Skip to content

Commit

Permalink
fix(gatsby-source-wordpress): HTML image regex's (#29778) (#29816)
Browse files Browse the repository at this point in the history
Co-authored-by: gatsbybot <mathews.kyle+gatsbybot@gmail.com>
(cherry picked from commit f6edccf)

Co-authored-by: Tyler Barnes <tyler@gatsbyjs.com>
  • Loading branch information
GatsbyJS Bot and TylerBarnes committed Feb 27, 2021
1 parent cb499e6 commit bca7951
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 31 deletions.
25 changes: 25 additions & 0 deletions packages/gatsby-source-wordpress/__tests__/process-node.test.js
@@ -0,0 +1,25 @@
import {
getImgSrcRemoteFileMatchesFromNodeString,
getImgTagMatchesWithUrl,
} from "../dist/steps/source-nodes/create-nodes/process-node"

test(`HTML image transformation regex matches images`, async () => {
const wpUrl = `http://wp.fakesite.com`

const nodeString = `<img src=\\"https://wp.fakesite.com/wp-content/uploads/2020/01/©SDM-Yep-©Hi-000-Header.jpg />
<img src=\\"http://wp.fakesite.com/wp-content/uploads/2020/01/©SDM-Yep-©Hi-000-Header.jpg />
<img src=\\"/wp-content/uploads/2020/01/©SDM-Yep-©Hi-000-Header.jpg />`

const matches = getImgSrcRemoteFileMatchesFromNodeString(nodeString)

expect(matches.length).toBe(3)

const imgTagMatches = getImgTagMatchesWithUrl({
nodeString,
wpUrl,
})

expect(imgTagMatches.length).toBe(3)
})
@@ -1,3 +1,4 @@
/* eslint-disable no-useless-escape */
import { isWebUri } from "valid-url"
import { fluid } from "gatsby-plugin-sharp"
import Img from "gatsby-image"
Expand Down Expand Up @@ -30,7 +31,7 @@ const getNodeEditLink = node => {

const findReferencedImageNodeIds = ({ nodeString, pluginOptions, node }) => {
// if the lazyNodes plugin option is set we don't need to find
// image node id's because those nodes will be fetched lazily in resolvers
// image node id's because those nodes will be fetched lazily in resolvers.
if (pluginOptions.type.MediaItem.lazyNodes) {
return []
}
Expand Down Expand Up @@ -327,6 +328,17 @@ const getCheerioElementFromMatch = wpUrl => ({ match, tag = `img` }) => {
}
}

const getCheerioElementsFromMatches = ({ imgTagMatches, wpUrl }) =>
imgTagMatches
.map(getCheerioElementFromMatch(wpUrl))
.filter(({ cheerioImg: { attribs } }) => {
if (!attribs.src) {
return false
}

return isWebUri(encodeURI(attribs.src))
})

const getLargestSizeFromSizesAttribute = sizesString => {
const sizesStringsArray = sizesString.split(`,`)

Expand Down Expand Up @@ -444,6 +456,28 @@ const cacheCreatedFileNodeBySrc = ({ node, src }) => {
}
}

const imgSrcRemoteFileRegex = /(?:src=\\")((?:(?:https?|ftp|file):\/\/|www\.|ftp\.|\/)(?:[^'"])*\.(?:jpeg|jpg|png|gif|ico|mpg|ogv|svg|bmp|tif|tiff))(\?[^\\" \.]*|)(?=\\"| |\.)/gim

export const getImgSrcRemoteFileMatchesFromNodeString = nodeString =>
execall(imgSrcRemoteFileRegex, nodeString).filter(({ subMatches }) => {
// if our match is json encoded, that means it's inside a JSON
// encoded string field.
const isInJSON = subMatches[0].includes(`\\/\\/`)

// we shouldn't process encoded JSON, so skip this match if it's JSON
return !isInJSON
})

export const getImgTagMatchesWithUrl = ({ nodeString, wpUrl }) =>
execall(
/<img([\w\W]+?)[\/]?>/gim,
nodeString
// we don't want to match images inside pre
.replace(/<pre([\w\W]+?)[\/]?>.*(<\/pre>)/gim, ``)
// and code tags, so temporarily remove those tags and everything inside them
.replace(/<code([\w\W]+?)[\/]?>.*(<\/code>)/gim, ``)
).filter(filterMatches(wpUrl))

const replaceNodeHtmlImages = async ({
nodeString,
node,
Expand All @@ -456,38 +490,15 @@ const replaceNodeHtmlImages = async ({
return nodeString
}

const imgSrcRemoteFileRegex = /(?:src=\\")((?:(?:https?|ftp|file):\/\/|www\.|ftp\.|\/)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])\.(?:jpeg|jpg|png|gif|ico|mpg|ogv|svg|bmp|tif|tiff))(\?[^\\" .]*|)(?=\\"| |\.)/gim
const imageUrlMatches = getImgSrcRemoteFileMatchesFromNodeString(nodeString)

const imageUrlMatches = execall(imgSrcRemoteFileRegex, nodeString).filter(
({ subMatches }) => {
// if our match is json encoded, that means it's inside a JSON
// encoded string field.
const isInJSON = subMatches[0].includes(`\\/\\/`)

// we shouldn't process encoded JSON, so skip this match if it's JSON
return !isInJSON
}
)

const imgTagMatches = execall(
/<img([\w\W]+?)[/]?>/gim,
nodeString
// we don't want to match images inside pre
.replace(/<pre([\w\W]+?)[/]?>.*(<\/pre>)/gim, ``)
// and code tags, so temporarily remove those tags and everything inside them
.replace(/<code([\w\W]+?)[/]?>.*(<\/code>)/gim, ``)
).filter(filterMatches(wpUrl))
const imgTagMatches = getImgTagMatchesWithUrl({ nodeString, wpUrl })

if (imageUrlMatches.length && imgTagMatches.length) {
const cheerioImages = imgTagMatches
.map(getCheerioElementFromMatch(wpUrl))
.filter(({ cheerioImg: { attribs } }) => {
if (!attribs.src) {
return false
}

return isWebUri(attribs.src)
})
const cheerioImages = getCheerioElementsFromMatches({
imgTagMatches,
wpUrl,
})

const htmlMatchesToMediaItemNodesMap = await fetchNodeHtmlImageMediaItemNodes(
{
Expand Down
Expand Up @@ -211,7 +211,8 @@ export const stripImageSizesFromUrl = url => {
const fileExtension = urlToFileExtension(url)

const imageSizesPattern = new RegExp(
`(?:[-_]([0-9]+)x([0-9]+))${fileExtension ? `.${fileExtension}` : ``}`
// eslint-disable-next-line no-useless-escape
`(?:[-_]([0-9]+)x([0-9]+))${fileExtension ? `\.${fileExtension}` : ``}`
)

let urlWithoutSizes = url.replace(imageSizesPattern, ``)
Expand Down

0 comments on commit bca7951

Please sign in to comment.