Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
after: done => {
if (!has_fixtures && !cheerio.browser) {
has_fixtures = nock.recorder.play();
// eslint-disable-next-line no-console
console.log(
`This is disabled for browser/node interop. To capture fixutres,
open ${'`src/test-helpers.js`'} and uncomment lines 58 and 59 and
the fs import at top of file.`
);
// const text = `const nock = require('nock');\n${has_fixtures.join('\n')}`;
// fs.writeFile(fp, text, done);
} else {
done();
}
},
};
before: () => {
if (cheerio.browser) return;
if (!has_fixtures) {
try {
require(`../${fp}`); // eslint-disable-line global-require, import/no-dynamic-require, max-len
has_fixtures = true;
} catch (e) {
nock.recorder.rec({
dont_print: true,
});
}
} else {
has_fixtures = false;
nock.recorder.rec({
dont_print: true,
});
}
},
import cheerio from 'cheerio';
// Browser does not like us setting user agent
export const REQUEST_HEADERS = cheerio.browser ? {} : {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
};
// The number of milliseconds to attempt to fetch a resource before timing out.
export const FETCH_TIMEOUT = 10000;
// Content types that we do not extract content from
const BAD_CONTENT_TYPES = [
'audio/mpeg',
'image/gif',
'image/jpeg',
'image/jpg',
];
export const BAD_CONTENT_TYPES_RE = new RegExp(`^(${BAD_CONTENT_TYPES.join('|')})$`, 'i');
total_pages: 1,
rendered_pages: 1,
};
}
if (contentType === 'markdown') {
const turndownService = new TurndownService();
result.content = turndownService.turndown(result.content);
} else if (contentType === 'text') {
result.content = $.text($(result.content));
}
return { ...result, ...extendedTypes };
},
browser: !!cheerio.browser,
// A convenience method for getting a resource
// to work with, e.g., for custom extractor generator
fetchResource(url) {
return Resource.create(url);
},
addExtractor(extractor) {
return addCustomExtractor(extractor);
},
};
export default Mercury;
async parse(url, { html, ...opts } = {}) {
const {
fetchAllPages = true,
fallback = true,
contentType = 'html',
headers = {},
extend,
customExtractor,
} = opts;
// if no url was passed and this is the browser version,
// set url to window.location.href and load the html
// from the current page
if (!url && cheerio.browser) {
url = window.location.href; // eslint-disable-line no-undef
html = html || cheerio.html();
}
const parsedUrl = URL.parse(url);
if (!validateUrl(parsedUrl)) {
return {
error: true,
message:
'The url parameter passed does not look like a valid URL. Please check your URL and try again.',
};
}
const $ = await Resource.create(url, html, parsedUrl, headers);
encodeDoc({ content, contentType }) {
const encoding = getEncoding(contentType);
let decodedContent = iconv.decode(content, encoding);
let $ = cheerio.load(decodedContent);
// after first cheerio.load, check to see if encoding matches
const contentTypeSelector = cheerio.browser
? 'meta[http-equiv=content-type]'
: 'meta[http-equiv=content-type i]';
const metaContentType =
$(contentTypeSelector).attr('content') ||
$('meta[charset]').attr('charset');
const properEncoding = getEncoding(metaContentType);
// if encodings in the header/body dont match, use the one in the body
if (metaContentType && properEncoding !== encoding) {
decodedContent = iconv.decode(content, properEncoding);
$ = cheerio.load(decodedContent);
}
return $;
},
};