Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
async function requestParseSearchItems ({requestOptions, xpath}) {
try {
const rsp = await request(requestOptions)
// 用htmlparser2转换一次再解析
let outerHTML = htmlparser2.DomUtils.getOuterHTML(htmlparser2.parseDOM(rsp))
const document = domParser.parseFromString(outerHTML)
return {items: parseDocument(document, xpath)}
} catch (e) {
console.error('解析失败', e)
return {err: e}
}
}
if (!contentDiv) {
throw new Error("Article content not found (no 'region-content' class)");
}
// remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
const bodyDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
contentDiv.children, true);
if (bodyDiv) {
const parent: any = bodyDiv.parent;
bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
DomUtils.removeElement(bodyDiv);
}
const title = DomUtils.findOne(elem => elem.name === "h1",
contentDiv.children, true);
const titleText = title && DomUtils.getText(title);
article.title = titleText || `Article No. ${index + 1}`;
let imageIndex = 0;
const imageUrls: Dictionary = {};
DomUtils.findOne(elem => {
// download images
if (elem.name === "img" && elem.attribs && elem.attribs.src) {
const src = elem.attribs.src;
const imageUrl = src.startsWith("http") ? src : this.parameters.drupalBaseUrl + src;
const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
const imageFileName = `article-${articleIndex}-${imageName}`;
const imageAssetPath = `${this.articlesDir}/${imageFileName}`;
export async function fetchArticle(context: IPlayContext, article: IArticle, url: string, index: number): Promise
{
const articleIndex = index.toString().padStart(2, "0");
console.log(`fetchArticle - fetching HTML from ${url}`);
const pageHtml = await fetch.text(url, "GET");
// parse the article's HTML content
const handler = new DomHandler();
const parser = new Parser(handler);
parser.write(pageHtml);
parser.done();
const dom = handler.dom;
// find parent of article content
const contentDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("region-content") >=0,
dom, true);
if (!contentDiv) {
throw new Error("Article content not found (no 'region-content' class)");
}
// remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
const bodyDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
contentDiv.children, true);
if (bodyDiv) {
const parent: any = bodyDiv.parent;
bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
DomUtils.removeElement(bodyDiv);
async fetchArticle(article: IArticle, url: string, index: number): Promise
{
const articleIndex = index.toString().padStart(2, "0");
console.log(`fetchArticle - fetching HTML from ${url}`);
const pageHtml = await fetch.text(url, "GET");
// parse the article's HTML content
const handler = new DomHandler();
const parser = new Parser(handler);
parser.write(pageHtml);
parser.done();
const dom = handler.dom;
// find parent of article content
const contentDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("region-content") >=0,
dom, true);
if (!contentDiv) {
throw new Error("Article content not found (no 'region-content' class)");
}
// remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
const bodyDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
contentDiv.children, true);
if (bodyDiv) {
const parent: any = bodyDiv.parent;
bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
DomUtils.removeElement(bodyDiv);
if (bodyDiv) {
const parent: any = bodyDiv.parent;
bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
DomUtils.removeElement(bodyDiv);
}
const title = DomUtils.findOne(elem => elem.name === "h1",
contentDiv.children, true);
const titleText = title && DomUtils.getText(title);
article.title = titleText || `Article No. ${index + 1}`;
let imageIndex = 0;
const imageUrls: Dictionary = {};
DomUtils.findOne(elem => {
// download images
if (elem.name === "img" && elem.attribs && elem.attribs.src) {
const src = elem.attribs.src;
const imageUrl = src.startsWith("http") ? src : context.drupalBaseUrl + src;
const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
const imageFileName = `article-${articleIndex}-${imageName}`;
const imageAssetPath = `${context.articleDir}/${imageFileName}`;
context.files[imageAssetPath] = imageAssetPath;
elem.attribs.src = imageFileName; // relative to location of html file
imageUrls[imageUrl] = imageAssetPath;
imageIndex++;
}
// remove additional classes from all nodes
if (elem.attribs && elem.attribs.class) {
if (!contentDiv) {
throw new Error("Article content not found (no 'region-content' class)");
}
// remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
const bodyDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
contentDiv.children, true);
if (bodyDiv) {
const parent: any = bodyDiv.parent;
bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
DomUtils.removeElement(bodyDiv);
}
const title = DomUtils.findOne(elem => elem.name === "h1",
contentDiv.children, true);
const titleText = title && DomUtils.getText(title);
article.title = titleText || `Article No. ${index + 1}`;
let imageIndex = 0;
const imageUrls: Dictionary = {};
DomUtils.findOne(elem => {
// download images
if (elem.name === "img" && elem.attribs && elem.attribs.src) {
const src = elem.attribs.src;
const imageUrl = src.startsWith("http") ? src : context.drupalBaseUrl + src;
const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
const imageFileName = `article-${articleIndex}-${imageName}`;
const imageAssetPath = `${context.articleDir}/${imageFileName}`;
}, function (body) {
var handler = new htmlparser.DefaultHandler()
var tokenizer = new (require('./parser'))
var parser = new htmlparser.Parser(handler);
tokenizer._cbs = new TokenizerProxy(parser._tokenizer._cbs)
parser._tokenizer = tokenizer
parser.parseComplete(body)
//console.log('=======')
// great. now it's time for a serializer.
//console.log( domutils.getOuterHTML(handler.dom[0]))
//console.log('=======')
//console.log(require('util').inspect(handler.dom[0], false, null))
var actual = new (xmldom.DOMParser)().parseFromString('')
actual.documentElement.parentNode.removeChild(actual.documentElement)
createXMLTemplate(actual, handler.dom[0])
// Why? Because. Because namespaces. Hateful namespaces.
var actual = new (xmldom.DOMParser)().parseFromString(actual.toString())
}, contentDiv.children, true);
// fetch all images
const urls = Object.keys(imageUrls);
const promises: Promise[] = urls.map(url => {
console.log(`fetchArticle - fetching image from ${url}`);
return fetch.buffer(url, "GET").then(image => {
const imageFileName = imageUrls[url];
const imageFilePath = this.getFilePath(imageFileName);
console.log(`fetchArticle - writing image to ${imageFilePath}`);
return fs.writeFile(imageFilePath, Buffer.from(image))
});
});
// write article HTML content
const contentHtml = DomUtils.getInnerHTML(contentDiv);
const articleFileName = `${this.articlesDir}/article-${articleIndex}.html`;
this.result.files[`scene_${articleFileName}`] = articleFileName;
const articleFilePath = this.getFilePath(articleFileName);
promises.push(fs.writeFile(articleFilePath, contentHtml));
return Promise.all(promises);
}
}, contentDiv.children, true);
// fetch all images
const urls = Object.keys(imageUrls);
const promises: Promise[] = urls.map(url => {
console.log(`fetchArticle - fetching image from ${url}`);
return fetch.buffer(url, "GET").then(image => {
const imageFileName = imageUrls[url];
const imageFilePath = path.resolve(context.job.jobDir, imageFileName);
console.log(`fetchArticle - writing image to ${imageFilePath}`);
return fs.writeFile(imageFilePath, Buffer.from(image))
});
});
// write article HTML content
const contentHtml = DomUtils.getInnerHTML(contentDiv);
const articleFileName = `${context.articleDir}/article-${articleIndex}.html`;
context.files[articleFileName] = articleFileName;
const articleFilePath = path.resolve(context.job.jobDir, articleFileName);
promises.push(fs.writeFile(articleFilePath, contentHtml));
return Promise.all(promises);
}
usedComponents () {
let tags = []
DomUtils.find((el) => {
let { name, attribs = {} } = el
// 记录所有非原生组件名
if (name && !isNativeTag(name)) {
tags.push(name)
}
let attrKeys = Object.keys(attribs)
/**
* 使用自定义组件是抽象组件
*/
if (/generic:/.test(attrKeys.join(';'))) {
attrKeys.forEach(key => {
/generic:/.test(key) && tags.push(attribs[key])
})