How to use htmlparser2 - 10 common examples

To help you get started, we’ve selected a few htmlparser2 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github xiandanin / magnetW / src / main / repository.js View on Github external
async function requestParseSearchItems ({requestOptions, xpath}) {
  try {
    const rsp = await request(requestOptions)

    // 用htmlparser2转换一次再解析
    let outerHTML = htmlparser2.DomUtils.getOuterHTML(htmlparser2.parseDOM(rsp))
    const document = domParser.parseFromString(outerHTML)
    return {items: parseDocument(document, xpath)}
  } catch (e) {
    console.error('解析失败', e)
    return {err: e}
github Smithsonian / dpo-cook / source / server / tasks / MigratePlayTask.ts View on Github external
if (!contentDiv) {
            throw new Error("Article content not found (no 'region-content' class)");

        // remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
        const bodyDiv = DomUtils.findOne(elem =>
            elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
            contentDiv.children, true);

        if (bodyDiv) {
            const parent: any = bodyDiv.parent;
            bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));

        const title = DomUtils.findOne(elem => === "h1",
            contentDiv.children, true);

        const titleText = title && DomUtils.getText(title);
        article.title = titleText || `Article No. ${index + 1}`;

        let imageIndex = 0;
        const imageUrls: Dictionary = {};

        DomUtils.findOne(elem => {
            // download images
            if ( === "img" && elem.attribs && elem.attribs.src) {
                const src = elem.attribs.src;
                const imageUrl = src.startsWith("http") ? src : this.parameters.drupalBaseUrl + src;
                const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
                const imageFileName = `article-${articleIndex}-${imageName}`;
                const imageAssetPath = `${this.articlesDir}/${imageFileName}`;
github Smithsonian / dpo-cook / source / server / migration / playArticleTools.ts View on Github external
export async function fetchArticle(context: IPlayContext, article: IArticle, url: string, index: number): Promise
    const articleIndex = index.toString().padStart(2, "0");

    console.log(`fetchArticle - fetching HTML from ${url}`);
    const pageHtml = await fetch.text(url, "GET");

    // parse the article's HTML content
    const handler = new DomHandler();
    const parser = new Parser(handler);
    const dom = handler.dom;

    // find parent of article content
    const contentDiv = DomUtils.findOne(elem =>
        elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("region-content") >=0,
        dom, true);

    if (!contentDiv) {
        throw new Error("Article content not found (no 'region-content' class)");

    // remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
    const bodyDiv = DomUtils.findOne(elem =>
        elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
        contentDiv.children, true);

    if (bodyDiv) {
        const parent: any = bodyDiv.parent;
        bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
github Smithsonian / dpo-cook / source / server / tasks / MigratePlayTask.ts View on Github external
async fetchArticle(article: IArticle, url: string, index: number): Promise
        const articleIndex = index.toString().padStart(2, "0");

        console.log(`fetchArticle - fetching HTML from ${url}`);
        const pageHtml = await fetch.text(url, "GET");

        // parse the article's HTML content
        const handler = new DomHandler();
        const parser = new Parser(handler);
        const dom = handler.dom;

        // find parent of article content
        const contentDiv = DomUtils.findOne(elem =>
            elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("region-content") >=0,
            dom, true);

        if (!contentDiv) {
            throw new Error("Article content not found (no 'region-content' class)");

        // remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
        const bodyDiv = DomUtils.findOne(elem =>
            elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
            contentDiv.children, true);

        if (bodyDiv) {
            const parent: any = bodyDiv.parent;
            bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
github Smithsonian / dpo-cook / source / server / migration / playArticleTools.ts View on Github external
if (bodyDiv) {
        const parent: any = bodyDiv.parent;
        bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));

    const title = DomUtils.findOne(elem => === "h1",
        contentDiv.children, true);

    const titleText = title && DomUtils.getText(title);
    article.title = titleText || `Article No. ${index + 1}`;

    let imageIndex = 0;
    const imageUrls: Dictionary = {};

    DomUtils.findOne(elem => {
        // download images
        if ( === "img" && elem.attribs && elem.attribs.src) {
            const src = elem.attribs.src;
            const imageUrl = src.startsWith("http") ? src : context.drupalBaseUrl + src;
            const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
            const imageFileName = `article-${articleIndex}-${imageName}`;
            const imageAssetPath = `${context.articleDir}/${imageFileName}`;
            context.files[imageAssetPath] = imageAssetPath;

            elem.attribs.src = imageFileName; // relative to location of html file
            imageUrls[imageUrl] = imageAssetPath;

        // remove additional classes from all nodes
        if (elem.attribs && elem.attribs.class) {
github Smithsonian / dpo-cook / source / server / migration / playArticleTools.ts View on Github external
if (!contentDiv) {
        throw new Error("Article content not found (no 'region-content' class)");

    // remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
    const bodyDiv = DomUtils.findOne(elem =>
        elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
        contentDiv.children, true);

    if (bodyDiv) {
        const parent: any = bodyDiv.parent;
        bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));

    const title = DomUtils.findOne(elem => === "h1",
        contentDiv.children, true);

    const titleText = title && DomUtils.getText(title);
    article.title = titleText || `Article No. ${index + 1}`;

    let imageIndex = 0;
    const imageUrls: Dictionary = {};

    DomUtils.findOne(elem => {
        // download images
        if ( === "img" && elem.attribs && elem.attribs.src) {
            const src = elem.attribs.src;
            const imageUrl = src.startsWith("http") ? src : context.drupalBaseUrl + src;
            const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
            const imageFileName = `article-${articleIndex}-${imageName}`;
            const imageAssetPath = `${context.articleDir}/${imageFileName}`;
github bigeasy / stencil / stencil.js View on Github external
}, function (body) {
            var handler = new htmlparser.DefaultHandler()
            var tokenizer = new (require('./parser'))
            var parser = new htmlparser.Parser(handler);
            tokenizer._cbs = new TokenizerProxy(parser._tokenizer._cbs)
            parser._tokenizer = tokenizer
            // great. now it's time for a serializer.
            //console.log( domutils.getOuterHTML(handler.dom[0]))
            //console.log(require('util').inspect(handler.dom[0], false, null))
            var actual = new (xmldom.DOMParser)().parseFromString('')
            createXMLTemplate(actual, handler.dom[0])
            // Why? Because. Because namespaces. Hateful namespaces.
            var actual = new (xmldom.DOMParser)().parseFromString(actual.toString())
github Smithsonian / dpo-cook / source / server / tasks / MigratePlayTask.ts View on Github external
}, contentDiv.children, true);

        // fetch all images
        const urls = Object.keys(imageUrls);
        const promises: Promise[] = => {
            console.log(`fetchArticle - fetching image from ${url}`);
            return fetch.buffer(url, "GET").then(image => {
                const imageFileName = imageUrls[url];
                const imageFilePath = this.getFilePath(imageFileName);
                console.log(`fetchArticle - writing image to ${imageFilePath}`);
                return fs.writeFile(imageFilePath, Buffer.from(image))

        // write article HTML content
        const contentHtml = DomUtils.getInnerHTML(contentDiv);
        const articleFileName = `${this.articlesDir}/article-${articleIndex}.html`;
        this.result.files[`scene_${articleFileName}`] = articleFileName;
        const articleFilePath = this.getFilePath(articleFileName);
        promises.push(fs.writeFile(articleFilePath, contentHtml));

        return Promise.all(promises);
github Smithsonian / dpo-cook / source / server / migration / playArticleTools.ts View on Github external
}, contentDiv.children, true);

    // fetch all images
    const urls = Object.keys(imageUrls);
    const promises: Promise[] = => {
        console.log(`fetchArticle - fetching image from ${url}`);
        return fetch.buffer(url, "GET").then(image => {
            const imageFileName = imageUrls[url];
            const imageFilePath = path.resolve(context.job.jobDir, imageFileName);
            console.log(`fetchArticle - writing image to ${imageFilePath}`);
            return fs.writeFile(imageFilePath, Buffer.from(image))

    // write article HTML content
    const contentHtml = DomUtils.getInnerHTML(contentDiv);
    const articleFileName = `${context.articleDir}/article-${articleIndex}.html`;
    context.files[articleFileName] = articleFileName;
    const articleFilePath = path.resolve(context.job.jobDir, articleFileName);
    promises.push(fs.writeFile(articleFilePath, contentHtml));

    return Promise.all(promises);
github realywithoutname / mini-program-webpack-loader / src / classes / Wxml.js View on Github external
usedComponents () {
    let tags = []
    DomUtils.find((el) => {
      let { name, attribs = {} } = el

      // 记录所有非原生组件名
      if (name && !isNativeTag(name)) {

      let attrKeys = Object.keys(attribs)

       * 使用自定义组件是抽象组件
      if (/generic:/.test(attrKeys.join(';'))) {
        attrKeys.forEach(key => {
          /generic:/.test(key) && tags.push(attribs[key])