How to use the htmlparser2.DomUtils.findOne function in htmlparser2

To help you get started, we’ve selected a few htmlparser2 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Smithsonian / dpo-cook / source / server / tasks / MigratePlayTask.ts View on Github external
if (!contentDiv) {
            throw new Error("Article content not found (no 'region-content' class)");
        }

        // remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
        const bodyDiv = DomUtils.findOne(elem =>
            elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
            contentDiv.children, true);

        if (bodyDiv) {
            const parent: any = bodyDiv.parent;
            bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
            DomUtils.removeElement(bodyDiv);
        }

        const title = DomUtils.findOne(elem => elem.name === "h1",
            contentDiv.children, true);

        const titleText = title && DomUtils.getText(title);
        article.title = titleText || `Article No. ${index + 1}`;

        let imageIndex = 0;
        const imageUrls: Dictionary = {};

        DomUtils.findOne(elem => {
            // download images
            if (elem.name === "img" && elem.attribs && elem.attribs.src) {
                const src = elem.attribs.src;
                const imageUrl = src.startsWith("http") ? src : this.parameters.drupalBaseUrl + src;
                const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
                const imageFileName = `article-${articleIndex}-${imageName}`;
                const imageAssetPath = `${this.articlesDir}/${imageFileName}`;
github Smithsonian / dpo-cook / source / server / migration / playArticleTools.ts View on Github external
export async function fetchArticle(context: IPlayContext, article: IArticle, url: string, index: number): Promise
{
    const articleIndex = index.toString().padStart(2, "0");

    console.log(`fetchArticle - fetching HTML from ${url}`);
    const pageHtml = await fetch.text(url, "GET");

    // parse the article's HTML content
    const handler = new DomHandler();
    const parser = new Parser(handler);
    parser.write(pageHtml);
    parser.done();
    const dom = handler.dom;

    // find parent of article content
    const contentDiv = DomUtils.findOne(elem =>
        elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("region-content") >=0,
        dom, true);

    if (!contentDiv) {
        throw new Error("Article content not found (no 'region-content' class)");
    }

    // remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
    const bodyDiv = DomUtils.findOne(elem =>
        elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
        contentDiv.children, true);

    if (bodyDiv) {
        const parent: any = bodyDiv.parent;
        bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
        DomUtils.removeElement(bodyDiv);
github Smithsonian / dpo-cook / source / server / tasks / MigratePlayTask.ts View on Github external
async fetchArticle(article: IArticle, url: string, index: number): Promise
    {
        const articleIndex = index.toString().padStart(2, "0");

        console.log(`fetchArticle - fetching HTML from ${url}`);
        const pageHtml = await fetch.text(url, "GET");

        // parse the article's HTML content
        const handler = new DomHandler();
        const parser = new Parser(handler);
        parser.write(pageHtml);
        parser.done();
        const dom = handler.dom;

        // find parent of article content
        const contentDiv = DomUtils.findOne(elem =>
            elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("region-content") >=0,
            dom, true);

        if (!contentDiv) {
            throw new Error("Article content not found (no 'region-content' class)");
        }

        // remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
        const bodyDiv = DomUtils.findOne(elem =>
            elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
            contentDiv.children, true);

        if (bodyDiv) {
            const parent: any = bodyDiv.parent;
            bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
            DomUtils.removeElement(bodyDiv);
github Smithsonian / dpo-cook / source / server / migration / playArticleTools.ts View on Github external
if (bodyDiv) {
        const parent: any = bodyDiv.parent;
        bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
        DomUtils.removeElement(bodyDiv);
    }

    const title = DomUtils.findOne(elem => elem.name === "h1",
        contentDiv.children, true);

    const titleText = title && DomUtils.getText(title);
    article.title = titleText || `Article No. ${index + 1}`;

    let imageIndex = 0;
    const imageUrls: Dictionary = {};

    DomUtils.findOne(elem => {
        // download images
        if (elem.name === "img" && elem.attribs && elem.attribs.src) {
            const src = elem.attribs.src;
            const imageUrl = src.startsWith("http") ? src : context.drupalBaseUrl + src;
            const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
            const imageFileName = `article-${articleIndex}-${imageName}`;
            const imageAssetPath = `${context.articleDir}/${imageFileName}`;
            context.files[imageAssetPath] = imageAssetPath;

            elem.attribs.src = imageFileName; // relative to location of html file
            imageUrls[imageUrl] = imageAssetPath;
            imageIndex++;
        }

        // remove additional classes from all nodes
        if (elem.attribs && elem.attribs.class) {
github Smithsonian / dpo-cook / source / server / migration / playArticleTools.ts View on Github external
if (!contentDiv) {
        throw new Error("Article content not found (no 'region-content' class)");
    }

    // remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
    const bodyDiv = DomUtils.findOne(elem =>
        elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
        contentDiv.children, true);

    if (bodyDiv) {
        const parent: any = bodyDiv.parent;
        bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
        DomUtils.removeElement(bodyDiv);
    }

    const title = DomUtils.findOne(elem => elem.name === "h1",
        contentDiv.children, true);

    const titleText = title && DomUtils.getText(title);
    article.title = titleText || `Article No. ${index + 1}`;

    let imageIndex = 0;
    const imageUrls: Dictionary = {};

    DomUtils.findOne(elem => {
        // download images
        if (elem.name === "img" && elem.attribs && elem.attribs.src) {
            const src = elem.attribs.src;
            const imageUrl = src.startsWith("http") ? src : context.drupalBaseUrl + src;
            const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
            const imageFileName = `article-${articleIndex}-${imageName}`;
            const imageAssetPath = `${context.articleDir}/${imageFileName}`;
github Smithsonian / dpo-cook / source / server / migration / playArticleTools.ts View on Github external
const parser = new Parser(handler);
    parser.write(pageHtml);
    parser.done();
    const dom = handler.dom;

    // find parent of article content
    const contentDiv = DomUtils.findOne(elem =>
        elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("region-content") >=0,
        dom, true);

    if (!contentDiv) {
        throw new Error("Article content not found (no 'region-content' class)");
    }

    // remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
    const bodyDiv = DomUtils.findOne(elem =>
        elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
        contentDiv.children, true);

    if (bodyDiv) {
        const parent: any = bodyDiv.parent;
        bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
        DomUtils.removeElement(bodyDiv);
    }

    const title = DomUtils.findOne(elem => elem.name === "h1",
        contentDiv.children, true);

    const titleText = title && DomUtils.getText(title);
    article.title = titleText || `Article No. ${index + 1}`;

    let imageIndex = 0;
github Smithsonian / dpo-cook / source / server / tasks / MigratePlayTask.ts View on Github external
const parser = new Parser(handler);
        parser.write(pageHtml);
        parser.done();
        const dom = handler.dom;

        // find parent of article content
        const contentDiv = DomUtils.findOne(elem =>
            elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("region-content") >=0,
            dom, true);

        if (!contentDiv) {
            throw new Error("Article content not found (no 'region-content' class)");
        }

        // remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
        const bodyDiv = DomUtils.findOne(elem =>
            elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
            contentDiv.children, true);

        if (bodyDiv) {
            const parent: any = bodyDiv.parent;
            bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
            DomUtils.removeElement(bodyDiv);
        }

        const title = DomUtils.findOne(elem => elem.name === "h1",
            contentDiv.children, true);

        const titleText = title && DomUtils.getText(title);
        article.title = titleText || `Article No. ${index + 1}`;

        let imageIndex = 0;