How to use fathom-web - 10 common examples

To help you get started, we’ve selected a few fathom-web examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github endlessm / eos-knowledge-lib / docs / framework / tutorial / code / cc-ingester / index.js View on Github external
return scoreIfHas;
    }
    return 1;
};

const rules = ruleset(
    // Isolate the actual blog post body text. Based on Fathom's example
    // Readability rules
    rule(dom('p,li,ol,ul,code,blockquote,pre,h1,h2,h3,h4,h5,h6'),
        props(scoreByLength).type('paragraphish')),
    rule(type('paragraphish'), score(byInverseLinkDensity)),
    rule(dom('p'), score(4.5).type('paragraphish')),

    // Tweaks for this particular blog
    rule(type('paragraphish'), score(hasAncestor('article', 10))),
    rule(dom('.entry-summary p'), score(0).type('paragraphish')),
    rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),

    // Find the best cluster of paragraph-ish nodes
    rule(
        type('paragraphish').bestCluster({
            splittingDistance: 3,
            differentDepthCost: 6.5,
            differentTagCost: 2,
            sameTagCost: 0.5,
            strideCost: 0,
        }),
        out('content').allThrough(Futils.domSort)));

async function ingestArticle(hatch, {title, link, date, author}) {
    let $ = await Libingester.util.fetch_html(link);
    const baseURI = Libingester.util.get_doc_base_uri($, link);
github WorldBrain / Memex / src / util / fathom-extractor.js View on Github external
// Initial tests of this are pretty innaccurate; lots to learn to be able to tweak the rules and use it well
const rules = ruleset(
    rule(
        dom('p,div,li,blockquote,h1,h2,h3,h4,h5,h6'),
        props(scoreByLength).type('paragraphish'),
    ),
    rule(
        type('paragraphish'),
        score(fnode => {
            const paragraphishNote = fnode.noteFor('paragraphish')
            return paragraphishNote
                ? (1 - linkDensity(fnode, paragraphishNote.inlineLength)) * 1.5
                : (1 - linkDensity(fnode)) * 1.5
        }),
    ),
    rule(dom('p'), score(4.5).type('paragraphish')),
    rule(
        type('paragraphish').bestCluster({
            splittingDistance: 3,
            differentDepthCost: 6.5,
            differentTagCost: 2,
            sameTagCost: 0.5,
            strideCost: 0,
        }),
        out('content').allThrough(domSort),
    ),
)

export default rules
github mozilla / fathom-trainees / src / trainees.js View on Github external
const cMax = Math.max(r, g, b);
                const cMin = Math.min(r, g, b);
                const delta = cMax - cMin;
                const lightness = (cMax + cMin) / 2;
                const denom = (1 - (Math.abs(2 * lightness - 1)));
                // Return 0 if it's black (R, G, and B all 0).
                return (denom === 0) ? 0 : delta / denom;
            }

            /* The actual ruleset */

            const rules = ruleset([
                rule(dom('div'), type('overlay')),
                rule(type('overlay'), score(big), {name: 'big'}),
                rule(type('overlay'), score(nearlyOpaque), {name: 'nearlyOpaque'}),
                rule(type('overlay'), score(monochrome), {name: 'monochrome'}),
                rule(type('overlay'), score(suspiciousClassOrId), {name: 'classOrId'}),
                rule(type('overlay'), score(visible), {name: 'visible'}),
                rule(type('overlay').max(), out('overlay'))
            ]);
            return rules;
        }
    }
github WorldBrain / Memex / src / util / fathom-extractor.js View on Github external
} from 'fathom-web'
import { inlineTextLength, linkDensity } from 'fathom-web/utils'

/**
 * @param {fnode} fnode
 * @return {any} Object containing a `score` key derived from the element's text length
 */
const scoreByLength = ({ element }) => ({
    score: inlineTextLength(element),
})

// Based on: https://hacks.mozilla.org/2017/04/fathom-a-framework-for-understanding-web-pages/
// Meant to be similar to Readability-like extraction of a page's main-content
// Initial tests of this are pretty innaccurate; lots to learn to be able to tweak the rules and use it well
const rules = ruleset(
    rule(
        dom('p,div,li,blockquote,h1,h2,h3,h4,h5,h6'),
        props(scoreByLength).type('paragraphish'),
    ),
    rule(
        type('paragraphish'),
        score(fnode => {
            const paragraphishNote = fnode.noteFor('paragraphish')
            return paragraphishNote
                ? (1 - linkDensity(fnode, paragraphishNote.inlineLength)) * 1.5
                : (1 - linkDensity(fnode)) * 1.5
        }),
    ),
    rule(dom('p'), score(4.5).type('paragraphish')),
    rule(
        type('paragraphish').bestCluster({
            splittingDistance: 3,
github WorldBrain / Memex / src / util / fathom-extractor.js View on Github external
* @param {fnode} fnode
 * @return {any} Object containing a `score` key derived from the element's text length
 */
const scoreByLength = ({ element }) => ({
    score: inlineTextLength(element),
})

// Based on: https://hacks.mozilla.org/2017/04/fathom-a-framework-for-understanding-web-pages/
// Meant to be similar to Readability-like extraction of a page's main-content
// Initial tests of this are pretty innaccurate; lots to learn to be able to tweak the rules and use it well
const rules = ruleset(
    rule(
        dom('p,div,li,blockquote,h1,h2,h3,h4,h5,h6'),
        props(scoreByLength).type('paragraphish'),
    ),
    rule(
        type('paragraphish'),
        score(fnode => {
            const paragraphishNote = fnode.noteFor('paragraphish')
            return paragraphishNote
                ? (1 - linkDensity(fnode, paragraphishNote.inlineLength)) * 1.5
                : (1 - linkDensity(fnode)) * 1.5
        }),
    ),
    rule(dom('p'), score(4.5).type('paragraphish')),
    rule(
        type('paragraphish').bestCluster({
            splittingDistance: 3,
            differentDepthCost: 6.5,
            differentTagCost: 2,
            sameTagCost: 0.5,
            strideCost: 0,
github endlessm / eos-knowledge-lib / docs / framework / tutorial / code / cc-ingester / index.js View on Github external
}
    return 1;
};

const rules = ruleset(
    // Isolate the actual blog post body text. Based on Fathom's example
    // Readability rules
    rule(dom('p,li,ol,ul,code,blockquote,pre,h1,h2,h3,h4,h5,h6'),
        props(scoreByLength).type('paragraphish')),
    rule(type('paragraphish'), score(byInverseLinkDensity)),
    rule(dom('p'), score(4.5).type('paragraphish')),

    // Tweaks for this particular blog
    rule(type('paragraphish'), score(hasAncestor('article', 10))),
    rule(dom('.entry-summary p'), score(0).type('paragraphish')),
    rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),

    // Find the best cluster of paragraph-ish nodes
    rule(
        type('paragraphish').bestCluster({
            splittingDistance: 3,
            differentDepthCost: 6.5,
            differentTagCost: 2,
            sameTagCost: 0.5,
            strideCost: 0,
        }),
        out('content').allThrough(Futils.domSort)));

async function ingestArticle(hatch, {title, link, date, author}) {
    let $ = await Libingester.util.fetch_html(link);
    const baseURI = Libingester.util.get_doc_base_uri($, link);
github mozilla / price-tracker / src / extraction / fathom / ruleset_factory.js View on Github external
/**
       * Title rules
       */
      // consider all eligible h1 elements
      rule(dom('h1').when(this.isEligibleTitle.bind(this)), type('title')),
      // better score based on y-axis proximity to max scoring image element
      rule(type('title'), score(this.isNearImageTopOrBottom.bind(this)), {name: 'isNearImageTopOrBottom'}),
      // return title element(s) with max score
      rule(type('title').max(), out('title')),

      /**
       * Price rules
       */
      // 72% by itself, at [4, 4, 4, 4...]!:
      // consider all eligible span and h2 elements
      rule(dom('span, h2').when(this.isEligiblePrice.bind(this)), type('price')),
      // check if the element has a '$' in its innerText
      rule(type('price'), score(this.hasDollarSign.bind(this)), {name: 'hasDollarSign'}),
      // better score the closer the element is to the top of the page
      rule(type('price'), score(this.isAboveTheFold.bind(this)), {name: 'isAboveTheFoldPrice'}),
      // check if the id has "price" in it
      rule(type('price'), score(this.hasPriceInID.bind(this)), {name: 'hasPriceInID'}),
      rule(type('price'), score(this.hasPriceInParentID.bind(this)), {name: 'hasPriceInParentID'}),
      // check if any class names have "price" in them
      rule(type('price'), score(this.hasPriceInClassName.bind(this)), {name: 'hasPriceInClassName'}),
      rule(type('price'), score(this.hasPriceInParentClassName.bind(this)), {name: 'hasPriceInParentClassName'}),
      // better score for larger font size
      rule(type('price'), score(this.fontIsBig.bind(this)), {name: 'fontIsBig'}),
      // better score based on x-axis proximity to max scoring image element
      rule(type('price'), score(this.isNearImage.bind(this)), {name: 'isNearImage'}),
      // check if innerText has a price pattern
      rule(type('price'), score(this.hasPriceishPattern.bind(this)), {name: 'hasPriceishPattern'}),
github mozilla / price-tracker / src / extraction / fathom / ruleset_factory.js View on Github external
/**
       * Price rules
       */
      // 72% by itself, at [4, 4, 4, 4...]!:
      // consider all eligible span and h2 elements
      rule(dom('span, h2').when(this.isEligiblePrice.bind(this)), type('price')),
      // check if the element has a '$' in its innerText
      rule(type('price'), score(this.hasDollarSign.bind(this)), {name: 'hasDollarSign'}),
      // better score the closer the element is to the top of the page
      rule(type('price'), score(this.isAboveTheFold.bind(this)), {name: 'isAboveTheFoldPrice'}),
      // check if the id has "price" in it
      rule(type('price'), score(this.hasPriceInID.bind(this)), {name: 'hasPriceInID'}),
      rule(type('price'), score(this.hasPriceInParentID.bind(this)), {name: 'hasPriceInParentID'}),
      // check if any class names have "price" in them
      rule(type('price'), score(this.hasPriceInClassName.bind(this)), {name: 'hasPriceInClassName'}),
      rule(type('price'), score(this.hasPriceInParentClassName.bind(this)), {name: 'hasPriceInParentClassName'}),
      // better score for larger font size
      rule(type('price'), score(this.fontIsBig.bind(this)), {name: 'fontIsBig'}),
      // better score based on x-axis proximity to max scoring image element
      rule(type('price'), score(this.isNearImage.bind(this)), {name: 'isNearImage'}),
      // check if innerText has a price pattern
      rule(type('price'), score(this.hasPriceishPattern.bind(this)), {name: 'hasPriceishPattern'}),
      // return price element(s) with max score
      rule(type('price').max(), out('price')),
    ],
    coeffs,
    biases);
  }
}
github endlessm / eos-knowledge-lib / docs / framework / tutorial / code / cc-ingester2 / index.js View on Github external
return 1;
};

const rules = ruleset(
    // Isolate the actual blog post body text. Based on Fathom's example
    // Readability rules
    rule(dom('p,li,ol,ul,code,blockquote,pre,h1,h2,h3,h4,h5,h6'),
        props(scoreByLength).type('paragraphish')),
    rule(type('paragraphish'), score(byInverseLinkDensity)),
    rule(dom('p'), score(4.5).type('paragraphish')),

    // Tweaks for this particular blog
    rule(type('paragraphish'), score(hasAncestor('article', 10))),
    rule(dom('.entry-summary p'), score(0).type('paragraphish')),
    rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),
    rule(dom('.jetpack-video-wrapper'), props(() => ({
        score: 100,
        note: {length: 1},
    })).type('paragraphish')),

    // Find the best cluster of paragraph-ish nodes
    rule(
        type('paragraphish').bestCluster({
            splittingDistance: 3,
            differentDepthCost: 6.5,
            differentTagCost: 2,
            sameTagCost: 0.5,
            strideCost: 0,
        }),
        out('content').allThrough(Futils.domSort)));

async function ingestArticle(hatch, {title, link, date, author}) {
github endlessm / eos-knowledge-lib / docs / framework / tutorial / code / cc-ingester2 / index.js View on Github external
props(scoreByLength).type('paragraphish')),
    rule(type('paragraphish'), score(byInverseLinkDensity)),
    rule(dom('p'), score(4.5).type('paragraphish')),

    // Tweaks for this particular blog
    rule(type('paragraphish'), score(hasAncestor('article', 10))),
    rule(dom('.entry-summary p'), score(0).type('paragraphish')),
    rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),
    rule(dom('.jetpack-video-wrapper'), props(() => ({
        score: 100,
        note: {length: 1},
    })).type('paragraphish')),

    // Find the best cluster of paragraph-ish nodes
    rule(
        type('paragraphish').bestCluster({
            splittingDistance: 3,
            differentDepthCost: 6.5,
            differentTagCost: 2,
            sameTagCost: 0.5,
            strideCost: 0,
        }),
        out('content').allThrough(Futils.domSort)));

async function ingestArticle(hatch, {title, link, date, author}) {
    let $ = await Libingester.util.fetch_html(link);
    const baseURI = Libingester.util.get_doc_base_uri($, link);

    const imageURI = $('meta[property="og:image"]').attr('content');
    const synopsis = $('meta[property="og:description"]').attr('content');
    const lastModified = $('meta[property="article:modified_time"]')
        .attr('content');