How to use cleaners - 10 common examples

To help you get started, we’ve selected a few cleaners examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github postlight / mercury-parser / src / extractors / generic / author / extractor.js View on Github external
// Second, look through our selectors looking for potential authors.
    author = extractFromSelectors($, AUTHOR_SELECTORS, 2);
    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    }

    // Last, use our looser regular-expression based selectors for
    // potential authors.
    // eslint-disable-next-line no-restricted-syntax
    for (const [selector, regex] of BYLINE_SELECTORS_RE) {
      const node = $(selector);
      if (node.length === 1) {
        const text = node.text();
        if (regex.test(text)) {
          return cleanAuthor(text);
        }
      }
    }

    return null;
  },
};
github postlight / mercury-parser / src / extractors / generic / author / extractor.js View on Github external
extract({ $, metaCache }) {
    let author;

    // First, check to see if we have a matching
    // meta tag that we can make use of.
    author = extractFromMeta($, AUTHOR_META_TAGS, metaCache);
    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    }

    // Second, look through our selectors looking for potential authors.
    author = extractFromSelectors($, AUTHOR_SELECTORS, 2);
    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    }

    // Last, use our looser regular-expression based selectors for
    // potential authors.
    // eslint-disable-next-line no-restricted-syntax
    for (const [selector, regex] of BYLINE_SELECTORS_RE) {
      const node = $(selector);
      if (node.length === 1) {
        const text = node.text();
        if (regex.test(text)) {
github postlight / mercury-parser / src / extractors / generic / author / extractor.js View on Github external
extract({ $, metaCache }) {
    let author;

    // First, check to see if we have a matching
    // meta tag that we can make use of.
    author = extractFromMeta($, AUTHOR_META_TAGS, metaCache);
    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    }

    // Second, look through our selectors looking for potential authors.
    author = extractFromSelectors($, AUTHOR_SELECTORS, 2);
    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    }

    // Last, use our looser regular-expression based selectors for
    // potential authors.
    // eslint-disable-next-line no-restricted-syntax
    for (const [selector, regex] of BYLINE_SELECTORS_RE) {
      const node = $(selector);
      if (node.length === 1) {
        const text = node.text();
        if (regex.test(text)) {
          return cleanAuthor(text);
        }
      }
    }

    return null;
github postlight / mercury-parser / src / extractors / generic / lead-image-url / extractor.js View on Github external
);

    if (topScore > 0) {
      cleanUrl = cleanImage(topUrl);

      if (cleanUrl) return cleanUrl;
    }

    // If nothing else worked, check to see if there are any really
    // probable nodes in the doc, like .
    // eslint-disable-next-line no-restricted-syntax
    for (const selector of LEAD_IMAGE_URL_SELECTORS) {
      const $node = $(selector).first();
      const src = $node.attr('src');
      if (src) {
        cleanUrl = cleanImage(src);
        if (cleanUrl) return cleanUrl;
      }

      const href = $node.attr('href');
      if (href) {
        cleanUrl = cleanImage(href);
        if (cleanUrl) return cleanUrl;
      }

      const value = $node.attr('value');
      if (value) {
        cleanUrl = cleanImage(value);
        if (cleanUrl) return cleanUrl;
      }
    }
github postlight / mercury-parser / src / extractors / generic / lead-image-url / extractor.js View on Github external
.prepend(html);
    }

    // Check to see if we have a matching meta tag that we can make use of.
    // Moving this higher because common practice is now to use large
    // images on things like Open Graph or Twitter cards.
    // images usually have for things like Open Graph.
    const imageUrl = extractFromMeta(
      $,
      LEAD_IMAGE_URL_META_TAGS,
      metaCache,
      false
    );

    if (imageUrl) {
      cleanUrl = cleanImage(imageUrl);

      if (cleanUrl) return cleanUrl;
    }

    // Next, try to find the "best" image via the content.
    // We'd rather not have to fetch each image and check dimensions,
    // so try to do some analysis and determine them instead.
    const $content = $(content);
    const imgs = $('img', $content).toArray();
    const imgScores = {};

    imgs.forEach((img, index) => {
      const $img = $(img);
      const src = $img.attr('src');

      if (!src) return;
github postlight / mercury-parser / src / extractors / generic / lead-image-url / extractor.js View on Github external
score += scoreAttr($img);
      score += scoreByParents($img);
      score += scoreBySibling($img);
      score += scoreByDimensions($img);
      score += scoreByPosition(imgs, index);

      imgScores[src] = score;
    });

    const [topUrl, topScore] = Reflect.ownKeys(imgScores).reduce(
      (acc, key) => (imgScores[key] > acc[1] ? [key, imgScores[key]] : acc),
      [null, 0]
    );

    if (topScore > 0) {
      cleanUrl = cleanImage(topUrl);

      if (cleanUrl) return cleanUrl;
    }

    // If nothing else worked, check to see if there are any really
    // probable nodes in the doc, like .
    // eslint-disable-next-line no-restricted-syntax
    for (const selector of LEAD_IMAGE_URL_SELECTORS) {
      const $node = $(selector).first();
      const src = $node.attr('src');
      if (src) {
        cleanUrl = cleanImage(src);
        if (cleanUrl) return cleanUrl;
      }

      const href = $node.attr('href');
github postlight / mercury-parser / src / extractors / generic / lead-image-url / extractor.js View on Github external
}

    // If nothing else worked, check to see if there are any really
    // probable nodes in the doc, like .
    // eslint-disable-next-line no-restricted-syntax
    for (const selector of LEAD_IMAGE_URL_SELECTORS) {
      const $node = $(selector).first();
      const src = $node.attr('src');
      if (src) {
        cleanUrl = cleanImage(src);
        if (cleanUrl) return cleanUrl;
      }

      const href = $node.attr('href');
      if (href) {
        cleanUrl = cleanImage(href);
        if (cleanUrl) return cleanUrl;
      }

      const value = $node.attr('value');
      if (value) {
        cleanUrl = cleanImage(value);
        if (cleanUrl) return cleanUrl;
      }
    }

    return null;
  },
};
github postlight / mercury-parser / src / extractors / generic / date-published / extractor.js View on Github external
datePublished = extractFromMeta(
      $,
      DATE_PUBLISHED_META_TAGS,
      metaCache,
      false
    );
    if (datePublished) return cleanDatePublished(datePublished);

    // Second, look through our selectors looking for potential
    // date_published's.
    datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS);
    if (datePublished) return cleanDatePublished(datePublished);

    // Lastly, look to see if a dately string exists in the URL
    datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
    if (datePublished) return cleanDatePublished(datePublished);

    return null;
  },
};
github postlight / mercury-parser / src / extractors / generic / date-published / extractor.js View on Github external
let datePublished;
    // First, check to see if we have a matching meta tag
    // that we can make use of.
    // Don't try cleaning tags from this string
    datePublished = extractFromMeta(
      $,
      DATE_PUBLISHED_META_TAGS,
      metaCache,
      false
    );
    if (datePublished) return cleanDatePublished(datePublished);

    // Second, look through our selectors looking for potential
    // date_published's.
    datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS);
    if (datePublished) return cleanDatePublished(datePublished);

    // Lastly, look to see if a dately string exists in the URL
    datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
    if (datePublished) return cleanDatePublished(datePublished);

    return null;
  },
};
github postlight / mercury-parser / src / extractors / generic / date-published / extractor.js View on Github external
extract({ $, url, metaCache }) {
    let datePublished;
    // First, check to see if we have a matching meta tag
    // that we can make use of.
    // Don't try cleaning tags from this string
    datePublished = extractFromMeta(
      $,
      DATE_PUBLISHED_META_TAGS,
      metaCache,
      false
    );
    if (datePublished) return cleanDatePublished(datePublished);

    // Second, look through our selectors looking for potential
    // date_published's.
    datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS);
    if (datePublished) return cleanDatePublished(datePublished);

    // Lastly, look to see if a dately string exists in the URL
    datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
    if (datePublished) return cleanDatePublished(datePublished);

    return null;
  },
};

cleaners

Cleans & validates untrusted data, with TypeScript & Flow support

MIT
Latest version published 2 months ago

Package Health Score

65 / 100
Full package analysis