Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
const Asset = require('../../models/asset');
const scraper = require('../../services/scraper');
const Assets = require('../../services/assets');
const { createLogger } = require('../../services/logging');
const logger = createLogger('jobs:scraper');
const fetch = require('node-fetch');
const { merge } = require('lodash');
const { version } = require('../../package.json');
const { SCRAPER_HEADERS } = require('../../config');
// Load the scraper with the rules.
const metascraper = require('metascraper').load([
require('metascraper-title')(),
require('metascraper-description')(),
require('metascraper-image')(),
require('metascraper-author')(),
require('metascraper-date')(),
require('./rules/modified')(),
require('./rules/section')(),
]);
let customHeaders = {};
try {
customHeaders = JSON.parse(SCRAPER_HEADERS);
} catch (err) {
console.error('Cannot parse TALK_SCRAPER_HEADERS');
throw err;
}
const fetch = require('node-fetch');
const ProxyAgent = require('proxy-agent');
const { merge } = require('lodash');
const { SCRAPER_HEADERS, SCRAPER_PROXY_URL } = require('../../config');
const kue = require('../kue');
const { version } = require('../../package.json');
// Load the scraper with the rules.
const metascraper = require('metascraper').load([
require('metascraper-title')(),
require('metascraper-description')(),
require('metascraper-image')(),
require('metascraper-author')(),
require('metascraper-date')(),
require('./rules/modified')(),
require('./rules/section')(),
]);
let customHeaders = {};
try {
customHeaders = JSON.parse(SCRAPER_HEADERS);
} catch (err) {
console.error('Cannot parse TALK_SCRAPER_HEADERS');
throw err;
}
/* eslint-disable no-unused-vars */
const errors = require('@feathersjs/errors');
const mongoose = require('mongoose');
const { URL } = require('url');
const metascraper = require('metascraper').load([
require('metascraper-date')(),
require('metascraper-title')(),
require('metascraper-description')(),
require('metascraper-image')()
]);
const got = require('got');
const Metaphor = require('metaphor');
const engine = new Metaphor.Engine({
preview: false,
tweet: true
});
const getMetadata = async (targetURL, Provider) => {
const data = {
metaphor: {},
// get link metadata
// TODO: add more services and use the metascraper to fill some metadata on the article
const metascraper = require('metascraper').load([
require('metascraper-author')(),
require('metascraper-date')(),
require('metascraper-description')(),
require('metascraper-image')(),
require('metascraper-logo')(),
require('metascraper-clearbit-logo')(),
require('metascraper-logo-favicon')(),
require('metascraper-publisher')(),
require('metascraper-title')(),
require('metascraper-url')(),
require('metascraper-youtube')(),
]);
const got = require('got');
const _ = require('lodash');
const getMetadata = async (targetUrl, app) => {
import Metascraper from 'metascraper'
import fetch from 'node-fetch'
import { ApolloError } from 'apollo-server'
import isEmpty from 'lodash/isEmpty'
import isArray from 'lodash/isArray'
import mergeWith from 'lodash/mergeWith'
import findProvider from './findProvider'
const error = require('debug')('embed:error')
const metascraper = Metascraper([
require('metascraper-author')(),
require('metascraper-date')(),
require('metascraper-description')(),
require('metascraper-image')(),
require('metascraper-lang')(),
require('metascraper-lang-detector')(),
require('metascraper-logo')(),
// require('metascraper-clearbit-logo')(),
require('metascraper-publisher')(),
require('metascraper-title')(),
require('metascraper-url')(),
require('metascraper-audio')(),
require('metascraper-soundcloud')(),
require('metascraper-video')(),
require('metascraper-youtube')(),
getTags(url) {
Metascraper
.scrapeUrl(url)
.then(metadata => {
const hasTags = !_.isEmpty(metadata);
this.setState({ tags: metadata, hasTags });
})
.catch(console.info);
}
const { parse } = require('url')
const { send } = require('micro')
const got = require('got');
const cache = require('memory-cache')
const metascraper = require('metascraper').load([
require('metascraper-author')(),
require('metascraper-date')(),
require('metascraper-description')(),
require('metascraper-image')(),
require('metascraper-logo')(),
require('metascraper-clearbit-logo')(),
require('metascraper-logo-favicon')(),
require('metascraper-publisher')(),
require('metascraper-title')(),
require('metascraper-url')(),
require('metascraper-logo-favicon')(),
require('metascraper-amazon')(),
require('metascraper-youtube')(),
require('metascraper-soundcloud')(),
require('metascraper-video-provider')()
])
scrape: async (source, { url }) => {
const { body: html } = await got(url);
const metadata = await Metascraper({ html, url });
return {
...metadata,
id: url,
};
return Metascraper.scrapeUrl(url, getRules(new URL(url)))
.then(getImages)
.then(metadata => ({
...metadata,
id: url,
}))
.catch(() => ({}))
}
},
router.post('/', (req, res, next) => {
if (!req.body.url) {
return res.status(400).json({
type: 'error',
error_code: 400,
error_message: 'Invalid request. Missing url',
});
}
const timer = logger.time('extract.post').namespace(req.body.url);
Metascraper.scrapeUrl(req.body.url).then(
data => {
const payload = {
url: data.url || req.body.url || '',
title: data.title || 'Unable to scrape title.',
content:
data.description ||
"Error: Unable to scrape description from the provided url. You'll have to do this on your own.",
author: data.publisher || 'Unable to scrape author.',
image: data.image || '',
};
cache.put(req.body.url, payload, TWENTY_FOUR_HOURS);
logger.log(Object.assign({}, { type: 'info' }, payload));
res.status(200).json(payload);
},
e => {
timer.log();
function onRequestDataExtraction(message, reply) {
logger.log(message);
const timer = logger.time('extract.post').namespace(message);
const cachedResult = cache.get(message.url);
if (cachedResult) {
return reply(cachedResult);
}
Metascraper.scrapeUrl(message.url)
.then(data => {
timer.log();
const payload = {
url: data.url || message.url,
title: data.title || 'Unable to scrape title.',
content: data.description ||
"Error: Unable to scrape description from the provided url. You'll have to do this on your own.",
author: data.publisher || 'Unable to scrape author.',
image: data.image || '',
};
cache.put(message.url, payload, TWENTY_FOUR_HOURS);
logger.log(Object.assign({}, { type: 'info' }, payload));
reply(payload);
})
.catch(e => {
timer.log();