Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def aleph_emit(context, data):
if not settings.ALEPH_HOST:
context.log.warning("No $MEMORIOUS_ALEPH_HOST, skipping upload...")
return
if not settings.ALEPH_API_KEY:
context.log.warning("No $MEMORIOUS_ALEPH_API_KEY, skipping upload...")
return
session_id = 'memorious:%s' % context.crawler.name
api = AlephAPI(settings.ALEPH_HOST, settings.ALEPH_API_KEY,
session_id=session_id)
collection_id = get_collection_id(context, api)
if collection_id is None:
context.log.warning("Cannot get aleph collection.")
return
content_hash = data.get('content_hash')
source_url = data.get('source_url', data.get('url'))
foreign_id = data.get('foreign_id', data.get('request_id', source_url))
if context.skip_incremental(collection_id, foreign_id, content_hash):
context.log.info("Skip aleph upload: %s", foreign_id)
return
meta = {
'crawler': context.crawler.name,
'foreign_id': foreign_id,
def extract(self, collector, document):
DocumentTagCollector(document, 'polyglot').save()
DocumentTagCollector(document, 'spacy').save()
try:
service = EntityExtractStub(self.channel)
texts = self.text_iterator(document)
entities = service.Extract(texts)
for entity in entities.entities:
type_ = self.TYPES.get(entity.type)
if type_ is None:
continue
collector.emit(entity.label, type_, weight=entity.weight)
log.info('Extracted %s entities.', len(collector))
except self.Error as exc:
log.exception("gRPC Error: %s", self.SERVICE)
self.reset_channel()
def extract(self, collector, document):
DocumentTagCollector(document, 'polyglot').save()
DocumentTagCollector(document, 'spacy').save()
try:
service = EntityExtractStub(self.channel)
texts = self.text_iterator(document)
entities = service.Extract(texts)
for entity in entities.entities:
type_ = self.TYPES.get(entity.type)
if type_ is None:
continue
collector.emit(entity.label, type_, weight=entity.weight)
log.info('Extracted %s entities.', len(collector))
except self.Error as e:
log.warning("gRPC [%s]: %s", e.code(), e.details())
self.reset_channel()
self.valid = False
class PersonResult(NamedResult):
category = ExtractedEntity.PERSON
def __init__(self, ctx, label, start, end):
super(PersonResult, self).__init__(ctx, label, start, end)
if self.valid and ' ' not in self.label:
self.valid = False
class LocationResult(NamedResult):
"""Locations are being mapped to countries."""
resolver = LocationResolver()
category = ExtractedEntity.LOCATION
def __init__(self, ctx, label, start, end):
super(LocationResult, self).__init__(ctx, label, start, end)
self.countries = self.resolver.get_countries(label)
class LanguageResult(Result):
category = ExtractedEntity.LANGUAGE
def __init__(self, ctx, label, start, end):
label = label.strip().lower()
super(LanguageResult, self).__init__(ctx, label, start, end)
class IPAddressResult(Result):
"""Pull IPv4, IPv6 - and validate using on-board Python tools."""
import logging
from alephclient.services.entityextract_pb2_grpc import EntityExtractStub
from alephclient.services.entityextract_pb2 import ExtractedEntity
from aleph import settings
from aleph.services import ServiceClientMixin
from aleph.analyze.analyzer import EntityAnalyzer, TextIterator
from aleph.model import DocumentTag, DocumentTagCollector
log = logging.getLogger(__name__)
TYPE = ExtractedEntity.Type.Value
class EntityExtractor(EntityAnalyzer, TextIterator, ServiceClientMixin):
SERVICE = settings.ENTITIES_SERVICE
ORIGIN = 'ner'
TYPES = {
ExtractedEntity.PERSON: DocumentTag.TYPE_PERSON,
ExtractedEntity.ORGANIZATION: DocumentTag.TYPE_ORGANIZATION,
ExtractedEntity.COMPANY: DocumentTag.TYPE_ORGANIZATION,
}
def __init__(self):
self.active = self.has_channel()
def extract(self, collector, document):
DocumentTagCollector(document, 'polyglot').save()
class IPAddressResult(Result):
"""Pull IPv4, IPv6 - and validate using on-board Python tools."""
category = ExtractedEntity.IPADDRESS
def __init__(self, ctx, label, start, end):
super(IPAddressResult, self).__init__(ctx, label, start, end)
try:
ip = ip_address(label)
self.key = self.label = str(ip)
except ValueError:
self.valid = False
class EmailResult(Result):
category = ExtractedEntity.EMAIL
def __init__(self, ctx, label, start, end):
super(EmailResult, self).__init__(ctx, label, start, end)
self.key = self.label_key(self.label)
self.valid = self.key is not None
# TODO: do we want to do TLD -> country?
class PhoneResult(Result):
FORMAT = phonenumbers.PhoneNumberFormat.E164
category = ExtractedEntity.PHONE
def __init__(self, ctx, label, start, end):
super(PhoneResult, self).__init__(ctx, label, start, end)
number = self._parse(label)
for country in ctx.countries:
def clean_label(text, category=None):
if text is None or len(text) > MAX_LENGTH:
return
match = CLEANUP.match(text)
if match is not None:
text = match.group('term')
text = collapse_spaces(text)
if not len(text) or len(text) < MIN_LENGTH:
return
if category in (None, ExtractedEntity.PERSON) and ' ' not in text:
return
return text
def extract_spacy(self, text):
try:
doc = self.spacy(text)
for ent in doc.ents:
type_ = SPACY_TYPES.get(ent.label_)
label = ent.text.strip()
if type_ is not None and len(label):
entity = ExtractedEntity()
entity.text = label
entity.type = type_
entity.start = ent.start
entity.end = ent.end
yield entity
except Exception:
log.exception("spaCy failed")
self.valid = False
class EmailResult(Result):
category = ExtractedEntity.EMAIL
def __init__(self, ctx, label, start, end):
super(EmailResult, self).__init__(ctx, label, start, end)
self.key = self.label_key(self.label)
self.valid = self.key is not None
# TODO: do we want to do TLD -> country?
class PhoneResult(Result):
FORMAT = phonenumbers.PhoneNumberFormat.E164
category = ExtractedEntity.PHONE
def __init__(self, ctx, label, start, end):
super(PhoneResult, self).__init__(ctx, label, start, end)
number = self._parse(label)
for country in ctx.countries:
if number is None:
number = self._parse(label, country)
self.valid = number is not None
if number is not None:
self.countries = [geocoder.region_code_for_number(number)]
self.label = phonenumbers.format_number(number, self.FORMAT)
self.key = self.label
def _parse(self, number, region=None):
try:
num = phonenumbers.parse(number, region)
def make_entity(self, text, type_, start, end):
entity = ExtractedEntity()
entity.text = text
entity.type = type_
entity.start = start
entity.end = end
return entity