How to use alephclient - 10 common examples

To help you get started, we’ve selected a few alephclient examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github alephdata / memorious / memorious / operations / aleph.py View on Github external
def aleph_emit(context, data):
    if not settings.ALEPH_HOST:
        context.log.warning("No $MEMORIOUS_ALEPH_HOST, skipping upload...")
        return
    if not settings.ALEPH_API_KEY:
        context.log.warning("No $MEMORIOUS_ALEPH_API_KEY, skipping upload...")
        return

    session_id = 'memorious:%s' % context.crawler.name
    api = AlephAPI(settings.ALEPH_HOST, settings.ALEPH_API_KEY,
                   session_id=session_id)
    collection_id = get_collection_id(context, api)
    if collection_id is None:
        context.log.warning("Cannot get aleph collection.")
        return

    content_hash = data.get('content_hash')
    source_url = data.get('source_url', data.get('url'))
    foreign_id = data.get('foreign_id', data.get('request_id', source_url))
    if context.skip_incremental(collection_id, foreign_id, content_hash):
        context.log.info("Skip aleph upload: %s", foreign_id)
        return

    meta = {
        'crawler': context.crawler.name,
        'foreign_id': foreign_id,
github alephdata / aleph / aleph / analyze / extract_entity.py View on Github external
def extract(self, collector, document):
        DocumentTagCollector(document, 'polyglot').save()
        DocumentTagCollector(document, 'spacy').save()
        try:
            service = EntityExtractStub(self.channel)
            texts = self.text_iterator(document)
            entities = service.Extract(texts)
            for entity in entities.entities:
                type_ = self.TYPES.get(entity.type)
                if type_ is None:
                    continue
                collector.emit(entity.label, type_, weight=entity.weight)
            log.info('Extracted %s entities.', len(collector))
        except self.Error as exc:
            log.exception("gRPC Error: %s", self.SERVICE)
            self.reset_channel()
github alephdata / aleph / aleph / analyze / extract_entity.py View on Github external
def extract(self, collector, document):
        DocumentTagCollector(document, 'polyglot').save()
        DocumentTagCollector(document, 'spacy').save()
        try:
            service = EntityExtractStub(self.channel)
            texts = self.text_iterator(document)
            entities = service.Extract(texts)
            for entity in entities.entities:
                type_ = self.TYPES.get(entity.type)
                if type_ is None:
                    continue
                collector.emit(entity.label, type_, weight=entity.weight)
            log.info('Extracted %s entities.', len(collector))
        except self.Error as e:
            log.warning("gRPC [%s]: %s", e.code(), e.details())
            self.reset_channel()
github alephdata / aleph / services / extract-entities / entityextractor / result.py View on Github external
self.valid = False


class PersonResult(NamedResult):
    category = ExtractedEntity.PERSON

    def __init__(self, ctx, label, start, end):
        super(PersonResult, self).__init__(ctx, label, start, end)
        if self.valid and ' ' not in self.label:
            self.valid = False


class LocationResult(NamedResult):
    """Locations are being mapped to countries."""
    resolver = LocationResolver()
    category = ExtractedEntity.LOCATION

    def __init__(self, ctx, label, start, end):
        super(LocationResult, self).__init__(ctx, label, start, end)
        self.countries = self.resolver.get_countries(label)


class LanguageResult(Result):
    category = ExtractedEntity.LANGUAGE

    def __init__(self, ctx, label, start, end):
        label = label.strip().lower()
        super(LanguageResult, self).__init__(ctx, label, start, end)


class IPAddressResult(Result):
    """Pull IPv4, IPv6 - and validate using on-board Python tools."""
github alephdata / aleph / aleph / analyze / extract_entity.py View on Github external
import logging
from alephclient.services.entityextract_pb2_grpc import EntityExtractStub
from alephclient.services.entityextract_pb2 import ExtractedEntity

from aleph import settings
from aleph.services import ServiceClientMixin
from aleph.analyze.analyzer import EntityAnalyzer, TextIterator
from aleph.model import DocumentTag, DocumentTagCollector

log = logging.getLogger(__name__)
TYPE = ExtractedEntity.Type.Value


class EntityExtractor(EntityAnalyzer, TextIterator, ServiceClientMixin):
    SERVICE = settings.ENTITIES_SERVICE
    ORIGIN = 'ner'
    TYPES = {
        ExtractedEntity.PERSON: DocumentTag.TYPE_PERSON,
        ExtractedEntity.ORGANIZATION: DocumentTag.TYPE_ORGANIZATION,
        ExtractedEntity.COMPANY: DocumentTag.TYPE_ORGANIZATION,
    }

    def __init__(self):
        self.active = self.has_channel()

    def extract(self, collector, document):
        DocumentTagCollector(document, 'polyglot').save()
github alephdata / aleph / services / extract-entities / entityextractor / result.py View on Github external
class IPAddressResult(Result):
    """Pull IPv4, IPv6 - and validate using on-board Python tools."""
    category = ExtractedEntity.IPADDRESS

    def __init__(self, ctx, label, start, end):
        super(IPAddressResult, self).__init__(ctx, label, start, end)
        try:
            ip = ip_address(label)
            self.key = self.label = str(ip)
        except ValueError:
            self.valid = False


class EmailResult(Result):
    category = ExtractedEntity.EMAIL

    def __init__(self, ctx, label, start, end):
        super(EmailResult, self).__init__(ctx, label, start, end)
        self.key = self.label_key(self.label)
        self.valid = self.key is not None
        # TODO: do we want to do TLD -> country?


class PhoneResult(Result):
    FORMAT = phonenumbers.PhoneNumberFormat.E164
    category = ExtractedEntity.PHONE

    def __init__(self, ctx, label, start, end):
        super(PhoneResult, self).__init__(ctx, label, start, end)
        number = self._parse(label)
        for country in ctx.countries:
github alephdata / aleph / services / extract-entities / entityextractor / normalize.py View on Github external
def clean_label(text, category=None):
    if text is None or len(text) > MAX_LENGTH:
        return
    match = CLEANUP.match(text)
    if match is not None:
        text = match.group('term')
    text = collapse_spaces(text)
    if not len(text) or len(text) < MIN_LENGTH:
        return
    if category in (None, ExtractedEntity.PERSON) and ' ' not in text:
        return
    return text
github alephdata / aleph / services / extract-entities / entityextractor / service.py View on Github external
def extract_spacy(self, text):
        try:
            doc = self.spacy(text)
            for ent in doc.ents:
                type_ = SPACY_TYPES.get(ent.label_)
                label = ent.text.strip()
                if type_ is not None and len(label):
                    entity = ExtractedEntity()
                    entity.text = label
                    entity.type = type_
                    entity.start = ent.start
                    entity.end = ent.end
                    yield entity
        except Exception:
            log.exception("spaCy failed")
github alephdata / aleph / services / extract-entities / entityextractor / result.py View on Github external
self.valid = False


class EmailResult(Result):
    category = ExtractedEntity.EMAIL

    def __init__(self, ctx, label, start, end):
        super(EmailResult, self).__init__(ctx, label, start, end)
        self.key = self.label_key(self.label)
        self.valid = self.key is not None
        # TODO: do we want to do TLD -> country?


class PhoneResult(Result):
    FORMAT = phonenumbers.PhoneNumberFormat.E164
    category = ExtractedEntity.PHONE

    def __init__(self, ctx, label, start, end):
        super(PhoneResult, self).__init__(ctx, label, start, end)
        number = self._parse(label)
        for country in ctx.countries:
            if number is None:
                number = self._parse(label, country)
        self.valid = number is not None
        if number is not None:
            self.countries = [geocoder.region_code_for_number(number)]
            self.label = phonenumbers.format_number(number, self.FORMAT)
            self.key = self.label

    def _parse(self, number, region=None):
        try:
            num = phonenumbers.parse(number, region)
github alephdata / aleph / services / extract-entities / entityextractor / service.py View on Github external
def make_entity(self, text, type_, start, end):
        entity = ExtractedEntity()
        entity.text = text
        entity.type = type_
        entity.start = start
        entity.end = end
        return entity