How to use the quantulum3.load function in quantulum3

To help you get started, we’ve selected a few quantulum3 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nielstron / quantulum3 / quantulum3 / tests.py View on Github external
unit = l.NAMES[item['unit']]
                except KeyError:
                    try:
                        entity = item['entity']
                    except KeyError:
                        print(('Could not find %s, provide "derived" and'
                               ' "entity"' % item['unit']))
                        return
                    if entity == 'unknown':
                        derived = [{
                            'base': l.NAMES[i['base']].entity.name,
                            'power': i['power']
                        } for i in item['dimensions']]
                        entity = c.Entity(name='unknown', dimensions=derived)
                    elif entity in l.ENTITIES:
                        entity = l.ENTITIES[entity]
                    else:
                        print(('Could not find %s, provide "derived" and'
                               ' "entity"' % item['unit']))
                        return
                    unit = c.Unit(
                        name=item['unit'],
                        dimensions=item['dimensions'],
                        entity=entity)
                try:
                    span = next(
                        re.finditer(re.escape(item['surface']),
                                    test['req'])).span()
                except StopIteration:
                    print('Surface mismatch for "%s"' % test['req'])
                    return
                uncert = None
github nielstron / quantulum3 / quantulum3 / disambiguate.py View on Github external
def disambiguate_unit(unit_surface, text, lang="en_US"):
    """
    Resolve ambiguity between units with same names, symbols or abbreviations.
    :returns (str) unit name of the resolved unit
    """
    if clf.USE_CLF:
        base = clf.disambiguate_unit(unit_surface, text, lang).name
    else:
        base = (
            load.units(lang).symbols[unit_surface]
            or load.units(lang).surfaces[unit_surface]
            or load.units(lang).surfaces_lower[unit_surface.lower()]
            or load.units(lang).symbols_lower[unit_surface.lower()]
        )

        if len(base) > 1:
            base = no_clf.disambiguate_no_classifier(base, text, lang)
        elif len(base) == 1:
            base = next(iter(base))

        if base:
            base = base.name
        else:
            base = "unk"

    return base
github nielstron / quantulum3 / quantulum3 / no_classifier.py View on Github external
def disambiguate_no_classifier(entities, text, lang="en_US"):
    """
    Disambiguate the entity or unit without a classifier
    :param entities:
    :param text:
    :param lang:
    :return: a single entity or unit that has been chosen for
    """
    word_sets = load.training_set(lang)

    max_entity, max_count, max_relative = None, 0, 0
    for entity in entities:
        count = 0
        total = 0
        for word_set in word_sets:
            if word_set["unit"] == entity.name:
                total += len(word_set["text"])
                for word in word_set["text"].split(" "):
                    count += 1 if word in text else 0
        try:
            relative = count / total
        except ZeroDivisionError:
            relative = 0
        if relative > max_relative or (relative == max_relative and count > max_count):
            max_entity, max_count, max_relative = entity, count, relative
github nielstron / quantulum3 / quantulum3 / classifier.py View on Github external
def disambiguate_unit(unit, text, lang="en_US"):
    """
    Resolve ambiguity between units with same names, symbols or abbreviations.
    """

    new_unit = (
        load.units(lang).symbols.get(unit)
        or load.units(lang).surfaces.get(unit)
        or load.units(lang).surfaces_lower.get(unit.lower())
        or load.units(lang).symbols_lower.get(unit.lower())
    )
    if not new_unit:
        raise KeyError('Could not find unit "%s" from "%s"' % (unit, text))

    if len(new_unit) > 1:
        transformed = classifier(lang).tfidf_model.transform([clean_text(text, lang)])
        scores = classifier(lang).classifier.predict_proba(transformed).tolist()[0]
        scores = zip(scores, classifier(lang).target_names)

        # Filter for possible names
        names = [i.name for i in new_unit]
        scores = [i for i in scores if i[1] in names]

        # Sort by rank
        scores = sorted(scores, key=lambda x: x[0], reverse=True)
github nielstron / quantulum3 / quantulum3 / _lang / en_US / parser.py View on Github external
def build_quantity(orig_text, text, item, values, unit, surface, span, uncert):
    """
    Build a Quantity object out of extracted information.
    """
    # TODO rerun if change occurred
    # Re parse unit if a change occurred
    dimension_change = True

    # Extract "absolute " ...
    _absolute = "absolute "
    if (
        unit.name == "dimensionless"
        and _absolute == orig_text[span[0] - len(_absolute) : span[0]]
    ):
        unit = load.units(lang).names["kelvin"]
        unit.original_dimensions = unit.dimensions
        surface = _absolute + surface
        span = (span[0] - len(_absolute), span[1])
        dimension_change = True

    # Usually "$3T" does not stand for "dollar tesla"
    # this holds as well for "3k miles"
    # TODO use classifier to decide if 3K is 3 thousand or 3 Kelvin
    if unit.entity.dimensions:
        if (
            len(unit.entity.dimensions) > 1
            and unit.entity.dimensions[0]["base"] == "currency"
            and unit.original_dimensions[1]["surface"] in reg.suffixes(lang).keys()
        ):
            suffix = unit.original_dimensions[1]["surface"]
            # Only apply if at least last value is suffixed by k, M, etc
github nielstron / quantulum3 / scripts / extract_vere.py View on Github external
if isinstance(unit, classes.Unit):
            surfaces.update(unit.surfaces)
            surfaces.update(unit.symbols)
        for surface in surfaces:
            neighbours = v.most_similar(
                v.query(surface), topn=topn, min_similarity=min_similarity)
            training_set.append({
                'unit':
                name,
                'text':
                ' '.join(neighbour[0] for neighbour in neighbours)
            })
    print('Done')

    with open(
            os.path.join(load.TOPDIR, 'similars.json'), 'w',
            encoding='utf8') as file:
        json.dump(training_set, file, sort_keys=True, indent=4)
github nielstron / quantulum3 / quantulum3 / _lang / en_US / parser.py View on Github external
and len(unit.original_dimensions) > 1
        and unit.original_dimensions[-1]["base"] == "count"
    ):
        unit.original_dimensions = unit.original_dimensions[:-1]
        dimension_change = True
        surface = surface[:-5]
        span = (span[0], span[1] - 5)
        _LOGGER.debug('\tCorrect for "time"')

    if dimension_change:
        if unit.original_dimensions:
            unit = parser.get_unit_from_dimensions(
                unit.original_dimensions, orig_text, lang
            )
        else:
            unit = load.units(lang).names["dimensionless"]

    # Discard irrelevant txt2float extractions, cardinal numbers, codes etc.
    if (
        surface.lower() in ["a", "an", "one"]
        or re.search(r"1st|2nd|3rd|[04-9]th", surface)
        or re.search(r"\d+[A-Z]+\d+", surface)
        or re.search(r"\ba second\b", surface, re.IGNORECASE)
    ):
        _LOGGER.debug('\tMeaningless quantity ("%s"), discard', surface)
        return

    objs = []
    for value in values:
        obj = cls.Quantity(
            value=value,
            unit=unit,
github nielstron / quantulum3 / quantulum3 / parser.py View on Github external
def get_entity_from_dimensions(dimensions, text, lang="en_US"):
    """
    Infer the underlying entity of a unit (e.g. "volume" for "m^3") based on
    its dimensionality.
    """

    new_derived = [
        {"base": load.units(lang).names[i["base"]].entity.name, "power": i["power"]}
        for i in dimensions
    ]

    final_derived = sorted(new_derived, key=lambda x: x["base"])
    key = load.get_key_from_dimensions(final_derived)

    ent = dis.disambiguate_entity(key, text, lang)
    if ent is None:
        _LOGGER.debug("\tCould not find entity for: %s", key)
        ent = cls.Entity(name="unknown", dimensions=new_derived)

    return ent