How to use the quantulum3.load.units function in quantulum3

To help you get started, we’ve selected a few quantulum3 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nielstron / quantulum3 / quantulum3 / parser.py View on Github external
def get_unit(item, text, lang="en_US"):
    """
    Extract unit from regex hit.
    """

    group_units = ["prefix", "unit1", "unit2", "unit3", "unit4"]
    group_operators = ["operator1", "operator2", "operator3", "operator4"]
    # How much of the end is removed because of an "incorrect" regex match
    unit_shortening = 0

    item_units = [item.group(i) for i in group_units if item.group(i)]

    if len(item_units) == 0:
        unit = load.units(lang).names["dimensionless"]
    else:
        derived, slash = [], False
        multiplication_operator = False
        for index in range(0, 5):
            unit = item.group(group_units[index])
            operator_index = None if index < 1 else group_operators[index - 1]
            operator = None if index < 1 else item.group(operator_index)

            # disallow spaces as operators in units expressed in their symbols
            # Enforce consistency among multiplication and division operators
            # Single exceptions are colloquial number abbreviations (5k miles)
            if operator in reg.multiplication_operators(lang) or (
                operator is None
                and unit
                and not (index == 1 and unit in reg.suffixes(lang))
            ):
github nielstron / quantulum3 / quantulum3 / classifier.py View on Github external
def disambiguate_unit(unit, text, lang="en_US"):
    """
    Resolve ambiguity between units with same names, symbols or abbreviations.
    """

    new_unit = (
        load.units(lang).symbols.get(unit)
        or load.units(lang).surfaces.get(unit)
        or load.units(lang).surfaces_lower.get(unit.lower())
        or load.units(lang).symbols_lower.get(unit.lower())
    )
    if not new_unit:
        raise KeyError('Could not find unit "%s" from "%s"' % (unit, text))

    if len(new_unit) > 1:
        transformed = classifier(lang).tfidf_model.transform([clean_text(text, lang)])
        scores = classifier(lang).classifier.predict_proba(transformed).tolist()[0]
        scores = zip(scores, classifier(lang).target_names)

        # Filter for possible names
        names = [i.name for i in new_unit]
        scores = [i for i in scores if i[1] in names]

        # Sort by rank
github nielstron / quantulum3 / quantulum3 / regex.py View on Github external
1: $
        2: 20
        3: /
        4: h
        5: None
        6: None
        7: None
        8: None
        9: None
        10: None

    """

    op_keys = sorted(list(operators(lang)), key=len, reverse=True)
    unit_keys = sorted(
        list(load.units(lang).surfaces.keys()) + list(load.units(lang).symbols.keys()),
        key=len,
        reverse=True,
    )
    symbol_keys = sorted(
        list(load.units(lang).prefix_symbols.keys()), key=len, reverse=True
    )

    exponent = exponents_regex(lang).format(superscripts=unicode_superscript_regex())

    all_ops = "|".join([r"{}".format(re.escape(i)) for i in op_keys])
    all_units = "|".join([r"{}".format(re.escape(i)) for i in unit_keys])
    all_symbols = "|".join([r"{}".format(re.escape(i)) for i in symbol_keys])

    pattern = r"""
        (?(?:%s)(?![a-zA-Z]))?         # Currencies, mainly
github nielstron / quantulum3 / quantulum3 / _lang / en_US / parser.py View on Github external
and unit.original_dimensions[1]["surface"] in reg.suffixes(lang).keys()
        ):
            suffix = unit.original_dimensions[1]["surface"]
            # Only apply if at least last value is suffixed by k, M, etc
            if re.search(r"\d{}\b".format(suffix), text):
                values = [value * reg.suffixes(lang)[suffix] for value in values]
                unit.original_dimensions = [
                    unit.original_dimensions[0]
                ] + unit.original_dimensions[2:]
                dimension_change = True

        elif unit.original_dimensions[0]["surface"] in reg.suffixes(lang).keys():
            # k/M etc is only applied if non-symbolic surfaces of other units
            # (because colloquial) or currency units
            symbolic = all(
                dim["surface"] in load.units(lang).names[dim["base"]].symbols
                for dim in unit.original_dimensions[1:]
            )
            if not symbolic:
                suffix = unit.original_dimensions[0]["surface"]
                values = [value * reg.suffixes(lang)[suffix] for value in values]
                unit.original_dimensions = unit.original_dimensions[1:]
                dimension_change = True

    # Usually "1990s" stands for the decade, not the amount of seconds
    elif re.match(r"[1-2]\d\d0s", surface):
        unit.original_dimensions = []
        dimension_change = True
        surface = surface[:-1]
        span = (span[0], span[1] - 1)
        _LOGGER.debug('\tCorrect for "1990s" pattern')
github nielstron / quantulum3 / quantulum3 / _lang / en_US / load.py View on Github external
def build_common_words():
    # Read raw 4 letter file
    path = os.path.join(TOPDIR, "common-words.txt")
    words = defaultdict(list)  # Collect words based on length
    with open(path, "r", encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            line = line.rstrip()
            if (
                line not in load.units(lang).surfaces_all
                and line not in load.units(lang).symbols
            ):
                words[len(line)].append(line)
            plural = load.pluralize(line)
            if (
                plural not in load.units(lang).surfaces_all
                and plural not in load.units(lang).symbols
            ):
                words[len(plural)].append(plural)
    return words