How to use the mindmeld.models.helpers.get_ngram function in mindmeld

To help you get started, we’ve selected a few mindmeld examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github cisco / mindmeld / mindmeld / models / query_features.py View on Github external
]
                            )
                            + 1
                        ),
                        "p_joint": math.log(
                            len(gazes[entity_type]["index"][get_ngram(tokens, i, 2)])
                            + 1
                        ),
                    },
                    {
                        "length": 3,
                        "position": 0,
                        "p_ngram": math.log(
                            sum(
                                [
                                    len(g["index"][get_ngram(tokens, i - 1, 3)])
                                    for g in gazes.values()
                                ]
                            )
                            + 1
                        ),
                        "p_joint": math.log(
                            len(
                                gazes[entity_type]["index"][get_ngram(tokens, i - 1, 3)]
                            )
                            + 1
                        ),
                    },
                ]

                for window_feature in window_features:
                    features = {
github cisco / mindmeld / mindmeld / models / query_features.py View on Github external
elif i == end - 1:
                    pos_attr = "end"
                else:
                    pos_attr = "cont"

                # Basic existence features
                positional_gaz_prefix = "in_gaz|type:{}|segment:{}".format(
                    entity_type, pos_attr
                )

                # Basic Positional features
                feature_sequence[i][positional_gaz_prefix] = 1

                features = {
                    # Features for ngram before the span
                    "|ngram_before|length:{}".format(1): get_ngram(
                        tokens, start - 1, 1
                    ),
                    # Features for ngram after the span
                    "|ngram_after|length:{}".format(1): get_ngram(tokens, end, 1),
                    # Features for ngram at start of span
                    "|ngram_first|length:{}".format(1): get_ngram(tokens, start, 1),
                    # Features for ngram at end of span
                    "|ngram_last|length:{}".format(1): get_ngram(tokens, end - 1, 1),
                    # Popularity features
                    "|pop": pop,
                    # Character length features
                    "|log_char_len": math.log(len(entity)),
                    "|pct_char_len": len(entity) / len(" ".join(tokens)),
                    # entity PMI and conditional prob
                    "|pmi": p_total + p_joint - p_entity_type - p_entity,
                    "|class_prob": p_total + p_joint - p_entity,
github cisco / mindmeld / mindmeld / models / query_features.py View on Github external
# Basic existence features
                positional_gaz_prefix = "in_gaz|type:{}|segment:{}".format(
                    entity_type, pos_attr
                )

                # Basic Positional features
                feature_sequence[i][positional_gaz_prefix] = 1

                features = {
                    # Features for ngram before the span
                    "|ngram_before|length:{}".format(1): get_ngram(
                        tokens, start - 1, 1
                    ),
                    # Features for ngram after the span
                    "|ngram_after|length:{}".format(1): get_ngram(tokens, end, 1),
                    # Features for ngram at start of span
                    "|ngram_first|length:{}".format(1): get_ngram(tokens, start, 1),
                    # Features for ngram at end of span
                    "|ngram_last|length:{}".format(1): get_ngram(tokens, end - 1, 1),
                    # Popularity features
                    "|pop": pop,
                    # Character length features
                    "|log_char_len": math.log(len(entity)),
                    "|pct_char_len": len(entity) / len(" ".join(tokens)),
                    # entity PMI and conditional prob
                    "|pmi": p_total + p_joint - p_entity_type - p_entity,
                    "|class_prob": p_total + p_joint - p_entity,
                    "|output_prob": p_total + p_joint - p_entity_type,
                }

                for key, value in features.items():
github cisco / mindmeld / mindmeld / models / query_features.py View on Github external
),
                    },
                    {
                        "length": 2,
                        "position": 1,
                        "p_ngram": math.log(
                            sum(
                                [
                                    len(g["index"][get_ngram(tokens, i, 2)])
                                    for g in gazes.values()
                                ]
                            )
                            + 1
                        ),
                        "p_joint": math.log(
                            len(gazes[entity_type]["index"][get_ngram(tokens, i, 2)])
                            + 1
                        ),
                    },
                    {
                        "length": 3,
                        "position": 0,
                        "p_ngram": math.log(
                            sum(
                                [
                                    len(g["index"][get_ngram(tokens, i - 1, 3)])
                                    for g in gazes.values()
                                ]
                            )
                            + 1
                        ),
                        "p_joint": math.log(
github cisco / mindmeld / mindmeld / models / query_features.py View on Github external
positional_gaz_prefix = "in_gaz|type:{}|segment:{}".format(
                    entity_type, pos_attr
                )

                # Basic Positional features
                feature_sequence[i][positional_gaz_prefix] = 1

                features = {
                    # Features for ngram before the span
                    "|ngram_before|length:{}".format(1): get_ngram(
                        tokens, start - 1, 1
                    ),
                    # Features for ngram after the span
                    "|ngram_after|length:{}".format(1): get_ngram(tokens, end, 1),
                    # Features for ngram at start of span
                    "|ngram_first|length:{}".format(1): get_ngram(tokens, start, 1),
                    # Features for ngram at end of span
                    "|ngram_last|length:{}".format(1): get_ngram(tokens, end - 1, 1),
                    # Popularity features
                    "|pop": pop,
                    # Character length features
                    "|log_char_len": math.log(len(entity)),
                    "|pct_char_len": len(entity) / len(" ".join(tokens)),
                    # entity PMI and conditional prob
                    "|pmi": p_total + p_joint - p_entity_type - p_entity,
                    "|class_prob": p_total + p_joint - p_entity,
                    "|output_prob": p_total + p_joint - p_entity_type,
                }

                for key, value in features.items():
                    for prefix in [gaz_feat_prefix, positional_gaz_prefix]:
                        feature_sequence[i][prefix + key] = value
github cisco / mindmeld / mindmeld / models / query_features.py View on Github external
},
                    {
                        "length": 3,
                        "position": 0,
                        "p_ngram": math.log(
                            sum(
                                [
                                    len(g["index"][get_ngram(tokens, i - 1, 3)])
                                    for g in gazes.values()
                                ]
                            )
                            + 1
                        ),
                        "p_joint": math.log(
                            len(
                                gazes[entity_type]["index"][get_ngram(tokens, i - 1, 3)]
                            )
                            + 1
                        ),
                    },
                ]

                for window_feature in window_features:
                    features = {
                        "|length:{}|pos:{}|pmi".format(
                            window_feature["length"], window_feature["position"]
                        ): p_total
                        + window_feature["p_joint"]
                        - p_entity_type
                        - window_feature["p_ngram"],
                        "|length:{}|pos:{}|class_prob".format(
                            window_feature["length"], window_feature["position"]
github cisco / mindmeld / mindmeld / models / entity_features.py View on Github external
def _extractor(example, resources):
        del resources
        query, entities, entity_index = example
        features = {}
        tokens = query.normalized_tokens
        current_entity = entities[entity_index]
        current_entity_token_start = current_entity.token_span.start

        for length, starts in ngram_lengths_to_start_positions.items():
            for start in starts:
                feat_name = "bag_of_words|ngram_before|length:{}|pos:{}".format(
                    length, start
                )
                features[feat_name] = get_ngram(
                    tokens, current_entity_token_start + start, length
                )

        return features
github cisco / mindmeld / mindmeld / models / query_features.py View on Github external
]
                            )
                            + 1
                        ),
                        "p_joint": math.log(
                            len(gazes[entity_type]["index"][get_ngram(tokens, i, 1)])
                            + 1
                        ),
                    },
                    {
                        "length": 2,
                        "position": -1,
                        "p_ngram": math.log(
                            sum(
                                [
                                    len(g["index"][get_ngram(tokens, i - 1, 2)])
                                    for g in gazes.values()
                                ]
                            )
                            + 1
                        ),
                        "p_joint": math.log(
                            len(
                                gazes[entity_type]["index"][get_ngram(tokens, i - 1, 2)]
                            )
                            + 1
                        ),
                    },
                    {
                        "length": 2,
                        "position": 1,
                        "p_ngram": math.log(