Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
]
)
+ 1
),
"p_joint": math.log(
len(gazes[entity_type]["index"][get_ngram(tokens, i, 2)])
+ 1
),
},
{
"length": 3,
"position": 0,
"p_ngram": math.log(
sum(
[
len(g["index"][get_ngram(tokens, i - 1, 3)])
for g in gazes.values()
]
)
+ 1
),
"p_joint": math.log(
len(
gazes[entity_type]["index"][get_ngram(tokens, i - 1, 3)]
)
+ 1
),
},
]
for window_feature in window_features:
features = {
elif i == end - 1:
pos_attr = "end"
else:
pos_attr = "cont"
# Basic existence features
positional_gaz_prefix = "in_gaz|type:{}|segment:{}".format(
entity_type, pos_attr
)
# Basic Positional features
feature_sequence[i][positional_gaz_prefix] = 1
features = {
# Features for ngram before the span
"|ngram_before|length:{}".format(1): get_ngram(
tokens, start - 1, 1
),
# Features for ngram after the span
"|ngram_after|length:{}".format(1): get_ngram(tokens, end, 1),
# Features for ngram at start of span
"|ngram_first|length:{}".format(1): get_ngram(tokens, start, 1),
# Features for ngram at end of span
"|ngram_last|length:{}".format(1): get_ngram(tokens, end - 1, 1),
# Popularity features
"|pop": pop,
# Character length features
"|log_char_len": math.log(len(entity)),
"|pct_char_len": len(entity) / len(" ".join(tokens)),
# entity PMI and conditional prob
"|pmi": p_total + p_joint - p_entity_type - p_entity,
"|class_prob": p_total + p_joint - p_entity,
# Basic existence features
positional_gaz_prefix = "in_gaz|type:{}|segment:{}".format(
entity_type, pos_attr
)
# Basic Positional features
feature_sequence[i][positional_gaz_prefix] = 1
features = {
# Features for ngram before the span
"|ngram_before|length:{}".format(1): get_ngram(
tokens, start - 1, 1
),
# Features for ngram after the span
"|ngram_after|length:{}".format(1): get_ngram(tokens, end, 1),
# Features for ngram at start of span
"|ngram_first|length:{}".format(1): get_ngram(tokens, start, 1),
# Features for ngram at end of span
"|ngram_last|length:{}".format(1): get_ngram(tokens, end - 1, 1),
# Popularity features
"|pop": pop,
# Character length features
"|log_char_len": math.log(len(entity)),
"|pct_char_len": len(entity) / len(" ".join(tokens)),
# entity PMI and conditional prob
"|pmi": p_total + p_joint - p_entity_type - p_entity,
"|class_prob": p_total + p_joint - p_entity,
"|output_prob": p_total + p_joint - p_entity_type,
}
for key, value in features.items():
),
},
{
"length": 2,
"position": 1,
"p_ngram": math.log(
sum(
[
len(g["index"][get_ngram(tokens, i, 2)])
for g in gazes.values()
]
)
+ 1
),
"p_joint": math.log(
len(gazes[entity_type]["index"][get_ngram(tokens, i, 2)])
+ 1
),
},
{
"length": 3,
"position": 0,
"p_ngram": math.log(
sum(
[
len(g["index"][get_ngram(tokens, i - 1, 3)])
for g in gazes.values()
]
)
+ 1
),
"p_joint": math.log(
positional_gaz_prefix = "in_gaz|type:{}|segment:{}".format(
entity_type, pos_attr
)
# Basic Positional features
feature_sequence[i][positional_gaz_prefix] = 1
features = {
# Features for ngram before the span
"|ngram_before|length:{}".format(1): get_ngram(
tokens, start - 1, 1
),
# Features for ngram after the span
"|ngram_after|length:{}".format(1): get_ngram(tokens, end, 1),
# Features for ngram at start of span
"|ngram_first|length:{}".format(1): get_ngram(tokens, start, 1),
# Features for ngram at end of span
"|ngram_last|length:{}".format(1): get_ngram(tokens, end - 1, 1),
# Popularity features
"|pop": pop,
# Character length features
"|log_char_len": math.log(len(entity)),
"|pct_char_len": len(entity) / len(" ".join(tokens)),
# entity PMI and conditional prob
"|pmi": p_total + p_joint - p_entity_type - p_entity,
"|class_prob": p_total + p_joint - p_entity,
"|output_prob": p_total + p_joint - p_entity_type,
}
for key, value in features.items():
for prefix in [gaz_feat_prefix, positional_gaz_prefix]:
feature_sequence[i][prefix + key] = value
},
{
"length": 3,
"position": 0,
"p_ngram": math.log(
sum(
[
len(g["index"][get_ngram(tokens, i - 1, 3)])
for g in gazes.values()
]
)
+ 1
),
"p_joint": math.log(
len(
gazes[entity_type]["index"][get_ngram(tokens, i - 1, 3)]
)
+ 1
),
},
]
for window_feature in window_features:
features = {
"|length:{}|pos:{}|pmi".format(
window_feature["length"], window_feature["position"]
): p_total
+ window_feature["p_joint"]
- p_entity_type
- window_feature["p_ngram"],
"|length:{}|pos:{}|class_prob".format(
window_feature["length"], window_feature["position"]
def _extractor(example, resources):
del resources
query, entities, entity_index = example
features = {}
tokens = query.normalized_tokens
current_entity = entities[entity_index]
current_entity_token_start = current_entity.token_span.start
for length, starts in ngram_lengths_to_start_positions.items():
for start in starts:
feat_name = "bag_of_words|ngram_before|length:{}|pos:{}".format(
length, start
)
features[feat_name] = get_ngram(
tokens, current_entity_token_start + start, length
)
return features
]
)
+ 1
),
"p_joint": math.log(
len(gazes[entity_type]["index"][get_ngram(tokens, i, 1)])
+ 1
),
},
{
"length": 2,
"position": -1,
"p_ngram": math.log(
sum(
[
len(g["index"][get_ngram(tokens, i - 1, 2)])
for g in gazes.values()
]
)
+ 1
),
"p_joint": math.log(
len(
gazes[entity_type]["index"][get_ngram(tokens, i - 1, 2)]
)
+ 1
),
},
{
"length": 2,
"position": 1,
"p_ngram": math.log(