Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@requires(GAZETTEER_RSC)
def extract_in_gaz_ngram_features(**args):
"""Returns a feature extractor for surrounding ngrams in gazetteers
"""
del args
def _extractor(query, resources):
def get_ngram_gaz_features(query, gazes, entity_type):
tokens = query.normalized_tokens
feat_seq = [{} for _ in tokens]
for i, _ in enumerate(feat_seq):
feat_prefix = "in_gaz|type:{}|ngram".format(entity_type)
# entity PMI and conditional prob
p_total = (
math.log(sum([g["total_entities"] for g in gazes.values()]) + 1) / 2
@requires(WORD_FREQ_RSC)
def extract_gaz_freq(**args):
"""
Extract frequency bin features for each gazetteer
Returns:
(function): A feature extraction function that returns the log of the \
count of query tokens within each gazetteer's frequency bins.
"""
del args
def _extractor(query, resources):
tokens = query.normalized_tokens
freq_features = defaultdict(int)
for tok in tokens:
query_freq = "OOV" if resources[WORD_FREQ_RSC].get(tok) is None else "IV"
@requires(WORD_NGRAM_FREQ_RSC)
def extract_bag_of_words_features(
ngram_lengths_to_start_positions, thresholds=(0,), **args
):
"""Returns a bag-of-words feature extractor.
Args:
ngram_lengths_to_start_positions (dict)
thresholds (int): Cut off value to include word in n-gram vocab
Returns:
(function) The feature extractor.
"""
threshold_list = list(thresholds)
word_thresholds = threshold_list + [0] * (
len(ngram_lengths_to_start_positions.keys()) - len(threshold_list)
)
@requires(GAZETTEER_RSC)
def extract_in_gaz_feature(scaling=1, **args):
"""Returns a feature extractor that generates a set of features indicating the presence
of query n-grams in different entity gazetteers. Used by the domain and intent classifiers
when the 'in-gaz' feature is specified in the config.
Args:
scaling (int): A multiplicative scale factor to the ``ratio_pop`` and ``ratio`` features of
the in-gaz feature set.
Returns:
function: Returns an extractor function
"""
del args
def _extractor(query, resources):
in_gaz_features = defaultdict(float)
@requires(QUERY_FREQ_RSC)
def extract_query_string(scaling=1000, **args):
"""
Extract whole query string as a feature.
Returns:
(function) A feature extraction function that takes a query and \
returns the whole query string for exact matching
"""
def _extractor(query, resources):
query_key = "<{}>".format(query.normalized_text)
if query_key in resources[QUERY_FREQ_RSC]:
return {"exact|query:{}".format(query_key): scaling}
if args.get(ENABLE_STEMMING, False):
@requires(ENABLE_STEMMING)
def enabled_stemming(**args):
"""Feature extractor for enabling stemming of the query
"""
del args
def _extractor(query, resources):
# no op
del query
del resources
return _extractor
@requires(WORD_FREQ_RSC)
def extract_freq(bins=5, **args):
"""
Extract frequency bin features.
Args:
bins (int): The number of frequency bins (besides OOV)
Returns:
(function): A feature extraction function that returns the log of the \
count of query tokens within each frequency bin.
"""
def _extractor(query, resources):
tokens = query.normalized_tokens
stemmed_tokens = query.stemmed_tokens
@requires(SYS_TYPES_RSC)
def extract_sys_candidate_features(start_positions=(0,), **args):
"""Return an extractor for features based on a heuristic guess of numeric
candidates at/near the current token.
Args:
start_positions (tuple): positions relative to current token (=0)
Returns:
(function) The feature extractor.
"""
del args
def _extractor(query, resources):
feat_seq = [{} for _ in query.normalized_tokens]
system_entities = query.get_system_entity_candidates(resources[SYS_TYPES_RSC])
for entity in system_entities:
@requires(WORD_FREQ_RSC)
def extract_edge_ngrams(lengths=(1,), **args):
"""
Extract ngrams of some specified lengths.
Args:
lengths (list of int): The ngram length.
Returns:
(function) An feature extraction function that takes a query and \
returns ngrams of the specified lengths at start and end of query.
"""
del args
def _extractor(query, resources):
tokens = query.normalized_tokens
feats = {}
@requires(GAZETTEER_RSC)
def extract_in_gaz_features(**args):
"""Returns a feature extractor that finds any gazetteer matches against the input query"""
del args
def _extractor(example, resources):
_, entities, entity_index = example
features = {}
current_entity = entities[entity_index]
domain_gazes = resources[GAZETTEER_RSC]
for gaz_name, gaz in domain_gazes.items():
if current_entity.normalized_text in gaz["pop_dict"]:
feat_name = "in_gaz|type:{}".format(gaz_name)
features[feat_name] = 1