Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.chars = aggregators.len(
self.datasources.text,
name=self._name + ".chars"
)
"`int` : The number of characters in the text"
self.numeric_chars = aggregators.sum(
mappers.map(len, self.datasources.numbers),
name=self._name + ".numeric_chars", returns=int
)
"`int` : The number of numeric characters in the text"
self.whitespace_chars = aggregators.sum(
mappers.map(len, self.datasources.whitespaces),
name=self._name + ".whitespace_chars", returns=int
)
"`int` : The number of whitespace characters in the text"
self.markup_chars = aggregators.sum(
mappers.map(len, self.datasources.markups),
name=self._name + ".markup_chars", returns=int
)
"`int` : The number of wikitext markup characters in the text"
self.cjk_chars = aggregators.sum(
mappers.map(len, self.datasources.cjks),
filters.positive(non_stopword_delta_values),
name=name + ".non_stopword_delta_increase",
returns=int
)
"`int` : The sum of word frequency delta increases for non-stopwords"
self.non_stopword_delta_decrease = aggregators.sum(
filters.negative(non_stopword_delta_values),
name=name + ".non_stopword_delta_decrease",
returns=int
)
"`int` : The sum of word frequency delta decreases for non-stopwords"
# Proportional word frequency deltas
stopword_prop_delta_values = \
dicts.values(self.datasources.stopword_prop_delta)
self.stopword_prop_delta_sum = aggregators.sum(
stopword_prop_delta_values,
name=name + ".stopword_prop_delta_sum"
)
"`float` : The sum of proportional word frequency deltas for stopwords"
self.stopword_prop_delta_increase = aggregators.sum(
filters.positive(stopword_prop_delta_values),
name=name + ".stopword_prop_delta_increase"
)
"""
`float` : The sum of proportional word frequency delta increases for
stopwords
"""
self.stopword_prop_delta_decrease = aggregators.sum(
filters.negative(stopword_prop_delta_values),
name=name + ".stopword_prop_delta_decrease"
)
# References
revision = Revision(
"euwiki.revision.revision",
wikitext.revision.datasources,
)
paragraphs = mappers.map(
str, revision.paragraphs_sentences_and_whitespace,
name="euwiki.revision.paragraphs"
)
paragraphs_without_refs = filters.regex_matching(
r"^(?!\s*$)((?!)(.|\n))*$",
paragraphs,
name="euwiki.revision.paragraphs_without_refs"
)
paragraphs_without_refs_total_length = aggregators.sum(
mappers.map(len, paragraphs_without_refs),
name="euwiki.revision.paragraphs_without_refs_total_length"
)
local_wiki = [
image_links,
image_links / max(wikitext.revision.content_chars, 1),
# category_links,
# category_links / max(wikitext.revision.content_chars, 1),
infobox_templates,
cn_templates + 1,
cn_templates / max(wikitext.revision.content_chars, 1),
log(paragraphs_without_refs_total_length + 1),
basque.dictionary.revision.dict_words,
basque.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
english.dictionary.revision.dict_words,
def __init__(self, name, revision_datasources):
super().__init__(name)
self.datasources = revision_datasources
self.unique_stems = aggregators.len(
dicts.keys(self.datasources.stem_frequency),
name=name + ".stems"
)
"""
`int` : A count of unique stemmed words.
"""
self.stem_chars = aggregators.sum(
mappers.map(len, self.datasources.stems),
name=name + ".stems_length",
returns=int
)
"""
`int` : A count of characters in stemmed words.
"""
if hasattr(self.datasources, 'parent'):
self.parent = Revision(name + ".parent", self.datasources.parent)
"""
:class:`~revscoring.languages.features.stemmed.Revision` : The
parent revision
"""
if hasattr(self.datasources, 'diff'):
"Check to see if we have at least 10 words and no refs"
words = 0
refs = 0
for t in segment.tokens():
words += t.type == "word"
refs += t.type in ("ref_open", "ref_close", "ref_singleton")
return words > 10 and refs == 0
paragraphs_without_refs = filters.filter(
filter_paragraphs_without_ref_tags,
wikitext.revision.datasources.paragraphs_sentences_and_whitespace,
name="ptwiki.revision.paragraphs_without_refs"
)
paragraphs_without_refs_total_length = aggregators.sum(
mappers.map(len, mappers.map(str, paragraphs_without_refs)),
name="ptwiki.revision.paragraphs_without_refs_total_length"
)
local_wiki = [
all_images,
all_images / max(wikitext.revision.content_chars, 1),
category_links,
category_links / max(wikitext.revision.content_chars, 1),
all_ref_tags,
all_ref_tags / max(wikitext.revision.content_chars, 1),
all_cite_templates,
all_cite_templates / max(wikitext.revision.content_chars, 1),
proportion_of_templated_references,
non_templated_references,
non_templated_references / max(wikitext.revision.content_chars, 1),
revision_lang_map = Datasource(
"revision.lang_map", process_normalized_lang_map,
depends_on=[ro.revision.text])
parent_lang_map = Datasource(
"revision.parent.lang_map", process_normalized_lang_map,
depends_on=[ro.revision.parent.text])
parent_lang_vector = vectorizers.vectorize(
parent_lang_map, keys=ALL_NORMALIZED_LANGS, returns=float,
name="revision.parent.lang_vector")
lang_delta = frequencies.delta(parent_lang_map, revision_lang_map)
lang_delta_vector = vectorizers.vectorize(
lang_delta, keys=ALL_NORMALIZED_LANGS, returns=float,
name="revision.diff.lang_delta_vector")
lang_delta_sum_diff = aggregators.sum(
mappers.abs(dicts.values(lang_delta)),
name="revision.diff.lang_delta_sum_diff")
def process_title_lang_match(title_lang, lang_delta):
return lang_delta.get(title_lang, 0.0)
parent_lang_match = Feature("revision.parent.lang_match",
process_title_lang_match,
depends_on=[translation_title_lang,
parent_lang_map],
returns=float)
match_lang_delta = Feature("revision.diff.match_lang_delta",
process_title_lang_match,
depends_on=[translation_title_lang,
def __init__(self, name, revision_datasources):
super().__init__(name)
self.datasources = revision_datasources
self.dict_words = aggregators.len(self.datasources.dict_words)
"`int` : A count of the number of dictionary words in the revision"
self.non_dict_words = \
aggregators.len(self.datasources.non_dict_words)
"`int` : A count of the number of non-dictionary words in the revision"
if hasattr(self.datasources, 'parent'):
self.parent = Revision(name + ".parent", self.datasources.parent)
"""
:class:`~revscoring.languages.features.dictionary.Revision` : The
parent revision
"""
if hasattr(self.datasources, 'diff'):
self.diff = Diff(name + ".diff", self.datasources.diff)
"""
:class:`~revscoring.languages.features.dictionary.Diff` : The
from itertools import groupby
from . import datasources
from .....datasources.meta import mappers
from ....feature import Feature
from ....meta import aggregators
from .util import prefix
chars_added = aggregators.sum(
mappers.map(len, datasources.segments_added),
name=prefix + ".chars_added", returns=int
)
"""
A count of the number of characters added in this edit.
"""
chars_removed = aggregators.sum(
mappers.map(len, datasources.segments_removed),
name=prefix + ".chars_removed", returns=int
)
"""
A count of the number of characters removed in this edit.
"""
numeric_chars_added = aggregators.sum(
unique_references_count = aggregators.len(unique_references)
"`int` : A count of unique sources in the revision"
# Status
is_human = wikibase_.revision.has_property_value(
properties.INSTANCE_OF, items.HUMAN, name=name + '.revision.is_human')
has_birthday = wikibase_.revision.has_property(
properties.DATE_OF_BIRTH, name=name + '.revision.has_birthday')
dead = wikibase_.revision.has_property(
properties.DATE_OF_DEATH, name=name + '.revision.dead')
is_blp = has_birthday.and_(not_(dead))
local_wiki = [
is_human,
is_blp,
aggregators.len(complete_translations),
aggregators.len(important_label_translations),
aggregators.len(important_description_translations),
aggregators.len(important_complete_translations),
references_count,
wikimedia_references_count,
wikimedia_references_count / modifiers.max(references_count, 1),
external_references_count,
external_references_count / modifiers.max(references_count, 1),
unique_references_count,
unique_references_count / modifiers.max(references_count, 1)
]
item_quality = wikibase.item + local_wiki
)
"`int` : The sum of delta changes in the number frequency table"
self.number_delta_increase = aggregators.sum(
filters.positive(dicts.values(self.datasources.number_delta)),
name=self._name + ".number_delta_increase"
)
"`int` : The sum of delta increases in the number frequency table"
self.number_delta_decrease = aggregators.sum(
filters.negative(dicts.values(self.datasources.number_delta)),
name=self._name + ".number_delta_decrease"
)
"`int` : The sum of delta decreases in the number frequency table"
self.number_prop_delta_sum = aggregators.sum(
dicts.values(self.datasources.number_prop_delta),
name=self._name + ".number_prop_delta_sum"
)
"""
`int` : The sum of proportional delta changes in the number
frequency table
"""
self.number_prop_delta_increase = aggregators.sum(
filters.positive(dicts.values(self.datasources.number_prop_delta)),
name=self._name + ".number_prop_delta_increase"
)
"""
`int` : The sum of proportional delta increases in the number
frequency table
"""