Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_scoring_context():
from revscoring.datasources import Datasource
from revscoring.dependencies import Dependent
from revscoring.features import Feature
fake_data = Datasource("fake_data", lambda: "fake")
len_func = Dependent("len_func")
literal_fake = Dependent("literal_fake")
characters = Feature("characters", lambda word, len: len(word),
returns=int,
depends_on=[fake_data, len_func])
is_fake = Feature("is_fake", lambda word, fake: word == fake,
returns=bool,
depends_on=[fake_data, literal_fake])
FakeExtractor = namedtuple("Extractor", ['extract', 'solve', 'language'])
def fake_extract(rev_ids, dependents, caches=None):
caches = caches if caches is not None else {}
for rev_id in rev_ids:
if rev_id % 5 != 0:
cache = caches.get(rev_id, {})
values = dependencies.solve(dependents,
context={len_func: lambda: len},
cache=cache)
values = list(values)
caches[rev_id] = cache
yield None, values
def __init__(self, language, error_if_missing=False):
self.language = language
self.prefix = language.__name__ + "." + self.MODULE_NAME + "."
self.words_list = TokenFilter(
self.prefix + "words",
self.DATASOURCE_MODULE.tokens,
token_is_word,
if_none=raise_rnf if error_if_missing else None
)
"""
Returns a list of word tokens.
"""
self.words = Feature(
self.prefix + "words", len,
returns=int,
depends_on=[self.words_list]
)
"""
A count of the number of words in the revision.
"""
self.content_words_list = TokenFilter(
self.prefix + "content_words",
self.DATASOURCE_MODULE.content_tokens,
token_is_word,
if_none=raise_rnf if error_if_missing else None
)
"""
Returns a list of words that appear in the (non-markup) content of the
"number_changed_sitelinks", process_no_changed_sitelinks, returns=int,
depends_on=[sitelinks_differ])
def process_no_added_labels(labels_differ):
return len(labels_differ.added())
number_added_labels = Feature(
"number_added_labels", process_no_added_labels, returns=int,
depends_on=[labels_differ])
def process_no_removed_labels(labels_differ):
return len(labels_differ.removed())
number_removed_labels = Feature(
"number_removed_labels", process_no_removed_labels, returns=int,
depends_on=[labels_differ])
def process_no_changed_labels(labels_differ):
return len(labels_differ.changed())
number_changed_labels = Feature(
"number_changed_labels", process_no_changed_labels, returns=int,
depends_on=[labels_differ])
def process_no_added_descriptions(descriptions_differ):
return len(descriptions_differ.added())
number_added_descriptions = Feature(
from wb_vandalism.datasources.parsed_revision_text import item
from revscoring.features import Feature
from .feature import has_property_value
import pywikibase
def process_no_claims(item):
no_claims = 0
for property_name in item.claims:
no_claims += len(item.claims[property_name])
return no_claims
number_claims = Feature("number_claims", process_no_claims, returns=int,
depends_on=[item])
def process_no_aliases(item):
no_aliases = 0
for lang in item.aliases:
no_aliases += len(item.aliases[lang])
return no_aliases
number_aliases = Feature("number_aliases", process_no_aliases, returns=int,
depends_on=[item])
def process_no_sources(item):
no_sources = 0
depends_on=[badges_differ, current_item, past_item])
# There is no need for changed badges.
def process_mean_distance_desc(parent, current, differ):
changed = differ.changed()
if not changed:
return 0.0
distance = 0
for lang in changed:
distance += (
1 - ratio(current.descriptions[lang], parent.descriptions[lang]))
return distance / len(changed)
mean_distance_descriptions = Feature(
"mean_distance_descriptions", process_mean_distance_desc, returns=float,
depends_on=[past_item, current_item, descriptions_differ])
def process_mean_distance_labels(parent, current, differ):
changed = differ.changed()
if not changed:
return 0.0
distance = 0
for lang in changed:
distance += 1 - ratio(current.labels[lang], parent.labels[lang])
return distance / len(changed)
mean_distance_labels = Feature(
"mean_distance_labels", process_mean_distance_labels, returns=float,
depends_on=[current_item, past_item, labels_differ])
# There is no need for changed aliases.
def process_no_added_claims(added_claims):
return len(added_claims)
number_added_claims = Feature(
"number_added_claims", process_no_added_claims, returns=int,
depends_on=[added_claims])
def process_no_removed_claims(removed_claims):
return len(removed_claims)
number_removed_claims = Feature(
"number_removed_claims", process_no_removed_claims, returns=int,
depends_on=[removed_claims])
def process_no_changed_claims(changed_claims):
return len(changed_claims)
number_changed_claims = Feature(
"number_changed_claims", process_no_changed_claims, returns=int,
depends_on=[changed_claims])
def process_no_changed_identifiers(changed_claims):
counter = 0
for old, new in changed_claims:
if isinstance(old.target, str):
"number_removed_labels", process_no_removed_labels, returns=int,
depends_on=[labels_differ])
def process_no_changed_labels(labels_differ):
return len(labels_differ.changed())
number_changed_labels = Feature(
"number_changed_labels", process_no_changed_labels, returns=int,
depends_on=[labels_differ])
def process_no_added_descriptions(descriptions_differ):
return len(descriptions_differ.added())
number_added_descriptions = Feature(
"number_added_descriptions", process_no_added_descriptions, returns=int,
depends_on=[descriptions_differ])
def process_no_removed_descriptions(descriptions_differ):
return len(descriptions_differ.removed())
number_removed_descriptions = Feature(
"number_removed_descriptions", process_no_removed_descriptions,
returns=int, depends_on=[descriptions_differ])
def process_no_changed_descriptions(descriptions_differ):
return len(descriptions_differ.changed())
number_changed_descriptions = Feature(
P569_changed = has_property_changed('P569')
P18_changed = has_property_changed('P18')
P109_changed = has_property_changed('P109')
P373_changed = has_property_changed('P373')
P856_changed = has_property_changed('P856')
def process_no_added_sources(added_sources):
return len(added_sources)
number_added_sources = Feature(
"number_added_sources", process_no_added_sources, returns=int,
depends_on=[added_sources])
def process_no_removed_sources(removed_sources):
return len(removed_sources)
number_removed_sources = Feature(
"number_removed_sources", process_no_removed_sources, returns=int,
depends_on=[removed_sources])
def process_no_added_qualifiers(added_qualifiers):
return len(added_qualifiers)
number_added_qualifiers = Feature(
"""
A count of the number of words in the revision.
"""
self.content_words_list = TokenFilter(
self.prefix + "content_words",
self.DATASOURCE_MODULE.content_tokens,
token_is_word,
if_none=raise_rnf if error_if_missing else None
)
"""
Returns a list of words that appear in the (non-markup) content of the
revision.
"""
self.content_words = Feature(
self.prefix + "content_words", len,
returns=int,
depends_on=[self.content_words_list]
)
"""
A count of the number of words in the (non-markup) content of the
revision.
"""
if language.resources.stopwords is not None and \
language.resources.stemmer is not None:
self.infonoise = Infonoise(
self.prefix + "infonoise",
language.resources.stopwords,
language.resources.stemmer.stem,
self.content_words_list
__url__, __version__)
from .datasources import Datasource
from .dependencies import Dependent, DependentSet
from .extractors import Extractor
from .features import Feature, FeatureVector
from .score_processor import ScoreProcessor
from .scoring import Model
if sys.version_info <= (3, 0):
raise VersionConflict(
"Revscoring requires Python '>=3' " +
"but your Python version is " +
platform.python_version())
__all__ = [Datasource, Dependent, DependentSet, Extractor, Feature,
FeatureVector, Model, ScoreProcessor,
__name__, __version__, __author__,
__author_email__, __description__, __url__]