Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_trim():
d1 = Datasource("derp1")
f1 = Feature("foobar1", returns=int)
f2 = Feature("foobar2", returns=int, depends_on=[d1])
c = Constant(value=5)
fv = FeatureVector("foobar3", returns=int, depends_on=[c])
assert list(trim(f1)) == [f1]
assert list(trim([f1, f2, fv])) == [f1, f2, fv]
assert list(trim([f1, f2, f1 + f2, fv])) == [f1, f2, fv]
assert (list(trim(log(max(f1 - f2, 1)))) ==
[f1, f2])
from ..datasources import page_creation, revision, site
from .feature import Feature
def process_is_content_namespace(revision_metadata, namespace_map):
return namespace_map[revision_metadata.page_namespace].content
is_content_namespace = \
Feature("page.is_content_namespace", process_is_content_namespace,
returns=bool,
depends_on=[revision.metadata, site.namespace_map])
"""
Represents whether this page is in a content namespace or not.
:Returns:
bool
:Example:
..code-block:: python
>>> from revscoring.features import page
>>> list(extractor.extract(655097130, [page.is_content_namespace]))
[True]
"""
def __init__(self, name, revision_datasources):
super().__init__(name, revision_datasources.user.last_revision)
self.seconds_since = Feature(
name + ".seconds_since",
_process_seconds_since,
returns=int,
depends_on=[revision_datasources.user.last_revision.timestamp,
revision_datasources.timestamp])
"`int`: The number of seconds since the user last saved an edit"
from ..datasources import contiguous_segments_added
from .feature import Feature
def process(contiguous_segments_added):
return len(contiguous_segments_added)
segments_added = Feature("segments_added", process,
returns=int, depends_on=[contiguous_segments_added])
def __init__(self, name, revision_datasources):
super().__init__(name)
self.datasources = revision_datasources.user
if hasattr(self.datasources, 'info'):
self.seconds_since_registration = Feature(
name + ".seconds_since_registration",
_process_seconds_since_registration,
returns=int,
depends_on=[revision_datasources.user.id,
revision_datasources.user.info.registration,
revision_datasources.timestamp])
"""
`int` : The number of seconds since the user registered their
account -- or zero in the case of anons. If the user has a
registration date that is *after* the revision timestamp
(should be implossible, but happens sometimes), the user is assumed
to be 1 year old.
"""
if hasattr(self.datasources, 'last_revision'):
self.last_revision = LastUserRevision(
self.badges_removed = aggregators.len(self.datasources.badges_removed)
"`int` : The number of badges removed"
self.badges_changed = aggregators.len(self.datasources.badges_changed)
"`int` : The number of badges changed"
# AF/38
self.proportion_of_qid_added = Feature(
name + ".proportion_of_qid_added",
_process_proportion_of_qid_added,
returns=float, depends_on=[self.datasources.parent_entity,
self.datasources.revision_entity]
)
"`int` : The proportion of Q# added."
# AF/38
self.proportion_of_language_added = Feature(
name + ".proportion_of_language_added",
_process_proportion_of_language_added,
returns=float, depends_on=[self.datasources.parent_entity,
self.datasources.revision_entity]
)
"`int` : The proportion of language added."
self.proportion_of_links_added = Feature(
name + ".proportion_of_links_added",
_process_proportion_of_links_added,
returns=float, depends_on=[self.datasources.parent_entity,
self.datasources.revision_entity]
)
"`int` : The proportion of links added."
self.identifiers_changed = Feature(
The name of a property (usually preceeded by "P")
value : `mixed`
The value to match
name : `str`
A name to associate with the Feature. If not set, the
feature's name will be
'has_property_value(, )'
"""
if name is None:
name = self._name + ".has_property_value({0}, {1})" \
.format(repr(property), repr(value))
return HasPropertyValue(name, property, value, self.datasources.entity)
class HasPropertyValue(Feature):
def __init__(self, name, property, value, item_datasource):
self.property = property
self.value = value
super().__init__(name, self._process, returns=bool,
depends_on=[item_datasource])
def _process(self, item):
statements = item.properties.get(self.property, [])
return self.value in (str(s.claim.datavalue) for s in statements)
from ..datasources import first_revision_metadata, revision_metadata
from .feature import Feature
def process(first_revision_metadata, revision_metadata):
return revision_metadata.timestamp - first_revision_metadata.timestamp
page_age_in_seconds = Feature("page_age_in_seconds", process,
returns=int,
depends_on=[first_revision_metadata,
revision_metadata])
import re
from ..datasources import revision_metadata
from .feature import Feature
SECTION_COMMENT_RE = re.compile(r"\/\*([^\*]|\*[^\/])+\*\/")
def process(revision_metadata):
if revision_metadata.comment is not None:
return SECTION_COMMENT_RE.match(revision_metadata.comment) is not None
else:
return False
is_section_comment = Feature("is_section_comment", process,
returns=bool, depends_on=[revision_metadata])
def validate(self, value):
if isinstance(value, self.returns):
return value
else:
raise ValueError("Expected {0}, but got {1} instead."
.format(self.returns, type(value)))
@classmethod
def or_constant(self, val):
if isinstance(val, Feature):
return val
else:
return Constant(val)
class Constant(Feature):
"""
A special sub-type of `revscoring.Feature` that returns a constant value.
:Parameters:
value : `mixed`
Any type of potential feature value
name : `str`
A name to give the feature
"""
def __init__(self, value, name=None):
self.value = value
if name is None:
name = str(value)
super().__init__(name, self._process,
returns=type(value), depends_on=[])