Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
Swedish Wikipedia
+++++++++++++++++
"""
from revscoring.features import wikitext
from revscoring.features.modifiers import max
from revscoring.languages import swedish
from . import wikipedia
cn_templates = wikitext.revision.template_names_matching(
r"Källa[ _]behövs|Kb",
name="svwiki.revision.cn_templates")
# Links
category_links = wikitext.revision.wikilink_titles_matching(
r"Category|Kategori\:", name="revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
r"File|Image|Fil\:", name="revision.image_links")
local_wiki = [
image_links,
image_links / max(wikitext.revision.content_chars, 1),
category_links,
category_links / max(wikitext.revision.content_chars, 1),
swedish.dictionary.revision.dict_words,
swedish.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
English Wikipedia
+++++++++++++++++
"""
import re
from revscoring.datasources.meta import filters, mappers
from revscoring.features import wikitext
from revscoring import Feature
from revscoring.features.meta import aggregators
from revscoring.features.modifiers import log, max, sub
from revscoring.languages import english
from . import wikipedia
# Templates
infobox_templates = wikitext.revision.template_names_matching(
r"infobox", name="enwiki.revision.infobox_templates")
CN_TEMPLATES = [
r"Citation[_ ]needed",
r"Cn",
r"Fact"
]
cn_templates = wikitext.revision.template_names_matching(
"|".join(CN_TEMPLATES), name="enwiki.revision.cn_templates")
who_templates = wikitext.revision.template_names_matching(
"Who", name="enwiki.revision.who_templates")
main_article_templates = wikitext.revision.template_names_matching(
"Main", name="enwiki.main_article_templates")
CITE_TEMPLATES = [
r"Cite",
r"Harvard[_ ]citation[_ ]no[_ ]brackets", r"harvnb",
r"Harvard citation", r"harv",
from revscoring.features import wikitext, modifiers
article = [
wikitext.revision.chars,
wikitext.revision.content_chars,
wikitext.revision.ref_tags,
(wikitext.revision.ref_tags /
modifiers.max(wikitext.revision.content_chars, 1)),
wikitext.revision.wikilinks,
(wikitext.revision.wikilinks /
modifiers.max(wikitext.revision.content_chars, 1)),
wikitext.revision.external_links,
(wikitext.revision.external_links /
modifiers.max(wikitext.revision.content_chars, 1)),
wikitext.revision.headings_by_level(2),
(wikitext.revision.headings_by_level(2) /
modifiers.max(wikitext.revision.content_chars, 1)),
wikitext.revision.headings_by_level(3),
(wikitext.revision.headings_by_level(3) /
modifiers.max(wikitext.revision.content_chars, 1))
]
wikitext.revision.diff.url_delta_sum,
wikitext.revision.diff.url_delta_increase,
wikitext.revision.diff.url_delta_decrease,
wikitext.revision.diff.url_prop_delta_sum,
wikitext.revision.diff.url_prop_delta_increase,
wikitext.revision.diff.url_prop_delta_decrease,
wikitext.revision.diff.word_delta_sum,
wikitext.revision.diff.word_delta_increase,
wikitext.revision.diff.word_delta_decrease,
wikitext.revision.diff.word_prop_delta_sum,
wikitext.revision.diff.word_prop_delta_increase,
wikitext.revision.diff.word_prop_delta_decrease,
wikitext.revision.diff.uppercase_word_delta_sum,
wikitext.revision.diff.uppercase_word_delta_increase,
wikitext.revision.diff.uppercase_word_delta_decrease,
wikitext.revision.diff.uppercase_word_prop_delta_sum,
wikitext.revision.diff.uppercase_word_prop_delta_increase,
wikitext.revision.diff.uppercase_word_prop_delta_decrease,
wikitext.revision.diff.punctuation_delta_sum,
wikitext.revision.diff.punctuation_delta_increase,
wikitext.revision.diff.punctuation_delta_decrease,
wikitext.revision.diff.punctuation_prop_delta_sum,
wikitext.revision.diff.punctuation_prop_delta_increase,
wikitext.revision.diff.punctuation_prop_delta_decrease,
wikitext.revision.diff.break_delta_sum,
wikitext.revision.diff.break_delta_increase,
wikitext.revision.diff.break_delta_decrease,
wikitext.revision.diff.break_prop_delta_sum,
wikitext.revision.diff.break_prop_delta_increase,
wikitext.revision.diff.break_prop_delta_decrease,
## token edit features
wikitext.revision.diff.segments_added,
from revscoring.languages import english
from . import mediawiki, wikipedia, wikitext
local_wiki = [
revision_oriented.revision.comment_matches(
r"copy|edit|npov|wp:?el",
name="enwiki.revision.comment.has_known_word"
),
revision_oriented.revision.comment_matches(
r"\[\[WP:AES\|←\]\]",
name="enwiki.revision.comment.is_aes"
),
sub(
wikitext_features.revision.template_names_matching(r"^cite"),
wikitext_features.revision.parent.template_names_matching(r"^cite"),
name="enwiki.revision.diff.cite_templates_added"
)
]
badwords = [
english.badwords.revision.diff.match_delta_sum,
english.badwords.revision.diff.match_delta_increase,
english.badwords.revision.diff.match_delta_decrease,
english.badwords.revision.diff.match_prop_delta_sum,
english.badwords.revision.diff.match_prop_delta_increase,
english.badwords.revision.diff.match_prop_delta_decrease
]
informals = [
english.informals.revision.diff.match_delta_sum,
english.informals.revision.diff.match_delta_increase,
SFN_TEMPLATES = [
r"Shortened footnote template", r"sfn",
r"Sfnp",
r"Sfnm",
r"Sfnmp"
]
shortened_footnote_templates = wikitext.revision.template_names_matching(
"|".join(SFN_TEMPLATES),
name="enwiki.revision.shortened_footnote_templates")
all_ref_tags = shortened_footnote_templates + wikitext.revision.ref_tags
all_cite_templates = cite_templates + shortened_footnote_templates
proportion_of_templated_references = \
all_cite_templates / max(all_ref_tags, 1)
non_templated_references = max(all_ref_tags - all_cite_templates, 0)
non_cite_templates = sub(
wikitext.revision.templates, all_cite_templates,
name="enwiki.revision.non_cite_templates"
)
# Links
category_links = wikitext.revision.wikilink_titles_matching(
r"Category\:", name="enwiki.revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
r"File|Image\:", name="enwiki.revision.image_links")
image_templates = wikitext.revision.template_names_matching(
r"((Wide|Tall|scalable) image)|Panorama|Panorama 2",
name='enwiki.revision.image_template')
def get_images(strs):
name="ruwiki.revision.cn_templates")
# Links
category_links = wikitext.revision.wikilink_titles_matching(
r"Category|Категория\:", name="revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
r"File|Image|Файл\:", name="revision.image_links")
local_wiki = [
russian.stemmed.revision.stem_chars,
(russian.stemmed.revision.stem_chars /
max(wikitext.revision.content_chars, 1)),
image_links,
image_links / max(wikitext.revision.content_chars, 1),
category_links,
category_links / max(wikitext.revision.content_chars, 1),
russian.dictionary.revision.dict_words,
russian.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
cn_templates,
cn_templates / max(wikitext.revision.content_chars, 1),
]
wp10 = local_wiki + wikipedia.article
# TODO: This ends up being case insensitive even though taht doesn't
# make any sense.
weird_regexes = [
# capital letters in the middle of a word
r'\w[^\WA-Z\u00c0-\u00dd]*[A-Z\u00c0-\u00dd][^\WA-Z\u00c0-\u00dd]+',
# non-text chars in the middle of a word
r'\w+[^\w\s]\w+',
# not actually french quotes e.g. "<<" and ">>" as opposed to « or »
r'<<|>>'
]
weird_word_things = RegexMatches(
"wikitext.revision.weird_word_things", weird_regexes)
# proportion of brackets and semi-colons
nonsense_markup = aggregators.len(
wikitext.revision.datasources.tokens_matching(r"[\{\}\[\]\|\;\\\/\:]"),
name="wikitext.revision.nonsense_markup")
# ... (in another page
# ...)
# TODO
# <big>,<small>,<center>,<div>,<span>,<b>,<i>,,<section>,''',''
good_tags = wikitext.revision.tag_names_matching(
r"big|small|center|div|span|b|i|poem|section",
name="wikitext.revision.good_tags")
expected_markup = aggregators.len(
wikitext.revision.datasources.tokens_matching(r"'''|''"),
name="wiktext.revision.expected_markup")
page = [
wikitext.revision.chars,</section></i></b></span></div></center></small></big>
Portuguese Wikipedia
++++++++++++++++++++
"""
import re
from revscoring import Feature
from revscoring.datasources.meta import filters, mappers
from revscoring.features import wikitext
from revscoring.features.meta import aggregators
from revscoring.features.modifiers import log, max, sub
from revscoring.languages import portuguese
from . import wikipedia
# Templates
infobox_templates = wikitext.revision.template_names_matching(
r"(Info|Infobox)", name="ptwiki.revision.infobox_templates")
CN_TEMPLATES = [
r"Carece[ _]de[ _]fontes",
r"Carece[ _]de[ _]fontes2",
r"Carece[ _]de[ _]fontes/bloco",
r"Carece[ _]de[ _]fontes/bloco2"
]
cn_templates = wikitext.revision.template_names_matching(
"|".join(CN_TEMPLATES), name="ptwiki.revision.cn_templates")
MAIN_TEMPLATES = [
r"Artigo[ _]principal",
r"Ver[ _]artigo[ _]principal",
r"Principal",
r"Ver[ _]também artigo[ _]principal",
r"Main",
r"Detalhes",
"""
Russian Wikipedia
+++++++++++++++++
"""
from revscoring.features import wikitext
from revscoring.features.modifiers import max
from revscoring.languages import russian
from . import wikipedia
cn_templates = wikitext.revision.template_names_matching(
r"Нет[ _]АИ",
name="ruwiki.revision.cn_templates")
# Links
category_links = wikitext.revision.wikilink_titles_matching(
r"Category|Категория\:", name="revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
r"File|Image|Файл\:", name="revision.image_links")
local_wiki = [
russian.stemmed.revision.stem_chars,
(russian.stemmed.revision.stem_chars /
max(wikitext.revision.content_chars, 1)),
image_links,
image_links / max(wikitext.revision.content_chars, 1),
category_links,