How to use the snowballstemmer.stemmer function in snowballstemmer

To help you get started, we’ve selected a few snowballstemmer examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibukawa / snowball_py / sample / testapp.py View on Github external
def main():
    argv = sys.argv
    if len(argv) < 2:
        usage()
        return
    algorithm = 'english'
    if len(argv) > 2:
        algorithm = argv[1]
        argv = argv[2:]
    else:
        argv = argv[1:]
    stemmer = snowballstemmer.stemmer(algorithm)
    splitter = re.compile(r"[\s\.-]")
    for arg in argv:
        for word in splitter.split(arg):
            if word == '':
                continue
            original = word.lower()
            print(original + " -> " + stemmer.stemWord(original))
main()
github sphinx-doc / sphinx / sphinx / search / tr.py View on Github external
def init(self, options: Dict) -> None:
        self.stemmer = snowballstemmer.stemmer('turkish')
github sphinx-doc / sphinx / sphinx / search / sv.py View on Github external
def init(self, options: Dict) -> None:
        self.stemmer = snowballstemmer.stemmer('swedish')
github sphinx-doc / sphinx / sphinx / search / ro.py View on Github external
def init(self, options: Dict) -> None:
        self.stemmer = snowballstemmer.stemmer('romanian')
github sphinx-doc / sphinx / sphinx / search / hu.py View on Github external
def init(self, options: Dict) -> None:
        self.stemmer = snowballstemmer.stemmer('hungarian')
github takahiro-777 / nlp_tutorial / nlp100_Python / 72 / main.py View on Github external
'twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,'
	'will,with,would,yet,you,your').lower().split(',')


def is_stopword(str):
	'''文字がストップワードかどうかを返す
	大小文字は同一視する

	戻り値:
	ストップワードならTrue、違う場合はFalse
	'''
	return str.lower() in stop_words


# 素性抽出
stemmer = snowballstemmer.stemmer('english')
word_counter = Counter()

with codecs.open(fname_sentiment, 'r', fencoding) as file_in:
	for line in file_in:
		for word in line[3:].split(' '):		# line[3:]で極性ラベル除去

			# 前後の空白文字除去
			word = word.strip()

			# ストップワード除去
			if is_stopword(word):
				continue

			# ステミング
			word = stemmer.stemWord(word)
github facebookresearch / craftassist / python / base_agent / ttad / generation_dialogues / build_scene.py View on Github external
import uuid
import json

TTAD_GEN_DIR = os.path.dirname(os.path.realpath(__file__))
CRAFTASSIST_DIR = os.path.join(TTAD_GEN_DIR, "../../")
sys.path.append(CRAFTASSIST_DIR)
from generate_data import *
import re
from dialogue_objects.interpreter_helper import coref_resolve, interpret_shape_schematic
from word_maps import SPECIAL_SHAPE_FNS, SPECIAL_SHAPES_CANONICALIZE, SPAWN_OBJECTS
from size_words import size_str_to_int
import shapes
import block_data
import snowballstemmer

stemmer = snowballstemmer.stemmer("english")

# from word2number.w2n import word_to_num

CHOICES = [Move, Build, Destroy, Dig, Copy, Fill, Spawn, Dance]


#############################################################
# modified from size_words...
#############################################################

RANGES = {
    "tiny": (2, 3),
    "small": (2, 3),
    "medium": (2, 4),
    "large": (4, 5),
    "huge": (5, 6),
github arangodb / arangodb / 3rdParty / snowball / python / stemwords.py View on Github external
def stemming(lang, input, output, encoding, pretty):
    stemmer = snowballstemmer.stemmer(lang)
    with codecs.open(output, "w", encoding) as outfile:
        with codecs.open(input, "r", encoding) as infile:
            for original in infile.readlines():
                original = original.strip()
                # Convert only ASCII-letters to lowercase, to match C behavior
                original = ''.join((c.lower() if 'A' <= c <= 'Z' else c for c in original))
                stemmed = stemmer.stemWord(original)
                if pretty == 0:
                    if stemmed != "":
                        outfile.write(stemmed)
                elif pretty == 1:
                    outfile.write(original, " -> ", stemmed)
                elif pretty == 2:
                    outfile.write(original)
                    if len(original) < 30:
                        outfile.write(" " * (30 - len(original)))
github neegor / wanish / wanish / summarizer.py View on Github external
def textrank(text, hdr):
    # finding out the most possible language of the text
    lang_code = lang_identifier.classify(' '.join([hdr, text]))[0]

    # tokenizing for words
    sentences = [sentence for sentence in split_multi(text)]

    stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english'))

    words = [set(stemmer.stemWord(word) for word in word_tokenizer(sentence.lower()) if word.isalpha())
             for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True), lang_code