How to use Unidecode - 10 common examples

To help you get started, we’ve selected a few Unidecode examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github huseinzol05 / Python-DevOps / basic / 1.autopep8 / malaya / main.py View on Github external
def textcleaning(string):
    string = re.sub(r'http\S+|www.\S+', '', ' '.join(
        [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]))
    string = unidecode(string).replace('.', '. ')
    string = string.replace(',', ', ')
    string = re.sub(r'[^\'\"A-Za-z\- ]+', '', unidecode(string))
    string = [
        y.strip() for y in word_tokenize(
            string.lower()) if isWord(
            y.strip())]
    string = [y for y in string if all(
        [y.find(k) < 0 for k in list_laughing]) and y[:len(y) // 2] != y[len(y) // 2:]]
    string = ' '.join(string).lower()
    string = (''.join(''.join(s)[:2]
                      for _, s in itertools.groupby(string))).split()
    return ' '.join([y for y in string if y not in STOPWORDS])
github avian2 / unidecode / tests / test_unidecode.py View on Github external
o = self.unidecode(s)

            self.assertEqual('THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG 1234567890', o)

    def test_enclosed_alphanumerics(self):
        self.assertEqual(
            'aA20(20)20.20100',
            self.unidecode(_u('ⓐⒶ⑳⒇⒛⓴⓾⓿')),
        )


class TestUnidecode(BaseTestUnidecode, unittest.TestCase):
    unidecode = staticmethod(unidecode)

class TestUnidecodeExpectASCII(BaseTestUnidecode, unittest.TestCase):
    unidecode = staticmethod(unidecode_expect_ascii)

class TestUnidecodeExpectNonASCII(BaseTestUnidecode, unittest.TestCase):
    unidecode = staticmethod(unidecode_expect_nonascii)

if __name__ == "__main__":
    unittest.main()
github avian2 / unidecode / tests / test_unidecode.py View on Github external
def test_enclosed_alphanumerics(self):
        self.assertEqual(
            'aA20(20)20.20100',
            self.unidecode(_u('ⓐⒶ⑳⒇⒛⓴⓾⓿')),
        )


class TestUnidecode(BaseTestUnidecode, unittest.TestCase):
    unidecode = staticmethod(unidecode)

class TestUnidecodeExpectASCII(BaseTestUnidecode, unittest.TestCase):
    unidecode = staticmethod(unidecode_expect_ascii)

class TestUnidecodeExpectNonASCII(BaseTestUnidecode, unittest.TestCase):
    unidecode = staticmethod(unidecode_expect_nonascii)

if __name__ == "__main__":
    unittest.main()
github Wordseer / wordseer / app / corenlp / corenlp.py View on Github external
def parse_parser_results(text):
    """ This is the nasty bit of code to interact with the command-line
    interface of the CoreNLP tools.  Takes a string of the parser results
    and then returns a Python list of dictionaries, one for each parsed
    sentence.
    """
    results = {"sentences": []}
    state = STATE_START
    lines = unidecode(text.decode('utf-8')).split("\n")
    for index, line in enumerate(lines):
        line = line.strip()

        if line.startswith("Sentence #"):
            sentence = {'words': [], 'parsetree': [], 'dependencies': []}
            results["sentences"].append(sentence)
            state = STATE_TEXT

        elif state == STATE_TEXT:
            sentence['text'] = remove_escapes(line)
            state = STATE_WORDS

        elif state == STATE_WORDS:
            if not line.startswith("[Text="):
                raise ParserError('Parse error. Could not find "[Text=" in: %s' % line)
            for s in WORD_PATTERN.findall(line):
github ekansa / open-context-py / opencontext_py / apps / exports / exptables / templating.py View on Github external
"""
        consolidated_ids = []
        consolidated_tuple_list = []
        look_person_list = raw_person_list
        last_count = self.exp_tab.row_count
        for raw_person in raw_person_list:
            # print('ID consolidated: ' + str(len(consolidated_ids)))
            if 'count' in raw_person:
                count = float(raw_person['count'])
            else:
                count = last_count
            last_count = count
            act_name = raw_person['label']
            if not isinstance(act_name, str):
                act_name = '[Not named]'
            act_uniname = unidecode(act_name)
            act_id = raw_person['id']
            for look_person in look_person_list:
                look_id = look_person['id']
                if not isinstance(look_person['label'], str):
                    look_person['label'] = '[Not named]'
                if look_id != act_id and \
                   look_id not in consolidated_ids and \
                   (act_name == look_person['label'] or act_uniname == unidecode(look_person['label'])):
                    # same name but different record for a person,
                    # lets consolidate it
                    consolidated_ids.append(look_person['id'])
                    count += float(look_person['count'])
            if act_id not in consolidated_ids:
                # print('Adding ' + str(unidecode(act_name)))
                person_tuple = (act_name, count)
                consolidated_tuple_list.append(person_tuple)
github spro / char-rnn.pytorch / helpers.py View on Github external
def read_file(filename):
    file = unidecode.unidecode(open(filename).read())
    return file, len(file)
github maximilianh / pubMunch / lib / pubCrawlLib-broken.py View on Github external
def writeDocIdStatus(outDir, pmid, msg, longMsg, details=None):
    """ append a line to doc status file in outDir """
    fname = join(outDir, PMIDSTATNAME)
    if isfile(fname):
        outFh = open(fname, 'a')
    else:
        outFh = open(fname, 'w')
    row = [str(pmid), unidecode.unidecode(msg)]
    if longMsg is not None:
        row.append(unidecode.unidecode(longMsg))
    if details is not None:
        row.append(unidecode.unidecode(details))
    row = [ x.encode('utf8') for x in row ]
    logging.info('Document status (pmid, logType, desc, details): %s' % ','.join(row))
    outFh.write('\t'.join(row))
    outFh.write('\n')
    return
github project-alice-assistant / ProjectAliceSkills / Tools / JsonValidator / src / DialogValidation.py View on Github external
def searchMissingSlotValues(values: list, slot: dict) -> list:
		if slot['automaticallyExtensible']:
			return list()

		allValues = list()
		for slotValue in slot['values']:
			allValues.append(unidecode(slotValue['value']).lower())
			allValues.extend([unidecode(x).lower() for x in slotValue.get('synonyms', list())])

		return [value for value in values if unidecode(value).lower() not in allValues]
github vrypan / bucket3 / bucket3 / b3tools.py View on Github external
def slugify(str):
    # Credit: http://stackoverflow.com/a/8366771
    str = urllib.parse.unquote(str)
    str = unidecode.unidecode(str).lower()
    ret = re.sub(r'\W+','-',str)
    return ret
github Pinafore / qb / bin / run_clm.py View on Github external
for corpus, qb, wiki, source in [("wiki", False, True, False),
                                     ("qb", True, False, False),
                                     ("source", False, False, True)
                                     ]:
        # Add training data
        start = time.time()
        for title, text in text_iterator(wiki, flags.wiki_location,
                                         qb, flags.question_db,
                                         source, flags.source_location,
                                         flags.max_pages,
                                         min_pages=min_answers):
            norm_title = lm.normalize_title(corpus, title)
            doc_num += 1
            if doc_num % 500 == 0 or time.time() - start > 10:
                print("Adding train doc %i, %s (%s)" %
                      (doc_num, unidecode(title), corpus))
                start = time.time()
            lm.add_train(norm_title, text)
            lm.add_train("compare_%i" % lm.compare(norm_title), text)

    print("Done training")
    if flags.lm_out:
        # Create the extractor object and write out the pickle
        o = open(flags.lm_out, 'w')
        lm.write_lm(o)

Unidecode

ASCII transliterations of Unicode text

GPL-2.0
Latest version published 12 months ago

Package Health Score

81 / 100
Full package analysis