How to use epitran - 10 common examples

To help you get started, we’ve selected a few epitran examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmort27 / epitran / epitran / bin / testvectorgen.py View on Github external
def main(code, space, infile):
    vec = epitran.vector.VectorsWithIPASpace(code, space)
    with codecs.open(infile, 'r', 'utf-8') as f:
        for line in f:
            fields = line.split('\t')
            if len(fields) > 1:
                word = fields[0]
                print(u"WORD: {}".format(word).encode('utf-8'))
                segs = vec.word_to_segs(word)
                for record in segs:
                    cat, case, orth, phon, id_, vector = record
                    print(u"Category: {}".format(cat).encode('utf-8'))
                    print(u"Case: {}".format(case).encode('utf-8'))
                    print(u"Orthographic: {}".format(orth).encode('utf-8'))
                    print(u"Phonetic: {}".format(phon).encode('utf-8'))
                    print(u"Vector: {}".format(vector).encode('utf-8'))
github dmort27 / epitran / epitran / flite.py View on Github external
def english_g2p(self, text):
        text = self.normalize(text)
        try:
            arpa_text = subprocess.check_output(['t2p', '"{}"'.format(text)])
            arpa_text = arpa_text.decode('utf-8')
        except OSError:
            logging.warning('t2p (from flite) is not installed.')
            arpa_text = ''
        except subprocess.CalledProcessError:
            logging.warning('Non-zero exit status from t2p.')
            arpa_text = ''
        return self.arpa_to_ipa(arpa_text)


class FliteLexLookup(Flite):
    """Flite G2P using lex_lookup."""

    def arpa_text_to_list(self, arpa_text):
        return arpa_text[1:-1].split(' ')

    def english_g2p(self, text):
        text = self.normalize(text).lower()
        try:
            arpa_text = subprocess.check_output(['lex_lookup', text])
            arpa_text = arpa_text.decode('utf-8')
        except OSError:
            logging.warning('lex_lookup (from flite) is not installed.')
            arpa_text = ''
        except subprocess.CalledProcessError:
            logging.warning('Non-zero exit status from lex_lookup.')
            arpa_text = ''
github dmort27 / epitran / epitran / bin / epitranscribe.py View on Github external
def main(code):
    epi = epitran.Epitran(code)
    for line in sys.stdin:  # pointless
        line = line.decode('utf-8')
        line = unicodedata.normalize('NFD', line.lower())
        line = epi.transliterate(line)
        line = line.encode('utf-8')
        sys.stdout.write(line)
github neulab / cmu-ner / utils / segnerfts / orm_morph.py View on Github external
# dictionary.
#
######################################
setSWords = []
# setSList = open("/home/data/LoReHLT17/internal/Lexicons/orm_lexicon/setS_wordlist.txt", "r")
setSList = open("../utils/segnerfts/res/setS_wordlist.txt", "r")

for line in setSList:
	setSWords.append(line.strip())

def get_freq_dist():
    freq = FreqDist()
    freq.update(brown.words())
    freq.update(setSWords)
    return freq
epi = epitran.Epitran("orm-Latn")
g2p = epi.transliterate

def stripFinalVowel(string):
	if string[-2:] in ["aa", "ee", "ii", "oo", "uu"]:
		return string[:-2]
	elif string[-1] in ["a", "e", "i", "o", "u"]:
		return string[:-1]
	else:
		return string

def get_dictionary(dict_filenames):
    l1_to_l2 = defaultdict(list)
    
    for dict_filename in dict_filenames:
	if os.path.isfile(dict_filename):
    		with open(dict_filename, "r", encoding="utf-8") as fin:
github dmort27 / epitran / epitran / reromanize.py View on Github external
def __init__(self, code, table, decompose=True, cedict_file=None):
        """Construct object for re-romanizing Epitran output.

        This class converts orthographic input, via Epitran, to a more
        conventional romanization that should be more readable to most humans.

        Args:
            code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
            table (str): Name of re-romanization table
            decompose (bool): apply decomposing normalization
        """
        self.epi = epitran.Epitran(code, cedict_file=cedict_file)
        self.mapping = self._load_reromanizer(table, decompose)
github dmort27 / epitran / epitran / space.py View on Github external
def __init__(self, code, space_names):
        """Construct a Space object

        Space objects take strings (corresponding to segments) and return
        integers, placing them in an integer space that can be translated into
        a one-hot vector.

        The resulting object has a dictionary-like interface that supports
        indexing and iteration over "keys".

        Args:
            code (str): ISO 639-3 code joined to ISO 15924 code with "-"
            space_names (list): list of space names consisting of ISO 639-3
            codes joined to ISO 15924 codes with "-"
        """
        self.epi = Epitran(code)
        self.dict = self._load_space(space_names)
github dmort27 / epitran / epitran / bin / connl2ipaspace.py View on Github external
def main(code, op, infiles, output):
    epi = epitran.Epitran(code)
    ft = panphon.FeatureTable()
    space = Counter()
    for fn in infiles:
        logging.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
        add_file = add_file_op if op else add_file_gen
        space.update(add_file(epi, ft, fn))
    print_space(output, space)
github dmort27 / epitran / epitran / vector.py View on Github external
def __init__(self, code, space_names):
        """Constructs VectorWithIPASpace object

        A VectorWithIPASpace object takes orthographic words, via the
        word_to_segs method, and returns a list of tuples consisting of category
        (letter or punctuation), lettercaase, orthographic form, phonetic form,
        id within an IPA space, and articulatory feature vector.

        Args:
            code (str): ISO 639-3 code joined to ISO 15924 code with "-"
            space_names (list): list of space names consisting of ISO 639-3
                                codes joined to ISO 15924 codes with "-"
        """
        self.epi = Epitran(code)
        self.space = Space(code, space_names)
github dmort27 / epitran / epitran / simple.py View on Github external
def _load_g2p_map(self, code, rev):
        """Load the code table for the specified language.

        Args:
            code (str): ISO 639-3 code plus "-" plus ISO 15924 code for the
                        language/script to be loaded
            rev (boolean): True for reversing the table (for reverse transliterating)
        """
        g2p = defaultdict(list)
        gr_by_line = defaultdict(list)
        code += '_rev' if rev else ''
        try:
            path = os.path.join('data', 'map', code + '.csv')
            path = pkg_resources.resource_filename(__name__, path)
        except IndexError:
            raise DatafileError('Add an appropriately-named mapping to the data/maps directory.')
        with open(path, 'rb') as f:
            reader = csv.reader(f, encoding='utf-8')
            orth, phon = next(reader)
            if orth != 'Orth' or phon != 'Phon':
                raise DatafileError('Header is ["{}", "{}"] instead of ["Orth", "Phon"].'.format(orth, phon))
            for (i, fields) in enumerate(reader):
                try:
                    graph, phon = fields
                except ValueError:
                    raise DatafileError('Map file is not well formed at line {}.'.format(i + 2))
                graph = unicodedata.normalize('NFD', graph)
                phon = unicodedata.normalize('NFD', phon)
                g2p[graph].append(phon)
                gr_by_line[graph].append(i)
        if self._one_to_many_gr_by_line_map(g2p):
            graph, lines = self._one_to_many_gr_by_line_map(gr_by_line)
github dmort27 / epitran / epitran / rules.py View on Github external
else:
                line = self._sub_symbols(line)
                r = re.match(r'(\S+)\s*->\s*(\S+)\s*/\s*(\S*)\s*[_]\s*(\S*)', line)
                try:
                    a, b, X, Y = r.groups()
                except AttributeError:
                    raise DatafileError('Line {}: "{}" cannot be parsed.'.format(i + 1, line))
                X, Y = X.replace('#', '^'), Y.replace('#', '$')
                a, b = a.replace('0', ''), b.replace('0', '')
                try:
                    if re.search(r'[?]P[<]sw1[>].+[?]P[<]sw2[>]', a):
                        return self._fields_to_function_metathesis(a, X, Y)
                    else:
                        return self._fields_to_function(a, b, X, Y)
                except Exception as e:
                    raise DatafileError('Line {}: "{}" cannot be compiled as regex: ̪{}'.format(i + 1, line, e))