Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main(code, space, infile):
vec = epitran.vector.VectorsWithIPASpace(code, space)
with codecs.open(infile, 'r', 'utf-8') as f:
for line in f:
fields = line.split('\t')
if len(fields) > 1:
word = fields[0]
print(u"WORD: {}".format(word).encode('utf-8'))
segs = vec.word_to_segs(word)
for record in segs:
cat, case, orth, phon, id_, vector = record
print(u"Category: {}".format(cat).encode('utf-8'))
print(u"Case: {}".format(case).encode('utf-8'))
print(u"Orthographic: {}".format(orth).encode('utf-8'))
print(u"Phonetic: {}".format(phon).encode('utf-8'))
print(u"Vector: {}".format(vector).encode('utf-8'))
def english_g2p(self, text):
text = self.normalize(text)
try:
arpa_text = subprocess.check_output(['t2p', '"{}"'.format(text)])
arpa_text = arpa_text.decode('utf-8')
except OSError:
logging.warning('t2p (from flite) is not installed.')
arpa_text = ''
except subprocess.CalledProcessError:
logging.warning('Non-zero exit status from t2p.')
arpa_text = ''
return self.arpa_to_ipa(arpa_text)
class FliteLexLookup(Flite):
"""Flite G2P using lex_lookup."""
def arpa_text_to_list(self, arpa_text):
return arpa_text[1:-1].split(' ')
def english_g2p(self, text):
text = self.normalize(text).lower()
try:
arpa_text = subprocess.check_output(['lex_lookup', text])
arpa_text = arpa_text.decode('utf-8')
except OSError:
logging.warning('lex_lookup (from flite) is not installed.')
arpa_text = ''
except subprocess.CalledProcessError:
logging.warning('Non-zero exit status from lex_lookup.')
arpa_text = ''
def main(code):
epi = epitran.Epitran(code)
for line in sys.stdin: # pointless
line = line.decode('utf-8')
line = unicodedata.normalize('NFD', line.lower())
line = epi.transliterate(line)
line = line.encode('utf-8')
sys.stdout.write(line)
# dictionary.
#
######################################
setSWords = []
# setSList = open("/home/data/LoReHLT17/internal/Lexicons/orm_lexicon/setS_wordlist.txt", "r")
setSList = open("../utils/segnerfts/res/setS_wordlist.txt", "r")
for line in setSList:
setSWords.append(line.strip())
def get_freq_dist():
freq = FreqDist()
freq.update(brown.words())
freq.update(setSWords)
return freq
epi = epitran.Epitran("orm-Latn")
g2p = epi.transliterate
def stripFinalVowel(string):
if string[-2:] in ["aa", "ee", "ii", "oo", "uu"]:
return string[:-2]
elif string[-1] in ["a", "e", "i", "o", "u"]:
return string[:-1]
else:
return string
def get_dictionary(dict_filenames):
l1_to_l2 = defaultdict(list)
for dict_filename in dict_filenames:
if os.path.isfile(dict_filename):
with open(dict_filename, "r", encoding="utf-8") as fin:
def __init__(self, code, table, decompose=True, cedict_file=None):
"""Construct object for re-romanizing Epitran output.
This class converts orthographic input, via Epitran, to a more
conventional romanization that should be more readable to most humans.
Args:
code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
table (str): Name of re-romanization table
decompose (bool): apply decomposing normalization
"""
self.epi = epitran.Epitran(code, cedict_file=cedict_file)
self.mapping = self._load_reromanizer(table, decompose)
def __init__(self, code, space_names):
"""Construct a Space object
Space objects take strings (corresponding to segments) and return
integers, placing them in an integer space that can be translated into
a one-hot vector.
The resulting object has a dictionary-like interface that supports
indexing and iteration over "keys".
Args:
code (str): ISO 639-3 code joined to ISO 15924 code with "-"
space_names (list): list of space names consisting of ISO 639-3
codes joined to ISO 15924 codes with "-"
"""
self.epi = Epitran(code)
self.dict = self._load_space(space_names)
def main(code, op, infiles, output):
epi = epitran.Epitran(code)
ft = panphon.FeatureTable()
space = Counter()
for fn in infiles:
logging.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
add_file = add_file_op if op else add_file_gen
space.update(add_file(epi, ft, fn))
print_space(output, space)
def __init__(self, code, space_names):
"""Constructs VectorWithIPASpace object
A VectorWithIPASpace object takes orthographic words, via the
word_to_segs method, and returns a list of tuples consisting of category
(letter or punctuation), lettercaase, orthographic form, phonetic form,
id within an IPA space, and articulatory feature vector.
Args:
code (str): ISO 639-3 code joined to ISO 15924 code with "-"
space_names (list): list of space names consisting of ISO 639-3
codes joined to ISO 15924 codes with "-"
"""
self.epi = Epitran(code)
self.space = Space(code, space_names)
def _load_g2p_map(self, code, rev):
"""Load the code table for the specified language.
Args:
code (str): ISO 639-3 code plus "-" plus ISO 15924 code for the
language/script to be loaded
rev (boolean): True for reversing the table (for reverse transliterating)
"""
g2p = defaultdict(list)
gr_by_line = defaultdict(list)
code += '_rev' if rev else ''
try:
path = os.path.join('data', 'map', code + '.csv')
path = pkg_resources.resource_filename(__name__, path)
except IndexError:
raise DatafileError('Add an appropriately-named mapping to the data/maps directory.')
with open(path, 'rb') as f:
reader = csv.reader(f, encoding='utf-8')
orth, phon = next(reader)
if orth != 'Orth' or phon != 'Phon':
raise DatafileError('Header is ["{}", "{}"] instead of ["Orth", "Phon"].'.format(orth, phon))
for (i, fields) in enumerate(reader):
try:
graph, phon = fields
except ValueError:
raise DatafileError('Map file is not well formed at line {}.'.format(i + 2))
graph = unicodedata.normalize('NFD', graph)
phon = unicodedata.normalize('NFD', phon)
g2p[graph].append(phon)
gr_by_line[graph].append(i)
if self._one_to_many_gr_by_line_map(g2p):
graph, lines = self._one_to_many_gr_by_line_map(gr_by_line)
else:
line = self._sub_symbols(line)
r = re.match(r'(\S+)\s*->\s*(\S+)\s*/\s*(\S*)\s*[_]\s*(\S*)', line)
try:
a, b, X, Y = r.groups()
except AttributeError:
raise DatafileError('Line {}: "{}" cannot be parsed.'.format(i + 1, line))
X, Y = X.replace('#', '^'), Y.replace('#', '$')
a, b = a.replace('0', ''), b.replace('0', '')
try:
if re.search(r'[?]P[<]sw1[>].+[?]P[<]sw2[>]', a):
return self._fields_to_function_metathesis(a, X, Y)
else:
return self._fields_to_function(a, b, X, Y)
except Exception as e:
raise DatafileError('Line {}: "{}" cannot be compiled as regex: ̪{}'.format(i + 1, line, e))