How to use the smt.utils.utility.mkcorpus function in smt

To help you get started, we’ve selected a few smt examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kenkov / smt / test / test_phrase.py View on Github external
def test_symmetrization(self):
        sentenses = [("僕 は 男 です", "I am a man"),
                     ("私 は 女 です", "I am a girl"),
                     ("私 は 先生 です", "I am a teacher"),
                     ("彼女 は 先生 です", "She is a teacher"),
                     ("彼 は 先生 です", "He is a teacher"),
                     ]
        corpus = mkcorpus(sentenses)
        es = "私 は 先生 です".split()
        fs = "I am a teacher".split()
        syn = symmetrization(es, fs, corpus)
        ans = set([(1, 1), (1, 2), (2, 3), (3, 4), (4, 3)])
        self.assertEqual(syn, ans)
github kenkov / smt / test / test_phrase.py View on Github external
(('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
                    ('dass', 'er', 'im', 'haus', 'bleibt')),
                   (('will', 'stay'), ('bleibt',)),
                   (('will', 'stay', 'in', 'the', 'house'),
                    ('im', 'haus', 'bleibt'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)

        # another test
        es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
        sentenses = [("僕 は 男 です", "I am a man"),
                     ("私 は 女 です", "I am a girl"),
                     ("私 は 先生 です", "I am a teacher"),
                     ("彼女 は 先生 です", "She is a teacher"),
                     ("彼 は 先生 です", "He is a teacher"),
                     ]
        corpus = mkcorpus(sentenses)
        alignment = symmetrization(es, fs, corpus)
        ans = set([(('\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'),
                    ('a', 'teacher')),
                   (('\xe5\x85\x88\xe7\x94\x9f',), ('teacher',)),
                   (('\xe7\xa7\x81',), ('I', 'am')),
                   (('\xe7\xa7\x81',
                     '\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'),
                    ('I', 'am', 'a', 'teacher'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)
github kenkov / smt / smt / phrase / word_alignment.py View on Github external
(8, 8), (9, 9), (5, 10), (6, 10)]
    from smt.utils.utility import matrix
    print(matrix(len(es), len(fs), e2f, es, fs))
    print(matrix(len(es), len(fs), f2e, es, fs))
    ali = _alignment(es, fs, e2f, f2e)
    print(matrix(len(es), len(fs), ali, es, fs))

    # test for symmetrization
    from smt.utils.utility import mkcorpus
    sentenses = [("僕 は 男 です", "I am a man"),
                 ("私 は 女 です", "I am a girl"),
                 ("私 は 先生 です", "I am a teacher"),
                 ("彼女 は 先生 です", "She is a teacher"),
                 ("彼 は 先生 です", "He is a teacher"),
                 ]
    corpus = mkcorpus(sentenses)
    es = "私 は 先生 です".split()
    fs = "I am a teacher".split()
    syn = symmetrization(es, fs, corpus)
    pprint(syn)
    print(matrix(len(es), len(fs), syn, es, fs))
github kenkov / smt / smt / phrase / phrase_extract.py View on Github external
#                 (6, 10),
    #                 (7, 8),
    #                 (8, 8),
    #                 (9, 9)])
    #pprint(phrase_extract(es, fs, alignment))

    # test2
    from smt.utils.utility import mkcorpus
    from word_alignment import symmetrization
    sentenses = [("僕 は 男 です", "I am a man"),
                 ("私 は 女 です", "I am a girl"),
                 ("私 は 先生 です", "I am a teacher"),
                 ("彼女 は 先生 です", "She is a teacher"),
                 ("彼 は 先生 です", "He is a teacher"),
                 ]
    corpus = mkcorpus(sentenses)
    es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
    alignment = symmetrization(es, fs, corpus)
    ext = phrase_extract(es, fs, alignment)
    for e, f in ext:
        print(' '.join(e), "<->", ' '.join(f))

    ## phrases
    fs = "I am a teacher".split()
    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    print(phrases)
github kenkov / smt / smt / phrase / phrase_extract.py View on Github external
def test_available_phrases():
    from smt.utils.utility import mkcorpus
    from smt.phrase.word_alignment import symmetrization

    sentenses = [("僕 は 男 です", "I am a man"),
                 ("私 は 女 です", "I am a girl"),
                 ("私 は 先生 です", "I am a teacher"),
                 ("彼女 は 先生 です", "She is a teacher"),
                 ("彼 は 先生 です", "He is a teacher"),
                 ]

    corpus = mkcorpus(sentenses)
    es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
    alignment = symmetrization(es, fs, corpus)
    ext = phrase_extract(es, fs, alignment)
    ans = ("は 先生 です <-> a teacher",
           "先生 <-> teacher"
           "私 <-> I am"
           "私 は 先生 です <-> I am a teacher")
    for e, f in ext:
        print("{} {} {}".format(' '.join(e), "<->", ' '.join(f)))

    ## phrases
    fs = "I am a teacher".split()
    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    print(phrases)
    ans = {((1, 'I'), (2, 'am')),
           ((1, 'I'), (2, 'am'), (3, 'a'), (4, 'teacher')),
github kenkov / smt / smt / ibmmodel / ibmmodel2.py View on Github external
def train(sentences, loop_count=1000):
    #for i, j in sentences:
    #    print(i, j)
    corpus = utility.mkcorpus(sentences)
    return _train(corpus, loop_count)