How to use cltk - 10 common examples

To help you get started, we’ve selected a few cltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github cltk / cltk / cltk / phonology / orthophonology.py View on Github external
def matches(self, other):
        return make_phoneme(self).matches(other)

    def __eq__(self, other):
        return False if type(self) != type(other) else IntEnum.__eq__(self, other)

    def __floordiv__(self, other):
        return make_phoneme(self) // other


class Consonantal(PhonologicalFeature):
    neg = auto()
    pos = auto()


class Voiced(PhonologicalFeature):
    neg = auto()
    pos = auto()


class Aspirated(PhonologicalFeature):
    neg = auto()
    pos = auto()


class Geminate(PhonologicalFeature):
    neg = auto()
    pos = auto()


class Roundedness(PhonologicalFeature):
    neg = auto()
github cltk / cltk / cltk / phonology / orthophonology.py View on Github external
return make_phoneme(self) <= other

    def __ge__(self, other):
        return make_phoneme(self) >= other

    def matches(self, other):
        return make_phoneme(self).matches(other)

    def __eq__(self, other):
        return False if type(self) != type(other) else IntEnum.__eq__(self, other)

    def __floordiv__(self, other):
        return make_phoneme(self) // other


class Consonantal(PhonologicalFeature):
    neg = auto()
    pos = auto()


class Voiced(PhonologicalFeature):
    neg = auto()
    pos = auto()


class Aspirated(PhonologicalFeature):
    neg = auto()
    pos = auto()


class Geminate(PhonologicalFeature):
    neg = auto()
github cltk / cltk / cltk / corpus / greek / tei.py View on Github external
def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = os.path.expanduser(
        get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml')
    xml_dir = os.path.normpath(get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.')
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if '__cts__' not in path]

    # new dir
    new_dir = os.path.normpath(get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek_plaintext/')
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip('.xml')
        xml_name += '.txt'

        plain_text = ''
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
github cltk / cltk / cltk / corpus / utils / importer.py View on Github external
# git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
        # self._download_corpus(corpus_type, corpus_name, path)
        type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
        type_dir = os.path.expanduser(type_dir_rel)
        repo_name = uri.split('/')[-1]  # eg, 'latin_corpus_newton_example.git'
        repo_name = repo_name.rstrip('.git')
        target_dir = os.path.join(type_dir, repo_name)
        target_file = os.path.join(type_dir, repo_name, 'README.md')
        # check if corpus already present
        # if not, clone
        if not os.path.isfile(target_file):
            if not os.path.isdir(type_dir):
                os.makedirs(type_dir)
            try:
                msg = "Cloning '{}' from '{}'".format(corpus_name, uri)
                logger.info(msg)
                Repo.clone_from(uri, target_dir, branch=branch, depth=1,
                                progress=ProgressPrinter())
            except CorpusImportError as corpus_imp_err:
                msg = "Git clone of '{}' failed: '{}'".format(uri, corpus_imp_err)
                logger.error(msg)
        # if corpus is present, pull latest
        else:
            try:
                repo = Repo(target_dir)
                assert not repo.bare  # or: assert repo.exists()
                git_origin = repo.remotes.origin
                msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri)
                logger.info(msg)
                git_origin.pull()
            except CorpusImportError as corpus_imp_err:
                msg = "Git pull of '{}' failed: '{}'".format(uri, corpus_imp_err)
github cltk / cltk / cltk / corpus / utils / importer.py View on Github external
logger.info(msg)
            if corpus_name in ('phi5', 'phi7', 'tlg'):
                if corpus_name == 'phi5':
                    # normalize path for checking dir
                    if local_path.endswith('/'):
                        local_path = local_path[:-1]
                    # check for right corpus dir
                    if os.path.split(local_path)[1] != 'PHI5':
                        logger.info("Directory must be named 'PHI5'.")
                if corpus_name == 'phi7':
                    # normalize local_path for checking dir
                    if local_path.endswith('/'):
                        local_path = local_path[:-1]
                    # check for right corpus dir
                    if os.path.split(local_path)[1] != 'PHI7':
                        logger.info("Directory must be named 'PHI7'.")
                if corpus_name == 'tlg':
                    # normalize path for checking dir
                    if local_path.endswith('/'):
                        local_path = local_path[:-1]
                    # check for right corpus dir
                    if os.path.split(local_path)[1] != 'TLG_E':
                        logger.info("Directory must be named 'TLG_E'.")
                # move the dir-checking commands into a function
                data_dir = os.path.expanduser(CLTK_DATA_DIR)
                originals_dir = os.path.join(data_dir, 'originals')
                # check for `originals` dir; if not present mkdir
                if not os.path.isdir(originals_dir):
                    os.makedirs(originals_dir)
                    msg = "Wrote directory at '{}'.".format(originals_dir)
                    logger.info(msg)
                tlg_originals_dir = os.path.join(data_dir,
github cltk / cltk / cltk / tag / lapos.py View on Github external
def tag_sentence(self, sentence):
        """Tag using Lapos model.

        TODO: Figure out how to pre-load model (loading is really slow). Or force users to bulk-convert files or strings.
        """
        fp_lapos = os.path.expanduser('~/cltk_data/multilingual/software/lapos')
        fp_model = os.path.expanduser('~/cltk_data/{0}/model/{1}_models_cltk/taggers/pos'.format(self.language, self.language))  # rel from Lapos dir
        try:
            lapos_command = 'cd {0} && echo "{1}" | ./lapos -t -m {2}'.format(fp_lapos, sentence, fp_model)
            p_out = subprocess.check_output(lapos_command,
                                            shell=True,
                                            stderr=subprocess.STDOUT,
                                            universal_newlines=True)
        except subprocess.CalledProcessError as cp_err:
            logger.error('Lapos call failed. Check installation.')
            logger.error(sentence)
            print(cp_err)
            raise

        # Parse output from Lapos
        # TODO: Make this cleaner/faster
        output_list = p_out.split('\n')
        output_list_filtered = [l for l in output_list if not l.startswith('loading the models')]
        output_list_filtered = [l for l in output_list_filtered if not l == 'done']
        output_list_filtered = [l for l in output_list_filtered if l]

        for line in output_list_filtered:
            word_tags = line.split(' ')
            tagged_sentence = []
            for word_tag in word_tags:
                word, tag = word_tag.split('/')
                word_tag_tuple = (word, tag)
github cltk / cltk / cltk / prosody / latin / VerseScanner.py View on Github external
words = line.split(" ")
        space_list = StringUtils.space_list(line)
        corrected_words = []
        for word in words:
            found = False
            for prefix in self.constants.PREFIXES:
                if word.startswith(prefix) and word != prefix:
                    corrected_words.append(self.syllabifier.convert_consonantal_i(prefix))
                    corrected_words.append(
                        self.syllabifier.convert_consonantal_i(word[len(prefix):]))
                    found = True
                    break
            if not found:
                corrected_words.append(self.syllabifier.convert_consonantal_i(word))
        new_line = StringUtils.join_syllables_spaces(corrected_words, space_list)
        char_list = StringUtils.overwrite(list(new_line),
                                          r"\b[iī][{}]".format(
                                              self.constants.VOWELS + self.constants.ACCENTED_VOWELS),
                                          "j")
        char_list = StringUtils.overwrite(char_list,
                                          r"\b[I][{}]".format(self.constants.VOWELS_WO_I),
                                          "J")
        char_list = StringUtils.overwrite(char_list, r"[{}][i][{}]".format(
            self.constants.VOWELS_WO_I, self.constants.VOWELS),
                                          "j", 1)
        return "".join(char_list)
github cltk / cltk / cltk / prosody / latin / VerseScanner.py View on Github external
def transform_i_to_j(self, line: str) -> str:
        """Transform instances of consonantal i to j
        :param line:
        :return:

        >>> print(VerseScanner().transform_i_to_j("iactātus"))
        jactātus
        >>> print(VerseScanner().transform_i_to_j("bracchia"))
        bracchia
        """

        words = line.split(" ")
        space_list = StringUtils.space_list(line)
        corrected_words = []
        for word in words:
            found = False
            for prefix in self.constants.PREFIXES:
                if word.startswith(prefix) and word != prefix:
                    corrected_words.append(self.syllabifier.convert_consonantal_i(prefix))
                    corrected_words.append(
                        self.syllabifier.convert_consonantal_i(word[len(prefix):]))
                    found = True
                    break
            if not found:
                corrected_words.append(self.syllabifier.convert_consonantal_i(word))
        new_line = StringUtils.join_syllables_spaces(corrected_words, space_list)
        char_list = StringUtils.overwrite(list(new_line),
                                          r"\b[iī][{}]".format(
                                              self.constants.VOWELS + self.constants.ACCENTED_VOWELS),
github cltk / cltk / cltk / prosody / latin / VerseScanner.py View on Github external
words = line.split(" ")
        space_list = StringUtils.space_list(line)
        corrected_words = []
        for word in words:
            found = False
            for prefix in self.constants.PREFIXES:
                if word.startswith(prefix) and word != prefix:
                    corrected_words.append(self.syllabifier.convert_consonantal_i(prefix))
                    corrected_words.append(
                        self.syllabifier.convert_consonantal_i(word[len(prefix):]))
                    found = True
                    break
            if not found:
                corrected_words.append(self.syllabifier.convert_consonantal_i(word))
        new_line = StringUtils.join_syllables_spaces(corrected_words, space_list)
        char_list = StringUtils.overwrite(list(new_line),
                                          r"\b[iī][{}]".format(
                                              self.constants.VOWELS + self.constants.ACCENTED_VOWELS),
                                          "j")
        char_list = StringUtils.overwrite(char_list,
                                          r"\b[I][{}]".format(self.constants.VOWELS_WO_I),
                                          "J")
        char_list = StringUtils.overwrite(char_list, r"[{}][i][{}]".format(
            self.constants.VOWELS_WO_I, self.constants.VOWELS),
                                          "j", 1)
        return "".join(char_list)
github cltk / cltk / cltk / corpus / utils / importer.py View on Github external
def _get_corpus_properties(self, corpus_name):
        """Check whether a corpus is available for import.
        :type corpus_name: str
        :param corpus_name: Name of available corpus.
        :rtype : str
        """
        try:
            # corpora = LANGUAGE_CORPORA[self.language]
            corpora = self.all_corpora
        except NameError as name_error:
            msg = 'Corpus not available for language ' \
                  '"%s": %s' % (self.language, name_error)
            logger.error(msg)
            raise CorpusImportError(msg)
        for corpus_properties in corpora:
            if corpus_properties['name'] == corpus_name:
                return corpus_properties
        msg = 'Corpus "%s" not available for the ' \
              '"%s" language.' % (corpus_name, self.language)
        logger.error(msg)
        raise CorpusImportError(msg)