How to use the cltk.utils.file_operations.open_pickle function in cltk

To help you get started, we’ve selected a few cltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github cltk / cltk / cltk / tag / pos.py View on Github external
def tag_ngram_12_backoff(self, untagged_string: str):
        """Tag POS with 1-, 2-gram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['ngram_12_backoff']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text
github cltk / cltk / cltk / tokenize / latin / sentence.py View on Github external
def __init__(self: object, language:str = 'latin', strict:bool = False):
        """
        :param language : language for sentence tokenization
        :type language: str
        :param strict : allow for stricter puctuation for sentence tokenization
        :type strict: bool
        """
        self.lang_vars = LatinLanguageVars()
        self.strict = strict
        super().__init__(language='latin', lang_vars=self.lang_vars)
        self.models_path = LatinPunktSentenceTokenizer.models_path

        try:
            self.model =  open_pickle(os.path.join(self.models_path, 'latin_punkt.pickle'))
        except FileNotFoundError as err:
            raise type(err)(LatinPunktSentenceTokenizer.missing_models_message)

        if self.strict:
            PunktLanguageVars.sent_end_chars=STRICT_PUNCTUATION
        else:
            PunktLanguageVars.sent_end_chars=PUNCTUATION
github cltk / cltk / cltk / lemmatize / latin / backoff.py View on Github external
def __init__(self, train, seed=3):
        self.train = train
        self.seed = seed
        
        rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
        path = os.path.expanduser(rel_path)

        # Check for presence of LATIN_OLD_MODEL
        file = 'latin_lemmata_cltk.pickle'      

        old_model_path = os.path.join(path, file)
        if os.path.isfile(old_model_path):
            self.LATIN_OLD_MODEL = open_pickle(old_model_path)
        else:
            self.LATIN_OLD_MODEL = {}
            print('The file %s is not available in cltk_data' % file)  
        
        # Check for presence of LATIN_MODEL
        file = 'latin_model.pickle'      

        model_path = os.path.join(path, file)
        if os.path.isfile(model_path):
            self.LATIN_MODEL = open_pickle(model_path)
        else:
            self.LATIN_MODEL = {}
            print('The file %s is not available in cltk_data' % file)  
        
        # Check for presence of misc_patterns
        self.latin_sub_patterns = latin_sub_patterns
github cltk / cltk / cltk / tag / pos.py View on Github external
def tag_tnt(self, untagged_string: str):
        """Tag POS with TnT tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['tnt']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text
github cltk / cltk / cltk / tokenize / sentence.py View on Github external
def __init__(self, language: str = None, lang_vars: object = None):
        """
        :param language : language for sentence tokenization
        :type language: str
        """
        self.language = language
        self.lang_vars = lang_vars
        super().__init__(language=self.language)
        if self.language:
            self.models_path = self._get_models_path(self.language)
            try:
                self.model = open_pickle(os.path.join(os.path.expanduser(self.models_path),
                                                      f'{self.language}_punkt.pickle'))
            except FileNotFoundError as err:
                raise type(err)(BasePunktSentenceTokenizer.missing_models_message)
github cltk / cltk / cltk / tag / pos.py View on Github external
def tag_ngram_123_backoff(self, untagged_string: str):
        """Tag POS with 1-, 2-, 3-gram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['ngram_123_backoff']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text
github cltk / cltk / cltk / lemmatize / latin / backoff.py View on Github external
# Check for presence of LATIN_OLD_MODEL
        file = 'latin_lemmata_cltk.pickle'      

        old_model_path = os.path.join(path, file)
        if os.path.isfile(old_model_path):
            self.LATIN_OLD_MODEL = open_pickle(old_model_path)
        else:
            self.LATIN_OLD_MODEL = {}
            print('The file %s is not available in cltk_data' % file)  
        
        # Check for presence of LATIN_MODEL
        file = 'latin_model.pickle'      

        model_path = os.path.join(path, file)
        if os.path.isfile(model_path):
            self.LATIN_MODEL = open_pickle(model_path)
        else:
            self.LATIN_MODEL = {}
            print('The file %s is not available in cltk_data' % file)  
        
        # Check for presence of misc_patterns
        self.latin_sub_patterns = latin_sub_patterns

        # Check for presence of verb_patterns
        self.latin_verb_patterns = latin_verb_patterns

        # Check for presence of latin_pps
        self.latin_pps = latin_pps

        def _randomize_data(train, seed):
            import random
            random.seed(seed)
github cltk / cltk / cltk / lemmatize / greek / backoff.py View on Github external
def __init__(self: object, train: List[list] = None, seed: int = 3, verbose: bool = False):
        self.models_path = BackoffGreekLemmatizer.models_path

        missing_models_message = "BackoffGreekLemmatizer requires the ```greek_models_cltk``` to be in cltk_data. Please load this corpus."

        try:
            self.train =  open_pickle(os.path.join(self.models_path, 'greek_lemmatized_sents.pickle'))
            self.GREEK_OLD_MODEL =  open_pickle(os.path.join(self.models_path, 'greek_lemmata_cltk.pickle'))
            self.GREEK_MODEL =  open_pickle(os.path.join(self.models_path, 'greek_model.pickle'))
        except FileNotFoundError as err:
            raise type(err)(missing_models_message)

        self.greek_sub_patterns = greek_sub_patterns # Move to greek_models_cltk

        self.seed = seed
        self.VERBOSE=verbose

        def _randomize_data(train: List[list], seed: int):
            import random
            random.seed(seed)
            random.shuffle(train)
            pos_train_sents = train[:4000]
            lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train]
            train_sents = lem_train_sents[:4000]
            test_sents = lem_train_sents[4000:5000]
github cltk / cltk / cltk / lemmatize / greek / backoff.py View on Github external
def __init__(self: object, train: List[list] = None, seed: int = 3, verbose: bool = False):
        self.models_path = BackoffGreekLemmatizer.models_path

        missing_models_message = "BackoffGreekLemmatizer requires the ```greek_models_cltk``` to be in cltk_data. Please load this corpus."

        try:
            self.train =  open_pickle(os.path.join(self.models_path, 'greek_lemmatized_sents.pickle'))
            self.GREEK_OLD_MODEL =  open_pickle(os.path.join(self.models_path, 'greek_lemmata_cltk.pickle'))
            self.GREEK_MODEL =  open_pickle(os.path.join(self.models_path, 'greek_model.pickle'))
        except FileNotFoundError as err:
            raise type(err)(missing_models_message)

        self.greek_sub_patterns = greek_sub_patterns # Move to greek_models_cltk

        self.seed = seed
        self.VERBOSE=verbose

        def _randomize_data(train: List[list], seed: int):
            import random
            random.seed(seed)
            random.shuffle(train)
            pos_train_sents = train[:4000]
            lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train]
github cltk / cltk / cltk / tag / pos.py View on Github external
def tag_trigram(self, untagged_string: str):
        """Tag POS with trigram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['trigram']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text