How to use the danlp.download.DEFAULT_CACHE_DIR function in danlp

To help you get started, we’ve selected a few danlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github alexandrainst / danlp / tests / test_ner_tagger.py View on Github external
def test_flair_tagger(self):
        # Download model beforehand
        download_model('flair.ner', DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True)
        print("Downloaded the flair model")

        # Load the NER tagger using the DaNLP wrapper
        flair_model = load_flair_ner_model()

        # Using the flair POS tagger
        sentence = Sentence('jeg hopper pΓ₯ en bil som er rΓΈd sammen med Jens-Peter E. Hansen')
        flair_model.predict(sentence)

        expected_string = "jeg hopper pΓ₯ en bil som er rΓΈd sammen med Jens-Peter  E.  Hansen "

        self.assertEqual(sentence.to_tagged_string(), expected_string)
github alexandrainst / danlp / tests / test_flair_models.py View on Github external
def test_flair_tagger(self):
        # Download model beforehand
        download_model('flair.pos', DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True)
        print("Downloaded the flair model")

        # Load the POS tagger using the DaNLP wrapper
        flair_model = load_flair_pos_model()

        # Using the flair POS tagger
        sentence = Sentence('jeg hopper pΓ₯ en bil som er rΓΈd sammen med Jens-Peter E. Hansen')
        flair_model.predict(sentence)

        expected_string = "jeg  hopper  pΓ₯  en  bil  som  er " \
                          " rΓΈd  sammen  med  Jens-Peter  E.  Hansen "

        self.assertEqual(sentence.to_tagged_string(), expected_string)
github alexandrainst / danlp / tests / test_spacy_model.py View on Github external
def test_download(self):
        # Download model beforehand
        model_path = download_model('spacy', DEFAULT_CACHE_DIR,
                                    process_func=_unzip_process_func,
                                    verbose=True)

        info = spacy.info(model_path)
        self.assertListEqual(info['pipeline'], ['tagger', 'parser', 'ner'])
        self.assertEqual(info['lang'], 'da')
github alexandrainst / danlp / danlp / models / embeddings.py View on Github external
def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR,
                        verbose: bool = False):
    """

    Available wordembeddings:
    - wiki.da.wv
    - cc.da.wv
    - conll17.da.wv
    - news.da.wv
    - sketchengine.da.wv

    Available subwordembeddings:
    - wiki.da.swv
    - cc.da.swv
    - sketchengine.da.swv

    :param pretrained_embedding:
github alexandrainst / danlp / danlp / datasets / sentiment.py View on Github external
def _twitter_data_process_func(tmp_file_path: str, meta_info: dict,
                               cache_dir: str = DEFAULT_CACHE_DIR,
                               clean_up_raw_data: bool = True,
                               verbose: bool = True):
    from zipfile import ZipFile

    twitter_api = construct_twitter_api_connection()
    
    model_name = meta_info['name']
    full_path = os.path.join(cache_dir, model_name) + meta_info['file_extension']

    with ZipFile(tmp_file_path, 'r') as zip_file:  # Extract files to cache_dir
        file_list = zip_file.namelist()
        extract_single_file_from_zip(cache_dir, file_list[0], full_path, zip_file)
    file_path = os.path.join(cache_dir, 'twitter.sentiment' + '.csv')
    df = pd.read_csv(file_path)

    twitter_ids = list(df['twitterid'])
github alexandrainst / danlp / danlp / models / embeddings.py View on Github external
def load_pytorch_embedding_layer(pretrained_embedding: str,
                                 cache_dir=DEFAULT_CACHE_DIR, verbose=False):
    """

    :param pretrained_embedding:
    :param cache_dir: the directory for storing cached models
    :return: an pytorch Embedding module and a list id2word
    """
    word_embeddings_available(pretrained_embedding, can_use_subword=False)
    import torch
    from torch.nn import Embedding

    word_vectors = load_wv_with_gensim(pretrained_embedding,
                                       cache_dir=cache_dir, verbose=verbose)
    weights = torch.FloatTensor(word_vectors.vectors)

    return Embedding.from_pretrained(weights), word_vectors.index2word
github alexandrainst / danlp / danlp / models / embeddings.py View on Github external
def _process_embeddings_for_spacy(tmp_file_path: str, meta_info: dict,
                                  cache_dir: str = DEFAULT_CACHE_DIR,
                                  clean_up_raw_data: bool = True,
                                  verbose: bool = False):
    """
    To use pretrained embeddings with spaCy the embeddings need to be stored in
    a specific format. This function converts embeddings saved in the binary
    word2vec format to a spaCy model with the init_model() function from
    spaCy. The generated files will be saved in the cache_dir under a
    folder called .spacy

    More information on converting pretrained word embeddings to spaCy models here:
    https://spacy.io/usage/vectors-similarity#custom

    :param str tmp_file_path: the file name of the embedding binary file
    :param str cache_dir: the directory for storing cached data
    :param bool verbose:
    """
github alexandrainst / danlp / danlp / models / flair_models.py View on Github external
def load_flair_pos_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
    """

    :param cache_dir:
    :param verbose:
    :return:
    """
    from flair.models import SequenceTagger

    model_weight_path = download_model('flair.pos', cache_dir, process_func=_unzip_process_func, verbose=verbose)

    # using the flair model
    flair_model = SequenceTagger.load(model_weight_path)

    return flair_model
github alexandrainst / danlp / danlp / datasets / sentiment.py View on Github external
    def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
        self.dataset_name1 = 'lcc1.sentiment'
        self.file_extension1 = DATASETS[self.dataset_name1]['file_extension']

        self.dataset_dir1 = download_dataset(self.dataset_name1, cache_dir=cache_dir)
        self.file_path1 = os.path.join(self.dataset_dir1, self.dataset_name1 + self.file_extension1)
        
        self.dataset_name2 = 'lcc2.sentiment'
        self.file_extension2 = DATASETS[self.dataset_name2]['file_extension']

        self.dataset_dir2 = download_dataset(self.dataset_name2, cache_dir=cache_dir)
        self.file_path2 = os.path.join(self.dataset_dir2, self.dataset_name2 + self.file_extension2)
github alexandrainst / danlp / danlp / models / bert_models.py View on Github external
def load_bert_ner_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
    """
    Wrapper function to ensure that all models in danlp are
    loaded in a similar way
    :param cache_dir:
    :param verbose:
    :return:
    """
    return BertNer(cache_dir, verbose)