How to use the torchtext.vocab.Vectors function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github AnubhavGupta3377 / Text-Classification-Models-Pytorch / Model_TextRNN / utils.py View on Github external
train_examples = [data.Example.fromlist(i, datafields) for i in train_df.values.tolist()]
        train_data = data.Dataset(train_examples, datafields)
        
        test_df = self.get_pandas_df(test_file)
        test_examples = [data.Example.fromlist(i, datafields) for i in test_df.values.tolist()]
        test_data = data.Dataset(test_examples, datafields)
        
        # If validation file exists, load it. Otherwise get validation data from training data
        if val_file:
            val_df = self.get_pandas_df(val_file)
            val_examples = [data.Example.fromlist(i, datafields) for i in val_df.values.tolist()]
            val_data = data.Dataset(val_examples, datafields)
        else:
            train_data, val_data = train_data.split(split_ratio=0.8)
        
        TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab
        
        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)
        
        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)
github NLPatVCU / medinify / medinify / sentiment / rnn_review_classifier.py View on Github external
def __init__(self, w2v_file):
        """
        Initializes RNNReviewClassifier
        :param w2v_file: embedding file
        """
        vectors = Vectors(w2v_file)
        self.vectors = vectors
github GauthierDmn / question_generation / preprocessing.py View on Github external
def load_data(self, train_file, val_file, glove_dir):
        # Loading saved data
        train_dataset = torch.load(train_file)
        train_examples = train_dataset['examples']

        val_dataset = torch.load(val_file)
        val_examples = val_dataset['examples']

        # Generating torchtext dataset class
        fields = [('src', self.src_field), ('trg', self.trg_field), ('feat', self.src_feat_field)]
        train_dataset = data.Dataset(fields=fields, examples=train_examples)
        val_dataset = data.Dataset(fields=fields, examples=val_examples)

        # Loading GloVE vectors
        vec = vocab.Vectors(os.path.join(glove_dir, "glove.6B.{}d.txt".format(config.word_embedding_size)))

        # Building field vocabulary
        self.src_field.build_vocab(train_dataset, vectors=vec, max_size=config.in_vocab_size)
        self.trg_field.build_vocab(train_dataset, vectors=vec, max_size=config.out_vocab_size)
        self.src_feat_field.build_vocab(train_dataset, vectors=vec, max_size=config.out_vocab_size)

        src_vocab, trg_vocab, src_feat_vocab = self.generate_vocabs()
        vocabs = {'src_vocab': src_vocab, 'trg_vocab': trg_vocab, 'src_feat_vocab': src_feat_vocab}

        return train_dataset, val_dataset, vocabs
github anhaidgroup / deepmatcher / deepmatcher / data / torchtext_extensions.py View on Github external
logger = logging.getLogger(__name__)


class FastText(vocab.Vectors):

    url_base = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/'

    def __init__(self, suffix='wiki-news-300d-1M.vec.zip', **kwargs):
        url = self.url_base + suffix
        base, ext = os.path.splitext(suffix)
        name = suffix if ext == '.vec' else base
        super(FastText, self).__init__(name, url=url, **kwargs)


class FastTextBinary(vocab.Vectors):

    url_base = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.{}.zip'
    name_base = 'wiki.{}.bin'

    def __init__(self, language='en', cache=None):
        """
        Arguments:
           language: Language of fastText pre-trained embedding model
           cache: directory for cached model
         """
        cache = os.path.expanduser(cache)
        url = FastTextBinary.url_base.format(language)
        name = FastTextBinary.name_base.format(language)

        self.cache(name, cache, url=url)
github smilelight / lightNLP / lightnlp / sl / ner / tool.py View on Github external
def get_vectors(self, path: str):
        logger.info('loading vectors from {}'.format(path))
        vectors = Vectors(path)
        logger.info('successed loading vectors')
        return vectors
github Morphl-AI / MorphL-Model-User-Search-Intent / preprocessing / usi_csv_preprocessing.py View on Github external
import torchtext.vocab as vocab
import torch.tensor as tensor
from pyspark.sql.types import ArrayType
from pyspark.sql.types import DoubleType


# Load env varibales
MASTER_URL = 'local[*]'
APPLICATION_NAME = 'preprocessor'
MORPHL_SERVER_IP_ADDRESS = getenv('MORPHL_SERVER_IP_ADDRESS')
MORPHL_CASSANDRA_USERNAME = getenv('MORPHL_CASSANDRA_USERNAME')
MORPHL_CASSANDRA_PASSWORD = getenv('MORPHL_CASSANDRA_PASSWORD')
MORPHL_CASSANDRA_KEYSPACE = getenv('MORPHL_CASSANDRA_KEYSPACE')

# Load word embeddings tensor
embedding = vocab.Vectors(name="/opt/glove/glove.6B.100d.txt",cache='/opt/glove')

# Function that returns a dataframe from a cassandra table


def fetch_from_cassandra(c_table_name, spark_session):
    load_options = {
        'keyspace': MORPHL_CASSANDRA_KEYSPACE,
        'table': c_table_name,
        'spark.cassandra.input.fetch.size_in_rows': '150'}

    df = (spark_session.read.format('org.apache.spark.sql.cassandra')
                            .options(**load_options)
                            .load())

    return df
github AveryLiu / TD-DMN / iterator.py View on Github external
def get_iters(self, train_batch_size, fold_num, vec_name, vec_cache):
        # Load data splits
        train, test = data.TabularDataset.splits(path="./data/fold_{}".format(fold_num), train="train.tsv",
                                                      test="test.tsv", format="tsv",
                                                      fields=[("TEXT", self.text_doc), ("ENTITY", self.entity_doc),
                                                              ("LABEL", self.label_doc),
                                                              ("OFFSET", self.offset_doc),
                                                              ("LENGTH", self.length_doc),
                                                              ("WORD_ATTN", self.word_attn_doc),
                                                              ("SENT_ATTN", self.sent_attn_doc),
                                                              ("DOC_ID", self.doc_id)])

        # First load vectors
        vector = Vectors(name=vec_name, cache=vec_cache)

        # Build vocabs
        self.text_doc.build_vocab(train, test, vectors=vector)
        self.entity_doc.build_vocab(train, test)
        self.label_doc.build_vocab(train, test)

        # Get iterators
        train_iter, test_iter = data.BucketIterator.splits((train, test),
                                                           sort=False, batch_sizes=(train_batch_size, 2),
                                                           repeat=True)
        train_iter.shuffle = True
        return train_iter, test_iter
github TianyuZhuuu / CHIP2018 / src / _05_prepare_data.py View on Github external
self.train_df = train_df[['q1_wid', 'q2_wid', 'q1_cid', 'q2_cid', 'label']]

        test_df['q1_wid'] = test_df['qid1'].apply(lambda qid: question_df.loc[qid]['wid'])
        test_df['q2_wid'] = test_df['qid2'].apply(lambda qid: question_df.loc[qid]['wid'])
        test_df['q1_cid'] = test_df['qid1'].apply(lambda qid: question_df.loc[qid]['cid'])
        test_df['q2_cid'] = test_df['qid2'].apply(lambda qid: question_df.loc[qid]['cid'])
        self.test_df = test_df[['q1_wid', 'q2_wid', 'q1_cid', 'q2_cid']]

        self.word_embedding_path = word_path
        self.char_embedding_path = char_path

        cache = '../cache'
        if not os.path.exists(cache):
            os.mkdir(cache)

        self.word_vectors = Vectors(self.word_embedding_path, cache)
        self.char_vectors = Vectors(self.char_embedding_path, cache)
        self.word_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05)
        self.char_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05)
        self.wordTEXT = data.Field(batch_first=True)
        self.charTEXT = data.Field(batch_first=True)
        self.LABEL = data.Field(sequential=False, use_vocab=False, dtype=torch.float)

        train_dataset = self.generate_dataset()
        test_dataset = self.generate_dataset(role='test')
        self.wordTEXT.build_vocab(train_dataset, test_dataset, min_freq=1, vectors=self.word_vectors)
        self.charTEXT.build_vocab(train_dataset, test_dataset, min_freq=1, vectors=self.char_vectors)
        self.word_embedding = self.wordTEXT.vocab.vectors
        self.char_embedding = self.charTEXT.vocab.vectors
github NLPatVCU / medinify / medinify / process / dataloder_processor.py View on Github external
def get_features(self, dataset):
        dataset = remove_neutral(dataset)
        vectors = Vectors(dataset.word_embeddings)
        self.get_labels(dataset)
        fields = {'text': ('text', self.text_field), 'label': ('label', self.label_field)}
        text = dataset.data_table[dataset.text_column].to_numpy()
        labels = dataset.data_table['label'].to_numpy()
        examples = [Example.fromdict(
            data={'text': text[x], 'label': labels[x]}, fields=fields) for x in range(labels.shape[0])]
        torch_dataset = TorchtextDataset(examples, {'text': self.text_field, 'label': self.label_field})
        try:
            self.text_field.vocab
        except AttributeError:
            self.text_field.build_vocab(torch_dataset, vectors=vectors)
            self.label_field.build_vocab(torch_dataset)
        loader = BucketIterator(torch_dataset, batch_size=25)
        return loader