How to use the torchtext.vocab function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github facebookresearch / pythia / pythia / utils / vocab.py View on Github external
which will be used to collect vectors
        embedding_name : str
            Embedding name picked up from the list of the pretrained aliases
            mentioned above
        """
        super(IntersectedVocab, self).__init__(vocab_file, *args, **kwargs)

        self.type = "intersected"

        name = embedding_name.split(".")[0]
        dim = embedding_name.split(".")[2][:-1]
        middle = embedding_name.split(".")[1]

        class_name = EMBEDDING_NAME_CLASS_MAPPING[name]

        if not hasattr(vocab, class_name):
            from pythia.common.registry import registry

            writer = registry.get("writer")
            error = "Unknown embedding type: %s" % name, "error"
            if writer is not None:
                writer.write(error, "error")
            raise RuntimeError(error)

        params = [middle]

        if name == "glove":
            params.append(int(dim))

        vector_cache = os.path.join(get_pythia_root(), ".vector_cache")
        embedding = getattr(vocab, class_name)(*params, cache=vector_cache)
github anhaidgroup / deepmatcher / deepmatcher / data / field.py View on Github external
if not isinstance(vec, vocab.Vectors):
                vec_name = vec
                vec_data = cls._cached_vec_data.get(vec_name)
                if vec_data is None:
                    parts = vec_name.split('.')
                    if parts[0] == 'fasttext':
                        if parts[2] == 'bin':
                            vec_data = FastTextBinary(language=parts[1], cache=cache)
                        elif parts[2] == 'vec' and parts[1] == 'wiki':
                            vec_data = FastText(
                                suffix='wiki-news-300d-1M.vec.zip', cache=cache)
                        elif parts[2] == 'vec' and parts[1] == 'crawl':
                            vec_data = FastText(
                                suffix='crawl-300d-2M.vec.zip', cache=cache)
                if vec_data is None:
                    vec_data = vocab.pretrained_aliases[vec_name](cache=cache)
                cls._cached_vec_data[vec_name] = vec_data
                vec_datas.append(vec_data)
            else:
                vec_datas.append(vec)

        return vec_datas
github facebookresearch / pytext / pytext / fields / dict_field.py View on Github external
if isinstance(arg, textdata.Dataset):
                sources += [
                    getattr(arg, name)
                    for name, field in arg.fields.items()
                    if field is self
                ]
            else:
                sources.append(arg)

        counter = Counter()
        for data in sources:
            for x in data:
                if len(x) > 0:
                    counter.update(x[0])
        specials = [self.unk_token, self.pad_token]
        self.vocab = vocab.Vocab(counter, specials=specials, **kwargs)
github facebookresearch / ParlAI / parlai / agents / legacy_agents / seq2seq / seq2seq_v0.py View on Github external
init = 'glove-twitter'
                        name = 'twitter.27B'
                        pretrained_dim = 200
                    else:
                        init = 'glove'
                        name = '840B'
                    embs = vocab.GloVe(
                        name=name,
                        dim=pretrained_dim,
                        cache=modelzoo_path(
                            self.opt.get('datapath'), 'models:glove_vectors'
                        ),
                    )
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(
                        language='en',
                        cache=modelzoo_path(
                            self.opt.get('datapath'), 'models:fasttext_vectors'
                        ),
                    )
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != pretrained_dim:
                    rp = torch.Tensor(pretrained_dim, opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
github uvavision / Text2Scene / lib / layout_utils.py View on Github external
def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = []
        for idx, word in enumerate(['', '', '']):
            self.word2index[word] = idx
            self.index2word.append(word)
            self.word2count[word] = 1
        self.n_words = 3  

        self.glovec = torchtext.vocab.GloVe(cache=osp.join(this_dir, '..', 'data', 'caches'))
github anhaidgroup / deepmatcher / deepmatcher / data / torchtext_extensions.py View on Github external
    lambda **kwargs: vocab.CharNGram(**kwargs),
    "fasttext.en.300d":
github Uehwan / 3-D-Scene-Graph / model / vis_tuning.py View on Github external
from graphviz import Digraph
import webcolors
import pprint
import math
from scipy.stats import norm
from color_histogram.core.hist_3d import Hist3D
#import pcl # cd python-pcl -> python setup.py build-ext -i -> python setup.py install
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import torchtext #0. install torchtext==0.2.3 (pip install torchtext==0.2.3)
from torch.nn.functional import cosine_similarity
from collections import Counter
import pcl
import os.path as osp
import os
fasttext = torchtext.vocab.FastText()
_GRAY = (218, 227, 218)
_GREEN = (18, 127, 15)
_WHITE = (255, 255, 255)


class same_node_detection(object):
    def __init__(self):
        self.compare_all = False
        self.class_weight = 10.0/20.0
        self.pose_weight = 8.0/20.0
        self.color_weight = 2.0/20.0

    def compare_class(self, curr_cls, prev_cls, cls_score ):
        similar_cls = False
        same_cls = 0
        score = 0
github zomux / nmtlab / nmtlab / utils / vocab.py View on Github external
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import torchtext.vocab
import pickle
from collections import Counter, defaultdict

DEFAULT_SPECIAL_TOKENS = ["", "<s>", "</s>", "UNK"]


class Vocab(torchtext.vocab.Vocab):
    
    def __init__(self, path=None, unk_token="UNK", picklable=False):
        self._unk_token = unk_token
        self.itos = []
        if picklable:
            self.stoi = {}
        else:
            self.stoi = defaultdict(lambda: 3)
        if path:
            self.load(path)
    
    def size(self):
        return len(self.itos)

    def initialize(self, special_tokens=None):
        if special_tokens is None:
github facebookresearch / pythia / pythia / utils / vocab.py View on Github external
if not hasattr(vocab, class_name):
            from pythia.common.registry import registry

            writer = registry.get("writer")
            error = "Unknown embedding type: %s" % name, "error"
            if writer is not None:
                writer.write(error, "error")
            raise RuntimeError(error)

        params = [middle]

        if name == "glove":
            params.append(int(dim))

        vector_cache = os.path.join(get_pythia_root(), ".vector_cache")
        embedding = getattr(vocab, class_name)(*params, cache=vector_cache)

        self.vectors = torch.empty(
            (self.get_size(), len(embedding.vectors[0])), dtype=torch.float
        )

        self.embedding_dim = len(embedding.vectors[0])

        for i in range(0, 4):
            self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i

        for i in range(4, self.get_size()):
            word = self.itos[i]
            embedding_index = embedding.stoi.get(word, None)

            if embedding_index is None:
                self.vectors[i] = self.vectors[self.UNK_INDEX].clone()
github shuoyangd / hoolock / preprocess.py View on Github external
def main(options):
  # first pass: collecting vocab
  conll_reader = utils.io.CoNLLReader(open(options.train_conll_file))
  tokens = []
  postags = []
  for sent in conll_reader:
    for row in sent:
      tokens.append(row["FORM"])
      postags.append(row["UPOSTAG"])
  conll_reader.close()
  vocab = torchtext.vocab.Vocab(collections.Counter(tokens), specials=["", ""],
                                max_size=options.vocab_size)
  postags = list(set(postags))
  postags.append("")
  postags.append("")
  postag2idx = dict((pair[1], pair[0]) for pair in enumerate(postags))

  oracle_reader = utils.io.OracleReader(open(options.train_oracle_file))
  actions = []
  for sent in oracle_reader:
    for row in sent:
      actions.append(make_action_str(row))
  actions = list(set(actions))
  actions.append("")
  actions.append("")
  action2idx = dict((pair[1], pair[0]) for pair in enumerate(actions))