Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
for dataset in datasets_for_vocab_creation:
if dataset not in all_datasets:
raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")
logger.info("From dataset instances, %s will be considered for vocabulary creation.",
", ".join(datasets_for_vocab_creation))
vocab = Vocabulary.from_params(
params.pop("vocabulary", {}),
(instance for key, dataset in all_datasets.items()
for instance in dataset
if key in datasets_for_vocab_creation)
)
model = Model.from_params(vocab=vocab, params=params.pop('model'))
model = transfer_prev_model_weights_to_new_model(prev_best_model, model)
# Initializing the model can have side effect of expanding the vocabulary
vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
iterator = DataIterator.from_params(params.pop("iterator"))
iterator.index_with(vocab)
validation_iterator_params = params.pop("validation_iterator", None)
if validation_iterator_params:
validation_iterator = DataIterator.from_params(validation_iterator_params)
validation_iterator.index_with(vocab)
else:
validation_iterator = None
train_data = all_datasets['train']
validation_data = all_datasets.get('validation')
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.common.checks import ConfigurationError
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from .sentence_tokenizer import SentenceTokenizer
from allennlp.data.tokenizers.word_filter import StopwordFilter, PassThroughWordFilter
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@DatasetReader.register("textcat")
class TextCatReader(DatasetReader):
"""
Reads tokens and their topic labels.
Assumes that data in file_path provided to _read is tab-separated, containing (at least) the two
fields 'tokens' and 'category', in no particular order, with each document/label on one line.
(So this means that documents must not contain either newlines or tabs.)
Example:
category tokens
sample_label_1 This is a document. It contains a couple of sentences.
sample_label_1 This is another document. It also contains two sentences.
sample_label_2 This document has a different label.
and so on.
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.common.checks import ConfigurationError
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from .sentence_tokenizer import SentenceTokenizer
from allennlp.data.tokenizers.word_filter import StopwordFilter, PassThroughWordFilter
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@DatasetReader.register("textcat")
class TextCatReader(DatasetReader):
"""
Reads tokens and their topic labels.
Assumes that data in file_path provided to _read is tab-separated, containing (at least) the two
fields 'tokens' and 'category', in no particular order, with each document/label on one line.
(So this means that documents must not contain either newlines or tabs.)
Example:
category tokens
sample_label_1 This is a document. It contains a couple of sentences.
sample_label_1 This is another document. It also contains two sentences.
sample_label_2 This document has a different label.
and so on.
def _get_combination(combination: str, tensors: List[torch.Tensor]) -> torch.Tensor:
if combination.isdigit():
index = int(combination) - 1
return tensors[index]
else:
if len(combination) != 3:
raise ConfigurationError("Invalid combination: " + combination)
first_tensor = _get_combination(combination[0], tensors)
second_tensor = _get_combination(combination[2], tensors)
operation = combination[1]
if operation == '*':
return first_tensor * second_tensor
elif operation == '/':
return first_tensor / second_tensor
elif operation == '+':
return first_tensor + second_tensor
elif operation == '-':
return first_tensor - second_tensor
else:
raise ConfigurationError("Invalid operation: " + operation)
def text_to_instance(self, nc: str) -> Instance:
tokenized_nc = self._tokenizer.tokenize(nc)
nc_field = TextField(tokenized_nc, self._token_indexers)
w1_field, w2_field, nc_seq_field = nc_field, nc_field, nc_field
constituents = nc.split('_')
if len(constituents) == 2:
w1, w2 = constituents
tokenized_w1 = self._tokenizer.tokenize(w1)
w1_field = TextField(tokenized_w1, self._token_indexers)
tokenized_w2 = self._tokenizer.tokenize(w2)
w2_field = TextField(tokenized_w2, self._token_indexers)
tokenized_nc_seq = self._tokenizer.tokenize(' '.join((w1, w2)))
nc_seq_field = TextField(tokenized_nc_seq, self._token_indexers)
fields = {'nc': nc_field, 'w1': w1_field, 'w2': w2_field, 'nc_seq': nc_seq_field}
return Instance(fields)
def _make_instance_from_text(self, sent_tokens, pred_index, annotations = None, sent_id = None):
instance_dict = {}
if isinstance(sent_tokens, str):
sent_tokens = sent_tokens.split()
sent_tokens = cleanse_sentence_text(sent_tokens)
text_field = TextField([Token(t) for t in sent_tokens], self._token_indexers)
instance_dict['text'] = text_field
instance_dict['predicate_indicator'] = SequenceLabelField([1 if i == pred_index else 0 for i in range(len(sent_tokens))], text_field)
if annotations is not None:
for i, slot_name in enumerate(self._slot_labels):
span_slot = ListField([LabelField(ann.slots[i], label_namespace="slot_%s"%slot_name) for ann in annotations for span in ann.all_spans])
instance_dict['span_slot_%s'%slot_name] = span_slot
labeled_span_field = ListField([SpanField(span.start(), span.end(), text_field) for ann in annotations for span in ann.all_spans])
instance_dict['labeled_spans'] = labeled_span_field
if self._bio_labels:
bio_labels = ["O"] * len(sent_tokens)
bio_labels[pred_index] = "B-V"
def _forward_loop(self,
source: Dict[str, torch.Tensor],
alias_database: AliasDatabase,
mention_type: torch.Tensor,
raw_entity_ids: Dict[str, torch.Tensor],
entity_ids: Dict[str, torch.Tensor],
parent_ids: Dict[str, torch.Tensor],
relations: Dict[str, torch.Tensor],
shortlist: Dict[str, torch.Tensor],
shortlist_inds: torch.Tensor) -> Dict[str, torch.Tensor]:
# Get the token mask and extract indexed text fields.
# shape: (batch_size, sequence_length)
target_mask = get_text_field_mask(source)
source = source['tokens']
raw_entity_ids = raw_entity_ids['raw_entity_ids']
entity_ids = entity_ids['entity_ids']
parent_ids = parent_ids['entity_ids']
relations = relations['relations']
logger.debug('Source & Target shape: %s', source.shape)
logger.debug('Entity ids shape: %s', entity_ids.shape)
logger.debug('Relations & Parent ids shape: %s', relations.shape)
logger.debug('Shortlist shape: %s', shortlist['entity_ids'].shape)
# Embed source tokens.
# shape: (batch_size, sequence_length, embedding_dim)
encoded, alpha_loss, beta_loss = self._encode_source(source)
splits = [self.token_embedding_dim] + [self.entity_embedding_dim] * 2
encoded_token, encoded_head, encoded_relation = encoded.split(splits, dim=-1)
action_sequence = []
elif action_sequence is None:
return None
index_fields: List[Field] = []
production_rule_fields: List[Field] = []
for production_rule in all_actions:
nonterminal, _ = production_rule.split(" ->")
production_rule = " ".join(production_rule.split(" "))
field = ProductionRuleField(
production_rule, self._world.is_global_rule(nonterminal), nonterminal=nonterminal
)
production_rule_fields.append(field)
valid_actions_field = ListField(production_rule_fields)
fields["valid_actions"] = valid_actions_field
action_map = {
action.rule: i # type: ignore
for i, action in enumerate(valid_actions_field.field_list)
}
for production_rule in action_sequence:
index_fields.append(IndexField(action_map[production_rule], valid_actions_field))
if not action_sequence:
index_fields = [IndexField(-1, valid_actions_field)]
action_sequence_field = ListField(index_fields)
fields["action_sequence"] = action_sequence_field
return Instance(fields)
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.decomposition import TruncatedSVD
EMBEDDING_DIM = 200
MEDLINE_WORD_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/medline_word_prob.json'
DISCOURSE_MODEL_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/model.tar.gz'
PUBMED_PRETRAINED_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/wikipedia-pubmed-and-PMC-w2v.txt.gz'
TRAIN_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/train_labels.json'
VALIDATION_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/validation_labels.json'
TEST_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/test_labels.json'
archive = load_archive(DISCOURSE_MODEL_PATH) # discourse model
predictor = Predictor.from_archive(archive, 'discourse_predictor')
assert os.path.exists('wiki.en.bin') == True
ft_model = load_model('wiki.en.bin') # fastText word vector
p_dict = json.load(open(cached_path(MEDLINE_WORD_PATH), 'r'))
def read_embedding(pretrained_path=PUBMED_PRETRAINED_PATH):
"""
Read Pubmed Pretrained embedding from Amazon S3 and
return dictionary of embeddings
"""
embeddings = {}
with EmbeddingsTextFile(pretrained_path) as embeddings_file:
for line in embeddings_file:
token = line.split(' ', 1)[0]
if token in p_dict.keys():
fields = line.rstrip().split(' ')
vector = np.asarray(fields[1:], dtype='float32')
embeddings[token] = vector
return embeddings
def _read(self, file_path):
with open(cached_path(file_path), "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
columns = data_file.readline().strip('\n').split('\t')
for line in data_file.readlines():
if not line:
continue
items = line.strip("\n").split("\t")
tokens = items[columns.index("tokens")]
category = items[columns.index("category")]
instance = self.text_to_instance(tokens=tokens,
category=category)
if instance is not None:
yield instance