Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"This is a sentence with tab",
"This is a sentence with multiple tabs",
]
for tokenizer in tokenizers:
for text in texts:
# Important: we don't assume to preserve whitespaces after tokenization.
# This means: \t, \n " " etc will all resolve to a single " ".
# This doesn't make a difference for BERT + XLNet but it does for roBERTa
# 1. original tokenize function from transformer repo on full sentence
standardized_whitespace_text = ' '.join(text.split()) # remove multiple whitespaces
tokenized = tokenizer.tokenize(standardized_whitespace_text)
# 2. our tokenizer with metadata on "whitespace tokenized words"
tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer)
# verify that tokenization on full sequence is the same as the one on "whitespace tokenized words"
assert tokenized_meta["tokens"] == tokenized, f"Failed using {tokenizer.__class__.__name__}"
# verify that offsets align back to original text
if text == "力加勝北区ᴵᴺᵀᵃছজটডণত":
# contains [UNK] that are impossible to match back to original text space
continue
for tok, offset in zip(tokenized_meta["tokens"], tokenized_meta["offsets"]):
#subword-tokens have special chars depending on model type. In order to align with original text we need to get rid of them
tok = re.sub(r"^(##|Ġ|▁)", "", tok)
#tok = tokenizer.decode(tokenizer.convert_tokens_to_ids(tok))
original_tok = text[offset:offset+len(tok)]
assert tok == original_tok, f"Offset alignment wrong for {tokenizer.__class__.__name__} and text '{text}'"
lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"]
tokenizers = []
for lang_name in lang_names:
t = Tokenizer.load(lang_name, lower_case=False)
t.add_tokens(new_tokens=["neverseentokens"])
tokenizers.append(t)
basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
for tokenizer in tokenizers:
save_dir = f"testsave"
tokenizer_type = tokenizer.__class__.__name__
tokenizer.save_pretrained(save_dir)
tokenizer_loaded = Tokenizer.load(save_dir, tokenizer_class=tokenizer_type)
tokenized_before = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer)
tokenized_after = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer_loaded)
assert tokenized_before == tokenized_after
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False
)
#deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt")
tokenizer.add_tokens(new_tokens=["neverseentokens"])
basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
# original tokenizer from transformer repo
tokenized = tokenizer.tokenize(basic_text)
assert tokenized == ['Some', 'Text', 'with', 'neverseentokens', 'plus', '!', '215', '?', '#', '.', 'and', 'a', 'combined', '-', 'token', '_', 'with', '/', 'ch', '##ars']
# ours with metadata
tokenized_meta = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer)
assert tokenized_meta["tokens"] == tokenized
assert tokenized_meta["offsets"] == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72]
assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False]
lang_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False
)
basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
# original tokenizer from transformer repo
tokenized = tokenizer.tokenize(basic_text)
assert tokenized == ['Some', 'Text', 'with', 'never', '##see', '##nto', '##ken', '##s', 'plus', '!', '215', '?', '#', '.', 'and', 'a', 'combined', '-', 'token', '_', 'with', '/', 'ch', '##ars']
# ours with metadata
tokenized_meta = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer)
assert tokenized_meta["tokens"] == tokenized
assert tokenized_meta["offsets"] == [0, 5, 10, 15, 20, 23, 26, 29, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72]
assert tokenized_meta["start_of_word"] == [True, True, True, True, False, False, False, False, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False]
def apply_tokenization(self, dictionary):
""" This performs tokenization on all documents and questions. The result is a list (unnested)
where each entry is a dictionary for one document-question pair (potentially mutliple answers). """
raw_baskets = []
document_text = dictionary["context"]
document_tokenized = tokenize_with_metadata(document_text, self.tokenizer)
document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
questions = dictionary["qas"]
for question in questions:
squad_id = question["id"]
question_text = question["question"]
question_tokenized = tokenize_with_metadata(question_text, self.tokenizer)
question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]
answers = []
for answer in question["answers"]:
a = {"text": answer["text"],
"offset": answer["answer_start"]}
answers.append(a)
raw = {"document_text": document_text,
"document_tokens": document_tokenized["tokens"],
"document_offsets": document_tokenized["offsets"],
"document_start_of_word": document_start_of_word,
"question_text": question_text,
"question_tokens": question_tokenized["tokens"],
"question_offsets": question_tokenized["offsets"],
"question_start_of_word": question_start_of_word,
"answers": answers,
"is_impossible": question["is_impossible"],
def apply_tokenization(self, dictionary):
""" This performs tokenization on all documents and questions. The result is a list (unnested)
where each entry is a dictionary for one document-question pair (potentially mutliple answers). """
raw_baskets = []
document_text = dictionary["context"]
document_tokenized = tokenize_with_metadata(document_text, self.tokenizer)
document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
questions = dictionary["qas"]
for question in questions:
squad_id = question["id"]
question_text = question["question"]
question_tokenized = tokenize_with_metadata(question_text, self.tokenizer)
question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]
answers = []
for answer in question["answers"]:
a = {"text": answer["text"],
"offset": answer["answer_start"]}
answers.append(a)
raw = {"document_text": document_text,
"document_tokens": document_tokenized["tokens"],
"document_offsets": document_tokenized["offsets"],
"document_start_of_word": document_start_of_word,
# create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible)
for idx in range(len(doc) - 1):
tokenized = {}
if self.next_sent_pred:
text_a, text_b, is_next_label = get_sentence_pair(doc, all_dicts, idx)
sample_in_clear_text = {
"text_a": text_a,
"text_b": text_b,
"nextsentence_label": is_next_label,
}
# tokenize
tokenized["text_a"] = tokenize_with_metadata(
text_a, self.tokenizer
)
tokenized["text_b"] = tokenize_with_metadata(
text_b, self.tokenizer
)
# truncate to max_seq_len
for seq_name in ["tokens", "offsets", "start_of_word"]:
tokenized["text_a"][seq_name], tokenized["text_b"][seq_name], _ = truncate_sequences(
seq_a=tokenized["text_a"][seq_name],
seq_b=tokenized["text_b"][seq_name],
tokenizer=self.tokenizer,
max_seq_len=self.max_seq_len)
samples.append(Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized))
# if we don't do next sentence prediction, we should feed in a single sentence
else:
text_a = doc[idx]
sample_in_clear_text = {
"text_a": text_a,
"text_b": None,
def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
# this tokenization also stores offsets
tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
# truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
for seq_name in tokenized.keys():
tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None,
tokenizer=self.tokenizer,
max_seq_len=self.max_seq_len)
# Samples don't have labels during Inference mode
if "label" in dictionary:
label = float(dictionary["label"])
scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1]
dictionary["label"] = scaled_label
return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
assert len(all_dicts) > 1, "Need at least 2 documents to sample random sentences from"
doc = dictionary["doc"]
samples = []
# create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible)
for idx in range(len(doc) - 1):
tokenized = {}
if self.next_sent_pred:
text_a, text_b, is_next_label = get_sentence_pair(doc, all_dicts, idx)
sample_in_clear_text = {
"text_a": text_a,
"text_b": text_b,
"nextsentence_label": is_next_label,
}
# tokenize
tokenized["text_a"] = tokenize_with_metadata(
text_a, self.tokenizer
)
tokenized["text_b"] = tokenize_with_metadata(
text_b, self.tokenizer
)
# truncate to max_seq_len
for seq_name in ["tokens", "offsets", "start_of_word"]:
tokenized["text_a"][seq_name], tokenized["text_b"][seq_name], _ = truncate_sequences(
seq_a=tokenized["text_a"][seq_name],
seq_b=tokenized["text_b"][seq_name],
tokenizer=self.tokenizer,
max_seq_len=self.max_seq_len)
samples.append(Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized))
# if we don't do next sentence prediction, we should feed in a single sentence
else:
text_a = doc[idx]