Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_smart_open_without_suffix():
with TemporaryDirectory() as temp:
fname = os.path.join(temp, 'test')
_touch_file(fname, compressed=True, empty=False)
with utils.smart_open(fname) as fin:
assert len(fin.readlines()) == 10
_touch_file(fname, compressed=False, empty=False)
with utils.smart_open(fname) as fin:
assert len(fin.readlines()) == 10
def rerank(args: argparse.Namespace):
"""
Reranks a list of hypotheses according to a sentence-level metric.
Writes all output to STDOUT.
:param args: Namespace object holding CLI arguments.
"""
reranker = Reranker(args.metric, args.return_score)
with utils.smart_open(args.reference) as reference, utils.smart_open(args.hypotheses) as hypotheses:
for i, (reference_line, hypothesis_line) in enumerate(zip(reference, hypotheses), 1):
reference = reference_line.strip()
# Expects a JSON object with keys containing at least 'translations',
# as returned by sockeye.translate's nbest output
hypotheses = json.loads(hypothesis_line.strip())
utils.check_condition('translations' in hypotheses,
"Reranking requires nbest JSON input with 'translations' key present.")
num_hypotheses = len(hypotheses['translations'])
if not num_hypotheses > 1:
logger.info("Line %d contains %d hypotheses. Nothing to rerank.", i, num_hypotheses)
reranked_hypotheses = hypotheses
else:
reranked_hypotheses = reranker.rerank(hypotheses, reference)
if args.output_best:
pad_to_multiple_of: Optional[int] = None,
num_pointers: int = 0) -> Vocab:
"""
Creates vocabulary from paths to a file in sentence-per-line format. A sentence is just a whitespace delimited
list of tokens. Note that special symbols like the beginning of sentence (BOS) symbol will be added to the
vocabulary.
:param paths: List of paths to files with one sentence per line.
:param num_words: Optional maximum number of words in the vocabulary.
:param min_count: Minimum occurrences of words to be included in the vocabulary.
:param pad_to_multiple_of: If not None, pads the vocabulary to a size that is the next multiple of this int.
:return: Word-to-id mapping.
"""
with ExitStack() as stack:
logger.info("Building vocabulary from dataset(s): %s", paths)
files = (stack.enter_context(utils.smart_open(path)) for path in paths) # pylint: disable=no-member
return build_vocab(chain(*files), num_words, min_count, pad_to_multiple_of, num_pointers)
def read_content(path: str, limit: Optional[int] = None) -> Iterator[List[str]]:
"""
Returns a list of tokens for each line in path up to a limit.
:param path: Path to files containing sentences.
:param limit: How many lines to read from path.
:return: Iterator over lists of words.
"""
with smart_open(path) as indata:
for i, line in enumerate(indata):
if limit is not None and i == limit:
break
yield list(get_tokens(line))