How to use the sockeye.utils.smart_open function in sockeye

To help you get started, we’ve selected a few sockeye examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / sockeye / test / unit / test_utils.py View on Github external
def test_smart_open_without_suffix():
    with TemporaryDirectory() as temp:
        fname = os.path.join(temp, 'test')
        _touch_file(fname, compressed=True, empty=False)
        with utils.smart_open(fname) as fin:
            assert len(fin.readlines()) == 10
        _touch_file(fname, compressed=False, empty=False)
        with utils.smart_open(fname) as fin:
            assert len(fin.readlines()) == 10
github awslabs / sockeye / sockeye / rerank.py View on Github external
def rerank(args: argparse.Namespace):
    """
    Reranks a list of hypotheses according to a sentence-level metric.
    Writes all output to STDOUT.

    :param args: Namespace object holding CLI arguments.
    """
    reranker = Reranker(args.metric, args.return_score)

    with utils.smart_open(args.reference) as reference, utils.smart_open(args.hypotheses) as hypotheses:
        for i, (reference_line, hypothesis_line) in enumerate(zip(reference, hypotheses), 1):
            reference = reference_line.strip()
            # Expects a JSON object with keys containing at least 'translations',
            # as returned by sockeye.translate's nbest output
            hypotheses = json.loads(hypothesis_line.strip())
            utils.check_condition('translations' in hypotheses,
                                  "Reranking requires nbest JSON input with 'translations' key present.")
            num_hypotheses = len(hypotheses['translations'])

            if not num_hypotheses > 1:
                logger.info("Line %d contains %d hypotheses. Nothing to rerank.", i, num_hypotheses)
                reranked_hypotheses = hypotheses
            else:
                reranked_hypotheses = reranker.rerank(hypotheses, reference)

            if args.output_best:
github awslabs / sockeye / sockeye / vocab.py View on Github external
pad_to_multiple_of: Optional[int] = None,
                     num_pointers: int = 0) -> Vocab:
    """
    Creates vocabulary from paths to a file in sentence-per-line format. A sentence is just a whitespace delimited
    list of tokens. Note that special symbols like the beginning of sentence (BOS) symbol will be added to the
    vocabulary.

    :param paths: List of paths to files with one sentence per line.
    :param num_words: Optional maximum number of words in the vocabulary.
    :param min_count: Minimum occurrences of words to be included in the vocabulary.
    :param pad_to_multiple_of: If not None, pads the vocabulary to a size that is the next multiple of this int.
    :return: Word-to-id mapping.
    """
    with ExitStack() as stack:
        logger.info("Building vocabulary from dataset(s): %s", paths)
        files = (stack.enter_context(utils.smart_open(path)) for path in paths)  # pylint: disable=no-member
        return build_vocab(chain(*files), num_words, min_count, pad_to_multiple_of, num_pointers)
github Cartus / DCGCN / sockeye / data_io.py View on Github external
def read_content(path: str, limit: Optional[int] = None) -> Iterator[List[str]]:
    """
    Returns a list of tokens for each line in path up to a limit.

    :param path: Path to files containing sentences.
    :param limit: How many lines to read from path.
    :return: Iterator over lists of words.
    """
    with smart_open(path) as indata:
        for i, line in enumerate(indata):
            if limit is not None and i == limit:
                break
            yield list(get_tokens(line))