How to use the dataset.ClusteredBatcher function in dataset

To help you get started, we’ve selected a few dataset examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github allenai / document-qa / experimental / squad_text_labels.py View on Github external
def test_build_training_data():
    train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True, False)
    eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False)
    data = PreprocessedData(SquadCorpus(),
                            TagTextAnswers(),
                            ParagraphAndQuestionDatasetBuilder(train_batching, eval_batching),
                            eval_on_verified=False,
                            sample=20, sample_dev=20
                            # sample_dev=100, sample=100, eval_on_verified=False
                            )
    data.preprocess()
    data = data.get_train()
    for batch in data.get_epoch():
        for x in batch:
            print(x.answer.answer_spans.shape)
github allenai / document-qa / experimental / tmp.py View on Github external
def show():
    stop = NltkPlusStopWords(True)
    prep = ExtractSingleParagraph(MergeParagraphs(400), TopTfIdf(stop, 3),
                                  WithIndicators(True, True), intern=True)
    train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True, False)
    eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False)
    builder = ParagraphAndQuestionsBuilder(train_batching, eval_batching)
    data = PreprocessedData(TriviaQaWebDataset(), prep, builder, eval_on_verified=False,
                            sample_dev=20, sample=100)
    data.preprocess(1)

    for batch in list(data.get_train().get_epoch())[:10]:
        for point in batch:
            print(" ".join(point.question))
            print(point.answer.answer_text)
            context = list(point.get_context())
            for s,e in point.answer.answer_spans:
                context[s] = "{{" + context[s]
                context[e] = context[e] + "}}"
            print(" ".join(context))
            input()
github allenai / document-qa / train_squad / train_text_answers.py View on Github external
predictor=WithFixedContextPredictionLayer(
            # BiRecurrentMapper(GruCellSpec(40)),
            ResidualLayer(BiRecurrentMapper(GruCellSpec(80))),
            AttentionEncoder(post_process=MapperSeq(FullyConnected(25, activation="tanh"), DropoutLayer(0.8))),
            WithProjectedProduct(include_tiled=True),
            ChainBiMapper(
                first_layer=BiRecurrentMapper(GruCellSpec(80)),
                second_layer=BiRecurrentMapper(GruCellSpec(80))
            ),
            aggregate="sum"
        )
    )
    with open(__file__, "r") as f:
        notes = f.read()

    train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False)
    eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
    data = PreprocessedData(SquadCorpus(),
                            TagTextAnswers(),
                            ParagraphAndQuestionDatasetBuilder(train_batching, eval_batching),
                            # sample=20, sample_dev=20,
                            eval_on_verified=False)
    data.preprocess()

    eval = [LossEvaluator(), BoundedSquadSpanEvaluator(bound=[17])]
    trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes, False)
github allenai / document-qa / train_triviaqa / train_open_qa.py View on Github external
)),
                                        VariationalDropoutLayer(0.8)),
        predictor=ConfidencePredictor(
            ChainBiMapper(
                first_layer=recurrent_layer,
                second_layer=recurrent_layer,
            ),
            AttentionEncoder(),
            FullyConnected(80, activation="tanh"),
            aggregate="sum"
        )
    )
    with open(__file__, "r") as f:
        notes = f.read()

    train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True, False)
    eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False)
    builder = RandomParagraphsBuilder(train_batching, eval_batching, 0.5)
    prep = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400),
                                             ShallowOpenWebRanker(12),
                                             intern=True,
                                             require_an_answer=True)
    data = PreprocessedData(TriviaQaWebDataset(), prep, builder, eval_on_verified=False)
    eval = [LossEvaluator(), ConfidenceEvaluator(8)]

    data.preprocess(6, 1000)
    data.cache_preprocess("tfidf-open-top12.pkl.gz")
    # data.load_preprocess("tfidf-open-top12.pkl.gz")

    trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes)
github allenai / document-qa / train_squad / train2.py View on Github external
predictor=WithFixedContextPredictionLayer(
            ResidualLayer(recurrent_layer),
            AttentionEncoder(post_process=MapperSeq(FullyConnected(25, activation="tanh"), DropoutLayer(0.8))),
            WithProjectedProduct(include_tiled=True),
            ChainBiMapper(
                first_layer=recurrent_layer,
                second_layer=recurrent_layer
            ),
            IndependentBoundsJointLoss()
        )
    )
    with open(__file__, "r") as f:
        notes = f.read()

    train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False)
    eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(SquadCorpus(), None, train_batching, eval_batching)

    eval = [LossEvaluator(), SpanProbability(), BoundedSquadSpanEvaluator(bound=[17])]
    trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes, False)
github allenai / document-qa / train_squad / train1.py View on Github external
StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()),
                                            FullyConnected(dim * 2, activation="relu")
                                        )),
                                        VariationalDropoutLayer(0.8)),
        predictor=BoundsPredictor(
            ChainBiMapper(
                first_layer=recurrent_layer,
                second_layer=recurrent_layer
            ),
        )
    )

    with open(__file__, "r") as f:
        notes = f.read()

    train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False)
    eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(SquadCorpus(), None, train_batching, eval_batching)

    eval = [LossEvaluator(), SpanProbability(), BoundedSquadSpanEvaluator(bound=[17])]
    trainer.start_training(data, model, train_params, eval, trainer.ModelDir(out), notes)
github allenai / document-qa / experimental / train_paragraph_selection / train_context_selection.py View on Github external
DropoutLayer(0.8),
            BiRecurrentMapper(GruCellSpec(50)),
            FullyConnected(30, activation="tanh")
        ),
        merge_with_features=ConcatLayer(),
        map_joint=NullMapper(),
        encode_joint_features=RecurrentEncoder(GruCellSpec(25), None),
        process=SequenceMapperSeq(BiRecurrentMapper(GruCellSpec(25)), FullyConnected(10)),
        predictor=SoftmaxPrediction(),
        any_features=True
    )

    with open(__file__, "r") as f:
        notes = f.read()

    train_batching = ClusteredBatcher(45, NParagraphsSortKey(), True, False)
    eval_batching = ClusteredBatcher(45, NParagraphsSortKey(), False, False)
    data = PreprocessedData(
        TriviaQaWebDataset(), fe,
        SelectionWithContextDatasetBuilder(train_batching, eval_batching),
        eval_on_verified=False,
        hold_out_train=(0, 5000),
        # sample=200, sample_dev=200,
    )

    # data.preprocess(8, chunk_size=1000)
    # data.cache_preprocess("unigram-para-held-out.pkl")
    data.load_preprocess("unigram-para-held-out.pkl")

    eval = [LossEvaluator(), AnyTopNEvaluator([1, 2, 3, 4]), PercentAnswerEvaluator([1,2,3,4]), TotalAnswersEvaluator([1,2,3,4])]
    trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes, False)
github allenai / document-qa / train_triviaqa / train_web2.py View on Github external
ChainBiMapper(
                first_layer=recurrent_layer,
                second_layer=recurrent_layer,
            ),
            span_predictor=IndependentBoundsNoAnswerOption()
        )
    )

    with open(__file__, "r") as f:
        notes = f.read()

    stop = NltkPlusStopWords(True)
    prep = ExtractMultiParagraphs(MergeParagraphs(400), TopTfIdf(stop, 4),
                                  model.preprocessor, intern=True, require_an_answer=True)

    eval_batching = ClusteredBatcher(150, ContextLenKey(), False, False)
    eval_builder = RandomParagraphsBuilder(eval_batching, eval_batching, 0.5)
    train_builder = StratifyParagraphSetsBuilder(35, 35, True, True)

    data = PreprocessedData(TriviaQaWebDataset(), prep, train_builder, eval_builder, eval_on_verified=False)
    data.preprocess(6, 1000)
    eval = [LossEvaluator(), ConfidenceEvaluator(8)]
    trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes, None)
github allenai / document-qa / train_triviaqa / train_web.py View on Github external
first_layer=recurrent_layer,
                second_layer=recurrent_layer,
            ),
            AttentionEncoder(),
            FullyConnected(80, activation="tanh"),
            aggregate="sum"
        )
    )

    with open(__file__, "r") as f:
        notes = f.read()

    prep = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400), ShallowOpenWebRanker(16),
                                             model.preprocessor, intern=True, require_an_answer=True)

    eval_batching = ClusteredBatcher(180, ContextLenKey(), False, True)
    train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True, True)

    eval_builder = RandomParagraphsBuilder(eval_batching, eval_batching, 0.5, 2)
    train_builder = StratifyParagraphsBuilder(train_batching, 2)

    data = PreprocessedData(TriviaQaWebDataset(), prep, train_builder, eval_builder, eval_on_verified=False)
    data.preprocess(6, 1000)
    eval = [LossEvaluator(), ConfidenceEvaluator(8)]
    trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes, None)