How to use the kaggle.corpus.Data function in kaggle

To help you get started, we’ve selected a few kaggle examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github maigimenez / jon-siamese / kaggle / corpus.py View on Github external
'is_duplicate'].values.tolist()
        assert len(sim_questions) == len(sim_tags)
        assert len(set(sim_tags)) == 1
        assert next(iter(set(sim_tags))) == 1

        # Save each pair of similar questions within the data class
        for question, tag in zip(sim_questions, sim_tags):
            if preprocess:
                q1 = preprocess_sentence(question[1])
                q2 = preprocess_sentence(question[2])
            else:
                q1 = question[1]
                q2 = question[2]

            if q1 and q2:
                self._sim_data.append(Data(question[0], q1, q2, tag, [0, 1]))
github maigimenez / jon-siamese / kaggle / corpus.py View on Github external
def load_kaggle(self, preprocess):
        self._data_frame = pd.read_csv(self._corpora_path, header=0)
        print(self._data_frame.keys())
        if self._partition == 'test':
            self._test_data = []
            for q_id, q1, q2 in self._data_frame[['test_id', 'question1', 'question2']].values:
                # print(q_id, q1, q2)
                if isinstance(q1, str) and isinstance(q2, str):
                    if preprocess:
                        q1 = preprocess_sentence(q1)
                        q2 = preprocess_sentence(q2)
                    self._test_data.append(Data(q_id, q1, q2))
                elif isinstance(q1, str) and not isinstance(q2, str):
                    self._test_data.append(Data(q_id, q1, ""))
                elif not isinstance(q1, str) and isinstance(q2, str):
                    self._test_data.append(Data(q_id, "", q2))
                else:
                    print(q_id)
            assert len(self._test_data) == 2345796

        else:
            self.load_sim_quora(preprocess=preprocess)
            self.load_non_sim_quora(preprocess=preprocess)
github maigimenez / jon-siamese / kaggle / corpus.py View on Github external
assert len(set(non_sim_tags)) == 1
        assert next(iter(set(non_sim_tags))) == 0

        # Save each pair of non similar questions within the data class
        for question, tag in zip(non_sim_questions, non_sim_tags):
            # There are two errors in the dataset.
            # Ids: 105796, 201871 doesn't have a pair of questions.
            # This condition prevent storing this value
            if preprocess:
                q1 = preprocess_sentence(question[1])
                q2 = preprocess_sentence(question[2])
            else:
                q1 = question[1] if isinstance(question[1], str) else None
                q2 = question[2] if isinstance(question[2], str) else None
            if q1 and q2:
                self._non_sim_data.append(Data(question[0], q1, q2, tag, [1, 0]))
github maigimenez / jon-siamese / kaggle / corpus.py View on Github external
def load_kaggle(self, preprocess):
        self._data_frame = pd.read_csv(self._corpora_path, header=0)
        print(self._data_frame.keys())
        if self._partition == 'test':
            self._test_data = []
            for q_id, q1, q2 in self._data_frame[['test_id', 'question1', 'question2']].values:
                # print(q_id, q1, q2)
                if isinstance(q1, str) and isinstance(q2, str):
                    if preprocess:
                        q1 = preprocess_sentence(q1)
                        q2 = preprocess_sentence(q2)
                    self._test_data.append(Data(q_id, q1, q2))
                elif isinstance(q1, str) and not isinstance(q2, str):
                    self._test_data.append(Data(q_id, q1, ""))
                elif not isinstance(q1, str) and isinstance(q2, str):
                    self._test_data.append(Data(q_id, "", q2))
                else:
                    print(q_id)
            assert len(self._test_data) == 2345796

        else:
            self.load_sim_quora(preprocess=preprocess)
            self.load_non_sim_quora(preprocess=preprocess)
github maigimenez / jon-siamese / kaggle / corpus.py View on Github external
def load_kaggle(self, preprocess):
        self._data_frame = pd.read_csv(self._corpora_path, header=0)
        print(self._data_frame.keys())
        if self._partition == 'test':
            self._test_data = []
            for q_id, q1, q2 in self._data_frame[['test_id', 'question1', 'question2']].values:
                # print(q_id, q1, q2)
                if isinstance(q1, str) and isinstance(q2, str):
                    if preprocess:
                        q1 = preprocess_sentence(q1)
                        q2 = preprocess_sentence(q2)
                    self._test_data.append(Data(q_id, q1, q2))
                elif isinstance(q1, str) and not isinstance(q2, str):
                    self._test_data.append(Data(q_id, q1, ""))
                elif not isinstance(q1, str) and isinstance(q2, str):
                    self._test_data.append(Data(q_id, "", q2))
                else:
                    print(q_id)
            assert len(self._test_data) == 2345796

        else:
            self.load_sim_quora(preprocess=preprocess)
            self.load_non_sim_quora(preprocess=preprocess)