How to use the pandera.Column function in pandera

To help you get started, we’ve selected a few pandera examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pandera-dev / pandera / tests / test_pandera.py View on Github external
def test_dataframe_schema():
    schema = DataFrameSchema(
        {
            "a": Column(Int,
                        Check(lambda x: x > 0, element_wise=True)),
            "b": Column(Float,
                        Check(lambda x: 0 <= x <= 10, element_wise=True)),
            "c": Column(String,
                        Check(lambda x: set(x) == {"x", "y", "z"})),
            "d": Column(Bool,
                        Check(lambda x: x.mean() > 0.5)),
            "e": Column(Category,
                        Check(lambda x: set(x) == {"c1", "c2", "c3"})),
            "f": Column(Object,
                        Check(lambda x: x.isin([(1,), (2,), (3,)]))),
            "g": Column(DateTime,
                        Check(lambda x: x >= pd.Timestamp("2015-01-01"),
                              element_wise=True)),
            "i": Column(Timedelta,
                        Check(lambda x: x < pd.Timedelta(10, unit="D"),
                              element_wise=True))
        })
github pandera-dev / pandera / tests / test_pandera.py View on Github external
def test_coerce_dtype():
    df = pd.DataFrame({
        "column1": [10.0, 20.0, 30.0],
        "column2": ["2018-01-01", "2018-02-01", "2018-03-01"],
        "column3": [1, 2, 3],
        "column4": [1., 1., np.nan],
    })
    # specify `coerce` at the Column level
    schema1 = DataFrameSchema({
        "column1": Column(Int, Check(lambda x: x > 0), coerce=True),
        "column2": Column(DateTime, coerce=True),
        "column3": Column(String, coerce=True),
    })
    # specify `coerce` at the DataFrameSchema level
    schema2 = DataFrameSchema({
        "column1": Column(Int, Check(lambda x: x > 0)),
        "column2": Column(DateTime),
        "column3": Column(String),
    }, coerce=True)

    for schema in [schema1, schema2]:
        result = schema.validate(df)
        assert result.column1.dtype == Int.value
        assert result.column2.dtype == DateTime.value
        for _, x in result.column3.iteritems():
            assert isinstance(x, str)
github pandera-dev / pandera / tests / test_dtypes.py View on Github external
def test_category_dtype():
    schema = DataFrameSchema(
        columns={
            "col": Column(
                dtypes.Category,
                checks=[
                    Check(lambda s: set(s) == {"A", "B", "C"}),
                    Check(lambda s:
                          s.cat.categories.tolist() == ["A", "B", "C"]),
                    Check(lambda s: s.isin(["A", "B", "C"]))
                ],
                nullable=False
            ),
        },
        coerce=False
    )
    validated_df = schema.validate(
        pd.DataFrame(
            {"col": pd.Series(["A", "B", "A", "B", "C"], dtype="category")}
        )
github pandera-dev / pandera / tests / test_pandera.py View on Github external
def test_check_groupby_multiple_columns():
    schema = DataFrameSchema({
        "col1": Column(Int, [
            Check(lambda s: s[("bar", True)].sum() == 16,  # 7 + 9
                  groupby=["col2", "col3"]),
        ]),
        "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
        "col3": Column(Bool),
    })

    df_pass = pd.DataFrame({
        "col1": [7, 8, 9, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        "col3": [True, False, True, False, True, False],
    })

    df = schema.validate(df_pass)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 3
    assert set(df.columns) == {"col1", "col2", "col3"}
github pandera-dev / pandera / tests / test_pandera.py View on Github external
def test_multi_index_columns():
    schema = DataFrameSchema({
        ("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))),
        ("zero", "bar"): Column(
            String, Check(lambda s: s.isin(["a", "b", "c", "d"]))),
        ("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))),
        ("one", "bar"): Column(
            DateTime, Check(lambda s: s == pd.datetime(2019, 1, 1)))
    })
    validated_df = schema.validate(
        pd.DataFrame({
            ("zero", "foo"): [0.1, 0.2, 0.7, 0.3],
            ("zero", "bar"): ["a", "b", "c", "d"],
            ("one", "foo"): [1, 6, 4, 7],
            ("one", "bar"): pd.to_datetime(["2019/01/01"] * 4)
        })
    )
    assert isinstance(validated_df, pd.DataFrame)
github pandera-dev / pandera / tests / test_hypotheses.py View on Github external
def test_dataframe_hypothesis_checks():

    df = pd.DataFrame({
        "col1": range(100, 201),
        "col2": range(0, 101),
    })

    hypothesis_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                relationship=lambda stat, pvalue, alpha=0.01: (
                    stat > 0 and pvalue / 2 < alpha
                ),
                relationship_kwargs={"alpha": 0.5},
            ),
            # one-sample test
            Hypothesis(
                test=stats.ttest_1samp,
                samples=["col1"],
                relationship=lambda stat, pvalue, alpha=0.01: (
github pandera-dev / pandera / tests / test_decorators.py View on Github external
    @check_output(DataFrameSchema({"column2": Column(Float)}))
    def test_func(df):
        return df
github pandera-dev / pandera / tests / test_pandera.py View on Github external
def test_dtypes():
    for dtype in [
            dtypes.Float,
            dtypes.Float16,
            dtypes.Float32,
            dtypes.Float64]:
        schema = DataFrameSchema({"col": Column(dtype, nullable=False)})
        validated_df = schema.validate(
            pd.DataFrame(
                {"col": [-123.1, -7654.321, 1.0, 1.1, 1199.51, 5.1, 4.6]},
                dtype=dtype.value))
        assert isinstance(validated_df, pd.DataFrame)

    for dtype in [
            dtypes.Int,
            dtypes.Int8,
            dtypes.Int16,
            dtypes.Int32,
            dtypes.Int64]:
        schema = DataFrameSchema({"col": Column(dtype, nullable=False)})
        validated_df = schema.validate(
            pd.DataFrame(
                {"col": [-712, -4, -321, 0, 1, 777, 5, 123, 9000]},