Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_index_joining_non_overlapping():
df1 = pd.DataFrame([{"a": "hi", "b": 2}, {"a": "bye", "b": 2}])
df2 = pd.DataFrame([{"a": "hi", "b": 2}, {"a": "bye", "b": 2}, {"a": "back fo mo", "b": 3}])
compare = datacompy.Compare(df1, df2, on_index=True)
assert not compare.matches()
assert compare.all_columns_match()
assert compare.intersect_rows_match()
assert len(compare.df1_unq_rows) == 0
assert len(compare.df2_unq_rows) == 1
assert list(compare.df2_unq_rows["a"]) == ["back fo mo"]
def test_compare_df_setter_good():
df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}])
df2 = pd.DataFrame([{"A": 1, "B": 2}, {"A": 2, "B": 3}])
compare = datacompy.Compare(df1, df2, ["a"])
assert compare.df1.equals(df1)
assert compare.df2.equals(df2)
assert compare.join_columns == ["a"]
compare = datacompy.Compare(df1, df2, ["A", "b"])
assert compare.df1.equals(df1)
assert compare.df2.equals(df2)
assert compare.join_columns == ["a", "b"]
def test_index_with_joins_with_ignore_spaces():
df1 = pd.DataFrame([{"a": 1, "b": " A"}, {"a": 2, "b": "A"}])
df2 = pd.DataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "A "}])
compare = datacompy.Compare(df1, df2, on_index=True, ignore_spaces=False)
assert not compare.matches()
assert compare.all_columns_match()
assert compare.all_rows_overlap()
assert not compare.intersect_rows_match()
compare = datacompy.Compare(df1, df2, "a", ignore_spaces=True)
assert compare.matches()
assert compare.all_columns_match()
assert compare.all_rows_overlap()
assert compare.intersect_rows_match()
# should match
df1 = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
df2 = pd.DataFrame({"a": [1, 2, 3], "B": [0, 1, 2]})
compare = datacompy.Compare(df1, df2, join_columns=["a"])
assert compare.matches()
# should not match
df1 = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
df2 = pd.DataFrame({"a": [1, 2, 3], "B": [0, 1, 2]})
compare = datacompy.Compare(df1, df2, join_columns=["a"], cast_column_names_lower=False)
assert not compare.matches()
# test join column
# should match
df1 = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
df2 = pd.DataFrame({"A": [1, 2, 3], "B": [0, 1, 2]})
compare = datacompy.Compare(df1, df2, join_columns=["a"])
assert compare.matches()
# should fail because "a" is not found in df2
df1 = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
df2 = pd.DataFrame({"A": [1, 2, 3], "B": [0, 1, 2]})
expected_message = "df2 must have all columns from join_columns"
with raises(ValueError, match=expected_message):
compare = datacompy.Compare(df1, df2, join_columns=["a"], cast_column_names_lower=False)
def test_dupes_with_nulls():
df1 = pd.DataFrame(
{
"fld_1": [1, 2, 2, 3, 3, 4, 5, 5],
"fld_2": ["A", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
}
)
df2 = pd.DataFrame({"fld_1": [1, 2, 3, 4, 5], "fld_2": ["A", np.nan, np.nan, np.nan, np.nan]})
comp = datacompy.Compare(df1, df2, join_columns=["fld_1", "fld_2"])
assert comp.subset()
def test_index_with_joins_with_ignore_case():
df1 = pd.DataFrame([{"a": 1, "b": "a"}, {"a": 2, "b": "A"}])
df2 = pd.DataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "a"}])
compare = datacompy.Compare(df1, df2, on_index=True, ignore_case=False)
assert not compare.matches()
assert compare.all_columns_match()
assert compare.all_rows_overlap()
assert not compare.intersect_rows_match()
compare = datacompy.Compare(df1, df2, "a", ignore_case=True)
assert compare.matches()
assert compare.all_columns_match()
assert compare.all_rows_overlap()
assert compare.intersect_rows_match()
def test_simple_dupes_one_field_two_vals():
df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}])
df2 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}])
compare = datacompy.Compare(df1, df2, join_columns=["a"])
assert compare.matches()
# Just render the report to make sure it renders.
t = compare.report()
def test_compare_df_setter_bad():
df = pd.DataFrame([{"a": 1, "A": 2}, {"a": 2, "A": 2}])
with raises(TypeError, match="df1 must be a pandas DataFrame"):
compare = datacompy.Compare("a", "a", ["a"])
with raises(ValueError, match="df1 must have all columns from join_columns"):
compare = datacompy.Compare(df, df.copy(), ["b"])
with raises(ValueError, match="df1 must have unique column names"):
compare = datacompy.Compare(df, df.copy(), ["a"])
df_dupe = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 3}])
assert datacompy.Compare(df_dupe, df_dupe.copy(), ["a", "b"]).df1.equals(df_dupe)
def test_compare_df_setter_bad_index():
df = pd.DataFrame([{"a": 1, "A": 2}, {"a": 2, "A": 2}])
with raises(TypeError, match="df1 must be a pandas DataFrame"):
compare = datacompy.Compare("a", "a", on_index=True)
with raises(ValueError, match="df1 must have unique column names"):
compare = datacompy.Compare(df, df.copy(), on_index=True)
def test_compare_df_setter_bad_index():
df = pd.DataFrame([{"a": 1, "A": 2}, {"a": 2, "A": 2}])
with raises(TypeError, match="df1 must be a pandas DataFrame"):
compare = datacompy.Compare("a", "a", on_index=True)
with raises(ValueError, match="df1 must have unique column names"):
compare = datacompy.Compare(df, df.copy(), on_index=True)