From 5a4299cd0e63a96b764ade1ef7ef04bc4e33c96a Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 6 Jul 2022 17:02:51 +0200 Subject: [PATCH 1/2] feat: add a high level dataframe diff utility class --- owid/datautils/dataframes.py | 173 ++++++++++++++++++++++++++ tests/test_dataframes.py | 232 +++++++++++++++++++++++++++++++++++ 2 files changed, 405 insertions(+) diff --git a/owid/datautils/dataframes.py b/owid/datautils/dataframes.py index f5f6971..50bb629 100644 --- a/owid/datautils/dataframes.py +++ b/owid/datautils/dataframes.py @@ -218,6 +218,179 @@ def are_equal( return equal, compared +class DataFrameHighLevelDiff: + """Class for comparing two dataframes. + + It assumes that all nans are identical, and compares floats by means of certain absolute and relative tolerances. + Construct this class by passing two dataframes of possibly different shape. Then check the are_structurally_equal + property to see if the column and row sets of the two dataframes match and/or check the are_equal flag to also + check for equality of values. + + For cases where there is a difference, various member fields on this class give indications of what is different + (e.g. columns missing in dataframe 1 or 2, index values missing in dataframe 1 or 2, etc.). + + Parameters + ---------- + df1 : pd.DataFrame + First dataframe. + df2 : pd.DataFrame + Second dataframe. + absolute_tolerance : float + Absolute tolerance to assume in the comparison of each cell in the dataframes. A value a of an element in df1 is + considered equal to the corresponding element b at the same position in df2, if: + abs(a - b) <= absolute_tolerance + relative_tolerance : float + Relative tolerance to assume in the comparison of each cell in the dataframes. A value a of an element in df1 is + considered equal to the corresponding element b at the same position in df2, if: + abs(a - b) / abs(b) <= relative_tolerance + + """ + + df1: pd.DataFrame + df2: pd.DataFrame + columns_missing_in_df1: List[str] + columns_missing_in_df2: List[str] + index_columns_missing_in_df1: List[str] + index_columns_missing_in_df2: List[str] + index_values_missing_in_df1: pd.Index + index_values_missing_in_df2: pd.Index + duplicate_index_values_in_df1: pd.Series + duplicate_index_values_in_df2: pd.Series + value_differences: Optional[pd.DataFrame] + + def __init__( + self, + df1: pd.DataFrame, + df2: pd.DataFrame, + absolute_tolerance: float, + relative_tolerance: float, + ): + self.df1 = df1 + self.df2 = df2 + self.absolute_tolerance = absolute_tolerance + self.relative_tolerance = relative_tolerance + self.diff() + + @property + def columns_with_differences(self) -> Any: + """Return the columns that are different in the two dataframes. + + This will be an array of index values. If the index is a MultiIndex, the index values will be tuples. + """ + if self.value_differences is None: + return pd.array([]) + return self.value_differences.columns.values + + @property + def rows_with_differences(self) -> Any: + """Return the row indices that are different in the two dataframes. + + This will be an array of index values. If the index is a MultiIndex, the index values will be tuples. + """ + if self.value_differences is None: + return pd.array([]) + return self.value_differences.index.values + + def diff(self) -> None: + """Diff the two dataframes. + + This can be a somewhat slow operation + """ + self.columns_missing_in_df1 = sorted( + set(self.df2.columns) - set(self.df1.columns) + ) + self.columns_missing_in_df2 = sorted( + set(self.df1.columns) - set(self.df2.columns) + ) + self.index_columns_missing_in_df1 = sorted( + set(self.df2.index.names) - set(self.df1.index.names) + ) + self.index_columns_missing_in_df2 = sorted( + set(self.df1.index.names) - set(self.df2.index.names) + ) + self.index_values_missing_in_df1 = self.df2.index.difference(self.df1.index) + self.index_values_missing_in_df2 = self.df1.index.difference(self.df2.index) + self.duplicate_index_values_in_df1 = self.df1[ + self.df1.index.duplicated() + ].index.values + self.duplicate_index_values_in_df2 = self.df2[ + self.df2.index.duplicated() + ].index.values + if self.are_structurally_equal: + # We don't use the compare function here from above because it builds a new + # dataframe and we want to leave indices intact so we can know which rows and columns + # were different once we drop the ones with no differences + diffs = self.df1.eq(self.df2) + + # Eq above does not take tolerance into account so compare again with tolerance + # for columns that are numeric. this could probably be sped up with a check on any on + # the column first but would have to be benchmarked + for col in diffs.columns: + if (self.df1[col].dtype in (object, "category")) or ( + self.df2[col].dtype in (object, "category") + ): + # Apply a direct comparison for strings or categories + pass + else: + # For numeric data, consider them equal within certain absolute and relative tolerances. + compared_values = np.isclose( + self.df1[col].values, + self.df2[col].values, + atol=self.absolute_tolerance, + rtol=self.relative_tolerance, + ) + # Treat nans as equal. + compared_values[ + pd.isnull(self.df1[col].values) + & pd.isnull(self.df2[col].values) + ] = True + diffs[col] = compared_values + + # We now have a dataframe with the same shape and indices as df1 and df2, filled with + # True where the values are the same. We want to use true for different values, so invert + # element-wise now + diffs = ~diffs + + if diffs.empty: + self.value_differences = None + else: + # Get a copy of diffs with all rows dropped where all values in a row are False + # (i.e. where df1 and df2 have identical values for all columns) + rows_with_diffs = diffs[diffs.any(axis=1)] + if rows_with_diffs.empty or not rows_with_diffs.any().any(): + self.value_differences = None + else: + # Now figure out all columns where there is at least one difference + columns_with_diffs = diffs.any(axis=0) + if not columns_with_diffs.any(): + self.value_differences = None + else: + # Here we drop the columns that did not have differences. We are left with a dataframe + # with the original indices and only the rows and columns let with differences. + self.value_differences = rows_with_diffs.loc[ + :, columns_with_diffs + ] + + @property + def are_structurally_equal(self) -> bool: + """Check if the two dataframes are structurally equal (i.e. same columns, same index values, ...).""" + return not ( + any(self.columns_missing_in_df1) + or any(self.columns_missing_in_df2) + or any(self.index_columns_missing_in_df1) + or any(self.index_columns_missing_in_df2) + or any(self.index_values_missing_in_df1) + or any(self.index_values_missing_in_df2) + or any(self.duplicate_index_values_in_df1) + or any(self.duplicate_index_values_in_df2) + ) + + @property + def are_equal(self) -> bool: + """Check if the two dataframes are equal, both structurally and cell-wise.""" + return self.are_structurally_equal and self.value_differences is None + + def groupby_agg( df: pd.DataFrame, groupby_columns: Union[List[str], str], diff --git a/tests/test_dataframes.py b/tests/test_dataframes.py index e17196a..527c53d 100644 --- a/tests/test_dataframes.py +++ b/tests/test_dataframes.py @@ -223,6 +223,238 @@ def test_on_dataframes_with_object_columns_with_nans(self): )[0] +class TestDataFrameHighLevelDiff: + def test_simple_equal_dataframes_are_equal(self): + df = pd.DataFrame( + { + "year": [2001, 2003, 2003, 2003, 2002, 2002], + "value_01": [1, 2, 3, 4, 5, 6], + } + ) + diff = dataframes.DataFrameHighLevelDiff( + df, + df, + absolute_tolerance=1e-8, + relative_tolerance=0.5, + ) + assert any(diff.columns_missing_in_df1) == False + assert any(diff.columns_missing_in_df2) == False + assert any(diff.index_columns_missing_in_df1) == False + assert any(diff.index_columns_missing_in_df2) == False + assert any(diff.index_values_missing_in_df1) == False + assert any(diff.index_values_missing_in_df2) == False + assert any(diff.duplicate_index_values_in_df1) == False + assert any(diff.duplicate_index_values_in_df2) == False + assert diff.are_structurally_equal == True + assert diff.are_equal + + def test_more_complex_equal_dataframes_are_equal(self): + df = pd.DataFrame( + { + "year": [2001, 2002, 2003, 2004, 2005, 2006] * 2, + "country": ["a"] * 6 + ["b"] * 6, + "value_01": [1, 2, 3, 4, 5, 6] * 2, + } + ) + df.set_index(["year", "country"], inplace=True) + diff = dataframes.DataFrameHighLevelDiff( + df, + df, + absolute_tolerance=1e-8, + relative_tolerance=0.5, + ) + assert any(diff.columns_missing_in_df1) == False + assert any(diff.columns_missing_in_df2) == False + assert any(diff.index_columns_missing_in_df1) == False + assert any(diff.index_columns_missing_in_df2) == False + assert any(diff.index_values_missing_in_df1) == False + assert any(diff.index_values_missing_in_df2) == False + assert any(diff.duplicate_index_values_in_df1) == False + assert any(diff.duplicate_index_values_in_df2) == False + assert diff.are_structurally_equal == True + assert diff.are_equal + + def test_detects_duplicate_index_values(self): + df = pd.DataFrame( + { + "year": [2001] * 12, + "country": ["a"] * 6 + ["b"] * 6, + "value_01": [1, 2, 3, 4, 5, 6] * 2, + } + ) + df.set_index(["year", "country"], inplace=True) + diff = dataframes.DataFrameHighLevelDiff( + df, + df, + absolute_tolerance=1e-8, + relative_tolerance=0.5, + ) + assert any(diff.columns_missing_in_df1) == False + assert any(diff.columns_missing_in_df2) == False + assert any(diff.index_columns_missing_in_df1) == False + assert any(diff.index_columns_missing_in_df2) == False + assert any(diff.index_values_missing_in_df1) == False + assert any(diff.index_values_missing_in_df2) == False + assert any(diff.duplicate_index_values_in_df1) == True + assert any(diff.duplicate_index_values_in_df2) == True + assert diff.are_structurally_equal == False + assert not diff.are_equal + + def test_detects_missing_index(self): + df = pd.DataFrame( + { + "year": [2001, 2002, 2003, 2004, 2005, 2006] * 2, + "country": ["a"] * 6 + ["b"] * 6, + "value_01": [1, 2, 3, 4, 5, 6] * 2, + } + ) + df2 = df.set_index(["year", "country"], inplace=False) + diff = dataframes.DataFrameHighLevelDiff( + df, + df2, + absolute_tolerance=1e-8, + relative_tolerance=0.5, + ) + assert any(diff.columns_missing_in_df1) == False + assert any(diff.columns_missing_in_df2) == True + assert any(diff.index_columns_missing_in_df1) == True + assert any(diff.index_columns_missing_in_df2) == False + assert any(diff.index_values_missing_in_df1) == True + assert any(diff.index_values_missing_in_df2) == True + assert any(diff.duplicate_index_values_in_df1) == False + assert any(diff.duplicate_index_values_in_df2) == False + assert diff.are_structurally_equal == False + assert not diff.are_equal + + def test_detects_missing_index_values(self): + df = pd.DataFrame( + { + "year": [2001, 2002, 2003, 2004, 2005, 2006] * 2, + "country": ["a"] * 6 + ["b"] * 6, + "value_01": [1, 2, 3, 4, 5, 6] * 2, + } + ) + df.set_index(["year", "country"], inplace=True) + df2 = df.copy() + df2.drop((2006, "b"), inplace=True) + diff = dataframes.DataFrameHighLevelDiff( + df, + df2, + absolute_tolerance=1e-8, + relative_tolerance=0.5, + ) + assert any(diff.columns_missing_in_df1) == False + assert any(diff.columns_missing_in_df2) == False + assert any(diff.index_columns_missing_in_df1) == False + assert any(diff.index_columns_missing_in_df2) == False + assert any(diff.index_values_missing_in_df1) == False + assert any(diff.index_values_missing_in_df2) == True + assert any(diff.duplicate_index_values_in_df1) == False + assert any(diff.duplicate_index_values_in_df2) == False + assert diff.are_structurally_equal == False + assert not diff.are_equal + + def test_detects_data_changes(self): + df = pd.DataFrame( + { + "year": [2001, 2002, 2003, 2004, 2005, 2006] * 2, + "country": ["a"] * 6 + ["b"] * 6, + "value_01": [1, 2, 3, 4, 5, 6] * 2, + } + ) + df.set_index(["year", "country"], inplace=True) + df2 = df.copy() + df2.loc[(2006, "b"), "value_01"] = 7 + diff = dataframes.DataFrameHighLevelDiff( + df, + df2, + absolute_tolerance=1e-8, + relative_tolerance=0.05, + ) + assert any(diff.columns_missing_in_df1) == False + assert any(diff.columns_missing_in_df2) == False + assert any(diff.index_columns_missing_in_df1) == False + assert any(diff.index_columns_missing_in_df2) == False + assert any(diff.index_values_missing_in_df1) == False + assert any(diff.index_values_missing_in_df2) == False + assert any(diff.duplicate_index_values_in_df1) == False + assert any(diff.duplicate_index_values_in_df2) == False + assert diff.are_structurally_equal == True + assert not diff.are_equal + assert diff.value_differences is not None and diff.value_differences.shape == ( + 1, + 1, + ) + assert type(diff.value_differences.index) == pd.MultiIndex + assert list(diff.rows_with_differences) == [(2006, "b")] + assert list(diff.columns_with_differences) == ["value_01"] + + def test_detects_data_changes_with_enough_tolerance(self): + df = pd.DataFrame( + { + "year": [2001, 2002, 2003, 2004, 2005, 2006] * 2, + "country": ["a"] * 6 + ["b"] * 6, + "value_01": [1, 2, 3, 4, 5, 6] * 2, + } + ) + df.set_index(["year", "country"], inplace=True) + df2 = df.copy() + df2.loc[(2006, "b"), "value_01"] = 7 + diff = dataframes.DataFrameHighLevelDiff( + df, + df2, + absolute_tolerance=1e-8, + relative_tolerance=0.3, + ) + assert any(diff.columns_missing_in_df1) == False + assert any(diff.columns_missing_in_df2) == False + assert any(diff.index_columns_missing_in_df1) == False + assert any(diff.index_columns_missing_in_df2) == False + assert any(diff.index_values_missing_in_df1) == False + assert any(diff.index_values_missing_in_df2) == False + assert any(diff.duplicate_index_values_in_df1) == False + assert any(diff.duplicate_index_values_in_df2) == False + assert diff.are_structurally_equal == True + assert diff.are_equal + assert diff.value_differences is None + + def test_detects_data_changes2(self): + df = pd.DataFrame( + { + "year": [2001, 2002, 2003, 2004, 2005, 2006] * 2, + "country": ["a"] * 6 + ["b"] * 6, + "value_01": [1, 2, 3, 4, 5, 6] * 2, + } + ) + df.set_index(["year", "country"], inplace=True) + df2 = df.copy() + df2.loc[(2006, "b"), "value_01"] = 7 + df2.loc[(2006, "a"), "value_01"] = 8 + diff = dataframes.DataFrameHighLevelDiff( + df, + df2, + absolute_tolerance=1e-8, + relative_tolerance=0.05, + ) + assert any(diff.columns_missing_in_df1) == False + assert any(diff.columns_missing_in_df2) == False + assert any(diff.index_columns_missing_in_df1) == False + assert any(diff.index_columns_missing_in_df2) == False + assert any(diff.index_values_missing_in_df1) == False + assert any(diff.index_values_missing_in_df2) == False + assert any(diff.duplicate_index_values_in_df1) == False + assert any(diff.duplicate_index_values_in_df2) == False + assert diff.are_structurally_equal == True + assert not diff.are_equal + assert diff.value_differences is not None and diff.value_differences.shape == ( + 2, + 1, + ) + assert type(diff.value_differences.index) == pd.MultiIndex + assert list(diff.rows_with_differences) == [(2006, "a"), (2006, "b")] + assert list(diff.columns_with_differences) == ["value_01"] + + class TestGroupbyAggregate: def test_default_aggregate_single_groupby_column_as_string(self): df_in = pd.DataFrame( From 84dc1c031adafb27df8a4ee84053e165e10e2341 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Fri, 8 Jul 2022 21:22:37 +0200 Subject: [PATCH 2/2] feat: add highlevel diff summary line generator Also incorporates PR feedback. 4 new tests are failing, these still need to be discussed. --- owid/datautils/dataframes.py | 334 +++++++++++++++++++++++++++++++---- owid/datautils/utils.py | 91 ++++++++++ tests/test_dataframes.py | 166 ++++++++++++++++- 3 files changed, 550 insertions(+), 41 deletions(-) create mode 100644 owid/datautils/utils.py diff --git a/owid/datautils/dataframes.py b/owid/datautils/dataframes.py index 50bb629..aa2ab50 100644 --- a/owid/datautils/dataframes.py +++ b/owid/datautils/dataframes.py @@ -1,10 +1,15 @@ """Objects related to pandas dataframes.""" -from typing import Tuple, Union, List, Any, Dict, Optional, cast, Callable +from typing import Generator, Tuple, Union, List, Any, Dict, Optional, cast, Callable import numpy as np import pandas as pd from pandas.api.types import union_categoricals +from owid.datautils.utils import ( + yield_formatted_if_not_empty, + get_compact_list_description, + yield_list_lines, +) from owid.datautils.common import ExceptionFromDocstring, warn_on_list_of_entities @@ -218,17 +223,20 @@ def are_equal( return equal, compared -class DataFrameHighLevelDiff: +class HighLevelDiff: """Class for comparing two dataframes. It assumes that all nans are identical, and compares floats by means of certain absolute and relative tolerances. Construct this class by passing two dataframes of possibly different shape. Then check the are_structurally_equal property to see if the column and row sets of the two dataframes match and/or check the are_equal flag to also - check for equality of values. + check for equality of values. The other fields give detailed information on what is different between the two + dataframes. For cases where there is a difference, various member fields on this class give indications of what is different (e.g. columns missing in dataframe 1 or 2, index values missing in dataframe 1 or 2, etc.). + The get_description_lines method fetches a list of strings that compactly describe the differences for humans. + Parameters ---------- df1 : pd.DataFrame @@ -250,35 +258,46 @@ class DataFrameHighLevelDiff: df2: pd.DataFrame columns_missing_in_df1: List[str] columns_missing_in_df2: List[str] + columns_shared: List[str] index_columns_missing_in_df1: List[str] index_columns_missing_in_df2: List[str] + index_columns_shared: List[str] index_values_missing_in_df1: pd.Index index_values_missing_in_df2: pd.Index + index_values_shared: pd.Index duplicate_index_values_in_df1: pd.Series duplicate_index_values_in_df2: pd.Series - value_differences: Optional[pd.DataFrame] + value_differences: Optional[pd.DataFrame] = None def __init__( self, df1: pd.DataFrame, df2: pd.DataFrame, - absolute_tolerance: float, - relative_tolerance: float, + absolute_tolerance: float = 1e-08, + relative_tolerance: float = 1e-05, ): self.df1 = df1 self.df2 = df2 self.absolute_tolerance = absolute_tolerance self.relative_tolerance = relative_tolerance - self.diff() + self._diff() + + @property + def value_differences_count(self) -> int: + """Get number of cells in the structural overlap of the two dataframes that differ by more than tolerance.""" + if self.value_differences is None: + return 0 + else: + return int(self.value_differences.sum().sum()) @property def columns_with_differences(self) -> Any: - """Return the columns that are different in the two dataframes. + """Get the columns that are different in the two dataframes. This will be an array of index values. If the index is a MultiIndex, the index values will be tuples. """ if self.value_differences is None: - return pd.array([]) + return np.array([]) return self.value_differences.columns.values @property @@ -288,61 +307,72 @@ def rows_with_differences(self) -> Any: This will be an array of index values. If the index is a MultiIndex, the index values will be tuples. """ if self.value_differences is None: - return pd.array([]) + return np.array([]) return self.value_differences.index.values - def diff(self) -> None: + def _diff(self) -> None: """Diff the two dataframes. This can be a somewhat slow operation """ - self.columns_missing_in_df1 = sorted( - set(self.df2.columns) - set(self.df1.columns) - ) - self.columns_missing_in_df2 = sorted( - set(self.df1.columns) - set(self.df2.columns) - ) - self.index_columns_missing_in_df1 = sorted( - set(self.df2.index.names) - set(self.df1.index.names) - ) - self.index_columns_missing_in_df2 = sorted( - set(self.df1.index.names) - set(self.df2.index.names) + df1_columns_set = set(self.df1.columns) + df2_columns_set = set(self.df2.columns) + self.columns_missing_in_df1 = sorted(df2_columns_set - df1_columns_set) + self.columns_missing_in_df2 = sorted(df1_columns_set - df2_columns_set) + self.columns_shared = sorted(df1_columns_set.intersection(df2_columns_set)) + + df1_index_names = set(self.df1.index.names) + df2_index_names = set(self.df2.index.names) + self.index_columns_missing_in_df1 = sorted(df2_index_names - df1_index_names) + self.index_columns_missing_in_df2 = sorted(df1_index_names - df2_index_names) + self.index_columns_shared = sorted( + df1_index_names.intersection(df2_index_names) ) + self.index_values_missing_in_df1 = self.df2.index.difference(self.df1.index) self.index_values_missing_in_df2 = self.df1.index.difference(self.df2.index) + self.index_values_shared = self.df2.index.intersection(self.df1.index) self.duplicate_index_values_in_df1 = self.df1[ self.df1.index.duplicated() ].index.values self.duplicate_index_values_in_df2 = self.df2[ self.df2.index.duplicated() ].index.values - if self.are_structurally_equal: + + # Now we calculate the value differences in the intersection of the two dataframes. + if self.columns_shared and any(self.index_values_shared): + df1_intersected = self.df1.loc[ + self.index_values_shared, list(self.columns_shared) + ] + df2_intersected = self.df2.loc[ + self.index_values_shared, list(self.columns_shared) + ] # We don't use the compare function here from above because it builds a new # dataframe and we want to leave indices intact so we can know which rows and columns # were different once we drop the ones with no differences - diffs = self.df1.eq(self.df2) + diffs = df1_intersected.eq(df2_intersected) # Eq above does not take tolerance into account so compare again with tolerance # for columns that are numeric. this could probably be sped up with a check on any on # the column first but would have to be benchmarked for col in diffs.columns: - if (self.df1[col].dtype in (object, "category")) or ( - self.df2[col].dtype in (object, "category") + if (df1_intersected[col].dtype in (object, "category")) or ( + df2_intersected[col].dtype in (object, "category") ): # Apply a direct comparison for strings or categories pass else: # For numeric data, consider them equal within certain absolute and relative tolerances. compared_values = np.isclose( - self.df1[col].values, - self.df2[col].values, + df1_intersected[col].values, + df2_intersected[col].values, atol=self.absolute_tolerance, rtol=self.relative_tolerance, ) # Treat nans as equal. compared_values[ - pd.isnull(self.df1[col].values) - & pd.isnull(self.df2[col].values) + pd.isnull(df1_intersected[col].values) + & pd.isnull(df2_intersected[col].values) ] = True diffs[col] = compared_values @@ -366,7 +396,7 @@ def diff(self) -> None: self.value_differences = None else: # Here we drop the columns that did not have differences. We are left with a dataframe - # with the original indices and only the rows and columns let with differences. + # with the original indices and only the rows and columns with differences. self.value_differences = rows_with_diffs.loc[ :, columns_with_diffs ] @@ -388,7 +418,247 @@ def are_structurally_equal(self) -> bool: @property def are_equal(self) -> bool: """Check if the two dataframes are equal, both structurally and cell-wise.""" - return self.are_structurally_equal and self.value_differences is None + return self.are_structurally_equal and self.are_overlapping_values_equal + + @property + def are_overlapping_values_equal(self) -> bool: + """Check if the values within the overlapping columns and rows of the two dataframes are equal.""" + return self.value_differences is None + + @property + def df1_value_differences(self) -> Optional[pd.DataFrame]: + """Get a sliced version of df1 that contains only the columns and rows that differ from df2. + + Note that this only includes the part of the dataframe that has structural overlap with + the other dataframe (i.e. extra columns or rows are not included). + """ + if self.value_differences is None: + return None + return cast( + pd.DataFrame, + self.df1.loc[self.value_differences.index, self.value_differences.columns], + ) + + @property + def df2_value_differences(self) -> Optional[pd.DataFrame]: + """Get a sliced version of df2 that contains only the columns and rows that differ from df2. + + Note that this only includes the part of the dataframe that has structural overlap with + the other dataframe (i.e. extra columns or rows are not included). + """ + if self.value_differences is None: + return None + return cast( + pd.DataFrame, + self.df2.loc[self.value_differences.index, self.value_differences.columns], + ) + + def get_description_lines_for_diff( + self, + df1_label: str, + df2_label: str, + use_color_tags: bool = False, + preview_different_dataframe_values: bool = False, + show_shared: bool = False, + truncate_lists_longer_than: int = 20, + ) -> Generator[str, None, None]: + """Generate a human readable description of the differences between the two dataframes. + + It is returned as a generator of strings, roughly one line per string yielded + (dataframe printing is done by pandas as one string and is returned as a single yielded item) + """ + red, red_end = ("[red]", "[/red]") if use_color_tags else ("", "") + green, green_end = ("[green]", "[/green]") if use_color_tags else ("", "") + blue, blue_end = ("[blue]", "[/blue]") if use_color_tags else ("", "") + + if self.are_equal: + yield (f"{green}{df1_label} is equal to {df2_label}{green_end}") + else: + yield (f"{red}{df1_label} is not equal to {df2_label}{red_end}") + + if self.are_structurally_equal: + yield (f"The structure is {green}identical{green_end}") + else: + yield (f"The structure is {red}different{red_end}") + + # The structure below works like this: we have a property that is a list + # (e.g. self.columns_missing_in_df1) that can be empty or have elements. + # If the list is empty we don't want to yield any lines. If the list has elements + # we want to yield a line. Additionally, we also want to truncate lines with many + # elements if they are too long. We use yield_formatted_if_not_empty on most of the + # member properties to output the differences if there are any. + + # Structural differences + if show_shared: + yield from yield_formatted_if_not_empty( + self.columns_shared, + lambda item: yield_list_lines( + f"{blue}Shared columns{blue_end}", + get_compact_list_description( + item, max_items=truncate_lists_longer_than + ), + ), + f"{red}No shared columns{red_end}", + ) + yield from yield_formatted_if_not_empty( + self.columns_missing_in_df1, + lambda item: yield_list_lines( + f"Columns missing in {df1_label}", + get_compact_list_description( + item, max_items=truncate_lists_longer_than + ), + ), + ) + yield from yield_formatted_if_not_empty( + self.columns_missing_in_df2, + lambda item: yield_list_lines( + f"Columns missing in {df2_label}", + get_compact_list_description( + item, max_items=truncate_lists_longer_than + ), + ), + ) + if show_shared: + yield from yield_formatted_if_not_empty( + self.index_columns_shared, + lambda item: yield_list_lines( + f"{blue}Shared index columns{blue_end}", + get_compact_list_description( + item, max_items=truncate_lists_longer_than + ), + ), + f"{red}No shared index columns{red_end}", + ) + yield from yield_formatted_if_not_empty( + self.index_columns_missing_in_df1, + lambda item: yield_list_lines( + f"Index columns missing in {df1_label}", + get_compact_list_description( + item, max_items=truncate_lists_longer_than + ), + ), + ) + yield from yield_formatted_if_not_empty( + self.index_columns_missing_in_df2, + lambda item: yield_list_lines( + f"Index columns missing in {df2_label}", + get_compact_list_description( + item, max_items=truncate_lists_longer_than + ), + ), + ) + if show_shared: + yield from yield_formatted_if_not_empty( + self.index_values_shared, + lambda item: yield_list_lines( + f"{blue}Shared index values{blue_end}", + get_compact_list_description( + item, max_items=truncate_lists_longer_than + ), + ), + f"{red}No shared index values{red_end}", + ) + yield from yield_formatted_if_not_empty( + self.index_values_missing_in_df1, + lambda item: yield_list_lines( + f"Index values missing in {df1_label}", + get_compact_list_description( + item, + self.df1.index.names, + max_items=truncate_lists_longer_than, + ), + ), + ) + yield from yield_formatted_if_not_empty( + self.index_values_missing_in_df2, + lambda item: yield_list_lines( + f"Index values missing in {df2_label}", + get_compact_list_description( + item, + self.df2.index.names, + max_items=truncate_lists_longer_than, + ), + ), + ) + yield from yield_formatted_if_not_empty( + self.duplicate_index_values_in_df1, + lambda item: yield_list_lines( + f"Duplicate index values in {df1_label}", + get_compact_list_description( + item, + self.df1.index.names, + max_items=truncate_lists_longer_than, + ), + ), + ) + yield from yield_formatted_if_not_empty( + self.duplicate_index_values_in_df2, + lambda item: yield_list_lines( + f"Duplicate index values in {df2_label}", + get_compact_list_description( + item, + self.df2.index.names, + max_items=truncate_lists_longer_than, + ), + ), + ) + + # Show "coordinates" where there are value differences + # This is done in compact form, e.g. if you have 10 new years for 200 countries + # that would be 2000 values but instead we unpack the hierarchical index tuples + # and show that a (shortened) list for the 200 countries and the 10 new years. + if self.value_differences is not None: + yield ( + f"Values in the shared columns/rows are {red}different{red_end}. " + + f"({self.value_differences_count} different cells)" + ) + yield from yield_formatted_if_not_empty( + self.columns_with_differences, + lambda item: yield_list_lines( + "Columns with diffs", + get_compact_list_description( + item, max_items=truncate_lists_longer_than + ), + ), + ) + yield from yield_formatted_if_not_empty( + self.rows_with_differences, + lambda item: yield_list_lines( + "Rows with diffs", + get_compact_list_description( + item, + self.df1.index.names, + max_items=truncate_lists_longer_than, + ), + ), + ) + + # This prints the two dataframes one after the other sliced to + # only the area where they have differences + if preview_different_dataframe_values: + if ( + self.value_differences + and self.columns_shared + and self.index_values_shared + ): + yield f"Values with differences in {df1_label}:" + yield ( + str( + self.df1.loc[ + self.value_differences.index, self.value_differences.columns + ] + ) + ) + yield f"Values with differences in {df2_label}:" + yield ( + str( + self.df2.loc[ + self.value_differences.index, self.value_differences.columns + ] + ) + ) + else: + yield "The datasets have no overlapping columns/rows." def groupby_agg( diff --git a/owid/datautils/utils.py b/owid/datautils/utils.py new file mode 100644 index 0000000..d78d9dd --- /dev/null +++ b/owid/datautils/utils.py @@ -0,0 +1,91 @@ +"""Functions related to the dataframe HighLevelDiff class.""" + +from typing import Callable, Generator, Iterable, List, Any, Optional + + +def get_list_description_with_max_length(items: List[Any], max_items: int = 20) -> str: + """Return a string representation for a list, potentially shortened in the middle.""" + if len(items) > max_items: + return ( + f"[{len(items)} items] " + + f'{", ".join(str(item) for item in items[:int(max_items/2)])} ... "' + + f'{", ".join(str(item) for item in items[-int(max_items/2):])}' + ) + else: + return ", ".join(str(item) for item in items) + + +def yield_list_lines( + description: str, items: Iterable[Any] +) -> Generator[str, None, None]: + """Yield a list of lines for a list of items. + + If the sublist is a single item then no newline is inserted. If the sublist has more than one item + then the description is printed as a header and the items are printed on separate lines with a sligh indent. + """ + sublines = [item for item in items] + if len(sublines) > 1: + yield f"{description}:" + for subline in sublines: + if subline != "": + yield f" {subline}" + elif len(sublines) == 1: + yield f"{description}: {sublines[0]}" + + +def get_compact_list_description( + items_iterable: Iterable[Any], + tuple_headers: Optional[List[str]] = None, + max_items: int = 20, +) -> Generator[str, None, None]: + """Get a compact desription of a list. + + If the list is numeric and monotonic then it gets compacted into a range like 2000-2015. If + the list contains tuples then the tuples are deconstructed into their components and the + components are compacted individually. Long lists (above max_items items) are + shortened in the middle. + """ + items = set(items_iterable) + if not items: + yield "[]" + elif all(isinstance(item, int) for item in items): + sorted_items = sorted(items) + if len(items) == 1: + yield str(sorted_items[0]) + if len(items) == 2: + yield f"{sorted_items[0]}, {sorted_items[1]}" + if len(items) > 2: + if len(items) == sorted_items[-1] - sorted_items[0]: + yield f"{sorted_items[0]}-{sorted_items[-1]}" + else: + yield get_list_description_with_max_length(sorted_items, max_items) + elif all(isinstance(item, tuple) for item in items): + transposed = zip(*items) + lines = [ + line for item in transposed for line in get_compact_list_description(item) + ] + if tuple_headers and len(tuple_headers) == len(lines): + yield from ( + f"{header}: {line}" for header, line in zip(tuple_headers, lines) + ) + else: + yield from lines + else: + sorted_items = sorted(items) + yield get_list_description_with_max_length(sorted_items, max_items) + + +def yield_formatted_if_not_empty( + item: Any, + format_function: Callable[[Any], Generator[str, None, None]], + fallback_message: str = "", +) -> Generator[str, None, None]: + """Yield an item formatted with the given function if it is not empty. + + This is a useful helper to avoid duplicating property/function access in if blocks and + then again in the block body. + """ + if item is not None and any(item): + yield from format_function(item) + elif fallback_message != "": + yield fallback_message diff --git a/tests/test_dataframes.py b/tests/test_dataframes.py index 527c53d..be00503 100644 --- a/tests/test_dataframes.py +++ b/tests/test_dataframes.py @@ -223,7 +223,155 @@ def test_on_dataframes_with_object_columns_with_nans(self): )[0] -class TestDataFrameHighLevelDiff: +class TestAreDataFramesEqualWithHighLevelDiff: + def are_equal(self, df1: pd.DataFrame, df2: pd.DataFrame, **kwargs: float) -> bool: + return dataframes.HighLevelDiff(df1, df2, **kwargs).are_equal + + def test_on_equal_dataframes_with_one_integer_column(self): + assert self.are_equal( + df1=pd.DataFrame({"col_01": [1, 2, 3]}), + df2=pd.DataFrame({"col_01": [1, 2, 3]}), + ) + + def test_on_almost_equal_dataframes_but_differing_by_one_element(self): + assert not self.are_equal( + df1=pd.DataFrame({"col_01": [1, 2, 3]}), + df2=pd.DataFrame({"col_01": [1, 2, 0]}), + ) + + def test_on_almost_equal_dataframes_but_differing_by_type(self): + assert not self.are_equal( + df1=pd.DataFrame({"col_01": [1, 2, 3]}), + df2=pd.DataFrame({"col_01": [1, 2, 3.0]}), + ) + + def test_on_equal_dataframes_containing_nans(self): + assert self.are_equal( + df1=pd.DataFrame({"col_01": [1, 2, np.nan]}), + df2=pd.DataFrame({"col_01": [1, 2, np.nan]}), + ) + + def test_on_equal_dataframes_containing_only_nans(self): + assert self.are_equal( + df1=pd.DataFrame({"col_01": [np.nan, np.nan]}), + df2=pd.DataFrame({"col_01": [np.nan, np.nan]}), + ) + + def test_on_equal_dataframes_both_empty(self): + assert self.are_equal(df1=pd.DataFrame(), df2=pd.DataFrame()) + + def test_on_equal_dataframes_with_various_types_of_columns(self): + assert self.are_equal( + df1=pd.DataFrame( + { + "col_01": [1, 2], + "col_02": [0.1, 0.2], + "col_03": ["1", "2"], + "col_04": [True, False], + } + ), + df2=pd.DataFrame( + { + "col_01": [1, 2], + "col_02": [0.1, 0.2], + "col_03": ["1", "2"], + "col_04": [True, False], + } + ), + ) + + def test_on_almost_equal_dataframes_but_columns_sorted_differently(self): + assert not self.are_equal( + df1=pd.DataFrame( + { + "col_01": [1, 2], + "col_02": [0.1, 0.2], + "col_03": ["1", "2"], + "col_04": [True, False], + } + ), + df2=pd.DataFrame( + { + "col_02": [0.1, 0.2], + "col_01": [1, 2], + "col_03": ["1", "2"], + "col_04": [True, False], + } + ), + ) + + def test_on_unequal_dataframes_with_all_columns_different(self): + assert not self.are_equal( + df1=pd.DataFrame({"col_01": [1, 2], "col_02": [0.1, 0.2]}), + df2=pd.DataFrame({"col_03": [0.1, 0.2], "col_04": [1, 2]}), + ) + + def test_on_unequal_dataframes_with_some_common_columns(self): + assert not self.are_equal( + df1=pd.DataFrame({"col_01": [1, 2], "col_02": [0.1, 0.2]}), + df2=pd.DataFrame({"col_01": [1, 2], "col_03": [1, 2]}), + ) + + def test_on_equal_dataframes_given_large_absolute_tolerance(self): + assert self.are_equal( + df1=pd.DataFrame({"col_01": [10, 20]}), + df2=pd.DataFrame({"col_01": [11, 21]}), + absolute_tolerance=1, + relative_tolerance=1e-8, + ) + + def test_on_unequal_dataframes_given_large_absolute_tolerance(self): + assert not self.are_equal( + df1=pd.DataFrame({"col_01": [10, 20]}), + df2=pd.DataFrame({"col_01": [11, 21]}), + absolute_tolerance=0.9, + relative_tolerance=1e-8, + ) + + def test_on_equal_dataframes_given_large_relative_tolerance(self): + assert self.are_equal( + df1=pd.DataFrame({"col_01": [1]}), + df2=pd.DataFrame({"col_01": [2]}), + absolute_tolerance=1e-8, + relative_tolerance=0.5, + ) + + def test_on_unequal_dataframes_given_large_relative_tolerance(self): + assert not self.are_equal( + df1=pd.DataFrame({"col_01": [1]}), + df2=pd.DataFrame({"col_01": [2]}), + absolute_tolerance=1e-8, + relative_tolerance=0.49, + ) + + def test_on_equal_dataframes_with_non_numeric_indexes(self): + assert self.are_equal( + df1=pd.DataFrame({"col_01": [1, 2], "col_02": ["a", "b"]}).set_index( + "col_02" + ), + df2=pd.DataFrame({"col_01": [1, 2], "col_02": ["a", "b"]}).set_index( + "col_02" + ), + ) + + def test_on_dataframes_of_equal_values_but_different_indexes(self): + assert not self.are_equal( + df1=pd.DataFrame({"col_01": [1, 2], "col_02": ["a", "b"]}).set_index( + "col_02" + ), + df2=pd.DataFrame({"col_01": [1, 2], "col_02": ["a", "c"]}).set_index( + "col_02" + ), + ) + + def test_on_dataframes_with_object_columns_with_nans(self): + assert self.are_equal( + df1=pd.DataFrame({"col_01": [np.nan, "b", "c"]}), + df2=pd.DataFrame({"col_01": [np.nan, "b", "c"]}), + ) + + +class TestHighLevelDiff: def test_simple_equal_dataframes_are_equal(self): df = pd.DataFrame( { @@ -231,7 +379,7 @@ def test_simple_equal_dataframes_are_equal(self): "value_01": [1, 2, 3, 4, 5, 6], } ) - diff = dataframes.DataFrameHighLevelDiff( + diff = dataframes.HighLevelDiff( df, df, absolute_tolerance=1e-8, @@ -257,7 +405,7 @@ def test_more_complex_equal_dataframes_are_equal(self): } ) df.set_index(["year", "country"], inplace=True) - diff = dataframes.DataFrameHighLevelDiff( + diff = dataframes.HighLevelDiff( df, df, absolute_tolerance=1e-8, @@ -283,7 +431,7 @@ def test_detects_duplicate_index_values(self): } ) df.set_index(["year", "country"], inplace=True) - diff = dataframes.DataFrameHighLevelDiff( + diff = dataframes.HighLevelDiff( df, df, absolute_tolerance=1e-8, @@ -309,7 +457,7 @@ def test_detects_missing_index(self): } ) df2 = df.set_index(["year", "country"], inplace=False) - diff = dataframes.DataFrameHighLevelDiff( + diff = dataframes.HighLevelDiff( df, df2, absolute_tolerance=1e-8, @@ -337,7 +485,7 @@ def test_detects_missing_index_values(self): df.set_index(["year", "country"], inplace=True) df2 = df.copy() df2.drop((2006, "b"), inplace=True) - diff = dataframes.DataFrameHighLevelDiff( + diff = dataframes.HighLevelDiff( df, df2, absolute_tolerance=1e-8, @@ -365,7 +513,7 @@ def test_detects_data_changes(self): df.set_index(["year", "country"], inplace=True) df2 = df.copy() df2.loc[(2006, "b"), "value_01"] = 7 - diff = dataframes.DataFrameHighLevelDiff( + diff = dataframes.HighLevelDiff( df, df2, absolute_tolerance=1e-8, @@ -400,7 +548,7 @@ def test_detects_data_changes_with_enough_tolerance(self): df.set_index(["year", "country"], inplace=True) df2 = df.copy() df2.loc[(2006, "b"), "value_01"] = 7 - diff = dataframes.DataFrameHighLevelDiff( + diff = dataframes.HighLevelDiff( df, df2, absolute_tolerance=1e-8, @@ -430,7 +578,7 @@ def test_detects_data_changes2(self): df2 = df.copy() df2.loc[(2006, "b"), "value_01"] = 7 df2.loc[(2006, "a"), "value_01"] = 8 - diff = dataframes.DataFrameHighLevelDiff( + diff = dataframes.HighLevelDiff( df, df2, absolute_tolerance=1e-8,