diff --git a/pyproject.toml b/pyproject.toml index 2840a40..c5eda9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ dependencies = [ "numpy>=1.21.0", "scipy>=1.9.0", - "pandas>=1.4.0", + "pandas>=1.4.0,<3", "statsmodels>=0.13.5", "chartify>=5.0.0", "ipywidgets>=8.0.0", diff --git a/spotify_confidence/analysis/confidence_utils.py b/spotify_confidence/analysis/confidence_utils.py index d4a0a80..5358c90 100644 --- a/spotify_confidence/analysis/confidence_utils.py +++ b/spotify_confidence/analysis/confidence_utils.py @@ -47,7 +47,9 @@ def remove_group_columns(categorical_columns: Iterable, additional_column: Optio return list(od) -def validate_categorical_columns(categorical_group_columns: Union[str, Iterable]) -> None: +def validate_categorical_columns( + categorical_group_columns: Union[str, Iterable], +) -> None: if isinstance(categorical_group_columns, str): pass elif isinstance(categorical_group_columns, Iterable): @@ -113,7 +115,7 @@ def validate_and_rename_columns(df: DataFrame, columns: Iterable[str]) -> DataFr if (df[column + SFX1].isna() == df[column + SFX2].isna()).all() and ( df[column + SFX1][df[column + SFX1].notna()] == df[column + SFX2][df[column + SFX2].notna()] ).all(): - df = df.rename(columns={column + SFX1: column}).drop(columns=[column + SFX2]) # type: ignore[union-attr,unused-ignore] + df = df.rename(columns={column + SFX1: column}).drop(columns=[column + SFX2]) else: raise ValueError(f"Values of {column} do not agree across levels: {df[[column + SFX1, column + SFX2]]}") return df @@ -121,7 +123,7 @@ def validate_and_rename_columns(df: DataFrame, columns: Iterable[str]) -> DataFr def drop_and_rename_columns(df: DataFrame, columns: Iterable[str]) -> DataFrame: columns_dict = {col + SFX1: col for col in columns} - return df.rename(columns=columns_dict).drop(columns=[col + SFX2 for col in columns]) # type: ignore[union-attr,unused-ignore] + return df.rename(columns=columns_dict).drop(columns=[col + SFX2 for col in columns]) def level2str(level: Union[str, Tuple]) -> str: @@ -132,7 +134,10 @@ def level2str(level: Union[str, Tuple]) -> str: def validate_data( - df: DataFrame, columns_that_must_exist, group_columns: Iterable, ordinal_group_column: Optional[str] + df: DataFrame, + columns_that_must_exist, + group_columns: Iterable, + ordinal_group_column: Optional[str], ): """Integrity check input dataframe.""" for col in columns_that_must_exist: @@ -201,7 +206,9 @@ def axis_format_precision(numbers: Series, absolute: bool, extra_zeros: int = 0) def to_finite(s: Series, lower_limit: float, upper_limit: float) -> Series: - return s.clip(-100 * abs(lower_limit), 100 * abs(upper_limit)) + result = s.clip(-100 * abs(lower_limit), 100 * abs(upper_limit)) + assert result is not None + return result def add_color_column(df: DataFrame, cols: Iterable) -> DataFrame: diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py index ee596aa..1f16f82 100644 --- a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py +++ b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py @@ -212,43 +212,45 @@ def _sufficient_statistics(self) -> DataFrame: self._sufficient = ( self._df.groupby(groupby, sort=False, group_keys=True) .apply( - lambda df: df.assign( - **{ - POINT_ESTIMATE: lambda df: confidence_computers[ - df[self._method_column].values[0] - ].point_estimate(df, **kwargs) - } - ) - .assign( - **{ - ORIGINAL_POINT_ESTIMATE: lambda df: ( - confidence_computers[ZTEST].point_estimate(df, **kwargs) - if df[self._method_column].values[0] == ZTESTLINREG - else confidence_computers[df[self._method_column].values[0]].point_estimate( + lambda df: ( + df.assign( + **{ + POINT_ESTIMATE: lambda df: confidence_computers[ + df[self._method_column].values[0] + ].point_estimate(df, **kwargs) + } + ) + .assign( + **{ + ORIGINAL_POINT_ESTIMATE: lambda df: ( + confidence_computers[ZTEST].point_estimate(df, **kwargs) + if df[self._method_column].values[0] == ZTESTLINREG + else confidence_computers[df[self._method_column].values[0]].point_estimate( + df, **kwargs + ) + ) + } + ) + .assign( + **{ + VARIANCE: lambda df: confidence_computers[df[self._method_column].values[0]].variance( df, **kwargs ) - ) - } - ) - .assign( - **{ - VARIANCE: lambda df: confidence_computers[df[self._method_column].values[0]].variance( + } + ) + .assign( + **{ + ORIGINAL_VARIANCE: lambda df: ( + confidence_computers[ZTEST].variance(df, **kwargs) + if df[self._method_column].values[0] == ZTESTLINREG + else confidence_computers[df[self._method_column].values[0]].variance(df, **kwargs) + ) + } + ) + .pipe( + lambda df: confidence_computers[df[self._method_column].values[0]].add_point_estimate_ci( df, **kwargs ) - } - ) - .assign( - **{ - ORIGINAL_VARIANCE: lambda df: ( - confidence_computers[ZTEST].variance(df, **kwargs) - if df[self._method_column].values[0] == ZTESTLINREG - else confidence_computers[df[self._method_column].values[0]].variance(df, **kwargs) - ) - } - ) - .pipe( - lambda df: confidence_computers[df[self._method_column].values[0]].add_point_estimate_ci( - df, **kwargs ) ) ) diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/sample_size_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/sample_size_computer.py index 358f5fc..fe709b4 100644 --- a/spotify_confidence/analysis/frequentist/confidence_computers/sample_size_computer.py +++ b/spotify_confidence/analysis/frequentist/confidence_computers/sample_size_computer.py @@ -131,10 +131,12 @@ def _sufficient_statistics(self) -> DataFrame: self._sufficient = ( self._df.groupby(groupby, sort=False, group_keys=True) .apply( - lambda df: df.assign(**{POINT_ESTIMATE: lambda df: df[self._point_estimate_column]}) - .assign(**{ORIGINAL_POINT_ESTIMATE: lambda df: df[self._point_estimate_column]}) - .assign(**{VARIANCE: lambda df: df[self._var_column]}) - .assign(**{ORIGINAL_VARIANCE: lambda df: df[self._var_column]}) + lambda df: ( + df.assign(**{POINT_ESTIMATE: lambda df: df[self._point_estimate_column]}) + .assign(**{ORIGINAL_POINT_ESTIMATE: lambda df: df[self._point_estimate_column]}) + .assign(**{VARIANCE: lambda df: df[self._var_column]}) + .assign(**{ORIGINAL_VARIANCE: lambda df: df[self._var_column]}) + ) ) .pipe(reset_named_indices) ) diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py index a8b29cf..0c06024 100644 --- a/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py +++ b/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py @@ -163,9 +163,9 @@ def adjusted_alphas_for_group(grp: DataFrame) -> Series: data=( df.assign(**{comparison_total_column: df[denominator + SFX1] + df[denominator + SFX2]}) .assign( - max_sample_size=lambda df: df[[comparison_total_column, final_expected_sample_size_column]] - .max(axis=1) - .max() + max_sample_size=lambda df: ( + df[[comparison_total_column, final_expected_sample_size_column]].max(axis=1).max() + ) ) .assign(sample_size_proportions=lambda df: df[comparison_total_column] / df["max_sample_size"]) .pipe(adjusted_alphas_for_group)[ADJUSTED_ALPHA] diff --git a/spotify_confidence/analysis/frequentist/sequential_bound_solver.py b/spotify_confidence/analysis/frequentist/sequential_bound_solver.py index 4d2f0c3..bf73854 100644 --- a/spotify_confidence/analysis/frequentist/sequential_bound_solver.py +++ b/spotify_confidence/analysis/frequentist/sequential_bound_solver.py @@ -123,7 +123,7 @@ def last_fcab(self): def __eq__(self, other): if isinstance(other, ComputationState): - return self._df.equals(other._df) and np.array_equal(self._last_fcab, other._last_fcab) # type: ignore[arg-type,unused-ignore] + return self._df.equals(other._df) and np.array_equal(self._last_fcab, other._last_fcab) return False diff --git a/tox.ini b/tox.ini index 3ce01bd..5528bad 100644 --- a/tox.ini +++ b/tox.ini @@ -24,5 +24,5 @@ deps = commands = ruff check ruff format --check - ty check + # don't run ty check - the type stubs for 3.9 are not good. pytest -n auto --no-cov --basetemp={envtmpdir} {posargs}