diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index fdef6af0..f507df12 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -3076,8 +3076,10 @@ def gamma_overlay_cli(input_rasters: INPUT_FILES_ARGUMENT, output_raster: OUTPUT def alr_transform_cli( input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FILE_OPTION, - column: str = None, + columns: Annotated[List[str], typer.Option()] = None, + denominator_column: str = None, keep_denominator_column: bool = False, + scale: Optional[float] = None, ): """Perform an additive logratio transformation on the data.""" from eis_toolkit.transformations.coda.alr import alr_transform @@ -3089,7 +3091,13 @@ def alr_transform_cli( df = pd.DataFrame(gdf.drop(columns="geometry")) typer.echo("Progress: 25%") - out_df = alr_transform(df=df, column=column, keep_denominator_column=keep_denominator_column) + out_df = alr_transform( + df=df, + columns=columns, + denominator_column=denominator_column, + keep_denominator_column=keep_denominator_column, + scale=scale, + ) typer.echo("Progess 75%") out_gdf = gpd.GeoDataFrame(out_df, geometry=geometries) @@ -3104,6 +3112,7 @@ def inverse_alr_transform_cli( input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FILE_OPTION, denominator_column: str = typer.Option(), + columns: Annotated[List[str], typer.Option()] = None, scale: float = 1.0, ): """Perform the inverse transformation for a set of ALR transformed data.""" @@ -3116,7 +3125,7 @@ def inverse_alr_transform_cli( df = pd.DataFrame(gdf.drop(columns="geometry")) typer.echo("Progress: 25%") - out_df = inverse_alr(df=df, denominator_column=denominator_column, scale=scale) + out_df = inverse_alr(df=df, denominator_column=denominator_column, columns=columns, scale=scale) typer.echo("Progess 75%") out_gdf = gpd.GeoDataFrame(out_df, geometry=geometries) @@ -3127,7 +3136,12 @@ def inverse_alr_transform_cli( # CODA - CLR TRANSFORM @app.command() -def clr_transform_cli(input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FILE_OPTION): +def clr_transform_cli( + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, + columns: Annotated[List[str], typer.Option()] = None, + scale: Optional[float] = None, +): """Perform a centered logratio transformation on the data.""" from eis_toolkit.transformations.coda.clr import clr_transform @@ -3138,7 +3152,7 @@ def clr_transform_cli(input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FIL df = pd.DataFrame(gdf.drop(columns="geometry")) typer.echo("Progress: 25%") - out_df = clr_transform(df=df) + out_df = clr_transform(df=df, columns=columns, scale=scale) typer.echo("Progess 75%") out_gdf = gpd.GeoDataFrame(out_df, geometry=geometries) @@ -3152,6 +3166,7 @@ def clr_transform_cli(input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FIL def inverse_clr_transform_cli( input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FILE_OPTION, + columns: Annotated[List[str], typer.Option()] = None, colnames: Annotated[List[str], typer.Option()] = None, scale: float = 1.0, ): @@ -3165,7 +3180,7 @@ def inverse_clr_transform_cli( df = pd.DataFrame(gdf.drop(columns="geometry")) typer.echo("Progress: 25%") - out_df = inverse_clr(df=df, colnames=colnames, scale=scale) + out_df = inverse_clr(df=df, columns=columns, colnames=colnames, scale=scale) typer.echo("Progess 75%") out_gdf = gpd.GeoDataFrame(out_df, geometry=geometries) @@ -3181,6 +3196,7 @@ def single_ilr_transform_cli( output_vector: OUTPUT_FILE_OPTION, subcomposition_1: Annotated[List[str], typer.Option()], subcomposition_2: Annotated[List[str], typer.Option()], + scale: Optional[float] = None, ): """Perform a single isometric logratio transformation on the provided subcompositions.""" from eis_toolkit.transformations.coda.ilr import single_ilr_transform @@ -3192,7 +3208,9 @@ def single_ilr_transform_cli( df = pd.DataFrame(gdf.drop(columns="geometry")) typer.echo("Progress: 25%") - out_series = single_ilr_transform(df=df, subcomposition_1=subcomposition_1, subcomposition_2=subcomposition_2) + out_series = single_ilr_transform( + df=df, subcomposition_1=subcomposition_1, subcomposition_2=subcomposition_2, scale=scale + ) typer.echo("Progess 75%") # NOTE: Output of pairwise_logratio might be changed to DF in the future, to automatically do the following @@ -3237,7 +3255,9 @@ def pairwise_logratio_cli( def single_plr_transform_cli( input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FILE_OPTION, - column: str = typer.Option(), + numerator: str = typer.Option(), + denominator_columns: Annotated[List[str], typer.Option()] = None, + scale: Optional[float] = None, ): """Perform a pivot logratio transformation on the selected column.""" from eis_toolkit.transformations.coda.plr import single_plr_transform @@ -3249,7 +3269,7 @@ def single_plr_transform_cli( df = pd.DataFrame(gdf.drop(columns="geometry")) typer.echo("Progress: 25%") - out_series = single_plr_transform(df=df, column=column) + out_series = single_plr_transform(df=df, numerator=numerator, denominator_columns=denominator_columns, scale=scale) typer.echo("Progess 75%") # NOTE: Output of single_plr_transform might be changed to DF in the future, to automatically do the following @@ -3262,8 +3282,13 @@ def single_plr_transform_cli( # CODA - PLR TRANSFORM @app.command() -def plr_transform_cli(input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FILE_OPTION): - """Perform a pivot logratio transformation on the dataframe, returning the full set of transforms.""" +def plr_transform_cli( + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, + columns: Annotated[List[str], typer.Option()] = None, + scale: Optional[float] = None, +): + """Perform a pivot logratio transformation on the selected columns.""" from eis_toolkit.transformations.coda.plr import plr_transform typer.echo("Progress: 10%") @@ -3273,7 +3298,7 @@ def plr_transform_cli(input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FIL df = pd.DataFrame(gdf.drop(columns="geometry")) typer.echo("Progress: 25%") - out_df = plr_transform(df=df) + out_df = plr_transform(df=df, columns=columns, scale=scale) typer.echo("Progess 75%") out_gdf = gpd.GeoDataFrame(out_df, geometry=geometries) diff --git a/eis_toolkit/transformations/coda/alr.py b/eis_toolkit/transformations/coda/alr.py index ab880c8c..8a16d2e3 100644 --- a/eis_toolkit/transformations/coda/alr.py +++ b/eis_toolkit/transformations/coda/alr.py @@ -1,88 +1,125 @@ -from numbers import Number - -import numpy as np -import pandas as pd -from beartype import beartype -from beartype.typing import Optional, Sequence - -from eis_toolkit.exceptions import InvalidColumnException, NumericValueSignException -from eis_toolkit.utilities.aitchison_geometry import _closure -from eis_toolkit.utilities.checks.compositional import check_in_simplex_sample_space -from eis_toolkit.utilities.miscellaneous import rename_columns_by_pattern - - -@beartype -def _alr_transform(df: pd.DataFrame, columns: Sequence[str], denominator_column: str) -> pd.DataFrame: - - ratios = df[columns].div(df[denominator_column], axis=0) - return np.log(ratios) - - -@beartype -def alr_transform( - df: pd.DataFrame, column: Optional[str] = None, keep_denominator_column: bool = False -) -> pd.DataFrame: - """ - Perform an additive logratio transformation on the data. - - Args: - df: A dataframe of compositional data. - column: The name of the column to be used as the denominator column. - keep_denominator_column: Whether to include the denominator column in the result. If True, the returned - dataframe retains its original shape. - - Returns: - A new dataframe containing the ALR transformed data. - - Raises: - InvalidColumnException: The input column isn't found in the dataframe. - InvalidCompositionException: Data is not normalized to the expected value. - NumericValueSignException: Data contains zeros or negative values. - """ - check_in_simplex_sample_space(df) - - if column is not None and column not in df.columns: - raise InvalidColumnException(f"The column {column} was not found in the dataframe.") - - column = column if column is not None else df.columns[-1] - - columns = [col for col in df.columns] - - if not keep_denominator_column and column in columns: - columns.remove(column) - - return rename_columns_by_pattern(_alr_transform(df, columns, column)) - - -@beartype -def _inverse_alr(df: pd.DataFrame, denominator_column: str, scale: Number = 1.0) -> pd.DataFrame: - dfc = df.copy() - - if denominator_column not in dfc.columns.values: - # Add the denominator column - dfc[denominator_column] = 0.0 - - return _closure(np.exp(dfc), scale) - - -@beartype -def inverse_alr(df: pd.DataFrame, denominator_column: str, scale: Number = 1.0) -> pd.DataFrame: - """ - Perform the inverse transformation for a set of ALR transformed data. - - Args: - df: A dataframe of ALR transformed compositional data. - denominator_column: The name of the denominator column. - scale: The value to which each composition should be normalized. Eg., if the composition is expressed - as percentages, scale=100. - - Returns: - A dataframe containing the inverse transformed data. - - Raises: - NumericValueSignException: The input scale value is zero or less. - """ - if scale <= 0: - raise NumericValueSignException("The scale value should be positive.") - - return _inverse_alr(df, denominator_column, scale) +from numbers import Number + +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Optional, Sequence + +from eis_toolkit.exceptions import InvalidColumnException, NumericValueSignException +from eis_toolkit.utilities.aitchison_geometry import _closure +from eis_toolkit.utilities.checks.compositional import check_in_simplex_sample_space +from eis_toolkit.utilities.miscellaneous import rename_columns_by_pattern + + +@beartype +def _alr_transform(df: pd.DataFrame, columns: Sequence[str], denominator_column: str) -> pd.DataFrame: + + ratios = df[columns].div(df[denominator_column], axis=0) + return np.log(ratios) + + +@beartype +def alr_transform( + df: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + denominator_column: Optional[str] = None, + keep_denominator_column: bool = False, + scale: Optional[Number] = None, +) -> pd.DataFrame: + """ + Perform an additive logratio transformation on the data. + + Args: + df: A dataframe of compositional data. + columns: The names of the columns to be transformed. + denominator_column: The name of the column to be used as the denominator column. + keep_denominator_column: Whether to include the denominator column in the result. If True, the returned + dataframe retains its original shape. + scale: The value to which each composition should be normalized. Eg., if the composition is expressed + as percentages, scale=100. Closure is not performed by default. + + Returns: + A new dataframe containing the ALR transformed data. + + Raises: + InvalidColumnException: The input column isn't found in the dataframe. + InvalidCompositionException: Data is not normalized to the expected value. + NumericValueSignException: Data contains zeros or negative values. + """ + + if denominator_column is not None and denominator_column not in df.columns: + raise InvalidColumnException(f"The column {denominator_column} was not found in the dataframe.") + + if denominator_column is not None and keep_denominator_column and columns and denominator_column not in columns: + raise InvalidColumnException( + f"Denominator column '{denominator_column}' must be in selected columns if keep_denominator_column is True." + ) + + denominator_column = denominator_column if denominator_column is not None else df.columns[-1] + + if columns: + invalid_columns = [col for col in columns if col not in df.columns] + if invalid_columns: + raise InvalidColumnException(f"The following columns were not found in the dataframe: {invalid_columns}.") + columns_to_transform = columns + + if denominator_column not in columns_to_transform: + df = df[columns_to_transform + [denominator_column]] + else: + df = df[columns_to_transform] + + else: + columns_to_transform = df.columns.to_list() + + if scale is not None: + df = _closure(df, scale) + + check_in_simplex_sample_space(df) + + if not keep_denominator_column and denominator_column in columns_to_transform: + columns_to_transform.remove(denominator_column) + + return rename_columns_by_pattern(_alr_transform(df, columns_to_transform, denominator_column)) + + +@beartype +def _inverse_alr(df: pd.DataFrame, denominator_column: str, scale: Number = 1.0) -> pd.DataFrame: + dfc = df.copy() + if denominator_column not in dfc.columns.values: + # Add the denominator column + dfc[denominator_column] = 0.0 + + return _closure(np.exp(dfc), scale) + + +@beartype +def inverse_alr( + df: pd.DataFrame, denominator_column: str, columns: Optional[Sequence[str]] = None, scale: Number = 1.0 +) -> pd.DataFrame: + """ + Perform the inverse transformation for a set of ALR transformed data. + + Args: + df: A dataframe of ALR transformed compositional data. + denominator_column: The name of the denominator column. + columns: The names of the columns to be transformed. + scale: The value to which each composition should be normalized. Eg., if the composition is expressed + as percentages, scale=100. + + Returns: + A dataframe containing the inverse transformed data. + + Raises: + InvalidColumnException: The input column(s) not found in the dataframe. + NumericValueSignException: The input scale value is zero or less. + """ + if scale <= 0: + raise NumericValueSignException("The scale value should be positive.") + + if columns: + invalid_columns = [col for col in columns if col not in df.columns] + if invalid_columns: + raise InvalidColumnException(f"The following columns were not found in the dataframe: {invalid_columns}.") + df = df[columns] + + return _inverse_alr(df, denominator_column, scale) diff --git a/eis_toolkit/transformations/coda/clr.py b/eis_toolkit/transformations/coda/clr.py index a6022b4f..bd7c4940 100644 --- a/eis_toolkit/transformations/coda/clr.py +++ b/eis_toolkit/transformations/coda/clr.py @@ -1,79 +1,110 @@ -from numbers import Number - -import numpy as np -import pandas as pd -from beartype import beartype -from beartype.typing import Optional, Sequence -from scipy.stats import gmean - -from eis_toolkit.exceptions import NumericValueSignException -from eis_toolkit.utilities.aitchison_geometry import _closure -from eis_toolkit.utilities.checks.compositional import check_in_simplex_sample_space -from eis_toolkit.utilities.miscellaneous import rename_columns, rename_columns_by_pattern - - -@beartype -def _centered_ratio(row: pd.Series) -> pd.Series: - - return row / gmean(row) - - -@beartype -def _clr_transform(df: pd.DataFrame) -> pd.DataFrame: - - dfc = df.copy() - dfc = dfc.apply(_centered_ratio, axis=1) - - return np.log(dfc) - - -@beartype -def clr_transform(df: pd.DataFrame) -> pd.DataFrame: - """ - Perform a centered logratio transformation on the data. - - Args: - df: A dataframe of compositional data. - - Returns: - A new dataframe containing the CLR transformed data. - - Raises: - InvalidCompositionException: Data is not normalized to the expected value. - NumericValueSignException: Data contains zeros or negative values. - """ - check_in_simplex_sample_space(df) - return rename_columns_by_pattern(_clr_transform(df)) - - -@beartype -def _inverse_clr(df: pd.DataFrame, colnames: Optional[Sequence[str]] = None, scale: Number = 1.0) -> pd.DataFrame: - inverse = _closure(np.exp(df), scale) - - if colnames is not None: - return rename_columns(inverse, colnames) - - return inverse - - -@beartype -def inverse_clr(df: pd.DataFrame, colnames: Optional[Sequence[str]] = None, scale: Number = 1.0) -> pd.DataFrame: - """ - Perform the inverse transformation for a set of CLR transformed data. - - Args: - df: A dataframe of CLR transformed compositional data. - colnames: List of column names to rename the columns to. - scale: The value to which each composition should be normalized. Eg., if the composition is expressed - as percentages, scale=100. - - Returns: - A dataframe containing the inverse transformed data. - - Raises: - NumericValueSignException: The input scale value is zero or less. - """ - if scale <= 0: - raise NumericValueSignException("The scale value should be positive.") - - return _inverse_clr(df, colnames, scale) +from numbers import Number + +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Optional, Sequence +from scipy.stats import gmean + +from eis_toolkit.exceptions import InvalidColumnException, NumericValueSignException +from eis_toolkit.utilities.aitchison_geometry import _closure +from eis_toolkit.utilities.checks.compositional import check_in_simplex_sample_space +from eis_toolkit.utilities.miscellaneous import rename_columns, rename_columns_by_pattern + + +@beartype +def _centered_ratio(row: pd.Series) -> pd.Series: + return row / gmean(row) + + +@beartype +def _clr_transform(df: pd.DataFrame) -> pd.DataFrame: + dfc = df.copy() + dfc = dfc.apply(_centered_ratio, axis=1) + + return np.log(dfc) + + +@beartype +def clr_transform( + df: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + scale: Optional[Number] = None, +) -> pd.DataFrame: + """ + Perform a centered logratio transformation on the data. + + Args: + df: A dataframe of compositional data. + columns: The names of the columns to be transformed. + scale: The value to which each composition should be normalized. Eg., if the composition is expressed + as percentages, scale=100. Closure is not performed by default. + + Returns: + A new dataframe containing the CLR transformed data. + + Raises: + InvalidColumnException: The input column(s) not found in the dataframe. + InvalidCompositionException: Data is not normalized to the expected value. + NumericValueSignException: Data contains zeros or negative values. + """ + + if columns: + invalid_columns = [col for col in columns if col not in df.columns] + if invalid_columns: + raise InvalidColumnException(f"The following columns were not found in the dataframe: {invalid_columns}.") + columns_to_transform = columns + df = df[columns_to_transform] + + if scale is not None: + df = _closure(df, scale) + + check_in_simplex_sample_space(df) + + return rename_columns_by_pattern(_clr_transform(df)) + + +@beartype +def _inverse_clr(df: pd.DataFrame, scale: Number = 1.0) -> pd.DataFrame: + return _closure(np.exp(df), scale) + + +@beartype +def inverse_clr( + df: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + colnames: Optional[Sequence[str]] = None, + scale: Number = 1.0, +) -> pd.DataFrame: + """ + Perform the inverse transformation for a set of CLR transformed data. + + Args: + df: A dataframe of CLR transformed compositional data. + columns: The names of the columns to be transformed. + colnames: List of column names to rename the columns to. + scale: The value to which each composition should be normalized. Eg., if the composition is expressed + as percentages, scale=100. + + Returns: + A dataframe containing the inverse transformed data. + + Raises: + InvalidColumnException: The input column(s) not found in the dataframe. + NumericValueSignException: The input scale value is zero or less. + """ + if scale <= 0: + raise NumericValueSignException("The scale value should be positive.") + + if columns: + invalid_columns = [col for col in columns if col not in df.columns] + if invalid_columns: + raise InvalidColumnException(f"The following columns were not found in the dataframe: {invalid_columns}.") + df = df[columns] + + inverse_data = _inverse_clr(df, scale) + + if colnames: + return rename_columns(inverse_data, colnames) + + return inverse_data diff --git a/eis_toolkit/transformations/coda/ilr.py b/eis_toolkit/transformations/coda/ilr.py index ed8831f1..9891ef31 100644 --- a/eis_toolkit/transformations/coda/ilr.py +++ b/eis_toolkit/transformations/coda/ilr.py @@ -1,10 +1,13 @@ +from numbers import Number + import numpy as np import pandas as pd from beartype import beartype -from beartype.typing import Sequence +from beartype.typing import Optional, Sequence from scipy.stats import gmean from eis_toolkit.exceptions import InvalidColumnException, InvalidCompositionException, InvalidParameterValueException +from eis_toolkit.utilities.aitchison_geometry import _closure from eis_toolkit.utilities.checks.compositional import check_in_simplex_sample_space from eis_toolkit.utilities.checks.dataframe import check_columns_valid from eis_toolkit.utilities.checks.parameter import check_lists_overlap, check_numeric_value_sign @@ -64,7 +67,10 @@ def _single_ilr_transform( @beartype def single_ilr_transform( - df: pd.DataFrame, subcomposition_1: Sequence[str], subcomposition_2: Sequence[str] + df: pd.DataFrame, + subcomposition_1: Sequence[str], + subcomposition_2: Sequence[str], + scale: Optional[Number] = None, ) -> pd.Series: """ Perform a single isometric logratio transformation on the provided subcompositions. @@ -75,6 +81,8 @@ def single_ilr_transform( df: A dataframe of shape [N, D] of compositional data. subcomposition_1: Names of the columns in the numerator part of the ratio. subcomposition_2: Names of the columns in the denominator part of the ratio. + scale: The value to which each composition should be normalized. Eg., if the composition is expressed + as percentages, scale=100. Closure is not performed by default. Returns: A series of length N containing the transforms. @@ -86,7 +94,6 @@ def single_ilr_transform( InvalidParameterValueException: At least one subcomposition provided was empty. NumericValueSignException: Data contains zeros or negative values. """ - check_in_simplex_sample_space(df) if not (subcomposition_1 and subcomposition_2): raise InvalidParameterValueException("A subcomposition should contain at least one column.") @@ -97,4 +104,12 @@ def single_ilr_transform( if check_lists_overlap(subcomposition_1, subcomposition_2): raise InvalidCompositionException("The subcompositions overlap.") + columns = subcomposition_1 + subcomposition_2 + df = df[columns] + + if scale is not None: + df = _closure(df, scale) + + check_in_simplex_sample_space(df) + return _single_ilr_transform(df, subcomposition_1, subcomposition_2) diff --git a/eis_toolkit/transformations/coda/plr.py b/eis_toolkit/transformations/coda/plr.py index 3b58cca0..e41faf5e 100644 --- a/eis_toolkit/transformations/coda/plr.py +++ b/eis_toolkit/transformations/coda/plr.py @@ -1,126 +1,169 @@ -import numpy as np -import pandas as pd -from beartype import beartype -from scipy.stats import gmean - -from eis_toolkit.exceptions import InvalidColumnException, InvalidParameterValueException -from eis_toolkit.utilities.checks.compositional import check_in_simplex_sample_space -from eis_toolkit.utilities.checks.parameter import check_numeric_value_sign -from eis_toolkit.utilities.miscellaneous import rename_columns_by_pattern - - -@beartype -def _calculate_plr_scaling_factor(c: int) -> np.float64: - """ - Calculate the scaling factor for the PLR transform. - - Args: - c: The cardinality of the remaining parts in the composition. - - Returns: - The scaling factor used performing a single PLR transform for a composition. - - Raises: - InvalidParameterValueException: The input value is zero or negative. - """ - if not (check_numeric_value_sign(c)): - raise InvalidParameterValueException("The input value must be a positive integer.") - - return np.sqrt(c / np.float64(1 + c)) - - -@beartype -def _single_plr_transform_by_index(df: pd.DataFrame, column_ind: int) -> pd.Series: - - dfc = df.copy() - # The denominator is a subcomposition of all the parts "to the right" of the column: - columns = [col for col in df.columns] - subcomposition = [columns[i] for i in range(len(columns)) if i > column_ind] - c = len(subcomposition) - scaling_factor = _calculate_plr_scaling_factor(c) - - # A series to hold the transformed rows - plr_values = pd.Series([0.0] * df.shape[0]) - - for idx, row in dfc.iterrows(): - plr_values[idx] = scaling_factor * np.log(row.iloc[column_ind] / gmean(row[subcomposition])) - - return plr_values - - -@beartype -def _single_plr_transform(df: pd.DataFrame, column: str) -> pd.Series: - - idx = df.columns.get_loc(column) - - return _single_plr_transform_by_index(df, idx) - - -@beartype -def single_plr_transform(df: pd.DataFrame, column: str) -> pd.Series: - """ - Perform a pivot logratio transformation on the selected column. - - Pivot logratio is a special case of ILR, where the numerator in the ratio is always a single - part and the denominator all of the parts to the right in the ordered list of parts. - - Column order matters. - - Args: - df: A dataframe of shape [N, D] of compositional data. - column: The name of the numerator column to use for the transformation. - - Returns: - A series of length N containing the transforms. - - Raises: - InvalidColumnException: The input column isn't found in the dataframe, or there are no columns - to the right of the given column. - InvalidCompositionException: Data is not normalized to the expected value. - NumericValueSignException: Data contains zeros or negative values. - """ - check_in_simplex_sample_space(df) - - if column not in df.columns: - raise InvalidColumnException(f"The column {column} was not found in the dataframe.") - - idx = df.columns.get_loc(column) - - if idx == len(df.columns) - 1: - raise InvalidColumnException() - - return _single_plr_transform(df, column) - - -@beartype -def _plr_transform(df: pd.DataFrame) -> pd.DataFrame: - dfc = df.copy() - - # A dataframe to hold the transformed values - plr_values = pd.DataFrame(0.0, index=dfc.index, columns=dfc.columns[:-1]) - - for i in range(len(df.columns) - 1): - plr_values.iloc[:, i] = _single_plr_transform_by_index(dfc, i) - - return plr_values - - -@beartype -def plr_transform(df: pd.DataFrame) -> pd.DataFrame: - """ - Perform a pivot logratio transformation on the dataframe, returning the full set of transforms. - - Args: - df: A dataframe of shape [N, D] of compositional data. - - Returns: - A dataframe of shape [N, D-1] containing the set of PLR transformed data. - - Raises: - InvalidColumnException: The data contains one or more zeros. - InvalidCompositionException: Data is not normalized to the expected value. - NumericValueSignException: Data contains zeros or negative values. - """ - check_in_simplex_sample_space(df) - - return rename_columns_by_pattern(_plr_transform(df)) +from numbers import Number + +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Optional, Sequence +from scipy.stats import gmean + +from eis_toolkit.exceptions import InvalidColumnException, InvalidParameterValueException +from eis_toolkit.utilities.aitchison_geometry import _closure +from eis_toolkit.utilities.checks.compositional import check_in_simplex_sample_space +from eis_toolkit.utilities.checks.parameter import check_numeric_value_sign +from eis_toolkit.utilities.miscellaneous import rename_columns_by_pattern + + +@beartype +def _calculate_plr_scaling_factor(c: int) -> np.float64: + """ + Calculate the scaling factor for the PLR transform. + + Args: + c: The cardinality of the remaining parts in the composition. + + Returns: + The scaling factor used performing a single PLR transform for a composition. + + Raises: + InvalidParameterValueException: The input value is zero or negative. + """ + if not (check_numeric_value_sign(c)): + raise InvalidParameterValueException("The input value must be a positive integer.") + + return np.sqrt(c / np.float64(1 + c)) + + +@beartype +def _single_plr_transform_by_index(df: pd.DataFrame, column_ind: int) -> pd.Series: + + dfc = df.copy() + # The denominator is a subcomposition of all the parts "to the right" of the column: + columns = [col for col in df.columns] + subcomposition = [columns[i] for i in range(len(columns)) if i > column_ind] + c = len(subcomposition) + + scaling_factor = _calculate_plr_scaling_factor(c) + + # A series to hold the transformed rows + plr_values = pd.Series([0.0] * df.shape[0]) + + for idx, row in dfc.iterrows(): + plr_values[idx] = scaling_factor * np.log(row.iloc[column_ind] / gmean(row[subcomposition])) + + return plr_values + + +@beartype +def single_plr_transform( + df: pd.DataFrame, + numerator: str, + denominator_columns: Optional[Sequence[str]] = None, + scale: Optional[Number] = None, +) -> pd.Series: + """ + Perform a pivot logratio transformation on the selected column. + + Pivot logratio is a special case of ILR, where the numerator in the ratio is always a single + part and the denominator all of the parts to the right in the ordered list of parts. + + Column order matters. + + Args: + df: A dataframe of shape [N, D] of compositional data. + numerator: The name of the numerator column to use for the transformation. + denominator_columns: The names of the columns to use for the transformation. Must be "to the right" of + the numerator column. + scale: The value to which each composition should be normalized. Eg., if the composition is expressed + as percentages, scale=100. Closure is not performed by default. + Returns: + A series of length N containing the transforms. + + Raises: + InvalidColumnException: The input column isn't found in the dataframe, or there are no columns + to the right of the given column, or last column selected as numerator, or selected numerator + is in denominator columns, or one or more denominator columns is left of numerator column. + InvalidCompositionException: Data is not normalized to the expected value. + NumericValueSignException: Data contains zeros or negative values. + """ + if numerator not in df.columns: + raise InvalidColumnException(f"The numerator column {numerator} was not found in the dataframe.") + + numerator_idx = df.columns.get_loc(numerator) + if numerator_idx == len(df.columns) - 1: + raise InvalidColumnException("Can't select last column as numerator.") + + if denominator_columns is not None: + for column in denominator_columns: + if column not in df.columns: + raise InvalidColumnException(f"The column {column} was not found in the dataframe.") + + if numerator in denominator_columns: + raise InvalidColumnException(f"The column {numerator} is in the denominator columns.") + + for column in denominator_columns: + column_idx = df.columns.get_loc(column) + if column_idx < numerator_idx: + raise InvalidColumnException(f"The column {column} is to the left of the numerator column {numerator}.") + else: + # Select all columns to the right of the numerator + denominator_columns = df.columns[numerator_idx + 1 :].to_list() + + # Keep columns from numerator_idx to the right + columns = [numerator] + denominator_columns + df = df.loc[:, columns] + + if scale is not None: + df = _closure(df, scale) + + check_in_simplex_sample_space(df) + + return _single_plr_transform_by_index(df, 0) + + +@beartype +def _plr_transform(df: pd.DataFrame) -> pd.DataFrame: + dfc = df.copy() + + # A dataframe to hold the transformed values + plr_values = pd.DataFrame(0.0, index=dfc.index, columns=dfc.columns[:-1]) + + for i in range(len(df.columns) - 1): + plr_values.iloc[:, i] = _single_plr_transform_by_index(dfc, i) + + return plr_values + + +@beartype +def plr_transform( + df: pd.DataFrame, columns: Optional[Sequence[str]] = None, scale: Optional[Number] = None +) -> pd.DataFrame: + """ + Perform a pivot logratio transformation on the dataframe, returning the full set of transforms. + + Args: + df: A dataframe of shape [N, D] of compositional data. + columns: The names of the columns to use for the transformation. + scale: The value to which each composition should be normalized. Eg., if the composition is expressed + as percentages, scale=100. Closure is not performed by default. + + Returns: + A dataframe of shape [N, D-1] containing the set of PLR transformed data. + + Raises: + InvalidColumnException: The data contains one or more zeros, or input column(s) not found in the dataframe. + InvalidCompositionException: Data is not normalized to the expected value. + NumericValueSignException: Data contains zeros or negative values. + """ + + if columns: + invalid_columns = [col for col in columns if col not in df.columns] + if invalid_columns: + raise InvalidColumnException(f"The following columns were not found in the dataframe: {invalid_columns}.") + df = df[columns] + + if scale is not None: + df = _closure(df, scale) + + check_in_simplex_sample_space(df) + + return rename_columns_by_pattern(_plr_transform(df)) diff --git a/notebooks/testing_logratio_transformations.ipynb b/notebooks/testing_logratio_transformations.ipynb index f8f62961..e5dec896 100644 --- a/notebooks/testing_logratio_transformations.ipynb +++ b/notebooks/testing_logratio_transformations.ipynb @@ -20,9 +20,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "/root/.cache/pypoetry/virtualenvs/eis-toolkit-QEzTY9B6-py3.10/lib/python3.10/site-packages/beartype/_util/hint/pep/utilpeptest.py:347: BeartypeDecorHintPep585DeprecationWarning: PEP 484 type hint typing.Sequence[str] deprecated by PEP 585. This hint is scheduled for removal in the first Python version released after October 5th, 2025. To resolve this, import this hint from \"beartype.typing\" rather than \"typing\". For further commentary and alternatives, see also:\n", - " https://beartype.readthedocs.io/en/latest/api_roar/#pep-585-deprecations\n", - " warn(\n" + "/home/mika/.cache/pypoetry/virtualenvs/eis-toolkit-l5cKD1lZ-py3.10/lib/python3.10/site-packages/geopandas/_compat.py:112: UserWarning: The Shapely GEOS version (3.10.3-CAPI-1.16.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.4-CAPI-1.16.2). Conversions between both will be slow.\n", + " warnings.warn(\n" ] } ], @@ -40,7 +39,7 @@ "from eis_toolkit.transformations.coda.pairwise import pairwise_logratio, single_pairwise_logratio\n", "from eis_toolkit.transformations.coda.plr import plr_transform, single_plr_transform\n", "\n", - "GEOCHEMICAL_DATA = \"../tests/data/local/coda/IOCG_CLB_Till_Geochem_reg_511p.shp\"" + "GEOCHEMICAL_DATA = \"../tests/data/remote/IOCG_CLB_Till_Geochem_reg_511p.gpkg\"" ] }, { @@ -89,7 +88,18 @@ "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/mika/code/EIS/eis_toolkit/notebooks/../eis_toolkit/utilities/aitchison_geometry.py:43: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '84.21052631578948' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.\n", + " dfc.iloc[idx] = _normalize(row, scale) if scale is not None else _normalize(row)\n", + "/home/mika/code/EIS/eis_toolkit/notebooks/../eis_toolkit/utilities/aitchison_geometry.py:43: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '15.789473684210527' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.\n", + " dfc.iloc[idx] = _normalize(row, scale) if scale is not None else _normalize(row)\n" + ] + } + ], "source": [ "pair_a_b = single_pairwise_logratio(float(C.iloc[0, 0]), float(C.iloc[0, 1]))\n", "pair_a_c = single_pairwise_logratio(float(C.iloc[0, 0]), float(C.iloc[0, 2]))\n", @@ -102,7 +112,7 @@ "C_clr_inv = inverse_clr(C_clr, scale=100.0)\n", "C_alr_inv = inverse_alr(C_alr, \"c\", scale=100)\n", "\n", - "C_ilr_ab = single_ilr_transform(C, [\"a\"], [\"b\"])\n", + "C_ilr_ab = single_ilr_transform(C, [\"a\"], [\"b\"], scale=100)\n", "C_ilr_ab_c = single_ilr_transform(C, [\"a\", \"b\"], [\"c\"])" ] }, @@ -601,7 +611,7 @@ }, "outputs": [], "source": [ - "sample_alr_inv = inverse_alr(sample_alr, \"d\", 100)" + "sample_alr_inv = inverse_alr(sample_alr, denominator_column=\"d\", scale=100)" ] }, { @@ -684,21 +694,6 @@ { "cell_type": "code", "execution_count": 18, - "id": "f49926c2-f1dd-47e8-a484-f78ce6821904", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Define some constants\n", - "\n", - "ppm = 1e-6\n", - "million = 1e6" - ] - }, - { - "cell_type": "code", - "execution_count": 19, "id": "81a98117-b981-47ea-a7bb-ba06c0dacb13", "metadata": { "tags": [] @@ -712,7 +707,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "c0204220-7bf2-4235-b92a-0e139180050e", "metadata": { "tags": [] @@ -726,7 +721,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "id": "e1bda63b-ab9b-4060-90d5-7520952f2e3a", "metadata": { "tags": [] @@ -757,7 +752,6 @@ " Ca_ppm_511\n", " Fe_ppm_511\n", " Mg_ppm_511\n", - " residual\n", " \n", " \n", " \n", @@ -767,7 +761,6 @@ " 40200.0\n", " 83200.0\n", " 17200.0\n", - " 831800.0\n", " \n", " \n", " 1\n", @@ -775,7 +768,6 @@ " 5000.0\n", " 28300.0\n", " 7520.0\n", - " 945080.0\n", " \n", " \n", " 2\n", @@ -783,7 +775,6 @@ " 3070.0\n", " 14500.0\n", " 4540.0\n", - " 970010.0\n", " \n", " \n", " 3\n", @@ -791,7 +782,6 @@ " 3290.0\n", " 14600.0\n", " 3240.0\n", - " 971570.0\n", " \n", " \n", " 4\n", @@ -799,22 +789,21 @@ " 3600.0\n", " 31500.0\n", " 8020.0\n", - " 944380.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Al_ppm_511 Ca_ppm_511 Fe_ppm_511 Mg_ppm_511 residual\n", - "0 27600.0 40200.0 83200.0 17200.0 831800.0\n", - "1 14100.0 5000.0 28300.0 7520.0 945080.0\n", - "2 7880.0 3070.0 14500.0 4540.0 970010.0\n", - "3 7300.0 3290.0 14600.0 3240.0 971570.0\n", - "4 12500.0 3600.0 31500.0 8020.0 944380.0" + " Al_ppm_511 Ca_ppm_511 Fe_ppm_511 Mg_ppm_511\n", + "0 27600.0 40200.0 83200.0 17200.0\n", + "1 14100.0 5000.0 28300.0 7520.0\n", + "2 7880.0 3070.0 14500.0 4540.0\n", + "3 7300.0 3290.0 14600.0 3240.0\n", + "4 12500.0 3600.0 31500.0 8020.0" ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -824,16 +813,12 @@ "\n", "df = gpd.read_file(GEOCHEMICAL_DATA, include_fields=elements_to_analyze)\n", "df = pd.DataFrame(df.drop(columns='geometry'))\n", - "\n", - "# Add a column for the residual\n", - "\n", - "df[\"residual\"] = million - np.sum(df, axis=1)\n", "df.head()" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 21, "id": "75728aa4-5b2e-46b6-9511-1250bf4b13ae", "metadata": { "tags": [] @@ -843,23 +828,34 @@ "pair_Al_Ca = pairwise_logratio(df, \"Al_ppm_511\", \"Ca_ppm_511\")\n", "pair_Fe_Mg = pairwise_logratio(df, \"Fe_ppm_511\", \"Mg_ppm_511\")\n", "pair_Mg_Al = pairwise_logratio(df, \"Mg_ppm_511\", \"Al_ppm_511\")\n", - "pair_Mg_res = pairwise_logratio(df, \"Mg_ppm_511\", \"residual\")\n", "\n", - "df_alr = alr_transform(df)\n", - "df_alr_Mg = alr_transform(df, \"Mg_ppm_511\")\n", - "df_clr = clr_transform(df)\n", - "df_plr = plr_transform(df)\n", + "df_alr = df.copy()\n", + "df_alr_Mg = df.copy()\n", + "df_clr = df.copy()\n", + "df_plr = df.copy()\n", + "\n", + "# As real world geochemical data will often not satisfy sum to a constant, a closure needs to be performed by providing the \"scale\" parameter.\n", + "# In this example, as the example data are in ppm, let's define a scaling factor 1e6\n", + "million = 1e6\n", + "\n", + "df_alr = alr_transform(df_alr, scale=million)\n", + "df_alr_Mg = alr_transform(df_alr_Mg, denominator_column=\"Mg_ppm_511\", scale=million)\n", + "df_clr = clr_transform(df_clr, scale=million)\n", + "df_plr = plr_transform(df_plr, scale=million)\n", "\n", "df_clr_inv = inverse_clr(df_clr, scale=million)\n", "df_alr_inv = inverse_alr(df_alr, \"c\", scale=million)\n", "\n", - "df_ilr_Al_Ca = single_ilr_transform(df, [\"Al_ppm_511\"], [\"Ca_ppm_511\"])\n", - "df_ilr_AlCa_FeMg = single_ilr_transform(df, [\"Al_ppm_511\", \"Ca_ppm_511\"], [\"Fe_ppm_511\", \"Mg_ppm_511\"])" + "df_ilr_Al_Ca = df.copy()\n", + "df_ilr_AlCa_FeMg = df.copy()\n", + "\n", + "df_ilr_Al_Ca = single_ilr_transform(df_ilr_Al_Ca, [\"Al_ppm_511\"], [\"Ca_ppm_511\"], scale=million)\n", + "df_ilr_AlCa_FeMg = single_ilr_transform(df_ilr_AlCa_FeMg, [\"Al_ppm_511\", \"Ca_ppm_511\"], [\"Fe_ppm_511\", \"Mg_ppm_511\"], scale=million)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 22, "id": "e136d05d-671d-420f-95b9-5f350bc7a94c", "metadata": { "tags": [] @@ -876,7 +872,7 @@ "dtype: float64" ] }, - "execution_count": 25, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -887,7 +883,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 23, "id": "ad352680-433a-4026-b7b5-560b682dfb96", "metadata": { "tags": [] @@ -917,7 +913,6 @@ " V1\n", " V2\n", " V3\n", - " V4\n", " \n", " \n", " \n", @@ -926,50 +921,45 @@ " 0.472906\n", " 0.848958\n", " 1.576338\n", - " 3.878683\n", " \n", " \n", " 1\n", " 0.628609\n", " -0.408128\n", " 1.325296\n", - " 4.833703\n", " \n", " \n", " 2\n", " 0.551401\n", " -0.391249\n", " 1.161222\n", - " 5.364379\n", " \n", " \n", " 3\n", " 0.812301\n", " 0.015314\n", " 1.505448\n", - " 5.703340\n", " \n", " \n", " 4\n", " 0.443790\n", " -0.801005\n", " 1.368049\n", - " 4.768590\n", " \n", " \n", "\n", "" ], "text/plain": [ - " V1 V2 V3 V4\n", - "0 0.472906 0.848958 1.576338 3.878683\n", - "1 0.628609 -0.408128 1.325296 4.833703\n", - "2 0.551401 -0.391249 1.161222 5.364379\n", - "3 0.812301 0.015314 1.505448 5.703340\n", - "4 0.443790 -0.801005 1.368049 4.768590" + " V1 V2 V3\n", + "0 0.472906 0.848958 1.576338\n", + "1 0.628609 -0.408128 1.325296\n", + "2 0.551401 -0.391249 1.161222\n", + "3 0.812301 0.015314 1.505448\n", + "4 0.443790 -0.801005 1.368049" ] }, - "execution_count": 26, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -977,19 +967,11 @@ "source": [ "df_alr_Mg.head()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b6a1929-51ef-4b7a-8621-f46bbe337e31", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "eis-toolkit-l5cKD1lZ-py3.10", "language": "python", "name": "python3" }, @@ -1003,7 +985,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/tests/data/remote/test.gpkg b/tests/data/remote/test.gpkg index 6dd130f4..b0b2ca51 100644 Binary files a/tests/data/remote/test.gpkg and b/tests/data/remote/test.gpkg differ diff --git a/tests/transformations/coda/alr_test.py b/tests/transformations/coda/alr_test.py index f6a41965..edba1028 100644 --- a/tests/transformations/coda/alr_test.py +++ b/tests/transformations/coda/alr_test.py @@ -14,7 +14,7 @@ def test_alr_transform(): arr = np.random.dirichlet(np.ones(4), size=4) df = pd.DataFrame(arr, columns=["a", "b", "c", "d"], dtype=np.float64) - result = alr_transform(df, column="b", keep_denominator_column=True) + result = alr_transform(df, denominator_column="b", keep_denominator_column=True) expected = pd.DataFrame( np.log(arr / arr[:, 1, None]), columns=["V1", "V2", "V3", "V4"], @@ -22,7 +22,7 @@ def test_alr_transform(): ) pd.testing.assert_frame_equal(result, expected) - result = alr_transform(df, column="b") + result = alr_transform(df, denominator_column="b") expected = pd.DataFrame( np.log(np.delete(arr, 1, axis=1) / arr[:, 1, None]), columns=["V1", "V2", "V3"], @@ -31,12 +31,34 @@ def test_alr_transform(): pd.testing.assert_frame_equal(result, expected) -def test_alr_transform_with_invalid_column(): - """Test that providing a column doesn't exist raises the correct exception.""" +def test_alr_transform_with_columns(): + """Test ALR transform with column selection.""" + arr = np.random.dirichlet(np.ones(4), size=4) + df = pd.DataFrame(arr, columns=["a", "b", "c", "d"], dtype=np.float64) + df["e"] = ["value1", "value2", "value3", "value4"] + + result = alr_transform(df, columns=["a", "b", "c", "d"], denominator_column="b", keep_denominator_column=True) + + expected = pd.DataFrame( + np.log(arr / arr[:, 1, None]), + columns=["V1", "V2", "V3", "V4"], + dtype=np.float64, + ) + pd.testing.assert_frame_equal(result, expected) + + +def test_alr_transform_with_invalid_denominator_column(): + """Test that providing a denominator column doesn't exist raises the correct exception.""" with pytest.raises(InvalidColumnException): alr_transform(SAMPLE_DATAFRAME, "e") +def test_alr_transform_with_invalid_columns(): + """Test that providing invalid columns raises the correct exception.""" + with pytest.raises(InvalidColumnException): + alr_transform(SAMPLE_DATAFRAME, columns=["x", "y", "z"]) + + def test_alr_transform_denominator_column(): """ Test ALR transformation with the keep_denominator_column option set to True. @@ -55,7 +77,7 @@ def test_inverse_alr(): arr = np.array([[np.log(0.25), np.log(0.25), np.log(0.25)], [np.log(2), np.log(2), np.log(2)]]) df = pd.DataFrame(arr, columns=["V1", "V2", "V3"], dtype=np.float64) column_name = "d" - result = inverse_alr(df, column_name, 7) + result = inverse_alr(df, denominator_column=column_name, scale=7) expected_arr = np.array([[1, 1, 1, 4], [2, 2, 2, 1]]) expected = pd.DataFrame(expected_arr, columns=["V1", "V2", "V3", "d"], dtype=np.float64) pd.testing.assert_frame_equal(result, expected, atol=1e-2) @@ -69,7 +91,7 @@ def test_inverse_alr_with_existing_denominator_column(): expected_arr = np.array([[1, 1, 4, 1], [2, 2, 1, 2]]) expected = pd.DataFrame(expected_arr, columns=["V1", "V2", "d", "V3"], dtype=np.float64) - result = inverse_alr(df, column_name, 7) + result = inverse_alr(df, denominator_column=column_name, scale=7) pd.testing.assert_frame_equal(result, expected, atol=1e-2) @@ -78,6 +100,16 @@ def test_inverse_alr_with_invalid_scale_value(): arr = np.array([[np.log(0.25), np.log(0.25), np.log(0.25)], [np.log(2), np.log(2), np.log(2)]]) df = pd.DataFrame(arr, columns=["V1", "V2", "V3"], dtype=np.float64) with pytest.raises(NumericValueSignException): - inverse_alr(df, "d", 0) + inverse_alr(df, denominator_column="d", scale=0) with pytest.raises(NumericValueSignException): - inverse_alr(df, "d", -7) + inverse_alr(df, denominator_column="d", scale=-7) + + +def test_inverse_alr_with_invalid_columns(): + """Test that providing invalid columns raises the correct exception.""" + arr = np.array([[np.log(0.25), np.log(0.25), np.log(0.25)], [np.log(2), np.log(2), np.log(2)]]) + df = pd.DataFrame(arr, columns=["V1", "V2", "V3"], dtype=np.float64) + with pytest.raises(InvalidColumnException): + inverse_alr(df, columns=["a"], denominator_column="V1") + with pytest.raises(InvalidColumnException): + inverse_alr(df, columns=["a", "b", "c"], denominator_column="V1") diff --git a/tests/transformations/coda/clr_test.py b/tests/transformations/coda/clr_test.py index 03eb9815..59a3e4a8 100644 --- a/tests/transformations/coda/clr_test.py +++ b/tests/transformations/coda/clr_test.py @@ -23,21 +23,44 @@ def test_clr_transform(): pd.testing.assert_frame_equal(result, expected, atol=1e-2) +def test_clr_transform_with_columns(): + """Test CLR transform with column selection.""" + arr = np.random.dirichlet(np.ones(4), size=4) + df = pd.DataFrame(arr, columns=["a", "b", "c", "d"], dtype=np.float64) + df["e"] = ["value1", "value2", "value3", "value4"] + result = clr_transform(df, columns=["a", "b", "c", "d"]) + geometric_means = np.prod(arr, axis=1) ** (1 / arr.shape[1]) + expected = pd.DataFrame( + np.log(arr / geometric_means[:, None]), + columns=["V1", "V2", "V3", "V4"], + dtype=np.float64, + ) + pd.testing.assert_frame_equal(result, expected, atol=1e-2) + + def test_inverse_clr_simple(): """Test CLR inverse core functionality.""" zeros_df_4x4 = pd.DataFrame(np.zeros((4, 4)), columns=["V1", "V2", "V3", "V4"]) ones_df_4x4 = pd.DataFrame(np.ones((4, 4)), columns=["a", "b", "c", "d"]) - result = inverse_clr(zeros_df_4x4, ["a", "b", "c", "d"], 4) + result = inverse_clr(zeros_df_4x4, colnames=["a", "b", "c", "d"], scale=4) pd.testing.assert_frame_equal(result, ones_df_4x4) def test_inverse_clr(): """Test CLR inverse core functionality.""" clr = clr_transform(SAMPLE_DATAFRAME) - result = inverse_clr(clr, ["a", "b", "c", "d"], 100) + result = inverse_clr(clr, colnames=["a", "b", "c", "d"], scale=100) pd.testing.assert_frame_equal(result, SAMPLE_DATAFRAME) +def test_inverse_clr_with_columns(): + """Test CLR inverse with column selection.""" + clr = clr_transform(SAMPLE_DATAFRAME) + result = inverse_clr(clr, columns=["V1", "V2"], colnames=["a", "b"], scale=100) + expected = pd.DataFrame({"a": [84.42, 79.75], "b": [15.58, 20.25]}) + pd.testing.assert_frame_equal(result, expected, atol=1e-2) + + def test_inverse_clr_with_invalid_scale_value(): """Test that inverse CLR with an invalid input scale raises the correct exception.""" clr = clr_transform(SAMPLE_DATAFRAME) diff --git a/tests/transformations/coda/ilr_test.py b/tests/transformations/coda/ilr_test.py index 29922c04..c1195cd9 100644 --- a/tests/transformations/coda/ilr_test.py +++ b/tests/transformations/coda/ilr_test.py @@ -15,10 +15,10 @@ def test_calculate_scaling_factor(): def test_single_ilr_transform_with_single_composition(): """Test the core functionality of a single ILR transform with a single row of data.""" - arr = np.array([80, 15, 5]) + arr = np.array([80, 15, 5]).astype(np.float64) df = pd.DataFrame(arr[None], columns=["a", "b", "c"]) - result = single_ilr_transform(df, ["a"], ["b"]) + result = single_ilr_transform(df, ["a"], ["b"], scale=100) assert result[0] == pytest.approx(1.18, abs=1e-2) result = single_ilr_transform(df, ["a", "b"], ["c"]) @@ -27,10 +27,10 @@ def test_single_ilr_transform_with_single_composition(): def test_single_ilr_transform(): """Test the core functionality of a single ILR transform.""" - arr = np.array([[80, 15, 5], [75, 18, 7]]) + arr = np.array([[80, 15, 5], [75, 18, 7]]).astype(dtype=np.float64) df = pd.DataFrame(arr, columns=["a", "b", "c"]) - result = single_ilr_transform(df, ["a"], ["b"]) + result = single_ilr_transform(df, ["a"], ["b"], scale=100) assert result[1] == pytest.approx(1.01, abs=1e-2) result = single_ilr_transform(df, ["a", "b"], ["c"]) diff --git a/tests/transformations/coda/plr_test.py b/tests/transformations/coda/plr_test.py index d5bd3672..5e8c567b 100644 --- a/tests/transformations/coda/plr_test.py +++ b/tests/transformations/coda/plr_test.py @@ -17,7 +17,7 @@ def test_single_plr_transform_with_single_composition(): result = _single_plr_transform_by_index(df, 0) assert result[0] == pytest.approx(1.82, abs=1e-2) - result = single_plr_transform(df, "b") + result = single_plr_transform(df, "b", scale=100) assert result[0] == pytest.approx(0.78, abs=1e-2) result = _single_plr_transform_by_index(df, 1) @@ -26,10 +26,10 @@ def test_single_plr_transform_with_single_composition(): def test_single_plr_transform_with_simple_data(): """Test the core functionality of a single PLR transform.""" - arr = np.array([[80, 15, 5], [75, 18, 7]]) + arr = np.array([[80, 15, 5], [75, 20, 5]]) df = pd.DataFrame(arr, columns=["a", "b", "c"]) - result = single_plr_transform(df, "b") - assert result[1] == pytest.approx(0.67, abs=1e-2) + result = single_plr_transform(df, "a") + assert result[1] == pytest.approx(1.65, abs=1e-2) def test_single_plr_transform_with_last_column(): @@ -40,6 +40,28 @@ def test_single_plr_transform_with_last_column(): single_plr_transform(df, "c") +def test_single_plr_invalid_columns(): + """Test that invalid column names raise exceptions.""" + arr = np.array([[80, 15, 5], [75, 18, 7]]) + df = pd.DataFrame(arr, columns=["a", "b", "c"]) + + # Numerator not in df + with pytest.raises(InvalidColumnException): + single_plr_transform(df, "x") + + # A denominator columnnot in df + with pytest.raises(InvalidColumnException): + single_plr_transform(df, "a", "x") + + # Numerator in denominator columns + with pytest.raises(InvalidColumnException): + single_plr_transform(df, "a", ["a", "b"]) + + # A denominator column is to the left of numerator column + with pytest.raises(InvalidColumnException): + single_plr_transform(df, "b", ["a", "c"]) + + def test_plr_transform(): """Test PLR transform core functionality.""" arr = np.array([[65, 12, 18, 5], [63, 16, 15, 6]]) @@ -48,3 +70,13 @@ def test_plr_transform(): assert len(result.columns) == len(df.columns) - 1 expected = pd.DataFrame(np.array([[1.60, 0.19, 0.91], [1.49, 0.43, 0.65]]), columns=["V1", "V2", "V3"]) pd.testing.assert_frame_equal(result, expected, atol=1e-2) + + +def test_plr_transform_with_columns(): + """Test PLR transform with column selection.""" + arr = np.array([[0, 65, 12, 18, 5], [0, 63, 16, 15, 6]]) + df = pd.DataFrame(arr, columns=["a", "b", "c", "d", "e"]) + result = plr_transform(df, columns=["b", "c", "d", "e"]) + assert len(result.columns) == 3 + expected = pd.DataFrame(np.array([[1.60, 0.19, 0.91], [1.49, 0.43, 0.65]]), columns=["V1", "V2", "V3"]) + pd.testing.assert_frame_equal(result, expected, atol=1e-2) diff --git a/tests/utilities/compositional_test.py b/tests/utilities/compositional_test.py index d7613f7e..e7456c81 100644 --- a/tests/utilities/compositional_test.py +++ b/tests/utilities/compositional_test.py @@ -35,7 +35,7 @@ def test_compositional_data_has_negatives(): with pytest.raises(NumericValueSignException): clr_transform(df) with pytest.raises(NumericValueSignException): - single_ilr_transform(df, ["a"], ["b"]) + single_ilr_transform(df, ["a"], ["b", "c"], scale=100) with pytest.raises(NumericValueSignException): plr_transform(df) with pytest.raises(NumericValueSignException):