Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/exploratory_analyses/normality_test.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Normality test

::: eis_toolkit.exploratory_analyses.normality_test
126 changes: 126 additions & 0 deletions eis_toolkit/exploratory_analyses/normality_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from numbers import Number

import numpy as np
import pandas as pd
from beartype import beartype
from beartype.typing import Dict, Optional, Sequence, Tuple
from scipy.stats import shapiro

from eis_toolkit.exceptions import (
EmptyDataException,
InvalidColumnException,
InvalidDataShapeException,
InvalidRasterBandException,
NonNumericDataException,
SampleSizeExceededException,
)
from eis_toolkit.utilities.checks.dataframe import check_columns_numeric, check_columns_valid, check_empty_dataframe


@beartype
def normality_test_dataframe(
data: pd.DataFrame, columns: Optional[Sequence[str]] = None
) -> Dict[str, Tuple[float, float]]:
"""
Compute Shapiro-Wilk test for normality on the input DataFrame.

Nodata values are dropped automatically.

Args:
data: Dataframe containing the input data.
columns: Column selection. If none, normality is tested for all columns.

Returns:
Test statistic and p_value for each selected column in a dictionary.

Raises:
EmptyDataException: The input data is empty.
InvalidColumnException: All selected columns were not found in the input data.
NonNumericDataException: Selected data or columns contains non-numeric data.
SampleSizeExceededException: Input data exceeds the maximum of 5000 samples.
"""
if check_empty_dataframe(data):
raise EmptyDataException("The input Dataframe is empty.")

if columns is not None:
if not check_columns_valid(data, columns):
raise InvalidColumnException("All selected columns were not found in the input DataFrame.")
if not check_columns_numeric(data, columns):
raise NonNumericDataException("The selected columns contain non-numeric data.")

data = data[columns].dropna()

else:
if not check_columns_numeric(data, data.columns):
raise NonNumericDataException("The input data contain non-numeric data.")
columns = data.columns

statistics = {}
for column in columns:
if len(data[column]) > 5000:
raise SampleSizeExceededException(f"Sample size for column '{column}' exceeds the limit of 5000 samples.")
statistics[column] = shapiro(data[column])

return statistics


@beartype
def normality_test_array(
data: np.ndarray, bands: Optional[Sequence[int]] = None, nodata_value: Optional[Number] = None
) -> Dict[int, Tuple[float, float]]:
"""
Compute Shapiro-Wilk test for normality on the input Numpy array.

It is assumed that 3D input array represents multiband raster and the first dimension is the number of bands
(same shape as Rasterio reads a raster into an array). Normality is calculated for each band separately.
NaN values and optionally a specified nodata value are masked out before calculations.

Args:
data: Numpy array containing the input data. Array should either be 1D, 2D or 3D.
bands: Band selection. Applies only if input array is 3D. If None, normality is tested for each band.
nodata_value: Nodata value to be masked out. Optional parameter.

Returns:
Test statistic and p_value for each selected band in a dictionary.

Raises:
EmptyDataException: The input data is empty.
InvalidRasterBandException: All selected bands were not found in the input data.
InvalidDataShapeException: Input data has incorrect number of dimensions (> 3).
SampleSizeExceededException: Input data exceeds the maximum of 5000 samples.
"""
if data.size == 0:
raise EmptyDataException("The input Numpy array is empty.")

if data.ndim == 1 or data.ndim == 2:
prepared_data = np.expand_dims(data, axis=0)
bands = range(1)

elif data.ndim == 3:
if bands is not None:
if not all(band < len(data) for band in bands):
raise InvalidRasterBandException("All selected bands were not found in the input array.")
else:
bands = range(len(data))
prepared_data = data

else:
raise InvalidDataShapeException(f"The input data has unexpected number of dimensions: {data.ndim}.")

statistics = {}

for band in bands:
flattened_data = prepared_data[band].ravel()

nan_mask = flattened_data == np.nan
if nodata_value is not None:
nodata_mask = flattened_data == nodata_value
nan_mask = nan_mask & nodata_mask
masked_data = np.ma.masked_array(data=flattened_data, mask=nan_mask)

if len(masked_data) > 5000:
raise SampleSizeExceededException(f"Sample size for band '{band}' exceeds the limit of 5000 samples.")

statistics[band] = shapiro(masked_data)

return statistics
78 changes: 4 additions & 74 deletions eis_toolkit/exploratory_analyses/statistical_tests.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
import numpy as np
import pandas as pd
from beartype import beartype
from beartype.typing import Dict, Literal, Optional, Sequence, Tuple, Union
from scipy.stats import chi2_contingency, shapiro

from eis_toolkit.exceptions import (
EmptyDataException,
EmptyDataFrameException,
InvalidColumnException,
InvalidParameterValueException,
NonNumericDataException,
SampleSizeExceededException,
)
from beartype.typing import Literal, Optional, Sequence
from scipy.stats import chi2_contingency

from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
from eis_toolkit.utilities.checks.dataframe import check_columns_numeric, check_columns_valid, check_empty_dataframe


Expand Down Expand Up @@ -57,68 +49,6 @@ def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Se
return statistics


@beartype
def normality_test(
data: Union[pd.DataFrame, np.ndarray], columns: Optional[Sequence[str]] = None
) -> Union[Dict[str, Tuple[float, float]], Tuple[float, float]]:
"""Compute Shapiro-Wilk test for normality on the input data.

Args:
data: Dataframe or Numpy array containing the input data.
columns: Optional columns to be used for testing.

Returns:
Test statistics for each variable, output differs based on input data type.
Numpy array input returns a Tuple of statistic and p_value.
Dataframe input returns a dictionary where keys are column names
and values are tuples containing the statistic and p-value.

Raises:
EmptyDataException: The input data is empty.
InvalidColumnException: All selected columns were not found in the input data.
NonNumericDataException: Selected data or columns contains non-numeric data.
SampleSizeExceededException: Input data exceeds the maximum of 5000 samples.
"""
statistics = {}
if isinstance(data, pd.DataFrame):
if check_empty_dataframe(data):
raise EmptyDataException("The input Dataframe is empty.")

if columns is not None:
if not check_columns_valid(data, columns):
raise InvalidColumnException("All selected columns were not found in the input DataFrame.")
if not check_columns_numeric(data, columns):
raise NonNumericDataException("The selected columns contain non-numeric data.")

data = data[columns].dropna()

else:
if not check_columns_numeric(data, data.columns):
raise NonNumericDataException("The input data contain non-numeric data.")
columns = data.columns

for column in columns:
if len(data[column]) > 5000:
raise SampleSizeExceededException(f"Sample size for '{column}' exceeds the limit of 5000 samples.")
statistic, p_value = shapiro(data[column])
statistics[column] = (statistic, p_value)

else:
if data.size == 0:
raise EmptyDataException("The input numpy array is empty.")
if len(data) > 5000:
raise SampleSizeExceededException("Sample size exceeds the limit of 5000 samples.")

nan_mask = np.isnan(data)
data = data[~nan_mask]

flattened_data = data.flatten()
statistic, p_value = shapiro(flattened_data)
statistics = (statistic, p_value)

return statistics


@beartype
def correlation_matrix(
data: pd.DataFrame,
Expand Down
Loading