GispoCoding · nmaarnio · Oct 11, 2023 · Oct 12, 2023 · Feb 1, 2024 · Feb 22, 2024
diff --git a/docs/exploratory_analyses/normality_test.md b/docs/exploratory_analyses/normality_test.md
@@ -0,0 +1,3 @@
+# Normality test
+
+::: eis_toolkit.exploratory_analyses.normality_test
diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py
@@ -0,0 +1,126 @@
+from numbers import Number
+
+import numpy as np
+import pandas as pd
+from beartype import beartype
+from beartype.typing import Dict, Optional, Sequence, Tuple
+from scipy.stats import shapiro
+
+from eis_toolkit.exceptions import (
+    EmptyDataException,
+    InvalidColumnException,
+    InvalidDataShapeException,
+    InvalidRasterBandException,
+    NonNumericDataException,
+    SampleSizeExceededException,
+)
+from eis_toolkit.utilities.checks.dataframe import check_columns_numeric, check_columns_valid, check_empty_dataframe
+
+
+@beartype
+def normality_test_dataframe(
+    data: pd.DataFrame, columns: Optional[Sequence[str]] = None
+) -> Dict[str, Tuple[float, float]]:
+    """
+    Compute Shapiro-Wilk test for normality on the input DataFrame.
+
+    Nodata values are dropped automatically.
+
+    Args:
+        data: Dataframe containing the input data.
+        columns: Column selection. If none, normality is tested for all columns.
+
+    Returns:
+        Test statistic and p_value for each selected column in a dictionary.
+
+    Raises:
+        EmptyDataException: The input data is empty.
+        InvalidColumnException: All selected columns were not found in the input data.
+        NonNumericDataException: Selected data or columns contains non-numeric data.
+        SampleSizeExceededException: Input data exceeds the maximum of 5000 samples.
+    """
+    if check_empty_dataframe(data):
+        raise EmptyDataException("The input Dataframe is empty.")
+
+    if columns is not None:
+        if not check_columns_valid(data, columns):
+            raise InvalidColumnException("All selected columns were not found in the input DataFrame.")
+        if not check_columns_numeric(data, columns):
+            raise NonNumericDataException("The selected columns contain non-numeric data.")
+
+        data = data[columns].dropna()
+
+    else:
+        if not check_columns_numeric(data, data.columns):
+            raise NonNumericDataException("The input data contain non-numeric data.")
+        columns = data.columns
+
+    statistics = {}
+    for column in columns:
+        if len(data[column]) > 5000:
+            raise SampleSizeExceededException(f"Sample size for column '{column}' exceeds the limit of 5000 samples.")
+        statistics[column] = shapiro(data[column])
+
+    return statistics
+
+
+@beartype
+def normality_test_array(
+    data: np.ndarray, bands: Optional[Sequence[int]] = None, nodata_value: Optional[Number] = None
+) -> Dict[int, Tuple[float, float]]:
+    """
+    Compute Shapiro-Wilk test for normality on the input Numpy array.
+
+    It is assumed that 3D input array represents multiband raster and the first dimension is the number of bands
+    (same shape as Rasterio reads a raster into an array). Normality is calculated for each band separately.
+    NaN values and optionally a specified nodata value are masked out before calculations.
+
+    Args:
+        data: Numpy array containing the input data. Array should either be 1D, 2D or 3D.
+        bands: Band selection. Applies only if input array is 3D. If None, normality is tested for each band.
+        nodata_value: Nodata value to be masked out. Optional parameter.
+
+    Returns:
+        Test statistic and p_value for each selected band in a dictionary.
+
+    Raises:
+        EmptyDataException: The input data is empty.
+        InvalidRasterBandException: All selected bands were not found in the input data.
+        InvalidDataShapeException: Input data has incorrect number of dimensions (> 3).
+        SampleSizeExceededException: Input data exceeds the maximum of 5000 samples.
+    """
+    if data.size == 0:
+        raise EmptyDataException("The input Numpy array is empty.")
+
+    if data.ndim == 1 or data.ndim == 2:
+        prepared_data = np.expand_dims(data, axis=0)
+        bands = range(1)
+
+    elif data.ndim == 3:
+        if bands is not None:
+            if not all(band < len(data) for band in bands):
+                raise InvalidRasterBandException("All selected bands were not found in the input array.")
+        else:
+            bands = range(len(data))
+        prepared_data = data
+
+    else:
+        raise InvalidDataShapeException(f"The input data has unexpected number of dimensions: {data.ndim}.")
+
+    statistics = {}
+
+    for band in bands:
+        flattened_data = prepared_data[band].ravel()
+
+        nan_mask = flattened_data == np.nan
+        if nodata_value is not None:
+            nodata_mask = flattened_data == nodata_value
+            nan_mask = nan_mask & nodata_mask
+        masked_data = np.ma.masked_array(data=flattened_data, mask=nan_mask)
+
+        if len(masked_data) > 5000:
+            raise SampleSizeExceededException(f"Sample size for band '{band}' exceeds the limit of 5000 samples.")
+
+        statistics[band] = shapiro(masked_data)
+
+    return statistics
diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py
@@ -1,17 +1,9 @@
-import numpy as np
 import pandas as pd
 from beartype import beartype
-from beartype.typing import Dict, Literal, Optional, Sequence, Tuple, Union
-from scipy.stats import chi2_contingency, shapiro
-
-from eis_toolkit.exceptions import (
-    EmptyDataException,
-    EmptyDataFrameException,
-    InvalidColumnException,
-    InvalidParameterValueException,
-    NonNumericDataException,
-    SampleSizeExceededException,
-)
+from beartype.typing import Literal, Optional, Sequence
+from scipy.stats import chi2_contingency
+
+from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
 from eis_toolkit.utilities.checks.dataframe import check_columns_numeric, check_columns_valid, check_empty_dataframe
 
 
@@ -57,68 +49,6 @@ def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Se
     return statistics
 
 
-@beartype
-def normality_test(
-    data: Union[pd.DataFrame, np.ndarray], columns: Optional[Sequence[str]] = None
-) -> Union[Dict[str, Tuple[float, float]], Tuple[float, float]]:
-    """Compute Shapiro-Wilk test for normality on the input data.
-
-    Args:
-        data: Dataframe or Numpy array containing the input data.
-        columns: Optional columns to be used for testing.
-
-    Returns:
-        Test statistics for each variable, output differs based on input data type.
-            Numpy array input returns a Tuple of statistic and p_value.
-            Dataframe input returns a dictionary where keys are column names
-            and values are tuples containing the statistic and p-value.
-
-    Raises:
-        EmptyDataException: The input data is empty.
-        InvalidColumnException: All selected columns were not found in the input data.
-        NonNumericDataException: Selected data or columns contains non-numeric data.
-        SampleSizeExceededException: Input data exceeds the maximum of 5000 samples.
-    """
-    statistics = {}
-    if isinstance(data, pd.DataFrame):
-        if check_empty_dataframe(data):
-            raise EmptyDataException("The input Dataframe is empty.")
-
-        if columns is not None:
-            if not check_columns_valid(data, columns):
-                raise InvalidColumnException("All selected columns were not found in the input DataFrame.")
-            if not check_columns_numeric(data, columns):
-                raise NonNumericDataException("The selected columns contain non-numeric data.")
-
-            data = data[columns].dropna()
-
-        else:
-            if not check_columns_numeric(data, data.columns):
-                raise NonNumericDataException("The input data contain non-numeric data.")
-            columns = data.columns
-
-        for column in columns:
-            if len(data[column]) > 5000:
-                raise SampleSizeExceededException(f"Sample size for '{column}' exceeds the limit of 5000 samples.")
-            statistic, p_value = shapiro(data[column])
-            statistics[column] = (statistic, p_value)
-
-    else:
-        if data.size == 0:
-            raise EmptyDataException("The input numpy array is empty.")
-        if len(data) > 5000:
-            raise SampleSizeExceededException("Sample size exceeds the limit of 5000 samples.")
-
-        nan_mask = np.isnan(data)
-        data = data[~nan_mask]
-
-        flattened_data = data.flatten()
-        statistic, p_value = shapiro(flattened_data)
-        statistics = (statistic, p_value)
-
-    return statistics
-
-
 @beartype
 def correlation_matrix(
     data: pd.DataFrame,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Normality test

		::: eis_toolkit.exploratory_analyses.normality_test