modelEAU · LoesVerhaeghe · Mar 12, 2025 · Mar 12, 2025 · Mar 13, 2025 · Mar 13, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -73,3 +73,8 @@
 ## 0.7.4
 
 - Added start and end parameters to plotting functions.
+
+## 0.8.0
+
+// TODO: says what you added to the pacakge
+- Added
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,18 +1,19 @@
 [tool.poetry]
 name = "meteaudata"
-version = "0.7.4"
+version = "0.8.0"
 description = "A lightweight package for tracking metadata about time series to create repeatable data pipelines."
 authors = ["Jean-David Therrien <jeandavidt@gmail.com>"]
 license = "CC-BY4"
 readme = "README.md"
 
 [tool.poetry.dependencies]
-python = "^3.9"
+python = ">3.10,<4.0"
 pandas = ">=1.4"
 ipython = "^8.2"
 pyyaml = "^6.0.1"
 pydantic = "^2.0"
 plotly = "^5.21.0"
+matplotlib = "3.10.1"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "^1.9.0"

diff --git a/src/meteaudata/__init__.py b/src/meteaudata/__init__.py
@@ -4,8 +4,14 @@
 from meteaudata.processing_steps.univariate.interpolate import (  # noqa: F401
     linear_interpolation,
 )
+from meteaudata.processing_steps.univariate.remove_duplicates import (
+    remove_duplicates,  # noqa: F401
+)
 from meteaudata.processing_steps.univariate.replace import replace_ranges  # noqa: F401
 from meteaudata.processing_steps.univariate.resample import resample  # noqa: F401
+from meteaudata.processing_steps.univariate.select_time_range import (
+    select_time_range,  # noqa: F401
+)
 from meteaudata.processing_steps.univariate.subset import subset  # noqa: F401
 from meteaudata.types import (  # noqa: F401
     DataProvenance,

diff --git a/src/meteaudata/processing_steps/multivariate/average.py b/src/meteaudata/processing_steps/multivariate/average.py
@@ -2,6 +2,7 @@
 from typing import Optional
 
 import pandas as pd
+
 from meteaudata.types import (
     DataProvenance,
     FunctionInfo,

diff --git a/src/meteaudata/processing_steps/univariate/remove_duplicates.py b/src/meteaudata/processing_steps/univariate/remove_duplicates.py
@@ -0,0 +1,76 @@
+import datetime
+from typing import Literal
+
+import pandas as pd
+
+from meteaudata.types import (
+    FunctionInfo,
+    Parameters,
+    ProcessingStep,
+    ProcessingType,
+)
+
+
+def remove_duplicates(
+    input_series: list[pd.Series],
+    *args,
+    keep: Literal["first", "last", False] = "first",
+    **kwargs,
+) -> list[tuple[pd.Series, list[ProcessingStep]]]:
+    """
+    A processing function to remove duplicate sample points from time series data.
+
+    The function checks for duplicates and retains only the desired occurrence of each duplicate.
+    first -> keeps first encountered row with that index
+    last -> keeps last encountered value with that index
+    False -> removes all duplicated values
+
+    Args:
+        input_series (list[pd.Series]): List of input time series to be processed.
+
+    Returns:
+        list[tuple[pd.Series, list[ProcessingStep]]]: Time series with duplicates removed, including metadata about the processing steps.
+    """
+
+    func_info = FunctionInfo(
+        name="remove_duplicates",
+        version="0.1",
+        author="Loes Verhaeghe",
+        reference="Loes Verhaeghe with the help of chat gpt",
+    )
+
+    processing_step = ProcessingStep(
+        type=ProcessingType.RESAMPLING,
+        parameters=Parameters(keep=keep),
+        function_info=func_info,
+        description="A processing function to remove duplicate sample points from time series",
+        run_datetime=datetime.datetime.now(),
+        requires_calibration=False,
+        input_series_names=[str(col.name) for col in input_series],
+        suffix="DEDUPLICATED",
+    )
+
+    outputs = []
+
+    for col in input_series:
+        col = col.copy()
+        col_name = col.name
+        signal, _ = str(col_name).split("_")
+
+        # Ensure the series has a proper datetime index
+        if not isinstance(col.index, pd.DatetimeIndex):
+            raise IndexError(
+                f"Series {col.name} has index type {type(col.index)}. Please provide pd.DatetimeIndex."
+            )
+
+        # Remove duplicate values while keeping the first occurrence
+        filtered_col = col.loc[~col.index.duplicated(keep=keep)]
+
+        # Update the series name with the processing step suffix
+        new_name = f"{signal}_{processing_step.suffix}"
+        filtered_col.name = new_name
+
+        # Append the filtered series along with the processing step metadata
+        outputs.append((filtered_col, [processing_step]))
+
+    return outputs
diff --git a/src/meteaudata/processing_steps/univariate/resample.py b/src/meteaudata/processing_steps/univariate/resample.py
@@ -39,7 +39,7 @@ def resample(
     for col in input_series:
         col = col.copy()
         col_name = col.name
-        signal, _ = str(col_name).split("_")
+        signal = "_".join(str(col_name).split("_")[:-1])
         if not isinstance(col.index, (pd.DatetimeIndex, pd.TimedeltaIndex)):
             raise IndexError(
                 f"Series {col.name} has index type {type(col.index)}. Please provide either pd.DatetimeIndex or pd.TimedeltaIndex"

diff --git a/src/meteaudata/processing_steps/univariate/select_time_range.py b/src/meteaudata/processing_steps/univariate/select_time_range.py
@@ -0,0 +1,76 @@
+import datetime
+
+import pandas as pd
+
+from meteaudata.types import (
+    FunctionInfo,
+    Parameters,
+    ProcessingStep,
+    ProcessingType,
+)
+
+
+def select_time_range(
+    input_series: list[pd.Series], start_time: str, end_time: str, *args, **kwargs
+) -> list[tuple[pd.Series, list[ProcessingStep]]]:
+    """
+    A processing function to filter time series data within a specified time range.
+
+    The function accepts a start and end time, and filters the data accordingly.
+
+    Args:
+        input_series (list[pd.Series]): List of input time series to be processed.
+        start_time (str): Start of the time range (e.g., "2023-10-01 00:00:00").
+        end_time (str): End of the time range (e.g., "2023-10-20 00:00:00").
+
+    Returns:
+        list[tuple[pd.Series, list[ProcessingStep]]]: Filtered time series with metadata about the processing steps.
+    """
+
+    func_info = FunctionInfo(
+        name="select_time_range",
+        version="0.1",
+        author="Loes Verhaeghe",
+        reference="Loes Verhaeghe with the help of chat gpt",
+    )
+
+    parameters = Parameters(start_time=start_time, end_time=end_time)
+
+    processing_step = ProcessingStep(
+        type=ProcessingType.FILTERING,
+        parameters=parameters,
+        function_info=func_info,
+        description="A processing function to select data between a specific time range",
+        run_datetime=datetime.datetime.now(),
+        requires_calibration=False,
+        input_series_names=[str(col.name) for col in input_series],
+        suffix="TIME-SLICE",
+    )
+
+    outputs = []
+
+    start_time = pd.to_datetime(start_time)
+    end_time = pd.to_datetime(end_time)
+
+    for col in input_series:
+        col = col.copy()
+        col_name = col.name
+        signal, _ = str(col_name).split("_")
+
+        # Ensure the series has a proper datetime index
+        if not isinstance(col.index, pd.DatetimeIndex):
+            raise IndexError(
+                f"Series {col.name} has index type {type(col.index)}. Please provide pd.DatetimeIndex."
+            )
+
+        # Filter the data based on the given time range
+        filtered_col = col[(col.index >= start_time) & (col.index <= end_time)]
+
+        # Update the series name with the processing step suffix
+        new_name = f"{signal}_{processing_step.suffix}"
+        filtered_col.name = new_name
+
+        # Append the filtered series along with the processing step metadata
+        outputs.append((filtered_col, [processing_step]))
+
+    return outputs
diff --git a/src/meteaudata/timeseries_stats.py b/src/meteaudata/timeseries_stats.py
@@ -0,0 +1,23 @@
+from meteaudata.types import (
+    TimeSeries,
+)
+
+
+def check_missing_values(input_ts: TimeSeries) -> int:
+    """
+    A function to count the missing values in a time series.
+
+    The function checks for any missing (NaN) values in the input series.
+
+    Args:
+        input_ts (TimeSeries): input time series to be processed.
+
+    Returns:
+        int: The number of NaN values in the Time Series data
+    """
+
+    data = input_ts.series
+
+    missing_count = data.isnull().sum()
+
+    return missing_count
diff --git a/src/meteaudata/types.py b/src/meteaudata/types.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 from typing import Any, Optional, Protocol, Union
 
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
@@ -545,6 +546,8 @@ def plot(
         )
         return fig
 
+    # TODO: implement plot_mpl for the time_series object.
+
     def remove_duplicated_steps(self):
         steps = self.processing_steps
         new_steps = []
@@ -1012,6 +1015,45 @@ def plot(
         )
         return fig
 
+    def plot_mpl(
+        self,
+        ts_names: list[str],
+        title: Optional[str] = None,
+        y_axis: Optional[str] = None,
+        x_axis: Optional[str] = None,
+    ) -> None:
+        """
+        Renders the selected time series in a matplotlib figure.
+        """
+        # Set default titles if not provided
+        if not title:
+            title = f"Time series plot of {self.name}"
+        if not y_axis:
+            y_axis = f"{self.name} ({self.units})"
+        if not x_axis:
+            x_axis = "Time"
+
+        # Create a figure and axis
+        fig = plt.figure(figsize=(12, 6))
+
+        # Loop through time series names and plot each one
+        for ts_name in ts_names:
+            ts = self.time_series[ts_name].series
+            # Assuming the time series `ts` has a pandas Series structure
+            plt.plot(ts, label=ts_name)
+
+        # Add title and labels
+        plt.title(title)
+        plt.xlabel(x_axis)
+        plt.ylabel(y_axis)
+
+        # Show legend
+        plt.legend()
+
+        # Display the plot
+        plt.tight_layout()
+        return fig
+
     def build_dependency_graph(self, ts_name: str) -> list[dict[str, Any]]:
         """
         Build a data structure that represents all the processig steps and their dependencies for a given time series.
@@ -1564,6 +1606,8 @@ def plot(
             )
         return fig
 
+    # TODO: implement plot_mpl for the dataset object.
+
     def __eq__(self, other):
         if not isinstance(other, Dataset):
             return False

diff --git a/tests/test_metEAUdata.py b/tests/test_metEAUdata.py
@@ -6,8 +6,10 @@
 from meteaudata.processing_steps.univariate import (
     interpolate,
     prediction,
+    remove_duplicates,
     replace,
     resample,
+    select_time_range,
     subset,
 )
 from meteaudata.types import DataProvenance, Dataset, Signal, TimeSeries
@@ -91,9 +93,21 @@ def sample_dataset():
             reason="sensor calibration procedure",
             replace_with=np.nan,
         )
+        # pd.date_range(start="2020-01-01", freq="6min", periods=100),)
         signal = signal.process(
             [f"{signal_name}_RESAMPLED#1"], interpolate.linear_interpolation
         )
+        signal = signal.process(
+            [f"{signal_name}_LIN-INT#1"],
+            select_time_range.select_time_range,
+            "2020-01-01 00:00:00",
+            "2020-01-01 02:00:00",
+        )
+        signal = signal.process(
+            [f"{signal_name}_TIME-SLICE#1"],
+            remove_duplicates.remove_duplicates,
+            keep="last",
+        )
     return dataset
 
 

diff --git a/tests/test_snippets.py b/tests/test_snippets.py