Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,8 @@
## 0.7.4

- Added start and end parameters to plotting functions.

## 0.8.0

// TODO: says what you added to the pacakge
- Added
515 changes: 502 additions & 13 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
[tool.poetry]
name = "meteaudata"
version = "0.7.4"
version = "0.8.0"
description = "A lightweight package for tracking metadata about time series to create repeatable data pipelines."
authors = ["Jean-David Therrien <jeandavidt@gmail.com>"]
license = "CC-BY4"
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.9"
python = ">3.10,<4.0"
pandas = ">=1.4"
ipython = "^8.2"
pyyaml = "^6.0.1"
pydantic = "^2.0"
plotly = "^5.21.0"
matplotlib = "3.10.1"

[tool.poetry.group.dev.dependencies]
mypy = "^1.9.0"
Expand Down
6 changes: 6 additions & 0 deletions src/meteaudata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@
from meteaudata.processing_steps.univariate.interpolate import ( # noqa: F401
linear_interpolation,
)
from meteaudata.processing_steps.univariate.remove_duplicates import (
remove_duplicates, # noqa: F401
)
from meteaudata.processing_steps.univariate.replace import replace_ranges # noqa: F401
from meteaudata.processing_steps.univariate.resample import resample # noqa: F401
from meteaudata.processing_steps.univariate.select_time_range import (
select_time_range, # noqa: F401
)
from meteaudata.processing_steps.univariate.subset import subset # noqa: F401
from meteaudata.types import ( # noqa: F401
DataProvenance,
Expand Down
1 change: 1 addition & 0 deletions src/meteaudata/processing_steps/multivariate/average.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional

import pandas as pd

from meteaudata.types import (
DataProvenance,
FunctionInfo,
Expand Down
76 changes: 76 additions & 0 deletions src/meteaudata/processing_steps/univariate/remove_duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import datetime
from typing import Literal

import pandas as pd

from meteaudata.types import (
FunctionInfo,
Parameters,
ProcessingStep,
ProcessingType,
)


def remove_duplicates(
input_series: list[pd.Series],
*args,
keep: Literal["first", "last", False] = "first",
**kwargs,
) -> list[tuple[pd.Series, list[ProcessingStep]]]:
"""
A processing function to remove duplicate sample points from time series data.

The function checks for duplicates and retains only the desired occurrence of each duplicate.
first -> keeps first encountered row with that index
last -> keeps last encountered value with that index
False -> removes all duplicated values

Args:
input_series (list[pd.Series]): List of input time series to be processed.

Returns:
list[tuple[pd.Series, list[ProcessingStep]]]: Time series with duplicates removed, including metadata about the processing steps.
"""

func_info = FunctionInfo(
name="remove_duplicates",
version="0.1",
author="Loes Verhaeghe",
reference="Loes Verhaeghe with the help of chat gpt",
)

processing_step = ProcessingStep(
type=ProcessingType.RESAMPLING,
parameters=Parameters(keep=keep),
function_info=func_info,
description="A processing function to remove duplicate sample points from time series",
run_datetime=datetime.datetime.now(),
requires_calibration=False,
input_series_names=[str(col.name) for col in input_series],
suffix="DEDUPLICATED",
)

outputs = []

for col in input_series:
col = col.copy()
col_name = col.name
signal, _ = str(col_name).split("_")

# Ensure the series has a proper datetime index
if not isinstance(col.index, pd.DatetimeIndex):
raise IndexError(
f"Series {col.name} has index type {type(col.index)}. Please provide pd.DatetimeIndex."
)

# Remove duplicate values while keeping the first occurrence
filtered_col = col.loc[~col.index.duplicated(keep=keep)]

# Update the series name with the processing step suffix
new_name = f"{signal}_{processing_step.suffix}"
filtered_col.name = new_name

# Append the filtered series along with the processing step metadata
outputs.append((filtered_col, [processing_step]))

return outputs
2 changes: 1 addition & 1 deletion src/meteaudata/processing_steps/univariate/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def resample(
for col in input_series:
col = col.copy()
col_name = col.name
signal, _ = str(col_name).split("_")
signal = "_".join(str(col_name).split("_")[:-1])
if not isinstance(col.index, (pd.DatetimeIndex, pd.TimedeltaIndex)):
raise IndexError(
f"Series {col.name} has index type {type(col.index)}. Please provide either pd.DatetimeIndex or pd.TimedeltaIndex"
Expand Down
76 changes: 76 additions & 0 deletions src/meteaudata/processing_steps/univariate/select_time_range.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import datetime

import pandas as pd

from meteaudata.types import (
FunctionInfo,
Parameters,
ProcessingStep,
ProcessingType,
)


def select_time_range(
input_series: list[pd.Series], start_time: str, end_time: str, *args, **kwargs
) -> list[tuple[pd.Series, list[ProcessingStep]]]:
"""
A processing function to filter time series data within a specified time range.

The function accepts a start and end time, and filters the data accordingly.

Args:
input_series (list[pd.Series]): List of input time series to be processed.
start_time (str): Start of the time range (e.g., "2023-10-01 00:00:00").
end_time (str): End of the time range (e.g., "2023-10-20 00:00:00").

Returns:
list[tuple[pd.Series, list[ProcessingStep]]]: Filtered time series with metadata about the processing steps.
"""

func_info = FunctionInfo(
name="select_time_range",
version="0.1",
author="Loes Verhaeghe",
reference="Loes Verhaeghe with the help of chat gpt",
)

parameters = Parameters(start_time=start_time, end_time=end_time)

processing_step = ProcessingStep(
type=ProcessingType.FILTERING,
parameters=parameters,
function_info=func_info,
description="A processing function to select data between a specific time range",
run_datetime=datetime.datetime.now(),
requires_calibration=False,
input_series_names=[str(col.name) for col in input_series],
suffix="TIME-SLICE",
)

outputs = []

start_time = pd.to_datetime(start_time)
end_time = pd.to_datetime(end_time)

for col in input_series:
col = col.copy()
col_name = col.name
signal, _ = str(col_name).split("_")

# Ensure the series has a proper datetime index
if not isinstance(col.index, pd.DatetimeIndex):
raise IndexError(
f"Series {col.name} has index type {type(col.index)}. Please provide pd.DatetimeIndex."
)

# Filter the data based on the given time range
filtered_col = col[(col.index >= start_time) & (col.index <= end_time)]

# Update the series name with the processing step suffix
new_name = f"{signal}_{processing_step.suffix}"
filtered_col.name = new_name

# Append the filtered series along with the processing step metadata
outputs.append((filtered_col, [processing_step]))

return outputs
23 changes: 23 additions & 0 deletions src/meteaudata/timeseries_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from meteaudata.types import (
TimeSeries,
)


def check_missing_values(input_ts: TimeSeries) -> int:
"""
A function to count the missing values in a time series.

The function checks for any missing (NaN) values in the input series.

Args:
input_ts (TimeSeries): input time series to be processed.

Returns:
int: The number of NaN values in the Time Series data
"""

data = input_ts.series

missing_count = data.isnull().sum()

return missing_count
44 changes: 44 additions & 0 deletions src/meteaudata/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pathlib import Path
from typing import Any, Optional, Protocol, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
Expand Down Expand Up @@ -545,6 +546,8 @@ def plot(
)
return fig

# TODO: implement plot_mpl for the time_series object.

def remove_duplicated_steps(self):
steps = self.processing_steps
new_steps = []
Expand Down Expand Up @@ -1012,6 +1015,45 @@ def plot(
)
return fig

def plot_mpl(
self,
ts_names: list[str],
title: Optional[str] = None,
y_axis: Optional[str] = None,
x_axis: Optional[str] = None,
) -> None:
"""
Renders the selected time series in a matplotlib figure.
"""
# Set default titles if not provided
if not title:
title = f"Time series plot of {self.name}"
if not y_axis:
y_axis = f"{self.name} ({self.units})"
if not x_axis:
x_axis = "Time"

# Create a figure and axis
fig = plt.figure(figsize=(12, 6))

# Loop through time series names and plot each one
for ts_name in ts_names:
ts = self.time_series[ts_name].series
# Assuming the time series `ts` has a pandas Series structure
plt.plot(ts, label=ts_name)

# Add title and labels
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)

# Show legend
plt.legend()

# Display the plot
plt.tight_layout()
return fig

def build_dependency_graph(self, ts_name: str) -> list[dict[str, Any]]:
"""
Build a data structure that represents all the processig steps and their dependencies for a given time series.
Expand Down Expand Up @@ -1564,6 +1606,8 @@ def plot(
)
return fig

# TODO: implement plot_mpl for the dataset object.

def __eq__(self, other):
if not isinstance(other, Dataset):
return False
Expand Down
14 changes: 14 additions & 0 deletions tests/test_metEAUdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
from meteaudata.processing_steps.univariate import (
interpolate,
prediction,
remove_duplicates,
replace,
resample,
select_time_range,
subset,
)
from meteaudata.types import DataProvenance, Dataset, Signal, TimeSeries
Expand Down Expand Up @@ -91,9 +93,21 @@ def sample_dataset():
reason="sensor calibration procedure",
replace_with=np.nan,
)
# pd.date_range(start="2020-01-01", freq="6min", periods=100),)
signal = signal.process(
[f"{signal_name}_RESAMPLED#1"], interpolate.linear_interpolation
)
signal = signal.process(
[f"{signal_name}_LIN-INT#1"],
select_time_range.select_time_range,
"2020-01-01 00:00:00",
"2020-01-01 02:00:00",
)
signal = signal.process(
[f"{signal_name}_TIME-SLICE#1"],
remove_duplicates.remove_duplicates,
keep="last",
)
return dataset


Expand Down
11 changes: 0 additions & 11 deletions tests/test_snippets.py

This file was deleted.