From fcf85501fca0db629aea352a10345cf08493d1ee Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 2 Feb 2026 10:16:10 +0100 Subject: [PATCH 1/5] clarify docstring --- src/osekit/core_api/spectro_dataset.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/osekit/core_api/spectro_dataset.py b/src/osekit/core_api/spectro_dataset.py index 8e4c51e3..5217372c 100644 --- a/src/osekit/core_api/spectro_dataset.py +++ b/src/osekit/core_api/spectro_dataset.py @@ -376,7 +376,14 @@ def link_audio_dataset( first: int = 0, last: int | None = None, ) -> None: - """Link the ``SpectroData`` of the ``SpectroDataset`` to the ``AudioData`` of the ``AudioDataset``. + """Link the ``SpectroDataset`` to the ``AudioDataset``. + + The ``SpectroData`` of the ``SpectroDataset`` will be linked to + the ``AudioData`` of the ``AudioDataset``. + + There should be in the ``AudioDataset`` an ``AudioData`` that + have the same ``begin`` and ``end`` than each of the ``SpectroData`` + of the ``SpectroDataset``. Parameters ---------- From 6564f7eedf6b79e8de4d8d04fb6eb6335fc9a567 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 2 Feb 2026 15:09:15 +0100 Subject: [PATCH 2/5] refactor test_link_audio_dataset --- tests/test_spectro.py | 209 ++++++++++++++++-------------------------- 1 file changed, 78 insertions(+), 131 deletions(-) diff --git a/tests/test_spectro.py b/tests/test_spectro.py index fc23e14c..8779116e 100644 --- a/tests/test_spectro.py +++ b/tests/test_spectro.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import gc from contextlib import nullcontext from pathlib import Path @@ -727,163 +728,109 @@ def test_link_audio_data( @pytest.mark.parametrize( ( - "audio_files", - "ads1_data_duration", - "ads2_data_duration", - "ads2_sample_rate", + "audio_data_params", + "spectro_data_params", "start_index", "stop_index", "expected_exception", ), [ pytest.param( - { - "duration": 1, - "sample_rate": 1_024, - "nb_files": 1, - "date_begin": pd.Timestamp("2024-01-01 12:00:00"), - }, - Timedelta(seconds=0.1), - Timedelta(seconds=0.1), - 1_024, + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ], + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ], None, None, nullcontext(), id="default_indexes_is_full_dataset", ), - pytest.param( - { - "duration": 1, - "sample_rate": 1_024, - "nb_files": 1, - "date_begin": pd.Timestamp("2024-01-01 12:00:00"), - }, - Timedelta(seconds=0.1), - Timedelta(seconds=0.1), - 1_024, - 2, - 6, - nullcontext(), - id="link_a_part_of_the_data", - ), - pytest.param( - { - "duration": 1, - "sample_rate": 1_024, - "nb_files": 1, - "date_begin": pd.Timestamp("2024-01-01 12:00:00"), - }, - Timedelta(seconds=0.1), - Timedelta(seconds=0.1), - 2_048, - None, - None, - pytest.raises( - ValueError, - match="The sample rate of the audio data doesn't match.", - ), - id="different_sample_rate", - ), - pytest.param( - { - "duration": 1, - "sample_rate": 1_024, - "nb_files": 1, - "date_begin": pd.Timestamp("2024-01-01 12:00:00"), - }, - Timedelta(seconds=0.1), - Timedelta(seconds=0.5), - 1_024, - None, - None, - pytest.raises( - ValueError, - match="The audio dataset doesn't contain the same number of data as the" - " spectro dataset.", - ), - id="different_number_of_data", - ), - pytest.param( - { - "duration": 1, - "sample_rate": 1_024, - "nb_files": 1, - "date_begin": pd.Timestamp("2024-01-01 12:00:00"), - }, - Timedelta(seconds=0.1), - Timedelta(seconds=0.101), - 1_024, - None, - None, - pytest.raises(ValueError, match="The end of the audio data doesn't match."), - id="different_end_of_first_data", - ), ], - indirect=["audio_files"], ) def test_link_audio_dataset( - audio_files: pytest.fixture, - tmp_path: pytest.fixture, - ads1_data_duration: Timedelta, - ads2_data_duration: Timedelta, - ads2_sample_rate: float, + patch_audio_data: None, + audio_data_params: list[ + tuple[Timestamp, Timestamp, float] + ], # begin, end, sample_rate + spectro_data_params: list[ + tuple[Timestamp, Timestamp, float] + ], # begin, end, sample_rate start_index: int, stop_index: int, - expected_exception: type[Exception], + expected_exception: contextlib.AbstractContextManager, ) -> None: - ads1 = AudioDataset.from_folder( - tmp_path, - strptime_format=TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED, - data_duration=ads1_data_duration, + ads_origin = AudioDataset( + [ + AudioData( + begin=sd_params[0], + end=sd_params[1], + mocked_value=[1.0] + * round((sd_params[1] - sd_params[0]).total_seconds() * sd_params[2]), + ) + for sd_params in spectro_data_params + ], ) - ads2 = AudioDataset.from_folder( - tmp_path, - strptime_format=TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED, - data_duration=ads2_data_duration, + + ads_dest = AudioDataset( + [ + AudioData( + begin=ad_params[0], + end=ad_params[1], + mocked_value=[1.0] + * round((ad_params[1] - ad_params[0]).total_seconds() * ad_params[2]), + ) + for ad_params in audio_data_params + ], ) - ads2.sample_rate = ads2_sample_rate sds = SpectroDataset.from_audio_dataset( - ads1, - fft=ShortTimeFFT(hamming(128), 128, ads1.sample_rate), + audio_dataset=ads_origin, + fft=ShortTimeFFT(hamming(16), 16, ads_origin.sample_rate), ) - with expected_exception as e: - assert sds.link_audio_dataset(ads2, first=start_index, last=stop_index) == e + origin_ids = {id(ad) for ad in ads_origin.data} + dest_ids = {id(ad) for ad in ads_dest.data} - if type(expected_exception) is not nullcontext: - return + assert all(id(sd.audio_data) in origin_ids for sd in sds.data) + assert not any(id(sd.audio_data) in dest_ids for sd in sds.data) - start_index = 0 if start_index is None else start_index - stop_index = len(ads1.data) if stop_index is None else stop_index - - for idx, sd in enumerate(sds.data): - if idx in range(start_index, stop_index): - assert sd.audio_data is not ads1.data[idx] - assert sd.audio_data is ads2.data[idx] - else: - assert sd.audio_data is ads1.data[idx] - assert sd.audio_data is not ads2.data[idx] - - # linking should fail if the length of the audio datasets differ: - ads_err = AudioDataset( - [*ads2.data, ads2.data[0]], - ) # Adding one data to the destination ads - with pytest.raises( - ValueError, - match=r"The audio dataset doesn't contain the same number of data as the " - "spectro dataset.", - ): - sds.link_audio_dataset(ads_err) + with expected_exception: + sds.link_audio_dataset( + audio_dataset=ads_dest, + first=start_index, + last=stop_index, + ) - # linking should fail if any of the data can't be linked - ads_err = AudioDataset(ads1.data) - ads1.data[-1].sample_rate = ads2_sample_rate * 0.5 - with pytest.raises( - ValueError, - match=r"The sample rate of the audio data doesn't match.", - ): - sds.link_audio_dataset(ads_err) + first_linked = 0 if start_index is None else start_index + last_linked = len(sds.data) if stop_index is None else stop_index + + relinked_sd = sds.data[first_linked:last_linked] + not_relinked_sd = [sd for sd in sds.data if sd not in relinked_sd] + + assert all(id(sd.audio_data) in origin_ids for sd in not_relinked_sd) + assert not any(id(sd.audio_data) in dest_ids for sd in not_relinked_sd) + assert all(id(sd.audio_data) in dest_ids for sd in relinked_sd) + assert not any(id(sd.audio_data) in origin_ids for sd in relinked_sd) @pytest.mark.parametrize( From 065dee88d437659f6b2595158e7a34cbf4a357bb Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 2 Feb 2026 15:46:33 +0100 Subject: [PATCH 3/5] refactor SpectroDataset.link_audio_dataset() to account for larger AudioDatasets --- src/osekit/core_api/spectro_dataset.py | 23 ++---- tests/test_spectro.py | 107 +++++++++++++++++++++++-- 2 files changed, 107 insertions(+), 23 deletions(-) diff --git a/src/osekit/core_api/spectro_dataset.py b/src/osekit/core_api/spectro_dataset.py index 5217372c..796da576 100644 --- a/src/osekit/core_api/spectro_dataset.py +++ b/src/osekit/core_api/spectro_dataset.py @@ -395,23 +395,16 @@ def link_audio_dataset( Index of the last ``SpectroData`` and ``AudioData`` to link. """ - if len(audio_dataset.data) != len(self.data): - msg = ( - "The audio dataset doesn't contain the same number of data" - " as the spectro dataset." - ) - raise ValueError(msg) - last = len(self.data) if last is None else last - for sd, ad in list( - zip( - sorted(self.data, key=lambda d: (d.begin, d.end)), - sorted(audio_dataset.data, key=lambda d: (d.begin, d.end)), - strict=False, - ), - )[first:last]: - sd.link_audio_data(ad) + ad_dict = {(ad.begin, ad.end): ad for ad in audio_dataset.data} + + for sd in self.data[first:last]: + key = (sd.begin, sd.end) + if key not in ad_dict: + msg = f"No AudioData found for SpectroData {sd}" + raise ValueError(msg) + sd.link_audio_data(ad_dict[key]) def update_json_audio_data(self, first: int, last: int) -> None: """Update the serialized ``json`` file with the spectro data from first to last. diff --git a/tests/test_spectro.py b/tests/test_spectro.py index 8779116e..d576b5b8 100644 --- a/tests/test_spectro.py +++ b/tests/test_spectro.py @@ -732,6 +732,7 @@ def test_link_audio_data( "spectro_data_params", "start_index", "stop_index", + "expected_relinked_data_idxs", "expected_exception", ), [ @@ -762,9 +763,101 @@ def test_link_audio_data( ], None, None, + [0, 1], nullcontext(), id="default_indexes_is_full_dataset", ), + pytest.param( + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:02"), + Timestamp("1994-02-27 00:00:03"), + 100.0, + ), + ], + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:02"), + Timestamp("1994-02-27 00:00:03"), + 100.0, + ), + ], + 1, + 2, + [1], + nullcontext(), + id="link_a_part_of_the_data", + ), + pytest.param( + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ], + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 150.0, + ), + ], + None, + None, + [], + pytest.raises( + ValueError, + match=r"The sample rate of the audio data doesn't match.", + ), + id="different_sample_rate", + ), + pytest.param( + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ], + [ + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ], + None, + None, + [0], + nullcontext(), + id="fewer_spectro_data_should_be_ok", + ), ], ) def test_link_audio_dataset( @@ -777,6 +870,7 @@ def test_link_audio_dataset( ], # begin, end, sample_rate start_index: int, stop_index: int, + expected_relinked_data_idxs: list[int], expected_exception: contextlib.AbstractContextManager, ) -> None: ads_origin = AudioDataset( @@ -784,8 +878,8 @@ def test_link_audio_dataset( AudioData( begin=sd_params[0], end=sd_params[1], - mocked_value=[1.0] - * round((sd_params[1] - sd_params[0]).total_seconds() * sd_params[2]), + sample_rate=sd_params[2], + mocked_value=[1.0], ) for sd_params in spectro_data_params ], @@ -796,8 +890,8 @@ def test_link_audio_dataset( AudioData( begin=ad_params[0], end=ad_params[1], - mocked_value=[1.0] - * round((ad_params[1] - ad_params[0]).total_seconds() * ad_params[2]), + sample_rate=ad_params[2], + mocked_value=[1.0], ) for ad_params in audio_data_params ], @@ -821,10 +915,7 @@ def test_link_audio_dataset( last=stop_index, ) - first_linked = 0 if start_index is None else start_index - last_linked = len(sds.data) if stop_index is None else stop_index - - relinked_sd = sds.data[first_linked:last_linked] + relinked_sd = [sds.data[idx] for idx in expected_relinked_data_idxs] not_relinked_sd = [sd for sd in sds.data if sd not in relinked_sd] assert all(id(sd.audio_data) in origin_ids for sd in not_relinked_sd) From d41b754b6e96d59545ae620e388956e8a1c23faa Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 2 Feb 2026 15:53:00 +0100 Subject: [PATCH 4/5] add remaining SpectroDataset.link_audio_dataset() tests --- tests/test_spectro.py | 102 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/tests/test_spectro.py b/tests/test_spectro.py index d576b5b8..f226e666 100644 --- a/tests/test_spectro.py +++ b/tests/test_spectro.py @@ -858,6 +858,108 @@ def test_link_audio_data( nullcontext(), id="fewer_spectro_data_should_be_ok", ), + pytest.param( + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:02"), + Timestamp("1994-02-27 00:00:03"), + 100.0, + ), + ], + [ + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ], + None, + None, + [], + pytest.raises( + ValueError, + match=rf"No AudioData found for SpectroData " + rf"{ + Timestamp('1994-02-27 00:00:01').strftime( + TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED + ) + }", + ), + id="not_found_spectrodata_should_raise", + ), + pytest.param( + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:03"), + 100.0, + ), + ], + [ + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ], + None, + None, + [], + pytest.raises( + ValueError, + match=rf"No AudioData found for SpectroData " + rf"{ + Timestamp('1994-02-27 00:00:01').strftime( + TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED + ) + }", + ), + id="found_begin_but_not_end_should_raise", + ), + pytest.param( + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ], + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ], + None, + None, + [], + pytest.raises( + ValueError, + match=rf"No AudioData found for SpectroData " + rf"{ + Timestamp('1994-02-27 00:00:00').strftime( + TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED + ) + }", + ), + id="found_end_but_not_begin_should_raise", + ), ], ) def test_link_audio_dataset( From 5c05929be971368a547a5d2d70c51367d7dff3bd Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 2 Feb 2026 16:10:51 +0100 Subject: [PATCH 5/5] add test case for SpectroDataset.link_audio_dataset() --- tests/test_spectro.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_spectro.py b/tests/test_spectro.py index f226e666..6fe76ae0 100644 --- a/tests/test_spectro.py +++ b/tests/test_spectro.py @@ -960,6 +960,32 @@ def test_link_audio_data( ), id="found_end_but_not_begin_should_raise", ), + pytest.param( + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ], + [ + ( + Timestamp("1994-02-27 00:00:00"), + Timestamp("1994-02-27 00:00:01"), + 100.0, + ), + ( + Timestamp("1994-02-27 00:00:01"), + Timestamp("1994-02-27 00:00:02"), + 100.0, + ), + ], + None, + 1, # Excludes the sd that doesn't have an ad counterpart in ads + [0], + nullcontext(), + id="missing_audiodata_counterparts_of_excluded_spectrodata_shouldnt_raise", + ), ], ) def test_link_audio_dataset(