From 518522e6850ebf91254334ba5bad48266ea51e80 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 20 Apr 2022 16:06:48 +0200 Subject: [PATCH] Add method to find coverage gaps --- typhon/files/fileset.py | 29 +++++++++++++++++++++++++++ typhon/tests/files/test_fileset.py | 32 ++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/typhon/files/fileset.py b/typhon/files/fileset.py index be1f199d..fa80d6dc 100644 --- a/typhon/files/fileset.py +++ b/typhon/files/fileset.py @@ -2998,6 +2998,35 @@ def write(self, data, file_info, in_background=False, **write_args): else: self.handler.write(data, file_info, **write_args) + def find_coverage_gaps(self, start_time, end_time): + """Find coverage gaps between timestamps. + + Between ``start_time`` and ``end_time``, yield pairs of times for which + the fileset does _not_ have data. + + Args: + start_time: Start time in any form understood by + :class:`~pandas.Timestamp`. + end_time: Ending time in the same form. + + Yields: + Yields zero or more instances of :class:`~pandas.Interval` + corresponding to each time segment in the overall interval that is + _not_ covered by data files. + """ + + last = pd.Timestamp(start_time) + for fi in self.find(start_time, end_time, no_files_error=False): + if fi.times[0] > last: + yield pd.Interval( + pd.Timestamp(last), + pd.Timestamp(fi.times[0])) + last = fi.times[1] + if last < end_time: + yield pd.Interval( + pd.Timestamp(last), + pd.Timestamp(end_time)) + class FileSetManager(dict): def __init__(self, *args, **kwargs): diff --git a/typhon/tests/files/test_fileset.py b/typhon/tests/files/test_fileset.py index 81e432b3..7568e121 100644 --- a/typhon/tests/files/test_fileset.py +++ b/typhon/tests/files/test_fileset.py @@ -8,6 +8,8 @@ import shutil import logging +import pandas as pd + from typhon.files import FileHandler, FileInfo, FileSet, FileSetManager from typhon.files.utils import get_testfiles_directory @@ -823,3 +825,33 @@ def test_compare_fileinfo(self): assert f2 != f4 assert f3 != f4 assert f1 != "fake/path" + + def test_coverage_gaps(self, tmp_path): + """Test that coverage gaps are correctly found.""" + + (tmp_path / "abisko").mkdir(parents=True) + fs = FileSet( + tmp_path / "abisko" / + "y{year}-m{month}-d{day}-h{hour}-m{minute}-he{end_hour}-me{end_minute}") + + pI = pd.Interval + pT = pd.Timestamp + gaps = list(fs.find_coverage_gaps( + datetime.datetime(1900, 1, 1, 0, 0), + datetime.datetime(1900, 1, 1, 0, 10))) + assert gaps == [ + pI(pT("1900-01-01T00:00:00"), pT("1900-01-01T00:10:00"))] + (tmp_path / "abisko" / "y1900-m01-d01-h00-m02-he00-me04").touch() + (tmp_path / "abisko" / "y1900-m01-d01-h00-m06-he00-me07").touch() + gaps = list(fs.find_coverage_gaps( + datetime.datetime(1900, 1, 1, 0, 0), + datetime.datetime(1900, 1, 1, 0, 15))) + assert gaps == [ + pI(pT("1900-01-01T00:00:00"), pT("1900-01-01T00:02:00")), + pI(pT("1900-01-01T00:04:00"), pT("1900-01-01T00:06:00")), + pI(pT("1900-01-01T00:07:00"), pT("1900-01-01T00:15:00"))] + (tmp_path / "abisko" / "y1900-m01-d01-h00-m07-he00-me12").touch() + gaps = list(fs.find_coverage_gaps( + datetime.datetime(1900, 1, 1, 0, 7), + datetime.datetime(1900, 1, 1, 0, 12))) + assert gaps == []