Skip to content

Commit 2f4e72b

Browse files
authored
docs(cache): add docstrings to cache (#7)
1 parent d8b80d6 commit 2f4e72b

8 files changed

Lines changed: 263 additions & 42 deletions

File tree

docs/api/cache_io.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
::: tools.cache.CacheIO

docs/api/video_slicer.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
::: tools.video_slicer.VideoSlicer

mkdocs.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ nav:
3131
- API Reference:
3232
- Main: api/data_forge.md
3333
- Image Comparer: api/img_comparer.md
34+
- Video slicer: api/video_slicer.md
3435
- Hasher:
3536
- Base Hasher: api/base_hasher.md
3637
- DHash: api/dhash.md
@@ -48,6 +49,7 @@ nav:
4849
- TXT writer: api/yolo_writer.md
4950
- Mixins:
5051
- FileRemoverMixin: api/file_remover.md
52+
- CacheIO: api/cache_io.md
5153
- CLI:
5254
- CLI launching: cli/data_forge.md
5355
- Default Settings: cli/default_values.md

requirements.txt

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,51 @@
1-
mkdocs>=1.6.0
2-
mkdocs-material>=9.5.0
3-
mkdocstrings[python]>=0.26.0
4-
mkdocs-autorefs>=1.2.0
5-
pydantic>=2.10.0
6-
pydantic-settings>=2.8.0
7-
python-dotenv>=1.0.0
8-
PyYAML>=6.0.0
9-
numpy>=1.24.0,<2.0.0
10-
opencv-python>=4.8.0
11-
requests>=2.31.0
12-
pytest>=8.0.0
13-
python-dateutil>=2.8.0
14-
watchdog>=4.0.0
15-
xmltodict
16-
pillow
1+
annotated-types==0.7.0
2+
babel==2.18.0
3+
backrefs==6.1
4+
certifi==2026.1.4
5+
charset-normalizer==3.4.4
6+
click==8.3.1
7+
colorama==0.4.6
8+
exceptiongroup==1.3.1
9+
ghp-import==2.1.0
10+
griffe==1.15.0
11+
idna==3.11
12+
iniconfig==2.3.0
13+
Jinja2==3.1.6
14+
Markdown==3.10.1
15+
MarkupSafe==3.0.3
16+
mergedeep==1.3.4
17+
mkdocs==1.6.1
18+
mkdocs-autorefs==1.4.3
19+
mkdocs-get-deps==0.2.0
20+
mkdocs-material==9.7.1
21+
mkdocs-material-extensions==1.3.1
22+
mkdocstrings==1.0.2
23+
mkdocstrings-python==2.0.1
24+
numpy==1.26.4
25+
opencv-python==4.11.0.86
26+
packaging==26.0
27+
paginate==0.5.7
28+
pandas==3.0.0
29+
pathspec==1.0.4
30+
pillow==12.1.0
31+
platformdirs==4.5.1
32+
pluggy==1.6.0
33+
pyarrow==23.0.0
34+
pydantic==2.12.5
35+
pydantic-settings==2.12.0
36+
pydantic_core==2.41.5
37+
Pygments==2.19.2
38+
pymdown-extensions==10.20.1
39+
pytest==9.0.2
40+
python-dateutil==2.9.0.post0
41+
python-dotenv==1.2.1
42+
PyYAML==6.0.3
43+
pyyaml_env_tag==1.1
44+
requests==2.32.5
45+
six==1.17.0
46+
tomli==2.4.0
47+
typing-inspection==0.4.2
48+
typing_extensions==4.15.0
49+
urllib3==2.6.3
50+
watchdog==6.0.0
51+
xmltodict==1.0.2

tests/test_cache.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import pytest
2+
import numpy as np
3+
import pandas as pd
4+
from pathlib import Path
5+
from unittest.mock import MagicMock, patch
6+
7+
from tools.cache import CacheIO
8+
from const_utils.default_values import AppSettings
9+
10+
11+
@pytest.fixture
12+
def mock_settings():
13+
"""Provides mock AppSettings for testing CacheIO."""
14+
settings = AppSettings(
15+
log_path=Path("./test_log"),
16+
cache_file_path=Path("./test_cache")
17+
)
18+
return settings
19+
20+
21+
@pytest.fixture
22+
def cache_io_instance(mock_settings):
23+
"""Provides a CacheIO instance with mock settings."""
24+
return CacheIO(settings=mock_settings)
25+
26+
27+
@pytest.fixture
28+
def temp_cache_file(tmp_path):
29+
"""Provides a temporary cache file path for testing."""
30+
return tmp_path / "test_cache.parquet"
31+
32+
33+
@pytest.fixture
34+
def sample_hash_map():
35+
"""Provides a sample hash map for testing."""
36+
return {
37+
Path("/path/to/file1.jpg"): np.array([True, False, True], dtype=bool),
38+
Path("/path/to/file2.png"): np.array([False, True, False], dtype=bool),
39+
}
40+
41+
42+
def test_cache_io_init(cache_io_instance):
43+
"""Tests if CacheIO initializes correctly."""
44+
assert isinstance(cache_io_instance, CacheIO)
45+
assert cache_io_instance.settings is not None
46+
assert cache_io_instance.logger is not None
47+
48+
49+
def test_save_empty_hash_map(cache_io_instance, temp_cache_file):
50+
"""Tests saving an empty hash map, which should not create a file."""
51+
hash_map = {}
52+
cache_io_instance.save(hash_map, temp_cache_file)
53+
assert not temp_cache_file.exists()
54+
55+
56+
def test_save_valid_hash_map(cache_io_instance, temp_cache_file, sample_hash_map):
57+
"""Tests saving a valid hash map and checks if the file is created."""
58+
cache_io_instance.save(sample_hash_map, temp_cache_file)
59+
assert temp_cache_file.exists()
60+
61+
62+
def test_load_non_existent_file(cache_io_instance, temp_cache_file):
63+
"""Tests loading from a cache file that does not exist."""
64+
loaded_data = cache_io_instance.load(temp_cache_file)
65+
assert loaded_data == {}
66+
67+
68+
def test_load_damaged_file(cache_io_instance, temp_cache_file):
69+
"""Tests loading from a damaged cache file (simulated by EOFError)."""
70+
temp_cache_file.write_text("corrupted data")
71+
with patch('pandas.read_parquet', side_effect=EOFError):
72+
loaded_data = cache_io_instance.load(temp_cache_file)
73+
assert loaded_data == {}
74+
75+
76+
def test_load_valid_file(cache_io_instance, temp_cache_file, sample_hash_map):
77+
"""Tests loading from a valid cache file."""
78+
cache_io_instance.save(sample_hash_map, temp_cache_file)
79+
loaded_data = cache_io_instance.load(temp_cache_file)
80+
81+
assert len(loaded_data) == len(sample_hash_map)
82+
for path, hash_array in sample_hash_map.items():
83+
assert path in loaded_data
84+
assert np.array_equal(loaded_data[path], hash_array)
85+
86+
87+
def test_generate_cache_filename_no_custom_name():
88+
"""Tests generating a cache filename without a custom name."""
89+
source_path = Path("/home/user/images")
90+
hash_type = "dhash"
91+
core_size = 16
92+
filename = CacheIO.generate_cache_filename(source_path, hash_type, core_size, None)
93+
assert filename.startswith("cache_")
94+
assert "_dimages" in filename
95+
assert "dhash_s16.parquet" in filename
96+
97+
98+
def test_generate_cache_filename_with_custom_name():
99+
"""Tests generating a cache filename with a custom name."""
100+
source_path = Path("/home/user/videos")
101+
hash_type = "phash"
102+
core_size = 32
103+
custom_name = Path("my_video_cache")
104+
filename = CacheIO.generate_cache_filename(source_path, hash_type, core_size, custom_name)
105+
assert filename == "my_video_cache_phash_s32.parquet"
106+
107+
108+
def test_generate_cache_filename_with_custom_name_with_suffix():
109+
"""Tests generating a cache filename with a custom name that already includes the suffix."""
110+
source_path = Path("/home/user/documents")
111+
hash_type = "ahash"
112+
core_size = 8
113+
custom_name = Path("doc_cache.parquet")
114+
filename = CacheIO.generate_cache_filename(source_path, hash_type, core_size, custom_name)
115+
assert filename == "doc_cache_ahash_s8.parquet"

tools/cache.py

Lines changed: 66 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,23 @@
11
import hashlib
2-
import pickle
32
from pathlib import Path
43
from typing import Dict, Optional
54
import numpy as np
5+
import pandas as pd
66

77
from const_utils.default_values import AppSettings
88
from logger.logger import LoggerConfigurator
99
from logger.logger_protocol import LoggerProtocol
1010

1111

1212
class CacheIO:
13-
SUFFIX = ".pkl"
13+
SUFFIX = ".parquet"
1414
def __init__(self, settings: AppSettings):
15-
"""
16-
saving and reading cache files for faster loading data
15+
"""Initializes the CacheIO class.
16+
17+
This class helps to save and load cache files. It makes data loading faster.
18+
19+
Args:
20+
settings (AppSettings): The application settings for logging and paths.
1721
"""
1822
self.settings = settings
1923
self.logger = LoggerConfigurator.setup(
@@ -23,33 +27,78 @@ def __init__(self, settings: AppSettings):
2327
)
2428

2529

26-
def load(self: LoggerProtocol, cache_file) -> Dict[Path, np.ndarray]:
27-
"""loading cache file"""
28-
try:
29-
self.logger.info(f"Loading cache file {cache_file}")
30-
with open(cache_file, "rb") as file:
31-
return pickle.load(file)
30+
def load(self: LoggerProtocol, cache_file: Path) -> Dict[Path, np.ndarray]:
31+
"""Loads data from a cache file.
32+
33+
It reads a parquet file and converts it into a dictionary of hashes.
3234
33-
except FileNotFoundError:
35+
Args:
36+
cache_file (Path): The path to the cache file.
37+
38+
Returns:
39+
Dict[Path, np.ndarray]: A dictionary where keys are file paths and values are hashes.
40+
Returns an empty dictionary if the file does not exist or is broken.
41+
"""
42+
if not cache_file.exists():
3443
self.logger.warning(f"Cache file {cache_file} does not exist")
3544
return {}
45+
46+
try:
47+
self.logger.info(f"Loading cache file {cache_file}")
48+
df = pd.read_parquet(cache_file)
49+
data = {
50+
Path(row['path']): np.array(row['hash'], dtype=bool)
51+
for _, row in df.iterrows()
52+
}
53+
return data
3654
except EOFError:
3755
self.logger.warning(f"Cache file {cache_file} is damaged. Deleting cache file")
3856
return {}
3957

58+
4059
def save(self: LoggerProtocol, hash_map: Dict[Path, np.ndarray], cache_file: Path) -> None:
41-
"""save hashmap to cache file"""
60+
"""Saves a hash map to a cache file.
61+
62+
It converts the dictionary into a parquet file for later use.
63+
64+
Args:
65+
hash_map (Dict[Path, np.ndarray]): The dictionary of hashes to save.
66+
cache_file (Path): The path where the cache file will be saved.
67+
"""
68+
if not hash_map:
69+
self.logger.warning("Hash map is empty, skipping saving cache file")
70+
return
71+
4272
cache_file.parent.mkdir(parents=True, exist_ok=True)
43-
self.logger.info(f"Saving cache file {cache_file}")
73+
self.logger.info(f"Saving {len(hash_map)} hashes to {cache_file.name}")
4474

45-
with open(cache_file, 'wb') as file:
46-
pickle.dump(hash_map, file)
75+
try:
76+
data = [
77+
{'path': str(p), 'hash': h.tolist()}
78+
for p, h in hash_map.items()
79+
]
80+
df = pd.DataFrame(data)
81+
df.to_parquet(cache_file, engine="pyarrow", compression="snappy", index=False)
82+
self.logger.info(f"Cache saved successfully.")
83+
except Exception as e:
84+
self.logger.error(f"Critical error saving cache: {e}")
4785

48-
self.logger.info(f"Cache file {cache_file} saved")
4986

5087
@classmethod
5188
def generate_cache_filename(cls, source_path: Path, hash_type: str, core_size: int, cache_name: Optional[Path]) -> str:
52-
"""generate cache filename from source_path """
89+
"""Creates a unique name for the cache file.
90+
91+
The name is based on the folder path, hash type, and size.
92+
93+
Args:
94+
source_path (Path): The folder path being processed.
95+
hash_type (str): The type of hash used (e.g., 'dhash').
96+
core_size (int): The size of the hash.
97+
cache_name (Optional[Path]): A custom name for the cache file, if provided.
98+
99+
Returns:
100+
str: The generated filename for the cache.
101+
"""
53102
suffix = f"{hash_type}_s{core_size}{cls.SUFFIX}"
54103
if cache_name is None:
55104
abs_path = str(source_path.resolve())

tools/video_slicer.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,31 @@
22
import cv2
33

44
class VideoSlicer:
5-
"""slicing video file and saving it to target directory"""
5+
"""A class to cut a video into many images.
6+
7+
This class takes a video file and saves its frames as separate image files in a folder.
8+
"""
69
def __init__(self):
10+
"""Initializes the VideoSlicer.
11+
12+
It sets the initial state of the slicer.
13+
"""
714
self.__sliced: bool = False
815

916

1017
def slice(self, source_file: Path, target_dir: Path, suffix: str = ".jpg", step: float = 1) -> tuple:
11-
"""
12-
slicing video file and saving it to target directory
13-
:param source_file: path to video file
14-
:param target_dir: target dir where sliced images from video file will be saved
15-
:param suffix: a suffix to add to image filename
16-
:param step: time step for saving image in seconds
17-
:return: count of images saved in target_dir
18+
"""Cuts the video into images and saves them to a folder.
19+
20+
Args:
21+
source_file (Path): The path to the video file you want to cut.
22+
target_dir (Path): The folder where you want to save the images.
23+
suffix (str): The file extension for the images (for example, '.jpg'). Defaults to '.jpg'.
24+
step (float): How many seconds to wait between saving images. Defaults to 1.
25+
26+
Returns:
27+
tuple: A tuple containing:
28+
- bool: True if the video was sliced successfully, False otherwise.
29+
- int: The total number of images saved.
1830
"""
1931
cap = cv2.VideoCapture(str(source_file))
2032

@@ -44,6 +56,12 @@ def slice(self, source_file: Path, target_dir: Path, suffix: str = ".jpg", step:
4456
self.__sliced = True
4557
return self.sliced, img_counter
4658

59+
4760
@property
4861
def sliced(self) -> bool:
62+
"""Checks if the video has been sliced.
63+
64+
Returns:
65+
bool: True if the slice method was called and finished, False otherwise.
66+
"""
4967
return self.__sliced

tst_commands.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,6 @@
6060
MAPPING[Commands.dedup].append("16")
6161

6262

63-
sys.argv = MAPPING[Commands.convert_annotations]
63+
sys.argv = MAPPING[Commands.dedup]
6464
app = DataForge()
6565
app.execute()

0 commit comments

Comments
 (0)