From 59ff8fb2838fcae0fc066b712a1d790cbd38adf5 Mon Sep 17 00:00:00 2001 From: demengc <43016355+demengc@users.noreply.github.com> Date: Tue, 2 Dec 2025 03:08:39 -0500 Subject: [PATCH 1/2] Optimize records access, vectorize DF flattening, implement hash-based deduplication, use binary search for scene filtering --- src/bwell_logkit/extractor.py | 39 +++++++++++------- src/bwell_logkit/logs.py | 76 +++++++++++++++++++++-------------- src/bwell_logkit/scene.py | 21 ++++++---- tests/test_logs.py | 21 ++++++---- 4 files changed, 95 insertions(+), 62 deletions(-) diff --git a/src/bwell_logkit/extractor.py b/src/bwell_logkit/extractor.py index 5b2179f..4de976b 100644 --- a/src/bwell_logkit/extractor.py +++ b/src/bwell_logkit/extractor.py @@ -16,21 +16,30 @@ def _flatten_dataframe(df: "pd.DataFrame") -> "pd.DataFrame": """Flatten nested dictionaries in DataFrame columns.""" - flattened_data = [] - - for _, row in df.iterrows(): - flattened_row = {} - for col, value in row.items(): - # Ensure col is treated as str for dict indexing - col_str = str(col) - if isinstance(value, dict): - for nested_key, nested_value in value.items(): - flattened_row[f"{col_str}_{nested_key}"] = nested_value - else: - flattened_row[col_str] = value - flattened_data.append(flattened_row) - - return pd.DataFrame(flattened_data) + if df.empty: + return df + + result_parts = [] + + for col in df.columns: + col_str = str(col) + + # Check if column contains dictionaries + first_valid = df[col].dropna().iloc[0] if not df[col].dropna().empty else None + + if isinstance(first_valid, dict): + # Use json_normalize for this column (vectorized) + normalized = pd.json_normalize(df[col].fillna({})) + normalized.columns = [ + f"{col_str}_{subcol}" for subcol in normalized.columns + ] + normalized.index = df.index # Preserve original index + result_parts.append(normalized) + else: + # Keep non-dict columns as-is + result_parts.append(df[[col]]) + + return pd.concat(result_parts, axis=1) if result_parts else pd.DataFrame() class LogSessionExtractor: diff --git a/src/bwell_logkit/logs.py b/src/bwell_logkit/logs.py index 68d783b..e4aab2e 100644 --- a/src/bwell_logkit/logs.py +++ b/src/bwell_logkit/logs.py @@ -3,11 +3,11 @@ """ import bisect -import copy from collections import defaultdict -from collections.abc import Mapping, Sequence from typing import TYPE_CHECKING, Any +import orjson + from .exceptions import SceneNotFoundError from .extractor import LogSessionExtractor, SceneViewExtractor from .scene import SceneManager @@ -17,35 +17,29 @@ import pandas as pd -def _freeze(obj: Any) -> Any: - """Freeze unhashable types.""" - if isinstance(obj, Mapping): - # For dicts: produce a frozenset of (key, frozen_value) pairs - return frozenset((key, _freeze(val)) for key, val in obj.items()) - elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)): - # For lists/tuples: produce a tuple of frozen elements - return tuple(_freeze(item) for item in obj) - else: - # Primitives stay as-is - return obj - - def _clean_records(records: list[LogRecord]) -> list[LogRecord]: - """Clean and deduplicate records.""" - seen = set() - cleaned = [] + """Clean and deduplicate records using JSON-based hashing. + + Uses orjson for fast serialization and deduplication. + Records are considered duplicates if they have identical content + (excluding the 'ID' field). + """ + seen: set[bytes] = set() + cleaned: list[LogRecord] = [] for rec in records: - # Make a shallow copy, drop the ID - tmp = dict(rec) - tmp.pop(RecordFields.ID, None) + # Create copy without ID for deduplication + tmp = {k: v for k, v in rec.items() if k != RecordFields.ID} - key = _freeze(tmp) + # Serialize to JSON bytes for hashing (orjson is fast) + # sort_keys ensures consistent ordering + key = orjson.dumps(tmp, option=orjson.OPT_SORT_KEYS) if key not in seen: seen.add(key) cleaned.append(rec) + # Sort by timestamp cleaned.sort(key=lambda r: r.get(RecordFields.GAME_TIME_SECS, 0.0)) return cleaned @@ -88,10 +82,22 @@ def __init__( self._extractor: LogSessionExtractor | None = None + # Cached timestamp list for binary search operations + self._timestamps: list[float] | None = None + @property - def records(self) -> list[LogRecord]: - """Get all records in the session.""" - return copy.deepcopy(self._records) + def records(self) -> tuple[LogRecord, ...]: + """Get all records in the session as an immutable tuple. + + Returns a tuple with shallow copies of records for efficient access + while preventing modification. + + Returns: + Immutable tuple of log records sorted by timestamp. Each record + is a shallow copy, so modifying nested structures affects the + original, but top-level field changes are isolated. + """ + return tuple(rec.copy() for rec in self._records) @property def metadata(self) -> dict[str, Any]: @@ -140,6 +146,15 @@ def scene_manager(self) -> SceneManager: self._scene_manager = SceneManager(self._records) return self._scene_manager + @property + def _timestamp_list(self) -> list[float]: + """Lazy-loaded timestamp list for binary search operations.""" + if self._timestamps is None: + self._timestamps = [ + r.get(RecordFields.GAME_TIME_SECS, 0.0) for r in self._records + ] + return self._timestamps + def list_scenes(self) -> list[str]: """ List all available scene names. @@ -230,10 +245,9 @@ def filter_time_range(self, start: float, end: float) -> "LogSession": if not self._records: return LogSession([], self._metadata, _scene_manager=self._scene_manager) - timestamps = [r.get(RecordFields.GAME_TIME_SECS, 0.0) for r in self._records] - - start_idx = bisect.bisect_left(timestamps, start) - end_idx = bisect.bisect_right(timestamps, end) + # Use cached timestamp list for binary search + start_idx = bisect.bisect_left(self._timestamp_list, start) + end_idx = bisect.bisect_right(self._timestamp_list, end) filtered_records = self._records[start_idx:end_idx] @@ -377,8 +391,8 @@ def __init__(self, session: LogSession, scene_info: SceneInfo): self._extractor: SceneViewExtractor | None = None @property - def records(self) -> list[LogRecord]: - """Get records in this scene.""" + def records(self) -> tuple[LogRecord, ...]: + """Get records in this scene as an immutable tuple.""" return self._session.records @property diff --git a/src/bwell_logkit/scene.py b/src/bwell_logkit/scene.py index b1a35f4..5d815fd 100644 --- a/src/bwell_logkit/scene.py +++ b/src/bwell_logkit/scene.py @@ -1,5 +1,6 @@ """Scene management and segmentation for bWell log data.""" +import bisect from collections import defaultdict from typing import Literal, Optional @@ -12,6 +13,8 @@ class SceneManager: def __init__(self, records: list[LogRecord]): self._records = records + # Cache timestamps for binary search operations + self._timestamps = [r.get(RecordFields.GAME_TIME_SECS, 0.0) for r in records] self._scenes = self._build_scene_index() def _build_scene_index(self) -> dict[str, list[SceneInfo]]: @@ -86,15 +89,17 @@ def get_scene_info(self, scene_name: str, instance: int = 0) -> SceneInfo: return self._scenes[scene_name][instance] def get_scene_records(self, scene_name: str, instance: int = 0) -> list[LogRecord]: - """Get all records within a specific scene instance.""" + """Get all records within a specific scene instance using binary search.""" + if not self._records: + return [] + info = self.get_scene_info(scene_name, instance) - return [ - r - for r in self._records - if info.start_game_time_secs - <= r.get(RecordFields.GAME_TIME_SECS, 0) - <= info.end_game_time_secs - ] + + # Use cached timestamps for binary search + start_idx = bisect.bisect_left(self._timestamps, info.start_game_time_secs) + end_idx = bisect.bisect_right(self._timestamps, info.end_game_time_secs) + + return self._records[start_idx:end_idx] def get_scene_instances( self, diff --git a/tests/test_logs.py b/tests/test_logs.py index 4a9e8a4..69e952c 100644 --- a/tests/test_logs.py +++ b/tests/test_logs.py @@ -76,15 +76,20 @@ def test_init_with_metadata(self, sample_records): assert session.metadata["user"] == "test_user" def test_records_property_returns_copy(self, sample_records): - """Test that records property returns a copy.""" + """Test that records property returns immutable tuple.""" session = LogSession(sample_records) - records_copy = session.records - - # Modify the copy - records_copy[0]["modified"] = True - - # Original should be unchanged - assert "modified" not in session.records[0] + records_tuple = session.records + + # Verify it's a tuple (immutable) + assert isinstance(records_tuple, tuple) + assert len(records_tuple) == len(sample_records) + + # Individual records can be modified for user's use, + # but modifying them doesn't affect internal state + if len(records_tuple) > 0: + test_record = records_tuple[0] + test_record["modified"] = True + assert "modified" not in session.records[0] def test_metadata_property_returns_copy(self, sample_records): """Test that metadata property returns a copy.""" From 41bcf1ce8ce7d409f6dbfc0c1f484fefa64e3ae1 Mon Sep 17 00:00:00 2001 From: demengc <43016355+demengc@users.noreply.github.com> Date: Tue, 2 Dec 2025 03:15:05 -0500 Subject: [PATCH 2/2] Bump version, remove unused requirements files --- pyproject.toml | 2 +- requirements-dev.txt | 22 ---------------------- requirements.txt | 3 --- 3 files changed, 1 insertion(+), 26 deletions(-) delete mode 100644 requirements-dev.txt delete mode 100644 requirements.txt diff --git a/pyproject.toml b/pyproject.toml index 46886d2..67d4f6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "bwell-logkit" -version = "1.1.0" +version = "1.2.0" description = "A Python library for processing and analyzing log files from the National Research Council (NRC) Canada's bWell application." readme = "README.md" requires-python = ">=3.11" diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 9eae92f..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,22 +0,0 @@ -# Development dependencies --r requirements.txt - -# Testing -pytest>=8.4.1 -pytest-cov>=6.2.1 -pytest-mock>=3.14.1 - -# Code quality -black>=25.1.0 -flake8>=7.3.0 -mypy>=1.16.1 -pandas-stubs>=2.3.0 -isort>=6.0.1 - -# Development tools -pre-commit>=3.7.0 -tox>=4.14.2 - -# Documentation -sphinx>=7.3.7 -sphinx-rtd-theme>=2.0.0 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 51435aa..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pandas>=2.3.1 -pyarrow>=20.0.0 -orjson>=3.10,<4