From 4d46f1d00ab9e6757f29a8223f9118a1375a7619 Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Mon, 22 Jun 2026 06:04:58 +0000 Subject: [PATCH] parse Apache-style timestamps with a numeric timezone offset The Apache combined log format always emits timestamps in the form '01/Jan/2025:12:00:00 -0700' (day/Mon/year:hour:minute:second signhhmm). The formats list in _try_parse_datetime only had the offset-less '%d/%b/%Y:%H:%M:%S' form, so any Apache log line whose timestamp included the numeric offset returned None from _try_parse_datetime, which cascaded to parse_timestamp returning None, which made the apache parser's _parse_timestamp fall back to the same offset-less format inside the parser and silently drop the timezone information. The result is that --start-time / --end-time filtering on a log file using the canonical Apache combined format with offsets either timed the window wrongly (because the time zones disagreed silently) or produced no entries at all (because the parser-level fallback used ts_str.split()[0] which left the offset in the string and made the follow-up strptime call fail with the colon in the offset). Add a '%d/%b/%Y:%H:%M:%S %z' entry to the formats list so the offset-aware Apache timestamp is parsed in one shot, and add tests/test_utils.py with three pin cases (positive offset, negative offset, offset-less still works) so the next refactor of the format list doesn't drop the offset form. --- src/log_analyzer_cli/utils.py | 1 + tests/test_utils.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 tests/test_utils.py diff --git a/src/log_analyzer_cli/utils.py b/src/log_analyzer_cli/utils.py index 4d518a3..c3fe948 100644 --- a/src/log_analyzer_cli/utils.py +++ b/src/log_analyzer_cli/utils.py @@ -43,6 +43,7 @@ def _try_parse_datetime(ts_str: str) -> Optional[datetime]: "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z", "%d/%b/%Y:%H:%M:%S", + "%d/%b/%Y:%H:%M:%S %z", "%b %d %H:%M:%S", "%Y/%m/%d %H:%M:%S", ] diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..df91c3d --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,34 @@ +"""Tests for log_analyzer_cli.utils.""" + +from log_analyzer_cli.utils import _try_parse_datetime + + +def test_apache_timestamp_with_positive_offset(): + # Apache combined log format always includes a timezone offset. + # Before the fix, _try_parse_datetime returned None for this input + # because the format list had only "%d/%b/%Y:%H:%M:%S" (no %z). + result = _try_parse_datetime("01/Jan/2025:12:00:00 +0000") + assert result is not None + assert result.year == 2025 + assert result.month == 1 + assert result.day == 1 + assert result.hour == 12 + assert result.utcoffset().total_seconds() == 0 + + +def test_apache_timestamp_with_negative_offset(): + # Real-world Apache access lines commonly use a negative offset. + # 10/Oct/2000:13:55:36 -0700 is the canonical example from the Apache docs. + result = _try_parse_datetime("10/Oct/2000:13:55:36 -0700") + assert result is not None + assert result.year == 2000 + assert result.hour == 13 + assert result.utcoffset().total_seconds() == -7 * 3600 + + +def test_apache_timestamp_without_offset_still_works(): + # The naive form (no offset) must still parse for back-compat with + # logs that predate the offset inclusion. + result = _try_parse_datetime("01/Jan/2025:12:00:00") + assert result is not None + assert result.hour == 12