From 72549905ce3f66d9798143cdefbeadc8a41a2a18 Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Mon, 22 Jun 2026 01:46:27 +0000 Subject: [PATCH] filter log entries from parsed levels and timezone-normalise CLI filters The CLI's `_parse_file` filtered by `detect_log_level(line)` (raw text scan) and `parse_timestamp(line)` (extracted from the raw line) before calling `parser.parse(line)`. Both disagree with the values the parser actually assigns, so the filters and the analyzer were working from different inputs. Level filter: an Apache 200 with "ERROR" in the User-Agent was INFO per the parser (200 status code) but ERROR per the raw-text scan, so `--levels INFO` wrongly dropped it; an Apache 500 with no level keyword in the line was ERROR per the parser but UNKNOWN per the raw-text scan, so `--levels ERROR` wrongly dropped it. Fix: parse the line first, then filter by `parsed.level`. Time filter: the JSON parser returns timezone-aware datetimes (the Z suffix is converted to +00:00 in `_parse_timestamp_string`); the CLI parsed `--start-time` / `--end-time` as naive datetimes and then compared them, raising "can't compare offset-naive and offset-aware datetimes" inside the outer try/except in `analyze()`. Fix: attach `tzinfo=timezone.utc` to the parsed filter values so a naive `YYYY-MM-DD HH:MM:SS` argument is treated as UTC and the comparison works against both naive and aware log timestamps. Includes two regression tests: - `test_analyze_time_filter_naive_start_with_aware_log` passes a naive `--start-time` against a JSON log with `Z` timestamps and asserts the late entry is kept and the early one is dropped - `test_analyze_level_filter_uses_parser_level_not_raw_text` writes an Apache log with a 200 + "ERROR" UA line and a 500 line, and checks that `--levels ERROR` keeps only the 500 and `--levels INFO` keeps only the 200 All 49 tests in the suite pass. --- src/log_analyzer_cli/cli.py | 51 ++++++++++++++++++++++------------- tests/test_cli.py | 54 +++++++++++++++++++++++++++++++++++-- 2 files changed, 84 insertions(+), 21 deletions(-) diff --git a/src/log_analyzer_cli/cli.py b/src/log_analyzer_cli/cli.py index 6134fc3..fbe06d4 100644 --- a/src/log_analyzer_cli/cli.py +++ b/src/log_analyzer_cli/cli.py @@ -5,6 +5,7 @@ import sys from pathlib import Path from typing import Optional +from datetime import timezone import click @@ -104,6 +105,7 @@ def analyze( except ValueError: click.echo(f"Error: Invalid start-time format. Use YYYY-MM-DD HH:MM:SS", err=True) sys.exit(1) + start_dt = start_dt.replace(tzinfo=timezone.utc) end_dt = None if end_time: @@ -112,6 +114,7 @@ def analyze( except ValueError: click.echo(f"Error: Invalid end-time format. Use YYYY-MM-DD HH:MM:SS", err=True) sys.exit(1) + end_dt = end_dt.replace(tzinfo=timezone.utc) parser = _get_parser(log_format, file) @@ -191,38 +194,48 @@ def _parse_file( start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, ): - """Parse log file with optional filtering.""" + """Parse log file with optional filtering. + + Level filtering and time filtering both operate on the *parsed* entry + rather than the raw line. Filtering on the raw line's text (via + ``detect_log_level``) can disagree with the level the parser actually + assigns — e.g. an Apache 200 response with the word "ERROR" in the + User-Agent string is ``INFO`` per the parser (200 status) but + ``ERROR`` per the raw-text scan, while an Apache 500 response with + no level keyword in the line is ``ERROR`` per the parser but + ``UNKNOWN`` per the raw-text scan. Parsing first means the level and + timestamp filter against the same values the analyzer will use. + """ entries = [] - + from log_analyzer_cli.parsers import ParsedEntry from log_analyzer_cli.utils import detect_log_level, parse_timestamp import re - + compiled_pattern = re.compile(search_pattern) if search_pattern else None - + for line in read_log_file(file_path): line = line.rstrip("\n\r") if not line: continue - - if include_levels: - level = detect_log_level(line) - if level not in include_levels: - continue - + if compiled_pattern and not compiled_pattern.search(line): continue - - timestamp = parse_timestamp(line) - if start_time and timestamp and timestamp < start_time: + + parsed = parser.parse(line) + if not parsed: continue - if end_time and timestamp and timestamp > end_time: + + if include_levels and parsed.level not in include_levels: continue - - parsed = parser.parse(line) - if parsed: - entries.append(parsed) - + + if start_time and parsed.timestamp and parsed.timestamp < start_time: + continue + if end_time and parsed.timestamp and parsed.timestamp > end_time: + continue + + entries.append(parsed) + return entries diff --git a/tests/test_cli.py b/tests/test_cli.py index 9c0dbe3..cdd836e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -87,9 +87,59 @@ def test_analyze_time_filter(self, runner, json_file): def test_analyze_auto_format_detection(self, runner, json_file): result = runner.invoke(main, ["analyze", str(json_file), "--format", "auto"]) assert result.exit_code == 0 - + + def test_analyze_time_filter_naive_start_with_aware_log(self, runner, tmp_path): + """Naive --start-time must compare against TZ-aware log timestamps without raising.""" + import json as json_mod + log = tmp_path / "tz-aware.log" + log.write_text( + json_mod.dumps({ + "timestamp": "2025-03-20T10:15:32.123Z", + "level": "INFO", + "message": "Started", + }) + "\n" + + json_mod.dumps({ + "timestamp": "2025-03-20T10:17:00.000Z", + "level": "ERROR", + "message": "Failed", + }) + "\n" + ) + result = runner.invoke( + main, + ["analyze", str(log), "-f", "json", "-o", "json", + "--start-time", "2025-03-20 10:16:00"], + ) + assert result.exit_code == 0, result.output + assert '"parsed_entries": 1' in result.output + assert "ERROR" in result.output + assert "Started" not in result.output + + def test_analyze_level_filter_uses_parser_level_not_raw_text(self, runner, tmp_path): + """--levels filters by the parser's level, not the raw-line keyword scan.""" + log = tmp_path / "apache.log" + log.write_text( + '127.0.0.1 - - [20/Mar/2025:10:15:32 +0000] ' + '"GET / HTTP/1.1" 200 1234 "-" "Mozilla/ERROR/5.0"\n' + '127.0.0.1 - - [20/Mar/2025:10:15:33 +0000] ' + '"GET /api HTTP/1.1" 500 5678 "-" "curl/8.0"\n' + ) + result = runner.invoke( + main, + ["analyze", str(log), "-f", "apache", "-o", "json", "-l", "ERROR"], + ) + assert result.exit_code == 0, result.output + assert '"parsed_entries": 1' in result.output + assert "ERROR" in result.output + result = runner.invoke( + main, + ["analyze", str(log), "-f", "apache", "-o", "json", "-l", "INFO"], + ) + assert result.exit_code == 0, result.output + assert '"parsed_entries": 1' in result.output + assert "INFO" in result.output + def test_help(self, runner): result = runner.invoke(main, ["--help"]) assert result.exit_code == 0 assert "analyze" in result.output - assert "formats" in result.output + assert "formats" in result.output \ No newline at end of file