From 72549905ce3f66d9798143cdefbeadc8a41a2a18 Mon Sep 17 00:00:00 2001
From: Zo Bot <github-automation@zo.computer>
Date: Mon, 22 Jun 2026 01:46:27 +0000
Subject: [PATCH] filter log entries from parsed levels and timezone-normalise
 CLI filters

The CLI's `_parse_file` filtered by `detect_log_level(line)` (raw text scan)
and `parse_timestamp(line)` (extracted from the raw line) before calling
`parser.parse(line)`. Both disagree with the values the parser actually
assigns, so the filters and the analyzer were working from different
inputs.

Level filter: an Apache 200 with "ERROR" in the User-Agent was INFO per
the parser (200 status code) but ERROR per the raw-text scan, so
`--levels INFO` wrongly dropped it; an Apache 500 with no level keyword
in the line was ERROR per the parser but UNKNOWN per the raw-text scan,
so `--levels ERROR` wrongly dropped it. Fix: parse the line first, then
filter by `parsed.level`.

Time filter: the JSON parser returns timezone-aware datetimes (the Z
suffix is converted to +00:00 in `_parse_timestamp_string`); the CLI
parsed `--start-time` / `--end-time` as naive datetimes and then
compared them, raising "can't compare offset-naive and offset-aware
datetimes" inside the outer try/except in `analyze()`. Fix: attach
`tzinfo=timezone.utc` to the parsed filter values so a naive
`YYYY-MM-DD HH:MM:SS` argument is treated as UTC and the comparison
works against both naive and aware log timestamps.

Includes two regression tests:
- `test_analyze_time_filter_naive_start_with_aware_log` passes a naive
  `--start-time` against a JSON log with `Z` timestamps and asserts the
  late entry is kept and the early one is dropped
- `test_analyze_level_filter_uses_parser_level_not_raw_text` writes an
  Apache log with a 200 + "ERROR" UA line and a 500 line, and checks
  that `--levels ERROR` keeps only the 500 and `--levels INFO` keeps
  only the 200

All 49 tests in the suite pass.
---
 src/log_analyzer_cli/cli.py | 51 ++++++++++++++++++++++-------------
 tests/test_cli.py           | 54 +++++++++++++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/src/log_analyzer_cli/cli.py b/src/log_analyzer_cli/cli.py
index 6134fc3..fbe06d4 100644
--- a/src/log_analyzer_cli/cli.py
+++ b/src/log_analyzer_cli/cli.py
@@ -5,6 +5,7 @@
 import sys
 from pathlib import Path
 from typing import Optional
+from datetime import timezone
 
 import click
 
@@ -104,6 +105,7 @@ def analyze(
             except ValueError:
                 click.echo(f"Error: Invalid start-time format. Use YYYY-MM-DD HH:MM:SS", err=True)
                 sys.exit(1)
+            start_dt = start_dt.replace(tzinfo=timezone.utc)
         
         end_dt = None
         if end_time:
@@ -112,6 +114,7 @@ def analyze(
             except ValueError:
                 click.echo(f"Error: Invalid end-time format. Use YYYY-MM-DD HH:MM:SS", err=True)
                 sys.exit(1)
+            end_dt = end_dt.replace(tzinfo=timezone.utc)
         
         parser = _get_parser(log_format, file)
         
@@ -191,38 +194,48 @@ def _parse_file(
     start_time: Optional[datetime] = None,
     end_time: Optional[datetime] = None,
 ):
-    """Parse log file with optional filtering."""
+    """Parse log file with optional filtering.
+
+    Level filtering and time filtering both operate on the *parsed* entry
+    rather than the raw line. Filtering on the raw line's text (via
+    ``detect_log_level``) can disagree with the level the parser actually
+    assigns — e.g. an Apache 200 response with the word "ERROR" in the
+    User-Agent string is ``INFO`` per the parser (200 status) but
+    ``ERROR`` per the raw-text scan, while an Apache 500 response with
+    no level keyword in the line is ``ERROR`` per the parser but
+    ``UNKNOWN`` per the raw-text scan. Parsing first means the level and
+    timestamp filter against the same values the analyzer will use.
+    """
     entries = []
-    
+
     from log_analyzer_cli.parsers import ParsedEntry
     from log_analyzer_cli.utils import detect_log_level, parse_timestamp
     import re
-    
+
     compiled_pattern = re.compile(search_pattern) if search_pattern else None
-    
+
     for line in read_log_file(file_path):
         line = line.rstrip("\n\r")
         if not line:
             continue
-        
-        if include_levels:
-            level = detect_log_level(line)
-            if level not in include_levels:
-                continue
-        
+
         if compiled_pattern and not compiled_pattern.search(line):
             continue
-        
-        timestamp = parse_timestamp(line)
-        if start_time and timestamp and timestamp < start_time:
+
+        parsed = parser.parse(line)
+        if not parsed:
             continue
-        if end_time and timestamp and timestamp > end_time:
+
+        if include_levels and parsed.level not in include_levels:
             continue
-        
-        parsed = parser.parse(line)
-        if parsed:
-            entries.append(parsed)
-    
+
+        if start_time and parsed.timestamp and parsed.timestamp < start_time:
+            continue
+        if end_time and parsed.timestamp and parsed.timestamp > end_time:
+            continue
+
+        entries.append(parsed)
+
     return entries
 
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 9c0dbe3..cdd836e 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -87,9 +87,59 @@ def test_analyze_time_filter(self, runner, json_file):
     def test_analyze_auto_format_detection(self, runner, json_file):
         result = runner.invoke(main, ["analyze", str(json_file), "--format", "auto"])
         assert result.exit_code == 0
-    
+
+    def test_analyze_time_filter_naive_start_with_aware_log(self, runner, tmp_path):
+        """Naive --start-time must compare against TZ-aware log timestamps without raising."""
+        import json as json_mod
+        log = tmp_path / "tz-aware.log"
+        log.write_text(
+            json_mod.dumps({
+                "timestamp": "2025-03-20T10:15:32.123Z",
+                "level": "INFO",
+                "message": "Started",
+            }) + "\n" +
+            json_mod.dumps({
+                "timestamp": "2025-03-20T10:17:00.000Z",
+                "level": "ERROR",
+                "message": "Failed",
+            }) + "\n"
+        )
+        result = runner.invoke(
+            main,
+            ["analyze", str(log), "-f", "json", "-o", "json",
+             "--start-time", "2025-03-20 10:16:00"],
+        )
+        assert result.exit_code == 0, result.output
+        assert '"parsed_entries": 1' in result.output
+        assert "ERROR" in result.output
+        assert "Started" not in result.output
+
+    def test_analyze_level_filter_uses_parser_level_not_raw_text(self, runner, tmp_path):
+        """--levels filters by the parser's level, not the raw-line keyword scan."""
+        log = tmp_path / "apache.log"
+        log.write_text(
+            '127.0.0.1 - - [20/Mar/2025:10:15:32 +0000] '
+            '"GET / HTTP/1.1" 200 1234 "-" "Mozilla/ERROR/5.0"\n'
+            '127.0.0.1 - - [20/Mar/2025:10:15:33 +0000] '
+            '"GET /api HTTP/1.1" 500 5678 "-" "curl/8.0"\n'
+        )
+        result = runner.invoke(
+            main,
+            ["analyze", str(log), "-f", "apache", "-o", "json", "-l", "ERROR"],
+        )
+        assert result.exit_code == 0, result.output
+        assert '"parsed_entries": 1' in result.output
+        assert "ERROR" in result.output
+        result = runner.invoke(
+            main,
+            ["analyze", str(log), "-f", "apache", "-o", "json", "-l", "INFO"],
+        )
+        assert result.exit_code == 0, result.output
+        assert '"parsed_entries": 1' in result.output
+        assert "INFO" in result.output
+
     def test_help(self, runner):
         result = runner.invoke(main, ["--help"])
         assert result.exit_code == 0
         assert "analyze" in result.output
-        assert "formats" in result.output
+        assert "formats" in result.output
\ No newline at end of file