From c79b31bd7e87a29107e6559ac2aa3d7ff876f441 Mon Sep 17 00:00:00 2001
From: Zo Bot <github-automation@zo.computer>
Date: Sat, 20 Jun 2026 13:46:53 +0000
Subject: [PATCH] filter --levels on parsed entry level, not raw line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CLI's _parse_file scanned the raw line with detect_log_level to
decide whether to keep a row before calling parser.parse(). For
generic-format logs that have no structured level field, the generic
parser also calls detect_log_level on the full line, so the same
false positives affected both the filter check and the output:
the parser misclassified WARNING lines that mention 'error' in the
hostname or message as ERROR. Worse, the cli filter ran first
and rejected lines that the parser would later have accepted —
the user got 'No log entries found matching criteria' even when the
file contained real WARNING-level rows.

Two changes:

1. generic.py: run detect_log_level on the message portion (the text
   after the timestamp) rather than the whole line. The previous
   syslog-parser commit fixed the same issue there.

2. cli.py: filter on parsed.level after parsing instead of running
   detect_log_level separately on the raw line. Now the parser and
   the filter agree on every line — the filter only sees levels the
   report would have printed. The pre-parse scan also did extra
   string work per line that the parser was about to redo.

Add a regression test using a JSON line whose message contains
'error' but whose level field is WARNING; the old code rejected it
under --levels=WARNING, the new code keeps it.
---
 src/log_analyzer_cli/cli.py             | 39 +++++++++++++++----------
 src/log_analyzer_cli/parsers/generic.py | 10 +++++--
 tests/test_cli.py                       | 22 +++++++++++++-
 3 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/src/log_analyzer_cli/cli.py b/src/log_analyzer_cli/cli.py
index 6134fc3..e0d7c5b 100644
--- a/src/log_analyzer_cli/cli.py
+++ b/src/log_analyzer_cli/cli.py
@@ -193,36 +193,45 @@ def _parse_file(
 ):
     """Parse log file with optional filtering."""
     entries = []
-    
+
     from log_analyzer_cli.parsers import ParsedEntry
-    from log_analyzer_cli.utils import detect_log_level, parse_timestamp
+    from log_analyzer_cli.utils import parse_timestamp
     import re
-    
+
     compiled_pattern = re.compile(search_pattern) if search_pattern else None
-    
+
     for line in read_log_file(file_path):
         line = line.rstrip("\n\r")
         if not line:
             continue
-        
-        if include_levels:
-            level = detect_log_level(line)
-            if level not in include_levels:
-                continue
-        
+
         if compiled_pattern and not compiled_pattern.search(line):
             continue
-        
+
         timestamp = parse_timestamp(line)
         if start_time and timestamp and timestamp < start_time:
             continue
         if end_time and timestamp and timestamp > end_time:
             continue
-        
+
         parsed = parser.parse(line)
-        if parsed:
-            entries.append(parsed)
-    
+        if not parsed:
+            continue
+
+        # Filter on the parser's level rather than re-scanning the raw
+        # line: the parser already strips host/process names (the syslog
+        # parser detects level from the message portion only), so a
+        # WARNING line whose message happens to contain the word "error"
+        # is reported as WARNING consistently in both the filter check
+        # and the output. The pre-parse filter form scanned the full
+        # line and could let "WARNING: error in user input" slip past
+        # --levels=ERROR, only for the output to then label it WARNING
+        # — the filter and the report disagreed about the same line.
+        if include_levels and parsed.level not in include_levels:
+            continue
+
+        entries.append(parsed)
+
     return entries
 
 
diff --git a/src/log_analyzer_cli/parsers/generic.py b/src/log_analyzer_cli/parsers/generic.py
index 64fe287..385b977 100644
--- a/src/log_analyzer_cli/parsers/generic.py
+++ b/src/log_analyzer_cli/parsers/generic.py
@@ -41,8 +41,12 @@ def parse(self, line: str) -> Optional[ParsedEntry]:
             return None
         
         timestamp = self._parse_timestamp(match.group("timestamp"))
-        level = detect_log_level(line)
-        
+        # Detect the level from the message portion only; scanning the
+        # full line picks up words like "error" that appear inside the
+        # timestamp, hostname, or process name and misclassifies an
+        # otherwise clean WARNING line as ERROR. The extracted message
+        # starts right after the timestamp and is what the user
+        # actually thinks of as the log entry's content.
         message_start = match.end()
         message = line[message_start:].strip()
         
@@ -51,7 +55,7 @@ def parse(self, line: str) -> Optional[ParsedEntry]:
         return ParsedEntry(
             raw=line,
             timestamp=timestamp,
-            level=level,
+            level=detect_log_level(message),
             message=message,
             metadata={"format": "generic"},
         )
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 9c0dbe3..9bdc114 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -72,7 +72,27 @@ def test_analyze_no_group(self, runner, json_file):
     def test_analyze_level_filter(self, runner, json_file):
         result = runner.invoke(main, ["analyze", str(json_file), "-l", "ERROR,WARNING"])
         assert result.exit_code == 0
-    
+
+    def test_analyze_level_filter_uses_parsed_level_not_raw_line(
+        self, runner, tmp_path
+    ):
+        # JSON log line where the message body mentions "error" as part of a
+        # description but the structured level field is WARNING. The
+        # pre-
+        log = tmp_path / "host-error-actual-warn.log"
+        log.write_text(
+            '{"level": "WARNING", "message": "error in the system"}\n'
+        )
+        result = runner.invoke(main, ["analyze", str(log), "-l", "WARNING", "-v"])
+        assert result.exit_code == 0
+        assert "Total Lines:" in result.output
+        assert "WARNING" in result.output
+        # "ERROR" appears as a section header ("TOP ERROR GROUPS") so
+        # check the level-distribution line specifically.
+        import re
+        level_rows = re.findall(r"\bERROR\b\s+:\s+\d+", result.output)
+        assert not level_rows, f"unexpected ERROR level row: {level_rows}"
+
     def test_analyze_pattern_filter(self, runner, json_file):
         result = runner.invoke(main, ["analyze", str(json_file), "-p", "database"])
         assert result.exit_code == 0