From c79b31bd7e87a29107e6559ac2aa3d7ff876f441 Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Sat, 20 Jun 2026 13:46:53 +0000 Subject: [PATCH] filter --levels on parsed entry level, not raw line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CLI's _parse_file scanned the raw line with detect_log_level to decide whether to keep a row before calling parser.parse(). For generic-format logs that have no structured level field, the generic parser also calls detect_log_level on the full line, so the same false positives affected both the filter check and the output: the parser misclassified WARNING lines that mention 'error' in the hostname or message as ERROR. Worse, the cli filter ran first and rejected lines that the parser would later have accepted — the user got 'No log entries found matching criteria' even when the file contained real WARNING-level rows. Two changes: 1. generic.py: run detect_log_level on the message portion (the text after the timestamp) rather than the whole line. The previous syslog-parser commit fixed the same issue there. 2. cli.py: filter on parsed.level after parsing instead of running detect_log_level separately on the raw line. Now the parser and the filter agree on every line — the filter only sees levels the report would have printed. The pre-parse scan also did extra string work per line that the parser was about to redo. Add a regression test using a JSON line whose message contains 'error' but whose level field is WARNING; the old code rejected it under --levels=WARNING, the new code keeps it. --- src/log_analyzer_cli/cli.py | 39 +++++++++++++++---------- src/log_analyzer_cli/parsers/generic.py | 10 +++++-- tests/test_cli.py | 22 +++++++++++++- 3 files changed, 52 insertions(+), 19 deletions(-) diff --git a/src/log_analyzer_cli/cli.py b/src/log_analyzer_cli/cli.py index 6134fc3..e0d7c5b 100644 --- a/src/log_analyzer_cli/cli.py +++ b/src/log_analyzer_cli/cli.py @@ -193,36 +193,45 @@ def _parse_file( ): """Parse log file with optional filtering.""" entries = [] - + from log_analyzer_cli.parsers import ParsedEntry - from log_analyzer_cli.utils import detect_log_level, parse_timestamp + from log_analyzer_cli.utils import parse_timestamp import re - + compiled_pattern = re.compile(search_pattern) if search_pattern else None - + for line in read_log_file(file_path): line = line.rstrip("\n\r") if not line: continue - - if include_levels: - level = detect_log_level(line) - if level not in include_levels: - continue - + if compiled_pattern and not compiled_pattern.search(line): continue - + timestamp = parse_timestamp(line) if start_time and timestamp and timestamp < start_time: continue if end_time and timestamp and timestamp > end_time: continue - + parsed = parser.parse(line) - if parsed: - entries.append(parsed) - + if not parsed: + continue + + # Filter on the parser's level rather than re-scanning the raw + # line: the parser already strips host/process names (the syslog + # parser detects level from the message portion only), so a + # WARNING line whose message happens to contain the word "error" + # is reported as WARNING consistently in both the filter check + # and the output. The pre-parse filter form scanned the full + # line and could let "WARNING: error in user input" slip past + # --levels=ERROR, only for the output to then label it WARNING + # — the filter and the report disagreed about the same line. + if include_levels and parsed.level not in include_levels: + continue + + entries.append(parsed) + return entries diff --git a/src/log_analyzer_cli/parsers/generic.py b/src/log_analyzer_cli/parsers/generic.py index 64fe287..385b977 100644 --- a/src/log_analyzer_cli/parsers/generic.py +++ b/src/log_analyzer_cli/parsers/generic.py @@ -41,8 +41,12 @@ def parse(self, line: str) -> Optional[ParsedEntry]: return None timestamp = self._parse_timestamp(match.group("timestamp")) - level = detect_log_level(line) - + # Detect the level from the message portion only; scanning the + # full line picks up words like "error" that appear inside the + # timestamp, hostname, or process name and misclassifies an + # otherwise clean WARNING line as ERROR. The extracted message + # starts right after the timestamp and is what the user + # actually thinks of as the log entry's content. message_start = match.end() message = line[message_start:].strip() @@ -51,7 +55,7 @@ def parse(self, line: str) -> Optional[ParsedEntry]: return ParsedEntry( raw=line, timestamp=timestamp, - level=level, + level=detect_log_level(message), message=message, metadata={"format": "generic"}, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 9c0dbe3..9bdc114 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -72,7 +72,27 @@ def test_analyze_no_group(self, runner, json_file): def test_analyze_level_filter(self, runner, json_file): result = runner.invoke(main, ["analyze", str(json_file), "-l", "ERROR,WARNING"]) assert result.exit_code == 0 - + + def test_analyze_level_filter_uses_parsed_level_not_raw_line( + self, runner, tmp_path + ): + # JSON log line where the message body mentions "error" as part of a + # description but the structured level field is WARNING. The + # pre- + log = tmp_path / "host-error-actual-warn.log" + log.write_text( + '{"level": "WARNING", "message": "error in the system"}\n' + ) + result = runner.invoke(main, ["analyze", str(log), "-l", "WARNING", "-v"]) + assert result.exit_code == 0 + assert "Total Lines:" in result.output + assert "WARNING" in result.output + # "ERROR" appears as a section header ("TOP ERROR GROUPS") so + # check the level-distribution line specifically. + import re + level_rows = re.findall(r"\bERROR\b\s+:\s+\d+", result.output) + assert not level_rows, f"unexpected ERROR level row: {level_rows}" + def test_analyze_pattern_filter(self, runner, json_file): result = runner.invoke(main, ["analyze", str(json_file), "-p", "database"]) assert result.exit_code == 0