diff --git a/src/log_analyzer_cli/cli.py b/src/log_analyzer_cli/cli.py index 6134fc3..e0d7c5b 100644 --- a/src/log_analyzer_cli/cli.py +++ b/src/log_analyzer_cli/cli.py @@ -193,36 +193,45 @@ def _parse_file( ): """Parse log file with optional filtering.""" entries = [] - + from log_analyzer_cli.parsers import ParsedEntry - from log_analyzer_cli.utils import detect_log_level, parse_timestamp + from log_analyzer_cli.utils import parse_timestamp import re - + compiled_pattern = re.compile(search_pattern) if search_pattern else None - + for line in read_log_file(file_path): line = line.rstrip("\n\r") if not line: continue - - if include_levels: - level = detect_log_level(line) - if level not in include_levels: - continue - + if compiled_pattern and not compiled_pattern.search(line): continue - + timestamp = parse_timestamp(line) if start_time and timestamp and timestamp < start_time: continue if end_time and timestamp and timestamp > end_time: continue - + parsed = parser.parse(line) - if parsed: - entries.append(parsed) - + if not parsed: + continue + + # Filter on the parser's level rather than re-scanning the raw + # line: the parser already strips host/process names (the syslog + # parser detects level from the message portion only), so a + # WARNING line whose message happens to contain the word "error" + # is reported as WARNING consistently in both the filter check + # and the output. The pre-parse filter form scanned the full + # line and could let "WARNING: error in user input" slip past + # --levels=ERROR, only for the output to then label it WARNING + # — the filter and the report disagreed about the same line. + if include_levels and parsed.level not in include_levels: + continue + + entries.append(parsed) + return entries diff --git a/src/log_analyzer_cli/parsers/generic.py b/src/log_analyzer_cli/parsers/generic.py index 64fe287..385b977 100644 --- a/src/log_analyzer_cli/parsers/generic.py +++ b/src/log_analyzer_cli/parsers/generic.py @@ -41,8 +41,12 @@ def parse(self, line: str) -> Optional[ParsedEntry]: return None timestamp = self._parse_timestamp(match.group("timestamp")) - level = detect_log_level(line) - + # Detect the level from the message portion only; scanning the + # full line picks up words like "error" that appear inside the + # timestamp, hostname, or process name and misclassifies an + # otherwise clean WARNING line as ERROR. The extracted message + # starts right after the timestamp and is what the user + # actually thinks of as the log entry's content. message_start = match.end() message = line[message_start:].strip() @@ -51,7 +55,7 @@ def parse(self, line: str) -> Optional[ParsedEntry]: return ParsedEntry( raw=line, timestamp=timestamp, - level=level, + level=detect_log_level(message), message=message, metadata={"format": "generic"}, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 9c0dbe3..9bdc114 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -72,7 +72,27 @@ def test_analyze_no_group(self, runner, json_file): def test_analyze_level_filter(self, runner, json_file): result = runner.invoke(main, ["analyze", str(json_file), "-l", "ERROR,WARNING"]) assert result.exit_code == 0 - + + def test_analyze_level_filter_uses_parsed_level_not_raw_line( + self, runner, tmp_path + ): + # JSON log line where the message body mentions "error" as part of a + # description but the structured level field is WARNING. The + # pre- + log = tmp_path / "host-error-actual-warn.log" + log.write_text( + '{"level": "WARNING", "message": "error in the system"}\n' + ) + result = runner.invoke(main, ["analyze", str(log), "-l", "WARNING", "-v"]) + assert result.exit_code == 0 + assert "Total Lines:" in result.output + assert "WARNING" in result.output + # "ERROR" appears as a section header ("TOP ERROR GROUPS") so + # check the level-distribution line specifically. + import re + level_rows = re.findall(r"\bERROR\b\s+:\s+\d+", result.output) + assert not level_rows, f"unexpected ERROR level row: {level_rows}" + def test_analyze_pattern_filter(self, runner, json_file): result = runner.invoke(main, ["analyze", str(json_file), "-p", "database"]) assert result.exit_code == 0