Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 32 additions & 19 deletions src/log_analyzer_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys
from pathlib import Path
from typing import Optional
from datetime import timezone

import click

Expand Down Expand Up @@ -104,6 +105,7 @@ def analyze(
except ValueError:
click.echo(f"Error: Invalid start-time format. Use YYYY-MM-DD HH:MM:SS", err=True)
sys.exit(1)
start_dt = start_dt.replace(tzinfo=timezone.utc)

end_dt = None
if end_time:
Expand All @@ -112,6 +114,7 @@ def analyze(
except ValueError:
click.echo(f"Error: Invalid end-time format. Use YYYY-MM-DD HH:MM:SS", err=True)
sys.exit(1)
end_dt = end_dt.replace(tzinfo=timezone.utc)

parser = _get_parser(log_format, file)

Expand Down Expand Up @@ -191,38 +194,48 @@ def _parse_file(
start_time: Optional[datetime] = None,
end_time: Optional[datetime] = None,
):
"""Parse log file with optional filtering."""
"""Parse log file with optional filtering.

Level filtering and time filtering both operate on the *parsed* entry
rather than the raw line. Filtering on the raw line's text (via
``detect_log_level``) can disagree with the level the parser actually
assigns — e.g. an Apache 200 response with the word "ERROR" in the
User-Agent string is ``INFO`` per the parser (200 status) but
``ERROR`` per the raw-text scan, while an Apache 500 response with
no level keyword in the line is ``ERROR`` per the parser but
``UNKNOWN`` per the raw-text scan. Parsing first means the level and
timestamp filter against the same values the analyzer will use.
"""
entries = []

from log_analyzer_cli.parsers import ParsedEntry
from log_analyzer_cli.utils import detect_log_level, parse_timestamp
import re

compiled_pattern = re.compile(search_pattern) if search_pattern else None

for line in read_log_file(file_path):
line = line.rstrip("\n\r")
if not line:
continue

if include_levels:
level = detect_log_level(line)
if level not in include_levels:
continue


if compiled_pattern and not compiled_pattern.search(line):
continue
timestamp = parse_timestamp(line)
if start_time and timestamp and timestamp < start_time:

parsed = parser.parse(line)
if not parsed:
continue
if end_time and timestamp and timestamp > end_time:

if include_levels and parsed.level not in include_levels:
continue

parsed = parser.parse(line)
if parsed:
entries.append(parsed)


if start_time and parsed.timestamp and parsed.timestamp < start_time:
continue
if end_time and parsed.timestamp and parsed.timestamp > end_time:
continue
Comment on lines +232 to +235

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "== Compare points introduced in CLI =="
rg -n -C2 --type=py 'parsed\.timestamp < start_time|parsed\.timestamp > end_time' src/log_analyzer_cli/cli.py

echo
echo "== Parser timestamp construction (look for naive datetime creation) =="
rg -n -C3 --type=py 'def parse\(|ParsedEntry\(|timestamp\s*=|strptime\(|fromisoformat\(|tzinfo|timezone|astimezone' src/log_analyzer_cli/parsers

echo
echo "== Any timestamp parsing helpers outside parsers =="
rg -n -C3 --type=py 'parse_timestamp|_parse_timestamp|strptime\(|fromisoformat\(' src/log_analyzer_cli

Repository: HrachShah/log-analyzer-cli

Length of output: 22486


🏁 Script executed:

# Check lines around 108 and 117 for UTC coercion
head -130 src/log_analyzer_cli/cli.py | tail -40

Repository: HrachShah/log-analyzer-cli

Length of output: 1535


🏁 Script executed:

# Check lines 225-245 to see full filter context and variable names
sed -n '225,245p' src/log_analyzer_cli/cli.py

Repository: HrachShah/log-analyzer-cli

Length of output: 525


Datetime comparison will fail when parsers return naive timestamps.

Lines 232–234 compare parsed.timestamp against UTC-aware datetimes (start_time and end_time passed from lines 108 and 117). Python raises TypeError when comparing aware and naive datetime objects. Multiple parsers emit naive datetimes:

  • syslog.py (lines 128–131): Returns naive datetime from strptime()
  • generic.py (lines 87–92): Returns naive datetime from strptime()
  • apache.py (line 119): Returns naive datetime when logs lack timezone info
  • json_log.py (line 81): Returns naive datetime for certain timestamp values

When a log entry has a naive timestamp, the comparison at lines 232–234 will crash with TypeError: '<' not supported between instances of 'datetime.datetime' and 'datetime.datetime' instead of being caught by the outer exception handler (lines 96–99, which does not list TypeError).

Normalize naive timestamps before comparison, or convert all timestamps to a consistent timezone state.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/log_analyzer_cli/cli.py` around lines 232 - 235, The datetime comparison
logic that checks parsed.timestamp against start_time and end_time will fail
with a TypeError when the timestamp is naive (lacks timezone info), because
Python cannot compare aware and naive datetime objects. Before performing the
comparison checks in the filtering block (where start_time and end_time are
compared), normalize any naive timestamps by detecting if parsed.timestamp lacks
tzinfo and converting it to a UTC-aware datetime object. This ensures all
datetime comparisons are between objects in the same timezone state, preventing
the TypeError from occurring.


entries.append(parsed)

return entries


Expand Down
54 changes: 52 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,59 @@ def test_analyze_time_filter(self, runner, json_file):
def test_analyze_auto_format_detection(self, runner, json_file):
result = runner.invoke(main, ["analyze", str(json_file), "--format", "auto"])
assert result.exit_code == 0


def test_analyze_time_filter_naive_start_with_aware_log(self, runner, tmp_path):
"""Naive --start-time must compare against TZ-aware log timestamps without raising."""
import json as json_mod
log = tmp_path / "tz-aware.log"
log.write_text(
json_mod.dumps({
"timestamp": "2025-03-20T10:15:32.123Z",
"level": "INFO",
"message": "Started",
}) + "\n" +
json_mod.dumps({
"timestamp": "2025-03-20T10:17:00.000Z",
"level": "ERROR",
"message": "Failed",
}) + "\n"
)
result = runner.invoke(
main,
["analyze", str(log), "-f", "json", "-o", "json",
"--start-time", "2025-03-20 10:16:00"],
)
assert result.exit_code == 0, result.output
assert '"parsed_entries": 1' in result.output
assert "ERROR" in result.output
assert "Started" not in result.output

def test_analyze_level_filter_uses_parser_level_not_raw_text(self, runner, tmp_path):
"""--levels filters by the parser's level, not the raw-line keyword scan."""
log = tmp_path / "apache.log"
log.write_text(
'127.0.0.1 - - [20/Mar/2025:10:15:32 +0000] '
'"GET / HTTP/1.1" 200 1234 "-" "Mozilla/ERROR/5.0"\n'
'127.0.0.1 - - [20/Mar/2025:10:15:33 +0000] '
'"GET /api HTTP/1.1" 500 5678 "-" "curl/8.0"\n'
)
result = runner.invoke(
main,
["analyze", str(log), "-f", "apache", "-o", "json", "-l", "ERROR"],
)
assert result.exit_code == 0, result.output
assert '"parsed_entries": 1' in result.output
assert "ERROR" in result.output
result = runner.invoke(
main,
["analyze", str(log), "-f", "apache", "-o", "json", "-l", "INFO"],
)
assert result.exit_code == 0, result.output
assert '"parsed_entries": 1' in result.output
assert "INFO" in result.output

def test_help(self, runner):
result = runner.invoke(main, ["--help"])
assert result.exit_code == 0
assert "analyze" in result.output
assert "formats" in result.output
assert "formats" in result.output
Loading