From 330a6fd8d330d8c10b7f80b615ab2f9c736a4e22 Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Mon, 22 Jun 2026 05:53:13 +0000 Subject: [PATCH] match Apache combined log user field as a non-empty token The COMBINED_PATTERN in ApacheParser used (?P\s+), which only matches whitespace. Real Apache combined log lines have a non-empty user field: anonymous requests use a single '-' and authenticated requests use the username (frank, alice, etc.). With \s+ as the group, every real combined log line failed to match and the parser silently fell through to the COMMON_PATTERN, which doesn't capture referer or user_agent. Switched to (?P\S+), matching the shape used by COMMON_PATTERN and by every reference implementation. Verified that: 192.168.1.10 - - [ts] "GET / HTTP/1.0" 200 2326 "-" "Mozilla/5.0" 192.168.1.10 - frank [ts] "GET /api HTTP/1.1" 200 512 "https://x/" "curl/8.0" both now parse with user / referer / user_agent in metadata instead of the anonymous-only fallback that was happening before. Added two test_parsers.py cases that pin the new behaviour: one for an authenticated request with non-empty user + referer + user_agent, and one for the anonymous '-' user. --- src/log_analyzer_cli/parsers/apache.py | 2 +- tests/test_parsers.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/log_analyzer_cli/parsers/apache.py b/src/log_analyzer_cli/parsers/apache.py index f294dfd..735618d 100644 --- a/src/log_analyzer_cli/parsers/apache.py +++ b/src/log_analyzer_cli/parsers/apache.py @@ -24,7 +24,7 @@ class ApacheParser(LogParser): COMBINED_PATTERN = re.compile( r'^(?P\S+)\s+' r'(?P\S+)\s+' - r'(?P\s+)' + r'(?P\S+)\s+' r'\[(?P[^\]]+)\]\s+' r'"(?P[^"]+)"\s+' r'(?P\d{3})\s+' diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 0248a28..2855eeb 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -131,6 +131,31 @@ def test_parse_apache_warning_status(self): assert entry is not None assert entry.level == "WARNING" + def test_parse_apache_combined_captures_user_field(self): + # The previous COMBINED_PATTERN used (?P\s+), so any combined + # line with a non-empty user (e.g. an authenticated request behind + # a mod_auth or a basic-auth proxy) silently fell through to the + # COMMON_PATTERN, which doesn't capture referer or user_agent. + parser = ApacheParser() + line = '192.168.1.10 - frank [20/Mar/2025:10:15:32 +0000] "GET /api HTTP/1.1" 200 512 "https://x.example/" "curl/8.0"' + entry = parser.parse(line) + + assert entry is not None + assert entry.metadata["user"] == "frank" + assert entry.metadata["referer"] == "https://x.example/" + assert entry.metadata["user_agent"] == "curl/8.0" + + def test_parse_apache_combined_anonymous_user(self): + # The anonymous user is conventionally a single '-' (not whitespace), + # so the regex needs to match a non-empty non-whitespace token. + parser = ApacheParser() + line = '192.168.1.10 - - [20/Mar/2025:10:15:32 +0000] "GET /index.html HTTP/1.1" 200 2326 "-" "Mozilla/5.0"' + entry = parser.parse(line) + + assert entry is not None + assert entry.metadata["user"] == "-" + assert entry.metadata["user_agent"] == "Mozilla/5.0" + class TestGenericParser: """Tests for GenericParser."""