feat: Improve execve trace parsing and event handling

scc-tw · scc-tw · commit 189bfb99b576 · 2025-05-02T15:13:06.000+08:00
- Add robust execve trace parsing with memory pointer resolution
- Implement fallback mechanisms for command resolution
- Add UnparsedEvent handling in event processing
- Fix event aggregation to handle raw trace lines
- Update summary generation to filter invalid events
- Add memory reading utilities for process inspection

This commit improves the reliability of execve trace parsing by:
1. Adding support for pointer-based execve traces
2. Implementing memory reading to resolve command names
3. Adding proper fallback to /proc/&lt;pid&gt;/cmdline
4. Gracefully handling unparsed events in the pipeline
5. Ensuring event aggregation and summary generation work with mixed event types

The changes maintain backward compatibility while adding support for
newer trace formats that only provide memory pointers.
diff --git a/linux_edr/app.py b/linux_edr/app.py
@@ -11,7 +11,7 @@
 from .config import Config
 from .report_manager import ReportManager
 from .models import Cell
-from .domain.models.events import BaseSyscallEvent, ExecveEvent
+from .domain.models.events import BaseSyscallEvent, ExecveEvent, UnparsedEvent
 
 
 def setup_logging(debug: bool = False) -> None:
@@ -327,12 +327,20 @@ def _process_event(self, evt: BaseSyscallEvent) -> None:
         if self.verbose_debug:
             self._log_debug_event(evt)
 
-        # If the trace reader already produced a validated ExecveEvent model, buffer it directly.
         if isinstance(evt, BaseSyscallEvent):
+            # Parsed syscall event – store its dict representation.
             self.agg.add(evt.model_dump() if hasattr(evt, "model_dump") else evt.dict())
             return
-        else:
-            logging.warning(f"Invalid event type: {type(evt)}")
+
+        # Handle unparsed trace lines gracefully.
+        if isinstance(evt, UnparsedEvent):
+            # Optionally buffer raw lines for troubleshooting; they will be ignored by most
+            # downstream processing since they lack a "command" key.
+            self.agg.add({"raw_line": evt.raw_line})
+            return
+
+        # For truly unexpected types, log a warning once.
+        logging.warning(f"Invalid event type: {type(evt)}")
 
     def _log_debug_event(self, evt: BaseSyscallEvent) -> None:
         """
diff --git a/linux_edr/summary.py b/linux_edr/summary.py
@@ -21,8 +21,12 @@ def build_summary(
     now = datetime.now(timezone.utc)
     start = now - timedelta(minutes=window_minutes)
 
-    # Count occurrences of each command
-    counts = Counter(evt["command"] for evt in events)
+    # Count occurrences of each command, ignoring events without a command field (e.g., UnparsedEvent)
+    counts = Counter(
+        evt["command"]
+        for evt in events
+        if isinstance(evt, dict) and evt.get("command")
+    )
     proc_summary: Dict[str, int] = dict(counts)
 
     report_data = {
diff --git a/linux_edr/trace.py b/linux_edr/trace.py
@@ -5,7 +5,8 @@
 import logging
 import time
 import re
-from typing import Generator, Optional, Union
+import mmap
+from typing import Generator, Optional, Union, Tuple, List
 
 # Default path to the kernel's trace_pipe
 TRACE_PATH = "/sys/kernel/tracing/trace_pipe"
@@ -24,6 +25,79 @@
     ExecveEvent, ForkEvent, CloneEvent, ConnectEvent, BaseSyscallEvent, UnparsedEvent
 )
 
+# ----------------------- Helpers to resolve execve pointers -----------------------
+
+def _read_string_from_mem(pid: int, address: int, max_len: int = 4096) -> Optional[str]:
+    """Attempt to read a NUL-terminated string from another process's memory.
+
+    Requires sufficient privileges (typically root / CAP_SYS_PTRACE).
+
+    Args:
+        pid: Process ID whose memory to read.
+        address: Address of the string.
+        max_len: Maximum number of bytes to read.
+
+    Returns:
+        Decoded string if successful, otherwise None.
+    """
+    try:
+        with open(f"/proc/{pid}/mem", "rb", buffering=0) as mem_fd:
+            mem_fd.seek(address)
+            data = mem_fd.read(max_len)
+
+        if not data:
+            return None
+
+        nul_idx = data.find(b"\x00")
+        if nul_idx != -1:
+            data = data[:nul_idx]
+
+        return data.decode("utf-8", errors="replace")
+    except Exception:
+        return None
+
+def _resolve_execve_cmd(pid: int, filename_ptr: str) -> Tuple[str, List[str]]:
+    """Resolve the filename and arguments for an execve trace when only pointers are present.
+
+    Strategy:
+        1. Attempt to read the filename string via /proc/<pid>/mem using the supplied pointer.
+        2. Fallback to /proc/<pid>/cmdline which usually contains argv contents.
+
+    Args:
+        pid: The PID from the trace event.
+        filename_ptr: Hex string pointer to filename (may include trailing comma).
+
+    Returns:
+        Tuple of (command, args list). Unknown values will be "<unknown>" or empty list.
+    """
+    # Clean pointer string and convert to int
+    filename_ptr = filename_ptr.strip().rstrip(",")
+    cmd: str = "<unknown>"
+    args: List[str] = []
+
+    try:
+        address = int(filename_ptr, 16)
+        if address:
+            if (s := _read_string_from_mem(pid, address)):
+                cmd = os.path.basename(s)
+    except Exception:
+        pass
+
+    # If we still don't have a usable cmd, fallback to cmdline
+    if cmd == "<unknown>":
+        try:
+            with open(f"/proc/{pid}/cmdline", "rb") as f:
+                data = f.read()
+            if data:
+                parts = data.split(b"\x00")
+                if parts:
+                    cmd = os.path.basename(parts[0].decode("utf-8", errors="replace"))
+                    args = [p.decode("utf-8", errors="replace") for p in parts[1:] if p]
+        except Exception:
+            pass
+
+    return cmd, args
+
 class TraceReader:
     """
     Non-blocking reader for trace_pipe via selectors.
@@ -138,8 +212,18 @@ def _parse_line(self, line: str) -> Optional[BaseSyscallEvent]:
             ts, pid_str, cmd_args = m.groups()
             pid = int(pid_str)
             parts = cmd_args.split() if cmd_args else []
-            if parts:
-                return ExecveEvent(timestamp=ts, pid=pid, command=parts[0].strip('"'), args=parts[1:])
+            if not parts:
+                return None
+
+            # Typical trace contents: "filename: 7ffcb..., argv: 7ff..., envp: ..."
+            if parts[0].startswith("filename:"):
+                # Extract pointer after "filename:" which is at index 1
+                filename_ptr = parts[1] if len(parts) > 1 else "0"
+                command, argv = _resolve_execve_cmd(pid, filename_ptr)
+                return ExecveEvent(timestamp=ts, pid=pid, command=command, args=argv)
+
+            # Fallback: treat the first token as command as before
+            return ExecveEvent(timestamp=ts, pid=pid, command=parts[0].strip('"'), args=parts[1:])
 
         # fork
         if m := FORK_PATTERN.search(line):