NVIDIA · assinchu · Jun 24, 2026
diff --git a/src/skillspector/nodes/report.py b/src/skillspector/nodes/report.py
@@ -22,6 +22,8 @@
 from __future__ import annotations
 
 import json
+import re
+from dataclasses import replace
 from datetime import UTC, datetime
 from io import StringIO
 from typing import Literal
@@ -65,6 +67,38 @@
 }
 
 
+# Scanned skill content (and LLM output that quotes it) can carry ANSI escape
+# sequences and control bytes (e.g. NUL, ESC) into finding text. Left in, they
+# make a rendered report register as binary — GitLab/editors show "download"
+# instead of the Markdown, and terminals emit garbled output. Strip them from
+# every finding text field so all report formats stay clean UTF-8.
+_ANSI_RE = re.compile(r"\x1b\[[0-9;?]*[ -/]*[@-~]")
+_CONTROL_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]")
+
+# Finding free-text fields that may carry scanned/LLM content.
+_SANITIZED_FIELDS = (
+    "message",
+    "explanation",
+    "remediation",
+    "finding",
+    "context",
+    "matched_text",
+    "code_snippet",
+)
+
+
+def _clean_text(value: str | None) -> str | None:
+    """Strip ANSI escape sequences and disallowed control chars (keep tab/newline)."""
+    if not isinstance(value, str):
+        return value
+    return _CONTROL_RE.sub("", _ANSI_RE.sub("", value))
+
+
+def _sanitize_finding(finding: Finding) -> Finding:
+    """Return a copy of *finding* with control/ANSI bytes stripped from text fields."""
+    return replace(finding, **{f: _clean_text(getattr(finding, f)) for f in _SANITIZED_FIELDS})
+
+
 def _severity_to_sarif_level(severity: str) -> Literal["error", "warning", "note"]:
     """Map Finding.severity to SARIF result level."""
     return {
@@ -533,6 +567,10 @@ def report(state: SkillspectorState) -> dict[str, object]:
     """
     raw_findings = state.get("findings", [])
     filtered_findings = state.get("filtered_findings", raw_findings)
+    # Strip ANSI/control bytes once here so every downstream format (terminal,
+    # json, markdown, sarif) and the returned findings stay clean UTF-8. Applied
+    # before partition/dedup so active and suppressed findings are both clean.
+    filtered_findings = [_sanitize_finding(f) for f in filtered_findings]
     component_metadata = state.get("component_metadata") or []
     components = state.get("components") or []
     file_cache = state.get("file_cache") or {}

diff --git a/tests/nodes/test_report_sanitizer.py b/tests/nodes/test_report_sanitizer.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for report-output sanitization (ANSI / control-byte stripping)."""
+
+from __future__ import annotations
+
+import pytest
+
+from skillspector.models import Finding
+from skillspector.nodes.report import _clean_text, _sanitize_finding, report
+from skillspector.state import SkillspectorState
+
+
+def _dirty_finding() -> Finding:
+    return Finding(
+        rule_id="E2",
+        message="creds \x1b[31mleak\x1b[0m here\x00",
+        severity="HIGH",
+        confidence=0.9,
+        file="a/SKILL.md",
+        start_line=5,
+        remediation="redact \x1b[1mnow\x1b[0m",
+        context="line with \x07 bell and \x1b[0m reset",
+    )
+
+
+def test_clean_text_strips_ansi_and_control_keeps_readable() -> None:
+    assert _clean_text("a\x1b[31mb\x1b[0mc\x00d") == "abcd"
+    # Tabs and newlines are preserved.
+    assert _clean_text("a\tb\nc") == "a\tb\nc"
+    # Emoji / multibyte UTF-8 is untouched.
+    assert _clean_text("🔴 HIGH") == "🔴 HIGH"
+    # Non-strings pass through.
+    assert _clean_text(None) is None
+
+
+def test_sanitize_finding_cleans_text_fields_only() -> None:
+    cleaned = _sanitize_finding(_dirty_finding())
+    assert "\x1b" not in cleaned.message and "\x00" not in cleaned.message
+    assert "leak" in cleaned.message and "here" in cleaned.message
+    assert "\x1b" not in (cleaned.remediation or "")
+    assert "\x07" not in (cleaned.context or "")
+    # Non-text fields are unchanged.
+    assert cleaned.rule_id == "E2"
+    assert cleaned.start_line == 5
+
+
+@pytest.mark.parametrize("fmt", ["markdown", "json", "sarif", "terminal"])
+def test_report_emits_clean_utf8_for_all_formats(fmt: str) -> None:
+    """No ANSI/control bytes leak into any report format."""
+    state: SkillspectorState = {
+        "filtered_findings": [_dirty_finding()],
+        "component_metadata": [],
+        "has_executable_scripts": False,
+        "manifest": {},
+        "skill_path": None,
+        "output_format": fmt,
+    }
+    body = report(state)["report_body"]
+    assert "\x00" not in body, f"NUL leaked into {fmt}"
+    assert "\x1b" not in body, f"ESC leaked into {fmt}"
+    # The readable content survives the sanitization.
+    assert "leak" in body and "here" in body