Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions src/skillspector/nodes/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from __future__ import annotations

import json
import re
from dataclasses import replace
from datetime import UTC, datetime
from io import StringIO
from typing import Literal
Expand Down Expand Up @@ -65,6 +67,38 @@
}


# Scanned skill content (and LLM output that quotes it) can carry ANSI escape
# sequences and control bytes (e.g. NUL, ESC) into finding text. Left in, they
# make a rendered report register as binary — GitLab/editors show "download"
# instead of the Markdown, and terminals emit garbled output. Strip them from
# every finding text field so all report formats stay clean UTF-8.
_ANSI_RE = re.compile(r"\x1b\[[0-9;?]*[ -/]*[@-~]")
_CONTROL_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]")

# Finding free-text fields that may carry scanned/LLM content.
_SANITIZED_FIELDS = (
"message",
"explanation",
"remediation",
"finding",
"context",
"matched_text",
"code_snippet",
)


def _clean_text(value: str | None) -> str | None:
"""Strip ANSI escape sequences and disallowed control chars (keep tab/newline)."""
if not isinstance(value, str):
return value
return _CONTROL_RE.sub("", _ANSI_RE.sub("", value))


def _sanitize_finding(finding: Finding) -> Finding:
"""Return a copy of *finding* with control/ANSI bytes stripped from text fields."""
return replace(finding, **{f: _clean_text(getattr(finding, f)) for f in _SANITIZED_FIELDS})


def _severity_to_sarif_level(severity: str) -> Literal["error", "warning", "note"]:
"""Map Finding.severity to SARIF result level."""
return {
Expand Down Expand Up @@ -533,6 +567,10 @@ def report(state: SkillspectorState) -> dict[str, object]:
"""
raw_findings = state.get("findings", [])
filtered_findings = state.get("filtered_findings", raw_findings)
# Strip ANSI/control bytes once here so every downstream format (terminal,
# json, markdown, sarif) and the returned findings stay clean UTF-8. Applied
# before partition/dedup so active and suppressed findings are both clean.
filtered_findings = [_sanitize_finding(f) for f in filtered_findings]
component_metadata = state.get("component_metadata") or []
components = state.get("components") or []
file_cache = state.get("file_cache") or {}
Expand Down
76 changes: 76 additions & 0 deletions tests/nodes/test_report_sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for report-output sanitization (ANSI / control-byte stripping)."""

from __future__ import annotations

import pytest

from skillspector.models import Finding
from skillspector.nodes.report import _clean_text, _sanitize_finding, report
from skillspector.state import SkillspectorState


def _dirty_finding() -> Finding:
return Finding(
rule_id="E2",
message="creds \x1b[31mleak\x1b[0m here\x00",
severity="HIGH",
confidence=0.9,
file="a/SKILL.md",
start_line=5,
remediation="redact \x1b[1mnow\x1b[0m",
context="line with \x07 bell and \x1b[0m reset",
)


def test_clean_text_strips_ansi_and_control_keeps_readable() -> None:
assert _clean_text("a\x1b[31mb\x1b[0mc\x00d") == "abcd"
# Tabs and newlines are preserved.
assert _clean_text("a\tb\nc") == "a\tb\nc"
# Emoji / multibyte UTF-8 is untouched.
assert _clean_text("🔴 HIGH") == "🔴 HIGH"
# Non-strings pass through.
assert _clean_text(None) is None


def test_sanitize_finding_cleans_text_fields_only() -> None:
cleaned = _sanitize_finding(_dirty_finding())
assert "\x1b" not in cleaned.message and "\x00" not in cleaned.message
assert "leak" in cleaned.message and "here" in cleaned.message
assert "\x1b" not in (cleaned.remediation or "")
assert "\x07" not in (cleaned.context or "")
# Non-text fields are unchanged.
assert cleaned.rule_id == "E2"
assert cleaned.start_line == 5


@pytest.mark.parametrize("fmt", ["markdown", "json", "sarif", "terminal"])
def test_report_emits_clean_utf8_for_all_formats(fmt: str) -> None:
"""No ANSI/control bytes leak into any report format."""
state: SkillspectorState = {
"filtered_findings": [_dirty_finding()],
"component_metadata": [],
"has_executable_scripts": False,
"manifest": {},
"skill_path": None,
"output_format": fmt,
}
body = report(state)["report_body"]
assert "\x00" not in body, f"NUL leaked into {fmt}"
assert "\x1b" not in body, f"ESC leaked into {fmt}"
# The readable content survives the sanitization.
assert "leak" in body and "here" in body