From 752d01a6f3b3bd6a0220967414411890adc10006 Mon Sep 17 00:00:00 2001
From: Dmitry Sidorov <dmitrii.s.sidorov@gmail.com>
Date: Thu, 14 Aug 2025 02:17:37 +0200
Subject: [PATCH] Add an ability to print CFG to pdf and display it in the
 output

Just use opt --passes=dot-cfg and the magic happens.
---
 .gitignore                 |   5 +
 Dockerfile                 |   4 +-
 Dockerfile.backend         |   4 +-
 README.md                  |   5 +-
 backend/server.py          | 263 ++++++++++++++++++++++++++++---------
 session_temps/.gitkeep     |   1 +
 setup_backend.sh           |   3 +-
 src/app/ExplorerContent.js | 101 ++++++++++++--
 tests/test_opt_dot_pdf.py  |  40 ++++++
 9 files changed, 347 insertions(+), 79 deletions(-)
 create mode 100644 session_temps/.gitkeep
 create mode 100644 tests/test_opt_dot_pdf.py

diff --git a/.gitignore b/.gitignore
index 04ccd0d..5ee2f45 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,3 +47,8 @@ next-env.d.ts
 
 # ignore stored sessions
 StoredSessions/*
+!StoredSessions/.gitkeep
+
+# session temps
+session_temps/*
+!session_temps/.gitkeep
diff --git a/Dockerfile b/Dockerfile
index 4c3789b..f2b5bbf 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -16,7 +16,7 @@ WORKDIR /app
 # Install minimal tooling
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
-      ca-certificates wget curl gnupg lsb-release software-properties-common && \
+      ca-certificates wget curl gnupg lsb-release software-properties-common graphviz && \
     rm -rf /var/lib/apt/lists/*
 
 # Add LLVM 22 repository
@@ -46,7 +46,7 @@ RUN python3 -m venv /opt/venv && \
     /opt/venv/bin/pip install --pre torch-mlir torchvision \
       --extra-index-url=https://download.pytorch.org/whl/nightly/cpu \
       -f https://github.com/llvm/torch-mlir-release/releases/expanded_assets/dev-wheels && \
-    /opt/venv/bin/pip install triton fastapi uvicorn pytest httpx
+    /opt/venv/bin/pip install triton fastapi uvicorn pytest httpx PyPDF2
 
 # Create non-root user and fix permissions
 RUN useradd -u 10001 -m --shell /usr/sbin/nologin appuser && \
diff --git a/Dockerfile.backend b/Dockerfile.backend
index 0308dba..2283aa7 100644
--- a/Dockerfile.backend
+++ b/Dockerfile.backend
@@ -9,7 +9,7 @@ WORKDIR /app
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
-      ca-certificates wget curl gnupg lsb-release software-properties-common && \
+      ca-certificates wget curl gnupg lsb-release software-properties-common graphviz && \
     rm -rf /var/lib/apt/lists/*
 
 RUN useradd -u 10001 -m --shell /usr/sbin/nologin appuser && \
@@ -28,7 +28,7 @@ RUN python3 -m venv /opt/venv && \
     /opt/venv/bin/pip install --pre torch-mlir torchvision \
       --extra-index-url=https://download.pytorch.org/whl/nightly/cpu \
       -f https://github.com/llvm/torch-mlir-release/releases/expanded_assets/dev-wheels && \
-    /opt/venv/bin/pip install triton fastapi uvicorn pydantic
+    /opt/venv/bin/pip install triton fastapi uvicorn pydantic PyPDF2
 
 RUN chown -R appuser:appuser /home/appuser/.cache /app
 
diff --git a/README.md b/README.md
index 835e25b..38dcf6f 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,7 @@ tracing models through various IR stages and transformations.
 - Torch-MLIR
 - LLVM with mlir-opt
 - Triton
+- graphviz - needed in case if you want PytorchExplorer to get CFG from LLVM IR in a form of pdf.
 
 To setup PyTorch and Torch-MLIR it's a good idea to visit https://github.com/llvm/torch-mlir repository and follow instructions from there.
 
@@ -85,7 +86,7 @@ source setup_backend.sh
 
 If you already have a working venv for Torch-MLIR, you can just install FastAPI and testing dependencies:
 ```bash
-pip install fastapi uvicorn pytest httpx
+pip install fastapi uvicorn pytest httpx PyPDF2
 ```
 
 ### Run the application
@@ -200,6 +201,8 @@ on the right.
    individually.
 5. Hit **Store Session** to save your work. The backend returns a short ID which
    can be appended to the URL (e.g. `/abc123`) to reload the same session later.
+6. It's possible to build CFG into pdf file for LLVM IR, just call standart for LLVM
+   opt --passes=dot-cfg and CFG will be rendered in the output window.
 
 ## Implementation details
 
diff --git a/backend/server.py b/backend/server.py
index 73dda68..ef67041 100644
--- a/backend/server.py
+++ b/backend/server.py
@@ -9,8 +9,10 @@
 import atexit
 import logging
 import traceback
+import base64
 from typing import List, Optional, Tuple
 import re
+from pathlib import Path
 
 from contextlib import redirect_stdout, redirect_stderr
 
@@ -31,6 +33,11 @@
     CompilerPipelineError,
 )
 
+try:
+    from PyPDF2 import PdfMerger
+except Exception:
+    PdfMerger = None
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
@@ -46,6 +53,14 @@
 
 cached_triton_runs = {}
 
+# Where to store per-request temporary artifacts (DOT/PDF/IR)
+# Default: <project_root>/session_temps, override with PE_SESSION_TEMPS
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+SESSION_TEMPS_ROOT = os.environ.get(
+    "PE_SESSION_TEMPS", str(PROJECT_ROOT / "session_temps")
+)
+os.makedirs(SESSION_TEMPS_ROOT, exist_ok=True)
+
 TORCH_MLIR_OPT_PATH = os.environ.get("TORCH_MLIR_OPT_PATH", "")
 LLVM_BIN_PATH = os.environ.get("LLVM_BIN_PATH", "")
 TRITON_OPT_PATH = os.environ.get("TRITON_OPT_PATH", "")
@@ -132,11 +147,13 @@ def split_cmd_arguments(cmd: str) -> List[str]:
 
 # Run torch-mlir-opt and/or mlir-opt and/or opt etc.
 def run_external_opt_tool_file(
-    input_path: str, cmd: str, tool: str, output_path: str
+    input_path: str, cmd: str, tool: str, output_path: str, cwd: Optional[str] = None
 ) -> Tuple[bool, str]:
     args = [tool] + split_cmd_arguments(cmd) + [input_path, "-o", output_path]
     try:
-        result = subprocess.run(args, capture_output=True, text=True, check=True)
+        result = subprocess.run(
+            args, capture_output=True, text=True, check=True, cwd=cwd
+        )
         return (True, result.stderr or "")
     except subprocess.CalledProcessError as e:
         logger.error(
@@ -151,80 +168,202 @@ def run_external_opt_tool_file(
         raise CompilerPipelineError(f"Unexpected error while running '{tool}': {e}")
 
 
+def _read_file_safe(path: str) -> Tuple[str, bool]:
+    # Read a file returning its text or base64 if binary.
+    # Returns a tuple of (content, is_binary). If the file cannot be decoded as
+    # UTF-8, it is assumed to be binary and returned base64-encoded.
+
+    with open(path, "rb") as f:
+        data = f.read()
+    try:
+        return data.decode("utf-8"), False
+    except UnicodeDecodeError:
+        encoded = base64.b64encode(data).decode("utf-8")
+        return encoded, True
+
+
 # Utility for custom pipeline.
 def apply_optional_passes(
     ir: str, pipeline: List[Tuple[str, str]], dump_each: bool = False
 ) -> str:
     uid = uuid.uuid4().hex
     output = ""
-    temp_files = []
+    pdf_blocks: List[Tuple[str, str]] = []
 
-    # Step 1: Write initial IR to a file.
-    with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f:
-        f.write(ir)
-        f.flush()
-        prev_path = f.name
-        temp_files.append(prev_path)
-
-    if dump_each:
-        output += f"\n\n===== Initial IR =====\n{ir}"
-
-    # Step 2: Apply pipeline stages.
-    for index, (tool, flags) in enumerate(pipeline):
-        tool_path = None
-
-        if tool == "torch-mlir-opt":
-            tool_path = os.path.join(TORCH_MLIR_OPT_PATH, "torch-mlir-opt")
-        elif tool == "mlir-opt":
-            tool_path = os.path.join(LLVM_BIN_PATH, "mlir-opt")
-        elif tool == "mlir-translate":
-            tool_path = os.path.join(LLVM_BIN_PATH, "mlir-translate")
-        elif tool == "opt":
-            flags += " -S"
-            tool_path = os.path.join(LLVM_BIN_PATH, "opt")
-        elif tool == "llc":
-            tool_path = os.path.join(LLVM_BIN_PATH, "llc")
-        elif tool == "triton-opt":
-            tool_path = os.path.join(TRITON_OPT_PATH, "triton-opt")
-        elif tool == "triton-llvm-opt":
-            tool_path = os.path.join(TRITON_OPT_PATH, "triton-llvm-opt")
-        elif tool == "user-tool":
-            tokens = split_cmd_arguments(flags)
-            if not tokens:
-                raise CompilerPipelineError("Empty user-tool invocation")
-            tool_path = tokens[0]
-            flags = " ".join(tokens[1:])
-        else:
-            raise CompilerPipelineError(f"Unknown pipeline tool: '{tool}'")
+    # Make one session dir under PytorchExplorer/session_temps.
+    session_dir = tempfile.mkdtemp(dir=SESSION_TEMPS_ROOT, prefix=f"sess_{uid}_")
+    try:
+        # Step 1: Write initial IR into the session dir.
+        prev_path = os.path.join(session_dir, f"ir_init_{uid}.ll")
+        with open(prev_path, "w") as f:
+            f.write(ir)
 
-        out_path = os.path.join(tempfile.gettempdir(), f"ir_step_{index}_{uid}")
-        temp_files.append(out_path)
+        if dump_each:
+            output += f"\n\n===== Initial IR =====\n{ir}"
+
+        # Step 2: Apply pipeline stages (all I/O + cwd inside session_dir).
+        for index, (tool, flags) in enumerate(pipeline):
+            if tool == "torch-mlir-opt":
+                tool_path = os.path.join(TORCH_MLIR_OPT_PATH, "torch-mlir-opt")
+            elif tool == "mlir-opt":
+                tool_path = os.path.join(LLVM_BIN_PATH, "mlir-opt")
+            elif tool == "mlir-translate":
+                tool_path = os.path.join(LLVM_BIN_PATH, "mlir-translate")
+            elif tool == "opt":
+                flags += " -S"
+                tool_path = os.path.join(LLVM_BIN_PATH, "opt")
+            elif tool == "llc":
+                tool_path = os.path.join(LLVM_BIN_PATH, "llc")
+            elif tool == "triton-opt":
+                tool_path = os.path.join(TRITON_OPT_PATH, "triton-opt")
+            elif tool == "triton-llvm-opt":
+                tool_path = os.path.join(TRITON_OPT_PATH, "triton-llvm-opt")
+            elif tool == "user-tool":
+                tokens = split_cmd_arguments(flags)
+                if not tokens:
+                    raise CompilerPipelineError("Empty user-tool invocation")
+                tool_path = tokens[0]
+                flags = " ".join(tokens[1:])
+                if os.path.basename(tool_path) == "opt" and "-S" not in flags.split():
+                    flags += " -S"
+            else:
+                raise CompilerPipelineError(f"Unknown pipeline tool: '{tool}'")
 
-        success, stderr = run_external_opt_tool_file(
-            prev_path, flags, tool_path, out_path
-        )
-        if not success:
-            raise CompilerPipelineError(f"{tool} failed: {stderr}")
+            out_path = os.path.join(session_dir, f"ir_step_{index}_{uid}.ll")
 
-        if dump_each:
-            with open(out_path, "r") as f:
-                stage_output = f.read()
-            output += f"\n\n===== IR after {tool} {flags} =====\n{stage_output}"
+            if "dot-cfg" in flags and "--dot-cfg-dir=" not in flags:
+                flags += f" --dot-cfg-dir={session_dir}"
 
-        prev_path = out_path
+            # Run the tool with cwd=session_dir so DOTs land here.
+            success, stderr = run_external_opt_tool_file(
+                prev_path, flags, tool_path, out_path, cwd=session_dir
+            )
+            if not success:
+                raise CompilerPipelineError(f"{tool} failed: {stderr}")
+
+            # Collect DOTs from session_dir -> convert to PDFs -> merge -> attach.
+            dot_files = sorted(
+                set(
+                    glob.glob(os.path.join(session_dir, "*.dot"))
+                    + glob.glob(os.path.join(session_dir, ".*.dot"))
+                )
+            )
+            pdf_paths: List[str] = []
 
-    if not dump_each:
-        with open(prev_path, "r") as f:
-            output = f.read()
+            # Only warn if user requested dot-cfg but no DOTs appeared.
+            if "dot-cfg" in flags and not dot_files:
+                logger.warning(
+                    "No *.dot emitted by -passes=dot-cfg; checked %s", session_dir
+                )
 
-    # Cleanup.
-    for path in temp_files:
-        try:
-            os.remove(path)
-        except Exception:
-            pass
+            # Convert DOT -> PDF.
+            if dot_files:
+                if not shutil.which("dot"):
+                    logger.error(
+                        "'dot' (graphviz) not found on PATH; cannot render CFG PDFs."
+                    )
+                else:
+                    for df in sorted(set(dot_files)):
+                        pdf_path = os.path.splitext(df)[0] + ".pdf"
+                        try:
+                            subprocess.run(
+                                ["dot", "-Tpdf", df, "-o", pdf_path], check=True
+                            )
+                            pdf_paths.append(pdf_path)
+                        except Exception as e:
+                            logger.error(f"Failed to convert {df} to PDF: {e}")
+
+                # Remove DOTs after conversion (keep PDFs).
+                for df in dot_files:
+                    try:
+                        os.remove(df)
+                    except Exception:
+                        pass
+
+            def _encode_and_attach(path_to_pdf: str):
+                try:
+                    with open(path_to_pdf, "rb") as pf:
+                        encoded = base64.b64encode(pf.read()).decode("utf-8")
+                    pdf_blocks.append((os.path.basename(path_to_pdf), encoded))
+                    if dump_each:
+                        nonlocal output
+                        output += f"\n\n===== DOT PDF {os.path.basename(path_to_pdf)} =====\n{encoded}"
+                except Exception as e:
+                    logger.error(f"Failed to read PDF {path_to_pdf}: {e}")
+
+            # Merge PDFs if we have more than one.
+            if pdf_paths:
+                merged_ok = False
+                merged_path = os.path.join(session_dir, f"cfg-merged-stage-{index}.pdf")
+
+                if PdfMerger is not None:
+                    try:
+                        merger = PdfMerger()
+                        for p in sorted(pdf_paths):
+                            merger.append(p)
+                        with open(merged_path, "wb") as mf:
+                            merger.write(mf)
+                        merger.close()
+                        _encode_and_attach(merged_path)
+                        merged_ok = True
+                    except Exception as e:
+                        logger.error(f"PDF merge via PyPDF2 failed: {e}")
+
+                if not merged_ok and shutil.which("pdfunite"):
+                    try:
+                        cmd = ["pdfunite"] + sorted(pdf_paths) + [merged_path]
+                        subprocess.run(cmd, check=True)
+                        _encode_and_attach(merged_path)
+                        merged_ok = True
+                    except Exception as e:
+                        logger.error(f"PDF merge via pdfunite failed: {e}")
+
+                if not merged_ok:
+                    # Fall back to attaching individual PDFs.
+                    for p in sorted(pdf_paths):
+                        _encode_and_attach(p)
+
+            # Handle analysis-only stages (no output file produced).
+            wrote_output = os.path.exists(out_path)
+
+            if dump_each:
+                path_to_show = out_path if wrote_output else prev_path
+                stage_output, is_binary = _read_file_safe(path_to_show)
+                if not wrote_output:
+                    output += f"\n\n===== IR after {tool} {flags} (no new output; IR unchanged) =====\n"
+                else:
+                    output += f"\n\n===== IR after {tool} {flags} =====\n"
+                output += stage_output
+
+            if wrote_output:
+                prev_path = out_path
+
+        # Final assembly.
+        if not dump_each:
+            with open(prev_path, "rb") as f:
+                data = f.read()
+            try:
+                output = data.decode("utf-8")
+            except UnicodeDecodeError:
+                encoded = base64.b64encode(data).decode("utf-8")
+                if data.startswith(b"%PDF"):
+                    pdf_blocks.insert(
+                        0, (os.path.basename(prev_path) + ".pdf", encoded)
+                    )
+                    output = ""
+                else:
+                    output = f"===== BINARY OUTPUT {os.path.basename(prev_path)} =====\n{encoded}"
+
+            for name, encoded in pdf_blocks:
+                if output:
+                    output += "\n\n"
+                output += f"===== DOT PDF {name} =====\n{encoded}"
 
-    return output
+        return output
+
+    finally:
+        # Nuke the whole session dir; PDFs/DOTs/IR intermediates are all ephemeral.
+        shutil.rmtree(session_dir, ignore_errors=True)
 
 
 # Torch graph IR.
diff --git a/session_temps/.gitkeep b/session_temps/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/session_temps/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/setup_backend.sh b/setup_backend.sh
index 2ba1e20..f855dcd 100644
--- a/setup_backend.sh
+++ b/setup_backend.sh
@@ -16,7 +16,8 @@ sudo apt-get update
 sudo apt-get -y install \
     llvm-22-dev \
     llvm-22-tools \
-    mlir-22-tools
+    mlir-22-tools \
+    graphviz
 
 echo "Exporting LLVM 22 tools path..."
 export PATH=/usr/lib/llvm-22/bin:$PATH
diff --git a/src/app/ExplorerContent.js b/src/app/ExplorerContent.js
index 92f5425..7b0976b 100644
--- a/src/app/ExplorerContent.js
+++ b/src/app/ExplorerContent.js
@@ -540,6 +540,24 @@ export default function ExplorerContent() {
       setLocalCmd(customToolCmd[irWin.id] || "");
     }, [customToolCmd, irWin.id]);
 
+    const extractPdfs = (output) => {
+      const regex = /===== DOT PDF ([^=]+?) =====\n([\s\S]*?)(?=\n=====|$)/g;
+      let match;
+      let lastIndex = 0;
+      let text = "";
+      const pdfs = [];
+      while ((match = regex.exec(output)) !== null) {
+        text += output.slice(lastIndex, match.index);
+        pdfs.push({
+          name: match[1],
+          data: match[2].trim().replace(/\s+/g, ""),
+        });
+        lastIndex = regex.lastIndex;
+      }
+      text += output.slice(lastIndex);
+      return { textContent: text.trim(), pdfs };
+    };
+
     return (
       <>
         <div
@@ -945,23 +963,84 @@ export default function ExplorerContent() {
             </div>
             <div
               style={{
+                display: "flex",
+                flexDirection: "column",
                 flex: 1,
                 minHeight: 0,
-                overflow: "auto",
               }}
             >
-              <Editor
-                height="100%"
-                language="mlir"
-                value={
+              {(() => {
+                const displayOutput =
                   exploreStage.windowId === irWin.id
                     ? filterToStage(irWin.output, exploreStage.stageIdx)
-                    : irWin.output
-                }
-                onChange={() => {}}
-                theme={theme === "light" ? "mlirTheme" : "mlirThemeDark"}
-                options={{ readOnly: true }}
-              />
+                    : irWin.output;
+                const { textContent, pdfs } = extractPdfs(displayOutput);
+
+                const editorOnly = pdfs.length === 0;
+
+                return (
+                  <>
+                    {/* Keep editor full-height if no PDFs; otherwise give it a small fixed height */}
+                    <div
+                      style={{
+                        flex: editorOnly ? "1 1 0" : "0 0 auto",
+                        minHeight: 0,
+                      }}
+                    >
+                      <Editor
+                        height={editorOnly ? "100%" : "260px"}
+                        language="mlir"
+                        value={textContent}
+                        onChange={() => {}}
+                        theme={
+                          theme === "light" ? "mlirTheme" : "mlirThemeDark"
+                        }
+                        options={{ readOnly: true }}
+                      />
+                    </div>
+
+                    {/* PDF area fills all remaining space */}
+                    {pdfs.length > 0 && (
+                      <div
+                        style={{
+                          flex: "1 1 0",
+                          minHeight: 0,
+                          overflow: "auto",
+                        }}
+                      >
+                        {pdfs.length === 1 ? (
+                          <iframe
+                            title={pdfs[0].name}
+                            src={`data:application/pdf;base64,${pdfs[0].data}`}
+                            style={{
+                              width: "100%",
+                              height: "100%",
+                              border: "none",
+                            }}
+                          />
+                        ) : (
+                          // If there are multiple PDFs, keep them scrollable;
+                          // each gets a reasonable min height but the container still fills.
+                          <div style={{ display: "grid", gap: 8 }}>
+                            {pdfs.map((p, i) => (
+                              <iframe
+                                key={i}
+                                title={p.name}
+                                src={`data:application/pdf;base64,${p.data}`}
+                                style={{
+                                  width: "100%",
+                                  height: "480px",
+                                  border: "none",
+                                }}
+                              />
+                            ))}
+                          </div>
+                        )}
+                      </div>
+                    )}
+                  </>
+                );
+              })()}
             </div>
           </>
         )}
diff --git a/tests/test_opt_dot_pdf.py b/tests/test_opt_dot_pdf.py
new file mode 100644
index 0000000..be3a199
--- /dev/null
+++ b/tests/test_opt_dot_pdf.py
@@ -0,0 +1,40 @@
+import os
+import httpx
+import pytest
+
+BASE_URL = os.environ.get("API_URL", "http://localhost:8000/generate_ir")
+
+LLVM_IR = """
+define i32 @foo(i32 %x) {
+entry:
+  %0 = add i32 %x, 1
+  ret i32 %0
+}
+"""
+
+
+def test_opt_dot_generates_pdf():
+    payload = {
+        "code": LLVM_IR,
+        "ir_type": "raw_ir",
+        "selected_language": "raw_ir",
+        "torch_mlir_opt": "",
+        "mlir_opt": "",
+        "mlir_translate": "",
+        "llvm_opt": "-passes=dot-cfg",
+        "llc": "",
+        "triton_opt": "",
+        "triton_llvm_opt": "",
+        "user_tool": "",
+        "dump_after_each_opt": False,
+    }
+    resp = httpx.post(BASE_URL, json=payload)
+    assert resp.status_code == 200, resp.text
+    out = resp.json()["output"]
+
+    # IR is still printed
+    assert "define i32 @foo" in out
+
+    # At least one embedded PDF block from the DOT conversion
+    assert "===== DOT PDF " in out
+    assert "JVBERi0" in out  # base64 PDF header