-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
240 lines (195 loc) · 8.36 KB
/
main.py
File metadata and controls
240 lines (195 loc) · 8.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import streamlit as st
import fitz
from PIL import Image
import io
import os
from geometry_utils import geometry_selector
# ---------------------------------
# Suppress MuPDF warnings (harmless)
# ---------------------------------
fitz.TOOLS.mupdf_display_errors(False)
st.set_page_config(page_title="PDF Wizard 2026", layout="wide")
# ---------------------------------
# PDF RENDERING HELPERS
# ---------------------------------
def render_single_page_for_canvas(file_bytes, zoom=1.4):
"""Render a single page for Kill Zones canvas with reliable PNG output for st_canvas."""
doc = fitz.open(stream=file_bytes, filetype="pdf")
page = doc[0]
width = page.rect.width
height = page.rect.height
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
img_bytes = pix.tobytes("png") # Export to bytes
img = Image.open(io.BytesIO(img_bytes)).convert("RGB") # Ensure RGB mode
doc.close()
return {"page": 0, "image": img, "width": width, "height": height, "bytes": img_bytes}
@st.cache_data(show_spinner=False)
def render_previews(file_bytes, keywords=(), zoom=1.4, max_pages=25, x_offset=0, y_offset=0):
"""Render previews for Kill Words with optional highlights and offsets."""
doc = fitz.open(stream=file_bytes, filetype="pdf")
mat = fitz.Matrix(zoom, zoom)
previews = []
for i in range(min(len(doc), max_pages)):
page = doc[i]
for word in keywords:
for rect in page.search_for(word):
rect.x0 += x_offset
rect.x1 += x_offset
rect.y0 += y_offset
rect.y1 += y_offset
page.add_highlight_annot(rect)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = Image.open(io.BytesIO(pix.tobytes("png")))
previews.append({"page": i, "image": img, "width": page.rect.width, "height": page.rect.height})
doc.close()
return previews
def render_from_pdf_bytes(file_bytes, zoom=1.4, max_pages=10):
"""Convert PDF bytes to list of PIL images (used for sanitized preview)."""
doc = fitz.open(stream=file_bytes, filetype="pdf")
mat = fitz.Matrix(zoom, zoom)
images = []
for i in range(min(len(doc), max_pages)):
pix = doc[i].get_pixmap(matrix=mat, alpha=False)
images.append(Image.open(io.BytesIO(pix.tobytes("png"))))
doc.close()
return images
# ---------------------------------
# REDACTION ENGINE
# ---------------------------------
class RedactorEngine:
def __init__(self, original_bytes):
self.original_bytes = original_bytes
self.reset()
def reset(self):
self.doc = fitz.open(stream=self.original_bytes, filetype="pdf")
def kill_by_coordinates(self, coords, cw, ch, x_offset=0, y_offset=0):
for page in self.doc:
pw, ph = page.rect.width, page.rect.height
for x0, y0, x1, y1 in coords:
rx0 = x0 * (pw / cw) + x_offset
ry0 = y0 * (ph / ch) + y_offset
rx1 = x1 * (pw / cw) + x_offset
ry1 = y1 * (ph / ch) + y_offset
page.add_redact_annot(fitz.Rect(rx0, ry0, rx1, ry1), fill=(0, 0, 0))
page.apply_redactions(images=2)
return self._export()
def sanitize_keywords(self, keywords, progress, x_offset=0, y_offset=0):
total = len(self.doc)
for i, page in enumerate(self.doc):
progress.progress((i + 1) / total)
for word in keywords:
for rect in page.search_for(word):
rect.x0 += x_offset
rect.x1 += x_offset
rect.y0 += y_offset
rect.y1 += y_offset
page.add_redact_annot(rect, fill=(0, 0, 0))
page.apply_redactions(images=0)
return self._export()
def _export(self):
out = io.BytesIO()
self.doc.save(out, garbage=4, deflate=True)
return out.getvalue()
# ---------------------------------
# UI
# ---------------------------------
def main():
st.sidebar.title("PDF Censor")
st.markdown("# PDF Censor")
st.markdown(
"**⚠️ 100% Privacy Guaranteed — Uploaded PDFs are never stored, logged, or shared.** "
"All processing happens in-memory."
)
uploaded = st.sidebar.file_uploader("Upload PDF", type="pdf")
if not uploaded:
st.info("Upload a source PDF to begin analysis.")
return
file_bytes = uploaded.getvalue()
if "engine" not in st.session_state or st.session_state.get("last_file") != uploaded.name:
st.session_state.engine = RedactorEngine(file_bytes)
st.session_state.last_file = uploaded.name
st.session_state.keywords = []
st.session_state.sanitized_pdf = None
base, ext = os.path.splitext(uploaded.name)
sanitized_name = f"{base}_sanitized{ext}"
st.sidebar.divider()
with st.sidebar.expander("Offset Correction (points)", expanded=False):
x_offset = st.number_input("Horizontal Offset (→ / ←)", value=0.0)
y_offset = st.number_input("Vertical Offset (↓ / ↑)", value=0.0)
# 1. Defined the purge logic to clear widget states
def reset_application_state():
"""Purge all engine data and force-reset UI widgets."""
st.session_state.engine.reset()
st.session_state.sanitized_pdf = None
st.session_state.word_previews = []
st.session_state.active_keywords = []
# Force UI reset by deleting the keys
if "mode_selection" in st.session_state:
del st.session_state["mode_selection"]
if "kill_zone_canvas" in st.session_state:
del st.session_state["kill_zone_canvas"]
st.cache_data.clear()
# 2. Reset Button Trigger
if st.sidebar.button("🔄 Reset All Changes", width="stretch"):
reset_application_state()
st.rerun()
# 3. Radio Widget with 'key' binding
# By providing a key, we can control this widget via st.session_state
mode = st.sidebar.radio(
"Mode",
["Kill Words", "Kill Zones"],
key="mode_selection"
)
# -------------------------
# KILL ZONES
# -------------------------
kill_zone_page = None
if mode == "Kill Zones":
kill_zone_page = render_single_page_for_canvas(file_bytes)
rects, cw, ch = geometry_selector(kill_zone_page)
if rects and st.sidebar.button("Execute Global Kill", type="primary"):
st.session_state.sanitized_pdf = st.session_state.engine.kill_by_coordinates(
rects, cw, ch, x_offset, y_offset
)
# -------------------------
# KILL WORDS
# -------------------------
kill_word_previews = []
if mode == "Kill Words":
raw = st.sidebar.text_area("Keywords (comma separated)", placeholder="e.g., Confidential, John Doe, 123-45-6789")
if st.sidebar.button("Apply Highlights"):
st.session_state.keywords = [k.strip() for k in raw.split(",") if k.strip()]
st.session_state.sanitized_pdf = None
kill_word_previews = render_previews(
file_bytes,
tuple(st.session_state.keywords),
x_offset=x_offset,
y_offset=y_offset,
)
if st.session_state.keywords and st.sidebar.button("Sanitize", type="primary"):
with st.status("Purging Data…"):
bar = st.progress(0)
st.session_state.sanitized_pdf = st.session_state.engine.sanitize_keywords(
st.session_state.keywords, bar, x_offset, y_offset
)
# -------------------------
# PREVIEW AREA
# -------------------------
if st.session_state.sanitized_pdf:
st.subheader("Sanitized Preview")
imgs = render_from_pdf_bytes(st.session_state.sanitized_pdf)
for i, img in enumerate(imgs):
st.image(img, caption=f"Sanitized Page {i+1}", width="stretch")
st.sidebar.download_button(
"📥 Download PDF", st.session_state.sanitized_pdf, sanitized_name, width="stretch"
)
else:
st.subheader("Source Preview")
if mode == "Kill Zones" and kill_zone_page:
st.image(kill_zone_page["image"], caption="Page 1", width="stretch")
elif mode == "Kill Words":
for p in kill_word_previews:
st.image(p["image"], caption=f"Page {p['page']+1}", width="stretch")
if __name__ == "__main__":
main()