ScanIt/server.py at main · Didfu/ScanIt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#!/usr/bin/env python
import uvicorn
import argparse
import connexion
import os
from flask import send_from_directory, redirect, request, send_file, jsonify
from flask_cors import CORS
from backend import AVAILABLE_MODELS
from dotenv import load_dotenv
from io import BytesIO
import json
from datetime import datetime
import pdfplumber
import logging
import tempfile
logging.getLogger("pdfminer").setLevel(logging.ERROR)
from connexion import request as connexion_request


# === PATH SETUP ===
# Only go one level up (project root = gltr/)
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
dotenv_path = os.path.join(PROJECT_ROOT, ".env")

# Custom PDF save directory inside project root
pdf_dir = os.path.join(PROJECT_ROOT, "reports")
os.makedirs(pdf_dir, exist_ok=True)

# Temporary PDF uploads directory
temp_pdf_dir = os.path.join(tempfile.gettempdir(), "scanit_uploads")
os.makedirs(temp_pdf_dir, exist_ok=True)


# === ENVIRONMENT ===
load_dotenv(dotenv_path)
key = os.getenv("key")

CONFIG_FILE_NAME = 'lmf.yml'
projects = {}


app = connexion.App(__name__)

# =====================================================================
# MODEL WRAPPER
# =====================================================================
class Project:
    """Wrapper around a model with pre-loading support"""
    def __init__(self, model_cls, config_name, preload=True):
        self.config = config_name
        self._model_cls = model_cls
        self._lm_instance = None

        # Pre-load if requested
        if preload:
            print(f"Pre-loading model '{self.config}'...")
            self._lm_instance = self._model_cls()
            print(f"Model '{self.config}' loaded successfully")

    @property
    def lm(self):
        """Return the model (should already be loaded)"""
        if self._lm_instance is None:
            raise RuntimeError(f"Model {self.config} was not pre-loaded")
        return self._lm_instance

def get_all_projects():
    """Return configuration of all loaded projects"""
    return {k: projects[k].config for k in projects.keys()}

# =====================================================================
# ANALYZE ENDPOINT
# =====================================================================
def analyze(analyze_request):
    """Analyze a text using the selected project/model"""
    print("=== ANALYZE FUNCTION CALLED ===")

    project = analyze_request.get('project')
    text = analyze_request.get('text')
    pdf_path = analyze_request.get('pdf_path')  # NEW: Optional PDF path

    print(f"Project requested: '{project}'")
    print(f"Available projects: {list(projects.keys())}")
    print(f"Text: '{text[:100]}...'")
    if pdf_path:
        print(f"PDF path provided: {pdf_path}")

    topk = analyze_request.get('topk', 20)
    include_detectgpt = analyze_request.get('include_detectgpt', True)
    include_fastdetect = analyze_request.get('include_fastdetect', True)
    include_factcheck = analyze_request.get('include_factcheck', True)
    max_claims = analyze_request.get('max_claims', 5)
    generate_gltr_viz = analyze_request.get('generate_gltr_viz', True)
    fastdetect_api_key = key

    res = {}

    try:
        if project in projects:
            print(f"✓ Project '{project}' found!")
            p = projects[project]

            if hasattr(p, 'lm'):
                print("Calling check_probabilities...")

                # Set PDF path if provided for GLTR overlay
                if pdf_path and os.path.exists(pdf_path):
                    print(f"Setting PDF path for GLTR overlay: {pdf_path}")
                    p.lm.set_current_pdf_path(pdf_path)

                lm_res = p.lm.check_probabilities(
                    text,
                    topk=topk,
                    include_detectgpt=include_detectgpt,
                    include_fastdetect=include_fastdetect,
                    fastdetect_api_key=fastdetect_api_key,
                    include_factcheck=include_factcheck,
                    max_claims=max_claims,
                    generate_gltr_viz=generate_gltr_viz,
                )

                print("✓ check_probabilities returned!")

                # Copy key results
                res["pred_topk"] = lm_res.get("pred_topk", [])
                res["real_topk"] = lm_res.get("real_topk", [])
                res["bpe_strings"] = lm_res.get("bpe_strings", [])
                res["detectgpt"] = lm_res.get("detectgpt", {})
                res["fastdetect"] = lm_res.get("fastdetect", {})
                res["factcheck"] = lm_res.get("factcheck", [])
                res["gltr_image"] = lm_res.get("gltr_image")

                # === Auto-generate PDF report ===
                try:
                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                    pdf_filename = f"analysis_report_{timestamp}.pdf"
                    pdf_path_out = os.path.join(pdf_dir, pdf_filename)

                    p.lm.generate_pdf_report(lm_res, output_path=pdf_path_out)
                    print(f"✓ PDF report saved: {pdf_path_out}")
                    res["pdf_filename"] = pdf_filename
                except Exception as e:
                    print(f"✗ Failed to generate PDF: {e}")
                    res["pdf_filename"] = None

                print(f"✓ Analysis complete. GLTR image: {res.get('gltr_image')}")
            else:
                print("✗ Project object has no 'lm' attribute!")
        else:
            print(f"✗ Project '{project}' NOT in projects dict!")

    except Exception as e:
        print(f"✗✗✗ EXCEPTION in analyze: {e}")
        import traceback
        traceback.print_exc()
        res["error"] = str(e)

    return {
        "request": {
            'project': project,
            'text': text,
            'topk': topk,
            'include_detectgpt': include_detectgpt,
            'include_fastdetect': include_fastdetect,
            'include_factcheck': include_factcheck,
            'max_claims': max_claims,
            'generate_gltr_viz': generate_gltr_viz,
            'has_fastdetect_key': bool(fastdetect_api_key),
            'has_pdf_path': bool(pdf_path)
        },
        "result": res
    }

# =====================================================================
# MANUAL PDF REPORT GENERATION
# =====================================================================
def generate_report(report_request):
    """Manually generate PDF report from existing analysis"""
    print("=== GENERATE_REPORT FUNCTION CALLED ===")

    try:
        project = report_request.get('project')
        analysis_data = report_request.get('analysis_data')
        input_text = report_request.get('input_text', '')

        if not project or not analysis_data:
            return {"error": "Missing required fields: project and analysis_data"}, 400

        if project not in projects:
            return {"error": f"Project '{project}' not found"}, 404

        print(f"Generating PDF report for project: {project}")

        pdf_data = {
            "input_text": input_text,
            "bpe_strings": analysis_data.get("bpe_strings", []),
            "real_topk": analysis_data.get("real_topk", []),
            "pred_topk": analysis_data.get("pred_topk", []),
            "detectgpt": analysis_data.get("detectgpt"),
            "fastdetect": analysis_data.get("fastdetect"),
            "factcheck": analysis_data.get("factcheck", []),
            "gltr_image": analysis_data.get("gltr_image")
        }

        p = projects[project]
        pdf_buffer = p.lm.generate_pdf_report(pdf_data)

        print("✓ PDF report generated successfully")

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"factcheck_report_{timestamp}.pdf"

        return send_file(
            pdf_buffer,
            mimetype='application/pdf',
            as_attachment=True,
            download_name=filename
        )

    except Exception as e:
        print(f"✗ PDF generation error: {e}")
        import traceback
        traceback.print_exc()
        return {"error": str(e)}, 500

# =====================================================================
# ROUTES
# =====================================================================
@app.route('/')
def serve_react():
    return send_from_directory('client/dist', 'index.html')

@app.route('/client/<path:path>')
def send_static(path):
    return send_from_directory('client', path)

@app.route('/data/<path:path>')
def send_data(path):
    return send_from_directory(args.dir, path)

@app.route('/api/health')
def health_check():
    return {
        "status": "healthy",
        "projects": list(projects.keys()),
        "models_loaded": len(projects)
    }

@app.route('/api/gltr_image/<path:filename>')
def serve_gltr_image(filename):
    import tempfile
    temp_dir = tempfile.gettempdir()
    return send_from_directory(temp_dir, filename)

@app.route('/api/download_pdf/<path:filename>')
def download_pdf(filename):
    """Serve previously generated PDF reports from project /reports folder"""
    file_path = os.path.join(pdf_dir, filename)
    if not os.path.exists(file_path):
        return {"error": f"File not found: {filename}"}, 404
    return send_from_directory(pdf_dir, filename, as_attachment=True, mimetype='application/pdf')

@app.route('/api/extract_pdf', methods=['POST'])
def extract_pdf():
    """Extract text from PDF and optionally save for GLTR overlay"""
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400

    file = request.files['file']

    if not file.filename.endswith('.pdf'):
        return jsonify({'error': 'File must be a PDF'}), 400

    try:
        # Save uploaded PDF temporarily for GLTR overlay
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        temp_filename = f"upload_{timestamp}_{file.filename}"
        temp_pdf_path = os.path.join(temp_pdf_dir, temp_filename)
        file.save(temp_pdf_path)
        print(f"Saved uploaded PDF to: {temp_pdf_path}")

        # Extract text from PDF
        text = ''
        with pdfplumber.open(temp_pdf_path) as pdf:
            for page in pdf.pages:
                # Extract text with layout preserved
                page_text = page.extract_text(layout=True)
                if page_text:
                    text += page_text + '\n\n'

        # Clean up common OCR artifacts
        text = text.replace(' - ', '-')  # Fix broken hyphens

        return jsonify({
            'text': text.strip(),
            'pdf_path': temp_pdf_path,  # Return path for GLTR overlay
            'pages': len(pdfplumber.open(temp_pdf_path).pages)
        })
    except Exception as e:
        print(f"PDF extraction error: {e}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500
def analyze_with_pdf(text, project='gemma-3n-E2B-it', file=None, topk=20,include_detectgpt=True, include_fastdetect=True, include_factcheck=True, max_claims=5, generate_gltr_viz=True):
    """
    Analyze text with optional PDF overlay for GLTR visualization
    Connexion automatically passes formData parameters as function arguments
    """
    print("=== ANALYZE_WITH_PDF FUNCTION CALLED ===")

    try:
        if not text:
            return {"error": "No text provided"}, 400

        print(f"Project: {project}")
        print(f"Text length: {len(text)}")
        print(f"File provided: {file is not None}")

        # Handle optional PDF file
        pdf_path = None
        if file is not None:
            # 'file' is a werkzeug FileStorage object
            if file.filename and file.filename.endswith('.pdf'):
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                temp_filename = f"analyze_{timestamp}_{file.filename}"
                pdf_path = os.path.join(temp_pdf_dir, temp_filename)
                file.save(pdf_path)
                print(f"Saved PDF for analysis: {pdf_path}")

        # Build request dictionary for analyze function
        analyze_request = {
            'project': project,
            'text': text,
            'pdf_path': pdf_path,
            'topk': topk,
            'include_detectgpt': include_detectgpt,
            'include_fastdetect': include_fastdetect,
            'include_factcheck': include_factcheck,
            'max_claims': max_claims,
            'generate_gltr_viz': generate_gltr_viz
        }

        print(f"Calling analyze with project: {project}")
        print(f"Available projects: {list(projects.keys())}")

        # Call the analyze function
        result = analyze(analyze_request)

        return result

    except Exception as e:
        print(f"Error in analyze_with_pdf: {e}")
        import traceback
        traceback.print_exc()
        return {"error": str(e)}, 500
# =====================================================================
# CLEANUP ENDPOINT (Optional)
# =====================================================================
@app.route('/api/cleanup_temp_pdfs', methods=['POST'])
def cleanup_temp_pdfs():
    """Clean up old temporary PDF files (older than 1 hour)"""
    try:
        import time
        current_time = time.time()
        removed_count = 0

        for filename in os.listdir(temp_pdf_dir):
            file_path = os.path.join(temp_pdf_dir, filename)
            if os.path.isfile(file_path):
                file_age = current_time - os.path.getmtime(file_path)
                # Remove files older than 1 hour (3600 seconds)
                if file_age > 3600:
                    os.remove(file_path)
                    removed_count += 1
                    print(f"Removed old temp PDF: {filename}")

        return jsonify({
            'success': True,
            'removed_count': removed_count
        })
    except Exception as e:
        return jsonify({'error': str(e)}), 500

# =====================================================================
# MAIN
# =====================================================================
app.add_api('server.yaml')

parser = argparse.ArgumentParser()
parser.add_argument("--model", default='gemma-3n-E2B-it')
parser.add_argument("--address", default="127.0.0.1")
parser.add_argument("--port", default="5001")
parser.add_argument("--dir", type=str, default=os.path.abspath('data'))
parser.add_argument("--no_cors", action='store_true')

args, _ = parser.parse_known_args()

if __name__ == '__main__':
    args = parser.parse_args()

    if not args.no_cors:
        CORS(app.app, headers='Content-Type')

    app.run(port=int(args.port), host=args.address)
else:
    args, _ = parser.parse_known_args()
    try:
        model = AVAILABLE_MODELS[args.model]
    except KeyError:
        model = AVAILABLE_MODELS['gemma-3n-E2B-it']
    projects[args.model] = Project(model, args.model)