-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcontract.py
More file actions
109 lines (94 loc) · 3.7 KB
/
contract.py
File metadata and controls
109 lines (94 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from flask import Flask, render_template_string, request, send_file, jsonify
from flask_cors import CORS
import pytesseract
from PIL import Image
from transformers import pipeline
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from googletrans import Translator
import os
import uuid
from pdf2image import convert_from_path
from pathlib import Path
import shutil
app = Flask(__name__)
CORS(app)
pytesseract.pytesseract.tesseract_cmd = r'C:\\Users\\Ghade\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'
summarizer = pipeline("summarization", model="t5-small")
def extract_text(file_path):
try:
ext = Path(file_path).suffix
safe_path = os.path.join("uploads", f"safe_{uuid.uuid4().hex}{ext}")
shutil.copy(file_path, safe_path)
if ext.lower() == ".pdf":
poppler_path = r"C:\poppler-24.08.0\Library\bin"
images = convert_from_path(safe_path, dpi=150, poppler_path=poppler_path, fmt='jpeg')
text = ""
for img in images:
text += pytesseract.image_to_string(img, lang="eng+ara") + "\n"
return text
else:
with Image.open(safe_path) as img:
return pytesseract.image_to_string(img, lang="eng+ara")
except Exception as e:
print("OCR ERROR:", e)
return f"Error processing file: {e}"
def summarize_clauses(text, lang="en"):
translator = Translator()
results = []
for clause in text.split('\n'):
clause = clause.strip()
if clause:
try:
input_length = len(clause.split())
max_len = min(50, max(10, int(input_length * 0.6)))
summary = summarizer(clause, max_length=max_len, min_length=5, do_sample=False)[0]['summary_text']
if lang == "ar":
summary = translator.translate(summary, dest="ar").text
results.append({"clause": clause, "summary": summary})
except Exception as e:
results.append({"clause": clause, "summary": f"Summarization error: {str(e)}"})
return results
def save_pdf(clauses, filename):
c = canvas.Canvas(filename, pagesize=letter)
width, height = letter
y = height - 40
c.drawString(40, y, "Contract Analysis Report")
y -= 30
for item in clauses:
for line in [f"Clause: {item['clause']}", f"Summary: {item['summary']}", ""]:
c.drawString(40, y, line)
y -= 20
if y < 50:
c.showPage()
y = height - 40
c.save()
@app.route('/contract-analysis', methods=['POST'])
def analyze_contract():
if 'contract_image' not in request.files:
return jsonify({"error": "No file uploaded."})
image = request.files['contract_image']
lang = request.form.get('language', 'en')
ext = os.path.splitext(image.filename)[1]
unique_name = f"{uuid.uuid4().hex}{ext}"
image_path = os.path.join("uploads", unique_name)
os.makedirs("uploads", exist_ok=True)
image.save(image_path)
extracted_text = extract_text(image_path)
results = summarize_clauses(extracted_text, lang)
if not results:
return jsonify({"error": "No analysis results returned."})
# Generate PDF
pdf_name = f"analysis_{uuid.uuid4().hex}.pdf"
pdf_path = os.path.join("uploads", pdf_name)
save_pdf(results, pdf_path)
return jsonify({
"results": results[:3], # Return only top 5
"pdf_url": f"http://localhost:5004/download/{pdf_name}"
})
@app.route('/download/<pdf_name>')
def download_pdf(pdf_name):
path = os.path.join("uploads", pdf_name)
return send_file(path, as_attachment=True)
if __name__ == "__main__":
app.run(port=5004, debug=True)