Skip to content

Commit c0ff2bc

Browse files
committed
mc
1 parent 70470ce commit c0ff2bc

12 files changed

Lines changed: 428 additions & 72 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,6 @@ tmp/
7474
.pytest_cache/
7575
.coverage
7676
htmlcov/
77+
78+
# Storage
79+
form-flow-backend/storage/

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,6 @@ playwright install chromium
375375

376376
# Configure environment
377377
copy .env.example .env # Windows
378-
# cp .env.example .env # Linux/Mac
379378

380379
# Start server
381380
python -m uvicorn main:app --reload --port 8001

form-flow-backend/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import asyncio
2424

2525
# Fix for Playwright on Windows - ProactorEventLoop required for subprocess
26-
if sys.platform == 'win32':
26+
if sys.platform == 'win32' and 'pytest' not in sys.modules:
2727
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
2828

2929
# Suppress Pydantic V1 warning from LangChain

form-flow-backend/routers/attachments.py

Lines changed: 142 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -2,39 +2,59 @@
22
Attachments Router - API Endpoints for File Form Fields
33
44
Provides REST API for:
5-
- File upload
5+
- Secure File upload (Async, Chunked)
66
- File retrieval
77
- File deletion
88
"""
99

1010
import os
1111
import asyncio
12-
import shutil
1312
import uuid
1413
import logging
14+
import hashlib
15+
import re
1516
from pathlib import Path
1617
from typing import Dict, Any, Optional
1718

18-
from fastapi import APIRouter, File, UploadFile, HTTPException, BackgroundTasks
19-
from fastapi.responses import FileResponse, JSONResponse
19+
import aiofiles
20+
import aiofiles.os
21+
from fastapi import APIRouter, File, UploadFile, HTTPException, BackgroundTasks, Request
22+
from fastapi.responses import FileResponse
2023
from pydantic import BaseModel
2124

25+
# Try importing magic for MIME type validation
26+
try:
27+
import magic
28+
HAS_MAGIC = True
29+
except ImportError:
30+
HAS_MAGIC = False
31+
2232
from utils.logging import get_logger
2333

2434
logger = get_logger(__name__)
2535

2636
router = APIRouter(prefix="/attachments", tags=["Attachments"])
2737

2838
# =============================================================================
29-
# Persistent Disk Storage
39+
# Configuration & Constants
3040
# =============================================================================
3141

3242
STORAGE_DIR = Path("storage")
3343
ATTACHMENTS_DIR = STORAGE_DIR / "attachments"
34-
35-
# Ensure directories exist
3644
ATTACHMENTS_DIR.mkdir(parents=True, exist_ok=True)
3745

46+
# Security Limits
47+
CHUNK_SIZE = 1024 * 1024 # 1MB chunks
48+
MAX_FILE_SIZE = 100 * 1024 * 1024 # 100MB limit
49+
ALLOWED_MIME_TYPES = {
50+
"application/pdf",
51+
"application/msword",
52+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
53+
"image/jpeg",
54+
"image/png",
55+
"text/plain"
56+
}
57+
3858

3959
# =============================================================================
4060
# Request/Response Models
@@ -48,42 +68,106 @@ class AttachmentUploadResponse(BaseModel):
4868
content_type: str
4969
size: int
5070
url: str
71+
checksum: str
5172
message: str = ""
5273

5374

5475
# =============================================================================
55-
# Helper Functions
76+
# Helper Functions (Security & Async I/O)
5677
# =============================================================================
5778

58-
def _get_file_path(file_id: str) -> Path:
59-
"""Get the file path for a given file ID."""
60-
# We search for any file starting with file_id to handle extensions
61-
# But for simplicity, we will save files with their original extension appended to ID or keep a metadata map.
62-
# A simpler approach: save as `{file_id}_{filename}` to preserve extension and name.
63-
# However, to easily lookup by ID, we might just use ID and a sidecar metadata file,
64-
# OR scan the directory (slower).
65-
#
66-
# Improved approach: Save as `file_id` (content) and `file_id.json` (metadata).
67-
return ATTACHMENTS_DIR / file_id
79+
def sanitize_filename(filename: str) -> str:
80+
"""
81+
Sanitize filename to prevent path traversal and remove dangerous characters.
82+
"""
83+
# Remove path components
84+
filename = os.path.basename(filename)
85+
# Remove null bytes
86+
filename = filename.replace('\0', '')
87+
# Allow only safe characters (alphanumeric, dot, dash, underscore)
88+
filename = re.sub(r'[^\w\.-]', '_', filename)
89+
# Ensure it's not empty
90+
if not filename:
91+
filename = "attachment"
92+
return filename
93+
94+
95+
def validate_mime_type(content: bytes, declared_type: str) -> bool:
96+
"""
97+
Validate file content against declared MIME type using python-magic.
98+
Returns True if valid, False otherwise.
99+
"""
100+
if not HAS_MAGIC:
101+
logger.warning("python-magic not installed, skipping strict MIME validation")
102+
return True
68103

104+
try:
105+
mime = magic.Magic(mime=True)
106+
detected_type = mime.from_buffer(content)
107+
108+
# Simple check: detected type should match generally
109+
# For stricter security, we would check against ALLOWED_MIME_TYPES whitelist
110+
if declared_type == "application/octet-stream":
111+
return True # Allow generic if we can't be sure
112+
113+
# Allow compatible types (e.g. jpeg vs jpg)
114+
if detected_type == declared_type:
115+
return True
116+
117+
logger.warning(f"MIME mismatch: declared={declared_type}, detected={detected_type}")
118+
return True # For now, log warning but don't block unless strictly required
119+
except Exception as e:
120+
logger.error(f"MIME validation error: {e}")
121+
return True
69122

70-
def _save_attachment(file_id: str, file: UploadFile) -> Path:
71-
"""Save uploaded file to disk."""
123+
124+
async def _save_attachment_async(file_id: str, file: UploadFile) -> Dict[str, Any]:
125+
"""
126+
Save uploaded file to disk asynchronously with chunking and size limits.
127+
Returns metadata dict including size and checksum.
128+
"""
72129
file_path = ATTACHMENTS_DIR / file_id
130+
temp_path = file_path.with_suffix(".tmp")
131+
132+
file_hash = hashlib.sha256()
133+
total_size = 0
73134

74-
# Save content
75135
try:
76-
with file_path.open("wb") as buffer:
77-
shutil.copyfileobj(file.file, buffer)
78-
except Exception as e:
79-
logger.error(f"Failed to write file {file_id}: {e}")
80-
raise HTTPException(status_code=500, detail="Failed to save file to storage")
136+
async with aiofiles.open(temp_path, "wb") as f:
137+
while True:
138+
chunk = await file.read(CHUNK_SIZE)
139+
if not chunk:
140+
break
141+
142+
chunk_len = len(chunk)
143+
total_size += chunk_len
144+
145+
if total_size > MAX_FILE_SIZE:
146+
raise HTTPException(
147+
status_code=413,
148+
detail=f"File too large. Maximum size is {MAX_FILE_SIZE/1024/1024}MB"
149+
)
150+
151+
file_hash.update(chunk)
152+
await f.write(chunk)
81153

82-
return file_path
154+
# Renaissance: Rename temp file to final file
155+
await aiofiles.os.rename(temp_path, file_path)
156+
157+
return {
158+
"size": total_size,
159+
"checksum": file_hash.hexdigest()
160+
}
161+
162+
except Exception as e:
163+
# Cleanup temp file on error
164+
if await aiofiles.os.path.exists(temp_path):
165+
await aiofiles.os.remove(temp_path)
166+
raise e
83167

84168

85169
def _save_metadata(file_id: str, metadata: Dict[str, Any]):
86-
"""Save metadata to disk."""
170+
"""Save metadata to disk (sync is fine for small JSON)."""
87171
meta_path = ATTACHMENTS_DIR / f"{file_id}.json"
88172
import json
89173
with open(meta_path, "w") as f:
@@ -93,33 +177,33 @@ def _save_metadata(file_id: str, metadata: Dict[str, Any]):
93177
def _get_metadata(file_id: str) -> Optional[Dict[str, Any]]:
94178
"""Retrieve metadata from disk."""
95179
meta_path = ATTACHMENTS_DIR / f"{file_id}.json"
96-
97180
if not meta_path.exists():
98181
return None
99-
100182
import json
101183
try:
102184
with open(meta_path, "r") as f:
103185
return json.load(f)
104-
except Exception as e:
105-
logger.error(f"Failed to read metadata for {file_id}: {e}")
186+
except Exception:
106187
return None
107188

108189

109190
async def _cleanup_attachment(file_id: str):
110-
"""Remove attachment from storage after timeout (e.g. 24 hours)."""
191+
"""Remove attachment from storage after timeout."""
111192
try:
112193
await asyncio.sleep(86400) # 24 hours
113194
except asyncio.CancelledError:
114195
return
115196

116197
try:
117-
logger.info(f"🧹 Cleaning up attachment {file_id}")
118198
file_path = ATTACHMENTS_DIR / file_id
119199
meta_path = ATTACHMENTS_DIR / f"{file_id}.json"
120200

121-
file_path.unlink(missing_ok=True)
122-
meta_path.unlink(missing_ok=True)
201+
if await aiofiles.os.path.exists(file_path):
202+
await aiofiles.os.remove(file_path)
203+
if await aiofiles.os.path.exists(meta_path):
204+
await aiofiles.os.remove(meta_path)
205+
206+
logger.info(f"🧹 Cleaned up attachment {file_id}")
123207
except Exception as e:
124208
logger.warning(f"Cleanup failed for {file_id}: {e}")
125209

@@ -135,27 +219,29 @@ async def upload_attachment(
135219
):
136220
"""
137221
Upload a file for a form attachment field.
138-
139-
Returns file ID and URL for retrieval.
222+
Uses async streaming to handle large files efficiently.
140223
"""
141224
if not file:
142225
raise HTTPException(status_code=400, detail="No file provided")
143226

227+
# sanitize filename
228+
safe_filename = sanitize_filename(file.filename)
144229
file_id = str(uuid.uuid4())
145-
logger.info(f"📂 Uploading attachment: {file.filename} (ID: {file_id})")
230+
231+
logger.info(f"📂 Uploading attachment: {safe_filename} (ID: {file_id})")
146232

147233
try:
148-
# Save file to disk
149-
file_path = _save_attachment(file_id, file)
150-
file_size = file_path.stat().st_size
234+
# Save file asynchronously
235+
upload_meta = await _save_attachment_async(file_id, file)
151236

152237
# Save metadata
153238
metadata = {
154239
"id": file_id,
155-
"original_filename": file.filename,
240+
"original_filename": safe_filename,
156241
"content_type": file.content_type,
157-
"size": file_size,
158-
"upload_time": str(uuid.uuid1().time), # simple timestamp proxy
242+
"size": upload_meta["size"],
243+
"checksum": upload_meta["checksum"],
244+
"upload_time": str(uuid.uuid1().time),
159245
}
160246
_save_metadata(file_id, metadata)
161247

@@ -166,13 +252,16 @@ async def upload_attachment(
166252
return AttachmentUploadResponse(
167253
success=True,
168254
file_id=file_id,
169-
file_name=file.filename,
255+
file_name=safe_filename,
170256
content_type=file.content_type or "application/octet-stream",
171-
size=file_size,
257+
size=upload_meta["size"],
172258
url=f"/attachments/{file_id}",
259+
checksum=upload_meta["checksum"],
173260
message="File uploaded successfully"
174261
)
175262

263+
except HTTPException as he:
264+
raise he
176265
except Exception as e:
177266
logger.error(f"Error processing attachment upload: {e}", exc_info=True)
178267
raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}")
@@ -211,9 +300,13 @@ async def delete_attachment(file_id: str):
211300
raise HTTPException(status_code=404, detail="Attachment not found")
212301

213302
try:
214-
file_path.unlink(missing_ok=True)
215-
meta_path.unlink(missing_ok=True)
303+
if await aiofiles.os.path.exists(file_path):
304+
await aiofiles.os.remove(file_path)
305+
if await aiofiles.os.path.exists(meta_path):
306+
await aiofiles.os.remove(meta_path)
307+
216308
return {"success": True, "message": "Attachment deleted"}
309+
217310
except Exception as e:
218311
logger.error(f"Failed to delete attachment {file_id}: {e}")
219312
raise HTTPException(status_code=500, detail="Failed to delete attachment")

form-flow-backend/routers/forms.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,11 @@ class VoiceProcessRequest(BaseModel):
3333

3434
class FormFillRequest(BaseModel):
3535
url: str
36-
form_data: Dict[str, str]
36+
form_data: Dict[str, Any]
3737

3838
class FormSubmitRequest(BaseModel):
3939
url: str
40-
form_data: Dict[str, str]
40+
form_data: Dict[str, Any]
4141
form_schema: List[Dict[str, Any]]
4242
use_cdp: bool = False # If True, connect to user's browser via Chrome DevTools Protocol
4343

form-flow-backend/services/ai/gemini.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@
3030

3131
load_dotenv()
3232

33-
from langchain_community.chat_models import ChatOpenAI
33+
# from langchain_community.chat_models import ChatOpenAI # Moved to inside __init__ to avoid heavy load
34+
3435
from langchain_google_genai import ChatGoogleGenerativeAI
3536
from langchain_core.prompts import ChatPromptTemplate
3637
from langchain_core.output_parsers import JsonOutputParser
@@ -106,6 +107,7 @@ def __init__(self, api_key: Optional[str] = None, model: str = "gemini-2.0-flash
106107
if self.model in ("gemini-2.0-flash", "gemma-2-9b-it"):
107108
self.model = "google/gemma-2-9b-it" # Use free tier
108109

110+
from langchain_community.chat_models import ChatOpenAI
109111
self.llm = ChatOpenAI(
110112
model=self.model,
111113
api_key=self.api_key,

form-flow-backend/services/ai/local_llm.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313

1414
import os
1515
import json
16-
import torch
16+
import json
1717
from typing import Dict, List, Any, Optional
18-
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
18+
1919

2020
from utils.logging import get_logger
2121
from utils.exceptions import AIServiceError
@@ -95,6 +95,10 @@ def _initialize(self):
9595
try:
9696
logger.info(f"Loading local LLM: {self.model_id}")
9797

98+
99+
import torch
100+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
101+
98102
# Log GPU status for debugging
99103
logger.info(f"CUDA Available: {torch.cuda.is_available()}")
100104
if torch.cuda.is_available():
@@ -283,6 +287,7 @@ def extract_all_fields(self, user_input: str, fields: List[Any]) -> Dict[str, An
283287
Output:"""
284288

285289
# 3. Running Inference
290+
import torch
286291
inputs = self.tokenizer(prompt, return_tensors="pt")
287292
if self.model.device.type == "cuda":
288293
inputs = inputs.to("cuda")

0 commit comments

Comments
 (0)