Skip to content

Commit 0f5ae43

Browse files
authored
implement new api endpoint (#129)
1 parent 7876f3a commit 0f5ae43

18 files changed

Lines changed: 189 additions & 136 deletions

File tree

rag-engine/src/layers/chunking_embedding/chunk_document.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ def chunk_document(
5151

5252
# ---- FINAL CLEANUP ----
5353
chunks = _deduplicate_chunks(chunks)
54+
if len(chunks) == 0:
55+
raise ValueError("No text found in your pdf!, make sure it is not image")
5456

5557
return chunks
5658

rag-engine/src/process/__init__.py renamed to rag-engine/src/layers/data_extractor/extractor/__init__.py

File renamed without changes.

rag-engine/src/layers/data_extractor/extractor.py renamed to rag-engine/src/layers/data_extractor/extractor/pdf.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,12 @@
1717
# ===============================
1818
# PUBLIC ENTRY
1919
# ===============================
20-
def pdf(pdf_bytes: bytes) -> tuple[list[Page], dict]:
20+
def extract_data(pdf_bytes: bytes) -> tuple[list[Page], dict]:
2121
pages_output: list[Page] = []
2222
metadata = {}
2323

2424
try:
2525
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf_doc:
26-
metadata["_document_id"] = str(uuid.uuid4())
2726
metadata["_file_type"] = "pdf"
2827
metadata["_page_count"] = len(pdf_doc.pages)
2928
metadata["_file_metadata"] = pdf_doc.metadata

rag-engine/src/layers/structure_analyzer/analyzer/__init__.py

Whitespace-only changes.

rag-engine/src/layers/structure_analyzer/analyzer.py renamed to rag-engine/src/layers/structure_analyzer/analyzer/pdf.py

File renamed without changes.

rag-engine/src/main.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
from dotenv import load_dotenv
22
from fastapi import FastAPI
3-
from src.process.controller import router as process
3+
from src.store.routers import store_upload_router, store_url_router
44
from .logging import configure_logging, LogLevels
55
from pathlib import Path
66

77

8-
env_path = Path(__file__).parent / '.env'
8+
env_path = Path(__file__).parent.parent / '.env'
99

1010
load_dotenv(dotenv_path=env_path)
1111
configure_logging(LogLevels.info)
1212
app = FastAPI()
13-
app.include_router(process)
13+
app.include_router(store_upload_router)
14+
app.include_router(store_url_router)

rag-engine/src/process/controller.py

Lines changed: 0 additions & 72 deletions
This file was deleted.

rag-engine/src/process/models.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

rag-engine/src/process/service.py

Lines changed: 0 additions & 38 deletions
This file was deleted.

rag-engine/src/store/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)