Skip to content

Commit e1a062e

Browse files
committed
setup Dengine
1 parent f482cb8 commit e1a062e

14 files changed

Lines changed: 269 additions & 0 deletions

File tree

d-engine/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.venv
2+
__pycache__

d-engine/pyrightconfig.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"exclude": [ ".venv" ],
3+
"venvPath": ".",
4+
"venv": ".venv",
5+
}

d-engine/requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
fastapi
2+
python-dotenv
3+
pydantic
4+
pdfplumber
5+
requests

d-engine/src/__init__.py

Whitespace-only changes.

d-engine/src/layers/__init__.py

Whitespace-only changes.

d-engine/src/layers/data_extractor/__init__.py

Whitespace-only changes.
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import io
2+
import re
3+
import uuid
4+
import pdfplumber
5+
6+
from src.process.models import PageContent
7+
8+
9+
def pdf(pdf_bytes: bytes) -> list[PageContent]:
10+
pages_output = []
11+
try:
12+
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
13+
for page_number, page in enumerate(pdf.pages, start=1):
14+
words = page.extract_words(
15+
x_tolerance=2, y_tolerance=2, keep_blank_chars=False
16+
)
17+
lines = {}
18+
for w in words:
19+
top = round(w["top"], 1)
20+
lines.setdefault(top, []).append(w)
21+
text_lines = []
22+
for top in sorted(lines.keys()):
23+
line_words = sorted(lines[top], key=lambda x: x["x0"])
24+
line_text = " ".join(word["text"] for word in line_words)
25+
text_lines.append(line_text)
26+
text = normalize_text("\n".join(text_lines))
27+
28+
tables_output = []
29+
tables = page.find_tables()
30+
for table in tables:
31+
data = table.extract()
32+
if data and any(any(cell for cell in row) for row in data):
33+
tables_output.append(data)
34+
35+
images_output = []
36+
for img in page.images:
37+
images_output.append({
38+
"id": str(uuid.uuid4()),
39+
"x0": img.get("x0"),
40+
"top": img.get("top"),
41+
"x1": img.get("x1"),
42+
"bottom": img.get("bottom"),
43+
"width": img.get("width"),
44+
"height": img.get("height"),
45+
})
46+
47+
pages_output.append({
48+
"page_number": page_number,
49+
"text": text,
50+
"tables": tables_output,
51+
"images": images_output,
52+
"width": page.width,
53+
"height": page.height,
54+
})
55+
56+
return pages_output
57+
except Exception as e:
58+
raise ValueError(f"Error processing PDF: {e}")
59+
60+
61+
def normalize_text(text: str) -> str:
62+
text = fix_hyphen_breaks(text)
63+
text = remove_page_numbers(text)
64+
text = remove_dot_lines(text)
65+
text = remove_lonely_symbols(text)
66+
text = fix_merged_words(text)
67+
text = normalize_spaces(text)
68+
69+
text = "\n".join(line.rstrip() for line in text.splitlines())
70+
text = re.sub(r"\n{3,}", "\n\n", text)
71+
72+
return text.strip()
73+
74+
75+
def fix_hyphen_breaks(text: str) -> str:
76+
# Join words broken with hyphen + newline
77+
return re.sub(r"-\n(\w)", r"\1", text)
78+
79+
80+
def remove_page_numbers(text: str) -> str:
81+
lines = text.splitlines()
82+
cleaned = []
83+
84+
for line in lines:
85+
stripped = line.strip()
86+
if stripped.isdigit():
87+
continue
88+
cleaned.append(line)
89+
90+
return "\n".join(cleaned)
91+
92+
93+
def normalize_spaces(text: str) -> str:
94+
return re.sub(r"[ \t]+", " ", text)
95+
96+
97+
def remove_dot_lines(text: str) -> str:
98+
lines = text.splitlines()
99+
cleaned = []
100+
for line in lines:
101+
if re.match(r"^(\.\s?){5,}$", line.strip()):
102+
continue
103+
cleaned.append(line)
104+
return "\n".join(cleaned)
105+
106+
107+
def remove_lonely_symbols(text: str) -> str:
108+
lines = text.splitlines()
109+
cleaned = []
110+
for line in lines:
111+
if len(line.strip()) <= 2:
112+
continue
113+
cleaned.append(line)
114+
return "\n".join(cleaned)
115+
116+
117+
def fix_merged_words(text: str) -> str:
118+
return re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from pydantic import BaseModel
2+
3+
class ImagePage(BaseModel):
4+
id: str
5+
x0:float
6+
top:float
7+
x1: float
8+
bottom: float
9+
width:float
10+
height: float
11+
12+
class PageContent(BaseModel):
13+
page_number: int
14+
text: str
15+
images: list[ImagePage]
16+
tables: list[list[list[str]]]

d-engine/src/logging.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import logging
2+
from enum import StrEnum
3+
4+
5+
LOG_FORMAT_DEBUG = "%(levelname)s:%(message)s:%(pathname)s:%(funcName)s:%(lineno)d"
6+
7+
8+
class LogLevels(StrEnum):
9+
info = "INFO"
10+
warn = "WARN"
11+
error = "ERROR"
12+
debug = "DEBUG"
13+
14+
15+
def configure_logging(log_level: str = LogLevels.error):
16+
log_level = str(log_level).upper()
17+
log_levels = [level.value for level in LogLevels]
18+
19+
if log_level not in log_levels:
20+
logging.basicConfig(level=LogLevels.error)
21+
return
22+
23+
if log_level == LogLevels.debug:
24+
logging.basicConfig(level=log_level, format=LOG_FORMAT_DEBUG)
25+
return
26+
27+
logging.basicConfig(level=log_level)

d-engine/src/main.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from fastapi import FastAPI
2+
from src.process.controller import router as process
3+
from .logging import configure_logging, LogLevels
4+
5+
configure_logging(LogLevels.info)
6+
app = FastAPI()
7+
app.include_router(process)

0 commit comments

Comments
 (0)