-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf.py
More file actions
71 lines (48 loc) · 1.91 KB
/
pdf.py
File metadata and controls
71 lines (48 loc) · 1.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from PyPDF2 import PdfReader, PdfWriter
from aiofiles import tempfile
import os
import aiohttp
import asyncio
import logging
import json
from tika import TikaCollect
CHUNK = int(os.environ.get("TIKA_CHUNK", "8"))
logger = logging.getLogger("uvicorn.error")
async def single_page(headers, url, file_name: str, safe_dir: str, session):
if file_name.startswith(safe_dir):
with open(file_name, "rb") as file:
async with session.put(url, data=file, headers=headers) as response:
headers = response.headers
text = await response.read()
return (text, headers)
async def page_requests(headers, url, file):
pdf = PdfReader(stream=file)
pdf_pages = []
async with tempfile.TemporaryDirectory() as temp_dir:
logger.info("Start pdf process!")
for page in range(len(pdf.pages)):
pdf_writer = PdfWriter()
pdf_writer.add_page(pdf.pages[page])
out_file_name = os.path.abspath(
os.path.join(temp_dir, f"page-{page:05d}.pdf"))
if out_file_name.startswith(temp_dir):
with open(out_file_name, 'wb') as out:
pdf_writer.write(out)
pdf_pages.append(out_file_name)
pages = len(pdf_pages)
data = TikaCollect()
async with aiohttp.ClientSession() as session:
tasks = []
for page in range(pages):
logger.info(f"Process page: {page} of {pages}")
tasks.append(single_page(
headers, url, pdf_pages[page], temp_dir, session))
if len(tasks) < CHUNK:
continue
else:
await data.async_collect(tasks)
tasks.clear()
if len(tasks) > 0:
await data.async_collect(tasks)
tasks.clear()
return data.get_request()