Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
270 changes: 270 additions & 0 deletions week1/community-contributions/day2-jmmz-summarize-PDF-database.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "3b77d329",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import ollama\n",
"import kagglehub\n",
"from openai import OpenAI\n",
"from dotenv import load_dotenv\n",
"from pypdf import PdfReader\n",
"from IPython.display import Markdown, display"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5fc0475b",
"metadata": {},
"outputs": [],
"source": [
"# Load environment variables\n",
"load_dotenv()\n",
"api_key = os.getenv(\"OPEN_API_KEY\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2cdeaeee",
"metadata": {},
"outputs": [],
"source": [
"!pip install kagglehub, pypdf"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8720f777",
"metadata": {},
"outputs": [],
"source": [
"import kagglehub\n",
"\n",
"# Download latest version\n",
"path = kagglehub.dataset_download(\"manisha717/dataset-of-pdf-files\")\n",
"\n",
"print(\"Path to dataset files:\", path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a0a9497",
"metadata": {},
"outputs": [],
"source": [
"class ResumePdf:\n",
"\n",
" def __init__(self, system_prompt: str = None, model_by_api: str = 'gpt'):\n",
" if system_prompt is None:\n",
" self.system_prompt = \"Eres un escritor experto, sabes a la perfección cómo resumir cualquier tipo de texto.\\\n",
" Tu misión es resumir con claridad, de manera que resaltes los puntos claves, el autor, la fecha y sobre \\\n",
" todo, debes ser capaz de conservar el sentido y sentimiento del texto según el área de conocimiento \\\n",
" (podría ser una carta de renuncia, una carta de amor, una carta para informar sobre avances \\\n",
" en un negocio, un paper científico, etc.) \\\n",
" Es altamente probable que la información esté en inglés, por tanto, debes traducir al español siempre.\\\n",
" Responde en formato markdown.\"\n",
" else:\n",
" self.system_prompt = system_prompt\n",
"\n",
" self.model_by_api = model_by_api\n",
" self.ollama_api = \"http://localhost:11434/api/chat\"\n",
"\n",
" def get_text_pdf(self, path):\n",
" reader = PdfReader(path)\n",
" full_text = '\\n'.join((page.extract_text() for i, page in enumerate(reader.pages) if i != 0))\n",
" return full_text\n",
"\n",
" def pdf_context(self, text: str) -> str:\n",
" user_prompt = \"Por favor, resumen el contenido del \\\n",
" pdf en máximo 500 palabras. Trata de obtener la idea global y describir las secciones \\\n",
" más importantes del texto\\n\\n\"\n",
" user_prompt += f\"{text}\"\n",
"\n",
" return user_prompt\n",
"\n",
" def messages_for(self, path_pdf):\n",
" full_text = self.get_text_pdf(path=path_pdf)\n",
" return [\n",
" {\"role\": \"system\", \"content\": self.system_prompt},\n",
" {\"role\": \"user\", \"content\": self.pdf_context(text=full_text)}\n",
" ]\n",
" \n",
" def display_summary(self, summary):\n",
" display(Markdown(summary))\n",
" \n",
" def summarize_pdf(self, path_pdf):\n",
" messages = self.messages_for(path_pdf)\n",
"\n",
" if self.model_by_api == 'gpt':\n",
" client = OpenAI(api_key=api_key)\n",
" response = client.chat.completions.create(model=\"gpt-4o-mini\", \n",
" messages=messages)\n",
" \n",
" summary = response.choices[0].message.content\n",
"\n",
" if self.model_by_api == 'ollama_local':\n",
" response = ollama.chat(model=\"llama3.2\", \n",
" messages=messages)\n",
" summary = response['message']['content']\n",
" \n",
" if self.model_by_api == 'ollama_openai':\n",
" ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', \n",
" api_key='ollama')\n",
" response = ollama_via_openai.chat.completions.create(\n",
" model=\"llama3.2\",\n",
" messages=messages\n",
" )\n",
" summary = response.choices[0].message.content\n",
" \n",
" return summary\n",
" \n",
" def generate_resumens(self, lista_paths_pdf):\n",
" for path_pdf in lista_paths_pdf:\n",
" try:\n",
" summary = self.summarize_pdf(path_pdf=path_pdf)\n",
" self.display_summary(summary) \n",
" yield {\n",
" \"path\": path_pdf,\n",
" \"summary\": summary\n",
" }\n",
" except Exception as e:\n",
" self.display_summary(f'## Error {e} with document {path_pdf}')\n",
" yield {\n",
" \"path\": path_pdf,\n",
" \"error\": str(e)\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "003fc878",
"metadata": {},
"outputs": [],
"source": [
"path_pdfs_database = \"path/to/pdf_bank\"\n",
"all_entries = os.listdir(path_pdfs_database)\n",
"files = [entry for entry in all_entries if os.path.isfile(os.path.join(path_pdfs_database, entry))]\n",
"files_path = [os.path.join(path_pdfs_database, f) for f in files]\n",
"\n",
"model_api = 'gpt' # o \"ollama_local\", \"ollama_openai\"\n",
"resumidor = ResumePdf(model_by_api=model_api)\n",
"\n",
"resumens = resumidor.generate_resumens(files_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "316739c2",
"metadata": {},
"outputs": [],
"source": [
"sum = next(resumens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "21768e5e",
"metadata": {},
"outputs": [],
"source": [
"sum"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc14f8ec",
"metadata": {},
"outputs": [],
"source": [
"sum = next(resumens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3db5b051",
"metadata": {},
"outputs": [],
"source": [
"sum"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef7a95f9",
"metadata": {},
"outputs": [],
"source": [
"sum = next(resumens)\n",
"sum"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8e549f3",
"metadata": {},
"outputs": [],
"source": [
"# Store summaries for postprocessing\n",
"\n",
"path_pdfs_database = \"C:/Users/mario/.cache/kagglehub/datasets/manisha717/dataset-of-pdf-files/versions/1/Pdf/\"\n",
"all_entries = os.listdir(path_pdfs_database)\n",
"files = [entry for entry in all_entries if os.path.isfile(os.path.join(path_pdfs_database, entry))]\n",
"files_path = [os.path.join(path_pdfs_database, f) for f in files]\n",
"\n",
"model_api = 'gpt' # o \"ollama_local\", \"ollama_openai\"\n",
"resumidor = ResumePdf(model_by_api=model_api)\n",
"\n",
"resumens = resumidor.generate_resumens(files_path)\n",
"\n",
"summaries = {}\n",
"\n",
"for i, resumen in enumerate(resumens):\n",
" if 'error' in resumen:\n",
" print(f\"❌ Error procesando {resumen['path']}: {resumen['error']}\")\n",
" else:\n",
" print(f\"✅ {resumen['path']}:\\n\")\n",
" summaries[resumen['path']] = resumen['summary']\n",
" print(\"\\n\" + \"=\"*80 + \"\\n\")\n",
"\n",
" if i == 3:\n",
" break"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}