diff --git a/frontend/llama_stack_ui/distribution/ui/modules/local_extractors.py b/frontend/llama_stack_ui/distribution/ui/modules/local_extractors.py new file mode 100644 index 0000000..305e3f9 --- /dev/null +++ b/frontend/llama_stack_ui/distribution/ui/modules/local_extractors.py @@ -0,0 +1,84 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import io +import logging +import os + +from docx import Document + +logger = logging.getLogger(__name__) + +LOCAL_SUPPORTED_EXTENSIONS = [".docx"] +PROVIDER_SUPPORTED_EXTENSIONS = [".txt", ".pdf", ".md"] + + +def extract_text_from_docx(file) -> str: + """Extract all text content from a .docx file. + + Reads paragraph text and table cell text from the document. + + Args: + file: File-like object containing .docx data + + Returns: + str: Extracted text with paragraphs separated by newlines + """ + doc = Document(file) + parts = [p.text for p in doc.paragraphs] + + for table in doc.tables: + for row in table.rows: + for cell in row.cells: + parts.append(cell.text) + + return "\n".join(parts) + + +def extract_text(file, filename: str) -> str: + """Extract text from a locally supported file type. + + Routes to the appropriate extractor based on file extension. + + Args: + file: File-like object with document data + filename: Original filename used to determine the file type + + Returns: + str: Extracted plain text content + + Raises: + ValueError: If the file extension is not locally supported + """ + ext = os.path.splitext(filename)[1].lower() + + if ext == ".docx": + return extract_text_from_docx(file) + else: + raise ValueError(f"Unsupported file type for local extraction: {ext}") + + +def create_text_file_from_extracted_content( + content: str, original_filename: str +) -> io.BytesIO: + """Wrap extracted text as an in-memory .txt file for the Llama Stack API. + + Creates a BytesIO object with .name and .size attributes so it can be + passed directly to the files.create API endpoint. + + Args: + content: Extracted plain text to wrap + original_filename: Original filename; the stem is reused with a .txt extension + + Returns: + io.BytesIO: In-memory text file ready for upload + """ + text_bytes = content.encode("utf-8") + text_file = io.BytesIO(text_bytes) + stem = os.path.splitext(original_filename)[0] + text_file.name = f"{stem}.txt" + text_file.size = len(text_bytes) + return text_file diff --git a/frontend/llama_stack_ui/distribution/ui/page/distribution/vector_dbs.py b/frontend/llama_stack_ui/distribution/ui/page/distribution/vector_dbs.py index 49571f6..65d83b7 100644 --- a/frontend/llama_stack_ui/distribution/ui/page/distribution/vector_dbs.py +++ b/frontend/llama_stack_ui/distribution/ui/page/distribution/vector_dbs.py @@ -13,6 +13,11 @@ from llama_stack_ui.distribution.ui.modules.utils import get_vector_db_name, data_url_from_file from llama_stack_ui.distribution.ui.modules.api import llama_stack_api +from llama_stack_ui.distribution.ui.modules.local_extractors import ( + extract_text, + create_text_file_from_extracted_content, + LOCAL_SUPPORTED_EXTENSIONS, +) # RAGDocument removed in 0.6.1 - using new files API instead @@ -295,9 +300,22 @@ def _upload_documents_to_database(vector_db_name, uploaded_files, vector_db_id=N actual_db_id = vector_db_id or vector_db_name with st.spinner(f"Uploading {len(uploaded_files)} file(s) to '{vector_db_name}'..."): for uploaded_file in uploaded_files: + original_filename = uploaded_file.name + file_ext = os.path.splitext(original_filename)[1].lower() + + # Auto-detect DOCX and extract locally, let server handle PDF/TXT + if file_ext in LOCAL_SUPPORTED_EXTENSIONS: + st.caption(f"📄 Extracting text from {original_filename}...") + text_content = extract_text(uploaded_file, original_filename) + file_to_upload = create_text_file_from_extracted_content( + text_content, original_filename + ) + else: + file_to_upload = uploaded_file + # Step 1: Upload file content to get file_id file_obj = llama_stack_api.client.files.create( - file=uploaded_file, + file=file_to_upload, purpose='assistants' ) @@ -305,7 +323,7 @@ def _upload_documents_to_database(vector_db_name, uploaded_files, vector_db_id=N llama_stack_api.client.vector_stores.files.create( vector_store_id=actual_db_id, file_id=file_obj.id, - attributes={"source": uploaded_file.name} + attributes={"source": original_filename} ) # Success diff --git a/frontend/pyproject.toml b/frontend/pyproject.toml index cfe2f1d..103693d 100644 --- a/frontend/pyproject.toml +++ b/frontend/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "llama-stack==__LLAMASTACK_VERSION__", "fire", "asyncpg", + "python-docx", ] [tool.setuptools]