Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

import io
import logging
import os

from docx import Document

logger = logging.getLogger(__name__)

LOCAL_SUPPORTED_EXTENSIONS = [".docx"]
PROVIDER_SUPPORTED_EXTENSIONS = [".txt", ".pdf", ".md"]


def extract_text_from_docx(file) -> str:
"""Extract all text content from a .docx file.

Reads paragraph text and table cell text from the document.

Args:
file: File-like object containing .docx data

Returns:
str: Extracted text with paragraphs separated by newlines
"""
doc = Document(file)
parts = [p.text for p in doc.paragraphs]

for table in doc.tables:
for row in table.rows:
for cell in row.cells:
parts.append(cell.text)

return "\n".join(parts)


def extract_text(file, filename: str) -> str:
"""Extract text from a locally supported file type.

Routes to the appropriate extractor based on file extension.

Args:
file: File-like object with document data
filename: Original filename used to determine the file type

Returns:
str: Extracted plain text content

Raises:
ValueError: If the file extension is not locally supported
"""
ext = os.path.splitext(filename)[1].lower()

if ext == ".docx":
return extract_text_from_docx(file)
else:
raise ValueError(f"Unsupported file type for local extraction: {ext}")


def create_text_file_from_extracted_content(
content: str, original_filename: str
) -> io.BytesIO:
"""Wrap extracted text as an in-memory .txt file for the Llama Stack API.

Creates a BytesIO object with .name and .size attributes so it can be
passed directly to the files.create API endpoint.

Args:
content: Extracted plain text to wrap
original_filename: Original filename; the stem is reused with a .txt extension

Returns:
io.BytesIO: In-memory text file ready for upload
"""
text_bytes = content.encode("utf-8")
text_file = io.BytesIO(text_bytes)
stem = os.path.splitext(original_filename)[0]
text_file.name = f"{stem}.txt"
text_file.size = len(text_bytes)
return text_file
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@

from llama_stack_ui.distribution.ui.modules.utils import get_vector_db_name, data_url_from_file
from llama_stack_ui.distribution.ui.modules.api import llama_stack_api
from llama_stack_ui.distribution.ui.modules.local_extractors import (
extract_text,
create_text_file_from_extracted_content,
LOCAL_SUPPORTED_EXTENSIONS,
)
# RAGDocument removed in 0.6.1 - using new files API instead


Expand Down Expand Up @@ -295,17 +300,30 @@ def _upload_documents_to_database(vector_db_name, uploaded_files, vector_db_id=N
actual_db_id = vector_db_id or vector_db_name
with st.spinner(f"Uploading {len(uploaded_files)} file(s) to '{vector_db_name}'..."):
for uploaded_file in uploaded_files:
original_filename = uploaded_file.name
file_ext = os.path.splitext(original_filename)[1].lower()

# Auto-detect DOCX and extract locally, let server handle PDF/TXT
if file_ext in LOCAL_SUPPORTED_EXTENSIONS:
st.caption(f"📄 Extracting text from {original_filename}...")
text_content = extract_text(uploaded_file, original_filename)
file_to_upload = create_text_file_from_extracted_content(
text_content, original_filename
)
else:
file_to_upload = uploaded_file

# Step 1: Upload file content to get file_id
file_obj = llama_stack_api.client.files.create(
file=uploaded_file,
file=file_to_upload,
purpose='assistants'
)

# Step 2: Add file to vector store (chunking handled server-side)
llama_stack_api.client.vector_stores.files.create(
vector_store_id=actual_db_id,
file_id=file_obj.id,
attributes={"source": uploaded_file.name}
attributes={"source": original_filename}
)

# Success
Expand Down
1 change: 1 addition & 0 deletions frontend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies = [
"llama-stack==__LLAMASTACK_VERSION__",
"fire",
"asyncpg",
"python-docx",
]

[tool.setuptools]
Expand Down