diff --git a/AGENTS.md b/AGENTS.md index 91f9d58..b8da0d7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -8,7 +8,7 @@ Solr MCP Server is a Spring AI Model Context Protocol (MCP) server that enables - **Status:** Apache incubating project (v0.0.2-SNAPSHOT) - **Java:** 25+ (centralized in build.gradle.kts) -- **Framework:** Spring Boot 3.5.8, Spring AI 1.1.2 +- **Framework:** Spring Boot 3.5.8, Spring AI 1.1.4 - **License:** Apache 2.0 ## Common Commands @@ -44,7 +44,7 @@ PROFILES=http ./gradlew bootRun # HTTP mode Four service classes expose MCP tools via `@McpTool` annotations: - **SearchService** (`search/`) - Full-text search with filtering, faceting, sorting, pagination -- **IndexingService** (`indexing/`) - Document indexing supporting JSON, CSV, XML formats +- **IndexingService** (`indexing/`) - Document indexing supporting JSON, CSV, XML formats and file uploads (text extracted by chat client) - **CollectionService** (`metadata/`) - List collections, get stats, health checks - **SchemaService** (`metadata/`) - Schema introspection @@ -53,6 +53,7 @@ Four service classes expose MCP tools via `@McpTool` annotations: `indexing/documentcreator/` uses strategy pattern for format parsing: - `SolrDocumentCreator` - Common interface - `JsonDocumentCreator`, `CsvDocumentCreator`, `XmlDocumentCreator` - Format implementations +- `FileDocumentCreator` - File content indexing (text already extracted by AI chat client) - `IndexingDocumentCreator` - Orchestrator that delegates to format-specific creators - `FieldNameSanitizer` - Automatic field name validation for Solr compatibility diff --git a/src/main/java/org/apache/solr/mcp/server/indexing/IndexingService.java b/src/main/java/org/apache/solr/mcp/server/indexing/IndexingService.java index e52ca7b..ff7de74 100644 --- a/src/main/java/org/apache/solr/mcp/server/indexing/IndexingService.java +++ b/src/main/java/org/apache/solr/mcp/server/indexing/IndexingService.java @@ -356,6 +356,52 @@ public void indexXmlDocuments(@McpToolParam(description = "Solr collection to in indexDocuments(collection, schemalessDoc); } + /** + * Indexes a document from file content into a Solr collection. + * + *

+ * This method accepts text content that has been extracted from a file by the + * AI chat client. When a user uploads a file (PDF, Word, etc.) through their + * chat client, the client extracts the text and passes it to this tool along + * with the original filename. + * + *

+ * A single SolrInputDocument is created with the following fields: + * + *

+ * + * @param collection + * the name of the Solr collection to index into + * @param content + * the text content extracted from the file + * @param filename + * the original filename (e.g. "report.pdf") + * @throws IOException + * if there are I/O errors during Solr communication + * @throws SolrServerException + * if Solr server encounters errors during indexing + * @see IndexingDocumentCreator#createSchemalessDocumentsFromFile(String, + * String) + * @see #indexDocuments(String, List) + */ + @PreAuthorize("isAuthenticated()") + @McpTool(name = "index-file-document", description = "Index a document from file content into a Solr collection. " + + "Use this when a user uploads a file (PDF, Word, Excel, etc.) and the text has " + + "already been extracted by the chat client. Pass the extracted text as content " + + "along with the original filename.") + public void indexFileDocument(@McpToolParam(description = "Solr collection to index into") String collection, + @McpToolParam(description = "Text content extracted from the file") String content, + @McpToolParam(description = "Original filename with extension (e.g. 'report.pdf')") String filename) + throws IOException, SolrServerException { + List documents = indexingDocumentCreator.createSchemalessDocumentsFromFile(content, + filename); + indexDocuments(collection, documents); + } + /** * Indexes a list of SolrInputDocument objects into a Solr collection using * batch processing. diff --git a/src/main/java/org/apache/solr/mcp/server/indexing/documentcreator/FileDocumentCreator.java b/src/main/java/org/apache/solr/mcp/server/indexing/documentcreator/FileDocumentCreator.java new file mode 100644 index 0000000..3235517 --- /dev/null +++ b/src/main/java/org/apache/solr/mcp/server/indexing/documentcreator/FileDocumentCreator.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.mcp.server.indexing.documentcreator; + +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.UUID; +import org.apache.solr.common.SolrInputDocument; +import org.springframework.stereotype.Component; + +/** + * Creates a SolrInputDocument from text content extracted from a file. + * + *

+ * This creator handles documents uploaded through AI chat clients, where the + * client has already extracted the text content from the original file (PDF, + * Word, etc.). It produces a single SolrInputDocument containing the text + * content and the original filename as metadata. + * + *

+ * This class does not implement {@link SolrDocumentCreator} because it requires + * a filename parameter in addition to the content string. + * + * @see IndexingDocumentCreator#createSchemalessDocumentsFromFile(String, + * String) + */ +@Component +public class FileDocumentCreator { + + private static final int MAX_INPUT_SIZE_BYTES = 10 * 1024 * 1024; + + /** + * Creates a SolrInputDocument from the provided text content and filename. + * + * @param content + * the text content extracted from the file + * @param filename + * the original filename (stored as metadata for search and + * filtering) + * @return a list containing a single SolrInputDocument + * @throws DocumentProcessingException + * if the content is null, empty, or exceeds the size limit + */ + public List create(String content, String filename) throws DocumentProcessingException { + if (content == null || content.isBlank()) { + throw new DocumentProcessingException("File content cannot be null or empty"); + } + if (filename == null || filename.isBlank()) { + throw new DocumentProcessingException("Filename cannot be null or empty"); + } + if (content.getBytes(StandardCharsets.UTF_8).length > MAX_INPUT_SIZE_BYTES) { + throw new DocumentProcessingException( + "Input too large: exceeds maximum size of " + MAX_INPUT_SIZE_BYTES + " bytes"); + } + + SolrInputDocument doc = new SolrInputDocument(); + doc.addField("id", UUID.randomUUID().toString()); + doc.addField("content", content); + doc.addField("filename", filename); + return List.of(doc); + } + +} diff --git a/src/main/java/org/apache/solr/mcp/server/indexing/documentcreator/IndexingDocumentCreator.java b/src/main/java/org/apache/solr/mcp/server/indexing/documentcreator/IndexingDocumentCreator.java index f20489a..0400926 100644 --- a/src/main/java/org/apache/solr/mcp/server/indexing/documentcreator/IndexingDocumentCreator.java +++ b/src/main/java/org/apache/solr/mcp/server/indexing/documentcreator/IndexingDocumentCreator.java @@ -62,11 +62,14 @@ public class IndexingDocumentCreator { private final JsonDocumentCreator jsonDocumentCreator; + private final FileDocumentCreator fileDocumentCreator; + public IndexingDocumentCreator(XmlDocumentCreator xmlDocumentCreator, CsvDocumentCreator csvDocumentCreator, - JsonDocumentCreator jsonDocumentCreator) { + JsonDocumentCreator jsonDocumentCreator, FileDocumentCreator fileDocumentCreator) { this.xmlDocumentCreator = xmlDocumentCreator; this.csvDocumentCreator = csvDocumentCreator; this.jsonDocumentCreator = jsonDocumentCreator; + this.fileDocumentCreator = fileDocumentCreator; } /** @@ -134,4 +137,31 @@ public List createSchemalessDocumentsFromXml(String xml) thro return xmlDocumentCreator.create(xml); } + + /** + * Creates a SolrInputDocument from text content extracted from a file. + * + *

+ * This method is intended for documents uploaded through AI chat clients, where + * the client has already extracted the text content from the original file. + * + * @param content + * the text content extracted from the file + * @param filename + * the original filename (stored as metadata) + * @return list of SolrInputDocument objects ready for indexing + * @throws DocumentProcessingException + * if content extraction fails + * @see FileDocumentCreator + */ + public List createSchemalessDocumentsFromFile(String content, String filename) + throws DocumentProcessingException { + if (content == null || content.isBlank()) { + throw new IllegalArgumentException("File content cannot be null or empty"); + } + if (filename == null || filename.isBlank()) { + throw new IllegalArgumentException("Filename cannot be null or empty"); + } + return fileDocumentCreator.create(content, filename); + } } diff --git a/src/test/java/org/apache/solr/mcp/server/indexing/FileIndexingTest.java b/src/test/java/org/apache/solr/mcp/server/indexing/FileIndexingTest.java new file mode 100644 index 0000000..3185e3d --- /dev/null +++ b/src/test/java/org/apache/solr/mcp/server/indexing/FileIndexingTest.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.mcp.server.indexing; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.mcp.server.indexing.documentcreator.IndexingDocumentCreator; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.TestPropertySource; + +/** + * Integration test for file document indexing through + * {@link IndexingDocumentCreator}. + */ +@SpringBootTest +@TestPropertySource(locations = "classpath:application.properties") +class FileIndexingTest { + + @Autowired + private IndexingDocumentCreator indexingDocumentCreator; + + @Test + void testCreateSchemalessDocumentsFromFile() throws Exception { + String content = "This is the text extracted from a PDF about Apache Solr full-text search."; + + List documents = indexingDocumentCreator.createSchemalessDocumentsFromFile(content, + "test-document.pdf"); + + assertThat(documents).hasSize(1); + + SolrInputDocument doc = documents.getFirst(); + assertThat(doc.getFieldValue("content")).isEqualTo(content); + assertThat(doc.getFieldValue("filename")).isEqualTo("test-document.pdf"); + assertThat(doc.getFieldValue("id")).isNotNull(); + } + + @Test + void testCreateSchemalessDocumentsFromFileWithNullContent() { + assertThatThrownBy(() -> indexingDocumentCreator.createSchemalessDocumentsFromFile(null, "test.txt")) + .isInstanceOf(IllegalArgumentException.class).hasMessageContaining("null or empty"); + } + + @Test + void testCreateSchemalessDocumentsFromFileWithNullFilename() { + assertThatThrownBy(() -> indexingDocumentCreator.createSchemalessDocumentsFromFile("content", null)) + .isInstanceOf(IllegalArgumentException.class).hasMessageContaining("null or empty"); + } + + @Test + void testCreateSchemalessDocumentsFromFileWithInvalidContent() { + assertThatThrownBy(() -> indexingDocumentCreator.createSchemalessDocumentsFromFile(" ", "test.txt")) + .isInstanceOf(IllegalArgumentException.class).hasMessageContaining("null or empty"); + } + + @Test + void testCreateSchemalessDocumentsFromFilePreservesMultilineContent() throws Exception { + String content = """ + Chapter 1: Introduction to Search + + Apache Solr provides distributed indexing, replication, and + load-balanced querying. It is designed for scalability and + fault tolerance. + + Chapter 2: Getting Started + """; + + List documents = indexingDocumentCreator.createSchemalessDocumentsFromFile(content, + "guide.docx"); + + assertThat(documents).hasSize(1); + assertThat(documents.getFirst().getFieldValue("content").toString()).contains("Chapter 1"); + assertThat(documents.getFirst().getFieldValue("content").toString()).contains("scalability"); + } + +} diff --git a/src/test/java/org/apache/solr/mcp/server/indexing/IndexingServiceDirectTest.java b/src/test/java/org/apache/solr/mcp/server/indexing/IndexingServiceDirectTest.java index 4d57b34..75ffa74 100644 --- a/src/test/java/org/apache/solr/mcp/server/indexing/IndexingServiceDirectTest.java +++ b/src/test/java/org/apache/solr/mcp/server/indexing/IndexingServiceDirectTest.java @@ -47,7 +47,7 @@ class IndexingServiceDirectTest { @BeforeEach void setUp() { indexingDocumentCreator = new IndexingDocumentCreator(new XmlDocumentCreator(), new CsvDocumentCreator(), - new JsonDocumentCreator()); + new JsonDocumentCreator(), new FileDocumentCreator()); indexingService = new IndexingService(solrClient, indexingDocumentCreator); } diff --git a/src/test/java/org/apache/solr/mcp/server/indexing/IndexingServiceTest.java b/src/test/java/org/apache/solr/mcp/server/indexing/IndexingServiceTest.java index c0bec69..1eb9e1b 100644 --- a/src/test/java/org/apache/solr/mcp/server/indexing/IndexingServiceTest.java +++ b/src/test/java/org/apache/solr/mcp/server/indexing/IndexingServiceTest.java @@ -31,6 +31,7 @@ import org.apache.solr.common.SolrInputDocument; import org.apache.solr.mcp.server.TestcontainersConfiguration; import org.apache.solr.mcp.server.indexing.documentcreator.CsvDocumentCreator; +import org.apache.solr.mcp.server.indexing.documentcreator.FileDocumentCreator; import org.apache.solr.mcp.server.indexing.documentcreator.IndexingDocumentCreator; import org.apache.solr.mcp.server.indexing.documentcreator.JsonDocumentCreator; import org.apache.solr.mcp.server.indexing.documentcreator.XmlDocumentCreator; @@ -77,8 +78,9 @@ void setUp() throws Exception { CsvDocumentCreator csvDocumentCreator = new CsvDocumentCreator(); JsonDocumentCreator jsonDocumentCreator = new JsonDocumentCreator(); + FileDocumentCreator fileDocumentCreator = new FileDocumentCreator(); indexingDocumentCreator = new IndexingDocumentCreator(xmlDocumentCreator, csvDocumentCreator, - jsonDocumentCreator); + jsonDocumentCreator, fileDocumentCreator); indexingService = new IndexingService(solrClient, indexingDocumentCreator); searchService = new SearchService(solrClient); @@ -757,6 +759,50 @@ void testDirectSanitizeFieldName() throws Exception { assertEquals("Value 6", doc.getFieldValue("trailing_underscores")); assertEquals("Value 7", doc.getFieldValue("multiple_underscores")); } + + @Test + void testIndexFileDocumentAndSearch() throws Exception { + String content = "Apache Solr provides distributed indexing and search with unique_file_test_marker_42"; + String filename = "solr-guide.pdf"; + + indexingService.indexFileDocument(COLLECTION_NAME, content, filename); + + // Search by content + SearchResponse result = searchService.search(COLLECTION_NAME, "content:unique_file_test_marker_42", null, null, + null, null, null); + + assertNotNull(result); + List> documents = result.documents(); + assertEquals(1, documents.size()); + + Map doc = documents.getFirst(); + assertNotNull(doc.get("content")); + String returnedContent = (String) getFieldValue(doc, "content"); + assertTrue(returnedContent.contains("unique_file_test_marker_42")); + assertEquals("solr-guide.pdf", getFieldValue(doc, "filename")); + assertNotNull(doc.get("id")); + } + + @Test + void testIndexFileDocumentSearchByFilename() throws Exception { + String content = "This is a quarterly financial report for Q3 2025."; + String filename = "q3_2025_financials_unique_test.xlsx"; + + indexingService.indexFileDocument(COLLECTION_NAME, content, filename); + + // Search by filename + SearchResponse result = searchService.search(COLLECTION_NAME, "filename:q3_2025_financials_unique_test.xlsx", + null, null, null, null, null); + + assertNotNull(result); + List> documents = result.documents(); + assertEquals(1, documents.size()); + + Map doc = documents.getFirst(); + assertEquals("q3_2025_financials_unique_test.xlsx", getFieldValue(doc, "filename")); + String returnedContent = (String) getFieldValue(doc, "content"); + assertTrue(returnedContent.contains("quarterly financial report")); + } } @Nested @@ -1015,6 +1061,36 @@ void indexCsvDocuments_WhenSolrClientThrowsIOException_ShouldPropagateException( verify(solrClient).add(eq("test_collection"), any(SolrInputDocument.class)); } + @Test + void indexFileDocument_WithValidInput_ShouldIndexDocuments() throws Exception { + String content = "Extracted text from PDF"; + String filename = "report.pdf"; + List mockDocs = createMockDocuments(1); + when(indexingDocumentCreator.createSchemalessDocumentsFromFile(content, filename)).thenReturn(mockDocs); + when(solrClient.add(eq("test_collection"), any(Collection.class))).thenReturn(null); + when(solrClient.commit("test_collection")).thenReturn(null); + + indexingService.indexFileDocument("test_collection", content, filename); + + verify(indexingDocumentCreator).createSchemalessDocumentsFromFile(content, filename); + verify(solrClient).add(eq("test_collection"), any(Collection.class)); + verify(solrClient).commit("test_collection"); + } + + @Test + void indexFileDocument_WhenDocumentCreatorThrowsException_ShouldPropagateException() throws Exception { + String content = ""; + String filename = "report.pdf"; + when(indexingDocumentCreator.createSchemalessDocumentsFromFile(content, filename)) + .thenThrow(new IllegalArgumentException("File content cannot be null or empty")); + + assertThrows(IllegalArgumentException.class, () -> { + indexingService.indexFileDocument("test_collection", content, filename); + }); + verify(solrClient, never()).add(anyString(), any(Collection.class)); + verify(solrClient, never()).commit(anyString()); + } + @Test void indexDocuments_WithRuntimeException_ShouldRetryIndividually() throws Exception { List docs = createMockDocuments(2); diff --git a/src/test/java/org/apache/solr/mcp/server/indexing/documentcreator/FileDocumentCreatorTest.java b/src/test/java/org/apache/solr/mcp/server/indexing/documentcreator/FileDocumentCreatorTest.java new file mode 100644 index 0000000..77ee618 --- /dev/null +++ b/src/test/java/org/apache/solr/mcp/server/indexing/documentcreator/FileDocumentCreatorTest.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.mcp.server.indexing.documentcreator; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.solr.common.SolrInputDocument; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class FileDocumentCreatorTest { + + private FileDocumentCreator fileDocumentCreator; + + @BeforeEach + void setUp() { + fileDocumentCreator = new FileDocumentCreator(); + } + + @Test + void create_withValidInput_shouldReturnDocumentWithContentAndFilename() { + String content = "This is the text extracted from a PDF about Apache Solr."; + + List docs = fileDocumentCreator.create(content, "report.pdf"); + + assertThat(docs).hasSize(1); + SolrInputDocument doc = docs.getFirst(); + assertThat(doc.getFieldValue("content")).isEqualTo(content); + assertThat(doc.getFieldValue("filename")).isEqualTo("report.pdf"); + assertThat(doc.getFieldValue("id")).isNotNull(); + } + + @Test + void create_shouldGenerateUniqueIds() { + String content = "Some document content."; + + List docs1 = fileDocumentCreator.create(content, "file1.txt"); + List docs2 = fileDocumentCreator.create(content, "file2.txt"); + + assertThat(docs1.getFirst().getFieldValue("id")).isNotEqualTo(docs2.getFirst().getFieldValue("id")); + } + + @Test + void create_withNullContent_shouldThrowException() { + assertThatThrownBy(() -> fileDocumentCreator.create(null, "test.txt")) + .isInstanceOf(DocumentProcessingException.class).hasMessageContaining("null or empty"); + } + + @Test + void create_withEmptyContent_shouldThrowException() { + assertThatThrownBy(() -> fileDocumentCreator.create("", "test.txt")) + .isInstanceOf(DocumentProcessingException.class).hasMessageContaining("null or empty"); + } + + @Test + void create_withBlankContent_shouldThrowException() { + assertThatThrownBy(() -> fileDocumentCreator.create(" ", "test.txt")) + .isInstanceOf(DocumentProcessingException.class).hasMessageContaining("null or empty"); + } + + @Test + void create_withNullFilename_shouldThrowException() { + assertThatThrownBy(() -> fileDocumentCreator.create("Some content", null)) + .isInstanceOf(DocumentProcessingException.class).hasMessageContaining("null or empty"); + } + + @Test + void create_withEmptyFilename_shouldThrowException() { + assertThatThrownBy(() -> fileDocumentCreator.create("Some content", "")) + .isInstanceOf(DocumentProcessingException.class).hasMessageContaining("null or empty"); + } + + @Test + void create_withOversizedContent_shouldThrowException() { + String largeContent = "x".repeat(11 * 1024 * 1024); + + assertThatThrownBy(() -> fileDocumentCreator.create(largeContent, "large.pdf")) + .isInstanceOf(DocumentProcessingException.class).hasMessageContaining("too large"); + } + + @Test + void create_withMultilineContent_shouldPreserveContent() { + String content = """ + Chapter 1: Introduction + + Apache Solr is an open source search platform. + It provides full-text search, faceting, and more. + + Chapter 2: Architecture + """; + + List docs = fileDocumentCreator.create(content, "book.docx"); + + assertThat(docs).hasSize(1); + assertThat(docs.getFirst().getFieldValue("content").toString()).contains("Chapter 1"); + assertThat(docs.getFirst().getFieldValue("content").toString()).contains("Architecture"); + } + +}