From 3b0a0afca51cfd9facd3558b423c2549783f81c3 Mon Sep 17 00:00:00 2001 From: Eric Deandrea Date: Mon, 15 Jun 2026 15:56:29 -0400 Subject: [PATCH] feat(api): support /v1/convert/source/batch endpoint Add batch document conversion support for the new endpoint introduced in docling-serve v1.22.0. The batch endpoint processes multiple document sources asynchronously, returning a task ID for tracking progress. New types: - BatchConvertDocumentRequest: request model with sources, target, options, and optional webhook callbacks - CallbackSpec: webhook callback specification for progress notifications - Jackson2/3ValidationErrorDetailListDeserializer: handle server 422 responses where detail is a string instead of a list API changes: - DoclingServeConvertApi: add convertSourceBatch() and convertSourceBatchAsync() methods - S3Source: add maxNumElements field for capping objects in batch mode - ValidationError: fix deserialization when detail is a plain string Closes #540 Signed-off-by: Eric Deandrea Signed-off-by: Eric Deandrea --- .../serve/api/DoclingServeConvertApi.java | 38 +++ .../request/BatchConvertDocumentRequest.java | 93 ++++++ .../api/convert/request/CallbackSpec.java | 58 ++++ .../api/convert/request/source/S3Source.java | 11 + ...ValidationErrorDetailListDeserializer.java | 24 ++ ...ValidationErrorDetailListDeserializer.java | 24 ++ .../serve/api/validation/ValidationError.java | 6 + .../serve/client/DoclingServeClient.java | 32 ++- .../client/operations/ConvertOperations.java | 14 + .../AbstractDoclingServeClientTests.java | 271 ++++++++++++++++++ docs/src/doc/docs/whats-new.md | 4 + 11 files changed, 569 insertions(+), 6 deletions(-) create mode 100644 docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/BatchConvertDocumentRequest.java create mode 100644 docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/CallbackSpec.java create mode 100644 docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/serialization/Jackson2ValidationErrorDetailListDeserializer.java create mode 100644 docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/serialization/Jackson3ValidationErrorDetailListDeserializer.java diff --git a/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/DoclingServeConvertApi.java b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/DoclingServeConvertApi.java index a8503462..91ff50b1 100644 --- a/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/DoclingServeConvertApi.java +++ b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/DoclingServeConvertApi.java @@ -6,8 +6,10 @@ import org.jspecify.annotations.Nullable; +import ai.docling.serve.api.convert.request.BatchConvertDocumentRequest; import ai.docling.serve.api.convert.request.ConvertDocumentRequest; import ai.docling.serve.api.convert.response.ConvertDocumentResponse; +import ai.docling.serve.api.task.response.TaskStatusPollResponse; import ai.docling.serve.api.util.FileUtils; import ai.docling.serve.api.util.ValidationUtils; @@ -107,6 +109,42 @@ default CompletionStage convertFilesAsync(@Nullable Con return convertSourceAsync(createRequest(request, files)); } + /** + * Submits a batch conversion request for processing multiple document sources asynchronously. + * + *

This method posts the batch request to the server, which returns a task status containing + * a task ID. The caller can then use {@link DoclingServeTaskApi#pollTaskStatus(ai.docling.serve.api.task.request.TaskStatusPollRequest)} + * and {@link DoclingServeTaskApi#convertTaskResult(ai.docling.serve.api.task.request.TaskResultRequest)} + * to track and retrieve results. + * + * @param request the {@link BatchConvertDocumentRequest} containing the sources, target, conversion options, and optional callbacks. + * @return a {@link TaskStatusPollResponse} containing the task ID and initial status. + * @throws ai.docling.serve.api.validation.ValidationException If request validation fails for any reason. + */ + TaskStatusPollResponse convertSourceBatch(BatchConvertDocumentRequest request); + + /** + * Submits a batch conversion request and automatically polls for completion. + * + *

This method submits the batch request, polls the task status in the background, + * and completes the returned future with the conversion result when all documents + * have been processed. + * + *

Example usage: + *

{@code
+   * client.convertSourceBatchAsync(request)
+   *     .thenAccept(response -> System.out.println("Batch complete"))
+   *     .exceptionally(ex -> { ex.printStackTrace(); return null; });
+   * }
+ * + * @param request the {@link BatchConvertDocumentRequest} containing the sources, target, conversion options, and optional callbacks. + * @return a {@link CompletionStage} that completes with the {@link ConvertDocumentResponse} + * when all documents have been processed, or completes exceptionally if the + * batch conversion fails or times out. + * @throws ai.docling.serve.api.validation.ValidationException If request validation fails for any reason. + */ + CompletionStage convertSourceBatchAsync(BatchConvertDocumentRequest request); + private ConvertDocumentRequest createRequest(@Nullable ConvertDocumentRequest request, Path... files) { ValidationUtils.ensureNotEmpty(files, "files"); diff --git a/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/BatchConvertDocumentRequest.java b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/BatchConvertDocumentRequest.java new file mode 100644 index 00000000..ec5d55ba --- /dev/null +++ b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/BatchConvertDocumentRequest.java @@ -0,0 +1,93 @@ +package ai.docling.serve.api.convert.request; + +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonSetter; +import com.fasterxml.jackson.annotation.Nulls; + +import ai.docling.serve.api.convert.request.options.ConvertDocumentOptions; +import ai.docling.serve.api.convert.request.source.Source; +import ai.docling.serve.api.convert.request.target.Target; + +/** + * Represents a request to batch convert document sources. The batch endpoint processes multiple + * documents asynchronously and returns a task ID for tracking progress. Sources can be HTTP URLs + * or S3 buckets, and results are delivered to a presigned URL or S3 target. + * + *

This class is serialized into JSON to conform to the API specification using + * {@link JsonProperty} annotations. Fields with {@code null} values or empty collections + * are omitted from the serialized JSON using {@link JsonInclude}. + */ +@JsonInclude(JsonInclude.Include.NON_EMPTY) +@tools.jackson.databind.annotation.JsonDeserialize(builder = BatchConvertDocumentRequest.Builder.class) +@lombok.extern.jackson.Jacksonized +@lombok.Builder(toBuilder = true) +@lombok.Getter +@lombok.ToString +public class BatchConvertDocumentRequest { + /** + * List of document sources to be converted. + * Each source can be an HTTP URL or S3 reference. + * + * @param sources the list of document sources + * @return the list of document sources + */ + @JsonProperty("sources") + @JsonSetter(nulls = Nulls.AS_EMPTY) + @lombok.Singular + private List sources; + + /** + * Target specification for where the converted documents should be delivered. + * Must be either a {@link ai.docling.serve.api.convert.request.target.PresignedUrlTarget} + * or {@link ai.docling.serve.api.convert.request.target.S3Target}. + * + * @param target the output target + * @return the output target + */ + @JsonProperty("target") + @lombok.NonNull + private Target target; + + /** + * Options controlling the document conversion process. + * Includes settings for OCR, output formats, processing pipelines, and more. + * + * @param options the conversion options + * @return the conversion options + */ + @JsonProperty("options") + @lombok.NonNull + @lombok.Builder.Default + private ConvertDocumentOptions options = ConvertDocumentOptions.builder().build(); + + /** + * Webhook callbacks for receiving progress notifications during batch processing. + * + * @param callbacks the list of callback specifications + * @return the list of callback specifications + */ + @JsonProperty("callbacks") + @JsonSetter(nulls = Nulls.AS_EMPTY) + @lombok.Singular + private List callbacks; + + /** + * Builder for creating {@link BatchConvertDocumentRequest} instances. + * Generated by Lombok's {@code @Builder} annotation. + * + *

Builder methods: + *

    + *
  • {@code source(Source)} - Add a single document source
  • + *
  • {@code sources(List)} - Set the list of document sources
  • + *
  • {@code target(Target)} - Set the output target
  • + *
  • {@code options(ConvertDocumentOptions)} - Set the conversion options
  • + *
  • {@code callback(CallbackSpec)} - Add a single callback specification
  • + *
  • {@code callbacks(List)} - Set the list of callback specifications
  • + *
+ */ + @tools.jackson.databind.annotation.JsonPOJOBuilder(withPrefix = "") + public static class Builder { } +} diff --git a/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/CallbackSpec.java b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/CallbackSpec.java new file mode 100644 index 00000000..984183ff --- /dev/null +++ b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/CallbackSpec.java @@ -0,0 +1,58 @@ +package ai.docling.serve.api.convert.request; + +import java.net.URI; +import java.util.Map; + +import org.jspecify.annotations.Nullable; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonSetter; +import com.fasterxml.jackson.annotation.Nulls; + +/** + * Represents a webhook callback specification for batch conversion progress notifications. + * When configured, the server sends POST requests to the specified URL with progress updates + * as documents are processed. + */ +@JsonInclude(JsonInclude.Include.NON_EMPTY) +@tools.jackson.databind.annotation.JsonDeserialize(builder = CallbackSpec.Builder.class) +@lombok.extern.jackson.Jacksonized +@lombok.Builder(toBuilder = true) +@lombok.Getter +@lombok.ToString +public class CallbackSpec { + /** + * The webhook URL that receives POST progress updates. + * + * @param url the webhook URL + * @return the webhook URL + */ + @JsonProperty("url") + @lombok.NonNull + private URI url; + + /** + * Additional headers sent with callback requests. + * + * @param headers the additional headers + * @return the additional headers + */ + @JsonProperty("headers") + @JsonSetter(nulls = Nulls.AS_EMPTY) + @lombok.Singular + private Map headers; + + /** + * Custom CA certificate (PEM) for endpoint verification. + * + * @param caCert the CA certificate in PEM format + * @return the CA certificate in PEM format + */ + @JsonProperty("ca_cert") + @Nullable + private String caCert; + + @tools.jackson.databind.annotation.JsonPOJOBuilder(withPrefix = "") + public static class Builder { } +} diff --git a/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/source/S3Source.java b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/source/S3Source.java index cb9d1336..59469af7 100644 --- a/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/source/S3Source.java +++ b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/convert/request/source/S3Source.java @@ -69,6 +69,17 @@ public final class S3Source extends Source { @lombok.Builder.Default private boolean verifySsl = true; + /** + * Maximum number of elements to read from S3. When set, limits the number of objects processed. + * Useful for batch conversions where the S3 source may contain a large number of objects. + * + * @param maxNumElements Maximum number of elements to read from S3. + * @return Maximum number of elements to read from S3. + */ + @JsonProperty("max_num_elements") + @org.jspecify.annotations.Nullable + private Integer maxNumElements; + @tools.jackson.databind.annotation.JsonPOJOBuilder(withPrefix = "") public static class Builder { } } diff --git a/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/serialization/Jackson2ValidationErrorDetailListDeserializer.java b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/serialization/Jackson2ValidationErrorDetailListDeserializer.java new file mode 100644 index 00000000..89f69cef --- /dev/null +++ b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/serialization/Jackson2ValidationErrorDetailListDeserializer.java @@ -0,0 +1,24 @@ +package ai.docling.serve.api.serialization; + +import java.io.IOException; +import java.util.List; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; + +import ai.docling.serve.api.validation.ValidationErrorDetail; + +public class Jackson2ValidationErrorDetailListDeserializer extends JsonDeserializer> { + + @Override + public List deserialize(JsonParser p, DeserializationContext ctxt) throws IOException { + if (p.currentToken() == JsonToken.VALUE_STRING) { + return List.of(ValidationErrorDetail.builder().message(p.getText()).build()); + } + + var type = ctxt.getTypeFactory().constructCollectionType(List.class, ValidationErrorDetail.class); + return ctxt.readValue(p, type); + } +} diff --git a/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/serialization/Jackson3ValidationErrorDetailListDeserializer.java b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/serialization/Jackson3ValidationErrorDetailListDeserializer.java new file mode 100644 index 00000000..a66428b8 --- /dev/null +++ b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/serialization/Jackson3ValidationErrorDetailListDeserializer.java @@ -0,0 +1,24 @@ +package ai.docling.serve.api.serialization; + +import tools.jackson.core.JacksonException; +import tools.jackson.core.JsonParser; +import tools.jackson.core.JsonToken; +import tools.jackson.databind.DeserializationContext; +import tools.jackson.databind.ValueDeserializer; + +import java.util.List; + +import ai.docling.serve.api.validation.ValidationErrorDetail; + +public class Jackson3ValidationErrorDetailListDeserializer extends ValueDeserializer> { + + @Override + public List deserialize(JsonParser p, DeserializationContext ctxt) throws JacksonException { + if (p.currentToken() == JsonToken.VALUE_STRING) { + return List.of(ValidationErrorDetail.builder().message(p.getText()).build()); + } + + var type = ctxt.getTypeFactory().constructCollectionType(List.class, ValidationErrorDetail.class); + return ctxt.readValue(p, type); + } +} diff --git a/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/validation/ValidationError.java b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/validation/ValidationError.java index 821faba1..e3a23102 100644 --- a/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/validation/ValidationError.java +++ b/docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/validation/ValidationError.java @@ -6,6 +6,10 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonSetter; import com.fasterxml.jackson.annotation.Nulls; +import com.fasterxml.jackson.databind.annotation.JsonDeserialize; + +import ai.docling.serve.api.serialization.Jackson2ValidationErrorDetailListDeserializer; +import ai.docling.serve.api.serialization.Jackson3ValidationErrorDetailListDeserializer; /** * Represents a validation error with customizable serialization and deserialization behavior. @@ -34,6 +38,8 @@ public class ValidationError { */ @JsonProperty("detail") @JsonSetter(nulls = Nulls.AS_EMPTY) + @JsonDeserialize(using = Jackson2ValidationErrorDetailListDeserializer.class) + @tools.jackson.databind.annotation.JsonDeserialize(using = Jackson3ValidationErrorDetailListDeserializer.class) @lombok.Singular private List errorDetails; diff --git a/docling-serve/docling-serve-client/src/main/java/ai/docling/serve/client/DoclingServeClient.java b/docling-serve/docling-serve-client/src/main/java/ai/docling/serve/client/DoclingServeClient.java index eb172aa0..acbde249 100644 --- a/docling-serve/docling-serve-client/src/main/java/ai/docling/serve/client/DoclingServeClient.java +++ b/docling-serve/docling-serve-client/src/main/java/ai/docling/serve/client/DoclingServeClient.java @@ -21,6 +21,7 @@ import java.util.Optional; import java.util.concurrent.CompletionStage; import java.util.concurrent.Flow.Subscriber; +import java.util.stream.Collectors; import org.jspecify.annotations.Nullable; import org.slf4j.Logger; @@ -38,6 +39,7 @@ import ai.docling.serve.api.clear.request.ClearConvertersRequest; import ai.docling.serve.api.clear.request.ClearResultsRequest; import ai.docling.serve.api.clear.response.ClearResponse; +import ai.docling.serve.api.convert.request.BatchConvertDocumentRequest; import ai.docling.serve.api.convert.request.ConvertDocumentRequest; import ai.docling.serve.api.convert.response.ConvertDocumentResponse; import ai.docling.serve.api.health.HealthCheckResponse; @@ -47,6 +49,7 @@ import ai.docling.serve.api.util.Utils; import ai.docling.serve.api.util.ValidationUtils; import ai.docling.serve.api.validation.ValidationError; +import ai.docling.serve.api.validation.ValidationErrorDetail; import ai.docling.serve.api.validation.ValidationException; import ai.docling.serve.client.operations.ChunkOperations; import ai.docling.serve.client.operations.ClearOperations; @@ -302,8 +305,8 @@ protected T getResponse(HttpRequest request, HttpResponse response, Class var statusCode = response.statusCode(); - if(statusCode >= 400) { - if(StreamResponse.class.equals(expectedReturnType)) { + if (statusCode >= 400) { + if (StreamResponse.class.equals(expectedReturnType)) { // typical 4XX & 5XX responses are usually accompanied by JSON response bodies // hence, reading the stream here. try (InputStream is = (InputStream) body){ @@ -313,17 +316,24 @@ protected T getResponse(HttpRequest request, HttpResponse response, Class } } - if(statusCode == 422) { + if (statusCode == 422) { + var validationError = readValue(body.toString(), ValidationError.class); + var errorText = validationError.getErrorDetails() + .stream() + .map(ValidationErrorDetail::getMessage) + .filter(Objects::nonNull) + .collect(Collectors.joining("\n")); + throw new ValidationException( - readValue(body.toString(), ValidationError.class), - "An error occurred while making %s request to %s".formatted(request.method(), request.uri()) + validationError, + "An error occurred while making %s request to %s:\n%s".formatted(request.method(), request.uri(), errorText) ); } else { throw new DoclingServeClientException("An error occurred: %s".formatted(body.toString()), statusCode, body.toString()); } } - if(StreamResponse.class.equals(expectedReturnType)) { + if (StreamResponse.class.equals(expectedReturnType)) { return (T) StreamResponse .builder() .headers(headerName -> response.headers().firstValue(headerName)) @@ -394,6 +404,16 @@ public CompletionStage convertSourceAsync(ConvertDocume return this.convertOps.convertSourceAsync(request); } + @Override + public TaskStatusPollResponse convertSourceBatch(BatchConvertDocumentRequest request) { + return this.convertOps.convertSourceBatch(request); + } + + @Override + public CompletionStage convertSourceBatchAsync(BatchConvertDocumentRequest request) { + return this.convertOps.convertSourceBatchAsync(request); + } + private class LoggingBodyPublisher implements BodyPublisher { private final BodyPublisher delegate; private final String stringContent; diff --git a/docling-serve/docling-serve-client/src/main/java/ai/docling/serve/client/operations/ConvertOperations.java b/docling-serve/docling-serve-client/src/main/java/ai/docling/serve/client/operations/ConvertOperations.java index 7b96fb54..53022860 100644 --- a/docling-serve/docling-serve-client/src/main/java/ai/docling/serve/client/operations/ConvertOperations.java +++ b/docling-serve/docling-serve-client/src/main/java/ai/docling/serve/client/operations/ConvertOperations.java @@ -5,6 +5,7 @@ import ai.docling.serve.api.DoclingServeConvertApi; import ai.docling.serve.api.DoclingServeTaskApi; +import ai.docling.serve.api.convert.request.BatchConvertDocumentRequest; import ai.docling.serve.api.convert.request.ConvertDocumentRequest; import ai.docling.serve.api.convert.request.target.PresignedUrlTarget; import ai.docling.serve.api.convert.request.target.PutTarget; @@ -13,6 +14,7 @@ import ai.docling.serve.api.convert.response.ConvertDocumentResponse; import ai.docling.serve.api.convert.response.ZipArchiveConvertDocumentResponse; import ai.docling.serve.api.task.request.TaskResultRequest; +import ai.docling.serve.api.task.response.TaskStatusPollResponse; import ai.docling.serve.api.util.Utils; import ai.docling.serve.api.util.ValidationUtils; @@ -82,4 +84,16 @@ private RequestContext createRequestContext(String uri, I request, public CompletionStage convertSourceAsync(ConvertDocumentRequest request) { return executeAsync(request, "/v1/convert/source/async"); } + + @Override + public TaskStatusPollResponse convertSourceBatch(BatchConvertDocumentRequest request) { + ValidationUtils.ensureNotNull(request, "request"); + return this.httpOperations.executePost( + createRequestContext("/v1/convert/source/batch", request, TaskStatusPollResponse.class)); + } + + @Override + public CompletionStage convertSourceBatchAsync(BatchConvertDocumentRequest request) { + return executeAsync(request, "/v1/convert/source/batch"); + } } diff --git a/docling-serve/docling-serve-client/src/test/java/ai/docling/serve/client/AbstractDoclingServeClientTests.java b/docling-serve/docling-serve-client/src/test/java/ai/docling/serve/client/AbstractDoclingServeClientTests.java index 78713036..5c17d431 100644 --- a/docling-serve/docling-serve-client/src/test/java/ai/docling/serve/client/AbstractDoclingServeClientTests.java +++ b/docling-serve/docling-serve-client/src/test/java/ai/docling/serve/client/AbstractDoclingServeClientTests.java @@ -66,6 +66,8 @@ import ai.docling.serve.api.clear.request.ClearConvertersRequest; import ai.docling.serve.api.clear.request.ClearResultsRequest; import ai.docling.serve.api.clear.response.ClearResponse; +import ai.docling.serve.api.convert.request.BatchConvertDocumentRequest; +import ai.docling.serve.api.convert.request.CallbackSpec; import ai.docling.serve.api.convert.request.ConvertDocumentRequest; import ai.docling.serve.api.convert.request.options.ConvertDocumentOptions; import ai.docling.serve.api.convert.request.options.ImageRefMode; @@ -1180,6 +1182,275 @@ void convertAsyncFilesNotRegularFile() { .isThrownBy(() -> getDoclingClient().convertFilesAsync(Path.of("src", "test", "resources"))) .withMessage("File (src/test/resources) is not a regular file"); } + + @Test + void shouldConvertSourceBatchWithPresignedUrlTarget() { + var request = BatchConvertDocumentRequest.builder() + .source( + HttpSource.builder() + .url(URI.create("https://arxiv.org/pdf/2408.09869")) + .build() + ) + .source( + HttpSource.builder() + .url(URI.create("https://arxiv.org/pdf/2501.17887")) + .build() + ) + .target(PresignedUrlTarget.builder().build()) + .build(); + + var wireMockServer = getWiremockServer(); + + wireMockServer.stubFor( + post("/v1/convert/source/batch") + .withRequestBody(equalToJson(writeValueAsString(request))) + .withHeader("Content-Type", equalTo("application/json")) + .withHeader("Accept", equalTo("application/json")) + .willReturn(okJson(""" + { + "task_id": "batch-task-123", + "task_type": "convert", + "task_status": "pending", + "task_position": 1, + "task_meta": null + } + """)) + ); + + var response = getDoclingClient(false, true).convertSourceBatch(request); + assertThat(response).isNotNull(); + assertThat(response.getTaskId()).isEqualTo("batch-task-123"); + assertThat(response.getTaskStatus()).isEqualTo(TaskStatus.PENDING); + + wireMockServer.verify( + 1, + postRequestedFor(urlPathEqualTo("/v1/convert/source/batch")) + .withHeader("Content-Type", equalTo("application/json")) + .withRequestBody( + matchingJsonPath("$.sources[0].kind", equalTo("http")) + .and(matchingJsonPath("$.sources[0].url", equalTo("https://arxiv.org/pdf/2408.09869"))) + .and(matchingJsonPath("$.sources[1].kind", equalTo("http"))) + .and(matchingJsonPath("$.sources[1].url", equalTo("https://arxiv.org/pdf/2501.17887"))) + .and(matchingJsonPath("$.target.kind", equalTo("presigned_url"))) + ) + ); + } + + @Test + void shouldConvertSourceBatchAsyncWithPresignedUrlTarget() { + var request = BatchConvertDocumentRequest.builder() + .source( + HttpSource.builder() + .url(URI.create("https://arxiv.org/pdf/2408.09869")) + .build() + ) + .target(PresignedUrlTarget.builder().build()) + .build(); + + var wireMockServer = getWiremockServer(); + + wireMockServer.stubFor( + post("/v1/convert/source/batch") + .withHeader("Content-Type", equalTo("application/json")) + .withHeader("Accept", equalTo("application/json")) + .willReturn(okJson(""" + { + "task_id": "batch-async-task-001", + "task_type": "convert", + "task_status": "pending", + "task_position": 1, + "task_meta": null + } + """)) + ); + + wireMockServer.stubFor( + get(urlPathEqualTo("/v1/status/poll/batch-async-task-001")) + .willReturn(okJson(""" + { + "task_id": "batch-async-task-001", + "task_type": "convert", + "task_status": "success", + "task_position": 0, + "task_meta": null + } + """)) + ); + + wireMockServer.stubFor( + get(urlPathEqualTo("/v1/result/batch-async-task-001")) + .willReturn(okJson(""" + { + "processing_time": 42.5, + "num_converted": 1, + "num_succeeded": 1, + "num_partially_succeeded": 0, + "num_failed": 0, + "documents": [ + { + "source_index": 0, + "source_uri": "https://arxiv.org/pdf/2408.09869", + "filename": "2408.09869", + "status": "success", + "errors": [], + "timings": {}, + "artifacts": [ + { + "artifact_type": "markdown", + "mime_type": "text/markdown", + "uri": "https://storage.example.com/2408.09869.md", + "url_expires_at": "2026-06-15T12:00:00Z" + } + ] + } + ] + } + """)) + ); + + var response = getDoclingClient(false, true) + .convertSourceBatchAsync(request).toCompletableFuture().join(); + + assertThat(response).isNotNull(); + assertThat(response.getResponseType()).isEqualTo(ResponseType.PRE_SIGNED_URL_RESPONSE); + assertThat(response).isInstanceOf(PreSignedUrlConvertResponse.class); + + var presignedResponse = (PreSignedUrlConvertResponse) response; + assertThat(presignedResponse.getProcessingTime()).isEqualTo(42.5); + assertThat(presignedResponse.getNumConverted()).isEqualTo(1); + assertThat(presignedResponse.getNumSucceeded()).isEqualTo(1); + assertThat(presignedResponse.getDocuments()).hasSize(1); + + var doc = presignedResponse.getDocuments().get(0); + assertThat(doc.getSourceUri()).isEqualTo("https://arxiv.org/pdf/2408.09869"); + assertThat(doc.getStatus()).isEqualTo(ConversionStatus.SUCCESS); + assertThat(doc.getArtifacts()).hasSize(1); + assertThat(doc.getArtifacts().get(0).getArtifactType()).isEqualTo(ArtifactType.MARKDOWN); + assertThat(doc.getArtifacts().get(0).getUri()).isEqualTo(URI.create("https://storage.example.com/2408.09869.md")); + + } + + @Test + void shouldConvertSourceBatchWithS3SourceAndS3Target() { + var request = BatchConvertDocumentRequest.builder() + .source( + S3Source.builder() + .endpoint("source-s3-endpoint") + .bucket("source-bucket") + .accessKey("source-access-key") + .secretKey("source-secret-key") + .keyPrefix("incoming/") + .maxNumElements(500) + .verifySsl(false) + .build() + ) + .target( + S3Target.builder() + .endpoint("target-s3-endpoint") + .bucket("target-bucket") + .accessKey("target-access-key") + .secretKey("target-secret-key") + .keyPrefix("converted/") + .verifySsl(false) + .build() + ).build(); + + var wireMockServer = getWiremockServer(); + + wireMockServer.stubFor( + post("/v1/convert/source/batch") + .withRequestBody(equalToJson(writeValueAsString(request))) + .withHeader("Content-Type", equalTo("application/json")) + .withHeader("Accept", equalTo("application/json")) + .willReturn(okJson(""" + { + "task_id": "batch-s3-task-456", + "task_type": "convert", + "task_status": "pending", + "task_position": 2, + "task_meta": null + } + """)) + ); + + var response = getDoclingClient(false, true).convertSourceBatch(request); + assertThat(response).isNotNull(); + assertThat(response.getTaskId()).isEqualTo("batch-s3-task-456"); + assertThat(response.getTaskStatus()).isEqualTo(TaskStatus.PENDING); + + wireMockServer.verify( + 1, + postRequestedFor(urlPathEqualTo("/v1/convert/source/batch")) + .withHeader("Content-Type", equalTo("application/json")) + .withRequestBody( + matchingJsonPath("$.sources[0].kind", equalTo("s3")) + .and(matchingJsonPath("$.sources[0].endpoint", equalTo("source-s3-endpoint"))) + .and(matchingJsonPath("$.sources[0].bucket", equalTo("source-bucket"))) + .and(matchingJsonPath("$.sources[0].key_prefix", equalTo("incoming/"))) + .and(matchingJsonPath("$.sources[0].max_num_elements", equalTo("500"))) + .and(matchingJsonPath("$.target.kind", equalTo("s3"))) + .and(matchingJsonPath("$.target.endpoint", equalTo("target-s3-endpoint"))) + .and(matchingJsonPath("$.target.bucket", equalTo("target-bucket"))) + .and(matchingJsonPath("$.target.key_prefix", equalTo("converted/"))) + ) + ); + } + + @Test + void shouldConvertSourceBatchWithCallbacks() { + var request = BatchConvertDocumentRequest.builder() + .source( + HttpSource.builder() + .url(URI.create("https://arxiv.org/pdf/2408.09869")) + .build() + ) + .target(PresignedUrlTarget.builder().build()) + .callback( + CallbackSpec.builder() + .url(URI.create("https://my-app.example.com/docling/progress")) + .header("Authorization", "Bearer token123") + .build() + ) + .build(); + + var wireMockServer = getWiremockServer(); + + wireMockServer.stubFor( + post("/v1/convert/source/batch") + .withRequestBody(equalToJson(writeValueAsString(request))) + .withHeader("Content-Type", equalTo("application/json")) + .withHeader("Accept", equalTo("application/json")) + .willReturn(okJson(""" + { + "task_id": "batch-callback-789", + "task_type": "convert", + "task_status": "pending", + "task_position": 1, + "task_meta": null + } + """)) + ); + + var response = getDoclingClient(false, true).convertSourceBatch(request); + assertThat(response).isNotNull(); + assertThat(response.getTaskId()).isEqualTo("batch-callback-789"); + + wireMockServer.verify( + 1, + postRequestedFor(urlPathEqualTo("/v1/convert/source/batch")) + .withRequestBody( + matchingJsonPath("$.callbacks[0].url", equalTo("https://my-app.example.com/docling/progress")) + .and(matchingJsonPath("$.callbacks[0].headers.Authorization", equalTo("Bearer token123"))) + ) + ); + } + + @Test + void convertSourceBatchNullRequest() { + assertThatExceptionOfType(IllegalArgumentException.class) + .isThrownBy(() -> getDoclingClient().convertSourceBatch(null)) + .withMessage("request cannot be null"); + } } @Nested diff --git a/docs/src/doc/docs/whats-new.md b/docs/src/doc/docs/whats-new.md index e837ff59..87546dde 100644 --- a/docs/src/doc/docs/whats-new.md +++ b/docs/src/doc/docs/whats-new.md @@ -25,6 +25,10 @@ Docling Java {{ gradle.project_version }} includes important breaking changes, a ### {{ gradle.project_version }} +* **New batch conversion support** — Added `convertSourceBatch()` and `convertSourceBatchAsync()` methods to `DoclingServeConvertApi` for the new `/v1/convert/source/batch` endpoint. Submit multiple HTTP or S3 sources for batch processing with optional webhook callbacks for progress notifications. Requires docling-serve v1.22.0+. +* **New `BatchConvertDocumentRequest`** — Request model for batch conversions, supporting `sources` (HTTP or S3), `target` (PresignedUrlTarget or S3Target), conversion `options`, and optional `callbacks` (webhook specifications). +* **New `CallbackSpec`** — Webhook callback specification for receiving progress notifications during batch processing, with `url`, `headers`, and optional `caCert` fields. +* **New `maxNumElements` field on `S3Source`** — Caps the number of S3 objects processed in a single batch, useful when the source bucket contains many objects. * **New `PresignedUrlTarget` request target** — Request server-managed presigned-URL delivery by setting `target` to `PresignedUrlTarget`. The docling-serve instance uploads each output artifact to its configured object storage and returns time-limited presigned download URLs in the response. Requires docling-serve v1.22.0+. * **New `PreSignedUrlConvertResponse` response type** — Returned when using `PresignedUrlTarget`. Contains per-document results in a `documents` list, where each `DocumentArtifactItem` carries the conversion status and a list of `ArtifactRef` entries with presigned download URLs for each output format. * **New supporting types** — `DocumentArtifactItem`, `ArtifactRef`, `ArtifactType`, `ConversionStatus`, `ProfilingItem`, `ProfilingScope`.