Skip to content

Commit 20f097a

Browse files
pk-zipstackclaude
andcommitted
UN-2836 [FEAT] Return full text contents of input file in API response
Add `include_extracted_text` parameter to API deployment endpoints that returns the full extracted text of each input file at the top level of each file result, independent of `include_metadata` and the `ENABLE_HIGHLIGHT_API_DEPLOYMENT` configuration flag. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 93c0634 commit 20f097a

5 files changed

Lines changed: 48 additions & 2 deletions

File tree

backend/api_v2/api_deployment_views.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ def post(
8181
timeout = serializer.validated_data.get(ApiExecution.TIMEOUT_FORM_DATA)
8282
include_metadata = serializer.validated_data.get(ApiExecution.INCLUDE_METADATA)
8383
include_metrics = serializer.validated_data.get(ApiExecution.INCLUDE_METRICS)
84+
include_extracted_text = serializer.validated_data.get(
85+
ApiExecution.INCLUDE_EXTRACTED_TEXT
86+
)
8487
use_file_history = serializer.validated_data.get(ApiExecution.USE_FILE_HISTORY)
8588
tag_names = serializer.validated_data.get(ApiExecution.TAGS)
8689
llm_profile_id = serializer.validated_data.get(ApiExecution.LLM_PROFILE_ID)
@@ -117,6 +120,7 @@ def post(
117120
timeout=timeout,
118121
include_metadata=include_metadata,
119122
include_metrics=include_metrics,
123+
include_extracted_text=include_extracted_text,
120124
use_file_history=use_file_history,
121125
tag_names=tag_names,
122126
llm_profile_id=llm_profile_id,
@@ -171,6 +175,9 @@ def get(
171175
execution_id = serializer.validated_data.get(ApiExecution.EXECUTION_ID)
172176
include_metadata = serializer.validated_data.get(ApiExecution.INCLUDE_METADATA)
173177
include_metrics = serializer.validated_data.get(ApiExecution.INCLUDE_METRICS)
178+
include_extracted_text = serializer.validated_data.get(
179+
ApiExecution.INCLUDE_EXTRACTED_TEXT
180+
)
174181

175182
# Fetch execution status
176183
response: ExecutionResponse = DeploymentHelper.get_execution_status(execution_id)
@@ -218,6 +225,7 @@ def get(
218225
deployment_execution_dto=deployment_execution_dto,
219226
include_metadata=include_metadata,
220227
include_metrics=include_metrics,
228+
include_extracted_text=include_extracted_text,
221229
)
222230
return Response(
223231
data={

backend/api_v2/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ class ApiExecution:
55
TIMEOUT_FORM_DATA: str = "timeout"
66
INCLUDE_METADATA: str = "include_metadata"
77
INCLUDE_METRICS: str = "include_metrics"
8+
INCLUDE_EXTRACTED_TEXT: str = "include_extracted_text"
89
USE_FILE_HISTORY: str = "use_file_history" # Undocumented parameter
910
EXECUTION_ID: str = "execution_id"
1011
TAGS: str = "tags"

backend/api_v2/deployment_helper.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ def execute_workflow(
155155
timeout: int,
156156
include_metadata: bool = False,
157157
include_metrics: bool = False,
158+
include_extracted_text: bool = False,
158159
use_file_history: bool = False,
159160
tag_names: list[str] = [],
160161
llm_profile_id: str | None = None,
@@ -275,7 +276,10 @@ def execute_workflow(
275276
)
276277
if not enable_highlight:
277278
result.remove_result_metadata_keys(["highlight_data"])
278-
result.remove_result_metadata_keys(["extracted_text"])
279+
if not include_extracted_text:
280+
result.remove_result_metadata_keys(["extracted_text"])
281+
if include_extracted_text:
282+
result.promote_extracted_text()
279283
if include_metadata or include_metrics:
280284
cls._enrich_result_with_usage_metadata(result)
281285
if not include_metadata:
@@ -458,6 +462,7 @@ def process_completed_execution(
458462
deployment_execution_dto: Any,
459463
include_metadata: bool,
460464
include_metrics: bool,
465+
include_extracted_text: bool = False,
461466
) -> None:
462467
"""Enrich and clean up the response for a completed execution."""
463468
api_deployment = deployment_execution_dto.api
@@ -476,7 +481,10 @@ def process_completed_execution(
476481
)
477482
if not enable_highlight:
478483
response.remove_result_metadata_keys(["highlight_data"])
479-
response.remove_result_metadata_keys(["extracted_text"])
484+
if not include_extracted_text:
485+
response.remove_result_metadata_keys(["extracted_text"])
486+
if include_extracted_text:
487+
response.promote_extracted_text()
480488
if include_metadata or include_metrics:
481489
DeploymentHelper._enrich_result_with_usage_metadata(response)
482490
if not include_metadata:

backend/api_v2/serializers.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,9 @@ class ExecutionRequestSerializer(TagParamsSerializer):
210210
If -1 it corresponds to async execution. Defaults to -1
211211
include_metadata (bool): Flag to include metadata in API response
212212
include_metrics (bool): Flag to include metrics in API response
213+
include_extracted_text (bool): Flag to include the full extracted text
214+
of the input file in the API response. The extracted text is returned
215+
at the top level of each file result, independent of include_metadata.
213216
use_file_history (bool): Flag to use FileHistory to save and retrieve
214217
responses quickly. This is undocumented to the user and can be
215218
helpful for demos.
@@ -232,6 +235,7 @@ class ExecutionRequestSerializer(TagParamsSerializer):
232235
)
233236
include_metadata = BooleanField(default=False)
234237
include_metrics = BooleanField(default=False)
238+
include_extracted_text = BooleanField(default=False)
235239
use_file_history = BooleanField(default=False)
236240

237241
presigned_urls = ListField(child=URLField(), required=False)
@@ -408,6 +412,7 @@ class ExecutionQuerySerializer(Serializer):
408412
execution_id = CharField(required=True)
409413
include_metadata = BooleanField(default=False)
410414
include_metrics = BooleanField(default=False)
415+
include_extracted_text = BooleanField(default=False)
411416

412417
def validate_execution_id(self, value):
413418
"""Trim spaces, validate UUID format, and check if execution_id exists."""

backend/workflow_manager/workflow_v2/dto.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,30 @@ def remove_inner_result_metadata(self) -> None:
129129
if isinstance(result, dict):
130130
result.pop("metadata", None)
131131

132+
def promote_extracted_text(self) -> None:
133+
"""Copies extracted_text from metadata to the top level of each file
134+
result item. This allows extracted_text to be returned independently
135+
of include_metadata.
136+
137+
After promotion, the extracted_text appears as:
138+
result[i]["extracted_text"] = "..."
139+
"""
140+
if not isinstance(self.result, list):
141+
return
142+
143+
for item in self.result:
144+
if not isinstance(item, dict):
145+
continue
146+
147+
result = item.get("result")
148+
if not isinstance(result, dict):
149+
continue
150+
151+
metadata = result.get("metadata", {})
152+
extracted_text = metadata.get("extracted_text")
153+
if extracted_text is not None:
154+
item["extracted_text"] = extracted_text
155+
132156
def remove_result_metrics(self) -> None:
133157
"""Removes the 'metrics' key from the 'result' dictionary within each
134158
'result' dictionary in the 'result' list attribute of the instance.

0 commit comments

Comments
 (0)