diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 0c2ecec6..86b0e83d 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.20.0" + ".": "0.21.0" } \ No newline at end of file diff --git a/.stats.yml b/.stats.yml index 40b629f2..0196ef48 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 17 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/reducto%2Freductoai-1b16fbf5337f188d0b66a5992f0d241be80c46c45412ef9830cb19b11437d1c6.yml -openapi_spec_hash: 88f89b5803058bfa20d5da05c2bcf754 -config_hash: 9dd1f73da997aefc8516b226e0e7fed7 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/reducto/reductoai-b36868a5180fdbbf68e1ec04645b9f3cafae6e8149be90e2b57a249fc8e5f070.yml +openapi_spec_hash: 22a35f1691ac8540ee6141d7c800119a +config_hash: 9fa10baf03f994be027bf73b29ac8572 diff --git a/README.md b/README.md index 16329cb2..055265b1 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,15 @@ and offers both synchronous and asynchronous clients powered by [httpx](https:// It is generated with [Stainless](https://www.stainless.com/). +## MCP Server + +Use the Reducto MCP Server to enable AI assistants to interact with this API, allowing them to explore endpoints, make test requests, and use documentation to help integrate this SDK into your application. + +[![Add to Cursor](https://cursor.com/deeplink/mcp-install-dark.svg)](https://cursor.com/en-US/install-mcp?name=reductoai-mcp&config=eyJjb21tYW5kIjoibnB4IiwiYXJncyI6WyIteSIsInJlZHVjdG9haS1tY3AiXSwiZW52Ijp7IlJFRFVDVE9fQVBJX0tFWSI6Ik15IEFQSSBLZXkifX0) +[![Install in VS Code](https://img.shields.io/badge/_-Add_to_VS_Code-blue?style=for-the-badge&logo=data:image/svg%2bxml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIGZpbGw9Im5vbmUiIHZpZXdCb3g9IjAgMCA0MCA0MCI+PHBhdGggZmlsbD0iI0VFRSIgZmlsbC1ydWxlPSJldmVub2RkIiBkPSJNMzAuMjM1IDM5Ljg4NGEyLjQ5MSAyLjQ5MSAwIDAgMS0xLjc4MS0uNzNMMTIuNyAyNC43OGwtMy40NiAyLjYyNC0zLjQwNiAyLjU4MmExLjY2NSAxLjY2NSAwIDAgMS0xLjA4Mi4zMzggMS42NjQgMS42NjQgMCAwIDEtMS4wNDYtLjQzMWwtMi4yLTJhMS42NjYgMS42NjYgMCAwIDEgMC0yLjQ2M0w3LjQ1OCAyMCA0LjY3IDE3LjQ1MyAxLjUwNyAxNC41N2ExLjY2NSAxLjY2NSAwIDAgMSAwLTIuNDYzbDIuMi0yYTEuNjY1IDEuNjY1IDAgMCAxIDIuMTMtLjA5N2w2Ljg2MyA1LjIwOUwyOC40NTIuODQ0YTIuNDg4IDIuNDg4IDAgMCAxIDEuODQxLS43MjljLjM1MS4wMDkuNjk5LjA5MSAxLjAxOS4yNDVsOC4yMzYgMy45NjFhMi41IDIuNSAwIDAgMSAxLjQxNSAyLjI1M3YuMDk5LS4wNDVWMzMuMzd2LS4wNDUuMDk1YTIuNTAxIDIuNTAxIDAgMCAxLTEuNDE2IDIuMjU3bC04LjIzNSAzLjk2MWEyLjQ5MiAyLjQ5MiAwIDAgMS0xLjA3Ny4yNDZabS43MTYtMjguOTQ3LTExLjk0OCA5LjA2MiAxMS45NTIgOS4wNjUtLjAwNC0xOC4xMjdaIi8+PC9zdmc+)](https://vscode.stainless.com/mcp/%7B%22name%22%3A%22reductoai-mcp%22%2C%22command%22%3A%22npx%22%2C%22args%22%3A%5B%22-y%22%2C%22reductoai-mcp%22%5D%2C%22env%22%3A%7B%22REDUCTO_API_KEY%22%3A%22My%20API%20Key%22%7D%7D) + +> Note: You may need to set environment variables in your MCP client. + ## Documentation The REST API documentation can be found on [docs.reductoai.com](https://docs.reductoai.com). The full API of this library can be found in [api.md](api.md). @@ -129,6 +138,23 @@ response = client.parse.run( print(response.enhance) ``` +## File uploads + +Request parameters that correspond to file uploads can be passed as `bytes`, or a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance or a tuple of `(filename, contents, media type)`. + +```python +from pathlib import Path +from reducto import Reducto + +client = Reducto() + +client.upload( + file=Path("/path/to/file"), +) +``` + +The async client uses the exact same interface. If you pass a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance, the file contents will be read asynchronously automatically. + ## Handling errors When the library is unable to connect to the API (for example, due to network connection problems or a timeout), a subclass of `reducto.APIConnectionError` is raised. diff --git a/api.md b/api.md index f0f17287..2b7460ac 100644 --- a/api.md +++ b/api.md @@ -1,7 +1,39 @@ # Shared Types ```python -from reducto.types import Upload +from reducto.types import ( + AdvancedCitationsConfig, + AdvancedProcessingOptions, + ArrayExtractConfig, + AsyncEditResponse, + AsyncExtractResponse, + AsyncParseResponse, + AsyncPipelineResponse, + AsyncSplitResponse, + BaseProcessingOptions, + Chunking, + ChunkingConfig, + ClassifyResponse, + DirectWebhookConfig, + EditResponse, + EnrichConfig, + ExperimentalProcessingOptions, + ExtractResponse, + FigureAgentic, + FigureSummaryConfig, + LargeTableChunkingConfig, + PageRange, + ParseResponse, + PipelineResponse, + SplitLargeTables, + SplitResponse, + SvixWebhookConfig, + TableAgentic, + TableSummaryConfig, + TextAgentic, + Upload, + WebhookConfigNew, +) ``` # Reducto @@ -25,10 +57,8 @@ Types: from reducto.types import ( AsyncConfigV3, AsyncParseConfig, - AsyncParseResponse, Enhance, Formatting, - ParseResponse, Retrieval, Settings, Spreadsheet, @@ -39,7 +69,7 @@ from reducto.types import ( Methods: - client.parse.run(\*\*params) -> ParseRunResponse -- client.parse.run_job(\*\*params) -> AsyncParseResponse +- client.parse.run_job(\*\*params) -> AsyncParseResponse # Extract @@ -48,7 +78,6 @@ Types: ```python from reducto.types import ( AsyncExtractConfig, - AsyncExtractResponse, ExtractSettings, ExtractUsage, Instructions, @@ -61,65 +90,52 @@ from reducto.types import ( Methods: - client.extract.run(\*\*params) -> ExtractRunResponse -- client.extract.run_job(\*\*params) -> AsyncExtractResponse +- client.extract.run_job(\*\*params) -> AsyncExtractResponse # Split Types: ```python -from reducto.types import ( - DeepSplitPageEvidence, - ParseUsage, - SplitCategory, - SplitResponse, - SplitTableOptions, - SplitRunJobResponse, -) +from reducto.types import DeepSplitPageEvidence, ParseUsage, SplitCategory, SplitTableOptions ``` Methods: -- client.split.run(\*\*params) -> SplitResponse -- client.split.run_job(\*\*params) -> SplitRunJobResponse +- client.split.run(\*\*params) -> SplitResponse +- client.split.run_job(\*\*params) -> AsyncSplitResponse # Edit Types: ```python -from reducto.types import BoundingBox, EditOptions, EditResponse, EditWidget, EditRunJobResponse +from reducto.types import BoundingBox, EditOptions, EditWidget ``` Methods: -- client.edit.run(\*\*params) -> EditResponse -- client.edit.run_job(\*\*params) -> EditRunJobResponse +- client.edit.run(\*\*params) -> EditResponse +- client.edit.run_job(\*\*params) -> AsyncEditResponse # Pipeline Types: ```python -from reducto.types import PipelineResponse, PipelineSettings, PipelineRunJobResponse +from reducto.types import PipelineSettings ``` Methods: -- client.pipeline.run(\*\*params) -> PipelineResponse -- client.pipeline.run_job(\*\*params) -> PipelineRunJobResponse +- client.pipeline.run(\*\*params) -> PipelineResponse +- client.pipeline.run_job(\*\*params) -> AsyncPipelineResponse # Classify -Types: - -```python -from reducto.types import ClassifyResponse, PageRange -``` - Methods: -- client.classify.run(\*\*params) -> ClassifyResponse +- client.classify.run(\*\*params) -> ClassifyResponse # Webhook @@ -138,7 +154,7 @@ Methods: Types: ```python -from reducto.types import ExtractResponse, JobGetResponse, JobGetAllResponse +from reducto.types import JobGetResponse, JobGetAllResponse ``` Methods: diff --git a/pyproject.toml b/pyproject.toml index 22d7bf0e..354ab96e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "reductoai" -version = "0.20.0" +version = "0.21.0" description = "The official Python library for the reducto API" dynamic = ["readme"] license = "Apache-2.0" @@ -168,7 +168,7 @@ show_error_codes = true # # We also exclude our `tests` as mypy doesn't always infer # types correctly and Pyright will still catch any type errors. -exclude = ['src/reducto/_files.py', '_dev/.*.py', 'tests/.*'] +exclude = ["src/reducto/_files.py", "_dev/.*.py", "tests/.*"] strict_equality = true implicit_reexport = true diff --git a/scripts/bootstrap b/scripts/bootstrap index b430fee3..fe8451e4 100755 --- a/scripts/bootstrap +++ b/scripts/bootstrap @@ -4,7 +4,7 @@ set -e cd "$(dirname "$0")/.." -if [ -f "Brewfile" ] && [ "$(uname -s)" = "Darwin" ] && [ "$SKIP_BREW" != "1" ] && [ -t 0 ]; then +if [ -f "Brewfile" ] && [ "$(uname -s)" = "Darwin" ] && [ "${SKIP_BREW:-}" != "1" ] && [ -t 0 ]; then brew bundle check >/dev/null 2>&1 || { echo -n "==> Install Homebrew dependencies? (y/N): " read -r response diff --git a/src/reducto/_base_client.py b/src/reducto/_base_client.py index 1e31cb4d..ee1f4314 100644 --- a/src/reducto/_base_client.py +++ b/src/reducto/_base_client.py @@ -558,6 +558,10 @@ def _build_request( files = cast(HttpxRequestFiles, ForceMultipartDict()) prepared_url = self._prepare_url(options.url) + # preserve hard-coded query params from the url + if params and prepared_url.query: + params = {**dict(prepared_url.params.items()), **params} + prepared_url = prepared_url.copy_with(raw_path=prepared_url.raw_path.split(b"?", 1)[0]) if "_" in prepared_url.host: # work around https://github.com/encode/httpx/discussions/2880 kwargs["extensions"] = {"sni_hostname": prepared_url.host.replace("_", "-")} diff --git a/src/reducto/_client.py b/src/reducto/_client.py index 901feebe..81b298b8 100644 --- a/src/reducto/_client.py +++ b/src/reducto/_client.py @@ -11,6 +11,7 @@ from . import _exceptions from ._qs import Querystring from .types import client_upload_params +from ._files import deepcopy_with_paths from ._types import ( Body, Omit, @@ -18,6 +19,7 @@ Headers, Timeout, NotGiven, + FileTypes, Transport, ProxiesTypes, RequestOptions, @@ -26,6 +28,8 @@ ) from ._utils import ( is_given, + is_mapping_t, + extract_files, maybe_transform, get_async_library, async_maybe_transform, @@ -147,6 +151,15 @@ def __init__( except KeyError as exc: raise ValueError(f"Unknown environment: {environment}") from exc + custom_headers_env = os.environ.get("REDUCTO_CUSTOM_HEADERS") + if custom_headers_env is not None: + parsed: dict[str, str] = {} + for line in custom_headers_env.split("\n"): + colon = line.find(":") + if colon >= 0: + parsed[line[:colon].strip()] = line[colon + 1 :].strip() + default_headers = {**parsed, **(default_headers if is_mapping_t(default_headers) else {})} + super().__init__( version=__version__, base_url=base_url, @@ -315,7 +328,7 @@ def upload( self, *, extension: Optional[str] | Omit = omit, - file: Optional[str] | Omit = omit, + file: Optional[FileTypes] | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -335,9 +348,17 @@ def upload( timeout: Override the client-level default timeout for this request, in seconds """ + body = deepcopy_with_paths({"file": file}, [["file"]]) + files = extract_files(cast(Mapping[str, object], body), paths=[["file"]]) + if files: + # It should be noted that the actual Content-Type header that will be + # sent to the server will contain a `boundary` parameter, e.g. + # multipart/form-data; boundary=---abc-- + extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})} return self.post( "/upload", - body=maybe_transform({"file": file}, client_upload_params.ClientUploadParams), + body=maybe_transform(body, client_upload_params.ClientUploadParams), + files=files, options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, @@ -450,6 +471,15 @@ def __init__( except KeyError as exc: raise ValueError(f"Unknown environment: {environment}") from exc + custom_headers_env = os.environ.get("REDUCTO_CUSTOM_HEADERS") + if custom_headers_env is not None: + parsed: dict[str, str] = {} + for line in custom_headers_env.split("\n"): + colon = line.find(":") + if colon >= 0: + parsed[line[:colon].strip()] = line[colon + 1 :].strip() + default_headers = {**parsed, **(default_headers if is_mapping_t(default_headers) else {})} + super().__init__( version=__version__, base_url=base_url, @@ -618,7 +648,7 @@ async def upload( self, *, extension: Optional[str] | Omit = omit, - file: Optional[str] | Omit = omit, + file: Optional[FileTypes] | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -638,9 +668,17 @@ async def upload( timeout: Override the client-level default timeout for this request, in seconds """ + body = deepcopy_with_paths({"file": file}, [["file"]]) + files = extract_files(cast(Mapping[str, object], body), paths=[["file"]]) + if files: + # It should be noted that the actual Content-Type header that will be + # sent to the server will contain a `boundary` parameter, e.g. + # multipart/form-data; boundary=---abc-- + extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})} return await self.post( "/upload", - body=await async_maybe_transform({"file": file}, client_upload_params.ClientUploadParams), + body=await async_maybe_transform(body, client_upload_params.ClientUploadParams), + files=files, options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, diff --git a/src/reducto/_files.py b/src/reducto/_files.py index cc14c14f..76c6988d 100644 --- a/src/reducto/_files.py +++ b/src/reducto/_files.py @@ -3,8 +3,8 @@ import io import os import pathlib -from typing import overload -from typing_extensions import TypeGuard +from typing import Sequence, cast, overload +from typing_extensions import TypeVar, TypeGuard import anyio @@ -17,7 +17,9 @@ HttpxFileContent, HttpxRequestFiles, ) -from ._utils import is_tuple_t, is_mapping_t, is_sequence_t +from ._utils import is_list, is_mapping, is_tuple_t, is_mapping_t, is_sequence_t + +_T = TypeVar("_T") def is_base64_file_input(obj: object) -> TypeGuard[Base64FileInput]: @@ -34,7 +36,7 @@ def assert_is_file_content(obj: object, *, key: str | None = None) -> None: if not is_file_content(obj): prefix = f"Expected entry at `{key}`" if key is not None else f"Expected file input `{obj!r}`" raise RuntimeError( - f"{prefix} to be bytes, an io.IOBase instance, PathLike or a tuple but received {type(obj)} instead." + f"{prefix} to be bytes, an io.IOBase instance, PathLike or a tuple but received {type(obj)} instead. See https://github.com/reductoai/reducto-python-sdk/tree/main#file-uploads" ) from None @@ -97,7 +99,7 @@ async def async_to_httpx_files(files: RequestFiles | None) -> HttpxRequestFiles elif is_sequence_t(files): files = [(key, await _async_transform_file(file)) for key, file in files] else: - raise TypeError("Unexpected file type input {type(files)}, expected mapping or sequence") + raise TypeError(f"Unexpected file type input {type(files)}, expected mapping or sequence") return files @@ -121,3 +123,51 @@ async def async_read_file_content(file: FileContent) -> HttpxFileContent: return await anyio.Path(file).read_bytes() return file + + +def deepcopy_with_paths(item: _T, paths: Sequence[Sequence[str]]) -> _T: + """Copy only the containers along the given paths. + + Used to guard against mutation by extract_files without copying the entire structure. + Only dicts and lists that lie on a path are copied; everything else + is returned by reference. + + For example, given paths=[["foo", "files", "file"]] and the structure: + { + "foo": { + "bar": {"baz": {}}, + "files": {"file": } + } + } + The root dict, "foo", and "files" are copied (they lie on the path). + "bar" and "baz" are returned by reference (off the path). + """ + return _deepcopy_with_paths(item, paths, 0) + + +def _deepcopy_with_paths(item: _T, paths: Sequence[Sequence[str]], index: int) -> _T: + if not paths: + return item + if is_mapping(item): + key_to_paths: dict[str, list[Sequence[str]]] = {} + for path in paths: + if index < len(path): + key_to_paths.setdefault(path[index], []).append(path) + + # if no path continues through this mapping, it won't be mutated and copying it is redundant + if not key_to_paths: + return item + + result = dict(item) + for key, subpaths in key_to_paths.items(): + if key in result: + result[key] = _deepcopy_with_paths(result[key], subpaths, index + 1) + return cast(_T, result) + if is_list(item): + array_paths = [path for path in paths if index < len(path) and path[index] == ""] + + # if no path expects a list here, nothing will be mutated inside it - return by reference + if not array_paths: + return cast(_T, item) + return cast(_T, [_deepcopy_with_paths(entry, array_paths, index + 1) for entry in item]) + return item diff --git a/src/reducto/_models.py b/src/reducto/_models.py index 1819e148..0ec36119 100644 --- a/src/reducto/_models.py +++ b/src/reducto/_models.py @@ -25,7 +25,9 @@ ClassVar, Protocol, Required, + Annotated, ParamSpec, + TypeAlias, TypedDict, TypeGuard, final, @@ -79,7 +81,15 @@ from ._constants import RAW_RESPONSE_HEADER if TYPE_CHECKING: + from pydantic import GetCoreSchemaHandler, ValidatorFunctionWrapHandler + from pydantic_core import CoreSchema, core_schema from pydantic_core.core_schema import ModelField, ModelSchema, LiteralSchema, ModelFieldsSchema +else: + try: + from pydantic_core import CoreSchema, core_schema + except ImportError: + CoreSchema = None + core_schema = None __all__ = ["BaseModel", "GenericModel"] @@ -396,6 +406,76 @@ def model_dump_json( ) +class _EagerIterable(list[_T], Generic[_T]): + """ + Accepts any Iterable[T] input (including generators), consumes it + eagerly, and validates all items upfront. + + Validation preserves the original container type where possible + (e.g. a set[T] stays a set[T]). Serialization (model_dump / JSON) + always emits a list — round-tripping through model_dump() will not + restore the original container type. + """ + + @classmethod + def __get_pydantic_core_schema__( + cls, + source_type: Any, + handler: GetCoreSchemaHandler, + ) -> CoreSchema: + (item_type,) = get_args(source_type) or (Any,) + item_schema: CoreSchema = handler.generate_schema(item_type) + list_of_items_schema: CoreSchema = core_schema.list_schema(item_schema) + + return core_schema.no_info_wrap_validator_function( + cls._validate, + list_of_items_schema, + serialization=core_schema.plain_serializer_function_ser_schema( + cls._serialize, + info_arg=False, + ), + ) + + @staticmethod + def _validate(v: Iterable[_T], handler: "ValidatorFunctionWrapHandler") -> Any: + original_type: type[Any] = type(v) + + # Normalize to list so list_schema can validate each item + if isinstance(v, list): + items: list[_T] = v + else: + try: + items = list(v) + except TypeError as e: + raise TypeError("Value is not iterable") from e + + # Validate items against the inner schema + validated: list[_T] = handler(items) + + # Reconstruct original container type + if original_type is list: + return validated + # str(list) produces the list's repr, not a string built from items, + # so skip reconstruction for str and its subclasses. + if issubclass(original_type, str): + return validated + try: + return original_type(validated) + except (TypeError, ValueError): + # If the type cannot be reconstructed, just return the validated list + return validated + + @staticmethod + def _serialize(v: Iterable[_T]) -> list[_T]: + """Always serialize as a list so Pydantic's JSON encoder is happy.""" + if isinstance(v, list): + return v + return list(v) + + +EagerIterable: TypeAlias = Annotated[Iterable[_T], _EagerIterable] + + def _construct_field(value: object, field: FieldInfo, key: str) -> object: if value is None: return field_get_default(field) diff --git a/src/reducto/_qs.py b/src/reducto/_qs.py index ada6fd3f..4127c19c 100644 --- a/src/reducto/_qs.py +++ b/src/reducto/_qs.py @@ -2,17 +2,13 @@ from typing import Any, List, Tuple, Union, Mapping, TypeVar from urllib.parse import parse_qs, urlencode -from typing_extensions import Literal, get_args +from typing_extensions import get_args -from ._types import NotGiven, not_given +from ._types import NotGiven, ArrayFormat, NestedFormat, not_given from ._utils import flatten _T = TypeVar("_T") - -ArrayFormat = Literal["comma", "repeat", "indices", "brackets"] -NestedFormat = Literal["dots", "brackets"] - PrimitiveData = Union[str, int, float, bool, None] # this should be Data = Union[PrimitiveData, "List[Data]", "Tuple[Data]", "Mapping[str, Data]"] # https://github.com/microsoft/pyright/issues/3555 @@ -101,7 +97,10 @@ def _stringify_item( items.extend(self._stringify_item(key, item, opts)) return items elif array_format == "indices": - raise NotImplementedError("The array indices format is not supported yet") + items = [] + for i, item in enumerate(value): + items.extend(self._stringify_item(f"{key}[{i}]", item, opts)) + return items elif array_format == "brackets": items = [] key = key + "[]" diff --git a/src/reducto/_types.py b/src/reducto/_types.py index 6c70e52c..1cd25278 100644 --- a/src/reducto/_types.py +++ b/src/reducto/_types.py @@ -47,6 +47,9 @@ ModelT = TypeVar("ModelT", bound=pydantic.BaseModel) _T = TypeVar("_T") +ArrayFormat = Literal["comma", "repeat", "indices", "brackets"] +NestedFormat = Literal["dots", "brackets"] + # Approximates httpx internal ProxiesTypes and RequestFiles types # while adding support for `PathLike` instances diff --git a/src/reducto/_utils/__init__.py b/src/reducto/_utils/__init__.py index 10cb66d2..1c090e51 100644 --- a/src/reducto/_utils/__init__.py +++ b/src/reducto/_utils/__init__.py @@ -24,7 +24,6 @@ coerce_integer as coerce_integer, file_from_path as file_from_path, strip_not_given as strip_not_given, - deepcopy_minimal as deepcopy_minimal, get_async_library as get_async_library, maybe_coerce_float as maybe_coerce_float, get_required_header as get_required_header, diff --git a/src/reducto/_utils/_utils.py b/src/reducto/_utils/_utils.py index eec7f4a1..199cd231 100644 --- a/src/reducto/_utils/_utils.py +++ b/src/reducto/_utils/_utils.py @@ -17,11 +17,11 @@ ) from pathlib import Path from datetime import date, datetime -from typing_extensions import TypeGuard +from typing_extensions import TypeGuard, get_args import sniffio -from .._types import Omit, NotGiven, FileTypes, HeadersLike +from .._types import Omit, NotGiven, FileTypes, ArrayFormat, HeadersLike _T = TypeVar("_T") _TupleT = TypeVar("_TupleT", bound=Tuple[object, ...]) @@ -40,25 +40,45 @@ def extract_files( query: Mapping[str, object], *, paths: Sequence[Sequence[str]], + array_format: ArrayFormat = "brackets", ) -> list[tuple[str, FileTypes]]: """Recursively extract files from the given dictionary based on specified paths. A path may look like this ['foo', 'files', '', 'data']. + ``array_format`` controls how ```` segments contribute to the emitted + field name. Supported values: ``"brackets"`` (``foo[]``), ``"repeat"`` and + ``"comma"`` (``foo``), ``"indices"`` (``foo[0]``, ``foo[1]``). + Note: this mutates the given dictionary. """ files: list[tuple[str, FileTypes]] = [] for path in paths: - files.extend(_extract_items(query, path, index=0, flattened_key=None)) + files.extend(_extract_items(query, path, index=0, flattened_key=None, array_format=array_format)) return files +def _array_suffix(array_format: ArrayFormat, array_index: int) -> str: + if array_format == "brackets": + return "[]" + if array_format == "indices": + return f"[{array_index}]" + if array_format == "repeat" or array_format == "comma": + # Both repeat the bare field name for each file part; there is no + # meaningful way to comma-join binary parts. + return "" + raise NotImplementedError( + f"Unknown array_format value: {array_format}, choose from {', '.join(get_args(ArrayFormat))}" + ) + + def _extract_items( obj: object, path: Sequence[str], *, index: int, flattened_key: str | None, + array_format: ArrayFormat, ) -> list[tuple[str, FileTypes]]: try: key = path[index] @@ -75,9 +95,11 @@ def _extract_items( if is_list(obj): files: list[tuple[str, FileTypes]] = [] - for entry in obj: - assert_is_file_content(entry, key=flattened_key + "[]" if flattened_key else "") - files.append((flattened_key + "[]", cast(FileTypes, entry))) + for array_index, entry in enumerate(obj): + suffix = _array_suffix(array_format, array_index) + emitted_key = (flattened_key + suffix) if flattened_key else suffix + assert_is_file_content(entry, key=emitted_key) + files.append((emitted_key, cast(FileTypes, entry))) return files assert_is_file_content(obj, key=flattened_key) @@ -86,8 +108,9 @@ def _extract_items( index += 1 if is_dict(obj): try: - # We are at the last entry in the path so we must remove the field - if (len(path)) == index: + # Remove the field if there are no more dict keys in the path, + # only "" traversal markers or end. + if all(p == "" for p in path[index:]): item = obj.pop(key) else: item = obj[key] @@ -105,6 +128,7 @@ def _extract_items( path, index=index, flattened_key=flattened_key, + array_format=array_format, ) elif is_list(obj): if key != "": @@ -116,9 +140,12 @@ def _extract_items( item, path, index=index, - flattened_key=flattened_key + "[]" if flattened_key is not None else "[]", + flattened_key=( + (flattened_key if flattened_key is not None else "") + _array_suffix(array_format, array_index) + ), + array_format=array_format, ) - for item in obj + for array_index, item in enumerate(obj) ] ) @@ -176,21 +203,6 @@ def is_iterable(obj: object) -> TypeGuard[Iterable[object]]: return isinstance(obj, Iterable) -def deepcopy_minimal(item: _T) -> _T: - """Minimal reimplementation of copy.deepcopy() that will only copy certain object types: - - - mappings, e.g. `dict` - - list - - This is done for performance reasons. - """ - if is_mapping(item): - return cast(_T, {k: deepcopy_minimal(v) for k, v in item.items()}) - if is_list(item): - return cast(_T, [deepcopy_minimal(entry) for entry in item]) - return item - - # copied from https://github.com/Rapptz/RoboDanny def human_join(seq: Sequence[str], *, delim: str = ", ", final: str = "or") -> str: size = len(seq) diff --git a/src/reducto/_version.py b/src/reducto/_version.py index 497850e3..30639123 100644 --- a/src/reducto/_version.py +++ b/src/reducto/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "reducto" -__version__ = "0.20.0" # x-release-please-version +__version__ = "0.21.0" # x-release-please-version diff --git a/src/reducto/resources/classify.py b/src/reducto/resources/classify.py index 8702e75f..18128dfa 100644 --- a/src/reducto/resources/classify.py +++ b/src/reducto/resources/classify.py @@ -18,7 +18,7 @@ async_to_streamed_response_wrapper, ) from .._base_client import make_request_options -from ..types.classify_response import ClassifyResponse +from ..types.shared.classify_response import ClassifyResponse __all__ = ["ClassifyResource", "AsyncClassifyResource"] diff --git a/src/reducto/resources/edit.py b/src/reducto/resources/edit.py index 48cae916..9a3a56dc 100644 --- a/src/reducto/resources/edit.py +++ b/src/reducto/resources/edit.py @@ -18,10 +18,11 @@ async_to_streamed_response_wrapper, ) from .._base_client import make_request_options -from ..types.edit_response import EditResponse from ..types.edit_widget_param import EditWidgetParam from ..types.edit_options_param import EditOptionsParam -from ..types.edit_run_job_response import EditRunJobResponse +from ..types.shared.edit_response import EditResponse +from ..types.shared.async_edit_response import AsyncEditResponse +from ..types.shared_params.webhook_config_new import WebhookConfigNew __all__ = ["EditResource", "AsyncEditResource"] @@ -117,14 +118,14 @@ def run_job( edit_options: EditOptionsParam | Omit = omit, form_schema: Optional[Iterable[EditWidgetParam]] | Omit = omit, priority: bool | Omit = omit, - webhook: edit_run_job_params.Webhook | Omit = omit, + webhook: WebhookConfigNew | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> EditRunJobResponse: + ) -> AsyncEditResponse: """Edit Async Args: @@ -171,7 +172,7 @@ def run_job( options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), - cast_to=EditRunJobResponse, + cast_to=AsyncEditResponse, ) @@ -266,14 +267,14 @@ async def run_job( edit_options: EditOptionsParam | Omit = omit, form_schema: Optional[Iterable[EditWidgetParam]] | Omit = omit, priority: bool | Omit = omit, - webhook: edit_run_job_params.Webhook | Omit = omit, + webhook: WebhookConfigNew | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> EditRunJobResponse: + ) -> AsyncEditResponse: """Edit Async Args: @@ -320,7 +321,7 @@ async def run_job( options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), - cast_to=EditRunJobResponse, + cast_to=AsyncEditResponse, ) diff --git a/src/reducto/resources/extract.py b/src/reducto/resources/extract.py index 699226c5..743f1705 100644 --- a/src/reducto/resources/extract.py +++ b/src/reducto/resources/extract.py @@ -26,8 +26,8 @@ from ..types.parse_options_param import ParseOptionsParam from ..types.extract_run_response import ExtractRunResponse from ..types.async_config_v3_param import AsyncConfigV3Param -from ..types.async_extract_response import AsyncExtractResponse from ..types.extract_settings_param import ExtractSettingsParam +from ..types.shared.async_extract_response import AsyncExtractResponse __all__ = ["ExtractResource", "AsyncExtractResource"] diff --git a/src/reducto/resources/parse.py b/src/reducto/resources/parse.py index 5beacb07..1d23710e 100644 --- a/src/reducto/resources/parse.py +++ b/src/reducto/resources/parse.py @@ -28,8 +28,8 @@ from ..types.formatting_param import FormattingParam from ..types.spreadsheet_param import SpreadsheetParam from ..types.parse_run_response import ParseRunResponse -from ..types.async_parse_response import AsyncParseResponse from ..types.async_config_v3_param import AsyncConfigV3Param +from ..types.shared.async_parse_response import AsyncParseResponse __all__ = ["ParseResource", "AsyncParseResource"] diff --git a/src/reducto/resources/pipeline.py b/src/reducto/resources/pipeline.py index 025334d8..f65d6c3c 100644 --- a/src/reducto/resources/pipeline.py +++ b/src/reducto/resources/pipeline.py @@ -16,10 +16,10 @@ async_to_streamed_response_wrapper, ) from .._base_client import make_request_options -from ..types.pipeline_response import PipelineResponse from ..types.async_config_v3_param import AsyncConfigV3Param from ..types.pipeline_settings_param import PipelineSettingsParam -from ..types.pipeline_run_job_response import PipelineRunJobResponse +from ..types.shared.pipeline_response import PipelineResponse +from ..types.shared.async_pipeline_response import AsyncPipelineResponse __all__ = ["PipelineResource", "AsyncPipelineResource"] @@ -111,7 +111,7 @@ def run_job( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> PipelineRunJobResponse: + ) -> AsyncPipelineResponse: """ Pipeline Async @@ -153,7 +153,7 @@ def run_job( options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), - cast_to=PipelineRunJobResponse, + cast_to=AsyncPipelineResponse, ) @@ -244,7 +244,7 @@ async def run_job( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> PipelineRunJobResponse: + ) -> AsyncPipelineResponse: """ Pipeline Async @@ -286,7 +286,7 @@ async def run_job( options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), - cast_to=PipelineRunJobResponse, + cast_to=AsyncPipelineResponse, ) diff --git a/src/reducto/resources/split.py b/src/reducto/resources/split.py index ae2bcab7..553a42bb 100644 --- a/src/reducto/resources/split.py +++ b/src/reducto/resources/split.py @@ -18,12 +18,12 @@ async_to_streamed_response_wrapper, ) from .._base_client import make_request_options -from ..types.split_response import SplitResponse from ..types.parse_options_param import ParseOptionsParam from ..types.split_category_param import SplitCategoryParam from ..types.async_config_v3_param import AsyncConfigV3Param -from ..types.split_run_job_response import SplitRunJobResponse +from ..types.shared.split_response import SplitResponse from ..types.split_table_options_param import SplitTableOptionsParam +from ..types.shared.async_split_response import AsyncSplitResponse __all__ = ["SplitResource", "AsyncSplitResource"] @@ -126,7 +126,7 @@ def run_job( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> SplitRunJobResponse: + ) -> AsyncSplitResponse: """ Split Async @@ -175,7 +175,7 @@ def run_job( options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), - cast_to=SplitRunJobResponse, + cast_to=AsyncSplitResponse, ) @@ -277,7 +277,7 @@ async def run_job( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> SplitRunJobResponse: + ) -> AsyncSplitResponse: """ Split Async @@ -326,7 +326,7 @@ async def run_job( options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), - cast_to=SplitRunJobResponse, + cast_to=AsyncSplitResponse, ) diff --git a/src/reducto/types/__init__.py b/src/reducto/types/__init__.py index 9994a7aa..3f217832 100644 --- a/src/reducto/types/__init__.py +++ b/src/reducto/types/__init__.py @@ -2,28 +2,52 @@ from __future__ import annotations -from .shared import Upload as Upload +from .shared import ( + Upload as Upload, + Chunking as Chunking, + PageRange as PageRange, + TextAgentic as TextAgentic, + EditResponse as EditResponse, + EnrichConfig as EnrichConfig, + TableAgentic as TableAgentic, + FigureAgentic as FigureAgentic, + ParseResponse as ParseResponse, + SplitResponse as SplitResponse, + ChunkingConfig as ChunkingConfig, + ExtractResponse as ExtractResponse, + ClassifyResponse as ClassifyResponse, + PipelineResponse as PipelineResponse, + SplitLargeTables as SplitLargeTables, + WebhookConfigNew as WebhookConfigNew, + AsyncEditResponse as AsyncEditResponse, + SvixWebhookConfig as SvixWebhookConfig, + ArrayExtractConfig as ArrayExtractConfig, + AsyncParseResponse as AsyncParseResponse, + AsyncSplitResponse as AsyncSplitResponse, + TableSummaryConfig as TableSummaryConfig, + DirectWebhookConfig as DirectWebhookConfig, + FigureSummaryConfig as FigureSummaryConfig, + AsyncExtractResponse as AsyncExtractResponse, + AsyncPipelineResponse as AsyncPipelineResponse, + BaseProcessingOptions as BaseProcessingOptions, + AdvancedCitationsConfig as AdvancedCitationsConfig, + LargeTableChunkingConfig as LargeTableChunkingConfig, + AdvancedProcessingOptions as AdvancedProcessingOptions, + ExperimentalProcessingOptions as ExperimentalProcessingOptions, +) from .v3_extract import V3Extract as V3Extract from .edit_widget import EditWidget as EditWidget from .parse_usage import ParseUsage as ParseUsage from .bounding_box import BoundingBox as BoundingBox -from .edit_response import EditResponse as EditResponse from .enhance_param import EnhanceParam as EnhanceParam -from .extract_usage import ExtractUsage as ExtractUsage -from .parse_response import ParseResponse as ParseResponse from .settings_param import SettingsParam as SettingsParam -from .split_response import SplitResponse as SplitResponse from .edit_run_params import EditRunParams as EditRunParams from .retrieval_param import RetrievalParam as RetrievalParam -from .extract_response import ExtractResponse as ExtractResponse from .formatting_param import FormattingParam as FormattingParam from .job_get_response import JobGetResponse as JobGetResponse -from .page_range_param import PageRangeParam as PageRangeParam from .parse_run_params import ParseRunParams as ParseRunParams from .split_run_params import SplitRunParams as SplitRunParams -from .classify_response import ClassifyResponse as ClassifyResponse from .edit_widget_param import EditWidgetParam as EditWidgetParam -from .pipeline_response import PipelineResponse as PipelineResponse from .spreadsheet_param import SpreadsheetParam as SpreadsheetParam from .bounding_box_param import BoundingBoxParam as BoundingBoxParam from .edit_options_param import EditOptionsParam as EditOptionsParam @@ -36,7 +60,6 @@ from .parse_options_param import ParseOptionsParam as ParseOptionsParam from .pipeline_run_params import PipelineRunParams as PipelineRunParams from .api_version_response import APIVersionResponse as APIVersionResponse -from .async_parse_response import AsyncParseResponse as AsyncParseResponse from .client_upload_params import ClientUploadParams as ClientUploadParams from .extract_run_response import ExtractRunResponse as ExtractRunResponse from .job_get_all_response import JobGetAllResponse as JobGetAllResponse @@ -45,15 +68,11 @@ from .split_run_job_params import SplitRunJobParams as SplitRunJobParams from .webhook_run_response import WebhookRunResponse as WebhookRunResponse from .async_config_v3_param import AsyncConfigV3Param as AsyncConfigV3Param -from .edit_run_job_response import EditRunJobResponse as EditRunJobResponse -from .async_extract_response import AsyncExtractResponse as AsyncExtractResponse from .extract_run_job_params import ExtractRunJobParams as ExtractRunJobParams from .extract_settings_param import ExtractSettingsParam as ExtractSettingsParam -from .split_run_job_response import SplitRunJobResponse as SplitRunJobResponse from .pipeline_run_job_params import PipelineRunJobParams as PipelineRunJobParams from .pipeline_settings_param import PipelineSettingsParam as PipelineSettingsParam from .async_parse_config_param import AsyncParseConfigParam as AsyncParseConfigParam from .deep_split_page_evidence import DeepSplitPageEvidence as DeepSplitPageEvidence -from .pipeline_run_job_response import PipelineRunJobResponse as PipelineRunJobResponse from .split_table_options_param import SplitTableOptionsParam as SplitTableOptionsParam from .async_extract_config_param import AsyncExtractConfigParam as AsyncExtractConfigParam diff --git a/src/reducto/types/async_config_v3_param.py b/src/reducto/types/async_config_v3_param.py index 8d2e2156..500b235f 100644 --- a/src/reducto/types/async_config_v3_param.py +++ b/src/reducto/types/async_config_v3_param.py @@ -3,30 +3,14 @@ from __future__ import annotations from typing import Union, Optional -from typing_extensions import Literal, Required, TypeAlias, TypedDict +from typing_extensions import TypeAlias, TypedDict -from .._types import SequenceNotStr +from .shared_params.svix_webhook_config import SvixWebhookConfig +from .shared_params.direct_webhook_config import DirectWebhookConfig -__all__ = ["AsyncConfigV3Param", "Webhook", "WebhookSvixWebhookConfig", "WebhookDirectWebhookConfig"] +__all__ = ["AsyncConfigV3Param", "Webhook"] - -class WebhookSvixWebhookConfig(TypedDict, total=False): - channels: SequenceNotStr[str] - """ - A list of Svix channels the message will be delivered down, omit to send to all - channels. - """ - - mode: Literal["svix"] - - -class WebhookDirectWebhookConfig(TypedDict, total=False): - url: Required[str] - - mode: Literal["direct"] - - -Webhook: TypeAlias = Union[WebhookSvixWebhookConfig, WebhookDirectWebhookConfig] +Webhook: TypeAlias = Union[SvixWebhookConfig, DirectWebhookConfig] class AsyncConfigV3Param(TypedDict, total=False): diff --git a/src/reducto/types/classify_run_params.py b/src/reducto/types/classify_run_params.py index 5c3db299..3bf70235 100644 --- a/src/reducto/types/classify_run_params.py +++ b/src/reducto/types/classify_run_params.py @@ -6,7 +6,7 @@ from typing_extensions import Required, TypeAlias, TypedDict from .._types import SequenceNotStr -from .page_range_param import PageRangeParam +from .shared_params import page_range from .shared_params.upload import Upload __all__ = ["ClassifyRunParams", "Input", "ClassificationSchema", "PageRange"] @@ -63,4 +63,4 @@ class ClassificationSchema(TypedDict, total=False): """ -PageRange: TypeAlias = Union[PageRangeParam, Iterable[PageRangeParam], Iterable[int]] +PageRange: TypeAlias = Union[page_range.PageRange, Iterable[page_range.PageRange], Iterable[int]] diff --git a/src/reducto/types/client_upload_params.py b/src/reducto/types/client_upload_params.py index 60f0acc0..c8d7c223 100644 --- a/src/reducto/types/client_upload_params.py +++ b/src/reducto/types/client_upload_params.py @@ -5,10 +5,12 @@ from typing import Optional from typing_extensions import TypedDict +from .._types import FileTypes + __all__ = ["ClientUploadParams"] class ClientUploadParams(TypedDict, total=False): extension: Optional[str] - file: Optional[str] + file: Optional[FileTypes] diff --git a/src/reducto/types/edit_run_job_params.py b/src/reducto/types/edit_run_job_params.py index 5e3f628c..0d8372a2 100644 --- a/src/reducto/types/edit_run_job_params.py +++ b/src/reducto/types/edit_run_job_params.py @@ -3,14 +3,14 @@ from __future__ import annotations from typing import Union, Iterable, Optional -from typing_extensions import Literal, Required, TypeAlias, TypedDict +from typing_extensions import Required, TypeAlias, TypedDict -from .._types import SequenceNotStr from .edit_widget_param import EditWidgetParam from .edit_options_param import EditOptionsParam from .shared_params.upload import Upload +from .shared_params.webhook_config_new import WebhookConfigNew -__all__ = ["EditRunJobParams", "DocumentURL", "Webhook"] +__all__ = ["EditRunJobParams", "DocumentURL"] class EditRunJobParams(TypedDict, total=False): @@ -42,27 +42,7 @@ class EditRunJobParams(TypedDict, total=False): jobs. """ - webhook: Webhook + webhook: WebhookConfigNew DocumentURL: TypeAlias = Union[str, Upload] - - -class Webhook(TypedDict, total=False): - channels: SequenceNotStr[str] - """ - A list of Svix channels the message will be delivered down, omit to send to all - channels. - """ - - metadata: object - """JSON metadata included in webhook request body""" - - mode: Literal["disabled", "svix", "direct"] - """The mode to use for webhook delivery. - - Defaults to 'disabled'. We recommend using 'svix' for production environments. - """ - - url: str - """The URL to send the webhook to (if using direct webhoook).""" diff --git a/src/reducto/types/enhance_param.py b/src/reducto/types/enhance_param.py index 85d5569c..386c86f1 100644 --- a/src/reducto/types/enhance_param.py +++ b/src/reducto/types/enhance_param.py @@ -2,47 +2,16 @@ from __future__ import annotations -from typing import Union, Iterable, Optional -from typing_extensions import Literal, Required, TypeAlias, TypedDict +from typing import Union, Iterable +from typing_extensions import TypeAlias, TypedDict -__all__ = ["EnhanceParam", "Agentic", "AgenticTableAgentic", "AgenticFigureAgentic", "AgenticTextAgentic"] +from .shared_params.text_agentic import TextAgentic +from .shared_params.table_agentic import TableAgentic +from .shared_params.figure_agentic import FigureAgentic +__all__ = ["EnhanceParam", "Agentic"] -class AgenticTableAgentic(TypedDict, total=False): - scope: Required[Literal["table"]] - - prompt: Optional[str] - """Custom prompt for table agentic.""" - - -class AgenticFigureAgentic(TypedDict, total=False): - scope: Required[Literal["figure"]] - - advanced_chart_agent: bool - """If True, use the advanced chart agent. Defaults to False.""" - - prompt: Optional[str] - """Custom prompt for figure agentic.""" - - return_overlays: bool - """If True, return overlays for the figure. - - This is so you can use the overlays to double check the quality of the - extraction - """ - - -class AgenticTextAgentic(TypedDict, total=False): - scope: Required[Literal["text"]] - - prompt: Optional[str] - """Custom instructions for agentic text. - - Note: This only applies to form regions (key-value). - """ - - -Agentic: TypeAlias = Union[AgenticTableAgentic, AgenticFigureAgentic, AgenticTextAgentic] +Agentic: TypeAlias = Union[TableAgentic, FigureAgentic, TextAgentic] class EnhanceParam(TypedDict, total=False): diff --git a/src/reducto/types/extract_response.py b/src/reducto/types/extract_response.py deleted file mode 100644 index 947aa58e..00000000 --- a/src/reducto/types/extract_response.py +++ /dev/null @@ -1,27 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List, Optional - -from .._models import BaseModel -from .extract_usage import ExtractUsage - -__all__ = ["ExtractResponse"] - - -class ExtractResponse(BaseModel): - citations: Optional[List[object]] = None - """The citations corresponding to the extracted response.""" - - result: List[object] - """The extracted response in your provided schema. - - This is a list of dictionaries. If disable_chunking is True (default), then it - will be a list of length one. - """ - - usage: ExtractUsage - - job_id: Optional[str] = None - - studio_link: Optional[str] = None - """The link to the studio pipeline for the document.""" diff --git a/src/reducto/types/extract_run_response.py b/src/reducto/types/extract_run_response.py index e343fcfa..d19ea49a 100644 --- a/src/reducto/types/extract_run_response.py +++ b/src/reducto/types/extract_run_response.py @@ -4,7 +4,7 @@ from typing_extensions import TypeAlias from .v3_extract import V3Extract -from .async_extract_response import AsyncExtractResponse +from .shared.async_extract_response import AsyncExtractResponse __all__ = ["ExtractRunResponse"] diff --git a/src/reducto/types/extract_usage.py b/src/reducto/types/extract_usage.py deleted file mode 100644 index 2d6a0a30..00000000 --- a/src/reducto/types/extract_usage.py +++ /dev/null @@ -1,18 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Optional -from typing_extensions import Literal - -from .._models import BaseModel - -__all__ = ["ExtractUsage"] - - -class ExtractUsage(BaseModel): - num_fields: int - - num_pages: int - - credits: Optional[float] = None - - extract_mode: Optional[Literal["super_agent", "extract", "spreadsheet_agent"]] = None diff --git a/src/reducto/types/job_get_response.py b/src/reducto/types/job_get_response.py index 5190ae91..f38acf91 100644 --- a/src/reducto/types/job_get_response.py +++ b/src/reducto/types/job_get_response.py @@ -6,12 +6,12 @@ from .._models import BaseModel from .v3_extract import V3Extract -from .edit_response import EditResponse -from .parse_response import ParseResponse -from .split_response import SplitResponse -from .extract_response import ExtractResponse -from .classify_response import ClassifyResponse -from .pipeline_response import PipelineResponse +from .shared.edit_response import EditResponse +from .shared.parse_response import ParseResponse +from .shared.split_response import SplitResponse +from .shared.extract_response import ExtractResponse +from .shared.classify_response import ClassifyResponse +from .shared.pipeline_response import PipelineResponse __all__ = [ "JobGetResponse", diff --git a/src/reducto/types/parse_run_response.py b/src/reducto/types/parse_run_response.py index f12989e7..6c78116a 100644 --- a/src/reducto/types/parse_run_response.py +++ b/src/reducto/types/parse_run_response.py @@ -3,8 +3,8 @@ from typing import Union from typing_extensions import TypeAlias -from .parse_response import ParseResponse -from .async_parse_response import AsyncParseResponse +from .shared.parse_response import ParseResponse +from .shared.async_parse_response import AsyncParseResponse __all__ = ["ParseRunResponse"] diff --git a/src/reducto/types/parse_usage.py b/src/reducto/types/parse_usage.py index 74abb42d..fe1e375d 100644 --- a/src/reducto/types/parse_usage.py +++ b/src/reducto/types/parse_usage.py @@ -1,6 +1,7 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Dict, Optional +from typing import Dict, List, Optional +from typing_extensions import Literal from .._models import BaseModel @@ -13,3 +14,32 @@ class ParseUsage(BaseModel): credit_breakdown: Optional[Dict[str, float]] = None credits: Optional[float] = None + + page_billing_breakdown: Optional[ + Dict[ + str, + List[ + Literal[ + "page", + "html_page", + "docx_native_page", + "agentic", + "complex", + "chart_agent", + "spreadsheet_cells", + "billable_spreadsheet_pages", + "enrich_table", + "figure_summary", + "table_summary", + "key_value", + "agentic_text", + "promptable_agentic_text", + ] + ], + ] + ] = None + """Per-page breakdown of features used. + + Maps 1-indexed page numbers (as strings) to the list of billing features applied + on that page (e.g. 'page', 'complex', 'chart_agent'). + """ diff --git a/src/reducto/types/pipeline_run_job_response.py b/src/reducto/types/pipeline_run_job_response.py deleted file mode 100644 index efe6e839..00000000 --- a/src/reducto/types/pipeline_run_job_response.py +++ /dev/null @@ -1,9 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from .._models import BaseModel - -__all__ = ["PipelineRunJobResponse"] - - -class PipelineRunJobResponse(BaseModel): - job_id: str diff --git a/src/reducto/types/retrieval_param.py b/src/reducto/types/retrieval_param.py index 3cf32952..cbff9cf6 100644 --- a/src/reducto/types/retrieval_param.py +++ b/src/reducto/types/retrieval_param.py @@ -2,31 +2,12 @@ from __future__ import annotations -from typing import List, Optional +from typing import List from typing_extensions import Literal, TypedDict -__all__ = ["RetrievalParam", "Chunking"] +from .shared_params.chunking import Chunking - -class Chunking(TypedDict, total=False): - chunk_mode: Literal["variable", "section", "page", "disabled", "block", "page_sections"] - """Choose how to partition chunks. - - Variable mode chunks by character length and visual context. Section mode chunks - by section headers. Page mode chunks according to pages. Page sections mode - chunks first by page, then by sections within each page. Disabled returns one - single chunk. - """ - - chunk_overlap: int - """Number of characters of overlap to include from adjacent chunks. Defaults to 0.""" - - chunk_size: Optional[int] - """ - The approximate size of chunks (in characters) that the document will be split - into. Defaults to null, in which case the chunk size is variable between 250 - - 1500 characters. - """ +__all__ = ["RetrievalParam"] class RetrievalParam(TypedDict, total=False): diff --git a/src/reducto/types/settings_param.py b/src/reducto/types/settings_param.py index efa50cad..93130646 100644 --- a/src/reducto/types/settings_param.py +++ b/src/reducto/types/settings_param.py @@ -6,11 +6,11 @@ from typing_extensions import Literal, TypeAlias, TypedDict from .._types import SequenceNotStr -from .page_range_param import PageRangeParam +from .shared_params import page_range __all__ = ["SettingsParam", "PageRange"] -PageRange: TypeAlias = Union[PageRangeParam, Iterable[PageRangeParam], Iterable[int], SequenceNotStr[str]] +PageRange: TypeAlias = Union[page_range.PageRange, Iterable[page_range.PageRange], Iterable[int], SequenceNotStr[str]] class SettingsParam(TypedDict, total=False): diff --git a/src/reducto/types/shared/__init__.py b/src/reducto/types/shared/__init__.py index e9bf9399..bb73a836 100644 --- a/src/reducto/types/shared/__init__.py +++ b/src/reducto/types/shared/__init__.py @@ -1,3 +1,33 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from .upload import Upload as Upload +from .chunking import Chunking as Chunking +from .page_range import PageRange as PageRange +from .text_agentic import TextAgentic as TextAgentic +from .edit_response import EditResponse as EditResponse +from .enrich_config import EnrichConfig as EnrichConfig +from .table_agentic import TableAgentic as TableAgentic +from .figure_agentic import FigureAgentic as FigureAgentic +from .parse_response import ParseResponse as ParseResponse +from .split_response import SplitResponse as SplitResponse +from .chunking_config import ChunkingConfig as ChunkingConfig +from .extract_response import ExtractResponse as ExtractResponse +from .classify_response import ClassifyResponse as ClassifyResponse +from .pipeline_response import PipelineResponse as PipelineResponse +from .split_large_tables import SplitLargeTables as SplitLargeTables +from .webhook_config_new import WebhookConfigNew as WebhookConfigNew +from .async_edit_response import AsyncEditResponse as AsyncEditResponse +from .svix_webhook_config import SvixWebhookConfig as SvixWebhookConfig +from .array_extract_config import ArrayExtractConfig as ArrayExtractConfig +from .async_parse_response import AsyncParseResponse as AsyncParseResponse +from .async_split_response import AsyncSplitResponse as AsyncSplitResponse +from .table_summary_config import TableSummaryConfig as TableSummaryConfig +from .direct_webhook_config import DirectWebhookConfig as DirectWebhookConfig +from .figure_summary_config import FigureSummaryConfig as FigureSummaryConfig +from .async_extract_response import AsyncExtractResponse as AsyncExtractResponse +from .async_pipeline_response import AsyncPipelineResponse as AsyncPipelineResponse +from .base_processing_options import BaseProcessingOptions as BaseProcessingOptions +from .advanced_citations_config import AdvancedCitationsConfig as AdvancedCitationsConfig +from .advanced_processing_options import AdvancedProcessingOptions as AdvancedProcessingOptions +from .large_table_chunking_config import LargeTableChunkingConfig as LargeTableChunkingConfig +from .experimental_processing_options import ExperimentalProcessingOptions as ExperimentalProcessingOptions diff --git a/src/reducto/types/shared/advanced_citations_config.py b/src/reducto/types/shared/advanced_citations_config.py new file mode 100644 index 00000000..bc4f525a --- /dev/null +++ b/src/reducto/types/shared/advanced_citations_config.py @@ -0,0 +1,12 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional + +from ..._models import BaseModel + +__all__ = ["AdvancedCitationsConfig"] + + +class AdvancedCitationsConfig(BaseModel): + numerical_confidence: Optional[bool] = None + """If True, enable numeric citation confidence scores. Defaults to False.""" diff --git a/src/reducto/types/shared/advanced_processing_options.py b/src/reducto/types/shared/advanced_processing_options.py new file mode 100644 index 00000000..8fb4c1fe --- /dev/null +++ b/src/reducto/types/shared/advanced_processing_options.py @@ -0,0 +1,135 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Union, Optional +from typing_extensions import Literal, TypeAlias + +from . import page_range +from ..._models import BaseModel +from .large_table_chunking_config import LargeTableChunkingConfig + +__all__ = ["AdvancedProcessingOptions", "PageRange"] + +PageRange: TypeAlias = Union[page_range.PageRange, List[page_range.PageRange], List[int], List[str]] + + +class AdvancedProcessingOptions(BaseModel): + add_page_markers: Optional[bool] = None + """If True, add page markers to the output (e.g. + + [[PAGE 1 BEGINS HERE]] and [[PAGE 1 ENDS HERE]] added as blocks to the content). + Defaults to False. + """ + + continue_hierarchy: Optional[bool] = None + """ + A flag to indicate if the hierarchy of the document should be continued from + chunk to chunk. + """ + + document_password: Optional[str] = None + """Password to decrypt password-protected documents.""" + + enable_change_tracking: Optional[bool] = None + """ + Enables model-based detection of underlines and strikethroughs, adding / + tags to OCR text. Works with any extraction mode. Defaults to False. + """ + + enable_highlight_detection: Optional[bool] = None + """If True, enable highlight detection. + + Highlighted text will be surrounded by tags in the output. Defaults to + False. + """ + + exclude_hidden_rows_cols: Optional[bool] = None + """Skip hidden rows and cols in Excel files. Defaults to False.""" + + exclude_hidden_sheets: Optional[bool] = None + """Skip hidden sheets in Excel files. Defaults to False.""" + + filter_line_numbers: Optional[bool] = None + """If True, filter out line numbers from the output. Defaults to False.""" + + force_file_extension: Optional[str] = None + """Force the URL to be downloaded as a specific file extension (e.g. .png).""" + + ignore_watermarks: Optional[bool] = None + """If True, ignore and remove watermarks from OCR output. Defaults to False.""" + + include_color_information: Optional[bool] = None + """ + If True, preserve Excel cell colours in the extracted spreadsheet text using + LaTeX colour commands. + """ + + include_dropdown_information: Optional[bool] = None + """ + If True, include dropdown options and the selected value when rendering + spreadsheet cells. + """ + + include_formula_information: Optional[bool] = None + """ + If True, preserve formula information in spreadsheet cells by wrapping text with + LaTeX formula commands during parsing. + """ + + keep_line_breaks: Optional[bool] = None + """If line breaks should be preserved in the text.""" + + large_table_chunking: Optional[LargeTableChunkingConfig] = None + """ + The configuration options for large table chunking (currently only supported on + spreadsheet and CSV files). + """ + + merge_tables: Optional[bool] = None + """ + A flag to indicate if consecutive tables with the same number of columns should + be merged across breaks and spaces. + """ + + ocr_system: Optional[ + Literal["highres", "multilingual", "combined", "reducto", "legacy", "reducto-v2", "reducto-v3"] + ] = None + """The OCR system to use. + + Highres is recommended for documents with English characters. Legacy uses an + alternative OCR backend. + """ + + page_range: Optional[PageRange] = None + """The page range to process (1-indexed). + + By default, the entire document is processed. For spreadsheets, you can also + provide a list of sheet names. + """ + + persist_results: Optional[bool] = None + """If True, persist the results indefinitely. Defaults to False.""" + + read_comments: Optional[bool] = None + """If True, pull in PDF comments from the document. Defaults to False.""" + + remove_text_formatting: Optional[bool] = None + """If True, remove text formatting from the output (e.g. + + hyphens for list items). Defaults to False. + """ + + return_ocr_data: Optional[bool] = None + """If True, return OCR data in the result. Defaults to False.""" + + spreadsheet_table_clustering: Optional[Literal["default", "disabled", "intelligent"]] = None + """ + In a spreadsheet with different tables inside, we enable splitting up the tables + by default. Intelligent mode applies more powerful models for superior accuracy, + at 5× the default per-cell rate. Disabling will register as one large table. + """ + + table_output_format: Optional[Literal["html", "json", "md", "jsonbbox", "dynamic", "ai_json", "csv"]] = None + """The mode to use for table output. + + Dynamic returns md for simpler tables and html for more complex tables. + """ diff --git a/src/reducto/types/shared/array_extract_config.py b/src/reducto/types/shared/array_extract_config.py new file mode 100644 index 00000000..bf2c5a58 --- /dev/null +++ b/src/reducto/types/shared/array_extract_config.py @@ -0,0 +1,29 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["ArrayExtractConfig"] + + +class ArrayExtractConfig(BaseModel): + enabled: Optional[bool] = None + """ + Array extraction allows you to extract long lists of information from lengthy + documents. It makes parallel calls on overlapping sections of the document. + """ + + mode: Optional[Literal["auto", "legacy", "streaming", "no_overlap"]] = None + """The array extraction version to use.""" + + pages_per_segment: Optional[int] = None + """Length of each segment, in pages, for parallel calls with array extraction.""" + + streaming_extract_item_density: Optional[int] = None + """Number of items to extract in each stream call. + + Lower numbers will increase quality but be much slower. 50 works well for most + documents with tables. + """ diff --git a/src/reducto/types/edit_run_job_response.py b/src/reducto/types/shared/async_edit_response.py similarity index 50% rename from src/reducto/types/edit_run_job_response.py rename to src/reducto/types/shared/async_edit_response.py index 967f46d8..da6fcba4 100644 --- a/src/reducto/types/edit_run_job_response.py +++ b/src/reducto/types/shared/async_edit_response.py @@ -1,9 +1,9 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from .._models import BaseModel +from ..._models import BaseModel -__all__ = ["EditRunJobResponse"] +__all__ = ["AsyncEditResponse"] -class EditRunJobResponse(BaseModel): +class AsyncEditResponse(BaseModel): job_id: str diff --git a/src/reducto/types/async_extract_response.py b/src/reducto/types/shared/async_extract_response.py similarity index 84% rename from src/reducto/types/async_extract_response.py rename to src/reducto/types/shared/async_extract_response.py index 7ee83afa..5bafe1ca 100644 --- a/src/reducto/types/async_extract_response.py +++ b/src/reducto/types/shared/async_extract_response.py @@ -1,6 +1,6 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from .._models import BaseModel +from ..._models import BaseModel __all__ = ["AsyncExtractResponse"] diff --git a/src/reducto/types/async_parse_response.py b/src/reducto/types/shared/async_parse_response.py similarity index 84% rename from src/reducto/types/async_parse_response.py rename to src/reducto/types/shared/async_parse_response.py index 41610236..9df7fe5d 100644 --- a/src/reducto/types/async_parse_response.py +++ b/src/reducto/types/shared/async_parse_response.py @@ -1,6 +1,6 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from .._models import BaseModel +from ..._models import BaseModel __all__ = ["AsyncParseResponse"] diff --git a/src/reducto/types/shared/async_pipeline_response.py b/src/reducto/types/shared/async_pipeline_response.py new file mode 100644 index 00000000..ca6a8829 --- /dev/null +++ b/src/reducto/types/shared/async_pipeline_response.py @@ -0,0 +1,9 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from ..._models import BaseModel + +__all__ = ["AsyncPipelineResponse"] + + +class AsyncPipelineResponse(BaseModel): + job_id: str diff --git a/src/reducto/types/split_run_job_response.py b/src/reducto/types/shared/async_split_response.py similarity index 50% rename from src/reducto/types/split_run_job_response.py rename to src/reducto/types/shared/async_split_response.py index 404b32fe..5294328b 100644 --- a/src/reducto/types/split_run_job_response.py +++ b/src/reducto/types/shared/async_split_response.py @@ -1,9 +1,9 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from .._models import BaseModel +from ..._models import BaseModel -__all__ = ["SplitRunJobResponse"] +__all__ = ["AsyncSplitResponse"] -class SplitRunJobResponse(BaseModel): +class AsyncSplitResponse(BaseModel): job_id: str diff --git a/src/reducto/types/shared/base_processing_options.py b/src/reducto/types/shared/base_processing_options.py new file mode 100644 index 00000000..50f69666 --- /dev/null +++ b/src/reducto/types/shared/base_processing_options.py @@ -0,0 +1,67 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Optional +from typing_extensions import Literal + +from ..._models import BaseModel +from .chunking_config import ChunkingConfig +from .table_summary_config import TableSummaryConfig +from .figure_summary_config import FigureSummaryConfig + +__all__ = ["BaseProcessingOptions"] + + +class BaseProcessingOptions(BaseModel): + chunking: Optional[ChunkingConfig] = None + """The configuration options for chunking. + + Chunking is commonly used for RAG usecases. + """ + + extraction_mode: Optional[Literal["ocr", "metadata", "hybrid"]] = None + """The mode to use for extraction. + + Metadata/hybrid are only recommended with high quality metadata embeddings. + """ + + figure_summary: Optional[FigureSummaryConfig] = None + """The configuration options for figure summarization.""" + + filter_blocks: Optional[ + List[ + Literal[ + "Header", + "Footer", + "Title", + "Section Header", + "Page Number", + "List Item", + "Figure", + "Table", + "Key Value", + "Text", + "Comment", + "Signature", + ] + ] + ] = None + """A list of block types to filter from chunk content. + + Pass blocks to filter them from content. By default, no blocks are filtered. + """ + + force_url_result: Optional[bool] = None + """ + Force the result to be returned in URL form (by default only used for very large + responses). + """ + + ocr_mode: Optional[Literal["standard", "agentic"]] = None + """The mode to use for OCR. + + Agentic mode adds an extra pass, correcting any table/text mistakes at a small + cost. + """ + + table_summary: Optional[TableSummaryConfig] = None + """The configuration options for table summarization.""" diff --git a/src/reducto/types/shared/chunking.py b/src/reducto/types/shared/chunking.py new file mode 100644 index 00000000..b56c1250 --- /dev/null +++ b/src/reducto/types/shared/chunking.py @@ -0,0 +1,29 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["Chunking"] + + +class Chunking(BaseModel): + chunk_mode: Optional[Literal["variable", "section", "page", "disabled", "block", "page_sections"]] = None + """Choose how to partition chunks. + + Variable mode chunks by character length and visual context. Section mode chunks + by section headers. Page mode chunks according to pages. Page sections mode + chunks first by page, then by sections within each page. Disabled returns one + single chunk. + """ + + chunk_overlap: Optional[int] = None + """Number of characters of overlap to include from adjacent chunks. Defaults to 0.""" + + chunk_size: Optional[int] = None + """ + The approximate size of chunks (in characters) that the document will be split + into. Defaults to null, in which case the chunk size is variable between 250 - + 1500 characters. + """ diff --git a/src/reducto/types/shared/chunking_config.py b/src/reducto/types/shared/chunking_config.py new file mode 100644 index 00000000..078ad35d --- /dev/null +++ b/src/reducto/types/shared/chunking_config.py @@ -0,0 +1,29 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["ChunkingConfig"] + + +class ChunkingConfig(BaseModel): + chunk_mode: Optional[Literal["variable", "section", "page", "block", "disabled", "page_sections"]] = None + """Choose how to partition chunks. + + Variable mode chunks by character length and visual context. Section mode chunks + by section headers. Page mode chunks according to pages. Page sections mode + chunks first by page, then by sections within each page. Disabled returns one + single chunk. + """ + + chunk_overlap: Optional[int] = None + """Number of characters of overlap to include from adjacent chunks. Defaults to 0.""" + + chunk_size: Optional[int] = None + """ + The approximate size of chunks (in characters) that the document will be split + into. Defaults to None, in which case the chunk size is variable between 250 - + 1500 characters. + """ diff --git a/src/reducto/types/classify_response.py b/src/reducto/types/shared/classify_response.py similarity index 93% rename from src/reducto/types/classify_response.py rename to src/reducto/types/shared/classify_response.py index 05d58978..1a2e07cc 100644 --- a/src/reducto/types/classify_response.py +++ b/src/reducto/types/shared/classify_response.py @@ -3,7 +3,7 @@ from typing import List, Optional from typing_extensions import Literal -from .._models import BaseModel +from ..._models import BaseModel __all__ = [ "ClassifyResponse", @@ -54,3 +54,5 @@ class ClassifyResponse(BaseModel): response_confidence: Optional[ResponseConfidence] = None """Overall confidence breakdown for classification response.""" + + response_type: Optional[Literal["classify"]] = None diff --git a/src/reducto/types/shared/direct_webhook_config.py b/src/reducto/types/shared/direct_webhook_config.py new file mode 100644 index 00000000..b0bff755 --- /dev/null +++ b/src/reducto/types/shared/direct_webhook_config.py @@ -0,0 +1,14 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["DirectWebhookConfig"] + + +class DirectWebhookConfig(BaseModel): + url: str + + mode: Optional[Literal["direct"]] = None diff --git a/src/reducto/types/edit_response.py b/src/reducto/types/shared/edit_response.py similarity index 74% rename from src/reducto/types/edit_response.py rename to src/reducto/types/shared/edit_response.py index 91c3217f..b8a4cf01 100644 --- a/src/reducto/types/edit_response.py +++ b/src/reducto/types/shared/edit_response.py @@ -1,10 +1,11 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from typing import List, Optional +from typing_extensions import Literal -from .._models import BaseModel -from .edit_widget import EditWidget -from .parse_usage import ParseUsage +from ..._models import BaseModel +from ..edit_widget import EditWidget +from ..parse_usage import ParseUsage __all__ = ["EditResponse"] @@ -19,6 +20,8 @@ class EditResponse(BaseModel): List of widgets with their types, descriptions, and bounding boxes. """ + response_type: Optional[Literal["edit"]] = None + usage: Optional[ParseUsage] = None """ Usage information for the edit operation, including number of pages and credits diff --git a/src/reducto/types/shared/enrich_config.py b/src/reducto/types/shared/enrich_config.py new file mode 100644 index 00000000..12ab6e81 --- /dev/null +++ b/src/reducto/types/shared/enrich_config.py @@ -0,0 +1,23 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["EnrichConfig"] + + +class EnrichConfig(BaseModel): + enabled: Optional[bool] = None + """ + If enabled, a large language/vision model will be used to postprocess the + extracted content. Note: enabling enrich requires tables be outputted in + markdown format. Defaults to False. + """ + + mode: Optional[Literal["standard", "page", "table"]] = None + """The mode to use for enrichment. Defaults to standard""" + + prompt: Optional[str] = None + """Add information to the prompt for enrichment.""" diff --git a/src/reducto/types/shared/experimental_processing_options.py b/src/reducto/types/shared/experimental_processing_options.py new file mode 100644 index 00000000..6533dd4f --- /dev/null +++ b/src/reducto/types/shared/experimental_processing_options.py @@ -0,0 +1,138 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import TYPE_CHECKING, Dict, Optional +from typing_extensions import Literal + +from pydantic import Field as FieldInfo + +from ..._models import BaseModel +from .enrich_config import EnrichConfig + +__all__ = ["ExperimentalProcessingOptions"] + + +class ExperimentalProcessingOptions(BaseModel): + chunk_table_blocks: Optional[bool] = None + """ + If True, split table blocks into smaller chunks based on the specified chunk + size in the chunking option. Defaults to False. + """ + + danger_filter_wide_boxes: Optional[bool] = None + """You probably shouldn't use this. + + If True, filter out boxes with width greater than 50% of the document width. + Defaults to False. You probably don't want to use this. + """ + + detect_signatures: Optional[bool] = None + """If True, detect signatures in the document. Defaults to False.""" + + disable_office_external_links: Optional[bool] = None + """ + If True, configure LibreOffice conversion to block linked content from untrusted + documents. Defaults to True on-prem and False elsewhere. + """ + + embed_text_metadata_pdf: Optional[bool] = None + """ + If extracted OCR text metadata should be embedded back into the returned PDF, + overwriting any existing text. Defaults to False. + """ + + enable_checkboxes: Optional[bool] = None + """ + Use an experimental checkbox detection model to add checkboxes to the output, + defaults to False + """ + + enable_equations: Optional[bool] = None + """ + Use an experimental equation detection model to add equations to the output, + defaults to False + """ + + enable_scripts: Optional[bool] = None + """ + Add tag around subscripts and tag around superscripts, defaults to + False + """ + + enrich: Optional[EnrichConfig] = None + """The configuration options for enrichment.""" + + latency_sensitive: Optional[bool] = None + """If True, the job will be processed with lower latency and higher priority. + + Uses 2x the cost of a regular job. Defaults to False. + """ + + layout_enrichment: Optional[bool] = None + """ + Layout enrichment is a beta feature that improves our layout and reading order + performance at the cost of increased latency. Defaults to False. + """ + + layout_model: Optional[ + Literal[ + "default", + "beta", + "dfine", + "rfdetr", + "rfdetr0302", + "rfdetr0303", + "rfdetrbase0218", + "rfdetr0304", + "rfdetr0306", + "qwen35_27b_0317", + ] + ] = None + """The layout model to use for the document. + + This will be deprecated in the future. + """ + + native_office_conversion: Optional[bool] = None + """ + Instead of using LibreOffice, when enabled, this flag uses a Windows VM to + convert files. This is slower but more accurate. + """ + + promptable_agentic_text_on_regular_blocks: Optional[bool] = None + """ + If True, enable two-stage LLM pipeline for agentic text correction on regular + text blocks. Defaults to False. + """ + + return_figure_images: Optional[bool] = None + """If figure images should be returned in the result. Defaults to False.""" + + return_page_images: Optional[bool] = None + """If full page images should be returned in the result. Defaults to False.""" + + return_table_images: Optional[bool] = None + """If table images should be returned in the result. Defaults to False.""" + + rotate_figures: Optional[bool] = None + """ + Use an orientation model to detect and rotate figures as needed, defaults to + False + """ + + rotate_pages: Optional[bool] = None + """Use an orientation model to detect and rotate pages as needed, defaults to True""" + + user_specified_timeout_seconds: Optional[float] = None + """A user specified timeout, defaults to None""" + + if TYPE_CHECKING: + # Some versions of Pydantic <2.8.0 have a bug and don’t allow assigning a + # value to this field, so for compatibility we avoid doing it at runtime. + __pydantic_extra__: Dict[str, object] = FieldInfo(init=False) # pyright: ignore[reportIncompatibleVariableOverride] + + # Stub to indicate that arbitrary properties are accepted. + # To access properties that are not valid identifiers you can use `getattr`, e.g. + # `getattr(obj, '$type')` + def __getattr__(self, attr: str) -> object: ... + else: + __pydantic_extra__: Dict[str, object] diff --git a/src/reducto/types/shared/extract_response.py b/src/reducto/types/shared/extract_response.py new file mode 100644 index 00000000..b071e9fc --- /dev/null +++ b/src/reducto/types/shared/extract_response.py @@ -0,0 +1,8 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Dict +from typing_extensions import TypeAlias + +__all__ = ["ExtractResponse"] + +ExtractResponse: TypeAlias = Dict[str, object] diff --git a/src/reducto/types/shared/figure_agentic.py b/src/reducto/types/shared/figure_agentic.py new file mode 100644 index 00000000..2846fe2d --- /dev/null +++ b/src/reducto/types/shared/figure_agentic.py @@ -0,0 +1,25 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["FigureAgentic"] + + +class FigureAgentic(BaseModel): + scope: Literal["figure"] + + advanced_chart_agent: Optional[bool] = None + """If True, use the advanced chart agent. Defaults to False.""" + + prompt: Optional[str] = None + """Custom prompt for figure agentic.""" + + return_overlays: Optional[bool] = None + """If True, return overlays for the figure. + + This is so you can use the overlays to double check the quality of the + extraction + """ diff --git a/src/reducto/types/shared/figure_summary_config.py b/src/reducto/types/shared/figure_summary_config.py new file mode 100644 index 00000000..7e8a9d59 --- /dev/null +++ b/src/reducto/types/shared/figure_summary_config.py @@ -0,0 +1,25 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional + +from ..._models import BaseModel + +__all__ = ["FigureSummaryConfig"] + + +class FigureSummaryConfig(BaseModel): + advanced_chart_agent: Optional[bool] = None + """If True, use the advanced chart agent. Defaults to False.""" + + enabled: Optional[bool] = None + """If figure summarization should be performed.""" + + override: Optional[bool] = None + """If the figure summary prompt should override our default prompt.""" + + prompt: Optional[str] = None + """Add information to the prompt for figure summarization. + + Note any visual cues that should be incorporated. Example: 'When provided a + diagram, extract all of the figure content verbatim.' + """ diff --git a/src/reducto/types/shared/large_table_chunking_config.py b/src/reducto/types/shared/large_table_chunking_config.py new file mode 100644 index 00000000..f0b033d6 --- /dev/null +++ b/src/reducto/types/shared/large_table_chunking_config.py @@ -0,0 +1,21 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional + +from ..._models import BaseModel + +__all__ = ["LargeTableChunkingConfig"] + + +class LargeTableChunkingConfig(BaseModel): + enabled: Optional[bool] = None + """ + If large tables should be chunked into smaller tables, currently only supported + on spreadsheet and CSV files. + """ + + size: Optional[int] = None + """The max row/column size for a table to be chunked. + + Defaults to 50. Header rows/columns are persisted based on heuristics. + """ diff --git a/src/reducto/types/shared/page_range.py b/src/reducto/types/shared/page_range.py new file mode 100644 index 00000000..9ecdd60a --- /dev/null +++ b/src/reducto/types/shared/page_range.py @@ -0,0 +1,15 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional + +from ..._models import BaseModel + +__all__ = ["PageRange"] + + +class PageRange(BaseModel): + end: Optional[int] = None + """The page number to stop processing at (1-indexed).""" + + start: Optional[int] = None + """The page number to start processing from (1-indexed).""" diff --git a/src/reducto/types/parse_response.py b/src/reducto/types/shared/parse_response.py similarity index 89% rename from src/reducto/types/parse_response.py rename to src/reducto/types/shared/parse_response.py index d9fa50da..cafaeaf8 100644 --- a/src/reducto/types/parse_response.py +++ b/src/reducto/types/shared/parse_response.py @@ -3,9 +3,9 @@ from typing import Dict, List, Union, Optional from typing_extensions import Literal, TypeAlias -from .._models import BaseModel -from .parse_usage import ParseUsage -from .bounding_box import BoundingBox +from ..._models import BaseModel +from ..parse_usage import ParseUsage +from ..bounding_box import BoundingBox __all__ = [ "ParseResponse", @@ -177,8 +177,19 @@ class ParseResponse(BaseModel): usage: ParseUsage + parse_mode: Optional[Literal["base", "lite"]] = None + """Which pipeline produced this response. + + `lite` means Reducto Flash Lite served the request; `base` is the standard + pipeline. Optional / nullable for forward compatibility — older API instances or + persisted responses written before this field existed will leave it `None`; + treat `None` as `base`. + """ + pdf_url: Optional[str] = None """The storage URL of the converted PDF file.""" + response_type: Optional[Literal["parse"]] = None + studio_link: Optional[str] = None """The link to the studio pipeline for the document.""" diff --git a/src/reducto/types/pipeline_response.py b/src/reducto/types/shared/pipeline_response.py similarity index 55% rename from src/reducto/types/pipeline_response.py rename to src/reducto/types/shared/pipeline_response.py index cd7faafb..c2e2deda 100644 --- a/src/reducto/types/pipeline_response.py +++ b/src/reducto/types/shared/pipeline_response.py @@ -1,47 +1,36 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from typing import List, Union, Optional -from typing_extensions import TypeAlias +from typing_extensions import Literal, TypeAlias -from .._models import BaseModel -from .v3_extract import V3Extract -from .parse_usage import ParseUsage +from ..._models import BaseModel +from ..v3_extract import V3Extract +from ..parse_usage import ParseUsage from .edit_response import EditResponse from .parse_response import ParseResponse from .split_response import SplitResponse from .extract_response import ExtractResponse -__all__ = [ - "PipelineResponse", - "Result", - "ResultExtract", - "ResultExtractExtractVariant0", - "ResultExtractExtractVariant0Result", - "ResultParse", -] +__all__ = ["PipelineResponse", "Result", "ResultExtractUnionMember0", "ResultParse"] -ResultExtractExtractVariant0Result: TypeAlias = Union[ExtractResponse, V3Extract] - -class ResultExtractExtractVariant0(BaseModel): +class ResultExtractUnionMember0(BaseModel): """This is the response format for Extract -> Split Pipelines""" page_range: List[int] - result: ResultExtractExtractVariant0Result + result: Union[ExtractResponse, V3Extract] split_name: str partition: Optional[str] = None -ResultExtract: TypeAlias = Union[List[ResultExtractExtractVariant0], ExtractResponse, V3Extract, None] - ResultParse: TypeAlias = Union[ParseResponse, List[ParseResponse], None] class Result(BaseModel): - extract: Optional[ResultExtract] = None + extract: Union[List[ResultExtractUnionMember0], ExtractResponse, V3Extract, None] = None parse: Optional[ResultParse] = None @@ -56,3 +45,5 @@ class PipelineResponse(BaseModel): result: Result usage: ParseUsage + + response_type: Optional[Literal["pipeline"]] = None diff --git a/src/reducto/types/shared/split_large_tables.py b/src/reducto/types/shared/split_large_tables.py new file mode 100644 index 00000000..33409b77 --- /dev/null +++ b/src/reducto/types/shared/split_large_tables.py @@ -0,0 +1,38 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Union, Optional +from typing_extensions import TypeAlias + +from ..._models import BaseModel + +__all__ = ["SplitLargeTables", "Size", "SizeSplitLargeTableSizes"] + + +class SizeSplitLargeTableSizes(BaseModel): + column: Optional[int] = None + """The number of columns to include in each chunk when splitting large tables. + + Does not chunk columns if set to None. + """ + + row: Optional[int] = None + """The number of rows to include in each chunk when splitting large tables. + + Does not chunk rows if set to None. + """ + + +Size: TypeAlias = Union[int, SizeSplitLargeTableSizes] + + +class SplitLargeTables(BaseModel): + enabled: Optional[bool] = None + """If True, split large tables into smaller tables. Defaults to True.""" + + size: Optional[Size] = None + """The size of the tables to split into. + + Defaults to 50. Use 'row' and 'column' to independently specify the number of + rows and columns to include when splitting. If you only want to split by rows or + columns, set the other value to None. + """ diff --git a/src/reducto/types/split_response.py b/src/reducto/types/shared/split_response.py similarity index 88% rename from src/reducto/types/split_response.py rename to src/reducto/types/shared/split_response.py index b3544af4..d6e74073 100644 --- a/src/reducto/types/split_response.py +++ b/src/reducto/types/shared/split_response.py @@ -3,9 +3,9 @@ from typing import Dict, List, Union, Optional from typing_extensions import Literal, TypeAlias -from .._models import BaseModel -from .parse_usage import ParseUsage -from .deep_split_page_evidence import DeepSplitPageEvidence +from ..._models import BaseModel +from ..parse_usage import ParseUsage +from ..deep_split_page_evidence import DeepSplitPageEvidence __all__ = [ "SplitResponse", @@ -69,3 +69,5 @@ class SplitResponse(BaseModel): """The split result.""" usage: ParseUsage + + response_type: Optional[Literal["split"]] = None diff --git a/src/reducto/types/shared/svix_webhook_config.py b/src/reducto/types/shared/svix_webhook_config.py new file mode 100644 index 00000000..4cbb5627 --- /dev/null +++ b/src/reducto/types/shared/svix_webhook_config.py @@ -0,0 +1,18 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["SvixWebhookConfig"] + + +class SvixWebhookConfig(BaseModel): + channels: Optional[List[str]] = None + """ + A list of Svix channels the message will be delivered down, omit to send to all + channels. + """ + + mode: Optional[Literal["svix"]] = None diff --git a/src/reducto/types/shared/table_agentic.py b/src/reducto/types/shared/table_agentic.py new file mode 100644 index 00000000..3cd206e6 --- /dev/null +++ b/src/reducto/types/shared/table_agentic.py @@ -0,0 +1,21 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["TableAgentic"] + + +class TableAgentic(BaseModel): + scope: Literal["table"] + + mode: Optional[Literal["default", "auto"]] = None + """ + Routing mode for table agentic: 'default' runs enrichment on all tables, 'auto' + uses the router to skip tables where enrichment is unlikely to help. + """ + + prompt: Optional[str] = None + """Custom prompt for table agentic.""" diff --git a/src/reducto/types/shared/table_summary_config.py b/src/reducto/types/shared/table_summary_config.py new file mode 100644 index 00000000..f4b4c776 --- /dev/null +++ b/src/reducto/types/shared/table_summary_config.py @@ -0,0 +1,15 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional + +from ..._models import BaseModel + +__all__ = ["TableSummaryConfig"] + + +class TableSummaryConfig(BaseModel): + enabled: Optional[bool] = None + """If table summarization should be performed.""" + + prompt: Optional[str] = None + """Add information to the prompt for table summarization.""" diff --git a/src/reducto/types/shared/text_agentic.py b/src/reducto/types/shared/text_agentic.py new file mode 100644 index 00000000..e2074786 --- /dev/null +++ b/src/reducto/types/shared/text_agentic.py @@ -0,0 +1,18 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["TextAgentic"] + + +class TextAgentic(BaseModel): + scope: Literal["text"] + + prompt: Optional[str] = None + """Custom instructions for agentic text. + + Note: This only applies to form regions (key-value). + """ diff --git a/src/reducto/types/shared/webhook_config_new.py b/src/reducto/types/shared/webhook_config_new.py new file mode 100644 index 00000000..e28c8b6d --- /dev/null +++ b/src/reducto/types/shared/webhook_config_new.py @@ -0,0 +1,28 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["WebhookConfigNew"] + + +class WebhookConfigNew(BaseModel): + channels: Optional[List[str]] = None + """ + A list of Svix channels the message will be delivered down, omit to send to all + channels. + """ + + metadata: Optional[object] = None + """JSON metadata included in webhook request body""" + + mode: Optional[Literal["disabled", "svix", "direct"]] = None + """The mode to use for webhook delivery. + + Defaults to 'disabled'. We recommend using 'svix' for production environments. + """ + + url: Optional[str] = None + """The URL to send the webhook to (if using direct webhoook).""" diff --git a/src/reducto/types/shared_params/__init__.py b/src/reducto/types/shared_params/__init__.py index e9bf9399..4a4299d3 100644 --- a/src/reducto/types/shared_params/__init__.py +++ b/src/reducto/types/shared_params/__init__.py @@ -1,3 +1,12 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from .upload import Upload as Upload +from .chunking import Chunking as Chunking +from .page_range import PageRange as PageRange +from .text_agentic import TextAgentic as TextAgentic +from .table_agentic import TableAgentic as TableAgentic +from .figure_agentic import FigureAgentic as FigureAgentic +from .split_large_tables import SplitLargeTables as SplitLargeTables +from .webhook_config_new import WebhookConfigNew as WebhookConfigNew +from .svix_webhook_config import SvixWebhookConfig as SvixWebhookConfig +from .direct_webhook_config import DirectWebhookConfig as DirectWebhookConfig diff --git a/src/reducto/types/shared_params/chunking.py b/src/reducto/types/shared_params/chunking.py new file mode 100644 index 00000000..2d79eda7 --- /dev/null +++ b/src/reducto/types/shared_params/chunking.py @@ -0,0 +1,29 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Optional +from typing_extensions import Literal, TypedDict + +__all__ = ["Chunking"] + + +class Chunking(TypedDict, total=False): + chunk_mode: Literal["variable", "section", "page", "disabled", "block", "page_sections"] + """Choose how to partition chunks. + + Variable mode chunks by character length and visual context. Section mode chunks + by section headers. Page mode chunks according to pages. Page sections mode + chunks first by page, then by sections within each page. Disabled returns one + single chunk. + """ + + chunk_overlap: int + """Number of characters of overlap to include from adjacent chunks. Defaults to 0.""" + + chunk_size: Optional[int] + """ + The approximate size of chunks (in characters) that the document will be split + into. Defaults to null, in which case the chunk size is variable between 250 - + 1500 characters. + """ diff --git a/src/reducto/types/shared_params/direct_webhook_config.py b/src/reducto/types/shared_params/direct_webhook_config.py new file mode 100644 index 00000000..841219f1 --- /dev/null +++ b/src/reducto/types/shared_params/direct_webhook_config.py @@ -0,0 +1,13 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import Literal, Required, TypedDict + +__all__ = ["DirectWebhookConfig"] + + +class DirectWebhookConfig(TypedDict, total=False): + url: Required[str] + + mode: Literal["direct"] diff --git a/src/reducto/types/shared_params/figure_agentic.py b/src/reducto/types/shared_params/figure_agentic.py new file mode 100644 index 00000000..607034fb --- /dev/null +++ b/src/reducto/types/shared_params/figure_agentic.py @@ -0,0 +1,25 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Optional +from typing_extensions import Literal, Required, TypedDict + +__all__ = ["FigureAgentic"] + + +class FigureAgentic(TypedDict, total=False): + scope: Required[Literal["figure"]] + + advanced_chart_agent: bool + """If True, use the advanced chart agent. Defaults to False.""" + + prompt: Optional[str] + """Custom prompt for figure agentic.""" + + return_overlays: bool + """If True, return overlays for the figure. + + This is so you can use the overlays to double check the quality of the + extraction + """ diff --git a/src/reducto/types/page_range_param.py b/src/reducto/types/shared_params/page_range.py similarity index 83% rename from src/reducto/types/page_range_param.py rename to src/reducto/types/shared_params/page_range.py index d2b9e2a5..780830b7 100644 --- a/src/reducto/types/page_range_param.py +++ b/src/reducto/types/shared_params/page_range.py @@ -5,10 +5,10 @@ from typing import Optional from typing_extensions import TypedDict -__all__ = ["PageRangeParam"] +__all__ = ["PageRange"] -class PageRangeParam(TypedDict, total=False): +class PageRange(TypedDict, total=False): end: Optional[int] """The page number to stop processing at (1-indexed).""" diff --git a/src/reducto/types/shared_params/split_large_tables.py b/src/reducto/types/shared_params/split_large_tables.py new file mode 100644 index 00000000..82c1c45b --- /dev/null +++ b/src/reducto/types/shared_params/split_large_tables.py @@ -0,0 +1,38 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Union, Optional +from typing_extensions import TypeAlias, TypedDict + +__all__ = ["SplitLargeTables", "Size", "SizeSplitLargeTableSizes"] + + +class SizeSplitLargeTableSizes(TypedDict, total=False): + column: Optional[int] + """The number of columns to include in each chunk when splitting large tables. + + Does not chunk columns if set to None. + """ + + row: Optional[int] + """The number of rows to include in each chunk when splitting large tables. + + Does not chunk rows if set to None. + """ + + +Size: TypeAlias = Union[int, SizeSplitLargeTableSizes] + + +class SplitLargeTables(TypedDict, total=False): + enabled: bool + """If True, split large tables into smaller tables. Defaults to True.""" + + size: Size + """The size of the tables to split into. + + Defaults to 50. Use 'row' and 'column' to independently specify the number of + rows and columns to include when splitting. If you only want to split by rows or + columns, set the other value to None. + """ diff --git a/src/reducto/types/shared_params/svix_webhook_config.py b/src/reducto/types/shared_params/svix_webhook_config.py new file mode 100644 index 00000000..cf571641 --- /dev/null +++ b/src/reducto/types/shared_params/svix_webhook_config.py @@ -0,0 +1,19 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import Literal, TypedDict + +from ..._types import SequenceNotStr + +__all__ = ["SvixWebhookConfig"] + + +class SvixWebhookConfig(TypedDict, total=False): + channels: SequenceNotStr[str] + """ + A list of Svix channels the message will be delivered down, omit to send to all + channels. + """ + + mode: Literal["svix"] diff --git a/src/reducto/types/shared_params/table_agentic.py b/src/reducto/types/shared_params/table_agentic.py new file mode 100644 index 00000000..ff586b84 --- /dev/null +++ b/src/reducto/types/shared_params/table_agentic.py @@ -0,0 +1,21 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Optional +from typing_extensions import Literal, Required, TypedDict + +__all__ = ["TableAgentic"] + + +class TableAgentic(TypedDict, total=False): + scope: Required[Literal["table"]] + + mode: Literal["default", "auto"] + """ + Routing mode for table agentic: 'default' runs enrichment on all tables, 'auto' + uses the router to skip tables where enrichment is unlikely to help. + """ + + prompt: Optional[str] + """Custom prompt for table agentic.""" diff --git a/src/reducto/types/shared_params/text_agentic.py b/src/reducto/types/shared_params/text_agentic.py new file mode 100644 index 00000000..be3ae8b2 --- /dev/null +++ b/src/reducto/types/shared_params/text_agentic.py @@ -0,0 +1,18 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Optional +from typing_extensions import Literal, Required, TypedDict + +__all__ = ["TextAgentic"] + + +class TextAgentic(TypedDict, total=False): + scope: Required[Literal["text"]] + + prompt: Optional[str] + """Custom instructions for agentic text. + + Note: This only applies to form regions (key-value). + """ diff --git a/src/reducto/types/shared_params/webhook_config_new.py b/src/reducto/types/shared_params/webhook_config_new.py new file mode 100644 index 00000000..0ebd8e3c --- /dev/null +++ b/src/reducto/types/shared_params/webhook_config_new.py @@ -0,0 +1,29 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import Literal, TypedDict + +from ..._types import SequenceNotStr + +__all__ = ["WebhookConfigNew"] + + +class WebhookConfigNew(TypedDict, total=False): + channels: SequenceNotStr[str] + """ + A list of Svix channels the message will be delivered down, omit to send to all + channels. + """ + + metadata: object + """JSON metadata included in webhook request body""" + + mode: Literal["disabled", "svix", "direct"] + """The mode to use for webhook delivery. + + Defaults to 'disabled'. We recommend using 'svix' for production environments. + """ + + url: str + """The URL to send the webhook to (if using direct webhoook).""" diff --git a/src/reducto/types/split_table_options_param.py b/src/reducto/types/split_table_options_param.py index 740e863c..a4c29707 100644 --- a/src/reducto/types/split_table_options_param.py +++ b/src/reducto/types/split_table_options_param.py @@ -8,6 +8,12 @@ class SplitTableOptionsParam(TypedDict, total=False): + allow_page_overlap: bool + """If True, a page can belong to multiple categories/partitions. + + If False, each page must belong to exactly one category. Defaults to True. + """ + table_cutoff: Literal["truncate", "preserve"] """ If tables should be truncated to the first few rows or if all content should be diff --git a/src/reducto/types/spreadsheet_param.py b/src/reducto/types/spreadsheet_param.py index 4a3e1a12..43dfcbd4 100644 --- a/src/reducto/types/spreadsheet_param.py +++ b/src/reducto/types/spreadsheet_param.py @@ -2,40 +2,12 @@ from __future__ import annotations -from typing import List, Union, Optional -from typing_extensions import Literal, TypeAlias, TypedDict +from typing import List +from typing_extensions import Literal, TypedDict -__all__ = ["SpreadsheetParam", "SplitLargeTables", "SplitLargeTablesSize", "SplitLargeTablesSizeSplitLargeTableSizes"] +from .shared_params.split_large_tables import SplitLargeTables - -class SplitLargeTablesSizeSplitLargeTableSizes(TypedDict, total=False): - column: Optional[int] - """The number of columns to include in each chunk when splitting large tables. - - Does not chunk columns if set to None. - """ - - row: Optional[int] - """The number of rows to include in each chunk when splitting large tables. - - Does not chunk rows if set to None. - """ - - -SplitLargeTablesSize: TypeAlias = Union[int, SplitLargeTablesSizeSplitLargeTableSizes] - - -class SplitLargeTables(TypedDict, total=False): - enabled: bool - """If True, split large tables into smaller tables. Defaults to True.""" - - size: SplitLargeTablesSize - """The size of the tables to split into. - - Defaults to 50. Use 'row' and 'column' to independently specify the number of - rows and columns to include when splitting. If you only want to split by rows or - columns, set the other value to None. - """ +__all__ = ["SpreadsheetParam"] class SpreadsheetParam(TypedDict, total=False): diff --git a/src/reducto/types/v3_extract.py b/src/reducto/types/v3_extract.py index e2b61eb0..2f197e6f 100644 --- a/src/reducto/types/v3_extract.py +++ b/src/reducto/types/v3_extract.py @@ -1,24 +1,8 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Union, Optional - -from .._models import BaseModel -from .extract_usage import ExtractUsage +from typing import Dict +from typing_extensions import TypeAlias __all__ = ["V3Extract"] - -class V3Extract(BaseModel): - result: Union[List[object], object] - """The extracted response in your provided schema. - - This is a list of dictionaries. If disable_chunking is True (default), then it - will be a list of length one. - """ - - usage: ExtractUsage - - job_id: Optional[str] = None - - studio_link: Optional[str] = None - """The link to the studio pipeline for the document.""" +V3Extract: TypeAlias = Dict[str, object] diff --git a/tests/api_resources/test_classify.py b/tests/api_resources/test_classify.py index 04bdff54..5a2a554f 100644 --- a/tests/api_resources/test_classify.py +++ b/tests/api_resources/test_classify.py @@ -9,7 +9,7 @@ from reducto import Reducto, AsyncReducto from tests.utils import assert_matches_type -from reducto.types import ClassifyResponse +from reducto.types.shared import ClassifyResponse base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") diff --git a/tests/api_resources/test_client.py b/tests/api_resources/test_client.py index 66d4adb5..4d11b754 100644 --- a/tests/api_resources/test_client.py +++ b/tests/api_resources/test_client.py @@ -56,7 +56,7 @@ def test_method_upload(self, client: Reducto) -> None: def test_method_upload_with_all_params(self, client: Reducto) -> None: client_ = client.upload( extension="extension", - file="file", + file=b"Example data", ) assert_matches_type(Upload, client_, path=["response"]) @@ -127,7 +127,7 @@ async def test_method_upload(self, async_client: AsyncReducto) -> None: async def test_method_upload_with_all_params(self, async_client: AsyncReducto) -> None: client = await async_client.upload( extension="extension", - file="file", + file=b"Example data", ) assert_matches_type(Upload, client, path=["response"]) diff --git a/tests/api_resources/test_edit.py b/tests/api_resources/test_edit.py index 426d8c2c..c47b3356 100644 --- a/tests/api_resources/test_edit.py +++ b/tests/api_resources/test_edit.py @@ -9,7 +9,7 @@ from reducto import Reducto, AsyncReducto from tests.utils import assert_matches_type -from reducto.types import EditResponse, EditRunJobResponse +from reducto.types.shared import EditResponse, AsyncEditResponse base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") @@ -95,7 +95,7 @@ def test_method_run_job(self, client: Reducto) -> None: document_url="string", edit_instructions="edit_instructions", ) - assert_matches_type(EditRunJobResponse, edit, path=["response"]) + assert_matches_type(AsyncEditResponse, edit, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -135,7 +135,7 @@ def test_method_run_job_with_all_params(self, client: Reducto) -> None: "url": "url", }, ) - assert_matches_type(EditRunJobResponse, edit, path=["response"]) + assert_matches_type(AsyncEditResponse, edit, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -148,7 +148,7 @@ def test_raw_response_run_job(self, client: Reducto) -> None: assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" edit = response.parse() - assert_matches_type(EditRunJobResponse, edit, path=["response"]) + assert_matches_type(AsyncEditResponse, edit, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -161,7 +161,7 @@ def test_streaming_response_run_job(self, client: Reducto) -> None: assert response.http_request.headers.get("X-Stainless-Lang") == "python" edit = response.parse() - assert_matches_type(EditRunJobResponse, edit, path=["response"]) + assert_matches_type(AsyncEditResponse, edit, path=["response"]) assert cast(Any, response.is_closed) is True @@ -249,7 +249,7 @@ async def test_method_run_job(self, async_client: AsyncReducto) -> None: document_url="string", edit_instructions="edit_instructions", ) - assert_matches_type(EditRunJobResponse, edit, path=["response"]) + assert_matches_type(AsyncEditResponse, edit, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -289,7 +289,7 @@ async def test_method_run_job_with_all_params(self, async_client: AsyncReducto) "url": "url", }, ) - assert_matches_type(EditRunJobResponse, edit, path=["response"]) + assert_matches_type(AsyncEditResponse, edit, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -302,7 +302,7 @@ async def test_raw_response_run_job(self, async_client: AsyncReducto) -> None: assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" edit = await response.parse() - assert_matches_type(EditRunJobResponse, edit, path=["response"]) + assert_matches_type(AsyncEditResponse, edit, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -315,6 +315,6 @@ async def test_streaming_response_run_job(self, async_client: AsyncReducto) -> N assert response.http_request.headers.get("X-Stainless-Lang") == "python" edit = await response.parse() - assert_matches_type(EditRunJobResponse, edit, path=["response"]) + assert_matches_type(AsyncEditResponse, edit, path=["response"]) assert cast(Any, response.is_closed) is True diff --git a/tests/api_resources/test_extract.py b/tests/api_resources/test_extract.py index 8a3a4202..a6f018df 100644 --- a/tests/api_resources/test_extract.py +++ b/tests/api_resources/test_extract.py @@ -11,8 +11,8 @@ from tests.utils import assert_matches_type from reducto.types import ( ExtractRunResponse, - AsyncExtractResponse, ) +from reducto.types.shared import AsyncExtractResponse base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") @@ -42,6 +42,7 @@ def test_method_run_with_all_params_overload_1(self, client: Reducto) -> None: "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -158,6 +159,7 @@ def test_method_run_with_all_params_overload_2(self, client: Reducto) -> None: "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -274,6 +276,7 @@ def test_method_run_job_with_all_params(self, client: Reducto) -> None: "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -388,6 +391,7 @@ async def test_method_run_with_all_params_overload_1(self, async_client: AsyncRe "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -504,6 +508,7 @@ async def test_method_run_with_all_params_overload_2(self, async_client: AsyncRe "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -620,6 +625,7 @@ async def test_method_run_job_with_all_params(self, async_client: AsyncReducto) "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], diff --git a/tests/api_resources/test_parse.py b/tests/api_resources/test_parse.py index 4590de2b..7159e156 100644 --- a/tests/api_resources/test_parse.py +++ b/tests/api_resources/test_parse.py @@ -11,8 +11,8 @@ from tests.utils import assert_matches_type from reducto.types import ( ParseRunResponse, - AsyncParseResponse, ) +from reducto.types.shared import AsyncParseResponse base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") @@ -37,6 +37,7 @@ def test_method_run_with_all_params_overload_1(self, client: Reducto) -> None: "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -137,6 +138,7 @@ def test_method_run_with_all_params_overload_2(self, client: Reducto) -> None: "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -238,6 +240,7 @@ def test_method_run_job_with_all_params(self, client: Reducto) -> None: "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -337,6 +340,7 @@ async def test_method_run_with_all_params_overload_1(self, async_client: AsyncRe "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -437,6 +441,7 @@ async def test_method_run_with_all_params_overload_2(self, async_client: AsyncRe "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -538,6 +543,7 @@ async def test_method_run_job_with_all_params(self, async_client: AsyncReducto) "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], diff --git a/tests/api_resources/test_pipeline.py b/tests/api_resources/test_pipeline.py index 9bbcdeb7..db9aa933 100644 --- a/tests/api_resources/test_pipeline.py +++ b/tests/api_resources/test_pipeline.py @@ -9,10 +9,7 @@ from reducto import Reducto, AsyncReducto from tests.utils import assert_matches_type -from reducto.types import ( - PipelineResponse, - PipelineRunJobResponse, -) +from reducto.types.shared import PipelineResponse, AsyncPipelineResponse base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") @@ -74,7 +71,7 @@ def test_method_run_job(self, client: Reducto) -> None: input="string", pipeline_id="pipeline_id", ) - assert_matches_type(PipelineRunJobResponse, pipeline, path=["response"]) + assert_matches_type(AsyncPipelineResponse, pipeline, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -92,7 +89,7 @@ def test_method_run_job_with_all_params(self, client: Reducto) -> None: }, settings={"document_password": "document_password"}, ) - assert_matches_type(PipelineRunJobResponse, pipeline, path=["response"]) + assert_matches_type(AsyncPipelineResponse, pipeline, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -105,7 +102,7 @@ def test_raw_response_run_job(self, client: Reducto) -> None: assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" pipeline = response.parse() - assert_matches_type(PipelineRunJobResponse, pipeline, path=["response"]) + assert_matches_type(AsyncPipelineResponse, pipeline, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -118,7 +115,7 @@ def test_streaming_response_run_job(self, client: Reducto) -> None: assert response.http_request.headers.get("X-Stainless-Lang") == "python" pipeline = response.parse() - assert_matches_type(PipelineRunJobResponse, pipeline, path=["response"]) + assert_matches_type(AsyncPipelineResponse, pipeline, path=["response"]) assert cast(Any, response.is_closed) is True @@ -182,7 +179,7 @@ async def test_method_run_job(self, async_client: AsyncReducto) -> None: input="string", pipeline_id="pipeline_id", ) - assert_matches_type(PipelineRunJobResponse, pipeline, path=["response"]) + assert_matches_type(AsyncPipelineResponse, pipeline, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -200,7 +197,7 @@ async def test_method_run_job_with_all_params(self, async_client: AsyncReducto) }, settings={"document_password": "document_password"}, ) - assert_matches_type(PipelineRunJobResponse, pipeline, path=["response"]) + assert_matches_type(AsyncPipelineResponse, pipeline, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -213,7 +210,7 @@ async def test_raw_response_run_job(self, async_client: AsyncReducto) -> None: assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" pipeline = await response.parse() - assert_matches_type(PipelineRunJobResponse, pipeline, path=["response"]) + assert_matches_type(AsyncPipelineResponse, pipeline, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -226,6 +223,6 @@ async def test_streaming_response_run_job(self, async_client: AsyncReducto) -> N assert response.http_request.headers.get("X-Stainless-Lang") == "python" pipeline = await response.parse() - assert_matches_type(PipelineRunJobResponse, pipeline, path=["response"]) + assert_matches_type(AsyncPipelineResponse, pipeline, path=["response"]) assert cast(Any, response.is_closed) is True diff --git a/tests/api_resources/test_split.py b/tests/api_resources/test_split.py index 408699d5..ba85395a 100644 --- a/tests/api_resources/test_split.py +++ b/tests/api_resources/test_split.py @@ -9,10 +9,7 @@ from reducto import Reducto, AsyncReducto from tests.utils import assert_matches_type -from reducto.types import ( - SplitResponse, - SplitRunJobResponse, -) +from reducto.types.shared import SplitResponse, AsyncSplitResponse base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") @@ -51,6 +48,7 @@ def test_method_run_with_all_params(self, client: Reducto) -> None: "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -98,7 +96,10 @@ def test_method_run_with_all_params(self, client: Reducto) -> None: }, }, }, - settings={"table_cutoff": "truncate"}, + settings={ + "allow_page_overlap": True, + "table_cutoff": "truncate", + }, split_rules="split_rules", ) assert_matches_type(SplitResponse, split, path=["response"]) @@ -153,7 +154,7 @@ def test_method_run_job(self, client: Reducto) -> None: } ], ) - assert_matches_type(SplitRunJobResponse, split, path=["response"]) + assert_matches_type(AsyncSplitResponse, split, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -180,6 +181,7 @@ def test_method_run_job_with_all_params(self, client: Reducto) -> None: "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -227,10 +229,13 @@ def test_method_run_job_with_all_params(self, client: Reducto) -> None: }, }, }, - settings={"table_cutoff": "truncate"}, + settings={ + "allow_page_overlap": True, + "table_cutoff": "truncate", + }, split_rules="split_rules", ) - assert_matches_type(SplitRunJobResponse, split, path=["response"]) + assert_matches_type(AsyncSplitResponse, split, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -248,7 +253,7 @@ def test_raw_response_run_job(self, client: Reducto) -> None: assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" split = response.parse() - assert_matches_type(SplitRunJobResponse, split, path=["response"]) + assert_matches_type(AsyncSplitResponse, split, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -266,7 +271,7 @@ def test_streaming_response_run_job(self, client: Reducto) -> None: assert response.http_request.headers.get("X-Stainless-Lang") == "python" split = response.parse() - assert_matches_type(SplitRunJobResponse, split, path=["response"]) + assert_matches_type(AsyncSplitResponse, split, path=["response"]) assert cast(Any, response.is_closed) is True @@ -307,6 +312,7 @@ async def test_method_run_with_all_params(self, async_client: AsyncReducto) -> N "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -354,7 +360,10 @@ async def test_method_run_with_all_params(self, async_client: AsyncReducto) -> N }, }, }, - settings={"table_cutoff": "truncate"}, + settings={ + "allow_page_overlap": True, + "table_cutoff": "truncate", + }, split_rules="split_rules", ) assert_matches_type(SplitResponse, split, path=["response"]) @@ -409,7 +418,7 @@ async def test_method_run_job(self, async_client: AsyncReducto) -> None: } ], ) - assert_matches_type(SplitRunJobResponse, split, path=["response"]) + assert_matches_type(AsyncSplitResponse, split, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -436,6 +445,7 @@ async def test_method_run_job_with_all_params(self, async_client: AsyncReducto) "agentic": [ { "scope": "table", + "mode": "default", "prompt": "prompt", } ], @@ -483,10 +493,13 @@ async def test_method_run_job_with_all_params(self, async_client: AsyncReducto) }, }, }, - settings={"table_cutoff": "truncate"}, + settings={ + "allow_page_overlap": True, + "table_cutoff": "truncate", + }, split_rules="split_rules", ) - assert_matches_type(SplitRunJobResponse, split, path=["response"]) + assert_matches_type(AsyncSplitResponse, split, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -504,7 +517,7 @@ async def test_raw_response_run_job(self, async_client: AsyncReducto) -> None: assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" split = await response.parse() - assert_matches_type(SplitRunJobResponse, split, path=["response"]) + assert_matches_type(AsyncSplitResponse, split, path=["response"]) @pytest.mark.skip(reason="Mock server tests are disabled") @parametrize @@ -522,6 +535,6 @@ async def test_streaming_response_run_job(self, async_client: AsyncReducto) -> N assert response.http_request.headers.get("X-Stainless-Lang") == "python" split = await response.parse() - assert_matches_type(SplitRunJobResponse, split, path=["response"]) + assert_matches_type(AsyncSplitResponse, split, path=["response"]) assert cast(Any, response.is_closed) is True diff --git a/tests/test_client.py b/tests/test_client.py index 157a4300..95bc33e2 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -427,6 +427,30 @@ def test_default_query_option(self) -> None: client.close() + def test_hardcoded_query_params_in_url(self, client: Reducto) -> None: + request = client._build_request(FinalRequestOptions(method="get", url="/foo?beta=true")) + url = httpx.URL(request.url) + assert dict(url.params) == {"beta": "true"} + + request = client._build_request( + FinalRequestOptions( + method="get", + url="/foo?beta=true", + params={"limit": "10", "page": "abc"}, + ) + ) + url = httpx.URL(request.url) + assert dict(url.params) == {"beta": "true", "limit": "10", "page": "abc"} + + request = client._build_request( + FinalRequestOptions( + method="get", + url="/files/a%2Fb?beta=true", + params={"limit": "10"}, + ) + ) + assert request.url.raw_path == b"/files/a%2Fb?beta=true&limit=10" + def test_request_extra_json(self, client: Reducto) -> None: request = client._build_request( FinalRequestOptions( @@ -1328,6 +1352,30 @@ async def test_default_query_option(self) -> None: await client.close() + async def test_hardcoded_query_params_in_url(self, async_client: AsyncReducto) -> None: + request = async_client._build_request(FinalRequestOptions(method="get", url="/foo?beta=true")) + url = httpx.URL(request.url) + assert dict(url.params) == {"beta": "true"} + + request = async_client._build_request( + FinalRequestOptions( + method="get", + url="/foo?beta=true", + params={"limit": "10", "page": "abc"}, + ) + ) + url = httpx.URL(request.url) + assert dict(url.params) == {"beta": "true", "limit": "10", "page": "abc"} + + request = async_client._build_request( + FinalRequestOptions( + method="get", + url="/files/a%2Fb?beta=true", + params={"limit": "10"}, + ) + ) + assert request.url.raw_path == b"/files/a%2Fb?beta=true&limit=10" + def test_request_extra_json(self, client: Reducto) -> None: request = client._build_request( FinalRequestOptions( diff --git a/tests/test_deepcopy.py b/tests/test_deepcopy.py deleted file mode 100644 index f2e61051..00000000 --- a/tests/test_deepcopy.py +++ /dev/null @@ -1,58 +0,0 @@ -from reducto._utils import deepcopy_minimal - - -def assert_different_identities(obj1: object, obj2: object) -> None: - assert obj1 == obj2 - assert id(obj1) != id(obj2) - - -def test_simple_dict() -> None: - obj1 = {"foo": "bar"} - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - - -def test_nested_dict() -> None: - obj1 = {"foo": {"bar": True}} - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - assert_different_identities(obj1["foo"], obj2["foo"]) - - -def test_complex_nested_dict() -> None: - obj1 = {"foo": {"bar": [{"hello": "world"}]}} - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - assert_different_identities(obj1["foo"], obj2["foo"]) - assert_different_identities(obj1["foo"]["bar"], obj2["foo"]["bar"]) - assert_different_identities(obj1["foo"]["bar"][0], obj2["foo"]["bar"][0]) - - -def test_simple_list() -> None: - obj1 = ["a", "b", "c"] - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - - -def test_nested_list() -> None: - obj1 = ["a", [1, 2, 3]] - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - assert_different_identities(obj1[1], obj2[1]) - - -class MyObject: ... - - -def test_ignores_other_types() -> None: - # custom classes - my_obj = MyObject() - obj1 = {"foo": my_obj} - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - assert obj1["foo"] is my_obj - - # tuples - obj3 = ("a", "b") - obj4 = deepcopy_minimal(obj3) - assert obj3 is obj4 diff --git a/tests/test_extract_files.py b/tests/test_extract_files.py index a4c8a66f..f3cccbea 100644 --- a/tests/test_extract_files.py +++ b/tests/test_extract_files.py @@ -4,7 +4,7 @@ import pytest -from reducto._types import FileTypes +from reducto._types import FileTypes, ArrayFormat from reducto._utils import extract_files @@ -35,6 +35,12 @@ def test_multiple_files() -> None: assert query == {"documents": [{}, {}]} +def test_top_level_file_array() -> None: + query = {"files": [b"file one", b"file two"], "title": "hello"} + assert extract_files(query, paths=[["files", ""]]) == [("files[]", b"file one"), ("files[]", b"file two")] + assert query == {"title": "hello"} + + @pytest.mark.parametrize( "query,paths,expected", [ @@ -62,3 +68,24 @@ def test_ignores_incorrect_paths( expected: list[tuple[str, FileTypes]], ) -> None: assert extract_files(query, paths=paths) == expected + + +@pytest.mark.parametrize( + "array_format,expected_top_level,expected_nested", + [ + ("brackets", [("files[]", b"a"), ("files[]", b"b")], [("items[][file]", b"a"), ("items[][file]", b"b")]), + ("repeat", [("files", b"a"), ("files", b"b")], [("items[file]", b"a"), ("items[file]", b"b")]), + ("comma", [("files", b"a"), ("files", b"b")], [("items[file]", b"a"), ("items[file]", b"b")]), + ("indices", [("files[0]", b"a"), ("files[1]", b"b")], [("items[0][file]", b"a"), ("items[1][file]", b"b")]), + ], +) +def test_array_format_controls_file_field_names( + array_format: ArrayFormat, + expected_top_level: list[tuple[str, FileTypes]], + expected_nested: list[tuple[str, FileTypes]], +) -> None: + top_level = {"files": [b"a", b"b"]} + assert extract_files(top_level, paths=[["files", ""]], array_format=array_format) == expected_top_level + + nested = {"items": [{"file": b"a"}, {"file": b"b"}]} + assert extract_files(nested, paths=[["items", "", "file"]], array_format=array_format) == expected_nested diff --git a/tests/test_files.py b/tests/test_files.py index b5e73903..23535a30 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -4,7 +4,8 @@ import pytest from dirty_equals import IsDict, IsList, IsBytes, IsTuple -from reducto._files import to_httpx_files, async_to_httpx_files +from reducto._files import to_httpx_files, deepcopy_with_paths, async_to_httpx_files +from reducto._utils import extract_files readme_path = Path(__file__).parent.parent.joinpath("README.md") @@ -49,3 +50,99 @@ def test_string_not_allowed() -> None: "file": "foo", # type: ignore } ) + + +def assert_different_identities(obj1: object, obj2: object) -> None: + assert obj1 == obj2 + assert obj1 is not obj2 + + +class TestDeepcopyWithPaths: + def test_copies_top_level_dict(self) -> None: + original = {"file": b"data", "other": "value"} + result = deepcopy_with_paths(original, [["file"]]) + assert_different_identities(result, original) + + def test_file_value_is_same_reference(self) -> None: + file_bytes = b"contents" + original = {"file": file_bytes} + result = deepcopy_with_paths(original, [["file"]]) + assert_different_identities(result, original) + assert result["file"] is file_bytes + + def test_list_popped_wholesale(self) -> None: + files = [b"f1", b"f2"] + original = {"files": files, "title": "t"} + result = deepcopy_with_paths(original, [["files", ""]]) + assert_different_identities(result, original) + result_files = result["files"] + assert isinstance(result_files, list) + assert_different_identities(result_files, files) + + def test_nested_array_path_copies_list_and_elements(self) -> None: + elem1 = {"file": b"f1", "extra": 1} + elem2 = {"file": b"f2", "extra": 2} + original = {"items": [elem1, elem2]} + result = deepcopy_with_paths(original, [["items", "", "file"]]) + assert_different_identities(result, original) + result_items = result["items"] + assert isinstance(result_items, list) + assert_different_identities(result_items, original["items"]) + assert_different_identities(result_items[0], elem1) + assert_different_identities(result_items[1], elem2) + + def test_empty_paths_returns_same_object(self) -> None: + original = {"foo": "bar"} + result = deepcopy_with_paths(original, []) + assert result is original + + def test_multiple_paths(self) -> None: + f1 = b"file1" + f2 = b"file2" + original = {"a": f1, "b": f2, "c": "unchanged"} + result = deepcopy_with_paths(original, [["a"], ["b"]]) + assert_different_identities(result, original) + assert result["a"] is f1 + assert result["b"] is f2 + assert result["c"] is original["c"] + + def test_extract_files_does_not_mutate_original_top_level(self) -> None: + file_bytes = b"contents" + original = {"file": file_bytes, "other": "value"} + + copied = deepcopy_with_paths(original, [["file"]]) + extracted = extract_files(copied, paths=[["file"]]) + + assert extracted == [("file", file_bytes)] + assert original == {"file": file_bytes, "other": "value"} + assert copied == {"other": "value"} + + def test_extract_files_does_not_mutate_original_nested_array_path(self) -> None: + file1 = b"f1" + file2 = b"f2" + original = { + "items": [ + {"file": file1, "extra": 1}, + {"file": file2, "extra": 2}, + ], + "title": "example", + } + + copied = deepcopy_with_paths(original, [["items", "", "file"]]) + extracted = extract_files(copied, paths=[["items", "", "file"]]) + + assert [entry for _, entry in extracted] == [file1, file2] + assert original == { + "items": [ + {"file": file1, "extra": 1}, + {"file": file2, "extra": 2}, + ], + "title": "example", + } + assert copied == { + "items": [ + {"extra": 1}, + {"extra": 2}, + ], + "title": "example", + } diff --git a/tests/test_models.py b/tests/test_models.py index d14b760a..3e0e297f 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,7 +1,8 @@ import json -from typing import TYPE_CHECKING, Any, Dict, List, Union, Optional, cast +from typing import TYPE_CHECKING, Any, Dict, List, Union, Iterable, Optional, cast from datetime import datetime, timezone -from typing_extensions import Literal, Annotated, TypeAliasType +from collections import deque +from typing_extensions import Literal, Annotated, TypedDict, TypeAliasType import pytest import pydantic @@ -9,7 +10,7 @@ from reducto._utils import PropertyInfo from reducto._compat import PYDANTIC_V1, parse_obj, model_dump, model_json -from reducto._models import DISCRIMINATOR_CACHE, BaseModel, construct_type +from reducto._models import DISCRIMINATOR_CACHE, BaseModel, EagerIterable, construct_type class BasicModel(BaseModel): @@ -961,3 +962,56 @@ def __getattr__(self, attr: str) -> Item: ... assert model.a.prop == 1 assert isinstance(model.a, Item) assert model.other == "foo" + + +# NOTE: Workaround for Pydantic Iterable behavior. +# Iterable fields are replaced with a ValidatorIterator and may be consumed +# during serialization, which can cause subsequent dumps to return empty data. +# See: https://github.com/pydantic/pydantic/issues/9541 +@pytest.mark.parametrize( + "data, expected_validated", + [ + ([1, 2, 3], [1, 2, 3]), + ((1, 2, 3), (1, 2, 3)), + (set([1, 2, 3]), set([1, 2, 3])), + (iter([1, 2, 3]), [1, 2, 3]), + ([], []), + ((x for x in [1, 2, 3]), [1, 2, 3]), + (map(lambda x: x, [1, 2, 3]), [1, 2, 3]), + (frozenset([1, 2, 3]), frozenset([1, 2, 3])), + (deque([1, 2, 3]), deque([1, 2, 3])), + ], + ids=["list", "tuple", "set", "iterator", "empty", "generator", "map", "frozenset", "deque"], +) +@pytest.mark.skipif(PYDANTIC_V1, reason="this is only supported in pydantic v2") +def test_iterable_construction(data: Iterable[int], expected_validated: Iterable[int]) -> None: + class TypeWithIterable(TypedDict): + items: EagerIterable[int] + + class Model(BaseModel): + data: TypeWithIterable + + m = Model.model_validate({"data": {"items": data}}) + assert m.data["items"] == expected_validated + + # Verify repeated dumps don't lose data (the original bug) + assert m.model_dump()["data"]["items"] == list(expected_validated) + assert m.model_dump()["data"]["items"] == list(expected_validated) + + +@pytest.mark.skipif(PYDANTIC_V1, reason="this is only supported in pydantic v2") +def test_iterable_construction_str_falls_back_to_list() -> None: + # str is iterable (over chars), but str(list_of_chars) produces the list's repr + # rather than reconstructing a string from items. We special-case str to fall + # back to list instead of attempting reconstruction. + class TypeWithIterable(TypedDict): + items: EagerIterable[str] + + class Model(BaseModel): + data: TypeWithIterable + + m = Model.model_validate({"data": {"items": "hello"}}) + + # falls back to list of chars rather than calling str(["h", "e", "l", "l", "o"]) + assert m.data["items"] == ["h", "e", "l", "l", "o"] + assert m.model_dump()["data"]["items"] == ["h", "e", "l", "l", "o"]