diff --git a/.release-please-manifest.json b/.release-please-manifest.json index cb9d2541..7f3f5c84 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.22.0" + ".": "0.23.0" } \ No newline at end of file diff --git a/.stats.yml b/.stats.yml index d5bade16..a6410018 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 17 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/reducto%2Freductoai-9dcbb133ea8d4e419314a5fddc878258610a80acfe82604b328a7eb3cb4f8f5f.yml -openapi_spec_hash: 10479661e623fc6063eafbba3f2b5dff -config_hash: 14efe98ee80f1e66ef35443e52f02953 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/reducto/reductoai-d32017996d4322082da97c33ea8a288f509d91a032960f3e00026c2c3688c188.yml +openapi_spec_hash: 06f95c19ecff0d30451aa1dbe6fb7c6e +config_hash: 9fa10baf03f994be027bf73b29ac8572 diff --git a/CHANGELOG.md b/CHANGELOG.md index 14e73b73..038b4a09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,47 @@ # Changelog +## 0.23.0 (2026-05-01) + +Full Changelog: [v0.22.0...v0.23.0](https://github.com/reductoai/reducto-python-sdk/compare/v0.22.0...v0.23.0) + +### Features + +* **api:** api update ([f6a1682](https://github.com/reductoai/reducto-python-sdk/commit/f6a16828595a63b3f026a029b89f2eba36b8203c)) +* **api:** api update ([d413d50](https://github.com/reductoai/reducto-python-sdk/commit/d413d50ae1a168ac2fa06c89fc73293eb5cd60f3)) +* **api:** api update ([808ab77](https://github.com/reductoai/reducto-python-sdk/commit/808ab7734b1690182be6e6fa636689e7d34b6b56)) +* **api:** api update ([c47f11a](https://github.com/reductoai/reducto-python-sdk/commit/c47f11afcacac318209e04b49dc3ba3a4fa549b6)) +* **api:** api update ([f7b90fa](https://github.com/reductoai/reducto-python-sdk/commit/f7b90fa8da10e32ec26f71313bd8eef7aac4bf83)) +* **api:** api update ([604578f](https://github.com/reductoai/reducto-python-sdk/commit/604578fc82a9e0e9cd1c96140b288d5f7e9cfc96)) +* **api:** api update ([117ec65](https://github.com/reductoai/reducto-python-sdk/commit/117ec654884ac13bdbd9e86e40441d7a5169aeda)) +* **api:** api update ([68580e6](https://github.com/reductoai/reducto-python-sdk/commit/68580e616d0743c04631a6a03f8f0c230dae9393)) +* **pypi:** reducto package aliasing ([c19a947](https://github.com/reductoai/reducto-python-sdk/commit/c19a94757e2d42ac015e8aaae4647f1bba2c5ade)) +* support setting headers via env ([6b6b3ad](https://github.com/reductoai/reducto-python-sdk/commit/6b6b3adb6e74e10716fdd5640b310b3d20734c56)) + + +### Bug Fixes + +* **ci:** `reducto` shim publishing ([48a5558](https://github.com/reductoai/reducto-python-sdk/commit/48a5558ed79ef8c8796890b486eb5bae92cbc21f)) +* **client:** preserve hardcoded query params when merging with user params ([72d4eac](https://github.com/reductoai/reducto-python-sdk/commit/72d4eac3b2514c35b26fdafeecb1e64cc067ea80)) +* ensure file data are only sent as 1 parameter ([5a5d5ee](https://github.com/reductoai/reducto-python-sdk/commit/5a5d5eed5f5bf34df796e73355824dafc30c492b)) +* use correct field name format for multipart file arrays ([00f933d](https://github.com/reductoai/reducto-python-sdk/commit/00f933debb5a49d3ee915f70283a1e85f61fa611)) + + +### Performance Improvements + +* **client:** optimize file structure copying in multipart requests ([fae7194](https://github.com/reductoai/reducto-python-sdk/commit/fae719428bbc73108ed0e3e569c5c65af3cea90a)) + + +### Chores + +* add Renovate config and SHA-pin GitHub Actions ([5bbef22](https://github.com/reductoai/reducto-python-sdk/commit/5bbef22b185f99a82a48d4dfe578b86b7931bef6)) +* configure new SDK language ([a54762b](https://github.com/reductoai/reducto-python-sdk/commit/a54762b2a40c2da5e5b7e8dee56717bde8c94e61)) +* **internal:** more robust bootstrap script ([197c371](https://github.com/reductoai/reducto-python-sdk/commit/197c37118331c2b09e73bb7e600e283bd4713cd4)) + + +### Styles + +* format renovate.json with Prettier ([668ab0e](https://github.com/reductoai/reducto-python-sdk/commit/668ab0ecd82574651f35f1c0b3910e1e845cedd4)) + ## 0.21.0 (2026-03-29) Full Changelog: [v0.20.0...v0.21.0](https://github.com/reductoai/reducto-python-sdk/compare/v0.20.0...v0.21.0) diff --git a/README.md b/README.md index f0fc57fd..055265b1 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,15 @@ and offers both synchronous and asynchronous clients powered by [httpx](https:// It is generated with [Stainless](https://www.stainless.com/). +## MCP Server + +Use the Reducto MCP Server to enable AI assistants to interact with this API, allowing them to explore endpoints, make test requests, and use documentation to help integrate this SDK into your application. + +[![Add to Cursor](https://cursor.com/deeplink/mcp-install-dark.svg)](https://cursor.com/en-US/install-mcp?name=reductoai-mcp&config=eyJjb21tYW5kIjoibnB4IiwiYXJncyI6WyIteSIsInJlZHVjdG9haS1tY3AiXSwiZW52Ijp7IlJFRFVDVE9fQVBJX0tFWSI6Ik15IEFQSSBLZXkifX0) +[![Install in VS Code](https://img.shields.io/badge/_-Add_to_VS_Code-blue?style=for-the-badge&logo=data:image/svg%2bxml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIGZpbGw9Im5vbmUiIHZpZXdCb3g9IjAgMCA0MCA0MCI+PHBhdGggZmlsbD0iI0VFRSIgZmlsbC1ydWxlPSJldmVub2RkIiBkPSJNMzAuMjM1IDM5Ljg4NGEyLjQ5MSAyLjQ5MSAwIDAgMS0xLjc4MS0uNzNMMTIuNyAyNC43OGwtMy40NiAyLjYyNC0zLjQwNiAyLjU4MmExLjY2NSAxLjY2NSAwIDAgMS0xLjA4Mi4zMzggMS42NjQgMS42NjQgMCAwIDEtMS4wNDYtLjQzMWwtMi4yLTJhMS42NjYgMS42NjYgMCAwIDEgMC0yLjQ2M0w3LjQ1OCAyMCA0LjY3IDE3LjQ1MyAxLjUwNyAxNC41N2ExLjY2NSAxLjY2NSAwIDAgMSAwLTIuNDYzbDIuMi0yYTEuNjY1IDEuNjY1IDAgMCAxIDIuMTMtLjA5N2w2Ljg2MyA1LjIwOUwyOC40NTIuODQ0YTIuNDg4IDIuNDg4IDAgMCAxIDEuODQxLS43MjljLjM1MS4wMDkuNjk5LjA5MSAxLjAxOS4yNDVsOC4yMzYgMy45NjFhMi41IDIuNSAwIDAgMSAxLjQxNSAyLjI1M3YuMDk5LS4wNDVWMzMuMzd2LS4wNDUuMDk1YTIuNTAxIDIuNTAxIDAgMCAxLTEuNDE2IDIuMjU3bC04LjIzNSAzLjk2MWEyLjQ5MiAyLjQ5MiAwIDAgMS0xLjA3Ny4yNDZabS43MTYtMjguOTQ3LTExLjk0OCA5LjA2MiAxMS45NTIgOS4wNjUtLjAwNC0xOC4xMjdaIi8+PC9zdmc+)](https://vscode.stainless.com/mcp/%7B%22name%22%3A%22reductoai-mcp%22%2C%22command%22%3A%22npx%22%2C%22args%22%3A%5B%22-y%22%2C%22reductoai-mcp%22%5D%2C%22env%22%3A%7B%22REDUCTO_API_KEY%22%3A%22My%20API%20Key%22%7D%7D) + +> Note: You may need to set environment variables in your MCP client. + ## Documentation The REST API documentation can be found on [docs.reductoai.com](https://docs.reductoai.com). The full API of this library can be found in [api.md](api.md). diff --git a/pyproject.toml b/pyproject.toml index e50cc10c..2245acf5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "reductoai" -version = "0.22.0" +version = "0.23.0" description = "The official Python library for the reducto API" dynamic = ["readme"] license = "Apache-2.0" diff --git a/scripts/bootstrap b/scripts/bootstrap index b430fee3..fe8451e4 100755 --- a/scripts/bootstrap +++ b/scripts/bootstrap @@ -4,7 +4,7 @@ set -e cd "$(dirname "$0")/.." -if [ -f "Brewfile" ] && [ "$(uname -s)" = "Darwin" ] && [ "$SKIP_BREW" != "1" ] && [ -t 0 ]; then +if [ -f "Brewfile" ] && [ "$(uname -s)" = "Darwin" ] && [ "${SKIP_BREW:-}" != "1" ] && [ -t 0 ]; then brew bundle check >/dev/null 2>&1 || { echo -n "==> Install Homebrew dependencies? (y/N): " read -r response diff --git a/src/reducto/_base_client.py b/src/reducto/_base_client.py index 1e31cb4d..ee1f4314 100644 --- a/src/reducto/_base_client.py +++ b/src/reducto/_base_client.py @@ -558,6 +558,10 @@ def _build_request( files = cast(HttpxRequestFiles, ForceMultipartDict()) prepared_url = self._prepare_url(options.url) + # preserve hard-coded query params from the url + if params and prepared_url.query: + params = {**dict(prepared_url.params.items()), **params} + prepared_url = prepared_url.copy_with(raw_path=prepared_url.raw_path.split(b"?", 1)[0]) if "_" in prepared_url.host: # work around https://github.com/encode/httpx/discussions/2880 kwargs["extensions"] = {"sni_hostname": prepared_url.host.replace("_", "-")} diff --git a/src/reducto/_client.py b/src/reducto/_client.py index df400032..81b298b8 100644 --- a/src/reducto/_client.py +++ b/src/reducto/_client.py @@ -11,6 +11,7 @@ from . import _exceptions from ._qs import Querystring from .types import client_upload_params +from ._files import deepcopy_with_paths from ._types import ( Body, Omit, @@ -27,9 +28,9 @@ ) from ._utils import ( is_given, + is_mapping_t, extract_files, maybe_transform, - deepcopy_minimal, get_async_library, async_maybe_transform, ) @@ -150,6 +151,15 @@ def __init__( except KeyError as exc: raise ValueError(f"Unknown environment: {environment}") from exc + custom_headers_env = os.environ.get("REDUCTO_CUSTOM_HEADERS") + if custom_headers_env is not None: + parsed: dict[str, str] = {} + for line in custom_headers_env.split("\n"): + colon = line.find(":") + if colon >= 0: + parsed[line[:colon].strip()] = line[colon + 1 :].strip() + default_headers = {**parsed, **(default_headers if is_mapping_t(default_headers) else {})} + super().__init__( version=__version__, base_url=base_url, @@ -338,7 +348,7 @@ def upload( timeout: Override the client-level default timeout for this request, in seconds """ - body = deepcopy_minimal({"file": file}) + body = deepcopy_with_paths({"file": file}, [["file"]]) files = extract_files(cast(Mapping[str, object], body), paths=[["file"]]) if files: # It should be noted that the actual Content-Type header that will be @@ -461,6 +471,15 @@ def __init__( except KeyError as exc: raise ValueError(f"Unknown environment: {environment}") from exc + custom_headers_env = os.environ.get("REDUCTO_CUSTOM_HEADERS") + if custom_headers_env is not None: + parsed: dict[str, str] = {} + for line in custom_headers_env.split("\n"): + colon = line.find(":") + if colon >= 0: + parsed[line[:colon].strip()] = line[colon + 1 :].strip() + default_headers = {**parsed, **(default_headers if is_mapping_t(default_headers) else {})} + super().__init__( version=__version__, base_url=base_url, @@ -649,7 +668,7 @@ async def upload( timeout: Override the client-level default timeout for this request, in seconds """ - body = deepcopy_minimal({"file": file}) + body = deepcopy_with_paths({"file": file}, [["file"]]) files = extract_files(cast(Mapping[str, object], body), paths=[["file"]]) if files: # It should be noted that the actual Content-Type header that will be diff --git a/src/reducto/_files.py b/src/reducto/_files.py index 7f056768..a4b9a7b1 100644 --- a/src/reducto/_files.py +++ b/src/reducto/_files.py @@ -3,8 +3,8 @@ import io import os import pathlib -from typing import overload -from typing_extensions import TypeGuard +from typing import Sequence, cast, overload +from typing_extensions import TypeVar, TypeGuard import anyio @@ -17,7 +17,9 @@ HttpxFileContent, HttpxRequestFiles, ) -from ._utils import is_tuple_t, is_mapping_t, is_sequence_t +from ._utils import is_list, is_mapping, is_tuple_t, is_mapping_t, is_sequence_t + +_T = TypeVar("_T") def is_base64_file_input(obj: object) -> TypeGuard[Base64FileInput]: @@ -121,3 +123,51 @@ async def async_read_file_content(file: FileContent) -> HttpxFileContent: return await anyio.Path(file).read_bytes() return file + + +def deepcopy_with_paths(item: _T, paths: Sequence[Sequence[str]]) -> _T: + """Copy only the containers along the given paths. + + Used to guard against mutation by extract_files without copying the entire structure. + Only dicts and lists that lie on a path are copied; everything else + is returned by reference. + + For example, given paths=[["foo", "files", "file"]] and the structure: + { + "foo": { + "bar": {"baz": {}}, + "files": {"file": } + } + } + The root dict, "foo", and "files" are copied (they lie on the path). + "bar" and "baz" are returned by reference (off the path). + """ + return _deepcopy_with_paths(item, paths, 0) + + +def _deepcopy_with_paths(item: _T, paths: Sequence[Sequence[str]], index: int) -> _T: + if not paths: + return item + if is_mapping(item): + key_to_paths: dict[str, list[Sequence[str]]] = {} + for path in paths: + if index < len(path): + key_to_paths.setdefault(path[index], []).append(path) + + # if no path continues through this mapping, it won't be mutated and copying it is redundant + if not key_to_paths: + return item + + result = dict(item) + for key, subpaths in key_to_paths.items(): + if key in result: + result[key] = _deepcopy_with_paths(result[key], subpaths, index + 1) + return cast(_T, result) + if is_list(item): + array_paths = [path for path in paths if index < len(path) and path[index] == ""] + + # if no path expects a list here, nothing will be mutated inside it - return by reference + if not array_paths: + return cast(_T, item) + return cast(_T, [_deepcopy_with_paths(entry, array_paths, index + 1) for entry in item]) + return item diff --git a/src/reducto/_qs.py b/src/reducto/_qs.py index de8c99bc..4127c19c 100644 --- a/src/reducto/_qs.py +++ b/src/reducto/_qs.py @@ -2,17 +2,13 @@ from typing import Any, List, Tuple, Union, Mapping, TypeVar from urllib.parse import parse_qs, urlencode -from typing_extensions import Literal, get_args +from typing_extensions import get_args -from ._types import NotGiven, not_given +from ._types import NotGiven, ArrayFormat, NestedFormat, not_given from ._utils import flatten _T = TypeVar("_T") - -ArrayFormat = Literal["comma", "repeat", "indices", "brackets"] -NestedFormat = Literal["dots", "brackets"] - PrimitiveData = Union[str, int, float, bool, None] # this should be Data = Union[PrimitiveData, "List[Data]", "Tuple[Data]", "Mapping[str, Data]"] # https://github.com/microsoft/pyright/issues/3555 diff --git a/src/reducto/_types.py b/src/reducto/_types.py index 6c70e52c..1cd25278 100644 --- a/src/reducto/_types.py +++ b/src/reducto/_types.py @@ -47,6 +47,9 @@ ModelT = TypeVar("ModelT", bound=pydantic.BaseModel) _T = TypeVar("_T") +ArrayFormat = Literal["comma", "repeat", "indices", "brackets"] +NestedFormat = Literal["dots", "brackets"] + # Approximates httpx internal ProxiesTypes and RequestFiles types # while adding support for `PathLike` instances diff --git a/src/reducto/_utils/__init__.py b/src/reducto/_utils/__init__.py index 10cb66d2..1c090e51 100644 --- a/src/reducto/_utils/__init__.py +++ b/src/reducto/_utils/__init__.py @@ -24,7 +24,6 @@ coerce_integer as coerce_integer, file_from_path as file_from_path, strip_not_given as strip_not_given, - deepcopy_minimal as deepcopy_minimal, get_async_library as get_async_library, maybe_coerce_float as maybe_coerce_float, get_required_header as get_required_header, diff --git a/src/reducto/_utils/_utils.py b/src/reducto/_utils/_utils.py index eec7f4a1..199cd231 100644 --- a/src/reducto/_utils/_utils.py +++ b/src/reducto/_utils/_utils.py @@ -17,11 +17,11 @@ ) from pathlib import Path from datetime import date, datetime -from typing_extensions import TypeGuard +from typing_extensions import TypeGuard, get_args import sniffio -from .._types import Omit, NotGiven, FileTypes, HeadersLike +from .._types import Omit, NotGiven, FileTypes, ArrayFormat, HeadersLike _T = TypeVar("_T") _TupleT = TypeVar("_TupleT", bound=Tuple[object, ...]) @@ -40,25 +40,45 @@ def extract_files( query: Mapping[str, object], *, paths: Sequence[Sequence[str]], + array_format: ArrayFormat = "brackets", ) -> list[tuple[str, FileTypes]]: """Recursively extract files from the given dictionary based on specified paths. A path may look like this ['foo', 'files', '', 'data']. + ``array_format`` controls how ```` segments contribute to the emitted + field name. Supported values: ``"brackets"`` (``foo[]``), ``"repeat"`` and + ``"comma"`` (``foo``), ``"indices"`` (``foo[0]``, ``foo[1]``). + Note: this mutates the given dictionary. """ files: list[tuple[str, FileTypes]] = [] for path in paths: - files.extend(_extract_items(query, path, index=0, flattened_key=None)) + files.extend(_extract_items(query, path, index=0, flattened_key=None, array_format=array_format)) return files +def _array_suffix(array_format: ArrayFormat, array_index: int) -> str: + if array_format == "brackets": + return "[]" + if array_format == "indices": + return f"[{array_index}]" + if array_format == "repeat" or array_format == "comma": + # Both repeat the bare field name for each file part; there is no + # meaningful way to comma-join binary parts. + return "" + raise NotImplementedError( + f"Unknown array_format value: {array_format}, choose from {', '.join(get_args(ArrayFormat))}" + ) + + def _extract_items( obj: object, path: Sequence[str], *, index: int, flattened_key: str | None, + array_format: ArrayFormat, ) -> list[tuple[str, FileTypes]]: try: key = path[index] @@ -75,9 +95,11 @@ def _extract_items( if is_list(obj): files: list[tuple[str, FileTypes]] = [] - for entry in obj: - assert_is_file_content(entry, key=flattened_key + "[]" if flattened_key else "") - files.append((flattened_key + "[]", cast(FileTypes, entry))) + for array_index, entry in enumerate(obj): + suffix = _array_suffix(array_format, array_index) + emitted_key = (flattened_key + suffix) if flattened_key else suffix + assert_is_file_content(entry, key=emitted_key) + files.append((emitted_key, cast(FileTypes, entry))) return files assert_is_file_content(obj, key=flattened_key) @@ -86,8 +108,9 @@ def _extract_items( index += 1 if is_dict(obj): try: - # We are at the last entry in the path so we must remove the field - if (len(path)) == index: + # Remove the field if there are no more dict keys in the path, + # only "" traversal markers or end. + if all(p == "" for p in path[index:]): item = obj.pop(key) else: item = obj[key] @@ -105,6 +128,7 @@ def _extract_items( path, index=index, flattened_key=flattened_key, + array_format=array_format, ) elif is_list(obj): if key != "": @@ -116,9 +140,12 @@ def _extract_items( item, path, index=index, - flattened_key=flattened_key + "[]" if flattened_key is not None else "[]", + flattened_key=( + (flattened_key if flattened_key is not None else "") + _array_suffix(array_format, array_index) + ), + array_format=array_format, ) - for item in obj + for array_index, item in enumerate(obj) ] ) @@ -176,21 +203,6 @@ def is_iterable(obj: object) -> TypeGuard[Iterable[object]]: return isinstance(obj, Iterable) -def deepcopy_minimal(item: _T) -> _T: - """Minimal reimplementation of copy.deepcopy() that will only copy certain object types: - - - mappings, e.g. `dict` - - list - - This is done for performance reasons. - """ - if is_mapping(item): - return cast(_T, {k: deepcopy_minimal(v) for k, v in item.items()}) - if is_list(item): - return cast(_T, [deepcopy_minimal(entry) for entry in item]) - return item - - # copied from https://github.com/Rapptz/RoboDanny def human_join(seq: Sequence[str], *, delim: str = ", ", final: str = "or") -> str: size = len(seq) diff --git a/src/reducto/_version.py b/src/reducto/_version.py index 956dea69..7f4900a2 100644 --- a/src/reducto/_version.py +++ b/src/reducto/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "reducto" -__version__ = "0.22.0" # x-release-please-version +__version__ = "0.23.0" # x-release-please-version diff --git a/src/reducto/types/__init__.py b/src/reducto/types/__init__.py index c0641ad1..3f217832 100644 --- a/src/reducto/types/__init__.py +++ b/src/reducto/types/__init__.py @@ -40,7 +40,6 @@ from .parse_usage import ParseUsage as ParseUsage from .bounding_box import BoundingBox as BoundingBox from .enhance_param import EnhanceParam as EnhanceParam -from .extract_usage import ExtractUsage as ExtractUsage from .settings_param import SettingsParam as SettingsParam from .edit_run_params import EditRunParams as EditRunParams from .retrieval_param import RetrievalParam as RetrievalParam diff --git a/src/reducto/types/extract_usage.py b/src/reducto/types/extract_usage.py deleted file mode 100644 index 2d6a0a30..00000000 --- a/src/reducto/types/extract_usage.py +++ /dev/null @@ -1,18 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Optional -from typing_extensions import Literal - -from .._models import BaseModel - -__all__ = ["ExtractUsage"] - - -class ExtractUsage(BaseModel): - num_fields: int - - num_pages: int - - credits: Optional[float] = None - - extract_mode: Optional[Literal["super_agent", "extract", "spreadsheet_agent"]] = None diff --git a/src/reducto/types/parse_usage.py b/src/reducto/types/parse_usage.py index 74abb42d..17b20133 100644 --- a/src/reducto/types/parse_usage.py +++ b/src/reducto/types/parse_usage.py @@ -1,6 +1,7 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Dict, Optional +from typing import Dict, List, Optional +from typing_extensions import Literal from .._models import BaseModel @@ -13,3 +14,26 @@ class ParseUsage(BaseModel): credit_breakdown: Optional[Dict[str, float]] = None credits: Optional[float] = None + + page_billing_breakdown: Optional[ + Dict[ + str, + List[ + Literal[ + "page", + "html_page", + "docx_native_page", + "agentic", + "complex", + "chart_agent", + "spreadsheet_cells", + "billable_spreadsheet_pages", + ] + ], + ] + ] = None + """Per-page breakdown of features used. + + Maps 1-indexed page numbers (as strings) to the list of billing features applied + on that page (e.g. 'page', 'complex', 'chart_agent'). + """ diff --git a/src/reducto/types/shared/advanced_processing_options.py b/src/reducto/types/shared/advanced_processing_options.py index 9b55e5bb..8fb4c1fe 100644 --- a/src/reducto/types/shared/advanced_processing_options.py +++ b/src/reducto/types/shared/advanced_processing_options.py @@ -90,7 +90,9 @@ class AdvancedProcessingOptions(BaseModel): be merged across breaks and spaces. """ - ocr_system: Optional[Literal["highres", "multilingual", "combined", "reducto", "legacy", "reducto-v2"]] = None + ocr_system: Optional[ + Literal["highres", "multilingual", "combined", "reducto", "legacy", "reducto-v2", "reducto-v3"] + ] = None """The OCR system to use. Highres is recommended for documents with English characters. Legacy uses an diff --git a/src/reducto/types/shared/experimental_processing_options.py b/src/reducto/types/shared/experimental_processing_options.py index 1f4d9ac2..6533dd4f 100644 --- a/src/reducto/types/shared/experimental_processing_options.py +++ b/src/reducto/types/shared/experimental_processing_options.py @@ -75,7 +75,16 @@ class ExperimentalProcessingOptions(BaseModel): layout_model: Optional[ Literal[ - "default", "beta", "rfdetr", "rfdetr0302", "rfdetr0303", "rfdetrbase0218", "rfdetr0304", "qwen35_27b_0317" + "default", + "beta", + "dfine", + "rfdetr", + "rfdetr0302", + "rfdetr0303", + "rfdetrbase0218", + "rfdetr0304", + "rfdetr0306", + "qwen35_27b_0317", ] ] = None """The layout model to use for the document. diff --git a/src/reducto/types/shared/extract_response.py b/src/reducto/types/shared/extract_response.py index 1744516d..b071e9fc 100644 --- a/src/reducto/types/shared/extract_response.py +++ b/src/reducto/types/shared/extract_response.py @@ -1,27 +1,8 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Optional - -from ..._models import BaseModel -from ..extract_usage import ExtractUsage +from typing import Dict +from typing_extensions import TypeAlias __all__ = ["ExtractResponse"] - -class ExtractResponse(BaseModel): - citations: Optional[List[object]] = None - """The citations corresponding to the extracted response.""" - - result: List[object] - """The extracted response in your provided schema. - - This is a list of dictionaries. If disable_chunking is True (default), then it - will be a list of length one. - """ - - usage: ExtractUsage - - job_id: Optional[str] = None - - studio_link: Optional[str] = None - """The link to the studio pipeline for the document.""" +ExtractResponse: TypeAlias = Dict[str, object] diff --git a/src/reducto/types/shared/parse_response.py b/src/reducto/types/shared/parse_response.py index 8b54180f..a12ff7dc 100644 --- a/src/reducto/types/shared/parse_response.py +++ b/src/reducto/types/shared/parse_response.py @@ -177,6 +177,15 @@ class ParseResponse(BaseModel): usage: ParseUsage + parse_mode: Optional[Literal["base", "lite"]] = None + """Which pipeline produced this response. + + `lite` means Reducto Flash Lite served the request; `base` is the standard + pipeline. Optional / nullable for forward compatibility — older API instances or + persisted responses written before this field existed will leave it `None`; + treat `None` as `base`. + """ + pdf_url: Optional[str] = None """The storage URL of the converted PDF file.""" diff --git a/src/reducto/types/shared/pipeline_response.py b/src/reducto/types/shared/pipeline_response.py index 3e6d9c59..1c5359dd 100644 --- a/src/reducto/types/shared/pipeline_response.py +++ b/src/reducto/types/shared/pipeline_response.py @@ -11,16 +11,7 @@ from .split_response import SplitResponse from .extract_response import ExtractResponse -__all__ = [ - "PipelineResponse", - "Result", - "ResultExtract", - "ResultExtractUnionMember0", - "ResultExtractUnionMember0Result", - "ResultParse", -] - -ResultExtractUnionMember0Result: TypeAlias = Union[ExtractResponse, V3Extract] +__all__ = ["PipelineResponse", "Result", "ResultExtractUnionMember0", "ResultParse"] class ResultExtractUnionMember0(BaseModel): @@ -28,20 +19,18 @@ class ResultExtractUnionMember0(BaseModel): page_range: List[int] - result: ResultExtractUnionMember0Result + result: Union[ExtractResponse, V3Extract] split_name: str partition: Optional[str] = None -ResultExtract: TypeAlias = Union[List[ResultExtractUnionMember0], ExtractResponse, V3Extract, None] - ResultParse: TypeAlias = Union[ParseResponse, List[ParseResponse], None] class Result(BaseModel): - extract: Optional[ResultExtract] = None + extract: Union[List[ResultExtractUnionMember0], ExtractResponse, V3Extract, None] = None parse: Optional[ResultParse] = None diff --git a/src/reducto/types/split_table_options_param.py b/src/reducto/types/split_table_options_param.py index 740e863c..a4c29707 100644 --- a/src/reducto/types/split_table_options_param.py +++ b/src/reducto/types/split_table_options_param.py @@ -8,6 +8,12 @@ class SplitTableOptionsParam(TypedDict, total=False): + allow_page_overlap: bool + """If True, a page can belong to multiple categories/partitions. + + If False, each page must belong to exactly one category. Defaults to True. + """ + table_cutoff: Literal["truncate", "preserve"] """ If tables should be truncated to the first few rows or if all content should be diff --git a/src/reducto/types/v3_extract.py b/src/reducto/types/v3_extract.py index e2b61eb0..2f197e6f 100644 --- a/src/reducto/types/v3_extract.py +++ b/src/reducto/types/v3_extract.py @@ -1,24 +1,8 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Union, Optional - -from .._models import BaseModel -from .extract_usage import ExtractUsage +from typing import Dict +from typing_extensions import TypeAlias __all__ = ["V3Extract"] - -class V3Extract(BaseModel): - result: Union[List[object], object] - """The extracted response in your provided schema. - - This is a list of dictionaries. If disable_chunking is True (default), then it - will be a list of length one. - """ - - usage: ExtractUsage - - job_id: Optional[str] = None - - studio_link: Optional[str] = None - """The link to the studio pipeline for the document.""" +V3Extract: TypeAlias = Dict[str, object] diff --git a/tests/api_resources/test_split.py b/tests/api_resources/test_split.py index 2de2376f..af41c588 100644 --- a/tests/api_resources/test_split.py +++ b/tests/api_resources/test_split.py @@ -95,7 +95,10 @@ def test_method_run_with_all_params(self, client: Reducto) -> None: }, }, }, - settings={"table_cutoff": "truncate"}, + settings={ + "allow_page_overlap": True, + "table_cutoff": "truncate", + }, split_rules="split_rules", ) assert_matches_type(SplitResponse, split, path=["response"]) @@ -224,7 +227,10 @@ def test_method_run_job_with_all_params(self, client: Reducto) -> None: }, }, }, - settings={"table_cutoff": "truncate"}, + settings={ + "allow_page_overlap": True, + "table_cutoff": "truncate", + }, split_rules="split_rules", ) assert_matches_type(AsyncSplitResponse, split, path=["response"]) @@ -351,7 +357,10 @@ async def test_method_run_with_all_params(self, async_client: AsyncReducto) -> N }, }, }, - settings={"table_cutoff": "truncate"}, + settings={ + "allow_page_overlap": True, + "table_cutoff": "truncate", + }, split_rules="split_rules", ) assert_matches_type(SplitResponse, split, path=["response"]) @@ -480,7 +489,10 @@ async def test_method_run_job_with_all_params(self, async_client: AsyncReducto) }, }, }, - settings={"table_cutoff": "truncate"}, + settings={ + "allow_page_overlap": True, + "table_cutoff": "truncate", + }, split_rules="split_rules", ) assert_matches_type(AsyncSplitResponse, split, path=["response"]) diff --git a/tests/test_client.py b/tests/test_client.py index 157a4300..95bc33e2 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -427,6 +427,30 @@ def test_default_query_option(self) -> None: client.close() + def test_hardcoded_query_params_in_url(self, client: Reducto) -> None: + request = client._build_request(FinalRequestOptions(method="get", url="/foo?beta=true")) + url = httpx.URL(request.url) + assert dict(url.params) == {"beta": "true"} + + request = client._build_request( + FinalRequestOptions( + method="get", + url="/foo?beta=true", + params={"limit": "10", "page": "abc"}, + ) + ) + url = httpx.URL(request.url) + assert dict(url.params) == {"beta": "true", "limit": "10", "page": "abc"} + + request = client._build_request( + FinalRequestOptions( + method="get", + url="/files/a%2Fb?beta=true", + params={"limit": "10"}, + ) + ) + assert request.url.raw_path == b"/files/a%2Fb?beta=true&limit=10" + def test_request_extra_json(self, client: Reducto) -> None: request = client._build_request( FinalRequestOptions( @@ -1328,6 +1352,30 @@ async def test_default_query_option(self) -> None: await client.close() + async def test_hardcoded_query_params_in_url(self, async_client: AsyncReducto) -> None: + request = async_client._build_request(FinalRequestOptions(method="get", url="/foo?beta=true")) + url = httpx.URL(request.url) + assert dict(url.params) == {"beta": "true"} + + request = async_client._build_request( + FinalRequestOptions( + method="get", + url="/foo?beta=true", + params={"limit": "10", "page": "abc"}, + ) + ) + url = httpx.URL(request.url) + assert dict(url.params) == {"beta": "true", "limit": "10", "page": "abc"} + + request = async_client._build_request( + FinalRequestOptions( + method="get", + url="/files/a%2Fb?beta=true", + params={"limit": "10"}, + ) + ) + assert request.url.raw_path == b"/files/a%2Fb?beta=true&limit=10" + def test_request_extra_json(self, client: Reducto) -> None: request = client._build_request( FinalRequestOptions( diff --git a/tests/test_deepcopy.py b/tests/test_deepcopy.py deleted file mode 100644 index f2e61051..00000000 --- a/tests/test_deepcopy.py +++ /dev/null @@ -1,58 +0,0 @@ -from reducto._utils import deepcopy_minimal - - -def assert_different_identities(obj1: object, obj2: object) -> None: - assert obj1 == obj2 - assert id(obj1) != id(obj2) - - -def test_simple_dict() -> None: - obj1 = {"foo": "bar"} - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - - -def test_nested_dict() -> None: - obj1 = {"foo": {"bar": True}} - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - assert_different_identities(obj1["foo"], obj2["foo"]) - - -def test_complex_nested_dict() -> None: - obj1 = {"foo": {"bar": [{"hello": "world"}]}} - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - assert_different_identities(obj1["foo"], obj2["foo"]) - assert_different_identities(obj1["foo"]["bar"], obj2["foo"]["bar"]) - assert_different_identities(obj1["foo"]["bar"][0], obj2["foo"]["bar"][0]) - - -def test_simple_list() -> None: - obj1 = ["a", "b", "c"] - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - - -def test_nested_list() -> None: - obj1 = ["a", [1, 2, 3]] - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - assert_different_identities(obj1[1], obj2[1]) - - -class MyObject: ... - - -def test_ignores_other_types() -> None: - # custom classes - my_obj = MyObject() - obj1 = {"foo": my_obj} - obj2 = deepcopy_minimal(obj1) - assert_different_identities(obj1, obj2) - assert obj1["foo"] is my_obj - - # tuples - obj3 = ("a", "b") - obj4 = deepcopy_minimal(obj3) - assert obj3 is obj4 diff --git a/tests/test_extract_files.py b/tests/test_extract_files.py index a4c8a66f..f3cccbea 100644 --- a/tests/test_extract_files.py +++ b/tests/test_extract_files.py @@ -4,7 +4,7 @@ import pytest -from reducto._types import FileTypes +from reducto._types import FileTypes, ArrayFormat from reducto._utils import extract_files @@ -35,6 +35,12 @@ def test_multiple_files() -> None: assert query == {"documents": [{}, {}]} +def test_top_level_file_array() -> None: + query = {"files": [b"file one", b"file two"], "title": "hello"} + assert extract_files(query, paths=[["files", ""]]) == [("files[]", b"file one"), ("files[]", b"file two")] + assert query == {"title": "hello"} + + @pytest.mark.parametrize( "query,paths,expected", [ @@ -62,3 +68,24 @@ def test_ignores_incorrect_paths( expected: list[tuple[str, FileTypes]], ) -> None: assert extract_files(query, paths=paths) == expected + + +@pytest.mark.parametrize( + "array_format,expected_top_level,expected_nested", + [ + ("brackets", [("files[]", b"a"), ("files[]", b"b")], [("items[][file]", b"a"), ("items[][file]", b"b")]), + ("repeat", [("files", b"a"), ("files", b"b")], [("items[file]", b"a"), ("items[file]", b"b")]), + ("comma", [("files", b"a"), ("files", b"b")], [("items[file]", b"a"), ("items[file]", b"b")]), + ("indices", [("files[0]", b"a"), ("files[1]", b"b")], [("items[0][file]", b"a"), ("items[1][file]", b"b")]), + ], +) +def test_array_format_controls_file_field_names( + array_format: ArrayFormat, + expected_top_level: list[tuple[str, FileTypes]], + expected_nested: list[tuple[str, FileTypes]], +) -> None: + top_level = {"files": [b"a", b"b"]} + assert extract_files(top_level, paths=[["files", ""]], array_format=array_format) == expected_top_level + + nested = {"items": [{"file": b"a"}, {"file": b"b"}]} + assert extract_files(nested, paths=[["items", "", "file"]], array_format=array_format) == expected_nested diff --git a/tests/test_files.py b/tests/test_files.py index b5e73903..23535a30 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -4,7 +4,8 @@ import pytest from dirty_equals import IsDict, IsList, IsBytes, IsTuple -from reducto._files import to_httpx_files, async_to_httpx_files +from reducto._files import to_httpx_files, deepcopy_with_paths, async_to_httpx_files +from reducto._utils import extract_files readme_path = Path(__file__).parent.parent.joinpath("README.md") @@ -49,3 +50,99 @@ def test_string_not_allowed() -> None: "file": "foo", # type: ignore } ) + + +def assert_different_identities(obj1: object, obj2: object) -> None: + assert obj1 == obj2 + assert obj1 is not obj2 + + +class TestDeepcopyWithPaths: + def test_copies_top_level_dict(self) -> None: + original = {"file": b"data", "other": "value"} + result = deepcopy_with_paths(original, [["file"]]) + assert_different_identities(result, original) + + def test_file_value_is_same_reference(self) -> None: + file_bytes = b"contents" + original = {"file": file_bytes} + result = deepcopy_with_paths(original, [["file"]]) + assert_different_identities(result, original) + assert result["file"] is file_bytes + + def test_list_popped_wholesale(self) -> None: + files = [b"f1", b"f2"] + original = {"files": files, "title": "t"} + result = deepcopy_with_paths(original, [["files", ""]]) + assert_different_identities(result, original) + result_files = result["files"] + assert isinstance(result_files, list) + assert_different_identities(result_files, files) + + def test_nested_array_path_copies_list_and_elements(self) -> None: + elem1 = {"file": b"f1", "extra": 1} + elem2 = {"file": b"f2", "extra": 2} + original = {"items": [elem1, elem2]} + result = deepcopy_with_paths(original, [["items", "", "file"]]) + assert_different_identities(result, original) + result_items = result["items"] + assert isinstance(result_items, list) + assert_different_identities(result_items, original["items"]) + assert_different_identities(result_items[0], elem1) + assert_different_identities(result_items[1], elem2) + + def test_empty_paths_returns_same_object(self) -> None: + original = {"foo": "bar"} + result = deepcopy_with_paths(original, []) + assert result is original + + def test_multiple_paths(self) -> None: + f1 = b"file1" + f2 = b"file2" + original = {"a": f1, "b": f2, "c": "unchanged"} + result = deepcopy_with_paths(original, [["a"], ["b"]]) + assert_different_identities(result, original) + assert result["a"] is f1 + assert result["b"] is f2 + assert result["c"] is original["c"] + + def test_extract_files_does_not_mutate_original_top_level(self) -> None: + file_bytes = b"contents" + original = {"file": file_bytes, "other": "value"} + + copied = deepcopy_with_paths(original, [["file"]]) + extracted = extract_files(copied, paths=[["file"]]) + + assert extracted == [("file", file_bytes)] + assert original == {"file": file_bytes, "other": "value"} + assert copied == {"other": "value"} + + def test_extract_files_does_not_mutate_original_nested_array_path(self) -> None: + file1 = b"f1" + file2 = b"f2" + original = { + "items": [ + {"file": file1, "extra": 1}, + {"file": file2, "extra": 2}, + ], + "title": "example", + } + + copied = deepcopy_with_paths(original, [["items", "", "file"]]) + extracted = extract_files(copied, paths=[["items", "", "file"]]) + + assert [entry for _, entry in extracted] == [file1, file2] + assert original == { + "items": [ + {"file": file1, "extra": 1}, + {"file": file2, "extra": 2}, + ], + "title": "example", + } + assert copied == { + "items": [ + {"extra": 1}, + {"extra": 2}, + ], + "title": "example", + }