From 59b8841c8bfcc098659eb2f78ea8b5ae1242a39e Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 11:36:17 +1200 Subject: [PATCH 01/13] chore(spec): vendor Data Extraction API OpenAPI spec (2026-05-25) The Data Extraction API (`POST /extraction/parse`) ships on a separate OpenAPI document from the existing DWS Processor API. Vendor the public spec so the new typed client surface is anchored to a checked-in source of truth. The Processor API spec stays at `dws-api-spec.yml`; the Data Extraction spec lives alongside it at `dws-data-extraction-spec.yml`. --- dws-data-extraction-spec.yml | 1132 ++++++++++++++++++++++++++++++++++ 1 file changed, 1132 insertions(+) create mode 100644 dws-data-extraction-spec.yml diff --git a/dws-data-extraction-spec.yml b/dws-data-extraction-spec.yml new file mode 100644 index 0000000..c67aa12 --- /dev/null +++ b/dws-data-extraction-spec.yml @@ -0,0 +1,1132 @@ +openapi: 3.1.0 +info: + version: '2026-05-25' + title: Nutrient Data Extraction API + description: |- + Nutrient Data Extraction API is an HTTP API that extracts structured content from documents. + Upload a PDF or image and receive extracted text elements, tables, formulas, key-value pairs, + and other structural elements with spatial data — or get a whole-document markdown representation. + + Four processing modes are available: choose `text` for plain text, `structure` for OCR-backed structure, + `understand` for deeper document analysis, or `agentic` for complex documents that need visual reasoning + and self-correction. + + # API Versioning + + Nutrient Data Extraction API is versioned using date-based versions in the form `YYYY-MM-DD` (for example, `2026-05-25`). + + Requests can override the API key's default version by sending the `x-nutrient-api-version` header. If the header is omitted, the request uses the latest version that was available when the API key was created. + + Supported API versions: + + | Version | Status | Notes | + | ------------ | ------- | ------------------------------------------------------------------ | + | `2026-05-25` | Current | Initial Data Extraction versioned contract. No older versions yet. | + contact: + name: Nutrient Data Extraction API + url: https://www.nutrient.io/api/ + license: + name: End User License Agreement + url: https://www.nutrient.io/api/terms/ +servers: + - url: https://api.nutrient.io + description: Base URL for Nutrient Data Extraction API endpoints. +security: + - BearerToken: [] +tags: + - name: Authorization + description: |- + Nutrient Data Extraction API uses an HTTP authorization header to map each request to the user making the request. You're required to provide your API token in the authorization header with each request. Otherwise, the API will return an error. + + The authorization header has the following shape: + + ``` + Authorization: Bearer pdf_live + ``` + + `pdf_live` is an API key that can be retrieved by logging in to the [Data Extraction API dashboard](https://www.nutrient.io/data-extraction/api_keys/). + + Because this API allows full access to credits you purchased for the Data Extraction API, it's only meant to be used by your backend services, which we assume are fully trusted. + - name: Data Extraction + description: Extract structured content from documents. + - name: File Type Support + description: |- + DWS Data Extraction API supports importing documents in different file formats: + + * PDFs + * Image documents + * Office files (Word, Excel, PowerPoint etc.) + + The following table shows the allowed file extensions and their MIME types: + + | File Extension | MIME Type | + | -------------- | ------------------------------------------------------------------------- | + | PDF | application/pdf | + | DOC | application/msword | + | DOCX | application/vnd.openxmlformats-officedocument.wordprocessingml.document | + | DOCM | application/vnd.ms-word.document.macroEnabled.12 | + | DOTX | application/vnd.openxmlformats-officedocument.wordprocessingml.template | + | DOTM | application/vnd.ms-word.template.macroEnabled.12 | + | XLS | application/vnd.ms-excel | + | XLSX | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | + | XLSM | application/vnd.ms-excel.sheet.macroEnabled.12 | + | XLSB | application/vnd.ms-excel.sheet.binary.macroEnabled.12 | + | XLTX | application/vnd.openxmlformats-officedocument.spreadsheetml.template | + | XLTM | application/vnd.ms-excel.template.macroEnabled.12 | + | XLAM | application/vnd.ms-excel.addin.macroEnabled.12 | + | PPT, PPS | application/vnd.ms-powerpoint | + | PPTX | application/vnd.openxmlformats-officedocument.presentationml.presentation | + | PPTM | application/vnd.ms-powerpoint.presentation.macroEnabled.12 | + | PPSX | application/vnd.openxmlformats-officedocument.presentationml.slideshow | + | PPSM | application/vnd.ms-powerpoint.slideshow.macroEnabled.12 | + | POTX | application/vnd.openxmlformats-officedocument.presentationml.template | + | POTM | application/vnd.ms-powerpoint.template.macroEnabled.12 | + | PPAM | application/vnd.ms-powerpoint.addin.macroEnabled.12 | + | RTF | application/rtf | + | ODT | application/vnd.oasis.opendocument.text | + | TXT | text/plain | + | BMP | image/bmp | + | JPG/JPEG | image/jpeg | + | PNG | image/png | + | TIFF | image/tiff | + | HEIC | image/heic | + | GIF | image/gif | + | WEBP | image/webp | + | SVG | image/svg+xml | + | TGA | image/x-tga | + | EPS | image/postscript | + - name: OCR Language Support + description: |- + The OCR action supports a wide range of languages for text extraction. You can specify languages using either: + - **Full language name** (lowercase, e.g., `english`, `german`) - available for commonly used languages + - **ISO 639-2 language code** (e.g., `eng`, `deu`) - available for all languages + - **ISO 639-2 language code with variant** (e.g., `chi_sim_vert` or `deu_frak`) + + | Description | Language code | Full language name | + | -------------------------------- | -------------- | ------------------ | + | Afrikaans | `afr` | | + | Albanian | `sqi` | | + | Amharic | `amh` | | + | Arabic | `ara` | | + | Armenian | `hye` | | + | Assamese | `asm` | | + | Azerbaijani | `aze` | | + | Azerbaijani - Cyrillic | `aze_cyrl` | | + | Basque | `eus` | | + | Belarusian | `bel` | | + | Bengali | `ben` | | + | Bosnian | `bos` | | + | Breton | `bre` | | + | Bulgarian | `bul` | | + | Burmese | `mya` | | + | Catalan; Valencian | `cat` | | + | Cebuano | `ceb` | | + | Central Khmer | `khm` | | + | Cherokee | `chr` | | + | Chinese - Simplified | `chi_sim` | | + | Chinese - Simplified (Vertical) | `chi_sim_vert` | | + | Chinese - Traditional | `chi_tra` | | + | Chinese - Traditional (Vertical) | `chi_tra_vert` | | + | Corsican | `cos` | | + | Croatian | `hrv` | `croatian` | + | Czech | `ces` | `czech` | + | Danish | `dan` | `danish` | + | Danish - Fraktur | `dan_frak` | | + | Dhivehi; Maldivian | `div` | | + | Dutch; Flemish | `nld` | `dutch` | + | Dzongkha | `dzo` | | + | English | `eng` | `english` | + | English, Middle (1100-1500) | `enm` | | + | Esperanto | `epo` | | + | Estonian | `est` | | + | Faroese | `fao` | | + | Filipino | `fil` | | + | Finnish | `fin` | `finnish` | + | French | `fra` | `french` | + | French, Middle (ca. 1400-1600) | `frm` | | + | Galician | `glg` | | + | Georgian | `kat` | | + | Georgian - Old | `kat_old` | | + | German | `deu` | `german` | + | German - Fraktur | `deu_frak` | | + | German Fraktur | `frk` | | + | Greek, Ancient | `grc` | | + | Greek, Modern | `ell` | | + | Gujarati | `guj` | | + | Haitian; Haitian Creole | `hat` | | + | Hebrew | `heb` | | + | Hindi | `hin` | | + | Hungarian | `hun` | | + | Icelandic | `isl` | | + | Indonesian | `ind` | `indonesian` | + | Inuktitut | `iku` | | + | Irish | `gle` | | + | Italian | `ita` | `italian` | + | Italian - Old | `ita_old` | | + | Japanese | `jpn` | | + | Japanese (Vertical) | `jpn_vert` | | + | Javanese | `jav` | | + | Kannada | `kan` | | + | Kazakh | `kaz` | | + | Kirghiz; Kyrgyz | `kir` | | + | Korean | `kor` | | + | Korean (Vertical) | `kor_vert` | | + | Kurdish | `kur` | | + | Kurmanji (Kurdish) | `kmr` | | + | Lao | `lao` | | + | Latin | `lat` | | + | Latvian | `lav` | | + | Lithuanian | `lit` | | + | Luxembourgish | `ltz` | | + | Macedonian | `mkd` | | + | Malay | `msa` | `malay` | + | Malayalam | `mal` | | + | Maltese | `mlt` | | + | Maori | `mri` | | + | Marathi | `mar` | | + | Math/Equation detection | `equ` | | + | Mongolian | `mon` | | + | Nepali | `nep` | | + | Norwegian | `nor` | `norwegian` | + | Occitan | `oci` | | + | Oriya | `ori` | | + | Panjabi; Punjabi | `pan` | | + | Persian | `fas` | | + | Polish | `pol` | `polish` | + | Portuguese | `por` | `portuguese` | + | Pushto; Pashto | `pus` | | + | Quechua | `que` | | + | Romanian; Moldavian | `ron` | | + | Russian | `rus` | | + | Sanskrit | `san` | | + | Scottish Gaelic | `gla` | | + | Serbian | `srp` | `serbian` | + | Serbian - Latin | `srp_latn` | | + | Sindhi | `snd` | | + | Sinhala; Sinhalese | `sin` | | + | Slovak | `slk` | `slovak` | + | Slovak - Fraktur | `slk_frak` | | + | Slovenian | `slv` | `slovenian` | + | Spanish; Castilian | `spa` | `spanish` | + | Spanish - Old | `spa_old` | | + | Sundanese | `sun` | | + | Swahili | `swa` | | + | Swedish | `swe` | `swedish` | + | Syriac | `syr` | | + | Tagalog | `tgl` | | + | Tajik | `tgk` | | + | Tamil | `tam` | | + | Tatar | `tat` | | + | Telugu | `tel` | | + | Thai | `tha` | | + | Tibetan | `bod` | | + | Tigrinya | `tir` | | + | Tonga | `ton` | | + | Turkish | `tur` | `turkish` | + | Uighur; Uyghur | `uig` | | + | Ukrainian | `ukr` | | + | Urdu | `urd` | | + | Uzbek | `uzb` | | + | Uzbek - Cyrillic | `uzb_cyrl` | | + | Vietnamese | `vie` | | + | Welsh | `cym` | | + | Western Frisian | `fry` | | + | Yiddish | `yid` | | + | Yoruba | `yor` | | +externalDocs: + description: Nutrient Data Extraction API guides + url: https://www.nutrient.io/guides/data-extraction/ +paths: + /extraction/parse: + post: + summary: Extract data from a document + operationId: extraction-parse + parameters: + - $ref: '#/components/parameters/NutrientApiVersion' + description: |- + Extract structured content from a document. Returns either typed document elements + with spatial data or a whole-document markdown representation. + + Four processing modes are available: + - **text** — Plain text extraction powered by Document Engine. Only supports `markdown` output. + - **structure** — OCR-backed structured extraction with spatial element output. + - **understand** — Deeper document analysis with structured extraction and semantic enrichment. + - **agentic** — AI-powered analysis for complex documents that need visual reasoning and self-correction. + + You can provide the input document in three ways: + - **Multipart form upload** via `multipart/form-data` with a `file` field and optional JSON `instructions`. + - **URL-based input** via `application/json` with a `url` field pointing to a remote document. + - **Raw binary upload** via `application/pdf` (or other supported content types). + requestBody: + content: + multipart/form-data: + encoding: + file: + contentType: application/pdf, application/msword, application/vnd.openxmlformats-officedocument.wordprocessingml.document, application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet, application/vnd.ms-powerpoint, application/vnd.openxmlformats-officedocument.presentationml.presentation, application/rtf, image/png, image/jpeg, image/tiff, image/bmp, image/gif, image/webp + instructions: + contentType: application/json + schema: + type: object + properties: + file: + type: string + format: binary + description: The document to parse. + example: + instructions: + description: |- + JSON-serialized processing instructions. Omit to use all defaults + (`mode: "understand"` with spatial element output). + type: object + properties: + url: + type: string + format: uri + description: |- + URL of a remote document to parse. Use this instead of the `file` field + to process a document hosted at a public URL. + example: https://storage.example.com/invoice.pdf + mode: + $ref: '#/components/schemas/Mode' + output: + $ref: '#/components/schemas/OutputOptions' + options: + $ref: '#/components/schemas/ProcessingOptions' + application/json: + schema: + type: object + required: + - url + properties: + url: + type: string + format: uri + description: URL of a remote document to parse. + example: https://storage.example.com/invoice.pdf + mode: + $ref: '#/components/schemas/Mode' + output: + $ref: '#/components/schemas/OutputOptions' + options: + $ref: '#/components/schemas/ProcessingOptions' + example: + url: https://storage.example.com/invoice.pdf + mode: understand + output: + format: spatial + application/pdf: + schema: + type: string + format: binary + description: Raw PDF document for direct upload. + image/png: + schema: + type: string + format: binary + description: Raw PNG image for direct upload. + image/jpeg: + schema: + type: string + format: binary + description: Raw JPEG image for direct upload. + image/tiff: + schema: + type: string + format: binary + description: Raw TIFF image for direct upload. + responses: + '200': + description: Extraction completed successfully. + content: + application/json: + schema: + $ref: '#/components/schemas/ParseResponse' + examples: + spatialElements: + summary: Spatial elements output (structure mode) + value: + status: 200 + requestId: req_e5f6g7h8 + output: + elements: + - id: a1b2c3d4-1111-4000-8000-000000000001 + type: paragraph + role: Title + text: Quarterly Report + confidence: 0.95 + readingOrder: 0 + bounds: + x: 200 + 'y': 139 + width: 1111 + height: 97 + page: + pageIndex: 0 + pageNumber: 1 + width: 1700 + height: 2200 + - id: a1b2c3d4-2222-4000-8000-000000000002 + type: paragraph + role: Text + text: Revenue grew 15% year-over-year. + confidence: 0.97 + readingOrder: 1 + bounds: + x: 200 + 'y': 278 + width: 1300 + height: 69 + page: + pageIndex: 0 + pageNumber: 1 + width: 1700 + height: 2200 + metrics: + processingTimeMs: 4200 + pagesProcessed: 1 + usage: + data_extraction_credits: + cost: 1.5 + remainingCredits: 850 + configuration: + mode: structure + outputFormat: spatial + markdownOutput: + summary: Markdown output (text mode) + value: + status: 200 + requestId: req_a1b2c3d4 + output: + markdown: |- + # Document Title + + First paragraph of text... + + ## Section Two + + More content here... + metrics: + processingTimeMs: 312 + pagesProcessed: 1 + usage: + data_extraction_credits: + cost: 1 + remainingCredits: 850 + configuration: + mode: text + outputFormat: markdown + '400': + description: The request is malformed. Invalid parameters, unsupported file format, or missing required fields. + content: + application/json: + schema: + $ref: '#/components/schemas/ParseErrorResponse' + example: + status: 400 + requestId: req_err_001 + errorMessage: The request is malformed + errorDetails: + source: request + code: invalid_request + failingPaths: + - path: $.mode + details: 'invalid mode: ''vlm''. Expected: text, structure, understand, agentic' + '401': + description: You are unauthorized. Sent when no API token is specified, or when the API token you specified isn't valid. + '402': + description: Insufficient credits for this request. + content: + application/json: + schema: + $ref: '#/components/schemas/ParseErrorResponse' + example: + status: 402 + requestId: req_err_002 + errorMessage: Insufficient credits. This request requires 2 credits, 0 remaining. + '413': + description: The uploaded file exceeds the maximum allowed size for your plan. + '429': + description: Too many requests. You have exceeded the rate limit for your subscription. + '500': + description: An internal processing error occurred. Please retry or contact support with the `requestId`. + content: + application/json: + schema: + $ref: '#/components/schemas/ParseErrorResponse' + example: + status: 500 + requestId: req_err_003 + errorMessage: Processing failed. Please retry or contact support with the requestId. + errorDetails: + source: maestro + code: maestro_error + '503': + description: The processing backend is temporarily unavailable. Please retry later. + tags: + - Data Extraction +components: + parameters: + NutrientApiVersion: + name: x-nutrient-api-version + in: header + required: false + description: |- + Optional API version override for this request. + + If omitted, the request uses the latest version that was available when the API key was created. + + See the [API Versioning](#description/api-versioning) section for the list of supported versions. + schema: + type: string + enum: + - '2026-05-25' + example: '2026-05-25' + securitySchemes: + BearerToken: + type: http + scheme: bearer + schemas: + Mode: + type: string + description: |- + Processing pipeline. + - `text` — Plain text extraction powered by Document Engine. Only supports `markdown` output format. + - `structure` — OCR-backed structured extraction with spatial element output. + - `understand` — Deeper document analysis with structured extraction and semantic enrichment. + - `agentic` — AI-powered analysis for complex documents that need visual reasoning and self-correction. + enum: + - text + - structure + - understand + - agentic + default: understand + example: understand + OutputOptions: + type: object + description: |- + Output configuration. When provided, `format` is required. + Default format depends on the mode: `text` defaults to `markdown`; + `structure`, `understand`, and `agentic` default to `spatial`. + required: + - format + properties: + format: + type: string + description: |- + The output format. + - `spatial` — Flat typed elements with bounding boxes, confidence scores, reading order, and page references. + Not available with `text` mode. + - `markdown` — Whole-document markdown representation. + enum: + - spatial + - markdown + example: spatial + includeWords: + type: boolean + description: |- + Include word-level OCR data nested inside paragraph and table cell elements. + Only applicable when `format` is `spatial`. + default: false + example: false + ProcessingOptions: + type: object + description: Additional processing options. + properties: + language: + description: |- + OCR language hint. Only supported for `structure`, `understand`, and `agentic` modes. + Accepts lowercase language names (`"english"`, `"german"`) + or ISO 639-2 language codes (`"eng"`, `"deu"`). Multilingual OCR can be expressed + as an array (`["eng", "spa"]`) or a `+`-joined string (`"eng+spa"`). + default: eng + oneOf: + - type: string + example: english + - type: array + items: + type: string + example: + - eng + - spa + ParseResponse: + type: object + required: + - status + - requestId + - output + - metrics + - configuration + properties: + status: + type: integer + description: HTTP status code. + enum: + - 200 + example: 200 + requestId: + type: string + description: Unique request identifier for debugging and support. + example: req_e5f6g7h8 + output: + $ref: '#/components/schemas/ParseOutput' + metrics: + $ref: '#/components/schemas/Metrics' + usage: + $ref: '#/components/schemas/Usage' + configuration: + $ref: '#/components/schemas/Configuration' + ParseOutput: + type: object + description: |- + Extracted content. Contains either `elements` (for spatial format) or `markdown` + (for markdown format), never both. + properties: + elements: + type: array + description: |- + Flat list of document elements across all pages, ordered by reading order. + Present when `output.format` is `spatial`. + items: + $ref: '#/components/schemas/Element' + markdown: + type: string + description: |- + Whole-document markdown content. + Present when `output.format` is `markdown`. + example: |- + # Document Title + + First paragraph of text... + Metrics: + type: object + required: + - processingTimeMs + - pagesProcessed + properties: + processingTimeMs: + type: number + description: Total processing time in milliseconds. + example: 4200 + pagesProcessed: + type: integer + description: Number of pages processed. + example: 1 + Usage: + type: object + properties: + data_extraction_credits: + type: object + required: + - cost + - remainingCredits + properties: + cost: + type: number + description: Credits consumed by this request. + example: 2 + remainingCredits: + type: number + description: Remaining credits in the account. + example: 850 + Configuration: + type: object + required: + - mode + - outputFormat + properties: + mode: + $ref: '#/components/schemas/Mode' + outputFormat: + type: string + description: The output format that was used for this request. + enum: + - spatial + - markdown + example: spatial + ParseErrorResponse: + type: object + required: + - status + - requestId + - errorMessage + properties: + status: + type: integer + description: HTTP status code. + example: 400 + requestId: + type: string + description: Unique request identifier for debugging and support. + example: req_err_001 + errorMessage: + type: string + description: Human-readable error summary. + example: The request is malformed + errorDetails: + type: object + description: Structured error details. Present on validation and processing errors. + properties: + source: + type: string + description: |- + Error origin. + - `request` — Validation errors (invalid parameters, unsupported format). + - `processing` — Backend processing failures. + - `maestro` — Maestro engine failures. + example: request + code: + type: string + description: Machine-readable error code stable enough for client branching. + example: invalid_request + failingPaths: + type: array + description: List of invalid fields. Present on validation errors. + items: + type: object + properties: + path: + type: string + description: JSON path to the invalid field. + example: $.mode + details: + type: string + description: Human-readable validation error. + example: 'invalid mode: ''vlm''. Expected: text, structure, understand, agentic' + Element: + oneOf: + - $ref: '#/components/schemas/ParagraphElement' + - $ref: '#/components/schemas/FormulaElement' + - $ref: '#/components/schemas/PictureElement' + - $ref: '#/components/schemas/TableElement' + - $ref: '#/components/schemas/KeyValueRegionElement' + - $ref: '#/components/schemas/HandwritingElement' + discriminator: + propertyName: type + mapping: + paragraph: '#/components/schemas/ParagraphElement' + formula: '#/components/schemas/FormulaElement' + picture: '#/components/schemas/PictureElement' + table: '#/components/schemas/TableElement' + keyValueRegion: '#/components/schemas/KeyValueRegionElement' + handwriting: '#/components/schemas/HandwritingElement' + ElementBase: + type: object + required: + - id + - type + - bounds + - confidence + - readingOrder + - page + properties: + id: + type: string + format: uuid + description: Unique element identifier. + example: a1b2c3d4-1111-4000-8000-000000000001 + bounds: + $ref: '#/components/schemas/Bounds' + confidence: + type: number + minimum: 0 + maximum: 1 + description: Detection confidence score. + example: 0.95 + readingOrder: + type: integer + minimum: 0 + description: Reading order index within the page. + example: 0 + page: + $ref: '#/components/schemas/PageRef' + ParagraphElement: + allOf: + - $ref: '#/components/schemas/ElementBase' + - type: object + required: + - type + - text + properties: + type: + type: string + enum: + - paragraph + role: + description: Semantic role of the paragraph. Null when the role is undetermined. + oneOf: + - type: string + enum: + - Text + - Title + - SectionHeader + - Header + - Footer + - Caption + - Footnote + - ListItem + - PageNumber + - Code + - CheckboxSelected + - CheckboxUnselected + - type: 'null' + example: Text + text: + type: string + description: Extracted text content. + example: Revenue grew 15% year-over-year. + words: + description: Word-level OCR data. Present when `includeWords` is `true`. + oneOf: + - type: array + items: + $ref: '#/components/schemas/Word' + - type: 'null' + FormulaElement: + allOf: + - $ref: '#/components/schemas/ElementBase' + - type: object + required: + - type + - latex + properties: + type: + type: string + enum: + - formula + latex: + type: string + description: LaTeX representation of the formula. + example: r = r_0 e^{kt} + PictureElement: + allOf: + - $ref: '#/components/schemas/ElementBase' + - type: object + required: + - type + - classification + - classificationConfidence + - altDescription + properties: + type: + type: string + enum: + - picture + classification: + type: string + description: Image classification category (chart, photo, diagram, etc.). + example: chart + classificationConfidence: + type: number + minimum: 0 + maximum: 1 + description: Confidence score for the classification. + example: 0.91 + altDescription: + type: string + description: AI-generated alternative text description. + example: Bar chart showing quarterly revenue growth across regions + captionIds: + description: IDs of associated caption paragraph elements. + oneOf: + - type: array + items: + type: string + format: uuid + - type: 'null' + footnoteIds: + description: IDs of associated footnote paragraph elements. + oneOf: + - type: array + items: + type: string + format: uuid + - type: 'null' + TableElement: + allOf: + - $ref: '#/components/schemas/ElementBase' + - type: object + required: + - type + - rowCount + - columnCount + - cells + properties: + type: + type: string + enum: + - table + rowCount: + type: integer + minimum: 0 + description: Number of rows in the table. + example: 3 + columnCount: + type: integer + minimum: 0 + description: Number of columns in the table. + example: 3 + cells: + type: array + description: Cell-level data. + items: + $ref: '#/components/schemas/TableCell' + captionIds: + description: IDs of associated caption paragraph elements. + oneOf: + - type: array + items: + type: string + format: uuid + - type: 'null' + footnoteIds: + description: IDs of associated footnote paragraph elements. + oneOf: + - type: array + items: + type: string + format: uuid + - type: 'null' + KeyValueRegionElement: + allOf: + - $ref: '#/components/schemas/ElementBase' + - type: object + required: + - type + - pairs + properties: + type: + type: string + enum: + - keyValueRegion + pairs: + type: array + description: Detected key-value pairs. + items: + $ref: '#/components/schemas/KeyValuePair' + HandwritingElement: + allOf: + - $ref: '#/components/schemas/ElementBase' + - type: object + required: + - type + - text + properties: + type: + type: string + enum: + - handwriting + text: + type: string + description: Extracted handwritten text content. + example: John Doe + words: + description: Word-level OCR data. Present when `includeWords` is `true`. + oneOf: + - type: array + items: + $ref: '#/components/schemas/Word' + - type: 'null' + Bounds: + type: object + required: + - x + - 'y' + - width + - height + description: |- + Bounding box of an element on the page. `(x, y)` is the top-left corner of the box. + Origin is the top-left of the page, with x increasing right and y increasing down. + + Coordinates are always expressed in render-space pixels. + `page.width` and `page.height` describe the same pixel canvas as every element, + word, and table cell bound on that page. + properties: + x: + type: number + description: X coordinate of the top-left corner (distance from page left edge). + example: 100 + 'y': + type: number + description: Y coordinate of the top-left corner (distance from page top edge). + example: 50 + width: + type: number + description: Width of the bounding box. + example: 400 + height: + type: number + description: Height of the bounding box. + example: 35 + Word: + type: object + required: + - text + - bounds + - confidence + description: Word-level OCR result. + properties: + text: + type: string + description: The word text. + example: Revenue + bounds: + $ref: '#/components/schemas/Bounds' + confidence: + type: number + minimum: 0 + maximum: 1 + description: OCR confidence score. + example: 0.95 + PageRef: + type: object + required: + - pageIndex + - pageNumber + - width + - height + description: |- + Source page reference. Provides the page index and the full page dimensions, + which define the coordinate space that all element bounds on this page are relative to. + properties: + pageIndex: + type: integer + minimum: 0 + description: 0-based page index. + example: 0 + pageNumber: + type: integer + minimum: 1 + description: 1-based page number. + example: 1 + width: + type: number + description: Page width in render-space pixels. + example: 1700 + height: + type: number + description: Page height in render-space pixels. + example: 2200 + TableCell: + type: object + required: + - id + - bounds + - confidence + - row + - column + - rowSpan + - colSpan + - text + properties: + id: + type: string + description: Unique cell identifier. + example: c-001 + bounds: + $ref: '#/components/schemas/Bounds' + confidence: + type: number + minimum: 0 + maximum: 1 + description: Detection confidence score. + example: 0.94 + row: + type: integer + minimum: 0 + description: 0-indexed row. + example: 0 + column: + type: integer + minimum: 0 + description: 0-indexed column. + example: 0 + rowSpan: + type: integer + minimum: 1 + description: Number of rows this cell spans. + default: 1 + example: 1 + colSpan: + type: integer + minimum: 1 + description: Number of columns this cell spans. + default: 1 + example: 1 + text: + type: string + description: Extracted text content. + example: Region + words: + description: Word-level OCR data. Present when `includeWords` is `true`. + oneOf: + - type: array + items: + $ref: '#/components/schemas/Word' + - type: 'null' + KeyValuePair: + type: object + required: + - id + properties: + id: + type: string + description: Unique identifier for the pair. + example: kvp-001 + key: + description: The key/question entity. Null when only a value was detected. + oneOf: + - $ref: '#/components/schemas/KeyValueEntity' + - type: 'null' + value: + description: The value/answer entity. Null when only a key was detected. + oneOf: + - $ref: '#/components/schemas/KeyValueEntity' + - type: 'null' + relationshipConfidence: + description: Confidence for the key-value relationship. + oneOf: + - type: number + minimum: 0 + maximum: 1 + - type: 'null' + example: 0.93 + KeyValueEntity: + type: object + required: + - id + - bounds + - confidence + - entityType + - value + properties: + id: + type: string + description: Unique entity identifier. + example: kve-001 + bounds: + $ref: '#/components/schemas/Bounds' + confidence: + type: number + minimum: 0 + maximum: 1 + description: Detection confidence score. + example: 0.92 + entityType: + type: string + description: Entity type. + enum: + - QUESTION + - ANSWER + - '' + example: QUESTION + value: + description: Extracted value. + example: Invoice Number +x-tagGroups: + - name: Endpoints + tags: + - Data Extraction + - name: Reference + tags: + - File Type Support + - OCR Language Support From 758d576bea34af9d100bd48ba9cad3b114aca716 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 11:36:28 +1200 Subject: [PATCH 02/13] feat(types): add Data Extraction API types for /extraction/parse Introduce hand-written types mirroring the public Data Extraction OpenAPI 3.1 contract (version 2026-05-25): - ParseMode (text | structure | understand | agentic) - ParseOutputFormat (spatial | markdown), ParseOutputOptions - ParseInstructions and ParseOptions request shapes - ParseResponseSpatial / ParseResponseMarkdown discriminated by output payload - Per-element types: ParagraphElement, FormulaElement, PictureElement, TableElement (with ParseTableCell), KeyValueRegionElement (with KeyValuePair / KeyValueEntity), HandwritingElement, and shared ParseElementBase / ParseBounds / ParsePageRef / ParseWord - ParseErrorResponse with structured failingPaths - ParseMetrics, ParseUsage (carrying data_extraction_credits), ParseConfiguration The Data Extraction API bills against a separate extraction-credits bucket from the processor API; type JSDoc makes the distinction explicit so client code does not conflate the two billing buckets. Wires the new endpoint into RequestTypeMap / ResponseTypeMap so the existing HTTP layer stays type-safe end-to-end. --- src/types/http.ts | 11 ++ src/types/index.ts | 1 + src/types/parse.ts | 400 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 412 insertions(+) create mode 100644 src/types/parse.ts diff --git a/src/types/http.ts b/src/types/http.ts index 64185e4..2420e10 100644 --- a/src/types/http.ts +++ b/src/types/http.ts @@ -1,5 +1,6 @@ import type { components, operations } from '../generated/api-types'; import type { NormalizedFileData } from '../inputs'; +import type { ParseInstructions, ParseResponse } from './parse'; import type { ValueOf } from '@typescript-eslint/eslint-plugin/dist/util'; export type RequestTypeMap = { @@ -26,6 +27,15 @@ export type RequestTypeMap = { file?: NormalizedFileData; }; '/tokens': components['schemas']['CreateAuthTokenParameters']; + /** + * `/extraction/parse` request body. Use exactly one of: + * - `file` + optional `instructions` for multipart upload (local files, buffers, streams). + * - `instructions.url` only for URL-based input (sent as `application/json`). + */ + '/extraction/parse': { + instructions: ParseInstructions; + file?: NormalizedFileData; + }; }; DELETE: { '/tokens': { id: string }; @@ -42,6 +52,7 @@ export type ResponseTypeMap = { '/sign': string; '/ai/redact': string; '/tokens': components['schemas']['CreateAuthTokenResponse']; + '/extraction/parse': ParseResponse; }; DELETE: { '/tokens': undefined; diff --git a/src/types/index.ts b/src/types/index.ts index b813af9..85684b3 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -2,5 +2,6 @@ export * from './common'; export * from './inputs'; export * from './workflow'; export * from './http'; +export * from './parse'; // Re-export generated types for convenience export type { components, operations, paths } from '../generated/api-types'; diff --git a/src/types/parse.ts b/src/types/parse.ts new file mode 100644 index 0000000..d13ef9c --- /dev/null +++ b/src/types/parse.ts @@ -0,0 +1,400 @@ +/** + * Type definitions for the Nutrient Data Extraction API (`POST /extraction/parse`). + * + * These types mirror the public OpenAPI 3.1 contract published at + * https://www.nutrient.io/api/reference/data-extraction/public/ (version `2026-05-25`). + * + * Note on billing: `/extraction/parse` is billed against **extraction credits**, a + * bucket separate from the **processor API credits** consumed by `/build`, `/sign`, + * OCR, conversion, and the other endpoints on the rest of `NutrientClient`. The + * response surfaces this explicitly in `usage.data_extraction_credits`. + * + * @see ParseResponse for the full response shape + * @see ParseMode for the four processing pipelines + */ + +/** + * Processing pipeline for `/extraction/parse`. + * + * Each mode bills a different amount of **extraction credits** per page, drawn from + * the account's extraction-credits bucket (separate from processor API credits). + * + * - `text` — Plain text extraction. Markdown output only. 1 extraction credit/page. + * - `structure` — OCR-backed structured extraction with spatial element output. 1.5 extraction credits/page. + * - `understand` — Deeper document analysis with semantic enrichment. 9 extraction credits/page. (Default) + * - `agentic` — VLM-augmented extraction for complex documents needing visual reasoning. 18 extraction credits/page. + * + * The `agentic` mode may not yet be activated on every account; if it isn't, the + * server returns a `400` with an `errorDetails.code` you can branch on. + */ +export type ParseMode = 'text' | 'structure' | 'understand' | 'agentic'; + +/** + * Output format for `/extraction/parse`. + * + * - `spatial` — Flat list of typed elements (paragraph, table, formula, picture, + * keyValueRegion, handwriting) with bounding boxes, confidence, reading order, + * and page references. Not available with `mode: 'text'`. + * - `markdown` — Whole-document Markdown representation, suited for RAG, search + * indexing, and content pipelines. + */ +export type ParseOutputFormat = 'spatial' | 'markdown'; + +/** + * Output configuration for `/extraction/parse`. + * + * Defaults: `text` mode emits `markdown`; `structure`, `understand`, and `agentic` + * emit `spatial`. + */ +export interface ParseOutputOptions { + /** Output format. Required when `output` is provided. */ + format: ParseOutputFormat; + /** + * Include word-level OCR data nested inside paragraph and table cell elements. + * Only applicable when `format` is `'spatial'`. Defaults to `false`. + */ + includeWords?: boolean; +} + +/** + * Additional processing options for `/extraction/parse`. + */ +export interface ParseProcessingOptions { + /** + * OCR language hint. Only honoured for `structure`, `understand`, and `agentic` modes. + * + * Accepts: + * - A lowercase language name (`'english'`, `'german'`). + * - An ISO 639-2 code (`'eng'`, `'deu'`). + * - A `+`-joined string for multilingual OCR (`'eng+spa'`). + * - An array of codes (`['eng', 'spa']`). + * + * Defaults to `'eng'` server-side. + */ + language?: string | string[]; +} + +/** + * Instruction payload sent to `/extraction/parse`. All fields are optional; an empty + * object resolves to `mode: 'understand'` with spatial output server-side. + */ +export interface ParseInstructions { + /** + * URL of a remote document to parse. Used by the JSON request shape; when + * passing a local file or buffer, omit this field. + */ + url?: string; + mode?: ParseMode; + output?: ParseOutputOptions; + options?: ParseProcessingOptions; +} + +/** + * Bounding box of an element on the page. + * + * `(x, y)` is the top-left corner. The origin is the top-left of the page, with + * x increasing right and y increasing down. Coordinates are in render-space pixels; + * `page.width` and `page.height` describe the same pixel canvas. + */ +export interface ParseBounds { + /** Distance from the page's left edge to the box's left edge, in pixels. */ + x: number; + /** Distance from the page's top edge to the box's top edge, in pixels. */ + y: number; + /** Width of the bounding box in pixels. */ + width: number; + /** Height of the bounding box in pixels. */ + height: number; +} + +/** + * Source page reference for an extracted element. + */ +export interface ParsePageRef { + /** 0-based page index. */ + pageIndex: number; + /** 1-based page number for human-facing labels. */ + pageNumber: number; + /** Page width in render-space pixels (matches the bounds coordinate space). */ + width: number; + /** Page height in render-space pixels. */ + height: number; +} + +/** + * Word-level OCR result. Included inside `ParagraphElement.words`, + * `HandwritingElement.words`, and `ParseTableCell.words` when + * `output.includeWords === true`. + */ +export interface ParseWord { + /** The word's text. */ + text: string; + bounds: ParseBounds; + /** OCR confidence score in `[0, 1]`. */ + confidence: number; +} + +/** + * Semantic role of a paragraph element. `null` when the role is undetermined. + */ +export type ParagraphRole = + | 'Text' + | 'Title' + | 'SectionHeader' + | 'Header' + | 'Footer' + | 'Caption' + | 'Footnote' + | 'ListItem' + | 'PageNumber' + | 'Code' + | 'CheckboxSelected' + | 'CheckboxUnselected'; + +/** Fields shared by every spatial element. */ +export interface ParseElementBase { + /** Unique element identifier (UUID). */ + id: string; + bounds: ParseBounds; + /** Detection confidence score in `[0, 1]`. */ + confidence: number; + /** Reading order index within the page. */ + readingOrder: number; + page: ParsePageRef; +} + +export interface ParagraphElement extends ParseElementBase { + type: 'paragraph'; + role?: ParagraphRole | null; + text: string; + /** Word-level OCR data. Present only when `includeWords` is `true`. */ + words?: ParseWord[] | null; +} + +export interface FormulaElement extends ParseElementBase { + type: 'formula'; + /** LaTeX representation of the formula. */ + latex: string; +} + +export interface PictureElement extends ParseElementBase { + type: 'picture'; + /** Image classification category (e.g. `chart`, `photo`, `diagram`). */ + classification: string; + /** Confidence score for the classification in `[0, 1]`. */ + classificationConfidence: number; + /** AI-generated alternative text. */ + altDescription: string; + /** IDs of associated caption paragraph elements. */ + captionIds?: string[] | null; + /** IDs of associated footnote paragraph elements. */ + footnoteIds?: string[] | null; +} + +/** A single cell inside a `TableElement`. */ +export interface ParseTableCell { + id: string; + bounds: ParseBounds; + /** Detection confidence score in `[0, 1]`. */ + confidence: number; + /** 0-indexed row. */ + row: number; + /** 0-indexed column. */ + column: number; + /** Number of rows this cell spans. */ + rowSpan: number; + /** Number of columns this cell spans. */ + colSpan: number; + text: string; + /** Word-level OCR data. Present only when `includeWords` is `true`. */ + words?: ParseWord[] | null; +} + +export interface TableElement extends ParseElementBase { + type: 'table'; + rowCount: number; + columnCount: number; + cells: ParseTableCell[]; + captionIds?: string[] | null; + footnoteIds?: string[] | null; +} + +/** Question or answer entity within a `KeyValuePair`. */ +export interface KeyValueEntity { + id: string; + bounds: ParseBounds; + /** Detection confidence score in `[0, 1]`. */ + confidence: number; + /** Entity type. The empty string is returned when the role is unclassified. */ + entityType: 'QUESTION' | 'ANSWER' | ''; + /** Extracted value (text or other primitive). */ + value: unknown; +} + +export interface KeyValuePair { + id: string; + /** The key/question entity. `null` when only a value was detected. */ + key?: KeyValueEntity | null; + /** The value/answer entity. `null` when only a key was detected. */ + value?: KeyValueEntity | null; + /** Confidence for the key-value relationship in `[0, 1]`. */ + relationshipConfidence?: number | null; +} + +export interface KeyValueRegionElement extends ParseElementBase { + type: 'keyValueRegion'; + pairs: KeyValuePair[]; +} + +export interface HandwritingElement extends ParseElementBase { + type: 'handwriting'; + text: string; + /** Word-level OCR data. Present only when `includeWords` is `true`. */ + words?: ParseWord[] | null; +} + +/** + * Discriminated union of every spatial element type. Use the `type` field for + * narrowing. + */ +export type ParseElement = + | ParagraphElement + | FormulaElement + | PictureElement + | TableElement + | KeyValueRegionElement + | HandwritingElement; + +/** + * Processing metrics for a `/extraction/parse` call. + */ +export interface ParseMetrics { + processingTimeMs: number; + pagesProcessed: number; +} + +/** + * Extraction-credit usage for a `/extraction/parse` call. + * + * **Extraction credits** are a separate billing bucket from processor API credits; + * an extraction call never debits processor credits and vice-versa. + */ +export interface ParseUsage { + data_extraction_credits?: { + /** Extraction credits consumed by this request. */ + cost: number; + /** Remaining extraction credits in the account. */ + remainingCredits: number; + }; +} + +/** + * Echoes the resolved configuration the server used for this request. + */ +export interface ParseConfiguration { + mode: ParseMode; + outputFormat: ParseOutputFormat; +} + +/** + * Successful `/extraction/parse` response with spatial element output. + * + * Narrow on `configuration.outputFormat === 'spatial'` (or `'markdown'` for the + * other variant) to access `output.elements` vs `output.markdown` with type safety. + */ +export interface ParseResponseSpatial { + status: 200; + /** Unique request identifier for debugging and support. */ + requestId: string; + output: { + elements: ParseElement[]; + /** Always absent on spatial responses. Kept on the shape so consumers can use + * a single `output` property without conditional access. */ + markdown?: undefined; + }; + metrics: ParseMetrics; + usage?: ParseUsage; + configuration: ParseConfiguration & { outputFormat: 'spatial' }; +} + +/** + * Successful `/extraction/parse` response with whole-document Markdown output. + */ +export interface ParseResponseMarkdown { + status: 200; + /** Unique request identifier for debugging and support. */ + requestId: string; + output: { + markdown: string; + /** Always absent on markdown responses. */ + elements?: undefined; + }; + metrics: ParseMetrics; + usage?: ParseUsage; + configuration: ParseConfiguration & { outputFormat: 'markdown' }; +} + +/** + * Discriminated union of every successful `/extraction/parse` response. Narrow on + * `configuration.outputFormat` (or simply branch on `output.markdown` / + * `output.elements`) to pick between the two output shapes. + */ +export type ParseResponse = ParseResponseSpatial | ParseResponseMarkdown; + +/** + * Path-level error detail returned inside `ParseErrorDetails.failingPaths`. + */ +export interface ParseErrorFailingPath { + /** JSON path to the invalid field (e.g. `$.mode`). */ + path: string; + /** Human-readable validation message. */ + details: string; +} + +/** + * Structured error details returned by the server on validation/processing errors. + */ +export interface ParseErrorDetails { + /** + * Error origin: + * - `request` — validation errors (invalid parameters, unsupported format). + * - `processing` — backend processing failures. + * - `maestro` — Maestro engine failures. + */ + source?: string; + /** Machine-readable error code stable enough for client branching. */ + code?: string; + /** Per-field validation errors. Present on validation responses. */ + failingPaths?: ParseErrorFailingPath[]; +} + +/** + * Error response envelope returned on 4xx/5xx responses from `/extraction/parse`. + * + * The TypeScript client surfaces this body as the `details` of the thrown + * `APIError` / `ValidationError` / `AuthenticationError`. + */ +export interface ParseErrorResponse { + status: number; + requestId: string; + errorMessage: string; + errorDetails?: ParseErrorDetails; +} + +/** + * Options accepted by {@link import('../client').NutrientClient.parse | NutrientClient.parse}. + * + * All fields are optional; the server falls back to `mode: 'understand'` with + * spatial output when nothing is provided. + */ +export interface ParseOptions { + mode?: ParseMode; + output?: ParseOutputOptions; + /** OCR language hint. See {@link ParseProcessingOptions.language}. */ + language?: string | string[]; + /** + * Optional API-version override sent as the `x-nutrient-api-version` header. + * Defaults to the version pinned at API-key creation time. + */ + apiVersion?: string; +} From 9ce0a03a8d935b9392f43adf157967e32811565d Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 11:36:37 +1200 Subject: [PATCH 03/13] feat(client): support /extraction/parse with parse() and convenience wrappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds first-class client methods for the Data Extraction API: - parse(input, options?) — full-fidelity call against POST /extraction/parse, supporting local files, buffers, streams, and URL inputs. Handles multipart upload for binary inputs and JSON body for URL-only requests. - parseToMarkdown(input, mode?) — convenience wrapper returning the whole- document Markdown string directly. Defaults to mode='text' (cheapest). - parseElements(input, mode?, includeWords?) — convenience wrapper returning the typed spatial-elements array. Defaults to mode='structure'. Threads x-nutrient-api-version through the HTTP layer when the caller pins a specific API version. JSDoc on every new method makes the billing distinction explicit: the Data Extraction API bills against extraction credits, a separate bucket from the processor API credits used by the rest of NutrientClient. The full set of new types is re-exported from the package root. --- src/client.ts | 183 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/http.ts | 28 ++++++++ src/index.ts | 32 +++++++++ 3 files changed, 238 insertions(+), 5 deletions(-) diff --git a/src/client.ts b/src/client.ts index 815fcc9..18ac103 100644 --- a/src/client.ts +++ b/src/client.ts @@ -8,12 +8,16 @@ import type { OutputTypeMap, WorkflowResult, UrlInput, + ParseInstructions, + ParseOptions, + ParseResponse, + ParseElement, } from './types'; import { ValidationError, NutrientError } from './errors'; import { workflow } from './workflow'; import type { components, operations } from './generated/api-types'; import { BuildActions } from './build'; -import { processFileInput, isRemoteFileInput } from './inputs'; +import { processFileInput, isRemoteFileInput, getRemoteUrl } from './inputs'; import { sendRequest } from './http'; import type { NormalizedFileData } from './inputs'; import type { ApplicableAction } from './builders/workflow'; @@ -1704,10 +1708,7 @@ export class NutrientClient { * fs.writeFileSync('modified-document.pdf', Buffer.from(result.buffer)); * ``` */ - async deletePages( - pdf: FileInputWithUrl, - pageIndices: number[], - ): Promise { + async deletePages(pdf: FileInputWithUrl, pageIndices: number[]): Promise { if (!pageIndices || pageIndices.length === 0) { throw new ValidationError('At least one page index is required for deletion'); } @@ -1808,4 +1809,176 @@ export class NutrientClient { .execute(); return this.processTypedWorkflowResult(result); } + + /** + * Extracts structured content from a document via the Nutrient Data Extraction API + * (`POST /extraction/parse`). + * + * Four processing modes are available, each billed against the account's + * **extraction credits** bucket (a separate billing bucket from the + * **processor API credits** used by the rest of `NutrientClient`): + * + * - `text` — Plain text extraction. Markdown output only. 1 extraction credit/page. + * - `structure` — OCR-backed structured extraction with spatial elements. 1.5 extraction credits/page. + * - `understand` — Deeper document analysis with semantic enrichment. 9 extraction credits/page. (Default) + * - `agentic` — VLM-augmented extraction for complex documents. 18 extraction credits/page. + * + * Two output formats: + * - `spatial` (default for `structure`/`understand`/`agentic`) — Typed elements + * with bounds, confidence, reading order, and page refs. + * - `markdown` (default for `text`) — Whole-document Markdown for RAG and search. + * + * @param input - The document to parse. Accepts local files, buffers, streams, or a URL. + * @param options - Optional parse configuration (mode, output format, language, API version). + * @returns Promise resolving to the `/extraction/parse` response. Narrow on + * `output.markdown` / `output.elements` for type-safe field access, or + * read `configuration.outputFormat` for the server-resolved value. + * + * @example + * ```typescript + * // Whole-document Markdown for a RAG pipeline (cheapest mode). + * const md = await client.parse('invoice.pdf', { mode: 'text' }); + * if (md.output.markdown !== undefined) { + * console.log(md.output.markdown); + * } + * + * // Spatial elements for a born-OCR scan. + * const spatial = await client.parse('scan.pdf', { + * mode: 'structure', + * output: { format: 'spatial', includeWords: true }, + * language: ['eng', 'spa'], + * }); + * if (spatial.output.elements !== undefined) { + * for (const el of spatial.output.elements) { + * if (el.type === 'paragraph') console.log(el.text); + * } + * } + * + * // Parse a remote document (URL input). + * const remote = await client.parse('https://example.com/doc.pdf'); + * + * // Cost reporting — extraction credits, not processor API credits. + * console.log('Extraction credits used:', remote.usage?.data_extraction_credits?.cost); + * + * // Or skip the discriminant entirely with the convenience wrappers: + * const justMarkdown = await client.parseToMarkdown('invoice.pdf'); + * const justElements = await client.parseElements('scan.pdf', 'understand'); + * ``` + */ + async parse(input: FileInputWithUrl, options?: ParseOptions): Promise { + const instructions: ParseInstructions = {}; + if (options?.mode !== undefined) instructions.mode = options.mode; + if (options?.output !== undefined) instructions.output = options.output; + if (options?.language !== undefined) { + instructions.options = { language: options.language }; + } + + const headers: Record | undefined = + options?.apiVersion !== undefined + ? { 'x-nutrient-api-version': options.apiVersion } + : undefined; + + // URL input → JSON body + const remoteUrl = getRemoteUrl(input); + if (remoteUrl !== null) { + instructions.url = remoteUrl; + const response = await sendRequest( + { + method: 'POST', + endpoint: '/extraction/parse', + data: { instructions }, + ...(headers ? { headers } : {}), + }, + this.options, + 'json', + ); + return response.data; + } + + // Local file input → multipart upload + const normalizedFile = await processFileInput(input as FileInput); + const response = await sendRequest( + { + method: 'POST', + endpoint: '/extraction/parse', + data: { instructions, file: normalizedFile }, + ...(headers ? { headers } : {}), + }, + this.options, + 'json', + ); + return response.data; + } + + /** + * Convenience wrapper around {@link NutrientClient.parse} that returns the + * whole-document Markdown directly. Billed against **extraction credits** + * (1 credit/page for `text`, 1.5 for `structure`, 9 for `understand`, 18 for + * `agentic`). + * + * @param input - The document to parse. + * @param mode - Processing mode (defaults to `'text'` for cheapest Markdown extraction). + * @returns Promise resolving to the Markdown string. + * + * @example + * ```typescript + * const markdown = await client.parseToMarkdown('document.pdf'); + * console.log(markdown); + * ``` + */ + async parseToMarkdown( + input: FileInputWithUrl, + mode: ParseOptions['mode'] = 'text', + ): Promise { + const result = await this.parse(input, { + mode, + output: { format: 'markdown' }, + }); + if (result.output.markdown === undefined) { + throw new NutrientError( + 'parseToMarkdown expected markdown output, server returned ' + + result.configuration.outputFormat, + 'PARSE_OUTPUT_MISMATCH', + { configuration: result.configuration as unknown as Record }, + ); + } + return result.output.markdown; + } + + /** + * Convenience wrapper around {@link NutrientClient.parse} that returns the + * spatial elements array directly. Not available with `mode: 'text'`. + * Billed against **extraction credits** (1.5/page for `structure`, 9 for + * `understand`, 18 for `agentic`). + * + * @param input - The document to parse. + * @param mode - Processing mode (defaults to `'structure'`). Must not be `'text'`. + * @param includeWords - Include word-level OCR data inside paragraphs and table cells. + * @returns Promise resolving to the array of spatial elements. + * + * @example + * ```typescript + * const elements = await client.parseElements('scan.pdf', 'understand'); + * const tables = elements.filter(e => e.type === 'table'); + * ``` + */ + async parseElements( + input: FileInputWithUrl, + mode: Exclude = 'structure', + includeWords = false, + ): Promise { + const result = await this.parse(input, { + mode, + output: { format: 'spatial', includeWords }, + }); + if (result.output.elements === undefined) { + throw new NutrientError( + 'parseElements expected spatial output, server returned ' + + result.configuration.outputFormat, + 'PARSE_OUTPUT_MISMATCH', + { configuration: result.configuration as unknown as Record }, + ); + } + return result.output.elements; + } } diff --git a/src/http.ts b/src/http.ts index 2dfb051..fd0d0c0 100644 --- a/src/http.ts +++ b/src/http.ts @@ -158,6 +158,34 @@ function prepareRequestBody; + const { file, instructions } = typedConfig.data; + + if (file) { + // Multipart upload: file + JSON instructions + const formData = new FormData(); + appendFileToFormData(formData, 'file', file); + if (instructions && Object.keys(instructions).length > 0) { + formData.append('instructions', JSON.stringify(instructions), { + contentType: 'application/json', + }); + } + axiosConfig.data = formData; + axiosConfig.headers = { + ...axiosConfig.headers, + ...formData.getHeaders(), + }; + } else { + // URL-only request → JSON body + axiosConfig.data = instructions; + axiosConfig.headers = { + ...axiosConfig.headers, + 'Content-Type': 'application/json', + }; + } + return axiosConfig; } } diff --git a/src/index.ts b/src/index.ts index 65f091d..acae9e9 100644 --- a/src/index.ts +++ b/src/index.ts @@ -36,6 +36,38 @@ export type { OutputTypeMap, TypedWorkflowResult, WorkflowDryRunResult, + + // Data Extraction (`/extraction/parse`) types + ParseMode, + ParseOutputFormat, + ParseOutputOptions, + ParseProcessingOptions, + ParseInstructions, + ParseOptions, + ParseResponse, + ParseResponseSpatial, + ParseResponseMarkdown, + ParseElement, + ParagraphElement, + ParagraphRole, + FormulaElement, + PictureElement, + TableElement, + ParseTableCell, + KeyValueRegionElement, + KeyValuePair, + KeyValueEntity, + HandwritingElement, + ParseElementBase, + ParseBounds, + ParsePageRef, + ParseWord, + ParseMetrics, + ParseUsage, + ParseConfiguration, + ParseErrorResponse, + ParseErrorDetails, + ParseErrorFailingPath, } from './types'; // Utility exports From e110ab9794f683ba1f89a90e1e46b9b3bba04888 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 11:36:48 +1200 Subject: [PATCH 04/13] test(parse): cover request shape, modes, output formats, and error paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds 19 unit tests around the new /extraction/parse surface: - Request shape: multipart vs JSON, apiVersion header forwarding, option serialisation (language, output, includeWords), default behaviour. - Mode coverage: all four modes (text, structure, understand, agentic) round-trip through the instructions payload. - Output coverage: spatial elements and whole-document Markdown variants validated end-to-end, including extraction-credit accounting on the response (data_extraction_credits, not processor credits). - Error paths: HTTP-layer ValidationError propagation, file-input preflight failures surfaced before the request leaves the process. - Convenience wrappers: parseToMarkdown and parseElements default modes and includeWords forwarding, plus defensive output-mismatch errors. Adds examples/src/parse_smoke.ts — a live operator-runnable smoke test that prints a parsed summary plus extraction-credit usage. Documents the build/pack/install/run recipe in the file header. --- examples/src/parse_smoke.ts | 163 +++++++++++++ src/__tests__/unit/parse.test.ts | 406 +++++++++++++++++++++++++++++++ 2 files changed, 569 insertions(+) create mode 100644 examples/src/parse_smoke.ts create mode 100644 src/__tests__/unit/parse.test.ts diff --git a/examples/src/parse_smoke.ts b/examples/src/parse_smoke.ts new file mode 100644 index 0000000..2f24672 --- /dev/null +++ b/examples/src/parse_smoke.ts @@ -0,0 +1,163 @@ +/** + * Live smoke test for the Data Extraction API (`POST /extraction/parse`). + * + * Invocation (from the `examples/` directory, after building the package): + * + * # 1. Build + pack the parent package + * cd .. + * npm install && npm run build && npm pack + * + * # 2. Install the packed tarball into examples/ + * cd examples + * npm install + * + * # 3. Run the smoke test + * NUTRIENT_API_KEY=pdf_live_... npx tsx src/parse_smoke.ts + * + * Optional environment: + * NUTRIENT_PARSE_INPUT — Local file path or https:// URL to parse. + * Defaults to `assets/sample.pdf`. + * NUTRIENT_PARSE_MODE — `text` | `structure` | `understand` | `agentic`. Defaults to `text`. + * NUTRIENT_PARSE_OUTPUT — `markdown` | `spatial`. Server default depends on mode. + * NUTRIENT_PARSE_INCLUDE_WORDS — Truthy value enables word-level OCR data in spatial output. + * NUTRIENT_PARSE_BASE_URL — Override base URL (e.g. staging). + * + * The script prints a short summary plus the first elements / first 800 chars of + * markdown for inspection. It exits non-zero on failure. It is intentionally + * read-only: it does not write files, push branches, or modify the worktree. + * + * Billing note: every call against `/extraction/parse` debits the account's + * **extraction credits** bucket (separate from the **processor API credits** + * used by `/build`, `/sign`, OCR, etc.). The cheapest mode (`text`) costs + * 1 extraction credit per page, so this smoke test against a 1-page PDF + * costs 1 extraction credit. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { NutrientClient } from '@nutrient-sdk/dws-client-typescript'; +import type { + ParseMode, + ParseOptions, + ParseOutputFormat, +} from '@nutrient-sdk/dws-client-typescript'; + +function envBool(name: string): boolean { + const v = process.env[name]; + return v !== undefined && v !== '' && v !== '0' && v.toLowerCase() !== 'false'; +} + +function parseModeOrDie(value: string | undefined, fallback: ParseMode): ParseMode { + const candidate = value ?? fallback; + const allowed: ParseMode[] = ['text', 'structure', 'understand', 'agentic']; + if (!allowed.includes(candidate as ParseMode)) { + console.error(`Invalid NUTRIENT_PARSE_MODE='${candidate}'. Allowed: ${allowed.join(', ')}`); + process.exit(2); + } + return candidate as ParseMode; +} + +function parseOutputOrUndefined(value: string | undefined): ParseOutputFormat | undefined { + if (value === undefined || value === '') return undefined; + if (value === 'markdown' || value === 'spatial') return value; + console.error(`Invalid NUTRIENT_PARSE_OUTPUT='${value}'. Allowed: markdown, spatial`); + process.exit(2); +} + +async function main(): Promise { + const apiKey = process.env['NUTRIENT_API_KEY']; + if (!apiKey) { + console.error('Error: NUTRIENT_API_KEY is not set. Export your DWS API key before running:'); + console.error(' export NUTRIENT_API_KEY=pdf_live_...'); + process.exit(1); + } + + const inputArg = + process.env['NUTRIENT_PARSE_INPUT'] ?? path.resolve(__dirname, '..', 'assets', 'sample.pdf'); + const isUrl = /^https?:\/\//i.test(inputArg); + if (!isUrl && !fs.existsSync(inputArg)) { + console.error(`Input file not found: ${inputArg}`); + process.exit(1); + } + + const mode = parseModeOrDie(process.env['NUTRIENT_PARSE_MODE'], 'text'); + const outputFormat = parseOutputOrUndefined(process.env['NUTRIENT_PARSE_OUTPUT']); + const includeWords = envBool('NUTRIENT_PARSE_INCLUDE_WORDS'); + + const client = new NutrientClient({ + apiKey, + ...(process.env['NUTRIENT_PARSE_BASE_URL'] + ? { baseUrl: process.env['NUTRIENT_PARSE_BASE_URL'] } + : {}), + }); + + const options: ParseOptions = { mode }; + if (outputFormat !== undefined) { + options.output = { format: outputFormat }; + if (outputFormat === 'spatial') options.output.includeWords = includeWords; + } + + console.log('--- /extraction/parse smoke test ---'); + console.log('Input:', inputArg); + console.log('Mode:', mode); + console.log('Output:', options.output ?? ''); + console.log(''); + + const started = Date.now(); + const result = await client.parse(inputArg, options); + const elapsed = Date.now() - started; + + console.log('--- response summary ---'); + console.log('Wall time: ', `${elapsed}ms`); + console.log('Server processing time (ms): ', result.metrics.processingTimeMs); + console.log('Pages processed: ', result.metrics.pagesProcessed); + console.log('Configured mode: ', result.configuration.mode); + console.log('Configured output format: ', result.configuration.outputFormat); + console.log('Extraction credits used: ', result.usage?.data_extraction_credits?.cost); + console.log( + 'Extraction credits remaining: ', + result.usage?.data_extraction_credits?.remainingCredits, + ); + console.log('Request ID: ', result.requestId); + console.log(''); + + if (result.output.markdown !== undefined) { + const md = result.output.markdown; + console.log('--- markdown (first 800 chars) ---'); + console.log(md.length > 800 ? md.slice(0, 800) + '\n…[truncated]' : md); + } else { + const elements = result.output.elements; + console.log(`--- spatial elements (${elements.length} total) ---`); + for (const el of elements.slice(0, 5)) { + const summary: Record = { + type: el.type, + id: el.id, + page: el.page.pageIndex, + confidence: el.confidence, + }; + if (el.type === 'paragraph' || el.type === 'handwriting') { + summary['textPreview'] = el.text.slice(0, 80); + } else if (el.type === 'table') { + summary['rowsXcols'] = `${el.rowCount}x${el.columnCount}`; + } else if (el.type === 'formula') { + summary['latex'] = el.latex; + } else if (el.type === 'picture') { + summary['classification'] = el.classification; + } else if (el.type === 'keyValueRegion') { + summary['pairs'] = el.pairs.length; + } + console.log(summary); + } + if (elements.length > 5) console.log(`… ${elements.length - 5} more elements`); + } +} + +main().catch((err: unknown) => { + const e = err as { message?: string; statusCode?: number; details?: unknown; code?: string }; + console.error('--- /extraction/parse smoke test FAILED ---'); + console.error('Message: ', e.message ?? String(err)); + if (e.code !== undefined) console.error('Error code: ', e.code); + if (e.statusCode !== undefined) console.error('HTTP status:', e.statusCode); + if (e.details !== undefined) console.error('Details: ', e.details); + process.exit(1); +}); diff --git a/src/__tests__/unit/parse.test.ts b/src/__tests__/unit/parse.test.ts new file mode 100644 index 0000000..1e48b78 --- /dev/null +++ b/src/__tests__/unit/parse.test.ts @@ -0,0 +1,406 @@ +import { NutrientClient } from '../../client'; +import type { + ParseResponseMarkdown, + ParseResponseSpatial, + ParagraphElement, + TableElement, +} from '../../types'; +import { NutrientError, ValidationError } from '../../errors'; +import * as inputsModule from '../../inputs'; +import * as httpModule from '../../http'; + +jest.mock('../../inputs'); +jest.mock('../../http'); + +const mockSendRequest = httpModule.sendRequest as jest.MockedFunction< + typeof httpModule.sendRequest +>; +const mockProcessFileInput = inputsModule.processFileInput as jest.MockedFunction< + typeof inputsModule.processFileInput +>; +const mockGetRemoteUrl = inputsModule.getRemoteUrl as jest.MockedFunction< + typeof inputsModule.getRemoteUrl +>; + +const sampleSpatialResponse: ParseResponseSpatial = { + status: 200, + requestId: 'req_e5f6g7h8', + output: { + elements: [ + { + id: 'a1b2c3d4-1111-4000-8000-000000000001', + type: 'paragraph', + role: 'Title', + text: 'Quarterly Report', + confidence: 0.95, + readingOrder: 0, + bounds: { x: 200, y: 139, width: 1111, height: 97 }, + page: { pageIndex: 0, pageNumber: 1, width: 1700, height: 2200 }, + } satisfies ParagraphElement, + { + id: 'a1b2c3d4-2222-4000-8000-000000000002', + type: 'table', + rowCount: 2, + columnCount: 2, + cells: [ + { + id: 'c-001', + bounds: { x: 100, y: 200, width: 200, height: 50 }, + confidence: 0.92, + row: 0, + column: 0, + rowSpan: 1, + colSpan: 1, + text: 'Region', + }, + ], + confidence: 0.92, + readingOrder: 1, + bounds: { x: 100, y: 200, width: 600, height: 200 }, + page: { pageIndex: 0, pageNumber: 1, width: 1700, height: 2200 }, + } satisfies TableElement, + ], + }, + metrics: { processingTimeMs: 4200, pagesProcessed: 1 }, + usage: { data_extraction_credits: { cost: 1.5, remainingCredits: 850 } }, + configuration: { mode: 'structure', outputFormat: 'spatial' }, +}; + +const sampleMarkdownResponse: ParseResponseMarkdown = { + status: 200, + requestId: 'req_a1b2c3d4', + output: { markdown: '# Document Title\n\nFirst paragraph.' }, + metrics: { processingTimeMs: 312, pagesProcessed: 1 }, + usage: { data_extraction_credits: { cost: 1, remainingCredits: 849 } }, + configuration: { mode: 'text', outputFormat: 'markdown' }, +}; + +const normalizedFile = { + data: Buffer.from('%PDF-1.4 fake'), + filename: 'doc.pdf', +}; + +function makeClient(): NutrientClient { + return new NutrientClient({ apiKey: 'test-key' }); +} + +describe('NutrientClient.parse()', () => { + beforeEach(() => { + jest.clearAllMocks(); + mockProcessFileInput.mockResolvedValue(normalizedFile); + mockGetRemoteUrl.mockReturnValue(null); + }); + + describe('request shape', () => { + it('sends a multipart POST to /extraction/parse for a local file', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + await makeClient().parse('document.pdf', { mode: 'structure' }); + + expect(mockSendRequest).toHaveBeenCalledTimes(1); + const call = mockSendRequest.mock.calls[0]?.[0] as { + method: string; + endpoint: string; + data: { instructions: { mode?: string }; file?: unknown }; + }; + expect(call.method).toBe('POST'); + expect(call.endpoint).toBe('/extraction/parse'); + expect(call.data.file).toBe(normalizedFile); + expect(call.data.instructions).toEqual({ mode: 'structure' }); + }); + + it('sends a JSON POST to /extraction/parse for a URL input', async () => { + mockGetRemoteUrl.mockReturnValue('https://example.com/doc.pdf'); + mockSendRequest.mockResolvedValue({ + data: sampleMarkdownResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + await makeClient().parse('https://example.com/doc.pdf', { mode: 'text' }); + + const call = mockSendRequest.mock.calls[0]?.[0] as { + method: string; + endpoint: string; + data: { instructions: { url?: string; mode?: string }; file?: unknown }; + }; + expect(call.endpoint).toBe('/extraction/parse'); + expect(call.data.file).toBeUndefined(); + expect(call.data.instructions).toEqual({ + mode: 'text', + url: 'https://example.com/doc.pdf', + }); + expect(mockProcessFileInput).not.toHaveBeenCalled(); + }); + + it('forwards the apiVersion option as x-nutrient-api-version', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + await makeClient().parse('document.pdf', { + mode: 'understand', + apiVersion: '2026-05-25', + }); + + const call = mockSendRequest.mock.calls[0]?.[0] as { + headers?: Record; + }; + expect(call.headers).toEqual({ 'x-nutrient-api-version': '2026-05-25' }); + }); + + it('serialises language and output options into instructions', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + await makeClient().parse('document.pdf', { + mode: 'understand', + output: { format: 'spatial', includeWords: true }, + language: ['eng', 'spa'], + }); + + const call = mockSendRequest.mock.calls[0]?.[0] as { + data: { + instructions: { + mode?: string; + output?: { format: string; includeWords?: boolean }; + options?: { language?: unknown }; + }; + }; + }; + expect(call.data.instructions).toEqual({ + mode: 'understand', + output: { format: 'spatial', includeWords: true }, + options: { language: ['eng', 'spa'] }, + }); + }); + + it('omits optional fields when not provided', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + await makeClient().parse('document.pdf'); + + const call = mockSendRequest.mock.calls[0]?.[0] as { + data: { instructions: object }; + headers?: Record; + }; + expect(call.data.instructions).toEqual({}); + expect(call.headers).toBeUndefined(); + }); + }); + + describe('mode coverage', () => { + const modes = ['text', 'structure', 'understand', 'agentic'] as const; + + it.each(modes)('serialises mode=%s into instructions', async (mode) => { + mockSendRequest.mockResolvedValue({ + data: + mode === 'text' + ? sampleMarkdownResponse + : { ...sampleSpatialResponse, configuration: { mode, outputFormat: 'spatial' } }, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + const result = await makeClient().parse('document.pdf', { mode }); + + const call = mockSendRequest.mock.calls[0]?.[0] as { + data: { instructions: { mode?: string } }; + }; + expect(call.data.instructions.mode).toBe(mode); + // The mocked response echoes the configured mode, so result.configuration.mode + // round-trips correctly for downstream branching. + expect(result.configuration.mode).toBe(mode); + }); + }); + + describe('output-shape coverage', () => { + it('returns spatial elements when configuration.outputFormat is spatial', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + const result = (await makeClient().parse('document.pdf', { + mode: 'structure', + output: { format: 'spatial' }, + })) as ParseResponseSpatial; + + expect(result.configuration.outputFormat).toBe('spatial'); + expect(Array.isArray(result.output.elements)).toBe(true); + expect(result.output.elements[0]?.type).toBe('paragraph'); + }); + + it('returns whole-document Markdown when configuration.outputFormat is markdown', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleMarkdownResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + const result = (await makeClient().parse('document.pdf', { + mode: 'text', + output: { format: 'markdown' }, + })) as ParseResponseMarkdown; + + expect(result.configuration.outputFormat).toBe('markdown'); + expect(result.output.markdown.startsWith('# ')).toBe(true); + }); + + it('surfaces extraction-credit usage (not processor credits)', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + const result = await makeClient().parse('document.pdf'); + // The field name `data_extraction_credits` is the explicit billing-bucket + // marker so callers cannot confuse it with processor API credits. + expect(result.usage?.data_extraction_credits?.cost).toBe(1.5); + expect(result.usage?.data_extraction_credits?.remainingCredits).toBe(850); + }); + }); + + describe('error paths', () => { + it('propagates ValidationError from the HTTP layer (e.g. 400 invalid mode)', async () => { + mockSendRequest.mockRejectedValue( + new ValidationError('The request is malformed', { + errorDetails: { source: 'request', code: 'invalid_request' }, + }), + ); + + await expect( + makeClient().parse('document.pdf', { mode: 'understand' }), + ).rejects.toBeInstanceOf(ValidationError); + }); + + it('propagates errors raised by the file input layer', async () => { + mockProcessFileInput.mockRejectedValue( + new ValidationError('File not found: missing.pdf', { filePath: 'missing.pdf' }), + ); + + await expect(makeClient().parse('missing.pdf')).rejects.toBeInstanceOf(ValidationError); + expect(mockSendRequest).not.toHaveBeenCalled(); + }); + }); +}); + +describe('NutrientClient.parseToMarkdown()', () => { + beforeEach(() => { + jest.clearAllMocks(); + mockProcessFileInput.mockResolvedValue(normalizedFile); + mockGetRemoteUrl.mockReturnValue(null); + }); + + it('returns the markdown string and defaults to mode=text', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleMarkdownResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + const md = await makeClient().parseToMarkdown('document.pdf'); + expect(md).toBe('# Document Title\n\nFirst paragraph.'); + + const call = mockSendRequest.mock.calls[0]?.[0] as { + data: { instructions: { mode?: string; output?: { format: string } } }; + }; + expect(call.data.instructions.mode).toBe('text'); + expect(call.data.instructions.output).toEqual({ format: 'markdown' }); + }); + + it('throws NutrientError on output mismatch (defensive)', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, // server returned spatial despite our markdown ask + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + await expect(makeClient().parseToMarkdown('document.pdf')).rejects.toBeInstanceOf( + NutrientError, + ); + }); +}); + +describe('NutrientClient.parseElements()', () => { + beforeEach(() => { + jest.clearAllMocks(); + mockProcessFileInput.mockResolvedValue(normalizedFile); + mockGetRemoteUrl.mockReturnValue(null); + }); + + it('returns the elements array and defaults to mode=structure', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + const elements = await makeClient().parseElements('document.pdf'); + expect(elements).toHaveLength(2); + expect(elements[0]?.type).toBe('paragraph'); + expect(elements[1]?.type).toBe('table'); + + const call = mockSendRequest.mock.calls[0]?.[0] as { + data: { + instructions: { mode?: string; output?: { format: string; includeWords?: boolean } }; + }; + }; + expect(call.data.instructions.mode).toBe('structure'); + expect(call.data.instructions.output).toEqual({ format: 'spatial', includeWords: false }); + }); + + it('forwards includeWords=true into the request', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + await makeClient().parseElements('document.pdf', 'understand', true); + + const call = mockSendRequest.mock.calls[0]?.[0] as { + data: { instructions: { output?: { includeWords?: boolean } } }; + }; + expect(call.data.instructions.output?.includeWords).toBe(true); + }); + + it('throws NutrientError when the server returned markdown instead of spatial', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleMarkdownResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + await expect(makeClient().parseElements('document.pdf')).rejects.toBeInstanceOf(NutrientError); + }); +}); From c8b5dcc1515032aa7c85178187fde3db31e635ed Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 11:37:01 +1200 Subject: [PATCH 05/13] docs: document /extraction/parse surface and extraction-credit billing - README: new "Data Extraction (/extraction/parse)" section with mode/ credit table, request examples for spatial + Markdown outputs, URL input, convenience wrappers, and a pointer to the smoke example. - docs/METHODS.md: new entries for parse, parseToMarkdown, parseElements inserted alongside the existing extract* convenience methods. - LLM_DOC.md: inject the same three method signatures so coding agents steered by this rule file know about parse and the extraction-credits bucket. - CHANGELOG.md: Unreleased entry covering the new client surface, the newly-exported public types, the live smoke script, and an explicit call-out that /extraction/parse bills against extraction credits (separate from processor API credits). Every doc surface that mentions cost says "extraction credits" explicitly so downstream readers cannot conflate the two billing buckets. --- CHANGELOG.md | 23 ++++++++++++++- LLM_DOC.md | 47 +++++++++++++++++++++++++++++++ README.md | 74 +++++++++++++++++++++++++++++++++++++++++++++++++ docs/METHODS.md | 65 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 208 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 39e5b9c..b7cdd5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -_Nothing yet._ +### Added + +- First-class client support for the Data Extraction API (`POST /extraction/parse`). + - `NutrientClient.parse(input, options?)` — full request/response surface with + typed support for all four modes (`text`, `structure`, `understand`, `agentic`) + and both output formats (`spatial`, `markdown`). + - `NutrientClient.parseToMarkdown(input, mode?)` — convenience wrapper returning + the whole-document Markdown string directly. + - `NutrientClient.parseElements(input, mode?, includeWords?)` — convenience + wrapper returning the spatial elements array directly. + - Public types: `ParseMode`, `ParseOutputFormat`, `ParseOutputOptions`, + `ParseInstructions`, `ParseOptions`, `ParseResponse`, `ParseResponseSpatial`, + `ParseResponseMarkdown`, `ParseElement` (and per-element types like + `ParagraphElement`, `TableElement`, `KeyValueRegionElement`), plus error and + metadata shapes (`ParseErrorResponse`, `ParseMetrics`, `ParseUsage`, + `ParseConfiguration`). + - Live smoke script at `scripts/smoke_parse.ts` for verifying against the + production endpoint. + - Billing note: `/extraction/parse` debits the account's **extraction + credits** bucket, which is separate from the **processor API credits** used + by the rest of `NutrientClient`. The response surfaces this explicitly in + `usage.data_extraction_credits`. ## [2.0.0] - 2026-01-27 diff --git a/LLM_DOC.md b/LLM_DOC.md index 620364d..ead5f6d 100644 --- a/LLM_DOC.md +++ b/LLM_DOC.md @@ -461,6 +461,53 @@ if (kvps && kvps.length > 0) { } ``` +#### parse(input, options?) +Extracts structured content from a document via the Data Extraction API (`POST /extraction/parse`). + +Billed against **extraction credits** (a separate bucket from processor API credits used by every other method). Mode costs per page: +- `text` — 1 extraction credit (Markdown only) +- `structure` — 1.5 extraction credits (spatial elements) +- `understand` — 9 extraction credits (default) +- `agentic` — 18 extraction credits + +```typescript +// Full call: spatial elements with bounding boxes, confidence, reading order +const result = await client.parse('invoice.pdf', { + mode: 'understand', + output: { format: 'spatial', includeWords: true }, + language: ['eng', 'spa'], +}); + +if (result.output.elements !== undefined) { + for (const el of result.output.elements) { + if (el.type === 'paragraph') console.log(el.text); + } +} + +// Extraction-credit accounting (separate from processor credits): +console.log(result.usage?.data_extraction_credits?.cost); + +// URL input (server fetches the URL): +const remote = await client.parse('https://example.com/doc.pdf', { mode: 'text' }); +``` + +#### parseToMarkdown(input, mode?) +Convenience wrapper that returns just the whole-document Markdown string. Defaults to `mode='text'` (cheapest, 1 extraction credit/page). + +```typescript +const markdown = await client.parseToMarkdown('document.pdf'); +const richer = await client.parseToMarkdown('scan.pdf', 'understand'); +``` + +#### parseElements(input, mode?, includeWords?) +Convenience wrapper that returns just the array of spatial elements. Defaults to `mode='structure'`. Cannot use `mode='text'`. + +```typescript +const elements = await client.parseElements('document.pdf'); +const tables = elements.filter(e => e.type === 'table'); +const withWords = await client.parseElements('scan.pdf', 'understand', true); +``` + #### flatten(file, annotationIds?) Flattens annotations in a PDF document. diff --git a/README.md b/README.md index a760c6d..14050d8 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,80 @@ const mergedPdf = await client.merge(['doc1.pdf', 'doc2.pdf', 'doc3.pdf']); For a complete list of available methods with examples, see the [Methods Documentation](docs/METHODS.md). +## Data Extraction (`/extraction/parse`) + +In addition to the document-processing endpoints, the client supports the +[Data Extraction API](https://www.nutrient.io/api/reference/data-extraction/public/) +for extracting structured content from PDFs, images, and Office files. + +**Billing**: `/extraction/parse` is billed against the account's **extraction +credits** bucket, which is **separate** from the **processor API credits** +consumed by `convert`, `ocr`, `sign`, `merge`, and the other endpoints. The +two buckets never debit each other. + +Four processing modes, each with its own extraction-credit cost per page: + +| Mode | Cost (per page) | Use case | +| ------------ | ---------------------- | ------------------------------------------------------------------------------ | +| `text` | 1 extraction credit | Fast Markdown extraction from born-digital documents. No OCR. | +| `structure` | 1.5 extraction credits | OCR-backed structured extraction with spatial elements and bounding boxes. | +| `understand` | 9 extraction credits | AI-augmented parsing for complex layouts, OCR correction, formulas. (Default.) | +| `agentic` | 18 extraction credits | VLM-augmented extraction for the deepest visual understanding. | + +Two output formats: `spatial` (typed elements + bounds + confidence + reading +order, default for non-text modes) or `markdown` (whole-document Markdown, ideal +for RAG, default for `text`). + +```typescript +import { NutrientClient } from '@nutrient-sdk/dws-client-typescript'; + +const client = new NutrientClient({ apiKey: process.env.NUTRIENT_API_KEY! }); + +// Full /extraction/parse call with explicit mode + output. +const result = await client.parse('invoice.pdf', { + mode: 'understand', + output: { format: 'spatial', includeWords: true }, + language: ['eng', 'spa'], +}); + +// Narrow on the present field for type-safe access: +if (result.output.elements !== undefined) { + for (const el of result.output.elements) { + if (el.type === 'paragraph') console.log(el.text); + if (el.type === 'table') console.log(`${el.rowCount}x${el.columnCount} table`); + } +} + +// Extraction-credit accounting is returned per request: +console.log( + 'Extraction credits used:', + result.usage?.data_extraction_credits?.cost, +); +console.log( + 'Extraction credits remaining:', + result.usage?.data_extraction_credits?.remainingCredits, +); + +// Convenience: get just the Markdown (cheapest mode by default). +const markdown = await client.parseToMarkdown('document.pdf'); + +// Convenience: get just the spatial elements. +const elements = await client.parseElements('scan.pdf', 'understand'); + +// URL input works the same way — the server fetches the URL. +const remote = await client.parse('https://example.com/document.pdf', { + mode: 'text', +}); +``` + +The full set of public types — `ParseMode`, `ParseElement`, `ParagraphElement`, +`TableElement`, `KeyValueRegionElement`, `ParseResponse`, etc. — is exported +from the package root for downstream typing. + +A live smoke script that calls the real endpoint with a sample PDF and prints a +parsed summary is available at `examples/src/parse_smoke.ts`. See the file +header for the run recipe. + ## Workflow System diff --git a/docs/METHODS.md b/docs/METHODS.md index 7fbd578..04da5bf 100644 --- a/docs/METHODS.md +++ b/docs/METHODS.md @@ -455,6 +455,71 @@ if (kvps && kvps.length > 0) { } ``` +##### parse(input, options?) +Extracts structured content via the Data Extraction API (`POST /extraction/parse`). + +This is the cleanest entry point for parsing PDFs, images, and Office files into either: +- Spatial elements (typed components — paragraphs, tables, formulas, pictures, key-value regions, handwriting — with bounding boxes, confidence scores, and reading order), or +- Whole-document Markdown (ideal for RAG and search indexing). + +**Billing**: `/extraction/parse` is billed against the account's **extraction credits** bucket, separate from the **processor API credits** used by every other method in this client. Per-page cost depends on the mode: + +| Mode | Extraction credits per page | +| ------------ | --------------------------- | +| `text` | 1 | +| `structure` | 1.5 | +| `understand` | 9 (default) | +| `agentic` | 18 | + +```typescript +// Cheapest mode: born-digital PDF → Markdown for a RAG pipeline. +const md = await client.parse('invoice.pdf', { mode: 'text' }); +if (md.output.markdown !== undefined) { + console.log(md.output.markdown); +} + +// OCR-backed spatial extraction. +const spatial = await client.parse('scan.pdf', { + mode: 'structure', + output: { format: 'spatial', includeWords: true }, + language: ['eng', 'spa'], +}); +if (spatial.output.elements !== undefined) { + for (const el of spatial.output.elements) { + if (el.type === 'paragraph') console.log(el.text); + } +} + +// URL input (the server fetches the URL — no client-side download needed). +const remote = await client.parse('https://example.com/document.pdf'); + +// Extraction-credit accounting: +console.log(remote.usage?.data_extraction_credits?.cost); +console.log(remote.usage?.data_extraction_credits?.remainingCredits); +``` + +##### parseToMarkdown(input, mode?) +Convenience wrapper that calls `parse()` with `output.format = 'markdown'` and returns the Markdown string directly. Defaults to `mode='text'` (1 extraction credit/page). + +```typescript +const markdown = await client.parseToMarkdown('document.pdf'); +const richer = await client.parseToMarkdown('scan.pdf', 'understand'); +``` + +##### parseElements(input, mode?, includeWords?) +Convenience wrapper that calls `parse()` with `output.format = 'spatial'` and returns the elements array directly. Defaults to `mode='structure'` (1.5 extraction credits/page). Pass `mode='text'` is rejected at compile time since `text` mode does not produce spatial output. + +```typescript +const elements = await client.parseElements('document.pdf'); + +// Get word-level OCR data nested inside paragraphs and table cells. +const withWords = await client.parseElements('scan.pdf', 'understand', true); + +// Filter by element type. +const tables = elements.filter(e => e.type === 'table'); +const paragraphs = elements.filter(e => e.type === 'paragraph'); +``` + ##### flatten(file, annotationIds?) Flattens annotations in a PDF document. From 17e419e670a8ff27c472ddd2829441521d3ca1a6 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 12:01:29 +1200 Subject: [PATCH 06/13] docs: fix smoke script path and parseElements doc fragment - CHANGELOG: correct path to live smoke script - METHODS.md: fix dangling sentence on parseElements compile-time guard --- CHANGELOG.md | 2 +- docs/METHODS.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7cdd5b..bc38355 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `ParagraphElement`, `TableElement`, `KeyValueRegionElement`), plus error and metadata shapes (`ParseErrorResponse`, `ParseMetrics`, `ParseUsage`, `ParseConfiguration`). - - Live smoke script at `scripts/smoke_parse.ts` for verifying against the + - Live smoke script at `examples/src/parse_smoke.ts` for verifying against the production endpoint. - Billing note: `/extraction/parse` debits the account's **extraction credits** bucket, which is separate from the **processor API credits** used diff --git a/docs/METHODS.md b/docs/METHODS.md index 04da5bf..26251ee 100644 --- a/docs/METHODS.md +++ b/docs/METHODS.md @@ -507,7 +507,7 @@ const richer = await client.parseToMarkdown('scan.pdf', 'understand'); ``` ##### parseElements(input, mode?, includeWords?) -Convenience wrapper that calls `parse()` with `output.format = 'spatial'` and returns the elements array directly. Defaults to `mode='structure'` (1.5 extraction credits/page). Pass `mode='text'` is rejected at compile time since `text` mode does not produce spatial output. +Convenience wrapper that calls `parse()` with `output.format = 'spatial'` and returns the elements array directly. Defaults to `mode='structure'` (1.5 extraction credits/page). Passing `mode='text'` is rejected at compile time since `text` mode does not produce spatial output. ```typescript const elements = await client.parseElements('document.pdf'); From 217e98e9b75e76badea7fa7be558058512c04363 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 20:02:37 +1200 Subject: [PATCH 07/13] refactor(types): extract ExtractionCredits to dedicated module Factor the inline extraction-credit billing shape out of ParseUsage into a standalone ExtractionCredits interface in src/types/extraction_credits.ts, mirroring the Python client's type-factoring approach. ParseUsage.data_extraction_credits now references ExtractionCredits instead of an anonymous inline type, making the billing object reusable if future endpoints surface the same shape. ExtractionCredits is re-exported from the package root alongside the other parse types. --- src/index.ts | 1 + src/types/extraction_credits.ts | 18 ++++++++++++++++++ src/types/index.ts | 1 + src/types/parse.ts | 13 +++++++------ 4 files changed, 27 insertions(+), 6 deletions(-) create mode 100644 src/types/extraction_credits.ts diff --git a/src/index.ts b/src/index.ts index acae9e9..40af1d6 100644 --- a/src/index.ts +++ b/src/index.ts @@ -38,6 +38,7 @@ export type { WorkflowDryRunResult, // Data Extraction (`/extraction/parse`) types + ExtractionCredits, ParseMode, ParseOutputFormat, ParseOutputOptions, diff --git a/src/types/extraction_credits.ts b/src/types/extraction_credits.ts new file mode 100644 index 0000000..1dbcd31 --- /dev/null +++ b/src/types/extraction_credits.ts @@ -0,0 +1,18 @@ +/** + * Extraction-credit usage returned by the Data Extraction API + * (`POST /extraction/parse`). + * + * **Extraction credits** are a separate billing bucket from the + * **processor API credits** consumed by `/build`, `/sign`, OCR, and + * every other endpoint on `NutrientClient`. An extraction call never + * debits processor credits and vice-versa. + * + * The server surfaces this object at + * `ParseResponse.usage.data_extraction_credits`. + */ +export interface ExtractionCredits { + /** Extraction credits consumed by this request. */ + cost: number; + /** Remaining extraction credits in the account after this request. */ + remainingCredits: number; +} diff --git a/src/types/index.ts b/src/types/index.ts index 85684b3..b47b66e 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -2,6 +2,7 @@ export * from './common'; export * from './inputs'; export * from './workflow'; export * from './http'; +export * from './extraction_credits'; export * from './parse'; // Re-export generated types for convenience export type { components, operations, paths } from '../generated/api-types'; diff --git a/src/types/parse.ts b/src/types/parse.ts index d13ef9c..4b67db4 100644 --- a/src/types/parse.ts +++ b/src/types/parse.ts @@ -1,3 +1,7 @@ +import type { ExtractionCredits } from './extraction_credits'; + +export type { ExtractionCredits }; + /** * Type definitions for the Nutrient Data Extraction API (`POST /extraction/parse`). * @@ -278,14 +282,11 @@ export interface ParseMetrics { * * **Extraction credits** are a separate billing bucket from processor API credits; * an extraction call never debits processor credits and vice-versa. + * + * See {@link ExtractionCredits} for the shape of the billing object. */ export interface ParseUsage { - data_extraction_credits?: { - /** Extraction credits consumed by this request. */ - cost: number; - /** Remaining extraction credits in the account. */ - remainingCredits: number; - }; + data_extraction_credits?: ExtractionCredits; } /** From 0fd9776df2d9d704efc5e43d24efa571949c5d2f Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 20:02:47 +1200 Subject: [PATCH 08/13] docs(client): rewrite parse() JSDoc with use-case-first framing Lead with the "Designed for" preamble naming the three canonical workflows (RAG/search indexing, form/invoice extraction, layout-aware understanding) before describing modes and output formats. Broaden the @param input description to explicitly mention non-PDF inputs (Office documents, images), matching the actual endpoint capability rather than implying PDF-only like sign(). Update the @example block to show a form/invoice extraction recipe alongside the RAG recipe, and replace the generic paragraph-walk with a keyValueRegion traversal that a form-extraction caller can copy directly. --- src/client.ts | 92 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/src/client.ts b/src/client.ts index 18ac103..e77d2fd 100644 --- a/src/client.ts +++ b/src/client.ts @@ -1814,55 +1814,79 @@ export class NutrientClient { * Extracts structured content from a document via the Nutrient Data Extraction API * (`POST /extraction/parse`). * - * Four processing modes are available, each billed against the account's - * **extraction credits** bucket (a separate billing bucket from the - * **processor API credits** used by the rest of `NutrientClient`): - * - * - `text` — Plain text extraction. Markdown output only. 1 extraction credit/page. - * - `structure` — OCR-backed structured extraction with spatial elements. 1.5 extraction credits/page. - * - `understand` — Deeper document analysis with semantic enrichment. 9 extraction credits/page. (Default) - * - `agentic` — VLM-augmented extraction for complex documents. 18 extraction credits/page. - * - * Two output formats: - * - `spatial` (default for `structure`/`understand`/`agentic`) — Typed elements - * with bounds, confidence, reading order, and page refs. - * - `markdown` (default for `text`) — Whole-document Markdown for RAG and search. - * - * @param input - The document to parse. Accepts local files, buffers, streams, or a URL. - * @param options - Optional parse configuration (mode, output format, language, API version). - * @returns Promise resolving to the `/extraction/parse` response. Narrow on - * `output.markdown` / `output.elements` for type-safe field access, or - * read `configuration.outputFormat` for the server-resolved value. + * Designed for **content-extraction workflows** where the goal is to feed document + * content into a downstream pipeline rather than render or transform the document: + * + * - **RAG / search indexing / content migration** — use `output.format: 'markdown'` + * to get a whole-document Markdown string ready for chunking, embedding, and + * indexing in a vector store or search engine. + * - **Form and invoice extraction** — use `output.format: 'spatial'` (default) to + * get a typed element list (paragraphs, tables, keyValueRegions, etc.) with + * bounding boxes and confidence scores per element. + * - **Layout-aware document understanding** — combine `mode: 'understand'` or + * `mode: 'agentic'` with spatial output for deep layout reconstruction and + * semantic classification, including agentic workflows. + * + * See the README's Data Extraction section for per-mode positioning, a + * "when to use which mode" table, and worked recipes (RAG ingestion, + * form/invoice extraction). + * + * **Billing**: billed against **extraction credits**, a separate bucket from the + * **processor API credits** used by every other method on this client. Per-page + * costs: `text` 1 cr, `structure` 1.5 cr, `understand` 9 cr, `agentic` 18 cr. + * + * @param input - The document to parse. Accepts local files (paths, Buffers, + * streams), or a URL string / `{ type: 'url', url: '...' }` object. The endpoint + * accepts a range of document formats — PDFs, Office documents (Word, Excel, + * PowerPoint), and images. Unlike `sign()`, parsing is not restricted to PDFs. + * @param options - Optional parse configuration: + * - `mode` — processing pipeline (`'text'` | `'structure'` | `'understand'` | `'agentic'`). + * - `output.format` — `'spatial'` for typed elements or `'markdown'` for Markdown. + * - `output.includeWords` — include word-level OCR data inside elements. + * - `language` — OCR language hint (string or array of ISO 639-2 codes). + * - `apiVersion` — optional API-version header override. + * @returns Promise resolving to the full `/extraction/parse` response envelope. + * Narrow on `output.markdown` / `output.elements` for type-safe field access, + * or read `configuration.outputFormat` for the server-resolved value. + * Extraction-credit accounting is at `usage.data_extraction_credits`. * * @example * ```typescript - * // Whole-document Markdown for a RAG pipeline (cheapest mode). - * const md = await client.parse('invoice.pdf', { mode: 'text' }); + * // RAG ingestion — born-digital PDF → Markdown, cheapest path (1 cr/page). + * const md = await client.parse('whitepaper.pdf', { mode: 'text' }); * if (md.output.markdown !== undefined) { * console.log(md.output.markdown); * } * - * // Spatial elements for a born-OCR scan. - * const spatial = await client.parse('scan.pdf', { - * mode: 'structure', - * output: { format: 'spatial', includeWords: true }, - * language: ['eng', 'spa'], - * }); + * // Form/invoice extraction — spatial elements with bounds and confidence. + * const spatial = await client.parse('invoice.pdf', { mode: 'understand' }); * if (spatial.output.elements !== undefined) { * for (const el of spatial.output.elements) { - * if (el.type === 'paragraph') console.log(el.text); + * if (el.type === 'keyValueRegion') { + * for (const pair of el.pairs) { + * console.log(pair.key?.value, '→', pair.value?.value); + * } + * } * } * } * - * // Parse a remote document (URL input). + * // OCR-backed extraction with word-level data and multilingual hint. + * const scan = await client.parse('scan.pdf', { + * mode: 'structure', + * output: { format: 'spatial', includeWords: true }, + * language: ['eng', 'spa'], + * }); + * + * // URL input — the server fetches the document, no client-side download. * const remote = await client.parse('https://example.com/doc.pdf'); * - * // Cost reporting — extraction credits, not processor API credits. - * console.log('Extraction credits used:', remote.usage?.data_extraction_credits?.cost); + * // Extraction-credit accounting (separate from processor API credits). + * console.log('Credits used:', remote.usage?.data_extraction_credits?.cost); + * console.log('Credits left:', remote.usage?.data_extraction_credits?.remainingCredits); * - * // Or skip the discriminant entirely with the convenience wrappers: - * const justMarkdown = await client.parseToMarkdown('invoice.pdf'); - * const justElements = await client.parseElements('scan.pdf', 'understand'); + * // Convenience wrappers skip output-format discrimination entirely: + * const markdown = await client.parseToMarkdown('whitepaper.pdf'); + * const elements = await client.parseElements('invoice.pdf', 'understand'); * ``` */ async parse(input: FileInputWithUrl, options?: ParseOptions): Promise { From 8e2140418554b75c81c175ce5e28e366afceb5e4 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 27 May 2026 20:02:57 +1200 Subject: [PATCH 09/13] docs: rewrite Data Extraction section with use-case-first framing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructure the README's /extraction/parse section to lead with use cases (RAG ingestion, form/invoice extraction, layout-aware understanding) before the mode table and code, matching the Python client's documentation approach. Add: - "Choosing an output format" table (markdown vs spatial, with shape and best-for columns). - "Modes — when to use which" table with credit costs and decision guidance. - Two worked recipes: RAG ingestion (PDF → Markdown → embed) and form/invoice extraction (PDF → spatial elements → structured object), each with the convenience-wrapper alternative shown alongside. - Explicit note that the endpoint accepts PDFs, Office documents, and images — not PDFs only. - Mention of the new ExtractionCredits type in the exported-types list. Update METHODS.md parse/parseToMarkdown/parseElements entries to match: lead with use-case positioning, add a parameters table, align examples with the recipe pattern from the README. --- README.md | 164 +++++++++++++++++++++++++++++++++--------------- docs/METHODS.md | 104 ++++++++++++++++++++---------- 2 files changed, 186 insertions(+), 82 deletions(-) diff --git a/README.md b/README.md index 14050d8..3a2edc6 100644 --- a/README.md +++ b/README.md @@ -135,73 +135,139 @@ For a complete list of available methods with examples, see the [Methods Documen ## Data Extraction (`/extraction/parse`) -In addition to the document-processing endpoints, the client supports the -[Data Extraction API](https://www.nutrient.io/api/reference/data-extraction/public/) -for extracting structured content from PDFs, images, and Office files. +`client.parse()` exposes Nutrient's Data Extraction API. It's designed for +**content-extraction workflows** where you need to feed document content into a +downstream pipeline rather than render or transform the document itself: -**Billing**: `/extraction/parse` is billed against the account's **extraction -credits** bucket, which is **separate** from the **processor API credits** -consumed by `convert`, `ocr`, `sign`, `merge`, and the other endpoints. The -two buckets never debit each other. +- **RAG / search indexing / content migration** — pull a clean Markdown + representation of a document for chunking, embedding, and indexing in a + vector store or search engine. +- **Form and invoice extraction** — pull structured fields (key/value pairs, + tables, semantic regions) out of business documents with bounding boxes and + confidence scores attached to every element. +- **Layout-aware document understanding** — get a typed, page-anchored element + list (paragraphs with semantic roles, tables with cell spans, formulas in + LaTeX, pictures, handwriting) suitable for building document-comprehension + tooling, including agentic workflows. -Four processing modes, each with its own extraction-credit cost per page: +The endpoint accepts PDFs, Office documents (Word, Excel, PowerPoint), and +images. Unlike `sign()`, it is not restricted to PDFs. -| Mode | Cost (per page) | Use case | -| ------------ | ---------------------- | ------------------------------------------------------------------------------ | -| `text` | 1 extraction credit | Fast Markdown extraction from born-digital documents. No OCR. | -| `structure` | 1.5 extraction credits | OCR-backed structured extraction with spatial elements and bounding boxes. | -| `understand` | 9 extraction credits | AI-augmented parsing for complex layouts, OCR correction, formulas. (Default.) | -| `agentic` | 18 extraction credits | VLM-augmented extraction for the deepest visual understanding. | +### Choosing an output format -Two output formats: `spatial` (typed elements + bounds + confidence + reading -order, default for non-text modes) or `markdown` (whole-document Markdown, ideal -for RAG, default for `text`). +| Format | Best for | Shape | +| ------------------- | --------------------------------------------------------------------------- | --------------------------------------------------------------- | +| `markdown` | RAG, search indexing, content migration — anywhere structured text beats spatial data | `response.output.markdown` — a single Markdown string | +| `spatial` (default) | Form/invoice extraction, layout reconstruction, flows that need per-element confidence | `response.output.elements` — flat array of typed elements | + +### Quick start ```typescript import { NutrientClient } from '@nutrient-sdk/dws-client-typescript'; const client = new NutrientClient({ apiKey: process.env.NUTRIENT_API_KEY! }); -// Full /extraction/parse call with explicit mode + output. -const result = await client.parse('invoice.pdf', { - mode: 'understand', - output: { format: 'spatial', includeWords: true }, - language: ['eng', 'spa'], -}); - -// Narrow on the present field for type-safe access: +// Spatial elements (default) — paragraphs, tables, key-value regions, etc. +const result = await client.parse('contract.pdf', { mode: 'understand' }); if (result.output.elements !== undefined) { for (const el of result.output.elements) { - if (el.type === 'paragraph') console.log(el.text); if (el.type === 'table') console.log(`${el.rowCount}x${el.columnCount} table`); } } -// Extraction-credit accounting is returned per request: -console.log( - 'Extraction credits used:', - result.usage?.data_extraction_credits?.cost, -); -console.log( - 'Extraction credits remaining:', - result.usage?.data_extraction_credits?.remainingCredits, -); - -// Convenience: get just the Markdown (cheapest mode by default). -const markdown = await client.parseToMarkdown('document.pdf'); - -// Convenience: get just the spatial elements. -const elements = await client.parseElements('scan.pdf', 'understand'); - -// URL input works the same way — the server fetches the URL. -const remote = await client.parse('https://example.com/document.pdf', { - mode: 'text', -}); +// Whole-document Markdown from a born-digital PDF. +const mdResult = await client.parse('report.pdf', { mode: 'text' }); +if (mdResult.output.markdown !== undefined) { + console.log(mdResult.output.markdown); +} +``` + +### Modes — when to use which + +| Mode | Credits / page | When to use | +| ------------ | -------------- | ------------------------------------------------------------------------------------------------------------------------- | +| `text` | 1 | Born-digital documents only. No OCR, no AI. Fastest and cheapest path to Markdown. | +| `structure` | 1.5 | OCR-based segmentation with bounding boxes. Handles scanned documents, images, and any input that requires OCR. | +| `understand` | 9 | Full pipeline with AI augmentation on top of OCR. Most accurate for tables, multi-column layouts, formulas, and forms. | +| `agentic` | 18 | Builds on `understand` and adds a vision-language model. Best for image descriptions and complex visual layouts. | + +### Recipes + +**RAG ingestion** — PDF → Markdown → chunks → embeddings → vector store: + +```typescript +const result = await client.parse('whitepaper.pdf', { mode: 'text' }); +const markdown = result.output.markdown!; +// Then: chunk on headings, embed, push to your vector store. +``` + +For born-digital PDFs, `mode: 'text'` is the cheapest path (1 credit/page). +For scanned PDFs or images, switch to `mode: 'structure'` so OCR runs. + +Or use the convenience wrapper: + +```typescript +const markdown = await client.parseToMarkdown('whitepaper.pdf'); +``` + +**Form/invoice extraction** — PDF → spatial elements → structured object: + +```typescript +const result = await client.parse('invoice.pdf', { mode: 'understand' }); +const elements = result.output.elements!; + +// Pull key/value pairs from form regions. +const fields: Record = {}; +for (const el of elements) { + if (el.type === 'keyValueRegion') { + for (const pair of el.pairs) { + if (pair.key && pair.value) { + fields[String(pair.key.value)] = pair.value.value; + } + } + } +} + +// Walk tables — each cell carries row/col indices and span counts. +for (const el of elements) { + if (el.type === 'table') { + console.log(`Table: ${el.rowCount}×${el.columnCount}`); + for (const cell of el.cells) { + console.log(` [${cell.row}][${cell.column}] ${cell.text}`); + } + } +} +``` + +For complex documents that mix dense images with text, step up to +`mode: 'agentic'` so the VLM produces image descriptions and semantic +classifications (18 credits/page). + +Or use the convenience wrapper to skip output-format discrimination entirely: + +```typescript +const elements = await client.parseElements('invoice.pdf', 'understand'); +``` + +### Billing — extraction credits vs processor credits + +`/extraction/parse` is billed against **extraction credits**, a separate +billing bucket from the **processor API credits** consumed by `convert`, +`ocr`, `sign`, `merge`, and every other endpoint on this client. The two +buckets never debit each other. + +Extraction-credit accounting is returned per request: + +```typescript +const result = await client.parse('document.pdf', { mode: 'structure' }); +const usage = result.usage?.data_extraction_credits; +console.log(`Cost: ${usage?.cost} extraction credits`); +console.log(`Remaining: ${usage?.remainingCredits} extraction credits`); ``` -The full set of public types — `ParseMode`, `ParseElement`, `ParagraphElement`, -`TableElement`, `KeyValueRegionElement`, `ParseResponse`, etc. — is exported -from the package root for downstream typing. +The full set of public types — `ExtractionCredits`, `ParseMode`, `ParseElement`, +`ParagraphElement`, `TableElement`, `KeyValueRegionElement`, `ParseResponse`, +etc. — is exported from the package root for downstream typing. A live smoke script that calls the real endpoint with a sample PDF and prints a parsed summary is available at `examples/src/parse_smoke.ts`. See the file diff --git a/docs/METHODS.md b/docs/METHODS.md index 26251ee..5647eb5 100644 --- a/docs/METHODS.md +++ b/docs/METHODS.md @@ -456,68 +456,106 @@ if (kvps && kvps.length > 0) { ``` ##### parse(input, options?) -Extracts structured content via the Data Extraction API (`POST /extraction/parse`). +Calls the Data Extraction API (`POST /extraction/parse`) to extract structured +content from a document. Designed for **RAG ingestion**, **search indexing**, +**content migration**, and **form/invoice extraction** workflows where the goal +is to feed document content into a downstream pipeline rather than render or +transform the document itself. -This is the cleanest entry point for parsing PDFs, images, and Office files into either: -- Spatial elements (typed components — paragraphs, tables, formulas, pictures, key-value regions, handwriting — with bounding boxes, confidence scores, and reading order), or -- Whole-document Markdown (ideal for RAG and search indexing). +Accepts PDFs, Office documents (Word, Excel, PowerPoint), and images as input. +Unlike `sign()`, it is not restricted to PDFs. -**Billing**: `/extraction/parse` is billed against the account's **extraction credits** bucket, separate from the **processor API credits** used by every other method in this client. Per-page cost depends on the mode: +Billed against **extraction credits** — a separate billing bucket from the +processor API credits consumed by every other method on this client. See the +[README's Data Extraction section](../README.md#data-extraction-extractionparse) +for the full positioning, the per-mode comparison table, and worked recipes. -| Mode | Extraction credits per page | -| ------------ | --------------------------- | -| `text` | 1 | -| `structure` | 1.5 | -| `understand` | 9 (default) | -| `agentic` | 18 | +**Parameters**: +- `input: FileInputWithUrl` — The document to parse. Accepts local files (paths, + Buffers, streams), a URL string, or a `{ type: 'url', url: '...' }` object. + The endpoint accepts PDFs, Office documents, and images. +- `options?: ParseOptions` — Optional configuration: + - `mode: ParseMode` — `'text'` (1 cr/page, born-digital, Markdown only), + `'structure'` (1.5 cr/page, OCR + spatial layout), + `'understand'` (9 cr/page, AI-augmented, default), + or `'agentic'` (18 cr/page, VLM-augmented). + - `output.format: ParseOutputFormat` — `'spatial'` (typed elements with bounds + and confidence) or `'markdown'` (whole-document Markdown string). + - `output.includeWords` — include word-level OCR data inside elements. + - `language` — OCR language hint (`'eng'`, `'deu'`, `['eng', 'spa']`, etc.). + - `apiVersion` — optional `x-nutrient-api-version` header override. + +**Returns**: `ParseResponse` — full response envelope with `output`, `metrics`, +`configuration`, and `usage.data_extraction_credits` (cost + remaining balance). + +```typescript +// RAG ingestion — born-digital PDF → Markdown (1 extraction credit/page). +const result = await client.parse('whitepaper.pdf', { mode: 'text' }); +if (result.output.markdown !== undefined) { + console.log(result.output.markdown); +} -```typescript -// Cheapest mode: born-digital PDF → Markdown for a RAG pipeline. -const md = await client.parse('invoice.pdf', { mode: 'text' }); -if (md.output.markdown !== undefined) { - console.log(md.output.markdown); +// Form extraction — typed spatial elements with bounds and confidence. +const invoice = await client.parse('invoice.pdf', { mode: 'understand' }); +if (invoice.output.elements !== undefined) { + for (const el of invoice.output.elements) { + if (el.type === 'keyValueRegion') { + for (const pair of el.pairs) { + console.log(pair.key?.value, '→', pair.value?.value); + } + } + } } -// OCR-backed spatial extraction. -const spatial = await client.parse('scan.pdf', { +// OCR-backed extraction with word-level data and multilingual hint. +const scan = await client.parse('scan.pdf', { mode: 'structure', output: { format: 'spatial', includeWords: true }, language: ['eng', 'spa'], }); -if (spatial.output.elements !== undefined) { - for (const el of spatial.output.elements) { - if (el.type === 'paragraph') console.log(el.text); - } -} -// URL input (the server fetches the URL — no client-side download needed). +// URL input — the server fetches the document, no client-side download. const remote = await client.parse('https://example.com/document.pdf'); -// Extraction-credit accounting: -console.log(remote.usage?.data_extraction_credits?.cost); -console.log(remote.usage?.data_extraction_credits?.remainingCredits); +// Billing — extraction credits, not processor credits. +const usage = remote.usage?.data_extraction_credits; +console.log(`Cost: ${usage?.cost} extraction credits`); +console.log(`Remaining: ${usage?.remainingCredits} extraction credits`); ``` ##### parseToMarkdown(input, mode?) -Convenience wrapper that calls `parse()` with `output.format = 'markdown'` and returns the Markdown string directly. Defaults to `mode='text'` (1 extraction credit/page). +Convenience wrapper that calls `parse()` with `output.format = 'markdown'` and +returns the Markdown string directly. Defaults to `mode='text'` (1 extraction +credit/page) — the cheapest path for born-digital PDFs. Switch to +`mode='structure'` for scanned documents or images so OCR runs. ```typescript +// Born-digital PDF → Markdown (cheapest). const markdown = await client.parseToMarkdown('document.pdf'); -const richer = await client.parseToMarkdown('scan.pdf', 'understand'); + +// Scanned document or image → OCR-backed Markdown. +const scanned = await client.parseToMarkdown('scan.pdf', 'structure'); + +// AI-augmented Markdown for complex layouts. +const rich = await client.parseToMarkdown('report.pdf', 'understand'); ``` ##### parseElements(input, mode?, includeWords?) -Convenience wrapper that calls `parse()` with `output.format = 'spatial'` and returns the elements array directly. Defaults to `mode='structure'` (1.5 extraction credits/page). Passing `mode='text'` is rejected at compile time since `text` mode does not produce spatial output. +Convenience wrapper that calls `parse()` with `output.format = 'spatial'` and +returns the spatial elements array directly. Defaults to `mode='structure'` +(1.5 extraction credits/page). Passing `mode='text'` is rejected at compile +time — `text` mode only produces Markdown, not spatial elements. ```typescript +// OCR-backed spatial elements. const elements = await client.parseElements('document.pdf'); -// Get word-level OCR data nested inside paragraphs and table cells. -const withWords = await client.parseElements('scan.pdf', 'understand', true); +// AI-augmented extraction with word-level OCR data. +const withWords = await client.parseElements('invoice.pdf', 'understand', true); // Filter by element type. const tables = elements.filter(e => e.type === 'table'); -const paragraphs = elements.filter(e => e.type === 'paragraph'); +const kvRegions = elements.filter(e => e.type === 'keyValueRegion'); ``` ##### flatten(file, annotationIds?) From 075f084b576d356d733a7eefd688a21f74a440ff Mon Sep 17 00:00:00 2001 From: nickwinder Date: Thu, 28 May 2026 14:10:23 +1200 Subject: [PATCH 10/13] feat(client): route parse() via DWS Extract key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DWS Extract is a separate product from DWS Processor with its own API key and credit pool. Calling /extraction/parse with the Processor key returns 403. Add an optional `extractApiKey` constructor option (string or async getter) that parse() prefers over apiKey when set; every non-parse method keeps using apiKey. Falls back to apiKey when extractApiKey is omitted, so tenants with a single global DWS key still work. The routing happens via a per-call options copy that swaps apiKey to the extract key — leaves this.options untouched and covers both the multipart file-input path and the JSON url-input path. Drop the bundled parse smoke script — its dual-key dance and pack/install recipe were superseded by the unit-test coverage of the request shape, response handling, and routing. Live verification against a real account belongs to ad-hoc developer sessions, not committed scaffolding. Mirrors PR #47 on the Python sibling client. --- CHANGELOG.md | 7 +- LLM_DOC.md | 11 ++ README.md | 27 ++++- docs/METHODS.md | 5 +- examples/src/parse_smoke.ts | 163 ------------------------------ src/__tests__/unit/client.test.ts | 32 ++++++ src/__tests__/unit/parse.test.ts | 78 ++++++++++++++ src/client.ts | 35 ++++++- src/types/common.ts | 14 +++ 9 files changed, 199 insertions(+), 173 deletions(-) delete mode 100644 examples/src/parse_smoke.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index bc38355..88c77da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - First-class client support for the Data Extraction API (`POST /extraction/parse`). + - `NutrientClient` accepts an `extractApiKey` option (string or async getter) + that `parse()` uses in place of `apiKey`. Data Extraction is a separate + product with its own credit pool, so the Processor key returns 403 against + `/extraction/parse`. When `extractApiKey` is omitted, `parse()` falls back + to `apiKey`, which works on tenants with global DWS keys. - `NutrientClient.parse(input, options?)` — full request/response surface with typed support for all four modes (`text`, `structure`, `understand`, `agentic`) and both output formats (`spatial`, `markdown`). @@ -23,8 +28,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `ParagraphElement`, `TableElement`, `KeyValueRegionElement`), plus error and metadata shapes (`ParseErrorResponse`, `ParseMetrics`, `ParseUsage`, `ParseConfiguration`). - - Live smoke script at `examples/src/parse_smoke.ts` for verifying against the - production endpoint. - Billing note: `/extraction/parse` debits the account's **extraction credits** bucket, which is separate from the **processor API credits** used by the rest of `NutrientClient`. The response surfaces this explicitly in diff --git a/LLM_DOC.md b/LLM_DOC.md index ead5f6d..5369e0b 100644 --- a/LLM_DOC.md +++ b/LLM_DOC.md @@ -470,6 +470,17 @@ Billed against **extraction credits** (a separate bucket from processor API cred - `understand` — 9 extraction credits (default) - `agentic` — 18 extraction credits +Data Extraction is a separate product with its own API key. Pass it as `extractApiKey` on the client constructor: + +```typescript +const client = new NutrientClient({ + apiKey: process.env.NUTRIENT_API_KEY!, + extractApiKey: process.env.NUTRIENT_EXTRACT_API_KEY!, +}); +``` + +Falls back to `apiKey` when `extractApiKey` is omitted (only works on tenants with global DWS keys). + ```typescript // Full call: spatial elements with bounding boxes, confidence, reading order const result = await client.parse('invoice.pdf', { diff --git a/README.md b/README.md index 3a2edc6..30b4e72 100644 --- a/README.md +++ b/README.md @@ -160,12 +160,33 @@ images. Unlike `sign()`, it is not restricted to PDFs. | `markdown` | RAG, search indexing, content migration — anywhere structured text beats spatial data | `response.output.markdown` — a single Markdown string | | `spatial` (default) | Form/invoice extraction, layout reconstruction, flows that need per-element confidence | `response.output.elements` — flat array of typed elements | +### Setup — separate Extract API key + +Data Extraction is a separate product from the DWS Processor with its own +credit pool and its own API key. Pass both keys when constructing the client: + +```typescript +const client = new NutrientClient({ + apiKey: process.env.NUTRIENT_API_KEY!, // Processor key + extractApiKey: process.env.NUTRIENT_EXTRACT_API_KEY!, // Data Extraction key +}); +``` + +`extractApiKey` is consulted only by `parse()`, `parseToMarkdown()`, and +`parseElements()`. Every other method on the client (`convert`, `sign`, `ocr`, +`merge`, …) keeps using `apiKey`. If you omit `extractApiKey`, the parse +methods fall back to `apiKey` — that fallback only works on tenants whose +single DWS key authorises both products. + ### Quick start ```typescript import { NutrientClient } from '@nutrient-sdk/dws-client-typescript'; -const client = new NutrientClient({ apiKey: process.env.NUTRIENT_API_KEY! }); +const client = new NutrientClient({ + apiKey: process.env.NUTRIENT_API_KEY!, + extractApiKey: process.env.NUTRIENT_EXTRACT_API_KEY!, +}); // Spatial elements (default) — paragraphs, tables, key-value regions, etc. const result = await client.parse('contract.pdf', { mode: 'understand' }); @@ -269,10 +290,6 @@ The full set of public types — `ExtractionCredits`, `ParseMode`, `ParseElement `ParagraphElement`, `TableElement`, `KeyValueRegionElement`, `ParseResponse`, etc. — is exported from the package root for downstream typing. -A live smoke script that calls the real endpoint with a sample PDF and prints a -parsed summary is available at `examples/src/parse_smoke.ts`. See the file -header for the run recipe. - ## Workflow System diff --git a/docs/METHODS.md b/docs/METHODS.md index 5647eb5..9c05fbd 100644 --- a/docs/METHODS.md +++ b/docs/METHODS.md @@ -463,13 +463,16 @@ is to feed document content into a downstream pipeline rather than render or transform the document itself. Accepts PDFs, Office documents (Word, Excel, PowerPoint), and images as input. -Unlike `sign()`, it is not restricted to PDFs. Billed against **extraction credits** — a separate billing bucket from the processor API credits consumed by every other method on this client. See the [README's Data Extraction section](../README.md#data-extraction-extractionparse) for the full positioning, the per-mode comparison table, and worked recipes. +Requires a Data Extraction API key — pass it as `extractApiKey` on the client +constructor (see [Setup — separate Extract API key](../README.md#setup--separate-extract-api-key)). +Falls back to `apiKey` if `extractApiKey` is omitted. + **Parameters**: - `input: FileInputWithUrl` — The document to parse. Accepts local files (paths, Buffers, streams), a URL string, or a `{ type: 'url', url: '...' }` object. diff --git a/examples/src/parse_smoke.ts b/examples/src/parse_smoke.ts deleted file mode 100644 index 2f24672..0000000 --- a/examples/src/parse_smoke.ts +++ /dev/null @@ -1,163 +0,0 @@ -/** - * Live smoke test for the Data Extraction API (`POST /extraction/parse`). - * - * Invocation (from the `examples/` directory, after building the package): - * - * # 1. Build + pack the parent package - * cd .. - * npm install && npm run build && npm pack - * - * # 2. Install the packed tarball into examples/ - * cd examples - * npm install - * - * # 3. Run the smoke test - * NUTRIENT_API_KEY=pdf_live_... npx tsx src/parse_smoke.ts - * - * Optional environment: - * NUTRIENT_PARSE_INPUT — Local file path or https:// URL to parse. - * Defaults to `assets/sample.pdf`. - * NUTRIENT_PARSE_MODE — `text` | `structure` | `understand` | `agentic`. Defaults to `text`. - * NUTRIENT_PARSE_OUTPUT — `markdown` | `spatial`. Server default depends on mode. - * NUTRIENT_PARSE_INCLUDE_WORDS — Truthy value enables word-level OCR data in spatial output. - * NUTRIENT_PARSE_BASE_URL — Override base URL (e.g. staging). - * - * The script prints a short summary plus the first elements / first 800 chars of - * markdown for inspection. It exits non-zero on failure. It is intentionally - * read-only: it does not write files, push branches, or modify the worktree. - * - * Billing note: every call against `/extraction/parse` debits the account's - * **extraction credits** bucket (separate from the **processor API credits** - * used by `/build`, `/sign`, OCR, etc.). The cheapest mode (`text`) costs - * 1 extraction credit per page, so this smoke test against a 1-page PDF - * costs 1 extraction credit. - */ - -import * as fs from 'fs'; -import * as path from 'path'; -import { NutrientClient } from '@nutrient-sdk/dws-client-typescript'; -import type { - ParseMode, - ParseOptions, - ParseOutputFormat, -} from '@nutrient-sdk/dws-client-typescript'; - -function envBool(name: string): boolean { - const v = process.env[name]; - return v !== undefined && v !== '' && v !== '0' && v.toLowerCase() !== 'false'; -} - -function parseModeOrDie(value: string | undefined, fallback: ParseMode): ParseMode { - const candidate = value ?? fallback; - const allowed: ParseMode[] = ['text', 'structure', 'understand', 'agentic']; - if (!allowed.includes(candidate as ParseMode)) { - console.error(`Invalid NUTRIENT_PARSE_MODE='${candidate}'. Allowed: ${allowed.join(', ')}`); - process.exit(2); - } - return candidate as ParseMode; -} - -function parseOutputOrUndefined(value: string | undefined): ParseOutputFormat | undefined { - if (value === undefined || value === '') return undefined; - if (value === 'markdown' || value === 'spatial') return value; - console.error(`Invalid NUTRIENT_PARSE_OUTPUT='${value}'. Allowed: markdown, spatial`); - process.exit(2); -} - -async function main(): Promise { - const apiKey = process.env['NUTRIENT_API_KEY']; - if (!apiKey) { - console.error('Error: NUTRIENT_API_KEY is not set. Export your DWS API key before running:'); - console.error(' export NUTRIENT_API_KEY=pdf_live_...'); - process.exit(1); - } - - const inputArg = - process.env['NUTRIENT_PARSE_INPUT'] ?? path.resolve(__dirname, '..', 'assets', 'sample.pdf'); - const isUrl = /^https?:\/\//i.test(inputArg); - if (!isUrl && !fs.existsSync(inputArg)) { - console.error(`Input file not found: ${inputArg}`); - process.exit(1); - } - - const mode = parseModeOrDie(process.env['NUTRIENT_PARSE_MODE'], 'text'); - const outputFormat = parseOutputOrUndefined(process.env['NUTRIENT_PARSE_OUTPUT']); - const includeWords = envBool('NUTRIENT_PARSE_INCLUDE_WORDS'); - - const client = new NutrientClient({ - apiKey, - ...(process.env['NUTRIENT_PARSE_BASE_URL'] - ? { baseUrl: process.env['NUTRIENT_PARSE_BASE_URL'] } - : {}), - }); - - const options: ParseOptions = { mode }; - if (outputFormat !== undefined) { - options.output = { format: outputFormat }; - if (outputFormat === 'spatial') options.output.includeWords = includeWords; - } - - console.log('--- /extraction/parse smoke test ---'); - console.log('Input:', inputArg); - console.log('Mode:', mode); - console.log('Output:', options.output ?? ''); - console.log(''); - - const started = Date.now(); - const result = await client.parse(inputArg, options); - const elapsed = Date.now() - started; - - console.log('--- response summary ---'); - console.log('Wall time: ', `${elapsed}ms`); - console.log('Server processing time (ms): ', result.metrics.processingTimeMs); - console.log('Pages processed: ', result.metrics.pagesProcessed); - console.log('Configured mode: ', result.configuration.mode); - console.log('Configured output format: ', result.configuration.outputFormat); - console.log('Extraction credits used: ', result.usage?.data_extraction_credits?.cost); - console.log( - 'Extraction credits remaining: ', - result.usage?.data_extraction_credits?.remainingCredits, - ); - console.log('Request ID: ', result.requestId); - console.log(''); - - if (result.output.markdown !== undefined) { - const md = result.output.markdown; - console.log('--- markdown (first 800 chars) ---'); - console.log(md.length > 800 ? md.slice(0, 800) + '\n…[truncated]' : md); - } else { - const elements = result.output.elements; - console.log(`--- spatial elements (${elements.length} total) ---`); - for (const el of elements.slice(0, 5)) { - const summary: Record = { - type: el.type, - id: el.id, - page: el.page.pageIndex, - confidence: el.confidence, - }; - if (el.type === 'paragraph' || el.type === 'handwriting') { - summary['textPreview'] = el.text.slice(0, 80); - } else if (el.type === 'table') { - summary['rowsXcols'] = `${el.rowCount}x${el.columnCount}`; - } else if (el.type === 'formula') { - summary['latex'] = el.latex; - } else if (el.type === 'picture') { - summary['classification'] = el.classification; - } else if (el.type === 'keyValueRegion') { - summary['pairs'] = el.pairs.length; - } - console.log(summary); - } - if (elements.length > 5) console.log(`… ${elements.length - 5} more elements`); - } -} - -main().catch((err: unknown) => { - const e = err as { message?: string; statusCode?: number; details?: unknown; code?: string }; - console.error('--- /extraction/parse smoke test FAILED ---'); - console.error('Message: ', e.message ?? String(err)); - if (e.code !== undefined) console.error('Error code: ', e.code); - if (e.statusCode !== undefined) console.error('HTTP status:', e.statusCode); - if (e.details !== undefined) console.error('Details: ', e.details); - process.exit(1); -}); diff --git a/src/__tests__/unit/client.test.ts b/src/__tests__/unit/client.test.ts index 7773062..956a0d2 100644 --- a/src/__tests__/unit/client.test.ts +++ b/src/__tests__/unit/client.test.ts @@ -232,6 +232,38 @@ describe('NutrientClient', () => { ).toThrow('Base URL must be a string'); }); + it('should accept a string extractApiKey', () => { + const client = new NutrientClient({ + apiKey: 'processor-key', + extractApiKey: 'extract-key', + }); + expect(client).toBeDefined(); + }); + + it('should accept an async extractApiKey getter', () => { + const client = new NutrientClient({ + apiKey: 'processor-key', + extractApiKey: (): Promise => Promise.resolve('extract-key'), + }); + expect(client).toBeDefined(); + }); + + it('should throw ValidationError for invalid extractApiKey type', () => { + expect( + () => + new NutrientClient({ + apiKey: 'processor-key', + extractApiKey: 123 as unknown as string, + }), + ).toThrow(ValidationError); + expect( + () => + new NutrientClient({ + apiKey: 'processor-key', + extractApiKey: 123 as unknown as string, + }), + ).toThrow('Extract API key must be a string or a function that returns a Promise'); + }); }); describe('workflow()', () => { diff --git a/src/__tests__/unit/parse.test.ts b/src/__tests__/unit/parse.test.ts index 1e48b78..a9cbb18 100644 --- a/src/__tests__/unit/parse.test.ts +++ b/src/__tests__/unit/parse.test.ts @@ -307,6 +307,84 @@ describe('NutrientClient.parse()', () => { expect(mockSendRequest).not.toHaveBeenCalled(); }); }); + + describe('Data Extraction API key routing', () => { + it('routes parse() via extractApiKey when set, leaving apiKey untouched', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + const client = new NutrientClient({ + apiKey: 'processor-key', + extractApiKey: 'extract-key', + }); + await client.parse('document.pdf'); + + const passedOptions = mockSendRequest.mock.calls[0]?.[1]; + expect(passedOptions?.apiKey).toBe('extract-key'); + // Original client options must not be mutated. + expect(client['options'].apiKey).toBe('processor-key'); + expect(client['options'].extractApiKey).toBe('extract-key'); + }); + + it('falls back to apiKey when extractApiKey is not provided', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + const client = new NutrientClient({ apiKey: 'processor-key' }); + await client.parse('document.pdf'); + + const passedOptions = mockSendRequest.mock.calls[0]?.[1]; + expect(passedOptions?.apiKey).toBe('processor-key'); + }); + + it('forwards an extractApiKey getter unchanged so http.ts resolves it lazily', async () => { + mockSendRequest.mockResolvedValue({ + data: sampleSpatialResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + const extractGetter = jest.fn(() => Promise.resolve('lazy-extract-key')); + const client = new NutrientClient({ + apiKey: 'processor-key', + extractApiKey: extractGetter, + }); + await client.parse('document.pdf'); + + const passedOptions = mockSendRequest.mock.calls[0]?.[1]; + expect(passedOptions?.apiKey).toBe(extractGetter); + // The client itself does not invoke the getter — that's http.ts's job. + expect(extractGetter).not.toHaveBeenCalled(); + }); + + it('uses extractApiKey for URL inputs too', async () => { + mockGetRemoteUrl.mockReturnValue('https://example.com/doc.pdf'); + mockSendRequest.mockResolvedValue({ + data: sampleMarkdownResponse, + status: 200, + statusText: 'OK', + headers: {}, + } as never); + + const client = new NutrientClient({ + apiKey: 'processor-key', + extractApiKey: 'extract-key', + }); + await client.parse('https://example.com/doc.pdf', { mode: 'text' }); + + const passedOptions = mockSendRequest.mock.calls[0]?.[1]; + expect(passedOptions?.apiKey).toBe('extract-key'); + }); + }); }); describe('NutrientClient.parseToMarkdown()', () => { diff --git a/src/client.ts b/src/client.ts index e77d2fd..f0daacb 100644 --- a/src/client.ts +++ b/src/client.ts @@ -71,6 +71,13 @@ function normalizePageParams( * return token; * } * }); + * + * // Data Extraction (`parse()`) needs its own key — it's a separate product + * // with its own credit pool. Pass both: + * const client = new NutrientClient({ + * apiKey: 'your-processor-key', + * extractApiKey: 'your-extract-key', + * }); * ``` */ export class NutrientClient { @@ -115,6 +122,16 @@ export class NutrientClient { if (options.baseUrl && typeof options.baseUrl !== 'string') { throw new ValidationError('Base URL must be a string'); } + + if ( + options.extractApiKey !== undefined && + typeof options.extractApiKey !== 'string' && + typeof options.extractApiKey !== 'function' + ) { + throw new ValidationError( + 'Extract API key must be a string or a function that returns a Promise', + ); + } } /** @@ -1835,6 +1852,11 @@ export class NutrientClient { * **processor API credits** used by every other method on this client. Per-page * costs: `text` 1 cr, `structure` 1.5 cr, `understand` 9 cr, `agentic` 18 cr. * + * **Authentication**: Data Extraction is a separate product with its own API + * key. Pass it via `new NutrientClient({ apiKey, extractApiKey })`. If + * `extractApiKey` is omitted, this method falls back to `apiKey`, which only + * succeeds when the key is a global DWS key authorised for both products. + * * @param input - The document to parse. Accepts local files (paths, Buffers, * streams), or a URL string / `{ type: 'url', url: '...' }` object. The endpoint * accepts a range of document formats — PDFs, Office documents (Word, Excel, @@ -1902,6 +1924,15 @@ export class NutrientClient { ? { 'x-nutrient-api-version': options.apiVersion } : undefined; + // Data Extraction is a separate product with its own API key. Route the + // request via a per-call options copy so the rest of the client (which + // talks to the Processor API) keeps using the main key. Falls back to + // apiKey when extractApiKey is unset. + const parseOptions: NutrientClientOptions = + this.options.extractApiKey !== undefined + ? { ...this.options, apiKey: this.options.extractApiKey } + : this.options; + // URL input → JSON body const remoteUrl = getRemoteUrl(input); if (remoteUrl !== null) { @@ -1913,7 +1944,7 @@ export class NutrientClient { data: { instructions }, ...(headers ? { headers } : {}), }, - this.options, + parseOptions, 'json', ); return response.data; @@ -1928,7 +1959,7 @@ export class NutrientClient { data: { instructions, file: normalizedFile }, ...(headers ? { headers } : {}), }, - this.options, + parseOptions, 'json', ); return response.data; diff --git a/src/types/common.ts b/src/types/common.ts index 2d889ff..7edd362 100644 --- a/src/types/common.ts +++ b/src/types/common.ts @@ -32,4 +32,18 @@ export interface NutrientClientOptions { * Timeout in milliseconds */ timeout?: number; + + /** + * Optional API key (or async getter) for the Nutrient DWS **Data Extraction** + * product. Required by `parse()` because Data Extraction is a separate + * product from the Processor API and has its own credit pool — using a + * Processor key against `/extraction/parse` returns 403. + * + * If omitted, `parse()` falls back to `apiKey`. That fallback works on + * tenants where a single global DWS key authorises both products. + * + * No other client method uses this key — `convert`, `sign`, `ocr`, etc. + * always use `apiKey`. + */ + extractApiKey?: string | (() => Promise); } From 48ec2124f15cdb3f212308b59b9de8d45dfcb38c Mon Sep 17 00:00:00 2001 From: nickwinder Date: Thu, 28 May 2026 14:10:41 +1200 Subject: [PATCH 11/13] refactor(types): derive parse types from generated OpenAPI spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `npm run generate:types:extract` that runs openapi-typescript against the vendored dws-data-extraction-spec.yml into src/generated/extract-types.ts, peer to the existing `generate:types` flow for the Processor spec. Rewrite src/types/parse.ts so the schema primitives derive from the generated `components['schemas']` rather than being hand-rolled: - ParseMode, ParseOutputFormat - ParseElement and the six element subtypes (ParagraphElement, FormulaElement, PictureElement, TableElement, KeyValueRegionElement, HandwritingElement) - ParseElementBase, ParseBounds, ParsePageRef, ParseWord - ParseTableCell, KeyValuePair, KeyValueEntity - ParseMetrics, ParseUsage, ParseConfiguration - ParseErrorResponse, ParseErrorDetails, ParseErrorFailingPath - ParagraphRole (now `NonNullable`) Keep four types hand-composed where they add something the spec doesn't express: - ParseOutputOptions / ParseInstructions — the spec marks `OutputOptions.includeWords` as required, but the server has a default and clients shouldn't be forced to pass it. - ParseResponseSpatial / ParseResponseMarkdown — cross-field discriminated narrowing (`elements?: undefined` / `markdown?: undefined`) the spec's ParseOutput doesn't model, letting callers write `if (output.markdown !== undefined)` without per-call `?.` access. - ParseOptions — adds the client-only `apiVersion` header concern that isn't a body field in the spec. Net: ~210 lines of hand-rolled type definitions deleted, replaced with one-line aliases that re-route through the generated schema. The public surface (every exported name) is unchanged. --- package.json | 3 +- src/generated/extract-types.ts | 689 +++++++++++++++++++++++++++++++++ src/types/parse.ts | 291 ++++---------- 3 files changed, 763 insertions(+), 220 deletions(-) create mode 100644 src/generated/extract-types.ts diff --git a/package.json b/package.json index 69cdbed..e6b9d48 100644 --- a/package.json +++ b/package.json @@ -85,7 +85,8 @@ "format:check": "prettier --check \"src/**/*.ts\"", "typecheck": "tsc --noEmit", "prepublishOnly": "npm run build && npm run test", - "generate:types": "openapi-typescript dws-api-spec.yml -o src/generated/api-types.ts" + "generate:types": "openapi-typescript dws-api-spec.yml -o src/generated/api-types.ts", + "generate:types:extract": "openapi-typescript dws-data-extraction-spec.yml -o src/generated/extract-types.ts" }, "dependencies": { "axios": "^1.13.2", diff --git a/src/generated/extract-types.ts b/src/generated/extract-types.ts new file mode 100644 index 0000000..6fc62fb --- /dev/null +++ b/src/generated/extract-types.ts @@ -0,0 +1,689 @@ +/** + * This file was auto-generated by openapi-typescript. + * Do not make direct changes to the file. + */ + +export interface paths { + "/extraction/parse": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** + * Extract data from a document + * @description Extract structured content from a document. Returns either typed document elements + * with spatial data or a whole-document markdown representation. + * + * Four processing modes are available: + * - **text** — Plain text extraction powered by Document Engine. Only supports `markdown` output. + * - **structure** — OCR-backed structured extraction with spatial element output. + * - **understand** — Deeper document analysis with structured extraction and semantic enrichment. + * - **agentic** — AI-powered analysis for complex documents that need visual reasoning and self-correction. + * + * You can provide the input document in three ways: + * - **Multipart form upload** via `multipart/form-data` with a `file` field and optional JSON `instructions`. + * - **URL-based input** via `application/json` with a `url` field pointing to a remote document. + * - **Raw binary upload** via `application/pdf` (or other supported content types). + */ + post: operations["extraction-parse"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; +} +export type webhooks = Record; +export interface components { + schemas: { + /** + * @description Processing pipeline. + * - `text` — Plain text extraction powered by Document Engine. Only supports `markdown` output format. + * - `structure` — OCR-backed structured extraction with spatial element output. + * - `understand` — Deeper document analysis with structured extraction and semantic enrichment. + * - `agentic` — AI-powered analysis for complex documents that need visual reasoning and self-correction. + * @default understand + * @example understand + * @enum {string} + */ + Mode: "text" | "structure" | "understand" | "agentic"; + /** + * @description Output configuration. When provided, `format` is required. + * Default format depends on the mode: `text` defaults to `markdown`; + * `structure`, `understand`, and `agentic` default to `spatial`. + */ + OutputOptions: { + /** + * @description The output format. + * - `spatial` — Flat typed elements with bounding boxes, confidence scores, reading order, and page references. + * Not available with `text` mode. + * - `markdown` — Whole-document markdown representation. + * @example spatial + * @enum {string} + */ + format: "spatial" | "markdown"; + /** + * @description Include word-level OCR data nested inside paragraph and table cell elements. + * Only applicable when `format` is `spatial`. + * @default false + * @example false + */ + includeWords: boolean; + }; + /** @description Additional processing options. */ + ProcessingOptions: { + /** + * @description OCR language hint. Only supported for `structure`, `understand`, and `agentic` modes. + * Accepts lowercase language names (`"english"`, `"german"`) + * or ISO 639-2 language codes (`"eng"`, `"deu"`). Multilingual OCR can be expressed + * as an array (`["eng", "spa"]`) or a `+`-joined string (`"eng+spa"`). + * @default eng + */ + language: string | string[]; + }; + ParseResponse: { + /** + * @description HTTP status code. + * @example 200 + * @enum {integer} + */ + status: 200; + /** + * @description Unique request identifier for debugging and support. + * @example req_e5f6g7h8 + */ + requestId: string; + output: components["schemas"]["ParseOutput"]; + metrics: components["schemas"]["Metrics"]; + usage?: components["schemas"]["Usage"]; + configuration: components["schemas"]["Configuration"]; + }; + /** + * @description Extracted content. Contains either `elements` (for spatial format) or `markdown` + * (for markdown format), never both. + */ + ParseOutput: { + /** + * @description Flat list of document elements across all pages, ordered by reading order. + * Present when `output.format` is `spatial`. + */ + elements?: components["schemas"]["Element"][]; + /** + * @description Whole-document markdown content. + * Present when `output.format` is `markdown`. + * @example # Document Title + * + * First paragraph of text... + */ + markdown?: string; + }; + Metrics: { + /** + * @description Total processing time in milliseconds. + * @example 4200 + */ + processingTimeMs: number; + /** + * @description Number of pages processed. + * @example 1 + */ + pagesProcessed: number; + }; + Usage: { + data_extraction_credits?: { + /** + * @description Credits consumed by this request. + * @example 2 + */ + cost: number; + /** + * @description Remaining credits in the account. + * @example 850 + */ + remainingCredits: number; + }; + }; + Configuration: { + mode: components["schemas"]["Mode"]; + /** + * @description The output format that was used for this request. + * @example spatial + * @enum {string} + */ + outputFormat: "spatial" | "markdown"; + }; + ParseErrorResponse: { + /** + * @description HTTP status code. + * @example 400 + */ + status: number; + /** + * @description Unique request identifier for debugging and support. + * @example req_err_001 + */ + requestId: string; + /** + * @description Human-readable error summary. + * @example The request is malformed + */ + errorMessage: string; + /** @description Structured error details. Present on validation and processing errors. */ + errorDetails?: { + /** + * @description Error origin. + * - `request` — Validation errors (invalid parameters, unsupported format). + * - `processing` — Backend processing failures. + * - `maestro` — Maestro engine failures. + * @example request + */ + source?: string; + /** + * @description Machine-readable error code stable enough for client branching. + * @example invalid_request + */ + code?: string; + /** @description List of invalid fields. Present on validation errors. */ + failingPaths?: { + /** + * @description JSON path to the invalid field. + * @example $.mode + */ + path?: string; + /** + * @description Human-readable validation error. + * @example invalid mode: 'vlm'. Expected: text, structure, understand, agentic + */ + details?: string; + }[]; + }; + }; + Element: components["schemas"]["ParagraphElement"] | components["schemas"]["FormulaElement"] | components["schemas"]["PictureElement"] | components["schemas"]["TableElement"] | components["schemas"]["KeyValueRegionElement"] | components["schemas"]["HandwritingElement"]; + ElementBase: { + /** + * Format: uuid + * @description Unique element identifier. + * @example a1b2c3d4-1111-4000-8000-000000000001 + */ + id: string; + bounds: components["schemas"]["Bounds"]; + /** + * @description Detection confidence score. + * @example 0.95 + */ + confidence: number; + /** + * @description Reading order index within the page. + * @example 0 + */ + readingOrder: number; + page: components["schemas"]["PageRef"]; + }; + ParagraphElement: components["schemas"]["ElementBase"] & { + /** @enum {string} */ + type: "paragraph"; + /** + * @description Semantic role of the paragraph. Null when the role is undetermined. + * @example Text + */ + role?: ("Text" | "Title" | "SectionHeader" | "Header" | "Footer" | "Caption" | "Footnote" | "ListItem" | "PageNumber" | "Code" | "CheckboxSelected" | "CheckboxUnselected") | null; + /** + * @description Extracted text content. + * @example Revenue grew 15% year-over-year. + */ + text: string; + /** @description Word-level OCR data. Present when `includeWords` is `true`. */ + words?: components["schemas"]["Word"][] | null; + } & { + /** + * @description discriminator enum property added by openapi-typescript + * @enum {string} + */ + type: "paragraph"; + }; + FormulaElement: components["schemas"]["ElementBase"] & { + /** @enum {string} */ + type: "formula"; + /** + * @description LaTeX representation of the formula. + * @example r = r_0 e^{kt} + */ + latex: string; + } & { + /** + * @description discriminator enum property added by openapi-typescript + * @enum {string} + */ + type: "formula"; + }; + PictureElement: components["schemas"]["ElementBase"] & { + /** @enum {string} */ + type: "picture"; + /** + * @description Image classification category (chart, photo, diagram, etc.). + * @example chart + */ + classification: string; + /** + * @description Confidence score for the classification. + * @example 0.91 + */ + classificationConfidence: number; + /** + * @description AI-generated alternative text description. + * @example Bar chart showing quarterly revenue growth across regions + */ + altDescription: string; + /** @description IDs of associated caption paragraph elements. */ + captionIds?: string[] | null; + /** @description IDs of associated footnote paragraph elements. */ + footnoteIds?: string[] | null; + } & { + /** + * @description discriminator enum property added by openapi-typescript + * @enum {string} + */ + type: "picture"; + }; + TableElement: components["schemas"]["ElementBase"] & { + /** @enum {string} */ + type: "table"; + /** + * @description Number of rows in the table. + * @example 3 + */ + rowCount: number; + /** + * @description Number of columns in the table. + * @example 3 + */ + columnCount: number; + /** @description Cell-level data. */ + cells: components["schemas"]["TableCell"][]; + /** @description IDs of associated caption paragraph elements. */ + captionIds?: string[] | null; + /** @description IDs of associated footnote paragraph elements. */ + footnoteIds?: string[] | null; + } & { + /** + * @description discriminator enum property added by openapi-typescript + * @enum {string} + */ + type: "table"; + }; + KeyValueRegionElement: components["schemas"]["ElementBase"] & { + /** @enum {string} */ + type: "keyValueRegion"; + /** @description Detected key-value pairs. */ + pairs: components["schemas"]["KeyValuePair"][]; + } & { + /** + * @description discriminator enum property added by openapi-typescript + * @enum {string} + */ + type: "keyValueRegion"; + }; + HandwritingElement: components["schemas"]["ElementBase"] & { + /** @enum {string} */ + type: "handwriting"; + /** + * @description Extracted handwritten text content. + * @example John Doe + */ + text: string; + /** @description Word-level OCR data. Present when `includeWords` is `true`. */ + words?: components["schemas"]["Word"][] | null; + } & { + /** + * @description discriminator enum property added by openapi-typescript + * @enum {string} + */ + type: "handwriting"; + }; + /** + * @description Bounding box of an element on the page. `(x, y)` is the top-left corner of the box. + * Origin is the top-left of the page, with x increasing right and y increasing down. + * + * Coordinates are always expressed in render-space pixels. + * `page.width` and `page.height` describe the same pixel canvas as every element, + * word, and table cell bound on that page. + */ + Bounds: { + /** + * @description X coordinate of the top-left corner (distance from page left edge). + * @example 100 + */ + x: number; + /** + * @description Y coordinate of the top-left corner (distance from page top edge). + * @example 50 + */ + y: number; + /** + * @description Width of the bounding box. + * @example 400 + */ + width: number; + /** + * @description Height of the bounding box. + * @example 35 + */ + height: number; + }; + /** @description Word-level OCR result. */ + Word: { + /** + * @description The word text. + * @example Revenue + */ + text: string; + bounds: components["schemas"]["Bounds"]; + /** + * @description OCR confidence score. + * @example 0.95 + */ + confidence: number; + }; + /** + * @description Source page reference. Provides the page index and the full page dimensions, + * which define the coordinate space that all element bounds on this page are relative to. + */ + PageRef: { + /** + * @description 0-based page index. + * @example 0 + */ + pageIndex: number; + /** + * @description 1-based page number. + * @example 1 + */ + pageNumber: number; + /** + * @description Page width in render-space pixels. + * @example 1700 + */ + width: number; + /** + * @description Page height in render-space pixels. + * @example 2200 + */ + height: number; + }; + TableCell: { + /** + * @description Unique cell identifier. + * @example c-001 + */ + id: string; + bounds: components["schemas"]["Bounds"]; + /** + * @description Detection confidence score. + * @example 0.94 + */ + confidence: number; + /** + * @description 0-indexed row. + * @example 0 + */ + row: number; + /** + * @description 0-indexed column. + * @example 0 + */ + column: number; + /** + * @description Number of rows this cell spans. + * @default 1 + * @example 1 + */ + rowSpan: number; + /** + * @description Number of columns this cell spans. + * @default 1 + * @example 1 + */ + colSpan: number; + /** + * @description Extracted text content. + * @example Region + */ + text: string; + /** @description Word-level OCR data. Present when `includeWords` is `true`. */ + words?: components["schemas"]["Word"][] | null; + }; + KeyValuePair: { + /** + * @description Unique identifier for the pair. + * @example kvp-001 + */ + id: string; + /** @description The key/question entity. Null when only a value was detected. */ + key?: components["schemas"]["KeyValueEntity"] | null; + /** @description The value/answer entity. Null when only a key was detected. */ + value?: components["schemas"]["KeyValueEntity"] | null; + /** + * @description Confidence for the key-value relationship. + * @example 0.93 + */ + relationshipConfidence?: number | null; + }; + KeyValueEntity: { + /** + * @description Unique entity identifier. + * @example kve-001 + */ + id: string; + bounds: components["schemas"]["Bounds"]; + /** + * @description Detection confidence score. + * @example 0.92 + */ + confidence: number; + /** + * @description Entity type. + * @example QUESTION + * @enum {string} + */ + entityType: "QUESTION" | "ANSWER" | ""; + /** + * @description Extracted value. + * @example Invoice Number + */ + value: unknown; + }; + }; + responses: never; + parameters: { + /** + * @description Optional API version override for this request. + * + * If omitted, the request uses the latest version that was available when the API key was created. + * + * See the [API Versioning](#description/api-versioning) section for the list of supported versions. + * @example 2026-05-25 + */ + NutrientApiVersion: "2026-05-25"; + }; + requestBodies: never; + headers: never; + pathItems: never; +} +export type $defs = Record; +export interface operations { + "extraction-parse": { + parameters: { + query?: never; + header?: { + /** + * @description Optional API version override for this request. + * + * If omitted, the request uses the latest version that was available when the API key was created. + * + * See the [API Versioning](#description/api-versioning) section for the list of supported versions. + * @example 2026-05-25 + */ + "x-nutrient-api-version"?: components["parameters"]["NutrientApiVersion"]; + }; + path?: never; + cookie?: never; + }; + requestBody?: { + content: { + "multipart/form-data": { + /** + * Format: binary + * @description The document to parse. + * @example + */ + file?: string; + /** + * @description JSON-serialized processing instructions. Omit to use all defaults + * (`mode: "understand"` with spatial element output). + */ + instructions?: { + /** + * Format: uri + * @description URL of a remote document to parse. Use this instead of the `file` field + * to process a document hosted at a public URL. + * @example https://storage.example.com/invoice.pdf + */ + url?: string; + mode?: components["schemas"]["Mode"]; + output?: components["schemas"]["OutputOptions"]; + options?: components["schemas"]["ProcessingOptions"]; + }; + }; + /** + * @example { + * "url": "https://storage.example.com/invoice.pdf", + * "mode": "understand", + * "output": { + * "format": "spatial" + * } + * } + */ + "application/json": { + /** + * Format: uri + * @description URL of a remote document to parse. + * @example https://storage.example.com/invoice.pdf + */ + url: string; + mode?: components["schemas"]["Mode"]; + output?: components["schemas"]["OutputOptions"]; + options?: components["schemas"]["ProcessingOptions"]; + }; + "application/pdf": string; + "image/png": string; + "image/jpeg": string; + "image/tiff": string; + }; + }; + responses: { + /** @description Extraction completed successfully. */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["ParseResponse"]; + }; + }; + /** @description The request is malformed. Invalid parameters, unsupported file format, or missing required fields. */ + 400: { + headers: { + [name: string]: unknown; + }; + content: { + /** + * @example { + * "status": 400, + * "requestId": "req_err_001", + * "errorMessage": "The request is malformed", + * "errorDetails": { + * "source": "request", + * "code": "invalid_request", + * "failingPaths": [ + * { + * "path": "$.mode", + * "details": "invalid mode: 'vlm'. Expected: text, structure, understand, agentic" + * } + * ] + * } + * } + */ + "application/json": components["schemas"]["ParseErrorResponse"]; + }; + }; + /** @description You are unauthorized. Sent when no API token is specified, or when the API token you specified isn't valid. */ + 401: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Insufficient credits for this request. */ + 402: { + headers: { + [name: string]: unknown; + }; + content: { + /** + * @example { + * "status": 402, + * "requestId": "req_err_002", + * "errorMessage": "Insufficient credits. This request requires 2 credits, 0 remaining." + * } + */ + "application/json": components["schemas"]["ParseErrorResponse"]; + }; + }; + /** @description The uploaded file exceeds the maximum allowed size for your plan. */ + 413: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Too many requests. You have exceeded the rate limit for your subscription. */ + 429: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description An internal processing error occurred. Please retry or contact support with the `requestId`. */ + 500: { + headers: { + [name: string]: unknown; + }; + content: { + /** + * @example { + * "status": 500, + * "requestId": "req_err_003", + * "errorMessage": "Processing failed. Please retry or contact support with the requestId.", + * "errorDetails": { + * "source": "maestro", + * "code": "maestro_error" + * } + * } + */ + "application/json": components["schemas"]["ParseErrorResponse"]; + }; + }; + /** @description The processing backend is temporarily unavailable. Please retry later. */ + 503: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + }; + }; +} diff --git a/src/types/parse.ts b/src/types/parse.ts index 4b67db4..2961c80 100644 --- a/src/types/parse.ts +++ b/src/types/parse.ts @@ -1,3 +1,4 @@ +import type { components } from '../generated/extract-types'; import type { ExtractionCredits } from './extraction_credits'; export type { ExtractionCredits }; @@ -5,33 +6,36 @@ export type { ExtractionCredits }; /** * Type definitions for the Nutrient Data Extraction API (`POST /extraction/parse`). * - * These types mirror the public OpenAPI 3.1 contract published at - * https://www.nutrient.io/api/reference/data-extraction/public/ (version `2026-05-25`). + * The primitive schemas (element types, bounds, page references, request options) + * are derived from `src/generated/extract-types.ts`, which is generated from + * `dws-data-extraction-spec.yml` by `npm run generate:types:extract`. The + * narrowed response unions (`ParseResponseSpatial` / `ParseResponseMarkdown`) + * and the client-facing `ParseOptions` surface are hand-composed on top. * - * Note on billing: `/extraction/parse` is billed against **extraction credits**, a - * bucket separate from the **processor API credits** consumed by `/build`, `/sign`, - * OCR, conversion, and the other endpoints on the rest of `NutrientClient`. The + * Billing note: `/extraction/parse` is billed against **extraction credits**, a + * bucket separate from the **processor API credits** consumed by `/build`, + * `/sign`, OCR, conversion, and the other endpoints on `NutrientClient`. The * response surfaces this explicitly in `usage.data_extraction_credits`. * * @see ParseResponse for the full response shape * @see ParseMode for the four processing pipelines */ +type Schemas = components['schemas']; + /** * Processing pipeline for `/extraction/parse`. * - * Each mode bills a different amount of **extraction credits** per page, drawn from - * the account's extraction-credits bucket (separate from processor API credits). + * Each mode bills a different amount of **extraction credits** per page, drawn + * from the account's extraction-credits bucket (separate from processor API + * credits). * * - `text` — Plain text extraction. Markdown output only. 1 extraction credit/page. * - `structure` — OCR-backed structured extraction with spatial element output. 1.5 extraction credits/page. * - `understand` — Deeper document analysis with semantic enrichment. 9 extraction credits/page. (Default) * - `agentic` — VLM-augmented extraction for complex documents needing visual reasoning. 18 extraction credits/page. - * - * The `agentic` mode may not yet be activated on every account; if it isn't, the - * server returns a `400` with an `errorDetails.code` you can branch on. */ -export type ParseMode = 'text' | 'structure' | 'understand' | 'agentic'; +export type ParseMode = Schemas['Mode']; /** * Output format for `/extraction/parse`. @@ -42,20 +46,24 @@ export type ParseMode = 'text' | 'structure' | 'understand' | 'agentic'; * - `markdown` — Whole-document Markdown representation, suited for RAG, search * indexing, and content pipelines. */ -export type ParseOutputFormat = 'spatial' | 'markdown'; +export type ParseOutputFormat = Schemas['Configuration']['outputFormat']; /** * Output configuration for `/extraction/parse`. * * Defaults: `text` mode emits `markdown`; `structure`, `understand`, and `agentic` - * emit `spatial`. + * emit `spatial`. `includeWords` defaults to `false` server-side and is only + * honoured when `format` is `'spatial'`. + * + * Hand-written (not derived from the spec) because the spec marks `includeWords` + * as required; in practice it has a server-side default and clients omit it. */ export interface ParseOutputOptions { /** Output format. Required when `output` is provided. */ format: ParseOutputFormat; /** * Include word-level OCR data nested inside paragraph and table cell elements. - * Only applicable when `format` is `'spatial'`. Defaults to `false`. + * Only applicable when `format` is `'spatial'`. Defaults to `false` server-side. */ includeWords?: boolean; } @@ -63,24 +71,14 @@ export interface ParseOutputOptions { /** * Additional processing options for `/extraction/parse`. */ -export interface ParseProcessingOptions { - /** - * OCR language hint. Only honoured for `structure`, `understand`, and `agentic` modes. - * - * Accepts: - * - A lowercase language name (`'english'`, `'german'`). - * - An ISO 639-2 code (`'eng'`, `'deu'`). - * - A `+`-joined string for multilingual OCR (`'eng+spa'`). - * - An array of codes (`['eng', 'spa']`). - * - * Defaults to `'eng'` server-side. - */ - language?: string | string[]; -} +export type ParseProcessingOptions = Schemas['ProcessingOptions']; /** - * Instruction payload sent to `/extraction/parse`. All fields are optional; an empty - * object resolves to `mode: 'understand'` with spatial output server-side. + * Instruction payload sent to `/extraction/parse`. All fields are optional; an + * empty object resolves to `mode: 'understand'` with spatial output server-side. + * + * Hand-written because the spec's `OutputOptions` makes `includeWords` required; + * see {@link ParseOutputOptions}. */ export interface ParseInstructions { /** @@ -97,211 +95,86 @@ export interface ParseInstructions { * Bounding box of an element on the page. * * `(x, y)` is the top-left corner. The origin is the top-left of the page, with - * x increasing right and y increasing down. Coordinates are in render-space pixels; - * `page.width` and `page.height` describe the same pixel canvas. + * x increasing right and y increasing down. Coordinates are in render-space + * pixels; `page.width` and `page.height` describe the same pixel canvas. */ -export interface ParseBounds { - /** Distance from the page's left edge to the box's left edge, in pixels. */ - x: number; - /** Distance from the page's top edge to the box's top edge, in pixels. */ - y: number; - /** Width of the bounding box in pixels. */ - width: number; - /** Height of the bounding box in pixels. */ - height: number; -} +export type ParseBounds = Schemas['Bounds']; /** - * Source page reference for an extracted element. + * Source page reference for an extracted element. Defines the coordinate space + * that all element bounds on the page are relative to. */ -export interface ParsePageRef { - /** 0-based page index. */ - pageIndex: number; - /** 1-based page number for human-facing labels. */ - pageNumber: number; - /** Page width in render-space pixels (matches the bounds coordinate space). */ - width: number; - /** Page height in render-space pixels. */ - height: number; -} +export type ParsePageRef = Schemas['PageRef']; /** * Word-level OCR result. Included inside `ParagraphElement.words`, * `HandwritingElement.words`, and `ParseTableCell.words` when * `output.includeWords === true`. */ -export interface ParseWord { - /** The word's text. */ - text: string; - bounds: ParseBounds; - /** OCR confidence score in `[0, 1]`. */ - confidence: number; -} +export type ParseWord = Schemas['Word']; + +/** Fields shared by every spatial element. */ +export type ParseElementBase = Schemas['ElementBase']; + +export type ParagraphElement = Schemas['ParagraphElement']; /** * Semantic role of a paragraph element. `null` when the role is undetermined. */ -export type ParagraphRole = - | 'Text' - | 'Title' - | 'SectionHeader' - | 'Header' - | 'Footer' - | 'Caption' - | 'Footnote' - | 'ListItem' - | 'PageNumber' - | 'Code' - | 'CheckboxSelected' - | 'CheckboxUnselected'; +export type ParagraphRole = NonNullable; -/** Fields shared by every spatial element. */ -export interface ParseElementBase { - /** Unique element identifier (UUID). */ - id: string; - bounds: ParseBounds; - /** Detection confidence score in `[0, 1]`. */ - confidence: number; - /** Reading order index within the page. */ - readingOrder: number; - page: ParsePageRef; -} +export type FormulaElement = Schemas['FormulaElement']; -export interface ParagraphElement extends ParseElementBase { - type: 'paragraph'; - role?: ParagraphRole | null; - text: string; - /** Word-level OCR data. Present only when `includeWords` is `true`. */ - words?: ParseWord[] | null; -} - -export interface FormulaElement extends ParseElementBase { - type: 'formula'; - /** LaTeX representation of the formula. */ - latex: string; -} - -export interface PictureElement extends ParseElementBase { - type: 'picture'; - /** Image classification category (e.g. `chart`, `photo`, `diagram`). */ - classification: string; - /** Confidence score for the classification in `[0, 1]`. */ - classificationConfidence: number; - /** AI-generated alternative text. */ - altDescription: string; - /** IDs of associated caption paragraph elements. */ - captionIds?: string[] | null; - /** IDs of associated footnote paragraph elements. */ - footnoteIds?: string[] | null; -} +export type PictureElement = Schemas['PictureElement']; /** A single cell inside a `TableElement`. */ -export interface ParseTableCell { - id: string; - bounds: ParseBounds; - /** Detection confidence score in `[0, 1]`. */ - confidence: number; - /** 0-indexed row. */ - row: number; - /** 0-indexed column. */ - column: number; - /** Number of rows this cell spans. */ - rowSpan: number; - /** Number of columns this cell spans. */ - colSpan: number; - text: string; - /** Word-level OCR data. Present only when `includeWords` is `true`. */ - words?: ParseWord[] | null; -} +export type ParseTableCell = Schemas['TableCell']; -export interface TableElement extends ParseElementBase { - type: 'table'; - rowCount: number; - columnCount: number; - cells: ParseTableCell[]; - captionIds?: string[] | null; - footnoteIds?: string[] | null; -} +export type TableElement = Schemas['TableElement']; /** Question or answer entity within a `KeyValuePair`. */ -export interface KeyValueEntity { - id: string; - bounds: ParseBounds; - /** Detection confidence score in `[0, 1]`. */ - confidence: number; - /** Entity type. The empty string is returned when the role is unclassified. */ - entityType: 'QUESTION' | 'ANSWER' | ''; - /** Extracted value (text or other primitive). */ - value: unknown; -} +export type KeyValueEntity = Schemas['KeyValueEntity']; -export interface KeyValuePair { - id: string; - /** The key/question entity. `null` when only a value was detected. */ - key?: KeyValueEntity | null; - /** The value/answer entity. `null` when only a key was detected. */ - value?: KeyValueEntity | null; - /** Confidence for the key-value relationship in `[0, 1]`. */ - relationshipConfidence?: number | null; -} +export type KeyValuePair = Schemas['KeyValuePair']; -export interface KeyValueRegionElement extends ParseElementBase { - type: 'keyValueRegion'; - pairs: KeyValuePair[]; -} +export type KeyValueRegionElement = Schemas['KeyValueRegionElement']; -export interface HandwritingElement extends ParseElementBase { - type: 'handwriting'; - text: string; - /** Word-level OCR data. Present only when `includeWords` is `true`. */ - words?: ParseWord[] | null; -} +export type HandwritingElement = Schemas['HandwritingElement']; /** * Discriminated union of every spatial element type. Use the `type` field for * narrowing. */ -export type ParseElement = - | ParagraphElement - | FormulaElement - | PictureElement - | TableElement - | KeyValueRegionElement - | HandwritingElement; +export type ParseElement = Schemas['Element']; /** * Processing metrics for a `/extraction/parse` call. */ -export interface ParseMetrics { - processingTimeMs: number; - pagesProcessed: number; -} +export type ParseMetrics = Schemas['Metrics']; /** * Extraction-credit usage for a `/extraction/parse` call. * - * **Extraction credits** are a separate billing bucket from processor API credits; - * an extraction call never debits processor credits and vice-versa. + * **Extraction credits** are a separate billing bucket from processor API + * credits; an extraction call never debits processor credits and vice-versa. * * See {@link ExtractionCredits} for the shape of the billing object. */ -export interface ParseUsage { - data_extraction_credits?: ExtractionCredits; -} +export type ParseUsage = Schemas['Usage']; /** * Echoes the resolved configuration the server used for this request. */ -export interface ParseConfiguration { - mode: ParseMode; - outputFormat: ParseOutputFormat; -} +export type ParseConfiguration = Schemas['Configuration']; /** * Successful `/extraction/parse` response with spatial element output. * - * Narrow on `configuration.outputFormat === 'spatial'` (or `'markdown'` for the - * other variant) to access `output.elements` vs `output.markdown` with type safety. + * Hand-composed over the generated `ParseOutput` schema: the spec marks both + * `elements` and `markdown` as optional on the same object, so callers without + * narrowing have to use `?.` everywhere. These narrowed variants pin one field + * present and the other `undefined`, allowing `if (output.markdown !== undefined)` + * to discriminate cleanly. */ export interface ParseResponseSpatial { status: 200; @@ -336,38 +209,21 @@ export interface ParseResponseMarkdown { } /** - * Discriminated union of every successful `/extraction/parse` response. Narrow on - * `configuration.outputFormat` (or simply branch on `output.markdown` / + * Discriminated union of every successful `/extraction/parse` response. Narrow + * on `configuration.outputFormat` (or simply branch on `output.markdown` / * `output.elements`) to pick between the two output shapes. */ export type ParseResponse = ParseResponseSpatial | ParseResponseMarkdown; -/** - * Path-level error detail returned inside `ParseErrorDetails.failingPaths`. - */ -export interface ParseErrorFailingPath { - /** JSON path to the invalid field (e.g. `$.mode`). */ - path: string; - /** Human-readable validation message. */ - details: string; -} +/** Path-level error detail returned inside `ParseErrorResponse.errorDetails.failingPaths`. */ +export type ParseErrorFailingPath = NonNullable< + NonNullable['failingPaths'] +>[number]; /** * Structured error details returned by the server on validation/processing errors. */ -export interface ParseErrorDetails { - /** - * Error origin: - * - `request` — validation errors (invalid parameters, unsupported format). - * - `processing` — backend processing failures. - * - `maestro` — Maestro engine failures. - */ - source?: string; - /** Machine-readable error code stable enough for client branching. */ - code?: string; - /** Per-field validation errors. Present on validation responses. */ - failingPaths?: ParseErrorFailingPath[]; -} +export type ParseErrorDetails = NonNullable; /** * Error response envelope returned on 4xx/5xx responses from `/extraction/parse`. @@ -375,24 +231,21 @@ export interface ParseErrorDetails { * The TypeScript client surfaces this body as the `details` of the thrown * `APIError` / `ValidationError` / `AuthenticationError`. */ -export interface ParseErrorResponse { - status: number; - requestId: string; - errorMessage: string; - errorDetails?: ParseErrorDetails; -} +export type ParseErrorResponse = Schemas['ParseErrorResponse']; /** * Options accepted by {@link import('../client').NutrientClient.parse | NutrientClient.parse}. * * All fields are optional; the server falls back to `mode: 'understand'` with - * spatial output when nothing is provided. + * spatial output when nothing is provided. Hand-written because `apiVersion` is + * a client-only concern (header override, not body) and the request-side options + * mirror the ergonomic surface in {@link ParseInstructions}. */ export interface ParseOptions { mode?: ParseMode; output?: ParseOutputOptions; /** OCR language hint. See {@link ParseProcessingOptions.language}. */ - language?: string | string[]; + language?: ParseProcessingOptions['language']; /** * Optional API-version override sent as the `x-nutrient-api-version` header. * Defaults to the version pinned at API-key creation time. From a04fd7fbe1a5a8f1ad76a673334c0e4e672f8f4b Mon Sep 17 00:00:00 2001 From: nickwinder Date: Thu, 28 May 2026 16:16:38 +1200 Subject: [PATCH 12/13] refactor(types): collapse parse types into http.ts and namespace the spec re-export MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most APIs in this client (sign, ocr, watermark, redact, etc.) don't have a dedicated `src/types/.ts` file — they reach types via `components['schemas']['X']` from `src/generated/api-types.ts`. The `src/types/parse.ts` and `src/types/extraction_credits.ts` files added on this branch were an outlier: most of their content was thin one-line aliases over the generated extract spec. Collapse to the rest-of-codebase pattern: - Delete `src/types/parse.ts` (was 254 lines, mostly aliases). - Delete `src/types/extraction_credits.ts` (single hand-rolled interface that duplicated the generated `Usage.data_extraction_credits` shape). - Move the 5 hand-composed types into `src/types/http.ts` (it already imports `ParseInstructions` / `ParseResponse` to type the endpoint maps): `ParseOutputOptions`, `ParseInstructions`, `ParseOptions`, `ParseResponseSpatial`, `ParseResponseMarkdown`, plus the derived `ExtractionCredits` alias. Each carries the JSDoc explaining why it's hand-composed instead of derived. - Drop the 23 cosmetic spec-alias exports from the package root. Consumers who need element-subtype types reach them via the new `extractComponents['schemas']['ParagraphElement']` namespace re-export, mirroring how Processor types are exposed via the existing `components` namespace. The package's public surface still exports the 7 hand-composed types (`ParseOutputOptions`, `ParseInstructions`, `ParseOptions`, `ParseResponse`, `ParseResponseSpatial`, `ParseResponseMarkdown`, `ExtractionCredits`) by name. Internal consumers (`src/client.ts`, the parse unit tests) shift to `extractComponents['schemas']['X']` for spec-derived types. Net: -290 lines on the type-definition surface, no behaviour change. --- CHANGELOG.md | 12 +- README.md | 20 ++- docs/METHODS.md | 4 +- src/__tests__/unit/parse.test.ts | 10 +- src/client.ts | 4 +- src/index.ts | 38 ++--- src/types/extraction_credits.ts | 18 --- src/types/http.ts | 132 +++++++++++++++- src/types/index.ts | 10 +- src/types/parse.ts | 254 ------------------------------- 10 files changed, 183 insertions(+), 319 deletions(-) delete mode 100644 src/types/extraction_credits.ts delete mode 100644 src/types/parse.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 88c77da..600ed04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,12 +22,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 the whole-document Markdown string directly. - `NutrientClient.parseElements(input, mode?, includeWords?)` — convenience wrapper returning the spatial elements array directly. - - Public types: `ParseMode`, `ParseOutputFormat`, `ParseOutputOptions`, - `ParseInstructions`, `ParseOptions`, `ParseResponse`, `ParseResponseSpatial`, - `ParseResponseMarkdown`, `ParseElement` (and per-element types like - `ParagraphElement`, `TableElement`, `KeyValueRegionElement`), plus error and - metadata shapes (`ParseErrorResponse`, `ParseMetrics`, `ParseUsage`, - `ParseConfiguration`). + - Public types: hand-composed `ParseOutputOptions`, `ParseInstructions`, + `ParseOptions`, `ParseResponse`, `ParseResponseSpatial`, `ParseResponseMarkdown`, + and `ExtractionCredits`. The spec primitives (`Mode`, `Element` and the six + subtypes, `Bounds`, `PageRef`, `Word`, `Metrics`, `Usage`, `Configuration`, + `ParseErrorResponse`, etc.) are accessible via the `extractComponents` + namespace re-export — same pattern as `components` for the Processor spec. - Billing note: `/extraction/parse` debits the account's **extraction credits** bucket, which is separate from the **processor API credits** used by the rest of `NutrientClient`. The response surfaces this explicitly in diff --git a/README.md b/README.md index 30b4e72..bcdeea0 100644 --- a/README.md +++ b/README.md @@ -286,9 +286,23 @@ console.log(`Cost: ${usage?.cost} extraction credits`); console.log(`Remaining: ${usage?.remainingCredits} extraction credits`); ``` -The full set of public types — `ExtractionCredits`, `ParseMode`, `ParseElement`, -`ParagraphElement`, `TableElement`, `KeyValueRegionElement`, `ParseResponse`, -etc. — is exported from the package root for downstream typing. +The hand-composed types (`ExtractionCredits`, `ParseOptions`, `ParseInstructions`, +`ParseResponse`, `ParseResponseSpatial`, `ParseResponseMarkdown`, +`ParseOutputOptions`) are exported from the package root. The spec primitives — +`Mode`, `Element` and the six element subtypes, `Bounds`, `PageRef`, `Word`, +`TableCell`, `KeyValuePair`, `KeyValueEntity`, `Metrics`, `Usage`, +`Configuration`, `ParseErrorResponse`, etc. — live under the `extractComponents` +namespace: + +```typescript +import type { extractComponents } from '@nutrient-sdk/dws-client-typescript'; + +type ParagraphElement = extractComponents['schemas']['ParagraphElement']; +type TableElement = extractComponents['schemas']['TableElement']; +``` + +This mirrors how the Processor types are exposed via the existing `components` +namespace. ## Workflow System diff --git a/docs/METHODS.md b/docs/METHODS.md index 9c05fbd..8017b6a 100644 --- a/docs/METHODS.md +++ b/docs/METHODS.md @@ -478,11 +478,11 @@ Falls back to `apiKey` if `extractApiKey` is omitted. Buffers, streams), a URL string, or a `{ type: 'url', url: '...' }` object. The endpoint accepts PDFs, Office documents, and images. - `options?: ParseOptions` — Optional configuration: - - `mode: ParseMode` — `'text'` (1 cr/page, born-digital, Markdown only), + - `mode` — `'text'` (1 cr/page, born-digital, Markdown only), `'structure'` (1.5 cr/page, OCR + spatial layout), `'understand'` (9 cr/page, AI-augmented, default), or `'agentic'` (18 cr/page, VLM-augmented). - - `output.format: ParseOutputFormat` — `'spatial'` (typed elements with bounds + - `output.format` — `'spatial'` (typed elements with bounds and confidence) or `'markdown'` (whole-document Markdown string). - `output.includeWords` — include word-level OCR data inside elements. - `language` — OCR language hint (`'eng'`, `'deu'`, `['eng', 'spa']`, etc.). diff --git a/src/__tests__/unit/parse.test.ts b/src/__tests__/unit/parse.test.ts index a9cbb18..dd65f0c 100644 --- a/src/__tests__/unit/parse.test.ts +++ b/src/__tests__/unit/parse.test.ts @@ -1,11 +1,9 @@ import { NutrientClient } from '../../client'; -import type { - ParseResponseMarkdown, - ParseResponseSpatial, - ParagraphElement, - TableElement, -} from '../../types'; +import type { ParseResponseMarkdown, ParseResponseSpatial, extractComponents } from '../../types'; import { NutrientError, ValidationError } from '../../errors'; + +type ParagraphElement = extractComponents['schemas']['ParagraphElement']; +type TableElement = extractComponents['schemas']['TableElement']; import * as inputsModule from '../../inputs'; import * as httpModule from '../../http'; diff --git a/src/client.ts b/src/client.ts index f0daacb..82c51cf 100644 --- a/src/client.ts +++ b/src/client.ts @@ -11,11 +11,11 @@ import type { ParseInstructions, ParseOptions, ParseResponse, - ParseElement, } from './types'; import { ValidationError, NutrientError } from './errors'; import { workflow } from './workflow'; import type { components, operations } from './generated/api-types'; +import type { components as extractComponents } from './generated/extract-types'; import { BuildActions } from './build'; import { processFileInput, isRemoteFileInput, getRemoteUrl } from './inputs'; import { sendRequest } from './http'; @@ -2021,7 +2021,7 @@ export class NutrientClient { input: FileInputWithUrl, mode: Exclude = 'structure', includeWords = false, - ): Promise { + ): Promise { const result = await this.parse(input, { mode, output: { format: 'spatial', includeWords }, diff --git a/src/index.ts b/src/index.ts index 40af1d6..7531d67 100644 --- a/src/index.ts +++ b/src/index.ts @@ -37,38 +37,26 @@ export type { TypedWorkflowResult, WorkflowDryRunResult, - // Data Extraction (`/extraction/parse`) types + // Data Extraction (`/extraction/parse`) — hand-composed client-facing types. + // Schema primitives (Mode, Element and the six subtypes, Bounds, PageRef, + // Word, Metrics, Usage, Configuration, ParseErrorResponse, etc.) live in the + // `extractComponents` namespace below — same pattern as `components` for the + // Processor spec. ExtractionCredits, - ParseMode, - ParseOutputFormat, ParseOutputOptions, - ParseProcessingOptions, ParseInstructions, ParseOptions, ParseResponse, ParseResponseSpatial, ParseResponseMarkdown, - ParseElement, - ParagraphElement, - ParagraphRole, - FormulaElement, - PictureElement, - TableElement, - ParseTableCell, - KeyValueRegionElement, - KeyValuePair, - KeyValueEntity, - HandwritingElement, - ParseElementBase, - ParseBounds, - ParsePageRef, - ParseWord, - ParseMetrics, - ParseUsage, - ParseConfiguration, - ParseErrorResponse, - ParseErrorDetails, - ParseErrorFailingPath, + + // Generated spec namespaces + components, + operations, + paths, + extractComponents, + extractOperations, + extractPaths, } from './types'; // Utility exports diff --git a/src/types/extraction_credits.ts b/src/types/extraction_credits.ts deleted file mode 100644 index 1dbcd31..0000000 --- a/src/types/extraction_credits.ts +++ /dev/null @@ -1,18 +0,0 @@ -/** - * Extraction-credit usage returned by the Data Extraction API - * (`POST /extraction/parse`). - * - * **Extraction credits** are a separate billing bucket from the - * **processor API credits** consumed by `/build`, `/sign`, OCR, and - * every other endpoint on `NutrientClient`. An extraction call never - * debits processor credits and vice-versa. - * - * The server surfaces this object at - * `ParseResponse.usage.data_extraction_credits`. - */ -export interface ExtractionCredits { - /** Extraction credits consumed by this request. */ - cost: number; - /** Remaining extraction credits in the account after this request. */ - remainingCredits: number; -} diff --git a/src/types/http.ts b/src/types/http.ts index 2420e10..778ee02 100644 --- a/src/types/http.ts +++ b/src/types/http.ts @@ -1,8 +1,138 @@ import type { components, operations } from '../generated/api-types'; +import type { components as extractComponents } from '../generated/extract-types'; import type { NormalizedFileData } from '../inputs'; -import type { ParseInstructions, ParseResponse } from './parse'; import type { ValueOf } from '@typescript-eslint/eslint-plugin/dist/util'; +type ExtractSchemas = extractComponents['schemas']; + +// ───────────────────────────────────────────────────────────────────────────── +// `/extraction/parse` — hand-composed request and response types +// +// The schema primitives (Mode, OutputFormat, Element and the six element +// subtypes, Bounds, PageRef, Word, TableCell, KeyValuePair, KeyValueEntity, +// Metrics, Usage, Configuration, ParseErrorResponse) live in the generated +// extract-types and are accessible to consumers via the `extractComponents` +// re-export from the package root. The types defined below are the four +// shapes the spec doesn't express on its own: +// +// - `ParseOutputOptions` / `ParseInstructions` — spec marks +// `OutputOptions.includeWords` as required but the server defaults it. +// - `ParseResponseSpatial` / `ParseResponseMarkdown` — cross-field +// discriminated narrowing so `if (output.markdown !== undefined)` works +// without per-call `?.` access. +// - `ParseOptions` — adds the client-only `apiVersion` header concern that +// isn't a body field in the spec. +// - `ExtractionCredits` — derived alias for the billing-bucket sub-shape. +// ───────────────────────────────────────────────────────────────────────────── + +/** + * Extraction-credit usage returned by the Data Extraction API + * (`POST /extraction/parse`). + * + * **Extraction credits** are a separate billing bucket from the **processor + * API credits** consumed by `/build`, `/sign`, OCR, and every other endpoint + * on `NutrientClient`. An extraction call never debits processor credits and + * vice-versa. The server surfaces this object at + * `ParseResponse.usage.data_extraction_credits`. + */ +export type ExtractionCredits = NonNullable; + +/** + * Output configuration for `/extraction/parse`. + * + * Defaults: `text` mode emits `markdown`; `structure`, `understand`, and + * `agentic` emit `spatial`. `includeWords` defaults to `false` server-side and + * is only honoured when `format` is `'spatial'`. Hand-written because the spec + * marks `includeWords` as required. + */ +export interface ParseOutputOptions { + /** Output format. */ + format: ExtractSchemas['Configuration']['outputFormat']; + /** + * Include word-level OCR data nested inside paragraph and table cell + * elements. Only applicable when `format` is `'spatial'`. + */ + includeWords?: boolean; +} + +/** + * Instruction payload sent to `/extraction/parse`. All fields are optional; an + * empty object resolves to `mode: 'understand'` with spatial output server-side. + */ +export interface ParseInstructions { + /** + * URL of a remote document to parse. Used by the JSON request shape; when + * passing a local file or buffer, omit this field. + */ + url?: string; + mode?: ExtractSchemas['Mode']; + output?: ParseOutputOptions; + options?: ExtractSchemas['ProcessingOptions']; +} + +/** + * Options accepted by `NutrientClient.parse()`. Hand-written because + * `apiVersion` is a client-only header override, not a body field in the spec. + */ +export interface ParseOptions { + mode?: ExtractSchemas['Mode']; + output?: ParseOutputOptions; + /** OCR language hint. Only honoured for `structure` / `understand` / `agentic` modes. */ + language?: ExtractSchemas['ProcessingOptions']['language']; + /** + * Optional API-version override sent as the `x-nutrient-api-version` header. + * Defaults to the version pinned at API-key creation time. + */ + apiVersion?: string; +} + +/** + * Successful `/extraction/parse` response with spatial element output. + * + * Hand-composed over the generated `ParseOutput` schema: the spec marks both + * `elements` and `markdown` as optional on the same object, forcing `?.` access + * at every call site. These narrowed variants pin one field present and the + * other `undefined`, allowing `if (output.markdown !== undefined)` to + * discriminate cleanly. + */ +export interface ParseResponseSpatial { + status: 200; + /** Unique request identifier for debugging and support. */ + requestId: string; + output: { + elements: ExtractSchemas['Element'][]; + markdown?: undefined; + }; + metrics: ExtractSchemas['Metrics']; + usage?: ExtractSchemas['Usage']; + configuration: ExtractSchemas['Configuration'] & { outputFormat: 'spatial' }; +} + +/** Successful `/extraction/parse` response with whole-document Markdown output. */ +export interface ParseResponseMarkdown { + status: 200; + /** Unique request identifier for debugging and support. */ + requestId: string; + output: { + markdown: string; + elements?: undefined; + }; + metrics: ExtractSchemas['Metrics']; + usage?: ExtractSchemas['Usage']; + configuration: ExtractSchemas['Configuration'] & { outputFormat: 'markdown' }; +} + +/** + * Discriminated union of every successful `/extraction/parse` response. Narrow + * on `configuration.outputFormat` (or simply branch on `output.markdown` / + * `output.elements`) to pick between the two output shapes. + */ +export type ParseResponse = ParseResponseSpatial | ParseResponseMarkdown; + +// ───────────────────────────────────────────────────────────────────────────── +// Endpoint request/response type maps +// ───────────────────────────────────────────────────────────────────────────── + export type RequestTypeMap = { GET: { '/account/info': undefined; diff --git a/src/types/index.ts b/src/types/index.ts index b47b66e..757ea2a 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -2,7 +2,13 @@ export * from './common'; export * from './inputs'; export * from './workflow'; export * from './http'; -export * from './extraction_credits'; -export * from './parse'; // Re-export generated types for convenience export type { components, operations, paths } from '../generated/api-types'; +// Re-export Data Extraction (`/extraction/parse`) spec types under a namespace +// so consumers can access element subtypes, schemas, and operations without a +// name collision with the Processor types above. +export type { + components as extractComponents, + operations as extractOperations, + paths as extractPaths, +} from '../generated/extract-types'; diff --git a/src/types/parse.ts b/src/types/parse.ts deleted file mode 100644 index 2961c80..0000000 --- a/src/types/parse.ts +++ /dev/null @@ -1,254 +0,0 @@ -import type { components } from '../generated/extract-types'; -import type { ExtractionCredits } from './extraction_credits'; - -export type { ExtractionCredits }; - -/** - * Type definitions for the Nutrient Data Extraction API (`POST /extraction/parse`). - * - * The primitive schemas (element types, bounds, page references, request options) - * are derived from `src/generated/extract-types.ts`, which is generated from - * `dws-data-extraction-spec.yml` by `npm run generate:types:extract`. The - * narrowed response unions (`ParseResponseSpatial` / `ParseResponseMarkdown`) - * and the client-facing `ParseOptions` surface are hand-composed on top. - * - * Billing note: `/extraction/parse` is billed against **extraction credits**, a - * bucket separate from the **processor API credits** consumed by `/build`, - * `/sign`, OCR, conversion, and the other endpoints on `NutrientClient`. The - * response surfaces this explicitly in `usage.data_extraction_credits`. - * - * @see ParseResponse for the full response shape - * @see ParseMode for the four processing pipelines - */ - -type Schemas = components['schemas']; - -/** - * Processing pipeline for `/extraction/parse`. - * - * Each mode bills a different amount of **extraction credits** per page, drawn - * from the account's extraction-credits bucket (separate from processor API - * credits). - * - * - `text` — Plain text extraction. Markdown output only. 1 extraction credit/page. - * - `structure` — OCR-backed structured extraction with spatial element output. 1.5 extraction credits/page. - * - `understand` — Deeper document analysis with semantic enrichment. 9 extraction credits/page. (Default) - * - `agentic` — VLM-augmented extraction for complex documents needing visual reasoning. 18 extraction credits/page. - */ -export type ParseMode = Schemas['Mode']; - -/** - * Output format for `/extraction/parse`. - * - * - `spatial` — Flat list of typed elements (paragraph, table, formula, picture, - * keyValueRegion, handwriting) with bounding boxes, confidence, reading order, - * and page references. Not available with `mode: 'text'`. - * - `markdown` — Whole-document Markdown representation, suited for RAG, search - * indexing, and content pipelines. - */ -export type ParseOutputFormat = Schemas['Configuration']['outputFormat']; - -/** - * Output configuration for `/extraction/parse`. - * - * Defaults: `text` mode emits `markdown`; `structure`, `understand`, and `agentic` - * emit `spatial`. `includeWords` defaults to `false` server-side and is only - * honoured when `format` is `'spatial'`. - * - * Hand-written (not derived from the spec) because the spec marks `includeWords` - * as required; in practice it has a server-side default and clients omit it. - */ -export interface ParseOutputOptions { - /** Output format. Required when `output` is provided. */ - format: ParseOutputFormat; - /** - * Include word-level OCR data nested inside paragraph and table cell elements. - * Only applicable when `format` is `'spatial'`. Defaults to `false` server-side. - */ - includeWords?: boolean; -} - -/** - * Additional processing options for `/extraction/parse`. - */ -export type ParseProcessingOptions = Schemas['ProcessingOptions']; - -/** - * Instruction payload sent to `/extraction/parse`. All fields are optional; an - * empty object resolves to `mode: 'understand'` with spatial output server-side. - * - * Hand-written because the spec's `OutputOptions` makes `includeWords` required; - * see {@link ParseOutputOptions}. - */ -export interface ParseInstructions { - /** - * URL of a remote document to parse. Used by the JSON request shape; when - * passing a local file or buffer, omit this field. - */ - url?: string; - mode?: ParseMode; - output?: ParseOutputOptions; - options?: ParseProcessingOptions; -} - -/** - * Bounding box of an element on the page. - * - * `(x, y)` is the top-left corner. The origin is the top-left of the page, with - * x increasing right and y increasing down. Coordinates are in render-space - * pixels; `page.width` and `page.height` describe the same pixel canvas. - */ -export type ParseBounds = Schemas['Bounds']; - -/** - * Source page reference for an extracted element. Defines the coordinate space - * that all element bounds on the page are relative to. - */ -export type ParsePageRef = Schemas['PageRef']; - -/** - * Word-level OCR result. Included inside `ParagraphElement.words`, - * `HandwritingElement.words`, and `ParseTableCell.words` when - * `output.includeWords === true`. - */ -export type ParseWord = Schemas['Word']; - -/** Fields shared by every spatial element. */ -export type ParseElementBase = Schemas['ElementBase']; - -export type ParagraphElement = Schemas['ParagraphElement']; - -/** - * Semantic role of a paragraph element. `null` when the role is undetermined. - */ -export type ParagraphRole = NonNullable; - -export type FormulaElement = Schemas['FormulaElement']; - -export type PictureElement = Schemas['PictureElement']; - -/** A single cell inside a `TableElement`. */ -export type ParseTableCell = Schemas['TableCell']; - -export type TableElement = Schemas['TableElement']; - -/** Question or answer entity within a `KeyValuePair`. */ -export type KeyValueEntity = Schemas['KeyValueEntity']; - -export type KeyValuePair = Schemas['KeyValuePair']; - -export type KeyValueRegionElement = Schemas['KeyValueRegionElement']; - -export type HandwritingElement = Schemas['HandwritingElement']; - -/** - * Discriminated union of every spatial element type. Use the `type` field for - * narrowing. - */ -export type ParseElement = Schemas['Element']; - -/** - * Processing metrics for a `/extraction/parse` call. - */ -export type ParseMetrics = Schemas['Metrics']; - -/** - * Extraction-credit usage for a `/extraction/parse` call. - * - * **Extraction credits** are a separate billing bucket from processor API - * credits; an extraction call never debits processor credits and vice-versa. - * - * See {@link ExtractionCredits} for the shape of the billing object. - */ -export type ParseUsage = Schemas['Usage']; - -/** - * Echoes the resolved configuration the server used for this request. - */ -export type ParseConfiguration = Schemas['Configuration']; - -/** - * Successful `/extraction/parse` response with spatial element output. - * - * Hand-composed over the generated `ParseOutput` schema: the spec marks both - * `elements` and `markdown` as optional on the same object, so callers without - * narrowing have to use `?.` everywhere. These narrowed variants pin one field - * present and the other `undefined`, allowing `if (output.markdown !== undefined)` - * to discriminate cleanly. - */ -export interface ParseResponseSpatial { - status: 200; - /** Unique request identifier for debugging and support. */ - requestId: string; - output: { - elements: ParseElement[]; - /** Always absent on spatial responses. Kept on the shape so consumers can use - * a single `output` property without conditional access. */ - markdown?: undefined; - }; - metrics: ParseMetrics; - usage?: ParseUsage; - configuration: ParseConfiguration & { outputFormat: 'spatial' }; -} - -/** - * Successful `/extraction/parse` response with whole-document Markdown output. - */ -export interface ParseResponseMarkdown { - status: 200; - /** Unique request identifier for debugging and support. */ - requestId: string; - output: { - markdown: string; - /** Always absent on markdown responses. */ - elements?: undefined; - }; - metrics: ParseMetrics; - usage?: ParseUsage; - configuration: ParseConfiguration & { outputFormat: 'markdown' }; -} - -/** - * Discriminated union of every successful `/extraction/parse` response. Narrow - * on `configuration.outputFormat` (or simply branch on `output.markdown` / - * `output.elements`) to pick between the two output shapes. - */ -export type ParseResponse = ParseResponseSpatial | ParseResponseMarkdown; - -/** Path-level error detail returned inside `ParseErrorResponse.errorDetails.failingPaths`. */ -export type ParseErrorFailingPath = NonNullable< - NonNullable['failingPaths'] ->[number]; - -/** - * Structured error details returned by the server on validation/processing errors. - */ -export type ParseErrorDetails = NonNullable; - -/** - * Error response envelope returned on 4xx/5xx responses from `/extraction/parse`. - * - * The TypeScript client surfaces this body as the `details` of the thrown - * `APIError` / `ValidationError` / `AuthenticationError`. - */ -export type ParseErrorResponse = Schemas['ParseErrorResponse']; - -/** - * Options accepted by {@link import('../client').NutrientClient.parse | NutrientClient.parse}. - * - * All fields are optional; the server falls back to `mode: 'understand'` with - * spatial output when nothing is provided. Hand-written because `apiVersion` is - * a client-only concern (header override, not body) and the request-side options - * mirror the ergonomic surface in {@link ParseInstructions}. - */ -export interface ParseOptions { - mode?: ParseMode; - output?: ParseOutputOptions; - /** OCR language hint. See {@link ParseProcessingOptions.language}. */ - language?: ParseProcessingOptions['language']; - /** - * Optional API-version override sent as the `x-nutrient-api-version` header. - * Defaults to the version pinned at API-key creation time. - */ - apiVersion?: string; -} From 97fe278149cdc04c496ccb16a01340c64ef67d3f Mon Sep 17 00:00:00 2001 From: nickwinder Date: Thu, 28 May 2026 16:17:40 +1200 Subject: [PATCH 13/13] fix: address code-review findings on the Data Extraction surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five findings from review: 1. Empty-string `extractApiKey` bypassed constructor validation. `apiKey` uses `!options.apiKey` (falsy, catches `''`); the new `extractApiKey` validator only checked `!== undefined` plus the type guard, so `extractApiKey: ''` passed, propagated into the per-call options as `apiKey: ''`, and produced `Authorization: Bearer ` with no token — surfacing as a confusing server-side 401 instead of a constructor-time `ValidationError`. Add an explicit empty-string check. 2. `extractErrorMessage` in `src/http.ts` checked snake_case (`error_message`, `error_description`) and generic message fields but not `errorMessage` (camelCase) — the field DWS Extract returns on every 4xx/5xx. Result: the server's specific message (e.g. `"invalid mode: 'vlm'"`) was silently replaced by the generic `HTTP : ` string. Add `errorMessage` to the priority list. 3. `parse()` accepted `mode='text' + output.format='spatial'` and let the server reject with 400. The Python sibling client adds a client-side `ValidationError` for this case (after reviewer feedback). The TS `parseElements()` wrapper blocked it at the type level via `Exclude`, but the low-level `parse()` did not. Add a pre-flight runtime guard. 4. `RequestTypeMap` JSDoc on `/extraction/parse` claimed `instructions` was optional for multipart upload, but the type definition marks it required and the implementation always passes it (an empty object when no options are supplied). Update the comment to match the type. 5. `parse()` `@param options.language` JSDoc described the field as "string or array of ISO 639-2 codes". The underlying spec also accepts lowercase language names (`'english'`, `'german'`) and `+`-joined multilingual strings (`'eng+spa'`). Document all four accepted forms. Adds three unit tests (empty-string `extractApiKey`, `errorMessage` extraction, text+spatial pre-flight rejection). 292 tests pass. --- src/__tests__/unit/client.test.ts | 9 ++++++++ src/__tests__/unit/http.test.ts | 29 +++++++++++++++++++++++++ src/__tests__/unit/parse.test.ts | 10 +++++++++ src/client.ts | 36 +++++++++++++++++++++++-------- src/http.ts | 4 ++++ src/types/http.ts | 5 +++-- 6 files changed, 82 insertions(+), 11 deletions(-) diff --git a/src/__tests__/unit/client.test.ts b/src/__tests__/unit/client.test.ts index 956a0d2..bc2eff1 100644 --- a/src/__tests__/unit/client.test.ts +++ b/src/__tests__/unit/client.test.ts @@ -264,6 +264,15 @@ describe('NutrientClient', () => { }), ).toThrow('Extract API key must be a string or a function that returns a Promise'); }); + + it('should throw ValidationError for empty-string extractApiKey', () => { + expect( + () => new NutrientClient({ apiKey: 'processor-key', extractApiKey: '' }), + ).toThrow(ValidationError); + expect( + () => new NutrientClient({ apiKey: 'processor-key', extractApiKey: '' }), + ).toThrow('Extract API key must not be an empty string'); + }); }); describe('workflow()', () => { diff --git a/src/__tests__/unit/http.test.ts b/src/__tests__/unit/http.test.ts index d60bc2f..a13619a 100644 --- a/src/__tests__/unit/http.test.ts +++ b/src/__tests__/unit/http.test.ts @@ -318,6 +318,35 @@ describe('HTTP Layer', () => { }); }); + it('should surface the camelCase errorMessage field from Extract API errors', async () => { + // DWS Extract returns `errorMessage` (camelCase) on every 4xx/5xx, not `message`. + const mockResponse = { + data: { + status: 400, + requestId: 'req_err_001', + errorMessage: "invalid mode: 'vlm'. Expected: text, structure, understand, agentic", + errorDetails: { source: 'request', code: 'invalid_request' }, + }, + status: 400, + statusText: 'Bad Request', + headers: {}, + }; + + mockedAxios.mockResolvedValueOnce(mockResponse); + + const config: RequestConfig<'POST', '/extraction/parse'> = { + endpoint: '/extraction/parse', + method: 'POST', + data: { instructions: {} }, + }; + + await expect(sendRequest(config, mockClientOptions, 'json')).rejects.toMatchObject({ + name: 'ValidationError', + message: "invalid mode: 'vlm'. Expected: text, structure, understand, agentic", + statusCode: 400, + }); + }); + it('should handle network errors', async () => { const networkError = { isAxiosError: true, diff --git a/src/__tests__/unit/parse.test.ts b/src/__tests__/unit/parse.test.ts index dd65f0c..76f5bb6 100644 --- a/src/__tests__/unit/parse.test.ts +++ b/src/__tests__/unit/parse.test.ts @@ -304,6 +304,16 @@ describe('NutrientClient.parse()', () => { await expect(makeClient().parse('missing.pdf')).rejects.toBeInstanceOf(ValidationError); expect(mockSendRequest).not.toHaveBeenCalled(); }); + + it("rejects mode='text' + output.format='spatial' before the network call", async () => { + // text mode emits markdown only — the server returns 400 for this combination. + // The client should surface a ValidationError without a network round-trip. + await expect( + makeClient().parse('document.pdf', { mode: 'text', output: { format: 'spatial' } }), + ).rejects.toBeInstanceOf(ValidationError); + expect(mockSendRequest).not.toHaveBeenCalled(); + expect(mockProcessFileInput).not.toHaveBeenCalled(); + }); }); describe('Data Extraction API key routing', () => { diff --git a/src/client.ts b/src/client.ts index 82c51cf..c63b808 100644 --- a/src/client.ts +++ b/src/client.ts @@ -123,14 +123,18 @@ export class NutrientClient { throw new ValidationError('Base URL must be a string'); } - if ( - options.extractApiKey !== undefined && - typeof options.extractApiKey !== 'string' && - typeof options.extractApiKey !== 'function' - ) { - throw new ValidationError( - 'Extract API key must be a string or a function that returns a Promise', - ); + if (options.extractApiKey !== undefined) { + if ( + typeof options.extractApiKey !== 'string' && + typeof options.extractApiKey !== 'function' + ) { + throw new ValidationError( + 'Extract API key must be a string or a function that returns a Promise', + ); + } + if (options.extractApiKey === '') { + throw new ValidationError('Extract API key must not be an empty string'); + } } } @@ -1865,7 +1869,9 @@ export class NutrientClient { * - `mode` — processing pipeline (`'text'` | `'structure'` | `'understand'` | `'agentic'`). * - `output.format` — `'spatial'` for typed elements or `'markdown'` for Markdown. * - `output.includeWords` — include word-level OCR data inside elements. - * - `language` — OCR language hint (string or array of ISO 639-2 codes). + * - `language` — OCR language hint. Accepts a lowercase language name + * (`'english'`, `'german'`), an ISO 639-2 code (`'eng'`, `'deu'`), an + * array (`['eng', 'spa']`), or a `+`-joined multilingual string (`'eng+spa'`). * - `apiVersion` — optional API-version header override. * @returns Promise resolving to the full `/extraction/parse` response envelope. * Narrow on `output.markdown` / `output.elements` for type-safe field access, @@ -1912,6 +1918,18 @@ export class NutrientClient { * ``` */ async parse(input: FileInputWithUrl, options?: ParseOptions): Promise { + // `text` mode emits markdown only — the server rejects this combination + // with a 400. Reject client-side so the caller gets a clear error without + // a network round-trip. Note: `parseElements()` blocks this at the type + // level via `Exclude`, but the low-level + // `parse()` accepts any combination, so the runtime guard is needed here. + if (options?.mode === 'text' && options?.output?.format === 'spatial') { + throw new ValidationError( + "mode='text' is not supported with output.format='spatial'. " + + "Use output.format='markdown', or switch to mode='structure' / 'understand' / 'agentic' for spatial elements.", + ); + } + const instructions: ParseInstructions = {}; if (options?.mode !== undefined) instructions.mode = options.mode; if (options?.output !== undefined) instructions.output = options.output; diff --git a/src/http.ts b/src/http.ts index fd0d0c0..4704c71 100644 --- a/src/http.ts +++ b/src/http.ts @@ -284,6 +284,10 @@ function extractErrorMessage(data: unknown): string | null { if (typeof errorData['error_message'] === 'string') { return errorData['error_message']; } + // DWS Extract uses `errorMessage` (camelCase) on every 4xx/5xx response. + if (typeof errorData['errorMessage'] === 'string') { + return errorData['errorMessage']; + } // Common error message fields if (typeof errorData['message'] === 'string') { diff --git a/src/types/http.ts b/src/types/http.ts index 778ee02..a5a1c5e 100644 --- a/src/types/http.ts +++ b/src/types/http.ts @@ -158,8 +158,9 @@ export type RequestTypeMap = { }; '/tokens': components['schemas']['CreateAuthTokenParameters']; /** - * `/extraction/parse` request body. Use exactly one of: - * - `file` + optional `instructions` for multipart upload (local files, buffers, streams). + * `/extraction/parse` request body. `instructions` is always sent (callers + * may pass an empty object for server defaults). Use exactly one of: + * - `file` + `instructions` for multipart upload (local files, buffers, streams). * - `instructions.url` only for URL-based input (sent as `application/json`). */ '/extraction/parse': {