Skip to content

Commit b8d0fc3

Browse files
authored
Set 'easyocr' for docling-standard pipeline (opendatahub-io#72)
##
1 parent bd734df commit b8d0fc3

6 files changed

Lines changed: 14 additions & 14 deletions

File tree

kubeflow-pipelines/common/components.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def download_docling_models(
200200
progress=True,
201201
with_layout=True,
202202
with_tableformer=True,
203-
with_easyocr=False,
203+
with_easyocr=True,
204204
)
205205
elif pipeline_type == "vlm" and remote_model_endpoint_enabled:
206206
# VLM pipeline with remote model endpoint: Download minimal required models

kubeflow-pipelines/docling-standard/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ The following configuration options are available as KFP parameters when you _Cr
1818
- `docling_image_export_mode`: Image export mode for the document. In `embedded` mode, the image is embedded as base64 encoded string. With `placeholder`, only the position of the image is marked in the output. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.
1919
- `docling_num_threads`: Number of threads to be used internally by the Docling engine.
2020
- `docling_ocr`: If enabled, the bitmap content will be processed using OCR.
21-
- `docling_ocr_engine`: The OCR engine to use. `tesseract`, `tesserocr`, or `rapidocr`.
21+
- `docling_ocr_engine`: The OCR engine to use. Current values are: `easyocr`.
2222
- `docling_pdf_backend`: The PDF backend to use. `pypdfium2`, `dlparse_v1`, `dlparse_v2`, or `dlparse_v4`.
2323
- `docling_table_mode`: The mode to use in the table structure model. `accurate` or `fast`.
2424
- `docling_timeout_per_document`: Timeout for each single document conversion.

kubeflow-pipelines/docling-standard/standard_components.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def docling_convert_standard(
2424
timeout_per_document: int = 300,
2525
ocr: bool = True,
2626
force_ocr: bool = False,
27-
ocr_engine: str = "tesseract",
27+
ocr_engine: str = "easyocr",
2828
allow_external_plugins: bool = False,
2929
enrich_code: bool = False,
3030
enrich_formula: bool = False,

kubeflow-pipelines/docling-standard/standard_convert_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def convert_pipeline(
2828
docling_timeout_per_document: int = 300,
2929
docling_ocr: bool = True,
3030
docling_force_ocr: bool = False,
31-
docling_ocr_engine: str = "tesseract",
31+
docling_ocr_engine: str = "easyocr",
3232
docling_allow_external_plugins: bool = False,
3333
docling_enrich_code: bool = False,
3434
docling_enrich_formula: bool = False,

kubeflow-pipelines/docling-standard/standard_convert_pipeline_compiled.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# docling_image_export_mode: str [Default: 'embedded']
1212
# docling_num_threads: int [Default: 4.0]
1313
# docling_ocr: bool [Default: True]
14-
# docling_ocr_engine: str [Default: 'tesseract']
14+
# docling_ocr_engine: str [Default: 'easyocr']
1515
# docling_pdf_backend: str [Default: 'dlparse_v4']
1616
# docling_table_mode: str [Default: 'accurate']
1717
# docling_timeout_per_document: int [Default: 300.0]
@@ -98,7 +98,7 @@ components:
9898
isOptional: true
9999
parameterType: BOOLEAN
100100
ocr_engine:
101-
defaultValue: tesseract
101+
defaultValue: easyocr
102102
description: Engine to use for OCR.
103103
isOptional: true
104104
parameterType: STRING
@@ -327,7 +327,7 @@ deploymentSpec:
327327
\ image_export_mode: str = \"embedded\",\n table_mode: str = \"accurate\"\
328328
,\n num_threads: int = 4,\n timeout_per_document: int = 300,\n \
329329
\ ocr: bool = True,\n force_ocr: bool = False,\n ocr_engine: str =\
330-
\ \"tesseract\",\n allow_external_plugins: bool = False,\n enrich_code:\
330+
\ \"easyocr\",\n allow_external_plugins: bool = False,\n enrich_code:\
331331
\ bool = False,\n enrich_formula: bool = False,\n enrich_picture_classes:\
332332
\ bool = False,\n enrich_picture_description: bool = False,\n):\n \
333333
\ \"\"\"\n Convert a list of PDF files to JSON and Markdown using Docling\
@@ -476,9 +476,9 @@ deploymentSpec:
476476
\n if pipeline_type == \"standard\":\n # Standard pipeline: download\
477477
\ traditional models\n download_models(\n output_dir=output_path_p,\n\
478478
\ progress=True,\n with_layout=True,\n \
479-
\ with_tableformer=True,\n with_easyocr=False,\n )\n \
480-
\ elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n \
481-
\ # VLM pipeline with remote model endpoint: Download minimal required\
479+
\ with_tableformer=True,\n with_easyocr=True,\n )\n \
480+
\ elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n \
481+
\ # VLM pipeline with remote model endpoint: Download minimal required\
482482
\ models\n # Only models set are what lives in fabianofranz repo\n\
483483
\ # TODO: figure out what needs to be downloaded or removed\n \
484484
\ download_models(\n output_dir=output_path_p,\n \
@@ -742,7 +742,7 @@ root:
742742
isOptional: true
743743
parameterType: BOOLEAN
744744
docling_ocr_engine:
745-
defaultValue: tesseract
745+
defaultValue: easyocr
746746
isOptional: true
747747
parameterType: STRING
748748
docling_pdf_backend:

kubeflow-pipelines/docling-vlm/vlm_convert_pipeline_compiled.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -389,9 +389,9 @@ deploymentSpec:
389389
\n if pipeline_type == \"standard\":\n # Standard pipeline: download\
390390
\ traditional models\n download_models(\n output_dir=output_path_p,\n\
391391
\ progress=True,\n with_layout=True,\n \
392-
\ with_tableformer=True,\n with_easyocr=False,\n )\n \
393-
\ elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n \
394-
\ # VLM pipeline with remote model endpoint: Download minimal required\
392+
\ with_tableformer=True,\n with_easyocr=True,\n )\n \
393+
\ elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n \
394+
\ # VLM pipeline with remote model endpoint: Download minimal required\
395395
\ models\n # Only models set are what lives in fabianofranz repo\n\
396396
\ # TODO: figure out what needs to be downloaded or removed\n \
397397
\ download_models(\n output_dir=output_path_p,\n \

0 commit comments

Comments
 (0)