Set 'easyocr' for docling-standard pipeline (opendatahub-io#72)

alimaredia · web-flow · commit b8d0fc3dd3e3 · 2025-12-19T19:09:59.000Z
##
diff --git a/kubeflow-pipelines/common/components.py b/kubeflow-pipelines/common/components.py
@@ -200,7 +200,7 @@ def download_docling_models(
             progress=True,
             with_layout=True,
             with_tableformer=True,
-            with_easyocr=False,
+            with_easyocr=True,
         )
     elif pipeline_type == "vlm" and remote_model_endpoint_enabled:
         # VLM pipeline with remote model endpoint: Download minimal required models
diff --git a/kubeflow-pipelines/docling-standard/README.md b/kubeflow-pipelines/docling-standard/README.md
@@ -18,7 +18,7 @@ The following configuration options are available as KFP parameters when you _Cr
 - `docling_image_export_mode`: Image export mode for the document. In `embedded` mode, the image is embedded as base64 encoded string. With `placeholder`, only the position of the image is marked in the output. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.
 - `docling_num_threads`: Number of threads to be used internally by the Docling engine.
 - `docling_ocr`: If enabled, the bitmap content will be processed using OCR.
-- `docling_ocr_engine`: The OCR engine to use. `tesseract`, `tesserocr`, or `rapidocr`.
+- `docling_ocr_engine`: The OCR engine to use. Current values are: `easyocr`.
 - `docling_pdf_backend`: The PDF backend to use. `pypdfium2`, `dlparse_v1`, `dlparse_v2`, or `dlparse_v4`.
 - `docling_table_mode`: The mode to use in the table structure model. `accurate` or `fast`.
 - `docling_timeout_per_document`: Timeout for each single document conversion.
diff --git a/kubeflow-pipelines/docling-standard/standard_components.py b/kubeflow-pipelines/docling-standard/standard_components.py
@@ -24,7 +24,7 @@ def docling_convert_standard(
     timeout_per_document: int = 300,
     ocr: bool = True,
     force_ocr: bool = False,
-    ocr_engine: str = "tesseract",
+    ocr_engine: str = "easyocr",
     allow_external_plugins: bool = False,
     enrich_code: bool = False,
     enrich_formula: bool = False,
diff --git a/kubeflow-pipelines/docling-standard/standard_convert_pipeline.py b/kubeflow-pipelines/docling-standard/standard_convert_pipeline.py
@@ -28,7 +28,7 @@ def convert_pipeline(
     docling_timeout_per_document: int = 300,
     docling_ocr: bool = True,
     docling_force_ocr: bool = False,
-    docling_ocr_engine: str = "tesseract",
+    docling_ocr_engine: str = "easyocr",
     docling_allow_external_plugins: bool = False,
     docling_enrich_code: bool = False,
     docling_enrich_formula: bool = False,
diff --git a/kubeflow-pipelines/docling-standard/standard_convert_pipeline_compiled.yaml b/kubeflow-pipelines/docling-standard/standard_convert_pipeline_compiled.yaml
@@ -11,7 +11,7 @@
 #    docling_image_export_mode: str [Default: 'embedded']
 #    docling_num_threads: int [Default: 4.0]
 #    docling_ocr: bool [Default: True]
-#    docling_ocr_engine: str [Default: 'tesseract']
+#    docling_ocr_engine: str [Default: 'easyocr']
 #    docling_pdf_backend: str [Default: 'dlparse_v4']
 #    docling_table_mode: str [Default: 'accurate']
 #    docling_timeout_per_document: int [Default: 300.0]
@@ -98,7 +98,7 @@ components:
           isOptional: true
           parameterType: BOOLEAN
         ocr_engine:
-          defaultValue: tesseract
+          defaultValue: easyocr
           description: Engine to use for OCR.
           isOptional: true
           parameterType: STRING
@@ -327,7 +327,7 @@ deploymentSpec:
           \    image_export_mode: str = \"embedded\",\n    table_mode: str = \"accurate\"\
           ,\n    num_threads: int = 4,\n    timeout_per_document: int = 300,\n   \
           \ ocr: bool = True,\n    force_ocr: bool = False,\n    ocr_engine: str =\
-          \ \"tesseract\",\n    allow_external_plugins: bool = False,\n    enrich_code:\
+          \ \"easyocr\",\n    allow_external_plugins: bool = False,\n    enrich_code:\
           \ bool = False,\n    enrich_formula: bool = False,\n    enrich_picture_classes:\
           \ bool = False,\n    enrich_picture_description: bool = False,\n):\n   \
           \ \"\"\"\n    Convert a list of PDF files to JSON and Markdown using Docling\
@@ -476,9 +476,9 @@ deploymentSpec:
           \n    if pipeline_type == \"standard\":\n        # Standard pipeline: download\
           \ traditional models\n        download_models(\n            output_dir=output_path_p,\n\
           \            progress=True,\n            with_layout=True,\n           \
-          \ with_tableformer=True,\n            with_easyocr=False,\n        )\n \
-          \   elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n \
-          \       # VLM pipeline with remote model endpoint: Download minimal required\
+          \ with_tableformer=True,\n            with_easyocr=True,\n        )\n  \
+          \  elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n  \
+          \      # VLM pipeline with remote model endpoint: Download minimal required\
           \ models\n        # Only models set are what lives in fabianofranz repo\n\
           \        # TODO: figure out what needs to be downloaded or removed\n   \
           \     download_models(\n            output_dir=output_path_p,\n        \
@@ -742,7 +742,7 @@ root:
         isOptional: true
         parameterType: BOOLEAN
       docling_ocr_engine:
-        defaultValue: tesseract
+        defaultValue: easyocr
         isOptional: true
         parameterType: STRING
       docling_pdf_backend:
diff --git a/kubeflow-pipelines/docling-vlm/vlm_convert_pipeline_compiled.yaml b/kubeflow-pipelines/docling-vlm/vlm_convert_pipeline_compiled.yaml
@@ -389,9 +389,9 @@ deploymentSpec:
           \n    if pipeline_type == \"standard\":\n        # Standard pipeline: download\
           \ traditional models\n        download_models(\n            output_dir=output_path_p,\n\
           \            progress=True,\n            with_layout=True,\n           \
-          \ with_tableformer=True,\n            with_easyocr=False,\n        )\n \
-          \   elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n \
-          \       # VLM pipeline with remote model endpoint: Download minimal required\
+          \ with_tableformer=True,\n            with_easyocr=True,\n        )\n  \
+          \  elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n  \
+          \      # VLM pipeline with remote model endpoint: Download minimal required\
           \ models\n        # Only models set are what lives in fabianofranz repo\n\
           \        # TODO: figure out what needs to be downloaded or removed\n   \
           \     download_models(\n            output_dir=output_path_p,\n        \

Original file line number	Diff line number	Diff line change
`@@ -200,7 +200,7 @@ def download_docling_models(`
`200`	`200`	`progress=True,`
`201`	`201`	`with_layout=True,`
`202`	`202`	`with_tableformer=True,`
`203`		`- with_easyocr=False,`
	`203`	`+ with_easyocr=True,`
`204`	`204`	`)`
`205`	`205`	`elif pipeline_type == "vlm" and remote_model_endpoint_enabled:`
`206`	`206`	`# VLM pipeline with remote model endpoint: Download minimal required models`