From b680b59d17a6a8cb367077313c1f1f7167d4b4b2 Mon Sep 17 00:00:00 2001
From: snow2white <164228644@qq.com>
Date: Fri, 16 Jan 2026 00:36:20 +0800
Subject: [PATCH] 0116v1

---
 .DS_Store                                   | Bin 0 -> 6148 bytes
 nlp/README.md                               |   9 +-
 nlp/inference_paddleOCR0.9B_detection.ipynb | 896 ++++++++++++++++++++
 3 files changed, 901 insertions(+), 4 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 nlp/inference_paddleOCR0.9B_detection.ipynb
diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..9e0398212f617f7441988f3eef716f8bfe0c362a
GIT binary patch
literal 6148
zcmeHKU2oGc6uoXbno>Z00BJ8sk$4@YgAEDs(sirEB;a8|An~D+tX+vn<I<+<q^eS%
z;Xm*fc;%PyUwDFZZBI(l@(LC5t?c92=i1lDaa<FTXis~)M0Fx^kr>NYk*zUq=d@xa
zQ^U?e^^qRO@-&QPxWcs+RspNP<|)8?cZ&iVP@fXY?q6;>Kx+I_ra~`+8Bv!mGW(o)
z-wHhLaY{XKWu7T+7nmhFrSk&kOMz2WJ)O}R`kZi|loULE7CaoxCn&N?g958UPwAKr
zsYB1nr%t|4HTSQXxjPi&H!QGt=2=#ahKMqz6mi}~St9m;;=GqqA?xKp`<O-r7VEHv
z70xCj^Nsx;j^b#NHk&_0$tiDLskjxl=Dzbk$eEu8>2%T##&7uPg_Lo4Sq{RNQGeF0
z-#nB_8bnEdtQ?|VAC$MRqNFEhZ8=SP6Xomc2Dj=~yY=n)e81V;@gD5%FLu27qx&fD
zJ=|L?s_v~j_a2{gKgPpEen#Xlfm1=*b%U3{Gc$8O_s2;r6J8r;J2Zw~gMt#gzF8%z
zbb?hopd)$)Y(VE;-h)?9+{monwVX$nUel9xe(D{`j#NiHefoqojZ|r<_X*y^2G;mu
zWyj!s$c>A@H`dJRxV4Rq&<17|unPPu1^D~mAu+ZLE;Xu82Qqa90G3g$3}ya9z&@VA
zmcgY)w7`UR1!`Aet{6hQquw*TmcgY)?M}j6K7<)rm>Y^vqoaRM(Mhy4+T1E&704^#
z=*K#r|D8YI|MO1v$tqwK_^%WY<s<*7jVYP4b!BpV)_O?qk=U5G)TmsLnd4X$_$b~+
bQie8<3&57arAE}i>>mLogUzf0e^r6s<wp3^

literal 0
HcmV?d00001

diff --git a/nlp/README.md b/nlp/README.md
index a2c9dae..ca135c7 100644
--- a/nlp/README.md
+++ b/nlp/README.md
@@ -2,11 +2,11 @@
 
 This directory contains ready-to-use Natural Language Processing application notebooks built with MindSpore. Each notebook demonstrates a complete or partial workflow—training, finetuning, or inference—along with a brief introduction to the model used.
 
-## Application List
+## Application List 
 
-| No. | Model | Description                     |
-| :-- | :---- | :------------------------------ |
-| 1   | / | This section is empty for now — feel free to contribute your first application! |
+| No.  | Model          | Description                                                  |
+| :--- | :------------- | :----------------------------------------------------------- |
+| 1    | paddleOCR 0.9B | Includes notebooks for paddleOCR 0.9B inferecne on tasks such as text detection |
 
 ## Contributing New NLP Applications
 
@@ -16,3 +16,4 @@ To contribute a new NLP application:
 2. If the model does not yet have its own directory, create a new one following the existing structure.
 3. Follow the notebook writing and naming standards in the [Contributing Guidelines](https://github.com/mindspore-courses/applications/wiki/Contributing-Guidelines).
 4. Update the application list in the README if required.
+
diff --git a/nlp/inference_paddleOCR0.9B_detection.ipynb b/nlp/inference_paddleOCR0.9B_detection.ipynb
new file mode 100644
index 0000000..68c87a9
--- /dev/null
+++ b/nlp/inference_paddleOCR0.9B_detection.ipynb
@@ -0,0 +1,896 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3f1b18b8-bca7-44cd-b9be-41e798846b8e",
+   "metadata": {},
+   "source": [
+    "# 基于MindSpore NLP的paddleocr-vl图像文本读取识别开发\n",
+    "## 基本环境信息：\n",
+    "\n",
+    "Python = 3.10\n",
+    "\n",
+    "CANN = 8.2.RC1\n",
+    "\n",
+    "MindSpore = 2.7.0\n",
+    "\n",
+    "MindSpore NLP == 0.5.1\n",
+    "\n",
+    "## 其他主要依赖库与版本：\n",
+    "\n",
+    "transformers==4.57.1\n",
+    "\n",
+    "diffusers==0.35.2\n",
+    "\n",
+    "einops\n",
+    "\n",
+    "torchvision"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d273fd8e-5e69-4a08-98bb-f3f945376e3d",
+   "metadata": {},
+   "source": [
+    "PaddleOCR-VL 是一款面向文档解析的 SOTA 且资源高效的模型。其核心组件为 PaddleOCR-VL-0.9B，这是一种紧凑而强大的视觉语言模型（VLM），由 NaViT 风格的动态分辨率视觉编码器与 ERNIE-4.5-0.3B 语言模型组成，以实现精准的元素识别。该创新模型高效支持 109 种语言，并在识别复杂元素（如文本、表格、公式和图表）方面表现出色，同时保持极低的资源消耗。通过在广泛使用的公开基准与内部基准上的全面评测，PaddleOCR-VL 在页级文档解析与元素级识别两方面均达到 SOTA 表现。它显著优于现有方案，对比顶级 VLM 亦具强竞争力，并具备快速的推理速度。这些优势使其非常适合在真实场景中落地部署。\n",
+    "在本案例中，基于Mindspore框架和mindnlp库利用PaddleOCR-VL 0.9B模型实现常用场景下的OCR识别与基础案例开发。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "24149edd-534f-43dc-a241-a2489dbecc8e",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## 表格识别\n",
+    "首先从表格识别（Table Recognition）开始，这里使用 Transformers 接口加载 Hugging Face 上的模型，先加载 PaddleOCR‑VL‑0.9B 多模态模型及其配套的 tokenizer 和 processor，然后从URL读取一张包含文字信息的图片，构造Table Recognition指令并通过对话模板转换为模型可识别的输入格式；接着由 processor 将图像与文本进行联合编码并送入模型，在 Ascend NPU 上执行生成式推理，最后将模型生成的 token 解码为可读文本，得到图片中表格的结构化识别结果。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0844e400-ec86-438d-bb4a-bec8de6b1b30",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mindspore/miniconda/envs/jupyter/lib/python3.10/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.\n",
+      "  setattr(self, word, getattr(machar, word).flat[0])\n",
+      "/home/mindspore/miniconda/envs/jupyter/lib/python3.10/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.\n",
+      "  return self._float_to_str(self.smallest_subnormal)\n",
+      "/home/mindspore/miniconda/envs/jupyter/lib/python3.10/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.\n",
+      "  setattr(self, word, getattr(machar, word).flat[0])\n",
+      "/home/mindspore/miniconda/envs/jupyter/lib/python3.10/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.\n",
+      "  return self._float_to_str(self.smallest_subnormal)\n",
+      "/home/mindspore/miniconda/envs/jupyter/lib/python3.10/site-packages/torchvision/io/image.py:14: UserWarning: Failed to load image Python extension: 'not support import any ops for now.'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n",
+      "  warn(\n",
+      "Modular Diffusers is currently an experimental feature under active development. The API is subject to breaking changes in future releases.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "444dcbb98c40471f8805e93ac0501bb4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f71cec0464a54875a319a2a3137aa667",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "configuration_paddleocr_vl.py: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "A new version of the following files was downloaded from https://huggingface.co/lvyufeng/PaddleOCR-VL-0.9B:\n",
+      "- configuration_paddleocr_vl.py\n",
+      ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "453f5d40ee564be698ecfb43cfe6b1d2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "modeling_paddleocr_vl.py: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "A new version of the following files was downloaded from https://huggingface.co/lvyufeng/PaddleOCR-VL-0.9B:\n",
+      "- modeling_paddleocr_vl.py\n",
+      ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "21da668b1b47448780d2913327841278",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/1.92G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[MS_ALLOC_CONF]Runtime config:  enable_vmm:True  vmm_align_size:2MB\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c17712e02bf14ca3aed01b8478655e18",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "generation_config.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6433cb75042b433484324a16a170101a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b1b684fab87d4349aa6abcdc18d5730c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.model:   0%|          | 0.00/1.61M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f21523e660414b96afadd33e68280568",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/11.2M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "289a09ca48ce4eb6acba668dca2cc1b6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "added_tokens.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d013c4ac0cc346b09385493a295d3e73",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "68c8384141ef4534b56d1d67660fb296",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "chat_template.jinja: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e5e19a4614614a85a64104b25c15969c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "processor_config.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f87f22e9ccc9425b8e4e31203ce3ee7d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "processing_paddleocr_vl.py: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "A new version of the following files was downloaded from https://huggingface.co/lvyufeng/PaddleOCR-VL-0.9B:\n",
+      "- processing_paddleocr_vl.py\n",
+      ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dfdedc49653c4c3c9bdd73a7620f6394",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "preprocessor_config.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "834f61be4cc84cb7b38d712892ff4b2f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "image_processing.py: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Keyword argument `format` is not a valid argument for this processor and will be ignored.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mindtorch.Size([1, 1204])\n",
+      "User: Table Recognition:\n",
+      "R&D QUALITY IMPROVEMENT SUGGESTION/SOLUTION FORM\n",
+      "Name/Phone Ext.: M. Hamann. P. Harper. P. Martinez Date: 9/3/92\n",
+      "Supervisor/Manager: J. S. Wigand\n",
+      "R&D Group: Licensee\n",
+      "Suggestion: Discontinue coal retention analyses on licensee submitted product samples. (Note: Coal Retention testing is not performed by most licensees. Other B&W physical measurements as ends stability and inspection for soft spots in cigarettes are thought to be sufficient measures to assure cigarette physical integrity. The proposed action will increase laboratory productivity.)\n",
+      "Suggested Solution(s): Delete coal retention from the list of standard analyses performed on licensee submitted product samples. Special requests for coal retention testing could still be submitted on an exception basis.\n",
+      "Have you contacted your Manager/Supervisor?\n",
+      "___ Yes\n",
+      "___ No\n",
+      "Manager Comments: Manager, please contact suggester and forward comments to the Quality Council.\n",
+      "qip.wp\n",
+      "597005708\n"
+     ]
+    }
+   ],
+   "source": [
+    "import mindspore\n",
+    "import mindnlp\n",
+    "from transformers import AutoModel, AutoProcessor, AutoTokenizer\n",
+    "from transformers.image_utils import load_image\n",
+    "\n",
+    "\n",
+    "model = AutoModel.from_pretrained(\"lvyufeng/PaddleOCR-VL-0.9B\", trust_remote_code=True, dtype=mindspore.float16, device_map='auto')\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"lvyufeng/PaddleOCR-VL-0.9B\")\n",
+    "processor = AutoProcessor.from_pretrained(\"lvyufeng/PaddleOCR-VL-0.9B\", trust_remote_code=True)\n",
+    "\n",
+    "image = load_image(\n",
+    "    \"https://hf-mirror.com/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg\"\n",
+    ")\n",
+    "\n",
+    "query = 'Table Recognition:'\n",
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": query,\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "text = tokenizer.apply_chat_template(messages, tokenize=False)\n",
+    "inputs = processor(image, text=text, return_tensors=\"pt\", format=True).to('cuda')\n",
+    "generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=1024)\n",
+    "print(generate_ids.shape)\n",
+    "decoded_output = processor.decode(\n",
+    "    generate_ids[0], skip_special_tokens=True\n",
+    ")\n",
+    "print(decoded_output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "30c4f72c-d2a9-4fd5-890b-5e69c0126126",
+   "metadata": {},
+   "source": [
+    "### 输出后处理\n",
+    "可以看到输出了表格里对应的文本段，这样看起来比较麻烦，实际开发中这样读取出来也需要后处理。可以先以模型生成的 decoded_output 作为原始文本输入，利用正则表达式封装的 extract 方法，从文本中按字段规则依次抽取姓名、日期、主管、研发组、建议内容、解决方案、经理评语和文档编号等关键信息，组织成一个字典结构；随后将该结构化结果打印出来，并通过 json.dumps 转换为格式化的 JSON 输出，在保留中文字符的同时进行缩进美化，最终得到可直接用于存储或接口传输的结构化数据。\n",
+    "将原来的长文本转化成可读性更强、可操作性更强的数据格式。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c8248b46-9adf-4814-a96b-0e99d2b3acd9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'name': 'M. Hamann. P. Harper. P. Martinez', 'date': '9/3/92', 'supervisor': 'J. S. Wigand', 'group': 'Licensee', 'suggestion': 'Discontinue coal retention analyses on licensee submitted product samples. (Note: Coal Retention testing is not performed by most licensees. Other B&W physical measurements as ends stability and inspection for soft spots in cigarettes are thought to be sufficient measures to assure cigarette physical integrity. The proposed action will increase laboratory productivity.)', 'solution': 'Delete coal retention from the list of standard analyses performed on licensee submitted product samples. Special requests for coal retention testing could still be submitted on an exception basis.', 'manager_comments': 'Manager, please contact suggester and forward comments to the Quality Council.', 'document_id': '597005708'}\n",
+      "####################################################################################################\n",
+      "{\n",
+      "    \"name\": \"M. Hamann. P. Harper. P. Martinez\",\n",
+      "    \"date\": \"9/3/92\",\n",
+      "    \"supervisor\": \"J. S. Wigand\",\n",
+      "    \"group\": \"Licensee\",\n",
+      "    \"suggestion\": \"Discontinue coal retention analyses on licensee submitted product samples. (Note: Coal Retention testing is not performed by most licensees. Other B&W physical measurements as ends stability and inspection for soft spots in cigarettes are thought to be sufficient measures to assure cigarette physical integrity. The proposed action will increase laboratory productivity.)\",\n",
+      "    \"solution\": \"Delete coal retention from the list of standard analyses performed on licensee submitted product samples. Special requests for coal retention testing could still be submitted on an exception basis.\",\n",
+      "    \"manager_comments\": \"Manager, please contact suggester and forward comments to the Quality Council.\",\n",
+      "    \"document_id\": \"597005708\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import json\n",
+    "\n",
+    "text = decoded_output\n",
+    "\n",
+    "def extract(pattern, text):\n",
+    "    m = re.search(pattern, text, re.S)\n",
+    "    return m.group(1).strip() if m else None\n",
+    "\n",
+    "data = {\n",
+    "    \"name\": extract(r\"Name/Phone Ext\\.\\:\\s*(.*?)\\s*Date\\:\", text),\n",
+    "    \"date\": extract(r\"Date\\:\\s*([0-9/]+)\", text),\n",
+    "    \"supervisor\": extract(r\"Supervisor/Manager\\:\\s*(.*?)\\s*R&D Group\\:\", text),\n",
+    "    \"group\": extract(r\"R&D Group\\:\\s*(.*?)\\s*Suggestion\\:\", text),\n",
+    "    \"suggestion\": extract(r\"Suggestion\\:\\s*(.*?)\\s*Suggested Solution\", text),\n",
+    "    \"solution\": extract(r\"Suggested Solution\\(s\\)\\:\\s*(.*?)\\s*Have you contacted\", text),\n",
+    "    \"manager_comments\": extract(r\"Manager Comments\\:\\s*(.*?)\\s*(qip\\.wp|\\n\\d{6,}$)\", text),\n",
+    "    \"document_id\": extract(r\"\\n(\\d{6,})$\", text)\n",
+    "}\n",
+    "\n",
+    "\n",
+    "print(data)\n",
+    "print('#'*100)\n",
+    "# JSON 输出\n",
+    "json_output = json.dumps(\n",
+    "    data,\n",
+    "    ensure_ascii=False,  #  保留中文\n",
+    "    indent=4,             # 缩进美化\n",
+    ")\n",
+    "\n",
+    "print(json_output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17478049-a326-480f-8055-ef391a2fc7c4",
+   "metadata": {},
+   "source": [
+    "### 数据库操作\n",
+    "实际从图片等提取出文本数据后，往往要进行存储，后续可以接RAG等操作，形成一套完整的工作流。这里以sqlite3为例，将读取并整理的文本数据进行入库操作。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ccd62f09-6fc3-4d9e-b387-7533fa06b5d5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import sqlite3\n",
+    "import json\n",
+    "\n",
+    "conn = sqlite3.connect(\"ocr_records.db\")\n",
+    "cursor = conn.cursor()\n",
+    "\n",
+    "cursor.execute(\"\"\"\n",
+    "CREATE TABLE IF NOT EXISTS raw_json (\n",
+    "    id INTEGER PRIMARY KEY AUTOINCREMENT,\n",
+    "    document_id TEXT,\n",
+    "    payload TEXT\n",
+    ")\n",
+    "\"\"\")\n",
+    "\n",
+    "cursor.execute(\"\"\"\n",
+    "INSERT INTO raw_json (document_id, payload)\n",
+    "VALUES (?, ?)\n",
+    "\"\"\", (\n",
+    "    data[\"document_id\"],\n",
+    "    json.dumps(data, ensure_ascii=False)\n",
+    "))\n",
+    "\n",
+    "conn.commit()\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "67d44334-7cec-4ed0-9310-07db43536518",
+   "metadata": {},
+   "source": [
+    "存储后也可以以相同的逻辑进行读取。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3b1cd5d1-abc0-4ee7-aff3-3dec84ebb830",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'name': 'M. Hamann. P. Harper. P. Martinez', 'date': '9/3/92', 'supervisor': 'J. S. Wigand', 'group': 'Licensee', 'suggestion': 'Discontinue coal retention analyses on licensee submitted product samples. (Note: Coal Retention testing is not performed by most licensees. Other B&W physical measurements as ends stability and inspection for soft spots in cigarettes are thought to be sufficient measures to assure cigarette physical integrity. The proposed action will increase laboratory productivity.)', 'solution': 'Delete coal retention from the list of standard analyses performed on licensee submitted product samples. Special requests for coal retention testing could still be submitted on an exception basis.', 'manager_comments': 'Manager, please contact suggester and forward comments to the Quality Council.', 'document_id': '597005708'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sqlite3\n",
+    "import json\n",
+    "\n",
+    "conn = sqlite3.connect(\"ocr_records.db\")\n",
+    "cursor = conn.cursor()\n",
+    "\n",
+    "cursor.execute(\n",
+    "    \"SELECT payload FROM raw_json WHERE document_id = ?\",\n",
+    "    (\"597005708\",)\n",
+    ")\n",
+    "\n",
+    "row = cursor.fetchone()\n",
+    "conn.close()\n",
+    "\n",
+    "if row:\n",
+    "    data = json.loads(row[0])\n",
+    "    print(data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c7da994-2734-4c6c-9a01-8a4efea18bbe",
+   "metadata": {},
+   "source": [
+    "# 通用函数构建\n",
+    "上述案例是对输入的图片直接进行处理，实际开发中可能不太方便，这时候就可以封装成一个函数，后续也方便拓展成一个Agent完成具体功能。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "537c5dd6-3d10-4d7e-aaf0-152e474f3db7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def run_vision_task(\n",
+    "    image_url,\n",
+    "    query,\n",
+    "    model,\n",
+    "    processor,\n",
+    "    tokenizer,\n",
+    "    device=\"cuda\",\n",
+    "    max_new_tokens=1024,\n",
+    "    do_sample=False,\n",
+    "    num_beams=1,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    通用视觉-语言推理函数（OCR / 表格识别等）\n",
+    "    \"\"\"\n",
+    "    # 加载图片\n",
+    "    image = load_image(image_url)\n",
+    "\n",
+    "    # 构造对话\n",
+    "    messages = [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": query,\n",
+    "        }\n",
+    "    ]\n",
+    "\n",
+    "    # 构造模型输入\n",
+    "    text = tokenizer.apply_chat_template(messages, tokenize=False)\n",
+    "    inputs = processor(\n",
+    "        image,\n",
+    "        text=text,\n",
+    "        return_tensors=\"pt\",\n",
+    "        format=True\n",
+    "    ).to(device)\n",
+    "\n",
+    "    # 推理\n",
+    "    generate_ids = model.generate(\n",
+    "        **inputs,\n",
+    "        do_sample=do_sample,\n",
+    "        num_beams=num_beams,\n",
+    "        max_new_tokens=max_new_tokens\n",
+    "    )\n",
+    "\n",
+    "    # 解码输出\n",
+    "    decoded_output = processor.decode(\n",
+    "        generate_ids[0],\n",
+    "        skip_special_tokens=True\n",
+    "    )\n",
+    "\n",
+    "    return {\n",
+    "        \"token_shape\": generate_ids.shape,\n",
+    "        \"text\": decoded_output\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "86e0b842-7317-4bbf-a71a-df2e4191e939",
+   "metadata": {},
+   "source": [
+    "这里选取paddle官方示例图片进行演示，提取一张报刊的文本，不过输出有些粗糙。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "44ba65b4-1b53-422a-8fba-7733bc7ffdec",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mindtorch.Size([1, 3680])\n",
+      "User: OCR:\n",
+      "助力双方交往\n",
+      "助力双方交往\n",
+      "本报记者 沈小晓\n",
+      "任彦 黄培昭\n",
+      "搭建友谊桥梁\n",
+      "身着中国传统民族服装的厄立特里亚青年依次登台表演中国民族舞、现代舞、扇子舞等，曼妙的舞姿赢得现场观众阵阵掌声。这是日前厄立特里亚高等教育与研究院孔子学院(以下简称\"厄特孔院\")举办\"喜迎新年\"中国歌舞比赛的场景。\n",
+      "中国和厄立特里亚传统友谊深厚。近年来，在高质量共建\"一带一路\"框架下，中厄两国人文交流不断深化，互利合作的民意基础日益深厚。\n",
+      "“学好中文，我们的未来不是梦”\n",
+      "“鲜花曾告诉我你怎样走过，大地知道你心中的每一个角落……”厄立特里亚阿斯马拉大学综合楼二层，一阵优美的歌声在走廊里回响。循着熟悉的旋律轻轻推开一间教室的门，学生们正跟着老师学唱中文歌曲《同一首歌》。\n",
+      "这是厄特孔院阿斯马拉大学教学点的一节中文歌曲课。为了让学生们更好地理解歌词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐字翻译和解释歌词。随着伴奏声响起，学生们边唱边随着节拍摇动身体，现场气氛热烈。\n",
+      "“这是中文歌曲初级班，共有32人。学生大部分来自首都阿斯马拉的中小学，年龄最小的仅有6岁。”尤斯拉告诉记者。\n",
+      "尤斯拉今年23岁，是厄立特里亚一所公立学校的艺术老师。她12岁开始在厄特孔院学习中文，在2017年第十届\"汉语桥\"世界中学生中文比赛中获得厄立特里亚赛区第一名，并和同伴代表厄立特里亚前往中国参加决赛，获得团体优胜奖。2022年起，尤斯拉开始在厄特孔院兼职教授中文歌曲，每周末两个课时。\"中国文化博大精深，我希望我的学生们能够通过中文歌曲更好地理解中国文化。”她说。\n",
+      "“姐姐，你想去中国吗?”“非常想！我想去看故宫、爬长城。”尤斯拉的学生中有一对能歌善舞的姐妹，姐姐露娅今年15岁，妹妹莉娅14岁，两人都已在厄特孔院学习多年，中文说得格外流利。\n",
+      "露娅对记者说：“这些年来，怀着对中文和中国文化的热爱，我们姐妹俩始终相互鼓励，一起学习。我们的中文一天比一天好，还学会了中文歌和中国舞。我们一定要到中国去。学好中文，我们的未来不是梦！”\n",
+      "据厄特孔院中方院长黄鸣飞介绍，这所孔院成立于2013年3月，由贵州财经大学和\n",
+      "厄立特里亚高等教育与研究院合作建立，开设了中国语言课程和中国文化课程，注册学生2万余人次。10余年来，厄特孔院已成为当地民众了解中国的一扇窗口。\n",
+      "黄鸣飞表示，随着来学习中文的人日益增多，阿斯马拉大学教学点已难以满足教学需要。2024年4月，由中企蜀道集团所属四川路桥承建的孔院教学楼项目在阿斯马拉开工建设，预计今年上半年竣工，建成后将为厄特孔院提供全新的办学场地。\n",
+      "“在中国学习的经历让我看到更广阔的世界”\n",
+      "多年来，厄立特里亚广大赴华留学生和培训人员积极投身国家建设，成为助力该国发展的人才和厄中友好的见证者和推动者。\n",
+      "在厄立特里亚全国妇女联盟工作的约翰娜·特韦尔德·凯莱塔就是其中一位。她曾在中华女子学院攻读硕士学位，研究方向是女性领导力与社会发展。其间，她实地走访中国多个地区，获得了观察中国社会发展的第一手资料。\n",
+      "谈起在中国求学的经历，约翰娜记忆犹新：“中国的发展在当今世界是独一无二的。沿着中国特色社会主义道路坚定前行，中国创造了发展奇迹，这一切都离不开中国共产党的领导。中国的发展经验值得许多国家学习借鉴。”\n",
+      "正在西南大学学习的厄立特里亚博士生穆卢盖塔·泽穆伊对中国怀有深厚感情。8年前，在北京师范大学获得硕士学位后，穆卢盖塔在社交媒体上写下这样一段话：“这是我人生的重要一步，自此我拥有了一双坚固的鞋子，赋予我穿越荆棘的力量。”\n",
+      "穆卢盖塔密切关注中国在经济、科技、教育等领域的发展，“中国在科研等方面的实力与日俱增。在中国学习的经历让我看到更广阔的世界，从中受益匪浅。”\n",
+      "23岁的莉迪亚·埃斯蒂法诺斯已在厄特孔院学习3年，在中国书法、中国画等方面表现十分优秀，在2024年厄立特里亚赛区的“汉语桥\"比赛中获得一等奖。莉迪亚说：“学习中国书法让我的内心变得安宁和\n"
+     ]
+    }
+   ],
+   "source": [
+    "ocr_result = run_vision_task(\n",
+    "    image_url=\"https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png\",\n",
+    "    query=\"OCR:\",\n",
+    "    model=model,\n",
+    "    processor=processor,\n",
+    "    tokenizer=tokenizer\n",
+    ")\n",
+    "\n",
+    "print(ocr_result[\"token_shape\"])\n",
+    "print(ocr_result[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e16797d8-5d57-4435-a383-a5b910b45c5b",
+   "metadata": {},
+   "source": [
+    "同样的，可以对输出的文本进行清洗操作，并且结构化输出，这里以类似md格式输出文章。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7508f7db-2f02-475b-a392-02fcd35ddd08",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "User: OCR:助力双方交往助力双方交往本报记者 沈小晓任彦 黄培昭搭建友谊桥梁身着中国传统民族服装的厄立特里亚青年依次登台表演中国民族舞、现代舞、扇子舞等，曼妙的舞姿赢得现场观众阵阵掌声。这是日前厄立特里亚高等教育与研究院孔子学院(以下简称\"厄特孔院\")举办\"喜迎新年\"中国歌舞比赛的场景。\n",
+      "\n",
+      "中国和厄立特里亚传统友谊深厚。近年来，在高质量共建\"一带一路\"框架下，中厄两国人文交流不断深化，互利合作的民意基础日益深厚。\n",
+      "\n",
+      "> “学好中文，我们的未来不是梦”\n",
+      "\n",
+      "> “鲜花曾告诉我你怎样走过，大地知道你心中的每一个角落……”\n",
+      "\n",
+      "厄立特里亚阿斯马拉大学综合楼二层，一阵优美的歌声在走廊里回响。循着熟悉的旋律轻轻推开一间教室的门，学生们正跟着老师学唱中文歌曲《同一首歌》。\n",
+      "\n",
+      "这是厄特孔院阿斯马拉大学教学点的一节中文歌曲课。为了让学生们更好地理解歌词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐字翻译和解释歌词。随着伴奏声响起，学生们边唱边随着节拍摇动身体，现场气氛热烈。\n",
+      "\n",
+      "> “这是中文歌曲初级班，共有32人。学生大部分来自首都阿斯马拉的中小学，年龄最小的仅有6岁。”\n",
+      "\n",
+      "尤斯拉告诉记者。\n",
+      "\n",
+      "尤斯拉今年23岁，是厄立特里亚一所公立学校的艺术老师。她12岁开始在厄特孔院学习中文，在2017年第十届\"汉语桥\"世界中学生中文比赛中获得厄立特里亚赛区第一名，并和同伴代表厄立特里亚前往中国参加决赛，获得团体优胜奖。2022年起，尤斯拉开始在厄特孔院兼职教授中文歌曲，每周末两个课时。\"中国文化博大精深，我希望我的学生们能够通过中文歌曲更好地理解中国文化。”她说。\n",
+      "\n",
+      "> “姐姐，你想去中国吗?”\n",
+      "\n",
+      "> “非常想！我想去看故宫、爬长城。”\n",
+      "\n",
+      "尤斯拉的学生中有一对能歌善舞的姐妹，姐姐露娅今年15岁，妹妹莉娅14岁，两人都已在厄特孔院学习多年，中文说得格外流利。\n",
+      "\n",
+      "> “这些年来，怀着对中文和中国文化的热爱，我们姐妹俩始终相互鼓励，一起学习。我们的中文一天比一天好，还学会了中文歌和中国舞。我们一定要到中国去。学好中文，我们的未来不是梦！”\n",
+      "\n",
+      "露娅对记者说：\n",
+      "\n",
+      "据厄特孔院中方院长黄鸣飞介绍，这所孔院成立于2013年3月，由贵州财经大学和厄立特里亚高等教育与研究院合作建立，开设了中国语言课程和中国文化课程，注册学生2万余人次。10余年来，厄特孔院已成为当地民众了解中国的一扇窗口。\n",
+      "\n",
+      "黄鸣飞表示，随着来学习中文的人日益增多，阿斯马拉大学教学点已难以满足教学需要。2024年4月，由中企蜀道集团所属四川路桥承建的孔院教学楼项目在阿斯马拉开工建设，预计今年上半年竣工，建成后将为厄特孔院提供全新的办学场地。\n",
+      "\n",
+      "# “在中国学习的经历让我看到更广阔的世界”\n",
+      "\n",
+      "多年来，厄立特里亚广大赴华留学生和培训人员积极投身国家建设，成为助力该国发展的人才和厄中友好的见证者和推动者。\n",
+      "\n",
+      "在厄立特里亚全国妇女联盟工作的约翰娜·特韦尔德·凯莱塔就是其中一位。她曾在中华女子学院攻读硕士学位，研究方向是女性领导力与社会发展。其间，她实地走访中国多个地区，获得了观察中国社会发展的第一手资料。\n",
+      "\n",
+      "> “中国的发展在当今世界是独一无二的。沿着中国特色社会主义道路坚定前行，中国创造了发展奇迹，这一切都离不开中国共产党的领导。中国的发展经验值得许多国家学习借鉴。”\n",
+      "\n",
+      "谈起在中国求学的经历，约翰娜记忆犹新：\n",
+      "\n",
+      "> “这是我人生的重要一步，自此我拥有了一双坚固的鞋子，赋予我穿越荆棘的力量。”\n",
+      "\n",
+      "正在西南大学学习的厄立特里亚博士生穆卢盖塔·泽穆伊对中国怀有深厚感情。8年前，在北京师范大学获得硕士学位后，穆卢盖塔在社交媒体上写下这样一段话：\n",
+      "\n",
+      "> “中国在科研等方面的实力与日俱增。在中国学习的经历让我看到更广阔的世界，从中受益匪浅。”\n",
+      "\n",
+      "穆卢盖塔密切关注中国在经济、科技、教育等领域的发展，\n",
+      "\n",
+      "23岁的莉迪亚·埃斯蒂法诺斯已在厄特孔院学习3年，在中国书法、中国画等方面表现十分优秀，在2024年厄立特里亚赛区的“汉语桥\"比赛中获得一等奖。莉迪亚说：“学习中国书法让我的内心变得安宁和\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "from typing import List\n",
+    "\n",
+    "\n",
+    "# ---------- 基础工具 ----------\n",
+    "\n",
+    "def clean_lines(text: str) -> List[str]:\n",
+    "    \"\"\"\n",
+    "    基础清洗：去空行、去首尾空格\n",
+    "    \"\"\"\n",
+    "    return [l.strip() for l in text.splitlines() if l.strip()]\n",
+    "\n",
+    "\n",
+    "def merge_sentences(lines: List[str]) -> List[str]:\n",
+    "    \"\"\"\n",
+    "    合并被 OCR 错误断行的句子\n",
+    "    规则：上一行没有以句末标点结束，就合并\n",
+    "    \"\"\"\n",
+    "    merged = []\n",
+    "    buf = \"\"\n",
+    "\n",
+    "    for line in lines:\n",
+    "        if not buf:\n",
+    "            buf = line\n",
+    "            continue\n",
+    "\n",
+    "        if re.search(r\"[。！？！”\\\"]$\", buf):\n",
+    "            merged.append(buf)\n",
+    "            buf = line\n",
+    "        else:\n",
+    "            buf += line\n",
+    "\n",
+    "    if buf:\n",
+    "        merged.append(buf)\n",
+    "\n",
+    "    return merged\n",
+    "\n",
+    "\n",
+    "# ---------- 结构判断 ----------\n",
+    "\n",
+    "def is_title(line: str) -> bool:\n",
+    "    \"\"\"\n",
+    "    通用标题判断（弱规则）\n",
+    "    \"\"\"\n",
+    "    return (\n",
+    "        len(line) <= 20\n",
+    "        and not re.search(r\"[，。！？：；]\", line)\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def split_quotes(line: str):\n",
+    "    \"\"\"\n",
+    "    拆分引语与正文\n",
+    "    返回：quotes, rest\n",
+    "    \"\"\"\n",
+    "    quotes = re.findall(r\"“[^”]+”\", line)\n",
+    "    rest = re.sub(r\"“[^”]+”\", \"\", line).strip()\n",
+    "    return quotes, rest\n",
+    "\n",
+    "\n",
+    "# ---------- 主流程 ----------\n",
+    "\n",
+    "def ocr_to_markdown(text: str) -> str:\n",
+    "    lines = clean_lines(text)\n",
+    "    lines = merge_sentences(lines)\n",
+    "\n",
+    "    md = []\n",
+    "    title_used = False\n",
+    "\n",
+    "    for line in lines:\n",
+    "\n",
+    "        # 主标题（只取第一个）\n",
+    "        if not title_used and is_title(line):\n",
+    "            md.append(f\"# {line}\")\n",
+    "            title_used = True\n",
+    "            continue\n",
+    "\n",
+    "        # 处理引语\n",
+    "        if \"“\" in line and \"”\" in line:\n",
+    "            quotes, rest = split_quotes(line)\n",
+    "\n",
+    "            for q in quotes:\n",
+    "                md.append(f\"> {q}\")\n",
+    "\n",
+    "            if rest:\n",
+    "                md.append(rest)\n",
+    "\n",
+    "            continue\n",
+    "\n",
+    "        # 普通正文\n",
+    "        md.append(line)\n",
+    "\n",
+    "    return \"\\n\\n\".join(md)\n",
+    "\n",
+    "\n",
+    "# ---------- 使用示例 ----------\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    markdown = ocr_to_markdown(ocr_result[\"text\"])\n",
+    "    print(markdown)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2f4b041-157b-4615-9f62-9223718b5cc1",
+   "metadata": {},
+   "source": [
+    "## 小结\n",
+    "paddle-ocr 0.9B虽然参数量不大，但效果还是很不错的，也适合个人搭建自己的小工作流进行使用，推荐大家尝试尝试。"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}