From b680b59d17a6a8cb367077313c1f1f7167d4b4b2 Mon Sep 17 00:00:00 2001 From: snow2white <164228644@qq.com> Date: Fri, 16 Jan 2026 00:36:20 +0800 Subject: [PATCH] 0116v1 --- .DS_Store | Bin 0 -> 6148 bytes nlp/README.md | 9 +- nlp/inference_paddleOCR0.9B_detection.ipynb | 896 ++++++++++++++++++++ 3 files changed, 901 insertions(+), 4 deletions(-) create mode 100644 .DS_Store create mode 100644 nlp/inference_paddleOCR0.9B_detection.ipynb diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9e0398212f617f7441988f3eef716f8bfe0c362a GIT binary patch literal 6148 zcmeHKU2oGc6uoXbno>Z00BJ8sk$4@YgAEDs(sirEB;a8|An~D+tX+vnNYk*zUq=d@xa zQ^U?e^^qRO@-&QPxWcs+RspNP<|)8?cZ&iVP@fXY?q6;>Kx+I_ra~`+8Bv!mGW(o) z-wHhLaY{XKWu7T+7nmhFrSk&kOMz2WJ)O}R`kZi|loULE7CaoxCn&N?g958UPwAKr zsYB1nr%t|4HTSQXxjPi&H!QGt=2=#ahKMqz6mi}~St9m;;=GqqA?xKp`2%T##&7uPg_Lo4Sq{RNQGeF0 z-#nB_8bnEdtQ?|VAC$MRqNFEhZ8=SP6Xomc2Dj=~yY=n)e81V;@gD5%FLu27qx&fD zJ=|L?s_v~j_a2{gKgPpEen#Xlfm1=*b%U3{Gc$8O_s2;r6J8r;J2Zw~gMt#gzF8%z zbb?hopd)$)Y(VE;-h)?9+{monwVX$nUel9xe(D{`j#NiHefoqojZ|r<_X*y^2G;mu zWyj!s$c>A@H`dJRxV4Rq&<17|unPPu1^D~mAu+ZLE;Xu82Qqa90G3g$3}ya9z&@VA zmcgY)w7`UR1!`Aet{6hQquw*TmcgY)?M}j6K7<)rm>Y^vqoaRM(Mhy4+T1E&704^# z=*K#r|D8YI|MO1v$tqwK_^%WY>mLogUzf0e^r6s type is zero.\n", + " setattr(self, word, getattr(machar, word).flat[0])\n", + "/home/mindspore/miniconda/envs/jupyter/lib/python3.10/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero.\n", + " return self._float_to_str(self.smallest_subnormal)\n", + "/home/mindspore/miniconda/envs/jupyter/lib/python3.10/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero.\n", + " setattr(self, word, getattr(machar, word).flat[0])\n", + "/home/mindspore/miniconda/envs/jupyter/lib/python3.10/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero.\n", + " return self._float_to_str(self.smallest_subnormal)\n", + "/home/mindspore/miniconda/envs/jupyter/lib/python3.10/site-packages/torchvision/io/image.py:14: UserWarning: Failed to load image Python extension: 'not support import any ops for now.'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n", + " warn(\n", + "Modular Diffusers is currently an experimental feature under active development. The API is subject to breaking changes in future releases.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "444dcbb98c40471f8805e93ac0501bb4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f71cec0464a54875a319a2a3137aa667", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "configuration_paddleocr_vl.py: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A new version of the following files was downloaded from https://huggingface.co/lvyufeng/PaddleOCR-VL-0.9B:\n", + "- configuration_paddleocr_vl.py\n", + ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "453f5d40ee564be698ecfb43cfe6b1d2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "modeling_paddleocr_vl.py: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A new version of the following files was downloaded from https://huggingface.co/lvyufeng/PaddleOCR-VL-0.9B:\n", + "- modeling_paddleocr_vl.py\n", + ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "21da668b1b47448780d2913327841278", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "model.safetensors: 0%| | 0.00/1.92G [00:00 “学好中文,我们的未来不是梦”\n", + "\n", + "> “鲜花曾告诉我你怎样走过,大地知道你心中的每一个角落……”\n", + "\n", + "厄立特里亚阿斯马拉大学综合楼二层,一阵优美的歌声在走廊里回响。循着熟悉的旋律轻轻推开一间教室的门,学生们正跟着老师学唱中文歌曲《同一首歌》。\n", + "\n", + "这是厄特孔院阿斯马拉大学教学点的一节中文歌曲课。为了让学生们更好地理解歌词大意,老师尤斯拉·穆罕默德萨尔·侯赛因逐字翻译和解释歌词。随着伴奏声响起,学生们边唱边随着节拍摇动身体,现场气氛热烈。\n", + "\n", + "> “这是中文歌曲初级班,共有32人。学生大部分来自首都阿斯马拉的中小学,年龄最小的仅有6岁。”\n", + "\n", + "尤斯拉告诉记者。\n", + "\n", + "尤斯拉今年23岁,是厄立特里亚一所公立学校的艺术老师。她12岁开始在厄特孔院学习中文,在2017年第十届\"汉语桥\"世界中学生中文比赛中获得厄立特里亚赛区第一名,并和同伴代表厄立特里亚前往中国参加决赛,获得团体优胜奖。2022年起,尤斯拉开始在厄特孔院兼职教授中文歌曲,每周末两个课时。\"中国文化博大精深,我希望我的学生们能够通过中文歌曲更好地理解中国文化。”她说。\n", + "\n", + "> “姐姐,你想去中国吗?”\n", + "\n", + "> “非常想!我想去看故宫、爬长城。”\n", + "\n", + "尤斯拉的学生中有一对能歌善舞的姐妹,姐姐露娅今年15岁,妹妹莉娅14岁,两人都已在厄特孔院学习多年,中文说得格外流利。\n", + "\n", + "> “这些年来,怀着对中文和中国文化的热爱,我们姐妹俩始终相互鼓励,一起学习。我们的中文一天比一天好,还学会了中文歌和中国舞。我们一定要到中国去。学好中文,我们的未来不是梦!”\n", + "\n", + "露娅对记者说:\n", + "\n", + "据厄特孔院中方院长黄鸣飞介绍,这所孔院成立于2013年3月,由贵州财经大学和厄立特里亚高等教育与研究院合作建立,开设了中国语言课程和中国文化课程,注册学生2万余人次。10余年来,厄特孔院已成为当地民众了解中国的一扇窗口。\n", + "\n", + "黄鸣飞表示,随着来学习中文的人日益增多,阿斯马拉大学教学点已难以满足教学需要。2024年4月,由中企蜀道集团所属四川路桥承建的孔院教学楼项目在阿斯马拉开工建设,预计今年上半年竣工,建成后将为厄特孔院提供全新的办学场地。\n", + "\n", + "# “在中国学习的经历让我看到更广阔的世界”\n", + "\n", + "多年来,厄立特里亚广大赴华留学生和培训人员积极投身国家建设,成为助力该国发展的人才和厄中友好的见证者和推动者。\n", + "\n", + "在厄立特里亚全国妇女联盟工作的约翰娜·特韦尔德·凯莱塔就是其中一位。她曾在中华女子学院攻读硕士学位,研究方向是女性领导力与社会发展。其间,她实地走访中国多个地区,获得了观察中国社会发展的第一手资料。\n", + "\n", + "> “中国的发展在当今世界是独一无二的。沿着中国特色社会主义道路坚定前行,中国创造了发展奇迹,这一切都离不开中国共产党的领导。中国的发展经验值得许多国家学习借鉴。”\n", + "\n", + "谈起在中国求学的经历,约翰娜记忆犹新:\n", + "\n", + "> “这是我人生的重要一步,自此我拥有了一双坚固的鞋子,赋予我穿越荆棘的力量。”\n", + "\n", + "正在西南大学学习的厄立特里亚博士生穆卢盖塔·泽穆伊对中国怀有深厚感情。8年前,在北京师范大学获得硕士学位后,穆卢盖塔在社交媒体上写下这样一段话:\n", + "\n", + "> “中国在科研等方面的实力与日俱增。在中国学习的经历让我看到更广阔的世界,从中受益匪浅。”\n", + "\n", + "穆卢盖塔密切关注中国在经济、科技、教育等领域的发展,\n", + "\n", + "23岁的莉迪亚·埃斯蒂法诺斯已在厄特孔院学习3年,在中国书法、中国画等方面表现十分优秀,在2024年厄立特里亚赛区的“汉语桥\"比赛中获得一等奖。莉迪亚说:“学习中国书法让我的内心变得安宁和\n" + ] + } + ], + "source": [ + "import re\n", + "from typing import List\n", + "\n", + "\n", + "# ---------- 基础工具 ----------\n", + "\n", + "def clean_lines(text: str) -> List[str]:\n", + " \"\"\"\n", + " 基础清洗:去空行、去首尾空格\n", + " \"\"\"\n", + " return [l.strip() for l in text.splitlines() if l.strip()]\n", + "\n", + "\n", + "def merge_sentences(lines: List[str]) -> List[str]:\n", + " \"\"\"\n", + " 合并被 OCR 错误断行的句子\n", + " 规则:上一行没有以句末标点结束,就合并\n", + " \"\"\"\n", + " merged = []\n", + " buf = \"\"\n", + "\n", + " for line in lines:\n", + " if not buf:\n", + " buf = line\n", + " continue\n", + "\n", + " if re.search(r\"[。!?!”\\\"]$\", buf):\n", + " merged.append(buf)\n", + " buf = line\n", + " else:\n", + " buf += line\n", + "\n", + " if buf:\n", + " merged.append(buf)\n", + "\n", + " return merged\n", + "\n", + "\n", + "# ---------- 结构判断 ----------\n", + "\n", + "def is_title(line: str) -> bool:\n", + " \"\"\"\n", + " 通用标题判断(弱规则)\n", + " \"\"\"\n", + " return (\n", + " len(line) <= 20\n", + " and not re.search(r\"[,。!?:;]\", line)\n", + " )\n", + "\n", + "\n", + "def split_quotes(line: str):\n", + " \"\"\"\n", + " 拆分引语与正文\n", + " 返回:quotes, rest\n", + " \"\"\"\n", + " quotes = re.findall(r\"“[^”]+”\", line)\n", + " rest = re.sub(r\"“[^”]+”\", \"\", line).strip()\n", + " return quotes, rest\n", + "\n", + "\n", + "# ---------- 主流程 ----------\n", + "\n", + "def ocr_to_markdown(text: str) -> str:\n", + " lines = clean_lines(text)\n", + " lines = merge_sentences(lines)\n", + "\n", + " md = []\n", + " title_used = False\n", + "\n", + " for line in lines:\n", + "\n", + " # 主标题(只取第一个)\n", + " if not title_used and is_title(line):\n", + " md.append(f\"# {line}\")\n", + " title_used = True\n", + " continue\n", + "\n", + " # 处理引语\n", + " if \"“\" in line and \"”\" in line:\n", + " quotes, rest = split_quotes(line)\n", + "\n", + " for q in quotes:\n", + " md.append(f\"> {q}\")\n", + "\n", + " if rest:\n", + " md.append(rest)\n", + "\n", + " continue\n", + "\n", + " # 普通正文\n", + " md.append(line)\n", + "\n", + " return \"\\n\\n\".join(md)\n", + "\n", + "\n", + "# ---------- 使用示例 ----------\n", + "\n", + "if __name__ == \"__main__\":\n", + " markdown = ocr_to_markdown(ocr_result[\"text\"])\n", + " print(markdown)" + ] + }, + { + "cell_type": "markdown", + "id": "a2f4b041-157b-4615-9f62-9223718b5cc1", + "metadata": {}, + "source": [ + "## 小结\n", + "paddle-ocr 0.9B虽然参数量不大,但效果还是很不错的,也适合个人搭建自己的小工作流进行使用,推荐大家尝试尝试。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}