From eacd98578641ad4719a8262aaee26c1770a22d5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Thu, 26 Feb 2026 14:45:19 +0800 Subject: [PATCH] fix: File memory parsing to output a list-type result --- .../read_multi_modal/file_content_parser.py | 117 +++++++++++------- src/memos/templates/mem_reader_prompts.py | 34 +++-- 2 files changed, 92 insertions(+), 59 deletions(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 2b49d63ba..1b4add398 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -50,7 +50,9 @@ class FileContentParser(BaseMessageParser): """Parser for file content parts.""" - def _get_doc_llm_response(self, chunk_text: str, custom_tags: list[str] | None = None) -> dict: + def _get_doc_llm_response( + self, chunk_text: str, custom_tags: list[str] | None = None + ) -> dict | list: """ Call LLM to extract memory from document chunk. Uses doc prompts from DOC_PROMPT_DICT. @@ -60,7 +62,7 @@ def _get_doc_llm_response(self, chunk_text: str, custom_tags: list[str] | None = custom_tags: Optional list of custom tags for LLM extraction Returns: - Parsed JSON response from LLM or empty dict if failed + Parsed JSON response from LLM (dict or list) or empty dict if failed """ if not self.llm: logger.warning("[FileContentParser] LLM not available for fine mode") @@ -777,35 +779,49 @@ def _make_fallback( return [_make_fallback(idx, text, "no_llm") for idx, text in valid_chunks] # Process single chunk with LLM extraction (worker function) - def _process_chunk(chunk_idx: int, chunk_text: str) -> TextualMemoryItem: - """Process chunk with LLM, fallback to raw on failure.""" + def _process_chunk(chunk_idx: int, chunk_text: str) -> list[TextualMemoryItem]: + """Process chunk with LLM, fallback to raw on failure. Returns list of memory items.""" try: response_json = self._get_doc_llm_response(chunk_text, custom_tags) if response_json: - value = response_json.get("value", "").strip() - if value: - tags = response_json.get("tags", []) - tags = tags if isinstance(tags, list) else [] - tags.extend(["mode:fine", "multimodal:file"]) - - llm_mem_type = response_json.get("memory_type", memory_type) - if llm_mem_type not in ["LongTermMemory", "UserMemory"]: - llm_mem_type = memory_type - - return _make_memory_item( - value=value, - mem_type=llm_mem_type, - tags=tags, - key=response_json.get("key"), - chunk_idx=chunk_idx, - chunk_content=chunk_text, - ) + # Handle list format response + response_list = response_json.get("memory list", []) + memory_items = [] + for item_data in response_list: + if not isinstance(item_data, dict): + continue + + value = item_data.get("value", "").strip() + if value: + tags = item_data.get("tags", []) + tags = tags if isinstance(tags, list) else [] + tags.extend(["mode:fine", "multimodal:file"]) + key_str = item_data.get("key", "") + + llm_mem_type = item_data.get("memory_type", memory_type) + if llm_mem_type not in ["LongTermMemory", "UserMemory"]: + llm_mem_type = memory_type + + memory_item = _make_memory_item( + value=value, + mem_type=llm_mem_type, + tags=tags, + key=key_str, + chunk_idx=chunk_idx, + chunk_content=chunk_text, + ) + memory_items.append(memory_item) + + if memory_items: + return memory_items + else: + return [_make_fallback(chunk_idx, chunk_text)] except Exception as e: logger.error(f"[FileContentParser] LLM error for chunk {chunk_idx}: {e}") # Fallback to raw chunk logger.warning(f"[FileContentParser] Fallback to raw for chunk {chunk_idx}") - return _make_fallback(chunk_idx, chunk_text) + return [_make_fallback(chunk_idx, chunk_text)] def _relate_chunks(items: list[TextualMemoryItem]) -> None: """ @@ -853,30 +869,37 @@ def get_chunk_idx(item: TextualMemoryItem) -> int: ): chunk_idx = futures[future] try: - node = future.result() - memory_items.append(node) - - # Check if this node is a fallback by checking tags - is_fallback = any(tag.startswith("fallback:") for tag in node.metadata.tags) - if is_fallback: - fallback_count += 1 - - # save raw file - node_id = node.id - if node.memory != node.metadata.sources[0].content: - chunk_node = _make_memory_item( - value=node.metadata.sources[0].content, - mem_type="RawFileMemory", - tags=[ - "mode:fine", - "multimodal:file", - f"chunk:{chunk_idx + 1}/{total_chunks}", - ], - chunk_idx=chunk_idx, - chunk_content="", - ) - chunk_node.metadata.summary_ids = [node_id] - memory_items.append(chunk_node) + nodes = future.result() + memory_items.extend(nodes) + + # Check if any node is a fallback by checking tags + has_fallback = False + for node in nodes: + is_fallback = any(tag.startswith("fallback:") for tag in node.metadata.tags) + if is_fallback: + fallback_count += 1 + has_fallback = True + + # save raw file only if no fallback (all nodes are LLM-extracted) + if not has_fallback and nodes: + # Use first node's source info for raw file + first_node = nodes[0] + if first_node.metadata.sources and len(first_node.metadata.sources) > 0: + # Collect all node IDs for summary_ids + node_ids = [node.id for node in nodes] + chunk_node = _make_memory_item( + value=first_node.metadata.sources[0].content, + mem_type="RawFileMemory", + tags=[ + "mode:fine", + "multimodal:file", + f"chunk:{chunk_idx + 1}/{total_chunks}", + ], + chunk_idx=chunk_idx, + chunk_content="", + ) + chunk_node.metadata.summary_ids = node_ids + memory_items.append(chunk_node) except Exception as e: tqdm.write(f"[ERROR] Chunk {chunk_idx} failed: {e}") diff --git a/src/memos/templates/mem_reader_prompts.py b/src/memos/templates/mem_reader_prompts.py index e4f1ca334..f431bd041 100644 --- a/src/memos/templates/mem_reader_prompts.py +++ b/src/memos/templates/mem_reader_prompts.py @@ -244,12 +244,17 @@ Return a single valid JSON object with the following structure: -Return valid JSON: { - "key": , - "memory_type": "LongTermMemory", - "value": , - "tags": + "memory list": [ + { + "key": , + "memory_type": "LongTermMemory", + "value": , + "tags": + } + ... + ], + "summary": } Language rules: @@ -264,7 +269,7 @@ Your Output:""" SIMPLE_STRUCT_DOC_READER_PROMPT_ZH = """您是搜索与检索系统的文本分析专家。 -您的任务是处理文档片段,并生成一个结构化的 JSON 对象。 +您的任务是处理文档片段,并生成一个结构化的 JSON 列表对象。 请执行以下操作: 1. 识别反映文档中事实内容、见解、决策或含义的关键信息——包括任何显著的主题、结论或数据点,使读者无需阅读原文即可充分理解该片段的核心内容。 @@ -281,14 +286,19 @@ - 优先考虑完整性和保真度,而非简洁性。 - 不要泛化或跳过可能具有上下文意义的细节。 -返回一个有效的 JSON 对象,结构如下: +返回有效的 JSON 对象: -返回有效的 JSON: { - "key": <字符串,`value` 字段的简洁标题>, - "memory_type": "LongTermMemory", - "value": <一段清晰准确的段落,全面总结文档片段中的主要观点、论据和信息——若输入摘要为英文,则用英文;若为中文,则用中文>, - "tags": <相关主题关键词列表(例如,["截止日期", "团队", "计划"])> + "memory list": [ + { + "key": <字符串,`value` 字段的简洁标题>, + "memory_type": "LongTermMemory", + "value": <一段清晰准确的段落,全面总结文档片段中的主要观点、论据和信息——若输入摘要为英文,则用英文;若为中文,则用中文>, + "tags": <相关主题关键词列表(例如,["截止日期", "团队", "计划"])> + } + ... + ], + "summary": <简洁总结原文内容,与输入语言一致> } 语言规则: