From 3bd0959d4f7684afe7c3069c7717f1a443b966c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Mon, 2 Feb 2026 11:06:31 +0800 Subject: [PATCH 1/2] fix: add fileurl to memoryvalue --- .../read_multi_modal/file_content_parser.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index fbc704d0b..00da08b1c 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -412,7 +412,6 @@ def parse_fast( # Extract file parameters (all are optional) file_data = file_info.get("file_data", "") file_id = file_info.get("file_id", "") - filename = file_info.get("filename", "") file_url_flag = False # Build content string based on available information content_parts = [] @@ -433,25 +432,12 @@ def parse_fast( # Check if it looks like a URL elif file_data.startswith(("http://", "https://", "file://")): file_url_flag = True - content_parts.append(f"[File URL: {file_data}]") else: # TODO: split into multiple memory items content_parts.append(file_data) else: content_parts.append(f"[File Data: {type(file_data).__name__}]") - # Priority 2: If file_id is provided, reference it - if file_id: - content_parts.append(f"[File ID: {file_id}]") - - # Priority 3: If filename is provided, include it - if filename: - content_parts.append(f"[Filename: {filename}]") - - # If no content can be extracted, create a placeholder - if not content_parts: - content_parts.append("[File: unknown]") - # Combine content parts content = " ".join(content_parts) From 55bb67d3164d603c9f12f5604f2f7a5fb469eba9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Mon, 2 Feb 2026 17:24:29 +0800 Subject: [PATCH 2/2] Extract the phrases from the key and input them into the tags. --- src/memos/mem_reader/read_multi_modal/base.py | 2 ++ .../mem_reader/read_multi_modal/file_content_parser.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/memos/mem_reader/read_multi_modal/base.py b/src/memos/mem_reader/read_multi_modal/base.py index 95d427864..737a3fe1e 100644 --- a/src/memos/mem_reader/read_multi_modal/base.py +++ b/src/memos/mem_reader/read_multi_modal/base.py @@ -15,6 +15,7 @@ TextualMemoryItem, TreeNodeTextualMemoryMetadata, ) +from memos.memories.textual.tree_text_memory.retrieve.retrieve_utils import FastTokenizer from memos.utils import timed from .utils import detect_lang, get_text_splitter @@ -90,6 +91,7 @@ def __init__(self, embedder, llm=None): """ self.embedder = embedder self.llm = llm + self.tokenizer = FastTokenizer(use_jieba=True, use_stopwords=True) @abstractmethod def create_source( diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 00da08b1c..9f4ab94c2 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -710,7 +710,7 @@ def _make_fallback( chunk_idx: int, chunk_text: str, reason: str = "raw" ) -> TextualMemoryItem: """Create fallback memory item with raw chunk text.""" - return _make_memory_item( + raw_chunk_mem = _make_memory_item( value=chunk_text, tags=[ "mode:fine", @@ -721,6 +721,11 @@ def _make_fallback( chunk_idx=chunk_idx, chunk_content=chunk_text, ) + tags_list = self.tokenizer.tokenize_mixed(raw_chunk_mem.metadata.key) + tags_list = [tag for tag in tags_list if len(tag) > 1] + tags_list = sorted(tags_list, key=len, reverse=True) + raw_chunk_mem.metadata.tags.extend(tags_list[:5]) + return raw_chunk_mem # Handle empty chunks case if not valid_chunks: