diff --git a/src/memos/mem_reader/simple_struct.py b/src/memos/mem_reader/simple_struct.py index 0f74adead..13515c038 100644 --- a/src/memos/mem_reader/simple_struct.py +++ b/src/memos/mem_reader/simple_struct.py @@ -67,9 +67,18 @@ def detect_lang(text): try: if not text or not isinstance(text, str): return "en" + cleaned_text = text + # remove role and timestamp + cleaned_text = re.sub( + r"\b(user|assistant|query|answer)\s*:", "", cleaned_text, flags=re.IGNORECASE + ) + cleaned_text = re.sub(r"\[[\d\-:\s]+\]", "", cleaned_text) + + # extract chinese characters chinese_pattern = r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f\U0002b820-\U0002ceaf\uf900-\ufaff]" - chinese_chars = re.findall(chinese_pattern, text) - if len(chinese_chars) / len(re.sub(r"[\s\d\W]", "", text)) > 0.3: + chinese_chars = re.findall(chinese_pattern, cleaned_text) + text_without_special = re.sub(r"[\s\d\W]", "", cleaned_text) + if text_without_special and len(chinese_chars) / len(text_without_special) > 0.3: return "zh" return "en" except Exception: @@ -466,15 +475,11 @@ def get_scene_data_info(self, scene_data: list, type: str) -> list[str]: if type == "chat": for items in scene_data: result = [] - for item in items: - # Convert dictionary to string - if "chat_time" in item: - result.append(item) - else: - result.append(item) + for i, item in enumerate(items): + result.append(item) if len(result) >= 10: results.append(result) - context = copy.deepcopy(result[-2:]) + context = copy.deepcopy(result[-2:]) if i + 1 < len(items) else [] result = context if result: results.append(result) diff --git a/src/memos/templates/instruction_completion.py b/src/memos/templates/instruction_completion.py index 03ae52c77..b88ff474c 100644 --- a/src/memos/templates/instruction_completion.py +++ b/src/memos/templates/instruction_completion.py @@ -45,7 +45,10 @@ def instruct_completion( "zh": "隐式偏好 > ", "en": "implicit preference > ", } - lang = detect_lang(explicit_pref_str + implicit_pref_str) + lang = detect_lang( + explicit_pref_str.replace("Explicit Preference:\n", "") + + implicit_pref_str.replace("Implicit Preference:\n", "") + ) if not explicit_pref_str and not implicit_pref_str: return "", ""