From a0dfab482f4cd93605628719c04568e6678ff46d Mon Sep 17 00:00:00 2001 From: YumemiDream <1803068130@qq.com> Date: Thu, 4 Jun 2026 19:41:19 +0800 Subject: [PATCH 1/6] fix: prevent plugin from overwriting manually edited jargon meanings - Add meaning_edited flag to Jargon ORM and dataclass - Set meaning_edited=True when user edits meaning via dashboard - _should_infer_meaning skips jargon with meaning_edited=True - Auto-migration will add the new column on startup --- models/jargon.py | 2 ++ models/orm/jargon.py | 2 ++ services/database/facades/jargon_facade.py | 4 ++++ services/jargon/jargon_miner.py | 3 +++ webui/services/jargon_service.py | 1 + 5 files changed, 12 insertions(+) diff --git a/models/jargon.py b/models/jargon.py index a3272c5f..109b1d0b 100644 --- a/models/jargon.py +++ b/models/jargon.py @@ -20,6 +20,7 @@ class Jargon: count: int = 1 # 出现次数 last_inference_count: int = 0 # 上次推断时的count值 is_complete: bool = False # 是否完成所有推断 (count>=100) + meaning_edited: bool = False # 用户是否手动编辑过释义 is_global: bool = False # 是否全局黑话 chat_id: str = "" # 群组ID created_at: Optional[datetime] = None # 创建时间 @@ -36,6 +37,7 @@ def to_dict(self) -> dict: 'count': self.count, 'last_inference_count': self.last_inference_count, 'is_complete': self.is_complete, + 'meaning_edited': self.meaning_edited, 'is_global': self.is_global, 'chat_id': self.chat_id, 'created_at': self.created_at.isoformat() if self.created_at else None, diff --git a/models/orm/jargon.py b/models/orm/jargon.py index b8cf462f..cf313279 100644 --- a/models/orm/jargon.py +++ b/models/orm/jargon.py @@ -19,6 +19,7 @@ class Jargon(Base): count = Column(Integer, default=1) last_inference_count = Column(Integer, default=0) is_complete = Column(Boolean, default=False) + meaning_edited = Column(Boolean, default=False) is_global = Column(Boolean, default=False) chat_id = Column(String(255), nullable=False, index=True) # 使用 BigInteger 存储 Unix 时间戳(自动迁移会将 DATETIME 转换为 BIGINT) @@ -46,6 +47,7 @@ def to_dict(self): 'count': self.count, 'last_inference_count': self.last_inference_count, 'is_complete': self.is_complete, + 'meaning_edited': self.meaning_edited, 'is_global': self.is_global, 'chat_id': self.chat_id, 'created_at': self.created_at, diff --git a/services/database/facades/jargon_facade.py b/services/database/facades/jargon_facade.py index f978fd59..448e1d20 100644 --- a/services/database/facades/jargon_facade.py +++ b/services/database/facades/jargon_facade.py @@ -164,6 +164,9 @@ async def update_jargon(self, jargon_data: Dict[str, Any]) -> bool: record.meaning = json.dumps(meaning_val, ensure_ascii=False) else: record.meaning = str(meaning_val) if meaning_val is not None else None + # Only mark meaning_edited when explicitly set (not from inference) + if jargon_data.get('meaning_edited'): + record.meaning_edited = True if 'is_jargon' in jargon_data: record.is_jargon = jargon_data['is_jargon'] if 'count' in jargon_data: @@ -476,6 +479,7 @@ async def search_jargon( 'is_jargon': r.is_jargon, 'count': r.count or 0, 'is_complete': r.is_complete, + 'meaning_edited': r.meaning_edited or False, 'is_global': r.is_global or False, 'chat_id': r.chat_id, 'created_at': r.created_at, diff --git a/services/jargon/jargon_miner.py b/services/jargon/jargon_miner.py index 97ae33a2..b30002f1 100644 --- a/services/jargon/jargon_miner.py +++ b/services/jargon/jargon_miner.py @@ -299,6 +299,8 @@ def _should_infer_meaning(self, jargon: Jargon) -> bool: """ if jargon.is_complete: return False + if jargon.meaning_edited: + return False count = jargon.count or 0 last_inference = jargon.last_inference_count or 0 @@ -512,6 +514,7 @@ async def save_or_update_jargon( count=existing_dict.get('count', 1), last_inference_count=existing_dict.get('last_inference_count', 0), is_complete=existing_dict.get('is_complete', False), + meaning_edited=existing_dict.get('meaning_edited', False), is_global=existing_dict.get('is_global', False), chat_id=existing_dict.get('chat_id', ''), created_at=existing_dict.get('created_at'), diff --git a/webui/services/jargon_service.py b/webui/services/jargon_service.py index ef4badb9..2f82192e 100644 --- a/webui/services/jargon_service.py +++ b/webui/services/jargon_service.py @@ -352,6 +352,7 @@ async def update_jargon( payload["content"] = content if meaning is not None: payload["meaning"] = meaning + payload["meaning_edited"] = True if len(payload) <= 1: return False, "没有需要更新的字段", self._format_jargon_for_frontend(current) From 0194dd411170b5c17afdc1fcc51f3ea3654d694f Mon Sep 17 00:00:00 2001 From: YumemiDream <1803068130@qq.com> Date: Thu, 4 Jun 2026 19:58:42 +0800 Subject: [PATCH 2/6] fix: change jargon count to actual chat occurrence frequency - Add sync_jargon_counts to facade: bulk-update count from statistical filter's term frequency table - mine_jargon syncs filter frequencies to DB before inference - Remove manual count+1 in save_or_update_jargon (count now managed by frequency sync) - Inference thresholds [3,6,10,20,40,60,100] now reflect actual chat occurrences, not LLM validation pass count --- services/database/facades/jargon_facade.py | 46 +++++++++++++++++++ .../database/sqlalchemy_database_manager.py | 3 ++ services/jargon/jargon_miner.py | 3 +- services/learning/message_pipeline.py | 10 ++++ 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/services/database/facades/jargon_facade.py b/services/database/facades/jargon_facade.py index 448e1d20..cba48273 100644 --- a/services/database/facades/jargon_facade.py +++ b/services/database/facades/jargon_facade.py @@ -195,6 +195,52 @@ async def update_jargon(self, jargon_data: Dict[str, Any]) -> bool: self._logger.error(f"[JargonFacade] 更新黑话失败: {e}", exc_info=True) return False + # 3b. sync_jargon_counts + async def sync_jargon_counts( + self, chat_id: str, term_frequencies: Dict[str, int] + ) -> int: + """Sync occurrence counts for existing jargon in a group. + + Only updates jargon terms that already exist in DB for *chat_id*. + Returns the number of records updated. + + Args: + chat_id: Group ID. + term_frequencies: ``{term: actual_chat_count}`` from the + statistical filter. + """ + if not term_frequencies: + return 0 + + updated = 0 + try: + async with self.get_session() as session: + stmt = select(Jargon).where( + Jargon.chat_id == chat_id, + Jargon.content.in_(list(term_frequencies.keys())), + ) + result = await session.execute(stmt) + records = result.scalars().all() + + now = int(time.time()) + for record in records: + new_count = term_frequencies.get(record.content, 0) + if new_count > (record.count or 0): + record.count = new_count + record.updated_at = now + updated += 1 + + if updated: + await session.commit() + self._logger.debug( + f"[JargonFacade] 同步黑话出现次数: " + f"chat={chat_id}, updated={updated}" + ) + except Exception as e: + self._logger.error(f"[JargonFacade] 同步黑话出现次数失败: {e}", exc_info=True) + + return updated + # 4. get_jargon_statistics async def get_jargon_statistics(self, group_id: str = None) -> Dict[str, Any]: """获取黑话学习统计信息 diff --git a/services/database/sqlalchemy_database_manager.py b/services/database/sqlalchemy_database_manager.py index 82fc5b4a..e13af6bc 100644 --- a/services/database/sqlalchemy_database_manager.py +++ b/services/database/sqlalchemy_database_manager.py @@ -960,6 +960,9 @@ async def insert_jargon(self, jargon_data: Dict[str, Any]) -> Optional[int]: async def update_jargon(self, jargon_data: Dict[str, Any]) -> bool: return await self._call_jargon("update_jargon", False, jargon_data) + async def sync_jargon_counts(self, chat_id: str, term_frequencies: Dict[str, int]) -> int: + return await self._call_jargon("sync_jargon_counts", 0, chat_id, term_frequencies) + async def get_jargon_statistics(self, group_id: str = None) -> Dict[str, Any]: return await self._call_jargon( "get_jargon_statistics", diff --git a/services/jargon/jargon_miner.py b/services/jargon/jargon_miner.py index b30002f1..589d59d2 100644 --- a/services/jargon/jargon_miner.py +++ b/services/jargon/jargon_miner.py @@ -521,8 +521,7 @@ async def save_or_update_jargon( updated_at=existing_dict.get('updated_at') ) - # 更新现有记录 - existing.count = (existing.count or 0) + 1 + # 更新现有记录(count 由频率同步管理,此处不递增) # 合并 raw_content existing_list = safe_parse_llm_json(existing.raw_content) or [] diff --git a/services/learning/message_pipeline.py b/services/learning/message_pipeline.py index 6e8be3a2..3be8d547 100644 --- a/services/learning/message_pipeline.py +++ b/services/learning/message_pipeline.py @@ -256,6 +256,16 @@ async def mine_jargon(self, group_id: str) -> None: if not statistical_candidates: statistical_candidates = None + # Sync actual chat occurrence counts to DB + term_freq = self._jargon_statistical_filter._group_term_freq.get(group_id) + if term_freq: + try: + await self._db_manager.sync_jargon_counts( + group_id, dict(term_freq) + ) + except Exception as e: + logger.debug(f"[JargonMining] Frequency sync failed: {e}") + await jargon_miner.run_once( chat_messages, len(recent_messages), From 4ef77bf7761bf310e2d9ee9f201be93d3181d0bb Mon Sep 17 00:00:00 2001 From: YumemiDream <1803068130@qq.com> Date: Thu, 4 Jun 2026 20:08:37 +0800 Subject: [PATCH 3/6] Revert "fix: change jargon count to actual chat occurrence frequency" This reverts commit 0194dd411170b5c17afdc1fcc51f3ea3654d694f. --- services/database/facades/jargon_facade.py | 46 ------------------- .../database/sqlalchemy_database_manager.py | 3 -- services/jargon/jargon_miner.py | 3 +- services/learning/message_pipeline.py | 10 ---- 4 files changed, 2 insertions(+), 60 deletions(-) diff --git a/services/database/facades/jargon_facade.py b/services/database/facades/jargon_facade.py index cba48273..448e1d20 100644 --- a/services/database/facades/jargon_facade.py +++ b/services/database/facades/jargon_facade.py @@ -195,52 +195,6 @@ async def update_jargon(self, jargon_data: Dict[str, Any]) -> bool: self._logger.error(f"[JargonFacade] 更新黑话失败: {e}", exc_info=True) return False - # 3b. sync_jargon_counts - async def sync_jargon_counts( - self, chat_id: str, term_frequencies: Dict[str, int] - ) -> int: - """Sync occurrence counts for existing jargon in a group. - - Only updates jargon terms that already exist in DB for *chat_id*. - Returns the number of records updated. - - Args: - chat_id: Group ID. - term_frequencies: ``{term: actual_chat_count}`` from the - statistical filter. - """ - if not term_frequencies: - return 0 - - updated = 0 - try: - async with self.get_session() as session: - stmt = select(Jargon).where( - Jargon.chat_id == chat_id, - Jargon.content.in_(list(term_frequencies.keys())), - ) - result = await session.execute(stmt) - records = result.scalars().all() - - now = int(time.time()) - for record in records: - new_count = term_frequencies.get(record.content, 0) - if new_count > (record.count or 0): - record.count = new_count - record.updated_at = now - updated += 1 - - if updated: - await session.commit() - self._logger.debug( - f"[JargonFacade] 同步黑话出现次数: " - f"chat={chat_id}, updated={updated}" - ) - except Exception as e: - self._logger.error(f"[JargonFacade] 同步黑话出现次数失败: {e}", exc_info=True) - - return updated - # 4. get_jargon_statistics async def get_jargon_statistics(self, group_id: str = None) -> Dict[str, Any]: """获取黑话学习统计信息 diff --git a/services/database/sqlalchemy_database_manager.py b/services/database/sqlalchemy_database_manager.py index e13af6bc..82fc5b4a 100644 --- a/services/database/sqlalchemy_database_manager.py +++ b/services/database/sqlalchemy_database_manager.py @@ -960,9 +960,6 @@ async def insert_jargon(self, jargon_data: Dict[str, Any]) -> Optional[int]: async def update_jargon(self, jargon_data: Dict[str, Any]) -> bool: return await self._call_jargon("update_jargon", False, jargon_data) - async def sync_jargon_counts(self, chat_id: str, term_frequencies: Dict[str, int]) -> int: - return await self._call_jargon("sync_jargon_counts", 0, chat_id, term_frequencies) - async def get_jargon_statistics(self, group_id: str = None) -> Dict[str, Any]: return await self._call_jargon( "get_jargon_statistics", diff --git a/services/jargon/jargon_miner.py b/services/jargon/jargon_miner.py index 589d59d2..b30002f1 100644 --- a/services/jargon/jargon_miner.py +++ b/services/jargon/jargon_miner.py @@ -521,7 +521,8 @@ async def save_or_update_jargon( updated_at=existing_dict.get('updated_at') ) - # 更新现有记录(count 由频率同步管理,此处不递增) + # 更新现有记录 + existing.count = (existing.count or 0) + 1 # 合并 raw_content existing_list = safe_parse_llm_json(existing.raw_content) or [] diff --git a/services/learning/message_pipeline.py b/services/learning/message_pipeline.py index 3be8d547..6e8be3a2 100644 --- a/services/learning/message_pipeline.py +++ b/services/learning/message_pipeline.py @@ -256,16 +256,6 @@ async def mine_jargon(self, group_id: str) -> None: if not statistical_candidates: statistical_candidates = None - # Sync actual chat occurrence counts to DB - term_freq = self._jargon_statistical_filter._group_term_freq.get(group_id) - if term_freq: - try: - await self._db_manager.sync_jargon_counts( - group_id, dict(term_freq) - ) - except Exception as e: - logger.debug(f"[JargonMining] Frequency sync failed: {e}") - await jargon_miner.run_once( chat_messages, len(recent_messages), From 28a9de16ac82ac1ba39d83a8683014400710efc5 Mon Sep 17 00:00:00 2001 From: YumemiDream <1803068130@qq.com> Date: Thu, 4 Jun 2026 20:09:16 +0800 Subject: [PATCH 4/6] fix: remove jargon sort by occurrences (count not reliable) --- web_res/static/html/dashboard.html | 3 --- 1 file changed, 3 deletions(-) diff --git a/web_res/static/html/dashboard.html b/web_res/static/html/dashboard.html index 71368b6b..011c1c87 100644 --- a/web_res/static/html/dashboard.html +++ b/web_res/static/html/dashboard.html @@ -2489,7 +2489,6 @@

黑话与批次

- @@ -6399,8 +6398,6 @@

${escapeHtml(item.title)}

items.reverse(); } else if (state.jargon.sort === 'name') { items.sort((a, b) => (a.term || a.word || '').localeCompare(b.term || b.word || '')); - } else if (state.jargon.sort === 'occurrences') { - items.sort((a, b) => (b.occurrences || 0) - (a.occurrences || 0)); } state.jargon.items = items; From 6a9b68f71698b3a8aa5af4642f5bdff96f6dcbdf Mon Sep 17 00:00:00 2001 From: YumemiDream <1803068130@qq.com> Date: Thu, 4 Jun 2026 20:26:39 +0800 Subject: [PATCH 5/6] fix: prioritize real dialogue pairs over LLM-generated patterns Style learning now extracts user->bot pairs from actual chat history first (chronologically matched), falling back to LLM-generated expression patterns only when no real pairs are found. --- services/core_learning/progressive_learning.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/services/core_learning/progressive_learning.py b/services/core_learning/progressive_learning.py index c220cb97..0087c013 100644 --- a/services/core_learning/progressive_learning.py +++ b/services/core_learning/progressive_learning.py @@ -1206,13 +1206,9 @@ async def _save_style_learning_record(self, group_id: str, style_analysis: Dict[ logger.debug(f"群组 {group_id} 没有风格分析结果且没有消息,跳过风格学习记录保存") return - # 1. 保存表达模式到 expression_patterns 表 - expression_patterns = style_analysis_dict.get('expression_patterns', []) - expression_patterns = self._filter_expression_patterns(expression_patterns) - - # 在 fewshot 模式下,style_analysis 可能不包含 expression_patterns。 - # 此时从数据库获取 bot 消息与用户消息合并,提取 user->bot 对话对。 - if not expression_patterns and messages: + # 1. 优先从真实对话中提取 user->bot 对话对(逻辑连贯) + expression_patterns = [] + if messages: try: merged = await self._merge_bot_messages_for_pairs(group_id, messages) if merged: @@ -1220,6 +1216,11 @@ async def _save_style_learning_record(self, group_id: str, style_analysis: Dict[ except Exception as pair_err: logger.debug(f"提取 fewshot 对话对失败: {pair_err}") + # 真实对话对不足时,回退到 LLM 生成的表达模式 + if not expression_patterns: + expression_patterns = style_analysis_dict.get('expression_patterns', []) + expression_patterns = self._filter_expression_patterns(expression_patterns) + if expression_patterns: await self._save_expression_patterns(group_id, expression_patterns) From adaffe25d88bb210ea9deeff9ec348d4591cdf69 Mon Sep 17 00:00:00 2001 From: YumemiDream <1803068130@qq.com> Date: Thu, 4 Jun 2026 20:39:15 +0800 Subject: [PATCH 6/6] fix: deduplicate style learning dialogue pairs at extraction and injection - Extraction: _extract_fewshot_pairs_from_merged deduplicates by (situation, expression) content within a single batch - Extraction: _extract_style_dialog_pairs deduplicates learned_patterns and few-shot pairs - Injection: _build_style_begin_dialogs checks existing begin_dialogs for matching user messages before appending, preventing duplicates across multiple approved reviews --- .../core_learning/progressive_learning.py | 7 +++++ webui/services/persona_review_service.py | 27 +++++++++++++++---- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/services/core_learning/progressive_learning.py b/services/core_learning/progressive_learning.py index 0087c013..3a551598 100644 --- a/services/core_learning/progressive_learning.py +++ b/services/core_learning/progressive_learning.py @@ -1373,8 +1373,10 @@ def _extract_fewshot_pairs_from_merged( Mirrors the logic of ExpressionPatternLearner._extract_few_shot_pairs but operates on plain dicts and returns expression pattern dicts. + Deduplicates by (situation, expression) content. """ pairs = [] + seen = set() current_time = time.time() for i in range(len(merged) - 1): @@ -1400,6 +1402,11 @@ def _extract_fewshot_pairs_from_merged( if '@' in msg_text or '@' in nxt_text: continue + key = (msg_text[:50], nxt_text[:100]) + if key in seen: + continue + seen.add(key) + pairs.append({ 'situation': msg_text[:50], 'expression': nxt_text[:100], diff --git a/webui/services/persona_review_service.py b/webui/services/persona_review_service.py index f6f06348..ce8bad5d 100644 --- a/webui/services/persona_review_service.py +++ b/webui/services/persona_review_service.py @@ -176,19 +176,27 @@ def _build_change_payload( @staticmethod def _extract_style_dialog_pairs(review: Dict[str, Any]) -> List[Tuple[str, str]]: - """Extract style review dialog pairs from structured patterns or few-shot text.""" + """Extract style review dialog pairs from structured patterns or few-shot text. Deduplicates by content.""" dialog_pairs = [] + seen = set() learned_patterns = review.get('learned_patterns', []) for pattern in learned_patterns: situation = pattern.get('situation', '') if isinstance(pattern, dict) else '' expression = pattern.get('expression', '') if isinstance(pattern, dict) else '' if situation and expression: - dialog_pairs.append((str(situation), str(expression))) + key = (str(situation), str(expression)) + if key not in seen: + seen.add(key) + dialog_pairs.append(key) if not dialog_pairs: - dialog_pairs = PersonaReviewService._parse_few_shots_to_pairs( + for user_msg, assistant_msg in PersonaReviewService._parse_few_shots_to_pairs( review.get('few_shots_content', '') or '' - ) + ): + key = (user_msg, assistant_msg) + if key not in seen: + seen.add(key) + dialog_pairs.append(key) return dialog_pairs def _dialog_pairs_for_style_review(self, review: Dict[str, Any]) -> List[Tuple[str, str]]: @@ -199,10 +207,19 @@ def _build_style_begin_dialogs( current_begin_dialogs: List[str], dialog_pairs: List[Tuple[str, str]] ) -> List[str]: - """Append style examples and keep only latest style example pairs.""" + """Append style examples, skipping pairs already present in begin_dialogs.""" updated_dialogs = PersonaReviewService._normalize_begin_dialogs(current_begin_dialogs) + # Build set of existing user messages for dedup + existing_user_msgs = set() + for d in updated_dialogs: + if isinstance(d, str) and d.startswith(STYLE_BEGIN_DIALOG_PREFIX): + existing_user_msgs.add(d[len(STYLE_BEGIN_DIALOG_PREFIX):].strip()) + for user_msg, assistant_msg in dialog_pairs: + if user_msg.strip() in existing_user_msgs: + continue + existing_user_msgs.add(user_msg.strip()) updated_dialogs.append(f"{STYLE_BEGIN_DIALOG_PREFIX}{user_msg}") updated_dialogs.append(str(assistant_msg))