MemTensor · Mozy403 · Jan 21, 2026 · Jan 22, 2026 · Jan 26, 2026 · Feb 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -216,3 +216,5 @@ cython_debug/
 outputs
 
 evaluation/data/temporal_locomo
+test_add_pipeline.py
+test_file_pipeline.py
diff --git a/src/memos/chunkers/base.py b/src/memos/chunkers/base.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 
 from memos.configs.chunker import BaseChunkerConfig
+import re
 
 
 class Chunk:
@@ -22,3 +23,42 @@ def __init__(self, config: BaseChunkerConfig):
     @abstractmethod
     def chunk(self, text: str) -> list[Chunk]:
         """Chunk the given text into smaller chunks."""
+
+    def protect_urls(self, text: str) -> tuple[str, dict[str, str]]:
+        """
+        Protect URLs in text from being split during chunking.
+
+        Args:
+            text: Text to process
+
+        Returns:
+            tuple: (Text with URLs replaced by placeholders, URL mapping dictionary)
+        """
+        url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
+        url_map = {}
+
+        def replace_url(match):
+            url = match.group(0)
+            placeholder = f"__URL_{len(url_map)}__"
+            url_map[placeholder] = url
+            return placeholder
+
+        protected_text = re.sub(url_pattern, replace_url, text)
+        return protected_text, url_map
+
+    def restore_urls(self, text: str, url_map: dict[str, str]) -> str:
+        """
+        Restore protected URLs in text back to their original form.
+
+        Args:
+            text: Text with URL placeholders
+            url_map: URL mapping dictionary from protect_urls
+
+        Returns:
+            str: Text with URLs restored
+        """
+        restored_text = text
+        for placeholder, url in url_map.items():
+            restored_text = restored_text.replace(placeholder, url)
+
-        restored_text = text
-        for placeholder, url in url_map.items():
-            restored_text = restored_text.replace(placeholder, url)
-        
+        placeholder_pattern = r'__URL_\d+__'
+
+        def replacer(match: re.Match) -> str:
+            placeholder = match.group(0)
+            # If the placeholder is not found, leave it unchanged
+            return url_map.get(placeholder, placeholder)
+
+        restored_text = re.sub(placeholder_pattern, replacer, text)
-        restored_text = text
-        for placeholder, url in url_map.items():
-            restored_text = restored_text.replace(placeholder, url)
-        
+        placeholder_pattern = r'__URL_\d+__'
+
+        def replacer(match: re.Match) -> str:
+            placeholder = match.group(0)
+            # If the placeholder is not found, leave it unchanged
+            return url_map.get(placeholder, placeholder)
+
+        restored_text = re.sub(placeholder_pattern, replacer, text)
+        return restored_text
diff --git a/src/memos/chunkers/charactertext_chunker.py b/src/memos/chunkers/charactertext_chunker.py
@@ -36,6 +36,8 @@ def __init__(
 
     def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
         """Chunk the given text into smaller chunks based on sentences."""
-        chunks = self.chunker.split_text(text)
+        protected_text, url_map = self.protect_urls(text)
+        chunks = self.chunker.split_text(protected_text)
+        chunks = [self.restore_urls(chunk, url_map) for chunk in chunks]
         logger.debug(f"Generated {len(chunks)} chunks from input text")
         return chunks
diff --git a/src/memos/chunkers/markdown_chunker.py b/src/memos/chunkers/markdown_chunker.py
@@ -2,6 +2,8 @@
 from memos.dependency import require_python_package
 from memos.log import get_logger
 
+import re
+
 from .base import BaseChunker, Chunk
 
 
@@ -22,13 +24,15 @@ def __init__(
         chunk_size: int = 1000,
         chunk_overlap: int = 200,
         recursive: bool = False,
+        auto_fix_headers: bool = True,  
-        auto_fix_headers: bool = True,  
+        auto_fix_headers: bool = True,
-        auto_fix_headers: bool = True,  
+        auto_fix_headers: bool = True,
     ):
         from langchain_text_splitters import (
             MarkdownHeaderTextSplitter,
             RecursiveCharacterTextSplitter,
         )
 
         self.config = config
+        self.auto_fix_headers = auto_fix_headers
         self.chunker = MarkdownHeaderTextSplitter(
             headers_to_split_on=config.headers_to_split_on
             if config
@@ -46,17 +50,105 @@ def __init__(
 
     def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
         """Chunk the given text into smaller chunks based on sentences."""
-        md_header_splits = self.chunker.split_text(text)
+        # Protect URLs first
+        protected_text, url_map = self.protect_urls(text)
+        # Auto-detect and fix malformed header hierarchy if enabled
+        if self.auto_fix_headers and self._detect_malformed_headers(protected_text):
+            logger.info("detected malformed header hierarchy, attempting to fix...")
+            protected_text = self._fix_header_hierarchy(protected_text)
+            logger.info("Header hierarchy fix completed")
+
+        md_header_splits = self.chunker.split_text(protected_text)
         chunks = []
         if self.chunker_recursive:
             md_header_splits = self.chunker_recursive.split_documents(md_header_splits)
         for doc in md_header_splits:
             try:
                 chunk = " ".join(list(doc.metadata.values())) + "\n" + doc.page_content
+                chunk = self.restore_urls(chunk, url_map)
                 chunks.append(chunk)
             except Exception as e:
                 logger.warning(f"warning chunking document: {e}")
-                chunks.append(doc.page_content)
+                restored_chunk = self.restore_urls(doc.page_content, url_map)
+                chunks.append(restored_chunk)
         logger.info(f"Generated chunks: {chunks[:5]}")
         logger.debug(f"Generated {len(chunks)} chunks from input text")
         return chunks
+
+    def _detect_malformed_headers(self, text: str) -> bool:
+        """Detect if markdown has improper header hierarchy usage."""
+        # Extract all valid markdown header lines 
+        header_levels = []
+        pattern = re.compile(r'^#{1,6}\s+.+')  
+        for line in text.split('\n'):
+            stripped_line = line.strip()
+            if pattern.match(stripped_line):
+                hash_match = re.match(r'^(#+)', stripped_line)
+                if hash_match:
+                    level = len(hash_match.group(1))
+                    header_levels.append(level)
+
+        total_headers = len(header_levels)
+        if total_headers == 0:
+            logger.debug("No valid headers detected, skipping check")
+            return False
+
+        # Calculate level-1 header ratio
+        level1_count = sum(1 for level in header_levels if level == 1)
+
+        # Determine if malformed: >90% are level-1 when total > 5
+        # OR all headers are level-1 when total ≤ 5
+        if total_headers > 5:
+            level1_ratio = level1_count / total_headers
+            if level1_ratio > 0.9:
+                logger.warning(
+                    f"Detected header hierarchy issue: {level1_count}/{total_headers} "
+                    f"({level1_ratio:.1%}) of headers are level 1"
+                )
+                return True
+        elif total_headers <= 5 and level1_count == total_headers:
+            logger.warning(
+                f"Detected header hierarchy issue: all {total_headers} headers are level 1"
+            )
+            return True
+        return False
+
+    def _fix_header_hierarchy(self, text: str) -> str:
+        """
+        Fix markdown header hierarchy by adjusting levels.
+
+        Strategy:
+        1. Keep the first header unchanged as level-1 parent
+        2. Increment all subsequent headers by 1 level (max level 6)
+        """
+        header_pattern = re.compile(r'^(#{1,6})\s+(.+)$')
+        lines = text.split('\n')
+        fixed_lines = []
+        first_valid_header = False  
+
+        for line in lines:
+            stripped_line = line.strip()
+            # Match valid header lines (invalid # lines kept as-is)
+            header_match = header_pattern.match(stripped_line)
+            if header_match:
+                current_hashes, title_content = header_match.groups()
+                current_level = len(current_hashes)
+
+                if not first_valid_header:
+                    # First valid header: keep original level unchanged
+                    fixed_line = f"{current_hashes} {title_content}"
+                    first_valid_header = True
+                    logger.debug(f"Keep first header at level {current_level}: {title_content[:50]}...")
+                else:
+                    # Subsequent headers: increment by 1, cap at level 6
+                    new_level = min(current_level + 1, 6)
+                    new_hashes = '#' * new_level
+                    fixed_line = f"{new_hashes} {title_content}"
+                    logger.debug(f"Adjust header level: {current_level} -> {new_level}: {title_content[:50]}...")
+                fixed_lines.append(fixed_line)
+            else:           
+                fixed_lines.append(line)
+
+        # Join with newlines to preserve original formatting
+        fixed_text = '\n'.join(fixed_lines)
+        return fixed_text
diff --git a/src/memos/chunkers/sentence_chunker.py b/src/memos/chunkers/sentence_chunker.py
@@ -43,11 +43,13 @@ def __init__(self, config: SentenceChunkerConfig):
 
     def chunk(self, text: str) -> list[str] | list[Chunk]:
         """Chunk the given text into smaller chunks based on sentences."""
-        chonkie_chunks = self.chunker.chunk(text)
+        protected_text, url_map = self.protect_urls(text)
+        chonkie_chunks = self.chunker.chunk(protected_text)
 
         chunks = []
         for c in chonkie_chunks:
             chunk = Chunk(text=c.text, token_count=c.token_count, sentences=c.sentences)
+            chunk = self.restore_urls(chunk.text, url_map)
-            chunk = self.restore_urls(chunk.text, url_map)
+            chunk.text = self.restore_urls(chunk.text, url_map)
-            chunk = self.restore_urls(chunk.text, url_map)
+            chunk.text = self.restore_urls(chunk.text, url_map)
             chunks.append(chunk)
 
         logger.debug(f"Generated {len(chunks)} chunks from input text")

diff --git a/src/memos/chunkers/simple_chunker.py b/src/memos/chunkers/simple_chunker.py
@@ -20,12 +20,15 @@ def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) ->
         Returns:
             List of text chunks
         """
-        if not text or len(text) <= chunk_size:
-            return [text] if text.strip() else []
+        protected_text, url_map = self.protect_urls(text)
+
+        if not protected_text or len(protected_text) <= chunk_size:
+            chunks = [protected_text] if protected_text.strip() else []
+            return [self.restore_urls(chunk, url_map) for chunk in chunks]
 
         chunks = []
         start = 0
-        text_len = len(text)
+        text_len = len(protected_text)
 
         while start < text_len:
             # Calculate end position
@@ -35,16 +38,16 @@ def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) ->
             if end < text_len:
                 # Try to break at newline, sentence end, or space
                 for separator in ["\n\n", "\n", "。", "！", "？", ". ", "! ", "? ", " "]:
-                    last_sep = text.rfind(separator, start, end)
+                    last_sep = protected_text.rfind(separator, start, end)
                     if last_sep != -1:
                         end = last_sep + len(separator)
                         break
 
-            chunk = text[start:end].strip()
+            chunk = protected_text[start:end].strip()
             if chunk:
                 chunks.append(chunk)
 
             # Move start position with overlap
             start = max(start + 1, end - chunk_overlap)
 
-        return chunks
+        return [self.restore_urls(chunk, url_map) for chunk in chunks]