Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,5 @@ cython_debug/
outputs

evaluation/data/temporal_locomo
test_add_pipeline.py
test_file_pipeline.py
40 changes: 40 additions & 0 deletions src/memos/chunkers/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from abc import ABC, abstractmethod

from memos.configs.chunker import BaseChunkerConfig
import re


class Chunk:
Expand All @@ -22,3 +23,42 @@ def __init__(self, config: BaseChunkerConfig):
@abstractmethod
def chunk(self, text: str) -> list[Chunk]:
"""Chunk the given text into smaller chunks."""

def protect_urls(self, text: str) -> tuple[str, dict[str, str]]:
"""
Protect URLs in text from being split during chunking.

Args:
text: Text to process

Returns:
tuple: (Text with URLs replaced by placeholders, URL mapping dictionary)
"""
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
url_map = {}

def replace_url(match):
url = match.group(0)
placeholder = f"__URL_{len(url_map)}__"
url_map[placeholder] = url
return placeholder

protected_text = re.sub(url_pattern, replace_url, text)
return protected_text, url_map

def restore_urls(self, text: str, url_map: dict[str, str]) -> str:
"""
Restore protected URLs in text back to their original form.

Args:
text: Text with URL placeholders
url_map: URL mapping dictionary from protect_urls

Returns:
str: Text with URLs restored
"""
restored_text = text
for placeholder, url in url_map.items():
restored_text = restored_text.replace(placeholder, url)

Comment on lines +60 to +63
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The URL restoration uses sequential string replacements in a loop, which has O(n*m) complexity where n is the number of URLs and m is the text length. For large texts with many URLs, this could be inefficient. Consider using a single regex substitution with a replacement function that looks up placeholders in the url_map dictionary, or building the result string in a single pass.

Suggested change
restored_text = text
for placeholder, url in url_map.items():
restored_text = restored_text.replace(placeholder, url)
placeholder_pattern = r'__URL_\d+__'
def replacer(match: re.Match) -> str:
placeholder = match.group(0)
# If the placeholder is not found, leave it unchanged
return url_map.get(placeholder, placeholder)
restored_text = re.sub(placeholder_pattern, replacer, text)

Copilot uses AI. Check for mistakes.
return restored_text
Comment on lines +27 to +64
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new URL protection and restoration functionality in the base chunker lacks test coverage. Since other chunkers have tests (e.g., test_sentence_chunker.py), tests should be added to verify that URLs are properly protected during chunking and restored afterwards, including edge cases like URLs at chunk boundaries or multiple URLs in the same chunk.

Copilot uses AI. Check for mistakes.
4 changes: 3 additions & 1 deletion src/memos/chunkers/charactertext_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def __init__(

def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
chunks = self.chunker.split_text(text)
protected_text, url_map = self.protect_urls(text)
chunks = self.chunker.split_text(protected_text)
chunks = [self.restore_urls(chunk, url_map) for chunk in chunks]
logger.debug(f"Generated {len(chunks)} chunks from input text")
return chunks
96 changes: 94 additions & 2 deletions src/memos/chunkers/markdown_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from memos.dependency import require_python_package
from memos.log import get_logger

import re

from .base import BaseChunker, Chunk


Expand All @@ -22,13 +24,15 @@ def __init__(
chunk_size: int = 1000,
chunk_overlap: int = 200,
recursive: bool = False,
auto_fix_headers: bool = True,
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace after the comma. This should be removed to maintain code cleanliness.

Suggested change
auto_fix_headers: bool = True,
auto_fix_headers: bool = True,

Copilot uses AI. Check for mistakes.
):
from langchain_text_splitters import (
MarkdownHeaderTextSplitter,
RecursiveCharacterTextSplitter,
)

self.config = config
self.auto_fix_headers = auto_fix_headers
self.chunker = MarkdownHeaderTextSplitter(
headers_to_split_on=config.headers_to_split_on
if config
Expand All @@ -46,17 +50,105 @@ def __init__(

def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
md_header_splits = self.chunker.split_text(text)
# Protect URLs first
protected_text, url_map = self.protect_urls(text)
# Auto-detect and fix malformed header hierarchy if enabled
if self.auto_fix_headers and self._detect_malformed_headers(protected_text):
logger.info("detected malformed header hierarchy, attempting to fix...")
protected_text = self._fix_header_hierarchy(protected_text)
logger.info("Header hierarchy fix completed")

md_header_splits = self.chunker.split_text(protected_text)
chunks = []
if self.chunker_recursive:
md_header_splits = self.chunker_recursive.split_documents(md_header_splits)
for doc in md_header_splits:
try:
chunk = " ".join(list(doc.metadata.values())) + "\n" + doc.page_content
chunk = self.restore_urls(chunk, url_map)
chunks.append(chunk)
except Exception as e:
logger.warning(f"warning chunking document: {e}")
chunks.append(doc.page_content)
restored_chunk = self.restore_urls(doc.page_content, url_map)
chunks.append(restored_chunk)
logger.info(f"Generated chunks: {chunks[:5]}")
logger.debug(f"Generated {len(chunks)} chunks from input text")
return chunks

def _detect_malformed_headers(self, text: str) -> bool:
"""Detect if markdown has improper header hierarchy usage."""
# Extract all valid markdown header lines
header_levels = []
pattern = re.compile(r'^#{1,6}\s+.+')
for line in text.split('\n'):
stripped_line = line.strip()
if pattern.match(stripped_line):
hash_match = re.match(r'^(#+)', stripped_line)
if hash_match:
level = len(hash_match.group(1))
header_levels.append(level)

total_headers = len(header_levels)
if total_headers == 0:
logger.debug("No valid headers detected, skipping check")
return False

# Calculate level-1 header ratio
level1_count = sum(1 for level in header_levels if level == 1)

# Determine if malformed: >90% are level-1 when total > 5
# OR all headers are level-1 when total ≤ 5
if total_headers > 5:
level1_ratio = level1_count / total_headers
if level1_ratio > 0.9:
logger.warning(
f"Detected header hierarchy issue: {level1_count}/{total_headers} "
f"({level1_ratio:.1%}) of headers are level 1"
)
return True
elif total_headers <= 5 and level1_count == total_headers:
logger.warning(
f"Detected header hierarchy issue: all {total_headers} headers are level 1"
)
return True
return False

def _fix_header_hierarchy(self, text: str) -> str:
"""
Fix markdown header hierarchy by adjusting levels.

Strategy:
1. Keep the first header unchanged as level-1 parent
2. Increment all subsequent headers by 1 level (max level 6)
"""
header_pattern = re.compile(r'^(#{1,6})\s+(.+)$')
lines = text.split('\n')
fixed_lines = []
first_valid_header = False

for line in lines:
stripped_line = line.strip()
# Match valid header lines (invalid # lines kept as-is)
header_match = header_pattern.match(stripped_line)
if header_match:
current_hashes, title_content = header_match.groups()
current_level = len(current_hashes)

if not first_valid_header:
# First valid header: keep original level unchanged
fixed_line = f"{current_hashes} {title_content}"
first_valid_header = True
logger.debug(f"Keep first header at level {current_level}: {title_content[:50]}...")
else:
# Subsequent headers: increment by 1, cap at level 6
new_level = min(current_level + 1, 6)
new_hashes = '#' * new_level
fixed_line = f"{new_hashes} {title_content}"
logger.debug(f"Adjust header level: {current_level} -> {new_level}: {title_content[:50]}...")
Comment on lines +143 to +147
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The header hierarchy fix strategy may not produce the desired results in all cases. The current approach increments all headers after the first by 1 level, but this doesn't account for the original hierarchy structure. For example, if the original has headers at levels [1, 1, 1], they become [1, 2, 2], but if the original was [1, 2, 1], they become [1, 3, 2], which breaks the hierarchy (level 3 appears before level 2 is closed). A more robust approach would be to normalize all level-1 headers to level-2 except the first, preserving the relative structure of non-level-1 headers.

Copilot uses AI. Check for mistakes.
fixed_lines.append(fixed_line)
else:
fixed_lines.append(line)

# Join with newlines to preserve original formatting
fixed_text = '\n'.join(fixed_lines)
return fixed_text
Comment on lines +78 to +154
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new header hierarchy detection and fixing functionality lacks test coverage. Since there are tests for mem_reader components, tests should be added to verify that malformed header hierarchies are correctly detected and fixed, including edge cases like all headers being level 1, mixed header levels, and empty documents.

Copilot uses AI. Check for mistakes.
4 changes: 3 additions & 1 deletion src/memos/chunkers/sentence_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,13 @@ def __init__(self, config: SentenceChunkerConfig):

def chunk(self, text: str) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
chonkie_chunks = self.chunker.chunk(text)
protected_text, url_map = self.protect_urls(text)
chonkie_chunks = self.chunker.chunk(protected_text)

chunks = []
for c in chonkie_chunks:
chunk = Chunk(text=c.text, token_count=c.token_count, sentences=c.sentences)
chunk = self.restore_urls(chunk.text, url_map)
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable chunk is first assigned as a Chunk object on line 51, but then immediately reassigned to a string on line 52 by calling restore_urls(). This overwrites the Chunk object with a string, which means the token_count and sentences attributes are lost. The correct approach would be to restore URLs on the text property and then append the Chunk object, like: chunk.text = self.restore_urls(chunk.text, url_map) followed by chunks.append(chunk).

Suggested change
chunk = self.restore_urls(chunk.text, url_map)
chunk.text = self.restore_urls(chunk.text, url_map)

Copilot uses AI. Check for mistakes.
chunks.append(chunk)

logger.debug(f"Generated {len(chunks)} chunks from input text")
Expand Down
15 changes: 9 additions & 6 deletions src/memos/chunkers/simple_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@ def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) ->
Returns:
List of text chunks
"""
if not text or len(text) <= chunk_size:
return [text] if text.strip() else []
protected_text, url_map = self.protect_urls(text)

if not protected_text or len(protected_text) <= chunk_size:
chunks = [protected_text] if protected_text.strip() else []
return [self.restore_urls(chunk, url_map) for chunk in chunks]

chunks = []
start = 0
text_len = len(text)
text_len = len(protected_text)

while start < text_len:
# Calculate end position
Expand All @@ -35,16 +38,16 @@ def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) ->
if end < text_len:
# Try to break at newline, sentence end, or space
for separator in ["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " "]:
last_sep = text.rfind(separator, start, end)
last_sep = protected_text.rfind(separator, start, end)
if last_sep != -1:
end = last_sep + len(separator)
break

chunk = text[start:end].strip()
chunk = protected_text[start:end].strip()
if chunk:
chunks.append(chunk)

# Move start position with overlap
start = max(start + 1, end - chunk_overlap)

return chunks
return [self.restore_urls(chunk, url_map) for chunk in chunks]
Loading
Loading