Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7a1e121
fix: fix three feature issues
Jan 21, 2026
861ce14
Merge remote-tracking branch 'upstream/dev-20260119-v2.0.3' into dev-…
Jan 22, 2026
b2ce876
fix:add uncommitted changes for the previous fix
Jan 26, 2026
7f723c3
fix: optimize chunk strategy
Feb 2, 2026
1e80f0c
Merge remote-tracking branch 'upstream/dev-20260202-v2.0.5' into dev-…
Feb 2, 2026
edeb180
Optimize chunk strategy
Feb 2, 2026
db143a6
add some comments
Feb 2, 2026
e695eb2
fix: update is_markdown
Feb 6, 2026
472d49d
Merge branch 'dev-20260224-v2.0.7' into Mozy403_dev-test1_0224
Feb 26, 2026
7afd15a
chunker fix
Feb 26, 2026
ea9ba16
fix: add context during document processing
Feb 26, 2026
7bd7970
merge: resolve conflict with upstream/dev-20260224-v2.0.7
Feb 26, 2026
302e4ef
Merge branch 'dev-20260224-v2.0.7' into Mozy403_dev-test1_0224
Feb 27, 2026
5978ca4
reformat
Feb 27, 2026
47b608f
fix tests info
Feb 27, 2026
61109e6
Merge branch 'dev-20260224-v2.0.7' into Mozy403_dev-test1_0224
whipser030 Feb 27, 2026
6e05fd6
Merge branch 'dev-20260224-v2.0.7' into Mozy403_dev-test1_0224
Feb 27, 2026
080a42f
Merge branch 'dev-20260302-v2.0.8' into Mozy403_dev-test1_0224
whipser030 Feb 27, 2026
915cc1d
Merge branch 'dev-test1' of https://github.com/Mozy403/MemOS into Moz…
Feb 27, 2026
7437759
Merge branch 'dev-20260302-v2.0.8' into Mozy403_dev-test1_0224
Feb 27, 2026
f2c1d47
Merge branch 'Mozy403_dev-test1_0224' of github.com:whipser030/MemOS …
Feb 27, 2026
2e98c75
Merge branch 'dev-20260302-v2.0.8' into Mozy403_dev-test1_0224
Mar 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,5 @@ cython_debug/
outputs

evaluation/data/temporal_locomo
test_add_pipeline.py
test_file_pipeline.py
41 changes: 41 additions & 0 deletions src/memos/chunkers/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from abc import ABC, abstractmethod

from memos.configs.chunker import BaseChunkerConfig
Expand All @@ -22,3 +24,42 @@ def __init__(self, config: BaseChunkerConfig):
@abstractmethod
def chunk(self, text: str) -> list[Chunk]:
"""Chunk the given text into smaller chunks."""

def protect_urls(self, text: str) -> tuple[str, dict[str, str]]:
"""
Protect URLs in text from being split during chunking.

Args:
text: Text to process

Returns:
tuple: (Text with URLs replaced by placeholders, URL mapping dictionary)
"""
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
url_map = {}

def replace_url(match):
url = match.group(0)
placeholder = f"__URL_{len(url_map)}__"
url_map[placeholder] = url
return placeholder

protected_text = re.sub(url_pattern, replace_url, text)
return protected_text, url_map

def restore_urls(self, text: str, url_map: dict[str, str]) -> str:
"""
Restore protected URLs in text back to their original form.

Args:
text: Text with URL placeholders
url_map: URL mapping dictionary from protect_urls

Returns:
str: Text with URLs restored
"""
restored_text = text
for placeholder, url in url_map.items():
restored_text = restored_text.replace(placeholder, url)

return restored_text
4 changes: 3 additions & 1 deletion src/memos/chunkers/charactertext_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def __init__(

def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
chunks = self.chunker.split_text(text)
protected_text, url_map = self.protect_urls(text)
chunks = self.chunker.split_text(protected_text)
chunks = [self.restore_urls(chunk, url_map) for chunk in chunks]
logger.debug(f"Generated {len(chunks)} chunks from input text")
return chunks
101 changes: 99 additions & 2 deletions src/memos/chunkers/markdown_chunker.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from memos.configs.chunker import MarkdownChunkerConfig
from memos.dependency import require_python_package
from memos.log import get_logger
Expand All @@ -22,13 +24,15 @@ def __init__(
chunk_size: int = 1000,
chunk_overlap: int = 200,
recursive: bool = False,
auto_fix_headers: bool = True,
):
from langchain_text_splitters import (
MarkdownHeaderTextSplitter,
RecursiveCharacterTextSplitter,
)

self.config = config
self.auto_fix_headers = auto_fix_headers
self.chunker = MarkdownHeaderTextSplitter(
headers_to_split_on=config.headers_to_split_on
if config
Expand All @@ -46,17 +50,110 @@ def __init__(

def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
md_header_splits = self.chunker.split_text(text)
# Protect URLs first
protected_text, url_map = self.protect_urls(text)
# Auto-detect and fix malformed header hierarchy if enabled
if self.auto_fix_headers and self._detect_malformed_headers(protected_text):
logger.info("[Chunker:] detected malformed header hierarchy, attempting to fix...")
protected_text = self._fix_header_hierarchy(protected_text)
logger.info("[Chunker:] Header hierarchy fix completed")

md_header_splits = self.chunker.split_text(protected_text)
chunks = []
if self.chunker_recursive:
md_header_splits = self.chunker_recursive.split_documents(md_header_splits)
for doc in md_header_splits:
try:
chunk = " ".join(list(doc.metadata.values())) + "\n" + doc.page_content
chunk = self.restore_urls(chunk, url_map)
chunks.append(chunk)
except Exception as e:
logger.warning(f"warning chunking document: {e}")
chunks.append(doc.page_content)
restored_chunk = self.restore_urls(doc.page_content, url_map)
chunks.append(restored_chunk)
logger.info(f"Generated chunks: {chunks[:5]}")
logger.debug(f"Generated {len(chunks)} chunks from input text")
return chunks

def _detect_malformed_headers(self, text: str) -> bool:
"""Detect if markdown has improper header hierarchy usage."""
# Extract all valid markdown header lines
header_levels = []
pattern = re.compile(r"^#{1,6}\s+.+")
for line in text.split("\n"):
stripped_line = line.strip()
if pattern.match(stripped_line):
hash_match = re.match(r"^(#+)", stripped_line)
if hash_match:
level = len(hash_match.group(1))
header_levels.append(level)

total_headers = len(header_levels)
if total_headers == 0:
logger.debug("No valid headers detected, skipping check")
return False

# Calculate level-1 header ratio
level1_count = sum(1 for level in header_levels if level == 1)

# Determine if malformed: >90% are level-1 when total > 5
# OR all headers are level-1 when total ≤ 5
if total_headers > 5:
level1_ratio = level1_count / total_headers
if level1_ratio > 0.9:
logger.warning(
f"Detected header hierarchy issue: {level1_count}/{total_headers} "
f"({level1_ratio:.1%}) of headers are level 1"
)
return True
elif total_headers <= 5 and level1_count == total_headers:
logger.warning(
f"Detected header hierarchy issue: all {total_headers} headers are level 1"
)
return True
return False

def _fix_header_hierarchy(self, text: str) -> str:
"""
Fix markdown header hierarchy by adjusting levels.

Strategy:
1. Keep the first header unchanged as level-1 parent
2. Increment all subsequent headers by 1 level (max level 6)
"""
header_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
lines = text.split("\n")
fixed_lines = []
first_valid_header = False

for line in lines:
stripped_line = line.strip()
# Match valid header lines (invalid # lines kept as-is)
header_match = header_pattern.match(stripped_line)
if header_match:
current_hashes, title_content = header_match.groups()
current_level = len(current_hashes)

if not first_valid_header:
# First valid header: keep original level unchanged
fixed_line = f"{current_hashes} {title_content}"
first_valid_header = True
logger.debug(
f"Keep first header at level {current_level}: {title_content[:50]}..."
)
else:
# Subsequent headers: increment by 1, cap at level 6
new_level = min(current_level + 1, 6)
new_hashes = "#" * new_level
fixed_line = f"{new_hashes} {title_content}"
logger.debug(
f"Adjust header level: {current_level} -> {new_level}: {title_content[:50]}..."
)
fixed_lines.append(fixed_line)
else:
fixed_lines.append(line)

# Join with newlines to preserve original formatting
fixed_text = "\n".join(fixed_lines)
logger.info(f"[Chunker:] Header hierarchy fix completed: {fixed_text[:50]}...")
return fixed_text
4 changes: 3 additions & 1 deletion src/memos/chunkers/sentence_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,13 @@ def __init__(self, config: SentenceChunkerConfig):

def chunk(self, text: str) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
chonkie_chunks = self.chunker.chunk(text)
protected_text, url_map = self.protect_urls(text)
chonkie_chunks = self.chunker.chunk(protected_text)

chunks = []
for c in chonkie_chunks:
chunk = Chunk(text=c.text, token_count=c.token_count, sentences=c.sentences)
chunk = self.restore_urls(chunk.text, url_map)
chunks.append(chunk)

logger.debug(f"Generated {len(chunks)} chunks from input text")
Expand Down
15 changes: 9 additions & 6 deletions src/memos/chunkers/simple_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@ def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) ->
Returns:
List of text chunks
"""
if not text or len(text) <= chunk_size:
return [text] if text.strip() else []
protected_text, url_map = self.protect_urls(text)

if not protected_text or len(protected_text) <= chunk_size:
chunks = [protected_text] if protected_text.strip() else []
return [self.restore_urls(chunk, url_map) for chunk in chunks]

chunks = []
start = 0
text_len = len(text)
text_len = len(protected_text)

while start < text_len:
# Calculate end position
Expand All @@ -35,16 +38,16 @@ def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) ->
if end < text_len:
# Try to break at newline, sentence end, or space
for separator in ["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " "]:
last_sep = text.rfind(separator, start, end)
last_sep = protected_text.rfind(separator, start, end)
if last_sep != -1:
end = last_sep + len(separator)
break

chunk = text[start:end].strip()
chunk = protected_text[start:end].strip()
if chunk:
chunks.append(chunk)

# Move start position with overlap
start = max(start + 1, end - chunk_overlap)

return chunks
return [self.restore_urls(chunk, url_map) for chunk in chunks]
Loading