Skip to content

Commit 244bf06

Browse files
ysyneuclaude
andcommitted
fix: clean and truncate doc content before Meilisearch upload
Strip frontmatter, HTML/MDX tags, and import statements from content before uploading to Meilisearch. Truncate cleaned content to 6000 chars to stay within DashScope text-embedding-v4's 8192-token input limit, which was causing batch embedding failures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9443d97 commit 244bf06

File tree

1 file changed

+18
-2
lines changed

1 file changed

+18
-2
lines changed

scripts/upload.sh

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ BASE_URL="${BASE_URL:-https://docs.flashcat.cloud}"
2828
FULL_UPLOAD="${FULL_UPLOAD:-false}"
2929
DRY_RUN=false
3030
BATCH_SIZE=5
31+
# DashScope text-embedding-v4 accepts up to 8192 tokens per input.
32+
# Truncate content to stay safely within this limit after cleanup.
33+
MAX_CONTENT_CHARS=6000
3134

3235
usage() {
3336
cat <<EOF
@@ -145,20 +148,33 @@ is_doc_file() {
145148
return 0
146149
}
147150

151+
# Clean raw MDX content for embedding: strip frontmatter, HTML/MDX tags,
152+
# import statements, and collapse whitespace, then truncate.
153+
clean_content() {
154+
local file=$1
155+
awk 'BEGIN{skip=0} NR==1 && /^---$/{skip=1;next} skip && /^---$/{skip=0;next} !skip' "$file" \
156+
| grep -v '^import ' \
157+
| sed 's/<[^>]*>//g' \
158+
| tr '\n' ' ' \
159+
| sed 's/ */ /g' \
160+
| cut -c1-"$MAX_CONTENT_CHARS"
161+
}
162+
148163
# Build a JSON document for a single file
149164
build_doc_json() {
150165
local file=$1
151-
local dir locale title doc_url id
166+
local dir locale title doc_url id content
152167
dir=$(dir_for_file "$file")
153168
locale=$(locale_for_file "$file")
154169
title=$(extract_title "$file")
155170
doc_url=$(extract_url "$file" "$dir" "$locale")
156171
id=$(file_to_id "$file")
172+
content=$(clean_content "$file")
157173

158174
jq -n \
159175
--arg id "$id" \
160176
--arg title "$title" \
161-
--rawfile content "$file" \
177+
--arg content "$content" \
162178
--arg locale "$locale" \
163179
--arg url "$doc_url" \
164180
'{id: $id, title: $title, content: $content, locale: $locale, url: $url}' 2>/dev/null

0 commit comments

Comments
 (0)