forked from lioensky/VCPToolBox
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTextChunker.js
More file actions
126 lines (104 loc) · 4.9 KB
/
TextChunker.js
File metadata and controls
126 lines (104 loc) · 4.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
// TextChunker.js
require('dotenv').config({ path: './config.env' });
const { get_encoding } = require("@dqbd/tiktoken"); // 假设您已安装 tiktoken 用于精确计算
const encoding = get_encoding("cl100k_base"); // gpt-4, gpt-3.5, embedding models 常用
// 从 config.env 文件读取最大 token 数,并应用85%的安全边界
const embeddingMaxToken = parseInt(process.env.WhitelistEmbeddingModelMaxToken, 10) || 8000;
const safeMaxTokens = Math.floor(embeddingMaxToken * 0.85);
const defaultOverlapTokens = Math.floor(safeMaxTokens * 0.1); // 重叠部分为最大值的10%
console.log(`[TextChunker] 配置加载: MaxToken=${embeddingMaxToken}, SafeMaxTokens=${safeMaxTokens}, OverlapTokens=${defaultOverlapTokens}`);
/**
* 智能文本切分器
* @param {string} text - 需要切分的原始文本
* @param {number} maxTokens - 每个切片的最大token数
* @param {number} overlapTokens - 切片间的重叠token数,以保证上下文连续性
* @returns {string[]} 切分后的文本块数组
*/
function chunkText(text, maxTokens = safeMaxTokens, overlapTokens = defaultOverlapTokens) {
if (!text) return [];
const sentences = text.split(/(?<=[。?!.!?\n])/g); // 按句子和换行符分割,保留分隔符
const chunks = [];
let currentChunk = "";
let currentTokens = 0;
for (let i = 0; i < sentences.length; i++) {
let sentence = sentences[i];
let sentenceTokens = encoding.encode(sentence).length;
// 处理超长句子:如果单个句子超过maxTokens,需要强制分割
if (sentenceTokens > maxTokens) {
// 先保存当前切片(如果有内容)
if (currentChunk.trim()) {
chunks.push(currentChunk.trim());
currentChunk = "";
currentTokens = 0;
}
// 对超长句子进行强制分割
const forceSplitChunks = forceSplitLongText(sentence, maxTokens, overlapTokens);
chunks.push(...forceSplitChunks);
continue;
}
if (currentTokens + sentenceTokens > maxTokens) {
chunks.push(currentChunk.trim());
// 创建重叠部分
let overlapChunk = "";
let overlapTokenCount = 0;
for (let j = i - 1; j >= 0; j--) {
const prevSentence = sentences[j];
const prevSentenceTokens = encoding.encode(prevSentence).length;
if (overlapTokenCount + prevSentenceTokens > overlapTokens) break;
overlapChunk = prevSentence + overlapChunk;
overlapTokenCount += prevSentenceTokens;
}
currentChunk = overlapChunk;
currentTokens = overlapTokenCount;
}
currentChunk += sentence;
currentTokens += sentenceTokens;
}
if (currentChunk.trim()) {
chunks.push(currentChunk.trim());
}
return chunks;
}
/**
* 强制分割超长文本
* @param {string} text - 需要分割的超长文本
* @param {number} maxTokens - 每个切片的最大token数
* @param {number} overlapTokens - 切片间的重叠token数
* @returns {string[]} 分割后的文本块数组
*/
function forceSplitLongText(text, maxTokens, overlapTokens) {
const chunks = [];
const tokens = encoding.encode(text);
let start = 0;
while (start < tokens.length) {
let end = Math.min(start + maxTokens, tokens.length);
// 尝试在合适的位置断开(避免在词汇中间断开)
if (end < tokens.length) {
const chunkTokens = tokens.slice(start, end);
let chunkText = encoding.decode(chunkTokens);
// 尝试在标点符号或空白处断开
const breakPoints = ['\n', '。', '!', '?', ',', ';', ':', ' ', '\t'];
let bestBreakPoint = -1;
for (let i = chunkText.length - 1; i >= Math.max(0, chunkText.length - 200); i--) {
if (breakPoints.includes(chunkText[i])) {
bestBreakPoint = i + 1;
break;
}
}
if (bestBreakPoint > 0) {
chunkText = chunkText.substring(0, bestBreakPoint);
const newTokens = encoding.encode(chunkText);
end = start + newTokens.length;
}
chunks.push(chunkText.trim());
} else {
// 最后一块
const chunkTokens = tokens.slice(start);
chunks.push(encoding.decode(chunkTokens).trim());
}
// 计算下一个起始位置(考虑重叠)
start = Math.max(start + 1, end - overlapTokens);
}
return chunks.filter(chunk => chunk.length > 0);
}
module.exports = { chunkText };