定义
检索增强,解决了ai幻觉的问题,提升问题的精准度
解决的问题
工程路径
实践
通过语义,段落,代码块,进行分组,本质上就是通过
js
体验AI代码助手
代码解读
复制代码
const smartSplitText = (text: string, maxTokens = 320, overlapTokens = 64) => {
const lines = text.replace(/\r\n/g, "\n").split("\n");
const segs: Array<{ type: "code" | "text"; content: string }> = [];
let i = 0;
let buf: string[] = [];
while (i < lines.length) {
const m = /^(```|~~~)\s*([a-zA-Z0-9+._-]*)?\s*$/.exec(lines[i]);
if (m) {
if (buf.length) {
segs.push({ type: "text", content: buf.join("\n") });
buf = [];
}
const marker = m[1];
const start = i;
i++;
while (i < lines.length && !new RegExp(`^${marker}\s*$`).test(lines[i])) i++;
if (i < lines.length) i++;
segs.push({ type: "code", content: lines.slice(start, i).join("\n") });
continue;
}
buf.push(lines[i]);
i++;
}
if (buf.length) segs.push({ type: "text", content: buf.join("\n") });
const out: string[] = [];
const pushWithOverlap = (chunk: string) => {
if (out.length === 0) {
out.push(chunk);
return;
}
const prev = out[out.length - 1];
const overlapChars = Math.max(0, Math.floor((overlapTokens * 4)));
const tail = prev.slice(Math.max(0, prev.length - overlapChars));
out.push(tail + (tail ? "\n" : "") + chunk);
};
for (const seg of segs) {
if (seg.type === "code") {
const t = estTokens(seg.content);
if (t <= maxTokens) {
pushWithOverlap(seg.content);
} else {
const codeLines = seg.content.split("\n");
let buf2: string[] = [];
for (const l of codeLines) {
const tmp = buf2.length ? buf2.join("\n") + "\n" + l : l;
if (estTokens(tmp) > maxTokens) {
if (buf2.length) pushWithOverlap(buf2.join("\n"));
buf2 = [l];
} else {
buf2.push(l);
}
}
if (buf2.length) pushWithOverlap(buf2.join("\n"));
}
} else {
const paras = seg.content.split(/\n{2,}/).map((p) => p.trim()).filter(Boolean);
let acc: string[] = [];
for (const p of paras) {
const tmp = acc.length ? acc.join("\n\n") + "\n\n" + p : p;
if (estTokens(tmp) > maxTokens) {
if (acc.length) pushWithOverlap(acc.join("\n\n"));
acc = [p];
} else {
acc.push(p);
}
}
if (acc.length) pushWithOverlap(acc.join("\n\n"));
}
}
return out;
};
通过了对文档的段落的向量化,最终并记录原始文档行,号,从而展示出原始文件的链接