RAG 体系认知

0 阅读1分钟

定义

检索增强,解决了ai幻觉的问题,提升问题的精准度

解决的问题

工程路径

实践

通过语义,段落,代码块,进行分组,本质上就是通过

js
 体验AI代码助手
 代码解读
复制代码
const smartSplitText = (text: string, maxTokens = 320, overlapTokens = 64) => {
  const lines = text.replace(/\r\n/g, "\n").split("\n");
  const segs: Array<{ type: "code" | "text"; content: string }> = [];
  let i = 0;
  let buf: string[] = [];
  while (i < lines.length) {
    const m = /^(```|~~~)\s*([a-zA-Z0-9+._-]*)?\s*$/.exec(lines[i]);
    if (m) {
      if (buf.length) {
        segs.push({ type: "text", content: buf.join("\n") });
        buf = [];
      }
      const marker = m[1];
      const start = i;
      i++;
      while (i < lines.length && !new RegExp(`^${marker}\s*$`).test(lines[i])) i++;
      if (i < lines.length) i++;
      segs.push({ type: "code", content: lines.slice(start, i).join("\n") });
      continue;
    }
    buf.push(lines[i]);
    i++;
  }
  if (buf.length) segs.push({ type: "text", content: buf.join("\n") });
  const out: string[] = [];
  const pushWithOverlap = (chunk: string) => {
    if (out.length === 0) {
      out.push(chunk);
      return;
    }
    const prev = out[out.length - 1];
    const overlapChars = Math.max(0, Math.floor((overlapTokens * 4)));
    const tail = prev.slice(Math.max(0, prev.length - overlapChars));
    out.push(tail + (tail ? "\n" : "") + chunk);
  };
  for (const seg of segs) {
    if (seg.type === "code") {
      const t = estTokens(seg.content);
      if (t <= maxTokens) {
        pushWithOverlap(seg.content);
      } else {
        const codeLines = seg.content.split("\n");
        let buf2: string[] = [];
        for (const l of codeLines) {
          const tmp = buf2.length ? buf2.join("\n") + "\n" + l : l;
          if (estTokens(tmp) > maxTokens) {
            if (buf2.length) pushWithOverlap(buf2.join("\n"));
            buf2 = [l];
          } else {
            buf2.push(l);
          }
        }
        if (buf2.length) pushWithOverlap(buf2.join("\n"));
      }
    } else {
      const paras = seg.content.split(/\n{2,}/).map((p) => p.trim()).filter(Boolean);
      let acc: string[] = [];
      for (const p of paras) {
        const tmp = acc.length ? acc.join("\n\n") + "\n\n" + p : p;
        if (estTokens(tmp) > maxTokens) {
          if (acc.length) pushWithOverlap(acc.join("\n\n"));
          acc = [p];
        } else {
          acc.push(p);
        }
      }
      if (acc.length) pushWithOverlap(acc.join("\n\n"));
    }
  }
  return out;
};

通过了对文档的段落的向量化,最终并记录原始文档行,号,从而展示出原始文件的链接

image.png