Document loaders
用于加载外部数据源,Langchain支持多种形式的数据源,加载方法都是某个loader实例化的load方法someLoader.load()。
TextLoader
const loader = new TextLoader("data/example.txt");
const docs = await loader.load();
// 打印一下docs的内容,包含一个文本数据和元数据信息
[
Document {
pageContent: "This covers how to load all documents in a directory.\n" +
"\n" +
"The second argument is a map of file extensions to loader factories. Each file will be passed to the matching loader, and the resulting documents will be concatenated together.",
metadata: { source: "data/example.txt" }
}
]
PDFLoader
DirectoryLoader
从文件夹中加载所有文件
import { TextLoader } from "langchain/document_loaders/fs/text";
import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
import {
JSONLoader,
JSONLinesLoader,
} from "langchain/document_loaders/fs/json";
const loader = new DirectoryLoader(
"data",
{
".json": (path) => new JSONLoader(path,"/texts"),
".txt": (path) => new TextLoader(path),
}
);
const docs = await loader.load();
console.log({ docs });
Github loader
从github的库中加载文件
import { GithubRepoLoader } from "langchain/document_loaders/web/github";
import ignore from "ignore";
const loader = new GithubRepoLoader(
"https://github.com/nk-a/prismaAndTs",
// 可以选择分支、是否递归文件夹、ignore等
{
branch: "master",
recursive: false,
unknown: "warn",
ignorePaths: ["*.md", "yarn.lock", "*.json"],
accessToken: env["GITHUB_TOKEN"]
}
);
const docs = await loader.load()
// docs
Document {
pageContent: "# Environment variables declared in this file are automatically made available to Prisma.\n" +
"# See the documentation for more detail: https://pris.ly/d/prisma-schema#accessing-environment-variables-from-the-schema\n" +
"\n" +
"# Prisma supports the native connection string format for PostgreSQL, MySQL, SQLite, SQL Server, MongoDB and CockroachDB.\n" +
"# See the documentation for all the connection string options: https://pris.ly/d/connection-strings\n" +
"\n" +
'DATABASE_URL="postgresql://johndoe:randompassword@localhost:5432/mydb?schema=public"',
metadata: {
source: ".env",
repository: "https://github.com/nk-a/prismaAndTs",
branch: "master"
},
id: undefined
},...more
Web Loader
使用cherrio加载web文档
const loader = new CheerioWebBaseLoader("https://js.langchain.com/docs/how_to/#key-features",
// 使用选择器筛选
{
selector: "h3",
}
);
const docs = await loader.load();
// docs
[
Document {
pageContent: "Prompt templatesExample selectorsChat modelsMessagesLLMsOutput parsersDocument loadersText splittersEmbedding modelsVector storesRetrieversIndexingToolsAgentsCallbacksCustomGenerative UIMultimodalQ&A with RAGExtractionChatbotsQuery analysisQ&A over SQL + CSVQ&A over graph databasesEvaluationTracing",
metadata: { source: "https://js.langchain.com/docs/how_to/#key-features" }
}
]
search loader
从搜索结果中加载数据,常用的有两个loader,SerpAPILoader 和 SearchApiLoader
import { SerpAPILoader } from "langchain/document_loaders/web/serpapi";
// 注册serpapi,拿到api key
const apiKey = "your serp api key";
// 搜索条件
const question = "ollama";
const loader = new SerpAPILoader({ q: question, apiKey });
const docs = await loader.load();
// docs
Document {
pageContent: '{"position":1,"title":"Ollama","link":"https://ollama.com/","redirect_link":"https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://ollama.com/&ved=2ahUKEwjCw4HP7pqJAxVw6skDHTiBKaYQFnoECAkQAQ","displayed_link":"https://ollama.com","favicon":"https://serpapi.com/searches/6713de3cb7d31dcc2f1ed8ff/images/3cb8c12859411358c5e7adef4823a8bf4bb52d4dc2b49a62128ef853a28fe111.png","snippet":"Get up and running with large language models.","sitelinks":{"expanded":[{"title":"Library","link":"https://ollama.com/library","snippet":"LLaVA is a novel end-to-end trained large multimodal model ..."},{"title":"Download Ollama","link":"https://ollama.com/download","snippet":"curl -fsSL https://ollama.com/install.sh | sh. View script source ..."},{"title":"Download Ollama on Windows","link":"https://ollama.com/download/windows","snippet":"Download Ollama. macOS Linux Windows · Download for ..."},{"title":"Llama3.1","link":"https://ollama.com/library/llama3.1","snippet":"Llama 3.1 is a new state-of-the-art model from Meta available in 8B ..."},{"title":"Blog","link":"https://ollama.com/blog","snippet":"Ollama is now available on Windows in preview, making it ..."}]},"source":"Ollama"}',
metadata: { source: "SerpAPI", responseType: "organic_results" }
},...more]
custom loader
自定义document loader,有三种实现方式
// 继承DocumentLoader,处理document
abstract class BaseDocumentLoader implements DocumentLoader {
abstract load(): Promise<Document[]>;
}
// 继承TextLoader,处理text file
abstract class TextLoader extends BaseDocumentLoader {
abstract parse(raw: string): Promise<string[]>;
}
// 继承BufferLoader
abstract class BufferLoader extends BaseDocumentLoader {
abstract parse(
raw: Buffer,
metadata: Document["metadata"]
): Promise<Document[]>;
}
Text Splitters
加载文档后,需要对文档数据做处理,更好地适配应用。最常见的场景就是对数据进行分割,以适配应用输入的上下文窗口,拆分的过程中同时需要保留其语义相关性。
拆分的过程可以抽象为三个步骤:
- 拆分长文本为小且有意义的
chunks,比如文章按句子拆分等; - 将小的
chunks组合为大的chunk。 - 操作组合来的chunk,组合为新的chunk,新的chunk之间会有重叠的部分,以保留语义相关性。
recursively split text by characters
基本的拆分方法,按照一个字符列表进行拆分,默认的列表是["\n\n", "\n", " ", ""]
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
// 通过修改chunkSize和chunkOverlap 调试拆分结果
const splitter = new RecursiveCharacterTextSplitter({
// 拆分的chunk大小
chunkSize: 50,
// chunk之间重叠部分大小
chunkOverlap: 1,
// 自定义拆分符
separators: ["|", "##", ">", "-"],
});
const docOutput = await splitter.splitDocuments([
new Document({ pageContent: text }),
]);
split code
拆分代码
import { SupportedTextSplitterLanguages } from "langchain/text_splitter";
// 查询支持的语言列表
console.log(SupportedTextSplitterLanguages);
getSeparatorsForLanguage 查看给定语言的分隔符
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
// javascript
RecursiveCharacterTextSplitter.getSeparatorsForLanguage("js");
//
[
"\nfunction ", "\nconst ",
"\nlet ", "\nvar ",
"\nclass ", "\nif ",
"\nfor ", "\nwhile ",
"\nswitch ", "\ncase ",
"\ndefault ", "\n\n",
"\n", " ",
""
]
以js为例,看看拆分的实际效果
const JS_CODE = `
function helloWorld() {
console.log("Hello, World!");
}
// Call the function
helloWorld();
`;
const jsSplitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
chunkSize: 60,
chunkOverlap: 0,
});
const jsDocs = await jsSplitter.createDocuments([JS_CODE]);
// jsDocs
[
Document {
pageContent: 'function helloWorld() {\n console.log("Hello, World!");\n}',
metadata: { loc: { lines: { from: 2, to: 4 } } },
id: undefined
},
Document {
pageContent: "// Call the function\nhelloWorld();",
metadata: { loc: { lines: { from: 6, to: 7 } } },
id: undefined
}
]