每天一个高级前端知识 - Day 8
今日主题:前端AI - 在浏览器中运行大语言模型,使用WebGPU + Transformers.js实现本地AI助手
核心概念:AI推理正在从云端走向本地
得益于WebGPU和量化技术,现在可以在浏览器中直接运行小规模LLM(如Phi-3、Gemma-2B、Qwen-1.8B)。
性能数据(MacBook M2):
Phi-3-mini (3.8B 4-bit): 15-20 tokens/秒
Gemma-2B (2.6B 4-bit): 25-30 tokens/秒
Zephyr-3B (3B 4-bit): 20-25 tokens/秒
🔬 浏览器AI架构
┌─────────────────────────────────────────────┐
│ JavaScript (控制层) │
│ - 文本预处理/后处理 │
│ - KV Cache管理 │
│ - 流式输出 │
└─────────────────────────────────────────────┘
↓
┌─────────────────────────────────────────────┐
│ Transformers.js (ONNX Runtime) │
│ - 模型加载与解析 │
│ - Tokenizer (BPE/SentencePiece) │
│ - 张量操作抽象层 │
└─────────────────────────────────────────────┘
↓
┌─────────────────────────────────────────────┐
│ WebGPU 后端 │
│ - 矩阵乘法加速 (GEMM) │
│ - Attention计算 │
│ - 量化推理 (INT4/INT8) │
└─────────────────────────────────────────────┘
🚀 完整实现:本地AI助手
// 1. 安装依赖
// npm install @xenova/transformers
import { pipeline, env, AutoModel, AutoTokenizer } from '@xenova/transformers';
// 配置WebGPU后端
env.backends.onnx.wasm.numThreads = navigator.hardwareConcurrency;
env.useBrowserCache = true;
env.localModelPath = '/models/'; // 本地缓存模型
class LocalLLM {
constructor(config) {
this.modelName = config.modelName || 'Xenova/phi-3-mini-4k-instruct';
this.model = null;
this.tokenizer = null;
this.isLoading = false;
this.isGenerating = false;
this.maxTokens = config.maxTokens || 2048;
this.temperature = config.temperature || 0.7;
this.topK = config.topK || 50;
this.topP = config.topP || 0.95;
this.kvCache = null;
// 进度回调
this.onProgress = config.onProgress || (() => {});
this.onToken = config.onToken || (() => {});
this.onComplete = config.onComplete || (() => {});
}
async load() {
if (this.model) return;
this.isLoading = true;
this.onProgress({ status: 'loading', progress: 0 });
try {
// 加载Tokenizer(使用Web Worker避免阻塞UI)
this.tokenizer = await AutoTokenizer.from_pretrained(this.modelName, {
progress_callback: (progress) => {
this.onProgress({
status: 'loading-tokenizer',
progress: progress.progress
});
}
});
// 加载模型(WebGPU后端自动选择)
this.model = await AutoModel.from_pretrained(this.modelName, {
dtype: 'q4f16', // 4-bit量化,减少75%内存
device: 'webgpu', // 使用WebGPU
progress_callback: (progress) => {
this.onProgress({
status: 'loading-model',
progress: progress.progress
});
}
});
this.isLoading = false;
this.onProgress({ status: 'ready', progress: 1 });
return true;
} catch (error) {
console.error('模型加载失败:', error);
this.isLoading = false;
throw error;
}
}
async generate(prompt, options = {}) {
if (!this.model) {
throw new Error('模型未加载,请先调用load()');
}
if (this.isGenerating) {
console.warn('已有生成任务在进行中');
return;
}
this.isGenerating = true;
const startTime = performance.now();
// 构建对话模板(Phi-3格式)
const messages = [
{ role: 'system', content: 'You are a helpful AI assistant.' },
{ role: 'user', content: prompt }
];
const formattedPrompt = this.applyChatTemplate(messages);
// Tokenize输入
const inputIds = this.tokenizer.encode(formattedPrompt);
let currentIds = inputIds;
// KV Cache(关键优化:避免重复计算)
if (!this.kvCache) {
this.kvCache = this.model.createKVCache();
}
let generatedTokens = [];
let tokenCount = 0;
const maxNewTokens = options.maxTokens || this.maxTokens;
// 自回归生成循环
for (let step = 0; step < maxNewTokens; step++) {
// 前向传播
const outputs = await this.model.forward({
input_ids: currentIds,
past_key_values: this.kvCache,
use_cache: true
});
// 更新KV Cache
this.kvCache = outputs.past_key_values;
// 获取下一个token的logits
const nextTokenLogits = outputs.logits.slice(-1)[0];
// 采样策略
let nextTokenId = this.sampleToken(nextTokenLogits, {
temperature: options.temperature || this.temperature,
topK: options.topK || this.topK,
topP: options.topP || this.topP
});
// 检查结束符
if (nextTokenId === this.tokenizer.eos_token_id) {
break;
}
// 解码token
const token = this.tokenizer.decode([nextTokenId]);
generatedTokens.push(token);
tokenCount++;
// 流式输出回调
this.onToken(token, {
tokenCount,
timestamp: performance.now() - startTime
});
// 准备下一步输入
currentIds = [[nextTokenId]];
}
// 生成完成
const fullText = generatedTokens.join('');
this.isGenerating = false;
this.onComplete(fullText, {
tokenCount,
totalTime: performance.now() - startTime,
tokensPerSecond: tokenCount / ((performance.now() - startTime) / 1000)
});
return fullText;
}
sampleToken(logits, options) {
// Top-K采样
const logitsArray = Array.from(logits);
if (options.topK && options.topK < logitsArray.length) {
// 获取Top-K索引
const indices = logitsArray
.map((val, idx) => ({ val, idx }))
.sort((a, b) => b.val - a.val)
.slice(0, options.topK);
// 重置其他logits为-Infinity
const filtered = new Array(logitsArray.length).fill(-Infinity);
indices.forEach(({ idx, val }) => {
filtered[idx] = val;
});
// 应用Temperature
const scaled = filtered.map(v => v / options.temperature);
return this.softmaxSample(scaled);
}
// 普通Temperature采样
const scaled = logitsArray.map(v => v / options.temperature);
return this.softmaxSample(scaled);
}
softmaxSample(logits) {
// 计算softmax
const maxLogit = Math.max(...logits);
const exp = logits.map(x => Math.exp(x - maxLogit));
const sum = exp.reduce((a, b) => a + b, 0);
const probs = exp.map(x => x / sum);
// 随机采样
const random = Math.random();
let cumulative = 0;
for (let i = 0; i < probs.length; i++) {
cumulative += probs[i];
if (random < cumulative) return i;
}
return probs.length - 1;
}
applyChatTemplate(messages) {
// Phi-3 chat template
let result = '';
for (const msg of messages) {
if (msg.role === 'system') {
result += `<|system|>\n${msg.content}<|end|>\n`;
} else if (msg.role === 'user') {
result += `<|user|>\n${msg.content}<|end|>\n`;
} else if (msg.role === 'assistant') {
result += `<|assistant|>\n${msg.content}<|end|>\n`;
}
}
result += '<|assistant|>\n';
return result;
}
// 模型卸载,释放内存
async unload() {
this.model = null;
this.tokenizer = null;
this.kvCache = null;
this.isGenerating = false;
// 触发垃圾回收(提示浏览器)
if (window.gc) window.gc();
}
// 获取模型信息
getInfo() {
return {
modelName: this.modelName,
isLoaded: !!this.model,
isLoading: this.isLoading,
isGenerating: this.isGenerating,
maxTokens: this.maxTokens,
temperature: this.temperature
};
}
}
💡 高级优化技术
// 1. 使用Web Worker避免UI阻塞
class WorkerLLM {
constructor() {
this.worker = new Worker('llm-worker.js');
this.callbacks = new Map();
this.requestId = 0;
}
async generate(prompt) {
const id = this.requestId++;
return new Promise((resolve) => {
this.callbacks.set(id, resolve);
this.worker.postMessage({ id, type: 'generate', prompt });
});
}
}
// 2. 投机解码(Speculative Decoding)- 加速2-3倍
class SpeculativeDecoder {
constructor(draftModel, targetModel) {
this.draftModel = draftModel; // 小模型,快速生成
this.targetModel = targetModel; // 大模型,验证
}
async generate(prompt, numTokens) {
let tokens = [];
while (tokens.length < numTokens) {
// 草稿模型生成5个候选token
const draftTokens = await this.draftModel.generateFast(prompt, 5);
// 目标模型并行验证
const validations = await this.targetModel.verify(prompt, draftTokens);
// 接受验证通过的tokens
const accepted = this.findAccepted(draftTokens, validations);
tokens.push(...accepted);
if (accepted.length < draftTokens.length) break;
}
return tokens;
}
}
// 3. 上下文缓存 - 重复对话不重复计算
class ContextCache {
constructor() {
this.cache = new Map();
this.maxSize = 10;
}
getOrCompute(sessionId, computeFn) {
if (this.cache.has(sessionId)) {
return this.cache.get(sessionId);
}
const result = computeFn();
if (this.cache.size >= this.maxSize) {
const firstKey = this.cache.keys().next().value;
this.cache.delete(firstKey);
}
this.cache.set(sessionId, result);
return result;
}
}
🎯 今日挑战
实现一个浏览器内运行的RAG系统:
要求:
- 使用本地LLM + 向量数据库(LanceDB)
- PDF文档上传和向量化(使用Transformers.js embedding模型)
- 实现相似度搜索 + LLM回答生成
- 完全离线运行
class LocalRAG {
constructor() {
this.llm = new LocalLLM({ modelName: 'Xenova/phi-3-mini-4k-instruct' });
this.embedder = null;
this.vectorDB = null;
this.chunks = [];
}
async init() {
await this.llm.load();
// 加载embedding模型
this.embedder = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
// 初始化向量数据库(IndexedDB存储)
this.vectorDB = await this.initVectorDB();
}
async addDocument(text, metadata) {
// 分块
const chunks = this.chunkText(text, 512);
for (const chunk of chunks) {
// 生成向量
const embedding = await this.embedder(chunk, { pooling: 'mean' });
// 存储
await this.vectorDB.add({
vector: embedding.data,
text: chunk,
metadata
});
}
}
async query(question, topK = 3) {
// 问题向量化
const questionEmbedding = await this.embedder(question, { pooling: 'mean' });
// 相似度搜索
const similar = await this.vectorDB.search(questionEmbedding.data, topK);
// 构建prompt
const context = similar.map(s => s.text).join('\n\n');
const prompt = `基于以下上下文回答问题:\n\n上下文:${context}\n\n问题:${question}\n\n回答:`;
// LLM生成
return await this.llm.generate(prompt);
}
chunkText(text, chunkSize) {
// 智能分块(按句子边界)
const sentences = text.split(/[.!?]+/);
const chunks = [];
let current = '';
for (const sentence of sentences) {
if ((current + sentence).length > chunkSize) {
chunks.push(current);
current = sentence;
} else {
current += sentence;
}
}
if (current) chunks.push(current);
return chunks;
}
}
📊 性能优化清单
- WebGPU后端(比WebGL快5-10倍)
- 4-bit量化(减少75%内存,保持95%精度)
- KV Cache(避免重复计算,加速2-3倍)
- Speculative Decoding(加速2-3倍)
- 批处理推理(同时处理多个请求)
- 模型分片加载(降低首屏时间)
- IndexedDB持久化(二次加载快10倍)
🚀 在线演示模型
| 模型 | 大小 | 内存 | 速度 | 质量 |
|---|---|---|---|---|
| Phi-3-mini-4k | 2.2GB | 2.8GB | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
| Gemma-2B | 1.5GB | 1.9GB | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ |
| Qwen-1.8B | 1.1GB | 1.4GB | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ |
| TinyLlama-1.1B | 0.7GB | 0.9GB | ⭐⭐⭐⭐⭐ | ⭐⭐ |
明日预告:编译原理在前端的应用 - 实现一个属于你自己的编译器(从AST到代码生成)
💡 核心洞察:本地AI不仅是隐私保护的解决方案,更是低延迟实时交互的关键!