字符串编解码是计算机处理文本的核心机制。本文从JavaScript字符串API开始,深入解析Unicode、UTF-8、UTF-16编码原理,并演示 TextEncoder/TextDecoder 的使用。
JavaScript String 编解码相关 API
codePointAt() - 获取Unicode码点
// 获取指定位置的Unicode码点
console.log('A'.codePointAt(0)); // 65 (U+0041)
console.log('中'.codePointAt(0)); // 20013 (U+4E2D)
console.log('🌍'.codePointAt(0)); // 127757 (U+1F30D)
// 处理代理对
const emoji = "🌍";
console.log('codePointAt(0):', emoji.codePointAt(0).toString(16)); // 1f30d
console.log('codePointAt(1):', emoji.codePointAt(1).toString(16)); // df0d (第二个代码单元)
charCodeAt() - 获取UTF-16代码单元
// 获取指定位置的UTF-16代码单元
console.log('A'.charCodeAt(0)); // 65
console.log('中'.charCodeAt(0)); // 20013
console.log('🌍'.charCodeAt(0)); // 55356 (代理对高位)
// 代理对示例
const emoji = "🌍";
console.log('charCodeAt(0):', emoji.charCodeAt(0).toString(16)); // d83c
console.log('charCodeAt(1):', emoji.charCodeAt(1).toString(16)); // df0d
fromCodePoint() - 从码点创建字符串
// 从Unicode码点创建字符串
console.log(String.fromCodePoint(65)); // A
console.log(String.fromCodePoint(20013)); // 中
console.log(String.fromCodePoint(127757)); // 🌍
// 批量创建
console.log(String.fromCodePoint(72, 101, 108, 108, 111)); // Hello
length - UTF-16代码单元数
// JavaScript字符串长度基于UTF-16代码单元数
const str1 = "Hello";
console.log(str1.length); // 5
const str2 = "Hello 世界! 🌍";
console.log(str2.length); // 13 (不是实际字符数)
// 获取实际字符数
function getActualCharCount(str) {
let count = 0;
for (let i = 0; i < str.length; i++) {
count++;
if (str.codePointAt(i) > 0xFFFF) {
i++; // 跳过代理对的第二个代码单元
}
}
return count;
}
console.log(getActualCharCount(str2)); // 11 (实际字符数)
Unicode
Unicode是一个国际标准,为世界上每个字符分配唯一的数字标识符,称为码点(Code Point)。
// Unicode码点范围
console.log('基本多语言平面:', '0x0000 - 0xFFFF'); // 常用字符
console.log('补充平面:', '0x10000 - 0x10FFFF'); // 表情符号、特殊符号
// 查看字符的Unicode码点
console.log('A的码点:', 'A'.codePointAt(0).toString(16)); // 41
console.log('中的码点:', '中'.codePointAt(0).toString(16)); // 4e2d
console.log('🌍的码点:', '🌍'.codePointAt(0).toString(16)); // 1f30d
UTF-8 编码
UTF-8是变长编码,兼容ASCII,是Web标准:
UTF-8 编码规则
- 1字节:ASCII字符 (0-127)
- 2字节:拉丁字符 (128-2047)
- 3字节:中文等 (2048-65535)
- 4字节:表情符号等 (65536+)
// UTF-8编码示例
const str = "Hello 世界! 🌍";
const encoder = new TextEncoder();
const utf8Bytes = encoder.encode(str);
console.log('UTF-8字节:', Array.from(utf8Bytes));
// [72, 101, 108, 108, 111, 32, 228, 184, 150, 231, 149, 140, 33, 32, 240, 159, 140, 141]
// 查看每个字符的UTF-8编码
Array.from(str).forEach((char, index) => {
const charBytes = encoder.encode(char);
console.log(`"${char}": [${Array.from(charBytes).join(', ')}]`);
});
UTF-8 编码算法
// UTF-8编码算法实现
function utf8Encode(str) {
const bytes = [];
for (let i = 0; i < str.length; i++) {
const codePoint = str.codePointAt(i);
if (codePoint < 0x80) {
// 1字节: 0xxxxxxx
bytes.push(codePoint);
} else if (codePoint < 0x800) {
// 2字节: 110xxxxx 10xxxxxx
bytes.push(0xC0 | (codePoint >> 6));
bytes.push(0x80 | (codePoint & 0x3F));
} else if (codePoint < 0x10000) {
// 3字节: 1110xxxx 10xxxxxx 10xxxxxx
bytes.push(0xE0 | (codePoint >> 12));
bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
bytes.push(0x80 | (codePoint & 0x3F));
} else {
// 4字节: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
bytes.push(0xF0 | (codePoint >> 18));
bytes.push(0x80 | ((codePoint >> 12) & 0x3F));
bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
bytes.push(0x80 | (codePoint & 0x3F));
}
// 如果是代理对,跳过下一个代码单元
if (codePoint > 0xFFFF) {
i++;
}
}
return bytes;
}
UTF-16 编码
UTF-16是JavaScript内部使用的编码格式:
UTF-16 编码规则
- 基本多语言平面:单个16位代码单元
- 补充平面:两个16位代码单元(代理对)
// UTF-16编码示例
const str = "Hello 世界! 🌍";
// 查看UTF-16代码单元
Array.from(str).forEach((char, index) => {
const codePoint = char.codePointAt(0);
const charCode = char.charCodeAt(0);
console.log(`"${char}": 码点=U+${codePoint.toString(16)}, UTF-16=0x${charCode.toString(16)}`);
});
UTF-16 编码算法
// UTF-16编码算法实现
function utf16Encode(str) {
const bytes = [];
for (let i = 0; i < str.length; i++) {
const codePoint = str.codePointAt(i);
if (codePoint < 0x10000) {
// 基本多语言平面 - 单个代码单元
bytes.push(codePoint & 0xFF); // 低字节
bytes.push((codePoint >> 8) & 0xFF); // 高字节
} else {
// 补充平面 - 代理对
const surrogate = codePoint - 0x10000;
const highSurrogate = 0xD800 + (surrogate >> 10);
const lowSurrogate = 0xDC00 + (surrogate & 0x3FF);
// 高位代理
bytes.push(highSurrogate & 0xFF);
bytes.push((highSurrogate >> 8) & 0xFF);
// 低位代理
bytes.push(lowSurrogate & 0xFF);
bytes.push((lowSurrogate >> 8) & 0xFF);
i++; // 跳过代理对的第二个代码单元
}
}
return bytes;
}
TextEncoder/TextDecoder 使用
基本用法
// 创建编码器和解码器
const encoder = new TextEncoder(); // 默认 UTF-8
const decoder = new TextDecoder(); // 默认 UTF-8
// 编码字符串
const str = "Hello 世界! 🌍";
const encoded = encoder.encode(str);
console.log('编码结果:', encoded); // Uint8Array
// 解码字节数组
const decoded = decoder.decode(encoded);
console.log('解码结果:', decoded); // "Hello 世界! 🌍"
支持的编码格式
// TextEncoder 只支持 UTF-8
console.log(new TextEncoder().encoding); // "utf-8"
// TextDecoder 支持多种编码
const encodings = [
'utf-8', // 默认
'utf-16le', // 小端序 UTF-16
'utf-16be', // 大端序 UTF-16
'iso-8859-1', // Latin-1
'windows-1251' // 西里尔字母
];
encodings.forEach(encoding => {
try {
const decoder = new TextDecoder(encoding);
console.log(`支持编码: ${encoding}`);
} catch (e) {
console.log(`不支持编码: ${encoding}`);
}
});
流式处理
// 分块编码
function encodeInChunks(str, chunkSize = 3) {
const encoder = new TextEncoder();
const chunks = [];
for (let i = 0; i < str.length; i += chunkSize) {
const chunk = str.slice(i, i + chunkSize);
const encoded = encoder.encode(chunk);
chunks.push(encoded);
console.log(`块 ${Math.floor(i/chunkSize) + 1}:`, Array.from(encoded));
}
return chunks;
}
// 分块解码
function decodeInChunks(encodedChunks) {
const decoder = new TextDecoder();
let result = '';
encodedChunks.forEach((chunk, index) => {
const decoded = decoder.decode(chunk, { stream: true });
result += decoded;
console.log(`解码块 ${index + 1}:`, decoded);
});
// 处理剩余字节
const final = decoder.decode();
if (final) {
result += final;
console.log('最终解码:', final);
}
return result;
}
// 使用示例
const testStr = "Hello 世界! 🌍";
const chunks = encodeInChunks(testStr, 3);
const decoded = decodeInChunks(chunks);
console.log('原始字符串:', testStr);
console.log('解码结果:', decoded);
错误处理
// 处理无效字节序列
const invalidBytes = new Uint8Array([0xFF, 0xFE, 0xFD]); // 无效的UTF-8序列
// 默认行为:替换无效字符
const decoder1 = new TextDecoder('utf-8');
console.log('默认处理:', decoder1.decode(invalidBytes)); // ���
// 抛出错误
const decoder2 = new TextDecoder('utf-8', { fatal: true });
try {
decoder2.decode(invalidBytes);
} catch (e) {
console.log('解码错误:', e.message);
}
实际应用场景
// 1. 文件上传处理
async function handleFileUpload(file) {
const arrayBuffer = await file.arrayBuffer();
const uint8Array = new Uint8Array(arrayBuffer);
// 检测编码
const decoder = new TextDecoder('utf-8', { fatal: false });
const text = decoder.decode(uint8Array);
console.log('文件内容:', text);
return text;
}
// 2. 网络数据传输
async function fetchTextData(url) {
const response = await fetch(url);
const arrayBuffer = await response.arrayBuffer();
// 根据响应头确定编码
const contentType = response.headers.get('content-type');
const encoding = contentType?.includes('charset=')
? contentType.split('charset=')[1]
: 'utf-8';
const decoder = new TextDecoder(encoding);
return decoder.decode(arrayBuffer);
}
// 3. 二进制数据处理
function processBinaryData(binaryData) {
const decoder = new TextDecoder('utf-8');
const text = decoder.decode(binaryData);
// 处理文本数据
const lines = text.split('\n');
return lines.map(line => line.trim()).filter(line => line.length > 0);
}
// 4. 编码转换
function convertEncoding(text, fromEncoding, toEncoding) {
// 从源编码解码
const decoder = new TextDecoder(fromEncoding);
const encoder = new TextEncoder(); // 只能编码为UTF-8
// 注意:TextEncoder只支持UTF-8,如需其他编码需使用第三方库
if (toEncoding !== 'utf-8') {
throw new Error('TextEncoder只支持UTF-8编码');
}
const bytes = decoder.decode(new TextEncoder().encode(text));
return encoder.encode(bytes);
}