字符串编解码详解

53 阅读5分钟

字符串编解码是计算机处理文本的核心机制。本文从JavaScript字符串API开始,深入解析Unicode、UTF-8、UTF-16编码原理,并演示 TextEncoder/TextDecoder 的使用。

JavaScript String 编解码相关 API

codePointAt() - 获取Unicode码点

// 获取指定位置的Unicode码点
console.log('A'.codePointAt(0));        // 65 (U+0041)
console.log('中'.codePointAt(0));       // 20013 (U+4E2D)
console.log('🌍'.codePointAt(0));       // 127757 (U+1F30D)

// 处理代理对
const emoji = "🌍";
console.log('codePointAt(0):', emoji.codePointAt(0).toString(16)); // 1f30d
console.log('codePointAt(1):', emoji.codePointAt(1).toString(16)); // df0d (第二个代码单元)

charCodeAt() - 获取UTF-16代码单元

// 获取指定位置的UTF-16代码单元
console.log('A'.charCodeAt(0));         // 65
console.log('中'.charCodeAt(0));        // 20013
console.log('🌍'.charCodeAt(0));        // 55356 (代理对高位)

// 代理对示例
const emoji = "🌍";
console.log('charCodeAt(0):', emoji.charCodeAt(0).toString(16)); // d83c
console.log('charCodeAt(1):', emoji.charCodeAt(1).toString(16)); // df0d

fromCodePoint() - 从码点创建字符串

// 从Unicode码点创建字符串
console.log(String.fromCodePoint(65));     // A
console.log(String.fromCodePoint(20013));  // 中
console.log(String.fromCodePoint(127757)); // 🌍

// 批量创建
console.log(String.fromCodePoint(72, 101, 108, 108, 111)); // Hello

length - UTF-16代码单元数

// JavaScript字符串长度基于UTF-16代码单元数
const str1 = "Hello";
console.log(str1.length); // 5

const str2 = "Hello 世界! 🌍";
console.log(str2.length); // 13 (不是实际字符数)

// 获取实际字符数
function getActualCharCount(str) {
    let count = 0;
    for (let i = 0; i < str.length; i++) {
        count++;
        if (str.codePointAt(i) > 0xFFFF) {
            i++; // 跳过代理对的第二个代码单元
        }
    }
    return count;
}
console.log(getActualCharCount(str2)); // 11 (实际字符数)

Unicode

Unicode是一个国际标准,为世界上每个字符分配唯一的数字标识符,称为码点(Code Point)

// Unicode码点范围
console.log('基本多语言平面:', '0x0000 - 0xFFFF'); // 常用字符
console.log('补充平面:', '0x10000 - 0x10FFFF');   // 表情符号、特殊符号

// 查看字符的Unicode码点
console.log('A的码点:', 'A'.codePointAt(0).toString(16)); // 41
console.log('中的码点:', '中'.codePointAt(0).toString(16)); // 4e2d
console.log('🌍的码点:', '🌍'.codePointAt(0).toString(16)); // 1f30d

UTF-8 编码

UTF-8是变长编码,兼容ASCII,是Web标准:

UTF-8 编码规则

  • 1字节:ASCII字符 (0-127)
  • 2字节:拉丁字符 (128-2047)
  • 3字节:中文等 (2048-65535)
  • 4字节:表情符号等 (65536+)
// UTF-8编码示例
const str = "Hello 世界! 🌍";
const encoder = new TextEncoder();
const utf8Bytes = encoder.encode(str);

console.log('UTF-8字节:', Array.from(utf8Bytes));
// [72, 101, 108, 108, 111, 32, 228, 184, 150, 231, 149, 140, 33, 32, 240, 159, 140, 141]

// 查看每个字符的UTF-8编码
Array.from(str).forEach((char, index) => {
    const charBytes = encoder.encode(char);
    console.log(`"${char}": [${Array.from(charBytes).join(', ')}]`);
});

UTF-8 编码算法

// UTF-8编码算法实现
function utf8Encode(str) {
    const bytes = [];
    
    for (let i = 0; i < str.length; i++) {
        const codePoint = str.codePointAt(i);
        
        if (codePoint < 0x80) {
            // 1字节: 0xxxxxxx
            bytes.push(codePoint);
        } else if (codePoint < 0x800) {
            // 2字节: 110xxxxx 10xxxxxx
            bytes.push(0xC0 | (codePoint >> 6));
            bytes.push(0x80 | (codePoint & 0x3F));
        } else if (codePoint < 0x10000) {
            // 3字节: 1110xxxx 10xxxxxx 10xxxxxx
            bytes.push(0xE0 | (codePoint >> 12));
            bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
            bytes.push(0x80 | (codePoint & 0x3F));
        } else {
            // 4字节: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            bytes.push(0xF0 | (codePoint >> 18));
            bytes.push(0x80 | ((codePoint >> 12) & 0x3F));
            bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
            bytes.push(0x80 | (codePoint & 0x3F));
        }
        
        // 如果是代理对,跳过下一个代码单元
        if (codePoint > 0xFFFF) {
            i++;
        }
    }
    
    return bytes;
}

UTF-16 编码

UTF-16是JavaScript内部使用的编码格式:

UTF-16 编码规则

  • 基本多语言平面:单个16位代码单元
  • 补充平面:两个16位代码单元(代理对)
// UTF-16编码示例
const str = "Hello 世界! 🌍";

// 查看UTF-16代码单元
Array.from(str).forEach((char, index) => {
    const codePoint = char.codePointAt(0);
    const charCode = char.charCodeAt(0);
    console.log(`"${char}": 码点=U+${codePoint.toString(16)}, UTF-16=0x${charCode.toString(16)}`);
});

UTF-16 编码算法

// UTF-16编码算法实现
function utf16Encode(str) {
    const bytes = [];
    
    for (let i = 0; i < str.length; i++) {
        const codePoint = str.codePointAt(i);
        
        if (codePoint < 0x10000) {
            // 基本多语言平面 - 单个代码单元
            bytes.push(codePoint & 0xFF);        // 低字节
            bytes.push((codePoint >> 8) & 0xFF); // 高字节
        } else {
            // 补充平面 - 代理对
            const surrogate = codePoint - 0x10000;
            const highSurrogate = 0xD800 + (surrogate >> 10);
            const lowSurrogate = 0xDC00 + (surrogate & 0x3FF);
            
            // 高位代理
            bytes.push(highSurrogate & 0xFF);
            bytes.push((highSurrogate >> 8) & 0xFF);
            
            // 低位代理
            bytes.push(lowSurrogate & 0xFF);
            bytes.push((lowSurrogate >> 8) & 0xFF);
            
            i++; // 跳过代理对的第二个代码单元
        }
    }
    
    return bytes;
}

TextEncoder/TextDecoder 使用

基本用法

// 创建编码器和解码器
const encoder = new TextEncoder(); // 默认 UTF-8
const decoder = new TextDecoder(); // 默认 UTF-8

// 编码字符串
const str = "Hello 世界! 🌍";
const encoded = encoder.encode(str);
console.log('编码结果:', encoded); // Uint8Array

// 解码字节数组
const decoded = decoder.decode(encoded);
console.log('解码结果:', decoded); // "Hello 世界! 🌍"

支持的编码格式

// TextEncoder 只支持 UTF-8
console.log(new TextEncoder().encoding); // "utf-8"

// TextDecoder 支持多种编码
const encodings = [
    'utf-8',        // 默认
    'utf-16le',     // 小端序 UTF-16
    'utf-16be',     // 大端序 UTF-16
    'iso-8859-1',   // Latin-1
    'windows-1251'  // 西里尔字母
];

encodings.forEach(encoding => {
    try {
        const decoder = new TextDecoder(encoding);
        console.log(`支持编码: ${encoding}`);
    } catch (e) {
        console.log(`不支持编码: ${encoding}`);
    }
});

流式处理

// 分块编码
function encodeInChunks(str, chunkSize = 3) {
    const encoder = new TextEncoder();
    const chunks = [];
    
    for (let i = 0; i < str.length; i += chunkSize) {
        const chunk = str.slice(i, i + chunkSize);
        const encoded = encoder.encode(chunk);
        chunks.push(encoded);
        console.log(`块 ${Math.floor(i/chunkSize) + 1}:`, Array.from(encoded));
    }
    
    return chunks;
}

// 分块解码
function decodeInChunks(encodedChunks) {
    const decoder = new TextDecoder();
    let result = '';
    
    encodedChunks.forEach((chunk, index) => {
        const decoded = decoder.decode(chunk, { stream: true });
        result += decoded;
        console.log(`解码块 ${index + 1}:`, decoded);
    });
    
    // 处理剩余字节
    const final = decoder.decode();
    if (final) {
        result += final;
        console.log('最终解码:', final);
    }
    
    return result;
}

// 使用示例
const testStr = "Hello 世界! 🌍";
const chunks = encodeInChunks(testStr, 3);
const decoded = decodeInChunks(chunks);
console.log('原始字符串:', testStr);
console.log('解码结果:', decoded);

错误处理

// 处理无效字节序列
const invalidBytes = new Uint8Array([0xFF, 0xFE, 0xFD]); // 无效的UTF-8序列

// 默认行为:替换无效字符
const decoder1 = new TextDecoder('utf-8');
console.log('默认处理:', decoder1.decode(invalidBytes)); // ���

// 抛出错误
const decoder2 = new TextDecoder('utf-8', { fatal: true });
try {
    decoder2.decode(invalidBytes);
} catch (e) {
    console.log('解码错误:', e.message);
}

实际应用场景

// 1. 文件上传处理
async function handleFileUpload(file) {
    const arrayBuffer = await file.arrayBuffer();
    const uint8Array = new Uint8Array(arrayBuffer);
    
    // 检测编码
    const decoder = new TextDecoder('utf-8', { fatal: false });
    const text = decoder.decode(uint8Array);
    
    console.log('文件内容:', text);
    return text;
}

// 2. 网络数据传输
async function fetchTextData(url) {
    const response = await fetch(url);
    const arrayBuffer = await response.arrayBuffer();
    
    // 根据响应头确定编码
    const contentType = response.headers.get('content-type');
    const encoding = contentType?.includes('charset=') 
        ? contentType.split('charset=')[1] 
        : 'utf-8';
    
    const decoder = new TextDecoder(encoding);
    return decoder.decode(arrayBuffer);
}

// 3. 二进制数据处理
function processBinaryData(binaryData) {
    const decoder = new TextDecoder('utf-8');
    const text = decoder.decode(binaryData);
    
    // 处理文本数据
    const lines = text.split('\n');
    return lines.map(line => line.trim()).filter(line => line.length > 0);
}

// 4. 编码转换
function convertEncoding(text, fromEncoding, toEncoding) {
    // 从源编码解码
    const decoder = new TextDecoder(fromEncoding);
    const encoder = new TextEncoder(); // 只能编码为UTF-8
    
    // 注意:TextEncoder只支持UTF-8,如需其他编码需使用第三方库
    if (toEncoding !== 'utf-8') {
        throw new Error('TextEncoder只支持UTF-8编码');
    }
    
    const bytes = decoder.decode(new TextEncoder().encode(text));
    return encoder.encode(bytes);
}