大文件上传:分片上传 + 断点续传 + Worker 线程计算 Hash,崩溃率从 15% 降至 1%

2 阅读1分钟

一、背景与问题

在业务中,我们经常需要处理用户上传大文件的场景,比如:上传文档、报表、设计稿(几十 MB 到几 GB);上传视频、音频等媒体文件;上传压缩包、备份文件。直接用 <input type="file"> 上传大文件会遇到以下问题:

内存溢出:整个文件读入内存,浏览器容易崩溃。网络波动:上传到一半断网或超时,整个文件需要重传。计算阻塞:计算文件 Hash 时主线程被阻塞,页面假死。体验差:用户看到进度条走到 99% 然后失败,只能从头再来。我们的业务曾因此收到大量用户反馈,崩溃率高达 15%。经过调研和实践,我们总结出一套基于分片上传 + 断点续传 + Worker 并行计算 Hash 的方案。

二、方案概览

整体思路是将大文件切分成多个小块(分片),逐个上传,支持断点续传,并用 Web Worker 并行计算文件 Hash 用于去重和秒传。

核心技术点如下表所示:

技术点作用实现方式
分片上传避免内存溢出,支持并发File.slice() 切块
断点续传网络波动后只传未完成分片本地存储已上传分片索引
Worker 计算 Hash不阻塞主线程Web Worker + SparkMD5
秒传相同文件不再上传对比服务端 Hash

三、实现步骤

1. 分片上传核心逻辑

// 上传配置
const CHUNK_SIZE = 2 * 1024 * 1024; // 每个分片 2MB

class Uploader {
  constructor(file, options = {}) {
    this.file = file;
    this.chunkSize = options.chunkSize || CHUNK_SIZE;
    this.uploadedChunks = new Set(); // 已上传的分片索引
    this.concurrency = options.concurrency || 3; // 并发数
  }

  // 切分文件
  getChunks() {
    const chunks = [];
    let index = 0;
    for (let start = 0; start < this.file.size; start += this.chunkSize) {
      const end = Math.min(start + this.chunkSize, this.file.size);
      chunks.push({
        index,
        start,
        end,
        chunk: this.file.slice(start, end),
        filename: `${this.file.name}_${index}`
      });
      index++;
    }
    return chunks;
  }

  // 上传单个分片
  async uploadChunk(chunk) {
    const formData = new FormData();
    formData.append('chunk', chunk.chunk);
    formData.append('filename', chunk.filename);
    formData.append('index', chunk.index);
    formData.append('totalChunks', this.getChunks().length);

    const response = await fetch('/api/upload/chunk', {
      method: 'POST',
      body: formData
    });

    if (!response.ok) {
      throw new Error(`分片 ${chunk.index} 上传失败`);
    }

    return response.json();
  }
}

2. 断点续传:记录上传状态

class Uploader {
  constructor(file, options = {}) {
    // ...
    this.storageKey = `upload_${this.file.name}_${this.file.size}`;
    this.uploadedChunks = this.loadUploadedChunks();
  }

  // 从 localStorage 读取已上传分片
  loadUploadedChunks() {
    try {
      const saved = localStorage.getItem(this.storageKey);
      return saved ? new Set(JSON.parse(saved)) : new Set();
    } catch {
      return new Set();
    }
  }

  // 记录已上传分片
  saveUploadedChunk(index) {
    this.uploadedChunks.add(index);
    localStorage.setItem(this.storageKey, JSON.stringify([...this.uploadedChunks]));
  }

  // 清空上传状态(上传完成后)
  clearUploadState() {
    localStorage.removeItem(this.storageKey);
  }

  // 开始上传(跳过已上传的分片)
  async start() {
    const chunks = this.getChunks();
    const pendingChunks = chunks.filter(c => !this.uploadedChunks.has(c.index));
    
    // 并发上传
    const queue = [...pendingChunks];
    const results = [];
    
    const worker = async () => {
      while (queue.length > 0) {
        const chunk = queue.shift();
        try {
          await this.uploadChunk(chunk);
          this.saveUploadedChunk(chunk.index);
          results.push({ index: chunk.index, success: true });
        } catch (error) {
          results.push({ index: chunk.index, success: false, error: error.message });
        }
      }
    };

    // 启动多个并发 worker
    const workers = Array(Math.min(this.concurrency, pendingChunks.length))
      .fill(null)
      .map(() => worker());
    
    await Promise.all(workers);
    return results;
  }
}

3. Worker 线程计算文件 Hash

计算大文件的 Hash 是耗时操作,必须放在 Worker 中,否则会阻塞主线程导致页面卡死。

// worker.js
importScripts('https://cdn.jsdelivr.net/npm/spark-md5@3.0.2/spark-md5.min.js');

self.onmessage = function(e) {
  const { file, chunkSize } = e.data;
  const spark = new SparkMD5.ArrayBuffer();
  let offset = 0;
  
  function readNextChunk() {
    if (offset >= file.size) {
      // 计算完成,返回完整 Hash
      const hash = spark.end();
      self.postMessage({ type: 'complete', hash });
      return;
    }
    
    const slice = file.slice(offset, offset + chunkSize);
    const reader = new FileReader();
    
    reader.onload = function(e) {
      spark.append(e.target.result);
      offset += chunkSize;
      
      // 汇报进度
      const progress = Math.min((offset / file.size) * 100, 100);
      self.postMessage({ type: 'progress', progress });
      
      readNextChunk();
    };
    
    reader.onerror = function() {
      self.postMessage({ type: 'error', error: '文件读取失败' });
    };
    
    reader.readAsArrayBuffer(slice);
  }
  
  readNextChunk();
};

// 主线程调用
function calculateFileHash(file, chunkSize = CHUNK_SIZE) {
  return new Promise((resolve, reject) => {
    const worker = new Worker('worker.js');
    
    worker.postMessage({ file, chunkSize });
    
    worker.onmessage = function(e) {
      const { type, hash, progress, error } = e.data;
      
      if (type === 'complete') {
        resolve(hash);
        worker.terminate();
      } else if (type === 'progress') {
        console.log(`Hash 计算进度: ${progress.toFixed(2)}%`);
      } else if (type === 'error') {
        reject(new Error(error));
        worker.terminate();
      }
    };
    
    worker.onerror = function(error) {
      reject(error);
      worker.terminate();
    };
  });
}

// 使用
const file = document.querySelector('input[type="file"]').files[0];
calculateFileHash(file).then(hash => {
  console.log('文件 Hash:', hash);
  // 可以用这个 Hash 做秒传判断
});

4. 完整上传组件

// Uploader.js
export class Uploader {
  constructor(file, options = {}) {
    this.file = file;
    this.chunkSize = options.chunkSize || 2 * 1024 * 1024;
    this.concurrency = options.concurrency || 3;
    this.storageKey = `upload_${file.name}_${file.size}`;
    this.uploadedChunks = this.loadUploadedChunks();
    this.onProgress = options.onProgress || (() => {});
    this.onComplete = options.onComplete || (() => {});
    this.onError = options.onError || (() => {});
  }

  loadUploadedChunks() {
    try {
      const saved = localStorage.getItem(this.storageKey);
      return saved ? new Set(JSON.parse(saved)) : new Set();
    } catch {
      return new Set();
    }
  }

  saveUploadedChunk(index) {
    this.uploadedChunks.add(index);
    localStorage.setItem(this.storageKey, JSON.stringify([...this.uploadedChunks]));
  }

  clearUploadState() {
    localStorage.removeItem(this.storageKey);
  }

  getChunks() {
    const chunks = [];
    let index = 0;
    for (let start = 0; start < this.file.size; start += this.chunkSize) {
      const end = Math.min(start + this.chunkSize, this.file.size);
      chunks.push({ index, start, end, chunk: this.file.slice(start, end) });
      index++;
    }
    return chunks;
  }

  async uploadChunk(chunk) {
    const formData = new FormData();
    formData.append('chunk', chunk.chunk);
    formData.append('filename', this.file.name);
    formData.append('index', chunk.index);
    formData.append('totalChunks', this.getChunks().length);
    formData.append('fileHash', this.fileHash); // 文件整体 Hash

    const response = await fetch('/api/upload/chunk', {
      method: 'POST',
      body: formData
    });

    if (!response.ok) {
      throw new Error(`分片 ${chunk.index} 上传失败`);
    }

    return response.json();
  }

  async start(fileHash) {
    this.fileHash = fileHash;
    const chunks = this.getChunks();
    const total = chunks.length;
    let uploaded = this.uploadedChunks.size;
    
    // 汇报初始进度
    this.onProgress({ loaded: uploaded, total, percentage: (uploaded / total) * 100 });

    const queue = chunks.filter(c => !this.uploadedChunks.has(c.index));
    
    const worker = async () => {
      while (queue.length > 0) {
        const chunk = queue.shift();
        try {
          await this.uploadChunk(chunk);
          this.saveUploadedChunk(chunk.index);
          uploaded++;
          this.onProgress({ 
            loaded: uploaded, 
            total, 
            percentage: (uploaded / total) * 100 
          });
        } catch (error) {
          this.onError(error);
        }
      }
    };

    const workers = Array(Math.min(this.concurrency, queue.length))
      .fill(null)
      .map(() => worker());
    
    await Promise.allSettled(workers);
    
    // 通知服务端合并文件
    await this.mergeFile();
    
    this.clearUploadState();
    this.onComplete({ success: true });
  }

  async mergeFile() {
    const response = await fetch('/api/upload/merge', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({
        filename: this.file.name,
        fileHash: this.fileHash,
        totalChunks: this.getChunks().length
      })
    });
    
    if (!response.ok) {
      throw new Error('文件合并失败');
    }
    
    return response.json();
  }
}

// React 组件中使用
import { useState, useCallback } from 'react';
import { Uploader } from './Uploader';

function FileUpload() {
  const [progress, setProgress] = useState(0);
  const [uploading, setUploading] = useState(false);

  const handleUpload = useCallback(async (e) => {
    const file = e.target.files[0];
    if (!file) return;

    setUploading(true);
    setProgress(0);

    // 先计算文件 Hash
    const hash = await calculateFileHash(file);

    const uploader = new Uploader(file, {
      chunkSize: 2 * 1024 * 1024,
      concurrency: 3,
      onProgress: ({ percentage }) => setProgress(percentage),
      onComplete: () => {
        setUploading(false);
        alert('上传成功!');
      },
      onError: (error) => {
        setUploading(false);
        alert(`上传失败: ${error.message}`);
      }
    });

    uploader.start(hash);
  }, []);

  return (
    <div>
      <input type="file" onChange={handleUpload} disabled={uploading} />
      {uploading && <progress value={progress} max="100">{progress.toFixed(1)}%</progress>}
    </div>
  );
}

5. 服务端合并(Node.js 示例)

// server.js (Express)
const express = require('express');
const fs = require('fs');
const path = require('path');
const multer = require('multer');
const cors = require('cors');

const app = express();
app.use(cors());

// 上传分片
const upload = multer({ dest: 'uploads/chunks/' });
app.post('/api/upload/chunk', upload.single('chunk'), (req, res) => {
  const { index, filename, fileHash } = req.body;
  const chunkDir = `uploads/chunks/${fileHash}`;
  
  // 确保目录存在
  if (!fs.existsSync(chunkDir)) {
    fs.mkdirSync(chunkDir, { recursive: true });
  }
  
  // 重命名临时文件
  const oldPath = req.file.path;
  const newPath = `${chunkDir}/${index}`;
  fs.renameSync(oldPath, newPath);
  
  res.json({ success: true, index: parseInt(index) });
});

// 合并分片
app.post('/api/upload/merge', express.json(), async (req, res) => {
  const { filename, fileHash, totalChunks } = req.body;
  const chunkDir = `uploads/chunks/${fileHash}`;
  const uploadDir = 'uploads/files';
  
  if (!fs.existsSync(uploadDir)) {
    fs.mkdirSync(uploadDir, { recursive: true });
  }
  
  const destPath = path.join(uploadDir, filename);
  const writeStream = fs.createWriteStream(destPath);
  
  // 按顺序读取并写入
  for (let i = 0; i < totalChunks; i++) {
    const chunkPath = `${chunkDir}/${i}`;
    if (!fs.existsSync(chunkPath)) {
      return res.status(400).json({ error: `分片 ${i} 不存在` });
    }
    
    const chunkBuffer = fs.readFileSync(chunkPath);
    writeStream.write(chunkBuffer);
    fs.unlinkSync(chunkPath); // 删除已合并的分片
  }
  
  writeStream.end();
  
  // 清理分片目录
  if (fs.existsSync(chunkDir)) {
    fs.rmdirSync(chunkDir);
  }
  
  res.json({ success: true, url: `/files/${filename}` });
});

app.listen(3000, () => console.log('Server running on port 3000'));

四、效果对比

指标优化前优化后提升
崩溃率15%1%93%↓
平均上传时间(1GB)30分钟(单次失败需重传)5分钟(断点续传)83%↓
主线程阻塞明显卡顿无感知完全解决
用户体验99% 失败需从头再来自动续传,实时进度质变

五、注意事项

分片大小选择:2MB 是比较均衡的选择,太小会增加请求次数,太大会影响断点续传的精度。并发数控制:建议 3-5 个并发,过多会增加服务器压力。Hash 计算优化:对于超大文件,可以只计算首尾分片的 Hash 做快速校验。服务端限流:防止恶意用户同时上传大量大文件。清理机制:定期清理未完成的上传分片,避免磁盘占满。

六、总结

通过分片上传 + 断点续传 + Worker 并行计算 Hash 的方案,我们成功将大文件上传的崩溃率从 15% 降至 1%,用户体验得到质的提升。核心要点:分片:避免内存溢出,支持并发上传。断点续传:网络波动后只传未完成分片。Worker 计算 Hash:不阻塞主线程。本地持久化:用 localStorage 记录上传状态。如果你也在处理大文件上传场景,不妨试试这套方案。

参考资料