node自动化提取指定格式文件

49 阅读1分钟

//!/usr/bin/env node

const fs = require('fs');

const path = require('path');

const axios = require('axios');

const glob = require('glob');

const mkdirp = require('mkdirp');

const { URL } = require('url');

const mime = require('mime-types');

const crypto = require('crypto');

// 获取当前目录名称

const currentDir = process.cwd();

const currentDirName = path.basename(currentDir);

// 脚本文件名(用于排除自身)

const scriptFileName = path.basename(__filename);

const urlFileExtensions = ['.html', '.htm', '.md', '.markdown', '.ts', '.js', '.tsx', '.less'];

const imageFileExtensions = ['.svg', '.png', '.jpg', '.jpeg', '.gif'];

const outputDirName = 'downloaded_resources';

const imgDirName = 'img';

const outputDirPath = path.join(currentDir, outputDirName, imgDirName);

mkdirp.sync(outputDirPath);

const domainPattern = /https?://mdn.alipayobjects.com[^"\s'<>]+/g;

const svgPattern = /<svg[\s\S]*?</svg>/gi;

// 替换非法路径字符

function sanitizePath(unsafePath) {

  return unsafePath.replace(/[^a-zA-Z0-9.*-]/g, '');

}

// 生成当前目录作为前缀

function generatePrefix(subPath) {

  return ${sanitizePath(currentDirName)}__${sanitizePath(subPath)};

}

function generateHash(str) {

  return crypto.createHash('md5').update(str).digest('hex').slice(0, 8);

}

async function fileExists(filePath) {

  try {

    await fs.promises.access(filePath, fs.constants.F_OK);

    return true;

  } catch {

    return false;

  }

}

async function downloadResource(url, outputDir, retries = 3) {

  try {

    const response = await axios.get(url, { responseType: 'arraybuffer', timeout: 15000 });

    const parsedUrl = new URL(url);

    let relativePath = decodeURIComponent(parsedUrl.pathname.replace(/^/+/g, ''));

    const pathSegments = relativePath.split('/');

    if (pathSegments.length < 2) return;

    let fileNameBase = sanitizePath(pathSegments[pathSegments.length - 2]);

    let ext = mime.extension(response.headers['content-type']) || 'bin';

    const prefix = generatePrefix(fileNameBase);

    const fileName = ${prefix}.${ext};

    const localPath = path.join(outputDir, fileName);

    if (await fileExists(localPath)) return;

    await mkdirp(path.dirname(localPath));

    await fs.promises.writeFile(localPath, response.data);

    console.log(✅ 下载: ${fileName});

  } catch (err) {

    if (retries > 0) {

      await downloadResource(url, outputDir, retries - 1);

    } else {

      console.error(❌ 下载失败: ${url});

    }

  }

}

async function copyImageFile(filePath, outputDir) {

  const ext = path.extname(filePath);

  const baseName = sanitizePath(path.basename(filePath, ext));

  const subPath = path.relative(currentDir, path.dirname(filePath));

  const prefix = generatePrefix(subPath);

  const fileName = ${prefix}_${baseName}${ext};

  const localPath = path.join(outputDir, fileName);

  if (await fileExists(localPath)) return;

  await mkdirp(path.dirname(localPath));

  await fs.promises.copyFile(filePath, localPath);

  console.log(✅ 复制: ${fileName});

}

async function extractSVGs(filePath) {

  try {

    const content = await fs.promises.readFile(filePath, 'utf-8');

    return content.match(svgPattern) || [];

  } catch {

    return [];

  }

}

async function extractAndSaveSVGs(filePath, outputDir) {

  const svgs = await extractSVGs(filePath);

  const subPath = path.relative(currentDir, path.dirname(filePath));

  const baseName = sanitizePath(path.basename(filePath, path.extname(filePath)));

  const prefix = generatePrefix(subPath);

  for (let i = 0; i < svgs.length; i++) {

    const svgContent = svgs[i];

    const fileName = ${prefix}_${baseName}_svg${i + 1}.svg;

    const localPath = path.join(outputDir, fileName);

    if (!(await fileExists(localPath))) {

      await fs.promises.writeFile(localPath, svgContent, 'utf-8');

      console.log(✅ 提取 SVG: ${fileName});

    }

  }

  return svgs.length;

}

async function extractUrls(filePath) {

  try {

    const content = await fs.promises.readFile(filePath, 'utf-8');

    return new Set(content.match(domainPattern) || []);

  } catch {

    return new Set();

  }

}

function findFiles(sourceDir, extensions, ignoreFiles = []) {

  return new Promise((resolve, reject) => {

    const pattern = **/*.+(${extensions.map((ext) => ext.replace('.', '')).join('|')});

    glob(

      pattern,

      {

        cwd: sourceDir,

        absolute: true,

        ignore: [${outputDirName}/**, ...ignoreFiles],

      },

      (err, files) => (err ? reject(err) : resolve(files)),

    );

  });

}

async function main(sourceDir, outputDir) {

  console.log(📁 当前目录: ${currentDirName});

  const urlFiles = await findFiles(sourceDir, urlFileExtensions, [scriptFileName]);

  const imageFiles = await findFiles(sourceDir, imageFileExtensions, [scriptFileName]);

  const uniqueUrls = new Set();

  let downloadCount = 0;

  let copyCount = 0;

  let svgCount = 0;

  for (const file of urlFiles) {

    const urls = await extractUrls(file);

    urls.forEach((url) => uniqueUrls.add(url));

    svgCount += await extractAndSaveSVGs(file, outputDir);

  }

  for (const img of imageFiles) {

    await copyImageFile(img, outputDir);

    copyCount++;

    svgCount += await extractAndSaveSVGs(img, outputDir);

  }

  for (const url of uniqueUrls) {

    await downloadResource(url, outputDir);

    downloadCount++;

  }

  console.log('\n✅ 全部完成');

  console.log(📦 下载资源数: ${downloadCount});

  console.log(🖼️ 本地复制数: ${copyCount});

  console.log(🧬 内嵌 SVG 数: ${svgCount});

}

main(currentDir, outputDirPath);