Scala中的文件读写-全文单词统计(二)

26 阅读4分钟

1. 文件读取的多种方式

1.1 使用Scala原生Source类

import scala.io.Source
import java.nio.charset.{Charset, StandardCharsets}

object FileReadingExamples {
  
  // 1.1.1 基本读取
  def readEntireFile(filePath: String): String = {
    val source = Source.fromFile(filePath)
    try {
      source.mkString
    } finally {
      source.close()
    }
  }
  
  // 1.1.2 逐行读取(惰性)
  def readLinesLazily(filePath: String): Iterator[String] = {
    Source.fromFile(filePath).getLines()
  }
  
  // 1.1.3 读取并处理每一行
  def processLines(filePath: String): List[String] = {
    val source = Source.fromFile(filePath)
    try {
      source.getLines()
        .map(_.trim)          // 去除首尾空格
        .filter(_.nonEmpty)   // 过滤空行
        .toList
    } finally {
      source.close()
    }
  }
  
  // 1.1.4 指定字符编码
  def readWithEncoding(filePath: String, encoding: String = "UTF-8"): String = {
    implicit val codec = scala.io.Codec(encoding)
    val source = Source.fromFile(filePath)
    try {
      source.mkString
    } finally {
      source.close()
    }
  }
  
  // 1.1.5 分批读取(大文件优化)
  def readInBatches(filePath: String, batchSize: Int = 1000): Iterator[List[String]] = {
    val lines = Source.fromFile(filePath).getLines()
    
    new Iterator[List[String]] {
      def hasNext: Boolean = lines.hasNext
      def next(): List[String] = {
        var batch = List.empty[String]
        var count = 0
        while (lines.hasNext && count < batchSize) {
          batch = lines.next() :: batch
          count += 1
        }
        batch.reverse
      }
    }
  }
}

1.2 使用Java NIO(更高效的方式)

import java.nio.file.{Files, Paths, StandardOpenOption}
import java.nio.charset.StandardCharsets
import scala.collection.JavaConverters._

object JavaNIOExamples {
  
  // 2.1 读取全部内容
  def readAllLines(filePath: String): List[String] = {
    Files.readAllLines(Paths.get(filePath), StandardCharsets.UTF_8)
      .asScala
      .toList
  }
  
  // 2.2 流式读取(内存效率高)
  def readLinesStream(filePath: String): Iterator[String] = {
    Files.lines(Paths.get(filePath), StandardCharsets.UTF_8)
      .iterator()
      .asScala
  }
  
  // 2.3 读取字节(二进制文件)
  def readBytes(filePath: String): Array[Byte] = {
    Files.readAllBytes(Paths.get(filePath))
  }
}

1.3 使用scala.util.Using(资源管理)

import scala.util.{Try, Using}
import scala.io.Source

object UsingResourceExamples {
  
  // 3.1 使用Using自动管理资源
  def readFileSafely(filePath: String): Try[String] = {
    Using(Source.fromFile(filePath)) { source =>
      source.mkString
    }
  }
  
  // 3.2 处理多个资源
  def mergeFiles(file1: String, file2: String): Try[String] = {
    Using.Manager { use =>
      val source1 = use(Source.fromFile(file1))
      val source2 = use(Source.fromFile(file2))
      source1.mkString + "\n" + source2.mkString
    }
  }
  
  // 3.3 自定义资源管理
  case class DatabaseConnection(url: String) extends AutoCloseable {
    def query(sql: String): String = s"Result of: $sql"
    override def close(): Unit = println("Closing database connection")
  }
  
  def withDatabase[A](url: String)(f: DatabaseConnection => A): Try[A] = {
    Using(new DatabaseConnection(url))(f)
  }
}

2. 文件写入的多种方式

2.1 基本写入操作

import java.io.{File, PrintWriter, FileWriter, BufferedWriter}
import java.nio.file.{Files, Paths, StandardOpenOption}

object FileWritingExamples {
  
  // 1.1 使用PrintWriter
  def writeWithPrintWriter(filePath: String, content: String): Unit = {
    val writer = new PrintWriter(new File(filePath))
    try {
      writer.print(content)
      writer.flush()
    } finally {
      writer.close()
    }
  }
  
  // 1.2 使用FileWriter(追加模式)
  def appendToFile(filePath: String, content: String): Unit = {
    val writer = new FileWriter(filePath, true)  // true表示追加
    try {
      writer.write(content + System.lineSeparator())
      writer.flush()
    } finally {
      writer.close()
    }
  }
  
  // 1.3 使用BufferedWriter(性能更好)
  def writeWithBuffering(filePath: String, lines: List[String]): Unit = {
    val writer = new BufferedWriter(new FileWriter(filePath))
    try {
      lines.foreach { line =>
        writer.write(line)
        writer.newLine()
      }
      writer.flush()
    } finally {
      writer.close()
    }
  }
  
  // 1.4 使用Java NIO Files
  def writeWithNIO(filePath: String, content: String): Unit = {
    Files.write(
      Paths.get(filePath),
      content.getBytes(StandardCharsets.UTF_8),
      StandardOpenOption.CREATE,
      StandardOpenOption.TRUNCATE_EXISTING,
      StandardOpenOption.WRITE
    )
  }
  
  // 1.5 追加写入NIO
  def appendWithNIO(filePath: String, content: String): Unit = {
    Files.write(
      Paths.get(filePath),
      content.getBytes(StandardCharsets.UTF_8),
      StandardOpenOption.CREATE,
      StandardOpenOption.APPEND
    )
  }
  
  // 1.6 写入多行
  def writeLines(filePath: String, lines: List[String]): Unit = {
    import java.nio.charset.StandardCharsets
    import scala.collection.JavaConverters._
    
    Files.write(
      Paths.get(filePath),
      lines.asJava,
      StandardCharsets.UTF_8,
      StandardOpenOption.CREATE,
      StandardOpenOption.TRUNCATE_EXISTING
    )
  }
}

3. 单词统计的完整实现

3.1 基础单词统计器

import scala.collection.mutable
import scala.util.matching.Regex

class BasicWordCounter {
  
  // 定义单词的正则表达式
  private val wordPattern: Regex = """[\p{L}']+""".r
  
  // 停用词列表
  private val stopWords: Set[String] = Set(
    "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", 
    "a", "an", "is", "are", "was", "were", "be", "been", "being"
  )
  
  /**
   * 清理和规范化单词
   */
  def normalizeWord(word: String): String = {
    word.toLowerCase
      .replaceAll("^['\"]+|['\"]+$", "")  // 移除开头结尾的引号
      .replaceAll("['\"]s$", "")          // 移除所有格's
  }
  
  /**
   * 从文本中提取单词
   */
  def extractWords(text: String): List[String] = {
    wordPattern.findAllIn(text)
      .map(normalizeWord)
      .filter(_.length > 1)  // 过滤单字符(除了I, a等特殊情况)
      .filterNot(stopWords.contains)
      .toList
  }
  
  /**
   * 统计单词频率
   */
  def countWords(text: String): Map[String, Int] = {
    val words = extractWords(text)
    
    // 方法1: 使用groupBy
    words.groupBy(identity)
      .view
      .mapValues(_.size)
      .toMap
    
    // 方法2: 使用foldLeft(更可控)
    // words.foldLeft(Map.empty[String, Int].withDefaultValue(0)) { (acc, word) =>
    //   acc.updated(word, acc(word) + 1)
    // }
  }
  
  /**
   * 从文件读取并统计
   */
  def countWordsInFile(filePath: String): Map[String, Int] = {
    import scala.io.Source
    
    val source = Source.fromFile(filePath)
    try {
      val text = source.mkString
      countWords(text)
    } finally {
      source.close()
    }
  }
}

3.2 高级单词统计器(支持多种功能)

import scala.io.Source
import java.io.File
import scala.collection.immutable.TreeMap
import scala.math.Ordering

case class WordStats(
  word: String,
  frequency: Int,
  percentage: Double,
  positions: List[Int]  // 单词出现的位置(字符偏移)
)

class AdvancedWordCounter {
  
  // 配置选项
  case class Config(
    ignoreCase: Boolean = true,
    filterStopWords: Boolean = true,
    minWordLength: Int = 2,
    includeNumbers: Boolean = false,
    language: String = "en"
  )
  
  // 多语言停用词
  private val stopWordsByLang: Map[String, Set[String]] = Map(
    "en" -> Set("the", "be", "to", "of", "and", "a", "in", "that", "have", "i"),
    "es" -> Set("el", "la", "de", "que", "y", "a", "en", "un", "ser", "se"),
    "fr" -> Set("le", "de", "un", "à", "être", "et", "en", "avoir", "que", "pour")
  )
  
  /**
   * 高级单词提取
   */
  def extractWordsAdvanced(
    text: String, 
    config: Config = Config()
  ): List[(String, Int)] = {  // (单词, 位置)
    
    // 构建正则表达式
    val wordRegex = if (config.includeNumbers) {
      """[\p{L}\p{N}']+""".r
    } else {
      """[\p{L}']+""".r
    }
    
    // 获取停用词
    val stopWords = if (config.filterStopWords) {
      stopWordsByLang.getOrElse(config.language, Set.empty[String])
    } else {
      Set.empty[String]
    }
    
    // 提取单词和位置
    wordRegex.findAllMatchIn(text)
      .map { m =>
        val word = m.matched
        val normalized = if (config.ignoreCase) word.toLowerCase else word
        (normalized, m.start)
      }
      .filter { case (word, _) => 
        word.length >= config.minWordLength && 
        !stopWords.contains(word)
      }
      .toList
  }
  
  /**
   * 生成详细的单词统计
   */
  def analyzeText(text: String, config: Config = Config()): List[WordStats] = {
    val wordOccurrences = extractWordsAdvanced(text, config)
    val totalWords = wordOccurrences.length
    
    // 按单词分组
    val grouped = wordOccurrences.groupBy(_._1)
    
    // 计算统计信息
    grouped.map { case (word, occurrences) =>
      val freq = occurrences.length
      val percentage = (freq.toDouble / totalWords) * 100
      val positions = occurrences.map(_._2)
      
      WordStats(word, freq, percentage, positions.sorted)
    }.toList
  }
  
  /**
   * 分析文件并生成报告
   */
  def analyzeFile(filePath: String, config: Config = Config()): TextAnalysisReport = {
    val source = Source.fromFile(filePath)
    try {
      val text = source.mkString
      val wordStats = analyzeText(text, config)
      
      TextAnalysisReport(
        filePath = filePath,
        totalWords = wordStats.map(_.frequency).sum,
        uniqueWords = wordStats.length,
        mostFrequent = wordStats.sortBy(-_.frequency).take(10),
        leastFrequent = wordStats.filter(_.frequency == 1).take(10),
        vocabularyDensity = wordStats.length.toDouble / wordStats.map(_.frequency).sum,
        averageWordLength = wordStats.map(_.word.length * _.frequency).sum.toDouble / 
                           wordStats.map(_.frequency).sum
      )
    } finally {
      source.close()
    }
  }
  
  /**
   * 批量处理多个文件
   */
  def analyzeDirectory(directoryPath: String, config: Config = Config()): 
    Map[String, TextAnalysisReport] = {
    
    val dir = new File(directoryPath)
    if (!dir.exists() || !dir.isDirectory) {
      throw new IllegalArgumentException(s"$directoryPath is not a valid directory")
    }
    
    val textFiles = dir.listFiles()
      .filter(_.isFile)
      .filter(_.getName.endsWith(".txt"))
      .map(_.getAbsolutePath)
    
    textFiles.par  // 并行处理
      .map(filePath => filePath -> analyzeFile(filePath, config))
      .seq
      .toMap
  }
}

case class TextAnalysisReport(
  filePath: String,
  totalWords: Int,
  uniqueWords: Int,
  mostFrequent: List[WordStats],
  leastFrequent: List[WordStats],
  vocabularyDensity: Double,  // 词汇密度 = 唯一词数 / 总词数
  averageWordLength: Double
)

3.3 流式处理(处理大文件)

import scala.io.Source
import java.util.concurrent.atomic.AtomicLong
import scala.collection.concurrent.TrieMap

class StreamingWordCounter(bufferSize: Int = 8192) {
  
  private val wordPattern = """\b[\p{L}']+\b""".r
  
  /**
   * 流式处理大文件
   */
  def countWordsStreaming(filePath: String): TrieMap[String, AtomicLong] = {
    val wordCounts = TrieMap.empty[String, AtomicLong]
    
    val source = Source.fromFile(filePath)
    val lines = source.getLines()
    
    // 并行处理行
    lines.grouped(1000)  // 每1000行一批
      .foreach { batch =>
        batch.par.foreach { line =>
          processLine(line, wordCounts)
        }
      }
    
    source.close()
    wordCounts
  }
  
  private def processLine(
    line: String, 
    wordCounts: TrieMap[String, AtomicLong]
  ): Unit = {
    wordPattern.findAllIn(line.toLowerCase)
      .foreach { word =>
        val normalized = word.toLowerCase
        wordCounts.getOrElseUpdate(normalized, new AtomicLong(0))
        wordCounts(normalized).incrementAndGet()
      }
  }
  
  /**
   * 实时显示处理进度
   */
  def countWordsWithProgress(filePath: String): Map[String, Long] = {
    import scala.io.Codec
    implicit val codec: Codec = Codec.UTF8
    
    val wordCounts = TrieMap.empty[String, AtomicLong]
    val totalLines = new AtomicLong(0)
    val startTime = System.currentTimeMillis()
    
    val source = Source.fromFile(filePath)
    val lines = source.getLines()
    
    // 处理并显示进度
    lines.zipWithIndex.foreach { case (line, index) =>
      processLine(line, wordCounts)
      totalLines.incrementAndGet()
      
      // 每处理10000行显示一次进度
      if (index % 10000 == 0 && index > 0) {
        val elapsed = (System.currentTimeMillis() - startTime) / 1000.0
        println(f"Processed $index lines in $elapsed%.2f seconds")
      }
    }
    
    source.close()
    
    val endTime = System.currentTimeMillis()
    val totalTime = (endTime - startTime) / 1000.0
    println(f"Total processing time: $totalTime%.2f seconds")
    println(s"Total lines: ${totalLines.get()}")
    println(s"Unique words: ${wordCounts.size}")
    
    // 转换结果
    wordCounts.map { case (word, count) => word -> count.get() }.toMap
  }
}

3.4 单词统计结果输出与可视化

import java.io.{PrintWriter, File}
import scala.util.{Try, Success, Failure}

class WordCountReporter {
  
  /**
   * 生成多种格式的输出
   */
  def generateReport(
    wordCounts: Map[String, Int],
    outputDir: String,
    reportName: String
  ): Try[Unit] = Try {
    // 确保输出目录存在
    val dir = new File(outputDir)
    if (!dir.exists()) dir.mkdirs()
    
    // 1. 纯文本报告
    writeTextReport(wordCounts, s"$outputDir/$reportName.txt")
    
    // 2. CSV报告
    writeCSVReport(wordCounts, s"$outputDir/$reportName.csv")
    
    // 3. JSON报告
    writeJSONReport(wordCounts, s"$outputDir/$reportName.json")
    
    // 4. HTML报告
    writeHTMLReport(wordCounts, s"$outputDir/$reportName.html")
    
    // 5. 统计摘要
    writeSummary(wordCounts, s"$outputDir/${reportName}_summary.txt")
  }
  
  private def writeTextReport(wordCounts: Map[String, Int], filePath: String): Unit = {
    val writer = new PrintWriter(new File(filePath))
    try {
      writer.println("WORD FREQUENCY REPORT")
      writer.println("=" * 50)
      writer.println()
      
      // 按频率排序
      val sorted = wordCounts.toList
        .sortBy { case (word, count) => (-count, word) }
      
      sorted.foreach { case (word, count) =>
        writer.println(f"$word%-20s $count%6d")
      }
    } finally {
      writer.close()
    }
  }
  
  private def writeCSVReport(wordCounts: Map[String, Int], filePath: String): Unit = {
    val writer = new PrintWriter(new File(filePath))
    try {
      writer.println("word,count,frequency_percentage")
      
      val total = wordCounts.values.sum.toDouble
      val sorted = wordCounts.toList.sortBy(-_._2)
      
      sorted.foreach { case (word, count) =>
        val percentage = (count / total) * 100
        writer.println(f""""$word",$count,$percentage%.4f%%""")
      }
    } finally {
      writer.close()
    }
  }
  
  private def writeJSONReport(wordCounts: Map[String, Int], filePath: String): Unit = {
    import org.json4s._
    import org.json4s.jackson.Serialization
    import org.json4s.jackson.Serialization.write
    
    implicit val formats: DefaultFormats = DefaultFormats
    
    val report = Map(
      "metadata" -> Map(
        "total_words" -> wordCounts.values.sum,
        "unique_words" -> wordCounts.size,
        "generated_at" -> java.time.LocalDateTime.now().toString
      ),
      "word_counts" -> wordCounts,
      "top_20_words" -> wordCounts.toList.sortBy(-_._2).take(20)
    )
    
    val json = write(report)
    val writer = new PrintWriter(new File(filePath))
    try {
      writer.write(json)
    } finally {
      writer.close()
    }
  }
  
  private def writeHTMLReport(wordCounts: Map[String, Int], filePath: String): Unit = {
    val writer = new PrintWriter(new File(filePath))
    try {
      val topWords = wordCounts.toList.sortBy(-_._2).take(50)
      
      writer.println("""<!DOCTYPE html>
<html>
<head>
    <title>Word Frequency Report</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 40px; }
        table { border-collapse: collapse; width: 100%; }
        th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        th { background-color: #4CAF50; color: white; }
        tr:nth-child(even) { background-color: #f2f2f2; }
        .bar { background-color: #4CAF50; height: 20px; }
    </style>
</head>
<body>
    <h1>Word Frequency Analysis</h1>
    <p>Total unique words: """ + wordCounts.size + """</p>
    <p>Total word occurrences: """ + wordCounts.values.sum + """</p>
    
    <h2>Top 50 Words</h2>
    <table>
        <tr>
            <th>Rank</th>
            <th>Word</th>
            <th>Count</th>
            <th>Frequency</th>
            <th>Visualization</th>
        </tr>""")
      
      val maxCount = topWords.headOption.map(_._2).getOrElse(1)
      
      topWords.zipWithIndex.foreach { case ((word, count), index) =>
        val percentage = (count.toDouble / wordCounts.values.sum) * 100
        val barWidth = (count.toDouble / maxCount) * 100
        
        writer.println(s"""
        <tr>
            <td>${index + 1}</td>
            <td>$word</td>
            <td>$count</td>
            <td>${"%.2f".format(percentage)}%</td>
            <td>
                <div class="bar" style="width: ${barWidth}%"></div>
            </td>
        </tr>""")
      }
      
      writer.println("""
    </table>
</body>
</html>""")
    } finally {
      writer.close()
    }
  }
  
  private def writeSummary(wordCounts: Map[String, Int], filePath: String): Unit = {
    val writer = new PrintWriter(new File(filePath))
    try {
      val sorted = wordCounts.toList.sortBy(-_._2)
      val totalWords = wordCounts.values.sum
      
      writer.println("TEXT ANALYSIS SUMMARY")
      writer.println("=" * 50)
      writer.println()
      writer.println(s"Total word occurrences: $totalWords")
      writer.println(s"Unique words: ${wordCounts.size}")
      writer.println(f"Vocabulary density: ${wordCounts.size.toDouble / totalWords}%.4f")
      writer.println()
      
      writer.println("TOP 20 WORDS:")
      writer.println("-" * 30)
      sorted.take(20).foreach { case (word, count) =>
        val percentage = (count.toDouble / totalWords) * 100
        writer.println(f"$word%-15s $count%6d (${percentage}%.2f%%)")
      }
      
      writer.println()
      writer.println("STATISTICS:")
      writer.println("-" * 30)
      writer.println(f"Average word length: ${calculateAverageLength(wordCounts)}%.2f")
      writer.println(s"Most common word: ${sorted.headOption.map(_._1).getOrElse("N/A")}")
      writer.println(s"Least common words: ${sorted.takeRight(5).map(_._1).mkString(", ")}")
      writer.println(s"Words occurring once: ${sorted.count(_._2 == 1)}")
    } finally {
      writer.close()
    }
  }
  
  private def calculateAverageLength(wordCounts: Map[String, Int]): Double = {
    val totalLength = wordCounts.map { case (word, count) => word.length * count }.sum
    val totalOccurrences = wordCounts.values.sum
    totalLength.toDouble / totalOccurrences
  }
}

3.5 使用示例

object WordCountApplication {
  
  def main(args: Array[String]): Unit = {
    if (args.length < 1) {
      println("Usage: scala WordCountApplication <input-file> [output-dir]")
      System.exit(1)
    }
    
    val inputFile = args(0)
    val outputDir = if (args.length > 1) args(1) else "./output"
    
    // 创建计数器实例
    val counter = new AdvancedWordCounter()
    val reporter = new WordCountReporter()
    
    try {
      // 分析文件
      val report = counter.analyzeFile(inputFile)
      
      // 打印摘要到控制台
      println("=" * 60)
      println("TEXT ANALYSIS RESULTS")
      println("=" * 60)
      println(s"File: ${report.filePath}")
      println(s"Total words: ${report.totalWords}")
      println(s"Unique words: ${report.uniqueWords}")
      println(f"Vocabulary density: ${report.vocabularyDensity}%.4f")
      println(f"Average word length: ${report.averageWordLength}%.2f")
      println()
      
      println("TOP 10 MOST FREQUENT WORDS:")
      println("-" * 40)
      report.mostFrequent.foreach { stats =>
        println(f"${stats.word}%-15s ${stats.frequency}%6d (${stats.percentage}%.2f%%)")
      }
      
      // 生成详细报告
      val wordCounts = report.mostFrequent.map(s => s.word -> s.frequency).toMap
      
      reporter.generateReport(wordCounts, outputDir, "word_analysis") match {
        case Success(_) => 
          println(s"\nReports generated successfully in: $outputDir")
        case Failure(exception) =>
          println(s"\nError generating reports: ${exception.getMessage}")
      }
      
    } catch {
      case e: Exception =>
        println(s"Error processing file: ${e.getMessage}")
        e.printStackTrace()
    }
  }
  
  // 批量处理示例
  def batchProcess(directory: String): Unit = {
    val counter = new AdvancedWordCounter()
    
    println(s"Processing all text files in: $directory")
    println("=" * 60)
    
    val reports = counter.analyzeDirectory(directory)
    
    reports.foreach { case (filePath, report) =>
      println(s"\nFile: ${new java.io.File(filePath).getName}")
      println(s"  Total words: ${report.totalWords}")
      println(s"  Unique words: ${report.uniqueWords}")
      println(s"  Top word: ${report.mostFrequent.headOption.map(_.word).getOrElse("N/A")}")
    }
    
    // 生成汇总报告
    val summary = reports.values.foldLeft(SummaryStatistics()) { (acc, report) =>
      acc.copy(
        totalFiles = acc.totalFiles + 1,
        totalWords = acc.totalWords + report.totalWords,
        totalUniqueWords = acc.totalUniqueWords + report.uniqueWords
      )
    }
    
    println("\n" + "=" * 60)
    println("SUMMARY ACROSS ALL FILES:")
    println(s"Total files processed: ${summary.totalFiles}")
    println(s"Total words: ${summary.totalWords}")
    println(s"Average words per file: ${summary.totalWords / summary.totalFiles}")
    println(s"Total unique words across all files: ${summary.totalUniqueWords}")
  }
}

case class SummaryStatistics(
  totalFiles: Int = 0,
  totalWords: Int = 0,
  totalUniqueWords: Int = 0
)

4. 性能优化技巧

4.1 内存优化

// 使用流式处理避免内存溢出
def processLargeFile(filePath: String): Unit = {
  import scala.io.Source
  
  val wordCounts = scala.collection.mutable.Map[String, Int]().withDefaultValue(0)
  
  // 逐块处理
  Source.fromFile(filePath).getLines()
    .grouped(10000)  // 每10000行处理一次
    .foreach { chunk =>
      chunk.foreach { line =>
        line.toLowerCase.split("\\W+")
          .filter(_.length > 1)
          .foreach { word =>
            wordCounts(word) += 1
          }
      }
      
      // 可选:定期清理或持久化部分结果
      if (wordCounts.size > 100000) {
        // 保存或处理部分结果
        savePartialResults(wordCounts.toMap)
        wordCounts.clear()
      }
    }
}

4.2 并行处理

import scala.collection.parallel.CollectionConverters._

def parallelWordCount(filePath: String): Map[String, Int] = {
  import scala.io.Source
  
  val source = Source.fromFile(filePath)
  val lines = try {
    source.getLines().toVector  // 转换为Vector支持并行
  } finally {
    source.close()
  }
  
  lines.par  // 转换为并行集合
    .flatMap(_.toLowerCase.split("\\W+"))
    .filter(_.length > 1)
    .groupBy(identity)
    .mapValues(_.size)
    .seq  // 转回顺序集合
    .toMap
}

4.3 缓存优化

import scala.collection.mutable

class CachedWordCounter {
  private val cache = mutable.Map[String, Map[String, Int]]()
  
  def countWordsWithCache(filePath: String): Map[String, Int] = {
    cache.getOrElseUpdate(filePath, {
      val lastModified = new java.io.File(filePath).lastModified()
      // 可以添加文件修改时间检查
      new AdvancedWordCounter().analyzeFile(filePath).mostFrequent
        .map(s => s.word -> s.frequency)
        .toMap
    })
  }
}

这个完整的实现提供了:

  1. 多种文件读写方式:涵盖各种场景
  2. 完整的单词统计功能:包括基础统计、高级分析、流式处理
  3. 丰富的输出格式:文本、CSV、JSON、HTML
  4. 性能优化:内存管理、并行处理、缓存
  5. 错误处理和资源管理:使用Try、Using等
  6. 可扩展性:易于添加新功能