1. 文件读取的多种方式
1.1 使用Scala原生Source类
import scala.io.Source
import java.nio.charset.{Charset, StandardCharsets}
object FileReadingExamples {
// 1.1.1 基本读取
def readEntireFile(filePath: String): String = {
val source = Source.fromFile(filePath)
try {
source.mkString
} finally {
source.close()
}
}
// 1.1.2 逐行读取(惰性)
def readLinesLazily(filePath: String): Iterator[String] = {
Source.fromFile(filePath).getLines()
}
// 1.1.3 读取并处理每一行
def processLines(filePath: String): List[String] = {
val source = Source.fromFile(filePath)
try {
source.getLines()
.map(_.trim) // 去除首尾空格
.filter(_.nonEmpty) // 过滤空行
.toList
} finally {
source.close()
}
}
// 1.1.4 指定字符编码
def readWithEncoding(filePath: String, encoding: String = "UTF-8"): String = {
implicit val codec = scala.io.Codec(encoding)
val source = Source.fromFile(filePath)
try {
source.mkString
} finally {
source.close()
}
}
// 1.1.5 分批读取(大文件优化)
def readInBatches(filePath: String, batchSize: Int = 1000): Iterator[List[String]] = {
val lines = Source.fromFile(filePath).getLines()
new Iterator[List[String]] {
def hasNext: Boolean = lines.hasNext
def next(): List[String] = {
var batch = List.empty[String]
var count = 0
while (lines.hasNext && count < batchSize) {
batch = lines.next() :: batch
count += 1
}
batch.reverse
}
}
}
}
1.2 使用Java NIO(更高效的方式)
import java.nio.file.{Files, Paths, StandardOpenOption}
import java.nio.charset.StandardCharsets
import scala.collection.JavaConverters._
object JavaNIOExamples {
// 2.1 读取全部内容
def readAllLines(filePath: String): List[String] = {
Files.readAllLines(Paths.get(filePath), StandardCharsets.UTF_8)
.asScala
.toList
}
// 2.2 流式读取(内存效率高)
def readLinesStream(filePath: String): Iterator[String] = {
Files.lines(Paths.get(filePath), StandardCharsets.UTF_8)
.iterator()
.asScala
}
// 2.3 读取字节(二进制文件)
def readBytes(filePath: String): Array[Byte] = {
Files.readAllBytes(Paths.get(filePath))
}
}
1.3 使用scala.util.Using(资源管理)
import scala.util.{Try, Using}
import scala.io.Source
object UsingResourceExamples {
// 3.1 使用Using自动管理资源
def readFileSafely(filePath: String): Try[String] = {
Using(Source.fromFile(filePath)) { source =>
source.mkString
}
}
// 3.2 处理多个资源
def mergeFiles(file1: String, file2: String): Try[String] = {
Using.Manager { use =>
val source1 = use(Source.fromFile(file1))
val source2 = use(Source.fromFile(file2))
source1.mkString + "\n" + source2.mkString
}
}
// 3.3 自定义资源管理
case class DatabaseConnection(url: String) extends AutoCloseable {
def query(sql: String): String = s"Result of: $sql"
override def close(): Unit = println("Closing database connection")
}
def withDatabase[A](url: String)(f: DatabaseConnection => A): Try[A] = {
Using(new DatabaseConnection(url))(f)
}
}
2. 文件写入的多种方式
2.1 基本写入操作
import java.io.{File, PrintWriter, FileWriter, BufferedWriter}
import java.nio.file.{Files, Paths, StandardOpenOption}
object FileWritingExamples {
// 1.1 使用PrintWriter
def writeWithPrintWriter(filePath: String, content: String): Unit = {
val writer = new PrintWriter(new File(filePath))
try {
writer.print(content)
writer.flush()
} finally {
writer.close()
}
}
// 1.2 使用FileWriter(追加模式)
def appendToFile(filePath: String, content: String): Unit = {
val writer = new FileWriter(filePath, true) // true表示追加
try {
writer.write(content + System.lineSeparator())
writer.flush()
} finally {
writer.close()
}
}
// 1.3 使用BufferedWriter(性能更好)
def writeWithBuffering(filePath: String, lines: List[String]): Unit = {
val writer = new BufferedWriter(new FileWriter(filePath))
try {
lines.foreach { line =>
writer.write(line)
writer.newLine()
}
writer.flush()
} finally {
writer.close()
}
}
// 1.4 使用Java NIO Files
def writeWithNIO(filePath: String, content: String): Unit = {
Files.write(
Paths.get(filePath),
content.getBytes(StandardCharsets.UTF_8),
StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING,
StandardOpenOption.WRITE
)
}
// 1.5 追加写入NIO
def appendWithNIO(filePath: String, content: String): Unit = {
Files.write(
Paths.get(filePath),
content.getBytes(StandardCharsets.UTF_8),
StandardOpenOption.CREATE,
StandardOpenOption.APPEND
)
}
// 1.6 写入多行
def writeLines(filePath: String, lines: List[String]): Unit = {
import java.nio.charset.StandardCharsets
import scala.collection.JavaConverters._
Files.write(
Paths.get(filePath),
lines.asJava,
StandardCharsets.UTF_8,
StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING
)
}
}
3. 单词统计的完整实现
3.1 基础单词统计器
import scala.collection.mutable
import scala.util.matching.Regex
class BasicWordCounter {
// 定义单词的正则表达式
private val wordPattern: Regex = """[\p{L}']+""".r
// 停用词列表
private val stopWords: Set[String] = Set(
"the", "and", "or", "but", "in", "on", "at", "to", "for", "of",
"a", "an", "is", "are", "was", "were", "be", "been", "being"
)
/**
* 清理和规范化单词
*/
def normalizeWord(word: String): String = {
word.toLowerCase
.replaceAll("^['\"]+|['\"]+$", "") // 移除开头结尾的引号
.replaceAll("['\"]s$", "") // 移除所有格's
}
/**
* 从文本中提取单词
*/
def extractWords(text: String): List[String] = {
wordPattern.findAllIn(text)
.map(normalizeWord)
.filter(_.length > 1) // 过滤单字符(除了I, a等特殊情况)
.filterNot(stopWords.contains)
.toList
}
/**
* 统计单词频率
*/
def countWords(text: String): Map[String, Int] = {
val words = extractWords(text)
// 方法1: 使用groupBy
words.groupBy(identity)
.view
.mapValues(_.size)
.toMap
// 方法2: 使用foldLeft(更可控)
// words.foldLeft(Map.empty[String, Int].withDefaultValue(0)) { (acc, word) =>
// acc.updated(word, acc(word) + 1)
// }
}
/**
* 从文件读取并统计
*/
def countWordsInFile(filePath: String): Map[String, Int] = {
import scala.io.Source
val source = Source.fromFile(filePath)
try {
val text = source.mkString
countWords(text)
} finally {
source.close()
}
}
}
3.2 高级单词统计器(支持多种功能)
import scala.io.Source
import java.io.File
import scala.collection.immutable.TreeMap
import scala.math.Ordering
case class WordStats(
word: String,
frequency: Int,
percentage: Double,
positions: List[Int] // 单词出现的位置(字符偏移)
)
class AdvancedWordCounter {
// 配置选项
case class Config(
ignoreCase: Boolean = true,
filterStopWords: Boolean = true,
minWordLength: Int = 2,
includeNumbers: Boolean = false,
language: String = "en"
)
// 多语言停用词
private val stopWordsByLang: Map[String, Set[String]] = Map(
"en" -> Set("the", "be", "to", "of", "and", "a", "in", "that", "have", "i"),
"es" -> Set("el", "la", "de", "que", "y", "a", "en", "un", "ser", "se"),
"fr" -> Set("le", "de", "un", "à", "être", "et", "en", "avoir", "que", "pour")
)
/**
* 高级单词提取
*/
def extractWordsAdvanced(
text: String,
config: Config = Config()
): List[(String, Int)] = { // (单词, 位置)
// 构建正则表达式
val wordRegex = if (config.includeNumbers) {
"""[\p{L}\p{N}']+""".r
} else {
"""[\p{L}']+""".r
}
// 获取停用词
val stopWords = if (config.filterStopWords) {
stopWordsByLang.getOrElse(config.language, Set.empty[String])
} else {
Set.empty[String]
}
// 提取单词和位置
wordRegex.findAllMatchIn(text)
.map { m =>
val word = m.matched
val normalized = if (config.ignoreCase) word.toLowerCase else word
(normalized, m.start)
}
.filter { case (word, _) =>
word.length >= config.minWordLength &&
!stopWords.contains(word)
}
.toList
}
/**
* 生成详细的单词统计
*/
def analyzeText(text: String, config: Config = Config()): List[WordStats] = {
val wordOccurrences = extractWordsAdvanced(text, config)
val totalWords = wordOccurrences.length
// 按单词分组
val grouped = wordOccurrences.groupBy(_._1)
// 计算统计信息
grouped.map { case (word, occurrences) =>
val freq = occurrences.length
val percentage = (freq.toDouble / totalWords) * 100
val positions = occurrences.map(_._2)
WordStats(word, freq, percentage, positions.sorted)
}.toList
}
/**
* 分析文件并生成报告
*/
def analyzeFile(filePath: String, config: Config = Config()): TextAnalysisReport = {
val source = Source.fromFile(filePath)
try {
val text = source.mkString
val wordStats = analyzeText(text, config)
TextAnalysisReport(
filePath = filePath,
totalWords = wordStats.map(_.frequency).sum,
uniqueWords = wordStats.length,
mostFrequent = wordStats.sortBy(-_.frequency).take(10),
leastFrequent = wordStats.filter(_.frequency == 1).take(10),
vocabularyDensity = wordStats.length.toDouble / wordStats.map(_.frequency).sum,
averageWordLength = wordStats.map(_.word.length * _.frequency).sum.toDouble /
wordStats.map(_.frequency).sum
)
} finally {
source.close()
}
}
/**
* 批量处理多个文件
*/
def analyzeDirectory(directoryPath: String, config: Config = Config()):
Map[String, TextAnalysisReport] = {
val dir = new File(directoryPath)
if (!dir.exists() || !dir.isDirectory) {
throw new IllegalArgumentException(s"$directoryPath is not a valid directory")
}
val textFiles = dir.listFiles()
.filter(_.isFile)
.filter(_.getName.endsWith(".txt"))
.map(_.getAbsolutePath)
textFiles.par // 并行处理
.map(filePath => filePath -> analyzeFile(filePath, config))
.seq
.toMap
}
}
case class TextAnalysisReport(
filePath: String,
totalWords: Int,
uniqueWords: Int,
mostFrequent: List[WordStats],
leastFrequent: List[WordStats],
vocabularyDensity: Double, // 词汇密度 = 唯一词数 / 总词数
averageWordLength: Double
)
3.3 流式处理(处理大文件)
import scala.io.Source
import java.util.concurrent.atomic.AtomicLong
import scala.collection.concurrent.TrieMap
class StreamingWordCounter(bufferSize: Int = 8192) {
private val wordPattern = """\b[\p{L}']+\b""".r
/**
* 流式处理大文件
*/
def countWordsStreaming(filePath: String): TrieMap[String, AtomicLong] = {
val wordCounts = TrieMap.empty[String, AtomicLong]
val source = Source.fromFile(filePath)
val lines = source.getLines()
// 并行处理行
lines.grouped(1000) // 每1000行一批
.foreach { batch =>
batch.par.foreach { line =>
processLine(line, wordCounts)
}
}
source.close()
wordCounts
}
private def processLine(
line: String,
wordCounts: TrieMap[String, AtomicLong]
): Unit = {
wordPattern.findAllIn(line.toLowerCase)
.foreach { word =>
val normalized = word.toLowerCase
wordCounts.getOrElseUpdate(normalized, new AtomicLong(0))
wordCounts(normalized).incrementAndGet()
}
}
/**
* 实时显示处理进度
*/
def countWordsWithProgress(filePath: String): Map[String, Long] = {
import scala.io.Codec
implicit val codec: Codec = Codec.UTF8
val wordCounts = TrieMap.empty[String, AtomicLong]
val totalLines = new AtomicLong(0)
val startTime = System.currentTimeMillis()
val source = Source.fromFile(filePath)
val lines = source.getLines()
// 处理并显示进度
lines.zipWithIndex.foreach { case (line, index) =>
processLine(line, wordCounts)
totalLines.incrementAndGet()
// 每处理10000行显示一次进度
if (index % 10000 == 0 && index > 0) {
val elapsed = (System.currentTimeMillis() - startTime) / 1000.0
println(f"Processed $index lines in $elapsed%.2f seconds")
}
}
source.close()
val endTime = System.currentTimeMillis()
val totalTime = (endTime - startTime) / 1000.0
println(f"Total processing time: $totalTime%.2f seconds")
println(s"Total lines: ${totalLines.get()}")
println(s"Unique words: ${wordCounts.size}")
// 转换结果
wordCounts.map { case (word, count) => word -> count.get() }.toMap
}
}
3.4 单词统计结果输出与可视化
import java.io.{PrintWriter, File}
import scala.util.{Try, Success, Failure}
class WordCountReporter {
/**
* 生成多种格式的输出
*/
def generateReport(
wordCounts: Map[String, Int],
outputDir: String,
reportName: String
): Try[Unit] = Try {
// 确保输出目录存在
val dir = new File(outputDir)
if (!dir.exists()) dir.mkdirs()
// 1. 纯文本报告
writeTextReport(wordCounts, s"$outputDir/$reportName.txt")
// 2. CSV报告
writeCSVReport(wordCounts, s"$outputDir/$reportName.csv")
// 3. JSON报告
writeJSONReport(wordCounts, s"$outputDir/$reportName.json")
// 4. HTML报告
writeHTMLReport(wordCounts, s"$outputDir/$reportName.html")
// 5. 统计摘要
writeSummary(wordCounts, s"$outputDir/${reportName}_summary.txt")
}
private def writeTextReport(wordCounts: Map[String, Int], filePath: String): Unit = {
val writer = new PrintWriter(new File(filePath))
try {
writer.println("WORD FREQUENCY REPORT")
writer.println("=" * 50)
writer.println()
// 按频率排序
val sorted = wordCounts.toList
.sortBy { case (word, count) => (-count, word) }
sorted.foreach { case (word, count) =>
writer.println(f"$word%-20s $count%6d")
}
} finally {
writer.close()
}
}
private def writeCSVReport(wordCounts: Map[String, Int], filePath: String): Unit = {
val writer = new PrintWriter(new File(filePath))
try {
writer.println("word,count,frequency_percentage")
val total = wordCounts.values.sum.toDouble
val sorted = wordCounts.toList.sortBy(-_._2)
sorted.foreach { case (word, count) =>
val percentage = (count / total) * 100
writer.println(f""""$word",$count,$percentage%.4f%%""")
}
} finally {
writer.close()
}
}
private def writeJSONReport(wordCounts: Map[String, Int], filePath: String): Unit = {
import org.json4s._
import org.json4s.jackson.Serialization
import org.json4s.jackson.Serialization.write
implicit val formats: DefaultFormats = DefaultFormats
val report = Map(
"metadata" -> Map(
"total_words" -> wordCounts.values.sum,
"unique_words" -> wordCounts.size,
"generated_at" -> java.time.LocalDateTime.now().toString
),
"word_counts" -> wordCounts,
"top_20_words" -> wordCounts.toList.sortBy(-_._2).take(20)
)
val json = write(report)
val writer = new PrintWriter(new File(filePath))
try {
writer.write(json)
} finally {
writer.close()
}
}
private def writeHTMLReport(wordCounts: Map[String, Int], filePath: String): Unit = {
val writer = new PrintWriter(new File(filePath))
try {
val topWords = wordCounts.toList.sortBy(-_._2).take(50)
writer.println("""<!DOCTYPE html>
<html>
<head>
<title>Word Frequency Report</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background-color: #4CAF50; color: white; }
tr:nth-child(even) { background-color: #f2f2f2; }
.bar { background-color: #4CAF50; height: 20px; }
</style>
</head>
<body>
<h1>Word Frequency Analysis</h1>
<p>Total unique words: """ + wordCounts.size + """</p>
<p>Total word occurrences: """ + wordCounts.values.sum + """</p>
<h2>Top 50 Words</h2>
<table>
<tr>
<th>Rank</th>
<th>Word</th>
<th>Count</th>
<th>Frequency</th>
<th>Visualization</th>
</tr>""")
val maxCount = topWords.headOption.map(_._2).getOrElse(1)
topWords.zipWithIndex.foreach { case ((word, count), index) =>
val percentage = (count.toDouble / wordCounts.values.sum) * 100
val barWidth = (count.toDouble / maxCount) * 100
writer.println(s"""
<tr>
<td>${index + 1}</td>
<td>$word</td>
<td>$count</td>
<td>${"%.2f".format(percentage)}%</td>
<td>
<div class="bar" style="width: ${barWidth}%"></div>
</td>
</tr>""")
}
writer.println("""
</table>
</body>
</html>""")
} finally {
writer.close()
}
}
private def writeSummary(wordCounts: Map[String, Int], filePath: String): Unit = {
val writer = new PrintWriter(new File(filePath))
try {
val sorted = wordCounts.toList.sortBy(-_._2)
val totalWords = wordCounts.values.sum
writer.println("TEXT ANALYSIS SUMMARY")
writer.println("=" * 50)
writer.println()
writer.println(s"Total word occurrences: $totalWords")
writer.println(s"Unique words: ${wordCounts.size}")
writer.println(f"Vocabulary density: ${wordCounts.size.toDouble / totalWords}%.4f")
writer.println()
writer.println("TOP 20 WORDS:")
writer.println("-" * 30)
sorted.take(20).foreach { case (word, count) =>
val percentage = (count.toDouble / totalWords) * 100
writer.println(f"$word%-15s $count%6d (${percentage}%.2f%%)")
}
writer.println()
writer.println("STATISTICS:")
writer.println("-" * 30)
writer.println(f"Average word length: ${calculateAverageLength(wordCounts)}%.2f")
writer.println(s"Most common word: ${sorted.headOption.map(_._1).getOrElse("N/A")}")
writer.println(s"Least common words: ${sorted.takeRight(5).map(_._1).mkString(", ")}")
writer.println(s"Words occurring once: ${sorted.count(_._2 == 1)}")
} finally {
writer.close()
}
}
private def calculateAverageLength(wordCounts: Map[String, Int]): Double = {
val totalLength = wordCounts.map { case (word, count) => word.length * count }.sum
val totalOccurrences = wordCounts.values.sum
totalLength.toDouble / totalOccurrences
}
}
3.5 使用示例
object WordCountApplication {
def main(args: Array[String]): Unit = {
if (args.length < 1) {
println("Usage: scala WordCountApplication <input-file> [output-dir]")
System.exit(1)
}
val inputFile = args(0)
val outputDir = if (args.length > 1) args(1) else "./output"
// 创建计数器实例
val counter = new AdvancedWordCounter()
val reporter = new WordCountReporter()
try {
// 分析文件
val report = counter.analyzeFile(inputFile)
// 打印摘要到控制台
println("=" * 60)
println("TEXT ANALYSIS RESULTS")
println("=" * 60)
println(s"File: ${report.filePath}")
println(s"Total words: ${report.totalWords}")
println(s"Unique words: ${report.uniqueWords}")
println(f"Vocabulary density: ${report.vocabularyDensity}%.4f")
println(f"Average word length: ${report.averageWordLength}%.2f")
println()
println("TOP 10 MOST FREQUENT WORDS:")
println("-" * 40)
report.mostFrequent.foreach { stats =>
println(f"${stats.word}%-15s ${stats.frequency}%6d (${stats.percentage}%.2f%%)")
}
// 生成详细报告
val wordCounts = report.mostFrequent.map(s => s.word -> s.frequency).toMap
reporter.generateReport(wordCounts, outputDir, "word_analysis") match {
case Success(_) =>
println(s"\nReports generated successfully in: $outputDir")
case Failure(exception) =>
println(s"\nError generating reports: ${exception.getMessage}")
}
} catch {
case e: Exception =>
println(s"Error processing file: ${e.getMessage}")
e.printStackTrace()
}
}
// 批量处理示例
def batchProcess(directory: String): Unit = {
val counter = new AdvancedWordCounter()
println(s"Processing all text files in: $directory")
println("=" * 60)
val reports = counter.analyzeDirectory(directory)
reports.foreach { case (filePath, report) =>
println(s"\nFile: ${new java.io.File(filePath).getName}")
println(s" Total words: ${report.totalWords}")
println(s" Unique words: ${report.uniqueWords}")
println(s" Top word: ${report.mostFrequent.headOption.map(_.word).getOrElse("N/A")}")
}
// 生成汇总报告
val summary = reports.values.foldLeft(SummaryStatistics()) { (acc, report) =>
acc.copy(
totalFiles = acc.totalFiles + 1,
totalWords = acc.totalWords + report.totalWords,
totalUniqueWords = acc.totalUniqueWords + report.uniqueWords
)
}
println("\n" + "=" * 60)
println("SUMMARY ACROSS ALL FILES:")
println(s"Total files processed: ${summary.totalFiles}")
println(s"Total words: ${summary.totalWords}")
println(s"Average words per file: ${summary.totalWords / summary.totalFiles}")
println(s"Total unique words across all files: ${summary.totalUniqueWords}")
}
}
case class SummaryStatistics(
totalFiles: Int = 0,
totalWords: Int = 0,
totalUniqueWords: Int = 0
)
4. 性能优化技巧
4.1 内存优化
// 使用流式处理避免内存溢出
def processLargeFile(filePath: String): Unit = {
import scala.io.Source
val wordCounts = scala.collection.mutable.Map[String, Int]().withDefaultValue(0)
// 逐块处理
Source.fromFile(filePath).getLines()
.grouped(10000) // 每10000行处理一次
.foreach { chunk =>
chunk.foreach { line =>
line.toLowerCase.split("\\W+")
.filter(_.length > 1)
.foreach { word =>
wordCounts(word) += 1
}
}
// 可选:定期清理或持久化部分结果
if (wordCounts.size > 100000) {
// 保存或处理部分结果
savePartialResults(wordCounts.toMap)
wordCounts.clear()
}
}
}
4.2 并行处理
import scala.collection.parallel.CollectionConverters._
def parallelWordCount(filePath: String): Map[String, Int] = {
import scala.io.Source
val source = Source.fromFile(filePath)
val lines = try {
source.getLines().toVector // 转换为Vector支持并行
} finally {
source.close()
}
lines.par // 转换为并行集合
.flatMap(_.toLowerCase.split("\\W+"))
.filter(_.length > 1)
.groupBy(identity)
.mapValues(_.size)
.seq // 转回顺序集合
.toMap
}
4.3 缓存优化
import scala.collection.mutable
class CachedWordCounter {
private val cache = mutable.Map[String, Map[String, Int]]()
def countWordsWithCache(filePath: String): Map[String, Int] = {
cache.getOrElseUpdate(filePath, {
val lastModified = new java.io.File(filePath).lastModified()
// 可以添加文件修改时间检查
new AdvancedWordCounter().analyzeFile(filePath).mostFrequent
.map(s => s.word -> s.frequency)
.toMap
})
}
}
这个完整的实现提供了:
- 多种文件读写方式:涵盖各种场景
- 完整的单词统计功能:包括基础统计、高级分析、流式处理
- 丰富的输出格式:文本、CSV、JSON、HTML
- 性能优化:内存管理、并行处理、缓存
- 错误处理和资源管理:使用Try、Using等
- 可扩展性:易于添加新功能