Spark wordcount 不同算子实现

262 阅读2分钟

环境准备

var sc:SparkContext = _

@Before
def beforeTest(): Unit ={
  val sparkConf = new SparkConf().setMaster("local").setAppName("wc")
  sc = new SparkContext(sparkConf)
}

@After
def afterTest(): Unit ={
  sc.stop()
}

groupBy

将数据根据指定的规则进行分组, 分区默认不变,但是数据会被打乱重新组合

@Test
def wordCountGroupBy(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))
  val group: RDD[(String, Iterable[String])] = words.groupBy(word=>word)
  val wordCount: RDD[(String, Int)] = group.mapValues(iter=>iter.size)
  wordCount.collect().foreach(println)
}

groupByKey

将数据源的数据根据 key 对 value 进行分组

@Test
def wordCountGroupByKey(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))
  val wordOne = words.map((_,1))
  val group: RDD[(String, Iterable[Int])] = wordOne.groupByKey()
  val wordCount: RDD[(String, Int)] = group.mapValues(iter=>iter.size)
  wordCount.collect().foreach(println)
}

reduceByKey

可以将数据按照相同的 Key 对 Value 进行聚合

@Test
def wordCountReduceByKey(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))
  val wordOne = words.map((_,1))
  val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_+_)
  wordCount.collect().foreach(println)
}

aggregateByKey

将数据根据不同的规则进行分区内计算和分区间计算

@Test
def wordCountAggregateByKey(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))
  val wordOne = words.map((_,1))
  val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_+_, _+_)
  wordCount.collect().foreach(println)
}

foldByKey

当分区内计算规则和分区间计算规则相同时,aggregateByKey 就可以简化为 foldByKey

def wordCountFoldByKey(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))
  val wordOne = words.map((_,1))
  val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_+_)
  wordCount.collect().foreach(println)
}

combineByKey

最通用的对 key-value 型 rdd 进行聚集操作的聚集函数(aggregation function)

@Test
def wordCountCombineByKey(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))
  val wordOne = words.map((_,1))
  val wordCount: RDD[(String, Int)] = wordOne.combineByKey(
    v=>v,
    (x:Int, y) => x + y,
    (x:Int, y:Int) => x + y
  )
  wordCount.collect().foreach(println)
}

countByKey

统计每种 key 的个数

@Test
def wordCountCountByKey(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))
  val wordOne = words.map((_,1))
  val wordCount: collection.Map[String, Long] = wordOne.countByKey()
  println(wordCount)
}

countByValue

统计每种 value 的个数

@Test
def wordCountCountByValue(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))
  val wordCount: collection.Map[String, Long] = words.countByValue()
  println(wordCount)
}

reduce

聚集 RDD 中的所有元素,先聚合分区内数据,再聚合分区间数据

@Test
def wordCountReduce(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))
  
  val mapWord = words.map(
    word => {
      mutable.Map[String, Long]((word,1))
    }
  )

  val wordCount = mapWord.reduce(
    (map1, map2) => {
      map2.foreach{
        case (word, count) => {
          val newCount = map1.getOrElse(word, 0L) + count
          map1.update(word, newCount)
        }
      }
      map1
    }
  )
  println(wordCount)
}

fold

折叠操作,aggregate 的简化版操作

@Test
def wordCountFold(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))
  
  val mapWord = words.map(
    word => {
      mutable.Map[String, Long]((word,1))
    }
  )

  val wordCount = mapWord.fold(mutable.Map("Hello"->0L))(
    (map1, map2) => {
      map2.foreach{
        case (word, count) => {
          val newCount = map1.getOrElse(word, 0L) + count
          map1.update(word, newCount)
        }
      }
      map1
    }
  )
  println(wordCount)
}

aggregate

分区的数据通过初始值和分区内的数据进行聚合,然后再和初始值进行分区间的数据聚合

@Test
def wordCountAggregate(): Unit = {
  val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
  val words = rdd.flatMap(_.split(" "))

  val mapWord = words.map(
    word => {
      mutable.Map[String, Long]((word,1))
    }
  )

  val wordCount = mapWord.aggregate(mutable.Map("Hello"->0L))(
    (map1:mutable.Map[String,Long], map2:mutable.Map[String,Long]) => {
      map2.foreach{
        case (word, count) => {
          val newCount = map1.getOrElse(word, 0L) + count
          map1.update(word, newCount)
        }
      }
      map1
    },
    (map1:mutable.Map[String,Long], map2:mutable.Map[String,Long]) => {
      map2.foreach{
        case (word, count) => {
          val newCount = map1.getOrElse(word, 0L) + count
          map1.update(word, newCount)
        }
      }
      map1
    }
  )

  println(wordCount)
}