- 小知识,大挑战!本文正在参与“程序员必备小知识”创作活动。
环境准备
var sc:SparkContext = _
@Before
def beforeTest(): Unit ={
val sparkConf = new SparkConf().setMaster("local").setAppName("wc")
sc = new SparkContext(sparkConf)
}
@After
def afterTest(): Unit ={
sc.stop()
}
groupBy
将数据根据指定的规则进行分组, 分区默认不变,但是数据会被打乱重新组合
@Test
def wordCountGroupBy(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val group: RDD[(String, Iterable[String])] = words.groupBy(word=>word)
val wordCount: RDD[(String, Int)] = group.mapValues(iter=>iter.size)
wordCount.collect().foreach(println)
}
groupByKey
将数据源的数据根据 key 对 value 进行分组
@Test
def wordCountGroupByKey(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_,1))
val group: RDD[(String, Iterable[Int])] = wordOne.groupByKey()
val wordCount: RDD[(String, Int)] = group.mapValues(iter=>iter.size)
wordCount.collect().foreach(println)
}
reduceByKey
可以将数据按照相同的 Key 对 Value 进行聚合
@Test
def wordCountReduceByKey(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_,1))
val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_+_)
wordCount.collect().foreach(println)
}
aggregateByKey
将数据根据不同的规则进行分区内计算和分区间计算
@Test
def wordCountAggregateByKey(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_,1))
val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_+_, _+_)
wordCount.collect().foreach(println)
}
foldByKey
当分区内计算规则和分区间计算规则相同时,aggregateByKey 就可以简化为 foldByKey
def wordCountFoldByKey(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_,1))
val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_+_)
wordCount.collect().foreach(println)
}
combineByKey
最通用的对 key-value 型 rdd 进行聚集操作的聚集函数(aggregation function)
@Test
def wordCountCombineByKey(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_,1))
val wordCount: RDD[(String, Int)] = wordOne.combineByKey(
v=>v,
(x:Int, y) => x + y,
(x:Int, y:Int) => x + y
)
wordCount.collect().foreach(println)
}
countByKey
统计每种 key 的个数
@Test
def wordCountCountByKey(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_,1))
val wordCount: collection.Map[String, Long] = wordOne.countByKey()
println(wordCount)
}
countByValue
统计每种 value 的个数
@Test
def wordCountCountByValue(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordCount: collection.Map[String, Long] = words.countByValue()
println(wordCount)
}
reduce
聚集 RDD 中的所有元素,先聚合分区内数据,再聚合分区间数据
@Test
def wordCountReduce(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val mapWord = words.map(
word => {
mutable.Map[String, Long]((word,1))
}
)
val wordCount = mapWord.reduce(
(map1, map2) => {
map2.foreach{
case (word, count) => {
val newCount = map1.getOrElse(word, 0L) + count
map1.update(word, newCount)
}
}
map1
}
)
println(wordCount)
}
fold
折叠操作,aggregate 的简化版操作
@Test
def wordCountFold(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val mapWord = words.map(
word => {
mutable.Map[String, Long]((word,1))
}
)
val wordCount = mapWord.fold(mutable.Map("Hello"->0L))(
(map1, map2) => {
map2.foreach{
case (word, count) => {
val newCount = map1.getOrElse(word, 0L) + count
map1.update(word, newCount)
}
}
map1
}
)
println(wordCount)
}
aggregate
分区的数据通过初始值和分区内的数据进行聚合,然后再和初始值进行分区间的数据聚合
@Test
def wordCountAggregate(): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val mapWord = words.map(
word => {
mutable.Map[String, Long]((word,1))
}
)
val wordCount = mapWord.aggregate(mutable.Map("Hello"->0L))(
(map1:mutable.Map[String,Long], map2:mutable.Map[String,Long]) => {
map2.foreach{
case (word, count) => {
val newCount = map1.getOrElse(word, 0L) + count
map1.update(word, newCount)
}
}
map1
},
(map1:mutable.Map[String,Long], map2:mutable.Map[String,Long]) => {
map2.foreach{
case (word, count) => {
val newCount = map1.getOrElse(word, 0L) + count
map1.update(word, newCount)
}
}
map1
}
)
println(wordCount)
}