WorldCount简单案例
package scala.com.lqs.chapter02.day06
/**
* @Author lqs
* @Date 2022年01月22日 09:31:56
* @Version 1.0.0
* @ClassName WordCount
* @Describe
*/
object WordCount {
/* def main(args: Array[String]): Unit = {
val stringList = List("Hello Scala Hbase kafka", "Hello Scala Hbase", "Hello Scala", "Hello")
val flatMapList: List[String] = stringList.flatten(_.split(" "))
println(flatMapList)
//如果化简完后,只剩下单独一个,则不能化简
val groupByList: Map[String, List[String]] = flatMapList.groupBy(s => s)
println(groupByList)
val mapList: Map[String, Int] = groupByList.map((tuple: (String, List[String])) => (tuple._1, tuple._2.size))
println(mapList)
//当使用map去转换二元组的元素时,如果key保持不变,只需要改变value
//可以使用mapValue优化
val mapValuesList: Map[String, Int] = groupByList.mapValues(_.size)
println(mapValuesList)
//对结果进行排序取前三
//map不能排序,需要转换为list
// mapValuesList.sortBy
val tuples: List[(String, Int)] = mapValuesList.toList.sortBy(_._2)(Ordering.Int.reverse)
println(tuples)
//取top3
val tuples1: List[(String, Int)] = tuples.take(3)
println(tuples1)
val tuples2: List[(String, Int)] = mapValuesList.toList.sortWith(_._2 > _._2)
println(tuples2)
println(tuples2.take(3))
//实际开发当中,除了内部的匿名函数可以化简之外
//每一步返回的变量名称可以省略
val tuples3: List[(String, Int)] = stringList.flatMap(_.split(" "))
.groupBy(s => s)
.mapValues(_.size)
.toList
.sortWith(_._2 > _._2)
.take(3)
println(tuples3)
}*/
def main(args: Array[String]): Unit = {
val stringList = List("Hello Scala Hbase kafka", "Hello Scala Hbase", "Hello Scala", "Hello")
val flatMapList: List[String] = stringList.flatten((string:String)=>string.split(" "))
println(flatMapList)
val groupByList: Map[String, List[String]] = flatMapList.groupBy((s: String) => s)
println(groupByList)
val mapList: Map[String, Int] = groupByList.map((tuple: (String, List[String])) => (tuple._1, tuple._2.size))
println(mapList)
//当使用map去转换二元组的元素时,如果key保持不变,只需要改变value
//可以使用mapValue优化
val mapValuesList: Map[String, Int] = groupByList.mapValues((list: List[String]) => list.size)
println(mapValuesList)
//对结果进行排序取前三
//map不能排序,需要转换为list
// mapValuesList.sortBy
val tuples: List[(String, Int)] = mapValuesList.toList.sortBy((tuple:(String,Int))=>tuple._2)(Ordering.Int.reverse)
println(tuples)
//取top3
val tuples1: List[(String, Int)] = tuples.take(3)
println(tuples1)
val tuples2: List[(String, Int)] = mapValuesList.toList.sortWith((left: (String, Int), right: (String, Int)) => left._2 > right._2)
println(tuples2)
println(tuples2.take(3))
}
}
WorldCount简单复杂案例
package scala.com.lqs.chapter02.day06
/**
* @Author lqs
* @Date 2022年01月22日 11:31:11
* @Version 1.0.0
* @ClassName FCWC
* @Describe
*/
object FCWC {
def main(args: Array[String]): Unit = {
val tupleList = List(("Hello Scala Spark World", 4), ("Hello Scala Spark", 3), ("Hello Scala", 2), ("Hello", 1))
//方法1:使用上一步写过的步骤
//不推荐,因为这里是反向优化,即把已经统计好了的数据展开进行重新统计
//转换数据为长字符串
val strings: List[String] = tupleList.map((tuple: (String, Int)) => (tuple._1 + " ") * tuple._2)
println(strings)
val result: List[(String, Int)] = strings.flatMap(_.split(" "))
.groupBy(s => s)
.mapValues(_.size)
.toList
.sortWith(_._2 > _._2)
.take(3)
println(result)
/*
化简方法2
*/
//方法2:使用算好的次数进行拆分
// ("Hello Scala Spark World", 4) => List(("hello",4),("scala",4),("spark",4),("world",4))
// 步骤一: 将二元组转换为集合单个单词加次数
val list: List[List[(String, Int)]] = tupleList.map((tuple: (String, Int)) => {
val strings1: Array[String] = tuple._1.split(" ")
strings1.toList.map((s: String) => (s, tuple._2))
})
println(list)
val flatten: List[(String, Int)] = list.flatten
println(flatten)
// 使用flatMap代替map + flatten
// 嵌套的匿名函数化简 优先化简内层的 再化简外层的
val flatMapList: List[(String, Int)] = tupleList.flatMap((tuple) => tuple._1.split(" ").map((_, tuple._2)))
// 步骤二: 将相同的单词聚合一组
val groupByList: Map[String, List[(String, Int)]] = flatMapList.groupBy(_._1)
println(groupByList)
// 步骤三: 将同一组的数据 出现的次数累加
//写法1
val mapValuesList: Map[String, Int] = groupByList.mapValues(_.map(_._2).sum)
//写法2
val mapValuesList2: Map[String, Int] = groupByList.mapValues(_.foldLeft(0)(_ + _._2))
println(mapValuesList2.take(3))
/*
熟练写法
*/
val result2: List[(String, Int)] = tupleList.flatMap(tuple => tuple._1.split(" ").map((_, tuple._2)))
.groupBy(_._1)
.mapValues(_.map(_._2).sum)
.toList
.sortWith(_._2 > _._2)
.take(3)
println(result2)
/*//方法2:使用算好的次数进行拆分
// ("Hello Scala Spark World", 4) => List(("hello",4),("scala",4),("spark",4),("world",4))
// 步骤一: 将二元组转换为集合单个单词加次数
val list: List[List[(String, Int)]] = tupleList.map((tuple: (String, Int)) => {
val strings1: Array[String] = tuple._1.split(" ")
strings1.toList.map((s: String) => (s, tuple._2))
})
println(list)
val flatten: List[(String, Int)] = list.flatten
println(flatten)
// 使用flatMap代替map + flatten
val flatMapList: List[(String, Int)] = tupleList.flatMap((tuple: (String, Int)) => (tuple._1.split(" ").map((s: String) => (s, tuple._2))))
// 步骤二: 将相同的单词聚合一组
val groupByList: Map[String, List[(String, Int)]] = flatMapList.groupBy((tuple: (String, Int)) => tuple._1)
println(groupByList)
// 步骤三: 将同一组的数据 出现的次数累加
//写法1
val mapValuesList: Map[String, Int] = groupByList.mapValues((list: List[(String, Int)]) => list.map(_._2).sum)
//写法2
val mapValuesList2: Map[String, Int] = groupByList.mapValues((list: List[(String, Int)]) => list.foldLeft(0)((res: Int, elem: (String, Int)) => res + elem._2))
println(mapValuesList2.take(3))
*/
}
}