Scala WorldCount简单案例和简单复杂案例

288 阅读3分钟

WorldCount简单案例

package scala.com.lqs.chapter02.day06

/**
 * @Author lqs
 * @Date 2022年01月22日 09:31:56
 * @Version 1.0.0
 * @ClassName WordCount
 * @Describe
 */
object WordCount {

 /* def main(args: Array[String]): Unit = {
    val stringList = List("Hello Scala Hbase kafka", "Hello Scala Hbase", "Hello Scala", "Hello")

    val flatMapList: List[String] = stringList.flatten(_.split(" "))
    println(flatMapList)

    //如果化简完后,只剩下单独一个,则不能化简
    val groupByList: Map[String, List[String]] = flatMapList.groupBy(s => s)
    println(groupByList)

    val mapList: Map[String, Int] = groupByList.map((tuple: (String, List[String])) => (tuple._1, tuple._2.size))
    println(mapList)

    //当使用map去转换二元组的元素时,如果key保持不变,只需要改变value
    //可以使用mapValue优化
    val mapValuesList: Map[String, Int] = groupByList.mapValues(_.size)
    println(mapValuesList)

    //对结果进行排序取前三
    //map不能排序,需要转换为list
    //    mapValuesList.sortBy

    val tuples: List[(String, Int)] = mapValuesList.toList.sortBy(_._2)(Ordering.Int.reverse)
    println(tuples)

    //取top3
    val tuples1: List[(String, Int)] = tuples.take(3)
    println(tuples1)

    val tuples2: List[(String, Int)] = mapValuesList.toList.sortWith(_._2 > _._2)
    println(tuples2)

    println(tuples2.take(3))

    //实际开发当中,除了内部的匿名函数可以化简之外
    //每一步返回的变量名称可以省略
    val tuples3: List[(String, Int)] = stringList.flatMap(_.split(" "))
      .groupBy(s => s)
      .mapValues(_.size)
      .toList
      .sortWith(_._2 > _._2)
      .take(3)

    println(tuples3)
  }*/

  def main(args: Array[String]): Unit = {
    val stringList = List("Hello Scala Hbase kafka", "Hello Scala Hbase", "Hello Scala", "Hello")

    val flatMapList: List[String] = stringList.flatten((string:String)=>string.split(" "))
    println(flatMapList)

    val groupByList: Map[String, List[String]] = flatMapList.groupBy((s: String) => s)
    println(groupByList)

    val mapList: Map[String, Int] = groupByList.map((tuple: (String, List[String])) => (tuple._1, tuple._2.size))
    println(mapList)

    //当使用map去转换二元组的元素时,如果key保持不变,只需要改变value
    //可以使用mapValue优化
    val mapValuesList: Map[String, Int] = groupByList.mapValues((list: List[String]) => list.size)
    println(mapValuesList)

    //对结果进行排序取前三
    //map不能排序,需要转换为list
//    mapValuesList.sortBy

    val tuples: List[(String, Int)] = mapValuesList.toList.sortBy((tuple:(String,Int))=>tuple._2)(Ordering.Int.reverse)
    println(tuples)

    //取top3
    val tuples1: List[(String, Int)] = tuples.take(3)
    println(tuples1)

    val tuples2: List[(String, Int)] = mapValuesList.toList.sortWith((left: (String, Int), right: (String, Int)) => left._2 > right._2)
    println(tuples2)

    println(tuples2.take(3))
  }
}

WorldCount简单复杂案例

package scala.com.lqs.chapter02.day06

/**
 * @Author lqs
 * @Date 2022年01月22日 11:31:11
 * @Version 1.0.0
 * @ClassName FCWC
 * @Describe
 */
object FCWC {
  def main(args: Array[String]): Unit = {
    val tupleList = List(("Hello Scala Spark World", 4), ("Hello Scala Spark", 3), ("Hello Scala", 2), ("Hello", 1))

    //方法1:使用上一步写过的步骤
    //不推荐,因为这里是反向优化,即把已经统计好了的数据展开进行重新统计
    //转换数据为长字符串
    val strings: List[String] = tupleList.map((tuple: (String, Int)) => (tuple._1 + " ") * tuple._2)
    println(strings)

    val result: List[(String, Int)] = strings.flatMap(_.split(" "))
      .groupBy(s => s)
      .mapValues(_.size)
      .toList
      .sortWith(_._2 > _._2)
      .take(3)
    println(result)

    /*
    化简方法2
     */
    //方法2:使用算好的次数进行拆分
    // ("Hello Scala Spark World", 4) => List(("hello",4),("scala",4),("spark",4),("world",4))
    // 步骤一: 将二元组转换为集合单个单词加次数
    val list: List[List[(String, Int)]] = tupleList.map((tuple: (String, Int)) => {
      val strings1: Array[String] = tuple._1.split(" ")
      strings1.toList.map((s: String) => (s, tuple._2))
    })

    println(list)

    val flatten: List[(String, Int)] = list.flatten
    println(flatten)

    // 使用flatMap代替map + flatten
    // 嵌套的匿名函数化简  优先化简内层的  再化简外层的
    val flatMapList: List[(String, Int)] = tupleList.flatMap((tuple) => tuple._1.split(" ").map((_, tuple._2)))


    // 步骤二: 将相同的单词聚合一组
    val groupByList: Map[String, List[(String, Int)]] = flatMapList.groupBy(_._1)
    println(groupByList)

    // 步骤三: 将同一组的数据  出现的次数累加
    //写法1
    val mapValuesList: Map[String, Int] = groupByList.mapValues(_.map(_._2).sum)

    //写法2
    val mapValuesList2: Map[String, Int] = groupByList.mapValues(_.foldLeft(0)(_ + _._2))

    println(mapValuesList2.take(3))

    /*
    熟练写法
     */
    val result2: List[(String, Int)] = tupleList.flatMap(tuple => tuple._1.split(" ").map((_, tuple._2)))
      .groupBy(_._1)
      .mapValues(_.map(_._2).sum)
      .toList
      .sortWith(_._2 > _._2)
      .take(3)
    println(result2)


    /*//方法2:使用算好的次数进行拆分
    // ("Hello Scala Spark World", 4) => List(("hello",4),("scala",4),("spark",4),("world",4))
    // 步骤一: 将二元组转换为集合单个单词加次数
    val list: List[List[(String, Int)]] = tupleList.map((tuple: (String, Int)) => {
      val strings1: Array[String] = tuple._1.split(" ")
      strings1.toList.map((s: String) => (s, tuple._2))
    })

    println(list)

    val flatten: List[(String, Int)] = list.flatten
    println(flatten)

    // 使用flatMap代替map + flatten
    val flatMapList: List[(String, Int)] = tupleList.flatMap((tuple: (String, Int)) => (tuple._1.split(" ").map((s: String) => (s, tuple._2))))

    // 步骤二: 将相同的单词聚合一组
    val groupByList: Map[String, List[(String, Int)]] = flatMapList.groupBy((tuple: (String, Int)) => tuple._1)
    println(groupByList)

    // 步骤三: 将同一组的数据  出现的次数累加
    //写法1
    val mapValuesList: Map[String, Int] = groupByList.mapValues((list: List[(String, Int)]) => list.map(_._2).sum)

    //写法2
    val mapValuesList2: Map[String, Int] = groupByList.mapValues((list: List[(String, Int)]) => list.foldLeft(0)((res: Int, elem: (String, Int)) => res + elem._2))

    println(mapValuesList2.take(3))
    */
  }
}