搜狗搜索日志分析

196 阅读1分钟

分词工具

图片.png

<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.7.7</version>
</dependency>

简单写个HanLPDemo

def main(args: Array[String]): Unit = {
  val word = "[HanLP入门案例]"
  val terms = HanLP.segment(word)
  println(terms)
  import scala.collection.JavaConverters._
  println(terms.asScala.map(_.word))
  val str = word.replaceAll("\[|\]", "")
  println(str)
  println(HanLP.segment(str).asScala.map(_.word))

  val log = """00:00:00 2982199073774412    [360安全卫士]   8 3 download.it.com.cn/softweb/software/firewall/antivirus/20067/17938.html"""

  val logWord = log.split("\s+")(2).replaceAll("\[|\]","")
  println(logWord)
  println(HanLP.segment(logWord).asScala.map(_.word))
}

搜狗日志分析

package com.spark.day3

import com.hankcs.hanlp.HanLP
import org.apache.spark.{SparkConf, SparkContext}

/**
  * @author yogurt
  * @Date 2022/11/18 - 23:19 - 2022
  *
  */
object sougouLogDemo {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("spark-sougou").setMaster("local[*]")
    val context = new SparkContext(conf)
    context.setLogLevel("warn")
    val lines = context.textFile("data/input/SogouQ.sample")
    val WordRDD = lines.map(line => {
      val word = line.split("\s+")
      SouGouRecord(word(0), word(1), word(2), word(3).toInt, word(4).toInt, word(5))
    })

    // 切分数据
    val wordRDD = WordRDD.flatMap(record => {
      val queryWord = record.queryWords.replaceAll("\[|\]", "")
      import scala.collection.JavaConverters._
      HanLP.segment(queryWord).asScala.map(_.word)
    })

    //TODO 3.统计指标
    //--1.热门搜索词
    val reslut1 = wordRDD.filter(w => !w.equals(".") && !w.equals("+")).map((_,1)).reduceByKey(_ + _).sortBy(_._2,false).take(10)

    //--2.用户热门搜索词(带上用户id)
    // 切数据
    val value = WordRDD.flatMap(record1 =>{
      val str = record1.queryWords.replaceAll("\[|\]","")
      import scala.collection.JavaConverters._
      val buffer = HanLP.segment(str).asScala.map(_.word)
      val userId = record1.userId
      buffer.map(w =>(userId,str))
    })
    // 过滤数据
    val reslut2 = value.filter(w => !w._2.equals(".") && !w._2.equals("+")).map((_,1)).reduceByKey(_ + _).sortBy(_._2,false).take(10)

    //--3.各个时间段搜索热度
    val reslut3 = WordRDD.map(m =>{
      val time = m.queryTime
      val str = time.substring(0,5)
      (str,1)
    }).reduceByKey(_+_).sortBy(_._2,false).take(10)


    println("-------------------热门搜索词---------------------")
    reslut1.foreach(println)

    println("-------------------用户热门搜索词(带上用户id)---------------------")
    reslut2.foreach(println)

    println("-------------------各个时间段搜索热度---------------------")
    reslut3.foreach(println)
  }

  //准备一个样例类用来封装数据
  /**
    * 用户搜索点击网页记录Record
    * @param queryTime  访问时间,格式为:HH:mm:ss
    * @param userId     用户ID
    * @param queryWords 查询词
    * @param resultRank 该URL在返回结果中的排名
    * @param clickRank  用户点击的顺序号
    * @param clickUrl   用户点击的URL
    */
  case class SouGouRecord( queryTime: String,
                          userId: String,
                          queryWords: String,
                          resultRank: Int,
                          clickRank: Int,
                          clickUrl: String)
}

图片.png

完毕!