分词工具
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.7.7</version>
</dependency>
简单写个HanLPDemo
def main(args: Array[String]): Unit = {
val word = "[HanLP入门案例]"
val terms = HanLP.segment(word)
println(terms)
import scala.collection.JavaConverters._
println(terms.asScala.map(_.word))
val str = word.replaceAll("\[|\]", "")
println(str)
println(HanLP.segment(str).asScala.map(_.word))
val log = """00:00:00 2982199073774412 [360安全卫士] 8 3 download.it.com.cn/softweb/software/firewall/antivirus/20067/17938.html"""
val logWord = log.split("\s+")(2).replaceAll("\[|\]","")
println(logWord)
println(HanLP.segment(logWord).asScala.map(_.word))
}
搜狗日志分析
package com.spark.day3
import com.hankcs.hanlp.HanLP
import org.apache.spark.{SparkConf, SparkContext}
/**
* @author yogurt
* @Date 2022/11/18 - 23:19 - 2022
*
*/
object sougouLogDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("spark-sougou").setMaster("local[*]")
val context = new SparkContext(conf)
context.setLogLevel("warn")
val lines = context.textFile("data/input/SogouQ.sample")
val WordRDD = lines.map(line => {
val word = line.split("\s+")
SouGouRecord(word(0), word(1), word(2), word(3).toInt, word(4).toInt, word(5))
})
// 切分数据
val wordRDD = WordRDD.flatMap(record => {
val queryWord = record.queryWords.replaceAll("\[|\]", "")
import scala.collection.JavaConverters._
HanLP.segment(queryWord).asScala.map(_.word)
})
//TODO 3.统计指标
//--1.热门搜索词
val reslut1 = wordRDD.filter(w => !w.equals(".") && !w.equals("+")).map((_,1)).reduceByKey(_ + _).sortBy(_._2,false).take(10)
//--2.用户热门搜索词(带上用户id)
// 切数据
val value = WordRDD.flatMap(record1 =>{
val str = record1.queryWords.replaceAll("\[|\]","")
import scala.collection.JavaConverters._
val buffer = HanLP.segment(str).asScala.map(_.word)
val userId = record1.userId
buffer.map(w =>(userId,str))
})
// 过滤数据
val reslut2 = value.filter(w => !w._2.equals(".") && !w._2.equals("+")).map((_,1)).reduceByKey(_ + _).sortBy(_._2,false).take(10)
//--3.各个时间段搜索热度
val reslut3 = WordRDD.map(m =>{
val time = m.queryTime
val str = time.substring(0,5)
(str,1)
}).reduceByKey(_+_).sortBy(_._2,false).take(10)
println("-------------------热门搜索词---------------------")
reslut1.foreach(println)
println("-------------------用户热门搜索词(带上用户id)---------------------")
reslut2.foreach(println)
println("-------------------各个时间段搜索热度---------------------")
reslut3.foreach(println)
}
//准备一个样例类用来封装数据
/**
* 用户搜索点击网页记录Record
* @param queryTime 访问时间,格式为:HH:mm:ss
* @param userId 用户ID
* @param queryWords 查询词
* @param resultRank 该URL在返回结果中的排名
* @param clickRank 用户点击的顺序号
* @param clickUrl 用户点击的URL
*/
case class SouGouRecord( queryTime: String,
userId: String,
queryWords: String,
resultRank: Int,
clickRank: Int,
clickUrl: String)
}
完毕!