spark
1.实例介绍
基于电商用户对母婴类目的日常行为和基础用户信息,预测潜在购买人群,部署集群每日训练并预测结果存储至hive,主要用到了spark2.0的ml机器学期算法库,将训练特征分为连续型特征(注册天数,浏览母婴商品时长等)和离散特征(近1天是否加购,年龄分段,是否母婴会员等),对连续型特征进行GBDT离散化处理,再进行归一化,卡方检验特征选择,LR模型训练等,最后利用pipeline管道结合cv交叉验证评估,对几个模型中用到的参数进行调参。将最终的参数设置为每日训练用的参数,部署到生产环境。
2.数据预处理
hive表取得训练数据:包含user_info, non_featrues, featrues, label四个字段
user_info non_featrues featrues label
0100041979#00019#母婴#00019W867#爱他美(aptamil)#R8078#奶粉 4887#4370#1#1#1#21#21#21#0#0#0 2#4#1#2#0#1#0#1#0#0#0#0#0#7#0#0#0#0#15 0
0100213395#00019#母婴#00019636B#谷朗(Goonen)#R8089#洗护用品 4744#19.2#0#0#1#0#0#24#0#0#0 1#4#1#0#0#1#0#1#0#5#0#0#0#7#0#0#0#0#15 0
0100276577#00019#母婴#000198191#亨氏(Heinz)#R8080#婴儿辅食 4690#293#0#0#3#0#0#46#0#0#0 1#5#3#0#0#1#1#1#0#4#0#0#0#7#0#0#0#0#15 0
0100469133#00019#母婴#000195981#好奇(Huggies)#R8081#纸类用品 4566#363.7#0#1#1#0#10#10#0#0#0 1#6#3#0#0#1#1#1#0#3#0#0#0#7#0#0#0#0#15 0
复制代码
3.Spark实现数据处理
3.1 GBDT Model
package gbdt.featureprocess
import org.apache.spark.mllib.linalg.{DenseVector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, FeatureType}
import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel, Node}
import org.apache.spark.rdd.RDD
import scala.Array.range
/**
* Description: gbdt训练和预测
* Created by wy in 2019/6/18 14:55
*/
class GBTPreprocessor extends Serializable {
//获取一棵树的所有叶节点
def getLeafNode(node: Node): Array[Int] = {
var leafNode = Array[Int]() //
if (node.isLeaf) {
leafNode = leafNode :+ node.id
}
else {
leafNode = leafNode ++ getLeafNode(node.leftNode.get) //数组与数组相加,元素相加,内部均由数组元素组成
leafNode = leafNode ++ getLeafNode(node.rightNode.get)
}
leafNode //返回值
}
//返回样本所在叶节点
def gbtPredict(node: Node, features: DenseVector): Int = {
//树模型在非叶节点的分割点
val split = node.split
if (node.isLeaf) {
node.id
}
else {
//判断连续或者离散,不同的特征判断方式
if (split.get.featureType == FeatureType.Continuous) {
//split.get.feature数据类型为int,表示feature index
if (features(split.get.feature) <= split.get.threshold) {
gbtPredict(node.leftNode.get, features)
}
else {
gbtPredict(node.rightNode.get, features)
}
}
else {
//categories默认为左子节点特征列表
if (split.get.categories.contains(features(split.get.feature))) {
gbtPredict(node.leftNode.get, features)
}
else {
gbtPredict(node.rightNode.get, features)
}
}
}
}
//GBT模型训练
def gbtTrain(gbtTrainData: RDD[LabeledPoint], numTrees: Int): (GradientBoostedTreesModel, Array[Array[Int]]) = {
val boostingStrategy = BoostingStrategy.defaultParams("Classification") //分类模型
boostingStrategy.setNumIterations(numTrees) //设置决策树个数
val gbtModel = GradientBoostedTrees.train(gbtTrainData, boostingStrategy) //GBT模型
val treeLeafArray = new Array[Array[Int]](numTrees) //统计各个树中叶节点node.id在各个树中的分布,括号中一个数则代表行数
//存储叶节点的index
for (i <- range(0, numTrees)) {
treeLeafArray(i) = getLeafNode(gbtModel.trees(i).topNode)
}
(gbtModel, treeLeafArray)
}
//GBT模型预测新的离散化特征
def gbtFeaturePredict(gbtTestData: RDD[(String, (Double, DenseVector))], gbtModel: GradientBoostedTreesModel,
treeLeafArray: Array[Array[Int]], numTrees: Int): RDD[(String, LabeledPoint)] = {
//利用GBT构建新特征
//processData (id, LabeledPoint(label, new DenseVector(features)),(label, new DenseVector(features)))
val newFeaturesSet = gbtTestData.map {
x => {
var gbtFeatures = new Array[Double](0) //储存新特征的新数组,选择Double类型因为labeledPoint函数要求double类型
for (i <- range(0, numTrees)) {
val loc = gbtPredict(gbtModel.trees(i).topNode, x._2._2) //样本点所落叶节点位置
val leafArray = new Array[Double](gbtModel.trees(i).numNodes / 2 + 1) //满员二叉树叶节点数=(总结点数/2)+1
leafArray(treeLeafArray(i).indexOf(loc)) = 1 //val数组元素可变,长度不可变
gbtFeatures = gbtFeatures ++ leafArray //++两边要求数据类型一致
}
(x._1, x._2._1, gbtFeatures) //id, label, gbtFeatures
}
}
//新特征数据
val gbtFeatureRDD = newFeaturesSet.map(x => (x._1, LabeledPoint(x._2, Vectors.dense(x._3))))
gbtFeatureRDD
}
}
复制代码
3.2 Pipeline Model
package potentialusers.pronunciation
import gbdt.featureprocess.GBTPreprocessor
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature._
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
/**
* Description: 数据处理,模型训练,参数调优
* Created by wy in 2019/9/26 9:34
*/
class ModelProcess {
/**
* 本地读取数据预处理,处理成labeledPoint和DenseVector
* @param rdd 本地读取txt数据 包含features,label
* @return denseVectorRDD
*/
def localDataProcess(rdd:RDD[String]): RDD[(String, LabeledPoint, LabeledPoint, (Double, DenseVector))] = {
val denseVectorRDD = rdd.map{
line =>{
val arr = line.toString.split("\t")
val userInfo = arr(0)
val nonFeatures = arr(1).split("#").map(_.toDouble)
val features = arr(2).split("#").map(_.toDouble)
val label = arr(3).toDouble
//创建一个稠密向量,labeledPoint格式GBT模型使用,后一组数据给特征离散化使用
(userInfo,LabeledPoint(label, new DenseVector(features)), LabeledPoint(label, new DenseVector(nonFeatures)),
(label, new DenseVector(nonFeatures)))
}
}
denseVectorRDD
}
/**
* yarn集群读取hive数据预处理,处理成labeledPoint和DenseVector
* @param rdd 读取hive dataFrame转换成rdd
* @return denseVectorRDD
*/
def hiveDataProcess(rdd:RDD[(String, String, String, String)]): RDD[(String, LabeledPoint, LabeledPoint,
(Double, DenseVector))] = {
val denseVectorRDD = rdd.map{
line => {
val userInfo = line._1
val nonFeatures = line._2.split("#").map(_.toDouble)
val features = line._3.split("#").map(_.toDouble)
val label = line._4.toDouble
//创建一个稠密向量,labeledPoint格式GBT模型使用,后一组数据给特征离散化使用
(userInfo,LabeledPoint(label, new DenseVector(features)), LabeledPoint(label, new DenseVector(nonFeatures)),
(label, new DenseVector(nonFeatures)))
}
}
denseVectorRDD
}
/**
* gbt model连续特征离散化处理
* @param train 训练用数据
* @param test 测试用数据
* @param sqc sqlContext
* @return 离散化后的训练集和测试集
*/
def gbtFeatureProcess(train:RDD[(String, LabeledPoint, LabeledPoint, (Double, DenseVector))],
test:RDD[(String, LabeledPoint, LabeledPoint, (Double, DenseVector))],
sqc:SQLContext):(DataFrame, DataFrame) = {
//离散特征
val trainRDD = train.map(x => (x._1,x._2)).map(x => ((x._1, x._2.label), x._2.features.asML))
val testRDD = test.map(x => (x._1,x._2)).map(x => ((x._1, x._2.label), x._2.features.asML))
//连续型特征
val gbtTrain = train.map(x => x._3) //gbt模型训练数据
val gbtTrainData = train.map(x => (x._1, x._4)) //gbt模型训练数据预测
val gbtTestData = test.map(x => (x._1, x._4)) //gbt模型测试数据预测
//GBT模型训练,将连续特征离散化处理
val gbtProcessor = new GBTPreprocessor
val numTrees = 10 //GBT决策树个数
val (gbtModel, treeLeafArray) = gbtProcessor.gbtTrain(gbtTrain, numTrees) //gbt训练
//gbtPredict 合并预测后的离散特征和原始特征 转换成DF训练和预测
//trainData
val gbtTrainRDD = gbtProcessor.gbtFeaturePredict(gbtTrainData, gbtModel, treeLeafArray, numTrees)
.map(x => ((x._1, x._2.label), x._2.features.asML))
val allTrainRDD = trainRDD.join(gbtTrainRDD)
val TrainDF = sqc.createDataFrame(allTrainRDD.map(x => (x._1._1,x._1._2,x._2._1,x._2._2)))
.toDF("userInfo", "label", "features1","features2")
//testData
val gbtTestRDD = gbtProcessor.gbtFeaturePredict(gbtTestData, gbtModel, treeLeafArray, numTrees)
.map(x => ((x._1, x._2.label), x._2.features.asML))
val allTestRDD = testRDD.join(gbtTestRDD)
val TestDF = sqc.createDataFrame(allTestRDD.map(x => (x._1._1,x._1._2,x._2._1,x._2._2)))
.toDF("userInfo", "label", "features1","features2")
//return gbt离散化特征DF
(TrainDF, TestDF)
}
/**
* 管道训练,包含归一化,卡房检验特征选择,逻辑回归
* @param data 训练集
* @return 训练后的三个模型
*/
def pipelineTrain(data: DataFrame):PipelineModel = {
data.persist() //缓存
//stage1 scaler model
val scaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
//stage2 卡方检验特征选择
val chiSqSelector = new ChiSqSelector()
.setFeaturesCol("scaledFeatures")
.setLabelCol("label")
.setNumTopFeatures(80)
.setOutputCol("selectedFeatures")
//stage3 lr model
val lr = new LogisticRegression()
.setMaxIter(200)
.setElasticNetParam(1.0) //L1和L2正则化系数比例,0.0为L2正则化 1.0为L1正则化
.setRegParam(0.00075) //正则化系数
.setThreshold(0.3) //正样本概率阈值,默认0.5,概率超过这个值即预测为正
.setLabelCol("label")
.setFeaturesCol("selectedFeatures")
//建立管道
val pipeline = new Pipeline()
.setStages(Array(scaler,chiSqSelector,lr))
//建立网格搜索,超参数调优用
val paramGrid = new ParamGridBuilder()
.addGrid(chiSqSelector.numTopFeatures,Array(70,80,90))
.addGrid(lr.maxIter, Array(100,150,200))
.addGrid(lr.elasticNetParam, Array(1.0,0.75,0.5,0.25,0.0))
.addGrid(lr.regParam, Array(0.001, 0.00075, 0.00125))
.build()
//建立evaluator,必须要保证验证的标签列是向量化后的标签
val evaluator = new BinaryClassificationEvaluator()
.setLabelCol("label")
//建立一个交叉验证的评估器,设置评估器的参数
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(evaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(2) //2折交叉
//运行交叉验证评估器,得到cv模型
//val cvModel = cv.fit(data)
//val pipelineModel = cvModel.bestModel.asInstanceOf[PipelineModel]
//调参后直接设置pipeline模型参数,部署生产环境
val pipelineModel = pipeline.fit(data)
data.unpersist()
pipelineModel
}
/**
* 管道模型预测结果
* @param data 待处理数据集
* @param pipelineModel 管道模型
* @return 归一化结果、特征选择结果、lr分类结果
*/
def pipelinePredict(data: DataFrame,pipelineModel: PipelineModel): (DataFrame, DataFrame, DataFrame) = {
data.persist()
//管道训练后的三个模型
val scalerModel = pipelineModel.stages(0).asInstanceOf[MinMaxScalerModel]
val chiSqSelectorModel = pipelineModel.stages(1).asInstanceOf[ChiSqSelectorModel]
val lrModel = pipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
println("特征选择个数:",chiSqSelectorModel.explainParam(chiSqSelectorModel.numTopFeatures))
println("LR迭代次数:",lrModel.explainParam(lrModel.maxIter))
println("LR正则化系数:",lrModel.explainParam(lrModel.regParam))
println("LR分类阈值:",lrModel.explainParam(lrModel.threshold))
println("L1L2正则比例:",lrModel.explainParam(lrModel.elasticNetParam))
println("LR特征个数:",lrModel.numFeatures)
val scalerData = scalerModel.transform(data) //归一化
val selectedData = chiSqSelectorModel.transform(scalerData) //特征选择
val predictions = lrModel.transform(selectedData) //lr预测
data.unpersist()
(scalerData,selectedData,predictions)
}
/**
* 特征合并
* @param data 数据集dataFrame 包含features1和features2
* @return 合并后的features的数据集
*/
def featureAssembler(data:DataFrame):DataFrame ={
val assembler = new VectorAssembler()
.setInputCols(Array("features1", "features2"))
.setOutputCol("features")
val output = assembler.transform(data)
output
}
/**
* 评估模型预测效果
* @param data 预测类别和原类别
* @return 分类准确率、加权准确率、加权召回率、F1值
*/
def multiClassEvaluate(data: RDD[(Double, Double)]): (Double, Double, Double, Double) = {
val metrics = new MulticlassMetrics(data)
val accuracy = metrics.accuracy
val weightedPrecision = metrics.weightedPrecision
val weightedRecall = metrics.weightedRecall
val f1 = metrics.weightedFMeasure
(accuracy, weightedPrecision, weightedRecall, f1)
}
}
复制代码
3.3 主函数
package potentialusers.pronunciation
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
/**
* Description: 千里传音母婴潜客模型-生产环境集群读hive数据
* Created by wy in 2019/9/26 9:22
*/
object HiveModelDept19 {
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
def main(args: Array[String]): Unit = {
//初始化
val conf = new SparkConf()
.setAppName("pronunciation-dept19")
.set("set hive.cli.print.header","false") //去除hive表列名
val sc = new SparkContext(conf)
val hc = new HiveContext(sc)
//读hive 训练数据
val trainRDD = hc.sql(
"""
|select user_info
|,non_features
|,features
|,label
|from sospdm.sosp_ml_pronunciation_pot_dept19_02_spark
""".stripMargin
).rdd.map(x =>(x(0).toString,x(1).toString,x(2).toString,x(3).toString))
//读hive 预测数据
val predictRDD = hc.sql(
"""
|select user_info
|,non_features
|,features
|,label
|from sospdm.sosp_ml_pronunciation_pot_dept19_04_spark
""".stripMargin
).rdd.map(x =>(x(0).toString,x(1).toString,x(2).toString,x(3).toString))
val modelProcess = new ModelProcess
val denseVectorTrainRDD = modelProcess.hiveDataProcess(trainRDD)
val denseVectorTestRDD = modelProcess.hiveDataProcess(predictRDD)
//gbt训练 将连续型特征离散化并和原离散特征合并成新特征
val (gbtFeatureTrainDF, gbtFeatureTestDF) = modelProcess.gbtFeatureProcess(denseVectorTrainRDD, denseVectorTestRDD, hc)
val unionTrainDF = modelProcess.featureAssembler(gbtFeatureTrainDF) //gbt离散化后特征合并原特征
val unionTestDF = modelProcess.featureAssembler(gbtFeatureTestDF)
//训练数据上采样 正样本复制2倍
val positiveDF = unionTrainDF.filter("label=1")
val negativeDF = unionTrainDF.filter("label=0")
val upPositiveDF = positiveDF//.union(positiveDF).union(positiveDF)
val upSampleDF = negativeDF.union(upPositiveDF)
//管道训练和预测
val pipelineModel = modelProcess.pipelineTrain(upSampleDF)
val (scalerDF, selectedDF, lrPredictions) = modelProcess.pipelinePredict(unionTestDF, pipelineModel)
//处理prediction,输出到hive
val predictionRDD = lrPredictions
.select("userInfo","probability").rdd.map {
line => {
val userInfo = line(0).toString.split("#")
val userId = userInfo(0)
val deptId = userInfo(1)
val deptNm = userInfo(2)
val brandID = userInfo(3)
val brandNm = userInfo(4)
val l2GroupId = userInfo(5)
val l2GroupNm = userInfo(6)
val probability = Seq(line(1)).mkString
(userId,deptId,deptNm,brandID,brandNm,l2GroupId,l2GroupNm,probability)
}
}.map(x => (
x._1.toString,
x._2.toString,
x._3.toString,
x._4.toString,
x._5.toString,
x._6.toString,
x._7.toString,
x._8.split(",")(1).filter(_!=']').toString
))
val predictionDF = hc.createDataFrame(predictionRDD)
.toDF("cust_num","dept_cd","dept_nm","brand_cd","brand_nm","l2_gds_group_cd","l2_gds_group_desc",
"probability")
predictionDF.createOrReplaceTempView("table_01")
//结果写入hive
hc.sql("drop table if exists sospdm.sosp_ml_pronunciation_pot_dept19_05")
hc.sql("create table if not exists sospdm.sosp_ml_pronunciation_pot_dept19_05 as select * from table_01")
sc.stop()
}
}
复制代码
4 本地训练和预测结果评估
4.1 正负样本
训练数据采样后正负样本数:(7200,60000,67200)
测试数据正负样本数:(3014,26979,29993)
复制代码
4.2 参数调优
(特征选择个数:,numTopFeatures: Number of features that selector will select, ordered by ascending p-value. If the number of features is < numTopFeatures, then this will select all features. (default: 50, current: 80))
(LR迭代次数:,maxIter: maximum number of iterations (>= 0) (default: 100, current: 200))
(LR正则化系数:,regParam: regularization parameter (>= 0) (default: 0.0, current: 7.5E-4))
(LR分类阈值:,threshold: threshold in binary classification prediction, in range [0, 1] (default: 0.5, current: 0.3))
(L1L2正则比例:,elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty (default: 0.0, current: 1.0))
复制代码
4.3 特征处理,取一条展示
原特征:
(0.0,[2.0,4.0,4.0,0.0,0.0,1.0,1.0,1.0,0.0,5.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,15.0]),(0.0,[2364.0,237.7,0.0,0.0,1.0,0.0,0.0,11.0,0.0,0.0,0.0]),(0.0,[2364.0,237.7,0.0,0.0,1.0,0.0,0.0,11.0,0.0,0.0,0.0])
GBDT处理后的离散特征:
+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features1 |features2 |
+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,0.0,0.0,0.0,0.0]|[0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0]|
+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row
两组特征合并:
+---------------------------------------------------------------------------------------------------------------------------------+
|features |
+---------------------------------------------------------------------------------------------------------------------------------+
|(99,[0,1,2,5,7,9,13,18,25,29,37,45,54,59,71,75,88,92],[1.0,6.0,3.0,1.0,1.0,4.0,7.0,15.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
+---------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row
归一化:
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|scaledFeatures |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.5,1.0,0.75,0.0,0.0,1.0,0.0,1.0,0.0,0.6666666666666666,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0]|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row
特征选择:
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|selectedFeatures |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[1.0,0.0,0.6666666666666666,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0]|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row
复制代码
4.4 测试集预测结果和评估
正样本预测结果:
+-----------------------------------------+----------+-----+--------------------------------------------------------+
|probability |prediction|label|userInfo |
+-----------------------------------------+----------+-----+--------------------------------------------------------+
|[0.84294043150981,0.15705956849019004] |0.0 |1.0 |6149278878#00019#母婴#00019Y201#美赞臣(Mead Johnson)#R8078#奶粉|
|[0.15246247452681652,0.8475375254731835] |1.0 |1.0 |2206978961#00019#母婴#00019C216#啓蒙(ENLIGHTEN)#R3000#玩具 |
|[0.014059524393215615,0.9859404756067843]|1.0 |1.0 |7131258996#00019#母婴#000190DPK#花王(MERRIES)#R8081#纸类用品 |
|[0.1781042946756356,0.8218957053243643] |1.0 |1.0 |7005441262#00019#母婴#0001909D0#巴拉巴拉(balabala)#R8085#童装童鞋 |
|[0.8464368177543155,0.15356318224568458] |0.0 |1.0 |6218853244#00019#母婴#00019X476#a2#R8078#奶粉 |
+-----------------------------------------+----------+-----+--------------------------------------------------------+
only showing top 5 rows
负样本预测结果:
+----------------------------------------+----------+-----+-------------------------------------------------------+
|probability |prediction|label|userInfo |
+----------------------------------------+----------+-----+-------------------------------------------------------+
|[0.8302631352405664,0.16973686475943353]|0.0 |0.0 |5202829782#00019#母婴#0001988C3#纤净#R8081#纸类用品 |
|[0.7633453272066048,0.23665467279339517]|0.0 |0.0 |5205485350#00019#母婴#000198191#亨氏(Heinz)#R8080#婴儿辅食 |
|[0.8066424188930162,0.19335758110698367]|0.0 |0.0 |5201970306#00019#母婴#000190632#凤凰(PHOENIX)#R8321#童车、儿童家具|
|[0.6722813751983735,0.3277186248016265] |1.0 |0.0 |5204473229#00019#母婴#000197H04#喜朗(Sharove)#R8089#洗护用品 |
|[0.8513533389731073,0.14864666102689283]|0.0 |0.0 |5200938736#00019#母婴#000196A71#开智#R3000#玩具 |
+----------------------------------------+----------+-----+-------------------------------------------------------+
only showing top 5 rows
复制代码
====================== lrModel 评估结果 =========================
分类准确率:0.4835128196579202
加权准确率:0.8260837884956335
加权召回率:0.4835128196579202
F1值:0.5785826982044591
复制代码
5 部署生产
每日更新预测结果,存hive
hive> select * from sosp_ml_pronunciation_pot_dept19 limit 10;
0100097670 00019 母婴 0.05970516382309314 1 R8078 奶粉 000198183 惠氏(Wyeth)
0100097670 00019 母婴 0.013896893483329929 2 R8078 奶粉 00019X476 a2
0100108980 00019 母婴 0.004950226280954919 1 R8080 婴儿辅食 000198191 亨氏(Heinz)
0100108980 00019 母婴 0.003184953199377607 2 R3000 玩具 0001959T8 乐婷美欣
0100108980 00019 母婴 0.003184953199377607 3 R8320 喂养用品 000190H36 奥秀(OXO)
0100045852 00019 母婴 0.039485927929147346 1 R8081 纸类用品 000195981 好奇(Huggies)
0100045852 00019 母婴 0.029986713331871417 2 R8081 纸类用品 000199286 大王(GOO.N)
0100045852 00019 母婴 0.015276411100067186 3 R8320 喂养用品 00019T128 贝亲(PIGEON)
0100045852 00019 母婴 0.0048930788360090575 4 R8089 洗护用品 000190HZV 哈罗闪(sanosan)
0100045852 00019 母婴 0.003925170659079256 5 R8081 纸类用品 000195952 帮宝适(Pampers)
复制代码