使用Spark构建聚类模型

145 阅读4分钟

将使用一个模型(推荐模型)的输出作为另外一个模型(聚类模型)的输入
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by biyuzhe on 16-1-28.
* 使用一个模型(推荐模型)的输出作为另外一个模型(聚类模型)的输入
*/
object MovieClustering {
def main (args: Array[String]) {
val conf = new SparkConf()
.setMaster(“local”) //.setMaster(biyuzhe)
.setSparkHome(System.getenv(“SPARK_HOME”))
.setAppName(“Clustering”)
.set(“spark.executor.memory”, “4g”)
val sc = new SparkContext(conf)
val path = “file:/home/raini/data/ml-100k”
//读取电影数据集
val movies = sc.textFile(path + “/u.item”)
// println(movies.first)

//读取电影题材(流派)标签
val genres = sc.textFile(path + "/u.genre")

// genres.take(5).foreach(println)
//<题材,索引> 并处理最后的空行
val genreMap = genres.filter(!_.isEmpty).map( line =>line.split(“\|”)).map(array => (array(1),array(0))).collectAsMap
println(genreMap)

//提取 (电影Id, 标题,题材)
val titlesAndGenres = movies.map(_.split("\|")).map{ array =>
  val genres = array.toSeq.slice(5, array.size)//|未知|行动|冒险|动画|儿童|喜剧|犯罪|纪录片|戏剧|幻想|黑色电影|恐怖|音乐|神秘|浪漫|科幻|惊悚片|战争|西方|
                                               //|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
  val genresAssigned = genres.zipWithIndex.filter{ case (g,idx) =>
    g == "1"
  }.map { case (g, idx) =>
    genreMap(idx.toString) //得到题材
  }
  (array(0).toInt, (array(1), genresAssigned))
}
println(titlesAndGenres.first)
//(1,(Toy Story (1995),ArrayBuffer(Animation, Children's, Comedy)))

/**
  *2.转化Rating,训练推荐模型
  */
import org.apache.spark.mllib.recommendation.{ALS, Rating}

val rawData = sc.textFile(path + "/u.data") //用户ID |项目|评级|时间戳。
val rawRatings = rawData.map (_.split("\t").take(3)) //忽略时间戳
val ratings = rawRatings.map{ case Array(user, movie, rating) =>
  Rating(user.toInt, movie.toInt, rating.toDouble)
}
ratings.cache
val alsModel = ALS.train(ratings, 50, 10, 0.1) //返回两pairRDD:Features/productFeatrures

/ *3.提取相关因素转化到Vector中作为聚类模型训练输入/
val movieFactors = alsModel.productFeatures.map{ case (id, factor) =>
(id, Vectors.dense(factor))
}
val movieVectors = movieFactors.map(_._2) //输入数据最好cache

val userFactors = alsModel.userFeatures.map { case (id, factor) =>
  (id, Vectors.dense(factor))
}
val userVectors = userFactors.map(_._2)

/ *4.观察输入数据相关因数特征向量的分布–确定是否需要(归一化) /
val movieMatrix = new RowMatrix(movieVectors)
val movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics()

val userMatrix = new RowMatrix(userVectors)
val userMatrixSummary = userMatrix.computeColumnSummaryStatistics()

println("movie factor mean: " + movieMatrixSummary.mean)
println("movie factor variance: " + movieMatrixSummary.variance)
println("movie factor mean: " + userMatrixSummary.mean)
println("movie factor variance: " + userMatrixSummary.variance)
println("----由上看出数据很好-------不需要 归一化 --------: " + userMatrixSummary.max)

/**5.训练推荐模型 */
//聚类不需标签LabeledPoint, 而是使用特征向量接口-> RDD的Vector数组
//
val numClusters = 5
val numIterations = 10 //最大迭代次数,默认100
val numRuns = 3 //算法并发运行数目,从多个起点并发执行,最后选择最佳结果,多次训练可有效找倒最优模型
val initializationMode = "random" //默认k-means 用来初始化聚类中心的方法

val movieClusterModel = KMeans.train(movieVectors,numClusters, numIterations, numRuns)

//当模型达到最大迭代次数,模型就已经收敛,如: //这里看不到,因为日志级别设置了只显示警告的东西
val movieClusterModelConverged = KMeans.train(movieVectors,numClusters, numIterations, 100)

//yong用户因数特征训练K-means
val userClusterModel = KMeans.train(userVectors,numClusters,numIterations,numRuns)


/**6.使用聚类模型进行预测 */
val movie1 = movieVectors.first()
//val movieCluster1 = movieClusterModel.clusterCenters
val movieCluster = movieClusterModel.predict(movie1)
val userCluster = userClusterModel.predict(userVectors.first())
//预测多个样本
val prediction = movieClusterModel.predict(movieVectors)
println(prediction.take(10).mkString(","))


/**7.使用数据集解释类别预测 */
import breeze.linalg._
import breeze.numerics.pow //相比scla.math 其可以对向量-按维进行处理

def computeDistance(v1: DenseVector[Double], v2: DenseVector[Double]) = pow(v1 - v2, 2).sum

//对-每个电影-计算其-特征向量-与-所属类簇中心向量的-距离-
val titlesWithFactors = titlesAndGenres.join(movieFactors) //不是movieVectors(因为titlesAndGenres是kv对)

val moviesAssigned = titlesWithFactors.map { case (id, ((title, genres),vector)) =>
    val pred = movieClusterModel.predict(vector)
    val clusterCentre = movieClusterModel.clusterCenters(pred)
    val dist = computeDistance(DenseVector(clusterCentre.toArray), DenseVector(vector.toArray))//中心->特征向量
    (id, title, genres.mkString(" "), pred, dist)
}
val clusterAssignments = moviesAssigned.groupBy {case (id, title, genres, cluster, dist) =>
  cluster
}.collectAsMap() //键->类簇标识,值->电影和相关信息的组合

//接着,枚举每个类簇并输出距离类中心最近的前20部电影
for ( (k,v) <- clusterAssignments.toSeq.sortBy(_._1)) {
  println(s"Cluster $k: ")
  val m = v.toSeq.sortBy(_._5)
  println(m.take(20).map { case (_, title, genres, _, d) =>
    (title, genres, d)
  }.mkString("\n")
  )
  println("====/n")
}

/**Cluster 0:
(Angela (1995),Drama,0.26773755873555743)
(Johns (1996),Drama,0.34759765601432857)
(Moonlight and Valentino (1995),Drama Romance,0.3519137808401636)
(Outlaw, The (1943),Western,0.3538873355808669)
(Outbreak (1995),Action Drama Thriller,0.35671637771512654)
(Blue Chips (1994),Drama,0.3614747325659568)
(Mr. Wonderful (1993),Comedy Romance,0.4042728699499413)
(River Wild, The (1994),Action Thriller,0.41305919411868147)
(Intimate Relations (1996),Comedy,0.4172526695549295)
(Mr. Jones (1993),Drama Romance,0.4244861306662781)
(Target (1995),Action Drama,0.47049263870626357)
(New Jersey Drive (1995),Crime Drama,0.477793194657522)
(Next Step, The (1995),Drama,0.49127813483365024)
(Wedding Bell Blues (1996),Comedy,0.49127813483365024)
(Tainted (1998),Comedy Thriller,0.49127813483365024)
(Prefontaine (1997),Drama,0.5250048654615782)
(Air Up There, The (1994),Comedy,0.5443137459216897)
(City of Angels (1998),Romance,0.5476353806150974)
(Courage Under Fire (1996),Drama War,0.5655563050851464)
(Maverick (1994),Action Comedy Western,0.5676057384432159)
====/n
Cluster 1:
。。。
====/n
*///如果我们有更多元素,导演/演员等 就能从每个类簇中找到更多的细节

/ *对用户特征向量聚类 /
val uUser = sc.textFile(path + “/u.user”) //user id | age | gender | occupation | zip code
val users = uUser.map(_.split(“\|”).take(4)).map{ a =>
(a(0).toInt,(a(1),a(2),a(3)))
}

val titlesWithUserFactors = users.join(userFactors)

val usersAssigned = titlesWithUserFactors.map { case (id, ((age, genrder,occupation),vector)) =>
  val pred = userClusterModel.predict(vector)
  val clusterCentre = userClusterModel.clusterCenters(pred)
  val dist = computeDistance(DenseVector(clusterCentre.toArray), DenseVector(vector.toArray))//中心->特征向量
   (id, age, genrder,occupation, pred, dist)
}
val UclusterAssignments = usersAssigned.groupBy {case (id, age, genrder,occupation, cluster, dist) =>
  cluster
}.collectAsMap() //键->类簇标识,值->相关信息的组合

//接着,枚举每个类簇并输出距离类中心最近的前10个人
for ( (k,v) <- UclusterAssignments.toSeq.sortBy(_._1)) {
  println(s"Cluster $k: ")
  val m = v.toSeq.sortBy(_._5)
  println(m.take(10).map { case (_, age, genrder, occupation,_, d) =>
    (age, genrder, occupation, d)
  }.mkString("\n")
  )
  println("====/n")
}

/**Cluster 0:

(30,M,other,2.8966362356009294)
(35,M,marketing,0.9507855659753555)
(54,F,administrator,2.402631176842464)
(57,M,retired,1.0673925854872153)
(33,M,scientist,0.9984593990975481)
(34,M,marketing,2.6646374636252417)
(42,M,educator,1.8950723524585869)
(39,M,educator,2.343260221685305)
(37,M,librarian,0.4754024132820834)
(22,M,other,2.8209046740692343)
====/n
Cluster 1:
。。。
Cluster 4:
(52,M,programmer,0.79876734137952)
(17,F,student,1.9171896461212914)
(45,M,other,0.3537740354864482)
(35,F,none,1.1590329725225672)
(58,M,executive,0.7072765980582237)
(22,F,student,1.780722258593975)
(31,F,educator,1.4818500200639821)
(30,M,salesman,1.9747640725633009)
(40,F,writer,0.7005565965074791)
(36,F,marketing,3.876324007484136)
====/n
*/

/**7.评估聚类模型性能
  *
  * 内部评估: 表示评估过程使用训练模型时使用的训练数据
  *    指标:(wcss/Davies-Bouldin指数/Dunn指数/轮廓系数-silhouette coefficient)都是使类簇内部样本距离尽可能接近,不同类簇样本相对较远
  *
  * 外部评估: 使用训练数据之外的数据
  *    指标:使用聚类模型-预测-类簇(标注有类标签的数据),使用分类模型中类似的方法评估预测值和真实标签的误差(真假阳性率/真假阴性率)
  *        (Rand measure / F-measure / 雅卡尔系数(Jaccard index))
  * */


/**8.计算数据集性能 *///mllib提供wcss计算
val movieCost = movieClusterModel.computeCost(movieVectors)
val userCost = userClusterModel.computeCost(userVectors)

println("WCSS for movies: " + movieCost)
println("WCSS for users: " + userCost)


/**9.聚类模型参数调优 */
val trainTestSplitMovies = movieVectors.randomSplit(Array(0.6, 0.4), 123)
val trainMovies = trainTestSplitMovies(0)
val testMovies = trainTestSplitMovies(1)

//computeCost 用测试集评估训练集
val costsMovies = Seq(2, 3, 4, 5, 10, 20, 30,50).map{ k =>
  (k, KMeans.train(trainMovies, k, numIterations, numRuns).computeCost(testMovies))
}
println("Movie-clustering-cross-validation")
costsMovies.foreach { case (k, cost) => println(f"WCSS-for-k=$k  -> $cost%2.2f")}

/**
随着类中心数目增加,WCSS值会下降,然后又开始增大。
:另 K-均值在交叉验证时,WCSS随着K增大持续减小,达到某个值后,下降突然变得很平缓,这时k通常为最优解(拐点)

类簇太多会变得难以理解和解释
 */

/**10.用户聚类在交叉验证下的性能 */
val trainTestSplitUsers = userVectors.randomSplit(Array(0.6, 0.4), 123)
val trainUsers = trainTestSplitUsers(0)
val testUsers = trainTestSplitUsers(1)

//computeCost 用测试集评估训练集
val costsUsers = Seq(2, 3, 4, 5, 10, 20, 30, 40,60).map{ k =>
  (k, KMeans.train(trainUsers, k, numIterations, numRuns).computeCost(testUsers))
}
println("User-clustering-cross-validation")
costsUsers.foreach { case (k, cost) => println(f"WCSS-for-k=$k  -> $cost%2.2f")}

}
}