package Classification
object Classifier {
def main (args: Array[String]) {
val conf = new SparkConf()
.setMaster("local[2]")
.setSparkHome(System.getenv("SPARK_HOME"))
.setAppName("网页点击Classification")
.set("spark.executor.memory", "4g")
val sc = new SparkContext(conf)
val rawData = sc.textFile("file:///home/raini/data/train_noheader.tsv")
val records = rawData.map(line => line.split("\t"))
records.first()
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
val data = records.map { r =>
val trimmed = r.map(_.replaceAll("\"",""))
val lable = trimmed(r.size - 1).toInt
val features = trimmed.slice(4,r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
LabeledPoint(lable, Vectors.dense(features))
}
data.cache
val numData = data.count
val nbData = records.map { r =>
val trimmed = r.map(_.replaceAll("\"",""))
val lable = trimmed(r.size - 1).toInt
val features = trimmed.slice(4,r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble).map(d => if (d < 0) 0.0 else d)
LabeledPoint(lable, Vectors.dense(features))
}
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.configuration.Algo
import org.apache.spark.mllib.tree.impurity.Entropy
val numIteration = 10
val maxTreeDepth = 5
val lrModel = LogisticRegressionWithSGD.train(data, numIteration)
val svmModel = SVMWithSGD.train(data, numIteration)
val nbModel = NaiveBayes.train(nbData)
val dtModel = DecisionTree.train(data, Algo.Classification, Entropy, maxTreeDepth)
val dataPoint = data.first
val prediction = lrModel.predict(dataPoint.features)
val trueLabel = dataPoint.label
val predictions = lrModel.predict(data.map(lp => lp.features))
predictions.take(5)
val lrTotalCorrect = data.map { point =>
if (lrModel.predict(point.features)== point.label) 1 else 0
}.sum
val lrAccuracy = lrTotalCorrect / numData
val svmTotalCorrect = data.map { point =>
if (svmModel.predict(point.features) == point.label) 1 else 0
}.sum
val svmAccuracy = svmTotalCorrect / numData
val nbTotalCorrect = nbData.map { point =>
if (nbModel.predict(point.features) == point.label) 1 else 0
}.sum
val nbAccuracy = nbTotalCorrect / numData
val dtTotalCorrect = data.map { point =>
val score = dtModel.predict(point.features)
val predicted = if (score > 0.5) 1 else 0
if (predicted == point.label) 1 else 0
}.sum
val dtAccuracy = dtTotalCorrect / numData
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
val metrics = Seq(lrModel, svmModel).map{ model =>
val scoreAndLabels = data.map{ point =>
(model.predict(point.features), point.label)
}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(model.getClass.getSimpleName, metrics.areaUnderPR(), metrics.areaUnderROC())
}
val nbMetrics =Seq(nbModel).map{ model =>
val scoreAndLabels = nbData.map {point =>
val score = model.predict(point.features)
(if (score > 0.5) 1.0 else 0.0, point.label)
}
val metrics =new BinaryClassificationMetrics(scoreAndLabels)
(model.getClass.getSimpleName, metrics.areaUnderPR(), metrics.areaUnderROC())
}
val dtMetrics = Seq(dtModel).map{ model =>
val scoreAndLabels = data.map{ point =>
val score = model.predict(point.features)
(if (score > 0.5) 1.0 else 0.0, point.label)
}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(model.getClass.getSimpleName, metrics.areaUnderPR(), metrics.areaUnderROC())
}
val allMetrics = metrics ++ nbMetrics ++ dtMetrics
allMetrics.foreach{ case (m, pr, roc) =>
println(f"$m, Area under PR: ${pr * 100.0}%2.4f%%, Area under ROC: ${roc * 100.0}2.4f%%")
}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
val vectors = data.map(lp => lp.features)
val matrix = new RowMatrix(vectors)
val matrixSummary = matrix.computeColumnSummaryStatistics()
println(matrixSummary.mean)
println(matrixSummary.max)
println(matrixSummary.variance)
println(matrixSummary.numNonzeros)
println(matrixSummary.normL2)
import org.apache.spark.mllib.feature.StandardScaler
val scaler = new StandardScaler(withMean = true, withStd = true).fit(vectors)
val scaledData = data.map(lp => LabeledPoint(lp.label, scaler.transform(lp.features)))
println(data.first.features)
println(scaledData.first.features)
println((data.first.features(0) - matrixSummary.mean(0)) / math.sqrt(matrixSummary.variance(0)))
val lrModelScaled = LogisticRegressionWithSGD.train(scaledData,numIteration)
val lrTotalCorrectScaled = scaledData.map{ point =>
if (lrModelScaled.predict(point.features) == point.label) 1 else 0
}.sum
val lrAccuracyScaled = lrTotalCorrectScaled / numData
val lrPredictionsVsTrue = scaledData.map{ point =>
(lrModelScaled.predict(point.features), point.label)
}
val lrMetricsScaled = new BinaryClassificationMetrics(lrPredictionsVsTrue)
val lrPr = lrMetricsScaled.areaUnderPR()
val lrPoc = lrMetricsScaled.areaUnderROC()
println(f"${lrModelScaled.getClass.getSimpleName}\nAccuracy:" +
f"${lrAccuracyScaled * 100}%2.4f\nArea under PR: ${lrPr * 100}%2.4f%%\nArea under POC: ${lrPoc * 100}%2.4f%%")
val catgories = records.map(r => r(3)).distinct.collect.zipWithIndex.toMap
val numCategories = catgories.size
val dataCategories = records.map{ r =>
val trimmed = r.map{ _.replaceAll("\"", "")}
val label = trimmed(r.size - 1).toInt
val categoryIdx = catgories(r(3))
val categoryFeatures = Array.ofDim[Double](numCategories)
categoryFeatures(categoryIdx) = 1.0
val otherFeatures = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
val features = categoryFeatures ++ otherFeatures
LabeledPoint(label, Vectors.dense(features))
}
println(dataCategories.first)
val scalerCats = new StandardScaler(withMean = true, withStd = true).fit(dataCategories.map(lp => lp.features))
val scaledDataCats = dataCategories.map(lp =>
LabeledPoint(lp.label, scalerCats.transform(lp.features)))
println(dataCategories.first.features)
println(scaledDataCats.first.features)
val lrModelScaledCats = LogisticRegressionWithSGD.train(scaledDataCats,numIteration)
val lrTotalCorrectScaledCats = scaledDataCats.map { point =>
if (lrModelScaledCats.predict(point.features) == point.label) 1 else 0
}.sum
val lrAccuracyScaledCats = lrTotalCorrectScaledCats / numData
val lrPredictionsVsTrueCats = scaledDataCats.map { point =>
(lrModelScaledCats.predict(point.features), point.label)
}
val lrMetricsScaledCats = new BinaryClassificationMetrics(lrPredictionsVsTrueCats)
val lrPrCats = lrMetricsScaledCats.areaUnderPR()
val lrRocCats = lrMetricsScaledCats.areaUnderROC()
println(f"${lrMetricsScaledCats.getClass.getSimpleName}\n" +
f"Accuracy: ${lrAccuracyScaledCats * 100}%2.4f%%\n" +
f"Area under PR: ${lrPrCats * 100}%2.4f%%\n" +
f"Area under ROC: ${lrRocCats * 100}%2.4f%%")
val dataNB = records.map { r =>
val trimmed = r.map( _.replaceAll("\"",""))
val label = trimmed(r.size - 1).toInt
val categoryIdx = catgories(r(3))
val categoryFeatures = Array.ofDim[Double](numCategories)
categoryFeatures(categoryIdx) = 1.0
LabeledPoint(label, Vectors.dense(categoryFeatures))
}
val nbModelCats = NaiveBayes.train(dataNB)
val nbTotalCorrectCats = dataNB.map { point =>
if (nbModelCats.predict(point.features) == point.label) 1 else 0
}.sum
val nbAccuracyCats = nbTotalCorrectCats / numData
val nbPredictionVsTrueCats = dataNB.map{ point =>
(nbModelCats.predict(point.features), point.label)
}
val nbMetricsCats = new BinaryClassificationMetrics(nbPredictionVsTrueCats)
val nbPrCats = nbMetricsCats.areaUnderPR()
val nbRocCats = nbMetricsCats.areaUnderROC()
println(f"${nbModelCats.getClass.getSimpleName}\n" +
f"Accuracy: ${nbAccuracyCats * 100}%2.4f%%\n" +
f"Area under PR: ${nbPrCats * 100}%2.4f%%\n" +
f"Area under ROC: ${nbRocCats * 100}%2.4f%%")
import org.apache.spark.mllib.optimization.{Updater,SimpleUpdater,L1Updater,SquaredL2Updater}
import org.apache.spark.mllib.classification.ClassificationModel
import org.apache.spark.rdd.RDD
def trainWithParams(input: RDD[LabeledPoint], regParam: Double, numIntrations: Int, updater: Updater, stepSize: Double) = {
val lr =new LogisticRegressionWithSGD
lr.optimizer
.setNumIterations(numIntrations)
.setUpdater(updater)
.setRegParam(regParam)
.setStepSize(stepSize)
lr.run(input)
}
def creatMetrics(label: String, data: RDD[LabeledPoint], model: ClassificationModel) = {
val scoreAndLabels = data.map { point =>
(model.predict(point.features),point.label)
}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(label, metrics.areaUnderROC())
}
scaledDataCats.cache()
val iterRasults = Seq(1, 5, 10, 50).map { param =>
val model = trainWithParams(scaledDataCats, 0.0, param, new SimpleUpdater, 1.0)
creatMetrics(s"$param iterations", scaledDataCats, model)
}
iterRasults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")}
val stepResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param =>
val model = trainWithParams(scaledDataCats, 0.0, numIteration, new SimpleUpdater, param)
creatMetrics(s"$param step size", scaledDataCats, model)
}
stepResults.foreach { case (param, auc) => println(f"$param,AUC = ${auc * 100}%2.2f%%")
}
val regResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map{ param =>
val model = trainWithParams(scaledDataCats, param, numIteration, new SquaredL2Updater, 1.0)
creatMetrics(s"${param} L2 regularization parameter",scaledDataCats, model)
}
regResults.foreach{ case (param,auc) => println(f"$param,AUC = ${auc * 100}%2.2f%%")
}
import org.apache.spark.mllib.tree.impurity.Impurity
import org.apache.spark.mllib.tree.impurity.Entropy
import org.apache.spark.mllib.tree.impurity.Gini
def trainDTWithParams(input: RDD[LabeledPoint], maxDepth: Int, impurity: Impurity) = {
DecisionTree.train(input, Algo.Classification, impurity, maxDepth)
}
val dtResultsEntropy = Seq(1, 2, 3, 4, 5, 10, 20).map{ param =>
val model = trainDTWithParams(data, param, Entropy)
val scoreAndLabels = data.map{ point =>
val score = model.predict(point.features)
(if (score > 0.5) 1.0 else 0.0, point.label)
}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(s" $param tree depth", metrics.areaUnderROC())
}
dtResultsEntropy.foreach { case (param, auc) => println(f"$param,AUC = ${auc * 100}%2.2f%%")
}
val dtResultsGini = Seq(1, 2, 3, 4, 5, 10, 20, 30).map{ param =>
val model = trainDTWithParams(data, param, Gini)
val scoreAndLabels = data.map{ point =>
val score = model.predict(point.features)
(if (score > 0.5) 1.0 else 0.0, point.label)
}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(s" $param tree depth", metrics.areaUnderROC())
}
dtResultsGini.foreach { case (param, auc) => println(f"$param,AUC = ${auc * 100}%2.2f%%")
}
def trainNBWithParams(input: RDD[LabeledPoint], lambda: Double) = {
val nb = new NaiveBayes()
nb.setLambda(lambda)
nb.run(input)
}
val nbResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param =>
val model = trainNBWithParams(dataNB, param)
val scoreAndLabels = dataNB.map{ point =>
(model.predict(point.features), point.label)
}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(s"$param lambda", metrics.areaUnderROC())
}
nbResults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")
}
val trainTreeSplit = scaledDataCats.randomSplit(Array(0.6,0.4), 123)
val train = trainTreeSplit(0)
val test = trainTreeSplit(1)
val regResultsTest = Seq(0.0, 0.001, 0.0025, 0.005, 0.01).map { param =>
val model = trainWithParams(train, param, numIteration, new SquaredL2Updater, 1.0)
creatMetrics(s"$param L2 regularization parameter", test, model)
}
regResultsTest.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.6f%%")
}
val regResultsTrain = Seq(0.0, 0.001, 0.0025, 0.005, 0.01).map { param =>
val model = trainWithParams(train, param, numIteration, new SquaredL2Updater, 1.0)
creatMetrics(s"$param L2 regularization parameter", train, model)
}
regResultsTrain.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.6f%%") }
}
}