记录学习大数据spark的wordcount案例
wordcount图解
1、启动spark服务
2、建立项目,引入依赖
<properties>
<encoding>UTF-8</encoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.version>2.12.11</scala.version>
<spark.version>3.0.1</spark.version>
<hadoop.version>2.7.5</hadoop.version>
</properties>
<dependencies>
<!--依赖Scala语言-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!--SparkCore依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- spark-streaming-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<!--spark-streaming+Kafka依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<!--SparkSQL依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<!--SparkSQL+ Hive依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive-thriftserver_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<!--StructuredStreaming+Kafka依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- SparkMlLib机器学习模块,里面有ALS推荐算法-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.7.7</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.2</version>
<scope>provided</scope>
</dependency>
</dependencies>
3、采用scala和Java编写代码
scala语言
object wordcount {
def main(args: Array[String]): Unit = {
// 1.env/准备sc/SparkContext/Spark上下文执行环境
val conf = new SparkConf().setAppName("wc").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
// 2.source/读取数据
//RDD:A Resilient Distributed Dataset (RDD):弹性分布式数据集,简单理解为分布式集合!使用起来和普通集合一样简单!
//RDD[就是一行行的数据]
val lines: RDD[String] = sc.textFile("data/input/wordcount.txt")
//3.transformation/数据操作/转换
val word = lines.flatMap(_.split(" "))
val unit = word.map((_,1)).reduceByKey(_ + _)
// 4.sink/输出
unit.foreach(println)
println(unit.collect().toBuffer)
// 输出到指定文件中
unit.repartition(1).saveAsTextFile("data/output/result")
}
}
Java语言
public class wordcountjava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("wordcount").setMaster("local[*]");
// 采用Java的sparkcontext
JavaSparkContext sc = new JavaSparkContext(conf);
sc.setLogLevel("WARN");
// 加载数据,生成分布式数据集
JavaRDD<String> rdd = sc.textFile("data/input/wordcount.txt");
JavaRDD<String> flatMap = rdd.flatMap(r -> Arrays.asList(r.split(" ")).iterator());
//组合成KeyValue对
JavaPairRDD<String, Integer> pair = flatMap.mapToPair(word -> new Tuple2<>(word, 1));
// 分组聚合
JavaPairRDD<String, Integer> reduce = pair.reduceByKey((x, y) -> x + y);
// 输出指定文件
reduce.saveAsTextFile("data/output/result2");
}
}
4、输出结果
完毕!