一、Spark Streaming
Spark Streaming是Spark Core API的一种扩展,它可以用于进行大规模、高吞吐量、容错的实时数据流的处理。这个实时:指的是近实时,最小可以支持秒级别的实时处理。
1.1、Spark Streaming的工作原理
它会接收实时输入的数据流,然后将数据拆分成多个batch,比如:每收集1s的数据给它封装为一个batch,然后将每个batch交给spark计算引擎处理。最后会产生一个结果数据流,这个结果数据流里面的数据也是由一个一个的batch所组成的。所以,SparkStreaming就是一小批一批的处理。
二、Spark Streaming WordCount代码展示
java pom依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.13</artifactId>
<version>3.3.1</version>
</dependency>
package com.strivelearn.sparkstreaming;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
/**
* @author strivelearn
* @version StreamWordCount.java, 2023年01月29日
*/
public class StreamWordCount {
public static void main(String[] args) throws InterruptedException {
SparkConf sparkConf = new SparkConf().setMaster("local[*]").setAppName("StreamWordCount");
// 创建StreamingContext
JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(3));
// 通过socket获取实时产生的数据
JavaReceiverInputDStream<String> linesRdd = javaStreamingContext.socketTextStream("192.168.234.100", 9001);
JavaDStream<String> wordsRdd = linesRdd.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
System.out.println("接收到数据:" + s);
String[] split = s.split("\s+");
return Arrays.asList(split).iterator();
}
});
// 把每个单词转换为tuple2的形式
JavaPairDStream<String, Integer> pairRdd = wordsRdd.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
// 执行reduceByKey操作
JavaPairDStream<String, Integer> wordCountRdd = pairRdd.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
wordCountRdd.print();
// 启动任务
javaStreamingContext.start();
// 等待任务停止
javaStreamingContext.awaitTermination();
}
}
-
在linux虚拟机启动socket
nc -l 9001
-
此时idea输出栏
三、Spark Streaming整合Kafka
java pom 依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.13</artifactId>
<version>3.3.1</version>
</dependency>
package com.strivelearn.sparkstreaming;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import scala.Tuple2;
/**
* @author strivelearn
* @version StreamKafka.java, 2023年01月29日
*/
public class StreamKafka {
public static void main(String[] args) throws InterruptedException {
SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("StreamKafka");
JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(3));
HashMap<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "192.168.234.100:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class.getName());
kafkaParams.put("value.deserializer", StringDeserializer.class.getName());
kafkaParams.put("group.id", "con_2");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", true);
// kafka 的topic
ArrayList<String> topics = new ArrayList<>();
topics.add("sparkstreamdemo");
JavaInputDStream<ConsumerRecord<String, String>> directStream = KafkaUtils.createDirectStream(javaStreamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));
// 处理数据
directStream.map(new Function<ConsumerRecord<String, String>, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> call(ConsumerRecord<String, String> stringStringConsumerRecord) throws Exception {
return new Tuple2<>(stringStringConsumerRecord.key(), stringStringConsumerRecord.value());
}
}).print();
javaStreamingContext.start();
javaStreamingContext.awaitTermination();
}
}
-
启动kafka
bin/kafka-server-start.sh -daemon config/server.properties
-
创建topic
bin/kafka-topics.sh --bootstrap-server localhost:9092 --create --topic sparkstreamdemo --partitions 1
-
生产者生产数据
bin/kafka-console-producer.sh --bootstrap-server localhost:9092 --topic sparkstreamdemo