简介
Structured Streaming 是 Apache Spark 提供的一个用于流数据处理的高级抽象层,它允许您以类似于批处理的方式处理实时数据流。Kafka 是一个分布式的流数据平台,结合使用 Structured Streaming 和 Kafka 可以实现强大的实时数据处理应用。
数据准备
通过定义的schema,使用下载的Kafka软件包向Kafka topic 发送消息:
schema 定义
identity部分可以简单当作一笔交易,存储在child表,EventId作为主键,PublishedAt作为交易时间,对相同主键的数据,使用主键去重后,新增数据的publishedAt大于等于老数据的publishedAt进行更新
index部分可以动态增减字段,作为用户账号,存储在parent表TransactionId作为主键,每次会更新交易状态
{
"type": "record",
"name": "event",
"namespace": "com.test",
"fields": [
{
"name": "identity",
"type": {
"type": "record",
"name": "identity",
"fields": [
{
"name": "TransactionId",
"type": "string"
},
{
"name": "EventId",
"type": "string"
},
{
"name": "TradeStatus",
"type": "string"
},
{
"name": "PublishedAt",
"type": [
"null",
{
"type": "long",
"logicalType": "timestamp-micros"
}
]
}
]
}
},
{
"name": "Indexes",
"type": {
"type": "record",
"name": "Indexes",
"fields": [
{
"name": "index",
"type": {
"type": "array",
"items": [
{
"name": "index",
"type": "record",
"fields": [
{
"name": "key",
"type": "string"
},
{
"name": "value",
"type": "string"
}
]
}
]
}
}
]
}
}
]
}
smaple message
{
"identity": {
"TransactionId": "trans5033367321545415837",
"EventId": "Eid8551651531071136126",
"TradeStatus": "L2 processed",
"PublishedAt": "2023-08-16T06:53:55.526074900"
},
"Indexes": {
"index": [
{
"key": "TransactionId",
"value": "trans5033367321545415837"
},
{
"key": "EntityId",
"value": "eid2935807702796558849"
},
{
"key": "ClientId",
"value": "cid-3039208996248690681"
}
]
}
}
通过Java模拟数据
public static void mockDate(String dataCount,String schemaFile,String outFile) throws Exception {
String schemaString = "{ "type": "record", "name": "event", "namespace": "com.test", "fields": [ { "name": "identity", "type": { "type": "record", "name": "identity", "fields": [ { "name": "TransactionId", "type": "string" }, { "name": "EventId", "type": "string" }, { "name": "TradeStatus", "type": "string" }, { "name": "PublishedAt", "type": [ "null", { "type": "long", "logicalType": "timestamp-micros" } ] } ] } }, { "name": "Indexes", "type": { "type": "record", "name": "Indexes", "fields": [ { "name": "index", "type": { "type": "array", "items": [ { "name": "index", "type": "record", "fields": [ { "name": "key", "type": "string" }, { "name": "value", "type": "string" } ] } ] } } ] } } ] }";
Schema schema = new Schema.Parser().parse(schemaString);
String indexSchemaString = "{ "type": "array", "items": [ { "name": "index", "type": "record", "fields": [ { "name": "key", "type": "string" }, { "name": "value", "type": "string" } ] } ] }";
Schema indexSchema = new Schema.Parser().parse(indexSchemaString);
String[] params = {schemaFile,outFile,dataCount};
// "Usage: RandomData <schemafile> <outputfile> <count>"
RandomData.main(params);
DataFileReader<GenericData.Record> records = new DataFileReader<GenericData.Record>(new File(outFile), new GenericDatumReader(schema));
ImmutableList.Builder<byte[]> dataBuilder = ImmutableList.builder();
for (GenericData.Record record : records) {
String TransactionId = "trans" + new Random().nextLong();
GenericData.Record record1 = (GenericData.Record) record.get(0);
record1.put("TransactionId",TransactionId);
record1.put("EventId","Eid"+ new Random().nextLong());
String[] status = {"L1 process","L2 processed","completed","unknown"};
record1.put("TradeStatus",status[new Random().nextInt(4)]);
record1.put("PublishedAt", LocalDateTime.now().toString());
GenericData.Record record2 = (GenericData.Record) record.get(1);
ImmutableList<Object> list = ImmutableList.of(
ImmutableMap.of("key","TransactionId","value",TransactionId),
ImmutableMap.of("key","EntityId","value","eid"+new Random().nextLong()),
ImmutableMap.of("key","ClientId","value","cid"+new Random().nextLong())
);
GenericData.Array<Map<String,String>> array = new GenericData.Array(indexSchema, list);
record2.put("index",array);
System.out.println(record);
}
}
启动Kafka客户端发送消息(以Windows为例)
启动zookeeper(二选一)
$ bin\windows\zookeeper-server-start.bat config\zookeeper.properties
启动kafka
$ bin\windows\kafka-server-start.bat config\server.properties
创建topic
$ bin\windows\kafka-topics.bat --create --topic delta-events --bootstrap-server localhost:9092
发送上面Java代码控制台打印的消息
F:\kafka_2.13-3.5.0>bin\windows\kafka-console-producer.bat --topic delta-events --bootstrap-server localhost:9092
>{"identity": {"TransactionId": "trans5033367321545415837", "EventId": "Eid8551651531071136126", "TradeStatus": "L2 processed", "PublishedAt": "2023-08-16T06:53:55.526074900"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans5033367321545415837"}, {"key": "EntityId", "value": "eid2935807702796558849"}, {"key": "ClientId", "value": "cid-3039208996248690681"}]}}
>{"identity": {"TransactionId": "trans6804110721886204906", "EventId": "Eid7187653767342281884", "TradeStatus": "unknown", "PublishedAt": "2023-08-16T06:53:55.534081600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans6804110721886204906"}, {"key": "EntityId", "value": "eid4594699262391819122"}, {"key": "ClientId", "value": "cid3593932414708215869"}]}}
>{"identity": {"TransactionId": "trans-2613584887854207584", "EventId": "Eid-5208636462814666178", "TradeStatus": "completed", "PublishedAt": "2023-08-16T06:53:55.535082600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans-2613584887854207584"}, {"key": "EntityId", "value": "eid242779643845656966"}, {"key": "ClientId", "value": "cid869591881191745565"}]}}
>{"identity": {"TransactionId": "trans8848688700188547799", "EventId": "Eid-3129431046447861530", "TradeStatus": "L1 process", "PublishedAt": "2023-08-16T06:53:55.535082600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans8848688700188547799"}, {"key": "EntityId", "value": "eid7625058015262914146"}, {"key": "ClientId", "value": "cid-2936387515973894447"}]}}
>终止批处理操作吗(Y/N)? Y
查看发送的消息:
F:\kafka_2.13-3.5.0>bin\windows\kafka-console-consumer.bat --topic delta-events --from-beginning --bootstrap-server localhost:9092
{"identity": {"TransactionId": "trans3595507515135846609", "EventId": "Eid2681663058625171526", "TradeStatus": "completed", "PublishedAt": "2023-08-15T20:56:17.921655500"}, "Indexes": {"index": [{"key": "ClientId", "value": "cid138892807697064717"}, {"key": "TransactionId", "value": "trans3595507515135846609"}, {"key": "EntityId", "value": "eid8169218527383631675"}, {"key": "ClientId", "value": "cid-4543090596963850881"}]}}
{"identity": {"TransactionId": "trans-1403368725302516575", "EventId": "Eid7428793617049238901", "TradeStatus": "unknown", "PublishedAt": "2023-08-15T20:56:17.930663800"}, "Indexes": {"index": [{"key": "ClientId", "value": "cid3050798983810286279"}, {"key": "TransactionId", "value": "trans-1403368725302516575"}, {"key": "EntityId", "value": "eid-3165139663653732655"}, {"key": "ClientId", "value": "cid6568395019926951896"}]}}
{"identity": {"TransactionId": "trans2195587482335207576", "EventId": "Eid3741948813890065845", "TradeStatus": "unknown", "PublishedAt": "2023-08-15T21:57:27.491588200"}, "Indexes": {"index": [{"key": "ClientId", "value": "cid-8354175370853567603"}, {"key": "TransactionId", "value": "trans2195587482335207576"}, {"key": "EntityId", "value": "eid-1010666287888311125"}, {"key": "ClientId", "value": "cid-7846753222644159503"}]}}
{"identity": {"TransactionId": "trans-307570308772678238", "EventId": "Eid-6145832259220094899", "TradeStatus": "L1 process", "PublishedAt": "2023-08-15T21:57:27.500595900"}, "Indexes": {"index": [{"key": "ClientId", "value": "cid6912324314002883649"}, {"key": "TransactionId", "value": "trans-307570308772678238"}, {"key": "EntityId", "value": "eid-7035027898361688692"}, {"key": "ClientId", "value": "cid-275892622329168857"}]}}
{"identity": {"TransactionId": "trans5033367321545415837", "EventId": "Eid8551651531071136126", "TradeStatus": "L2 processed", "PublishedAt": "2023-08-16T06:53:55.526074900"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans5033367321545415837"}, {"key": "EntityId", "value": "eid2935807702796558849"}, {"key": "ClientId", "value": "cid-3039208996248690681"}]}}
{"identity": {"TransactionId": "trans6804110721886204906", "EventId": "Eid7187653767342281884", "TradeStatus": "unknown", "PublishedAt": "2023-08-16T06:53:55.534081600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans6804110721886204906"}, {"key": "EntityId", "value": "eid4594699262391819122"}, {"key": "ClientId", "value": "cid3593932414708215869"}]}}
{"identity": {"TransactionId": "trans-2613584887854207584", "EventId": "Eid-5208636462814666178", "TradeStatus": "completed", "PublishedAt": "2023-08-16T06:53:55.535082600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans-2613584887854207584"}, {"key": "EntityId", "value": "eid242779643845656966"}, {"key": "ClientId", "value": "cid869591881191745565"}]}}
{"identity": {"TransactionId": "trans8848688700188547799", "EventId": "Eid-3129431046447861530", "TradeStatus": "L1 process", "PublishedAt": "2023-08-16T06:53:55.535082600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans8848688700188547799"}, {"key": "EntityId", "value": "eid7625058015262914146"}, {"key": "ClientId", "value": "cid-2936387515973894447"}]}}

使用SparkSession建表
spark.sql("create or replace table child(PublishedAt timestamp,ConsumerDateTime timestamp,ProducerDateTime timestamp,TradeStatus string,TransactionId string,EventId string) using Delta location '\Users\aaron\Desktop\deltaWriter/test/child'")
spark.sql("create table parent(PublishedAt timestamp,ConsumerDateTime timestamp,ProducerDateTime timestamp,TradeStatus string,ClientId string,TransactionId string,EntityId string) using Delta location '\Users\aaron\Desktop\deltaWriter/test/parent'")
在 Spark 中使用 Structured Streaming 进行 Kafka 集成
导入必要的库:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>spark-test</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<spark.version>3.1.2</spark.version>
<scala.version>3.3.2</scala.version>
</properties>
<repositories>
<repository>
<id>central</id>
<url>https://maven.aliyun.com/repository/central</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.12</artifactId>
<version>3.4.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.3.2</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.12.15</version>
</dependency>
<dependency>
<groupId>io.delta</groupId>
<artifactId>delta-core_2.12</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-avro_2.12</artifactId>
<version>3.4.1</version>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>1.8.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Compiler plugin to set the appropriate source and target Java versions -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<!-- Add the Scala Maven Plugin -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.5.6</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
创建一个 SparkSession,用于操作 Spark 和 Spark SQL:
val spark = SparkSession.builder()
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
.config("spark.sql.adaptive.enabled","true")
.config("spark.sql.debug.maxToStringFields","123")
.config("spark.sql.mapKeyDedupPolicy","LAST_WIN")
.master("local[*]").getOrCreate()
数据读取
// readOptions:
val readConfig = new java.util.HashMap[String,String]()
readConfig.put("kafka.bootstrap.servers",servers)
readConfig.put("subscribe",topic)
readConfig.put("startingOffsets", "earliest")
class KafkaReader extends Reader {
override def read(spark: SparkSession, config: util.Map[String, String]): DataFrame = {
spark.readStream.format("kafka").options(config).load()
}
}
数据写入
// writer optons:
val schemaString = "{ "type": "record", "name": "event", "namespace": "com.test", "fields": [ { "name": "identity", "type": { "type": "record", "name": "identity", "fields": [ { "name": "TransactionId", "type": "string" }, { "name": "EventId", "type": "string" }, { "name": "TradeStatus", "type": "string" }, { "name": "PublishedAt", "type": [ "null", { "type": "long", "logicalType": "timestamp-micros" } ] } ] } }, { "name": "Indexes", "type": { "type": "record", "name": "Indexes", "fields": [ { "name": "index", "type": { "type": "array", "items": [ { "name": "index", "type": "record", "fields": [ { "name": "key", "type": "string" }, { "name": "value", "type": "string" } ] } ] } } ] } } ] }"
val writeConfig = new java.util.HashMap[String,String]()
writeConfig.put("schemaString",schemaString)
writeConfig.put("checkPointLocation","C:\Users\xx\Desktop\deltaWriter/test/_checkpoint/")
writeConfig.put("childPath","C:\Users\xx\Downloads\tushare-sdk-master\work-conclusion\spark-warehouse\UsersxxDesktopdeltaWriter\test\child")
writeConfig.put("parentPath","C:\Users\xx\Downloads\tushare-sdk-master\work-conclusion\spark-warehouse\UsersxxDesktopdeltaWriter\test\parent")
writeConfig.put("cols","ClientId,TransactionId,EntityId")
abstract class StreamWriter extends Writer {
var schema: StructType = _
var configs: util.Map[String, String] = _
override def write(df:DataFrame, config: util.Map[String, String]): Unit = {
val schemaString = config.get("schemaString")
if(StringUtils.isNotBlank(schemaString)) {
schema = parse(schemaString)
}
configs = config
df.writeStream
.format(config.getOrDefault("target","delta"))
.foreachBatch(batchWriter _)
.option("checkpointLocation",config.get("checkPointLocation"))
.start()
.awaitTermination()
}
def batchWriter(df:DataFrame,batchId:Long):Unit={
}
def parse(ss:String):StructType={
SchemaConverters.toSqlType(new Schema.Parser().parse(ss)).dataType.asInstanceOf[StructType]
}
}
class DeltaWriter extends StreamWriter {
val spark = SparkSession.builder()
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
.master("local[*]").getOrCreate()
override def batchWriter(df: DataFrame, batchId: Long): Unit = {
val rawDf = df.selectExpr("cast(value as string) as msg","timestamp as ProducerDateTime")
.withColumn("object",from_json(col("msg"), schema))
println("data count is "+ rawDf.count())
processParent(rawDf)
processChild(rawDf)
}
def processParent(df:DataFrame):Unit={
println(" start processParent 1....")
var parent = df.selectExpr("map_from_entries(object.Indexes.index) as maps","ProducerDateTime","object.identity.TradeStatus","object.identity.PublishedAt")
// parent.write.save(configs.get("parentPath"))
configs.get("cols").split(",").foreach(colName=>{
parent = parent.withColumn(colName,col("maps").getItem(colName))
})
parent = parent.drop("maps").withColumn("ConsumerDateTime",current_timestamp())
parent = parent.selectExpr("TransactionId","struct(PublishedAt,ConsumerDateTime,ProducerDateTime,TradeStatus,ClientId,TransactionId,EntityId) as struct")
.groupBy("TransactionId")
.agg(functions.max("struct").alias("structMaxBy"))
.selectExpr("structMaxBy.*")
println(" start processParent 2....")
// parent.show()
// parent.write.mode("append").save(configs.get("parentPath"))
merge(parent,configs.get("parentPath"),"source.TransactionId = target.TransactionId","source.PublishedAt >= target.PublishedAt")
}
def processChild(df:DataFrame):Unit={
println(" start processChild 1....")
var child = df.selectExpr("object.identity.*","ProducerDateTime")
.withColumn("ConsumerDateTime",current_timestamp())
// child.write.save(configs.get("parentPath"))
child = child.selectExpr("EventId","struct(PublishedAt,ConsumerDateTime,ProducerDateTime,TradeStatus,TransactionId,EventId) as struct")
.groupBy("EventId")
.agg(functions.max("struct").alias("structMaxBy"))
.selectExpr("structMaxBy.*")
println(" start processChild 2....")
// child.show()
// child.write.mode("append").save(configs.get("childPath"))
merge(child,configs.get("childPath"),"source.EventId = target.EventId","source.PublishedAt >= target.PublishedAt")
}
def merge(df:DataFrame,targetLocation:String,mergeCondition:String,matchCondition:String):Unit={
DeltaTable
.forPath(spark,targetLocation)
.alias("target")
.merge(df.as("source"),mergeCondition)
.whenMatched(matchCondition)
.updateAll()
.whenNotMatched()
.insertAll()
.execute()
}
}
数据验证
val spark = SparkSession.builder()
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
.master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
spark.read.load("C:\Users\xx\Downloads\tushare-sdk-master\work-conclusion\spark-warehouse\UsersaaronDesktopdeltaWriter\test\child").show()
spark.read.load("C:\Users\xx\Downloads\tushare-sdk-master\work-conclusion\spark-warehouse\UsersaaronDesktopdeltaWriter\test\child").show()
小结
以上是一个简单的示例,实现了数据插入和更新,演示了如何使用 Structured Streaming 进行 Kafka 集成。实际应用中,可以根据需求进行更复杂的数据处理和输出操作,还可以处理更多的配置和设置,例如检查点、容错性等