Structured Streaming + Kafka Integration-03Structured Stream

简介

Structured Streaming 是 Apache Spark 提供的一个用于流数据处理的高级抽象层，它允许您以类似于批处理的方式处理实时数据流。Kafka 是一个分布式的流数据平台，结合使用 Structured Streaming 和 Kafka 可以实现强大的实时数据处理应用。

数据准备

通过定义的schema，使用下载的Kafka软件包向Kafka topic 发送消息：

schema 定义

identity部分可以简单当作一笔交易，存储在child表，EventId作为主键，PublishedAt作为交易时间，对相同主键的数据，使用主键去重后，新增数据的publishedAt大于等于老数据的publishedAt进行更新

index部分可以动态增减字段，作为用户账号，存储在parent表TransactionId作为主键，每次会更新交易状态

{
    "type": "record",
    "name": "event",
    "namespace": "com.test",
    "fields": [
        {
            "name": "identity",
            "type": {
                "type": "record",
                "name": "identity",
                "fields": [
                    {
                        "name": "TransactionId",
                        "type": "string"
                    },
                    {
                        "name": "EventId",
                        "type": "string"
                    },
                    {
                        "name": "TradeStatus",
                        "type": "string"
                    },
                    {
                        "name": "PublishedAt",
                        "type": [
                            "null",
                            {
                                "type": "long",
                                "logicalType": "timestamp-micros"
                            }
                        ]
                    }
                ]
            }
        },
        {
            "name": "Indexes",
            "type": {
                "type": "record",
                "name": "Indexes",
                "fields": [
                    {
                        "name": "index",
                        "type": {
                            "type": "array",
                            "items": [
                                {
                                    "name": "index",
                                    "type": "record",
                                    "fields": [
                                        {
                                            "name": "key",
                                            "type": "string"
                                        },
                                        {
                                            "name": "value",
                                            "type": "string"
                                        }
                                    ]
                                }
                            ]
                        }
                    }
                ]
            }
        }
    ]
}

smaple message

{
    "identity": {
        "TransactionId": "trans5033367321545415837",
        "EventId": "Eid8551651531071136126",
        "TradeStatus": "L2 processed",
        "PublishedAt": "2023-08-16T06:53:55.526074900"
    },
    "Indexes": {
        "index": [
            {
                "key": "TransactionId",
                "value": "trans5033367321545415837"
            },
            {
                "key": "EntityId",
                "value": "eid2935807702796558849"
            },
            {
                "key": "ClientId",
                "value": "cid-3039208996248690681"
            }
        ]
    }
}

通过Java模拟数据

public static void mockDate(String dataCount,String schemaFile,String outFile) throws Exception {
    String schemaString = "{ "type": "record", "name": "event", "namespace": "com.test", "fields": [ { "name": "identity", "type": { "type": "record", "name": "identity", "fields": [ { "name": "TransactionId", "type": "string" }, { "name": "EventId", "type": "string" }, { "name": "TradeStatus", "type": "string" }, { "name": "PublishedAt", "type": [ "null", { "type": "long", "logicalType": "timestamp-micros" } ] } ] } }, { "name": "Indexes", "type": { "type": "record", "name": "Indexes", "fields": [ { "name": "index", "type": { "type": "array", "items": [ { "name": "index", "type": "record", "fields": [ { "name": "key", "type": "string" }, { "name": "value", "type": "string" } ] } ] } } ] } } ] }";
    Schema schema = new Schema.Parser().parse(schemaString);

    String indexSchemaString = "{ "type": "array", "items": [ { "name": "index", "type": "record", "fields": [ { "name": "key", "type": "string" }, { "name": "value", "type": "string" } ] } ] }";
    Schema indexSchema = new Schema.Parser().parse(indexSchemaString);

    String[] params = {schemaFile,outFile,dataCount};
    // "Usage: RandomData <schemafile> <outputfile> <count>"
    RandomData.main(params);

    DataFileReader<GenericData.Record> records = new DataFileReader<GenericData.Record>(new File(outFile), new GenericDatumReader(schema));

    ImmutableList.Builder<byte[]> dataBuilder = ImmutableList.builder();
    for (GenericData.Record record : records) {
        String TransactionId = "trans" + new Random().nextLong();
        GenericData.Record record1 = (GenericData.Record) record.get(0);
        record1.put("TransactionId",TransactionId);
        record1.put("EventId","Eid"+ new Random().nextLong());
        String[] status = {"L1 process","L2 processed","completed","unknown"};
        record1.put("TradeStatus",status[new Random().nextInt(4)]);
        record1.put("PublishedAt", LocalDateTime.now().toString());

        GenericData.Record record2 = (GenericData.Record) record.get(1);

        ImmutableList<Object> list = ImmutableList.of(
                ImmutableMap.of("key","TransactionId","value",TransactionId),
                ImmutableMap.of("key","EntityId","value","eid"+new Random().nextLong()),
                ImmutableMap.of("key","ClientId","value","cid"+new Random().nextLong())
        );
        GenericData.Array<Map<String,String>> array = new GenericData.Array(indexSchema, list);
        record2.put("index",array);
        System.out.println(record);
    }
}

启动Kafka客户端发送消息（以Windows为例）

启动zookeeper(二选一)
$ bin\windows\zookeeper-server-start.bat config\zookeeper.properties

启动kafka
$ bin\windows\kafka-server-start.bat config\server.properties

创建topic
$ bin\windows\kafka-topics.bat --create --topic delta-events --bootstrap-server localhost:9092

发送上面Java代码控制台打印的消息
F:\kafka_2.13-3.5.0>bin\windows\kafka-console-producer.bat --topic delta-events --bootstrap-server localhost:9092
>{"identity": {"TransactionId": "trans5033367321545415837", "EventId": "Eid8551651531071136126", "TradeStatus": "L2 processed", "PublishedAt": "2023-08-16T06:53:55.526074900"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans5033367321545415837"}, {"key": "EntityId", "value": "eid2935807702796558849"}, {"key": "ClientId", "value": "cid-3039208996248690681"}]}}
>{"identity": {"TransactionId": "trans6804110721886204906", "EventId": "Eid7187653767342281884", "TradeStatus": "unknown", "PublishedAt": "2023-08-16T06:53:55.534081600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans6804110721886204906"}, {"key": "EntityId", "value": "eid4594699262391819122"}, {"key": "ClientId", "value": "cid3593932414708215869"}]}}
>{"identity": {"TransactionId": "trans-2613584887854207584", "EventId": "Eid-5208636462814666178", "TradeStatus": "completed", "PublishedAt": "2023-08-16T06:53:55.535082600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans-2613584887854207584"}, {"key": "EntityId", "value": "eid242779643845656966"}, {"key": "ClientId", "value": "cid869591881191745565"}]}}
>{"identity": {"TransactionId": "trans8848688700188547799", "EventId": "Eid-3129431046447861530", "TradeStatus": "L1 process", "PublishedAt": "2023-08-16T06:53:55.535082600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans8848688700188547799"}, {"key": "EntityId", "value": "eid7625058015262914146"}, {"key": "ClientId", "value": "cid-2936387515973894447"}]}}
>终止批处理操作吗(Y/N)? Y

查看发送的消息：
F:\kafka_2.13-3.5.0>bin\windows\kafka-console-consumer.bat --topic delta-events --from-beginning --bootstrap-server localhost:9092
{"identity": {"TransactionId": "trans3595507515135846609", "EventId": "Eid2681663058625171526", "TradeStatus": "completed", "PublishedAt": "2023-08-15T20:56:17.921655500"}, "Indexes": {"index": [{"key": "ClientId", "value": "cid138892807697064717"}, {"key": "TransactionId", "value": "trans3595507515135846609"}, {"key": "EntityId", "value": "eid8169218527383631675"}, {"key": "ClientId", "value": "cid-4543090596963850881"}]}}
{"identity": {"TransactionId": "trans-1403368725302516575", "EventId": "Eid7428793617049238901", "TradeStatus": "unknown", "PublishedAt": "2023-08-15T20:56:17.930663800"}, "Indexes": {"index": [{"key": "ClientId", "value": "cid3050798983810286279"}, {"key": "TransactionId", "value": "trans-1403368725302516575"}, {"key": "EntityId", "value": "eid-3165139663653732655"}, {"key": "ClientId", "value": "cid6568395019926951896"}]}}
{"identity": {"TransactionId": "trans2195587482335207576", "EventId": "Eid3741948813890065845", "TradeStatus": "unknown", "PublishedAt": "2023-08-15T21:57:27.491588200"}, "Indexes": {"index": [{"key": "ClientId", "value": "cid-8354175370853567603"}, {"key": "TransactionId", "value": "trans2195587482335207576"}, {"key": "EntityId", "value": "eid-1010666287888311125"}, {"key": "ClientId", "value": "cid-7846753222644159503"}]}}
{"identity": {"TransactionId": "trans-307570308772678238", "EventId": "Eid-6145832259220094899", "TradeStatus": "L1 process", "PublishedAt": "2023-08-15T21:57:27.500595900"}, "Indexes": {"index": [{"key": "ClientId", "value": "cid6912324314002883649"}, {"key": "TransactionId", "value": "trans-307570308772678238"}, {"key": "EntityId", "value": "eid-7035027898361688692"}, {"key": "ClientId", "value": "cid-275892622329168857"}]}}
{"identity": {"TransactionId": "trans5033367321545415837", "EventId": "Eid8551651531071136126", "TradeStatus": "L2 processed", "PublishedAt": "2023-08-16T06:53:55.526074900"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans5033367321545415837"}, {"key": "EntityId", "value": "eid2935807702796558849"}, {"key": "ClientId", "value": "cid-3039208996248690681"}]}}
{"identity": {"TransactionId": "trans6804110721886204906", "EventId": "Eid7187653767342281884", "TradeStatus": "unknown", "PublishedAt": "2023-08-16T06:53:55.534081600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans6804110721886204906"}, {"key": "EntityId", "value": "eid4594699262391819122"}, {"key": "ClientId", "value": "cid3593932414708215869"}]}}
{"identity": {"TransactionId": "trans-2613584887854207584", "EventId": "Eid-5208636462814666178", "TradeStatus": "completed", "PublishedAt": "2023-08-16T06:53:55.535082600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans-2613584887854207584"}, {"key": "EntityId", "value": "eid242779643845656966"}, {"key": "ClientId", "value": "cid869591881191745565"}]}}
{"identity": {"TransactionId": "trans8848688700188547799", "EventId": "Eid-3129431046447861530", "TradeStatus": "L1 process", "PublishedAt": "2023-08-16T06:53:55.535082600"}, "Indexes": {"index": [{"key": "TransactionId", "value": "trans8848688700188547799"}, {"key": "EntityId", "value": "eid7625058015262914146"}, {"key": "ClientId", "value": "cid-2936387515973894447"}]}}

![Kafka-MSG.png](https://p3-juejin.byteimg.com/tos-cn-i-k3u1fbpfcp/21900a80c5984dd8b0294839cc139c76~tplv-k3u1fbpfcp-watermark.image?)

使用SparkSession建表

spark.sql("create or replace table child(PublishedAt timestamp,ConsumerDateTime timestamp,ProducerDateTime timestamp,TradeStatus string,TransactionId string,EventId string) using Delta location '\Users\aaron\Desktop\deltaWriter/test/child'")
spark.sql("create table parent(PublishedAt timestamp,ConsumerDateTime timestamp,ProducerDateTime timestamp,TradeStatus string,ClientId string,TransactionId string,EntityId string) using Delta location '\Users\aaron\Desktop\deltaWriter/test/parent'")

在 Spark 中使用 Structured Streaming 进行 Kafka 集成

导入必要的库：

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>spark-test</artifactId>
    <version>1.0-SNAPSHOT</version>


    <properties>
        <spark.version>3.1.2</spark.version>
        <scala.version>3.3.2</scala.version>
    </properties>
    <repositories>
        <repository>
            <id>central</id>
            <url>https://maven.aliyun.com/repository/central</url>
            <releases>
                <enabled>true</enabled>
            </releases>
            <snapshots>
                <enabled>true</enabled>
            </snapshots>
        </repository>
    </repositories>
    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql-kafka-0-10_2.12</artifactId>
            <version>3.4.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.12</artifactId>
            <version>3.3.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.12</artifactId>
            <version>3.3.2</version>
        </dependency>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>2.12.15</version>
        </dependency>
        <dependency>
            <groupId>io.delta</groupId>
            <artifactId>delta-core_2.12</artifactId>
            <version>2.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-avro_2.12</artifactId>
            <version>3.4.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.avro</groupId>
            <artifactId>avro</artifactId>
            <version>1.8.2</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <!-- Compiler plugin to set the appropriate source and target Java versions -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
                <!-- Add the Scala Maven Plugin -->
                <plugin>
                    <groupId>net.alchim31.maven</groupId>
                    <artifactId>scala-maven-plugin</artifactId>
                    <version>4.5.6</version>
                    <executions>
                        <execution>
                            <goals>
                                <goal>compile</goal>
                                <goal>testCompile</goal>
                            </goals>
                        </execution>
                    </executions>
                </plugin>
        </plugins>
    </build>

</project>

创建一个 SparkSession，用于操作 Spark 和 Spark SQL：

val spark = SparkSession.builder()
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
  .config("spark.sql.adaptive.enabled","true")
  .config("spark.sql.debug.maxToStringFields","123")
  .config("spark.sql.mapKeyDedupPolicy","LAST_WIN")
  .master("local[*]").getOrCreate()

数据读取

// readOptions:
val readConfig = new java.util.HashMap[String,String]()
readConfig.put("kafka.bootstrap.servers",servers)
readConfig.put("subscribe",topic)
readConfig.put("startingOffsets", "earliest")


class KafkaReader extends Reader {
  override def read(spark: SparkSession, config: util.Map[String, String]): DataFrame = {
    spark.readStream.format("kafka").options(config).load()
  }
}

数据写入

// writer optons:
val schemaString = "{ "type": "record", "name": "event", "namespace": "com.test", "fields": [ { "name": "identity", "type": { "type": "record", "name": "identity", "fields": [ { "name": "TransactionId", "type": "string" }, { "name": "EventId", "type": "string" }, { "name": "TradeStatus", "type": "string" }, { "name": "PublishedAt", "type": [ "null", { "type": "long", "logicalType": "timestamp-micros" } ] } ] } }, { "name": "Indexes", "type": { "type": "record", "name": "Indexes", "fields": [ { "name": "index", "type": { "type": "array", "items": [ { "name": "index", "type": "record", "fields": [ { "name": "key", "type": "string" }, { "name": "value", "type": "string" } ] } ] } } ] } } ] }"
val writeConfig = new java.util.HashMap[String,String]()
writeConfig.put("schemaString",schemaString)
writeConfig.put("checkPointLocation","C:\Users\xx\Desktop\deltaWriter/test/_checkpoint/")
writeConfig.put("childPath","C:\Users\xx\Downloads\tushare-sdk-master\work-conclusion\spark-warehouse\UsersxxDesktopdeltaWriter\test\child")
writeConfig.put("parentPath","C:\Users\xx\Downloads\tushare-sdk-master\work-conclusion\spark-warehouse\UsersxxDesktopdeltaWriter\test\parent")
writeConfig.put("cols","ClientId,TransactionId,EntityId")

abstract class StreamWriter extends Writer {

  var schema: StructType = _
  var configs: util.Map[String, String] = _

  override def write(df:DataFrame, config: util.Map[String, String]): Unit = {

    val schemaString = config.get("schemaString")
    if(StringUtils.isNotBlank(schemaString)) {
      schema = parse(schemaString)
    }
    configs = config
    df.writeStream
      .format(config.getOrDefault("target","delta"))
      .foreachBatch(batchWriter _)
      .option("checkpointLocation",config.get("checkPointLocation"))
      .start()
      .awaitTermination()
  }

  def batchWriter(df:DataFrame,batchId:Long):Unit={
  }

  def parse(ss:String):StructType={
    SchemaConverters.toSqlType(new Schema.Parser().parse(ss)).dataType.asInstanceOf[StructType]
  }
}


class DeltaWriter extends StreamWriter {
  val spark = SparkSession.builder()
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .master("local[*]").getOrCreate()

  override def batchWriter(df: DataFrame, batchId: Long): Unit = {
    val rawDf = df.selectExpr("cast(value as string) as msg","timestamp as ProducerDateTime")
              .withColumn("object",from_json(col("msg"), schema))
    println("data count is "+ rawDf.count())
    processParent(rawDf)
    processChild(rawDf)
  }

   def processParent(df:DataFrame):Unit={
     println(" start processParent 1....")
     var parent = df.selectExpr("map_from_entries(object.Indexes.index) as maps","ProducerDateTime","object.identity.TradeStatus","object.identity.PublishedAt")
     //    parent.write.save(configs.get("parentPath"))

     configs.get("cols").split(",").foreach(colName=>{
       parent = parent.withColumn(colName,col("maps").getItem(colName))
      })
     parent = parent.drop("maps").withColumn("ConsumerDateTime",current_timestamp())

      parent  = parent.selectExpr("TransactionId","struct(PublishedAt,ConsumerDateTime,ProducerDateTime,TradeStatus,ClientId,TransactionId,EntityId) as struct")
       .groupBy("TransactionId")
       .agg(functions.max("struct").alias("structMaxBy"))
       .selectExpr("structMaxBy.*")

     println(" start processParent 2....")
//     parent.show()
//     parent.write.mode("append").save(configs.get("parentPath"))
     merge(parent,configs.get("parentPath"),"source.TransactionId = target.TransactionId","source.PublishedAt >= target.PublishedAt")
   }

  def processChild(df:DataFrame):Unit={
    println(" start processChild 1....")
    var child = df.selectExpr("object.identity.*","ProducerDateTime")
      .withColumn("ConsumerDateTime",current_timestamp())
    //    child.write.save(configs.get("parentPath"))

    child  = child.selectExpr("EventId","struct(PublishedAt,ConsumerDateTime,ProducerDateTime,TradeStatus,TransactionId,EventId) as struct")
      .groupBy("EventId")
      .agg(functions.max("struct").alias("structMaxBy"))
      .selectExpr("structMaxBy.*")
    println(" start processChild 2....")
//    child.show()
//    child.write.mode("append").save(configs.get("childPath"))
    merge(child,configs.get("childPath"),"source.EventId = target.EventId","source.PublishedAt >= target.PublishedAt")
  }



  def merge(df:DataFrame,targetLocation:String,mergeCondition:String,matchCondition:String):Unit={
    DeltaTable
      .forPath(spark,targetLocation)
      .alias("target")
      .merge(df.as("source"),mergeCondition)
      .whenMatched(matchCondition)
      .updateAll()
      .whenNotMatched()
      .insertAll()
      .execute()
  }

}

数据验证

val spark = SparkSession.builder()
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
  .master("local[*]").getOrCreate()

spark.sparkContext.setLogLevel("WARN")
spark.read.load("C:\Users\xx\Downloads\tushare-sdk-master\work-conclusion\spark-warehouse\UsersaaronDesktopdeltaWriter\test\child").show()
spark.read.load("C:\Users\xx\Downloads\tushare-sdk-master\work-conclusion\spark-warehouse\UsersaaronDesktopdeltaWriter\test\child").show()

小结

以上是一个简单的示例，实现了数据插入和更新，演示了如何使用 Structured Streaming 进行 Kafka 集成。实际应用中，可以根据需求进行更复杂的数据处理和输出操作，还可以处理更多的配置和设置，例如检查点、容错性等