1 Confluent Kafka介绍
当前主流的Kafka有以下3种:
- Apache Kafka: 开源,版本迭代快,由于上层支持的功能较少,所以需要开发的量比较大且可控;
- CDP/HDP Kafka: 由HDP公司将Apache Kafka集成在一套大数据服务软件里,一般版本较落后,且细节黑盒不可控;
- Confluent-Kafka: 由Confluent公司在Apache Kafka的基础上,开发了很多上层功能,如: 支持多种Connetors, Rest Api, Schema Registry, 高可用等,版本更新较快,分成开源社区版和收费版,收费版还有Control Center等功能。
2 Confluent Kafka安装
安装版本软件兼容性和硬件条件在官方文档中有说明 docs.confluent.io/platform/7.…
安装方式分成以下两种:
- zip和tar
- docker
测试时,我们选择直接使用开源社区版的tar
2.1 获取Confluent Community components tar
curl -O https://packages.confluent.io/archive/7.4/confluent-community-7.4.1.tar.gz
解压
tar -zxvf confluent-community-7.4.1.tar.gz
2.2 启动服务
cd confluent-7.4.1/
启动本地zookeeper服务,端口2181
bin/zookeeper-server-start etc/kafka/zookeeper.properties
启动单节点的kafka服务,端口9092
bin/kafka-server-start etc/kafka/server.properties
启动schema-registry服务,端口8081
bin/schema-registry-start etc/schema-registry/schema-registry.properties
启动rest服务,端口8082
bin/kafka-rest-start etc/kafka-rest/kafka-rest.properties
3 功能测试(rest,schema registry and connectors)
虽然已经打开了rest服务,可以使用curl模拟http请求。但是为了方便参数的输入,这里通过使用python的request模块往kafka rest服务发送请求。 官方的api接口文档 docs.confluent.io/platform/7.…
api也有v2和v3两个版本,这里我们统一用v3版本进行测试
header = {"Content-Type": "application/json"}
3.1 rest功能测试
3.1.1 获取集群信息
def test_clusters():
host = "http://localhost:8082/v3/clusters"
r = requests.get(host, headers=header)
print("=================")
for data in json.loads(r.content.decode("utf-8"))["data"]:
print(data['metadata']['self'])
返回结果
http://localhost:8082/v3/clusters/gEulWkK9QcqtO7CGHwP0wQ
3.1.2 获取topic信息
def test_list_topics():
host = "http://localhost:8082/v3/clusters/gEulWkK9QcqtO7CGHwP0wQ/topics"
r = requests.get(host, headers=header)
print("=================")
for data in json.loads(r.content.decode("utf-8"))["data"]:
print(data['metadata']['self'])
返回结果
http://localhost:8082/v3/clusters/gEulWkK9QcqtO7CGHwP0wQ/topics/_schema_encoders http://localhost:8082/v3/clusters/gEulWkK9QcqtO7CGHwP0wQ/topics/_schemas http://localhost:8082/v3/clusters/gEulWkK9QcqtO7CGHwP0wQ/topics/wenkang_test_schema
3.1.3 新增topic
def test_add_topic():
host = "http://localhost:8082/v3/clusters/gEulWkK9QcqtO7CGHwP0wQ/topics"
data = {"topic_name": "wenkang_test_schema_v3"}
r = requests.post(host, data=json.dumps(data), headers=header)
print("=================")
print(r.content.decode("utf-8"))
3.2 Schema Registry测试
schema_header = {"Content-Type": "application/vnd.schemaregistry.v1+json"}
3.2.1 获取所有subject
A subject refers to the name under which the schema is registered.If you are using Schema Registry for Kafka, then a subject refers to either a “<topic>-value” or “<topic>-key” depending on whether you are registering the value schema for that topic or the key schema.
def test_list_schema_subjects():
host = "http://localhost:8081/subjects"
r = requests.get(host, headers=schema_header)
print("=================")
print(r.content.decode('utf-8'))
返回结果
["wenkang_test_schema","wenkang_test_schema-key","wenkang_test_schema-value"]
3.2.2 新增schema
def test_registry_new_schema():
host = "http://localhost:8081/subjects/wenkang_test_schema_v3/versions"
data = {"schema": json.dumps({
"name": "User",
"type": "record",
"namespace": "io.confluent.examples.clients.basicavro",
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "favorite_number",
"type": "long"
},
{
"name": "favorite_color",
"type": "string"
}
]})
}
data_str = json.dumps(data)
r = requests.post(host, data=data_str, headers=schema_header)
print("=================")
print(r.content.decode('utf-8'))
返回结果 {"id":2}
3.2.3 获取最新的schema信息
def test_get_subject():
host = " http://localhost:8081/subjects/wenkang_test_schema_v3/versions/latest"
r = requests.get(host, headers=schema_header)
print("=================")
print(r.content.decode('utf-8'))
返回结果
{"subject":"wenkang_test_schema_v3","version":1,"id":2,"schema":"{"type":"record","name":"User","namespace":"io.confluent.examples.clients.basicavro","fields":[{"name":"name","type":"string"},{"name":"favorite_number","type":"long"},{"name":"favorite_color","type":"string"}]}"}
3.2.4 将schema信息注册到topic wenkang_test_schema_v3上
def test_register_schema_for_topic_v3():
"""
The Kafka serializers and deserializers default to using <topicName>-Key and <topicName>-value as
the corresponding subject name while registering or retrieving the schema.
"""
# fetch the latest schema
host = " http://localhost:8081/subjects/wenkang_test_schema_v3/versions/latest"
r = requests.get(host, headers=schema_header)
v3_schema = json.loads(r.content.decode('utf-8'))["schema"]
data = {"schema": v3_schema}
data_str = json.dumps(data)
key_host = "http://localhost:8081/subjects/wenkang_test_schema_v3-key/versions"
r = requests.post(key_host, data=data_str, headers=schema_header)
print("=================")
print(r.content.decode('utf-8'))
value_host = "http://localhost:8081/subjects/wenkang_test_schema_v3-value/versions"
r = requests.post(value_host, data=data_str, headers=schema_header)
print("=================")
print(r.content.decode('utf-8'))
返回结果 {"id":2} {"id":2}
再查看test_list_schema_subjects,结果如下
["wenkang_test_schema","wenkang_test_schema-key","wenkang_test_schema-value","wenkang_test_schema_v3","wenkang_test_schema_v3-key","wenkang_test_schema_v3-value"]
3.2.5 producer send message
def test_produce_avro_topic_record():
host = "http://localhost:8082/v3/clusters/gEulWkK9QcqtO7CGHwP0wQ/topics/wenkang_test_schema_v3/records"
records = {"value": {"data": {"name": "xkk", "favorite_number": 8, "favorite_color": "green", "age": 4}}}
records_str = json.dumps(records)
r = requests.post(host, headers=header, data=records_str)
print("=================")
print(r.content.decode('utf-8'))
返回结果 {"error_code":200,"cluster_id":"gEulWkK9QcqtO7CGHwP0wQ","topic_name":"wenkang_test_schema_v3","partition_id":0,"offset":1,"timestamp":"2023-08-22T05:59:29.500Z","value":{"type":"AVRO","subject":"wenkang_test_schema_v3-value","schema_id":2,"schema_version":1,"size":16}}
再测试下字段不正确的情况
def test_produce_avro_topic_record():
host = "http://localhost:8082/v3/clusters/gEulWkK9QcqtO7CGHwP0wQ/topics/wenkang_test_schema_v3/records"
records = {"value": {"data": {"name": "xkk", "favorite_number": 8, "favorite_color11": "green", "age11": 4}}}
records_str = json.dumps(records)
r = requests.post(host, headers=header, data=records_str)
print("=================")
print(r.content.decode('utf-8'))
返回结果
{"error_code":400,"message":"Bad Request: Expected field name not found: favorite_color"}
3.2.6 使用AvroConsumer消费数据
from confluent_kafka.avro import AvroConsumer
if __name__ == '__main__':
consumer_conf = {'bootstrap.servers': "localhost:9092",
'group.id': 'xkk1、',
'enable.auto.commit': False,
'auto.offset.reset': "earliest",
'schema.registry.url': 'http://localhost:8081'}
topic = "wenkang_test_schema_v3"
consumer = AvroConsumer(consumer_conf)
consumer.subscribe([topic])
try:
while True:
try:
msg = consumer.poll(timeout=1.0)
if msg is None:
continue
if msg.error():
print('Error occured: {}'.format(msg.error()))
print(msg.value())
except Exception as e:
print(e)
except KeyboardInterrupt:
pass
consumer.close()
返回结果
{'name': 'xkk', 'favorite_number': 8, 'favorite_color': 'green'} 注意到这里是没有age字段的
如果我们使用普通的Consumer,是无法正确解析value的
from kafka import KafkaConsumer
if __name__ == '__main__':
consumer = KafkaConsumer(
"wenkang_test_schema_v3",
bootstrap_servers=["localhost:9092"],
auto_offset_reset="earliest"
)
for msg in consumer:
print(msg.value)
返回结果 b'\x00\x00\x00\x00\x02\x06xkk\x10\ngreen'
2.4 测试file stream connector
cd confluent-7.4.1/
vi etc/schema-registry/connect-avro-standalone.properties
plugin.path=share/java,{your path}/confluent-7.4.1/share/filestream-connectors
vi etc/kafka/connect-file-sink.properties
file={your path}/test.sink.txt topics={your topic}
执行下列命令即可
bin/connect-standalone etc/schema-registry/connect-avro-standalone.properties etc/kafka/connect-file-sink.properties