本文已参与「新人创作礼」活动,一起开启掘金创作之路。
文章目录
一 数据漏消费和重复消费
无论是同步提交还是异步提交offset,都有可能会造成数据的漏消费或者重复消费。先提交offset后消费,有可能造成数据的漏消费;而先消费后提交offset,有可能会造成数据的重复消费。
重复消费解决方案:
(1)下游去重:但是缺点是kafka采取了幂等性和事务,保证了kafka服务端消息没有重复的情况下,下游去重浪费了kafka的性能
(2)确保数据的消费和提交两个操作是原子性的,要么同时成功,要么同时失败。原子化绑定的前提是提交动作不能往kafka服务端提交,因为往kafka服务端提交与消费动作无法实现原子绑定。因此想实现原子化绑定,需要自己保存offset,而不是让kafka帮我们保存offset。
手动保存offset
package com.hike.consumer;
import org.apache.kafka.clients.consumer.ConsumerRebalanceListener;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import java.io.*;
import java.util.*;
/**
* 自定义保存offset
*/
public class ConsumerManual {
//记录每个消费者消费的offset,缓存
private static Map<TopicPartition,Long> offset = new HashMap<TopicPartition, Long>();
//用来保存Hash值
private static String file = "d:/offset";
public static void main(String[] args) throws IOException, InterruptedException {
//1 新建一个consumer对象
Properties properties = new Properties();
properties.load(Consumer.class.getClassLoader().getResourceAsStream("consumer1.properties"));
final KafkaConsumer<String, String> consumer = new KafkaConsumer<String, String>(properties);
//2 订阅话题,拉取消息
consumer.subscribe(Collections.singleton("hello"),
//每当有一个新的消费者加入到consumerGroup都会重新进行分区分配
new ConsumerRebalanceListener() {
//新加入进来的consumer应该从之前组中的consumer消费过的地方开始消费,而不是从0开始
//之前由kafka服务器负责告知,现在采用自定义保存offset的方式,服务器不知道消费的位置
//需要手动的告诉新加入的消费者,需要分别执行以下两个函数,完成此项功能
//分区分配之前做的事情
public void onPartitionsRevoked(Collection<TopicPartition> partitions) {
//各个消费者应该将旧的offset提交
commit();
}
//分区分配之后做的事情
public void onPartitionsAssigned(Collection<TopicPartition> partitions) {
//各个消费者应该将offset遍历,获取新的offset
readOffset(partitions);
//遍历所有分区,将offset读取出来,告诉消费者从哪里开始消费
for (TopicPartition partition : partitions) {
Long os = offset.get(partition);
if (os == null) {
consumer.seek(partition, 0);
} else {
consumer.seek(partition, os);
}
}
}
});
//消费消息
while(true){
ConsumerRecords<String, String> records = consumer.poll(2000);
//将此部分的操作原子绑定
{
for (ConsumerRecord<String, String> record : records) {
System.out.println(record);
//将获取到的数据存放到高速缓存中
offset.put(
new TopicPartition(record.topic(), record.partition()),
record.offset());
}
//消费完成之后,再提交
commit();
}
}
}
/**
* 从自定义介质中读取offset到缓存
* @param partitions
*/
private static void readOffset(Collection<TopicPartition> partitions) {
ObjectInputStream objectInputStream = null;
Map<TopicPartition,Long> temp;
try {
objectInputStream = new ObjectInputStream(new FileInputStream(file));
temp = (Map<TopicPartition, Long>) objectInputStream.readObject();
} catch (Exception e) {
temp = new HashMap<TopicPartition, Long>();
} finally {
if(objectInputStream != null){
try {
objectInputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//从全部分区offset中读取到我们分配的分区的offset
for (TopicPartition partition : partitions) {
offset.put(partition,temp.get(partition));
}
}
//将缓存中的offset提交到自定义介质中
private static void commit() {
//先从文件中读取全部旧的offset
ObjectInputStream objectInputStream = null;
Map<TopicPartition,Long> temp;
try {
objectInputStream = new ObjectInputStream(new FileInputStream(file));
temp = (Map<TopicPartition, Long>) objectInputStream.readObject();
} catch (Exception e) {
temp = new HashMap<TopicPartition, Long>();
} finally {
if(objectInputStream != null){
try {
objectInputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//合并offset
temp.putAll(offset);
//将新的offset写出去
ObjectOutputStream objectOutputStream = null;
try {
objectOutputStream = new ObjectOutputStream(new FileOutputStream(file));
objectOutputStream.writeObject(temp);
} catch (IOException e) {
e.printStackTrace();
} finally {
if(objectOutputStream != null){
try {
objectOutputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
二 自定义Interceptor
1 案例需求
实现一个简单的双interceptor组成的拦截链。第一个interceptor会在消息发送前将时间戳信息加到消息value的最前部;第二个interceptor会在消息发送后更新成功发送消息数或失败发送消息数。
2 TimeInterceptor类
package com.hike.intercetor;
import org.apache.kafka.clients.producer.ProducerInterceptor;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import java.util.Map;
public class TimeInterceptor implements ProducerInterceptor<String, String> {
/**
* 自定义record,可以在此处修改时间戳
* @param record 原始record
* @return 修改后的record
*/
public ProducerRecord<String, String> onSend(ProducerRecord<String, String> record) {
//获取时间戳
Long timestamp = record.timestamp();
//新建一个record
return new ProducerRecord<String, String>(
record.topic(),
record.partition(),
record.timestamp(),
record.key(),
System.currentTimeMillis() + record.value(),
record.headers()
);
}
/**
* 收到ack以后调用此方法
* @param metadata
* @param exception
*/
public void onAcknowledgement(RecordMetadata metadata, Exception exception) {
}
/**
* 关闭Producer时候调用
*/
public void close() {
}
/**
* 定义拦截器的方法
* @param configs
*/
public void configure(Map<String, ?> configs) {
}
}
3 CountInterceptor类
package com.hike.intercetor;
import org.apache.kafka.clients.producer.ProducerInterceptor;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import java.util.Map;
/**
* 记录消息发送成败的数量
*/
public class CountIinterceptor implements ProducerInterceptor<String,String> {
private long success = 0;
private long fail = 0;
public ProducerRecord<String, String> onSend(ProducerRecord<String, String> record) {
return record;
}
/**
* 收到ack后计数
* @param metadata
* @param exception
*/
public void onAcknowledgement(RecordMetadata metadata, Exception exception) {
if(exception == null){
success++;
}else {
fail++;
}
}
/**
* 发送完成后输出结果
*/
public void close() {
System.out.println(success + "条发送成功");
System.out.println(fail + "条发送失败");
}
public void configure(Map<String, ?> configs) {
}
}
4 在producer中添加几条语句
package com.hike.intercetor;
import org.apache.kafka.clients.producer.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
public class Producer {
public static void main(String[] args) throws ExecutionException, InterruptedException, IOException {
//1 实例化kafka集群(创建对象)
Properties properties = new Properties();
//通过配置文件配置
//properties.load(Producer.class.getClassLoader().getResourceAsStream("kafka.properties"));
//通过程序代码设置
properties.setProperty("key.serializer","org.apache.kafka.common.serialization.StringSerializer");
properties.setProperty("value.serializer","org.apache.kafka.common.serialization.StringSerializer");
properties.setProperty("acks","all");
properties.setProperty("bootstrap.servers","hadoop101:9092");
List<String> interceptors = new ArrayList<String>();
//添加拦截器
interceptors.add("com.hike.intercetor.CountIinterceptor");
interceptors.add("com.hike.intercetor.TimeInterceptor");
//将拦截器放到properties中
properties.put(ProducerConfig.INTERCEPTOR_CLASSES_CONFIG,interceptors);
KafkaProducer<String, String> producer = new KafkaProducer<String, String>(properties);
//2 用集群对象发送数据
for (int i = 0; i < 10; i++) {
Future<RecordMetadata> future = producer.send(
//2.1 封装ProducerRecord
new ProducerRecord<String, String>(
"hello",
Integer.toString(i),
"Value" + i
),
//2.2 回调函数
new Callback() {
//当sender收到服务器的ack之后,sender线程会调用onCompletion方法
public void onCompletion(RecordMetadata recordMetadata, Exception e) {
if (e == null) {
System.out.println(recordMetadata);
}
}
});
// RecordMetadata recordMetadata = future.get(); //添加此语句,发送方式变为同步操作
System.out.println("第" + i + "条发送成功");
}
//3 关闭资源
producer.close();
}
}
三 Flume对接kafka
1 配置文件
# define
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /opt/module/hive/logs/hive.log
# sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = hadoop101:9092,hadoop102:9092,hadoop103:9092
a1.sinks.k1.kafka.topic = hello
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.sinks.k1.kafka.producer.acks = all
a1.sinks.k1.kafka.producer.linger.ms = 1
# channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# bind
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
详细配置可以查看Flume官网 Documentation/User Guide/查找kafka
2 案例实操
cd /opt/module/flume/job/
vim flume-exec-kafka.conf
在idea中执行consumer,在服务器执行bin/flume-ng agent -n a1 -c conf -f job/flume-exec-kafka.conf -Dflume.root.logger=INFO,console 返回idea在控制台会收到一些信息
四 kafka监控(Eagle)
#关闭kafka集群
kafka-server-stop.sh
vim kafka-server-start.sh
#修改kafka-server-start.sh命令中
if [ "x$KAFKA_HEAP_OPTS" = "x" ]; then
export KAFKA_HEAP_OPTS="-Xmx1G -Xms1G"
fi
#为
if [ "x$KAFKA_HEAP_OPTS" = "x" ]; then
export KAFKA_HEAP_OPTS="-server -Xms2G -Xmx2G -XX:PermSize=128m -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:ParallelGCThreads=8 -XX:ConcGCThreads=5 -XX:InitiatingHeapOccupancyPercent=70"
export JMX_PORT="9999"
#export KAFKA_HEAP_OPTS="-Xmx1G -Xms1G"
fi
#分发配置文件
xsync kafka-server-start.sh
#上传压缩包kafka-eagle-bin-1.4.5.tar.gz到集群/opt/software目录
#解压到本地
tar -zxvf kafka-eagle-bin-1.4.5.tar.gz
#进入刚才解压的目录
#将kafka-eagle-web-1.3.7-bin.tar.gz解压至/opt/module
tar -zxvf kafka-eagle-web-1.4.5-bin.tar.gz -C /opt/module/
mv kafka-eagle-web-1.4.5/ eagle
#给启动文件执行权限
cd bin/
chmod 777 ke.sh
#修改配置文件 conf/system-config.properties
######################################
# multi zookeeper & kafka cluster list
######################################
kafka.eagle.zk.cluster.alias=cluster1,cluster2
cluster1.zk.list=hadoop101:2181,hadoop102:2181,hadoop103:2181
cluster2.zk.list=xdn10:2181,xdn11:2181,xdn12:2181
######################################
# broker size online list
######################################
cluster1.kafka.eagle.broker.size=20
######################################
# zk client thread limit
######################################
kafka.zk.limit.size=25
######################################
# kafka eagle webui port
######################################
kafka.eagle.webui.port=8048
######################################
# kafka offset storage
######################################
cluster1.kafka.eagle.offset.storage=kafka
cluster2.kafka.eagle.offset.storage=zk
######################################
kafka.eagle.metrics.charts=false
kafka.eagle.metrics.retain=30
######################################
# kafka sql topic records max
kafka.eagle.sql.fix.error=false
######################################
# delete kafka topic token
######################################
kafka.eagle.topic.token=keadmin
######################################
# kafka sasl authenticate
######################################
cluster1.kafka.eagle.sasl.enable=false
cluster1.kafka.eagle.sasl.protocol=SASL_PLAINTEXT
cluster1.kafka.eagle.sasl.mechanism=SCRAM-SHA-256
cluster1.kafka.eagle.sasl.client.id=
cluster2.kafka.eagle.sasl.enable=false
cluster2.kafka.eagle.sasl.protocol=SASL_PLAINTEXT
######################################
# multi zookeeper & kafka cluster list
######################################
kafka.eagle.zk.cluster.alias=cluster1,cluster2
cluster1.zk.list=hadoop101:2181,hadoop102:2181,hadoop103:2181
cluster2.zk.list=xdn10:2181,xdn11:2181,xdn12:2181
######################################
# broker size online list
######################################
cluster1.kafka.eagle.broker.size=20
######################################
# zk client thread limit
######################################
kafka.zk.limit.size=25
######################################
# kafka eagle webui port
######################################
kafka.eagle.webui.port=8048
######################################
# kafka offset storage
######################################
cluster1.kafka.eagle.offset.storage=kafka
cluster2.kafka.eagle.offset.storage=zk
######################################
kafka.eagle.metrics.charts=true
kafka.eagle.metrics.retain=30
######################################
# kafka sql topic records max
kafka.eagle.sql.fix.error=false
######################################
# delete kafka topic token
######################################
kafka.eagle.topic.token=keadmin
######################################
# kafka sasl authenticate
######################################
cluster1.kafka.eagle.sasl.enable=false
cluster1.kafka.eagle.sasl.protocol=SASL_PLAINTEXT
cluster1.kafka.eagle.sasl.mechanism=SCRAM-SHA-256
######################################
# multi zookeeper & kafka cluster list
######################################
kafka.eagle.zk.cluster.alias=cluster1
cluster1.zk.list=hadoop101:2181,hadoop102:2181,hadoop103:2181
######################################
# broker size online list
######################################
cluster1.kafka.eagle.broker.size=20
######################################
# zk client thread limit
######################################
kafka.zk.limit.size=25
######################################
# kafka eagle webui port
######################################
kafka.eagle.webui.port=8048
######################################
# kafka offset storage
######################################
cluster1.kafka.eagle.offset.storage=kafka
# kafka metrics, 30 days by default
######################################
kafka.eagle.metrics.charts=true
kafka.eagle.metrics.retain=30
######################################
# kafka sql topic records max
kafka.eagle.sql.fix.error=false
######################################
# delete kafka topic token
######################################
kafka.eagle.topic.token=keadmin
######################################
cluster1.kafka.eagle.sasl.enable=false
cluster1.kafka.eagle.sasl.protocol=SASL_PLAINTEXT
cluster1.kafka.eagle.sasl.mechanism=SCRAM-SHA-256
cluster1.kafka.eagle.sasl.client.id=
cluster1.kafka.eagle.sasl.cgroup.enable=false
cluster2.kafka.eagle.sasl.enable=false
cluster2.kafka.eagle.sasl.protocol=SASL_PLAINTEXT
cluster2.kafka.eagle.sasl.mechanism=PLAIN
cluster2.kafka.eagle.sasl.jaas.config=org.apache.kafka.common.security.plain.PlainLoginModule required username="kafka" password="kafka-eagle";
cluster2.kafka.eagle.sasl.client.id=
cluster2.kafka.eagle.sasl.cgroup.enable=false
cluster2.kafka.eagle.sasl.cgroup.topics=
######################################
# kafka sqlite jdbc driver address
######################################
kafka.eagle.driver=org.sqlite.JDBCkafka.eagle.url=jdbc:sqlite:/hadoop/kafka-eagle/db/ke.db
kafka.eagle.username=root
kafka.eagle.password=www.kafka-eagle.org
######################################
# kafka mysql jdbc driver address
######################################
kafka.eagle.driver=com.mysql.jdbc.Driverkafka.eagle.url=jdbc:mysql://hadoop101:3306/ke?useUnicode=true&characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull
kafka.eagle.username=root
kafka.eagle.password=hikehike
sudo vim /etc/profile.d/my_env.sh
#添加环境变量
export KE_HOME=/opt/module/eagle
export PATH=$PATH:$KE_HOME/bin
source /etc/profile
#启动kafka eagle
kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties
bin/ke.sh start
#根据提示登录即可
如果根据提示,登录失败(不能进入系统),可能是配置文件出现错误,检查配置文件