Kafka生产者发送消息到Broker流程分析

513 阅读10分钟

1. 客户端开发

一个正常的生产逻辑需要具备以下几个步骤:

  1. 配置生产者客户端参数及创建相应的生产者实例。

  2. 构建待发送的消息。

  3. 发送消息。

  4. 关闭生产者实例。

生产者客户端示例代码

public class KafkaProducerDemo {
    public static final String brokerList = "localhost:9092";
    public static final String topic = "topic-demo";

    public static Properties initConfig(){
        Properties props = new Properties();
        props.put("bootstrap.servers", brokerList);
        props.put("key.serializer",
                "org.apache.kafka.common.serialization.StringSerializer");
        props.put("value.serializer",
                "org.apache.kafka.common.serialization.StringSerializer");
        props.put("client.id", "producer.client.id.demo");
        return props;
    }

    public static void main(String[] args) {
        Properties props = initConfig();
        KafkaProducer<String, String> producer = new KafkaProducer<>(props);
        ProducerRecord<String, String> record =
                new ProducerRecord<>(topic, "Hello, Kafka!");
        try {
            producer.send(record);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

生产者的 API 使用还是比较简单,创建一个 ProducerRecord 对象 (这个对象包含目标主题和要发送的内容,当然还可以指定键以及分区), 然后调用 send 方法就把消息发送出去了。在发送 ProducerRecord 对象时,生产者要先把键和值对象序列化成字节数组,这样才能在网络上进行传输。

2 源码分析

new KafkaProducer<>(props);做了什么?

   public KafkaProducer(Properties properties) {
        this(new ProducerConfig(properties), (Serializer)null, (Serializer)null, (Metadata)null, (KafkaClient)null);
    }


	KafkaProducer(ProducerConfig config, Serializer<K> keySerializer, Serializer<V> valueSerializer, Metadata metadata, KafkaClient kafkaClient) {
        try {
            Map<String, Object> userProvidedConfigs = config.originals();
            this.producerConfig = config;
            this.time = Time.SYSTEM;
            String clientId = config.getString("client.id");
            if (clientId.length() <= 0) {
                clientId = "producer-" + PRODUCER_CLIENT_ID_SEQUENCE.getAndIncrement();
            }

            this.clientId = clientId;
            String transactionalId = userProvidedConfigs.containsKey("transactional.id") ? (String)userProvidedConfigs.get("transactional.id") : null;
            LogContext logContext;
            if (transactionalId == null) {
                logContext = new LogContext(String.format("[Producer clientId=%s] ", clientId));
            } else {
                logContext = new LogContext(String.format("[Producer clientId=%s, transactionalId=%s] ", clientId, transactionalId));
            }

            this.log = logContext.logger(KafkaProducer.class);
            this.log.trace("Starting the Kafka producer");
            Map<String, String> metricTags = Collections.singletonMap("client-id", clientId);
            MetricConfig metricConfig = (new MetricConfig()).samples(config.getInt("metrics.num.samples")).timeWindow(config.getLong("metrics.sample.window.ms"), TimeUnit.MILLISECONDS).recordLevel(RecordingLevel.forName(config.getString("metrics.recording.level"))).tags(metricTags);
            List<MetricsReporter> reporters = config.getConfiguredInstances("metric.reporters", MetricsReporter.class);
            reporters.add(new JmxReporter("kafka.producer"));
            this.metrics = new Metrics(metricConfig, reporters, this.time);
            ProducerMetrics metricsRegistry = new ProducerMetrics(this.metrics);
            this.partitioner = (Partitioner)config.getConfiguredInstance("partitioner.class", Partitioner.class);
            long retryBackoffMs = config.getLong("retry.backoff.ms");
            if (keySerializer == null) {
                this.keySerializer = Wrapper.ensureExtended((Serializer)config.getConfiguredInstance("key.serializer", Serializer.class));
                this.keySerializer.configure(config.originals(), true);
            } else {
                config.ignore("key.serializer");
                this.keySerializer = Wrapper.ensureExtended(keySerializer);
            }

            if (valueSerializer == null) {
                this.valueSerializer = Wrapper.ensureExtended((Serializer)config.getConfiguredInstance("value.serializer", Serializer.class));
                this.valueSerializer.configure(config.originals(), false);
            } else {
                config.ignore("value.serializer");
                this.valueSerializer = Wrapper.ensureExtended(valueSerializer);
            }

            userProvidedConfigs.put("client.id", clientId);
            List<ProducerInterceptor<K, V>> interceptorList = (new ProducerConfig(userProvidedConfigs, false)).getConfiguredInstances("interceptor.classes", ProducerInterceptor.class);
            // 拦截器
            this.interceptors = new ProducerInterceptors(interceptorList);
            ClusterResourceListeners clusterResourceListeners = this.configureClusterResourceListeners(keySerializer, valueSerializer, interceptorList, reporters);
            this.maxRequestSize = config.getInt("max.request.size");
            this.totalMemorySize = config.getLong("buffer.memory");
            this.compressionType = CompressionType.forName(config.getString("compression.type"));
            this.maxBlockTimeMs = config.getLong("max.block.ms");
            this.requestTimeoutMs = config.getInt("request.timeout.ms");
            this.transactionManager = configureTransactionState(config, logContext, this.log);
            int retries = configureRetries(config, this.transactionManager != null, this.log);
            int maxInflightRequests = configureInflightRequests(config, this.transactionManager != null);
            short acks = configureAcks(config, this.transactionManager != null, this.log);
            this.apiVersions = new ApiVersions();
            // 创建RecordAccumulator
            // RecordAccumulator:每一个是生产上都会维护一个固定大小的内存空间,主要用于合并单条消息,进行批量发送,提高吞吐量,减少带宽消耗。
            // RecordAccumulator的大小是可配置的,可以配置buffer.memory来修改缓冲区大小,默认值为:33554432(32M)
            
            //RecordAccumulator内存结构分为两部分
            // 第一部分为已经使用的内存,这一部分主要存放了很多的队列。每一个主题的每一个分区都会创建一个队列,来存放当前分区下待发送的消息集合
            // 第二部分为未使用的内存,这一部分分为已经池化后的内存和未池化的整个剩余内存(nonPooledAvailableMemory)。
            //	池化的内存的会根据batch.size(默认值为16K)的配置进行池化多个ByteBuffer,放入一个队列中。所有的剩余空间会形成一个未池化的剩余空间。
            this.accumulator = new RecordAccumulator(logContext, config.getInt("batch.size"), this.totalMemorySize, this.compressionType, config.getLong("linger.ms"), retryBackoffMs, this.metrics, this.time, this.apiVersions, this.transactionManager);
            List<InetSocketAddress> addresses = ClientUtils.parseAndValidateAddresses(config.getList("bootstrap.servers"));
            if (metadata != null) {
                this.metadata = metadata;
            } else {
                this.metadata = new Metadata(retryBackoffMs, config.getLong("metadata.max.age.ms"), true, true, clusterResourceListeners);
                this.metadata.update(Cluster.bootstrap(addresses), Collections.emptySet(), this.time.milliseconds());
            }

            ChannelBuilder channelBuilder = ClientUtils.createChannelBuilder(config);
            Sensor throttleTimeSensor = Sender.throttleTimeSensor(metricsRegistry.senderMetrics);
            // 创建一个KafkaClient负责和borker通信
            KafkaClient client = kafkaClient != null ? kafkaClient : new NetworkClient(new Selector(config.getLong("connections.max.idle.ms"), this.metrics, this.time, "producer", channelBuilder, logContext), this.metadata, clientId, maxInflightRequests, config.getLong("reconnect.backoff.ms"), config.getLong("reconnect.backoff.max.ms"), config.getInt("send.buffer.bytes"), config.getInt("receive.buffer.bytes"), this.requestTimeoutMs, this.time, true, this.apiVersions, throttleTimeSensor, logContext);

            // 以下是创建Sender线程
            // Sender 类实现了Runnable接口
            this.sender = new Sender(logContext, (KafkaClient)client, this.metadata, this.accumulator, maxInflightRequests == 1, config.getInt("max.request.size"), acks, retries, metricsRegistry.senderMetrics, Time.SYSTEM, this.requestTimeoutMs, config.getLong("retry.backoff.ms"), this.transactionManager, this.apiVersions);
            String ioThreadName = "kafka-producer-network-thread | " + clientId;
            // 创建一个线程 实际运行线程是 Sender,(KafkaThread 是对 Sender 的封装),用于扫描 RecordAccumulator 中是否有消息
            this.ioThread = new KafkaThread(ioThreadName, this.sender, true);
            // 启动Sender线程
            this.ioThread.start();
            this.errors = this.metrics.sensor("errors");
            config.logUnused();
            AppInfoParser.registerAppInfo("kafka.producer", clientId, this.metrics);
            this.log.debug("Kafka producer started");
        } catch (Throwable var26) {
            this.close(0L, TimeUnit.MILLISECONDS, true);
            throw new KafkaException("Failed to construct kafka producer", var26);
        }
    }

上面的代码就是构造 KafkaProducer 时核心逻辑,它会构造一个 KafkaClient 负责和 broker 通信,同时构造一个 Sender 并启动一个异步线程,这个线程会被命名为:kafka-producer-network-thread|${clientId}, 如果你在创建 producer 的时候指定 client.id 的值为 myclient, 那么线程名称就是 kafka-producer-network-thread|myclient

创建好Producer实例后,与Producer交互的代码就是producer.send(record);

producer.send(record)做了什么

代码一直追下去发现逻辑是放在doSend()方法中

    /**
     * Implementation of asynchronously send a record to a topic.
     */
    private Future<RecordMetadata> doSend(ProducerRecord<K, V> record, Callback callback) {
        TopicPartition tp = null;
        try {
            throwIfProducerClosed();
            // first make sure the metadata for the topic is available
            ClusterAndWaitTime clusterAndWaitTime;
            try {
                clusterAndWaitTime = waitOnMetadata(record.topic(), record.partition(), maxBlockTimeMs);
            } catch (KafkaException e) {
                if (metadata.isClosed())
                    throw new KafkaException("Producer closed while send in progress", e);
                throw e;
            }
            long remainingWaitMs = Math.max(0, maxBlockTimeMs - clusterAndWaitTime.waitedOnMetadataMs);
            Cluster cluster = clusterAndWaitTime.cluster;
            byte[] serializedKey;
            try {
                // 对key进行序列化
                serializedKey = keySerializer.serialize(record.topic(), record.headers(), record.key());
            } catch (ClassCastException cce) {
                throw new SerializationException("Can't convert key of class " + record.key().getClass().getName() +
                        " to class " + producerConfig.getClass(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG).getName() +
                        " specified in key.serializer", cce);
            }
            byte[] serializedValue;
            try {
                // 对value进行序列化
                serializedValue = valueSerializer.serialize(record.topic(), record.headers(), record.value());
            } catch (ClassCastException cce) {
                throw new SerializationException("Can't convert value of class " + record.value().getClass().getName() +
                        " to class " + producerConfig.getClass(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG).getName() +
                        " specified in value.serializer", cce);
            }
            // 获取partition 
            int partition = partition(record, serializedKey, serializedValue, cluster);
            // 这个也很重要
            // 可以想像下他的数据结构,因为他要被当做Map的key
            tp = new TopicPartition(record.topic(), partition);

            setReadOnly(record.headers());
            Header[] headers = record.headers().toArray();

            int serializedSize = AbstractRecords.estimateSizeInBytesUpperBound(apiVersions.maxUsableProduceMagic(),
                    compressionType, serializedKey, serializedValue, headers);
            ensureValidRecordSize(serializedSize);
            long timestamp = record.timestamp() == null ? time.milliseconds() : record.timestamp();
            log.trace("Sending record {} with callback {} to topic {} partition {}", record, callback, record.topic(), partition);
            // producer callback will make sure to call both 'callback' and interceptor callback
            Callback interceptCallback = new InterceptorCallback<>(callback, this.interceptors, tp);

            if (transactionManager != null && transactionManager.isTransactional())
                transactionManager.maybeAddPartitionToTransaction(tp);

            // 核心代码
            // 发送消息实际上是将消息缓存起来放到accumulator中
            RecordAccumulator.RecordAppendResult result = accumulator.append(tp, timestamp, serializedKey,
                    serializedValue, headers, interceptCallback, remainingWaitMs);
            if (result.batchIsFull || result.newBatchCreated) {
                log.trace("Waking up the sender since topic {} partition {} is either full or getting a new batch", record.topic(), partition);
                this.sender.wakeup();
            }
            return result.future;
            // handling exceptions and record the errors;
            // for API exceptions return them in the future,
            // for other exceptions throw directly
        } catch (ApiException e) {
            log.debug("Exception occurred during message send:", e);
            if (callback != null)
                callback.onCompletion(null, e);
            this.errors.record();
            this.interceptors.onSendError(record, tp, e);
            return new FutureFailure(e);
        } catch (InterruptedException e) {
            this.errors.record();
            this.interceptors.onSendError(record, tp, e);
            throw new InterruptException(e);
        } catch (BufferExhaustedException e) {
            this.errors.record();
            this.metrics.sensor("buffer-exhausted-records").record();
            this.interceptors.onSendError(record, tp, e);
            throw e;
        } catch (KafkaException e) {
            this.errors.record();
            this.interceptors.onSendError(record, tp, e);
            throw e;
        } catch (Exception e) {
            // we notify interceptor about all exceptions, since onSend is called before anything else in this method
            this.interceptors.onSendError(record, tp, e);
            throw e;
        }
    }
	private final ConcurrentMap<TopicPartition, Deque<ProducerBatch>> batches;
	
	public RecordAppendResult append(TopicPartition tp,
                                     long timestamp,
                                     byte[] key,
                                     byte[] value,
                                     Header[] headers,
                                     Callback callback,
                                     long maxTimeToBlock) throws InterruptedException {
        // We keep track of the number of appending thread to make sure we do not miss batches in
        // abortIncompleteBatches().
        appendsInProgress.incrementAndGet();
        ByteBuffer buffer = null;
        if (headers == null) headers = Record.EMPTY_HEADERS;
        try {
            // dq是一个双向队列
            // 从batchs(ConcurrentMap<TopicPartition, Deque<ProducerBatch>>)中
			// 根据主题分区获取对应的队列,如果没有则new ArrayDeque<>返回
            // check if we have an in-progress batch
            Deque<ProducerBatch> dq = getOrCreateDeque(tp);
            synchronized (dq) {
                if (closed)
                    throw new KafkaException("Producer closed while send in progress");
                RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callback, dq);
                if (appendResult != null)
                    return appendResult;
            }

            // we don't have an in-progress record batch try to allocate a new batch
            byte maxUsableMagic = apiVersions.maxUsableProduceMagic();
            //计算同一个记录批次占用空间大小,batchSize根据batch.size参数决定
            int size = Math.max(this.batchSize, AbstractRecords.estimateSizeInBytesUpperBound(maxUsableMagic, compression, key, value, headers));
            log.trace("Allocating a new {} byte message buffer for topic {} partition {}", size, tp.topic(), tp.partition());
            //为同一个topic,partition分配buffer,如果同一个记录批次的内存不足,
			//那么会阻塞maxTimeToBlock(max.block.ms参数)这么长时间
            buffer = free.allocate(size, maxTimeToBlock);
            synchronized (dq) {
                // Need to check if producer is closed again after grabbing the dequeue lock.
                if (closed)
                    throw new KafkaException("Producer closed while send in progress");

                RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callback, dq);
                if (appendResult != null) {
                    // Somebody else found us a batch, return the one we waited for! Hopefully this doesn't happen often...
                    return appendResult;
                }

                //创建MemoryRecordBuilder,通过buffer初始化appendStream(DataOutputStream)属性
                MemoryRecordsBuilder recordsBuilder = recordsBuilder(buffer, maxUsableMagic);
                ProducerBatch batch = new ProducerBatch(tp, recordsBuilder, time.milliseconds());
                //将key,value写入到MemoryRecordsBuilder中的appendStream(DataOutputStream)中
                FutureRecordMetadata future = Utils.notNull(batch.tryAppend(timestamp, key, value, headers, callback, time.milliseconds()));

                //把消息放到了deque
                dq.addLast(batch);
                incomplete.add(batch);

                // Don't deallocate this buffer in the finally block as it's being used in the record batch
                buffer = null;

                return new RecordAppendResult(future, dq.size() > 1 || batch.isFull(), true);
            }
        } finally {
            if (buffer != null)
                free.deallocate(buffer);
            appendsInProgress.decrementAndGet();
        }
    }

可以发现每个kafka producer在new 的时候都会创建一个backgroud的ioThread,每个producer都有一个ioThread。实际上,producer的send()并不是直接发送消息到broker上的,而是在执行send()时把消息保存到 RecordAccumulator 中,实际上就是保存到一个 Map 中 (ConcurrentMap<TopicPartition, Deque>), 这条消息会被记录到同一个记录批次 (相同主题相同分区算同一个批次) 里面,这个批次的所有消息会被发送到相同的主题和分区上。

然后后端的IOThread会一直扫描这个缓冲池中的消息,也就是这个线程负责真正地把消息发送到broker中。每一个Producer都是由一个持有未发送消息的资源池和一个用来向kafka集群发送消息记录的后台IOThread组成。使用后未关闭producer将导致这些资源泄漏。

将RecordAccumulator中的数据发送消息到 Kafka Broker

上面已经将消息存储 RecordAccumulator 中去了,现在看看怎么发送消息。上面我们提到了创建 KafkaProducer 的时候会启动一个异步线程去从 RecordAccumulator 中取得消息然后发送到 Kafka, 发送消息的核心代码是 Sender.java, 它实现了 Runnable 接口并在后台一直运行处理发送请求并将消息发送到合适的节点,直到 KafkaProducer 被关闭

/**
* The background thread that handles the sending of produce requests to the Kafka cluster. This thread makes metadata
* requests to renew its view of the cluster and then sends produce requests to the appropriate nodes.
*/
public class Sender implements Runnable {
    public void run() {
    
        // 一直运行直到kafkaProducer.close()方法被调用
        while (running) {
           run(time.milliseconds());
        }
        
        //从日志上看是开始处理KafkaProducer被关闭后的逻辑
        log.debug("Beginning shutdown of Kafka producer I/O thread, sending remaining records.");
    
        //当非强制关闭的时候,可能还仍然有请求并且accumulator中还仍然存在数据,此时我们需要将请求处理完成
        while (!forceClose && (this.accumulator.hasUndrained() || this.client.inFlightRequestCount() > 0)) {
           run(time.milliseconds());
        }
        if (forceClose) {
            //如果是强制关闭,且还有未发送完毕的消息,则取消发送并抛出一个异常new KafkaException("Producer is closed forcefully.")
            this.accumulator.abortIncompleteBatches();
        }
        ...
      }

KafkaProducer 的关闭方法有 2 个,close() 以及 close(long timeout,TimeUnit timUnit), 其中 timeout 参数的意思是等待生产者完成任何待处理请求的最长时间,第一种方式的 timeout 为 Long.MAX_VALUE 毫秒,如果采用第二种方式关闭,当 timeout=0 的时候则表示强制关闭,直接关闭 Sender (设置 running=false)。

run (long) 方法中包含对 transactionManager 的处理和发送消息的流程

代码如下:

        /**
     * Run a single iteration of sending
     *
     * @param now The current POSIX time in milliseconds
     */
    void run(long now) {
        // 对 transactionManager 的处理
        if (transactionManager != null) {
            try {
                if (transactionManager.shouldResetProducerStateAfterResolvingSequences())
                    // Check if the previous run expired batches which requires a reset of the producer state.
                    transactionManager.resetProducerId();

                if (!transactionManager.isTransactional()) {
                    // this is an idempotent producer, so make sure we have a producer id
                    maybeWaitForProducerId();
                } else if (transactionManager.hasUnresolvedSequences() && !transactionManager.hasFatalError()) {
                    transactionManager.transitionToFatalError(new KafkaException("The client hasn't received acknowledgment for " +
                            "some previously sent messages and can no longer retry them. It isn't safe to continue."));
                } else if (transactionManager.hasInFlightTransactionalRequest() || maybeSendTransactionalRequest(now)) {
                    // as long as there are outstanding transactional requests, we simply wait for them to return
                    client.poll(retryBackoffMs, now);
                    return;
                }

                // do not continue sending if the transaction manager is in a failed state or if there
                // is no producer id (for the idempotent case).
                if (transactionManager.hasFatalError() || !transactionManager.hasProducerId()) {
                    RuntimeException lastError = transactionManager.lastError();
                    if (lastError != null)
                        maybeAbortBatches(lastError);
                    client.poll(retryBackoffMs, now);
                    return;
                } else if (transactionManager.hasAbortableError()) {
                    accumulator.abortUndrainedBatches(transactionManager.lastError());
                }
            } catch (AuthenticationException e) {
                // This is already logged as error, but propagated here to perform any clean ups.
                log.trace("Authentication exception while processing transactional request: {}", e);
                transactionManager.authenticationFailed(e);
            }
        }
        long pollTimeout = sendProducerData(now);
        client.poll(pollTimeout, now);
    }

}

首先查看 sendProducerData () 方法,它的核心逻辑在 sendProduceRequest() 方法 (处于 Sender.java) 中

   private void sendProduceRequest(long now, int destination, short acks, int timeout, List<ProducerBatch> batches) {
        if (batches.isEmpty())
            return;

        Map<TopicPartition, MemoryRecords> produceRecordsByPartition = new HashMap<>(batches.size());
        final Map<TopicPartition, ProducerBatch> recordsByPartition = new HashMap<>(batches.size());

        // find the minimum magic version used when creating the record sets
        byte minUsedMagic = apiVersions.maxUsableProduceMagic();
        for (ProducerBatch batch : batches) {
            if (batch.magic() < minUsedMagic)
                minUsedMagic = batch.magic();
        }

        for (ProducerBatch batch : batches) {
            TopicPartition tp = batch.topicPartition;
            //将ProducerBatch中MemoryRecordsBuilder转换为MemoryRecords(发送的数据就在这里面)
            MemoryRecords records = batch.records();

            // down convert if necessary to the minimum magic used. In general, there can be a delay between the time
            // that the producer starts building the batch and the time that we send the request, and we may have
            // chosen the message format based on out-dated metadata. In the worst case, we optimistically chose to use
            // the new message format, but found that the broker didn't support it, so we need to down-convert on the
            // client before sending. This is intended to handle edge cases around cluster upgrades where brokers may
            // not all support the same message format version. For example, if a partition migrates from a broker
            // which is supporting the new magic version to one which doesn't, then we will need to convert.
            if (!records.hasMatchingMagic(minUsedMagic))
                records = batch.records().downConvert(minUsedMagic, 0, time).records();
            produceRecordsByPartition.put(tp, records);
            recordsByPartition.put(tp, batch);
        }

        String transactionalId = null;
        if (transactionManager != null && transactionManager.isTransactional()) {
            transactionalId = transactionManager.transactionalId();
        }
        ProduceRequest.Builder requestBuilder = ProduceRequest.Builder.forMagic(minUsedMagic, acks, timeout,
                produceRecordsByPartition, transactionalId);
        //消息发送完成时的回调
        RequestCompletionHandler callback = new RequestCompletionHandler() {
            public void onComplete(ClientResponse response) {
                //处理响应消息
                handleProduceResponse(response, recordsByPartition, time.milliseconds());
            }
        };

        String nodeId = Integer.toString(destination);
      	//根据参数构造ClientRequest,此时需要发送的消息在requestBuilder中
        ClientRequest clientRequest = client.newClientRequest(nodeId, requestBuilder, now, acks != 0,
                requestTimeoutMs, callback);
       	//将clientRequest转换成Send对象(Send.java,包含了需要发送数据的buffer),
	//给KafkaChannel设置该对象,这里还没有发送数据
        client.send(clientRequest, now);
        log.trace("Sent produce request to {}: {}", nodeId, requestBuilder);
    }

上面的 client.send () 方法最终会定位到 NetworkClient.doSend () 方法,所有的请求 (无论是 producer 发送消息的请求还是获取 metadata 的请求) 都是通过该方法设置对应的 Send 对象。所支持的请求在 ApiKeys.java 中都有定义,这里面可以看到每个请求的 request 以及 response 对应的数据结构。

上面只是设置了发送消息所需要准备的内容,现在进入到发送消息的主流程,发送消息的核心代码在 Selector.java 的 pollSelectionKeys () 方法中,代码如下:

    void pollSelectionKeys(Set<SelectionKey> selectionKeys,
                           boolean isImmediatelyConnected,
                           long currentTimeNanos) {
        for (SelectionKey key : determineHandlingOrder(selectionKeys)) {
                //....
                /* if channel is ready write to any sockets that have space in their buffer and for which we have data */
                if (channel.ready() && key.isWritable()) {
                    Send send = null;
                    try {
                        //底层实际调用的是java8 GatheringByteChannel的write方法
                        send = channel.write();
                    } catch (Exception e) {
                        sendFailed = true;
                        throw e;
                    }
                    if (send != null) {
                        this.completedSends.add(send);
                        this.sensors.recordBytesSent(channel.id(), send.size());
                    }
                }
            ....
        }
    }

就这样,我们的消息就发送到了 broker 中了,发送流程分析完毕,这个是完美的情况,但是总会有发送失败的时候 (消息过大或者没有可用的 leader),那么发送失败后重发又是在哪里完成的呢?还记得上面的回调函数吗?没错,就是在回调函数这里设置的,先来看下回调函数源码

//org.apache.kafka.clients.producer.internals.Sender#handleProduceResponse
private void handleProduceResponse(ClientResponse response, Map<TopicPartition, ProducerBatch> batches, long now) {
  RequestHeader requestHeader = response.requestHeader();

  if (response.wasDisconnected()) {
    //如果是网络断开则构造Errors.NETWORK_EXCEPTION的响应
    for (ProducerBatch batch : batches.values())
        completeBatch(batch, new ProduceResponse.PartitionResponse(Errors.NETWORK_EXCEPTION), correlationId, now, 0L);

  } else if (response.versionMismatch() != null) {

   //如果是版本不匹配,则构造Errors.UNSUPPORTED_VERSION的响应
    for (ProducerBatch batch : batches.values())
        completeBatch(batch, new ProduceResponse.PartitionResponse(Errors.UNSUPPORTED_VERSION), correlationId, now, 0L);

  } else {
    
    if (response.hasResponse()) {
        //如果存在response就返回正常的response
           ...
        }
    } else {

        //如果acks=0,那么则构造Errors.NONE的响应,因为这种情况只需要发送不需要响应结果
        for (ProducerBatch batch : batches.values()) {
            completeBatch(batch, new ProduceResponse.PartitionResponse(Errors.NONE), correlationId, now, 0L);
        }
    }
  }
}

而在 completeBatch 方法中我们主要关注失败的逻辑处理,核心源码如下:

//org.apache.kafka.clients.producer.internals.Sender#completeBatch
private void completeBatch(ProducerBatch batch, ProduceResponse.PartitionResponse response, long correlationId,
                           long now, long throttleUntilTimeMs) {
  Errors error = response.error;

  //如果发送的消息太大,需要重新进行分割发送
  if (error == Errors.MESSAGE_TOO_LARGE && batch.recordCount > 1 &&
        (batch.magic() >= RecordBatch.MAGIC_VALUE_V2 || batch.isCompressed())) {

    this.accumulator.splitAndReenqueue(batch);
    this.accumulator.deallocate(batch);
    this.sensors.recordBatchSplit();

  } else if (error != Errors.NONE) {

    //发生了错误,如果此时可以retry(retry次数未达到限制以及产生异常是RetriableException)
    if (canRetry(batch, response)) {
        if (transactionManager == null) {
            //把需要重试的消息放入队列中,等待重试,实际就是调用deque.addFirst(batch)
            reenqueueBatch(batch, now);
        } 
    } 
}

分区算法

List<PartitionInfo> partitions = cluster.partitionsForTopic(topic);
int numPartitions = partitions.size();
if (keyBytes == null) {
    //如果key为null,则使用Round Robin算法
    int nextValue = nextValue(topic);
    List<PartitionInfo> availablePartitions = cluster.availablePartitionsForTopic(topic);
    if (availablePartitions.size() > 0) {
        int part = Utils.toPositive(nextValue) % availablePartitions.size();
        return availablePartitions.get(part).partition();
    } else {
        // no partitions are available, give a non-available partition
        return Utils.toPositive(nextValue) % numPartitions;
    }
} else {
    // 根据key进行散列
    return Utils.toPositive(Utils.murmur2(keyBytes)) % numPartitions;
}

Kafka 中对于分区的算法有两种情况

  1. 如果键值为 null, 并且使用了默认的分区器,那么记录键随机地发送到主题内各个可用的分区上。分区器使用轮询 (Round Robin) 算法键消息均衡地分布到各个分区上。

  2. 如果键不为空,并且使用了默认的分区器,那么 Kafka 会对键进行散列 (使用 Kafka 自己的散列算法,即使升级 Java 版本,散列值也不会发生变化),然后根据散列值把消息映射到特定的分区上。同一个键总是被映射到同一个分区上 (如果分区数量发生了变化则不能保证),映射的时候会使用主题所有的分区,而不仅仅是可用分区,所以如果写入数据分区是不可用的,那么就会发生错误,当然这种情况很少发生。