kafka 怎么发消息

398 阅读12分钟

kafka producer

我们从kafka源码里面的一个examples说起,跟踪kafka producer发送一条message的整个流程。

producer的初始化

首先是new出一个producer

public Producer(String topic, Boolean isAsync)
  {
    Properties props = new Properties();
    props.put("bootstrap.servers", "localhost:9092");
    props.put("client.id", "DemoProducer");
    props.put("key.serializer", "org.apache.kafka.common.serialization.IntegerSerializer");
    props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
    producer = new KafkaProducer<Integer, String>(props);
    this.topic = topic;
    this.isAsync = isAsync;
  }

进入new KafkaProducer()方法内部看看

private KafkaProducer(ProducerConfig config, Serializer<K> keySerializer, Serializer<V> valueSerializer) {
       try {
           log.trace("Starting the Kafka producer");
           Map<String, Object> userProvidedConfigs = config.originals();
           this.producerConfig = config;
           this.time = new SystemTime();

           MetricConfig metricConfig = new MetricConfig().samples(config.getInt(ProducerConfig.METRICS_NUM_SAMPLES_CONFIG))
                   .timeWindow(config.getLong(ProducerConfig.METRICS_SAMPLE_WINDOW_MS_CONFIG),
                           TimeUnit.MILLISECONDS);
           clientId = config.getString(ProducerConfig.CLIENT_ID_CONFIG);
           if (clientId.length() <= 0)
               clientId = "producer-" + PRODUCER_CLIENT_ID_SEQUENCE.getAndIncrement();
           List<MetricsReporter> reporters = config.getConfiguredInstances(ProducerConfig.METRIC_REPORTER_CLASSES_CONFIG,
                   MetricsReporter.class);
           reporters.add(new JmxReporter(JMX_PREFIX));
           this.metrics = new Metrics(metricConfig, reporters, time);
           this.partitioner = config.getConfiguredInstance(ProducerConfig.PARTITIONER_CLASS_CONFIG, Partitioner.class);
           long retryBackoffMs = config.getLong(ProducerConfig.RETRY_BACKOFF_MS_CONFIG);
           this.metadata = new Metadata(retryBackoffMs, config.getLong(ProducerConfig.METADATA_MAX_AGE_CONFIG));
           this.maxRequestSize = config.getInt(ProducerConfig.MAX_REQUEST_SIZE_CONFIG);
           this.totalMemorySize = config.getLong(ProducerConfig.BUFFER_MEMORY_CONFIG);
           this.compressionType = CompressionType.forName(config.getString(ProducerConfig.COMPRESSION_TYPE_CONFIG));
           /* check for user defined settings.
            * If the BLOCK_ON_BUFFER_FULL is set to true,we do not honor METADATA_FETCH_TIMEOUT_CONFIG.
            * This should be removed with release 0.9 when the deprecated configs are removed.
            */
           if (userProvidedConfigs.containsKey(ProducerConfig.BLOCK_ON_BUFFER_FULL_CONFIG)) {
               log.warn(ProducerConfig.BLOCK_ON_BUFFER_FULL_CONFIG + " config is deprecated and will be removed soon. " +
                       "Please use " + ProducerConfig.MAX_BLOCK_MS_CONFIG);
               boolean blockOnBufferFull = config.getBoolean(ProducerConfig.BLOCK_ON_BUFFER_FULL_CONFIG);
               if (blockOnBufferFull) {
                   this.maxBlockTimeMs = Long.MAX_VALUE;
               } else if (userProvidedConfigs.containsKey(ProducerConfig.METADATA_FETCH_TIMEOUT_CONFIG)) {
                   log.warn(ProducerConfig.METADATA_FETCH_TIMEOUT_CONFIG + " config is deprecated and will be removed soon. " +
                           "Please use " + ProducerConfig.MAX_BLOCK_MS_CONFIG);
                   this.maxBlockTimeMs = config.getLong(ProducerConfig.METADATA_FETCH_TIMEOUT_CONFIG);
               } else {
                   this.maxBlockTimeMs = config.getLong(ProducerConfig.MAX_BLOCK_MS_CONFIG);
               }
           } else if (userProvidedConfigs.containsKey(ProducerConfig.METADATA_FETCH_TIMEOUT_CONFIG)) {
               log.warn(ProducerConfig.METADATA_FETCH_TIMEOUT_CONFIG + " config is deprecated and will be removed soon. " +
                       "Please use " + ProducerConfig.MAX_BLOCK_MS_CONFIG);
               this.maxBlockTimeMs = config.getLong(ProducerConfig.METADATA_FETCH_TIMEOUT_CONFIG);
           } else {
               this.maxBlockTimeMs = config.getLong(ProducerConfig.MAX_BLOCK_MS_CONFIG);
           }

           /* check for user defined settings.
            * If the TIME_OUT config is set use that for request timeout.
            * This should be removed with release 0.9
            */
           if (userProvidedConfigs.containsKey(ProducerConfig.TIMEOUT_CONFIG)) {
               log.warn(ProducerConfig.TIMEOUT_CONFIG + " config is deprecated and will be removed soon. Please use " +
                       ProducerConfig.REQUEST_TIMEOUT_MS_CONFIG);
               this.requestTimeoutMs = config.getInt(ProducerConfig.TIMEOUT_CONFIG);
           } else {
               this.requestTimeoutMs = config.getInt(ProducerConfig.REQUEST_TIMEOUT_MS_CONFIG);
           }

           Map<String, String> metricTags = new LinkedHashMap<String, String>();
           metricTags.put("client-id", clientId);
           //RecordAccumulator是用来缓存即将发送的日志的。后面有介绍
           this.accumulator = new RecordAccumulator(config.getInt(ProducerConfig.BATCH_SIZE_CONFIG),
                   this.totalMemorySize,
                   this.compressionType,
                   config.getLong(ProducerConfig.LINGER_MS_CONFIG),
                   retryBackoffMs,
                   metrics,
                   time,
                   metricTags);
           //这个配置用来帮助producer发现集群
           List<InetSocketAddress> addresses = ClientUtils.parseAndValidateAddresses(config.getList(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG));
           //先用bootstrap server来初始化一个集群,
           this.metadata.update(Cluster.bootstrap(addresses), time.milliseconds());
           //和NIO相关的一个组件,后面介绍
           ChannelBuilder channelBuilder = ClientUtils.createChannelBuilder(config.values());
           NetworkClient client = new NetworkClient(
                   new Selector(config.getLong(ProducerConfig.CONNECTIONS_MAX_IDLE_MS_CONFIG), this.metrics, time, "producer", metricTags, channelBuilder),
                   this.metadata,
                   clientId,
                   config.getInt(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION),
                   config.getLong(ProducerConfig.RECONNECT_BACKOFF_MS_CONFIG),
                   config.getInt(ProducerConfig.SEND_BUFFER_CONFIG),
                   config.getInt(ProducerConfig.RECEIVE_BUFFER_CONFIG),
                   this.requestTimeoutMs, time);
           this.sender = new Sender(client,
                   this.metadata,
                   this.accumulator,
                   config.getInt(ProducerConfig.MAX_REQUEST_SIZE_CONFIG),
                   (short) parseAcks(config.getString(ProducerConfig.ACKS_CONFIG)),
                   config.getInt(ProducerConfig.RETRIES_CONFIG),
                   this.metrics,
                   new SystemTime(),
                   clientId,
                   this.requestTimeoutMs);
           String ioThreadName = "kafka-producer-network-thread" + (clientId.length() > 0 ? " | " + clientId : "");
           //kafka专门起一个线程来运行sender这个任务,后面分析
           this.ioThread = new KafkaThread(ioThreadName, this.sender, true);
           this.ioThread.start();

           this.errors = this.metrics.sensor("errors");

           if (keySerializer == null) {
               this.keySerializer = config.getConfiguredInstance(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
                       Serializer.class);
               this.keySerializer.configure(config.originals(), true);
           } else {
               config.ignore(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG);
               this.keySerializer = keySerializer;
           }
           if (valueSerializer == null) {
               this.valueSerializer = config.getConfiguredInstance(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
                       Serializer.class);
               this.valueSerializer.configure(config.originals(), false);
           } else {
               config.ignore(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG);
               this.valueSerializer = valueSerializer;
           }
           config.logUnused();
           AppInfoParser.registerAppInfo(JMX_PREFIX, clientId);
           log.debug("Kafka producer started");
       } catch (Throwable t) {
           // call close methods if internal objects are already constructed
           // this is to prevent resource leak. see KAFKA-2121
           close(0, TimeUnit.MILLISECONDS, true);
           // now propagate the exception
           throw new KafkaException("Failed to construct kafka producer", t);
       }
   }

这个方法特别长,不准备对它进行逐行的解释了,比较重要的都备注在代码中了。

send流程分析

下面是send的整个流程,我们把它分成多个snippet来讲解。

public Future<RecordMetadata> send(ProducerRecord<K, V> record, Callback callback) {
        try {
            // first make sure the metadata for the topic is available
            //snippet 1. 先等待元数据,花费时间为waitedOnMetadataMs
            long waitedOnMetadataMs = waitOnMetadata(record.topic(), this.maxBlockTimeMs);
            long remainingWaitMs = Math.max(0, this.maxBlockTimeMs - waitedOnMetadataMs);
            //对record中的key,value进行序列化
            byte[] serializedKey;
            try {
                serializedKey = keySerializer.serialize(record.topic(), record.key());
            } catch (ClassCastException cce) {
                throw new SerializationException("Can't convert key of class " + record.key().getClass().getName() +
                        " to class " + producerConfig.getClass(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG).getName() +
                        " specified in key.serializer");
            }
            byte[] serializedValue;
            try {
                serializedValue = valueSerializer.serialize(record.topic(), record.value());
            } catch (ClassCastException cce) {
                throw new SerializationException("Can't convert value of class " + record.value().getClass().getName() +
                        " to class " + producerConfig.getClass(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG).getName() +
                        " specified in value.serializer");
            }
            //snippet 2. 通过分区器获得这次produce的分区
            int partition = partition(record, serializedKey, serializedValue, metadata.fetch());
            //snippet 3. 计算这个消息序列化后的大小
            int serializedSize = Records.LOG_OVERHEAD + Record.recordSize(serializedKey, serializedValue);
            //snippet 4. 对某个对消息有效大小的限制进行确认
            ensureValidRecordSize(serializedSize);
            TopicPartition tp = new TopicPartition(record.topic(), partition);
            log.trace("Sending record {} with callback {} to topic {} partition {}", record, callback, record.topic(), partition);
            //snippet 5. 将要发送的消息放到缓冲区
            RecordAccumulator.RecordAppendResult result = accumulator.append(tp, serializedKey, serializedValue, callback, remainingWaitMs);
            if (result.batchIsFull || result.newBatchCreated) {
                log.trace("Waking up the sender since topic {} partition {} is either full or getting a new batch", record.topic(), partition);
                //snippet 6. 如果batch满了,唤醒发送组件发送
                this.sender.wakeup();
            }
            return result.future;
            // handling exceptions and record the errors;
            // for API exceptions return them in the future,
            // for other exceptions throw directly
        } catch (ApiException e) {
            log.debug("Exception occurred during message send:", e);
            if (callback != null)
                callback.onCompletion(null, e);
            this.errors.record();
            return new FutureFailure(e);
        } catch (InterruptedException e) {
            this.errors.record();
            throw new InterruptException(e);
        } catch (BufferExhaustedException e) {
            this.errors.record();
            this.metrics.sensor("buffer-exhausted-records").record();
            throw e;
        } catch (KafkaException e) {
            this.errors.record();
            throw e;
        }
    }

对每个snippet进行单独分析

snippet 1. 获取元数据

在进行下一步之前,我们得先知道元数据包含什么内容:

private final long refreshBackoffMs;
    //元数据最大过期时间
    private final long metadataExpireMs;
    private int version;
    private long lastRefreshMs;
    private long lastSuccessfulRefreshMs;
    //cluser包含了集群节点的信息,所有的topic,partition的信息
    private Cluster cluster;
    private boolean needUpdate;
    private final Set<String> topics;
    private final List<Listener> listeners;
    private boolean needMetadataForAllTopics;

在初始化producer的时候,我们已经更新过一次meta

List<InetSocketAddress> addresses = ClientUtils.parseAndValidateAddresses(config.getList(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG));
            this.metadata.update(Cluster.bootstrap(addresses), time.milliseconds());

其中update的实现为:

public synchronized void update(Cluster cluster, long now) {
        this.needUpdate = false;
        this.lastRefreshMs = now;
        this.lastSuccessfulRefreshMs = now;
        this.version += 1;

        for (Listener listener: listeners)
            listener.onMetadataUpdate(cluster);

        // Do this after notifying listeners as subscribed topics' list can be changed by listeners
        this.cluster = this.needMetadataForAllTopics ? getClusterForCurrentTopics(cluster) : cluster;

        notifyAll();
        log.debug("Updated cluster metadata version {} to {}", this.version, this.cluster);
    }

update方法接收了一个cluster来初始化meta内部的cluster。但是在程序刚启动的时候,cluster只包含了bootstrap的信息,这个meta是残缺的。另外注意的是update方法会唤醒等待meta的线程。(注意到后面有方法调用wait()的)。所以发消息之前,还要再获取一次元数据。

在发送消息之前,要确保metadata已到位。

private long waitOnMetadata(String topic, long maxWaitMs) throws InterruptedException {
        // add topic to metadata topic list if it is not there already.
        if (!this.metadata.containsTopic(topic))
            //把topic先放到meta里面一个叫topics的set里面
            this.metadata.add(topic);

        //如果元数据里面已经有topic的信息了,就不用继续了。这里的fetch()返回meta的cluster
        if (metadata.fetch().partitionsForTopic(topic) != null)
            return 0;

        long begin = time.milliseconds();
        long remainingWaitMs = maxWaitMs;
        //循环获取
        while (metadata.fetch().partitionsForTopic(topic) == null) {
            log.trace("Requesting metadata update for topic {}.", topic);
            //标记这个meta的状态为需要更新了
            int version = metadata.requestUpdate();
            //把sender唤醒
            sender.wakeup();
            //阻塞在这个对象上,等待update里面notify
            metadata.awaitUpdate(version, remainingWaitMs);
            long elapsed = time.milliseconds() - begin;
            if (elapsed >= maxWaitMs)
                throw new TimeoutException("Failed to update metadata after " + maxWaitMs + " ms.");
            if (metadata.fetch().unauthorizedTopics().contains(topic))
                throw new TopicAuthorizationException(topic);
            remainingWaitMs = maxWaitMs - elapsed;
        }
        return time.milliseconds() - begin;
    }

上面的逻辑都比较简洁,最后会走到sender这一步。在producer初始化的时候我们就看到kafka起了一个专门的线程运行sender,它的run方法是:

public void run() {
        log.debug("Starting Kafka producer I/O thread.");

        // main loop, runs until close is called
        while (running) {
            try {
                run(time.milliseconds());
            } catch (Exception e) {
                log.error("Uncaught error in kafka producer I/O thread: ", e);
            }
        }

        log.debug("Beginning shutdown of Kafka producer I/O thread, sending remaining records.");

        // okay we stopped accepting requests but there may still be
        // requests in the accumulator or waiting for acknowledgment,
        // wait until these are completed.
        while (!forceClose && (this.accumulator.hasUnsent() || this.client.inFlightRequestCount() > 0)) {
            try {
                run(time.milliseconds());
            } catch (Exception e) {
                log.error("Uncaught error in kafka producer I/O thread: ", e);
            }
        }
        if (forceClose) {
            // We need to fail all the incomplete batches and wake up the threads waiting on
            // the futures.
            this.accumulator.abortIncompleteBatches();
        }
        try {
            this.client.close();
        } catch (Exception e) {
            log.error("Failed to close network client", e);
        }

        log.debug("Shutdown of Kafka producer I/O thread has completed.");
    }

其中的run(time.milliseconds())的实现为:

public void run(long now) {
        Cluster cluster = metadata.fetch();
        // get the list of partitions with data ready to send
        RecordAccumulator.ReadyCheckResult result = this.accumulator.ready(cluster, now);

        // if there are any partitions whose leaders are not known yet, force metadata update
        if (result.unknownLeadersExist)
            this.metadata.requestUpdate();

        // remove any nodes we aren't ready to send to
        Iterator<Node> iter = result.readyNodes.iterator();
        long notReadyTimeout = Long.MAX_VALUE;
        while (iter.hasNext()) {
            Node node = iter.next();
            if (!this.client.ready(node, now)) {
                iter.remove();
                notReadyTimeout = Math.min(notReadyTimeout, this.client.connectionDelay(node, now));
            }
        }

        // create produce requests
        Map<Integer, List<RecordBatch>> batches = this.accumulator.drain(cluster,
                                                                         result.readyNodes,
                                                                         this.maxRequestSize,
                                                                         now);

        List<RecordBatch> expiredBatches = this.accumulator.abortExpiredBatches(this.requestTimeout, cluster, now);
        // update sensors
        for (RecordBatch expiredBatch : expiredBatches)
            this.sensors.recordErrors(expiredBatch.topicPartition.topic(), expiredBatch.recordCount);

        sensors.updateProduceRequestMetrics(batches);
        List<ClientRequest> requests = createProduceRequests(batches, now);
        // If we have any nodes that are ready to send + have sendable data, poll with 0 timeout so this can immediately
        // loop and try sending more data. Otherwise, the timeout is determined by nodes that have partitions with data
        // that isn't yet sendable (e.g. lingering, backing off). Note that this specifically does not include nodes
        // with sendable data that aren't ready to send since they would cause busy looping.
        long pollTimeout = Math.min(result.nextReadyCheckDelayMs, notReadyTimeout);
        if (result.readyNodes.size() > 0) {
            log.trace("Nodes with data ready to send: {}", result.readyNodes);
            log.trace("Created {} produce requests: {}", requests.size(), requests);
            pollTimeout = 0;
        }
        for (ClientRequest request : requests)
            client.send(request, now);

        // if some partitions are already ready to be sent, the select time would be 0;
        // otherwise if some partition already has some data accumulated but not ready yet,
        // the select time will be the time difference between now and its linger expiry time;
        // otherwise the select time will be the time difference between now and the metadata expiry time;
        this.client.poll(pollTimeout, now);
    }

调用sender.wakeup会发生呢,肯定会更新元数据,但是具体怎么做的,我们后面继续说。

snippet 2. 分区

发送kafka消息之前,要确定它要发到topic的哪个分区。

private int partition(ProducerRecord<K, V> record, byte[] serializedKey , byte[] serializedValue, Cluster cluster) {
        //如果消息已经指定了分区,只要判断这个分区是否有效即可
        Integer partition = record.partition();
        if (partition != null) {
            List<PartitionInfo> partitions = cluster.partitionsForTopic(record.topic());
            int numPartitions = partitions.size();
            // they have given us a partition, use it
            if (partition < 0 || partition >= numPartitions)
                throw new IllegalArgumentException("Invalid partition given with record: " + partition
                                                   + " is not in the range [0..."
                                                   + numPartitions
                                                   + "].");
            return partition;
        }
        //返回分区器实现的方法结果。
        return this.partitioner.partition(record.topic(), record.key(), serializedKey, record.value(), serializedValue,
            cluster);
    }

kafka对于分区器有个默认的实现:

public int partition(String topic, Object key, byte[] keyBytes, Object value, byte[] valueBytes, Cluster cluster) {
        List<PartitionInfo> partitions = cluster.partitionsForTopic(topic);
        int numPartitions = partitions.size();
        //如果消息不带key,就round robin
        if (keyBytes == null) {
            int nextValue = counter.getAndIncrement();
            List<PartitionInfo> availablePartitions = cluster.availablePartitionsForTopic(topic);
            if (availablePartitions.size() > 0) {
                int part = DefaultPartitioner.toPositive(nextValue) % availablePartitions.size();
                return availablePartitions.get(part).partition();
            } else {
                // no partitions are available, give a non-available partition
                return DefaultPartitioner.toPositive(nextValue) % numPartitions;
            }
        } else {
            //通过hash来获取分区
            // hash the keyBytes to choose a partition
            return DefaultPartitioner.toPositive(Utils.murmur2(keyBytes)) % numPartitions;
        }
    }

snippet 3&4 消息的大小

实际的kafka的消息比想象中的key value要大一些,

public static int recordSize(int keySize, int valueSize) {
        return CRC_LENGTH + MAGIC_LENGTH + ATTRIBUTE_LENGTH + KEY_SIZE_LENGTH + keySize + VALUE_SIZE_LENGTH + valueSize;
    }

需要对总大小做限制

private void ensureValidRecordSize(int size) {
        if (size > this.maxRequestSize)
            throw new RecordTooLargeException("The message is " + size +
                                              " bytes when serialized which is larger than the maximum request size you have configured with the " +
                                              ProducerConfig.MAX_REQUEST_SIZE_CONFIG +
                                              " configuration.");
        if (size > this.totalMemorySize)
            throw new RecordTooLargeException("The message is " + size +
                                              " bytes when serialized which is larger than the total memory buffer you have configured with the " +
                                              ProducerConfig.BUFFER_MEMORY_CONFIG +
                                              " configuration.");
    }

我们不对这个深究了。

snippet 5. 将消息先方法放入缓冲区

缓冲区是什么?它包含这些成员

private volatile boolean closed;
    private int drainIndex;
    private final AtomicInteger flushesInProgress;
    private final AtomicInteger appendsInProgress;
    private final int batchSize;
    private final CompressionType compression;
    private final long lingerMs;
    private final long retryBackoffMs;
    private final BufferPool free;
    private final Time time;
    private final ConcurrentMap<TopicPartition, Deque<RecordBatch>> batches;
    private final IncompleteRecordBatches incomplete;

通过batches这个成员变量看出,kafka把不同topicpartition的消息塞入了不同的队列中。Dqueue是一个双端队列,支持在两端同时插入和删除。顾名思义,存放进入缓冲区,就是放到这个双端队列中。再看Deque中存放的元素RecordBatch,

public final class RecordBatch {

    private static final Logger log = LoggerFactory.getLogger(RecordBatch.class);

    public int recordCount = 0;
    public int maxRecordSize = 0;
    public volatile int attempts = 0;
    public final long createdMs;
    public long drainedMs;
    public long lastAttemptMs;
    //这里面维护了一个ByteBuffer
    public final MemoryRecords records;
    public final TopicPartition topicPartition;
    public final ProduceRequestResult produceFuture;
    public long lastAppendTime;
    private final List<Thunk> thunks;
    private boolean retry;
    
    ...

我们说kafka的高性能就体现在很多实现细节中,比如看下面的append方法的实现。直接看代码吧。

public RecordAppendResult append(TopicPartition tp, byte[] key, byte[] value, Callback callback, long maxTimeToBlock) throws InterruptedException {
        // We keep track of the number of appending thread to make sure we do not miss batches in
        // abortIncompleteBatches().
        //这里说明可能是有多个线程在调用
        appendsInProgress.incrementAndGet();
        try {
            if (closed)
                throw new IllegalStateException("Cannot send after the producer is closed.");
            // check if we have an in-progress batch
            Deque<RecordBatch> dq = dequeFor(tp);
            synchronized (dq) {
                RecordBatch last = dq.peekLast();
                //如果好到了一个batch,正好放入
                if (last != null) {
                    FutureRecordMetadata future = last.tryAppend(key, value, callback, time.milliseconds());
                    if (future != null)
                        return new RecordAppendResult(future, dq.size() > 1 || last.records.isFull(), false);
                }
            }

            // we don't have an in-progress record batch try to allocate a new batch
            int size = Math.max(this.batchSize, Records.LOG_OVERHEAD + Record.recordSize(key, value));
            log.trace("Allocating a new {} byte message buffer for topic {} partition {}", size, tp.topic(), tp.partition());
            ByteBuffer buffer = free.allocate(size, maxTimeToBlock);
            synchronized (dq) {
                // Need to check if producer is closed again after grabbing the dequeue lock.
                if (closed)
                    throw new IllegalStateException("Cannot send after the producer is closed.");
                RecordBatch last = dq.peekLast();
                if (last != null) {
                    FutureRecordMetadata future = last.tryAppend(key, value, callback, time.milliseconds());
                    if (future != null) {
                        // Somebody else found us a batch, return the one we waited for! Hopefully this doesn't happen often...
                        free.deallocate(buffer);
                        return new RecordAppendResult(future, dq.size() > 1 || last.records.isFull(), false);
                    }
                }
                MemoryRecords records = MemoryRecords.emptyRecords(buffer, compression, this.batchSize);
                RecordBatch batch = new RecordBatch(tp, records, time.milliseconds());
                FutureRecordMetadata future = Utils.notNull(batch.tryAppend(key, value, callback, time.milliseconds()));

                dq.addLast(batch);
                incomplete.add(batch);
                return new RecordAppendResult(future, dq.size() > 1 || batch.records.isFull(), true);
            }
        } finally {
            appendsInProgress.decrementAndGet();
        }
    }

其中放入batch的方法为:

public FutureRecordMetadata tryAppend(byte[] key, byte[] value, Callback callback, long now) {
        if (!this.records.hasRoomFor(key, value)) {
            return null;
        } else {
            this.records.append(0L, key, value);
            this.maxRecordSize = Math.max(this.maxRecordSize, Record.recordSize(key, value));
            this.lastAppendTime = now;
            FutureRecordMetadata future = new FutureRecordMetadata(this.produceFuture, this.recordCount);
            if (callback != null)
                thunks.add(new Thunk(callback, future));
            this.recordCount++;
            return future;
        }
    }

在放入batch之前先判断空间是否足够:

public boolean hasRoomFor(byte[] key, byte[] value) {
        if (!this.writable)
            return false;

        //如果batch目前写入量为0,判断一个batch是否够放这一条日志,不够放返回false。如果写入量不为0,判断剩余的空间是否够放。
        //initialCapacity是初始分配的bytebuffer的空间大小,writelimit是我们认为限制的一个batch的size
        return this.compressor.numRecordsWritten() == 0 ?
            this.initialCapacity >= Records.LOG_OVERHEAD + Record.recordSize(key, value) :
            this.writeLimit >= this.compressor.estimatedBytesWritten() + Records.LOG_OVERHEAD + Record.recordSize(key, value);
    }

如果这个batch的空间不够放,返回null,表示这个batch存放失败了。返回不为空就算成功了。在返回不为空的时候,执行了一行代码

// Somebody else found us a batch, return the one we waited for! Hopefully this doesn't happen often...
free.deallocate(buffer);

在放到这个已经存在的batch,已经默默预先分配了一块内存,现在用不上了,就归还了。后面的逻辑就比较好理解了,

//先创建存record的地方
MemoryRecords records = MemoryRecords.emptyRecords(buffer, compression, this.batchSize);
//再创建batch
RecordBatch batch = new RecordBatch(tp, records, time.milliseconds());
FutureRecordMetadata future = Utils.notNull(batch.tryAppend(key, value, callback, time.milliseconds()));
dq.addLast(batch);
incomplete.add(batch);
return new RecordAppendResult(future, dq.size() > 1 || batch.records.isFull(), true);

关于kafka是怎么管理这一块内存的,有时间再继续介绍。

snippet 6. 发送

终于到最后发送这一步了。我们又发现了这段代码

sender.wakeup

这到底是什么意思呢? 先到sender线程的run方法中的最后一行代码

// if some partitions are already ready to be sent, the select time would be 0;
        // otherwise if some partition already has some data accumulated but not ready yet,
        // the select time will be the time difference between now and its linger expiry time;
        // otherwise the select time will be the time difference between now and the metadata expiry time;
        this.client.poll(pollTimeout, now);

这里client的实现类是NetworkClient,poll的实现是:

public List<ClientResponse> poll(long timeout, long now) {
        //这里封装了一个拉meta的请求
        long metadataTimeout = metadataUpdater.maybeUpdate(now);
        try {
            this.selector.poll(Utils.min(timeout, metadataTimeout, requestTimeoutMs));
        } catch (IOException e) {
            log.error("Unexpected error during I/O", e);
        }

        // process completed actions
        long updatedNow = this.time.milliseconds();
        List<ClientResponse> responses = new ArrayList<>();
        handleCompletedSends(responses, updatedNow);
        handleCompletedReceives(responses, updatedNow);
        handleDisconnections(responses, updatedNow);
        handleConnections();
        handleTimedOutRequests(responses, updatedNow);

        // invoke callbacks
        for (ClientResponse response : responses) {
            if (response.request().hasCallback()) {
                try {
                    response.request().callback().onComplete(response);
                } catch (Exception e) {
                    log.error("Uncaught error in request completion:", e);
                }
            }
        }

        return responses;
    }

metadataUpdater.maybeUpdate(now)里面封装了一个拉取meta的请求。

public long maybeUpdate(long now) {
            // should we update our metadata?
            long timeToNextMetadataUpdate = metadata.timeToNextUpdate(now);
            long timeToNextReconnectAttempt = Math.max(this.lastNoNodeAvailableMs + metadata.refreshBackoff() - now, 0);
            long waitForMetadataFetch = this.metadataFetchInProgress ? Integer.MAX_VALUE : 0;
            // if there is no node available to connect, back off refreshing metadata
            long metadataTimeout = Math.max(Math.max(timeToNextMetadataUpdate, timeToNextReconnectAttempt),
                    waitForMetadataFetch);
 
            if (metadataTimeout == 0) {
                // Beware that the behavior of this method and the computation of timeouts for poll() are
                // highly dependent on the behavior of leastLoadedNode.
                //选一个node
                Node node = leastLoadedNode(now);
                maybeUpdate(now, node);
            }

            return metadataTimeout;
        }

leastLoadedNode用于寻找一个此时请求数最少的节点

public Node leastLoadedNode(long now) {
        List<Node> nodes = this.metadataUpdater.fetchNodes();
        int inflight = Integer.MAX_VALUE;
        Node found = null;

        int offset = this.randOffset.nextInt(nodes.size());
        for (int i = 0; i < nodes.size(); i++) {
            int idx = (offset + i) % nodes.size();
            Node node = nodes.get(idx);
            int currInflight = this.inFlightRequests.inFlightRequestCount(node.idString());
            if (currInflight == 0 && this.connectionStates.isConnected(node.idString())) {
                // if we find an established connection with no in-flight requests we can stop right away
                return node;
            } else if (!this.connectionStates.isBlackedOut(node.idString(), now) && currInflight < inflight) {
                // otherwise if this is the best we have found so far, record that
                inflight = currInflight;
                found = node;
            }
        }
        
        return found;
    }

其中maybeUpdate的实现为

private void maybeUpdate(long now, Node node) {
            if (node == null) {
                log.debug("Give up sending metadata request since no node is available");
                // mark the timestamp for no node available to connect
                this.lastNoNodeAvailableMs = now;
                return;
            }
            String nodeConnectionId = node.idString();

            if (canSendRequest(nodeConnectionId)) {
                Set<String> topics = metadata.needMetadataForAllTopics() ? new HashSet<String>() : metadata.topics();
                this.metadataFetchInProgress = true;
                ClientRequest metadataRequest = request(now, nodeConnectionId, topics);
                log.debug("Sending metadata request {} to node {}", metadataRequest, node.id());
                doSend(metadataRequest, now);
            } else if (connectionStates.canConnect(nodeConnectionId, now)) {
                // we don't have a connection to this node right now, make one
                log.debug("Initialize connection to node {} for sending metadata request", node.id());
                initiateConnect(node, now);
                // If initiateConnect failed immediately, this node will be put into blackout and we
                // should allow immediately retrying in case there is another candidate node. If it
                // is still connecting, the worst case is that we end up setting a longer timeout
                // on the next round and then wait for the response.
            } else { // connected, but can't send more OR connecting
                // In either case, we just need to wait for a network event to let us know the selected
                // connection might be usable again.
                this.lastNoNodeAvailableMs = now;
            }
        }

先构造一个请求

private ClientRequest request(long now, String node, Set<String> topics) {
            MetadataRequest metadata = new MetadataRequest(new ArrayList<>(topics));
            RequestSend send = new RequestSend(node, nextRequestHeader(ApiKeys.METADATA), metadata.toStruct());
            return new ClientRequest(now, true, send, null, true);
        }

所以这个元数据请求依赖一个topic,一开始就是我们在produce的时候先传入的。 最后就是doSend

private void doSend(ClientRequest request, long now) {
        request.setSendTimeMs(now);
        //存储还没有响应的请求
        this.inFlightRequests.add(request);
        selector.send(request.request());
    }