// append an entry to the index (if needed)
      if (bytesSinceLastIndexEntry > indexIntervalBytes) {
        offsetIndex.append(largestOffset, physicalPosition)
        timeIndex.maybeAppend(maxTimestampSoFar, offsetOfMaxTimestampSoFar)
        bytesSinceLastIndexEntry = 0
      }
      bytesSinceLastIndexEntry += records.sizeInBytes

read():

入参：

startOffset: Long：

读取的第一个消息的位移

maxSize: Int：

能读取的最大字节数

maxPosition: Long = size：

能读取日志中能读取的最大位置

minOneMessage: Boolean = false：

是否返回至少一条消息，当读取的消息大于maxSize时

流程：

1.根据startOffset通过translateOffset，找到要读取的其实文件位置。

2.计算要读取的文件大小。


    val adjustedMaxSize =
      if (minOneMessage) math.max(maxSize, startOffsetAndSize.size)
      else maxSize
    if (adjustedMaxSize == 0)
      return FetchDataInfo(offsetMetadata, MemoryRecords.EMPTY)
    val fetchSize: Int = min((maxPosition - startPosition).toInt, adjustedMaxSize)

3.通过FileRecords.slice读取指定位置指定大小的数据

recover():

参数：

producerStateManager: ProducerStateManager：
leaderEpochCache: Option[LeaderEpochFileCache]：

流程：

1.清空所有索引


    offsetIndex.reset()
    timeIndex.reset()
    txnIndex.reset()

2.遍历日志段中所有消息。

1.检查消息的位移是否合法。

2.更新日志段的最大时间戳及其所属消息的位移。

3.更新索引资源。

4.更新消息总字节数

5.更新producerStateManager和leaderEpochCache状态。

3.进行截断操作，将大于遍历所有消息获取的总大小以外的数据截取。

2.Log

变量：

保存了所有的需要使用的文件类型


 /** a log file */
  val LogFileSuffix = ".log"
  /** an index file */
  val IndexFileSuffix = ".index"
  /** a time index file */
  val TimeIndexFileSuffix = ".timeindex"
  val ProducerSnapshotFileSuffix = ".snapshot"
  /** an (aborted) txn index */
  val TxnIndexFileSuffix = ".txnindex"
  /** a file that is scheduled to be deleted */
  val DeletedFileSuffix = ".deleted"
  /** A temporary file that is being used for log cleaning */
  val CleanedFileSuffix = ".cleaned"
  /** A temporary file used when swapping files into the log */
  val SwapFileSuffix = ".swap"
  /** Clean shutdown file that indicates the broker was cleanly shutdown in 0.8 and higher.
   * This is used to avoid unnecessary recovery after a clean shutdown. In theory this could be
   * avoided by passing in the recovery point, however finding the correct position to do this
   * requires accessing the offset index which may not be safe in an unclean shutdown.
   * For more information see the discussion in PR#2104
   */
  val CleanShutdownFile = ".kafka_cleanshutdown"
  /** a directory that is scheduled to be deleted */
  val DeleteDirSuffix = "-delete"
  /** a directory that is used for future partition */
  val FutureDirSuffix = "-future"

初始化：


  locally {
    initializePartitionMetadata()
    updateLogStartOffset(logStartOffset)
    maybeIncrementFirstUnstableOffset()
    initializeTopicId()
  }

initializePartitionMetadata()：
        创建分区日志路径。
updateLogStartOffset(logStartOffset)：
        设置高水位和回复点。
maybeIncrementFirstUnstableOffset()：
        First Unstable Offset事务机制的一部分
initializeTopicId()：
        初始topic id

方法：

appendAsFollower():

Followert添加日志

appendAsLeader():

Leader添加日志

2.请求处理


KafkaServer::startup():
 
初始化SocketServer
socketServer = new SocketServer(config, metrics, time, credentialProvider, apiVersionManager)
socketServer.startup(startProcessingRequests = false)
 
初始化数据面KafkaRequestHandlerPool
 dataPlaneRequestHandlerPool = new KafkaRequestHandlerPool(config.brokerId, socketServer.dataPlaneRequestChannel, dataPlaneRequestProcessor, time,
          config.numIoThreads, s"${SocketServer.DataPlaneMetricPrefix}RequestHandlerAvgIdlePercent", SocketServer.DataPlaneThreadPrefix)
 
初始化控制面KafkaRequestHandlerPool
 controlPlaneRequestHandlerPool = new KafkaRequestHandlerPool(config.brokerId, socketServer.controlPlaneRequestChannelOpt.get, controlPlaneRequestProcessor, time,
            1, s"${SocketServer.ControlPlaneMetricPrefix}RequestHandlerAvgIdlePercent", SocketServer.ControlPlaneThreadPrefix)

1.SocketServer

方法：

startup():
启动服务。
1.创建控制面的acceptor和processor线程。
2.创建数据面的acceptor和processor线程。
3.启动数据面和控制面的线程。

1.Acceptor

作用：

负责监听新接入的连接，确认有新连接接入后，将连接传递给某个processor处理线程(currentProcessorIndex = currentProcessorIndex % processors.length)

2.Processor

作用：

负责接收acceptor监听到的连接，并进行数据接收，接收数据之后通过

RequestChannel发送给KafkaRequestHandler线程进行处理。

成员：

newConnections:ArrayBlockingQueue：

将acceptor分配的新连接保存起来，等待处理。防止processor处理过慢，影响acceptor分配新连接。

2.KafkaRequestHandlerPool

作用:

管理线程池，每个线程执行KafkaRequestHandler方法，处理实际请求信息。具体处理有KafkaApis.handle()进行处理。

方法：

KafkaRequestHandlerPool():根据传入的numThreads创建对应数量的KafkaRequestHandler线程。
数据面有numThreads个处理消息的线程，控制面有一个处理消息的线程。

3.KafkaApis

作用:

处理收到的实际请求内容。

处理的所有消息：


 case ApiKeys.PRODUCE => handleProduceRequest(request, requestLocal)
        case ApiKeys.FETCH => handleFetchRequest(request)
        case ApiKeys.LIST_OFFSETS => handleListOffsetRequest(request)
        case ApiKeys.METADATA => handleTopicMetadataRequest(request)
        case ApiKeys.LEADER_AND_ISR => handleLeaderAndIsrRequest(request)
        case ApiKeys.STOP_REPLICA => handleStopReplicaRequest(request)
        case ApiKeys.UPDATE_METADATA => handleUpdateMetadataRequest(request, requestLocal)
        case ApiKeys.CONTROLLED_SHUTDOWN => handleControlledShutdownRequest(request)
        case ApiKeys.OFFSET_COMMIT => handleOffsetCommitRequest(request, requestLocal)
        case ApiKeys.OFFSET_FETCH => handleOffsetFetchRequest(request)
        case ApiKeys.FIND_COORDINATOR => handleFindCoordinatorRequest(request)
        case ApiKeys.JOIN_GROUP => handleJoinGroupRequest(request, requestLocal)
        case ApiKeys.HEARTBEAT => handleHeartbeatRequest(request)
        case ApiKeys.LEAVE_GROUP => handleLeaveGroupRequest(request)
        case ApiKeys.SYNC_GROUP => handleSyncGroupRequest(request, requestLocal)
        case ApiKeys.DESCRIBE_GROUPS => handleDescribeGroupRequest(request)
        case ApiKeys.LIST_GROUPS => handleListGroupsRequest(request)
        case ApiKeys.SASL_HANDSHAKE => handleSaslHandshakeRequest(request)
        case ApiKeys.API_VERSIONS => handleApiVersionsRequest(request)
        case ApiKeys.CREATE_TOPICS => maybeForwardToController(request, handleCreateTopicsRequest)
        case ApiKeys.DELETE_TOPICS => maybeForwardToController(request, handleDeleteTopicsRequest)
        case ApiKeys.DELETE_RECORDS => handleDeleteRecordsRequest(request)
        case ApiKeys.INIT_PRODUCER_ID => handleInitProducerIdRequest(request, requestLocal)
        case ApiKeys.OFFSET_FOR_LEADER_EPOCH => handleOffsetForLeaderEpochRequest(request)
        case ApiKeys.ADD_PARTITIONS_TO_TXN => handleAddPartitionToTxnRequest(request, requestLocal)
        case ApiKeys.ADD_OFFSETS_TO_TXN => handleAddOffsetsToTxnRequest(request, requestLocal)
        case ApiKeys.END_TXN => handleEndTxnRequest(request, requestLocal)
        case ApiKeys.WRITE_TXN_MARKERS => handleWriteTxnMarkersRequest(request, requestLocal)
        case ApiKeys.TXN_OFFSET_COMMIT => handleTxnOffsetCommitRequest(request, requestLocal)
        case ApiKeys.DESCRIBE_ACLS => handleDescribeAcls(request)
        case ApiKeys.CREATE_ACLS => maybeForwardToController(request, handleCreateAcls)
        case ApiKeys.DELETE_ACLS => maybeForwardToController(request, handleDeleteAcls)
        case ApiKeys.ALTER_CONFIGS => maybeForwardToController(request, handleAlterConfigsRequest)
        case ApiKeys.DESCRIBE_CONFIGS => handleDescribeConfigsRequest(request)
        case ApiKeys.ALTER_REPLICA_LOG_DIRS => handleAlterReplicaLogDirsRequest(request)
        case ApiKeys.DESCRIBE_LOG_DIRS => handleDescribeLogDirsRequest(request)
        case ApiKeys.SASL_AUTHENTICATE => handleSaslAuthenticateRequest(request)
        case ApiKeys.CREATE_PARTITIONS => maybeForwardToController(request, handleCreatePartitionsRequest)
        case ApiKeys.CREATE_DELEGATION_TOKEN => maybeForwardToController(request, handleCreateTokenRequest)
        case ApiKeys.RENEW_DELEGATION_TOKEN => maybeForwardToController(request, handleRenewTokenRequest)
        case ApiKeys.EXPIRE_DELEGATION_TOKEN => maybeForwardToController(request, handleExpireTokenRequest)
        case ApiKeys.DESCRIBE_DELEGATION_TOKEN => handleDescribeTokensRequest(request)
        case ApiKeys.DELETE_GROUPS => handleDeleteGroupsRequest(request, requestLocal)
        case ApiKeys.ELECT_LEADERS => handleElectReplicaLeader(request)
        case ApiKeys.INCREMENTAL_ALTER_CONFIGS => maybeForwardToController(request, handleIncrementalAlterConfigsRequest)
        case ApiKeys.ALTER_PARTITION_REASSIGNMENTS => maybeForwardToController(request, handleAlterPartitionReassignmentsRequest)
        case ApiKeys.LIST_PARTITION_REASSIGNMENTS => maybeForwardToController(request, handleListPartitionReassignmentsRequest)
        case ApiKeys.OFFSET_DELETE => handleOffsetDeleteRequest(request, requestLocal)
        case ApiKeys.DESCRIBE_CLIENT_QUOTAS => handleDescribeClientQuotasRequest(request)
        case ApiKeys.ALTER_CLIENT_QUOTAS => maybeForwardToController(request, handleAlterClientQuotasRequest)
        case ApiKeys.DESCRIBE_USER_SCRAM_CREDENTIALS => handleDescribeUserScramCredentialsRequest(request)
        case ApiKeys.ALTER_USER_SCRAM_CREDENTIALS => maybeForwardToController(request, handleAlterUserScramCredentialsRequest)
        case ApiKeys.ALTER_ISR => handleAlterIsrRequest(request)
        case ApiKeys.UPDATE_FEATURES => maybeForwardToController(request, handleUpdateFeatures)
        case ApiKeys.ENVELOPE => handleEnvelope(request, requestLocal)
        case ApiKeys.DESCRIBE_CLUSTER => handleDescribeCluster(request)
        case ApiKeys.DESCRIBE_PRODUCERS => handleDescribeProducersRequest(request)
        case ApiKeys.DESCRIBE_TRANSACTIONS => handleDescribeTransactionsRequest(request)
        case ApiKeys.LIST_TRANSACTIONS => handleListTransactionsRequest(request)
        case ApiKeys.ALLOCATE_PRODUCER_IDS => handleAllocateProducerIdsRequest(request)
        case ApiKeys.DESCRIBE_QUORUM => forwardToControllerOrFail(request)

3.Controller

1.ControllerContext

作用：

元数据信息。

成员：


  val stats = new ControllerStats //controller统计信息
  var offlinePartitionCount = 0    //离线分区计数
  var preferredReplicaImbalanceCount = 0 //
  val shuttingDownBrokerIds = mutable.Set.empty[Int] //关闭中broker的id列表
  private val liveBrokers = mutable.Set.empty[Broker] //当前运行中broker对象列表
  private val liveBrokerEpochs = mutable.Map.empty[Int, Long] //运行中broker epoch列表
  var epoch: Int = KafkaController.InitialControllerEpoch //controller当前epoch值
  var epochZkVersion: Int = KafkaController.InitialControllerEpochZkVersion //controller对应zookeeper节点的epoch值
 
  val allTopics = mutable.Set.empty[String] //集群主题列表
  var topicIds = mutable.Map.empty[String, Uuid] 
  var topicNames = mutable.Map.empty[Uuid, String]
  val partitionAssignments = mutable.Map.empty[String, mutable.Map[Int, ReplicaAssignment]]//主题分区的副本列表
  private val partitionLeadershipInfo = mutable.Map.empty[TopicPartition, 
    LeaderIsrAndControllerEpoch]//主题分区的leader/ISR副本信息
  val partitionsBeingReassigned = mutable.Set.empty[TopicPartition]//处于副本重分配过程的主题分区列表
  val partitionStates = mutable.Map.empty[TopicPartition, PartitionState]//主题分区状态列表
  val replicaStates = mutable.Map.empty[PartitionAndReplica, ReplicaState]//主题分区副本列表
  val replicasOnOfflineDirs = mutable.Map.empty[Int, Set[TopicPartition]]//不可用磁盘路劲上的副本列表
 
  val topicsToBeDeleted = mutable.Set.empty[String]//待删除列表
 val topicsWithDeletionStarted = mutable.Set.empty[String]//已开启删除的主题列表
  val topicsIneligibleForDeletion = mutable.Set.empty[String]//暂时无法删除的主题列表

2.RequestSendThread

功能：

负责发送controller的消息给其他Broker.每个RequestSendThread对应一个Broker。

发送的消息：

LeaderAndIsrRequest:通知Broker相关主题各个分区的Leader副本在那个Broker上，ISR中的副本在呢些Broker上。
StopReplicaRequest:通知指定Broker停止其上面的副本对象。
UpdateMetadataRequest:通知更新Broker上的元数据。

3.ControllerChannelManager

功能：

管理controller和集群中Broker的连接。为每个Broker创建RequestSendThread来进行消息发送。

函数：

startup():启动所有Broker的发送线程
shutdown()：移除所有Broker，并关闭与Broker的发送线程和网络连接。

sendRequest():把对应Broker的消息放入对应RequestSendThread中的队列。

addBroker():添加新的Broker节点，addNewBroker()创建与Broker的连接和发送线程，startRequestSendThread()启动发送线程。

removeBroker()：移除Broker，并关闭与Broker的发送线程和网络连接。

4.Controller事件管理

1.ControllerEventManager

功能：

管理Controller事件。

2.ControllerEventThread

功能：

处理Controller事件的线程，所有事件由一个线程进行处理不涉及锁的操作。

3.ControllerEventProcessor

功能：

KafkaController的接口。

KafkaController实现的事件处理


     event match {
        case event: MockEvent =>
          // Used only in test cases
          event.process()
        case ShutdownEventThread =>
          error("Received a ShutdownEventThread event. This type of event is supposed to be handle by ControllerEventThread")
        case AutoPreferredReplicaLeaderElection =>
          processAutoPreferredReplicaLeaderElection()
        case ReplicaLeaderElection(partitions, electionType, electionTrigger, callback) =>
          processReplicaLeaderElection(partitions, electionType, electionTrigger, callback)
        case UncleanLeaderElectionEnable =>
          processUncleanLeaderElectionEnable()
        case TopicUncleanLeaderElectionEnable(topic) =>
          processTopicUncleanLeaderElectionEnable(topic)
        case ControlledShutdown(id, brokerEpoch, callback) =>
          processControlledShutdown(id, brokerEpoch, callback)
        case LeaderAndIsrResponseReceived(response, brokerId) =>
          processLeaderAndIsrResponseReceived(response, brokerId)
        case UpdateMetadataResponseReceived(response, brokerId) =>
          processUpdateMetadataResponseReceived(response, brokerId)
        case TopicDeletionStopReplicaResponseReceived(replicaId, requestError, partitionErrors) =>
          processTopicDeletionStopReplicaResponseReceived(replicaId, requestError, partitionErrors)
        case BrokerChange =>
          processBrokerChange()
        case BrokerModifications(brokerId) =>
          processBrokerModification(brokerId)
        case ControllerChange =>
          processControllerChange()
        case Reelect =>
          processReelect()
        case RegisterBrokerAndReelect =>
          processRegisterBrokerAndReelect()
        case Expire =>
          processExpire()
        case TopicChange =>
          processTopicChange()
        case LogDirEventNotification =>
          processLogDirEventNotification()
        case PartitionModifications(topic) =>
          processPartitionModifications(topic)
        case TopicDeletion =>
          processTopicDeletion()
        case ApiPartitionReassignment(reassignments, callback) =>
          processApiPartitionReassignment(reassignments, callback)
        case ZkPartitionReassignment =>
          processZkPartitionReassignment()
        case ListPartitionReassignments(partitions, callback) =>
          processListPartitionReassignments(partitions, callback)
        case UpdateFeatures(request, callback) =>
          processFeatureUpdates(request, callback)
        case PartitionReassignmentIsrChange(partition) =>
          processPartitionReassignmentIsrChange(partition)
        case IsrChangeNotification =>
          processIsrChangeNotification()
        case AlterIsrReceived(brokerId, brokerEpoch, isrsToAlter, callback) =>
          processAlterIsr(brokerId, brokerEpoch, isrsToAlter, callback)
        case AllocateProducerIds(brokerId, brokerEpoch, callback) =>
          processAllocateProducerIds(brokerId, brokerEpoch, callback)
        case Startup =>
          processStartup()
      }

4.副本管理

1.ReplicaFetcherThread

父类：

AbstractFetcherThread。

功能：

follower从leader拉取消息。

成员：

doWork():


 override def doWork(): Unit = {
    maybeTruncate()
    maybeFetch()
  }

负责执行副本截断，执行消息获取。

processPartitionData():

处理从leader拉取的消息，并通过LogSegment.append()写入日志。

truncate():

对指定分区进行日志截断。

buildFetch():

构建发送给leader副本的Broker的Fetch消息。

2.ReplicaManager

成员：
allPartitions：
        管理当前Broker上所有的分区数据。

replicaFetcherManager：
        创建管理follower副本向leader副本拉取消息的ReplicaFetcherThread，并把日志写入副本log。

appendRecords():
        写入日志：
        情况1：leader写入本地日志后返回成功。
        情况2:leader写入本地后，且其他follower也写入成功，此时才返回成功。

        使用场景：
        1.生产者向leader副本写入消息。
        2.消费者组写入信息。
        3.事务管理器写入事务信息。

fetchMessages():
        处理fetch请求。数据累积达到一定值的时候才会把数据返回给请求方。

4.消费者组管理

1.MemberMetadata

成员：

summary,summaryNoMetadata：
        创建返回MemberSummary实例。
        MemberSummary中的成员：
                memberId：
                        消费者组成员id，consumer-组ID-<序号>-
                groupInstanceId：
                        消费者组静态成员ID。
                clientId：
                        消费者组成员配置的client.id
                clientHost:
                        运行消费者组程序的主机名。
                metadata：
                        标识消费者组成员分区分配策略的字节数组。
                assignment：
                        保存分配给该成员的订阅分区。

rebalanceTimeoutMs：
        必须完成rebalance的时间。
sessionTimeoutMs：
        会话超时时间，超时时间内为有心跳产生，被认定成员下线，触发rebalance。
protocolType：
        协议类型。
supportedProtocols：
        多组多分区分配策略。
awaitingJoinCallback：
        表示组成员是否正在等待加入组。
awaitingSyncCallback：
        表示组成员组成员是否等待GroupCoordinator发送分配方案。
isNew：
        表示是否是消费者组的新成员。
heartbeatSatisfied：
        当心跳过期时设置为false，接收到心跳设置为true。

2.GroupMetadata

成员：

GroupState：

定义消费者组状态。PreparingRebalance，CompletingRebalance，Stable，Dead，Empty ，5种状态。

currentStateTimestamp：
        最近一次状态变更的时间。
generationId：
        每次执行rebalance值加1。
leaderId：
        消费者组中leader成员的Member Id。
members:
        保存消费者组下所有成员的元数据信息。
offsets：
        保存按照主题分区分组的位移主题消息的hashMap ，key是主题分区，value 是 CommitRecordMetadataAndOffset保存位移信息，保存了位移主题消息自己的位移值和位移提交消息中保存的消费者组的位移值。
subscribedTopics：
        保存消费者组订阅的列表用于帮助从offsets字段过滤订阅主题分区的位移值。
supportedProtocols：
        保存分区分配策略的支持票数。
transitionTo():
        设置消费者组状态为传入的状态。并更新状态变化时间currentStateTimestamp。
canRebalance():
        判断是否可以rebalance。
add():

        添加成员。


 def add(member: MemberMetadata, callback: JoinCallback = null): Unit = {
    member.groupInstanceId.foreach { instanceId =>
      if (staticMembers.contains(instanceId))
        throw new IllegalStateException(s"Static member with groupInstanceId=$instanceId " +
          s"cannot be added to group $groupId since it is already a member")
      staticMembers.put(instanceId, member.memberId)
    }
   // 如果成员组中没有其他成员
    if (members.isEmpty)
     //   把该成员的protocolType设置为消费组的protocolType
      this.protocolType = Some(member.protocolType)
 
    比较protocolType是否相同
    assert(this.protocolType.orNull == member.protocolType)
 //   确保成员的分区分配策略和组的分区分配策略匹配
    assert(supportsProtocols(member.protocolType, MemberMetadata.plainProtocolSet(member.supportedProtocols)))
    //检查是否没有lead成员
    if (leaderId.isEmpty)
    //    将成员设置成lead成员
      leaderId = Some(member.memberId)
//将成员加入members
    members.put(member.memberId, member)
   // 更新分区分配策略支持票数
    incSupportedProtocols(member)
    设置成员加入组后的回调函数
    member.awaitingJoinCallback = callback
 
   // 更新已加入组的成员数
    if (member.isAwaitingJoin)
      numMembersAwaitingJoin += 1
 
    pendingMembers.remove(member.memberId)
  }

remove():

移除成员。


 def remove(memberId: String): Unit = {
    //从members中移除成员
    members.remove(memberId).foreach { member =>
        更新分区分配策略支持票数
      decSupportedProtocols(member)
      if (member.isAwaitingJoin)
        已加入组成员数减1
        numMembersAwaitingJoin -= 1
 
      member.groupInstanceId.foreach(staticMembers.remove)
    }
 
    如果是lead成员
    if (isLeader(memberId))
      选择剩余成员列表中第一个
        leaderId = members.keys.headOption
 
    pendingMembers.remove(memberId)
    pendingSyncMembers.remove(memberId)
  }

initializeOffsets():
        添加位移值。
onOffsetCommitAppend():
        添加位移值。
completePendingTxnOffsetCommit():
         完成一个待决事物。
getExpiredOffsets():
        获取订阅过期分区。


def getExpiredOffsets(baseTimestamp: CommitRecordMetadataAndOffset => Long,
                          subscribedTopics: Set[String] = Set.empty): Map[TopicPartition, 
    //同时满足一下三个条件时可以移除
OffsetAndMetadata] = {
      offsets.filter {
        case (topicPartition, commitRecordMetadataAndOffset) =>
        //分区所属主题不在订阅主题列表中，不在subscribedTopics中
          !subscribedTopics.contains(topicPartition.topic()) &&
        //主题分期一完成提交不在pendingOffsetCommits中
          !pendingOffsetCommits.contains(topicPartition) && {
            //主题分区在位移主题中对应的消息的存在时间超过了阈值
            commitRecordMetadataAndOffset.offsetAndMetadata.expireTimestamp match {
              case None =>
                // current version with no per partition retention
                currentTimestamp - baseTimestamp(commitRecordMetadataAndOffset) >= offsetRetentionMs
              case Some(expireTimestamp) =>
                // older versions with explicit expire_timestamp field => old expiration semantics is used
                currentTimestamp >= expireTimestamp
            }
          }
      }.map {
        case (topicPartition, commitRecordOffsetAndMetadata) =>
          (topicPartition, commitRecordOffsetAndMetadata.offsetAndMetadata)
      }.toMap
    }

removeExpiredOffsets(): 
        移除过期分区。

selectProtocol():
        选取分区分配策略。


 def selectProtocol: String = {
    if (members.isEmpty)
      throw new IllegalStateException("Cannot select protocol for empty group")
 
    // select the protocol for this group which is supported by all members
    // 找出所有成员都支持的分区分片策略
    val candidates = candidateProtocols
 
    // let each member vote for one of the protocols and choose the one with the most votes
    //所有成员投票，选取票数多的策略
    val (protocol, _) = allMemberMetadata
      .map(_.vote(candidates))
      .groupBy(identity)
      .maxBy { case (_, votes) => votes.size }
 
    protocol
  }

2.GroupMetadataManager

成员：

brokerId：
        所在broker的id。
interBrokerProtocolVersion:
        broker间通信使用的版本。
config:
        OffsetConfig，定义位移管理的相关参数
replicaManager:
        副本管理区。
CompressionType：
        压缩器类型。
groupMetadataCache：
        Pool[String, GroupMetadata]，key是消费者组名称，value是GroupMetadata消费者组元数据。保存由当前broker上GroupCoordinator管理的消费者组元数据
loadingPartitions：
        位移主题下正在执行加载操作的分区号集合。
ownedPartitions：
        位移主题下完成加载操作的分区号集合。
groupMetadataTopicPartitionCount：
        位移主题分区数。
getGroup():
        获取消费者组元数据。
getOrMaybeCreateGroup():
        获取相应的消费者组原信息，如果不存在可以创建并添加消费者组。removeGroupsForPartition():
        移除消费者组信息，通过scheduler.schedule()创建异步任务。调用removeGroupsAndOffsets()，移除组信息和位移信息。


private [group] def removeGroupsAndOffsets(topicPartition: TopicPartition,
                                             coordinatorEpoch: Option[Int],
                                             onGroupUnloaded: GroupMetadata => Unit): Unit = {
    val offsetsPartition = topicPartition.partition
    if (maybeUpdateCoordinatorEpoch(offsetsPartition, coordinatorEpoch)) {
      var numOffsetsRemoved = 0
      var numGroupsRemoved = 0
 
      debug(s"Started unloading offsets and group metadata for $topicPartition for " +
        s"coordinator epoch $coordinatorEpoch")
      inLock(partitionLock) {
        // we need to guard the group removal in cache in the loading partition lock
        // to prevent coordinator's check-and-get-group race condition
        //从ownedPartitions和loadingPartitions移除特定位移主题分区。
        ownedPartitions.remove(offsetsPartition)
        loadingPartitions.remove(offsetsPartition)
        //遍历所有消费者组信息
        for (group <- groupMetadataCache.values) {
            //如果组信息是在给定位移主题分区下保存的
          if (partitionFor(group.groupId) == offsetsPartition) {
            //卸载组
            onGroupUnloaded(group)
            //将信息从组数据元中移除。
            groupMetadataCache.remove(group.groupId, group)
            //把消费者组从producer对应的组集合中移除
            removeGroupFromAllProducers(group.groupId)
            //更新已移除组计数器
            numGroupsRemoved += 1
            //更新已移除位移值计数器
            numOffsetsRemoved += group.numOffsets
          }
        }
      }
      info(s"Finished unloading $topicPartition for coordinator epoch $coordinatorEpoch. " +
        s"Removed $numOffsetsRemoved cached offsets and $numGroupsRemoved cached groups.")
    } else {
      info(s"Not removing offsets and group metadata for $topicPartition " +
        s"in epoch $coordinatorEpoch since current epoch is ${epochForPartitionId.get(topicPartition.partition)}")
    }
  }

addGroup():
        添加消费者组元数据。
loadGroup():
        加载消费者组元数据。


  private def loadGroup(group: GroupMetadata, offsets: Map[TopicPartition, CommitRecordMetadataAndOffset],
                        pendingTransactionalOffsets: Map[Long, mutable.Map[TopicPartition, CommitRecordMetadataAndOffset]]): Unit = {
    // offsets are initialized prior to loading the group into the cache to ensure that clients see a consistent
    // view of the group's offsets
    trace(s"Initialized offsets $offsets for group ${group.groupId}")
    //初始化消费者组的位移信息
    group.initializeOffsets(offsets, pendingTransactionalOffsets.toMap)
    //调动addGroup添加消费者组
    val currentGroup = addGroup(group)
    if (group != currentGroup)
      debug(s"Attempt to load group ${group.groupId} from log with generation ${group.generationId} failed " +
        s"because there is already a cached group with generation ${currentGroup.generationId}")
  }

storeOffsets():
        保存消费者组位移。


  def storeOffsets(group: GroupMetadata,
                   consumerId: String,
                   offsetMetadata: immutable.Map[TopicPartition, OffsetAndMetadata],
                   responseCallback: immutable.Map[TopicPartition, Errors] => Unit,
                   producerId: Long = RecordBatch.NO_PRODUCER_ID,
                   producerEpoch: Short = RecordBatch.NO_PRODUCER_EPOCH,
                   requestLocal: RequestLocal = RequestLocal.NoCaching): Unit = {
    // first filter out partitions with offset metadata size exceeding limit
    val filteredOffsetMetadata = offsetMetadata.filter { case (_, offsetAndMetadata) =>
    //位移消息要小于maxMetadataSize
      validateOffsetMetadataLength(offsetAndMetadata.metadata)
    }
 
    group.inLock {
      if (!group.hasReceivedConsistentOffsetCommits)
        warn(s"group: ${group.groupId} with leader: ${group.leaderOrNull} has received offset commits from consumers as well " +
          s"as transactional producers. Mixing both types of offset commits will generally result in surprises and " +
          s"should be avoided.")
    }
 
    val isTxnOffsetCommit = producerId != RecordBatch.NO_PRODUCER_ID
    // construct the message set to append
    //如果位移消息都过大，就返回错误信息
    if (filteredOffsetMetadata.isEmpty) {
      // compute the final error codes for the commit response
      val commitStatus = offsetMetadata.map { case (k, _) => k -> Errors.OFFSET_METADATA_TOO_LARGE }
      responseCallback(commitStatus)
    } else {
    //查看当前borker是否为指定消费者组的Coordinator
      getMagic(partitionFor(group.groupId)) match {
        case Some(magicValue) =>
          // We always use CREATE_TIME, like the producer. The conversion to LOG_APPEND_TIME (if necessary) happens automatically.
          val timestampType = TimestampType.CREATE_TIME
          val timestamp = time.milliseconds()
            //创建位移主题的位移提交消息
          val records = filteredOffsetMetadata.map { case (topicPartition, offsetAndMetadata) =>
            val key = GroupMetadataManager.offsetCommitKey(group.groupId, topicPartition)
            val value = GroupMetadataManager.offsetCommitValue(offsetAndMetadata, interBrokerProtocolVersion)
            new SimpleRecord(timestamp, key, value)
          }
          val offsetTopicPartition = new TopicPartition(Topic.GROUP_METADATA_TOPIC_NAME, partitionFor(group.groupId))
        //申请内存，等待将位移消息写入
          val buffer = ByteBuffer.allocate(AbstractRecords.estimateSizeInBytes(magicValue, compressionType, records.asJava))
 
          if (isTxnOffsetCommit && magicValue < RecordBatch.MAGIC_VALUE_V2)
            throw Errors.UNSUPPORTED_FOR_MESSAGE_FORMAT.exception("Attempting to make a transaction offset commit with an invalid magic: " + magicValue)
 
          val builder = MemoryRecords.builder(buffer, magicValue, compressionType, timestampType, 0L, time.milliseconds(),
            producerId, producerEpoch, 0, isTxnOffsetCommit, RecordBatch.NO_PARTITION_LEADER_EPOCH)
 
          records.foreach(builder.append)
          val entries = Map(offsetTopicPartition -> builder.build())
 
          // set the callback function to insert offsets into cache after log append completed
          def putCacheCallback(responseStatus: Map[TopicPartition, PartitionResponse]): Unit = {
            // the append response should only contain the topics partition
            //确保消息写入指定位移主题分区
            if (responseStatus.size != 1 || !responseStatus.contains(offsetTopicPartition))
              throw new IllegalStateException("Append status %s should only have one partition %s"
                .format(responseStatus, offsetTopicPartition))
 
            // construct the commit response status and insert
            // the offset and metadata to cache if the append status has no error
            val status = responseStatus(offsetTopicPartition)
 
            val responseError = group.inLock {
            //写入结果没有异常
              if (status.error == Errors.NONE) {
                //如果不是dead状态
                if (!group.is(Dead)) {
                  filteredOffsetMetadata.forKeyValue { (topicPartition, offsetAndMetadata) =>
                    if (isTxnOffsetCommit)
                      group.onTxnOffsetCommitAppend(producerId, topicPartition, CommitRecordMetadataAndOffset(Some(status.baseOffset), offsetAndMetadata))
                    else
                       //填充GroupMetadata中的元数据
                      group.onOffsetCommitAppend(topicPartition, CommitRecordMetadataAndOffset(Some(status.baseOffset), offsetAndMetadata))
                  }
                }
 
                // Record the number of offsets committed to the log
                offsetCommitsSensor.record(records.size)
 
                Errors.NONE
              } else {
                if (!group.is(Dead)) {
                  if (!group.hasPendingOffsetCommitsFromProducer(producerId))
                    removeProducerGroup(producerId, group.groupId)
                  filteredOffsetMetadata.forKeyValue { (topicPartition, offsetAndMetadata) =>
                    if (isTxnOffsetCommit)
                      group.failPendingTxnOffsetCommit(producerId, topicPartition)
                    else
                    //取消未完成的位移消息写入
                      group.failPendingOffsetWrite(topicPartition, offsetAndMetadata)
                  }
                }
 
                debug(s"Offset commit $filteredOffsetMetadata from group ${group.groupId}, consumer $consumerId " +
                  s"with generation ${group.generationId} failed when appending to log due to ${status.error.exceptionName}")
 
                // transform the log append error code to the corresponding the commit status error code
                status.error match {
                  case Errors.UNKNOWN_TOPIC_OR_PARTITION
                       | Errors.NOT_ENOUGH_REPLICAS
                       | Errors.NOT_ENOUGH_REPLICAS_AFTER_APPEND =>
                    Errors.COORDINATOR_NOT_AVAILABLE
 
                  case Errors.NOT_LEADER_OR_FOLLOWER
                       | Errors.KAFKA_STORAGE_ERROR =>
                    Errors.NOT_COORDINATOR
 
                  case Errors.MESSAGE_TOO_LARGE
                       | Errors.RECORD_LIST_TOO_LARGE
                       | Errors.INVALID_FETCH_SIZE =>
                    Errors.INVALID_COMMIT_OFFSET_SIZE
 
                  case other => other
                }
              }
            }
 
            // compute the final error codes for the commit response
            //获取最后的状态
            val commitStatus = offsetMetadata.map { case (topicPartition, offsetAndMetadata) =>
              if (validateOffsetMetadataLength(offsetAndMetadata.metadata))
                (topicPartition, responseError)
              else
                (topicPartition, Errors.OFFSET_METADATA_TOO_LARGE)
            }
 
            // finally trigger the callback logic passed from the API layer
            //最后调用回调函数
            responseCallback(commitStatus)
          }
 
          if (isTxnOffsetCommit) {
            group.inLock {
              addProducerGroup(producerId, group.groupId)
              group.prepareTxnOffsetCommit(producerId, offsetMetadata)
            }
          } else {
            group.inLock {
              group.prepareOffsetCommit(offsetMetadata)
            }
          }
           //将消息写入到位移主题，并把putCacheCallback作为回调函数执行，更新消费者元数据
          appendForGroup(group, entries, requestLocal, putCacheCallback)
 
        case None =>
          val commitStatus = offsetMetadata.map { case (topicPartition, _) =>
            (topicPartition, Errors.NOT_COORDINATOR)
          }
          responseCallback(commitStatus)
      }
    }
  }

getOffsets():

查询消费者组位移，从对应的GroupMetadata中获取。

groupMetadataKey():生成消费者组注册消息的key。
groupMetadataValue():生成消费者组注册消息的value。当注册消息的value为空的时候，表示可以将这个消费者组元数据从位移主题中删除。


 def groupMetadataValue(groupMetadata: GroupMetadata,
                         assignment: Map[String, Array[Byte]],
                         apiVersion: ApiVersion): Array[Byte] = {
 
    val version =
      if (apiVersion < KAFKA_0_10_1_IV0) 0.toShort
      else if (apiVersion < KAFKA_2_1_IV0) 1.toShort
      else if (apiVersion < KAFKA_2_3_IV0) 2.toShort
      else 3.toShort
 
    MessageUtil.toVersionPrefixedBytes(version, new GroupMetadataValue()
    //设置协议类型
      .setProtocolType(groupMetadata.protocolType.getOrElse(""))
    //设置generation Id。
      .setGeneration(groupMetadata.generationId)
    //设置分配策略。
      .setProtocol(groupMetadata.protocolName.orNull)
    //设置leader ID
      .setLeader(groupMetadata.leaderOrNull)
    //设置最近一次状态变更时间
      .setCurrentStateTimestamp(groupMetadata.currentStateTimestampOrDefault)
    //设置所有成员信息  
    .setMembers(groupMetadata.allMemberMetadata.map { memberMetadata =>
        new GroupMetadataValue.MemberMetadata()
          .setMemberId(memberMetadata.memberId)
          .setClientId(memberMetadata.clientId)
          .setClientHost(memberMetadata.clientHost)
          .setSessionTimeout(memberMetadata.sessionTimeoutMs)
          .setRebalanceTimeout(memberMetadata.rebalanceTimeoutMs)
          .setGroupInstanceId(memberMetadata.groupInstanceId.orNull)
          // The group is non-empty, so the current protocol must be defined
          .setSubscription(groupMetadata.protocolName.map(memberMetadata.metadata)
            .getOrElse(throw new IllegalStateException("Attempted to write non-empty group metadata with no defined protocol.")))
          .setAssignment(assignment.getOrElse(memberMetadata.memberId,
            throw new IllegalStateException(s"Attempted to write member ${memberMetadata.memberId} of group ${groupMetadata.groupId} with no assignment.")))
      }.asJava))
  }

offsetCommitKey():创建提交位移消息key。消费者名称+主题+分区号
offsetCommitValue():创建提交位移消息value。当位移消息的value为空的时候，表示可以将这个消费者组在的主题分区上的位移调教删除。


  def offsetCommitValue(offsetAndMetadata: OffsetAndMetadata,
                        apiVersion: ApiVersion): Array[Byte] = {
    val version =
      if (apiVersion < KAFKA_2_1_IV0 || offsetAndMetadata.expireTimestamp.nonEmpty) 1.toShort
      else if (apiVersion < KAFKA_2_1_IV1) 2.toShort
      else 3.toShort
    MessageUtil.toVersionPrefixedBytes(version, new OffsetCommitValue()
    //设置位移值
      .setOffset(offsetAndMetadata.offset)
    //元数据信息
      .setMetadata(offsetAndMetadata.metadata)
    //设置更新时间
      .setCommitTimestamp(offsetAndMetadata.commitTimestamp)
//设置leader的Epoch   
.setLeaderEpoch(offsetAndMetadata.leaderEpoch.orElse(RecordBatch.NO_PARTITION_LEADER_EPOCH))
      // version 1 has a non empty expireTimestamp field
      .setExpireTimestamp(offsetAndMetadata.expireTimestamp.getOrElse(OffsetCommitRequest.DEFAULT_TIMESTAMP))
    )
  }

storeGroup():
        向Coordinator中注册消费者组，putCacheCallback把消息存入内存中。Coordinator对应的Broker是组所在位移主题分区的leader的Broker。


def storeGroup(group: GroupMetadata,
                 groupAssignment: Map[String, Array[Byte]],
                 responseCallback: Errors => Unit,
                 requestLocal: RequestLocal = RequestLocal.NoCaching): Unit = {
    //判断当前broker是不是group的coordinator
    getMagic(partitionFor(group.groupId)) match {
      case Some(magicValue) =>
        // We always use CREATE_TIME, like the producer. The conversion to LOG_APPEND_TIME (if necessary) happens automatically.
        val timestampType = TimestampType.CREATE_TIME
        val timestamp = time.milliseconds()
        //构建注册消息的key和value
        val key = GroupMetadataManager.groupMetadataKey(group.groupId)
        val value = GroupMetadataManager.groupMetadataValue(group, groupAssignment, interBrokerProtocolVersion)
 
        val records = {
          val buffer = ByteBuffer.allocate(AbstractRecords.estimateSizeInBytes(magicValue, compressionType,
            Seq(new SimpleRecord(timestamp, key, value)).asJava))
          val builder = MemoryRecords.builder(buffer, magicValue, compressionType, timestampType, 0L)
          builder.append(timestamp, key, value)
          builder.build()
        }
        //计算要写入的目标分区
        val groupMetadataPartition = new TopicPartition(Topic.GROUP_METADATA_TOPIC_NAME, partitionFor(group.groupId))
        val groupMetadataRecords = Map(groupMetadataPartition -> records)
        val generationId = group.generationId
 
        // set the callback function to insert the created group into cache after log append completed
        //填充cache
        def putCacheCallback(responseStatus: Map[TopicPartition, PartitionResponse]): Unit = {
          // the append response should only contain the topics partition
          if (responseStatus.size != 1 || !responseStatus.contains(groupMetadataPartition))
            throw new IllegalStateException("Append status %s should only have one partition %s"
              .format(responseStatus, groupMetadataPartition))
 
          // construct the error status in the propagated assignment response in the cache
          val status = responseStatus(groupMetadataPartition)
 
          val responseError = if (status.error == Errors.NONE) {
            Errors.NONE
          } else {
            debug(s"Metadata from group ${group.groupId} with generation $generationId failed when appending to log " +
              s"due to ${status.error.exceptionName}")
 
            // transform the log append error code to the corresponding the commit status error code
            status.error match {
              case Errors.UNKNOWN_TOPIC_OR_PARTITION
                   | Errors.NOT_ENOUGH_REPLICAS
                   | Errors.NOT_ENOUGH_REPLICAS_AFTER_APPEND =>
                Errors.COORDINATOR_NOT_AVAILABLE
 
              case Errors.NOT_LEADER_OR_FOLLOWER
                   | Errors.KAFKA_STORAGE_ERROR =>
                Errors.NOT_COORDINATOR
 
              case Errors.REQUEST_TIMED_OUT =>
                Errors.REBALANCE_IN_PROGRESS
 
              case Errors.MESSAGE_TOO_LARGE
                   | Errors.RECORD_LIST_TOO_LARGE
                   | Errors.INVALID_FETCH_SIZE =>
 
                error(s"Appending metadata message for group ${group.groupId} generation $generationId failed due to " +
                  s"${status.error.exceptionName}, returning UNKNOWN error code to the client")
 
                Errors.UNKNOWN_SERVER_ERROR
 
              case other =>
                error(s"Appending metadata message for group ${group.groupId} generation $generationId failed " +
                  s"due to unexpected error: ${status.error.exceptionName}")
 
                other
            }
          }
       
          responseCallback(responseError)
        }
        //向位移主题写入消息
        appendForGroup(group, groupMetadataRecords, requestLocal, putCacheCallback)
 
      case None =>
        responseCallback(Errors.NOT_COORDINATOR)
        None
    }
  }

管理消费者组的coordinator所在的broker是group所使用的位移主题的分区leader的broker

loadGroupsAndOffsets():
doLoadGroupsAndOffsets():
        从位移主题中加载消费者组和消费者组的位移到内存中。当前Broker成为某个分区的Leader副本的时候执行。


private def doLoadGroupsAndOffsets(topicPartition: TopicPartition, onGroupLoaded: GroupMetadata => Unit): Unit = {
    def logEndOffset: Long = replicaManager.getLogEndOffset(topicPartition).getOrElse(-1L)
    //从副本管理中，查找到 主题分区的 日志对象
    replicaManager.getLog(topicPartition) match {
      case None =>
        warn(s"Attempted to load offsets and group metadata from $topicPartition, but found no log")
 
      case Some(log) =>
        //已完成位移值加载的分区列表。
        val loadedOffsets = mutable.Map[GroupTopicPartition, CommitRecordMetadataAndOffset]()
        //正在位移加载的分区列表
        val pendingOffsets = mutable.Map[Long, mutable.Map[GroupTopicPartition, CommitRecordMetadataAndOffset]]()
        //已完成组信息加载的消费者组列表
        val loadedGroups = mutable.Map[String, GroupMetadata]()
        //等待移除的消费者组列表
        val removedGroups = mutable.Set[String]()
 
        // buffer may not be needed if records are read from memory
        var buffer = ByteBuffer.allocate(0)
 
        // loop breaks if leader changes at any time during the load, since logEndOffset is -1
        //位移主题分区的起始位置
        var currOffset = log.logStartOffset
 
        // loop breaks if no records have been read, since the end of the log has been reached
        //至少读取一条消息
        var readAtLeastOneRecord = true
        //读取位移值小于日志LEO值，至少读取一次，GroupMetadataManager没有关闭
        while (currOffset < logEndOffset && readAtLeastOneRecord && !shuttingDown.get()) {
            //读取位移主题指定分区的数据
          val fetchDataInfo = log.read(currOffset,
            maxLength = config.loadBufferSize,
            isolation = FetchLogEnd,
            minOneMessage = true)
 
          readAtLeastOneRecord = fetchDataInfo.records.sizeInBytes > 0
            //创建消息集合
          val memRecords = (fetchDataInfo.records: @unchecked) match {
            case records: MemoryRecords => records
            //从FileRecords 转换到MemoryRecords 
            case fileRecords: FileRecords =>
              val sizeInBytes = fileRecords.sizeInBytes
              val bytesNeeded = Math.max(config.loadBufferSize, sizeInBytes)
 
              // minOneMessage = true in the above log.read means that the buffer may need to be grown to ensure progress can be made
              if (buffer.capacity < bytesNeeded) {
                if (config.loadBufferSize < bytesNeeded)
                  warn(s"Loaded offsets and group metadata from $topicPartition with buffer larger ($bytesNeeded bytes) than " +
                    s"configured offsets.load.buffer.size (${config.loadBufferSize} bytes)")
 
                buffer = ByteBuffer.allocate(bytesNeeded)
              } else {
                buffer.clear()
              }
 
              fileRecords.readInto(buffer, 0)
              MemoryRecords.readableRecords(buffer)
          }
 
          memRecords.batches.forEach { batch =>
            val isTxnOffsetCommit = batch.isTransactional
            //判断是否为控制类消息
            if (batch.isControlBatch) {
              val recordIterator = batch.iterator
              if (recordIterator.hasNext) {
                val record = recordIterator.next()
                val controlRecord = ControlRecordType.parse(record.key)
                if (controlRecord == ControlRecordType.COMMIT) {
                  pendingOffsets.getOrElse(batch.producerId, mutable.Map[GroupTopicPartition, CommitRecordMetadataAndOffset]())
                    .foreach {
                      case (groupTopicPartition, commitRecordMetadataAndOffset) =>
                        if (!loadedOffsets.contains(groupTopicPartition) || loadedOffsets(groupTopicPartition).olderThan(commitRecordMetadataAndOffset))
                          loadedOffsets.put(groupTopicPartition, commitRecordMetadataAndOffset)
                    }
                }
                pendingOffsets.remove(batch.producerId)
              }
            } else {
              var batchBaseOffset: Option[Long] = None
              for (record <- batch.asScala) {
                //检查消息有key
                require(record.hasKey, "Group metadata/offset entry key should not be null")
                if (batchBaseOffset.isEmpty)
                   //记录第一条消息位移值
                  batchBaseOffset = Some(record.offset)
                
                GroupMetadataManager.readMessageKey(record.key) match {
                    //判断是提交位移消息
                  case offsetKey: OffsetKey =>
                    if (isTxnOffsetCommit && !pendingOffsets.contains(batch.producerId))
                      pendingOffsets.put(batch.producerId, mutable.Map[GroupTopicPartition, CommitRecordMetadataAndOffset]())
 
                    // load offset
                    val groupTopicPartition = offsetKey.key
                    //判断是否有value
                    if (!record.hasValue) {
                        //没有value从列表中移除
                      if (isTxnOffsetCommit)
                        pendingOffsets(batch.producerId).remove(groupTopicPartition)
                      else
                        loadedOffsets.remove(groupTopicPartition)
                    } else {
                        //有value，放入列表中
                      val offsetAndMetadata = GroupMetadataManager.readOffsetMessageValue(record.value)
                      if (isTxnOffsetCommit)
                        pendingOffsets(batch.producerId).put(groupTopicPartition, CommitRecordMetadataAndOffset(batchBaseOffset, offsetAndMetadata))
                      else
                        loadedOffsets.put(groupTopicPartition, CommitRecordMetadataAndOffset(batchBaseOffset, offsetAndMetadata))
                    }
                    //消费者组注册信息
                  case groupMetadataKey: GroupMetadataKey =>
                    // load group metadata
                    val groupId = groupMetadataKey.key
                    val groupMetadata = GroupMetadataManager.readGroupMessageValue(groupId, record.value, time)
                    if (groupMetadata != null) {
                      removedGroups.remove(groupId)
                      loadedGroups.put(groupId, groupMetadata)
                    } else {
                      loadedGroups.remove(groupId)
                      removedGroups.add(groupId)
                    }
 
                  case unknownKey =>
                    throw new IllegalStateException(s"Unexpected message key $unknownKey while loading offsets and group metadata")
                }
              }
            }
            //更新读取位置到批次最后一条消息
            currOffset = batch.nextOffset
          }
        }
 
        val (groupOffsets, emptyGroupOffsets) = loadedOffsets
          .groupBy(_._1.group)
          .map { case (k, v) =>
            k -> v.map { case (groupTopicPartition, offset) => (groupTopicPartition.topicPartition, offset) }
          }.partition { case (group, _) => loadedGroups.contains(group) }
 
        val pendingOffsetsByGroup = mutable.Map[String, mutable.Map[Long, mutable.Map[TopicPartition, CommitRecordMetadataAndOffset]]]()
        pendingOffsets.forKeyValue { (producerId, producerOffsets) =>
          producerOffsets.keySet.map(_.group).foreach(addProducerGroup(producerId, _))
          producerOffsets
            .groupBy(_._1.group)
            .forKeyValue { (group, offsets) =>
              val groupPendingOffsets = pendingOffsetsByGroup.getOrElseUpdate(group, mutable.Map.empty[Long, mutable.Map[TopicPartition, CommitRecordMetadataAndOffset]])
              val groupProducerOffsets = groupPendingOffsets.getOrElseUpdate(producerId, mutable.Map.empty[TopicPartition, CommitRecordMetadataAndOffset])
              groupProducerOffsets ++= offsets.map { case (groupTopicPartition, offset) =>
                (groupTopicPartition.topicPartition, offset)
              }
            }
        }
 
        val (pendingGroupOffsets, pendingEmptyGroupOffsets) = pendingOffsetsByGroup
          .partition { case (group, _) => loadedGroups.contains(group)}
        //已完成加载的消费者组信息
        loadedGroups.values.foreach { group =>
          val offsets = groupOffsets.getOrElse(group.groupId, Map.empty[TopicPartition, CommitRecordMetadataAndOffset])
          val pendingOffsets = pendingGroupOffsets.getOrElse(group.groupId, Map.empty[Long, mutable.Map[TopicPartition, CommitRecordMetadataAndOffset]])
          debug(s"Loaded group metadata $group with offsets $offsets and pending offsets $pendingOffsets")
            //添加消费者组信息到cache中
          loadGroup(group, offsets, pendingOffsets)
            //外层传入方法，处理下分组组下所有成员心跳设置。
          onGroupLoaded(group)
        }
 
        // load groups which store offsets in kafka, but which have no active members and thus no group
        // metadata stored in the log
        //没有对应组信息的列表
        (emptyGroupOffsets.keySet ++ pendingEmptyGroupOffsets.keySet).foreach { groupId =>
            //创建全新的消费者组元数据
          val group = new GroupMetadata(groupId, Empty, time)
          val offsets = emptyGroupOffsets.getOrElse(groupId, Map.empty[TopicPartition, CommitRecordMetadataAndOffset])
          val pendingOffsets = pendingEmptyGroupOffsets.getOrElse(groupId, Map.empty[Long, mutable.Map[TopicPartition, CommitRecordMetadataAndOffset]])
          debug(s"Loaded group metadata $group with offsets $offsets and pending offsets $pendingOffsets")
          loadGroup(group, offsets, pendingOffsets)
          onGroupLoaded(group)
        }
 
        //检查removedGroups中的消费者组不会出现在cache中。
        removedGroups.foreach { groupId =>
          // if the cache already contains a group which should be removed, raise an error. Note that it
          // is possible (however unlikely) for a consumer group to be removed, and then to be used only for
          // offset storage (i.e. by "simple" consumers)
          if (groupMetadataCache.contains(groupId) && !emptyGroupOffsets.contains(groupId))
            throw new IllegalStateException(s"Unexpected unload of active group $groupId while " +
              s"loading partition $topicPartition")
        }
    }
  }

3.GroupCoordinator

函数：

handleJoinGroup():
        消费者组成员加入消费者组。


def handleJoinGroup(groupId: String,//消费者组id
                      memberId: String,//成员id
                      groupInstanceId: Option[String], //
                      requireKnownMemberId: Boolean,//
                      clientId: String,
                      clientHost: String,//消费者主机名
                      rebalanceTimeoutMs: Int, //reblance超时时间
                      sessionTimeoutMs: Int,    //会话超时时间
                      protocolType: String,    //协议类型
                      protocols: List[(String, Array[Byte])],
                      responseCallback: JoinCallback,//回调函数
                      requestLocal: RequestLocal = RequestLocal.NoCaching


//如果成员id有但是，消费者组没有则返回失败。
//其他情况返回现有消费者组或者创建新的消费者组
groupManager.getOrMaybeCreateGroup(groupId, isUnknownMember) match {
        case None =>
          responseCallback(JoinGroupResult(memberId, Errors.UNKNOWN_MEMBER_ID))
        case Some(group) =>
          group.inLock {
            //检查消费者组是否已经满了，无法再添加成员
            if (!acceptJoiningMember(group, memberId)) {
              group.remove(memberId)
              responseCallback(JoinGroupResult(JoinGroupRequest.UNKNOWN_MEMBER_ID, Errors.GROUP_MAX_SIZE_REACHED))
            } else if (isUnknownMember) {
            //空id成员加入组
              doNewMemberJoinGroup(
                group,
                groupInstanceId,
                requireKnownMemberId,
                clientId,
                clientHost,
                rebalanceTimeoutMs,
                sessionTimeoutMs,
                protocolType,
                protocols,
                responseCallback,
                requestLocal
              )
            } else {
                //非空id成员加入组
              doCurrentMemberJoinGroup(
                group,
                memberId,
                groupInstanceId,
                clientId,
                clientHost,
                rebalanceTimeoutMs,
                sessionTimeoutMs,
                protocolType,
                protocols,
                responseCallback
              )
            }
 
            // attempt to complete JoinGroup
             //如果消费者组正处于PreparingRebalance状态
            if (group.is(PreparingRebalance)) {
             //放入延迟队列，延迟处理
              rebalancePurgatory.checkAndComplete(GroupJoinKey(group.groupId))
            }
          }
      }

handleSyncGroup():


private def validateSyncGroup(
    group: GroupMetadata,
    generationId: Int,
    memberId: String,
    protocolType: Option[String],
    protocolName: Option[String],
    groupInstanceId: Option[String],
  ): Option[Errors] = {
    //消费者组是dead状态
    if (group.is(Dead)) {
 
      Some(Errors.COORDINATOR_NOT_AVAILABLE)
    } else {
      //成员是否属于group
      validateCurrentMember(
        group,
        memberId,
        groupInstanceId,
        operation = "sync-group"
      ).orElse {
        //generationId一致
        if (generationId != group.generationId) {
          Some(Errors.ILLEGAL_GENERATION)
        //协议类型是否一致
        } else if (protocolType.isDefined && !group.protocolType.contains(protocolType.get)) {
          Some(Errors.INCONSISTENT_GROUP_PROTOCOL)
        //分区策略是否一致
        } else if (protocolName.isDefined && !group.protocolName.contains(protocolName.get)) {
          Some(Errors.INCONSISTENT_GROUP_PROTOCOL)
        } else {
          None
        }
      }
    }
  }


    case None => group.currentState match {
        //ID异常
          case Empty =>
            responseCallback(SyncGroupResult(Errors.UNKNOWN_MEMBER_ID))
        //正在rebalance中
          case PreparingRebalance =>
            responseCallback(SyncGroupResult(Errors.REBALANCE_IN_PROGRESS))
 
          case CompletingRebalance =>
            group.get(memberId).awaitingSyncCallback = responseCallback
            removePendingSyncMember(group, memberId)
 
            //判断是否为leader
            if (group.isLeader(memberId)) {
              info(s"Assignment received from leader $memberId for group ${group.groupId} for generation ${group.generationId}. " +
                s"The group has ${group.size} members, ${group.allStaticMembers.size} of which are static.")
 
               //如果成员没有分配方案，创建空的方案
              val missing = group.allMembers.diff(groupAssignment.keySet)
              val assignment = groupAssignment ++ missing.map(_ -> Array.empty[Byte]).toMap
 
              if (missing.nonEmpty) {
                warn(s"Setting empty assignments for members $missing of ${group.groupId} for generation ${group.generationId}")
              }
                //消费者组信息保存到消费者组元数据，并写入内部位移
              groupManager.storeGroup(group, assignment, (error: Errors) => {
                group.inLock {
                    //如果CompletingRebalance且generationId 相同
                  if (group.is(CompletingRebalance) && generationId == group.generationId) {
                    if (error != Errors.NONE) {
                       //清空分配方案发送给所有成员
                      resetAndPropagateAssignmentError(group, error)
                        //准备开始新的Rebalance
                      maybePrepareRebalance(group, s"Error when storing group assignment during SyncGroup (member: $memberId)")
                    } else {
                        //保存分配方案到组元数据中
                      setAndPropagateAssignment(group, assignment)
                      //消费者组到Stable状态
                        group.transitionTo(Stable)
                    }
                  }
                }
              }, requestLocal)
              groupCompletedRebalanceSensor.record()
            }
 
          case Stable =>
            移除同步中的成员
            removePendingSyncMember(group, memberId)
 
            // if the group is stable, we just return the current assignment
            //获取成员元数据
            val memberMetadata = group.get(memberId)
            //封装协议类型，策略，分配方案
            responseCallback(SyncGroupResult(group.protocolType, group.protocolName, memberMetadata.assignment, Errors.NONE))
            completeAndScheduleNextHeartbeatExpiration(group, group.get(memberId))
 
          case Dead =>
            throw new IllegalStateException(s"Reached unexpected condition for Dead group ${group.groupId}")
        }

rebalance：

消费者成员发送心跳到coordinator，如果需要rebalance， coordinator会在心跳包的回复通知消费者成员。

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/花生_TL007/article/detail/346935