简介
本文用于记录阅读etcd transport源码过程,随着代码阅读深入本文的内容随时可能改变,当前理解不保证完全正确
整体架构图
transport接口定义
type Transporter interface {
// Start starts the given Transporter.
// Start MUST be called before calling other functions in the interface.
Start() error
// Handler returns the HTTP handler of the transporter.
// A transporter HTTP handler handles the HTTP requests
// from remote peers.
// The handler MUST be used to handle RaftPrefix(/raft)
// endpoint.
Handler() http.Handler
// Send sends out the given messages to the remote peers.
// Each message has a To field, which is an id that maps
// to an existing peer in the transport.
// If the id cannot be found in the transport, the message
// will be ignored.
Send(m []raftpb.Message)
// SendSnapshot sends out the given snapshot message to a remote peer.
// The behavior of SendSnapshot is similar to Send.
SendSnapshot(m snap.Message)
// AddRemote adds a remote with given peer urls into the transport.
// A remote helps newly joined member to catch up the progress of cluster,
// and will not be used after that.
// It is the caller's responsibility to ensure the urls are all valid,
// or it panics.
AddRemote(id types.ID, urls []string)
// AddPeer adds a peer with given peer urls into the transport.
// It is the caller's responsibility to ensure the urls are all valid,
// or it panics.
// Peer urls are used to connect to the remote peer.
AddPeer(id types.ID, urls []string)
// RemovePeer removes the peer with given id.
RemovePeer(id types.ID)
// RemoveAllPeers removes all the existing peers in the transport.
RemoveAllPeers()
// UpdatePeer updates the peer urls of the peer with the given id.
// It is the caller's responsibility to ensure the urls are all valid,
// or it panics.
UpdatePeer(id types.ID, urls []string)
// ActiveSince returns the time that the connection with the peer
// of the given id becomes active.
// If the connection is active since peer was added, it returns the adding time.
// If the connection is currently inactive, it returns zero time.
ActiveSince(id types.ID) time.Time
// ActivePeers returns the number of active peers.
ActivePeers() int
// Stop closes the connections and stops the transporter.
Stop()
}
transport是如何定义的
type Transport struct {
Logger *zap.Logger
// http.transport的相关配置 目前先忽略
DialTimeout time.Duration
TLSInfo transport.TLSInfo
// raft相关业务逻辑 也不是它的重点,暂时忽略raft业务
ID types.ID // local member ID
URLs types.URLs
ClusterID types.ID
Raft Raft
reports status
Snapshotter *snap.Snapshotter
ServerStats *stats.ServerStats
LeaderStats *stats.LeaderStats
// 应该是用于传递error,停止transport用的
ErrorC chan error
// `RoundTripper`负责执行一个 HTTP 请求并返回一个 HTTP 响应 可以理解为client
streamRt http.RoundTripper // roundTripper used by streams
pipelineRt http.RoundTripper // roundTripper used by pipelines
mu sync.RWMutex // protect the remote and peer map
// 用于记录pipeline连接 这个后面会具体分析
remotes map[types.ID]*remote
// raft初始化的时候 调用Start()初始化以后 会调用AddPeer()方法创建连接
peers map[types.ID]Peer // peers map
// 用于服务探活 定时调用health接口
pipelineProber probing.Prober
streamProber probing.Prober
}
transport start又做了什么
目前看起来只是初始化了两个client,streamRt和pipelineRt。没什么可说的, 目前看起来两者唯一的区别是pipelineRt限制了最大连接数是1024,没有设置超时,streamRt没有限制连接数,但是默认5s连接会超时
transport handler做了什么
func (t *Transport) Handler() http.Handler {
pipelineHandler := newPipelineHandler(t, t.Raft, t.ClusterID)
streamHandler := newStreamHandler(t, t, t.Raft, t.ID, t.ClusterID)
snapHandler := newSnapshotHandler(t, t.Raft, t.Snapshotter, t.ClusterID)
mux := http.NewServeMux()
mux.Handle(RaftPrefix, pipelineHandler)
mux.Handle(RaftStreamPrefix+"/", streamHandler)
mux.Handle(RaftSnapshotPrefix, snapHandler)
mux.Handle(ProbingPrefix, probing.NewHandler())
return mux
}
pipelineHandler
func (h *pipelineHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
...
// 建立反向的pipeline协程 并存储到remotes中
addRemoteFromRequest(h.tr, r)
// 通过Buffer大小控制每次读取的大小 防止buffer过大无法及时返回造成client超时
limitedr := pioutil.NewLimitedBufferReader(r.Body, connReadLimitByte)
b, err := io.ReadAll(limitedr)
if err != nil {
h.lg.Warn(
"failed to read Raft message",
zap.String("local-member-id", h.localID.String()),
zap.Error(err),
)
http.Error(w, "error reading raft message", http.StatusBadRequest)
recvFailures.WithLabelValues(r.RemoteAddr).Inc()
return
}
var m raftpb.Message
if err := m.Unmarshal(b); err != nil {
h.lg.Warn(
"failed to unmarshal Raft message",
zap.String("local-member-id", h.localID.String()),
zap.Error(err),
)
http.Error(w, "error unmarshalling raft message", http.StatusBadRequest)
recvFailures.WithLabelValues(r.RemoteAddr).Inc()
return
}
receivedBytes.WithLabelValues(types.ID(m.From).String()).Add(float64(len(b)))
if err := h.r.Process(context.TODO(), m); err != nil {
switch v := err.(type) {
case writerToResponse:
v.WriteTo(w)
default:
h.lg.Warn(
"failed to process Raft message",
zap.String("local-member-id", h.localID.String()),
zap.Error(err),
)
http.Error(w, "error processing raft message", http.StatusInternalServerError)
w.(http.Flusher).Flush()
// disconnect the http stream
panic(err)
}
return
}
// Write StatusNoContent header after the message has been processed by
// raft, which facilitates the client to report MsgSnap status.
w.WriteHeader(http.StatusNoContent)
}
streamHandler
在分析Hanlder之前,我们要先看下transport的peers是如何初始化的,初始化的代码是在raft启动的时候进行的
func (rc *raftNode) startRaft() {
...
rc.transport.Start()
for i := range rc.peers {
if i+1 != rc.id {
rc.transport.AddPeer(types.ID(i+1), []string{rc.peers[i]})
}
}
...
}
AddPeer都做了些什么
func (t *Transport) AddPeer(id types.ID, us []string) {
t.mu.Lock()
defer t.mu.Unlock()
if t.peers == nil {
panic("transport stopped")
}
if _, ok := t.peers[id]; ok {
return
}
urls, err := types.NewURLs(us)
if err != nil {
if t.Logger != nil {
t.Logger.Panic("failed NewURLs", zap.Strings("urls", us), zap.Error(err))
}
}
fs := t.LeaderStats.Follower(id.String())
t.peers[id] = startPeer(t, urls, id, fs)
// 增加peer的健康检测
addPeerToProber(t.Logger, t.pipelineProber, id.String(), us, RoundTripperNameSnapshot, rttSec)
addPeerToProber(t.Logger, t.streamProber, id.String(), us, RoundTripperNameRaftMessage, rttSec)
}
peer的结构定义
type peer struct {
lg *zap.Logger
localID types.ID
// id of the remote raft peer node
id types.ID
r Raft
status *peerStatus
picker *urlPicker
msgAppV2Writer *streamWriter
writer *streamWriter
pipeline *pipeline
snapSender *snapshotSender // snapshot sender to send v3 snapshot messages
msgAppV2Reader *streamReader
msgAppReader *streamReader
recvc chan raftpb.Message
propc chan raftpb.Message
mu sync.Mutex
paused bool
cancel context.CancelFunc // cancel pending works in go routine created by peer.
stopc chan struct{}
}
startPeer做了什么
- 创建一个针对这个url的pipeline协程监控,该协程会监听pipline的msgc channel,监听到messsage后使用之前transport初始化的pipelineRt发送信息
- 开启两个协程分别监听recvc,propc协程
- 创建两个streamReader(streamTypeMsgAppV2/streamTypeMessage)
- 创建两个streamWriter,用于后续发送消息
这个streamReader又是如何实现的呢
- 启动一个协程,使用transport的streamRt与指定的server建立连接
- 在这个协程中,等待接受server端的返回,根据message的type,将message发送到recvc/propc协程
这里就带来了另外一个疑问,recvc/propc协程是如何区分的 根据message的type 只有MsgProp类型会是propc,这样实现的原因是: 对于propc类型的信息raft.Process 可能会在没有leader节点时阻塞,必须将 propc 放入一个与 recvc 分开的协程中,以避免阻塞处理其他 raft 消息。(猜测主要是心跳和选举,尤其是选举)
recvc := cr.recvc
if m.Type == raftpb.MsgProp {
recvc = cr.propc
}
那么streamWriter又是如何实现的?
streamWriter结构定义
type streamWriter struct {
lg *zap.Logger
localID types.ID
peerID types.ID
status *peerStatus
fs *stats.FollowerStats
r Raft
mu sync.Mutex // guard field working and closer
closer io.Closer
working bool
msgc chan raftpb.Message
connc chan *outgoingConn
stopc chan struct{}
done chan struct{}
}
type outgoingConn struct {
t streamType
io.Writer
http.Flusher
io.Closer
localID types.ID
peerID types.ID
}
我们先忽略streamWriter是如何实现的,先来看下streamHandler是如何实现的
func (h *streamHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
...
// 根据path区分message类型
var t streamType
switch path.Dir(r.URL.Path) {
case streamTypeMsgAppV2.endpoint(h.lg):
t = streamTypeMsgAppV2
case streamTypeMessage.endpoint(h.lg):
t = streamTypeMessage
default:
http.Error(w, "invalid path", http.StatusNotFound)
return
}
// 根据来源做一些校验
...
p := h.peerGetter.Get(from)
if p == nil {
http.Error(w, "error sender not found", http.StatusNotFound)
return
}
wto := h.id.String()
if gto := r.Header.Get("X-Raft-To"); gto != wto {
h.lg.Warn(
"ignored streaming request; ID mismatch",
zap.String("local-member-id", h.tr.ID.String()),
zap.String("remote-peer-id-stream-handler", h.id.String()),
zap.String("remote-peer-id-header", gto),
zap.String("remote-peer-id-from", from.String()),
zap.String("cluster-id", h.cid.String()),
)
http.Error(w, "to field mismatch", http.StatusPreconditionFailed)
return
}
w.WriteHeader(http.StatusOK)
w.(http.Flusher).Flush()
c := newCloseNotifier()
conn := &outgoingConn{
t: t,
Writer: w,
Flusher: w.(http.Flusher),
Closer: c,
localID: h.tr.ID,
peerID: from,
}
// 这个方法实际上会去绑定streamWriter,并确认这个client对应的序列化反序列化方式,依赖conn携带的streamType
p.attachOutgoingConn(conn)
<-c.closeNotify()
}
那么我们继续分析streamWrite
streamWriter如何接收到要发送的信息
先看下attachOutgoingConn方法,将io.writter通过connc通道发送给streamWriter
func (p *peer) attachOutgoingConn(conn *outgoingConn) {
var ok bool
switch conn.t {
case streamTypeMsgAppV2:
ok = p.msgAppV2Writer.attach(conn)
case streamTypeMessage:
ok = p.writer.attach(conn)
default:
if p.lg != nil {
p.lg.Panic("unknown stream type", zap.String("type", conn.t.String()))
}
}
if !ok {
conn.Close()
}
}
那么streamWritter又是如何处理的这个outgoingConn的呢
func (cw *streamWriter) run() {
var (
msgc chan raftpb.Message
heartbeatc <-chan time.Time
t streamType
enc encoder
flusher http.Flusher
batched int
)
tickc := time.NewTicker(ConnReadTimeout / 3)
defer tickc.Stop()
unflushed := 0
for {
select {
case <-heartbeatc:
...
case m := <-msgc:
err := enc.encode(&m)
...
case conn := <-cw.connc:
cw.mu.Lock()
closed := cw.closeUnlocked()
t = conn.t
switch conn.t {
case streamTypeMsgAppV2:
enc = newMsgAppV2Encoder(conn.Writer, cw.fs)
case streamTypeMessage:
enc = &messageEncoder{w: conn.Writer}
default:
if cw.lg != nil {
cw.lg.Panic("unhandled stream type", zap.String("stream-type", t.String()))
}
}
flusher = conn.Flusher
unflushed = 0
cw.status.activate()
cw.closer = conn.Closer
cw.working = true
cw.mu.Unlock()
heartbeatc, msgc = tickc.C, cw.msgc
case <-cw.stopc:
...
}
}
这样这个writter就已经初始化完成了,他又是如何发送信息的呢
首先我们看下peer的send方法
func (p *peer) send(m raftpb.Message) {
p.mu.Lock()
paused := p.paused
p.mu.Unlock()
if paused {
return
}
writec, name := p.pick(m)
select {
case writec <- m:
default:
p.r.ReportUnreachable(m.To)
if isMsgSnap(m) {
p.r.ReportSnapshot(m.To, raft.SnapshotFailure)
}
sentFailures.WithLabelValues(types.ID(m.To).String()).Inc()
}
}
实际streamWritter的调用其实隐藏在p.pick方法里,这里我们可以看到发送消息的优先级:
- 快照使用Pipeline
- 如果支持appV2则优先使用v2 没有则使用streamMsg
- 如果都没有 则降级使用pipeline
func (p *peer) pick(m raftpb.Message) (writec chan<- raftpb.Message, picked string) {
var ok bool
// Considering MsgSnap may have a big size, e.g., 1G, and will block
// stream for a long time, only use one of the N pipelines to send MsgSnap.
if isMsgSnap(m) {
return p.pipeline.msgc, pipelineMsg
} else if writec, ok = p.msgAppV2Writer.writec(); ok && isMsgApp(m) {
return writec, streamAppV2
} else if writec, ok = p.writer.writec(); ok {
return writec, streamMsg
}
return p.pipeline.msgc, pipelineMsg
}
pipeline和stream差异
streamWriter 和 pipeline 都是 etcd 中用来发送数据的组件,但它们在设计和用途上有一些关键的区别。
pipeline: 在 etcd 的 Raft 实现中,pipeline主要负责将待发送的消息进行排队和缓存。当pipeline要发送消息时,它会先将这些消息打包成一个批次,然后通过RoundTrip接口发送出去。这种设计可以帮助减少网络 I/O,并确保在发送消息时能够最大限度地利用网络带宽。同时pipeline会启4个client去发送消息,确保在发送消息时能够最大限度地利用网络带宽。streamWriter: 在 etcd 的 gRPC 流实现中,streamWriter主要负责管理和调度数据的异步写入。当streamWriter要发送数据时,它会先将数据写入到 gRPC 流,然后在适当的时机调用Flush方法将数据真正发送出去。这种设计可以帮助处理大量的并发写入操作,并确保数据能尽快被发送出去。目前etcd的实现没有和pipeline一样启动多个client(如果我们每次写入的数据都很大,可能增加stream个数会是一个可选择的优化方向)
虽然 streamWriter 在底层可能也会使用 RoundTrip 接口来发送数据,但它与 pipeline 的主要区别在于数据的写入和发送方式。pipeline 是将多个消息打包成一个批次然后一次性发送,而 streamWriter 是将数据逐个写入到 gRPC 流,然后在适当的时机调用 Flush 方法发送。
这种设计使得 streamWriter 可以更精细地控制数据的发送,例如,它可以在写入一定量的数据后立即调用 Flush,以确保数据能尽快被发送出去。而 pipeline 则主要关注于如何将多个消息打包成一个批次来减少网络 I/O。
所以,虽然 streamWriter 和 pipeline 在某种程度上都是通过 RoundTrip 接口来发送数据,但它们在设计和用途上有一些关键的区别。