Monitor的数据提交流程:
Leader选举完成后,Leader和Peon角色已经清晰了,在读写操作之前,Monitor集群应当先进行phase1(阶段1):REOVERY来首先确保PN一致。
1、phase1(阶段1)处理流程如下:
Leader
void Paxos::collect(version_t oldpn)
{
#1 Leader设置Paxos的状态为STATE_RECOVERING:state = STATE_RECOVERING;
#2 清空上次收到的值:
// reset the number of lasts received
uncommitted_v = 0;
uncommitted_pn = 0;
uncommitted_value.clear();
peer_first_committed.clear();
peer_last_committed.clear();
#3 如果DBStore中存在pending的proposal,其version一定是last_committed+1
uncommitted_pn = pending_pn;
uncommitted_v = last_committed+1;
#4 否则,uncommitted_pn = accepted_pn;
#5 生成新的accept_pn提案号,自己先accept:accepted_pn = get_new_proposal_number(std::max(accepted_pn, oldpn));
#6 向quorum中,除自己外的其他成员发送OP_COLLECT消息:
MMonPaxos *collect = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COLLECT, ceph_clock_now());
collect->last_committed = last_committed;
collect->first_committed = first_committed;
collect->pn = accepted_pn; //新生成的提案号accepted_pn
mon.send_mon_message(collect, *p);//将自己的last_committed/first_committed/accpeted_pn发送出去
#7 设置collect消息超时事件:collect_timeout();
}
Peon
收到OP_COLLECT消息,处理:
void Paxos::handle_collect(MonOpRequestRef op)
{
#1 Peon设置当前的状态STATE_RECOVERING:state = STATE_RECOVERING;
#2 重置lease超时事件:reset_lease_timeout(); ,超时重新选举。
#3 如果Leader的第一个版本first_committed都大于Peon的last_committed(collect->first_committed > last_committed+1)
,那么需要重新bootstrap()。
#4 Peon回复消息LAST给Leader,携带自己的first_committed/last_committed
MMonPaxos *last = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LAST, ceph_clock_now());
last->last_committed = last_committed;
last->first_committed = first_committed;
#5 若Leader的pn大于Peon的accepted_pn (collect->pn > accepted_pn),则Peon接受pn:accepted_pn = collect->pn; 否则,就不接受,保持上次接受的pn
last->pn = accepted_pn;
#6 若Peon之前last_committed大于Leader的(collect->last_committed < last_committed),则Peon把自己commit的值分享给Leader
share_state(last, collect->first_committed, collect->last_committed);
#7 若存在pending_v=last_committed+1,则把pending_pn和pending_v对应的值value,放入Last消息中,通知Leader
last->uncommitted_pn = pending_pn;
#8 否则,保持上次接受的accepted_pn
last->uncommitted_pn = previous_pn; //(version_t previous_pn = accepted_pn;)
#9 collect->get_connection()->send_message(last);//发送Last消息给Leader
}
Leader
收到OP_LAST消息,处理:
void Paxos::handle_last(MonOpRequestRef op)
{
#1 Leader的last_committed比Peon的first_committed都落后很多(last->first_committed > last_committed + 1),重新bootstrap
#2 调用store_state()处理peon分享的已经commit的数据:need_refresh = store_state(last);
#3 若peon版本太旧(peer_last_committed都小于leader的first_committed),重新bootstrap。
(peer_last_committed + 1 < first_committed && first_committed > 1)
#4 若peon版本旧,但还在可同步范围内(peer_last_committed大于leader的first_committed,小于last_committed),则发送COMMIT消息,share_state分享leader已commit的数据。
(peer_last_committed < last_committed)
MMonPaxos *commit = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COMMIT, ceph_clock_now());
share_state(commit, peer_first_committed[p->first], p->second);
mon.send_mon_message(commit, p->first);
#5 若peon接受过的PN值比leader的大(last->pn > accepted_pn),那么用peon的PN值重新collect():collect(last->pn);
#6 若peon接受了自己的PN(last->pn == accepted_pn),记录下Peon同时发送来的uncommitted_pn/uncommitted_v/uncommitted_value
uncommitted_v = last->last_committed+1;
uncommitted_pn = last->uncommitted_pn;
uncommitted_value = last->values[uncommitted_v];
#7 若quorum中所有成员都接受了leader的PN:(num_last == mon.get_quorum().size())
#8 若存在未提交的值uncommitted_value:(uncommitted_v == last_committed+1 && uncommitted_value.length())
则设置Paxos状态为state = STATE_UPDATING_PREVIOUS;
开始propose过程:begin(uncommitted_value);将未提交的值uncommitted_value提交。
#9 否则,若没有未提交的值,leader延长自己的lease时间:extend_lease();,将lease_expire时间发送给所有的Peon成员:(first_committed、last_committed、lease_expire)
void Paxos::extend_lease()
{
MMonPaxos *lease = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE, ceph_clock_now());
lease->last_committed = last_committed;
lease->lease_timestamp = utime_t{lease_expire};
lease->first_committed = first_committed;
mon.send_mon_message(lease, *p);//Leader给Peon发送延长lease时间的消息OP_LEASE
}
#10 do_refresh():一个paxos过程完成之后,需要让上层的各个monitor service刷新状态
#11 finish_round():已经完成一轮的提议
}
Peon
收到OP_COMMIT消息,处理:
void Paxos::handle_commit(MonOpRequestRef op)
{
#1 调用store_state()处理leader分享的已经commit的数据:store_state(commit);将更新写到后端存储。
#2 刷新PaxosService:do_refresh();
}
收到OP_LEASE消息,处理:
void Paxos::handle_lease(MonOpRequestRef op)
{
#1 检查LEASE消息的时间戳和当前时间差,判断是否超出允许的mon_clock_drift_allowed
warn_on_future_time(lease->sent_timestamp, lease->get_source());
#2 peon更新自己的lease_expire:lease_expire = new_expire;
#3 设置Paxos状态为ACTIVE:state = STATE_ACTIVE;
#4 发送OP_LEASE_ACK消息回复给Leader:
MMonPaxos *ack = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE_ACK, ceph_clock_now());
ack->last_committed = last_committed;
ack->first_committed = first_committed;
ack->lease_timestamp = ceph_clock_now();
encode(mon.session_map.feature_map, ack->feature_map);
lease->get_connection()->send_message(ack);//发送OP_LEASE_ACK消息回复给Leader
#5 重置lease超时事件:reset_lease_timeout();
}
Leader
收到OP_LEASE_ACK消息,处理:
void Paxos::handle_lease_ack(MonOpRequestRef op)
{
#1 处理ack消息中的feature_map:
FeatureMap& t = mon.quorum_feature_map[from];
decode(t, p);
#2 等待quorum中所有成员都回复ACK:(acked_lease == mon.get_quorum())
#3 取消ack超时事件:mon.timer.cancel_event(lease_ack_timeout_event);
#4 warn_on_future_time(ack->sent_timestamp, ack->get_source());
}
2、phase2(阶段2)处理流程如下:
phase1(阶段1)之后,进入phase2(阶段2),阶段2是Monitor在正常工作时,提出提案、接受提案、提交提案流程。
处理流程如下:
Leader
void Paxos::begin(bufferlist& v)
{
#1 Leader自己先接受accept提案:accepted.insert(mon.rank);
#2 需要提交的新值,都是KV,如:[paxos:1869, value]:new_value = v;
#3 生成DB事务Transaction,以Paxos为前缀:last_committed+1组成key,将new_value写入DB中。
auto t(std::make_shared<MonitorDBStore::Transaction>());
t->put(get_name(), last_committed+1, new_value);
#4 同时更新pending_v和pending_pn:
t->put(get_name(), "pending_v", last_committed + 1);
t->put(get_name(), "pending_pn", accepted_pn);
#5 若quorum中只有自己一个成员(mon.get_quorum().size() == 1),直接commit:
commit_start();
#6 若quorum有多个成员,则向quorum中的所有成员发送OP_BEGIN消息:(new_value、last_committed、accepted_pn)
MMonPaxos *begin = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_BEGIN, ceph_clock_now());
begin->values[last_committed+1] = new_value;
begin->last_committed = last_committed;
begin->pn = accepted_pn;
mon.send_mon_message(begin, *p);
#7 置消息超时事件:accept_timeout_event = mon.timer.add_event_after(g_conf()->mon_accept_timeout_factor * g_conf()->mon_lease,
}
Peon
收到OP_BEGIN消息,处理:
void Paxos::handle_begin(MonOpRequestRef op)
{
#1 Leader发来的PN小于Peon自己的accepted_pn(begin->pn < accepted_pn),则忽略消息,认为这是旧一轮的决议。
#2 Leader发来的PN得等于Peon自己的accepted_pn才是一致有效的,不一致,直接assert。
ceph_assert(begin->pn == accepted_pn);
#3 Leader的last_committed得等于Peon自己的last_committed才是一致有效的,不一致,直接assert。
ceph_assert(begin->last_committed == last_committed);
#4 设置Peon的Paxos状态是STATE_UPDATING更新value:state = STATE_UPDATING;
#5 以paxos为前缀:last_committed+1组合为key,将value写入DB;同时更新pending_v和pending_pn。
t->put(get_name(), v, begin->values[v]);
t->put(get_name(), "pending_v", v);
t->put(get_name(), "pending_pn", accepted_pn);
#6 接受提案,将(accepted_pn、last_committed)在ACCEPT消息中给Leader回复reply:Peon接受该值。
MMonPaxos *accept = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_ACCEPT, ceph_clock_now());
accept->pn = accepted_pn;
accept->last_committed = last_committed;
begin->get_connection()->send_message(accept);
}
Leader
收到OP_ACCEPT消息,处理:
void Paxos::handle_accept(MonOpRequestRef op)
{
#1 如果Peon发来的PN跟自己的accepted_pn不相等(accept->pn != accepted_pn),则返回。
#2 若Peon发过来的last_committed,比last_committed-1还小,则被认为是旧一轮的决议,忽略。
(last_committed > 0 && accept->last_committed < last_committed-1)
#3 Peon发了两次accept消息,Leader直接assert。
ceph_assert(accepted.count(from) == 0);
#4 将Peon记录在accepted中,接受该Peon:accepted.insert(from);
#5 当接收者accepted==quorum成员数目时(accepted == mon.get_quorum()),即:quorum中所有成员都同意了,则Leader提交决议commit_start()
void Paxos::commit_start()
{
#5.1 更新DB中的last_committed为last_committed + 1
t->put(get_name(), "last_committed", last_committed + 1);
decode_append_transaction(t, new_value);
#5.2 Leader完成本次提交Committed:new_value
get_store()->queue_transaction(t, new C_Committed(this));
#5.3 若更新uncommitted值is_updating_previous(),设置state=STATE_WRITING_PREVIOUS
state = STATE_WRITING_PREVIOUS;
#5.4 若更新值is_updating(),设置state=STATE_WRITING
state = STATE_WRITING;
}
#6 将new_value封装在Transaction中,写入DB中,插入回调Commit_finish()。
#6.1 更新DB中的last_committed为last_committed + 1
t->put(get_name(), "last_committed", last_committed + 1);
#6.2 decode_append_transaction(t, new_value);
#6.3 Leader完成本次提交Committed:new_value
#6.4 get_store()->queue_transaction(t, new C_Committed(this));
#7 回调函数:void Paxos::commit_finish()
{
#7.1 更新DB中的last_committed=last_committed+1:last_committed++;
#7.2 当Leader完成本地的提交后,向quorum中的所有Peon成员发送OP_COMMIT消息:(new_value、accepted_pn、last_committed)
MMonPaxos *commit = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COMMIT, ceph_clock_now());
commit->values[last_committed] = new_value;
commit->pn = accepted_pn;
commit->last_committed = last_committed;
mon.send_mon_message(commit, *p);
}
#8 设置Leader的Paxos状态为STATE_REFRESH:state = STATE_REFRESH;
#9 Leader延长自己的lease时间,并将lease发送给所有Peon:extend_lease();
void Paxos::extend_lease()
{
MMonPaxos *lease = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE, ceph_clock_now());
lease->last_committed = last_committed;
lease->lease_timestamp = utime_t{lease_expire};
lease->first_committed = first_committed;
mon.send_mon_message(lease, *p);//Leader给Peon发送延长lease时间的消息OP_LEASE
}
#10 完成一轮提议:finish_round();
#10.1 复原Paxos的状态为STATE_ACTIVE:state = STATE_ACTIVE;
}
Peon
收到OP_COMMIT消息,处理:
void Paxos::handle_commit(MonOpRequestRef op)
{
#1 调用store_state()处理leader分享的已经commit的数据:store_state(commit);将更新写到后端存储。
#2 刷新PaxosService:do_refresh();
}
收到OP_LEASE消息,处理:
void Paxos::handle_lease(MonOpRequestRef op)
{
#1 检查LEASE消息的时间戳和当前时间差,判断是否超出允许的mon_clock_drift_allowed
warn_on_future_time(lease->sent_timestamp, lease->get_source());
#2 peon更新自己的lease_expire:lease_expire = new_expire;
#3 设置Paxos状态为ACTIVE:state = STATE_ACTIVE;
#4 发送OP_LEASE_ACK消息回复给Leader:
MMonPaxos *ack = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE_ACK, ceph_clock_now());
ack->last_committed = last_committed;
ack->first_committed = first_committed;
ack->lease_timestamp = ceph_clock_now();
encode(mon.session_map.feature_map, ack->feature_map);
lease->get_connection()->send_message(ack);//发送OP_LEASE_ACK消息回复给Leader
#5 重置lease超时事件:reset_lease_timeout();
}
Leader
收到OP_LEASE_ACK消息,处理:
void Paxos::handle_lease_ack(MonOpRequestRef op)
{
#1 处理ack消息中的feature_map:
FeatureMap& t = mon.quorum_feature_map[from];
decode(t, p);
#2 等待quorum中所有成员都回复ACK:(acked_lease == mon.get_quorum())
#3 取消ack超时事件:mon.timer.cancel_event(lease_ack_timeout_event);
#4 warn_on_future_time(ack->sent_timestamp, ack->get_source());
}