Ceph之Monitor分析4:数据提交流程

119 阅读5分钟

Monitor的数据提交流程:

Leader选举完成后,Leader和Peon角色已经清晰了,在读写操作之前,Monitor集群应当先进行phase1(阶段1):REOVERY来首先确保PN一致。

1、phase1(阶段1)处理流程如下:

Leader

void Paxos::collect(version_t oldpn)

{

#1 Leader设置Paxos的状态为STATE_RECOVERING:state = STATE_RECOVERING;

#2 清空上次收到的值:

// reset the number of lasts received
  uncommitted_v = 0;
  uncommitted_pn = 0;
  uncommitted_value.clear();
  peer_first_committed.clear();
  peer_last_committed.clear();

#3 如果DBStore中存在pending的proposal,其version一定是last_committed+1

uncommitted_pn = pending_pn;
uncommitted_v = last_committed+1;

#4 否则,uncommitted_pn = accepted_pn;

#5 生成新的accept_pn提案号,自己先accept:accepted_pn = get_new_proposal_number(std::max(accepted_pn, oldpn));

#6 向quorum中,除自己外的其他成员发送OP_COLLECT消息:

    MMonPaxos *collect = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COLLECT, ceph_clock_now());
    collect->last_committed = last_committed;
    collect->first_committed = first_committed;
    collect->pn = accepted_pn; //新生成的提案号accepted_pn
    mon.send_mon_message(collect, *p);//将自己的last_committed/first_committed/accpeted_pn发送出去

#7 设置collect消息超时事件:collect_timeout();

}

Peon

收到OP_COLLECT消息,处理:

void Paxos::handle_collect(MonOpRequestRef op)

{

#1 Peon设置当前的状态STATE_RECOVERING:state = STATE_RECOVERING;

#2 重置lease超时事件:reset_lease_timeout(); ,超时重新选举。

#3 如果Leader的第一个版本first_committed都大于Peon的last_committed(collect->first_committed > last_committed+1)

,那么需要重新bootstrap()。

#4 Peon回复消息LAST给Leader,携带自己的first_committed/last_committed

MMonPaxos *last = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LAST, ceph_clock_now());
last->last_committed = last_committed;
last->first_committed = first_committed;

#5 若Leader的pn大于Peon的accepted_pn (collect->pn > accepted_pn),则Peon接受pn:accepted_pn = collect->pn; 否则,就不接受,保持上次接受的pn

last->pn = accepted_pn;

#6 若Peon之前last_committed大于Leader的(collect->last_committed < last_committed),则Peon把自己commit的值分享给Leader

share_state(last, collect->first_committed, collect->last_committed);

#7 若存在pending_v=last_committed+1,则把pending_pn和pending_v对应的值value,放入Last消息中,通知Leader

last->uncommitted_pn = pending_pn;

#8 否则,保持上次接受的accepted_pn

last->uncommitted_pn = previous_pn; //(version_t previous_pn = accepted_pn;)

#9 collect->get_connection()->send_message(last);//发送Last消息给Leader

}

Leader

收到OP_LAST消息,处理:

void Paxos::handle_last(MonOpRequestRef op)

{

#1 Leader的last_committed比Peon的first_committed都落后很多(last->first_committed > last_committed + 1),重新bootstrap

#2 调用store_state()处理peon分享的已经commit的数据:need_refresh = store_state(last);

#3 若peon版本太旧(peer_last_committed都小于leader的first_committed),重新bootstrap。

(peer_last_committed + 1 < first_committed && first_committed > 1)

#4 若peon版本旧,但还在可同步范围内(peer_last_committed大于leader的first_committed,小于last_committed),则发送COMMIT消息,share_state分享leader已commit的数据。

(peer_last_committed < last_committed)

MMonPaxos *commit = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COMMIT, ceph_clock_now());
share_state(commit, peer_first_committed[p->first], p->second);
mon.send_mon_message(commit, p->first);

#5 若peon接受过的PN值比leader的大(last->pn > accepted_pn),那么用peon的PN值重新collect():collect(last->pn);

#6 若peon接受了自己的PN(last->pn == accepted_pn),记录下Peon同时发送来的uncommitted_pn/uncommitted_v/uncommitted_value

uncommitted_v = last->last_committed+1;
uncommitted_pn = last->uncommitted_pn;
uncommitted_value = last->values[uncommitted_v];

#7 若quorum中所有成员都接受了leader的PN:(num_last == mon.get_quorum().size())

#8 若存在未提交的值uncommitted_value:(uncommitted_v == last_committed+1 && uncommitted_value.length())

则设置Paxos状态为state = STATE_UPDATING_PREVIOUS;

开始propose过程:begin(uncommitted_value);将未提交的值uncommitted_value提交。

#9 否则,若没有未提交的值,leader延长自己的lease时间:extend_lease();,将lease_expire时间发送给所有的Peon成员:(first_committed、last_committed、lease_expire)

void Paxos::extend_lease()
{
MMonPaxos *lease = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE, ceph_clock_now());
lease->last_committed = last_committed;
lease->lease_timestamp = utime_t{lease_expire};
lease->first_committed = first_committed;
mon.send_mon_message(lease, *p);//Leader给Peon发送延长lease时间的消息OP_LEASE
}

#10 do_refresh():一个paxos过程完成之后,需要让上层的各个monitor service刷新状态

#11 finish_round():已经完成一轮的提议

}

Peon

收到OP_COMMIT消息,处理:

void Paxos::handle_commit(MonOpRequestRef op)

{

#1 调用store_state()处理leader分享的已经commit的数据:store_state(commit);将更新写到后端存储。

#2 刷新PaxosService:do_refresh();

}

收到OP_LEASE消息,处理:

void Paxos::handle_lease(MonOpRequestRef op)

{

#1 检查LEASE消息的时间戳和当前时间差,判断是否超出允许的mon_clock_drift_allowed

warn_on_future_time(lease->sent_timestamp, lease->get_source());

#2 peon更新自己的lease_expire:lease_expire = new_expire;

#3 设置Paxos状态为ACTIVE:state = STATE_ACTIVE;

#4 发送OP_LEASE_ACK消息回复给Leader:

MMonPaxos *ack = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE_ACK, ceph_clock_now());
ack->last_committed = last_committed;
ack->first_committed = first_committed;
ack->lease_timestamp = ceph_clock_now();
encode(mon.session_map.feature_map, ack->feature_map);
lease->get_connection()->send_message(ack);//发送OP_LEASE_ACK消息回复给Leader

#5 重置lease超时事件:reset_lease_timeout();

}

Leader

收到OP_LEASE_ACK消息,处理:

void Paxos::handle_lease_ack(MonOpRequestRef op)

{

#1 处理ack消息中的feature_map:

FeatureMap& t = mon.quorum_feature_map[from];
decode(t, p);

#2 等待quorum中所有成员都回复ACK:(acked_lease == mon.get_quorum())

#3 取消ack超时事件:mon.timer.cancel_event(lease_ack_timeout_event);

#4 warn_on_future_time(ack->sent_timestamp, ack->get_source());

}

2、phase2(阶段2)处理流程如下:

phase1(阶段1)之后,进入phase2(阶段2),阶段2是Monitor在正常工作时,提出提案、接受提案、提交提案流程。

处理流程如下:

Leader

void Paxos::begin(bufferlist& v)

{

#1 Leader自己先接受accept提案:accepted.insert(mon.rank);

#2 需要提交的新值,都是KV,如:[paxos:1869, value]:new_value = v;

#3 生成DB事务Transaction,以Paxos为前缀:last_committed+1组成key,将new_value写入DB中。

auto t(std::make_shared<MonitorDBStore::Transaction>());
t->put(get_name(), last_committed+1, new_value);

#4 同时更新pending_v和pending_pn:

t->put(get_name(), "pending_v", last_committed + 1);
t->put(get_name(), "pending_pn", accepted_pn);

#5 若quorum中只有自己一个成员(mon.get_quorum().size() == 1),直接commit:

commit_start();

#6 若quorum有多个成员,则向quorum中的所有成员发送OP_BEGIN消息:(new_value、last_committed、accepted_pn)

MMonPaxos *begin = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_BEGIN, ceph_clock_now());
begin->values[last_committed+1] = new_value;
begin->last_committed = last_committed;
begin->pn = accepted_pn;
   
mon.send_mon_message(begin, *p);

#7 置消息超时事件:accept_timeout_event = mon.timer.add_event_after(g_conf()->mon_accept_timeout_factor * g_conf()->mon_lease,

}

Peon

收到OP_BEGIN消息,处理:

void Paxos::handle_begin(MonOpRequestRef op)

{

#1 Leader发来的PN小于Peon自己的accepted_pn(begin->pn < accepted_pn),则忽略消息,认为这是旧一轮的决议。

#2 Leader发来的PN得等于Peon自己的accepted_pn才是一致有效的,不一致,直接assert。

ceph_assert(begin->pn == accepted_pn);

#3 Leader的last_committed得等于Peon自己的last_committed才是一致有效的,不一致,直接assert。

ceph_assert(begin->last_committed == last_committed);

#4 设置Peon的Paxos状态是STATE_UPDATING更新value:state = STATE_UPDATING;

#5 以paxos为前缀:last_committed+1组合为key,将value写入DB;同时更新pending_v和pending_pn。

t->put(get_name(), v, begin->values[v]);
 
t->put(get_name(), "pending_v", v);
t->put(get_name(), "pending_pn", accepted_pn);

#6 接受提案,将(accepted_pn、last_committed)在ACCEPT消息中给Leader回复reply:Peon接受该值。

MMonPaxos *accept = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_ACCEPT, ceph_clock_now());
accept->pn = accepted_pn;
accept->last_committed = last_committed;
begin->get_connection()->send_message(accept);

}

Leader

收到OP_ACCEPT消息,处理:

void Paxos::handle_accept(MonOpRequestRef op)

{

#1 如果Peon发来的PN跟自己的accepted_pn不相等(accept->pn != accepted_pn),则返回。

#2 若Peon发过来的last_committed,比last_committed-1还小,则被认为是旧一轮的决议,忽略。

(last_committed > 0 && accept->last_committed < last_committed-1)

#3 Peon发了两次accept消息,Leader直接assert。

ceph_assert(accepted.count(from) == 0);

#4 将Peon记录在accepted中,接受该Peon:accepted.insert(from);

#5 当接收者accepted==quorum成员数目时(accepted == mon.get_quorum()),即:quorum中所有成员都同意了,则Leader提交决议commit_start()

void Paxos::commit_start()

{

#5.1 更新DB中的last_committed为last_committed + 1

t->put(get_name(), "last_committed", last_committed + 1);
decode_append_transaction(t, new_value);

#5.2 Leader完成本次提交Committed:new_value

get_store()->queue_transaction(t, new C_Committed(this));

#5.3 若更新uncommitted值is_updating_previous(),设置state=STATE_WRITING_PREVIOUS

state = STATE_WRITING_PREVIOUS;

#5.4 若更新值is_updating(),设置state=STATE_WRITING

state = STATE_WRITING;

}

#6 将new_value封装在Transaction中,写入DB中,插入回调Commit_finish()。

#6.1 更新DB中的last_committed为last_committed + 1

t->put(get_name(), "last_committed", last_committed + 1);

#6.2 decode_append_transaction(t, new_value);

#6.3 Leader完成本次提交Committed:new_value

#6.4 get_store()->queue_transaction(t, new C_Committed(this));

#7 回调函数:void Paxos::commit_finish()

{

#7.1 更新DB中的last_committed=last_committed+1:last_committed++;

#7.2 当Leader完成本地的提交后,向quorum中的所有Peon成员发送OP_COMMIT消息:(new_value、accepted_pn、last_committed)

MMonPaxos *commit = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COMMIT, ceph_clock_now());
commit->values[last_committed] = new_value;
commit->pn = accepted_pn;
commit->last_committed = last_committed;    
mon.send_mon_message(commit, *p);

}

#8 设置Leader的Paxos状态为STATE_REFRESH:state = STATE_REFRESH;

#9 Leader延长自己的lease时间,并将lease发送给所有Peon:extend_lease();

void Paxos::extend_lease()
{
MMonPaxos *lease = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE, ceph_clock_now());
lease->last_committed = last_committed;
lease->lease_timestamp = utime_t{lease_expire};
lease->first_committed = first_committed;
mon.send_mon_message(lease, *p);//Leader给Peon发送延长lease时间的消息OP_LEASE
}

#10 完成一轮提议:finish_round();

#10.1 复原Paxos的状态为STATE_ACTIVE:state = STATE_ACTIVE;

}

Peon

收到OP_COMMIT消息,处理:

void Paxos::handle_commit(MonOpRequestRef op)

{

#1 调用store_state()处理leader分享的已经commit的数据:store_state(commit);将更新写到后端存储。

#2 刷新PaxosService:do_refresh();

}

收到OP_LEASE消息,处理:

void Paxos::handle_lease(MonOpRequestRef op)

{

#1 检查LEASE消息的时间戳和当前时间差,判断是否超出允许的mon_clock_drift_allowed

warn_on_future_time(lease->sent_timestamp, lease->get_source());

#2 peon更新自己的lease_expire:lease_expire = new_expire;

#3 设置Paxos状态为ACTIVE:state = STATE_ACTIVE;

#4 发送OP_LEASE_ACK消息回复给Leader:

MMonPaxos *ack = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE_ACK, ceph_clock_now());
ack->last_committed = last_committed;
ack->first_committed = first_committed;
ack->lease_timestamp = ceph_clock_now();
encode(mon.session_map.feature_map, ack->feature_map);
lease->get_connection()->send_message(ack);//发送OP_LEASE_ACK消息回复给Leader

#5 重置lease超时事件:reset_lease_timeout();

}

Leader

收到OP_LEASE_ACK消息,处理:

void Paxos::handle_lease_ack(MonOpRequestRef op)

{

#1 处理ack消息中的feature_map:

FeatureMap& t = mon.quorum_feature_map[from];

decode(t, p);

#2 等待quorum中所有成员都回复ACK:(acked_lease == mon.get_quorum())

#3 取消ack超时事件:mon.timer.cancel_event(lease_ack_timeout_event);

#4 warn_on_future_time(ack->sent_timestamp, ack->get_source());

}