1 GetInfo
PeeringState::GetInfo::GetInfo(my_context ctx)
: my_base(ctx),
NamedState(context<PeeringMachine>().state_history, "Started/Primary/Peering/GetInfo")
{
// ...
PastIntervals::PriorSet &prior_set = context<Peering>().prior_set;
// ...
prior_set = ps->build_prior();
ps->prior_readable_down_osds = prior_set.down;
if (ps->prior_readable_down_osds.empty())
{
psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
<< dendl;
ps->clear_prior_readable_until_ub();
}
ps->reset_min_peer_features();
get_infos();
if (prior_set.pg_down)
{
post_event(IsDown());
}
else if (peer_info_requested.empty())
{
post_event(GotInfo());
}
}
在完成 ps->build_prior() 后,进入函数 get_infos() 函数。
prior_set.pg_down 为真表示没有足够的 osd 进行 peering,进入 Down 状态;peer_info_requested 为空表 info 获取完毕,进入 GetLog。
prior_set其实是Peering这个状态的一个成员变量,GetInfo是Peering的子状态。GetInfo使用context<Peering>()获取指向父状态(即Peering)的引用,然后获取其成员prior_set的引用。
1.1 get_infos
void PeeringState::GetInfo::get_infos()
{
DECLARE_LOCALS;
PastIntervals::PriorSet &prior_set = context<Peering>().prior_set;
ps->blocked_by.clear();
for (auto it = prior_set.probe.begin(); it != prior_set.probe.end(); ++it)
{
pg_shard_t peer = *it;
if (peer == ps->pg_whoami)
{
continue;
}
// 重复 peer,忽略
if (ps->peer_info.count(peer))
{
psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
continue;
}
// 已发过获取 info 请求,忽略
if (peer_info_requested.count(peer))
{
psdout(10) << " already requested info from osd." << peer << dendl;
ps->blocked_by.insert(peer.osd);
}
// osd down,忽略
else if (!ps->get_osdmap()->is_up(peer.osd))
{
psdout(10) << " not querying info from down osd." << peer << dendl;
}
else
{
psdout(10) << " querying info from osd." << peer << dendl;
context<PeeringMachine>().send_query(
peer.osd,
pg_query_t(pg_query_t::INFO,
it->shard, ps->pg_whoami.shard,
ps->info.history,
ps->get_osdmap_epoch()));
peer_info_requested.insert(peer);
ps->blocked_by.insert(peer.osd);
}
}
ps->check_prior_readable_down_osds(ps->get_osdmap());
pl->publish_stats_to_osd();
}
get_infos() 主要是遍历 build_prior() 计算的出的 osd,筛选出 up 的 osd,向这些 osd 查询 info,并将这些 osd 插入 blocked_by。
2 处理消息
boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec &infoevt)
{
DECLARE_LOCALS;
auto p = peer_info_requested.find(infoevt.from);
if (p != peer_info_requested.end())
{
peer_info_requested.erase(p);
ps->blocked_by.erase(infoevt.from.osd);
}
epoch_t old_start = ps->info.history.last_epoch_started;
if (ps->proc_replica_info(
infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent))
{
// we got something new ...
PastIntervals::PriorSet &prior_set = context<Peering>().prior_set;
if (old_start < ps->info.history.last_epoch_started)
{
psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
prior_set = ps->build_prior();
ps->prior_readable_down_osds = prior_set.down;
// filter out any osds that got dropped from the probe set from
// peer_info_requested. this is less expensive than restarting
// peering (which would re-probe everyone).
auto p = peer_info_requested.begin();
while (p != peer_info_requested.end())
{
if (prior_set.probe.count(*p) == 0)
{
psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
peer_info_requested.erase(p++);
}
else
{
++p;
}
}
get_infos();
}
psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
<< hex << infoevt.features << dec << dendl;
ps->apply_peer_features(infoevt.features);
// are we done getting everything?
if (peer_info_requested.empty() && !prior_set.pg_down)
{
psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
post_event(GotInfo());
}
}
return discard_event();
}
该函数处理查询到的 info。收到 info 后,先从 peer_info_requested 和 blocked_by 中删除对应的 osd。接着处理 info 与副本相关的细节,详见 2.1 节。
如果 proc_replica_info 返回 true,表示获取到了新的 info。先比较 proc_replica_info 前后的 ps->info.history.last_epoch_started,如果在此期间有副本更新了 ps->info.history.last_epoch_started,就调用 build_prior,计算新的 probe 列表,并删除 peer_info_requested 不再出现在 prior_set.probe 里的 pg_shard_t。然后调用 get_infos(),按新的 probe 列表获取 info。
最后当 info 获取完毕并且无 osd down,就进入 GetLog。
2.1 proc_replica_info
bool PeeringState::proc_replica_info(
pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
{
auto p = peer_info.find(from);
if (p != peer_info.end() && p->second.last_update == oinfo.last_update)
{
psdout(10) << " got dup osd." << from << " info "
<< oinfo << ", identical to ours" << dendl;
return false;
}
if (!get_osdmap()->has_been_up_since(from.osd, send_epoch))
{
psdout(10) << " got info " << oinfo << " from down osd." << from
<< " discarding" << dendl;
return false;
}
psdout(10) << " got osd." << from << " " << oinfo << dendl;
ceph_assert(is_primary());
peer_info[from] = oinfo;
might_have_unfound.insert(from);
update_history(oinfo.history);
// stray?
if (!is_up(from) && !is_acting(from))
{
psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
stray_set.insert(from);
if (is_clean())
{
purge_strays();
}
}
// was this a new info? if so, update peers!
if (p == peer_info.end())
update_heartbeat_peers();
return true;
}
proc_replica_info 主要是排除重复的 info 和来自 down osd 的 info,并处理 stary 的 osd。
2.2 update_history
void PeeringState::update_history(const pg_history_t &new_history)
{
auto mnow = pl->get_mnow();
info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
// 按 new_history 中更新的 attrs 更新 info.history 的 attrs
if (info.history.merge(new_history))
{
psdout(20) << __func__ << " advanced history from " << new_history << dendl;
dirty_info = true;
// 如果在当前 interval 内有从 osd 完成了 peering
if (info.history.last_epoch_clean >= info.history.same_interval_since)
{
psdout(20) << __func__ << " clearing past_intervals" << dendl;
past_intervals.clear();
dirty_big_info = true;
}
prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow); // return prior_readable_until_ub;
if (prior_readable_until_ub != ceph::signedspan::zero())
{
dout(20) << __func__
<< " prior_readable_until_ub " << prior_readable_until_ub
<< " (mnow " << mnow << " + "
<< info.history.prior_readable_until_ub << ")" << dendl;
}
}
// 注册 scrub
pl->on_info_history_change();
}
2.3 get_mnow
ceph::signedspan PG::get_mnow()
{
return osd->get_mnow();
}
ceph::signedspan OSDService::get_mnow()
{
return ceph::mono_clock::now() - osd->startup_time;
}
显然, get_mnow 适用于获取 now 至 osd startup 的时间偏移。
2.4 refresh_prior_readable_until_ub
ceph::signedspan refresh_prior_readable_until_ub(
ceph::signedspan now, ///< now, relative to osd startup_time
ceph::signedspan ub)
{ ///< ub, relative to osd startup_time
if (now >= ub)
{
// prior interval(s) are unreadable; we can zero the upper bound
prior_readable_until_ub = ceph::signedspan::zero();
return ceph::signedspan::zero();
}
else
{
prior_readable_until_ub = ub - now;
return ub;
}
}
这个函数的 commit message 如下:
Before we share pg_history_t, refresh the prior_readable_until_ub to be a simple duration from now, so that it is completely clock-independent. The receiver can interpret it based on the receive time for the message, which loses a bit of precision but is safe since this is an upper bound.
主要是将 prior_readable_until_ub 设置为距 now 的 duration。所以当 ub 大于等于 now 时,会将 prior_readable_until_ub 设为 ceph::signedspan::zero()。