[Ceph] Peering/GetInfo

350 阅读2分钟

前半部分见 [Ceph] Peering/GetInfo/build_prior

1 GetInfo

PeeringState::GetInfo::GetInfo(my_context ctx)
    : my_base(ctx),
      NamedState(context<PeeringMachine>().state_history, "Started/Primary/Peering/GetInfo")
{
  // ...
  PastIntervals::PriorSet &prior_set = context<Peering>().prior_set;
  // ...
  prior_set = ps->build_prior();
  ps->prior_readable_down_osds = prior_set.down;
  if (ps->prior_readable_down_osds.empty())
  {
    psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
               << dendl;
    ps->clear_prior_readable_until_ub();
  }

  ps->reset_min_peer_features();
  get_infos();
  if (prior_set.pg_down)
  {
    post_event(IsDown());
  }
  else if (peer_info_requested.empty())
  {
    post_event(GotInfo());
  }
}

在完成 ps->build_prior() 后,进入函数 get_infos() 函数。
prior_set.pg_down 为真表示没有足够的 osd 进行 peering,进入 Down 状态;peer_info_requested 为空表 info 获取完毕,进入 GetLog

prior_set 其实是 Peering 这个状态的一个成员变量,GetInfoPeering 的子状态。GetInfo 使用 context<Peering>() 获取指向父状态(即 Peering)的引用,然后获取其成员 prior_set 的引用。

1.1 get_infos

void PeeringState::GetInfo::get_infos()
{
  DECLARE_LOCALS;
  PastIntervals::PriorSet &prior_set = context<Peering>().prior_set;

  ps->blocked_by.clear();
  for (auto it = prior_set.probe.begin(); it != prior_set.probe.end(); ++it)
  {
    pg_shard_t peer = *it;
    if (peer == ps->pg_whoami)
    {
      continue;
    }
    // 重复 peer,忽略
    if (ps->peer_info.count(peer))
    {
      psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
      continue;
    }
    // 已发过获取 info 请求,忽略
    if (peer_info_requested.count(peer))
    {
      psdout(10) << " already requested info from osd." << peer << dendl;
      ps->blocked_by.insert(peer.osd);
    }
    // osd down,忽略
    else if (!ps->get_osdmap()->is_up(peer.osd))
    {
      psdout(10) << " not querying info from down osd." << peer << dendl;
    }
    else
    {
      psdout(10) << " querying info from osd." << peer << dendl;
      context<PeeringMachine>().send_query(
          peer.osd,
          pg_query_t(pg_query_t::INFO,
                     it->shard, ps->pg_whoami.shard,
                     ps->info.history,
                     ps->get_osdmap_epoch()));
      peer_info_requested.insert(peer);
      ps->blocked_by.insert(peer.osd);
    }
  }

  ps->check_prior_readable_down_osds(ps->get_osdmap());

  pl->publish_stats_to_osd();
}

get_infos() 主要是遍历 build_prior() 计算的出的 osd,筛选出 uposd,向这些 osd 查询 info,并将这些 osd 插入 blocked_by

2 处理消息

boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec &infoevt)
{

  DECLARE_LOCALS;

  auto p = peer_info_requested.find(infoevt.from);
  if (p != peer_info_requested.end())
  {
    peer_info_requested.erase(p);
    ps->blocked_by.erase(infoevt.from.osd);
  }

  epoch_t old_start = ps->info.history.last_epoch_started;
  if (ps->proc_replica_info(
          infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent))
  {
    // we got something new ...
    PastIntervals::PriorSet &prior_set = context<Peering>().prior_set;
    if (old_start < ps->info.history.last_epoch_started)
    {
      psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
      prior_set = ps->build_prior();
      ps->prior_readable_down_osds = prior_set.down;

      // filter out any osds that got dropped from the probe set from
      // peer_info_requested.  this is less expensive than restarting
      // peering (which would re-probe everyone).
      auto p = peer_info_requested.begin();
      while (p != peer_info_requested.end())
      {
        if (prior_set.probe.count(*p) == 0)
        {
          psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
          peer_info_requested.erase(p++);
        }
        else
        {
          ++p;
        }
      }
      get_infos();
    }
    psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
               << hex << infoevt.features << dec << dendl;
    ps->apply_peer_features(infoevt.features);

    // are we done getting everything?
    if (peer_info_requested.empty() && !prior_set.pg_down)
    {
      psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
      psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
      psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
      post_event(GotInfo());
    }
  }
  return discard_event();
}

该函数处理查询到的 info。收到 info 后,先从 peer_info_requestedblocked_by 中删除对应的 osd。接着处理 info 与副本相关的细节,详见 2.1 节。

如果 proc_replica_info 返回 true,表示获取到了新的 info。先比较 proc_replica_info 前后的 ps->info.history.last_epoch_started,如果在此期间有副本更新了 ps->info.history.last_epoch_started,就调用 build_prior,计算新的 probe 列表,并删除 peer_info_requested 不再出现在 prior_set.probe 里的 pg_shard_t。然后调用 get_infos(),按新的 probe 列表获取 info

最后当 info 获取完毕并且无 osd down,就进入 GetLog

2.1 proc_replica_info

bool PeeringState::proc_replica_info(
    pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
{
  auto p = peer_info.find(from);
  if (p != peer_info.end() && p->second.last_update == oinfo.last_update)
  {
    psdout(10) << " got dup osd." << from << " info "
               << oinfo << ", identical to ours" << dendl;
    return false;
  }

  if (!get_osdmap()->has_been_up_since(from.osd, send_epoch))
  {
    psdout(10) << " got info " << oinfo << " from down osd." << from
               << " discarding" << dendl;
    return false;
  }

  psdout(10) << " got osd." << from << " " << oinfo << dendl;
  ceph_assert(is_primary());
  peer_info[from] = oinfo;
  might_have_unfound.insert(from);

  update_history(oinfo.history);

  // stray?
  if (!is_up(from) && !is_acting(from))
  {
    psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
    stray_set.insert(from);
    if (is_clean())
    {
      purge_strays();
    }
  }

  // was this a new info?  if so, update peers!
  if (p == peer_info.end())
    update_heartbeat_peers();

  return true;
}

proc_replica_info 主要是排除重复的 info 和来自 down osdinfo,并处理 staryosd

2.2 update_history

void PeeringState::update_history(const pg_history_t &new_history)
{
  auto mnow = pl->get_mnow();
  info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
  // 按 new_history 中更新的 attrs 更新 info.history 的 attrs
  if (info.history.merge(new_history))
  {
    psdout(20) << __func__ << " advanced history from " << new_history << dendl;
    dirty_info = true;
    // 如果在当前 interval 内有从 osd 完成了 peering
    if (info.history.last_epoch_clean >= info.history.same_interval_since)
    {
      psdout(20) << __func__ << " clearing past_intervals" << dendl;
      past_intervals.clear();
      dirty_big_info = true;
    }
    prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow); // return prior_readable_until_ub;
    if (prior_readable_until_ub != ceph::signedspan::zero())
    {
      dout(20) << __func__
               << " prior_readable_until_ub " << prior_readable_until_ub
               << " (mnow " << mnow << " + "
               << info.history.prior_readable_until_ub << ")" << dendl;
    }
  }
  
  // 注册 scrub
  pl->on_info_history_change();
}

2.3 get_mnow

ceph::signedspan PG::get_mnow()
{
  return osd->get_mnow();
}

ceph::signedspan OSDService::get_mnow()
{
  return ceph::mono_clock::now() - osd->startup_time;
}

显然, get_mnow 适用于获取 nowosd startup 的时间偏移。

2.4 refresh_prior_readable_until_ub

ceph::signedspan refresh_prior_readable_until_ub(
    ceph::signedspan now, ///< now, relative to osd startup_time
    ceph::signedspan ub)
{ ///< ub, relative to osd startup_time
  if (now >= ub)
  {
    // prior interval(s) are unreadable; we can zero the upper bound
    prior_readable_until_ub = ceph::signedspan::zero();
    return ceph::signedspan::zero();
  }
  else
  {
    prior_readable_until_ub = ub - now;
    return ub;
  }
}

这个函数的 commit message 如下:

Before we share pg_history_t, refresh the prior_readable_until_ub to be a simple duration from now, so that it is completely clock-independent. The receiver can interpret it based on the receive time for the message, which loses a bit of precision but is safe since this is an upper bound.

主要是将 prior_readable_until_ub 设置为距 nowduration。所以当 ub 大于等于 now 时,会将 prior_readable_until_ub 设为 ceph::signedspan::zero()