[Ceph] Peering::GetLog 阅读笔记

915 阅读7分钟

1 GetLog

PeeringState::GetLog::GetLog(my_context ctx)
    : my_base(ctx),
      NamedState(
          context<PeeringMachine>().state_history,
          "Started/Primary/Peering/GetLog"),
      msg(0)
{
  context<PeeringMachine>().log_enter(state_name);

  DECLARE_LOCALS;

  ps->log_weirdness();

  // adjust acting?
  if (!ps->choose_acting(auth_log_shard, false,
                         &context<Peering>().history_les_bound))
  {
    if (!ps->want_acting.empty())
    {
      post_event(NeedActingChange());
    }
    else
    {
      post_event(IsIncomplete());
    }
    return;
  }

  // am i the best?
  if (auth_log_shard == ps->pg_whoami)
  {
    post_event(GotLog());
    return;
  }

  const pg_info_t &best = ps->peer_info[auth_log_shard];

  // am i broken?
  if (ps->info.last_update < best.log_tail)
  {
    psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
    post_event(IsIncomplete());
    return;
  }

  // how much log to request?
  eversion_t request_log_from = ps->info.last_update;
  ceph_assert(!ps->acting_recovery_backfill.empty());
  for (auto p = ps->acting_recovery_backfill.begin();
       p != ps->acting_recovery_backfill.end();
       ++p)
  {
    if (*p == ps->pg_whoami)
      continue;
    // 在不小于 best.log_tail 的情况下,尽可能选择更小的 last_update
    pg_info_t &ri = ps->peer_info[*p];
    if (ri.last_update < ps->info.log_tail &&
        ri.last_update >= best.log_tail &&
        ri.last_update < request_log_from)
      request_log_from = ri.last_update;
  }

  // how much?
  psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
  context<PeeringMachine>().send_query(
      auth_log_shard.osd,
      pg_query_t(
          pg_query_t::LOG,
          auth_log_shard.shard, ps->pg_whoami.shard,
          request_log_from, ps->info.history,
          ps->get_osdmap_epoch()));

  ceph_assert(ps->blocked_by.empty());
  ps->blocked_by.insert(auth_log_shard.osd);
  pl->publish_stats_to_osd();
}
  1. choose_acting 选出 want_acting,当没选出合适时,若 want_acting 为空,进入 Incomplete,否则进入 WaitActingChange。关于 choose_acting,见第 2 节。

2 choose_acting

/**
 * choose acting
 *
 * calculate the desired acting, and request a change with the monitor
 * if it differs from the current acting.
 *
 * if restrict_to_up_acting=true, we filter out anything that's not in
 * up/acting.  in order to lift this restriction, we need to
 *  1) check whether it's worth switching the acting set any time we get
 *     a new pg info (not just here, when recovery finishes)
 *  2) check whether anything in want_acting went down on each new map
 *     (and, if so, calculate a new want_acting)
 *  3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
 * TODO!
 */
bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
                                 bool restrict_to_up_acting,
                                 bool *history_les_bound,
                                 bool request_pg_temp_change_only)
{
  map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
  all_info[pg_whoami] = info; // 自己的 info 先加上

  if (cct->_conf->subsys.should_gather<dout_subsys, 10>())
  {
    for (auto p = all_info.begin(); p != all_info.end(); ++p)
    {
      psdout(10) << __func__ << " all_info osd." << p->first << " "
                 << p->second << dendl;
    }
  }

  // find_best_info 见第 3 节。
  auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting,
                                       history_les_bound);

  // 没找到可以用来做权威日志的 shard
  if (auth_log_shard == all_info.end())
  {
    // up 和 acting 不等,说明需要 pg temp。返回 GetLog 后,进入 WaitActingChange
    if (up != acting)
    {
      psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
                 << " reverting to up" << dendl;
      want_acting = up;
      vector<int> empty;
      pl->queue_want_pg_temp(empty);
    }
    else
    {
      psdout(10) << __func__ << " failed" << dendl;
      // 将 want_acting 置空,返回到 GetLog 后,会进入 Incomplete
      ceph_assert(want_acting.empty());
    }
    return false;
  }

  ceph_assert(!auth_log_shard->second.is_incomplete());
  auth_log_shard_id = auth_log_shard->first;

  set<pg_shard_t> want_backfill, want_acting_backfill;
  vector<int> want;
  stringstream ss;
  if (pool.info.is_replicated())
  {
    // select_replicated_primary 见 2.1 节
    auto [primary_shard, oldest_log] = select_replicated_primary(
        auth_log_shard,
        cct->_conf.get_val<uint64_t>(
            "osd_force_auth_primary_missing_objects"),
        up,
        up_primary,
        all_info,
        get_osdmap(),
        ss);
    if (pool.info.is_stretch_pool())
    {
      calc_replicated_acting_stretch(
          primary_shard,
          oldest_log,
          get_osdmap()->get_pg_size(info.pgid.pgid),
          acting,
          up,
          up_primary,
          all_info,
          restrict_to_up_acting,
          &want,
          &want_backfill,
          &want_acting_backfill,
          get_osdmap(),
          pool,
          ss);
    }
    else
    {
      calc_replicated_acting(
          primary_shard,
          oldest_log,
          get_osdmap()->get_pg_size(info.pgid.pgid),
          acting,
          up,
          up_primary,
          all_info,
          restrict_to_up_acting,
          &want,
          &want_backfill,
          &want_acting_backfill,
          get_osdmap(),
          pool,
          ss);
    }
  }
  else
  {
    calc_ec_acting(
        auth_log_shard,
        get_osdmap()->get_pg_size(info.pgid.pgid),
        acting,
        up,
        all_info,
        restrict_to_up_acting,
        &want,
        &want_backfill,
        &want_acting_backfill,
        ss);
  }
  psdout(10) << ss.str() << dendl;

  if (!recoverable(want))
  {
    // 清空 want_acting,返回到 GetLog 后进入 Incompletet
    want_acting.clear();
    return false;
  }

  set<pg_shard_t> want_async_recovery;
  if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC))
  {
    if (pool.info.is_erasure())
    {
      choose_async_recovery_ec(
          all_info, auth_log_shard->second, &want, &want_async_recovery,
          get_osdmap());
    }
    else
    {
      choose_async_recovery_replicated(
          all_info, auth_log_shard->second, &want, &want_async_recovery,
          get_osdmap());
    }
  }
  while (want.size() > pool.info.size)
  {
    // async recovery should have taken out as many osds as it can.
    // if not, then always evict the last peer
    // (will get synchronously recovered later)
    psdout(10) << __func__ << " evicting osd." << want.back()
               << " from oversized want " << want << dendl;
    want.pop_back();
  }
  // 返回 GetLog 后,进入 WaitActingChange
  if (want != acting)
  {
    psdout(10) << __func__ << " want " << want << " != acting " << acting
               << ", requesting pg_temp change" << dendl;
    want_acting = want;

    if (!cct->_conf->osd_debug_no_acting_change)
    {
      if (want_acting == up)
      {
        // There can't be any pending backfill if
        // want is the same as crush map up OSDs.
        ceph_assert(want_backfill.empty());
        vector<int> empty;
        pl->queue_want_pg_temp(empty);
      }
      else
        pl->queue_want_pg_temp(want);
    }
    return false;
  }
  // make sure we respect the stretch cluster rules -- and
  // didn't break them with earlier choices!
  const pg_pool_t &pg_pool = pool.info;
  if (pg_pool.is_stretch_pool())
  {
    stringstream ss;
    // stretch 某些时候会阻塞 peering
    if (!pg_pool.stretch_set_can_peer(want, *get_osdmap(), &ss))
    {
      psdout(5) << "peering blocked by stretch_can_peer: " << ss.str() << dendl;
      return false;
    }
  }

  if (request_pg_temp_change_only)
    return true;
  want_acting.clear();
  acting_recovery_backfill = want_acting_backfill;
  psdout(10) << "acting_recovery_backfill is "
             << acting_recovery_backfill << dendl;
  // 将计算后的结果赋给 PeeringState 的成员
  ceph_assert(
      backfill_targets.empty() ||
      backfill_targets == want_backfill);
  if (backfill_targets.empty())
  {
    // Caller is GetInfo
    backfill_targets = want_backfill;
  }
  // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
  ceph_assert(
      async_recovery_targets.empty() ||
      async_recovery_targets == want_async_recovery ||
      !needs_recovery());
  // needs_recovery(): num_missing() 不为空则返回 true
  if (async_recovery_targets.empty() || !needs_recovery())
  {
    async_recovery_targets = want_async_recovery;
  }
  // Will not change if already set because up would have had to change
  // Verify that nothing in backfill is in stray_set
  for (auto i = want_backfill.begin(); i != want_backfill.end(); ++i)
  {
    ceph_assert(stray_set.find(*i) == stray_set.end());
  }
  psdout(10) << "choose_acting want=" << want << " backfill_targets="
             << want_backfill << " async_recovery_targets="
             << async_recovery_targets << dendl;
  return true;
}

2.1 select_replicated_primary

std::pair<map<pg_shard_t, pg_info_t>::const_iterator, eversion_t>
PeeringState::select_replicated_primary(
    map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
    uint64_t force_auth_primary_missing_objects,
    const std::vector<int> &up,
    pg_shard_t up_primary,
    const map<pg_shard_t, pg_info_t> &all_info,
    const OSDMapRef osdmap,
    ostream &ss)
{
  pg_shard_t auth_log_shard_id = auth_log_shard->first;

  ss << __func__ << " newest update on osd." << auth_log_shard_id
     << " with " << auth_log_shard->second << std::endl;

  // select primary
  auto primary = all_info.find(up_primary);
  if (up.size() &&
      !primary->second.is_incomplete() &&
      // 表示 primary 上的数据至少不比 auth_log_shard 旧
      primary->second.last_update >=
          auth_log_shard->second.log_tail)
  {
    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS))
    {
      auto approx_missing_objects =
          primary->second.stats.stats.sum.num_objects_missing;
      auto auth_version = auth_log_shard->second.last_update.version;
      auto primary_version = primary->second.last_update.version;
      if (auth_version > primary_version)
      {
        approx_missing_objects += auth_version - primary_version;
      }
      else
      {
        approx_missing_objects += primary_version - auth_version;
      }
      // approx_missing_objects += | auth_version - primary_version |
      if ((uint64_t)approx_missing_objects >
          force_auth_primary_missing_objects)
      {
        primary = auth_log_shard;
        ss << "up_primary: " << up_primary << ") has approximate "
           << approx_missing_objects
           << "(>" << force_auth_primary_missing_objects << ") "
           << "missing objects, osd." << auth_log_shard_id
           << " selected as primary instead"
           << std::endl;
      }
      else
      {
        ss << "up_primary: " << up_primary << ") selected as primary"
           << std::endl;
      }
    }
    else
    {
      ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
    }
  }
  else
  {
    ceph_assert(!auth_log_shard->second.is_incomplete());
    ss << "up[0] needs backfill, osd." << auth_log_shard_id
       << " selected as primary instead" << std::endl;
    primary = auth_log_shard;
  }

  ss << __func__ << " primary is osd." << primary->first
     << " with " << primary->second << std::endl;

  /* We include auth_log_shard->second.log_tail because in GetLog,
   * we will request logs back to the min last_update over our
   * acting_backfill set, which will result in our log being extended
   * as far backwards as necessary to pick up any peers which can
   * be log recovered by auth_log_shard's log */
  // 按需回溯可以经权威日志恢复的 peers,选来扩展日志。
  eversion_t oldest_auth_log_entry =
      std::min(primary->second.log_tail, auth_log_shard->second.log_tail);

  return std::make_pair(primary, oldest_auth_log_entry);
}

commit message:

osd: enforce the CRUSH bucket peering constraints on peering

We enforce the rules in choose_acting(), which is pretty simple.

More complicatedly, we add a calc_replicated_acting_stretch to try and extend the acting set with appropriate choices when we would otherwise block peering due to site constraints.

This implementation simply attempt to maximize the number of buckets at rank peering_crush_bucket_barrier -- see comments for more details.

I (Sam) think that this approach can be used to generalize all replicated pgs provided that we start setting peering_bucket_barrier to the failure domain -- TODO. This would fix a long standing deficiency during failure recovery where the selected temp mapping can fail to respect the user's failure domains.

这个函数是在这个 pr 添加的。关于 stretch mode,可以看 docs/rados/operations/stretch-mode.rst,主要是为了改善 ceph 在跨数据中心环境下的副本一致性问题。

参数 force_auth_primary_missing_objects 取自配置 osd_auth_primary_missing_objects

Option("osd_force_auth_primary_missing_objects", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(100)
// 超过该值变回强制使用 auth_log_shard 作为临时的 primary
.set_description("Approximate missing objects above which to force auth_log_shard to be primary temporarily"),

select_replicated_primary 根据 missing 的数量(approx_missing_objects += | auth_version - primary_version |),若其大于 osd_force_auth_primary_missing_objects

2.2 calc_replicated_acting_stretch

/**
 * calc_replicated_acting_stretch
 *
 * Choose an acting set using as much of the up set as possible; filling 
 * in the remaining slots so as to maximize the number of crush buckets at
 * level pool.info.peering_crush_bucket_barrier represented.
 *
 * Stretch clusters are a bit special: while they have a "size" the
 * same way as normal pools, if we happen to lose a data center
 * (we call it a "stretch bucket", but really it'll be a data center or
 * a cloud availability zone), we don't actually want to shove
 * 2 DC's worth of replication into a single site -- it won't fit!
 * So we locally calculate a bucket_max, based
 * on the targeted number of stretch buckets for the pool and
 * its size. Then we won't pull more than bucket_max from any
 * given ancestor even if it leaves us undersized.

 * There are two distinct phases: (commented below)
 */
void PeeringState::calc_replicated_acting_stretch(
    map<pg_shard_t, pg_info_t>::const_iterator primary,
    eversion_t oldest_auth_log_entry,
    unsigned size,
    const vector<int> &acting,
    const vector<int> &up,
    pg_shard_t up_primary,
    const map<pg_shard_t, pg_info_t> &all_info,
    bool restrict_to_up_acting,
    vector<int> *want,
    set<pg_shard_t> *backfill,
    set<pg_shard_t> *acting_backfill,
    const OSDMapRef osdmap,
    const PGPool &pool,
    ostream &ss)
{
  ceph_assert(want);
  ceph_assert(acting_backfill);
  ceph_assert(backfill);
  ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
     << std::endl;

  auto used = [want](int osd) {
    return std::find(want->begin(), want->end(), osd) != want->end();
  };

  auto usable_info = [&](const auto &cur_info) mutable {
    return !(cur_info.is_incomplete() ||
             cur_info.last_update < oldest_auth_log_entry);
  };

  auto osd_info = [&](int osd) mutable -> const pg_info_t & {
    pg_shard_t cand = pg_shard_t(osd, shard_id_t::NO_SHARD);
    const pg_info_t &cur_info = all_info.find(cand)->second;
    return cur_info;
  };

  auto usable_osd = [&](int osd) mutable {
    return usable_info(osd_info(osd));
  };

  std::map<int, bucket_candidates_t> ancestors;
  auto get_ancestor = [&](int osd) mutable {
    // 获取 bucket type 为 peering_crush_bucket_barrier 的 bucket
    int ancestor = osdmap->crush->get_parent_of_type(
        osd,
        pool.info.peering_crush_bucket_barrier,
        pool.info.crush_rule);
    return ancestors[ancestor];
  };

  // bucket_max 用来限制本地 pull crush bucket 的最大数量
  unsigned bucket_max = pool.info.size / pool.info.peering_crush_bucket_target;
  if (bucket_max * pool.info.peering_crush_bucket_target < pool.info.size)
  {
    ++bucket_max;
  }

  /* 1) Select all usable osds from the up set as well as the primary
   * 
   * We also stash any unusable osds from up into backfill.
   */
  auto add_required = [&](int osd) {
    // 如果 osd 不在 want 里
    if (!used(osd))
    {
      want->push_back(osd);
      acting_backfill->insert(
          pg_shard_t(osd, shard_id_t::NO_SHARD));
      // void inc_selected() { selected++; }
      get_ancestor(osd).inc_selected();
    }
  };
  add_required(primary->first.osd);
  ss << " osd " << primary->first.osd << " primary accepted "
     << osd_info(primary->first.osd) << std::endl;
  for (auto upcand : up)
  {
    auto upshard = pg_shard_t(upcand, shard_id_t::NO_SHARD);
    auto &curinfo = osd_info(upcand);
    // 主要区别是 usable osd 会进 want,但不会进 backfill
    if (usable_osd(upcand))
    {
      ss << " osd " << upcand << " (up) accepted " << curinfo << std::endl;
      add_required(upcand);
    }
    else
    {
      ss << " osd " << upcand << " (up) backfill " << curinfo << std::endl;
      backfill->insert(upshard);
      acting_backfill->insert(upshard);
    }
  }

  if (want->size() >= pool.info.size)
  {
    ss << " up set sufficient" << std::endl;
    return;
  }
  ss << " up set insufficient, considering remaining osds" << std::endl;

  /* 2) Fill out remaining slots from usable osds in all_info
   *    while maximizing the number of ancestor nodes at the
   *    barrier_id crush level.
   */
  {
    std::vector<std::pair<osd_ord_t, osd_id_t>> candidates;
    /* To do this, we first filter the set of usable osd into an ordered
     * list of usable osds
     */
    auto get_osd_ord = [&](bool is_acting, const pg_info_t &info) -> osd_ord_t {
      return std::make_tuple(
          !is_acting /* acting should sort first */,
          info.last_update);
    };
    for (auto &cand : acting)
    {
      auto &cand_info = osd_info(cand);
      if (!used(cand) && usable_info(cand_info))
      {
        ss << " acting candidate " << cand << " " << cand_info << std::endl;
        candidates.push_back(std::make_pair(get_osd_ord(true, cand_info), cand));
      }
    }
    if (!restrict_to_up_acting)
    {
      for (auto &[cand, info] : all_info)
      {
        if (!used(cand.osd) && usable_info(info))
        {
          ss << " other candidate " << cand << " " << info << std::endl;
          candidates.push_back(
              std::make_pair(get_osd_ord(false, info), cand.osd));
        }
      }
    }
    // 排序后,acting 在前;在 acting 中,last_update 小的靠前。
    std::sort(candidates.begin(), candidates.end());

    // We then filter these candidates by ancestor
    std::for_each(candidates.begin(), candidates.end(), [&](auto cand) {
      get_ancestor(cand.second).add_osd(cand.first, cand.second);
    });
  }

  auto pop_ancestor = [&](auto &ancestor) {
    ceph_assert(!ancestor.is_empty());
    auto osd = ancestor.pop_osd();

    ss << " accepting candidate " << osd << std::endl;

    ceph_assert(!used(osd));
    ceph_assert(usable_osd(osd));

    want->push_back(osd);
    acting_backfill->insert(
        pg_shard_t(osd, shard_id_t::NO_SHARD));
    ancestor.inc_selected();
  };

  /* Next, we use the ancestors map to grab a descendant of the
   * peering_crush_mandatory_member if not already represented.
   *
   * TODO: using 0 here to match other users.  Prior to merge, I
   * expect that this and other users should instead check against
   * CRUSH_ITEM_NONE.
   */
  // peering_crush_mandatory_member 见 2.2.1 节
  if (pool.info.peering_crush_mandatory_member != CRUSH_ITEM_NONE)
  {
    auto aiter = ancestors.find(pool.info.peering_crush_mandatory_member);
    if (aiter != ancestors.end() &&
        aiter->second.get_num_selected())
    {
      ss << " adding required ancestor " << aiter->first << std::endl;
      ceph_assert(!aiter->second.is_empty()); // wouldn't exist otherwise
      pop_ancestor(aiter->second);
    }
  }

  /* We then place the ancestors in a heap ordered by fewest selected
   * and then by the ordering token of the next osd */
  bucket_heap_t aheap;
  std::for_each(ancestors.begin(), ancestors.end(), [&](auto &anc) {
    aheap.push_if_nonempty(anc.second);
  });

  /* and pull from this heap until it's empty or we have enough. */
  while (!aheap.is_empty() && want->size() < pool.info.size)
  {
    auto next = aheap.pop();
    pop_ancestor(next.get());
    if (next.get().get_num_selected() < bucket_max)
    {
      aheap.push_if_nonempty(next);
    }
  }

  /* The end result is that we should have as many buckets covered as
   * possible while respecting up, the primary selection,
   * the pool size (given bucket count constraints),
   * and the mandatory member.
   */
}
2.2.1 peering_crush_mandatory_member
// If non-zero, require OSDs in at least this many different instances...
uint32_t peering_crush_bucket_count = 0;
// of this bucket type...
uint32_t peering_crush_bucket_barrier = 0;
// including this one
int32_t peering_crush_mandatory_member = pg_CRUSH_ITEM_NONE;

举个例子:

  • peering_crush_bucket_count3
  • peering_crush_bucket_barrierhost
  • peering_crush_mandatory_memberdc

osd 至少要在 3host 上,且必须某一在 dc 中。

2.3 calc_replicated_acting

/**
 * calculate the desired acting set.
 *
 * Choose an appropriate acting set.  Prefer up[0], unless it is
 * incomplete, or another osd has a longer tail that allows us to
 * bring other up nodes up to date.
 */
void PeeringState::calc_replicated_acting(
    map<pg_shard_t, pg_info_t>::const_iterator primary,
    eversion_t oldest_auth_log_entry,
    unsigned size,
    const vector<int> &acting,
    const vector<int> &up,
    pg_shard_t up_primary,
    const map<pg_shard_t, pg_info_t> &all_info,
    bool restrict_to_up_acting,
    vector<int> *want,
    set<pg_shard_t> *backfill,
    set<pg_shard_t> *acting_backfill,
    const OSDMapRef osdmap,
    const PGPool &pool,
    ostream &ss)
{
  ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
     << std::endl;

  want->push_back(primary->first.osd);
  acting_backfill->insert(primary->first);

  // select replicas that have log contiguity with primary.
  // prefer up, then acting, then any peer_info osds
  for (auto i : up)
  {
    pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
    if (up_cand == primary->first)
      continue;
    const pg_info_t &cur_info = all_info.find(up_cand)->second;
    // 为 true,说明有缺失数据,加入 backfill,不加入 want
    if (cur_info.is_incomplete() ||
        cur_info.last_update < oldest_auth_log_entry)
    {
      ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
      backfill->insert(up_cand);
      acting_backfill->insert(up_cand);
    }
    else
    {
      want->push_back(i);
      acting_backfill->insert(up_cand);
      ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
    }
  }

  if (want->size() >= size)
  {
    return;
  }

  std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
  candidate_by_last_update.reserve(acting.size());
  // This no longer has backfill OSDs, but they are covered above.
  for (auto i : acting)
  {
    pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
    // skip up osds we already considered above
    if (acting_cand == primary->first)
      continue;
    auto up_it = find(up.begin(), up.end(), i);
    if (up_it != up.end())
      continue;

    const pg_info_t &cur_info = all_info.find(acting_cand)->second;
    // 与 up 里的不同的是,acting 里缺失数据的 osd 直接忽略
    if (cur_info.is_incomplete() ||
        cur_info.last_update < oldest_auth_log_entry)
    {
      ss << " shard " << acting_cand << " (acting) REJECTED "
         << cur_info << std::endl;
    }
    else
    {
      candidate_by_last_update.emplace_back(cur_info.last_update, i);
    }
  }

  auto sort_by_eversion = [](const std::pair<eversion_t, int> &lhs,
                             const std::pair<eversion_t, int> &rhs) {
    return lhs.first > rhs.first;
  };
  // sort by last_update, in descending order.
  // 优先选择写入更多的 osd
  std::sort(candidate_by_last_update.begin(),
            candidate_by_last_update.end(), sort_by_eversion);
  for (auto &p : candidate_by_last_update)
  {
    ceph_assert(want->size() < size);
    want->push_back(p.second);
    pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
    acting_backfill->insert(s);
    ss << " shard " << s << " (acting) accepted "
       << all_info.find(s)->second << std::endl;
    if (want->size() >= size)
    {
      return;
    }
  }

  if (restrict_to_up_acting)
  {
    return;
  }
  candidate_by_last_update.clear();
  candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
  // continue to search stray to find more suitable peers
  for (auto &i : all_info)
  {
    // skip up osds we already considered above
    if (i.first == primary->first)
      continue;
    auto up_it = find(up.begin(), up.end(), i.first.osd);
    if (up_it != up.end())
      continue;
    auto acting_it = find(
        acting.begin(), acting.end(), i.first.osd);
    if (acting_it != acting.end())
      continue;

    if (i.second.is_incomplete() ||
        i.second.last_update < oldest_auth_log_entry)
    {
      ss << " shard " << i.first << " (stray) REJECTED " << i.second
         << std::endl;
    }
    else
    {
      candidate_by_last_update.emplace_back(
          i.second.last_update, i.first.osd);
    }
  }

  if (candidate_by_last_update.empty())
  {
    // save us some effort
    return;
  }

  // sort by last_update, in descending order.
  std::sort(candidate_by_last_update.begin(),
            candidate_by_last_update.end(), sort_by_eversion);

  for (auto &p : candidate_by_last_update)
  {
    ceph_assert(want->size() < size);
    want->push_back(p.second);
    pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
    acting_backfill->insert(s);
    ss << " shard " << s << " (stray) accepted "
       << all_info.find(s)->second << std::endl;
    if (want->size() >= size)
    {
      return;
    }
  }
}

计算 acting set,优先选择 up[0],如果 up[0]incomplete,则选有最新的 osd

2.4 calc_ec_acting

void PeeringState::calc_ec_acting(
    map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
    unsigned size,
    const vector<int> &acting,
    const vector<int> &up,
    const map<pg_shard_t, pg_info_t> &all_info,
    bool restrict_to_up_acting,
    vector<int> *_want,
    set<pg_shard_t> *backfill,
    set<pg_shard_t> *acting_backfill,
    ostream &ss)
{
  vector<int> want(size, CRUSH_ITEM_NONE);
  // shard -> pg_shard_t 的映射
  map<shard_id_t, set<pg_shard_t>> all_info_by_shard;
  for (auto i = all_info.begin();
       i != all_info.end();
       ++i)
  {
    all_info_by_shard[i->first.shard].insert(i->first);
  }
  for (uint8_t i = 0; i < want.size(); ++i)
  {
    ss << "For position " << (unsigned)i << ": ";
    if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
        !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
        // info 的 last_update 大于等于权威日志的 log_tail,表示该 osd 是可以用权威日志来恢复的
        all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
            auth_log_shard->second.log_tail)
    {
      ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
      want[i] = up[i];
      continue;
    }
    if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE)
    {
      ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
         << " and ";
      backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
    }

    if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
        !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
        all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
            auth_log_shard->second.log_tail)
    {
      ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
      want[i] = acting[i];
    }
    else if (!restrict_to_up_acting)
    {
      for (auto j = all_info_by_shard[shard_id_t(i)].begin();
           j != all_info_by_shard[shard_id_t(i)].end();
           ++j)
      {
        ceph_assert(j->shard == i);
        if (!all_info.find(*j)->second.is_incomplete() &&
            all_info.find(*j)->second.last_update >=
                auth_log_shard->second.log_tail)
        {
          ss << " selecting stray: " << *j << std::endl;
          want[i] = j->osd;
          break;
        }
      }
      if (want[i] == CRUSH_ITEM_NONE)
        ss << " failed to fill position " << (int)i << std::endl;
    }
  }

  for (uint8_t i = 0; i < want.size(); ++i)
  {
    if (want[i] != CRUSH_ITEM_NONE)
    {
      acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
    }
  }
  acting_backfill->insert(backfill->begin(), backfill->end());
  _want->swap(want);
}

calc_ec_actingcalc_replicated_acting 的逻辑相似,主要是写法不同。但有一个区别,多副本模式中,所有对象在不同 osd 上的内容都是一样的,但对于 ec 而言,每个 osd 上的内容都是不同的。所以需要 all_info_by_shard 来按 shard 来归类 info,自然也不需要像 calc_replicated_acting 里那样都要判断是否选了重复的 osd 加进了 want

2.5 choose_async_recovery_ec

void PeeringState::choose_async_recovery_ec(
    const map<pg_shard_t, pg_info_t> &all_info,
    const pg_info_t &auth_info,
    vector<int> *want,
    set<pg_shard_t> *async_recovery,
    const OSDMapRef osdmap) const
{
  set<pair<int, pg_shard_t>> candidates_by_cost;
  for (uint8_t i = 0; i < want->size(); ++i)
  {
    if ((*want)[i] == CRUSH_ITEM_NONE)
      continue;

    // Considering log entries to recover is accurate enough for
    // now. We could use minimum_to_decode_with_cost() later if
    // necessary.
    pg_shard_t shard_i((*want)[i], shard_id_t(i));
    // do not include strays
    if (stray_set.find(shard_i) != stray_set.end())
      continue;
    // Do not include an osd that is not up, since choosing it as
    // an async_recovery_target will move it out of the acting set.
    // This results in it being identified as a stray during peering,
    // because it is no longer in the up or acting set.
    if (!is_up(shard_i))
      continue;
    auto shard_info = all_info.find(shard_i)->second;
    // for ec pools we rollback all entries past the authoritative
    // last_update *before* activation. This is relatively inexpensive
    // compared to recovery, since it is purely local, so treat shards
    // past the authoritative last_update the same as those equal to it.
    version_t auth_version = auth_info.last_update.version;
    version_t candidate_version = shard_info.last_update.version;
    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS))
    {
      auto approx_missing_objects =
          shard_info.stats.stats.sum.num_objects_missing;
      // 这里仅处理了 auth_version > candidate_version 的情况
      // 当 auth_version < candidate_version 说明 osd 上有比权威日志更新的数据
      // 但 ec 的回复策略是回复到最小版本
      if (auth_version > candidate_version)
      {
        approx_missing_objects += auth_version - candidate_version;
      }
      if (static_cast<uint64_t>(approx_missing_objects) >
          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost"))
      {
        candidates_by_cost.emplace(approx_missing_objects, shard_i);
      }
    }
    else
    {
      if (auth_version > candidate_version &&
          (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost"))
      {
        candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
      }
    }
  }

  psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
             << dendl;

  // take out as many osds as we can for async recovery, in order of cost
  for (auto rit = candidates_by_cost.rbegin();
       rit != candidates_by_cost.rend(); ++rit)
  {
    pg_shard_t cur_shard = rit->second;
    vector<int> candidate_want(*want);
    candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
    if (recoverable(candidate_want))
    {
      want->swap(candidate_want);
      async_recovery->insert(cur_shard);
    }
  }
  psdout(20) << __func__ << " result want=" << *want
             << " async_recovery=" << *async_recovery << dendl;
}

2.5 choose_async_recovery_replicated

void PeeringState::choose_async_recovery_replicated(
    const map<pg_shard_t, pg_info_t> &all_info,
    const pg_info_t &auth_info,
    vector<int> *want,
    set<pg_shard_t> *async_recovery,
    const OSDMapRef osdmap) const
{
  set<pair<int, pg_shard_t>> candidates_by_cost;
  for (auto osd_num : *want)
  {
    pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
    // do not include strays
    if (stray_set.find(shard_i) != stray_set.end())
      continue;
    // Do not include an osd that is not up, since choosing it as
    // an async_recovery_target will move it out of the acting set.
    // This results in it being identified as a stray during peering,
    // because it is no longer in the up or acting set.
    if (!is_up(shard_i))
      continue;
    auto shard_info = all_info.find(shard_i)->second;
    // use the approximate magnitude of the difference in length of
    // logs plus historical missing objects as the cost of recovery
    version_t auth_version = auth_info.last_update.version;
    version_t candidate_version = shard_info.last_update.version;
    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS))
    {
      auto approx_missing_objects =
          shard_info.stats.stats.sum.num_objects_missing;
      // 和 ec 的主要区别是在这里,副本模式下会尽可能选择有更新数据的 osd
      if (auth_version > candidate_version)
      {
        approx_missing_objects += auth_version - candidate_version;
      }
      else
      {
        approx_missing_objects += candidate_version - auth_version;
      }
      if (static_cast<uint64_t>(approx_missing_objects) >
          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost"))
      {
        candidates_by_cost.emplace(approx_missing_objects, shard_i);
      }
    }
    else
    {
      size_t approx_entries;
      if (auth_version > candidate_version)
      {
        approx_entries = auth_version - candidate_version;
      }
      else
      {
        approx_entries = candidate_version - auth_version;
      }
      if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost"))
      {
        candidates_by_cost.insert(make_pair(approx_entries, shard_i));
      }
    }
  }

  psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
             << dendl;
  // candidates_by_cost 按缺失数据的数量按升序排序,优先选择缺失最多的 shard
  // take out as many osds as we can for async recovery, in order of cost
  for (auto rit = candidates_by_cost.rbegin();
       rit != candidates_by_cost.rend(); ++rit)
  {
    if (want->size() <= pool.info.min_size)
    {
      break;
    }
    pg_shard_t cur_shard = rit->second;
    vector<int> candidate_want(*want);
    for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it)
    {
      if (*it == cur_shard.osd)
      {
        candidate_want.erase(it);
        want->swap(candidate_want);
        async_recovery->insert(cur_shard);
        break;
      }
    }
  }
  psdout(20) << __func__ << " result want=" << *want
             << " async_recovery=" << *async_recovery << dendl;
}

3 find_best_info

/**
 * find_best_info
 *
 * Returns an iterator to the best info in infos sorted by:
 *  1) Prefer newer last_update
 *  2) Prefer longer tail if it brings another info into contiguity
 *  3) Prefer current primary
 */
map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
    const map<pg_shard_t, pg_info_t> &infos,
    bool restrict_to_up_acting,
    bool *history_les_bound) const
{
  ceph_assert(history_les_bound);
  /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
   * to make changes to this process.  Also, make sure to update it
   * when you find bugs! */
  epoch_t max_last_epoch_started_found = 0;
  for (auto i = infos.begin(); i != infos.end(); ++i)
  {
  	 // osd_find_best_info_ignore_history_les:
  	 // THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE DIRECTION OF A DEVELOPER
  	 // 开发用配置项
    if (!cct->_conf->osd_find_best_info_ignore_history_les &&
        max_last_epoch_started_found < i->second.history.last_epoch_started)
    {
      *history_les_bound = true;
      max_last_epoch_started_found = i->second.history.last_epoch_started;
    }
    if (!i->second.is_incomplete() &&
        max_last_epoch_started_found < i->second.last_epoch_started)
    {
      *history_les_bound = false;
      max_last_epoch_started_found = i->second.last_epoch_started;
    }
  }
  eversion_t min_last_update_acceptable = eversion_t::max();
  for (auto i = infos.begin(); i != infos.end(); ++i)
  {
    // 可能存在 incomplete 的 info 的 les >= max_last_epoch_started
    if (max_last_epoch_started_found <= i->second.last_epoch_started)
    {
      if (min_last_update_acceptable > i->second.last_update)
        min_last_update_acceptable = i->second.last_update;
    }
  }
  if (min_last_update_acceptable == eversion_t::max())
    return infos.end();

  auto best = infos.end();
  // find osd with newest last_update (oldest for ec_pool).
  // if there are multiples, prefer
  //  - a longer tail, if it brings another peer into log contiguity
  //  - the current primary
  for (auto p = infos.begin(); p != infos.end(); ++p)
  {
    if (restrict_to_up_acting && !is_up(p->first) &&
        !is_acting(p->first))
      continue;
    // Only consider peers with last_update >= min_last_update_acceptable
    if (p->second.last_update < min_last_update_acceptable)
      continue;
    // Disqualify anyone with a too old last_epoch_started
    if (p->second.last_epoch_started < max_last_epoch_started_found)
      continue;
    // Disqualify anyone who is incomplete (not fully backfilled)
    if (p->second.is_incomplete())
      continue;
    if (best == infos.end())
    {
      best = p;
      continue;
    }
    // Prefer newer last_update
    // 当 pg 为纠删码时,require_rollback() 返回 true
    if (pool.info.require_rollback())
    {
      // ec pg 默认 rollback,只选择最小的 last_update 作为权威日志
      if (p->second.last_update > best->second.last_update)
        continue;
      if (p->second.last_update < best->second.last_update)
      {
        best = p;
        continue;
      }
    }
    else
    {
      // 副本 pg 则和 ec 相反
      if (p->second.last_update < best->second.last_update)
        continue;
      if (p->second.last_update > best->second.last_update)
      {
        best = p;
        continue;
      }
    }

    // Prefer longer tail
    if (p->second.log_tail > best->second.log_tail)
    {
      continue;
    }
    else if (p->second.log_tail < best->second.log_tail)
    {
      best = p;
      continue;
    }

    if (!p->second.has_missing() && best->second.has_missing())
    {
      psdout(10) << __func__ << " prefer osd." << p->first
                 << " because it is complete while best has missing"
                 << dendl;
      best = p;
      continue;
    }
    else if (p->second.has_missing() && !best->second.has_missing())
    {
      psdout(10) << __func__ << " skipping osd." << p->first
                 << " because it has missing while best is complete"
                 << dendl;
      continue;
    }
    else
    {
      // both are complete or have missing
      // fall through
    }

    // prefer current primary (usually the caller), all things being equal
    if (p->first == pg_whoami)
    {
      psdout(10) << "calc_acting prefer osd." << p->first
                 << " because it is current primary" << dendl;
      best = p;
      continue;
    }
  }
  return best;
}

find_best_info 对副本的 pg 而言,主要是找出 last_update 最大的 info,但对 ec,则是相反。若有多个 last_update 相同的,就选择尽量小的 log_tail,以获取更多的日志。若仍然选出多个 info,则当 best 有缺失数据的情况下,用备选的,没有缺失数据的 info 替代。最后,如果 best 是主,则换一个。

4 处理收到的日志

boost::statechart::result PeeringState::GetLog::react(const MLogRec &logevt)
{
  ceph_assert(!msg);
  if (logevt.from != auth_log_shard)
  {
    psdout(10) << "GetLog: discarding log from "
               << "non-auth_log_shard osd." << logevt.from << dendl;
    return discard_event();
  }
  psdout(10) << "GetLog: received master log from osd."
             << logevt.from << dendl;
  msg = logevt.msg;
  post_event(GotLog());
  return discard_event();
}

boost::statechart::result PeeringState::GetLog::react(const GotLog &)
{
  DECLARE_LOCALS;
  psdout(10) << "leaving GetLog" << dendl;
  if (msg)
  {
    psdout(10) << "processing master log" << dendl;
    ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
                        msg->info, std::move(msg->log), std::move(msg->missing),
                        auth_log_shard);
  }
  ps->start_flush(context<PeeringMachine>().get_cur_transaction());
  return transit<GetMissing>();
}

void PeeringState::proc_master_log(
    ObjectStore::Transaction &t, pg_info_t &oinfo,
    pg_log_t &&olog, pg_missing_t &&omissing, pg_shard_t from)
{
  psdout(10) << "proc_master_log for osd." << from << ": "
             << olog << " " << omissing << dendl;
  ceph_assert(!is_peered() && is_primary());

  // merge log into our own log to build master log.  no need to
  // make any adjustments to their missing map; we are taking their
  // log to be authoritative (i.e., their entries are by definitely
  // non-divergent).
  merge_log(t, oinfo, std::move(olog), from);
  // 将收到的 info 加入 peer_info 里
  peer_info[from] = oinfo;
  psdout(10) << " peer osd." << from << " now " << oinfo
             << " " << omissing << dendl;
  might_have_unfound.insert(from);

  // 处理主副 osd 数据不一致
  // See doc/dev/osd_internals/last_epoch_started
  if (oinfo.last_epoch_started > info.last_epoch_started)
  {
    info.last_epoch_started = oinfo.last_epoch_started;
    dirty_info = true;
  }
  if (oinfo.last_interval_started > info.last_interval_started)
  {
    info.last_interval_started = oinfo.last_interval_started;
    dirty_info = true;
  }
  update_history(oinfo.history);
  ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
              info.last_epoch_started >= info.history.last_epoch_started);

  // 记录下 peer missing
  peer_missing[from].claim(std::move(omissing));
}

状态机收到日志后,主要是合并收到的日志,然后检查下来自从 osd 的日志是否要比自己新,如果是,就标记 dirty_infotrue