Calico 源码分析:CNI_COMMAND=ADD

825 阅读7分钟

根据CNI Spec,当环境变量CNI_COMMAND=ADD时,kubernetes要求CNI为容器添加网络设备。

Calico是如何实现容器添加网络设备的功能的呢?简单分析下源码记录下自己的理解。

CNI_COMMAND=ADD操作,需要传入一些环境变量和CNI配置文件

假设传入下面的CNI的配置文件:

  • /etc/cni/net.d/10-calico.conflist
{
  "name": "k8s-pod-network",
  "cniVersion": "0.3.1",
  "plugins": [
    {
      "type": "calico",
      "log_level": "info",
      "log_file_path": "/var/log/calico/cni/cni.log",
      "etcd_endpoints": "https://192.168.10.8:2379",
      "etcd_key_file": "/etc/cni/net.d/calico-tls/etcd-key",
      "etcd_cert_file": "/etc/cni/net.d/calico-tls/etcd-cert",
      "etcd_ca_cert_file": "/etc/cni/net.d/calico-tls/etcd-ca",
      "mtu": 0,
      "ipam": {
          "type": "calico-ipam"
      },
      "policy": {
          "type": "k8s"
      },
      "kubernetes": {
          "kubeconfig": "/etc/cni/net.d/calico-kubeconfig"
      }
    },
    {
      "type": "portmap",
      "snat": true,
      "capabilities": {"portMappings": true}
    },
    {
      "type": "bandwidth",
      "capabilities": {"bandwidth": true}
    }
  ]
}

源码分析

代码入口点

  • cni-plugin/cmd/calico/calico.go
func main() {
	// Use the name of the binary to determine which routine to run.
	_, filename := filepath.Split(os.Args[0])
	switch filename {
	case "calico", "calico.exe":
		plugin.Main(VERSION) // => CNI_COMMAND=ADD 走这个分支
	
	default:
		panic("Unknown binary name: " + filename)
	}
}
  • cni-plugin/pkg/plugin/plugin.go
func Main(version string) {
        // ...
        
        // CNI的4个接口注册
	skel.PluginMain(cmdAdd, cmdDummyCheck, cmdDel,
		cniSpecVersion.PluginSupports("0.1.0", "0.2.0", "0.3.0", "0.3.1", "0.4.0", "1.0.0"),
		"Calico CNI plugin "+version)
}

只分析cmdAdd的实现,cmdAdd用于给容器创建网卡。这个函数巨长... 不重要的全都剪掉了

func cmdAdd(args *skel.CmdArgs) (err error) {

        // 从stdin读取配置,一般是类似/etc/cni/net.d/10-calico.conflist里的内容
        // 格式有差异,不过内容是大致相同的
	// Unmarshal the network config, and perform validation
	conf := types.NetConf{}
	if err := json.Unmarshal(args.StdinData, &conf); err != nil {
		return fmt.Errorf("failed to load netconf: %v", err)
	}
        
        // WEP 是workload endpoint的简称,这里是构造WEP的坐标,WEP的坐标系包括节点,协调类型(一半是k8s), pod名称 etc.组成
	// Extract WEP identifiers such as pod name, pod namespace (for k8s), containerID, IfName.
	wepIDs, err := utils.GetIdentifiers(args, nodename)



	calicoClient, err := utils.CreateClient(conf)
	if err != nil {
		err = fmt.Errorf("error creating calico client: %v", err)
		return
	}


	// Remove the endpoint field (IfName) from the wepIDs so we can get a WEP name prefix.
	// We use the WEP name prefix (e.g. prefix: "node1-k8s-mypod--1-", full name: "node1-k8s-mypod--1-eth0"
	// to list all the WEPs so if we have a WEP with a different IfName (e.g. "node1-k8s-mypod--1-eth1")
	// we could still get that.
	wepIDs.Endpoint = ""

	// Calculate the workload name prefix from the WEP specific identifiers
	// for the given orchestrator.
        // 前缀一般是"node1-k8s-mypod--1-"
	wepPrefix, err := wepIDs.CalculateWorkloadEndpointName(true)
	if err != nil {
		err = fmt.Errorf("error constructing WorkloadEndpoint prefix: %s", err)
		return
	}

        // 请求Kubernetes,获取WorkloadEndpoint CRD列表
	// Check if there's an existing endpoint by listing the existing endpoints based on the WEP name prefix.
	endpoints, err := calicoClient.WorkloadEndpoints().List(ctx, options.ListOptions{Name: wepPrefix, Namespace: wepIDs.Namespace, Prefix: true})
	if err != nil {
		return
	}

        // 后面的逻辑都是更新libapi.WorkloadEndpoint内的字段
	var endpoint *libapi.WorkloadEndpoint

        // ...
        
         // If we don't find a match from the existing WorkloadEndpoints then we calculate
	// the WEP name with the IfName passed in so we can create the WorkloadEndpoint later in the process.
	if endpoint == nil {
		wepIDs.Endpoint = args.IfName  // 网卡名放入WEP ID
		wepIDs.WEPName, err = wepIDs.CalculateWorkloadEndpointName(false)
		if err != nil {
			err = fmt.Errorf("error constructing WorkloadEndpoint name: %s", err)
			return
		}
	}



	// Collect the result in this variable - this is ultimately what gets "returned" by this function by printing
	// it to stdout.
	var result *cniv1.Result  // => cniv1.Result是最后cni执行结果的输出结构体

	// If running under Kubernetes then branch off into the kubernetes code, otherwise handle everything in this
	// function.
	if wepIDs.Orchestrator == api.OrchestratorKubernetes {
                // 具体的创建网卡的实现走这个分支
		if result, err = k8s.CmdAddK8s(ctx, args, conf, *wepIDs, calicoClient, endpoint); err != nil {
			return
		}
	} else {
            // ...
	}

        
	// Print result to stdout, in the format defined by the requested cniVersion.
	err = cnitypes.PrintResult(result, conf.CNIVersion)
	return
}

  • WEP ID

那么WEP ID(workload endpoint identifier)是什么?WEP ID可以唯一标识容器内一个网卡。

type WEPIdentifiers struct {
	Namespace string
	WEPName   string  // <node>-k8s-<pod>-<endpoint>
	names.WorkloadEndpointIdentifiers
}
// WorkloadEndpointIdentifiers is a collection of identifiers that are used to uniquely
// identify a WorkloadEndpoint resource.  Since a resource is identified by a single
// name field, Calico requires the name to be constructed in a very specific format.
// The format is dependent on the Orchestrator type:
// -  k8s:  <node>-k8s-<pod>-<endpoint>
// -  cni:  <node>-cni-<containerID>-<endpoint>
// -  libnetwork:  <node>-libnetwork-libnetwork-<endpoint>
// -  (other):  <node>-<orchestrator>-<workload>-<endpoint>
//
// Each parameter cannot start or end with a dash (-), and dashes within the parameter
// will be escaped to a double-dash (--) in the constructed name.
//
// List queries allow for prefix lists (for non-KDD), the client should verify that
// the items returned in the list match the supplied identifiers using the
// NameMatches() method.  This is necessary because a prefix match may return endpoints
// that do not exactly match the required identifiers.  For example, suppose you are
// querying endpoints with node=node1, orch=k8s, pod=pod and endpoints is wild carded:
// -  The name prefix would be `node1-k8s-pod-`
// -  A list query using that prefix would also return endpoints with, for example,
//    a pod call "pod-1", because the name of the endpoint might be `node1-k8s-pod--1-eth0`
//    which matches the required name prefix.
//
// The Node and Orchestrator are always required for both prefix and non-prefix name
// construction.
type WorkloadEndpointIdentifiers struct {
	Node         string   // 节点
	Orchestrator string   // 协调平台,值一般是k8s
	Endpoint     string   // 容器network namespace里的网卡名
	Workload     string   // 暂时忽略
	Pod          string   
	ContainerID  string
}

创建网卡是通过调用result, err = k8s.CmdAddK8s(ctx, args, conf, *wepIDs, calicoClient, endpoint);

  • cni-plugin/pkg/k8s/k8s.go
// CmdAddK8s performs the "ADD" operation on a kubernetes pod
// Having kubernetes code in its own file avoids polluting the mainline code. It's expected that the kubernetes case will
// more special casing than the mainline code.
func CmdAddK8s(ctx context.Context, args *skel.CmdArgs, conf types.NetConf, epIDs utils.WEPIdentifiers, calicoClient calicoclient.Interface, endpoint *libapi.WorkloadEndpoint) (*cniv1.Result, error) {
        // dataplane是一个重要接口,知道如何为容器创建网卡
	d, err := dataplane.GetDataplane(conf, logger)

	// Allocate the IP and update/create the endpoint. Do this even if the endpoint already exists and has an IP
	// allocation. The kubelet will send a DEL call for any old containers and we'll clean up the old IPs then.
	client, err := NewK8sClient(conf, logger)
	if err != nil {
		return nil, err
	}


	var routes []*net.IPNet

        // 创建
	// Determine which routes to program within the container. If no routes were provided in the CNI config,
	// then use the Calico default routes. If routes were provided then program those instead.
	if len(routes) == 0 {
		logger.Debug("No routes specified in CNI configuration, using defaults.")
		routes = utils.DefaultRoutes // 0.0.0.0/0
	} else {
            // ...
	}

        // 这两个都为空
	ipAddrsNoIpam := annot["cni.projectcalico.org/ipAddrsNoIpam"]
	ipAddrs := annot["cni.projectcalico.org/ipAddrs"]

	// Switch based on which annotations are passed or not passed.
	switch {
	case ipAddrs == "" && ipAddrsNoIpam == "":
		// Call the IPAM plugin.
                // 调用calico-ipam分配ip地址
		result, err = utils.AddIPAM(conf, args, logger)
		if err != nil {
			return nil, err
		}

                // ...
	}

        // 构造endpoint,后续同步到kubernetes
	endpoint.Name = epIDs.WEPName
	endpoint.Namespace = epIDs.Namespace
	endpoint.Labels = labels
	endpoint.GenerateName = generateName
	endpoint.Spec.Endpoint = epIDs.Endpoint
	endpoint.Spec.Node = epIDs.Node
	endpoint.Spec.Orchestrator = epIDs.Orchestrator
	endpoint.Spec.Pod = epIDs.Pod
	endpoint.Spec.Ports = ports
	endpoint.Spec.IPNetworks = []string{}
	endpoint.Spec.ServiceAccountName = serviceAccount



	// releaseIPAM cleans up any IPAM allocations on failure.
	releaseIPAM := func() {
		logger.WithField("endpointIPs", endpoint.Spec.IPNetworks).Info("Releasing IPAM allocation(s) after failure")
		utils.ReleaseIPAllocation(logger, conf, args)
	}
        
        // 计算"namespace.pod"的hash值,用于cali-xxx网卡命名
	// Whether the endpoint existed or not, the veth needs (re)creating.
	desiredVethName := k8sconversion.NewConverter().VethNameForWorkload(epIDs.Namespace, epIDs.Pod)
        
        // 执行具体创建网卡的操作
	hostVethName, contVethMac, err := d.DoNetworking(
		ctx, calicoClient, args, result, desiredVethName, routes, endpoint, annot)



        // 请求kubernetes 更新WEP CRD
	// Write the endpoint object (either the newly created one, or the updated one)
	// Pass special-case flag through to KDD to let it know what kind of patch to apply to the underlying
	// Pod resource. (In Enterprise) Felix also modifies the pod through a patch and setting this avoids patching the
	// same fields as Felix so that we can't clobber Felix's updates.
	ctxPatchCNI := k8sresources.ContextWithPatchMode(ctx, k8sresources.PatchModeCNI)
	if _, err := utils.CreateOrUpdate(ctxPatchCNI, calicoClient, endpoint); err != nil {
		logger.WithError(err).Error("Error creating/updating endpoint in datastore.")
		releaseIPAM()
		return nil, err
	}



	return result, nil
}

Dataplane是封装数据面具体操作接口:

type Dataplane interface {
	DoNetworking(
		ctx context.Context,
		calicoClient calicoclient.Interface,
		args *skel.CmdArgs,
		result *cniv1.Result,
		desiredVethName string,
		routes []*net.IPNet,
		endpoint *api.WorkloadEndpoint,
		annotations map[string]string,
	) (hostVethName, contVethMAC string, err error)

	CleanUpNamespace(args *skel.CmdArgs) error
}

默认实现是linuxDataplane,使用github.com/vishvananda/netlink库操作linux下的网络设备。

func (d *linuxDataplane) DoNetworking(
	ctx context.Context,
	calicoClient calicoclient.Interface,
	args *skel.CmdArgs,
	result *cniv1.Result,
	desiredVethName string,
	routes []*net.IPNet,
	endpoint *api.WorkloadEndpoint,
	annotations map[string]string,
) (hostVethName, contVethMAC string, err error) {
	hostVethName = desiredVethName
	contVethName := args.IfName
	var hasIPv4, hasIPv6 bool

	d.logger.Infof("Setting the host side veth name to %s", hostVethName)

	// Clean up if hostVeth exists.
	if oldHostVeth, err := netlink.LinkByName(hostVethName); err == nil {
		if err = netlink.LinkDel(oldHostVeth); err != nil {
			return "", "", fmt.Errorf("failed to delete old hostVeth %v: %v", hostVethName, err)
		}
		d.logger.Infof("Cleaning old hostVeth: %v", hostVethName)
	}

        // WithNetNSPath在容器network namespace中创建veth pair等
        // WithNetNSPath executes the passed closure under the given network
        // namespace, restoring the original namespace afterwards.
	err = ns.WithNetNSPath(args.Netns, func(hostNS ns.NetNS) error {
                // veth pair设备
		veth := &netlink.Veth{
			LinkAttrs: netlink.LinkAttrs{
				Name:        contVethName,
				MTU:         d.mtu,
				NumTxQueues: d.queues,
				NumRxQueues: d.queues,
			},
			PeerName: hostVethName,
		}
                // 创建容器端veth
		if err := netlink.LinkAdd(veth); err != nil {
			d.logger.Errorf("Error adding veth %+v: %s", veth, err)
			return err
		}
                // 创建主机端veth
		hostVeth, err := netlink.LinkByName(hostVethName)
		if err != nil {
			err = fmt.Errorf("failed to lookup %q: %v", hostVethName, err)
			return err
		}

            
                // 启动主机端veth
		// Explicitly set the veth to UP state; the veth won't get a link local address unless it's set to UP state.
		if err = netlink.LinkSetUp(hostVeth); err != nil {
			return fmt.Errorf("failed to set %q up: %w", hostVethName, err)
		}

		contVeth, err := netlink.LinkByName(contVethName)
		if err != nil {
			err = fmt.Errorf("failed to lookup %q: %v", contVethName, err)
			return err
		}
                
                // 启动容器端veth
		// Explicitly set the veth to UP state; the veth won't get a link local address unless it's set to UP state.
		if err = netlink.LinkSetUp(contVeth); err != nil {
			return fmt.Errorf("failed to set %q up: %w", contVethName, err)
		}

		// Fetch the MAC from the container Veth. This is needed by Calico.
		contVethMAC = contVeth.Attrs().HardwareAddr.String()
		d.logger.WithField("MAC", contVethMAC).Debug("Found MAC for container veth")


                // 添加默认路由
		// At this point, the virtual ethernet pair has been created, and both ends have the right names.
		// Both ends of the veth are still in the container's network namespace.

		// Do the per-IP version set-up.  Add gateway routes etc.
		if hasIPv4 {
			// Add a connected route to a dummy next hop so that a default route can be set
			gw := net.IPv4(169, 254, 1, 1)
			gwNet := &net.IPNet{IP: gw, Mask: net.CIDRMask(32, 32)}
			err := netlink.RouteAdd(
				&netlink.Route{
					LinkIndex: contVeth.Attrs().Index,
					Scope:     netlink.SCOPE_LINK,
					Dst:       gwNet,
				},
			)

			if err != nil {
				return fmt.Errorf("failed to add route inside the container: %v", err)
			}

			for _, r := range routes {
				if r.IP.To4() == nil {
					d.logger.WithField("route", r).Debug("Skipping non-IPv4 route")
					continue
				}
				d.logger.WithField("route", r).Debug("Adding IPv4 route")
				if err = ip.AddRoute(r, gw, contVeth); err != nil {
					return fmt.Errorf("failed to add IPv4 route for %v via %v: %v", r, gw, err)
				}
			}
		}

                // 给容器端veth设备添加IP
		// Now add the IPs to the container side of the veth.
		for _, addr := range result.IPs {
			if err = netlink.AddrAdd(contVeth, &netlink.Addr{IPNet: &addr.Address}); err != nil {
				return fmt.Errorf("failed to add IP addr to %q: %v", contVeth, err)
			}
		}
                
                // sysctl设置ipv4转发等
		if err = d.configureContainerSysctls(hasIPv4, hasIPv6); err != nil {
			return fmt.Errorf("error configuring sysctls for the container netns, error: %s", err)
		}

                // 将主机端veth移动到主机的network namespace
		// Now that the everything has been successfully set up in the container, move the "host" end of the
		// veth into the host namespace.
		if err = netlink.LinkSetNsFd(hostVeth, int(hostNS.Fd())); err != nil {
			return fmt.Errorf("failed to move veth to host netns: %v", err)
		}

		return nil
	})

	if err != nil {
		d.logger.Errorf("Error creating veth: %s", err)
		return "", "", err
	}

        // 设置主机端的路由
	// Now that the host side of the veth is moved, state set to UP, and configured with sysctls, we can add the routes to it in the host namespace.
	err = SetupRoutes(hostVeth, result)
	if err != nil {
		return "", "", fmt.Errorf("error adding host side routes for interface: %s, error: %s", hostVeth.Attrs().Name, err)
	}

	return hostVethName, contVethMAC, err
}

疑问

  1. 为什么默认路由是169.254.1.1 ?
gw := net.IPv4(169, 254, 1, 1)
gwNet := &net.IPNet{IP: gw, Mask: net.CIDRMask(32, 32)}
err := netlink.RouteAdd(
	&netlink.Route{
		LinkIndex: contVeth.Attrs().Index,
                Scope:     netlink.SCOPE_LINK,
		Dst:       gwNet,
	},
)

是因为使用了arp代理,当协议栈不知道谁的IP是169.254.1.1时,会发送arp请求。ARP 请求目标跨网段时,网关设备收到此 ARP 请求,会用自己的 MAC 地址返回给请求者,这便是代理 ARP(Proxy ARP)。

详情参考: zhuanlan.zhihu.com/p/75933393