根据CNI Spec,当环境变量CNI_COMMAND=ADD时,kubernetes要求CNI为容器添加网络设备。
Calico是如何实现容器添加网络设备的功能的呢?简单分析下源码记录下自己的理解。
CNI_COMMAND=ADD操作,需要传入一些环境变量和CNI配置文件
假设传入下面的CNI的配置文件:
- /etc/cni/net.d/10-calico.conflist
{
"name": "k8s-pod-network",
"cniVersion": "0.3.1",
"plugins": [
{
"type": "calico",
"log_level": "info",
"log_file_path": "/var/log/calico/cni/cni.log",
"etcd_endpoints": "https://192.168.10.8:2379",
"etcd_key_file": "/etc/cni/net.d/calico-tls/etcd-key",
"etcd_cert_file": "/etc/cni/net.d/calico-tls/etcd-cert",
"etcd_ca_cert_file": "/etc/cni/net.d/calico-tls/etcd-ca",
"mtu": 0,
"ipam": {
"type": "calico-ipam"
},
"policy": {
"type": "k8s"
},
"kubernetes": {
"kubeconfig": "/etc/cni/net.d/calico-kubeconfig"
}
},
{
"type": "portmap",
"snat": true,
"capabilities": {"portMappings": true}
},
{
"type": "bandwidth",
"capabilities": {"bandwidth": true}
}
]
}
源码分析
代码入口点
- cni-plugin/cmd/calico/calico.go
func main() {
// Use the name of the binary to determine which routine to run.
_, filename := filepath.Split(os.Args[0])
switch filename {
case "calico", "calico.exe":
plugin.Main(VERSION) // => CNI_COMMAND=ADD 走这个分支
default:
panic("Unknown binary name: " + filename)
}
}
- cni-plugin/pkg/plugin/plugin.go
func Main(version string) {
// ...
// CNI的4个接口注册
skel.PluginMain(cmdAdd, cmdDummyCheck, cmdDel,
cniSpecVersion.PluginSupports("0.1.0", "0.2.0", "0.3.0", "0.3.1", "0.4.0", "1.0.0"),
"Calico CNI plugin "+version)
}
只分析cmdAdd的实现,cmdAdd用于给容器创建网卡。这个函数巨长... 不重要的全都剪掉了
func cmdAdd(args *skel.CmdArgs) (err error) {
// 从stdin读取配置,一般是类似/etc/cni/net.d/10-calico.conflist里的内容
// 格式有差异,不过内容是大致相同的
// Unmarshal the network config, and perform validation
conf := types.NetConf{}
if err := json.Unmarshal(args.StdinData, &conf); err != nil {
return fmt.Errorf("failed to load netconf: %v", err)
}
// WEP 是workload endpoint的简称,这里是构造WEP的坐标,WEP的坐标系包括节点,协调类型(一半是k8s), pod名称 etc.组成
// Extract WEP identifiers such as pod name, pod namespace (for k8s), containerID, IfName.
wepIDs, err := utils.GetIdentifiers(args, nodename)
calicoClient, err := utils.CreateClient(conf)
if err != nil {
err = fmt.Errorf("error creating calico client: %v", err)
return
}
// Remove the endpoint field (IfName) from the wepIDs so we can get a WEP name prefix.
// We use the WEP name prefix (e.g. prefix: "node1-k8s-mypod--1-", full name: "node1-k8s-mypod--1-eth0"
// to list all the WEPs so if we have a WEP with a different IfName (e.g. "node1-k8s-mypod--1-eth1")
// we could still get that.
wepIDs.Endpoint = ""
// Calculate the workload name prefix from the WEP specific identifiers
// for the given orchestrator.
// 前缀一般是"node1-k8s-mypod--1-"
wepPrefix, err := wepIDs.CalculateWorkloadEndpointName(true)
if err != nil {
err = fmt.Errorf("error constructing WorkloadEndpoint prefix: %s", err)
return
}
// 请求Kubernetes,获取WorkloadEndpoint CRD列表
// Check if there's an existing endpoint by listing the existing endpoints based on the WEP name prefix.
endpoints, err := calicoClient.WorkloadEndpoints().List(ctx, options.ListOptions{Name: wepPrefix, Namespace: wepIDs.Namespace, Prefix: true})
if err != nil {
return
}
// 后面的逻辑都是更新libapi.WorkloadEndpoint内的字段
var endpoint *libapi.WorkloadEndpoint
// ...
// If we don't find a match from the existing WorkloadEndpoints then we calculate
// the WEP name with the IfName passed in so we can create the WorkloadEndpoint later in the process.
if endpoint == nil {
wepIDs.Endpoint = args.IfName // 网卡名放入WEP ID
wepIDs.WEPName, err = wepIDs.CalculateWorkloadEndpointName(false)
if err != nil {
err = fmt.Errorf("error constructing WorkloadEndpoint name: %s", err)
return
}
}
// Collect the result in this variable - this is ultimately what gets "returned" by this function by printing
// it to stdout.
var result *cniv1.Result // => cniv1.Result是最后cni执行结果的输出结构体
// If running under Kubernetes then branch off into the kubernetes code, otherwise handle everything in this
// function.
if wepIDs.Orchestrator == api.OrchestratorKubernetes {
// 具体的创建网卡的实现走这个分支
if result, err = k8s.CmdAddK8s(ctx, args, conf, *wepIDs, calicoClient, endpoint); err != nil {
return
}
} else {
// ...
}
// Print result to stdout, in the format defined by the requested cniVersion.
err = cnitypes.PrintResult(result, conf.CNIVersion)
return
}
- WEP ID
那么WEP ID(workload endpoint identifier)是什么?WEP ID可以唯一标识容器内一个网卡。
type WEPIdentifiers struct {
Namespace string
WEPName string // <node>-k8s-<pod>-<endpoint>
names.WorkloadEndpointIdentifiers
}
// WorkloadEndpointIdentifiers is a collection of identifiers that are used to uniquely
// identify a WorkloadEndpoint resource. Since a resource is identified by a single
// name field, Calico requires the name to be constructed in a very specific format.
// The format is dependent on the Orchestrator type:
// - k8s: <node>-k8s-<pod>-<endpoint>
// - cni: <node>-cni-<containerID>-<endpoint>
// - libnetwork: <node>-libnetwork-libnetwork-<endpoint>
// - (other): <node>-<orchestrator>-<workload>-<endpoint>
//
// Each parameter cannot start or end with a dash (-), and dashes within the parameter
// will be escaped to a double-dash (--) in the constructed name.
//
// List queries allow for prefix lists (for non-KDD), the client should verify that
// the items returned in the list match the supplied identifiers using the
// NameMatches() method. This is necessary because a prefix match may return endpoints
// that do not exactly match the required identifiers. For example, suppose you are
// querying endpoints with node=node1, orch=k8s, pod=pod and endpoints is wild carded:
// - The name prefix would be `node1-k8s-pod-`
// - A list query using that prefix would also return endpoints with, for example,
// a pod call "pod-1", because the name of the endpoint might be `node1-k8s-pod--1-eth0`
// which matches the required name prefix.
//
// The Node and Orchestrator are always required for both prefix and non-prefix name
// construction.
type WorkloadEndpointIdentifiers struct {
Node string // 节点
Orchestrator string // 协调平台,值一般是k8s
Endpoint string // 容器network namespace里的网卡名
Workload string // 暂时忽略
Pod string
ContainerID string
}
创建网卡是通过调用result, err = k8s.CmdAddK8s(ctx, args, conf, *wepIDs, calicoClient, endpoint);
- cni-plugin/pkg/k8s/k8s.go
// CmdAddK8s performs the "ADD" operation on a kubernetes pod
// Having kubernetes code in its own file avoids polluting the mainline code. It's expected that the kubernetes case will
// more special casing than the mainline code.
func CmdAddK8s(ctx context.Context, args *skel.CmdArgs, conf types.NetConf, epIDs utils.WEPIdentifiers, calicoClient calicoclient.Interface, endpoint *libapi.WorkloadEndpoint) (*cniv1.Result, error) {
// dataplane是一个重要接口,知道如何为容器创建网卡
d, err := dataplane.GetDataplane(conf, logger)
// Allocate the IP and update/create the endpoint. Do this even if the endpoint already exists and has an IP
// allocation. The kubelet will send a DEL call for any old containers and we'll clean up the old IPs then.
client, err := NewK8sClient(conf, logger)
if err != nil {
return nil, err
}
var routes []*net.IPNet
// 创建
// Determine which routes to program within the container. If no routes were provided in the CNI config,
// then use the Calico default routes. If routes were provided then program those instead.
if len(routes) == 0 {
logger.Debug("No routes specified in CNI configuration, using defaults.")
routes = utils.DefaultRoutes // 0.0.0.0/0
} else {
// ...
}
// 这两个都为空
ipAddrsNoIpam := annot["cni.projectcalico.org/ipAddrsNoIpam"]
ipAddrs := annot["cni.projectcalico.org/ipAddrs"]
// Switch based on which annotations are passed or not passed.
switch {
case ipAddrs == "" && ipAddrsNoIpam == "":
// Call the IPAM plugin.
// 调用calico-ipam分配ip地址
result, err = utils.AddIPAM(conf, args, logger)
if err != nil {
return nil, err
}
// ...
}
// 构造endpoint,后续同步到kubernetes
endpoint.Name = epIDs.WEPName
endpoint.Namespace = epIDs.Namespace
endpoint.Labels = labels
endpoint.GenerateName = generateName
endpoint.Spec.Endpoint = epIDs.Endpoint
endpoint.Spec.Node = epIDs.Node
endpoint.Spec.Orchestrator = epIDs.Orchestrator
endpoint.Spec.Pod = epIDs.Pod
endpoint.Spec.Ports = ports
endpoint.Spec.IPNetworks = []string{}
endpoint.Spec.ServiceAccountName = serviceAccount
// releaseIPAM cleans up any IPAM allocations on failure.
releaseIPAM := func() {
logger.WithField("endpointIPs", endpoint.Spec.IPNetworks).Info("Releasing IPAM allocation(s) after failure")
utils.ReleaseIPAllocation(logger, conf, args)
}
// 计算"namespace.pod"的hash值,用于cali-xxx网卡命名
// Whether the endpoint existed or not, the veth needs (re)creating.
desiredVethName := k8sconversion.NewConverter().VethNameForWorkload(epIDs.Namespace, epIDs.Pod)
// 执行具体创建网卡的操作
hostVethName, contVethMac, err := d.DoNetworking(
ctx, calicoClient, args, result, desiredVethName, routes, endpoint, annot)
// 请求kubernetes 更新WEP CRD
// Write the endpoint object (either the newly created one, or the updated one)
// Pass special-case flag through to KDD to let it know what kind of patch to apply to the underlying
// Pod resource. (In Enterprise) Felix also modifies the pod through a patch and setting this avoids patching the
// same fields as Felix so that we can't clobber Felix's updates.
ctxPatchCNI := k8sresources.ContextWithPatchMode(ctx, k8sresources.PatchModeCNI)
if _, err := utils.CreateOrUpdate(ctxPatchCNI, calicoClient, endpoint); err != nil {
logger.WithError(err).Error("Error creating/updating endpoint in datastore.")
releaseIPAM()
return nil, err
}
return result, nil
}
Dataplane是封装数据面具体操作接口:
type Dataplane interface {
DoNetworking(
ctx context.Context,
calicoClient calicoclient.Interface,
args *skel.CmdArgs,
result *cniv1.Result,
desiredVethName string,
routes []*net.IPNet,
endpoint *api.WorkloadEndpoint,
annotations map[string]string,
) (hostVethName, contVethMAC string, err error)
CleanUpNamespace(args *skel.CmdArgs) error
}
默认实现是linuxDataplane,使用github.com/vishvananda/netlink库操作linux下的网络设备。
func (d *linuxDataplane) DoNetworking(
ctx context.Context,
calicoClient calicoclient.Interface,
args *skel.CmdArgs,
result *cniv1.Result,
desiredVethName string,
routes []*net.IPNet,
endpoint *api.WorkloadEndpoint,
annotations map[string]string,
) (hostVethName, contVethMAC string, err error) {
hostVethName = desiredVethName
contVethName := args.IfName
var hasIPv4, hasIPv6 bool
d.logger.Infof("Setting the host side veth name to %s", hostVethName)
// Clean up if hostVeth exists.
if oldHostVeth, err := netlink.LinkByName(hostVethName); err == nil {
if err = netlink.LinkDel(oldHostVeth); err != nil {
return "", "", fmt.Errorf("failed to delete old hostVeth %v: %v", hostVethName, err)
}
d.logger.Infof("Cleaning old hostVeth: %v", hostVethName)
}
// WithNetNSPath在容器network namespace中创建veth pair等
// WithNetNSPath executes the passed closure under the given network
// namespace, restoring the original namespace afterwards.
err = ns.WithNetNSPath(args.Netns, func(hostNS ns.NetNS) error {
// veth pair设备
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{
Name: contVethName,
MTU: d.mtu,
NumTxQueues: d.queues,
NumRxQueues: d.queues,
},
PeerName: hostVethName,
}
// 创建容器端veth
if err := netlink.LinkAdd(veth); err != nil {
d.logger.Errorf("Error adding veth %+v: %s", veth, err)
return err
}
// 创建主机端veth
hostVeth, err := netlink.LinkByName(hostVethName)
if err != nil {
err = fmt.Errorf("failed to lookup %q: %v", hostVethName, err)
return err
}
// 启动主机端veth
// Explicitly set the veth to UP state; the veth won't get a link local address unless it's set to UP state.
if err = netlink.LinkSetUp(hostVeth); err != nil {
return fmt.Errorf("failed to set %q up: %w", hostVethName, err)
}
contVeth, err := netlink.LinkByName(contVethName)
if err != nil {
err = fmt.Errorf("failed to lookup %q: %v", contVethName, err)
return err
}
// 启动容器端veth
// Explicitly set the veth to UP state; the veth won't get a link local address unless it's set to UP state.
if err = netlink.LinkSetUp(contVeth); err != nil {
return fmt.Errorf("failed to set %q up: %w", contVethName, err)
}
// Fetch the MAC from the container Veth. This is needed by Calico.
contVethMAC = contVeth.Attrs().HardwareAddr.String()
d.logger.WithField("MAC", contVethMAC).Debug("Found MAC for container veth")
// 添加默认路由
// At this point, the virtual ethernet pair has been created, and both ends have the right names.
// Both ends of the veth are still in the container's network namespace.
// Do the per-IP version set-up. Add gateway routes etc.
if hasIPv4 {
// Add a connected route to a dummy next hop so that a default route can be set
gw := net.IPv4(169, 254, 1, 1)
gwNet := &net.IPNet{IP: gw, Mask: net.CIDRMask(32, 32)}
err := netlink.RouteAdd(
&netlink.Route{
LinkIndex: contVeth.Attrs().Index,
Scope: netlink.SCOPE_LINK,
Dst: gwNet,
},
)
if err != nil {
return fmt.Errorf("failed to add route inside the container: %v", err)
}
for _, r := range routes {
if r.IP.To4() == nil {
d.logger.WithField("route", r).Debug("Skipping non-IPv4 route")
continue
}
d.logger.WithField("route", r).Debug("Adding IPv4 route")
if err = ip.AddRoute(r, gw, contVeth); err != nil {
return fmt.Errorf("failed to add IPv4 route for %v via %v: %v", r, gw, err)
}
}
}
// 给容器端veth设备添加IP
// Now add the IPs to the container side of the veth.
for _, addr := range result.IPs {
if err = netlink.AddrAdd(contVeth, &netlink.Addr{IPNet: &addr.Address}); err != nil {
return fmt.Errorf("failed to add IP addr to %q: %v", contVeth, err)
}
}
// sysctl设置ipv4转发等
if err = d.configureContainerSysctls(hasIPv4, hasIPv6); err != nil {
return fmt.Errorf("error configuring sysctls for the container netns, error: %s", err)
}
// 将主机端veth移动到主机的network namespace
// Now that the everything has been successfully set up in the container, move the "host" end of the
// veth into the host namespace.
if err = netlink.LinkSetNsFd(hostVeth, int(hostNS.Fd())); err != nil {
return fmt.Errorf("failed to move veth to host netns: %v", err)
}
return nil
})
if err != nil {
d.logger.Errorf("Error creating veth: %s", err)
return "", "", err
}
// 设置主机端的路由
// Now that the host side of the veth is moved, state set to UP, and configured with sysctls, we can add the routes to it in the host namespace.
err = SetupRoutes(hostVeth, result)
if err != nil {
return "", "", fmt.Errorf("error adding host side routes for interface: %s, error: %s", hostVeth.Attrs().Name, err)
}
return hostVethName, contVethMAC, err
}
疑问
- 为什么默认路由是
169.254.1.1?
gw := net.IPv4(169, 254, 1, 1)
gwNet := &net.IPNet{IP: gw, Mask: net.CIDRMask(32, 32)}
err := netlink.RouteAdd(
&netlink.Route{
LinkIndex: contVeth.Attrs().Index,
Scope: netlink.SCOPE_LINK,
Dst: gwNet,
},
)
是因为使用了arp代理,当协议栈不知道谁的IP是169.254.1.1时,会发送arp请求。ARP 请求目标跨网段时,网关设备收到此 ARP 请求,会用自己的 MAC 地址返回给请求者,这便是代理 ARP(Proxy ARP)。