preface

Previous articles have analyzed the Flannel network model. Admittedly, Flannel solves the problem of network communication between containers in K8S cluster, but it cannot solve the problem of direct communication between containers in the cluster and virtual machines or physical machines outside the cluster.

In fact, it is more accurate to say that the service outside the cluster cannot ping through the IP address of the container inside the cluster. This means that in a microservice discovery and registration scenario like Dubbo, at the network layer, consumers outside the K8S cluster cannot directly connect to providers within the cluster

Why, one might wonder, did Flannel fail to respond to the situation?

This is because the IP address of the container in the K8S cluster is independently generated by Flanneld, which is not within the VPC network segment. As a result, the routing table of the server outside the cluster lacks the corresponding routing entry and forwards the packet to the container.

As smart as you are, you immediately think, “In this case, why not let the IP assigned by the container be in the VPC network segment?”

Congratulations, you got it!!

In the VPC-CNI solution, IP addresses are allocated to containers from VPC network segments. In this way, there is no difference between the inside and outside of the cluster. Another advantage of this scheme is that the network performance is significantly improved because the process of encapsulating VXLAN packets is not needed.

To smoothly migrate service systems to k8S, especially the microservice architecture based on RPC+ register center, direct communication between the cluster and the external cluster must be maintained. In this scenario, the VPC-CNI solution is the preferred solution

The principle of

Main implementation logic:

ENI(Elastic Netowrk Interface)

  • Each ENI is bound to a Primary IP and Secondry IP

  • Local IP Address Manager (IPAMD) runs on each worker node and adds all secondary IP addresses of all enIs to the Local IP Address pool

  • When CNI receives the request to create POD event, it will request IPAMD to get IP and set pod network stack through GRPC. On the other hand, when a request to remove a POD is received, ipAMD is notified to release the IP and remove the POD network stack

CNI

Comply with the interface specification of k8S CNI network model, the main implementation of cmdAdd cmdDel interface, respectively deal with pod network creation and destruction events

  • cmdAdd

Code path: CMD/CLASH – ENI – CNi -plugin/ CNi. Go

func cmdAdd(args *skel.CmdArgs) error {
	return add(args, typeswrapper.New(), grpcwrapper.New(), rpcwrapper.New(), driver.New())
}

func add(args *skel.CmdArgs, cniTypes typeswrapper.CNITYPES, grpcClient grpcwrapper.GRPC, rpcClient rpcwrapper.RPC, driverClient driver.NetworkAPIs) error {

	conf, log, err := LoadNetConf(args.StdinData)
    ...
	// Parse the k8s parameter
    var k8sArgs K8sArgs
	iferr := cniTypes.LoadArgs(args.Args, &k8sArgs); err ! =nil {
		log.Errorf("Failed to load k8s config from arg: %v", err)
		return errors.Wrap(err, "add cmd: failed to load k8s config from arg")}...// Initiate a request to ipAMD Server through GRPC
	conn, err := grpcClient.Dial(ipamdAddress, grpc.WithInsecure())
	...
	c := rpcClient.NewCNIBackendClient(conn)
    
        // Call ipAMD's AddNetwork interface to get the IP address
	r, err := c.AddNetwork(context.Background(),
		&pb.AddNetworkRequest{
			ClientVersion:              version,
			K8S_POD_NAME:               string(k8sArgs.K8S_POD_NAME),
			K8S_POD_NAMESPACE:          string(k8sArgs.K8S_POD_NAMESPACE),
			K8S_POD_INFRA_CONTAINER_ID: string(k8sArgs.K8S_POD_INFRA_CONTAINER_ID),
			Netns:                      args.Netns,
			ContainerID:                args.ContainerID,
			NetworkName:                conf.Name,
			IfName:                     args.IfName,
		})
    ...
	addr := &net.IPNet{
		IP:   net.ParseIP(r.IPv4Addr),
		Mask: net.IPv4Mask(255.255.255.255),}...// After obtaining the IP address, call the driver module to configure the POD network namespace
		err = driverClient.SetupNS(hostVethName, args.IfName, args.Netns, addr, int(r.DeviceNumber), r.VPCcidrs, r.UseExternalSNAT, mtu, log)
	}
    ...
	ips := []*current.IPConfig{
		{
			Version: "4",
			Address: *addr,
		},
	}

	result := &current.Result{
		IPs: ips,
	}

	return cniTypes.PrintResult(result, conf.CNIVersion)
}
Copy the code

Summary: CNI requests IPAMD service to obtain IP through GRPC, and calls driver module to set pod network environment after obtaining IP

  • cmdDel

Release pod IP and clean up the POD network environment

func cmdDel(args *skel.CmdArgs) error {
	return del(args, typeswrapper.New(), grpcwrapper.New(), rpcwrapper.New(), driver.New())
}

func del(args *skel.CmdArgs, cniTypes typeswrapper.CNITYPES, grpcClient grpcwrapper.GRPC, rpcClient rpcwrapper.RPC, driverClient driver.NetworkAPIs) error {

	conf, log, err := LoadNetConf(args.StdinData)
    ...
	var k8sArgs K8sArgs
	iferr := cniTypes.LoadArgs(args.Args, &k8sArgs); err ! =nil {
		log.Errorf("Failed to load k8s config from args: %v", err)
		return errors.Wrap(err, "del cmd: failed to load k8s config from args")}// Initiate GRPC request notifying IPAMD to release IP
	conn, err := grpcClient.Dial(ipamdAddress, grpc.WithInsecure())
	...
	c := rpcClient.NewCNIBackendClient(conn)

	r, err := c.DelNetwork(context.Background(), &pb.DelNetworkRequest{
		ClientVersion:              version,
		K8S_POD_NAME:               string(k8sArgs.K8S_POD_NAME),
		K8S_POD_NAMESPACE:          string(k8sArgs.K8S_POD_NAMESPACE),
		K8S_POD_INFRA_CONTAINER_ID: string(k8sArgs.K8S_POD_INFRA_CONTAINER_ID),
		NetworkName:                conf.Name,
		ContainerID:                args.ContainerID,
		IfName:                     args.IfName,
		Reason:                     "PodDeleted",})... deletedPodIP := net.ParseIP(r.IPv4Addr)ifdeletedPodIP ! =nil {
		addr := &net.IPNet{
			IP:   deletedPodIP,
			Mask: net.IPv4Mask(255.255.255.255),}...// Call the TearDownNS interface of the driver module to remove and clean the POD network stack
			err = driverClient.TeardownNS(addr, int(r.DeviceNumber), log)
        ...
	return nil
}
Copy the code

driver


This module mainly provides the tools to create and destroy pod network stack. The main functions of dirver module are SetupNS and TeardownNS

Code path: CMD/Television-eni – CNi -plugin/driver.go

Code logic:

  • SetupNS

This function is used to configure the POD network stack, including preparing the POD network environment and configuring policy-based routing

In the AWS-CNI network model, each ENI on a node generates a routing table to forward from-POD traffic. In PBR mode, to-POD traffic preferentially goes through the main routing table, while FROm-POD traffic goes through the routing table corresponding to ENI. Therefore, you need to configure PBR when configuring A POD network

func (os *linuxNetwork) SetupNS(hostVethName string, contVethName string, netnsPath string, addr *net.IPNet, deviceNumber int, vpcCIDRs []string, useExternalSNAT bool, mtu int, log logger.Logger) error {
	log.Debugf("SetupNS: hostVethName=%s, contVethName=%s, netnsPath=%s, deviceNumber=%d, mtu=%d", hostVethName, contVethName, netnsPath, deviceNumber, mtu)
	return setupNS(hostVethName, contVethName, netnsPath, addr, deviceNumber, vpcCIDRs, useExternalSNAT, os.netLink, os.ns, mtu, log, os.procSys)
}


func setupNS(hostVethName string, contVethName string, netnsPath string, addr *net.IPNet, deviceNumber int, vpcCIDRs []string, useExternalSNAT bool,
	netLink netlinkwrapper.NetLink, ns nswrapper.NS, mtu int, log logger.Logger, procSys procsyswrapper.ProcSys) error {

        // Call setupVeth to set the POD network environment
	hostVeth, err := setupVeth(hostVethName, contVethName, netnsPath, addr, netLink, ns, mtu, procSys, log)
    ...
	addrHostAddr := &net.IPNet{
		IP:   addr.IP,
		Mask: net.CIDRMask(32.32)}

        // Add a route to pod IP route add $IP dev veth-1 in the primary routing table on the node
	route := netlink.Route{
		LinkIndex: hostVeth.Attrs().Index,
		Scope:     netlink.SCOPE_LINK,
		Dst:       addrHostAddr}
   
        // The netlink interface encapsulates Linux commands such as "IP link"," IP route", and "IP rule"
	iferr := netLink.RouteReplace(&route); err ! =nil {
		return errors.Wrapf(err, "setupNS: unable to add or replace route entry for %s", route.Dst.IP.String())
	}
    
        // Run the "IP rule" command to add to-pod PBR 512: from all to 10.0.97.30 lookup main
	err = addContainerRule(netLink, true, addr, mainRouteTable)
       ...
    
       // Check whether ENI deviceNumber is primary ENI. 0 indicates primary ENI
       // If ENI is not primary ENI, add PBR for traffic coming out of POD
       // 1536: from 10.0.97.30 lookup eni-1
	if deviceNumber > 0 {
		tableNumber := deviceNumber + 1
		err = addContainerRule(netLink, false, addr, tableNumber)
        ...
	}
	return nil
}
Copy the code

The final result:

# ip rule list
0:	from all lookup local 
512:	from all to 10.0.97.30 lookup main <---------- to Pod'traffic 1025: Not from all to 10.0.0.0/16 lookup main 1536: From 10.0.97.30 lookup eni-1 <-------------- from Pod's traffic
Copy the code
  • createVethPairContext

The createVethPairContext structure contains the parameters required to create vethPair. The run method is the implementation of the setupVeth function, including creating vethpair, enabling VethpIR, configuring the POD gateway, routing, and so on

func newCreateVethPairContext(contVethName string, hostVethName string, addr *net.IPNet, mtu int) *createVethPairContext {
	return &createVethPairContext{
		contVethName: contVethName,
		hostVethName: hostVethName,
		addr:         addr,
		netLink:      netlinkwrapper.NewNetLink(),
		ip:           ipwrapper.NewIP(),
		mtu:          mtu,
	}
}

func (createVethContext *createVethPairContext) run(hostNS ns.NetNS) error {
	veth := &netlink.Veth{
		LinkAttrs: netlink.LinkAttrs{
			Name:  createVethContext.contVethName,
			Flags: net.FlagUp,
			MTU:   createVethContext.mtu,
		},
		PeerName: createVethContext.hostVethName,
	}
    
        // Run IP link add to create a vethpair for pod
	iferr := createVethContext.netLink.LinkAdd(veth); err ! =nil {
		return err
	}

	hostVeth, err := createVethContext.netLink.LinkByName(createVethContext.hostVethName)
	...
        // Run the IP link set $link up command to enable the vethpair host
	iferr = createVethContext.netLink.LinkSetUp(hostVeth); err ! =nil {
		return errors.Wrapf(err, "setup NS network: failed to set link %q up", createVethContext.hostVethName)
	}

	contVeth, err := createVethContext.netLink.LinkByName(createVethContext.contVethName)
	iferr ! =nil {
		return errors.Wrapf(err, "setup NS network: failed to find link %q", createVethContext.contVethName)
	}

	// Enable vethpair on pod
	iferr = createVethContext.netLink.LinkSetUp(contVeth); err ! =nil {
		return errors.Wrapf(err, "setup NS network: failed to set link %q up", createVethContext.contVethName)
	}

        169.254.1.1 route add default gw addr
	iferr = createVethContext.netLink.RouteReplace(&netlink.Route{ LinkIndex: contVeth.Attrs().Index, Scope: netlink.SCOPE_LINK, Dst: gwNet}); err ! =nil {
		return errors.Wrap(err, "setup NS network: failed to add default gateway")}// Add the default route effect default via 169.254.1.1 dev eth0
	iferr = createVethContext.ip.AddDefaultRoute(gwNet.IP, contVeth); err ! =nil {
		return errors.Wrap(err, "setup NS network: failed to add default route")}// add IP addr add $IP dev eth0
	iferr = createVethContext.netLink.AddrAdd(contVeth, &netlink.Addr{IPNet: createVethContext.addr}); err ! =nil {
		return errors.Wrapf(err, "setup NS network: failed to add IP addr to %q", createVethContext.contVethName)
	}

	// Add static ARP entries for the default gateway
	neigh := &netlink.Neigh{
		LinkIndex:    contVeth.Attrs().Index,
		State:        netlink.NUD_PERMANENT,
		IP:           gwNet.IP,
		HardwareAddr: hostVeth.Attrs().HardwareAddr,
	}

	iferr = createVethContext.netLink.NeighAdd(neigh); err ! =nil {
		return errors.Wrap(err, "setup NS network: failed to add static ARP")}// Move one end of the vethpair to the host network namespace
	if err = createVethContext.netLink.LinkSetNsFd(hostVeth, int(hostNS.Fd())); err ! =nil {
		return errors.Wrap(err, "setup NS network: failed to move veth to host netns")}return nil
}
Copy the code
  • TeardownNS

Clear the POD network environment

func (os *linuxNetwork) TeardownNS(addr *net.IPNet, deviceNumber int, log logger.Logger) error {
	log.Debugf("TeardownNS: addr %s, deviceNumber %d", addr.String(), deviceNumber)
	return tearDownNS(addr, deviceNumber, os.netLink, log)
}

func tearDownNS(addr *net.IPNet, deviceNumber int, netLink netlinkwrapper.NetLink, log logger.Logger) error{...// Delete the to-POD PBR and run "IP rule del".
	toContainerRule := netLink.NewRule()
	toContainerRule.Dst = addr
	toContainerRule.Priority = toContainerRulePriority
	err := netLink.RuleDel(toContainerRule)
     ...
     // Check whether ENI is Primary ENI. If it is not Primary, delete the PBR from FROm-POD
	if deviceNumber > 0 {
		err := deleteRuleListBySrc(*addr)
      ...
	}
	addrHostAddr := &net.IPNet{
		IP:   addr.IP,
		Mask: net.CIDRMask(32.32)}...return nil
}
Copy the code

IPAMD

The local IP address pool management process runs on each worker node with daemonset and maintains all available IP addresses on the node. So, where does the data in the IP address pool come from?

In AWS EC2, there is a concept of EC2metadata, which holds metadata information about the instance, including all enIs bound to EC2, and all IP addresses on the ENI, and provides interfaces to obtain:

The curl http://169.254.169.254/latest/meta-data/network/interfaces/macs/

The curl http://169.254.169.254/latest/meta-data/network/interfaces/macs/0a:da:9d:51:47:28/local-ipv4s

Ipamd stores ENI/IP information in the dataStore during initialization, which is implemented in nodeInit

nodeInit

func (c *IPAMContext) nodeInit(a) error{...// Request ec2 metadata interface to get all ENI data
	metadataResult, err := c.awsClient.DescribeAllENIs()
	...
	enis := c.filterUnmanagedENIs(metadataResult.ENIMetadata)
         ....
		// Add ENI information
		retry := 0
		for {
			retry++
			if err = c.setupENI(eni.ENIID, eni, isTrunkENI, isEFAENI); err == nil {
				log.Infof("ENI %s set up.", eni.ENIID)
				break}...return nil
}
Copy the code
  • setupENI

The main tasks of setupENI are to complete dataStore data initialization, including:

  • Add ENI to the datastore
  • Enable vethpair associated with ENI
  • Add all secondary IP addresses of ENI to the datastore
func (c *IPAMContext) setupENI(eni string, eniMetadata awsutils.ENIMetadata, isTrunkENI, isEFAENI bool) error {
	primaryENI := c.awsClient.GetPrimaryENI()
    
	err := c.dataStore.AddENI(eni, eniMetadata.DeviceNumber, eni == primaryENI, isTrunkENI, isEFAENI)
	...
	c.primaryIP[eni] = eniMetadata.PrimaryIPv4Address()

	if eni != primaryENI {
		err = c.networkClient.SetupENINetwork(c.primaryIP[eni], eniMetadata.MAC, eniMetadata.DeviceNumber, eniMetadata.SubnetIPv4CIDR)
        ...
	}
    ...
	c.addENIsecondaryIPsToDataStore(eniMetadata.IPv4Addresses, eni)
	c.addENIprefixesToDataStore(eniMetadata.IPv4Prefixes, eni)

	return nil
}
Copy the code

dataStore

DataStore is a local DB constructed by a structure. It maintains information about the ENI node and all IP addresses bound to the ENI node. Each IP address has ipamkey as the primary key. When IP addresses are assigned, network name, CNI_CONTAINERID, CNI_IFNAME is used as the primary key. Otherwise, IP is not assigned and ipamkey is set to null

Code path/PKG/ipamd/datastore data_store. Go

type DataStore struct {
	total                    int 
	assigned                 int  
	allocatedPrefix          int
	eniPool                  ENIPool 
	lock                     sync.Mutex
	log                      logger.Logger
	CheckpointMigrationPhase int 
	backingStore             Checkpointer
	cri                      cri.APIs
	isPDEnabled              bool
}

type ENI struct {
	ID         string
	createTime time.Time
	IsPrimary bool
	IsTrunk bool
	IsEFA bool
	DeviceNumber int
	AvailableIPv4Cidrs map[string]*CidrInfo
}

type AddressInfo struct {
	IPAMKey        IPAMKey
	Address        string
	UnassignedTime time.Time
}

type CidrInfo struct {
	Cidr net.IPNet    / / 192.168.1.1/24
	IPv4Addresses map[string]*AddressInfo
	IsPrefix bool
}

type ENIPool map[string]*ENI   //['eniid]eni
Copy the code

Datastore contains two main methods AssignPodIPv4Address and UnAssignPodIPv4Address CNI essentially call these two methods directly to obtain and release an IP address, respectively

  • AssignPodIPv4Address
// Assign IP to pod
func (ds *DataStore) AssignPodIPv4Address(ipamKey IPAMKey) (ipv4address string, deviceNumber int, err error) {
   // Add a mutex to dataStore operations
	ds.lock.Lock()
	defer ds.lock.Unlock()
      ...
      // Run through eniPool to get the IP address
      for _, eni := range ds.eniPool {
		for _, availableCidr := range eni.AvailableIPv4Cidrs {
			var addr *AddressInfo
			var strPrivateIPv4 string
			var err error

			if(ds.isPDEnabled && availableCidr.IsPrefix) || (! ds.isPDEnabled && ! availableCidr.IsPrefix) { strPrivateIPv4, err = ds.getFreeIPv4AddrfromCidr(availableCidr)iferr ! =nil {
					ds.log.Debugf("Unable to get IP address from CIDR: %v", err)
					//Check in next CIDR
					continue}... addr = availableCidr.IPv4Addresses[strPrivateIPv4] ... availableCidr.IPv4Addresses[strPrivateIPv4] = addr// Set ipamkey for assigned IP addresses
			ds.assignPodIPv4AddressUnsafe(ipamKey, eni, addr)
                         ...
			return addr.Address, eni.DeviceNumber, nil}}... }Copy the code
  • UnAssignPodIPv4Address
// Release the IP address
func (ds *DataStore) UnassignPodIPv4Address(ipamKey IPAMKey) (e *ENI, ip string, deviceNumber int, err error){...// Use primary key ipamKey to find the corresponding POD IP address in enipool
	eni, availableCidr, addr := ds.eniPool.FindAddressForSandbox(ipamKey)
    ...
        / / call unassignPodIPv4AddressUnsafe set IP to undistributed state, the IP address of the corresponding primary key ipamkey set to null
	ds.unassignPodIPv4AddressUnsafe(addr)
	...
        // Set the IP address release time to the current time
	addr.UnassignedTime = time.Now()
    ...
	return eni, addr.Address, eni.DeviceNumber, nil
}
Copy the code

Color {green} {cloud monkey life} cloud monkey life for more knowledge