Why?
I am interested in the Linux kernel networking stack, and want to learn as much about it as I can. But one problem is that it is huge and pretty complicated. My solution to this problem is to focus on smaller pieces at a time. This time I've chosen the TUN virtual network driver. I hope that my summary of how TUN is implemented can help others in their journey towards understanding the inner workings of Linux networking.TUN interface internals
The beginning
static int __init tun_init(void) { [...] ret = misc_register(&tun_miscdev); if (ret) { pr_err("Can't register misc device %d\n", TUN_MINOR); goto err_misc; } [...] return 0; [...] }
We can see here that the modules registers a misc device. Once the device has been registered, a special file gets created at /dev/net/tun
The TUN Miscellaneous Device
static const struct file_operations tun_fops = { [...] .read_iter = tun_chr_read_iter, .write_iter = tun_chr_write_iter, [...] .unlocked_ioctl = tun_chr_ioctl,
[...] .open = tun_chr_open, [...] };
I'll be going through four of the file operations; open, ioctl, read and write.
Opening the TUN file
static int tun_chr_open(struct inode *inode, struct file * file) { struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; DBG1(KERN_INFO, "tunX: tun_chr_open\n"); tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); if (!tfile) return -ENOMEM; [...] return 0; }
Here we see that the function allocates memory for a struct of type tun_file. This struct contains all information needed to connect one misc device file descriptor with a TUN virtual network interface.
So, now we have a opened file. But we can't do much with it until we associate it with a network device. This is accomplished with the ioctl system call.
Creating a network device
static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { [...] dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { [...] err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, ifr->ifr_flags & IFF_NAPI); if (err < 0) return err; [...] } else { [...] dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, NET_NAME_UNKNOWN, tun_setup, queues, queues); if (!dev) return -ENOMEM; [...] tun = netdev_priv(dev); tun->dev = dev; [...] tun_net_init(dev); tun_flow_init(tun); [...] err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI); if (err < 0) goto err_free_flow; err = register_netdevice(tun->dev); if (err < 0) goto err_detach; } netif_carrier_on(tun->dev); [...] return 0; [...] }
Starting off, the function checks if a device with the provided name already exists, and if so, associates the opened file with the already existing device. If no existing device is found, a new one gets allocated, initialised, attached to the TUN file and registered.
This means that the virtual network device can now be seen and interacted with using userspace tools such as ifconfig or ip.
Let's have a look at one of the network device initialization functions. The tun_net_init.
static void tun_net_init(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; /* Point-to-Point TUN Device */ dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = 1500; [...] break; case IFF_TAP: [...] break; } dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU - dev->hard_header_len; }
The most notable thing in this function is the assignment of the tun_netdev_ops. We will get back to those in a little while.
Sending a packet
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, int noblock, bool more) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr gso = { 0 }; struct tun_pcpu_stats *stats; int good_linear; int copylen; bool zerocopy = false; int err; u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tun); if (!(tun->dev->flags & IFF_UP)) return -EIO; [...] if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { [...] } else { if (!zerocopy) { [...] } if (frags) { [...] } else { skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); } if (IS_ERR(skb)) { [...] return PTR_ERR(skb); } if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { [...]
return -EFAULT; } } if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { [...] } switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; switch (ip_version) { case 4: pi.proto = htons(ETH_P_IP); break; case 6: pi.proto = htons(ETH_P_IPV6); break; default: this_cpu_inc(tun->pcpu_stats->rx_dropped); kfree_skb(skb); return -EINVAL; } } skb_reset_mac_header(skb); skb->protocol = pi.proto; skb->dev = tun->dev; break; case IFF_TAP: [...] break; } [...] skb_reset_network_header(skb); skb_probe_transport_header(skb, 0); if (skb_xdp) { [...] } [...] if (frags) { [...] } else if (tfile->napi_enabled) { [...] } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { [...] } else { netif_rx_ni(skb); } [...] return total_len; }
Basically what's happening is this:
1. An skb structure gets allocated (tun_alloc_skb)
2. The userspace data gets copied into the skb (skb_copy_datagram_from_iter)
3. The skb gets sent into the kernel network stack (netif_rx_ni)
Receiving a packet
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); int txq = skb->queue_mapping; struct tun_file *tfile; int len = skb->len; [...] skb_orphan(skb); nf_reset(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) goto drop; /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); rcu_read_unlock(); return NETDEV_TX_OK; [...] }
Here we can see that the skb getting sent through the TUN device actually gets queued (ptr_ring_produce) and processes waiting on a blocking read operation gets notified. The network packet is now ready to be read into and processed in the userspace program. This leads us to the TUN file read functions. The most interesting function is probably tun_do_read
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, int noblock, void *ptr) { ssize_t ret; int err; tun_debug(KERN_INFO, tun, "tun_do_read\n"); [...] if (!ptr) { /* Read frames from ring */ ptr = tun_ring_recv(tfile, noblock, &err); if (!ptr) return err; } if (tun_is_xdp_buff(ptr)) { [...] } else { struct sk_buff *skb = ptr; ret = tun_put_user(tun, tfile, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; }
This function is basically the opposite of the write function we looked at earlier. An skb gets dequeued (tun_ring_recv) and sent to userspace (tun_put_user). Lastly the skb gets released (consume_skb)
Summary
- A userspace program creates a virtual TUN network device by using the open and ioctl system calls on a special misc device located at /dev/net/tun
- The userspace program can then insert data into the network stack by writing to its opened TUN file, or
- It can read network data from the kernel by reading from the opened TUN file