本文整理下之前的学习笔记,基于DPDK17.11版本源码分析。
主要看一下DPDK探测网卡设备,并进行初始化的流程,用到了类似kernel中的总线-设备-驱动模型。
本文的重点之一是DPDK如何在用户态操作网卡寄存器,这里先给个答案: 想要操作网卡寄存器,需要用到网卡的基地址BAR,intel网卡一般使用BAR0就行,通过mmap此文件/sys/bus/pci/devices/'pci address'/resource'map_idx'就可以在用户态得到BAR0对应的虚拟地址,此虚拟地址加上寄存器的偏移即可读取/设置网卡寄存器。此文件是kernel为pci设备创建,提供了底层的mmap实现。
首先看两个宏定义,利用gcc的属性attribute(constructor)定义的函数,可以在main函数执行前运行,还可以指定优先级attribute(constructor(prio)),prio从101开始,值越小越早执行。不指定优先级的版本是优先级更低的。
/**
* Run function before main() with low priority.
*
* The constructor will be run after prioritized constructors.
*
* @param func
* Constructor function.
*/
#define RTE_INIT(func) \
static void __attribute__((constructor, used)) func(void)
/**
* Run function before main() with high priority.
*
* @param func
* Constructor function.
* @param prio
* Priority number must be above 100.
* Lowest number is the first to run.
*/
#define RTE_INIT_PRIO(func, prio) \
static void __attribute__((constructor(prio), used)) func(void)
RTE_PMD_REGISTER_PCI
宏RTE_PMD_REGISTER_PCI使用上面的宏RTE_INIT注册驱动。
/** Helper for PCI device registration from driver (eth, crypto) instance */
#define RTE_PMD_REGISTER_PCI(nm, pci_drv) \
RTE_INIT(pciinitfn_ ##nm); \
static void pciinitfn_ ##nm(void) \
{\
(pci_drv).driver.name = RTE_STR(nm);\
rte_pci_register(&pci_drv); \
} \
static struct rte_pci_driver rte_ixgbe_pmd = {
.id_table = pci_id_ixgbe_map, //记录此驱动支持的网卡类型
.drv_flags = RTE_PCI_DRV_NEED_MAPPING | RTE_PCI_DRV_INTR_LSC |
RTE_PCI_DRV_IOVA_AS_VA,
.probe = eth_ixgbe_pci_probe, //设备和驱动匹配上的,调用probe函数进行初始化
.remove = eth_ixgbe_pci_remove,
};
//注册igb网卡驱动
RTE_PMD_REGISTER_PCI(net_e1000_igb, rte_igb_pmd);
RTE_REGISTER_BUS
RTE_REGISTER_BUS 在main函数执行前注册bus,插入全局链表 rte_bus_list。
//优先级为101,rte_log_init在main函数之前第一个被执行
/* Logging should be first initializer (before drivers and bus) */
RTE_INIT_PRIO(rte_log_init, 101);
//优先级为110,用于注册bus
/**
* Helper for Bus registration.
* The constructor has higher priority than PMD constructors.
*/
#define RTE_REGISTER_BUS(nm, bus) \
RTE_INIT_PRIO(businitfn_ ##nm, 110); \
static void businitfn_ ##nm(void) \
{\
(bus).name = RTE_STR(nm);\
rte_bus_register(&bus); \
}
//注册 vdev 总线
RTE_REGISTER_BUS(vdev, rte_vdev_bus);
struct rte_pci_bus rte_pci_bus = {
.bus = {
.scan = rte_pci_scan,
.probe = rte_pci_probe,
.find_device = pci_find_device,
.plug = pci_plug,
.unplug = pci_unplug,
.parse = pci_parse,
.get_iommu_class = rte_pci_get_iommu_class,
},
.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
};
//注册 pci 总线
RTE_REGISTER_BUS(pci, rte_pci_bus.bus);
rte_eal_init
rte_eal_init为DPDK程序的环境抽象层,主要进行CPU,内存和网卡等的初始化。本文重点介绍网卡相关。
int
rte_eal_init(int argc, char **argv)
//遍历全局链表rte_bus_list扫描总线,对于pci总线来说,调用 rte_pci_scan
rte_bus_scan()
//遍历全局链表rte_bus_list探测总线,对于pci总线来说,调用 rte_pci_probe
rte_bus_probe()
rte_pci_scan
/*
* Scan the content of the PCI bus, and the devices in the devices
* list
*/
int
rte_pci_scan(void)
struct dirent *e;
DIR *dir;
char dirname[PATH_MAX];
struct rte_pci_addr addr;
//打开目录/sys/bus/pci/devices, 扫描设备,这里用的是 struct rte_pci_device
dir = opendir(rte_pci_get_sysfs_path());
while ((e = readdir(dir)) != NULL) {
//此目录下以pci地址为子目录,所以通过解析子目录名字获取pci地址
parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr)
//组合成设备目录: /sys/bus/pci/devices/0000:81:00.0
snprintf(dirname, sizeof(dirname), "%s/%s", rte_pci_get_sysfs_path(), e->d_name);
pci_scan_one(dirname, &addr)
struct rte_pci_device *dev;
dev = malloc(sizeof(*dev));
dev->addr = *addr;
//读取网卡设备目录下的文件,获取 vendor id
/* get vendor id */
snprintf(filename, sizeof(filename), "%s/vendor", dirname);
eal_parse_sysfs_value(filename, &tmp);
dev->id.vendor_id = (uint16_t)tmp;
//同样的读取目录,获取 device id,subsystem_vendor id等信息,保存到 dev 中
/* get device id */
/* get subsystem_vendor id */
/* get subsystem_device id */
/* get class_id */
/* get max_vfs */
/* get numa node, default to 0 if not present */
//解析设备目录下的 /sys/bus/pci/devices/'pci address'/resource 文件,
//获取物理地址和长度,虚拟地址会在 rte_pci_probe 时进行映射,映射成功后就可以在用户态操作设备的寄存器
/* parse resources */
snprintf(filename, sizeof(filename), "%s/resource", dirname);
pci_parse_sysfs_resource(filename, dev) < 0)
for (i = 0; i<PCI_MAX_RESOURCE; i++) {
if (flags & IORESOURCE_MEM) { //只关注 mem 类型
dev->mem_resource[i].phys_addr = phys_addr;
dev->mem_resource[i].len = end_addr - phys_addr + 1;
/* not mapped for now */
dev->mem_resource[i].addr = NULL;
}
}
//解析当前网卡绑定的驱动名字
/* parse driver */
snprintf(filename, sizeof(filename), "%s/driver", dirname);
pci_get_kernel_driver_by_path(filename, driver);
if (!strcmp(driver, "vfio-pci"))
dev->kdrv = RTE_KDRV_VFIO;
else if (!strcmp(driver, "igb_uio"))
dev->kdrv = RTE_KDRV_IGB_UIO;
else if (!strcmp(driver, "uio_pci_generic"))
dev->kdrv = RTE_KDRV_UIO_GENERIC;
else
dev->kdrv = RTE_KDRV_UNKNOWN;
//将扫描到的设备 pci_dev 按pci地址顺序插入全局链表 rte_pci_bus.device_list,
//pci地址小的插入链表前面
/* device is valid, add in list (sorted) */
if (TAILQ_EMPTY(&rte_pci_bus.device_list)) {
rte_pci_add_device(dev);
//将扫描到的设备添加到链表 rte_pci_bus.device_list
TAILQ_INSERT_TAIL(&rte_pci_bus.device_list, pci_dev, next);
} else {
struct rte_pci_device *dev2;
int ret;
TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
//返回值大于0说明要插入的设备pci地址大于 dev2
ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr);
if (ret > 0)
continue;
//返回值小于0说明要插入的设备pci地址小于 dev2,则将dev插入到dev2的前面
if (ret < 0) {
rte_pci_insert_device(dev2, dev);
} else { /* already registered */
//返回值相等,说明是同一个设备,更新下参数即可。
dev2->kdrv = dev->kdrv;
dev2->max_vfs = dev->max_vfs;
pci_name_set(dev2);
memmove(dev2->mem_resource, dev->mem_resource, sizeof(dev->mem_resource));
free(dev);
}
return 0;
}
//遍历完链表,没找到比dev小的,则将dev插入链表最后
rte_pci_add_device(dev);
}
}
rte_pci_probe
/*
* Scan the content of the PCI bus, and call the probe() function for
* all registered drivers that have a matching entry in its id_table
* for discovered devices.
*/
int
rte_pci_probe(void)
struct rte_pci_device *dev = NULL;
//遍历设备链表rte_pci_bus.device_list,rte_pci_scan 阶段会把扫描到的设备插入此链表
//TAILQ_FOREACH(p, &(rte_pci_bus.device_list), next)
FOREACH_DEVICE_ON_PCIBUS(dev)
struct rte_pci_driver *dr = NULL;
//遍历驱动链表rte_pci_bus.driver_list,调用 RTE_PMD_REGISTER_PCI时将驱动插入此链表
//TAILQ_FOREACH(p, &(rte_pci_bus.driver_list), next)
FOREACH_DRIVER_ON_PCIBUS(dr)
rte_pci_probe_one_driver(dr, dev);
//vendor_id,device_id,subsystem_vendor_id,subsystem_device_id,class_id
//设备和驱动的这几项必须匹配
rte_pci_match(dr, dev);
const struct rte_pci_id *id_table;
//遍历驱动的 id_table,此table记录下驱动支持的网卡类型
for (id_table = pci_drv->id_table; id_table->vendor_id != 0; id_table++) {
/* check if device's identifiers match the driver's ones */
if (id_table->vendor_id != pci_dev->id.vendor_id &&
id_table->vendor_id != PCI_ANY_ID)
continue;
if (id_table->device_id != pci_dev->id.device_id &&
id_table->device_id != PCI_ANY_ID)
continue;
if (id_table->subsystem_vendor_id !=
pci_dev->id.subsystem_vendor_id &&
id_table->subsystem_vendor_id != PCI_ANY_ID)
continue;
if (id_table->subsystem_device_id !=
pci_dev->id.subsystem_device_id &&
id_table->subsystem_device_id != PCI_ANY_ID)
continue;
if (id_table->class_id != pci_dev->id.class_id &&
id_table->class_id != RTE_CLASS_ANY_ID)
continue;
return 1;
}
return 0;
//设备和驱动匹配上了。
//如果驱动设置了 RTE_PCI_DRV_NEED_MAPPING,则必须进行映射,
//即把设备物理地址映射成用户态的虚拟地址
if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING)
rte_pci_map_device(dev);
//如果网卡绑定到 vfio 驱动,则调用 pci_vfio_map_resource
pci_vfio_map_resource(dev);
//如果网卡绑定到 igb_uio 或者 uio_pci_generic,则调用 pci_uio_map_resource
/* map resources for devices that use igb_uio */
pci_uio_map_resource
struct mapped_pci_resource *uio_res = NULL;
struct mapped_pci_res_list *uio_res_list =
RTE_TAILQ_CAST(rte_uio_tailq.head, mapped_pci_res_list);
pci_uio_alloc_resource(dev, &uio_res);
uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 1);
/* depending on kernel version, uio can be located in uio/uioX or uio:uioX */
//到目录下/sys/bus/pci/devices/'pci address'/,找到uio目录,
//获取uio number(网卡绑定到igb_uio驱动后,会创建此目录)
snprintf(dirname, sizeof(dirname),
"%s/" PCI_PRI_FMT "/uio", rte_pci_get_sysfs_path(),
loc->domain, loc->bus, loc->devid, loc->function);
//打开 /dev/uiox 设备,获取fd,并保存到dev->intr_handle.fd
snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num);
/* save fd if in primary process */
dev->intr_handle.fd = open(devname, O_RDWR);
//打开目录 /sys/class/uio/uio%u/device/config,获取fd,并保存
snprintf(cfgname, sizeof(cfgname), "/sys/class/uio/uio%u/device/config", uio_num);
dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
/* allocate the mapping details for secondary processes*/
*uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0);
snprintf((*uio_res)->path, sizeof((*uio_res)->path), "%s", devname);
memcpy(&(*uio_res)->pci_addr, &dev->addr, sizeof((*uio_res)->pci_addr));
/* Map all BARs */
for (i = 0; i != PCI_MAX_RESOURCE; i++) {
/* skip empty BAR */
phaddr = dev->mem_resource[i].phys_addr;
if (phaddr == 0)
continue;
pci_uio_map_resource_by_index(dev, i, uio_res, map_idx);
int fd;
char devname[PATH_MAX];
void *mapaddr;
struct rte_pci_addr *loc;
struct pci_map *maps;
loc = &dev->addr;
maps = uio_res->maps;
//devname 为 /sys/bus/pci/devices/'pci address'/resource'map_idx'
/* update devname for mmap */
snprintf(devname, sizeof(devname),
"%s/" PCI_PRI_FMT "/resource%d",
rte_pci_get_sysfs_path(),
loc->domain, loc->bus, loc->devid,
loc->function, res_idx);
//分配内存用来保存path
maps[map_idx].path = rte_malloc(NULL, strlen(devname) + 1, 0);
//打开文件/sys/bus/pci/devices/'pci address'/resource'map_idx',进行mmap映射
fd = open(devname, O_RDWR);
/* try mapping somewhere close to the end of hugepages */
//pci_map_addr为全局变量,尝试映射到距离hugepages近的地方
if (pci_map_addr == NULL)
pci_map_addr = pci_find_max_end_va();
//mmap映射,返回映射成功的虚拟地址
mapaddr = pci_map_resource(pci_map_addr, fd, 0, (size_t)dev->mem_resource[res_idx].len, 0);
//映射成功后,将 pci_map_addr 指向 mapaddr+len 的虚拟地址,下次循环映射
pci_map_addr = RTE_PTR_ADD(mapaddr, (size_t)dev->mem_resource[res_idx].len);
//将映射后的虚拟地址保存下来,以后就可以使用此虚拟地址操作设备
maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr;
maps[map_idx].size = dev->mem_resource[res_idx].len;
maps[map_idx].addr = mapaddr;
maps[map_idx].offset = 0;
strcpy(maps[map_idx].path, devname);
//最后将虚拟地址保存到 dev 中
dev->mem_resource[res_idx].addr = mapaddr;
//将uio_res插入全局链表 uio_res_list,
//主要用于secondary进程遍历此链表映射相同的虚拟地址
TAILQ_INSERT_TAIL(uio_res_list, uio_res, next);
/* reference driver structure */
dev->driver = dr;
dev->device.driver = &dr->driver;
//调用驱动的probe函数,比如 eth_ixgbe_pci_probe
dr->probe(dr, dev);
//rte_eth_dev_pci_generic_probe是个通用的封装函数,用来申请 rte_eth_dev 内存,
//成功后,调用驱动特有的callback dev_init,比如 eth_ixgbe_dev_init
//sizeof(struct ixgbe_adapter)为驱动特有的私有数据
rte_eth_dev_pci_generic_probe(pci_dev, sizeof(struct ixgbe_adapter), eth_ixgbe_dev_init);
struct rte_eth_dev *eth_dev;
eth_dev = rte_eth_dev_pci_allocate(pci_dev, private_data_size);
name = dev->device.name;
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
eth_dev = rte_eth_dev_allocate(name);
//找到第一个可用的portid
port_id = rte_eth_dev_find_free_port();
//rte_eth_dev_data 是一个多进程共享变量
if (rte_eth_dev_data == NULL)
rte_eth_dev_data_alloc();
memset(&rte_eth_dev_data[port_id], 0, sizeof(struct rte_eth_dev_data));
eth_dev = eth_dev_get(port_id);
snprintf(eth_dev->data->name, sizeof(eth_dev->data->name), "%s", name);
eth_dev->data->port_id = port_id;
eth_dev->data->mtu = ETHER_MTU;
if (private_data_size) {
//分配驱动私有数据占用内存
eth_dev->data->dev_private = rte_zmalloc_socket(name,
private_data_size, RTE_CACHE_LINE_SIZE,
dev->device.numa_node);
}
} else {
eth_dev = rte_eth_dev_attach_secondary(name);
}
//调用 eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev)
dev_init(eth_dev);
struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
struct rte_intr_handle *intr_handle = &pci_dev->intr_handle;
struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
eth_dev->dev_ops = &ixgbe_eth_dev_ops;
eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;
eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
eth_dev->tx_pkt_prepare = &ixgbe_prep_pkts;
//将 bar0 的虚拟地址赋值到 hw->hw_addr,以后就可以通过 hw->hw_addr 读取/设置寄存器了
hw->hw_addr = (void *)pci_dev->mem_resource[0].addr;
//注册中断处理函数
rte_intr_callback_register(intr_handle, ixgbe_dev_interrupt_handler, eth_dev);
//使能网卡pci层的中断
/* enable uio/vfio intr/eventfd mapping */
rte_intr_enable(intr_handle);
//设置网卡寄存器,使能网卡中断
/* enable support intr */
ixgbe_enable_intr(eth_dev);