enum netdev_state_t {
__LINK_STATE_START,
__LINK_STATE_PRESENT,
__LINK_STATE_NOCARRIER,
__LINK_STATE_LINKWATCH_PENDING,
__LINK_STATE_DORMANT,
};
An Ethernet frame consists of the following fields:
• Seven bytes preamble (7字节 前导码)
• Start frame delimiter (SFD) (帧起始符)
• Two address fields
• Length or type field
• Data field
• Frame check sequence (FCS,CRC value)
前导码和起始符不算在真实的包里,他们跟随一个包一起发过来,但是tcpdump 或 wireshark是抓不到的。FCS也是一样。
0~46 字节的pad,应该是用于当实际数据不对齐时,用来填充对齐。


vlan 帧

暂停帧:接收设备产生一个暂停帧来指示发送设备的拥塞,发送设备应该停止发送数据。
暂停帧格式:type字段为0x8808,跟随type后的是2字节的操作码(Opcode)应该是0x0001.
暂停帧的pad 永远是42字节。
P1、P2 时暂停值,决定暂停发送或允许发送。

The MAC receive engine performs the following tasks:
• Check frame framing
• Remove frame preamble and frame SFD field
• Discard frame based on frame destination address field (基于目的地址字段丢弃帧)
• Terminate pause frames
• Check frame length
• Remove payload padding if it exists (删除填充)
• Calculate and verify CRC-32
• Write received frames in the core receive FIFO

imx6ull 用下表来描述一个缓冲区。
data buffer pointer 保存着缓冲区的地址,缓冲区必须位于外置内存中。


Enhanced buffer descriptors可以与驱动中的struct bufdesc 对应。
struct bufdesc { __fec16 cbd_sc; /* Control and status info */ __fec16 cbd_datlen; /* Data length */ __fec32 cbd_bufaddr; /* Buffer address */ };

网络数据包在接收时网卡会将数据包dma 到内存的ring buffer中,中断通知驱动数据已到达;同样发送时,驱动将数据包写到ring buffer中,通知网卡将数据拷走。所以无论是在接收和发送时都要通过ring buffer暂时保存数据包,那么数据包在ring buffer中是怎么存放的。
ring buffer 就是队列,队列有先进先出的特点,网卡和驱动就形成一个生产者和消费者的关系。
imx6ull 中数据包存到队列中的形式就是以struct bufdesc 结构,队列的大小也是按照这个结构计算——一个队列中能个存放若干个struct bufdesc 结构体。
struct sk_buff是内核描述一个数据包的结构体,imx6ull 驱动应该会把sk_buff 转化成bufdesc。

static int fec_probe(struct platform_device *pdev)
{
struct fec_enet_private *fep;
struct fec_platform_data *pdata;
struct net_device *ndev;
int i, irq, ret = 0;
const struct of_device_id *of_id;
static int dev_id;
struct device_node *np = pdev->dev.of_node, *phy_node;
int num_tx_qs;
int num_rx_qs;
char irq_name[8];
int irq_cnt;
fec_enet_get_queue_num(pdev, &num_tx_qs, &num_rx_qs); //从设备树获取txq和rxq的数量
/* Init network device */
ndev = alloc_etherdev_mqs(sizeof(struct fec_enet_private) + //申请netdev
FEC_STATS_SIZE, num_tx_qs, num_rx_qs);
if (!ndev)
return -ENOMEM;
SET_NETDEV_DEV(ndev, &pdev->dev); //设置 netdev->dev.parent = pdev->dev
/* setup board info structure */
fep = netdev_priv(ndev); //私有数据变量= (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);
of_id = of_match_device(fec_dt_ids, &pdev->dev); //与平台相关的数据
if (of_id)
pdev-->id_entry->driver_data;
fep->netdev = ndev;
fep->num_rx_queues = num_rx_qs;
fep->num_tx_queues = num_tx_qs;
#if !defined(CONFIG_M5272)
/* default enable pause frame auto negotiation */
if (fep->quirks & FEC_QUIRK_HAS_GBIT)
fep->pause_flag |= FEC_PAUSE_FLAG_AUTONEG;
#endif
/* Select default pin state */
pinctrl_pm_select_default_state(&pdev->dev);
fep->hwp = devm_platform_ioremap_resource(pdev, 0); //获取并映射资源:mac 寄存器地址
if (IS_ERR(fep->hwp)) {
ret = PTR_ERR(fep->hwp);
goto failed_ioremap;
}
fep->pdev = pdev;
fep->dev_id = dev_id++;
platform_set_drvdata(pdev, ndev);
if ((of_machine_is_compatible("fsl,imx6q") ||
of_machine_is_compatible("fsl,imx6dl")) &&
!of_property_read_bool(np, "fsl,err006687-workaround-present"))
fep->quirks |= FEC_QUIRK_ERR006687;
fec_enet_of_parse_stop_mode(pdev); //解析设备树停止模式:stop-mode = <&gpr 0x10 4>;
ret = fec_enet_ipc_handle_init(fep);
if (ret)
goto failed_ipc_init;
if (of_get_property(np, "fsl,magic-packet", NULL)) //魔术包
fep->wol_flag |= FEC_WOL_HAS_MAGIC_PACKET;
if (of_get_property(np, "fsl,rgmii_txc_dly", NULL))
fep->rgmii_txc_dly = true;
if (of_get_property(np, "fsl,rgmii_rxc_dly", NULL))
fep->rgmii_rxc_dly = true;
phy_node = of_parse_phandle(np, "phy-handle", 0); //解析设备树 phy节点
if (!phy_node && of_phy_is_fixed_link(np)) {
ret = of_phy_register_fixed_link(np);
if (ret < 0) {
dev_err(&pdev->dev,
"broken fixed-link specification\n");
goto failed_phy;
}
phy_node = of_node_get(np);
}
fep->phy_node = phy_node;
ret = of_get_phy_mode(pdev->dev.of_node); //获取phy 接口模式 sgmii、rgmii等,返回的是枚举类型值
if (ret < 0) {
pdata = dev_get_platdata(&pdev->dev);
if (pdata)
fep->phy_interface = pdata->phy;
else
fep->phy_interface = PHY_INTERFACE_MODE_MII;
} else {
fep->phy_interface = ret;
}
request_bus_freq(BUS_FREQ_HIGH);
//时钟相关的省略一段代码
fep->clk_ipg = devm_clk_get(&pdev->dev, "ipg");
if (IS_ERR(fep->clk_ipg)) {
ret = PTR_ERR(fep->clk_ipg);
goto failed_clk;
}
......
fep->reg_phy = devm_regulator_get_optional(&pdev->dev, "phy");
if (!IS_ERR(fep->reg_phy)) {
ret = regulator_enable(fep->reg_phy);
if (ret) {
dev_err(&pdev->dev,
"Failed to enable phy regulator: %d\n", ret);
goto failed_regulator;
}
} else {
if (PTR_ERR(fep->reg_phy) == -EPROBE_DEFER) {
ret = -EPROBE_DEFER;
goto failed_regulator;
}
fep->reg_phy = NULL;
}
......
ret = fec_reset_phy(pdev); //复位phy,通过gpio 硬件复位
if (ret)
goto failed_reset;
irq_cnt = fec_enet_get_irq_cnt(pdev); //获取irq 个数
if (fep->bufdesc_ex)
fec_ptp_init(pdev, irq_cnt);
ret = fec_enet_init(ndev); //dma 队列内存分配、netdev_ops设置、ethtool_ops、设置napi poll、复位和初始化mac
if (ret)
goto failed_init;
for (i = 0; i < irq_cnt; i++) { //获取irq
snprintf(irq_name, sizeof(irq_name), "int%d", i);
irq = platform_get_irq_byname_optional(pdev, irq_name); //通过名字在platform 总线上查找irq
if (irq < 0)
irq = platform_get_irq(pdev, i); //从设备树获取irq
if (irq < 0) {
ret = irq;
goto failed_irq;
}
ret = devm_request_irq(&pdev->dev, irq, fec_enet_interrupt,
0, pdev->name, ndev); //申请中断,中断处理函数 fec_enet_interrupt
if (ret) //有新的数据包到达,或发送完成等等都会触发中断,进入中断处理函数,根据中断事件类型执行不同的代码
goto failed_irq;
fep->irq[i] = irq;
}
/*
唤醒中断 0
interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>;
*/
/* get wake up irq */
ret = of_property_read_u32(np, "fsl,wakeup_irq", &irq);
if (!ret && irq < irq_cnt)
fep->wake_irq = fep->irq[irq];
else
fep->wake_irq = fep->irq[0];
init_completion(&fep->mdio_done);
/* board only enable one mii bus in default */
if (!of_get_property(np, "fsl,mii-exclusive", NULL))
fep->quirks |= FEC_QUIRK_SINGLE_MDIO;
//mdio 初始化
ret = fec_enet_mii_init(pdev);
if (ret)
goto failed_mii_init;
/* Carrier starts down, phylib will bring it up */
netif_carrier_off(ndev); //调用函数 netif_carrier_off 通知内核,先关闭链路, phylib 会打开。
fec_enet_clk_enable(ndev, false);
pinctrl_pm_select_sleep_state(&pdev->dev);
ret = register_netdev(ndev); //注册netdev
if (ret)
goto failed_register;
device_init_wakeup(&ndev->dev, fep->wol_flag &
FEC_WOL_HAS_MAGIC_PACKET);
if (fep->bufdesc_ex && fep->ptp_clock)
netdev_info(ndev, "registered PHC device %d\n", fep->dev_id);
fep->rx_copybreak = COPYBREAK_DEFAULT;
INIT_WORK(&fep->tx_timeout_work, fec_enet_timeout_work);
pm_runtime_mark_last_busy(&pdev->dev);
pm_runtime_put_autosuspend(&pdev->dev);
return 0;
failed_register:
fec_enet_mii_remove(fep);
failed_mii_init:
failed_irq:
failed_init:
fec_ptp_stop(pdev);
if (fep->reg_phy)
regulator_disable(fep->reg_phy);
failed_reset:
pm_runtime_put_noidle(&pdev->dev);
pm_runtime_disable(&pdev->dev);
failed_regulator:
clk_disable_unprepare(fep->clk_ahb);
failed_clk_ahb:
clk_disable_unprepare(fep->clk_ipg);
failed_clk_ipg:
fec_enet_clk_enable(ndev, false);
failed_clk:
release_bus_freq(BUS_FREQ_HIGH);
if (of_phy_is_fixed_link(np))
of_phy_deregister_fixed_link(np);
of_node_put(phy_node);
failed_ipc_init:
failed_phy:
dev_id--;
failed_ioremap:
free_netdev(ndev);
return ret;
}
这里主要是做了netdev的申请,以及一些以太网通用的设置ether_setup 函数中,私有数据内存的申请,和netdev下tx、rx队列的申请(在后面的fec_enet_init 函数中又做了txq、rxq dma的内存分配。后者才是真正的ring buffer,alloc_etherdev_mqs函数中 申请队列的作用暂时不清楚)。
netdev和private的空间在一块内存中去申请,netdev前后都做了32字节的对齐。

//linux-5.4.47\net\ethernet\eth.c
struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
unsigned int rxqs)
{
return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_UNKNOWN,
ether_setup, txqs, rxqs); //ether_setup: 设置netdev 一些通用的值
}
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned char name_assign_type,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
unsigned int alloc_size;
struct net_device *p;
BUG_ON(strlen(name) >= sizeof(dev->name)); //当设置名字的长度 > netdev->name 的空间长度时,BUG_ON(1),发生bug
if (txqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
return NULL;
}
if (rxqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
return NULL;
}
alloc_size = sizeof(struct net_device); //计算netdev 长度 alloc_size
if (sizeof_priv) {
/* ensure 32-byte alignment of private area */
alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); //将netdev长度以32字节对齐
alloc_size += sizeof_priv; //alloc_size + 私有数据长度
}
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1; //alloc_size +32 -1
p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!p)
return NULL;
dev = PTR_ALIGN(p, NETDEV_ALIGN); //p以32 字节对齐,作为netdev的地址
dev->padded = (char *)dev - (char *)p; //netdev->padded 申请的内存首地址与netdev首地址之间做了多少填充
dev->pcpu_refcnt = alloc_percpu(int); //netdev 的引用次数
if (!dev->pcpu_refcnt)
goto free_dev;
if (dev_addr_init(dev))[] //初始化mac地址
goto free_pcpu;
dev_mc_init(dev); //初始化组播mac地址
dev_uc_init(dev); //初始化单播mac地址
dev_net_set(dev, &init_net);
netdev_register_lockdep_key(dev);
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
dev->upper_level = 1;
dev->lower_level = 1;
INIT_LIST_HEAD(&dev->napi_list); //初始化链表头
INIT_LIST_HEAD(&dev->unreg_list);
INIT_LIST_HEAD(&dev->close_list);
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->adj_list.upper);
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
if (!dev->tx_queue_len) {
dev->priv_flags |= IFF_NO_QUEUE;
dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
}
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
if (netif_alloc_netdev_queues(dev)) //tx 队列申请
goto free_all;
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev)) //rx 队列申请
goto free_all;
strcpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops; //设置默认ethtool ops
nf_hook_ingress_init(dev);
return dev;
free_all:
free_netdev(dev);
return NULL;
free_pcpu:
free_percpu(dev->pcpu_refcnt);
free_dev:
netdev_freemem(dev);
return NULL;
}
fec_enet_init 主要做了dma 队列内存分配、netdev_ops设置、ethtool_ops、设置napi poll
struct bufdesc 是缓冲区描述符,描述存放数据的缓冲区,这里申请struct bufdesc 内存需要用一致性dma 映射。
在后面的open 函数中申请bufdesc->cbd_bufaddr,这个指针指向的才是真正存放要发送数据的内存地址,使用的是dma 流式映射。
static int fec_enet_init(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
struct bufdesc *cbd_base; //struct bufdesc:描述一个缓冲区
dma_addr_t bd_dma;
int bd_size;
unsigned int i;
unsigned dsize = fep->bufdesc_ex ? sizeof(struct bufdesc_ex) :
sizeof(struct bufdesc); //buffer 描述符大小
unsigned dsize_log2 = __fls(dsize);
int ret;
......
/* Check mask of the streaming and coherent API */
//dma 掩码设置: dma的寻址能力(例如24bit寻址范围是16MB,32bit 是4GB),告诉内核所支持的dma是多少位的,一般都是32位
ret = dma_set_mask_and_coherent(&fep->pdev->dev, DMA_BIT_MASK(32));
if (ret < 0) {
dev_warn(&fep->pdev->dev, "No suitable DMA available\n");
return ret;
}
/*
TSO:全称是 TCP Segmentation Offload,利用网卡对大数据包进行自动分段处理,降低 CPU负载。
GSO:全称是 Generic Segmentation Offload,在发送数据之前先检查一下网卡是否支持 TSO,如果支持的话就让网卡分段,
不支持的话就由协议栈进行分段处理,分段处理完成以后再交给网卡去发送。
*/
ret = fec_enet_alloc_queue(ndev); //设置tx和rx ring buffer大小,分配tx tso头部内存
if (ret)
return ret;
//数据包应该是以struct bufdesc 形式存在ring buffer中,total_tx_ring_size是描述一个ring buffer能存放多少个struct bufdesc
bd_size = (fep->total_tx_ring_size + fep->total_rx_ring_size) * dsize;
/* Allocate memory for buffer descriptors. */
//分配txq、rxq的缓冲区描述符bufdesc,这些缓冲区描述符要用一致性dma映射
cbd_base = dmam_alloc_coherent(&fep->pdev->dev, bd_size, &bd_dma, //dma 分配tx、rx 内存
GFP_KERNEL);
if (!cbd_base) {
ret = -ENOMEM;
goto free_queue_mem;
}
/* Get the Ethernet address */
ret = fec_get_mac(ndev); //获取mac地址
if (ret)
goto free_queue_mem;
/* make sure MAC we just acquired is programmed into the hw */
fec_set_mac_address(ndev, NULL);
/* Set receive and transmit descriptor base. */
for (i = 0; i < fep->num_rx_queues; i++) {
struct fec_enet_priv_rx_q *rxq = fep->rx_queue[i];
unsigned size = dsize * rxq->bd.ring_size;
rxq->bd.qid = i; //队列id
rxq->bd.base = cbd_base; //缓冲区描述符首地址,dma 分配的虚拟地址(内核使用)
rxq->bd.cur = cbd_base;
rxq->bd.dma = bd_dma; //dma 分配的dma地址(给IO设备使用)
rxq->bd.dsize = dsize; //buffer 描述符大小
rxq->bd.dsize_log2 = dsize_log2;
/*offset_des_active_rxq 的定义,分别是三个队列的激活队列寄存器地址
static const unsigned short offset_des_active_rxq[] = {
FEC_R_DES_ACTIVE_0, FEC_R_DES_ACTIVE_1, FEC_R_DES_ACTIVE_2
};
static const unsigned short offset_des_active_txq[] = {
FEC_X_DES_ACTIVE_0, FEC_X_DES_ACTIVE_1, FEC_X_DES_ACTIVE_2
};
*/
rxq->bd.reg_desc_active = fep->hwp + offset_des_active_rxq[i]; //激活队列
bd_dma += size;
cbd_base = (struct bufdesc *)(((void *)cbd_base) + size);
rxq->bd.last = (struct bufdesc *)(((void *)cbd_base) - dsize); //接收队列,最后一个bufdesc
}
for (i = 0; i < fep->num_tx_queues; i++) {
struct fec_enet_priv_tx_q *txq = fep->tx_queue[i];
unsigned size = dsize * txq->bd.ring_size;
txq->bd.qid = i;
txq->bd.base = cbd_base; //前面经过cbd_base+size 偏移,当前cbd_base是txq的第一个cbd_base地址
txq->bd.cur = cbd_base;
txq->bd.dma = bd_dma;
txq->bd.dsize = dsize;
txq->bd.dsize_log2 = dsize_log2;
txq->bd.reg_desc_active = fep->hwp + offset_des_active_txq[i];
bd_dma += size;
cbd_base = (struct bufdesc *)(((void *)cbd_base) + size);
txq->bd.last = (struct bufdesc *)(((void *)cbd_base) - dsize);
}
/* The FEC Ethernet specific entries in the device structure */
ndev->watchdog_timeo = TX_TIMEOUT; //发送超时时间
ndev->netdev_ops = &fec_netdev_ops; //netdev_ops
ndev->ethtool_ops = &fec_enet_ethtool_ops; //ethtool_ops
writel(FEC_RX_DISABLED_IMASK, fep->hwp + FEC_IMASK); //rxf 中断掩码设置 中断掩码:允许中断发生或屏蔽中断
/* netif_napi_add
添加napi poll 回调函数:fec_enet_rx_napi
NAPI_POLL_WEIGHT:napi 权重,在调用napi收包时最多只能收取NAPI_POLL_WEIGHT,超过这个值就马上退出,剩下没接收完的留着下次处理,
防止网络驱动pool 函数一直霸占着软中断线程
*/
netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, NAPI_POLL_WEIGHT);
//设置一些网卡特性
if (fep->quirks & FEC_QUIRK_HAS_VLAN)
/* enable hw VLAN support */
ndev->features |= NETIF_F_HW_VLAN_CTAG_RX;
if (fep->quirks & FEC_QUIRK_HAS_CSUM) {
ndev->gso_max_segs = FEC_MAX_TSO_SEGS; //gso 分片
/* enable hw accelerator */
ndev->features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM
| NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_TSO);
fep->csum_flags |= FLAG_RX_CSUM_ENABLED;
}
if (fep->quirks & FEC_QUIRK_HAS_AVB) {
fep->tx_align = 0;
fep->rx_align = 0x3f;
}
ndev->hw_features = ndev->features;
fec_restart(ndev); //做了一些mac寄存器的复位设置,txq和txq 中的struct bufdesc初始化
if (fep->quirks & FEC_QUIRK_MIB_CLEAR)
fec_enet_clear_ethtool_stats(ndev);
else
fec_enet_update_ethtool_stats(ndev);
return 0;
free_queue_mem:
fec_enet_free_queue(ndev);
return ret;
}
dma 分配内存时物理地址一定要时连续的,且需要一致性DMA分配。
在这个函数指定了 ring buffer的大小(一个能容纳多少个struct bufdesc)
static int fec_enet_alloc_queue(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
int i;
int ret = 0;
struct fec_enet_priv_tx_q *txq;
/*
vmalloc 一般用于申请页级别的大块内存,保证虚拟地址连续,当物理地址不一定连续,申请大小不限制。
kmalloc 申请的内存位于物理内存映射区域,而且在物理上也是连续的,申请的大小不能超过128KB。
kzalloc 只是 kmalloc 多了 __GFP_ZERO 标志,用于将申请内存填零。
一般情况下,内存只有在要被 DMA 访问的时候才需要物理上连续,但为了性能上的考虑,内核中一般使用 kmalloc(),
而只有在需要获得大块内存时才使用 vmalloc()。例如,当模块被动态加载到内核当中时,就把模块装载到由 vmalloc() 分配的内存上。
*/
for (i = 0; i < fep->num_tx_queues; i++) {
txq = kzalloc(sizeof(*txq), GFP_KERNEL);
if (!txq) {
ret = -ENOMEM;
goto alloc_failed;
}
fep->tx_queue[i] = txq; //描述一个发送队列
txq->bd.ring_size = TX_RING_SIZE; //一个环形缓冲区的大小
fep->total_tx_ring_size += fep->tx_queue[i]->bd.ring_size; //所有环形缓冲区相加的大小
txq->tx_stop_threshold = FEC_MAX_SKB_DESCS;
txq->tx_wake_threshold =
(txq->bd.ring_size - txq->tx_stop_threshold) / 2;
txq->tso_hdrs = dma_alloc_coherent(&fep->pdev->dev, //dma 分配tx tso头部缓存
txq->bd.ring_size * TSO_HEADER_SIZE,
&txq->tso_hdrs_dma,
GFP_KERNEL);
if (!txq->tso_hdrs) {
ret = -ENOMEM;
goto alloc_failed;
}
}
for (i = 0; i < fep->num_rx_queues; i++) {
fep->rx_queue[i] = kzalloc(sizeof(*fep->rx_queue[i]),
GFP_KERNEL);
if (!fep->rx_queue[i]) {
ret = -ENOMEM;
goto alloc_failed;
}
fep->rx_queue[i]->bd.ring_size = RX_RING_SIZE;
fep->total_rx_ring_size += fep->rx_queue[i]->bd.ring_size;
}
return ret;
alloc_failed:
fec_enet_free_queue(ndev);
return ret;
}
static void fec_restart(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
u32 val;
u32 temp_mac[2];
u32 rcntl = OPT_FRAME_SIZE | 0x04;
u32 ecntl = FEC_ENET_ETHEREN; /* ETHEREN */ //以太网使能
/* Whack a reset. We should wait for this.
* For i.MX6SX SOC, enet use AXI bus, we use disable MAC
* instead of reset MAC itself.
*/
if (fep->quirks & FEC_QUIRK_HAS_AVB) {
writel(0, fep->hwp + FEC_ECNTRL);
} else {
writel(1, fep->hwp + FEC_ECNTRL); //复位 mac
udelay(10);
}
/*
* enet-mac reset will reset mac address registers too,
* so need to reconfigure it.
*/
memcpy(&temp_mac, ndev->dev_addr, ETH_ALEN); //临时mac
writel((__force u32)cpu_to_be32(temp_mac[0]),
fep->hwp + FEC_ADDR_LOW);
writel((__force u32)cpu_to_be32(temp_mac[1]),
fep->hwp + FEC_ADDR_HIGH);
/* Clear any outstanding interrupt. */
writel(0xffffffff, fep->hwp + FEC_IEVENT); //中断事件寄存器,写1清0
fec_enet_bd_init(ndev); //缓冲区描述符初始化
fec_enet_enable_ring(ndev); //使能队列:将队列地址写入寄存器
/* Reset tx SKB buffers. */
fec_enet_reset_skb(ndev); //释放txq skb
/* Enable MII mode */ //mdio 控制相关设置
if (fep->full_duplex == DUPLEX_FULL) {
/* FD enable */
writel(0x04, fep->hwp + FEC_X_CNTRL);
} else {
/* No Rcv on Xmit */
rcntl |= 0x02;
writel(0x0, fep->hwp + FEC_X_CNTRL);
}
/* Set MII speed */
writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED);
#if !defined(CONFIG_M5272)
if (fep->quirks & FEC_QUIRK_HAS_RACC) {
val = readl(fep->hwp + FEC_RACC);
/* align IP header */
val |= FEC_RACC_SHIFT16;
if (fep->csum_flags & FLAG_RX_CSUM_ENABLED)
/* set RX checksum */
val |= FEC_RACC_OPTIONS;
else
val &= ~FEC_RACC_OPTIONS;
writel(val, fep->hwp + FEC_RACC);
writel(PKT_MAXBUF_SIZE, fep->hwp + FEC_FTRL);
}
#endif
/*
* The phy interface and speed need to get configured
* differently on enet-mac. 根据各mac 控制器的特性,phy 接口和速度需要进行不同的配置。
*/
if (fep->quirks & FEC_QUIRK_ENET_MAC) {
/* Enable flow control and length check */
rcntl |= 0x40000000 | 0x00000020;
/* RGMII, RMII or MII */
if (fep->phy_interface == PHY_INTERFACE_MODE_RGMII ||
fep->phy_interface == PHY_INTERFACE_MODE_RGMII_ID ||
fep->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID ||
fep->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID)
rcntl |= (1 << 6);
else if (fep->phy_interface == PHY_INTERFACE_MODE_RMII)
rcntl |= (1 << 8);
else
rcntl &= ~(1 << 8);
/* 1G, 100M or 10M */
if (ndev->phydev) {
if (ndev->phydev->speed == SPEED_1000)
ecntl |= (1 << 5);
else if (ndev->phydev->speed == SPEED_100)
rcntl &= ~(1 << 9);
else
rcntl |= (1 << 9);
}
} else {
#ifdef FEC_MIIGSK_ENR
if (fep->quirks & FEC_QUIRK_USE_GASKET) {
u32 cfgr;
/* disable the gasket and wait */
writel(0, fep->hwp + FEC_MIIGSK_ENR);
while (readl(fep->hwp + FEC_MIIGSK_ENR) & 4)
udelay(1);
/*
* configure the gasket:
* RMII, 50 MHz, no loopback, no echo
* MII, 25 MHz, no loopback, no echo
*/
cfgr = (fep->phy_interface == PHY_INTERFACE_MODE_RMII)
? BM_MIIGSK_CFGR_RMII : BM_MIIGSK_CFGR_MII;
if (ndev->phydev && ndev->phydev->speed == SPEED_10)
cfgr |= BM_MIIGSK_CFGR_FRCONT_10M;
writel(cfgr, fep->hwp + FEC_MIIGSK_CFGR);
/* re-enable the gasket */
writel(2, fep->hwp + FEC_MIIGSK_ENR);
}
#endif
}
#if !defined(CONFIG_M5272)
/* enable pause frame
暂停帧:当ring buffer满,cpu来不及收取时,ifconfig 出现rx overruns,此时ring buffer 溢出了,需要发出暂停帧通知对方停止向本设备发送。
使能暂停帧检测:当检测到有暂停帧接收,停止向对方发送数据。
*/
if ((fep->pause_flag & FEC_PAUSE_FLAG_ENABLE) ||
((fep->pause_flag & FEC_PAUSE_FLAG_AUTONEG) &&
ndev->phydev && ndev->phydev->pause)) {
rcntl |= FEC_ENET_FCE; //使能接收暂停帧检测
/* set FIFO threshold parameter to reduce overrun 设置FIFO 阈值,减少溢出*/
writel(FEC_ENET_RSEM_V, fep->hwp + FEC_R_FIFO_RSEM);
writel(FEC_ENET_RSFL_V, fep->hwp + FEC_R_FIFO_RSFL);
writel(FEC_ENET_RAEM_V, fep->hwp + FEC_R_FIFO_RAEM);
writel(FEC_ENET_RAFL_V, fep->hwp + FEC_R_FIFO_RAFL);
/* OPD */
writel(FEC_ENET_OPD_V, fep->hwp + FEC_OPD);
} else {
rcntl &= ~FEC_ENET_FCE;
}
#endif /* !defined(CONFIG_M5272) */
writel(rcntl, fep->hwp + FEC_R_CNTRL);
/* Setup multicast filter. */
set_multicast_list(ndev);
#ifndef CONFIG_M5272
writel(0, fep->hwp + FEC_HASH_TABLE_HIGH);
writel(0, fep->hwp + FEC_HASH_TABLE_LOW);
#endif
if (fep->quirks & FEC_QUIRK_ENET_MAC) {
/* enable ENET endian swap */
ecntl |= (1 << 8);
/* enable ENET store and forward mode */
writel(1 << 8, fep->hwp + FEC_X_WMRK);
}
if (fep->bufdesc_ex)
ecntl |= (1 << 4);
if (fep->quirks & FEC_QUIRK_DELAYED_CLKS_SUPPORT &&
fep->rgmii_txc_dly)
ecntl |= FEC_ENET_TXC_DLY;
if (fep->quirks & FEC_QUIRK_DELAYED_CLKS_SUPPORT &&
fep->rgmii_rxc_dly)
ecntl |= FEC_ENET_RXC_DLY;
#ifndef CONFIG_M5272
/* Enable the MIB statistic event counters */
writel(0 << 31, fep->hwp + FEC_MIB_CTRLSTAT);
#endif
i_txc_dly)
ecntl |= FEC_ENET_TXC_DLY;
if (fep->quirks & FEC_QUIRK_DELAYED_CLKS_SUPPORT &&
fep->rgmii_rxc_dly)
ecntl |= FEC_ENET_RXC_DLY;
#ifndef CONFIG_M5272
/* Enable the MIB statistic event counters */
writel(0 << 31, fep->hwp + FEC_MIB_CTRLSTAT);
#endif
/* And last, enable the transmit and receive processing */
writel(ecntl, fep->hwp + FEC_ECNTRL);
fec_enet_active_rxring(ndev); //激活接收队列
if (fep->bufdesc_ex)
fec_ptp_start_cyclecounter(ndev);
/* Enable interrupts we wish to service */
if (fep->link)
writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
else
writel(FEC_ENET_MII, fep->hwp + FEC_IMASK);
/* Init the interrupt coalescing */
fec_enet_itr_coal_init(ndev);
}
队列中的缓冲区描述符初始化
static void fec_enet_bd_init(struct net_device *dev)
{
struct fec_enet_private *fep = netdev_priv(dev);
struct fec_enet_priv_tx_q *txq;
struct fec_enet_priv_rx_q *rxq;
struct bufdesc *bdp;
unsigned int i;
unsigned int q;
for (q = 0; q < fep->num_rx_queues; q++) { //循环设置所有队列
/* Initialize the receive buffer descriptors. */
rxq = fep->rx_queue[q];
bdp = rxq->bd.base;
for (i = 0; i < rxq->bd.ring_size; i++) { //初始化一个队列的所有缓冲区描述符
/* Initialize the BD for every fragment in the page. */
/*
bdp->cbd_bufaddr 指向的是保存要发送的数据地址,这里还没有分配,应该是NULL,走else 分支;
在open 函数中,也会调用fec_restart,那时已经完成了bdp->cbd_bufaddr分配,走if 分支;
*/
if (bdp->cbd_bufaddr)
bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY); //设置缓冲区标识为空(没有数据)
else
bdp->cbd_sc = cpu_to_fec16(0); //清除描述符控制和状态信息
bdp = fec_enet_get_nextdesc(bdp, &rxq->bd); //查找下一个struct bufdesc
}
/* Set the last buffer to wrap */
bdp = fec_enet_get_prevdesc(bdp, &rxq->bd); //当前队列的最后一个缓冲区描述符;
bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP); //下一个队列的起始地址可以在ENETx_RDS
rxq->bd.cur = rxq->bd.base;
}
for (q = 0; q < fep->num_tx_queues; q++) {
/* ...and the same for transmit */
txq = fep->tx_queue[q];
bdp = txq->bd.base;
txq->bd.cur = bdp;
for (i = 0; i < txq->bd.ring_size; i++) {
/* Initialize the BD for every fragment in the page. */
bdp->cbd_sc = cpu_to_fec16(0); //设置描述符控制,状态信息为0
if (bdp->cbd_bufaddr &&
!IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr))) //如果cbd_bufaddr 地址在tso header 范围内,释放cbd_bufaddr 地址
dma_unmap_single(&fep->pdev->dev,
fec32_to_cpu(bdp->cbd_bufaddr),
fec16_to_cpu(bdp->cbd_datlen),
DMA_TO_DEVICE);
if (txq->tx_skbuff[i]) {
dev_kfree_skb_any(txq->tx_skbuff[i]); //释放txq->skb[i]
txq->tx_skbuff[i] = NULL;
}
bdp->cbd_bufaddr = cpu_to_fec32(0);
bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
}
/* Set the last buffer to wrap */
bdp = fec_enet_get_prevdesc(bdp, &txq->bd);
bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
txq->dirty_tx = bdp;
}
}
fec_enet_enable_ring 使能队列:将队列首地址写入ENETx_RDSR 寄存器。
imx系列最多支持3个队列,imx6ull 只支持1个队列。

static void fec_enet_enable_ring(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
struct fec_enet_priv_tx_q *txq;
struct fec_enet_priv_rx_q *rxq;
int i;
for (i = 0; i < fep->num_rx_queues; i++) { //imx系列最多支持3个接收队列,imx6ull只支持1个,队列0的起始地址保存在寄存器FEC_R_DES_START_0(ENETx_RDSR)
rxq = fep->rx_queue[i];
writel(rxq->bd.dma, fep->hwp + FEC_R_DES_START(i)); //队列开始地址设置:将内存中环形缓冲区地址写入寄存器ENETx_RDSR
writel(PKT_MAXBUF_SIZE, fep->hwp + FEC_R_BUFF_SIZE(i)); //队列大小
/* enable DMA1/2 */
if (i)
writel(RCMR_MATCHEN | RCMR_CMP(i), //多队列情况下,使能其它dma通道
fep->hwp + FEC_RCMR(i));
}
for (i = 0; i < fep->num_tx_queues; i++) {
txq = fep->tx_queue[i];
writel(txq->bd.dma, fep->hwp + FEC_X_DES_START(i));
/* enable DMA1/2 */
if (i)
writel(DMA_CLASS_EN | IDLE_SLOPE(i),
fep->hwp + FEC_DMA_CFG(i));
}
}
mdio 控制器初始化。
fec_enet_mii_init 函数中会申请一个struct mii_bus 结构体,并向内核注册。
mii_bus 用来描述一个mdio总线(这是真实有功能的总线用来管理phy设备,读写phy寄存器。struct bus_type mdio_bus_type 只是为了匹配phy设备与phy驱动)。
其中mii_bus 最重要的成员就是mii_bus->read 和 mii_bus->write,他们用来读写phy寄存器。
这两个函数由nxp 编写,因为每个cpu它的控制器都不尽相同,需要厂家自己编写。
static int fec_enet_mii_init(struct platform_device *pdev)
{
static struct mii_bus *fec0_mii_bus;
static bool *fec_mii_bus_share;
struct net_device *ndev = platform_get_drvdata(pdev);
struct fec_enet_private *fep = netdev_priv(ndev);
struct device_node *node;
int err = -ENXIO;
u32 mii_speed, holdtime;
...... 省略一些代码
fep->mii_bus = mdiobus_alloc();
if (fep->mii_bus == NULL) {
err = -ENOMEM;
goto err_out;
}
fep->mii_bus->name = "fec_enet_mii_bus";
fep->mii_bus->read = fec_enet_mdio_read; //mii 读/写 函数设置,(读写phy芯片)
fep->mii_bus->write = fec_enet_mdio_write;
snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x",
pdev->name, fep->dev_id + 1);
fep->mii_bus->priv = fep;
fep->mii_bus->parent = &pdev->dev;
node = of_get_child_by_name(pdev->dev.of_node, "mdio"); //获取mdio节点
err = of_mdiobus_register(fep->mii_bus, node); //向内核注册mdio
of_node_put(node);
if (err)
goto err_out_free_mdiobus;
mii_cnt++;
/* save fec0 mii_bus */
if (fep->quirks & FEC_QUIRK_SINGLE_MDIO) {
fec0_mii_bus = fep->mii_bus;
fec_mii_bus_share = &fep->mii_bus_share;
}
return 0;
err_out_free_mdiobus:
mdiobus_free(fep->mii_bus);
err_out:
return err;
}
mdiobus_register(mdio); 向内核注册mii_bus 总线;
for_each_available_child_of_node(np, child) {} 查找设备树mdio节点下所有子节点——phy节点。
addr = of_mdio_parse_addr(&mdio->dev, child); 解析phy节点,获取节点下reg的值——phy地址。
of_mdiobus_register_phy(mdio, child, addr); 注册phy设备。
//linux-5.4.47\drivers\of\of_mdio.c
int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
{
struct device_node *child;
bool scanphys = false;
int addr, rc;
if (!np)
return mdiobus_register(mdio);
/* Do not continue if the node is disabled */
if (!of_device_is_available(np))
return -ENODEV;
/*Register the MDIO bus */
rc = mdiobus_register(mdio);
if (rc)
return rc;
/* Loop over the child nodes and register a phy_device for each phy */
for_each_available_child_of_node(np, child) {
addr = of_mdio_parse_addr(&mdio->dev, child);
if (addr < 0) {
scanphys = true;
continue;
}
if (of_mdiobus_child_is_phy(child))
rc = of_mdiobus_register_phy(mdio, child, addr);
else
rc = of_mdiobus_register_device(mdio, child, addr);
if (rc == -ENODEV)
dev_err(&mdio->dev,
"MDIO device at address %d is missing.\n",
addr);
else if (rc)
goto unregister;
}
if (!scanphys)
return 0;
/* auto scan for PHYs with empty reg property */
for_each_available_child_of_node(np, child) {
/* Skip PHYs with reg property set */
if (of_find_property(child, "reg", NULL))
continue;
for (addr = 0; addr < PHY_MAX_ADDR; addr++) {
/* skip already registered PHYs */
if (mdiobus_is_registered_device(mdio, addr))
continue;
/* be noisy to encourage people to set reg property */
dev_info(&mdio->dev, "scan phy %pOFn at address %i\n",
child, addr);
if (of_mdiobus_child_is_phy(child)) {
rc = of_mdiobus_register_phy(mdio, child, addr);
if (rc && rc != -ENODEV)
goto unregister;
break;
}
}
}
return 0;
unregister:
mdiobus_unregister(mdio);
return rc;
}
EXPORT_SYMBOL(of_mdiobus_register);
注册phy设备
static int of_mdiobus_register_phy(struct mii_bus *mdio,
struct device_node *child, u32 addr)
{
struct phy_device *phy;
bool is_c45;
int rc;
u32 phy_id;
is_c45 = of_device_is_compatible(child,
"ethernet-phy-ieee802.3-c45");
if (!is_c45 && !of_get_phy_id(child, &phy_id))
phy = phy_device_create(mdio, addr, phy_id, 0, NULL);
else
phy = get_phy_device(mdio, addr, is_c45);
if (IS_ERR(phy))
return PTR_ERR(phy);
rc = of_irq_get(child, 0);
if (rc == -EPROBE_DEFER) {
phy_device_free(phy);
return rc;
}
if (rc > 0) {
phy->irq = rc;
mdio->irq[addr] = rc;
} else {
phy->irq = mdio->irq[addr];
}
if (of_property_read_bool(child, "broken-turn-around"))
mdio->phy_ignore_ta_mask |= 1 << addr;
of_property_read_u32(child, "reset-assert-us",
&phy->mdio.reset_assert_delay);
/* Associate the OF node with the device structure so it
* can be looked up later */
of_node_get(child);
phy->mdio.dev.of_node = child;
phy->mdio.dev.fwnode = of_fwnode_handle(child);
/* All data is now stored in the phy struct;
* register it */
rc = phy_device_register(phy);
if(rc) {
phy_device_free(phy);
of_node_put(child);
return rc;
}
dev_dbg(&mdio->dev, "registered phy %pOFn at address %i\n",
child, addr);
return 0;
}
phy = phy_device_create(mdio, addr, phy_id, 0, NULL); 创建phy设备,返回的是struct phy_device 类型,Linux内核用struct phy_device来描述一个phy设备。
rc = phy_device_register(phy); 注册phy设备,当phy设备注册时,会调用mdio_bus_type->match 进行匹配phy驱动。
Linux 以太网驱动会向上层提供net_device_ops ,方便应用层控制网卡。
static const struct net_device_ops fec_netdev_ops = {
.ndo_open = fec_enet_open,
.ndo_stop = fec_enet_close,
.ndo_start_xmit = fec_enet_start_xmit,
.ndo_select_queue = fec_enet_select_queue,
.ndo_set_rx_mode = set_multicast_list,
.ndo_validate_addr = eth_validate_addr,
.ndo_tx_timeout = fec_timeout,
.ndo_set_mac_address = fec_set_mac_address,
.ndo_do_ioctl = fec_enet_ioctl,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = fec_poll_controller,
#endif
.ndo_set_features = fec_set_features,
};
在fec_enet_open 中主要做了以下几点:
static int fec_enet_open(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
int ret;
bool reset_again;
ret = pm_runtime_get_sync(&fep->pdev->dev);
if (ret < 0)
return ret;
pinctrl_pm_select_default_state(&fep->pdev->dev);
ret = fec_enet_clk_enable(ndev, true);
if (ret)
goto clk_enable;
/*
在fec_enet_clk_enable()中会调用phy_reset_after_clk_enable(),但是此时phydev并没有附加在netdev上,
phy_reset_after_clk_enable() phy复位会失败。
为了确保PHY正常工作,需要在PHY被探测后,我们检查是否需要再次复位。
*/
if (ndev->phydev && ndev->phydev->drv)
reset_again = false;
else
reset_again = true; //待phydev 附加到netdev后,需要再次复位
/* I should reset the ring buffers here, but I don't yet know
* a simple way to do that.
*/
ret = fec_enet_alloc_buffers(ndev);
if (ret)
goto err_enet_alloc;
/* Init MAC prior to mii bus probe
在 mii 总线探测之前初始化 MAC,fec_restart 中有关于mdio控制器部分设置*/
fec_restart(ndev);
/* Probe and connect to PHY when open the interface */
ret = fec_enet_mii_probe(ndev);
if (ret)
goto err_enet_mii_probe;
/* Call phy_reset_after_clk_enable() again if it failed during
* phy_reset_after_clk_enable() before because the PHY wasn't probed.
*/
if (reset_again)
phy_reset_after_clk_enable(ndev->phydev);
if (fep->quirks & FEC_QUIRK_ERR006687)
imx6q_cpuidle_fec_irqs_used();
if (fep->quirks & FEC_QUIRK_HAS_PMQOS)
pm_qos_add_request(&fep->pm_qos_req,
PM_QOS_CPU_DMA_LATENCY,
0);
napi_enable(&fep->napi); //使能napi
phy_start(ndev->phydev); //开启phy
netif_tx_start_all_queues(ndev); //打开所有发送队列
device_set_wakeup_enable(&ndev->dev, fep->wol_flag &
FEC_WOL_FLAG_ENABLE);
return 0;
err_enet_mii_probe:
fec_enet_free_buffers(ndev);
err_enet_alloc:
fec_enet_clk_enable(ndev, false);
clk_enable:
pm_runtime_mark_last_busy(&fep->pdev->dev);
pm_runtime_put_autosuspend(&fep->pdev->dev);
if (!fep->mii_bus_share)
pinctrl_pm_select_sleep_state(&fep->pdev->dev);
return ret;
}
fec_enet_alloc_buffers(ndev); 复位缓冲区struct bufdesc。
static int fec_enet_alloc_buffers(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
unsigned int i;
for (i = 0; i < fep->num_rx_queues; i++)
if (fec_enet_alloc_rxq_buffers(ndev, i)) //申请了rxq 下的所有bufdesc->cbd_bufaddr,作用是保存将要传输的数据
return -ENOMEM;
for (i = 0; i < fep->num_tx_queues; i++) //并没有像rxq一样,申请cbd_bufaddr,txq的所有bufdesc->cbd_bufaddr 都是NULL,申请2048字节内存保存在 txq->tx_bounce[i]
if (fec_enet_alloc_txq_buffers(ndev, i))
return -ENOMEM;
return 0;
}
fec_enet_alloc_rxq_buffers
保存网卡传输数据的缓冲区应该使用流式的DMA buffer;网卡驱动和网卡DMA控制器往往是通过一些内存中的描述符(形成环或者链)进行交互,这些保存描述符的memory一般采用一致性dma 映射。
dma 流式分配传输缓冲区的内存给存储数据的skb->data,和bufdesc->cbd_bufaddr;前者保存的是dma地址,应该是给设备使用的,后者是虚拟地址,供驱动使用。
static int
fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
{
struct fec_enet_private *fep = netdev_priv(ndev);
unsigned int i;
struct sk_buff *skb;
struct bufdesc *bdp;
struct fec_enet_priv_rx_q *rxq;
rxq = fep->rx_queue[queue];
bdp = rxq->bd.base;
for (i = 0; i < rxq->bd.ring_size; i++) { // 循环设置一个队列中的所有bufdesc
skb = netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE); //申请skb, FEC_ENET_RX_FRSIZE应该是一帧数据最大的大小2048
if (!skb)
goto err_alloc;
if (fec_enet_new_rxbdp(ndev, bdp, skb)) { //dma流式映射分配了存放发送数据的缓冲区,虚拟地址赋值给bufdesc->cbd_bufaddr,dma地址赋给 skb->data
dev_kfree_skb(skb); //将bufdesc 和sk_buff 的关系连结起来。
goto err_alloc;
}
rxq->rx_skbuff[i] = skb;
bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY); //设置缓冲区状态为空
if (fep->bufdesc_ex) {
struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
ebdp->cbd_esc = cpu_to_fec32(BD_ENET_RX_INT);
}
bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
}
/* Set the last buffer to wrap. */
bdp = fec_enet_get_prevdesc(bdp, &rxq->bd);
bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP); //设置当前队列的最后一个bufdesc
return 0;
err_alloc:
fec_enet_free_buffers(ndev);
return -ENOMEM;
}
fec_enet_new_rxbdp
static int
fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff *skb)
{
struct fec_enet_private *fep = netdev_priv(ndev);
int off;
off = ((unsigned long)skb->data) & fep->rx_align; //手册中要求按照64字节对齐
if (off)
skb_reserve(skb, fep->rx_align + 1 - off); //扩展skb 头部空间
/*
dma_map_single:dma 流式映射,通常用在一些传输数据的buffer 上,是动态、临时的,随时可以map,用完就可以unmap
流式映射需要指明数据拷贝的方向,DMA_FROM_DEVICE 表示从设备拷贝到内存,也就是网卡->内存,接收肯定是设置DMA_FROM_DEVICE
*/
/*
dma流式映射分配内存到 bufdesc->cbd_bufaddr,用来保存网络传输的数据。
skb->data 保存的是映射区域的dma 地址,bdp->cbd_bufaddr 虚拟地址。
此处将skb 与 缓冲区描述符关联起来了。
*/
bdp->cbd_bufaddr = cpu_to_fec32(dma_map_single(&fep->pdev->dev, skb->data, FEC_ENET_RX_FRSIZE - fep->rx_align, DMA_FROM_DEVICE));
if (dma_mapping_error(&fep->pdev->dev, fec32_to_cpu(bdp->cbd_bufaddr))) {
if (net_ratelimit())
netdev_err(ndev, "Rx DMA memory map failed\n");
return -ENOMEM;
}
return 0;
}
fec_enet_alloc_txq_buffers
这里并没有像接收一样申请txq 缓冲区的内存,不知道是什么原因,可能与前面申请的TSO 内存有关。
申请txq->tx_bounce ,作用未知。
static int
fec_enet_alloc_txq_buffers(struct net_device *ndev, unsigned int queue)
{
struct fec_enet_private *fep = netdev_priv(ndev);
unsigned int i;
struct bufdesc *bdp;
struct fec_enet_priv_tx_q *txq;
txq = fep->tx_queue[queue];
bdp = txq->bd.base;
for (i = 0; i < txq->bd.ring_size; i++) {
txq->tx_bounce[i] = kmalloc(FEC_ENET_TX_FRSIZE, GFP_KERNEL); //申请tx_bounce
if (!txq->tx_bounce[i])
goto err_alloc;
bdp->cbd_sc = cpu_to_fec16(0); //清除bufdesc 的控制状态信息
bdp->cbd_bufaddr = cpu_to_fec32(0); //设置txq 中缓冲区描述符指向的缓冲区为空
if (fep->bufdesc_ex) {
struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
ebdp->cbd_esc = cpu_to_fec32(BD_ENET_TX_INT);
}
bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
}
/* Set the last buffer to wrap. */
bdp = fec_enet_get_prevdesc(bdp, &txq->bd);
bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
return 0;
err_alloc:
fec_enet_free_buffers(ndev);
return -ENOMEM;
}
of_phy_connect 的调用
of_phy_connect 会调用到phy_attach_direct,在其中绑定了phydev 与netdev,并且在phy_init_hw 中会调用到phy驱动。
static int fec_enet_mii_probe(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
struct phy_device *phy_dev = NULL;
char mdio_bus_id[MII_BUS_ID_SIZE];
char phy_name[MII_BUS_ID_SIZE + 3];
int phy_id;
int dev_id = fep->dev_id;
if (fep->phy_node) {
phy_dev = of_phy_connect(ndev, fep->phy_node, //绑定netdev 和phy
&fec_enet_adjust_link, 0, //fec_enet_adjust_link 连接状态调节回调函数
fep->phy_interface);
if (!phy_dev) {
netdev_err(ndev, "Unable to connect to phy\n");
return -ENODEV;
}
} else {
/* check for attached phy */
for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) {
if (!mdiobus_is_registered_device(fep->mii_bus, phy_id))
continue;
if (dev_id--)
continue;
strlcpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE);
break;
}
if (phy_id >= PHY_MAX_ADDR) {
netdev_info(ndev, "no PHY, assuming direct connection to switch\n");
strlcpy(mdio_bus_id, "fixed-0", MII_BUS_ID_SIZE);
phy_id = 0;
}
snprintf(phy_name, sizeof(phy_name),
PHY_ID_FMT, mdio_bus_id, phy_id);
phy_dev = phy_connect(ndev, phy_name, &fec_enet_adjust_link,
fep->phy_interface);
}
if (IS_ERR(phy_dev)) {
netdev_err(ndev, "could not attach to PHY\n");
return PTR_ERR(phy_dev);
}
/* mask with MAC supported features */
if (fep->quirks & FEC_QUIRK_HAS_GBIT) {
phy_set_max_speed(phy_dev, 1000);
phy_remove_link_mode(phy_dev,
ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
#if !defined(CONFIG_M5272)
phy_support_sym_pause(phy_dev);
#endif
}
else
phy_set_max_speed(phy_dev, 100);
fep->link = 0;
fep->full_duplex = 0;
phy_attached_info(phy_dev);
return 0;
}
void phy_start(struct phy_device *phydev)
{
mutex_lock(&phydev->lock);
if (phydev->state != PHY_READY && phydev->state != PHY_HALTED) {
WARN(1, "called from state %s\n",
phy_state_to_str(phydev->state));
goto out;
}
/* if phy was suspended, bring the physical link up again*/
__phy_resume(phydev); //恢复phy
phydev->state = PHY_UP; //设置phy状态为up
phy_start_machine(phydev); //开启phy状态机,phy状态机 会轮询调用检测phy的连接状态
out:
mutex_unlock(&phydev->lock);
}
fec_enet_close 的内容与open 相反:
static int
fec_enet_close(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
phy_stop(ndev->phydev); //停止phy:修改phy状态,停止phy状态机
if (netif_device_present(ndev)) { //检查__LINK_STATE_PRESENT,标志位;在register_netdevice 中会标记此位,如果没有这个标志位,表示netdev不可用。
napi_disable(&fep->napi); //禁用napi
netif_tx_disable(ndev);
fec_stop(ndev);
}
phy_disconnect(ndev->phydev);
ndev->phydev = NULL;
if (fep->quirks & FEC_QUIRK_ERR006687)
imx6q_cpuidle_fec_irqs_unused();
fec_enet_update_ethtool_stats(ndev);
fec_enet_clk_enable(ndev, false);
if (fep->quirks & FEC_QUIRK_HAS_PMQOS)
pm_qos_remove_request(&fep->pm_qos_req);
if (!fep->mii_bus_share)
pinctrl_pm_select_sleep_state(&fep->pdev->dev);
pm_runtime_mark_last_busy(&fep->pdev->dev);
pm_runtime_put_autosuspend(&fep->pdev->dev);
fec_enet_free_buffers(ndev); //释放txq、rxq 存储传输数据的内存
return 0;
}
static inline void netif_tx_disable(struct net_device *dev)
{
unsigned int i;
int cpu;
local_bh_disable();
cpu = smp_processor_id();
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
__netif_tx_lock(txq, cpu);
netif_tx_stop_queue(txq);
__netif_tx_unlock(txq);
}
local_bh_enable();
}
static void fec_enet_free_buffers(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
unsigned int i;
struct sk_buff *skb;
struct bufdesc *bdp;
struct fec_enet_priv_tx_q *txq;
struct fec_enet_priv_rx_q *rxq;
unsigned int q;
for (q = 0; q < fep->num_rx_queues; q++) {
rxq = fep->rx_queue[q];
bdp = rxq->bd.base;
for (i = 0; i < rxq->bd.ring_size; i++) {
skb = rxq->rx_skbuff[i];
rxq->rx_skbuff[i] = NULL;
if (skb) {
dma_unmap_single(&fep->pdev->dev,
fec32_to_cpu(bdp->cbd_bufaddr),
FEC_ENET_RX_FRSIZE - fep->rx_align,
DMA_FROM_DEVICE);
dev_kfree_skb(skb);
}
bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
}
}
for (q = 0; q < fep->num_tx_queues; q++) {
txq = fep->tx_queue[q];
bdp = txq->bd.base;
for (i = 0; i < txq->bd.ring_size; i++) {
kfree(txq->tx_bounce[i]);
txq->tx_bounce[i] = NULL;
skb = txq->tx_skbuff[i];
txq->tx_skbuff[i] = NULL;
dev_kfree_skb(skb);
}
}
}
Linux 网络收包采用napi 技术,napi——中断+轮询。
在网卡将数据搬运到内存中的环形缓冲区后,会发出硬件中断通知cpu 数据到达,cpu跳入中断处理函数,会根据中断事件类型分别处理,当类型为接收数据时,启动napi 调度,napi 会将网卡驱动中的napi_poll (各种网卡的设计不同,各自的收包程序也会不同)添加到cpu 的网络软中断私有数据中,并标记网络接收的软中断。
软中断线程会不断判断是否有软中断标志位被标记,每一个标记位对应一个实例函数(这些函数是在内核启动时与标志位绑定),一旦发现标记软中断线程便会跳入该处理函数,也就是在这个处理函数中调用网卡驱动的napi_poll 开始真正的收包流程。

napi 调用流程
//在网卡驱动中会注册中断 与 napi(一般在初始化时,probe 函数中)
ret = devm_request_irq(&pdev->dev, irq, fec_enet_interrupt,
0, pdev->name, ndev);
netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, NAPI_POLL_WEIGHT);
/*
中断处理函数负责的事
1、读取中断事件
2、清除中断标志位
3、禁用napi 中断
4、napi 调度
*/
static irqreturn_t fec_enet_interrupt(int irq, void *dev_id)
{
struct net_device *ndev = dev_id;
struct fec_enet_private *fep = netdev_priv(ndev);
uint int_events;
irqreturn_t ret = IRQ_NONE;
int_events = readl(fep->hwp + FEC_IEVENT); //读取中断事件
writel(int_events, fep->hwp + FEC_IEVENT); //清除中断标志位
fec_enet_collect_events(fep, int_events); //此处根据事件标记fep->work_rx 或fep->work_tx
if ((fep->work_tx || fep->work_rx) && fep->link) {
ret = IRQ_HANDLED;
if (napi_schedule_prep(&fep->napi)) { //检测是否可以进行napi调度
/* Disable the NAPI interrupts */
writel(FEC_NAPI_IMASK, fep->hwp + FEC_IMASK); //禁用napi中断
__napi_schedule(&fep->napi); //开始napi调度
}
}
if (int_events & FEC_ENET_MII) { //napi 调度完成
ret = IRQ_HANDLED;
complete(&fep->mdio_done);
}
return ret;
}
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n); //this_cpu_ptr(&softnet_data) 获取到当前cpu的 网络软中断私有数据
local_irq_restore(flags);
}
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list); //将驱动的napi添加到 cpu网络软中断私有数据的链表中
__raise_softirq_irqoff(NET_RX_SOFTIRQ); //标志软中断标志位:NET_RX_SOFTIRQ
}
//__raise_softirq_irqoff 标志其实就是将对应标志位 或上 1
void __raise_softirq_irqoff(unsigned int nr)
{
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
}
软中断:在多核的cpu中,每一个cpu都会运行一个软中断线程,它会处理硬中断来不及处理的事务,由硬件中断处理函数去标记软中断。硬件中断在哪个cpu上处理,对应的软中断也在哪个cpu上处理。单核cpu只有一个软中断线程。
使用napi的好处:提高了收包效率;比较早的内核版本中完全使用中断来接收数据包,这样在处理超长包时,cpu需要长时间陷入中断处理函数,导致其它程序无法调度,降低cpu性能。如果单纯使用轮询时,在大量的小包时,处理效率也会比较低。
内核在启动时在spawn_ksoftirqd 函数中调用了smpboot_register_percpu_thread 为每一个cpu 创建了一个软中断线程。
//file: kernel/softirq.c
static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
.thread_fn = run_ksoftirqd,
.thread_comm = "ksoftirqd/%u",
};
static __init int spawn_ksoftirqd(void)
{
register_cpu_notifier(&cpu_nfb);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
return 0;
}
early_initcall(spawn_ksoftirqd);
smpboot_register_percpu_thread 定义在kernel/smpboot.c。
//kernel/smpboot.c
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
unsigned int cpu;
int ret = 0;
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
goto out;
}
smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
mutex_unlock(&smpboot_threads_lock);
put_online_cpus();
return ret;
}
Linux内核启动时会调用 网络子系统初始化函数 net_dev_init。
在其中注册了网络发送和接收的action,并与它们的标识符绑定。
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
//file: net/core/dev.c
static int __init net_dev_init(void)
{
......
for_each_possible_cpu(i) {
struct softnet_data *sd = &per_cpu(softnet_data, i);
memset(sd, 0, sizeof(*sd));
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
sd->completion_queue = NULL;
INIT_LIST_HEAD(&sd->poll_list);
......
}
......
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
}
subsys_initcall(net_dev_init);
内核中不仅仅只有网络的软中断函数,还有起来类型的。
//file: include/linux/interrupt.h
enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
BLOCK_IOPOLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ,
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
NR_SOFTIRQS
};
softirq_threads 描述一个软中断线程,当软中断线程注册后,会不断调用ksoftirqd_should_run 判断是否有软中断发生。
在硬件中断 napi 调度的时候调用了or_softirq_pending 标识了软中断标志位,local_softirq_pending就可以读取到是否有软中断发生,发生软中断后会调用run_ksoftirqd 。
#define local_softirq_pending() (__this_cpu_read(local_softirq_pending_ref))
#define set_softirq_pending(x) (__this_cpu_write(local_softirq_pending_ref, (x)))
#define or_softirq_pending(x) (__this_cpu_or(local_softirq_pending_ref, (x)))
static int ksoftirqd_should_run(unsigned int cpu)
{
return local_softirq_pending();
}
进入软中断前需要禁用硬件中断(不太清楚为什么关闭,在韦东山的教程中软中断是可以被硬件中断打断的)
static void run_ksoftirqd(unsigned int cpu)
{
local_irq_disable(); //禁用硬件中断
if (local_softirq_pending()) {
/*
* We can safely run softirq on inline stack, as we are not deep
* in the task stack here.
*/
__do_softirq();
local_irq_enable();
cond_resched();
return;
}
local_irq_enable();
}
__do_softirq
h->action(h); 执行了action,我们这里执行的就是net_rx_action,它就是网络接收的软中断
asmlinkage void __do_softirq(void)
{
do {
if (pending & 1) {
unsigned int vec_nr = h - softirq_vec;
int prev_count = preempt_count();
...
trace_softirq_entry(vec_nr);
h->action(h);
trace_softirq_exit(vec_nr);
...
}
h++;
pending >>= 1;
} while (pending);
}
函数开头的time_limit和budget是用来控制net_rx_action函数主动退出的,目的是保证网络包的接收不霸占CPU不放。 等下次网卡再有硬中断过来的时候再处理剩下的接收数据包。其中budget可以通过内核参数调整。 这个函数中剩下的核心逻辑是获取到当前CPU变量softnet_data,对其poll_list进行遍历, 然后执行到网卡驱动注册到的poll函数。imx6ull 的poll 函数就是fec_enet_rx_napi.
static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
void *have;
local_irq_disable();
while (!list_empty(&sd->poll_list)) {
......
n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
trace_napi_poll(n);
}
budget -= work;
}
}
static int fec_enet_rx_napi(struct napi_struct *napi, int budget)
{
struct net_device *ndev = napi->dev;
struct fec_enet_private *fep = netdev_priv(ndev);
int pkts; //packet 数量
pkts = fec_enet_rx(ndev, budget); //budget 是用来控制函数主动退出的,目的是保证网络包的接收不霸占CPU不放。
fec_enet_tx(ndev);
if (pkts < budget) {
napi_complete_done(napi, pkts);
writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
}
return pkts;
}
接着调用是
fec_enet_rx->fec_enet_rx_queue
fec_enet_rx_queue 太复杂了,以后再研究。
fec_probe 主要做的事情是: