imx6ull 以太网

网络设备的状态

enum netdev_state_t {				
	__LINK_STATE_START,				
	__LINK_STATE_PRESENT,			
	__LINK_STATE_NOCARRIER,
	__LINK_STATE_LINKWATCH_PENDING,
	__LINK_STATE_DORMANT,
};
1
2
3
4
5
6
7

22.6.1 Ethernet MAC frame formats

An Ethernet frame consists of the following fields:
• Seven bytes preamble （7字节前导码）
• Start frame delimiter (SFD) （帧起始符）
• Two address fields
• Length or type field
• Data field
• Frame check sequence (FCS，CRC value)
前导码和起始符不算在真实的包里，他们跟随一个包一起发过来，但是tcpdump 或 wireshark是抓不到的。FCS也是一样。
0~46 字节的pad，应该是用于当实际数据不对齐时，用来填充对齐。
在这里插入图片描述

vlan 帧

暂停帧：接收设备产生一个暂停帧来指示发送设备的拥塞，发送设备应该停止发送数据。
暂停帧格式：type字段为0x8808，跟随type后的是2字节的操作码（Opcode）应该是0x0001.
暂停帧的pad 永远是42字节。
P1、P2 时暂停值，决定暂停发送或允许发送。
在这里插入图片描述

22.6.4 MAC receive

The MAC receive engine performs the following tasks:
• Check frame framing
• Remove frame preamble and frame SFD field
• Discard frame based on frame destination address field （基于目的地址字段丢弃帧）
• Terminate pause frames
• Check frame length
• Remove payload padding if it exists （删除填充）
• Calculate and verify CRC-32
• Write received frames in the core receive FIFO
在这里插入图片描述

22.6.5 MAC transmit

22.6.14 Enhanced buffer descriptors（增强型缓冲区描述符）

imx6ull 用下表来描述一个缓冲区。
data buffer pointer 保存着缓冲区的地址，缓冲区必须位于外置内存中。
在这里插入图片描述

Enhanced buffer descriptors可以与驱动中的struct bufdesc 对应。
struct bufdesc { __fec16 cbd_sc; /* Control and status info */ __fec16 cbd_datlen; /* Data length */ __fec32 cbd_bufaddr; /* Buffer address */ };

imx6ull 的数据包存放形式

在这里插入图片描述

网络数据包在接收时网卡会将数据包dma 到内存的ring buffer中，中断通知驱动数据已到达；同样发送时，驱动将数据包写到ring buffer中，通知网卡将数据拷走。所以无论是在接收和发送时都要通过ring buffer暂时保存数据包，那么数据包在ring buffer中是怎么存放的。

ring buffer 就是队列，队列有先进先出的特点，网卡和驱动就形成一个生产者和消费者的关系。

imx6ull 中数据包存到队列中的形式就是以struct bufdesc 结构，队列的大小也是按照这个结构计算——一个队列中能个存放若干个struct bufdesc 结构体。
struct sk_buff是内核描述一个数据包的结构体，imx6ull 驱动应该会把sk_buff 转化成bufdesc。

在这里插入图片描述

fec_probe

static int fec_probe(struct platform_device *pdev)
{
	struct fec_enet_private *fep;
	struct fec_platform_data *pdata;
	struct net_device *ndev;
	int i, irq, ret = 0;
	const struct of_device_id *of_id;
	static int dev_id;
	struct device_node *np = pdev->dev.of_node, *phy_node;
	int num_tx_qs;
	int num_rx_qs;
	char irq_name[8];
	int irq_cnt;

	fec_enet_get_queue_num(pdev, &num_tx_qs, &num_rx_qs);			//从设备树获取txq和rxq的数量

	/* Init network device */
	ndev = alloc_etherdev_mqs(sizeof(struct fec_enet_private) +			//申请netdev
				  FEC_STATS_SIZE, num_tx_qs, num_rx_qs);		
	if (!ndev)													
		return -ENOMEM;	

	SET_NETDEV_DEV(ndev, &pdev->dev);   //设置 netdev->dev.parent = pdev->dev  

	/* setup board info structure */
	fep = netdev_priv(ndev);   //私有数据变量= (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);

	of_id = of_match_device(fec_dt_ids, &pdev->dev);	//与平台相关的数据
	if (of_id)
		pdev-->id_entry->driver_data;

	fep->netdev = ndev;
	fep->num_rx_queues = num_rx_qs;
	fep->num_tx_queues = num_tx_qs;

#if !defined(CONFIG_M5272)
	/* default enable pause frame auto negotiation */
	if (fep->quirks & FEC_QUIRK_HAS_GBIT)
		fep->pause_flag |= FEC_PAUSE_FLAG_AUTONEG;
#endif	

	/* Select default pin state */
	pinctrl_pm_select_default_state(&pdev->dev); 

	fep->hwp = devm_platform_ioremap_resource(pdev, 0);	//获取并映射资源：mac 寄存器地址
	if (IS_ERR(fep->hwp)) {
		ret = PTR_ERR(fep->hwp);
		goto failed_ioremap;
	}

	fep->pdev = pdev;
	fep->dev_id = dev_id++;

	platform_set_drvdata(pdev, ndev);

	if ((of_machine_is_compatible("fsl,imx6q") ||
	     of_machine_is_compatible("fsl,imx6dl")) &&
	    !of_property_read_bool(np, "fsl,err006687-workaround-present"))
		fep->quirks |= FEC_QUIRK_ERR006687;

	fec_enet_of_parse_stop_mode(pdev);	//解析设备树停止模式：stop-mode = <&gpr 0x10 4>;
	ret = fec_enet_ipc_handle_init(fep);
	if (ret)
		goto failed_ipc_init;

	if (of_get_property(np, "fsl,magic-packet", NULL))	//魔术包
		fep->wol_flag |= FEC_WOL_HAS_MAGIC_PACKET;

	if (of_get_property(np, "fsl,rgmii_txc_dly", NULL))
		fep->rgmii_txc_dly = true;

	if (of_get_property(np, "fsl,rgmii_rxc_dly", NULL))
		fep->rgmii_rxc_dly = true;

	phy_node = of_parse_phandle(np, "phy-handle", 0);	//解析设备树 phy节点
	if (!phy_node && of_phy_is_fixed_link(np)) {
		ret = of_phy_register_fixed_link(np);
		if (ret < 0) {
			dev_err(&pdev->dev,
				"broken fixed-link specification\n");
			goto failed_phy;
		}
		phy_node = of_node_get(np);
	}
	fep->phy_node = phy_node;

	ret = of_get_phy_mode(pdev->dev.of_node);	//获取phy 接口模式 sgmii、rgmii等，返回的是枚举类型值
	if (ret < 0) {
		pdata = dev_get_platdata(&pdev->dev);
		if (pdata)
			fep->phy_interface = pdata->phy;
		else
			fep->phy_interface = PHY_INTERFACE_MODE_MII;
	} else {
		fep->phy_interface = ret;
	}

	request_bus_freq(BUS_FREQ_HIGH);
	//时钟相关的省略一段代码
	fep->clk_ipg = devm_clk_get(&pdev->dev, "ipg");
	if (IS_ERR(fep->clk_ipg)) {
		ret = PTR_ERR(fep->clk_ipg);
		goto failed_clk;
	}

	......	

	fep->reg_phy = devm_regulator_get_optional(&pdev->dev, "phy");
	if (!IS_ERR(fep->reg_phy)) {
		ret = regulator_enable(fep->reg_phy);
		if (ret) {
			dev_err(&pdev->dev,
				"Failed to enable phy regulator: %d\n", ret);
			goto failed_regulator;
		}
	} else {
		if (PTR_ERR(fep->reg_phy) == -EPROBE_DEFER) {
			ret = -EPROBE_DEFER;
			goto failed_regulator;
		}
		fep->reg_phy = NULL;
	}
	
	......	

	ret = fec_reset_phy(pdev);	//复位phy，通过gpio 硬件复位
	if (ret)
		goto failed_reset;

	irq_cnt = fec_enet_get_irq_cnt(pdev);	//获取irq 个数
	if (fep->bufdesc_ex)
		fec_ptp_init(pdev, irq_cnt);

	ret = fec_enet_init(ndev);	//dma 队列内存分配、netdev_ops设置、ethtool_ops、设置napi poll、复位和初始化mac
	if (ret)
		goto failed_init;

	for (i = 0; i < irq_cnt; i++) {	//获取irq
		snprintf(irq_name, sizeof(irq_name), "int%d", i);
		irq = platform_get_irq_byname_optional(pdev, irq_name);	//通过名字在platform 总线上查找irq
		if (irq < 0)
			irq = platform_get_irq(pdev, i);	//从设备树获取irq
		if (irq < 0) {
			ret = irq;
			goto failed_irq;
		}
		ret = devm_request_irq(&pdev->dev, irq, fec_enet_interrupt,
				       0, pdev->name, ndev);		//申请中断，中断处理函数 fec_enet_interrupt
		if (ret)									//有新的数据包到达，或发送完成等等都会触发中断，进入中断处理函数，根据中断事件类型执行不同的代码
			goto failed_irq;

		fep->irq[i] = irq;
	}
	/*
	唤醒中断 0 
	interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>,
                       <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>;
	*/
	/* get wake up irq */
	ret = of_property_read_u32(np, "fsl,wakeup_irq", &irq);	
	if (!ret && irq < irq_cnt)
		fep->wake_irq = fep->irq[irq];
	else
		fep->wake_irq = fep->irq[0];

	init_completion(&fep->mdio_done);
	/* board only enable one mii bus in default */
	if (!of_get_property(np, "fsl,mii-exclusive", NULL))
		fep->quirks |= FEC_QUIRK_SINGLE_MDIO;
	//mdio 初始化
	ret = fec_enet_mii_init(pdev);
	if (ret)
		goto failed_mii_init;

	/* Carrier starts down, phylib will bring it up */
	netif_carrier_off(ndev);		//调用函数 netif_carrier_off 通知内核，先关闭链路， phylib 会打开。
	fec_enet_clk_enable(ndev, false);
	pinctrl_pm_select_sleep_state(&pdev->dev);

	ret = register_netdev(ndev);		//注册netdev
	if (ret)
		goto failed_register;

	device_init_wakeup(&ndev->dev, fep->wol_flag &
			   FEC_WOL_HAS_MAGIC_PACKET);		

	if (fep->bufdesc_ex && fep->ptp_clock)
		netdev_info(ndev, "registered PHC device %d\n", fep->dev_id);

	fep->rx_copybreak = COPYBREAK_DEFAULT;
	INIT_WORK(&fep->tx_timeout_work, fec_enet_timeout_work);

	pm_runtime_mark_last_busy(&pdev->dev);
	pm_runtime_put_autosuspend(&pdev->dev);

	return 0;

failed_register:
	fec_enet_mii_remove(fep);
failed_mii_init:
failed_irq:
failed_init:
	fec_ptp_stop(pdev);
	if (fep->reg_phy)
		regulator_disable(fep->reg_phy);
failed_reset:
	pm_runtime_put_noidle(&pdev->dev);
	pm_runtime_disable(&pdev->dev);
failed_regulator:
	clk_disable_unprepare(fep->clk_ahb);
failed_clk_ahb:
	clk_disable_unprepare(fep->clk_ipg);
failed_clk_ipg:
	fec_enet_clk_enable(ndev, false);
failed_clk:
	release_bus_freq(BUS_FREQ_HIGH);
	if (of_phy_is_fixed_link(np))
		of_phy_deregister_fixed_link(np);
	of_node_put(phy_node);
failed_ipc_init:
failed_phy:
	dev_id--;
failed_ioremap:
	free_netdev(ndev);

	return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227

alloc_etherdev_mqs

这里主要是做了netdev的申请，以及一些以太网通用的设置ether_setup 函数中，私有数据内存的申请，和netdev下tx、rx队列的申请（在后面的fec_enet_init 函数中又做了txq、rxq dma的内存分配。后者才是真正的ring buffer，alloc_etherdev_mqs函数中申请队列的作用暂时不清楚）。
netdev和private的空间在一块内存中去申请，netdev前后都做了32字节的对齐。
在这里插入图片描述

//linux-5.4.47\net\ethernet\eth.c
struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
				      unsigned int rxqs)
{
	return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_UNKNOWN,
				ether_setup, txqs, rxqs);		//ether_setup: 设置netdev 一些通用的值 
}
1
2
3
4
5
6
7

struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
		unsigned char name_assign_type,
		void (*setup)(struct net_device *),
		unsigned int txqs, unsigned int rxqs)
{
	struct net_device *dev;
	unsigned int alloc_size;
	struct net_device *p;

	BUG_ON(strlen(name) >= sizeof(dev->name));				//当设置名字的长度 > netdev->name 的空间长度时，BUG_ON(1),发生bug

	if (txqs < 1) {
		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
		return NULL;
	}

	if (rxqs < 1) {
		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
		return NULL;
	}
	alloc_size = sizeof(struct net_device);					//计算netdev 长度 alloc_size
	if (sizeof_priv) {
		/* ensure 32-byte alignment of private area */
		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);		//将netdev长度以32字节对齐
		alloc_size += sizeof_priv;							//alloc_size + 私有数据长度
	}
	/* ensure 32-byte alignment of whole construct */
	alloc_size += NETDEV_ALIGN - 1;							//alloc_size +32 -1

	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
	if (!p)
		return NULL;

	dev = PTR_ALIGN(p, NETDEV_ALIGN);		//p以32 字节对齐，作为netdev的地址
	dev->padded = (char *)dev - (char *)p;	//netdev->padded 申请的内存首地址与netdev首地址之间做了多少填充

	dev->pcpu_refcnt = alloc_percpu(int);	//netdev 的引用次数
	if (!dev->pcpu_refcnt)
		goto free_dev;

	if (dev_addr_init(dev))[]				//初始化mac地址
		goto free_pcpu;

	dev_mc_init(dev);						//初始化组播mac地址
	dev_uc_init(dev);						//初始化单播mac地址

	dev_net_set(dev, &init_net);

	netdev_register_lockdep_key(dev);

	dev->gso_max_size = GSO_MAX_SIZE;
	dev->gso_max_segs = GSO_MAX_SEGS;
	dev->upper_level = 1;
	dev->lower_level = 1;

	INIT_LIST_HEAD(&dev->napi_list);	//初始化链表头
	INIT_LIST_HEAD(&dev->unreg_list);
	INIT_LIST_HEAD(&dev->close_list);
	INIT_LIST_HEAD(&dev->link_watch_list);
	INIT_LIST_HEAD(&dev->adj_list.upper);
	INIT_LIST_HEAD(&dev->adj_list.lower);
	INIT_LIST_HEAD(&dev->ptype_all);
	INIT_LIST_HEAD(&dev->ptype_specific);
#ifdef CONFIG_NET_SCHED
	hash_init(dev->qdisc_hash);
#endif
	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
	setup(dev);

	if (!dev->tx_queue_len) {
		dev->priv_flags |= IFF_NO_QUEUE;
		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
	}

	dev->num_tx_queues = txqs;
	dev->real_num_tx_queues = txqs;
	if (netif_alloc_netdev_queues(dev))	//tx 队列申请
		goto free_all;
	
	dev->num_rx_queues = rxqs;
	dev->real_num_rx_queues = rxqs;
	if (netif_alloc_rx_queues(dev))		//rx 队列申请
		goto free_all;

	strcpy(dev->name, name);
	dev->name_assign_type = name_assign_type;
	dev->group = INIT_NETDEV_GROUP;
	if (!dev->ethtool_ops)
		dev->ethtool_ops = &default_ethtool_ops;	//设置默认ethtool ops

	nf_hook_ingress_init(dev);

	return dev;

free_all:
	free_netdev(dev);
	return NULL;

free_pcpu:
	free_percpu(dev->pcpu_refcnt);
free_dev:
	netdev_freemem(dev);
	return NULL;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

fec_enet_init

fec_enet_init 主要做了dma 队列内存分配、netdev_ops设置、ethtool_ops、设置napi poll
struct bufdesc 是缓冲区描述符，描述存放数据的缓冲区，这里申请struct bufdesc 内存需要用一致性dma 映射。
在后面的open 函数中申请bufdesc->cbd_bufaddr，这个指针指向的才是真正存放要发送数据的内存地址，使用的是dma 流式映射。

static int fec_enet_init(struct net_device *ndev)
{
	struct fec_enet_private *fep = netdev_priv(ndev);
	struct bufdesc *cbd_base;				//struct bufdesc：描述一个缓冲区
	dma_addr_t bd_dma;
	int bd_size;
	unsigned int i;
	unsigned dsize = fep->bufdesc_ex ? sizeof(struct bufdesc_ex) :
			sizeof(struct bufdesc);	//buffer 描述符大小
	unsigned dsize_log2 = __fls(dsize);
	int ret;

......

	/* Check mask of the streaming and coherent API */
	//dma 掩码设置: dma的寻址能力（例如24bit寻址范围是16MB，32bit 是4GB），告诉内核所支持的dma是多少位的，一般都是32位
	ret = dma_set_mask_and_coherent(&fep->pdev->dev, DMA_BIT_MASK(32));
	if (ret < 0) {
		dev_warn(&fep->pdev->dev, "No suitable DMA available\n");
		return ret;
	}
	
	/*
		TSO：全称是 TCP Segmentation Offload，利用网卡对大数据包进行自动分段处理，降低 CPU负载。
		
		GSO：全称是 Generic Segmentation Offload，在发送数据之前先检查一下网卡是否支持 TSO，如果支持的话就让网卡分段，
		不支持的话就由协议栈进行分段处理，分段处理完成以后再交给网卡去发送。
	*/
	ret = fec_enet_alloc_queue(ndev);	//设置tx和rx ring buffer大小，分配tx tso头部内存
	if (ret)
		return ret;
	//数据包应该是以struct bufdesc 形式存在ring buffer中，total_tx_ring_size是描述一个ring buffer能存放多少个struct bufdesc
	bd_size = (fep->total_tx_ring_size + fep->total_rx_ring_size) * dsize;

	/* Allocate memory for buffer descriptors. */
	//分配txq、rxq的缓冲区描述符bufdesc，这些缓冲区描述符要用一致性dma映射
	cbd_base = dmam_alloc_coherent(&fep->pdev->dev, bd_size, &bd_dma,	//dma 分配tx、rx 内存
				       GFP_KERNEL);
	if (!cbd_base) {
		ret = -ENOMEM;
		goto free_queue_mem;
	}

	/* Get the Ethernet address */
	ret = fec_get_mac(ndev);	//获取mac地址
	if (ret)
		goto free_queue_mem;

	/* make sure MAC we just acquired is programmed into the hw */
	fec_set_mac_address(ndev, NULL);

	/* Set receive and transmit descriptor base. */
	for (i = 0; i < fep->num_rx_queues; i++) {
		struct fec_enet_priv_rx_q *rxq = fep->rx_queue[i];
		unsigned size = dsize * rxq->bd.ring_size;

		rxq->bd.qid = i;			//队列id
		rxq->bd.base = cbd_base;	//缓冲区描述符首地址，dma 分配的虚拟地址(内核使用)
		rxq->bd.cur = cbd_base;
		rxq->bd.dma = bd_dma;		//dma 分配的dma地址（给IO设备使用）
		rxq->bd.dsize = dsize;		//buffer 描述符大小
		rxq->bd.dsize_log2 = dsize_log2;
		/*offset_des_active_rxq 的定义，分别是三个队列的激活队列寄存器地址
		static const unsigned short offset_des_active_rxq[] = {
			FEC_R_DES_ACTIVE_0, FEC_R_DES_ACTIVE_1, FEC_R_DES_ACTIVE_2
		};
		static const unsigned short offset_des_active_txq[] = {
			FEC_X_DES_ACTIVE_0, FEC_X_DES_ACTIVE_1, FEC_X_DES_ACTIVE_2
		};
		*/
		rxq->bd.reg_desc_active = fep->hwp + offset_des_active_rxq[i];	//激活队列
		bd_dma += size;
		cbd_base = (struct bufdesc *)(((void *)cbd_base) + size);
		rxq->bd.last = (struct bufdesc *)(((void *)cbd_base) - dsize);	//接收队列，最后一个bufdesc
	}

	for (i = 0; i < fep->num_tx_queues; i++) {
		struct fec_enet_priv_tx_q *txq = fep->tx_queue[i];
		unsigned size = dsize * txq->bd.ring_size;

		txq->bd.qid = i;
		txq->bd.base = cbd_base;				//前面经过cbd_base+size 偏移，当前cbd_base是txq的第一个cbd_base地址
		txq->bd.cur = cbd_base;
		txq->bd.dma = bd_dma;
		txq->bd.dsize = dsize;
		txq->bd.dsize_log2 = dsize_log2;
		txq->bd.reg_desc_active = fep->hwp + offset_des_active_txq[i];
		bd_dma += size;
		cbd_base = (struct bufdesc *)(((void *)cbd_base) + size);
		txq->bd.last = (struct bufdesc *)(((void *)cbd_base) - dsize);
	}

	/* The FEC Ethernet specific entries in the device structure */
	ndev->watchdog_timeo = TX_TIMEOUT;				//发送超时时间
	ndev->netdev_ops = &fec_netdev_ops;				//netdev_ops
	ndev->ethtool_ops = &fec_enet_ethtool_ops;		//ethtool_ops

	writel(FEC_RX_DISABLED_IMASK, fep->hwp + FEC_IMASK);	//rxf 中断掩码设置		中断掩码：允许中断发生或屏蔽中断
	
	/*	netif_napi_add
		添加napi poll 回调函数：fec_enet_rx_napi
		NAPI_POLL_WEIGHT：napi 权重，在调用napi收包时最多只能收取NAPI_POLL_WEIGHT，超过这个值就马上退出，剩下没接收完的留着下次处理，
		防止网络驱动pool 函数一直霸占着软中断线程
	*/
	netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, NAPI_POLL_WEIGHT);	
	
	//设置一些网卡特性
	if (fep->quirks & FEC_QUIRK_HAS_VLAN)
		/* enable hw VLAN support */
		ndev->features |= NETIF_F_HW_VLAN_CTAG_RX;

	if (fep->quirks & FEC_QUIRK_HAS_CSUM) {
		ndev->gso_max_segs = FEC_MAX_TSO_SEGS;	//gso 分片

		/* enable hw accelerator */
		ndev->features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM
				| NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_TSO);
		fep->csum_flags |= FLAG_RX_CSUM_ENABLED;
	}

	if (fep->quirks & FEC_QUIRK_HAS_AVB) {
		fep->tx_align = 0;
		fep->rx_align = 0x3f;
	}

	ndev->hw_features = ndev->features;

	fec_restart(ndev);		//做了一些mac寄存器的复位设置，txq和txq 中的struct bufdesc初始化

	if (fep->quirks & FEC_QUIRK_MIB_CLEAR)
		fec_enet_clear_ethtool_stats(ndev);
	else
		fec_enet_update_ethtool_stats(ndev);

	return 0;

free_queue_mem:
	fec_enet_free_queue(ndev);
	return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

fec_enet_alloc_queue

dma 分配内存时物理地址一定要时连续的，且需要一致性DMA分配。
在这个函数指定了 ring buffer的大小（一个能容纳多少个struct bufdesc）

static int fec_enet_alloc_queue(struct net_device *ndev)
{
	struct fec_enet_private *fep = netdev_priv(ndev);
	int i;
	int ret = 0;
	struct fec_enet_priv_tx_q *txq;

	/*
		vmalloc 一般用于申请页级别的大块内存，保证虚拟地址连续，当物理地址不一定连续，申请大小不限制。
		kmalloc 申请的内存位于物理内存映射区域，而且在物理上也是连续的，申请的大小不能超过128KB。
		kzalloc 只是 kmalloc 多了 __GFP_ZERO 标志，用于将申请内存填零。

		一般情况下，内存只有在要被 DMA 访问的时候才需要物理上连续，但为了性能上的考虑，内核中一般使用 kmalloc()，
		而只有在需要获得大块内存时才使用 vmalloc()。例如，当模块被动态加载到内核当中时，就把模块装载到由 vmalloc() 分配的内存上。
	*/
	for (i = 0; i < fep->num_tx_queues; i++) {	
		txq = kzalloc(sizeof(*txq), GFP_KERNEL);
		if (!txq) {
			ret = -ENOMEM;
			goto alloc_failed;
		}

		fep->tx_queue[i] = txq;				//描述一个发送队列
		txq->bd.ring_size = TX_RING_SIZE;							//一个环形缓冲区的大小
		fep->total_tx_ring_size += fep->tx_queue[i]->bd.ring_size;	//所有环形缓冲区相加的大小

		txq->tx_stop_threshold = FEC_MAX_SKB_DESCS;
		txq->tx_wake_threshold =
			(txq->bd.ring_size - txq->tx_stop_threshold) / 2;

		txq->tso_hdrs = dma_alloc_coherent(&fep->pdev->dev,			//dma 分配tx tso头部缓存
					txq->bd.ring_size * TSO_HEADER_SIZE,
					&txq->tso_hdrs_dma,
					GFP_KERNEL);
		if (!txq->tso_hdrs) {
			ret = -ENOMEM;
			goto alloc_failed;
		}
	}

	for (i = 0; i < fep->num_rx_queues; i++) {
		fep->rx_queue[i] = kzalloc(sizeof(*fep->rx_queue[i]),
					   GFP_KERNEL);
		if (!fep->rx_queue[i]) {
			ret = -ENOMEM;
			goto alloc_failed;
		}

		fep->rx_queue[i]->bd.ring_size = RX_RING_SIZE;
		fep->total_rx_ring_size += fep->rx_queue[i]->bd.ring_size;
	}
	return ret;

alloc_failed:
	fec_enet_free_queue(ndev);
	return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

fec_restart

static void fec_restart(struct net_device *ndev)
{
	struct fec_enet_private *fep = netdev_priv(ndev);
	u32 val;
	u32 temp_mac[2];
	u32 rcntl = OPT_FRAME_SIZE | 0x04;
	u32 ecntl = FEC_ENET_ETHEREN; /* ETHEREN */		//以太网使能

	/* Whack a reset.  We should wait for this.
	 * For i.MX6SX SOC, enet use AXI bus, we use disable MAC
	 * instead of reset MAC itself.
	 */
	if (fep->quirks & FEC_QUIRK_HAS_AVB) {
		writel(0, fep->hwp + FEC_ECNTRL);
	} else {
		writel(1, fep->hwp + FEC_ECNTRL);	//复位 mac
		udelay(10);
	} 

	/*
	 * enet-mac reset will reset mac address registers too,
	 * so need to reconfigure it.
	 */
	memcpy(&temp_mac, ndev->dev_addr, ETH_ALEN);	//临时mac
	writel((__force u32)cpu_to_be32(temp_mac[0]),
	       fep->hwp + FEC_ADDR_LOW);
	writel((__force u32)cpu_to_be32(temp_mac[1]),
	       fep->hwp + FEC_ADDR_HIGH);

	/* Clear any outstanding interrupt. */
	writel(0xffffffff, fep->hwp + FEC_IEVENT);	//中断事件寄存器，写1清0

	fec_enet_bd_init(ndev);		//缓冲区描述符初始化
	fec_enet_enable_ring(ndev);	//使能队列：将队列地址写入寄存器
	
	/* Reset tx SKB buffers. */
	fec_enet_reset_skb(ndev);			//释放txq skb

	/* Enable MII mode */	//mdio 控制相关设置
	if (fep->full_duplex == DUPLEX_FULL) {
		/* FD enable */
		writel(0x04, fep->hwp + FEC_X_CNTRL);
	} else {
		/* No Rcv on Xmit */
		rcntl |= 0x02;
		writel(0x0, fep->hwp + FEC_X_CNTRL);
	}	

	/* Set MII speed */
	writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED);

#if !defined(CONFIG_M5272)
	if (fep->quirks & FEC_QUIRK_HAS_RACC) {
		val = readl(fep->hwp + FEC_RACC);
		/* align IP header */
		val |= FEC_RACC_SHIFT16;
		if (fep->csum_flags & FLAG_RX_CSUM_ENABLED)
			/* set RX checksum */
			val |= FEC_RACC_OPTIONS;
		else
			val &= ~FEC_RACC_OPTIONS;
		writel(val, fep->hwp + FEC_RACC);
		writel(PKT_MAXBUF_SIZE, fep->hwp + FEC_FTRL);
	}
#endif

	/*
	 * The phy interface and speed need to get configured
	 * differently on enet-mac.			根据各mac 控制器的特性，phy 接口和速度需要进行不同的配置。
	 */
	if (fep->quirks & FEC_QUIRK_ENET_MAC) {
		/* Enable flow control and length check */
		rcntl |= 0x40000000 | 0x00000020;

		/* RGMII, RMII or MII */
		if (fep->phy_interface == PHY_INTERFACE_MODE_RGMII ||
		    fep->phy_interface == PHY_INTERFACE_MODE_RGMII_ID ||
		    fep->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID ||
		    fep->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID)
			rcntl |= (1 << 6);
		else if (fep->phy_interface == PHY_INTERFACE_MODE_RMII)
			rcntl |= (1 << 8);
		else
			rcntl &= ~(1 << 8);

		/* 1G, 100M or 10M */
		if (ndev->phydev) {
			if (ndev->phydev->speed == SPEED_1000)
				ecntl |= (1 << 5);
			else if (ndev->phydev->speed == SPEED_100)
				rcntl &= ~(1 << 9);
			else
				rcntl |= (1 << 9);
		}
	} else {
#ifdef FEC_MIIGSK_ENR
		if (fep->quirks & FEC_QUIRK_USE_GASKET) {
			u32 cfgr;
			/* disable the gasket and wait */
			writel(0, fep->hwp + FEC_MIIGSK_ENR);
			while (readl(fep->hwp + FEC_MIIGSK_ENR) & 4)
				udelay(1);

			/*
			 * configure the gasket:
			 *   RMII, 50 MHz, no loopback, no echo
			 *   MII, 25 MHz, no loopback, no echo
			 */
			cfgr = (fep->phy_interface == PHY_INTERFACE_MODE_RMII)
				? BM_MIIGSK_CFGR_RMII : BM_MIIGSK_CFGR_MII;
			if (ndev->phydev && ndev->phydev->speed == SPEED_10)
				cfgr |= BM_MIIGSK_CFGR_FRCONT_10M;
			writel(cfgr, fep->hwp + FEC_MIIGSK_CFGR);

			/* re-enable the gasket */
			writel(2, fep->hwp + FEC_MIIGSK_ENR);
		}
#endif
	}
#if !defined(CONFIG_M5272)
	/* enable pause frame
	暂停帧：当ring buffer满，cpu来不及收取时，ifconfig 出现rx overruns，此时ring buffer 溢出了，需要发出暂停帧通知对方停止向本设备发送。
	使能暂停帧检测：当检测到有暂停帧接收，停止向对方发送数据。
	*/
	if ((fep->pause_flag & FEC_PAUSE_FLAG_ENABLE) ||
	    ((fep->pause_flag & FEC_PAUSE_FLAG_AUTONEG) &&
	     ndev->phydev && ndev->phydev->pause)) {
		rcntl |= FEC_ENET_FCE;							//使能接收暂停帧检测	

		/* set FIFO threshold parameter to reduce overrun 	设置FIFO 阈值，减少溢出*/
		writel(FEC_ENET_RSEM_V, fep->hwp + FEC_R_FIFO_RSEM);
		writel(FEC_ENET_RSFL_V, fep->hwp + FEC_R_FIFO_RSFL);
		writel(FEC_ENET_RAEM_V, fep->hwp + FEC_R_FIFO_RAEM);
		writel(FEC_ENET_RAFL_V, fep->hwp + FEC_R_FIFO_RAFL);

		/* OPD */
		writel(FEC_ENET_OPD_V, fep->hwp + FEC_OPD);
	} else {
		rcntl &= ~FEC_ENET_FCE;
	}
#endif /* !defined(CONFIG_M5272) */

	writel(rcntl, fep->hwp + FEC_R_CNTRL);

	/* Setup multicast filter. */
	set_multicast_list(ndev);
#ifndef CONFIG_M5272
	writel(0, fep->hwp + FEC_HASH_TABLE_HIGH);
	writel(0, fep->hwp + FEC_HASH_TABLE_LOW);
#endif

	if (fep->quirks & FEC_QUIRK_ENET_MAC) {
		/* enable ENET endian swap */
		ecntl |= (1 << 8);
		/* enable ENET store and forward mode */
		writel(1 << 8, fep->hwp + FEC_X_WMRK);
	}

	if (fep->bufdesc_ex)
		ecntl |= (1 << 4);

	if (fep->quirks & FEC_QUIRK_DELAYED_CLKS_SUPPORT &&
		fep->rgmii_txc_dly)
		ecntl |= FEC_ENET_TXC_DLY;
	if (fep->quirks & FEC_QUIRK_DELAYED_CLKS_SUPPORT &&
		fep->rgmii_rxc_dly)
		ecntl |= FEC_ENET_RXC_DLY;

#ifndef CONFIG_M5272
	/* Enable the MIB statistic event counters */
	writel(0 << 31, fep->hwp + FEC_MIB_CTRLSTAT);
#endif
i_txc_dly)
		ecntl |= FEC_ENET_TXC_DLY;
	if (fep->quirks & FEC_QUIRK_DELAYED_CLKS_SUPPORT &&
		fep->rgmii_rxc_dly)
		ecntl |= FEC_ENET_RXC_DLY;

#ifndef CONFIG_M5272
	/* Enable the MIB statistic event counters */
	writel(0 << 31, fep->hwp + FEC_MIB_CTRLSTAT);
#endif

	/* And last, enable the transmit and receive processing */
	writel(ecntl, fep->hwp + FEC_ECNTRL);
	fec_enet_active_rxring(ndev);					//激活接收队列

	if (fep->bufdesc_ex)
		fec_ptp_start_cyclecounter(ndev);

	/* Enable interrupts we wish to service */
	if (fep->link)
		writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
	else
		writel(FEC_ENET_MII, fep->hwp + FEC_IMASK);

	/* Init the interrupt coalescing */
	fec_enet_itr_coal_init(ndev);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

fec_enet_bd_init

队列中的缓冲区描述符初始化

static void fec_enet_bd_init(struct net_device *dev)
{
	struct fec_enet_private *fep = netdev_priv(dev);
	struct fec_enet_priv_tx_q *txq;
	struct fec_enet_priv_rx_q *rxq;
	struct bufdesc *bdp;
	unsigned int i;
	unsigned int q;

	for (q = 0; q < fep->num_rx_queues; q++) {	//循环设置所有队列
		/* Initialize the receive buffer descriptors. */
		rxq = fep->rx_queue[q];
		bdp = rxq->bd.base;

		for (i = 0; i < rxq->bd.ring_size; i++) {	//初始化一个队列的所有缓冲区描述符

			/* Initialize the BD for every fragment in the page. */
			/*
				bdp->cbd_bufaddr 指向的是保存要发送的数据地址，这里还没有分配，应该是NULL，走else 分支；
				在open 函数中，也会调用fec_restart，那时已经完成了bdp->cbd_bufaddr分配，走if 分支；
			*/
			if (bdp->cbd_bufaddr)
				bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY);	//设置缓冲区标识为空（没有数据）
			else
				bdp->cbd_sc = cpu_to_fec16(0);					//清除描述符控制和状态信息	
			bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);	//查找下一个struct bufdesc
		}

		/* Set the last buffer to wrap */
		bdp = fec_enet_get_prevdesc(bdp, &rxq->bd);	//当前队列的最后一个缓冲区描述符；
		bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);	//下一个队列的起始地址可以在ENETx_RDS
		
		rxq->bd.cur = rxq->bd.base;
	}

	for (q = 0; q < fep->num_tx_queues; q++) {
		/* ...and the same for transmit */
		txq = fep->tx_queue[q];
		bdp = txq->bd.base;
		txq->bd.cur = bdp;

		for (i = 0; i < txq->bd.ring_size; i++) {
			/* Initialize the BD for every fragment in the page. */
			bdp->cbd_sc = cpu_to_fec16(0);		//设置描述符控制，状态信息为0
			if (bdp->cbd_bufaddr &&
			    !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))	//如果cbd_bufaddr 地址在tso header 范围内，释放cbd_bufaddr 地址
				dma_unmap_single(&fep->pdev->dev,
						 fec32_to_cpu(bdp->cbd_bufaddr),
						 fec16_to_cpu(bdp->cbd_datlen),
						 DMA_TO_DEVICE);
			if (txq->tx_skbuff[i]) {
				dev_kfree_skb_any(txq->tx_skbuff[i]);			//释放txq->skb[i]
				txq->tx_skbuff[i] = NULL;
			}
			bdp->cbd_bufaddr = cpu_to_fec32(0);
			bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
		}

		/* Set the last buffer to wrap */
		bdp = fec_enet_get_prevdesc(bdp, &txq->bd);
		bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
		txq->dirty_tx = bdp;
	}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

fec_enet_enable_ring 使能队列：将队列首地址写入ENETx_RDSR 寄存器。
imx系列最多支持3个队列，imx6ull 只支持1个队列。
在这里插入图片描述

static void fec_enet_enable_ring(struct net_device *ndev)
{
	struct fec_enet_private *fep = netdev_priv(ndev);
	struct fec_enet_priv_tx_q *txq;
	struct fec_enet_priv_rx_q *rxq;
	int i;

	for (i = 0; i < fep->num_rx_queues; i++) {	//imx系列最多支持3个接收队列，imx6ull只支持1个，队列0的起始地址保存在寄存器FEC_R_DES_START_0（ENETx_RDSR）
		rxq = fep->rx_queue[i];
		writel(rxq->bd.dma, fep->hwp + FEC_R_DES_START(i));		//队列开始地址设置：将内存中环形缓冲区地址写入寄存器ENETx_RDSR
		writel(PKT_MAXBUF_SIZE, fep->hwp + FEC_R_BUFF_SIZE(i));	//队列大小

		/* enable DMA1/2 */
		if (i)
			writel(RCMR_MATCHEN | RCMR_CMP(i),		//多队列情况下，使能其它dma通道
			       fep->hwp + FEC_RCMR(i));
	}

	for (i = 0; i < fep->num_tx_queues; i++) {
		txq = fep->tx_queue[i];
		writel(txq->bd.dma, fep->hwp + FEC_X_DES_START(i));

		/* enable DMA1/2 */
		if (i)
			writel(DMA_CLASS_EN | IDLE_SLOPE(i),
			       fep->hwp + FEC_DMA_CFG(i));
	}
} 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

fec_enet_mii_init

mdio 控制器初始化。
fec_enet_mii_init 函数中会申请一个struct mii_bus 结构体，并向内核注册。
mii_bus 用来描述一个mdio总线（这是真实有功能的总线用来管理phy设备，读写phy寄存器。struct bus_type mdio_bus_type 只是为了匹配phy设备与phy驱动）。
其中mii_bus 最重要的成员就是mii_bus->read 和 mii_bus->write，他们用来读写phy寄存器。
这两个函数由nxp 编写，因为每个cpu它的控制器都不尽相同，需要厂家自己编写。

static int fec_enet_mii_init(struct platform_device *pdev)
{
	static struct mii_bus *fec0_mii_bus;
	static bool *fec_mii_bus_share;
	struct net_device *ndev = platform_get_drvdata(pdev);
	struct fec_enet_private *fep = netdev_priv(ndev);
	struct device_node *node;
	int err = -ENXIO;
	u32 mii_speed, holdtime;

	...... 省略一些代码

	fep->mii_bus = mdiobus_alloc();
	if (fep->mii_bus == NULL) {
		err = -ENOMEM;
		goto err_out;
	}

	fep->mii_bus->name = "fec_enet_mii_bus";
	fep->mii_bus->read = fec_enet_mdio_read;		//mii 读/写 函数设置，（读写phy芯片）
	fep->mii_bus->write = fec_enet_mdio_write;
	snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x",
		pdev->name, fep->dev_id + 1);
	fep->mii_bus->priv = fep;
	fep->mii_bus->parent = &pdev->dev;

	node = of_get_child_by_name(pdev->dev.of_node, "mdio");	//获取mdio节点
	err = of_mdiobus_register(fep->mii_bus, node);			//向内核注册mdio
	of_node_put(node);
	if (err)
		goto err_out_free_mdiobus;

	mii_cnt++;

	/* save fec0 mii_bus */
	if (fep->quirks & FEC_QUIRK_SINGLE_MDIO) {
		fec0_mii_bus = fep->mii_bus;
		fec_mii_bus_share = &fep->mii_bus_share;
	}

	return 0;

err_out_free_mdiobus:
	mdiobus_free(fep->mii_bus);
err_out:
	return err;
}	
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

of_mdiobus_register

mdiobus_register(mdio); 向内核注册mii_bus 总线；
for_each_available_child_of_node(np, child) {} 查找设备树mdio节点下所有子节点——phy节点。
addr = of_mdio_parse_addr(&mdio->dev, child); 解析phy节点，获取节点下reg的值——phy地址。
of_mdiobus_register_phy(mdio, child, addr); 注册phy设备。

//linux-5.4.47\drivers\of\of_mdio.c
int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
{
	struct device_node *child;
	bool scanphys = false;
	int addr, rc;

	if (!np)
		return mdiobus_register(mdio);

	/* Do not continue if the node is disabled */
	if (!of_device_is_available(np))
		return -ENODEV;

	/*Register the MDIO bus */
	rc = mdiobus_register(mdio);
	if (rc)
		return rc;

	/* Loop over the child nodes and register a phy_device for each phy */
	for_each_available_child_of_node(np, child) {
		addr = of_mdio_parse_addr(&mdio->dev, child);
		if (addr < 0) {
			scanphys = true;
			continue;
		}

		if (of_mdiobus_child_is_phy(child))
			rc = of_mdiobus_register_phy(mdio, child, addr);
		else
			rc = of_mdiobus_register_device(mdio, child, addr);

		if (rc == -ENODEV)
			dev_err(&mdio->dev,
				"MDIO device at address %d is missing.\n",
				addr);
		else if (rc)
			goto unregister;
	}

	if (!scanphys)
		return 0;

	/* auto scan for PHYs with empty reg property */
	for_each_available_child_of_node(np, child) {
		/* Skip PHYs with reg property set */
		if (of_find_property(child, "reg", NULL))
			continue;

		for (addr = 0; addr < PHY_MAX_ADDR; addr++) {
			/* skip already registered PHYs */
			if (mdiobus_is_registered_device(mdio, addr))
				continue;

			/* be noisy to encourage people to set reg property */
			dev_info(&mdio->dev, "scan phy %pOFn at address %i\n",
				 child, addr);

			if (of_mdiobus_child_is_phy(child)) {
				rc = of_mdiobus_register_phy(mdio, child, addr);
				if (rc && rc != -ENODEV)
					goto unregister;
				break;
			}
		}
	}

	return 0;

unregister:
	mdiobus_unregister(mdio);
	return rc;
}
EXPORT_SYMBOL(of_mdiobus_register);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

of_mdiobus_register_phy

注册phy设备

static int of_mdiobus_register_phy(struct mii_bus *mdio,
				    struct device_node *child, u32 addr)
{
	struct phy_device *phy;
	bool is_c45;
	int rc;
	u32 phy_id;

	is_c45 = of_device_is_compatible(child,
					 "ethernet-phy-ieee802.3-c45");
					 
	if (!is_c45 && !of_get_phy_id(child, &phy_id))
		phy = phy_device_create(mdio, addr, phy_id, 0, NULL);
	else
		phy = get_phy_device(mdio, addr, is_c45);
	if (IS_ERR(phy))
		return PTR_ERR(phy);

	rc = of_irq_get(child, 0);
	if (rc == -EPROBE_DEFER) {
		phy_device_free(phy);
		return rc;
	}
	if (rc > 0) {
		phy->irq = rc;
		mdio->irq[addr] = rc;
	} else {
		phy->irq = mdio->irq[addr];
	}

	if (of_property_read_bool(child, "broken-turn-around"))
		mdio->phy_ignore_ta_mask |= 1 << addr;

	of_property_read_u32(child, "reset-assert-us",
			     &phy->mdio.reset_assert_delay);

	/* Associate the OF node with the device structure so it
	 * can be looked up later */
	of_node_get(child);
	phy->mdio.dev.of_node = child;
	phy->mdio.dev.fwnode = of_fwnode_handle(child);

	/* All data is now stored in the phy struct;
	 * register it */
	rc = phy_device_register(phy);
	if(rc) {
		phy_device_free(phy);
		of_node_put(child);
		return rc;
	}

	dev_dbg(&mdio->dev, "registered phy %pOFn at address %i\n",
		child, addr);
	return 0;
} 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

phy = phy_device_create(mdio, addr, phy_id, 0, NULL); 创建phy设备，返回的是struct phy_device 类型，Linux内核用struct phy_device来描述一个phy设备。
rc = phy_device_register(phy); 注册phy设备，当phy设备注册时，会调用mdio_bus_type->match 进行匹配phy驱动。

net_device_ops

Linux 以太网驱动会向上层提供net_device_ops ，方便应用层控制网卡。

static const struct net_device_ops fec_netdev_ops = {
	.ndo_open		= fec_enet_open,
	.ndo_stop		= fec_enet_close,
	.ndo_start_xmit		= fec_enet_start_xmit,
	.ndo_select_queue       = fec_enet_select_queue,
	.ndo_set_rx_mode	= set_multicast_list,
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_tx_timeout		= fec_timeout,
	.ndo_set_mac_address	= fec_set_mac_address,
	.ndo_do_ioctl		= fec_enet_ioctl,
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	= fec_poll_controller,
#endif
	.ndo_set_features	= fec_set_features,
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

fec_enet_open

在fec_enet_open 中主要做了以下几点：

分配rxq 缓冲区内存
复位mac，做了一些寄存器的设置；初始化了txq、rxq 的缓冲区描述符
将phydev 附加到netdev，并调用了phy驱动初始化phy
使能napi
开启phy 状态机
打开所有发送队列

static int fec_enet_open(struct net_device *ndev)
{
	struct fec_enet_private *fep = netdev_priv(ndev);
	int ret;
	bool reset_again;

	ret = pm_runtime_get_sync(&fep->pdev->dev);
	if (ret < 0)
		return ret;

	pinctrl_pm_select_default_state(&fep->pdev->dev);
	ret = fec_enet_clk_enable(ndev, true);
	if (ret)
		goto clk_enable;

	
	 /*
	   在fec_enet_clk_enable()中会调用phy_reset_after_clk_enable()，但是此时phydev并没有附加在netdev上，
	   phy_reset_after_clk_enable() phy复位会失败。 
	   为了确保PHY正常工作，需要在PHY被探测后，我们检查是否需要再次复位。
	  */
	if (ndev->phydev && ndev->phydev->drv)
		reset_again = false;
	else
		reset_again = true;			//待phydev 附加到netdev后，需要再次复位

	/* I should reset the ring buffers here, but I don't yet know
	 * a simple way to do that.
	 */
	ret = fec_enet_alloc_buffers(ndev);
	if (ret)
		goto err_enet_alloc;

	/* Init MAC prior to mii bus probe 
	  在 mii 总线探测之前初始化 MAC，fec_restart 中有关于mdio控制器部分设置*/
	fec_restart(ndev);

	/* Probe and connect to PHY when open the interface */
	ret = fec_enet_mii_probe(ndev);
	if (ret)
		goto err_enet_mii_probe;

	/* Call phy_reset_after_clk_enable() again if it failed during
	 * phy_reset_after_clk_enable() before because the PHY wasn't probed.
	 */
	if (reset_again)
		phy_reset_after_clk_enable(ndev->phydev);

	if (fep->quirks & FEC_QUIRK_ERR006687)
		imx6q_cpuidle_fec_irqs_used();
	if (fep->quirks & FEC_QUIRK_HAS_PMQOS)
		pm_qos_add_request(&fep->pm_qos_req,
				   PM_QOS_CPU_DMA_LATENCY,
				   0);
	napi_enable(&fep->napi);		//使能napi
	phy_start(ndev->phydev);				//开启phy
	netif_tx_start_all_queues(ndev);		//打开所有发送队列

	device_set_wakeup_enable(&ndev->dev, fep->wol_flag &
				 FEC_WOL_FLAG_ENABLE);

	return 0;

err_enet_mii_probe:
	fec_enet_free_buffers(ndev);
err_enet_alloc:
	fec_enet_clk_enable(ndev, false);
clk_enable:
	pm_runtime_mark_last_busy(&fep->pdev->dev);
	pm_runtime_put_autosuspend(&fep->pdev->dev);
	if (!fep->mii_bus_share)
		pinctrl_pm_select_sleep_state(&fep->pdev->dev);
	return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

fec_enet_alloc_buffers

fec_enet_alloc_buffers(ndev); 复位缓冲区struct bufdesc。

static int fec_enet_alloc_buffers(struct net_device *ndev)
{
	struct fec_enet_private *fep = netdev_priv(ndev);
	unsigned int i;

	for (i = 0; i < fep->num_rx_queues; i++)			
		if (fec_enet_alloc_rxq_buffers(ndev, i))	//申请了rxq 下的所有bufdesc->cbd_bufaddr,作用是保存将要传输的数据
			return -ENOMEM;

	for (i = 0; i < fep->num_tx_queues; i++)		//并没有像rxq一样，申请cbd_bufaddr，txq的所有bufdesc->cbd_bufaddr 都是NULL,申请2048字节内存保存在 txq->tx_bounce[i] 
		if (fec_enet_alloc_txq_buffers(ndev, i))
			return -ENOMEM;
	return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14

fec_enet_alloc_rxq_buffers
保存网卡传输数据的缓冲区应该使用流式的DMA buffer；网卡驱动和网卡DMA控制器往往是通过一些内存中的描述符（形成环或者链）进行交互，这些保存描述符的memory一般采用一致性dma 映射。
dma 流式分配传输缓冲区的内存给存储数据的skb->data，和bufdesc->cbd_bufaddr；前者保存的是dma地址，应该是给设备使用的，后者是虚拟地址，供驱动使用。

static int
fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
{
	struct fec_enet_private *fep = netdev_priv(ndev);
	unsigned int i;
	struct sk_buff *skb;
	struct bufdesc	*bdp;
	struct fec_enet_priv_rx_q *rxq;

	rxq = fep->rx_queue[queue];
	bdp = rxq->bd.base;
	for (i = 0; i < rxq->bd.ring_size; i++) {	// 循环设置一个队列中的所有bufdesc
		skb = netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE);	//申请skb,  FEC_ENET_RX_FRSIZE应该是一帧数据最大的大小2048
		if (!skb)
			goto err_alloc;

		if (fec_enet_new_rxbdp(ndev, bdp, skb)) {    //dma流式映射分配了存放发送数据的缓冲区，虚拟地址赋值给bufdesc->cbd_bufaddr，dma地址赋给 skb->data
			dev_kfree_skb(skb);						 //将bufdesc 和sk_buff 的关系连结起来。
			goto err_alloc;
		}

		rxq->rx_skbuff[i] = skb;	
		bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY);	//设置缓冲区状态为空

		if (fep->bufdesc_ex) {
			struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
			ebdp->cbd_esc = cpu_to_fec32(BD_ENET_RX_INT);
		}

		bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
	}

	/* Set the last buffer to wrap. */
	bdp = fec_enet_get_prevdesc(bdp, &rxq->bd);	
	bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);	//设置当前队列的最后一个bufdesc
	return 0;
 err_alloc:
	fec_enet_free_buffers(ndev);
	return -ENOMEM;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40

fec_enet_new_rxbdp

static int
fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff *skb)
{
	struct  fec_enet_private *fep = netdev_priv(ndev);
	int off;

	off = ((unsigned long)skb->data) & fep->rx_align;	//手册中要求按照64字节对齐
	if (off)
		skb_reserve(skb, fep->rx_align + 1 - off);	//扩展skb 头部空间

	/*
	dma_map_single：dma 流式映射，通常用在一些传输数据的buffer 上，是动态、临时的，随时可以map，用完就可以unmap
	流式映射需要指明数据拷贝的方向，DMA_FROM_DEVICE 表示从设备拷贝到内存，也就是网卡->内存，接收肯定是设置DMA_FROM_DEVICE
	*/
	/*
	dma流式映射分配内存到 bufdesc->cbd_bufaddr，用来保存网络传输的数据。
	skb->data 保存的是映射区域的dma 地址，bdp->cbd_bufaddr 虚拟地址。
	此处将skb 与 缓冲区描述符关联起来了。
	*/
	bdp->cbd_bufaddr = cpu_to_fec32(dma_map_single(&fep->pdev->dev, skb->data, FEC_ENET_RX_FRSIZE - fep->rx_align, DMA_FROM_DEVICE));
	if (dma_mapping_error(&fep->pdev->dev, fec32_to_cpu(bdp->cbd_bufaddr))) {
		if (net_ratelimit())
			netdev_err(ndev, "Rx DMA memory map failed\n");
		return -ENOMEM;
	}

	return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

fec_enet_alloc_txq_buffers
这里并没有像接收一样申请txq 缓冲区的内存，不知道是什么原因，可能与前面申请的TSO 内存有关。
申请txq->tx_bounce ，作用未知。

static int
fec_enet_alloc_txq_buffers(struct net_device *ndev, unsigned int queue)
{
	struct fec_enet_private *fep = netdev_priv(ndev);
	unsigned int i;
	struct bufdesc  *bdp;
	struct fec_enet_priv_tx_q *txq;

	txq = fep->tx_queue[queue];
	bdp = txq->bd.base;
	for (i = 0; i < txq->bd.ring_size; i++) {
		txq->tx_bounce[i] = kmalloc(FEC_ENET_TX_FRSIZE, GFP_KERNEL);	//申请tx_bounce
		if (!txq->tx_bounce[i])
			goto err_alloc;

		bdp->cbd_sc = cpu_to_fec16(0);			//清除bufdesc 的控制状态信息
		bdp->cbd_bufaddr = cpu_to_fec32(0);		//设置txq 中缓冲区描述符指向的缓冲区为空

		if (fep->bufdesc_ex) {
			struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
			ebdp->cbd_esc = cpu_to_fec32(BD_ENET_TX_INT);
		}

		bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
	}
	/* Set the last buffer to wrap. */
	bdp = fec_enet_get_prevdesc(bdp, &txq->bd);
	bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);

	return 0;

 err_alloc:
	fec_enet_free_buffers(ndev);
	return -ENOMEM;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

fec_enet_mii_probe

of_phy_connect 的调用
of_phy_connect 会调用到phy_attach_direct，在其中绑定了phydev 与netdev，并且在phy_init_hw 中会调用到phy驱动。

static int fec_enet_mii_probe(struct net_device *ndev)
{
	struct fec_enet_private *fep = netdev_priv(ndev);
	struct phy_device *phy_dev = NULL;
	char mdio_bus_id[MII_BUS_ID_SIZE];
	char phy_name[MII_BUS_ID_SIZE + 3];
	int phy_id;
	int dev_id = fep->dev_id;

	if (fep->phy_node) {
		phy_dev = of_phy_connect(ndev, fep->phy_node,	//绑定netdev 和phy
					 &fec_enet_adjust_link, 0,			//fec_enet_adjust_link 连接状态调节回调函数
					 fep->phy_interface);
		if (!phy_dev) {
			netdev_err(ndev, "Unable to connect to phy\n");
			return -ENODEV;
		}
	} else {
		/* check for attached phy */
		for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) {
			if (!mdiobus_is_registered_device(fep->mii_bus, phy_id))
				continue;
			if (dev_id--)
				continue;
			strlcpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE);
			break;
		} 

		if (phy_id >= PHY_MAX_ADDR) {
			netdev_info(ndev, "no PHY, assuming direct connection to switch\n");
			strlcpy(mdio_bus_id, "fixed-0", MII_BUS_ID_SIZE);
			phy_id = 0;
		}
		snprintf(phy_name, sizeof(phy_name),
			 PHY_ID_FMT, mdio_bus_id, phy_id);
		phy_dev = phy_connect(ndev, phy_name, &fec_enet_adjust_link,
				      fep->phy_interface);
	}
	if (IS_ERR(phy_dev)) {
		netdev_err(ndev, "could not attach to PHY\n");
		return PTR_ERR(phy_dev);
	}

	/* mask with MAC supported features */
	if (fep->quirks & FEC_QUIRK_HAS_GBIT) {
		phy_set_max_speed(phy_dev, 1000);
		phy_remove_link_mode(phy_dev,
				     ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
#if !defined(CONFIG_M5272)
		phy_support_sym_pause(phy_dev);
#endif
	}
	else
		phy_set_max_speed(phy_dev, 100);

	fep->link = 0;
	fep->full_duplex = 0;

	phy_attached_info(phy_dev);

	return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

void phy_start(struct phy_device *phydev)
{
	mutex_lock(&phydev->lock);

	if (phydev->state != PHY_READY && phydev->state != PHY_HALTED) {
		WARN(1, "called from state %s\n",
		     phy_state_to_str(phydev->state));
		goto out;
	}

	/* if phy was suspended, bring the physical link up again*/
	__phy_resume(phydev);				//恢复phy

	phydev->state = PHY_UP;				//设置phy状态为up

	phy_start_machine(phydev);			//开启phy状态机,phy状态机 会轮询调用检测phy的连接状态
out:
	mutex_unlock(&phydev->lock);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

fec_enet_close

fec_enet_close 的内容与open 相反：

停止phy
禁用napi
netif_tx_stop_queue 停止发送队列
fec_stop mac一些寄存器设置
phy_disconnect 断开phydev 与netdev的关系
fec_enet_free_buffers 释放txq、rxq 缓冲区

static int
fec_enet_close(struct net_device *ndev)
{
	struct fec_enet_private *fep = netdev_priv(ndev);

	phy_stop(ndev->phydev);		//停止phy：修改phy状态，停止phy状态机

	if (netif_device_present(ndev)) {			//检查__LINK_STATE_PRESENT，标志位；在register_netdevice 中会标记此位，如果没有这个标志位，表示netdev不可用。
		napi_disable(&fep->napi);	//禁用napi
		netif_tx_disable(ndev);
		fec_stop(ndev);
	}

	phy_disconnect(ndev->phydev);
	ndev->phydev = NULL;

	if (fep->quirks & FEC_QUIRK_ERR006687)
		imx6q_cpuidle_fec_irqs_unused();

	fec_enet_update_ethtool_stats(ndev);

	fec_enet_clk_enable(ndev, false);
	if (fep->quirks & FEC_QUIRK_HAS_PMQOS)
		pm_qos_remove_request(&fep->pm_qos_req);
	if (!fep->mii_bus_share)
		pinctrl_pm_select_sleep_state(&fep->pdev->dev);
	pm_runtime_mark_last_busy(&fep->pdev->dev);
	pm_runtime_put_autosuspend(&fep->pdev->dev);

	fec_enet_free_buffers(ndev);		//释放txq、rxq 存储传输数据的内存

	return 0;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

static inline void netif_tx_disable(struct net_device *dev)
{
	unsigned int i;
	int cpu;

	local_bh_disable();
	cpu = smp_processor_id();
	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

		__netif_tx_lock(txq, cpu);
		netif_tx_stop_queue(txq);
		__netif_tx_unlock(txq);
	}
	local_bh_enable();
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

static void fec_enet_free_buffers(struct net_device *ndev)
{
	struct fec_enet_private *fep = netdev_priv(ndev);
	unsigned int i;
	struct sk_buff *skb;
	struct bufdesc	*bdp;
	struct fec_enet_priv_tx_q *txq;
	struct fec_enet_priv_rx_q *rxq;
	unsigned int q;

	for (q = 0; q < fep->num_rx_queues; q++) {
		rxq = fep->rx_queue[q];
		bdp = rxq->bd.base;
		for (i = 0; i < rxq->bd.ring_size; i++) {
			skb = rxq->rx_skbuff[i];
			rxq->rx_skbuff[i] = NULL;
			if (skb) {
				dma_unmap_single(&fep->pdev->dev,
						 fec32_to_cpu(bdp->cbd_bufaddr),
						 FEC_ENET_RX_FRSIZE - fep->rx_align,
						 DMA_FROM_DEVICE);
				dev_kfree_skb(skb);
			}
			bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
		}
	}

	for (q = 0; q < fep->num_tx_queues; q++) {
		txq = fep->tx_queue[q];
		bdp = txq->bd.base;
		for (i = 0; i < txq->bd.ring_size; i++) {
			kfree(txq->tx_bounce[i]);
			txq->tx_bounce[i] = NULL;
			skb = txq->tx_skbuff[i];
			txq->tx_skbuff[i] = NULL;
			dev_kfree_skb(skb);
		}
	}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

网络收包流程

Linux 网络收包采用napi 技术，napi——中断+轮询。
在网卡将数据搬运到内存中的环形缓冲区后，会发出硬件中断通知cpu 数据到达，cpu跳入中断处理函数，会根据中断事件类型分别处理，当类型为接收数据时，启动napi 调度，napi 会将网卡驱动中的napi_poll （各种网卡的设计不同，各自的收包程序也会不同）添加到cpu 的网络软中断私有数据中，并标记网络接收的软中断。
软中断线程会不断判断是否有软中断标志位被标记，每一个标记位对应一个实例函数（这些函数是在内核启动时与标志位绑定），一旦发现标记软中断线程便会跳入该处理函数，也就是在这个处理函数中调用网卡驱动的napi_poll 开始真正的收包流程。
在这里插入图片描述
napi 调用流程

//在网卡驱动中会注册中断 与 napi（一般在初始化时，probe 函数中）
ret = devm_request_irq(&pdev->dev, irq, fec_enet_interrupt,
				       0, pdev->name, ndev);
				       
netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, NAPI_POLL_WEIGHT);

/*
中断处理函数负责的事
1、读取中断事件
2、清除中断标志位
3、禁用napi 中断
4、napi 调度
*/
static irqreturn_t fec_enet_interrupt(int irq, void *dev_id)
{
	struct net_device *ndev = dev_id;
	struct fec_enet_private *fep = netdev_priv(ndev);
	uint int_events;
	irqreturn_t ret = IRQ_NONE;

	int_events = readl(fep->hwp + FEC_IEVENT);				//读取中断事件
	writel(int_events, fep->hwp + FEC_IEVENT);				//清除中断标志位
	fec_enet_collect_events(fep, int_events);				//此处根据事件标记fep->work_rx 或fep->work_tx

	if ((fep->work_tx || fep->work_rx) && fep->link) {
		ret = IRQ_HANDLED;

		if (napi_schedule_prep(&fep->napi)) {				//检测是否可以进行napi调度
			/* Disable the NAPI interrupts */
			writel(FEC_NAPI_IMASK, fep->hwp + FEC_IMASK);	//禁用napi中断
			__napi_schedule(&fep->napi);					//开始napi调度
		}
	}

	if (int_events & FEC_ENET_MII) {		//napi 调度完成
		ret = IRQ_HANDLED;
		complete(&fep->mdio_done);
	}
	return ret;
}	

void __napi_schedule(struct napi_struct *n)
{
	unsigned long flags;

	local_irq_save(flags);
	____napi_schedule(this_cpu_ptr(&softnet_data), n);		//this_cpu_ptr(&softnet_data) 获取到当前cpu的 网络软中断私有数据
	local_irq_restore(flags);
}

static inline void ____napi_schedule(struct softnet_data *sd,
				     struct napi_struct *napi)
{
	list_add_tail(&napi->poll_list, &sd->poll_list);	//将驱动的napi添加到 cpu网络软中断私有数据的链表中
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);				//标志软中断标志位：NET_RX_SOFTIRQ
}

//__raise_softirq_irqoff 标志其实就是将对应标志位 或上 1
void __raise_softirq_irqoff(unsigned int nr)
{
	trace_softirq_raise(nr);
	or_softirq_pending(1UL << nr);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

软中断

软中断：在多核的cpu中，每一个cpu都会运行一个软中断线程，它会处理硬中断来不及处理的事务，由硬件中断处理函数去标记软中断。硬件中断在哪个cpu上处理，对应的软中断也在哪个cpu上处理。单核cpu只有一个软中断线程。
使用napi的好处：提高了收包效率；比较早的内核版本中完全使用中断来接收数据包，这样在处理超长包时，cpu需要长时间陷入中断处理函数，导致其它程序无法调度，降低cpu性能。如果单纯使用轮询时，在大量的小包时，处理效率也会比较低。

软中断初始化

内核在启动时在spawn_ksoftirqd 函数中调用了smpboot_register_percpu_thread 为每一个cpu 创建了一个软中断线程。

//file: kernel/softirq.c
static struct smp_hotplug_thread softirq_threads = {
    .store          = &ksoftirqd,
    .thread_should_run  = ksoftirqd_should_run,
    .thread_fn      = run_ksoftirqd,
    .thread_comm        = "ksoftirqd/%u",
};

static __init int spawn_ksoftirqd(void)
{
    register_cpu_notifier(&cpu_nfb);

    BUG_ON(smpboot_register_percpu_thread(&softirq_threads));

    return 0;
}
early_initcall(spawn_ksoftirqd);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

smpboot_register_percpu_thread 定义在kernel/smpboot.c。

//kernel/smpboot.c
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
	unsigned int cpu;
	int ret = 0;

	get_online_cpus();
	mutex_lock(&smpboot_threads_lock);
	for_each_online_cpu(cpu) {
		ret = __smpboot_create_thread(plug_thread, cpu);
		if (ret) {
			smpboot_destroy_threads(plug_thread);
			goto out;
		}
		smpboot_unpark_thread(plug_thread, cpu);
	}
	list_add(&plug_thread->list, &hotplug_threads);
out:
	mutex_unlock(&smpboot_threads_lock);
	put_online_cpus();
	return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

Linux内核启动时会调用网络子系统初始化函数 net_dev_init。
在其中注册了网络发送和接收的action，并与它们的标识符绑定。
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);

//file: net/core/dev.c
static int __init net_dev_init(void)
{
    ......

    for_each_possible_cpu(i) {
        struct softnet_data *sd = &per_cpu(softnet_data, i);

        memset(sd, 0, sizeof(*sd));
        skb_queue_head_init(&sd->input_pkt_queue);
        skb_queue_head_init(&sd->process_queue);
        sd->completion_queue = NULL;
        INIT_LIST_HEAD(&sd->poll_list);

        ......
    }

    ......

    open_softirq(NET_TX_SOFTIRQ, net_tx_action);
    open_softirq(NET_RX_SOFTIRQ, net_rx_action);
}
subsys_initcall(net_dev_init);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

内核中不仅仅只有网络的软中断函数，还有起来类型的。

//file: include/linux/interrupt.h
enum
{
    HI_SOFTIRQ=0,
    TIMER_SOFTIRQ,
    NET_TX_SOFTIRQ,
    NET_RX_SOFTIRQ,
    BLOCK_SOFTIRQ,
    BLOCK_IOPOLL_SOFTIRQ,
    TASKLET_SOFTIRQ,
    SCHED_SOFTIRQ,
    HRTIMER_SOFTIRQ,
    RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */

    NR_SOFTIRQS
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

软中断调用

softirq_threads 描述一个软中断线程，当软中断线程注册后，会不断调用ksoftirqd_should_run 判断是否有软中断发生。
在硬件中断 napi 调度的时候调用了or_softirq_pending 标识了软中断标志位，local_softirq_pending就可以读取到是否有软中断发生，发生软中断后会调用run_ksoftirqd 。

#define local_softirq_pending()	(__this_cpu_read(local_softirq_pending_ref))
#define set_softirq_pending(x)	(__this_cpu_write(local_softirq_pending_ref, (x)))
#define or_softirq_pending(x)	(__this_cpu_or(local_softirq_pending_ref, (x)))

static int ksoftirqd_should_run(unsigned int cpu)
{
	return local_softirq_pending();
}
1
2
3
4
5
6
7
8

进入软中断前需要禁用硬件中断（不太清楚为什么关闭，在韦东山的教程中软中断是可以被硬件中断打断的）

static void run_ksoftirqd(unsigned int cpu)
{
	local_irq_disable();		//禁用硬件中断
	if (local_softirq_pending()) {
		/*
		 * We can safely run softirq on inline stack, as we are not deep
		 * in the task stack here.
		 */
		__do_softirq();
		local_irq_enable();
		cond_resched();
		return;
	}
	local_irq_enable();
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

__do_softirq
h->action(h); 执行了action，我们这里执行的就是net_rx_action，它就是网络接收的软中断

asmlinkage void __do_softirq(void)
{
    do {
        if (pending & 1) {
            unsigned int vec_nr = h - softirq_vec;
            int prev_count = preempt_count();

            ...
            trace_softirq_entry(vec_nr);
            h->action(h);
            trace_softirq_exit(vec_nr);
            ...
        }
        h++;
        pending >>= 1;
    } while (pending);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

函数开头的time_limit和budget是用来控制net_rx_action函数主动退出的，目的是保证网络包的接收不霸占CPU不放。等下次网卡再有硬中断过来的时候再处理剩下的接收数据包。其中budget可以通过内核参数调整。这个函数中剩下的核心逻辑是获取到当前CPU变量softnet_data，对其poll_list进行遍历, 然后执行到网卡驱动注册到的poll函数。imx6ull 的poll 函数就是fec_enet_rx_napi.

static void net_rx_action(struct softirq_action *h)
{
    struct softnet_data *sd = &__get_cpu_var(softnet_data);
    unsigned long time_limit = jiffies + 2;
    int budget = netdev_budget;
    void *have;

    local_irq_disable();

    while (!list_empty(&sd->poll_list)) {
        ......
        n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);

        work = 0;
        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
            work = n->poll(n, weight);
            trace_napi_poll(n);
        }

        budget -= work;
    }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

真正的收包函数：fec_enet_rx_napi

static int fec_enet_rx_napi(struct napi_struct *napi, int budget)
{
	struct net_device *ndev = napi->dev;
	struct fec_enet_private *fep = netdev_priv(ndev);
	int pkts;		//packet 数量

	pkts = fec_enet_rx(ndev, budget);	//budget 是用来控制函数主动退出的，目的是保证网络包的接收不霸占CPU不放。

	fec_enet_tx(ndev);

	if (pkts < budget) {
		napi_complete_done(napi, pkts);
		writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
	}
	return pkts;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

接着调用是
fec_enet_rx->fec_enet_rx_queue
fec_enet_rx_queue 太复杂了，以后再研究。

总结

fec_probe 主要做的事情是：

申请netdev
dma分配txq、rxq
硬件初始化
net_device_ops、ethtool_ops、发送超时等设置。
napi 初始化
填充netdev、fec_enet_private
注册中断
mdio初始化
注册netdev

相关阅读:
druid连接池参数配置详解
 ChatGPT如何助力DevOps｜用例解读
 python设置小数点后2位
 向量数据库：Milvus
centos7安装kafka、zookeeper
Golang使用反射
 力扣记录：Hot100（4）——75-101
如何保护压缩包里的内容不被看到？
deepin-anything 源码刨析
 vue中页面跳转白屏的解决方式，同一路由地址，参数不同跳转白屏的解决方式
原文地址：https://blog.csdn.net/weixin_48006170/article/details/124971934