
更详细的介绍参考《TCP/IP详解卷 1:协议》第3章 IP:网际协议。
- struct iphdr {
- #if defined(__LITTLE_ENDIAN_BITFIELD)
- __u8 ihl:4,
- version:4;
- #elif defined (__BIG_ENDIAN_BITFIELD)
- __u8 version:4,
- ihl:4;
- #else
- #error "Please fix
" - #endif
- __u8 tos;
- __be16 tot_len;
- __be16 id;
- __be16 frag_off;
- __u8 ttl;
- __u8 protocol;
- __sum16 check;
- __be32 saddr;
- __be32 daddr;
- /*The options start here. */
- };
3位标志位:
- /* IP flags. */
- #define IP_CE 0x8000 /* Flag: "Congestion" */
- #define IP_DF 0x4000 /* Flag: "Don't Fragment" */
- #define IP_MF 0x2000 /* Flag: "More Fragments" */
- #define IP_OFFSET 0x1FFF /* "Fragment Offset" part */
(参考资料《Linux Kernel Networking - Implementation and Theory》、机械工业出版社《Linux内核源码剖析:TCP/IP实现(上册)》)
__ip_route_output_key_hash返回路由rtable,rtable里面包含路由缓存项dst_entry,对于输出路由,主要用到了路由及路由缓存项的输出函数指针output、网卡设备net_device、网关rt_gateway,__ip_route_output_key_hash调用栈:

dst_output调用路由缓存的输出函数指针output,然后调用ip_output、ip_finish_output、ip_finish_output2,ip_finish_output2找到路由缓的下一跳(网关或者局域网内的其他主机),然后调用邻居子系统的dst_neigh_output发送报文,ip_finish_output2邻居查找及报文发送代码实现如下:

ip_finish_output2函数调用栈如下:

ip_route_input_slow调用skb_dst_set_noref设置skb的路由缓存,路由缓存dst包含一个输入函数指针,最终调用该输入函数处理输入报文:

ip_route_input_slow调用栈:
查找到路由缓存之后,调用dst_input函数,dst_input函数调用路由缓存的input函数处理报文,dst_input调用栈:

ip_queue_xmit主要检查skb是否已经设置路由,如果没有就查找路由,如果找不到路由就丢弃报文;有路由就设置IP首部(IP首部的16位总长度、16位首部检验和在下一级函数设置),调用ip_local_out、rt->dst.output输出IP报文。
ip_queue_xmit代码实现如下:
- int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
- {
- struct inet_sock *inet = inet_sk(sk);
- struct net *net = sock_net(sk);
- struct ip_options_rcu *inet_opt;
- struct flowi4 *fl4;
- struct rtable *rt;
- struct iphdr *iph;
- int res;
-
- /* Skip all of this if the packet is already routed,
- * f.e. by something like SCTP.
- */
- rcu_read_lock();
- inet_opt = rcu_dereference(inet->inet_opt);
- fl4 = &fl->u.ip4;
- rt = skb_rtable(skb); // skb路由项(输出网卡设备、下一跳地址等)
- if (rt)
- goto packet_routed; // 如果已经有路由,那么跳转到packet_routed,使用已经设置好的路由
-
- /* Make sure we can route this packet. */
- rt = (struct rtable *)__sk_dst_check(sk, 0); // 获取路由项缓存sk_dst_cache
- if (!rt) {
- __be32 daddr;
-
- /* Use correct destination address if we have options. */
- daddr = inet->inet_daddr;
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
-
- /* If this fails, retransmit mechanism of transport layer will
- * keep trying until route appears or the connection times
- * itself out.
- */
- rt = ip_route_output_ports(net, fl4, sk,
- daddr, inet->inet_saddr,
- inet->inet_dport,
- inet->inet_sport,
- sk->sk_protocol,
- RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if); // 调用ip_route_output_ports、ip_route_output_flow查询输出路由缓存;参考机械工业出版社《Linux内核源码剖析:TCP IP实现(下册)》"第20章 路由缓存"
- if (IS_ERR(rt))
- goto no_route; // 没有路由则跳转到no_route,丢弃报文
- sk_setup_caps(sk, &rt->dst);
- }
- skb_dst_set_noref(skb, &rt->dst); // 设置skb->_skb_refdst
-
- packet_routed:
- if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
- goto no_route;
-
- /* OK, we know where to send it, allocate and build IP header. */
- skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); // 获取IP首部在skb里面的地址(如果有选项,需要计算选项的长度,否则为0;从传输层数据之前预留的就是IP首部地址空间)
- skb_reset_network_header(skb);
- iph = ip_hdr(skb); // 获取IP首部地址
- *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); // IP首部前16位为: 4位版本号、4位首部长度、8位服务类型(TOS);(4 << 12)为4位版本号,IPv4,(5 << 8)为4位首部长度,5*4共20字节(此次没加上选项长度), (inet->tos & 0xff)为8位服务类型(TOS)
- if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
- iph->frag_off = htons(IP_DF); // IP不分片(Don’t Fragment)
- else
- iph->frag_off = 0;
- iph->ttl = ip_select_ttl(inet, &rt->dst); // 8位生存时间(TTL)
- iph->protocol = sk->sk_protocol; // 8位协议(IPPROTO_TCP/IPPROTO_UDP)
- ip_copy_addrs(iph, fl4); // 32位源IP地址
-
- /* Transport layer set skb->h.foo itself. */
-
- if (inet_opt && inet_opt->opt.optlen) { // 选项(如果有)
- iph->ihl += inet_opt->opt.optlen >> 2; // 修正4位首部长度
- ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); // 拷贝选项
- }
-
- ip_select_ident_segs(net, skb, sk,
- skb_shinfo(skb)->gso_segs ?: 1); // 16位标识(每个IP报文有唯一的标识,通过16位标识区分是否是一个IP报文的分片)
-
- /* TODO : should we use skb->sk here instead of sk ? */
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
-
- res = ip_local_out(net, sk, skb); // 调用ip_local_out发送IP报文(IP首部的16位总长度在函数__ip_local_out里面设置,16位首部检验和在函数ip_send_check设置)
- rcu_read_unlock();
- return res;
-
- no_route:
- rcu_read_unlock();
- IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
- kfree_skb(skb);
- return -EHOSTUNREACH;
- }
ip_queue_xmit调用栈如下:


ip_packet_type定义ETH_P_IP报文输入函数:
- static struct packet_type ip_packet_type __read_mostly = {
- .type = cpu_to_be16(ETH_P_IP),
- .func = ip_rcv,
- };
以太网首部:
- struct ethhdr {
- unsigned char h_dest[ETH_ALEN]; /* destination eth addr */
- unsigned char h_source[ETH_ALEN]; /* source ether addr */
- __be16 h_proto; /* packet type ID field */
- } __attribute__((packed));
smsc911x_poll调用eth_type_trans读取以太网首部类型字段并设置接收报文类型skb->protocol:

__netif_receive_skb_core找到ip_packet_type,并调用输入函数ip_rcv处理输入报文:

ip_rcv函数调用栈:

ip_rcv校验IP首部各字段、校验和、总长度等,校验通过之后调用ip_rcv_finish继续处理IP报文。
ip_rcv函数代码实现:
- int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
- {
- const struct iphdr *iph;
- struct net *net;
- u32 len;
-
- /* When the interface is in promisc. mode, drop all the crap
- * that it receives, do not try to analyse it.
- */
- if (skb->pkt_type == PACKET_OTHERHOST) // eth_type_trans比较以太网首部的目的地址是否是输入网卡的地址,如果不是,设置skb->pkt_type为PACKET_OTHERHOST
- goto drop; // 丢弃发往其他主机的IP报文
-
-
- net = dev_net(dev);
- IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len);
-
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (!skb) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
- goto out;
- }
-
- if (!pskb_may_pull(skb, sizeof(struct iphdr))) // 报文长度检查(IP首部的长度)
- goto inhdr_error; // 报文的长度小于IP首部的长度(不包含选项),也就是不完整的IP报文,或者有其他错误,跳转到inhdr_error
-
- iph = ip_hdr(skb); // 获取IP首部地址
-
- /*
- * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
- *
- * Is the datagram acceptable?
- *
- * 1. Length at least the size of an ip header
- * 2. Version of 4
- * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
- * 4. Doesn't have a bogus length
- */
-
- if (iph->ihl < 5 || iph->version != 4) // 如果IP首部4位首部长度小于5(IP首部不包含选项至少有5*4个字节数据),那么4位首部长度错误,跳转到inhdr_error;如果IP首部4位版本不是4(不是IPv4),那么跳转到inhdr_error
- goto inhdr_error;
-
- BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
- BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
- BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
- IP_ADD_STATS_BH(net,
- IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
- max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
-
- if (!pskb_may_pull(skb, iph->ihl*4)) // 报文长度检查(IP首部长度(包含选项))
- goto inhdr_error; // 报文长度不够IP首部长度(包含选项)或者其他错误,跳转到inhdr_error
-
- iph = ip_hdr(skb); // 获取IP首部地址
-
- if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) // 16位首部检验和校验(包含选项)
- goto csum_error; // 16位首部检验和校验失败,跳转到csum_error
-
- len = ntohs(iph->tot_len); // 16位总长度(字节数)
- if (skb->len < len) { // 报文总长度小于16位总长度
- IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS);
- goto drop; // 跳转到drop,丢弃报文
- } else if (len < (iph->ihl*4)) // 报文长度检查(IP首部长度(包含选项))
- goto inhdr_error; // 报文长度不够IP首部长度(包含选项)或者其他错误,跳转到inhdr_error
-
- /* Our transport medium may have padded the buffer out. Now we know it
- * is IP we can trim to the true length of the frame.
- * Note this now means skb->len holds ntohs(iph->tot_len).
- */
- if (pskb_trim_rcsum(skb, len)) { // 删除SKB尾部的数据(len之后的数据)
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
- goto drop;
- }
-
- skb->transport_header = skb->network_header + iph->ihl*4; // 获取传输层首部
-
- /* Remove any debris in the socket control block */
- memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
-
- /* Must drop socket now because of tproxy. */
- skb_orphan(skb);
-
- return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
- net, NULL, skb, dev, NULL,
- ip_rcv_finish); // 调用ip_rcv_finish
-
- csum_error:
- IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS);
- inhdr_error:
- IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
- drop:
- kfree_skb(skb);
- out:
- return NET_RX_DROP;
- }
物理层只管物理地址不管IP地址,IP报文校验完整之后还得调用ip_route_input_noref查找输入路由,看看是否有输入路由(发往本地或者转发),如果没有输入路由则丢弃报文,否则调用dst_input根据路由处理报文(发往本地或者转发)。
ip_rcv_finish函数实现代码如下:
- static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
- {
- const struct iphdr *iph = ip_hdr(skb);
- struct rtable *rt;
-
- if (sysctl_ip_early_demux &&
- !skb_dst(skb) &&
- !skb->sk &&
- !ip_is_fragment(iph)) {
- const struct net_protocol *ipprot;
- int protocol = iph->protocol;
-
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot && ipprot->early_demux) {
- ipprot->early_demux(skb);
- /* must reload iph, skb->head might have changed */
- iph = ip_hdr(skb);
- }
- }
-
- /*
- * Initialise the virtual path cache for the packet. It describes
- * how the packet travels inside Linux networking.
- */
- if (!skb_valid_dst(skb)) { // 没有输入路由缓存
- int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev); // 查找输入路由
- if (unlikely(err)) { // 没有输入路由
- if (err == -EXDEV)
- NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER);
- goto drop; // 跳转到drop,丢弃报文
- }
- }
-
- #ifdef CONFIG_IP_ROUTE_CLASSID
- if (unlikely(skb_dst(skb)->tclassid)) {
- struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
- u32 idx = skb_dst(skb)->tclassid;
- st[idx&0xFF].o_packets++;
- st[idx&0xFF].o_bytes += skb->len;
- st[(idx>>16)&0xFF].i_packets++;
- st[(idx>>16)&0xFF].i_bytes += skb->len;
- }
- #endif
-
- if (iph->ihl > 5 && ip_rcv_options(skb)) // 4位首部长度大于5(包含选项),处理选项
- goto drop; // 选项处理失败,跳转到drop,丢弃报文
-
- rt = skb_rtable(skb);
- if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
- } else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
-
- return dst_input(skb); // 调用dst_input处理输入报文
-
- drop:
- kfree_skb(skb);
- return NET_RX_DROP;
- }
ip_local_deliver调用栈:

对于发送本地的报文,调用ip_local_deliver、ip_local_deliver_finish,ip_local_deliver_finish根据IP首部的8位协议找到传输层对应协议的输入处理函数,调用传输层的处理函数处理传输层报文,对于IPv4的TCP协议就是tcp_v4_rcv。
ip_local_deliver_finish函数代码实现如下:
- static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
- {
- __skb_pull(skb, skb_network_header_len(skb));
-
- rcu_read_lock();
- {
- int protocol = ip_hdr(skb)->protocol; // 8位协议(IPPROTO_TCP/IPPROTO_UDP/IPPROTO_RAW...)
- const struct net_protocol *ipprot;
- int raw;
-
- resubmit:
- raw = raw_local_deliver(skb, protocol);
-
- ipprot = rcu_dereference(inet_protos[protocol]); // 获取传输层的net_protocol(tcp_protocol/udp_protocol...)
- if (ipprot) {
- int ret;
-
- if (!ipprot->no_policy) {
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
- goto out;
- }
- nf_reset(skb);
- }
- ret = ipprot->handler(skb); // 调用传输层的处理函数(tcp_v4_rcv/udp_rcv...)
- if (ret < 0) {
- protocol = -ret;
- goto resubmit;
- }
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
- } else {
- if (!raw) {
- if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_PROT_UNREACH, 0);
- }
- kfree_skb(skb);
- } else {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
- consume_skb(skb);
- }
- }
- }
- out:
- rcu_read_unlock();
-
- return 0;
- }
ip_local_deliver_finish调用栈:
