• ipv4: inet初始化过程


    Internet Protocol version 4,简称ipv4。ipv4模块从网络碎片队列初始化开始启动,然后执行cipso初始化,inet初始化等等。

    目录


    1. 源码流程

    1.1 inet_frag_wq_init

    1.2 inet_init

    2. 源码结构

    3. 部分结构定义

    4. 扩展函数


    内容


    1. 源码流程

    1.1 inet_frag_wq_init

      ipv4模块中首先创建专用工作队列inet_frag_wq,参考https://lore.kernel.org/lkml/20201211112405.31158-1-sjpark@amazon.com/内核补丁

    static int __init inet_frag_wq_init(void)
    {
            inet_frag_wq = create_workqueue("inet_frag_wq");
            // 创建ipv4专用工作队列
            if (!inet_frag_wq)
                    panic("Could not create inet frag workq");
            return 0;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

      inet_frag_wq在fqdir_exit函数中加入执行


    1.2 inet_init

    static int __init inet_init(void)
    {
            struct inet_protosw *q;
            struct list_head *r;
            int rc;
    
            /* 将 sock_skb_cb 存储在 skb->cb[] 的末尾,
            以便使用 skb->cb[] 的协议族将继续直接使用它并利用其对齐保证 */
            sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); // 如果inet_skb_parm结构长度 大于 44(48 - 4(u32类型))字节,执行BUILD_BUG_ON
    
            rc = proto_register(&tcp_prot, 1);// 注册tcp协议结构
            // 创建TCP缓存,用于向用户空间拷贝
            // 创建request_sock_TCP缓存,tw_sock_TCP缓存
            // #define PROTO_INUSE_NR  64
            // 协议对象prot将被添加到proto_list,如果注册的协议数量超过63个,将不在proto_inuse_idx中为prot留下可用位    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

      tcp_prot分析,继续看inet_init:

            rc = proto_register(&udp_prot, 1); // 注册udp协议结构 
            // 创建UDP缓存,request_sock_UDP缓存,tw_sock_UDP缓存 
    
    • 1
    • 2

      udp_prot分析,继续看inet_init:

            rc = proto_register(&raw_prot, 1);  
            // 创建RAW缓存,request_sock_RAW缓存,tw_sock_RAW缓存  
            // raw_prot...
    
            rc = proto_register(&ping_prot, 1);
            // 创建PING缓存,request_sock_PING缓存,tw_sock_PING缓存  
            // ping_prot...
    
            (void)sock_register(&inet_family_ops);
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

      sock_register分析,继续看inet_init:

    #ifdef CONFIG_SYSCTL
            ip_static_sysctl_init(); 
            // 向ctl_table_header结构中增加ctl_table结构,
            // 然后放入ctl_dir,
            // 由ctl_dir(红黑)树维护节点
    #endif
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6

      ip_static_sysctl_init分析,继续看inet_init:

    /* 添加所有基本协议 */
    if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
    // IPPROTO_ICMP = 1 互联网控制消息协议
    // icmp_protocol 协议结构
                    pr_crit("%s: Cannot add ICMP protocol\n", __func__);
    
    • 1
    • 2
    • 3
    • 4
    • 5

      内核模块定义了网络协议结构struct net_protocol inet_protos数组对象,数组成员数量为256,可标记目前所有的网络协议。

    if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
    // IPPROTO_UDP = 17 用户数据报文协议
    
    if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
    // IPPROTO_TCP = 6 传输控制协议
    
    #ifdef CONFIG_IP_MULTICAST
            if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
            // IPPROTO_IGMP = 2 互联网组管理协议
                    pr_crit("%s: Cannot add IGMP protocol\n", __func__);
    #endif
    
    /* 为inet_create注册套接字端信息 */
    for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
    // static struct list_head inetsw[SOCK_MAX];
    // #define SOCK_MAX (SOCK_PACKET(10) + 1)
    
                    INIT_LIST_HEAD(r); // 初始化一个list_head对象
    
    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
                    inet_register_protosw(q); 
                    // 注册inet_protosw对象
                    // 如果是新的协议类型(inetsw链表没有储存),加到inet_protosw结构对应的链表尾部
                    // inet_protosw属于 -> IP协议注册套接字接口结构
    
    arp_init(); // 初始化地址解析协议
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26

      arp_init分析,继续看inet_init:

    ip_init(); // 初始化ip相关结构
    // ip运行时,peer子系统,igmp等
    
    • 1
    • 2

      ip_init分析,继续看inet_init:

    if (init_ipv4_mibs()) // 初始化ipv4 mibs
    // MIB,用于管理所有与 IPv4 和 IPv6 相关的隧道
    // 隧道提供了一种将任意数据包封装在传输协议
                    panic("%s: Cannot init ipv4 mibs\n", __func__);
    
    tcp_init();
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6

      tcp_init分析,继续看inet_init:

    udp_init(); // udp初始化
    // 初始化udp表 “UDP”
    // sysctl_udp_mem[0]设置udp内存合理范围
    // sysctl_udp_mem[1]表示警戒值
    // sysctl_udp_mem[2]超出这个值,禁止分配
    // 注册udp_sysctl_ops
    // 注册udp包过滤器
    
    udplite4_register(); // 注册udplite
    // UDP-Lite 轻量级用户数据包协议
    
    raw_init(); // raw初始化
    // raw 网络打印协议
    
    ping_init(); // 初始化ping表
    
    if (icmp_init() < 0) // 设置ICMP层
    // ICMP协议是一种面向无连接的协议,用于传输出错报告控制信息
                    panic("Failed to create the ICMP control socket.\n");
    
    #if defined(CONFIG_IP_MROUTE)
            if (ip_mr_init()) // 广播路由初始化
                    pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
    #endif
    
    if (init_inet_pernet_ops())
                    pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
    
    ipv4_proc_init(); // ipv4相关子系统创建
    // raw, tcp4, udp4, ping, ip
    
    ipfrag_init(); // ip碎片整合子系统创建
    // static struct inet_frags ip4_frags;
    
    dev_add_pack(&ip_packet_type); // 将协议处理程序添加到网络堆栈
    
    ip_tunnel_core_init(); // 创建ip隧道
    
            rc = 0;
    out:
            return rc;
    out_unregister_raw_proto:
            proto_unregister(&raw_prot);
    out_unregister_udp_proto:
            proto_unregister(&udp_prot);
    out_unregister_tcp_proto:
            proto_unregister(&tcp_prot);
            goto out;
    }
    
    fs_initcall(inet_init);
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51

    fqdir_exit

    void fqdir_exit(struct fqdir *fqdir)
    {
            INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
            queue_work(inet_frag_wq, &fqdir->destroy_work);
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5

    2. 源码结构

    tcp_prot

    struct proto tcp_prot = {
            .name                   = "TCP",
            .owner                  = THIS_MODULE,
            .close                  = tcp_close,
            .pre_connect            = tcp_v4_pre_connect,
            .connect                = tcp_v4_connect,
            .disconnect             = tcp_disconnect,
            .accept                 = inet_csk_accept,
            .ioctl                  = tcp_ioctl,
            .init                   = tcp_v4_init_sock,
            .destroy                = tcp_v4_destroy_sock,
            .shutdown               = tcp_shutdown,
            .setsockopt             = tcp_setsockopt,
            .getsockopt             = tcp_getsockopt,
            .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
            .keepalive              = tcp_set_keepalive,
            .recvmsg                = tcp_recvmsg,
            .sendmsg                = tcp_sendmsg,
            .sendpage               = tcp_sendpage,
            .backlog_rcv            = tcp_v4_do_rcv,
            .release_cb             = tcp_release_cb,
            .hash                   = inet_hash,
            .unhash                 = inet_unhash,
            .get_port               = inet_csk_get_port,
            .put_port               = inet_put_port,
    #ifdef CONFIG_BPF_SYSCALL
            .psock_update_sk_prot   = tcp_bpf_update_proto,
    #endif
            .enter_memory_pressure  = tcp_enter_memory_pressure,
            .leave_memory_pressure  = tcp_leave_memory_pressure,
            .stream_memory_free     = tcp_stream_memory_free,
            .sockets_allocated      = &tcp_sockets_allocated,
            .orphan_count           = &tcp_orphan_count,
            .memory_allocated       = &tcp_memory_allocated,
            .memory_pressure        = &tcp_memory_pressure,
            .sysctl_mem             = sysctl_tcp_mem,
            .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
            .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
            .max_header             = MAX_TCP_HEADER,
            .obj_size               = sizeof(struct tcp_sock),
            .slab_flags             = SLAB_TYPESAFE_BY_RCU,
            .twsk_prot              = &tcp_timewait_sock_ops,
            .rsk_prot               = &tcp_request_sock_ops,
            .h.hashinfo             = &tcp_hashinfo,
            .no_autobind            = true,
            .diag_destroy           = tcp_abort,
    };
    EXPORT_SYMBOL(tcp_prot);
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48

    udp_prot

    struct proto udp_prot = {
            .name                   = "UDP",
            .owner                  = THIS_MODULE,
            .close                  = udp_lib_close,
            .pre_connect            = udp_pre_connect,
            .connect                = ip4_datagram_connect,
            .disconnect             = udp_disconnect,
            .ioctl                  = udp_ioctl,
            .init                   = udp_init_sock,
            .destroy                = udp_destroy_sock,
            .setsockopt             = udp_setsockopt,
            .getsockopt             = udp_getsockopt,
            .sendmsg                = udp_sendmsg,
            .recvmsg                = udp_recvmsg,
            .sendpage               = udp_sendpage,
            .release_cb             = ip4_datagram_release_cb,
            .hash                   = udp_lib_hash,
            .unhash                 = udp_lib_unhash,
            .rehash                 = udp_v4_rehash,
            .get_port               = udp_v4_get_port,
            .put_port               = udp_lib_unhash,
            #ifdef CONFIG_BPF_SYSCALL
            .psock_update_sk_prot   = udp_bpf_update_proto,
    #endif
            .memory_allocated       = &udp_memory_allocated,
            .sysctl_mem             = sysctl_udp_mem,
            .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
            .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
            .obj_size               = sizeof(struct udp_sock),
            .h.udp_table            = &udp_table,
            .diag_destroy           = udp_abort,
    };
    EXPORT_SYMBOL(udp_prot);
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33

      ipv4_route_table定义:

    static struct ctl_table ipv4_route_table[] = {
            {
                    .procname       = "gc_thresh",
                    .data           = &ipv4_dst_ops.gc_thresh,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec,
            },
            {
                    .procname       = "max_size",
                    .data           = &ip_rt_max_size,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec,
            },
            {
                    /*  Deprecated. Use gc_min_interval_ms */
    
                    .procname       = "gc_min_interval",
                    .data           = &ip_rt_gc_min_interval,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec_jiffies,
            },
            {
                    .procname       = "gc_min_interval_ms",
                    .data           = &ip_rt_gc_min_interval,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec_ms_jiffies,
            },
            {
                    .procname       = "gc_timeout",
                    .data           = &ip_rt_gc_timeout,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec_jiffies,
            },
            {
                    .procname       = "gc_interval",
                    .data           = &ip_rt_gc_interval,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec_jiffies,
            },
            {
                    .procname       = "redirect_load",
                    .data           = &ip_rt_redirect_load,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec,
            },
            {
                    .procname       = "redirect_number",
                    .data           = &ip_rt_redirect_number,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec,
            },
            {
                    .procname       = "redirect_silence",
                    .data           = &ip_rt_redirect_silence,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec,
            },
            {
                    .procname       = "error_cost",
                    .data           = &ip_rt_error_cost,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec,
            },
            {
                    .procname       = "error_burst",
                    .data           = &ip_rt_error_burst,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec,
            },
            {
                    .procname       = "gc_elasticity",
                    .data           = &ip_rt_gc_elasticity,
                    .maxlen         = sizeof(int),
                    .mode           = 0644,
                    .proc_handler   = proc_dointvec,
            },
            { }
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89

      icmp_protocol:

    static const struct net_protocol icmp_protocol = {
            .handler =      icmp_rcv,
            .err_handler =  icmp_err,
            .no_policy =    1,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5

      inetsw_array:

    static struct inet_protosw inetsw_array[] =
    {
            {
                    .type =       SOCK_STREAM,
                    .protocol =   IPPROTO_TCP,
                    .prot =       &tcp_prot,
                    .ops =        &inet_stream_ops,
                    .flags =      INET_PROTOSW_PERMANENT |
                                  INET_PROTOSW_ICSK,
            },
    
            {
                    .type =       SOCK_DGRAM,
                    .protocol =   IPPROTO_UDP,
                    .prot =       &udp_prot,
                    .ops =        &inet_dgram_ops,
                    .flags =      INET_PROTOSW_PERMANENT,
           },
    
                   {
                    .type =       SOCK_DGRAM,
                    .protocol =   IPPROTO_ICMP,
                    .prot =       &ping_prot,
                    .ops =        &inet_sockraw_ops,
                    .flags =      INET_PROTOSW_REUSE,
           },
    
           {
                   .type =       SOCK_RAW,
                   .protocol =   IPPROTO_IP,        /* wild card */
                   .prot =       &raw_prot,
                   .ops =        &inet_sockraw_ops,
                   .flags =      INET_PROTOSW_REUSE,
           }
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35

      arp_tbl:

    struct neigh_table arp_tbl = {
            .family         = AF_INET,
            .key_len        = 4,
            .protocol       = cpu_to_be16(ETH_P_IP),
            .hash           = arp_hash,
            .key_eq         = arp_key_eq,
            .constructor    = arp_constructor,
            .proxy_redo     = parp_redo,
            .is_multicast   = arp_is_multicast,
            .id             = "arp_cache",
            .parms          = {
                    .tbl                    = &arp_tbl,
                    .reachable_time         = 30 * HZ,
                    .data   = {
                            [NEIGH_VAR_MCAST_PROBES] = 3,
                            [NEIGH_VAR_UCAST_PROBES] = 3,
                            [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
                            [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
                            [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
                            [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
                            [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
                            [NEIGH_VAR_PROXY_QLEN] = 64,
                            [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
                            [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
                            [NEIGH_VAR_LOCKTIME] = 1 * HZ,
                    },
            },
            .gc_interval    = 30 * HZ,
            .gc_thresh1     = 128,
            .gc_thresh2     = 512,
            .gc_thresh3     = 1024,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32

      arp_packet_type

    static struct packet_type arp_packet_type __read_mostly = {
            .type = cpu_to_be16(ETH_P_ARP),
            .func = arp_rcv,
            // 从设备层接收arp请求
    };
    #define ETH_P_ARP       0x0806 /* 地址解析包 */
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7

      arp_packet_type

    static struct pernet_operations arp_net_ops = {
            .init = arp_net_init,
            .exit = arp_net_exit,
    };
    
    • 1
    • 2
    • 3
    • 4

      neigh_sysctl_template

    static struct neigh_sysctl_table {
            struct ctl_table_header *sysctl_header;
            struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
    } neigh_sysctl_template __read_mostly = {
            .neigh_vars = {
                    NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
                    NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"),
                    NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"),
                    NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"),
                    NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
                    NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
                    NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
                    NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
                    NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
                    NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
                    NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"),
                    NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"),
                    NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"),
                    NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"),
                    NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"),
                    NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"),
                    [NEIGH_VAR_GC_INTERVAL] = {
                            .procname       = "gc_interval",
                            .maxlen         = sizeof(int),
                            .mode           = 0644,
                            .proc_handler   = proc_dointvec_jiffies,
                    },
                    [NEIGH_VAR_GC_THRESH1] = {
                            .procname       = "gc_thresh1",
                            .maxlen         = sizeof(int),
                            .mode           = 0644,
                            .extra1         = SYSCTL_ZERO,
                            .extra2         = SYSCTL_INT_MAX,
                            .proc_handler   = proc_dointvec_minmax,
                            },
                    [NEIGH_VAR_GC_THRESH2] = {
                            .procname       = "gc_thresh2",
                            .maxlen         = sizeof(int),
                            .mode           = 0644,
                            .extra1         = SYSCTL_ZERO,
                            .extra2         = SYSCTL_INT_MAX,
                            .proc_handler   = proc_dointvec_minmax,
                    },
                    [NEIGH_VAR_GC_THRESH3] = {
                            .procname       = "gc_thresh3",
                            .maxlen         = sizeof(int),
                            .mode           = 0644,
                            .extra1         = SYSCTL_ZERO,
                            .extra2         = SYSCTL_INT_MAX,
                            .proc_handler   = proc_dointvec_minmax,
                    },
                    {},
            },
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54

      arp_netdev_notifier

    static struct notifier_block arp_netdev_notifier = {
            .notifier_call = arp_netdev_event,
    };
    
    • 1
    • 2
    • 3

      ipv4_dst_ops 路由函数操作结构

    static struct dst_ops ipv4_dst_ops = {
            .family =               AF_INET,
            .check =                ipv4_dst_check,
            .default_advmss =       ipv4_default_advmss,
            .mtu =                  ipv4_mtu,
            .cow_metrics =          ipv4_cow_metrics,
            .destroy =              ipv4_dst_destroy,
            .negative_advice =      ipv4_negative_advice,
            .link_failure =         ipv4_link_failure,
            .update_pmtu =          ip_rt_update_pmtu,
            .redirect =             ip_do_redirect,
            .local_out =            __ip_local_out,
            .neigh_lookup =         ipv4_neigh_lookup,
            .confirm_neigh =        ipv4_confirm_neigh,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

      ipv4_dst_blackhole_ops 重新定向(网络流量)

    static struct dst_ops ipv4_dst_blackhole_ops = {
            .family                 = AF_INET,
            .default_advmss         = ipv4_default_advmss,
            .neigh_lookup           = ipv4_neigh_lookup,
            .check                  = dst_blackhole_check,
            .cow_metrics            = dst_blackhole_cow_metrics,
            .update_pmtu            = dst_blackhole_update_pmtu,
            .redirect               = dst_blackhole_redirect,
            .mtu                    = dst_blackhole_mtu,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

      pernet_operations

    static __net_initdata struct pernet_operations devinet_ops = {
            .init = devinet_init_net,
            .exit = devinet_exit_net,
    };
    
    • 1
    • 2
    • 3
    • 4

      ip_netdev_notifier

    static struct notifier_block ip_netdev_notifier = {
            .notifier_call = inetdev_event,
    };
    
    • 1
    • 2
    • 3

      __read_mostly

    static struct rtnl_af_ops inet_af_ops __read_mostly = {
            .family           = AF_INET,
            .fill_link_af     = inet_fill_link_af,
            .get_link_af_size = inet_get_link_af_size,
            .validate_link_af = inet_validate_link_af,
            .set_link_af      = inet_set_link_af,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7

      fib_net_ops

    static struct pernet_operations fib_net_ops = {
            .init = fib_net_init,
            .exit = fib_net_exit,
            .exit_batch = fib_net_exit_batch,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5

      fib_netdev_notifier

    static struct notifier_block fib_netdev_notifier = {
            .notifier_call = fib_netdev_event,
    };
    
    • 1
    • 2
    • 3

      fib_inetaddr_notifier

    static struct notifier_block fib_inetaddr_notifier = {
            .notifier_call = fib_inetaddr_event,
    };
    
    • 1
    • 2
    • 3

      inetaddr_chain 网络地址通知链

    static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
    
    • 1

      inetaddr_chain 网络地址通知链

    static struct pernet_operations __net_initdata xfrm_net_ops = {
            .init = xfrm_net_init,
            .exit = xfrm_net_exit,
    };
    
    • 1
    • 2
    • 3
    • 4

      xfrm_dev_notifier transform(转换)设备通知

    static struct notifier_block xfrm_dev_notifier = {
            .notifier_call  = xfrm_dev_event,
    };
    
    • 1
    • 2
    • 3

      xfrm4_state_afinfo

    static struct xfrm_state_afinfo xfrm4_state_afinfo = {
            .family                 = AF_INET,
            .proto                  = IPPROTO_IPIP,
            .output                 = xfrm4_output,
            .transport_finish       = xfrm4_transport_finish,
            .local_error            = xfrm4_local_error,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7

      igmp_net_ops

    static struct pernet_operations igmp_net_ops = {
            .init = igmp_net_init,
            .exit = igmp_net_exit,
    };
    
    • 1
    • 2
    • 3
    • 4

      igmp_notifier

    static struct notifier_block igmp_notifier = {
            .notifier_call = igmp_netdev_event,
    };
    
    • 1
    • 2
    • 3

      ipv4_mib_ops

    static __net_initdata struct pernet_operations ipv4_mib_ops = {
            .init = ipv4_mib_init_net,
            .exit = ipv4_mib_exit_net,
    };
    
    • 1
    • 2
    • 3
    • 4

      udp_reg_info

    static struct bpf_iter_reg udp_reg_info = {
            .target                 = "udp",
            .ctx_arg_info_size      = 1,
            .ctx_arg_info           = {
                    { offsetof(struct bpf_iter__udp, udp_sk),
                      PTR_TO_BTF_ID_OR_NULL },
            },
            .seq_info               = &udp_seq_info,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

      udplite_prot

    struct proto    udplite_prot = {
            .name              = "UDP-Lite",
            .owner             = THIS_MODULE,
            .close             = udp_lib_close,
            .connect           = ip4_datagram_connect,
            .disconnect        = udp_disconnect,
            .ioctl             = udp_ioctl,
            .init              = udplite_sk_init,
            .destroy           = udp_destroy_sock,
            .setsockopt        = udp_setsockopt,
            .getsockopt        = udp_getsockopt,
            .sendmsg           = udp_sendmsg,
            .recvmsg           = udp_recvmsg,
            .sendpage          = udp_sendpage,
            .hash              = udp_lib_hash,
            .unhash            = udp_lib_unhash,
            .rehash            = udp_v4_rehash,
            .get_port          = udp_v4_get_port,
    
            .memory_allocated  = &udp_memory_allocated,
            .per_cpu_fw_alloc  = &udp_memory_per_cpu_fw_alloc,
    
            .sysctl_mem        = sysctl_udp_mem,
            .obj_size          = sizeof(struct udp_sock),
            .h.udp_table       = &udplite_table,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26

      af_inet_ops

    static __net_initdata struct pernet_operations af_inet_ops = {
            .init = inet_init_net,
    };
    
    • 1
    • 2
    • 3

      ip_packet_type

    static struct packet_type ip_packet_type __read_mostly = {
            .type = cpu_to_be16(ETH_P_IP),
            .func = ip_rcv,
            .list_func = ip_list_rcv,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5

      ip_tun_lwt_ops

    static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
            .build_state = ip_tun_build_state,
            .destroy_state = ip_tun_destroy_state,
            .fill_encap = ip_tun_fill_encap_info,
            .get_encap_size = ip_tun_encap_nlsize,
            .cmp_encap = ip_tun_cmp_encap,
            .owner = THIS_MODULE,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

      ip6_tun_lwt_ops

    static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
            .build_state = ip6_tun_build_state,
            .fill_encap = ip6_tun_fill_encap_info,
            .get_encap_size = ip6_tun_encap_nlsize,
            .cmp_encap = ip_tun_cmp_encap,
            .owner = THIS_MODULE,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7

      tcp_metrics_nl_policy

    static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
            [TCP_METRICS_ATTR_ADDR_IPV4]    = { .type = NLA_U32, },
            [TCP_METRICS_ATTR_ADDR_IPV6]    = { .type = NLA_BINARY,
                                                .len = sizeof(struct in6_addr), },
            /* Following attributes are not received for GET/DEL,
             * we keep them for reference
             */
    #if 0
            [TCP_METRICS_ATTR_AGE]          = { .type = NLA_MSECS, },
            [TCP_METRICS_ATTR_TW_TSVAL]     = { .type = NLA_U32, },
            [TCP_METRICS_ATTR_TW_TS_STAMP]  = { .type = NLA_S32, },
            [TCP_METRICS_ATTR_VALS]         = { .type = NLA_NESTED, },
            [TCP_METRICS_ATTR_FOPEN_MSS]    = { .type = NLA_U16, },
            [TCP_METRICS_ATTR_FOPEN_SYN_DROPS]      = { .type = NLA_U16, },
            [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]    = { .type = NLA_MSECS, },
            [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
                                                .len = TCP_FASTOPEN_COOKIE_MAX, },
    #endif
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19

      ipv4_specific

    const struct inet_connection_sock_af_ops ipv4_specific = {
            .queue_xmit        = ip_queue_xmit,
            .send_check        = tcp_v4_send_check,
            .rebuild_header    = inet_sk_rebuild_header,
            .sk_rx_dst_set     = inet_sk_rx_dst_set,
            .conn_request      = tcp_v4_conn_request,
            .syn_recv_sock     = tcp_v4_syn_recv_sock,
            .net_header_len    = sizeof(struct iphdr),
            .setsockopt        = ip_setsockopt,
            .getsockopt        = ip_getsockopt,
            .addr2sockaddr     = inet_csk_addr2sockaddr,
            .sockaddr_len      = sizeof(struct sockaddr_in),
            .mtu_reduced       = tcp_v4_mtu_reduced,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14

    3. 部分结构定义

      possible_net_t

    typedef struct {
    #ifdef CONFIG_NET_NS
    	struct net *net;
    #endif
    } possible_net_t;
    
    • 1
    • 2
    • 3
    • 4
    • 5

      neigh_table

    struct neigh_table {
    	int			family;
    	unsigned int		entry_size;
    	unsigned int		key_len;
    	__be16			protocol;
    	__u32			(*hash)(const void *pkey,
    					const struct net_device *dev,
    					__u32 *hash_rnd);
    	bool			(*key_eq)(const struct neighbour *, const void *pkey);
    	int			(*constructor)(struct neighbour *);
    	int			(*pconstructor)(struct pneigh_entry *);
    	void			(*pdestructor)(struct pneigh_entry *);
    	void			(*proxy_redo)(struct sk_buff *skb);
    	int			(*is_multicast)(const void *pkey);
    	bool			(*allow_add)(const struct net_device *dev,
    					     struct netlink_ext_ack *extack);
    	char			*id;
    	struct neigh_parms	parms;
    	struct list_head	parms_list;
    	int			gc_interval;
    	int			gc_thresh1;
    	int			gc_thresh2;
    	int			gc_thresh3;
    	unsigned long		last_flush;
    	struct delayed_work	gc_work;
    	struct delayed_work	managed_work;
    	struct timer_list 	proxy_timer;
    	struct sk_buff_head	proxy_queue;
    	atomic_t		entries;
    	atomic_t		gc_entries;
    	struct list_head	gc_list;
    	struct list_head	managed_list;
    	rwlock_t		lock;
    	unsigned long		last_rand;
    	struct neigh_statistics	__percpu *stats;
    	struct neigh_hash_table __rcu *nht;
    	struct pneigh_entry	**phash_buckets;
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38

      ip_rt_acct

    #ifdef CONFIG_IP_ROUTE_CLASSID
    struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
    #endif /* CONFIG_IP_ROUTE_CLASSID */
    
    • 1
    • 2
    • 3

      udp_table

    struct udp_table {
            struct udp_hslot        *hash;
            struct udp_hslot        *hash2;
            unsigned int            mask;
            unsigned int            log;
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6

      lwtunnel_encap_types ip隧道类型

    enum lwtunnel_encap_types {
            LWTUNNEL_ENCAP_NONE,
            LWTUNNEL_ENCAP_MPLS,
            LWTUNNEL_ENCAP_IP,
            LWTUNNEL_ENCAP_ILA,
            LWTUNNEL_ENCAP_IP6,
            LWTUNNEL_ENCAP_SEG6,
            LWTUNNEL_ENCAP_BPF,
            LWTUNNEL_ENCAP_SEG6_LOCAL,
            LWTUNNEL_ENCAP_RPL,
            LWTUNNEL_ENCAP_IOAM6,
            __LWTUNNEL_ENCAP_MAX,
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13

    4. 扩展函数

    sock_register

      sock_register函数用于添加套接字协议到协议族结构中

    static const struct net_proto_family inet_family_ops = {
            .family = PF_INET, // 协议类型
            .create = inet_create,
            .owner  = THIS_MODULE,
    };
    
    int sock_register(const struct net_proto_family *ops)
    {
            int err;
    
            if (ops->family >= NPROTO) {
            // #define NPROTO          AF_MAX
            // #define AF_MAX          46
    
                    pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
                    return -ENOBUFS;
            }
    
            spin_lock(&net_family_lock);
            if (rcu_dereference_protected(net_families[ops->family],
                                          lockdep_is_held(&net_family_lock))) // 如果net_families[ops->family]受保护,不允许写入
            // PF_INET 2 /* Internet IP Protocol */
    
                    err = -EEXIST;
            else {  
                    rcu_assign_pointer(net_families[ops->family], ops); // 写入协议结构
                    err = 0;
            }
            spin_unlock(&net_family_lock);
    
            pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]);
            return err;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33

      内核模块定义了网络协议族结构static const struct net_proto_family net_families数组对象,数组成员数量为43,可标记目前所有的网络协议族。

      inet_create分析:


      ip_static_sysctl_init

      register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
      // struct net init_net;
      
      ||
      \/
      struct ctl_table_header *register_net_sysctl(struct net *net,
              const char *path, struct ctl_table *table)
      {
              if (!net_eq(net, &init_net)) // 这里调用net == init_net
                      /* 验证sysctls对于非init netns是安全的
                      1)是只读的
                      2)有一个指向全局内核/模块数据段外部的数据指针,
                      而不是指向per-net对象分配的堆 */
                      ensure_safe_net_sysctl(net, path, table); // 修改为只读
              
              return __register_sysctl_table(&net->sysctls, path, table); // 向ctl_table_header结构中增加ctl_table结构,然后放入ctl_dir,由ctl_dir(红黑)树维护节点
      }
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17

      arp_init

      void __init arp_init(void)
      {
              neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); // 邻居表初始化并放入数组中
              // 注册 neigh_stat_seq_ops 创建/proc/id 文件
              // NEIGH_ARP_TABLE = 0
      
      • 1
      • 2
      • 3
      • 4
      • 5

        neigh_table_init分析,继续看arp_init:

      dev_add_pack(&arp_packet_type);
      // 注册 地址解析包 类型
      // 从设备层接收arp请求
      
      register_pernet_subsys(&arp_net_ops);
      // 注册网络命名空间子系统 
      // 注册一个子系统,该子系统具有
      // 分别在创建和销毁网络命名空间时调用的init和exit函数
      // 注册后,所有网络命名空间的初始化函数
      // 都会为每个现有的网络命名空间调用
      // 允许内核模块拥有一组网络命名空间的无竞争视图
      // first_device <- ops
       
      #ifdef CONFIG_SYSCTL
              neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
              // 邻居系统表注册
              // 向ctl_table_header结构中增加ctl_table结构
      #endif
              register_netdevice_notifier(&arp_netdev_notifier); // 注册网络设备通知块
      }
      
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21

      neigh_table_init

      void neigh_table_init(int index, struct neigh_table *tbl)
      {
              unsigned long now = jiffies;
              unsigned long phsize;
      
              INIT_LIST_HEAD(&tbl->parms_list);
              INIT_LIST_HEAD(&tbl->gc_list);
              INIT_LIST_HEAD(&tbl->managed_list);
      
              list_add(&tbl->parms.list, &tbl->parms_list);
              // 关联邻居协议参数链表
              write_pnet(&tbl->parms.net, &init_net);
              // 邻居协议参数 neigh_parms.net 初始化 (->net = &init_net)
              refcount_set(&tbl->parms.refcnt, 1);
              // 设置引用计数1
      
              /* 它是区间 (1/2)*base...(3/2)*base中的随机分布
               它对应于默认的 IPv6 设置并且不可覆盖,
               因为它确实是合理的选择 */
              tbl->parms.reachable_time =
                                neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));             
      
              tbl->stats = alloc_percpu(struct neigh_statistics); // 分配统计结构缓存
      
      #ifdef CONFIG_PROC_FS
              if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat,
                                    &neigh_stat_seq_ops, tbl))
              // 创建/proc/id 文件,关联neigh_stat_seq_ops结构
                      panic("cannot create neighbour proc dir entry"); 
      #endif
      
              RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); // 哈希表分配
      
              ...
              INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
              queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
                              tbl->parms.reachable_time);
              // 设置gc_work延迟工作队列
      
              INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work);
              queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0);
              // 设置managed_work延迟工作队列
      
              timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0);
              // 定时器设置, 调用neigh_proxy_process skb流数据处理函数
              skb_queue_head_init_class(&tbl->proxy_queue,
                              &neigh_table_proxy_queue_class);
              // skb_queue_head结构初始化,与lockdep绑定
      
              tbl->last_flush = now; // 设置最近一次缓存写入时间
              tbl->last_rand  = now + tbl->parms.reachable_time * 20;
              // 设置最近一次gc_work队列调用执行时间
      
              neigh_tables[index] = tbl; // 放入邻居表结构数组中
      }
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
      • 40
      • 41
      • 42
      • 43
      • 44
      • 45
      • 46
      • 47
      • 48
      • 49
      • 50
      • 51
      • 52
      • 53
      • 54
      • 55

      ip_init

      void __init ip_init(void)
      {               
              ip_rt_init(); // 初始化ip运行(运行时)相关模块
              // 分配大型哈希表用于ip识别, 使用大页面 2MB
              // 填充20个字节的随机数 
      
      • 1
      • 2
      • 3
      • 4
      • 5

        ip_rt_init分析,继续看ip_init:

      inet_initpeers(); // 初始化peer子系统
      // 计算peer最大内存限制
      // 分配inet_peer_cache缓存
      
      #if defined(CONFIG_IP_MULTICAST)
              igmp_mc_init(); // 初始化igmp(互联网组管理协议)
              // 注册子系统 igmp_net_ops
              // 注册通知块 igmp_notifier
      #endif 
      }
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10

      ip_rt_init

      int __init ip_rt_init(void)
      {       
              void *idents_hash;
              int cpu;        
              
              /* 分配大型哈希表用于ip识别, 使用大页面 2MB */
              idents_hash = alloc_large_system_hash("IP idents",
               sizeof(*ip_idents) + sizeof(*ip_tstamps),         
                                                    0,
                                                    16, /* one bucket per 64 KB */
                                                    HASH_ZERO,
                                                    NULL,
                                                    &ip_idents_mask,
                                                    2048,
                                                    256*1024);
              
              ip_idents = idents_hash;
      
              prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
              // 填充20个字节的随机数
      
              ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
              // 跳过随机数
      
              for_each_possible_cpu(cpu) {
                      struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
                      // 为每个cpu分配uncached_list对象
      
                      INIT_LIST_HEAD(&ul->head);
                      INIT_LIST_HEAD(&ul->quarantine);
                      spin_lock_init(&ul->lock);
              }
      
      #ifdef CONFIG_IP_ROUTE_CLASSID
              ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
              // ip路由统计, 内存对齐方式分配256个
              if (!ip_rt_acct)
                      panic("IP: failed to allocate ip_rt_acct\n");
      #endif
      
              ipv4_dst_ops.kmem_cachep =
                      kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
              // 分配缓存
      
              ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
      
              if (dst_entries_init(&ipv4_dst_ops) < 0)
              // 初始化事件统计计数为0
                      panic("IP: failed to allocate ipv4_dst_ops counter\n");
      
              if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
                      panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
      
              ipv4_dst_ops.gc_thresh = ~0;
              ip_rt_max_size = INT_MAX;
      
              devinet_init();
              // 注册网络设备 devinet_ops
              // 注册网络设备块 ip_netdev_notifier
              // 队列中唤醒任务,检查运行时长
              // 注册rtnetlink结构(由路由控制的netlink,用于内核与各个子系统之间通讯)  rtnl_af_ops
              // 注册rtnetlink消息RTM_NEWADDR -> 调用函数 inet_rtm_newaddr
              // RTM_DELADDR -> 调用函数 inet_rtm_deladdr
              // RTM_GETADDR -> 转储函数 inet_dump_ifaddr
              // RTM_GETNETCONF -> 调用函数  inet_netconf_get_devconf 转储函数 inet_netconf_dump_devconf
      
              ip_fib_init();
              // 创建ip_fib_alias缓存,ip_fib_trie缓存
              // 注册网络命名空间子系统 fib_net_ops
              // 注册到(所有)网络设备通知块 fib_netdev_notifier
              // 注册到块通知链 fib_inetaddr_notifier
              // RTM_NEWROUTE -> 调用函数 inet_rtm_newroute
              // RTM_DELROUTE -> 调用函数 inet_rtm_delroute
              // RTM_GETROUTE -> 转储函数 inet_dump_fib
      
      #ifdef CONFIG_XFRM
              xfrm_init();
              // 注册xfrm命名空间子系统 xfrm_net_ops
              // 注册xfrm设备通知块 xfrm_dev_notifier
              // 取出每个CPU的xfrm_trans_tasklet结构对象,初始化队列,软中断绑定函数
      
              xfrm4_init();  
              // 初始化xfrm4模块
      #endif
      
              rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
                            RTNL_FLAG_DOIT_UNLOCKED);
              // RTM_GETROUTE -> 调用函数 inet_rtm_getroute
      
      #ifdef CONFIG_SYSCTL
              register_pernet_subsys(&sysctl_route_ops);
      #endif
              register_pernet_subsys(&ip_rt_ops);
              register_pernet_subsys(&rt_genid_ops);
              register_pernet_subsys(&ipv4_inetpeer_ops);
              return 0;
      }
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
      • 40
      • 41
      • 42
      • 43
      • 44
      • 45
      • 46
      • 47
      • 48
      • 49
      • 50
      • 51
      • 52
      • 53
      • 54
      • 55
      • 56
      • 57
      • 58
      • 59
      • 60
      • 61
      • 62
      • 63
      • 64
      • 65
      • 66
      • 67
      • 68
      • 69
      • 70
      • 71
      • 72
      • 73
      • 74
      • 75
      • 76
      • 77
      • 78
      • 79
      • 80
      • 81
      • 82
      • 83
      • 84
      • 85
      • 86
      • 87
      • 88
      • 89
      • 90
      • 91
      • 92
      • 93
      • 94
      • 95
      • 96
      • 97

      tcp_init

      void __init tcp_init(void)
      {
              int max_rshare, max_wshare, cnt;
              unsigned long limit;
              unsigned int i;
      
              BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
              // #define TCP_MIN_SND_MSS         48
              // #define MAX_TCP_OPTION_SPACE 40
      
              BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
                           sizeof_field(struct sk_buff, cb));
      
              percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); // 每cpu计数结构初始化
      
              timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
              // 定时更新不属于任何进程的tcp sockets总数
              // 每个孤儿socket最多消耗64K不可交换内存
              
              inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
                                  thash_entries, 21,  /* one slot per 2 MB*/
                                  0, 64 * 1024);
              // 分配tcp哈希表,用于绑定端口地址等
      
              tcp_hashinfo.bind_bucket_cachep =
                      kmem_cache_create("tcp_bind_bucket",
                                        sizeof(struct inet_bind_bucket), 0,
                                        SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                        SLAB_ACCOUNT,
                                        NULL);
              // tcp_bind_bucket缓存
      
              ...
              tcp_init_mem(); // tcp设置缓存范围
              // sysctl_tcp_mem[0]设置udp内存合理范围
              // sysctl_tcp_mem[1]表示警戒值
              // sysctl_tcp_mem[2]超出这个值,禁止分配
      
              /* 将每个套接字的限制设置为不超过压力阈值的 1/128 */
              limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
              max_wshare = min(4UL*1024*1024, limit);
              max_rshare = min(6UL*1024*1024, limit);
      
              init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
              init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
              init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
              // tcp设置写缓存范围
      
              init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
              init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
              init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
              // tcp设置读缓存范围
      
              ...
              tcp_v4_init(); // 为每个cpu创建 tcp基于ipv4的socket
      
              tcp_metrics_init(); // 创建tcp度量子系统,注册度量地址族类型
      
              /* 将新的拥塞控制算法附加到可用选项列表中 */
              BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
      
              tcp_tasklet_init(); // 为每个cpu创建tcp软中断函数
      
              mptcp_init(); // mptcp创建
              // mptcp全称MultiPathTCP,其目的是允许tcp连接使用多个路径来最大化信道资源使用
      }
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
      • 40
      • 41
      • 42
      • 43
      • 44
      • 45
      • 46
      • 47
      • 48
      • 49
      • 50
      • 51
      • 52
      • 53
      • 54
      • 55
      • 56
      • 57
      • 58
      • 59
      • 60
      • 61
      • 62
      • 63
      • 64
      • 65
      • 66
    • 相关阅读:
      六氟化硫SF6断路器的运行维护、泄漏处理及气体在线监测
      【C/C++】指针常量、常量指针、指向常量的常指针
      基于Spring Boot垂钓服务系统的设计与实现毕业设计源码071739
      认识和使用容器
      企业网络流量分析监控软件
      【Python】Hook函数相关知识点
      《白话强化学习与python》笔记——第六章深度学习(二)
      muduo网络库编程
      js 监听a标签跳转页面,并携带自定义参数
      基于架构软件设计方法及应用
    • 原文地址:https://blog.csdn.net/a29562268/article/details/126165352