poll/select源码分析

在linux中，很多东西都需要通过fd(file description)来操作，比如字符设备，文件和socket，而在kernel中，都会给fd关联一个结构体 struct file_operations，其提供了很多函数指针用来操作fd对象。
对于poll和select实现来说，它可以poll等待fd的数据，就是因为fd的struct file_operations提供了poll函数指针的实现，比如对于socket来说，提供了sock_poll。


static const struct file_operations socket_file_ops = {
    .owner =    THIS_MODULE,
    ...
    .poll =     sock_poll,
    ...
};
 
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
      sock_create(family, type, protocol, &sock);
      sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
           int fd = get_unused_fd_flags(flags);
           newfile = sock_alloc_file(sock, flags, NULL);
                  file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &socket_file_ops);
           fd_install(fd, newfile);

下面分析下poll和select的源码实现，至于怎么使用它们，网上可以搜到很多例子。

poll

poll系统调用在kernel中的定义，源码在 linux/fs/select.c中


SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs)
{
    struct timespec end_time, *to = NULL;
    int ret;
    //将参数timeout_msecs转换到结构struct timespec
    if (timeout_msecs >= 0) {
        to = &end_time;
        poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
            NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
    }
 
    ret = do_sys_poll(ufds, nfds, to);


int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec *end_time)
    //参数nfds不能大于当前进程最大文件描述符个数限制
    if (nfds > rlimit(RLIMIT_NOFILE))
        return -EINVAL;
 
    //循环将用户侧ufds拷贝到内核态，使用poll_list连接起来
    struct poll_list {
        struct poll_list *next;
        int len;
        struct pollfd entries[0];
    };
    struct pollfd {
        int fd;
        short events;
        short revents;
    };
    //计算数组stack_pps能存储多少个struct pollfd
    #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / sizeof(struct pollfd))
    #define FRONTEND_STACK_ALLOC    256
    #define POLL_STACK_ALLOC    FRONTEND_STACK_ALLOC
    //计算一个PAGE_SIZE能存储多少个struct pollfd
    #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
 
    /* Allocate small arguments on the stack to save memory and     
    be faster - use long to make sure the buffer is aligned properly
    on 64 bit archs to avoid unaligned access */
    //POLL_STACK_ALLOC为256，所以数组stack_pps四个元素
    long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
    //指针head指向数组stack_pps起始地址
    struct poll_list *const head = (struct poll_list *)stack_pps;
    struct poll_list *walk = head;
    unsigned long todo = nfds;
    //len取最小值
    len = min_t(unsigned int, nfds, N_STACK_PPS);
    //这里的逻辑是，首先使用栈上的数组stack_pps存储pollfd，如
    //果nfds小于N_STACK_PPS，只用数组stack_pps就够用，但
    //是如果nfds很大，就需要额外动态分配内存来存储，并使用
    //poll_list连接起来。
    for (;;) {
        walk->next = NULL;
        walk->len = len;
        if (!len)
            break;
 
        if (copy_from_user(walk->entries, ufds + nfds-todo,
                    sizeof(struct pollfd) * walk->len))
            goto out_fds;
 
        todo -= walk->len;
        if (!todo)
            break;
 
        len = min(todo, POLLFD_PER_PAGE);
        size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
        walk = walk->next = kmalloc(size, GFP_KERNEL);
        if (!walk) {
            err = -ENOMEM;
            goto out_fds;
        }
    }
    //初始化table结构体,参见下面的注释a
    struct poll_wqueues table;
    poll_initwait(&table);
 
    //循环调用监听文件的poll函数,参见下面的注释b
    //返回值fdcount表示发生事件的fd个数
    fdcount = do_poll(nfds, head, &table, end_time);
 
    //释放分配的内存，并将wait从目标文件的等待队列中删除。
    poll_freewait(&table);
 
    //将所有监听文件拷贝到用户程序，不管有没有事件发生
    for (walk = head; walk; walk = walk->next) { 
                struct pollfd *fds = walk->entries;
        int j;
        for (j = 0; j < walk->len; j++, ufds++)
            if (__put_user(fds[j].revents, &ufds->revents))
                goto out_fds;
    }
err = fdcount;
return err;

a. poll_initwait


typedef struct poll_table_struct {
    poll_queue_proc _qproc;
    unsigned long _key;
} poll_table;
struct poll_wqueues {
    poll_table pt;
    struct poll_table_page *table;
    struct task_struct *polling_task;
    int triggered;
    int error;
    int inline_index;
    struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};
void poll_initwait(struct poll_wqueues *pwq)
{
    //调用目标文件poll函数时，会调用_pollwait
    init_poll_funcptr(&pwq->pt, __pollwait);
        pt->_qproc = qproc;
        pt->_key   = ~0UL; /* all events enabled */
    //将当前进程描述符赋值给polling_task, 
    pwq->polling_task = current;
    pwq->triggered = 0;
    pwq->error = 0;
    pwq->table = NULL;
    pwq->inline_index = 0;
}

b. do_poll
遍历所有fd，并调用fd提供的poll函数，如果有事件发生，即do_pollfd返回值非0，则将count加一，
do_poll有三层循环，内两层循环遍历所有fd，如果遍历完所有fd，没有事件发生，即count为0，则在最外层循环启动定时器，调度到其他进程执行，直到事件发生或者超时到达。


static int do_poll(unsigned int nfds,  struct poll_list *list,
           struct poll_wqueues *wait, struct timespec *end_time)
    //如果超时时间为0，则将time_out置为1，则不管有没有事件发
    //生，不会进行堵塞等待，而是立即返回
    /* Optimise the no-wait case */
    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
        pt->_qproc = NULL;
        timed_out = 1;
    }
    for {
        //walk是个链表，每个链表元素包含len个fd，下面的第一个
        //for循环遍历链表，第二个for循环遍历当前walk的所有fd，
        //所以需要循环两次才能遍历完所有fd
        for (walk = list; walk != NULL; walk = walk->next) {
            struct pollfd * pfd, * pfd_end;
            pfd = walk->entries;
            pfd_end = pfd + walk->len;
            for (; pfd != pfd_end; pfd++) {
                //即使只有一个fd上有事件发生，也会把pt->_qproc置为空。
                //因为即使只有一个fd有事件发生，此次poll调用也需要将事件返回给用户程序，
                //调用目标文件poll函数就没必要将等待节点插入目标文件的等待队列，只需要获取
                //当前发生的事件即可。
                //do_pollfd返回值为正，说明此fd有事件发生
                if (do_pollfd(pfd, pt)) {
                    count++;
                    pt->_qproc = NULL;
                }
            }
        }
        pt->_qproc = NULL;
        if (!count) {
            count = wait->error;
        //没有事件发生，但是当前进程有其他信号
        if (signal_pending(current))
            count = -EINTR;
 
        //跳出最外层循环的条件有三个: 
        //a. 有fd事件发生
        //b. 当前进程有其他信号
        //c. 超时时间到
        if (count || timed_out)
            break;
        //如果所有fd都没有事件发生，并且timeout大于0，则调度
        //到其他进程开始等待timeout超时。如果timeout小于0，则
        //无限期等待事件发生
        if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) {
            timed_out = 1;
        }
    }
    //返回值count：
    //如果为0，说明超时时间到，并且没有fd有事件发生
    //如果为正数，则说明有count个fd有事件发生
    //如果为负数，说明有信号发生
    return count;
}

do_pollfd中调用fd提供的poll函数，poll函数将已发生的事件赋给mask，再将mask赋给pllfd->revents, 后面会传递给用户侧，函数最后将mask返回上层调用


/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.
 */
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll,
                     unsigned int busy_flag)
{
    unsigned int mask;
    int fd;
 
    mask = 0;
    fd = pollfd->fd;
    if (fd >= 0) {
        struct fd f = fdget(fd);
        mask = POLLNVAL;
        if (f.file) {
            #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
            mask = DEFAULT_POLLMASK;
            if (f.file->f_op->poll) {
                pwait->_key = pollfd->events|POLLERR|POLLHUP;
                pwait->_key |= busy_flag;
                //调用目标文件的poll函数
                mask = f.file->f_op->poll(f.file, pwait);
                if (mask & busy_flag)
                    *can_busy_poll = true;
            }
            /* Mask out unneeded events. */
            mask &= pollfd->events | POLLERR | POLLHUP;
            fdput(f);
        }
    }
    pollfd->revents = mask;
 
    return mask;
}

比如uio模块的poll函数为uio_poll，此函数会调用poll_wait将等待队列添加到等待队列链表中。等待队列头存储在file的私有数据中。(在uio_open中建立file私有数据和listener的关系)


static unsigned int uio_poll(struct file *filep, poll_table *wait)
{
    struct uio_listener *listener = filep->private_data;
    struct uio_device *idev = listener->dev;
 
    if (!idev->info->irq)
        return -EIO;
 
    poll_wait(filep, &idev->wait, wait);
        //p->_qproc为__pollwait
        if (p && p->_qproc && wait_address)
            p->_qproc(filp, wait_address, p);
    if (listener->event_count != atomic_read(&idev->event))
        return POLLIN | POLLRDNORM;
    return 0;
}
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
{
    struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
    struct poll_table_entry *entry = poll_get_entry(pwq);
    if (!entry)
        return;
    entry->filp = get_file(filp);
    entry->wait_address = wait_address;
    entry->key = p->_key;
    //目标文件有事件发生时，调用pollwake唤醒poll进程
    init_waitqueue_func_entry(&entry->wait, pollwake);
    q->flags    = 0;
    q->private  = NULL;
    q->func     = func;
    entry->wait.private = pwq;
    add_wait_queue(wait_address, &entry->wait);
    unsigned long flags;
    wait->flags &= ~WQ_FLAG_EXCLUSIVE;
    spin_lock_irqsave(&q->lock, flags);
    __add_wait_queue(q, wait);
    spin_unlock_irqrestore(&q->lock, flags);
}

事件触发，唤醒进程
当有中断发生时，调用uio_event_notify


igbuio_pci_irqhandler 中调用 uio_event_notify
void uio_event_notify(struct uio_info *info)
{
    struct uio_device *idev = info->uio_dev;
    //将中断事件加1
    atomic_inc(&idev->event);
    //idev->wait是等待队列
    //调用等待队列上注册的函数pollwake
    wake_up_interruptible(&idev->wait);
    kill_fasync(&idev->async_queue, SIGIO, POLL_IN);
}
 
//nr_exclusive为1，则只唤醒一个进程
#define wake_up_interruptible(x)    __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
void __wake_up(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, void *key)
{
    unsigned long flags;
 
    spin_lock_irqsave(&q->lock, flags);
    __wake_up_common(q, mode, nr_exclusive, 0, key);
    spin_unlock_irqrestore(&q->lock, flags);
}
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int wake_flags, void *key)
{
    wait_queue_t *curr, *next;
 
    //遍历链表q->task_list
    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
        unsigned flags = curr->flags;
        //调用pollwake唤醒poll进程，成功返回1
        if (curr->func(curr, mode, wake_flags, key) &&
                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
    }
}
static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_table_entry *entry;
 
    entry = container_of(wait, struct poll_table_entry, wait);
    //key为NULL
    if (key && !((unsigned long)key & entry->key))
        return 0;
    return __pollwake(wait, mode, sync, key);
}
 
static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    //通过wait->private获取poll_wqueues, 
    //poll_wqueues->polling_task中存放的是调用poll函数进程描述符
    struct poll_wqueues *pwq = wait->private;
    DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
 
    //唤醒调用poll的进程
    return default_wake_function(&dummy_wait, mode, sync, key);
}

poll进程被唤醒后，将发生的事件返回给用户程序，用户程序就可以根据发生的事件调用read或者write读取/写入数据，以uio_read为例


static ssize_t uio_read(struct file *filep, char __user *buf, size_t count, loff_t *ppos)
{
    struct uio_listener *listener = filep->private_data;
    struct uio_device *idev = listener->dev;
    DECLARE_WAITQUEUE(wait, current);
    ssize_t retval;
    s32 event_count;
 
    if (!idev->info->irq)
        return -EIO;
 
    if (count != sizeof(s32))
        return -EINVAL;
 
    add_wait_queue(&idev->wait, &wait);
 
    do {
        set_current_state(TASK_INTERRUPTIBLE);
         //idev->event为实时发生的中断计数
        //listener->event_count为上次中断计数
        //如果这俩值不相等说明有新中断发生
        //因为这次read是通过poll获取有事件发生后执行的，
        //所以肯定会走这个流程.
        event_count = atomic_read(&idev->event);
        if (event_count != listener->event_count) {
              //将发生中断事件变量拷贝到用户侧变量
             //同时返回值也为
            if (copy_to_user(buf, &event_count, count))
                retval = -EFAULT;
            else {
                listener->event_count = event_count;
                retval = count;
            }
            break;
        }
        //没有中断事件发生，则如果是堵塞read，则开始堵塞，
        //如果为非堵塞read，则返回EAGAIN
        if (filep->f_flags & O_NONBLOCK) {
            retval = -EAGAIN;
            break;
        }
 
        if (signal_pending(current)) {
            retval = -ERESTARTSYS;
            break;
        }
        schedule();
    } while (1);
 
    __set_current_state(TASK_RUNNING);
    remove_wait_queue(&idev->wait, &wait);
 
    return retval;
}

select

select只能监听1024个文件描述符的原因？
在用户侧代码中，会将需要监听的fd放入结构体fd_set中，其在kernel中定义如下。数组fds_bits的每一位代表一个文件描述符，而数组fds_bits由宏__FD_SETSIZE 得到，此宏定义就是1024，所以如果想支持更多文件描述符，需要重新编译内核将此宏改大。


#undef __FD_SETSIZE
#define __FD_SETSIZE    1024
typedef struct {
    unsigned long fds_bits[__FD_SETSIZE / (8 * sizeof(long))];
} __kernel_fd_set;
typedef __kernel_fd_set     fd_set;

select系统调用在kernel中的定义


SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct timeval __user *, tvp)
    core_sys_select(n, inp, outp, exp, to);
}
 
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
               fd_set __user *exp, struct timespec *end_time)
{
    /* max_fds can increase, so grab it once to avoid race */
    rcu_read_lock();
    fdt = files_fdtable(current->files);
    max_fds = fdt->max_fds;
    rcu_read_unlock();
    //如果超出了当前进程已打开的最大文件描述符，则只使用最大描述符个数
    if (n > max_fds)
        n = max_fds;
 
    #define FDS_BITPERLONG  (8*sizeof(long))
    #define FDS_LONGS(nr)   (((nr)+FDS_BITPERLONG-  1)/FDS_BITPERLONG)
    #define FDS_BYTES(nr)   (FDS_LONGS(nr)*sizeof(long))
    //size表示n个文件描述符占用多少字节
    size = FDS_BYTES(n);
 
    #define FRONTEND_STACK_ALLOC    256
    #define SELECT_STACK_ALLOC  FRONTEND_STACK_ALLOC
    /* Allocate small arguments on the stack to save memory and be faster */
    long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
    void *bits;
    bits = stack_fds;
    //如果stack_fds大小不够用，则直接使用kmalloc分配内存
    if (size > sizeof(stack_fds) / 6) {
        bits = kmalloc(6 * size, GFP_KERNEL);
    }   
    //fds.in指向bits内存首地址，占用size个字节，存放对in事件感兴趣的fd
    //fds.out指向bits+size的内存地址，占用size个字节，存放对out事件感兴趣的fd
    //fds.ex,res_in依次类推
    fds.in      = bits;
    fds.out     = bits +   size;
    fds.ex      = bits + 2*size;
    fds.res_in  = bits + 3*size;
    fds.res_out = bits + 4*size;
    fds.res_ex  = bits + 5*size;
 
    //将in, out, exp文件描述符分别拷贝到fds.in, out, ex中
    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;
 
    //将fds.res_in，out和ex清零，用于存放发生事件的fd
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);
    //核心函数
    ret = do_select(n, &fds, end_time);
    //将结果拷贝到用户侧变量
    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;
}
 
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
    retval = max_select_fd(n, fds);
    poll_initwait(&table);
    wait = &table.pt;
    //如果没有超时时间为0，则time_out设置为1，不管是否有事件发生都会返回
    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
        wait->_qproc = NULL;
        timed_out = 1;
    }
    //后面的逻辑和poll中的类似，遍历所有fd，调用fd中的poll函数，   
    //在poll函数中将当前进程加入到等待队列。
    //如果有事件发生则返回发生事件的个数，
    //如果没有事件发生，并且time_out为1，则返回0.
    //如果没有事件发生，并且time_out为0，则调度执行其他进程，  
    //等待事件发生或者等待超时后返回。

缺点：

1.每次调用 select()，都需要把 fd 集合从用户态拷贝到内核态，这个开销在 fd 很多时会很大，同时每次调用 select() 都需要在内核遍历传递进来的所有 fd，这个开销在 fd 很多时也很大。
2.单个进程能够监视的文件描述符的数量存在最大限制，在 Linux 上一般为 1024，可以通过修改宏定义甚至重新编译内核的方式提升这一限制，但是这样也会造成效率的降低。

select每次调用都会线性扫描全部的集合，导致效率呈现线性下降。
每次调用select都会调用poll_initwait将调用进程添加到目标文件等待队列中，select函数返回时调用poll_freewait从等待队列删除。

select() 和 poll() 系统调用的本质一样，poll() 的机制与 select() 类似，与 select() 在本质上没有多大差别，管理多个描述符也是进行轮询，根据描述符的状态进行处理，但是 poll() 没有最大文件描述符数量的限制（但是数量过大后性能也是会下降）。
poll() 和 select() 同样存在一个缺点就是，包含大量文件描述符的数组被整体复制于用户态和内核的地址空间之间，而不论这些文件描述符是否就绪，它的开销随着文件描述符数量的增加而线性增大。

也可参考：https://www.jianshu.com/p/2332b777f84f

相关阅读:
ME60单板加载故障维护经验
 玩转 gpgpu sim 01记 —— try it
如何写出高性能代码(三)优化内存回收(GC)
2023-10学习笔记
 Revit中为房间添加填充图例和“构件快速上色”
JS和TS的异同
 Web前端开发面试题汇总
 C语言：二叉树的遍历以及遇到的问题
 【数据结构-树】哈夫曼树及其应用
 分析Jetpack Compose动画内部是如何实现的
原文地址：https://blog.csdn.net/fengcai_ke/article/details/126563983