Epoll 是 IO 多路复用机制在 Linux 中的实现,相比 select
,在性能上有很大的提升。
- 文章由本人实践探究所得,如有错误,恳请指正,不胜感谢。
- 转载请于文首标明出处:【Linux】epoll 解析 - 掘金 (juejin.cn)
- 文章仍未完工,请见谅
头文件定义
所有代码只用于验证和考究,如果是学习了解,看文字即可。
文件位置:/usr/include/x86_64-linux-gnu/sys/epoll.h
#ifndef _SYS_EPOLL_H
#define _SYS_EPOLL_H 1
#include ...
enum EPOLL_EVENTS { // 以下列的 Epoll 事件,非常多
EPOLLIN = 0x001,
EPOLLPRI = 0x002,
...
};
/* Valid opcodes ( "op" parameter ) to issue to epoll_ctl(). */
#define EPOLL_CTL_ADD 1 /* Add a file descriptor to the interface. */
#define EPOLL_CTL_DEL 2 /* Remove a file descriptor from the interface. */
#define EPOLL_CTL_MOD 3 /* Change file descriptor epoll_event structure. */
typedef union epoll_data { // union 类型,
void *ptr; // 用户自定义消息指针
int fd; // 文件描述符
uint32_t u32; //
uint64_t u64; //
} epoll_data_t;
struct epoll_event { // Epoll 事件,告知内核应该监听的事件
uint32_t events; /* Epoll events */
epoll_data_t data; /* User data variable */
} __EPOLL_PACKED;
__BEGIN_DECLS
/* Creates an epoll instance. Returns an fd for the new instance.
The "size" parameter is a hint specifying the number of file
descriptors to be associated with the new instance. The fd
returned by epoll_create() should be closed with close(). */
extern int epoll_create (int __size) __THROW;
/* Same as epoll_create but with an FLAGS parameter. The unused SIZE
parameter has been dropped. */
extern int epoll_create1 (int __flags) __THROW;
/* Manipulate an epoll instance "epfd". Returns 0 in case of success,
-1 in case of error ( the "errno" variable will contain the
specific error code ) The "op" parameter is one of the EPOLL_CTL_*
constants defined above. The "fd" parameter is the target of the
operation. The "event" parameter describes which events the caller
is interested in and any associated user data. */
extern int epoll_ctl (int __epfd, int __op, int __fd,
struct epoll_event *__event) __THROW;
/* Wait for events on an epoll instance "epfd". Returns the number of
triggered events returned in "events" buffer. Or -1 in case of
error with the "errno" variable set to the specific error code. The
"events" parameter is a buffer that will contain triggered
events. The "maxevents" is the maximum number of events to be
returned ( usually size of "events" ). The "timeout" parameter
specifies the maximum wait time in milliseconds (-1 == infinite).
This function is a cancellation point and therefore not marked with
__THROW. */
extern int epoll_wait (int __epfd, struct epoll_event *__events,
int __maxevents, int __timeout);
/* Same as epoll_wait, but the thread's signal mask is temporarily
and atomically replaced with the one provided as parameter.
This function is a cancellation point and therefore not marked with
__THROW. */
extern int epoll_pwait (int __epfd, struct epoll_event *__events,
int __maxevents, int __timeout,
const __sigset_t *__ss);
__END_DECLS
#endif /* sys/epoll.h */
eventpoll
epoll 实例,重点关注 epoll_wait
、rdllist
字段。
struct eventpoll {
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
struct mutex mtx;
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
/* List of ready file descriptors */
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
rwlock_t lock;
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/
struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
u64 gen;
struct hlist_head refs;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* tracks wakeup nests for lockdep validation */
u8 nests;
#endif
};
epoll_create
int epoll_create (int __size) __THROW;
即创建 epoll 实例(Create Epoll)。为新实例返回一个 fd 值,表示文件描述符(file descriptor),是存在于 /proc/进程id/fd/
路径下的文件,如:
zohar@Debian-bullseye:/proc/20/fd$ tree
.
├── 0 -> /dev/null
├── 1 -> /home/zohar/nohup.out
├── 10 -> /home/zohar/Application/clion-2021.2.1/bin/clion.sh
└── 2 -> /home/zohar/nohup.out
In Unix and Unix-like computer operating systems, a file descriptor (FD, less frequently fildes) is a unique identifier (handle) for a file or other input/output resource, such as a pipe or network socket. -- File descriptor - Wikipedia
文件描述符(FD)是 Unix 和类 Unix 系统中,文件或者其他 IO 资源的唯一标识符(句柄)。
size
参数是一个提示,指定与新实例关联的文件描述符的数量,但如今已被废弃。epoll_create()
返回的 fd 应该用 close()
关闭,否则会由于文件描述符被不断占用而耗尽。
// fs/eventpoll.c
SYSCALL_DEFINE1(epoll_create, int, size) {
if (size <= 0) return -EINVAL; // 从这里可以看出 size 是无意义的,当必须大于等于 0,以兼容旧版本
return do_epoll_create(0);
}
static int do_epoll_create(int flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
/* ... */
error = ep_alloc(&ep); // 1. 为 Epoll 实例分配内存
if (error < 0)
return error;
/*
* 构造 Epoll 实例和文件描述符
*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); // 2. 获取一个未使用的文件描述符
if (fd < 0) {
error = fd;
goto out_free_ep;
}
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC)); // 3. 创建一个 file 实例,就是 fd 文件
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file); // 4. 绑定文件描述符到 file 实例上
return fd;
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
epoll_wait
int epoll_wait (int __epfd, struct epoll_event *__events,
int __maxevents, int __timeout);
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to) {
int error;
struct fd f;
struct eventpoll *ep;
/* ... */
/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd); // 根据文件描述符找到文件
if (!f.file)
return -EBADF;
/* ... */
/*
* private_data 字段存储的是 Epoll 实例
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, to);
error_fput:
fdput(f);
return error;
}
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, struct timespec64 *timeout) {
int res, eavail, timed_out = 0;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
lockdep_assert_irqs_enabled();
if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
slack = select_estimate_accuracy(timeout);
to = &expires;
*to = timespec64_to_ktime(*timeout);
} else if (timeout) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation.
*/
timed_out = 1;
}
/*
* This call is racy: We may or may not see events that are being added
* to the ready list under the lock (e.g., in IRQ callbacks). For cases
* with a non-zero timeout, this thread will check the ready list under
* lock and will add to the wait queue. For cases with a zero
* timeout, the user by definition should not care and will have to
* recheck again.
*/
eavail = ep_events_available(ep);
while (1) {
if (eavail) { // 如果有事件就绪,将就绪对象拷贝到 events 中
/*
* Try to transfer events to user space. In case we get
* 0 events and there's still timeout left over, we go
* trying again in search of more luck.
*/
res = ep_send_events(ep, events, maxevents);
if (res)
return res;
}
if (timed_out) // 如果超时,返回 0
return 0;
eavail = ep_busy_loop(ep, timed_out);
if (eavail)
continue;
if (signal_pending(current))
return -EINTR;
/*
* Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each
* wakeup. Why it is important? In case of several waiters
* each new wakeup will hit the next waiter, giving it the
* chance to harvest new event. Otherwise wakeup can be
* lost. This is also good performance-wise, because on
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
*/
init_wait(&wait);
write_lock_irq(&ep->lock);
/*
* Barrierless variant, waitqueue_active() is called under
* the same lock on wakeup ep_poll_callback() side, so it
* is safe to avoid an explicit barrier.
*/
__set_current_state(TASK_INTERRUPTIBLE);
/*
* Do the final check under the lock. ep_scan_ready_list()
* plays with two lists (->rdllist and ->ovflist) and there
* is always a race when both lists are empty for short
* period of time although events are pending, so lock is
* important.
*/
eavail = ep_events_available(ep);
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
if (!eavail)
timed_out = !schedule_hrtimeout_range(to, slack,
HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/*
* We were woken up, thus go and try to harvest some events.
* If timed out and still on the wait queue, recheck eavail
* carefully under lock, below.
*/
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
write_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
* timeout expired before it could reacquire the lock.
* Thus, when wait.entry is empty, it needs to harvest
* events.
*/
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
}
}
epoll_ctl
int epoll_ctl (int __epfd, int __op, int __fd,
struct epoll_event *__event) __THROW;
操作 epoll 实例 epfd
。如果成功返回 0,op
参数是上面定义的 EPOLL_CTL_*
常量之一。fd
参数是操作的目标。event
参数描述调用者感兴趣的事件和任何相关的用户数据。
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct eventpoll *tep = NULL;
error = -EBADF;
f = fdget(epfd);
if (!f.file)
goto error_return;
/* Get the "struct file *" for the target file */
tf = fdget(fd);
if (!tf.file)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
if (!file_can_poll(tf.file))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(epds);
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;
/*
* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;
/*
* When we insert an epoll file descriptor inside another epoll file
* descriptor, there is the chance of creating closed loops, which are
* better be handled here, than in more critical paths. While we are
* checking for loops we also determine the list of files reachable
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
* the epoll file descriptor is attaching directly to a wakeup source,
* unless the epoll file descriptor is nested. The purpose of taking the
* 'epmutex' on add is to prevent complex toplogies such as loops and
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
loop_check_gen++;
full_check = 1;
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
error = -ELOOP;
if (ep_loop_check(ep, tep) != 0)
goto error_tgt_fput;
}
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
}
}
/*
* Try to lookup the file inside our RB tree. Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
epi = ep_find(ep, tf.file, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
break;
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT;
break;
}
mutex_unlock(&ep->mtx);
error_tgt_fput:
if (full_check) {
clear_tfile_check_list();
loop_check_gen++;
mutex_unlock(&epmutex);
}
fdput(tf);
error_fput:
fdput(f);
error_return:
return error;
}