從內核看IO_Uring的實現（一）

作者：theanarkh 2021-07-07 23:38:05

最近研究了一下Linux的高性能異步IO框架io_uring，并嘗試引入Node.js中應用起來。所以本文打算介紹一下io_uring在內核的實現，因為io_uring實現代碼量大，邏輯復雜，所以只能慢慢分析。這一篇介紹io_uring初始化接口io_uring_setup的實現。

[[410006]]

前言：最近研究了一下Linux的高性能異步IO框架io_uring，并嘗試引入Node.js中應用起來。所以本文打算介紹一下io_uring在內核的實現，因為io_uring實現代碼量大，邏輯復雜，所以只能慢慢分析。這一篇介紹io_uring初始化接口io_uring_setup的實現。

static long io_uring_setup(u32 entries, struct io_uring_params __user *params){ 
    struct io_uring_params p; 
    int i; 
 
    if (copy_from_user(&p, params, sizeof(p))) 
        return -EFAULT; 
    // 支持的flag 
    if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 
            IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 
            IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) 
        return -EINVAL; 
 
    return  io_uring_create(entries, &p, params); 
}

io_uring_setup是對io_uring_create的封裝。第一個參數entries指定請求隊列的長度，第二個參數params是用于調用方和內核通信的結構體。我們看一下定義。

struct io_uring_params { 
    // 定義請求隊列長度（2的sq_entries次方），調用方定義 
    __u32 sq_entries; 
    // 完成隊列長度，默認是2 * 請求隊列長度 
    __u32 cq_entries; 
    // 控制內核行為的標記 
    __u32 flags; 
    // poll模式下開啟的內核線程綁定的cpu 
    __u32 sq_thread_cpu; 
    // poll模式下開啟的內核線程空閑時間，之后會掛起。 
    __u32 sq_thread_idle; 
    // 內核當前支持的能力，內核設置 
    __u32 features; 
    __u32 wq_fd; 
    __u32 resv[3]; 
    // 記錄內核數據的結構體，調用方后續調用mmap需要用到。 
    struct io_sqring_offsets sq_off; 
    struct io_cqring_offsets cq_off; 
};

我們接著看io_uring_create。

static int io_uring_create(unsigned entries, struct io_uring_params *p, 
               struct io_uring_params __user *params){ 
    struct user_struct *user = NULL; 
    struct io_ring_ctx *ctx; 
    bool limit_mem; 
    int ret; 
 
    p->sq_entries = roundup_pow_of_two(entries); 
    // 自定義完成隊列長度 
    if (p->flags & IORING_SETUP_CQSIZE) { 
        p->cq_entries = roundup_pow_of_two(p->cq_entries); 
        // 完成隊列不能小于請求隊列 
        if (p->cq_entries < p->sq_entries) 
            return -EINVAL; 
        // 超過閾值則需要設置IORING_SETUP_CLAMP標記 
        if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { 
            if (!(p->flags & IORING_SETUP_CLAMP)) 
                return -EINVAL; 
            p->cq_entries = IORING_MAX_CQ_ENTRIES; 
        } 
    } else { 
        // 默認是兩倍的請求隊列長度 
        p->cq_entries = 2 * p->sq_entries; 
    } 
    // 用戶信息 
    user = get_uid(current_user()); 
    // 分配一個ctx記錄上下文，因為調用方只能拿到fd，后續操作fd的時候會拿到關聯的上下文 
    ctx = io_ring_ctx_alloc(p); 
    ctx->user = user; 
    // 和poll模式相關的數據結構 
    ctx->sqo_task = get_task_struct(current); 
    // 分配一個io_rings 
    ret = io_allocate_scq_urings(ctx, p); 
    // 處理poll模式的邏輯 
    ret = io_sq_offload_start(ctx, p); 
    // 后面還有很多，一會分析 
}

io_uring_create代碼比較多，我們分步分析。首先分配了一個io_ring_ctx結構體，這是核心的數據結構，用于記錄io_uring實例的上下文，不過我們暫時不需要了解它具體的定義，因為實在太多，只關注本文相關的字段。

1 分配一個io_rings結構體

接著調用io_allocate_scq_urings分配一個io_rings結構體，這是非常核心的邏輯，我們看一下io_rings的定義。

struct io_rings { 
    struct io_uring     sq, cq; 
    u32         sq_ring_mask, cq_ring_mask; 
    u32         sq_ring_entries, cq_ring_entries; 
    u32         sq_dropped; 
    u32         sq_flags; 
    u32         cq_flags; 
    u32         cq_overflow; 
    struct io_uring_cqe cqes[]; 
};

io_rings主要用于記錄請求和完成隊列的信息。我們繼續看io_allocate_scq_urings。

static int io_allocate_scq_urings(struct io_ring_ctx *ctx, 
                  struct io_uring_params *p){ 
    struct io_rings *rings; 
    size_t size, sq_array_offset; 
    // 記錄請求和完成隊列大小到ctx 
    ctx->sq_entries = p->sq_entries; 
    ctx->cq_entries = p->cq_entries; 
    /*  
        計算結構體和額外數組的大小，sq_array_offset保存結構體大小， 
        size保存結構體+額外數組+另一個額外數組的大小 
    */ 
    size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); 
    // 分配內存 
    rings = io_mem_alloc(size); 
    // ... 
}

io_allocate_scq_urings細節比較多，我們分開分析，我們看一下rings_size的邏輯。

static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, 
                size_t *sq_offset){ 
    struct io_rings *rings; 
    size_t off, sq_array_size; 
    // 計算結構體和格外數組的大小，見io_rings定義 
    off = struct_size(rings, cqes, cq_entries); 
    // sq_offset記錄結構體大小 
    if (sq_offset) 
        *sq_offset = off; 
    // 計算多個u32元素的數組的大小 
    sq_array_size = array_size(sizeof(u32), sq_entries); 
    // 計算結構體大小 + sq_array_size的大小保存到off 
    if (check_add_overflow(off, sq_array_size, &off)) 
        return SIZE_MAX; 
    return off; 
}

struct_size是計算結構體和額外字段大小的宏，我們剛才看到io_rings結構體的定義中，最后一個字段是struct io_uring_cqe cqes[]，看起來是個空數組，其實他的內存是緊跟著結構體后面分配的，結構如下。

下面我們看struct_size是如何計算的。

#define struct_size(p, member, count)                   \ 
    __ab_c_size(count,                      \ 
            sizeof(*(p)->member) + __must_be_array((p)->member),\ 
            sizeof(*(p))) 
 
static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c){ 
    size_t bytes; 
    // 計算a * b保存到bytes 
    if (check_mul_overflow(a, b, &bytes)) 
        return SIZE_MAX; 
    // 計算bytes + c保存搭配bytes 
    if (check_add_overflow(bytes, c, &bytes)) 
        return SIZE_MAX; 
 
    return bytes; 
}

我們看到計算方式就是數組元素大小*元素個數+結構體本身的大小。計算完結構體大小后又通過array_size計算了另一個數組的大小并加起來，所以io_rings的結構體如下所示。

分配了io_rings之后我們繼續看接下來的邏輯。

static int io_allocate_scq_urings(struct io_ring_ctx *ctx, 
                  struct io_uring_params *p){ 
    // ... 
    // 記錄到ctx中 
    ctx->rings = rings; 
    // sq_array記錄rings結構體中，u32數組的首地址 
    ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 
    // 用于回環處理 
    rings->sq_ring_mask = p->sq_entries - 1; 
    rings->cq_ring_mask = p->cq_entries - 1; 
    // 隊列長度 
    rings->sq_ring_entries = p->sq_entries; 
    rings->cq_ring_entries = p->cq_entries; 
    ctx->sq_mask = rings->sq_ring_mask; 
    ctx->cq_mask = rings->cq_ring_mask; 
    // 請求隊列的數組大小 
    size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 
    // 分配內存并記錄到sq_sqes 
    ctx->sq_sqes = io_mem_alloc(size); 
    return 0; 
}

進行了一系列設置后，架構如下。

創建完io_rings結構體后，我們繼續回到io_uring_create中。

2 設置io_uring_params

內核申請完系列結構體后，需要通過io_uring_params結構體返回給調用方。

static int io_uring_create(unsigned entries, struct io_uring_params *p, 
               struct io_uring_params __user *params) { 
 
    ret = io_allocate_scq_urings(ctx, p); 
    // 初始化poll模式相關邏輯，如果開啟了的話 
    ret = io_sq_offload_start(ctx, p); 
    memset(&p->sq_off, 0, sizeof(p->sq_off)); 
    // 記錄字段在結構體的偏移 
    p->sq_off.head = offsetof(struct io_rings, sq.head); 
    p->sq_off.tail = offsetof(struct io_rings, sq.tail); 
    p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); 
    p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); 
    p->sq_off.flags = offsetof(struct io_rings, sq_flags); 
    p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); 
    p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 
 
    memset(&p->cq_off, 0, sizeof(p->cq_off)); 
    p->cq_off.head = offsetof(struct io_rings, cq.head); 
    p->cq_off.tail = offsetof(struct io_rings, cq.tail); 
    p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); 
    p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); 
    p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); 
    p->cq_off.cqes = offsetof(struct io_rings, cqes); 
    p->cq_off.flags = offsetof(struct io_rings, cq_flags); 
    // 內核支持的屬性 
    p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 
            IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 
            IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 
            IORING_FEAT_POLL_32BITS; 
 
    copy_to_user(params, p, sizeof(*p)) 
    // 獲取fd 
    ret = io_uring_get_fd(ctx); 
    return ret; 
}

io_uring_create繼續進行了一系列賦值，賦值完后架構如下。

3 獲取文件描述符

內核通過io_uring_get_fd獲取文件描述符返回給調用方。

static int io_uring_get_fd(struct io_ring_ctx *ctx){ 
    struct file *file; 
    // 獲取一個可用fd 
    int ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC); 
    // 分配一個file結構體，設置函數集為io_uring_fops，并關聯上下文ctx 
    file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, 
                    O_RDWR | O_CLOEXEC); 
 
    // 關聯fd和file結構體 
    fd_install(ret, file); 
    return ret; 
}

io_uring_get_fd申請了一個fd和file，這是遵循vfs的設計，最重要的是把io_uring的函數集掛在到file上，后續通過fd操作的io_uring實例的時候，經過vfs后就會執行對應的函數，另外還需要把ctx和file關聯起來，因為后續通過fd操作io_uring時，需要拿到fd對應的io_uring上下文。至此。

io_uring_setup就分析完了，但是還不能使用。io_uring在設計中，為了減少系統調用和用戶、內核數據通信的成本，實現了用戶、內核共享數據結構的方式，這樣用戶和內核就可以操作同一份數據結構達到通信目的，而不用通過系統調用，更不需要設計來回復制。為了達到這個目的，用戶拿到io_uring實例后，還需要調用mmap獲取對應的內存映射。我們通過liburing庫的邏輯來分析。

4 從liburing庫看io_uring的使用

int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, 
                   struct io_uring_params *p){ 
    int fd, ret; 
    // 調用io_uring_setup，拿到fd 
    fd = __sys_io_uring_setup(entries, p); 
    if (fd < 0) 
        return -errno; 
    // 內存映射 
    ret = io_uring_queue_mmap(fd, p, ring); 
    // 保存系統支持的屬性 
    ring->features = p->features; 
    return 0; 
}

我們重點看一下io_uring_queue_mmap。

int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring){ 
    int ret; 
 
    memset(ring, 0, sizeof(*ring)); 
    ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); 
    // 記錄flags和fd 
    if (!ret) { 
        ring->flags = p->flags; 
        ring->ring_fd = fd; 
    } 
    return ret; 
}

繼續看io_uring_mmap。

static int io_uring_mmap(int fd, struct io_uring_params *p, 
             struct io_uring_sq *sq, struct io_uring_cq *cq){ 
    size_t size; 
    int ret; 
    // 請求隊列需要映射的內存大小，即整個結構體struct io_rings結構體的大小 
    sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); 
    // 請求隊列和完成隊列映射的內存大小一樣，等于請求隊列的 
    cq->ring_sz = sq->ring_sz; 
    // 映射并拿到虛擬地址，大小是sq->ring_sz 
    sq->ring_ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, 
            MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); 
    cq->ring_ptr = sq->ring_ptr; 
    // 通過首地址和偏移拿到對應字段的地址 
    sq->khead = sq->ring_ptr + p->sq_off.head; 
    sq->ktail = sq->ring_ptr + p->sq_off.tail; 
    sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask; 
    sq->kring_entries = sq->ring_ptr + p->sq_off.ring_entries; 
    sq->kflags = sq->ring_ptr + p->sq_off.flags; 
    sq->kdropped = sq->ring_ptr + p->sq_off.dropped; 
    sq->array = sq->ring_ptr + p->sq_off.array; 
    // 映射保存請求隊列節點的內存 
    size = p->sq_entries * sizeof(struct io_uring_sqe); 
    sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, 
                MAP_SHARED | MAP_POPULATE, fd, 
                IORING_OFF_SQES); 
    // 同上 
    cq->khead = cq->ring_ptr + p->cq_off.head; 
    cq->ktail = cq->ring_ptr + p->cq_off.tail; 
    cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask; 
    cq->kring_entries = cq->ring_ptr + p->cq_off.ring_entries; 
    cq->koverflow = cq->ring_ptr + p->cq_off.overflow; 
    cq->cqes = cq->ring_ptr + p->cq_off.cqes; 
    if (p->cq_off.flags) 
        cq->kflags = cq->ring_ptr + p->cq_off.flags; 
    return 0; 
}

io_uring_mmap除了保存一些常用的字段信息外，最重要的是做了內存映射。我們看看mmap的最后一個參數分別是IORING_OFF_SQ_RING和IORING_OFF_SQES，接下來我們看看io_uring的mmap鉤子的實現。

static int io_uring_mmap(struct file *file, struct vm_area_struct *vma){ 
    size_t sz = vma->vm_end - vma->vm_start; 
    unsigned long pfn; 
    void *ptr; 
 
    ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 
 
    pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 
    return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);}static void *io_uring_validate_mmap_request(struct file *file, 
                        loff_t pgoff, size_t sz){ 
    struct io_ring_ctx *ctx = file->private_data; 
    loff_t offset = pgoff << PAGE_SHIFT; 
    struct page *page; 
    void *ptr; 
 
    switch (offset) { 
    case IORING_OFF_SQ_RING: 
    case IORING_OFF_CQ_RING: 
        ptr = ctx->rings; 
        break; 
    case IORING_OFF_SQES: 
        ptr = ctx->sq_sqes; 
        break; 
    default: 
        return ERR_PTR(-EINVAL); 
    } 
 
    page = virt_to_head_page(ptr); 
    if (sz > page_size(page)) 
        return ERR_PTR(-EINVAL); 
 
    return ptr; 
}

這里設計的內容涉及到了復雜的內存管理，從代碼中我們大概知道，返回的地址分別是ctx->rings和ctx->sq_sqes。即我們操作mmap返回的虛擬地址時，映射到內核的數據結構是ctx的字段。這樣就完成了數據共享。最后形成的架構圖如下。

至此，分析就告一段落，io_uring的實現實在是復雜，需要反復閱讀和思考，才能慢慢理解和了解它的原理。

后記：io_uring作為新一代IO框架，未來應該會在各大軟件中使用，尤其是對性能有極高要求的服務器，所以是非常值得關注和學習的。

責任編輯：武曉燕來源：編程雜技

內核 IO Linux

成人免费xxxxx在线视频软件_久久精品久久久_亚洲国产精品久久久_天天色天天色_亚洲人成一区_欧美一级欧美三级在线观看

從內核看IO_Uring的實現（一）

1 分配一個io_rings結構體

2 設置io_uring_params

3 獲取文件描述符

4 從liburing庫看io_uring的使用