From 7882d3fbaaa722e5659621513a41d93fe4c04031 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 2 Mar 2022 10:45:53 -0500 Subject: [PATCH 1/7] Fix indentation in syscall_64.tbl --- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 9618381a6f2447..4355a28f597160 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -366,7 +366,7 @@ 442 common mount_setattr sys_mount_setattr 444 common print_xrp_stats sys_print_xrp_stats -445 common read_xrp sys_read_xrp +445 common read_xrp sys_read_xrp 446 common test_xrp sys_test_xrp # From 190db942a261d3569b2c67cd7b605bc1cecf95b2 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 2 Mar 2022 10:46:02 -0500 Subject: [PATCH 2/7] Add syscalls to syscall_32.tbl --- arch/x86/entry/syscalls/syscall_32.tbl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index a1c9f496fca6a2..9e8c310266a91f 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -447,3 +447,6 @@ 440 i386 process_madvise sys_process_madvise 441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 i386 mount_setattr sys_mount_setattr +444 i386 print_xrp_stats sys_print_xrp_stats +445 i386 read_xrp sys_read_xrp +446 i386 test_xrp sys_test_xrp From 7d491a16bfc097e8e8715af58d1677ed68f11fce Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 2 Mar 2022 10:47:27 -0500 Subject: [PATCH 3/7] Move syscall asmlinkage declarations in syscalls.h - Group them together - Put them within the CONFIG_ARCH_HAS_SYSCALL_WRAPER block --- include/linux/syscalls.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e3786408601c50..2280879ee6e4ed 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -507,8 +507,6 @@ asmlinkage long sys_writev(unsigned long fd, unsigned long vlen); asmlinkage long sys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos); -asmlinkage long sys_read_xrp(unsigned int fd, char __user *buf, - size_t count, loff_t pos, unsigned int bpf_fd, char __user *scratch_buf); asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos); asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec, @@ -1258,6 +1256,10 @@ asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long fd, unsigned long pgoff); asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg); +asmlinkage long sys_print_xrp_stats(struct xrp_stats __user *buf); +asmlinkage long sys_read_xrp(unsigned int fd, char __user *buf, + size_t count, loff_t pos, unsigned int bpf_fd, char __user *scratch_buf); +asmlinkage long sys_test_xrp(char __user *data_buf, char __user *scratch_buf, unsigned int bpf_fd); /* * Not a real system call, but a placeholder for syscalls which are @@ -1367,9 +1369,6 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, const struct old_timespec32 __user *timeout); -asmlinkage long sys_print_xrp_stats(struct xrp_stats __user *buf); -asmlinkage long sys_test_xrp(char __user *data_buf, char __user *scratch_buf, unsigned int bpf_fd); - int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen); int __sys_setsockopt(int fd, int level, int optname, char __user *optval, From 8f0ab33c222d8edc2e4dce1cae34d545d9d1af47 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 2 Mar 2022 10:50:41 -0500 Subject: [PATCH 4/7] Add syscalls to unistd.h and update __NR_syscalls - This is required to support ARM64 --- include/uapi/asm-generic/unistd.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index ce58cff99b6653..c35381e983219e 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -864,8 +864,15 @@ __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) #define __NR_mount_setattr 442 __SYSCALL(__NR_mount_setattr, sys_mount_setattr) +#define __NR_print_xrp_stats 444 +__SYSCALL(__NR_print_xrp_stats, sys_print_xrp_stats) +#define __NR_read_xrp 445 +__SYSCALL(__NR_read_xrp, sys_read_xrp) +#define __NR_test_xrp 446 +__SYSCALL(__NR_test_xrp, sys_test_xrp) + #undef __NR_syscalls -#define __NR_syscalls 443 +#define __NR_syscalls 447 /* * 32 bit systems traditionally used different From e16cf631df6fe582560b8998c967ca4a689752ea Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 4 May 2022 14:05:37 -0400 Subject: [PATCH 5/7] Initial XRP multi-file support --- drivers/nvme/host/pci.c | 49 +++++++++++++++++++++++++++++++++++---- fs/block_dev.c | 8 +++++-- fs/io_uring.c | 2 ++ fs/ioctl.c | 15 ++++++++++++ fs/iomap/direct-io.c | 4 +++- fs/read_write.c | 11 +++++---- include/linux/blk_types.h | 3 ++- include/linux/filter.h | 3 +++ include/linux/fs.h | 2 ++ include/uapi/linux/bpf.h | 3 +++ 10 files changed, 88 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 431234643cc102..db1bdc2834f5f9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -28,6 +28,9 @@ #include #include #include +#include +#include +#include #include "trace.h" #include "nvme.h" @@ -1066,6 +1069,11 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) struct bpf_xrp_kern ebpf_context; u32 ebpf_return; loff_t file_offset, data_len; + struct files_struct *files_struct; + struct file *file; + struct inode *inode; + s32 fd; + struct fdtable *fdt; u64 disk_offset; ktime_t ebpf_start; ktime_t resubmit_start = ktime_get(); @@ -1073,14 +1081,29 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) struct xrp_mapping mapping; ktime_t extent_lookup_start; + fd = req->bio->xrp_cur_fd; + files_struct = req->bio->xrp_fdtable; + fdt = files_fdtable(files_struct); + + if (!fd_is_open(fd, fdt)) { + printk("nvme_handle_cqe: bad file descriptor given %d, dump context\n", fd); + ebpf_dump_page((uint8_t *) ebpf_context.scratch, 4096); + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) + nvme_pci_complete_rq(req); + return; + } + + file = get_file(files_lookup_fd_rcu(files_struct, fd)); + inode = file->f_inode; + /* verify version number */ if (req->bio->xrp_count > 1 - && req->bio->xrp_inode->i_op == &ext4_file_inode_operations) { + && inode->i_op == &ext4_file_inode_operations) { file_offset = req->bio->xrp_file_offset; data_len = 512; extent_lookup_start = ktime_get(); - xrp_retrieve_mapping(req->bio->xrp_inode, file_offset, data_len, &mapping); + xrp_retrieve_mapping(inode, file_offset, data_len, &mapping); atomic_long_add(ktime_sub(ktime_get(), extent_lookup_start), &xrp_extent_lookup_time); atomic_long_inc(&xrp_extent_lookup_count); if (!mapping.exist || mapping.len < data_len || mapping.address & 0x1ff) { @@ -1088,6 +1111,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) ebpf_dump_page((uint8_t *) ebpf_context.scratch, 4096); if (!nvme_try_complete_req(req, cqe->status, cqe->result)) nvme_pci_complete_rq(req); + fput(file); return; } else if (mapping.version != req->bio->xrp_extent_version) { printk("nvme_handle_cqe: version mismatch with logical address 0x%llx (expected %lld, got %lld), dump context\n", @@ -1095,9 +1119,11 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) ebpf_dump_page((uint8_t *) ebpf_context.scratch, 4096); if (!nvme_try_complete_req(req, cqe->status, cqe->result)) nvme_pci_complete_rq(req); + fput(file); return; } } + fput(file); memset(&ebpf_context, 0, sizeof(struct bpf_xrp_kern)); ebpf_context.data = page_address(bio_page(req->bio)); @@ -1132,13 +1158,26 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) return; } /* address mapping */ + fd = ebpf_context.fd_arr[0]; file_offset = ebpf_context.next_addr[0]; data_len = 512; // FIXME: support variable data_len and more than one next_addr req->bio->xrp_file_offset = file_offset; - if (req->bio->xrp_inode->i_op == &ext4_file_inode_operations) { + + if (!fd_is_open(fd, fdt)) { + printk("nvme_handle_cqe: bad file descriptor given %d, dump context\n", fd); + ebpf_dump_page((uint8_t *) ebpf_context.scratch, 4096); + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) + nvme_pci_complete_rq(req); + return; + } + + file = get_file(files_lookup_fd_rcu(files_struct, fd)); + inode = file->f_inode; + + if (inode->i_op == &ext4_file_inode_operations) { extent_lookup_start = ktime_get(); - xrp_retrieve_mapping(req->bio->xrp_inode, file_offset, data_len, &mapping); + xrp_retrieve_mapping(inode, file_offset, data_len, &mapping); atomic_long_add(ktime_sub(ktime_get(), extent_lookup_start), &xrp_extent_lookup_time); atomic_long_inc(&xrp_extent_lookup_count); if (!mapping.exist || mapping.len < data_len || mapping.address & 0x1ff) { @@ -1146,6 +1185,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) ebpf_dump_page((uint8_t *) ebpf_context.scratch, 4096); if (!nvme_try_complete_req(req, cqe->status, cqe->result)) nvme_pci_complete_rq(req); + fput(file); return; } else { req->bio->xrp_extent_version = mapping.version; @@ -1155,6 +1195,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) /* no address translation, use direct map */ disk_offset = file_offset; } + fput(file); nvme_req(req)->cmd = req->xrp_command; req->bio->xrp_count += 1; req->bio->bi_iter.bi_sector = (disk_offset >> 9) + req->bio->xrp_partition_start_sector; diff --git a/fs/block_dev.c b/fs/block_dev.c index 17ec21e375eebf..48e748f9715ef4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -268,9 +268,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, ret = bio.bi_iter.bi_size; bio.xrp_enabled = iocb->xrp_enabled; - bio.xrp_inode = file->f_inode; bio.xrp_partition_start_sector = 0; bio.xrp_count = 1; + bio.xrp_fdtable = current->files; // TODO: investigate locking required + bio.xrp_cur_fd = iocb->xrp_cur_fd; + bio.xrp_file_offset = iocb->xrp_file_offset; if (bio.xrp_enabled) { if (get_user_pages_fast(iocb->xrp_scratch_buf, 1, FOLL_WRITE, &bio.xrp_scratch_page) != 1) { printk("__blkdev_direct_IO_simple: failed to get scratch page\n"); @@ -458,9 +460,11 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } bio->xrp_enabled = iocb->xrp_enabled; - bio->xrp_inode = file->f_inode; bio->xrp_partition_start_sector = 0; bio->xrp_count = 1; + bio->xrp_fdtable = current->files; + bio->xrp_cur_fd = iocb->xrp_cur_fd; + bio->xrp_file_offset = iocb->xrp_file_offset; if (bio->xrp_enabled) { if (get_user_pages_fast(iocb->xrp_scratch_buf, 1, FOLL_WRITE, &bio->xrp_scratch_page) != 1) { printk("__blkdev_direct_IO: failed to get scratch page\n"); diff --git a/fs/io_uring.c b/fs/io_uring.c index 79ba34044a21ac..55b73ebf37f21e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2743,6 +2743,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) kiocb->xrp_enabled = true; kiocb->xrp_scratch_buf = (char __user *) sqe->scratch; kiocb->xrp_bpf_fd = (unsigned int) sqe->bpf_fd; + kiocb->xrp_cur_fd = (unsigned int) sqe->fd; + kiocb->xrp_file_offset = sqe->off; } req->rw.addr = READ_ONCE(sqe->addr); diff --git a/fs/ioctl.c b/fs/ioctl.c index 8dae8fb02bddfe..ca26c05ec5a36b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -949,6 +949,21 @@ static bool xrp_is_valid_access(int off, int size, enum bpf_access_type type, co info->reg_type = PTR_TO_MEM; info->mem_size = PAGE_SIZE; break; + case bpf_ctx_range(struct bpf_xrp, fd_arr): + size_of_field = sizeof_field(struct bpf_xrp, fd_arr); + if (!bpf_ctx_narrow_access_ok(off, size, size_of_field)) + return false; + break; + case bpf_ctx_range(struct bpf_xrp, cur_addr): + size_of_field = sizeof_field(struct bpf_xrp, cur_addr); + if (type != BPF_READ || !bpf_ctx_narrow_access_ok(off, size, size_of_field)) + return false; + break; + case bpf_ctx_range(struct bpf_xrp, cur_fd): + size_of_field = sizeof_field(struct bpf_xrp, cur_fd); + if (type != BPF_READ || !bpf_ctx_narrow_access_ok(off, size, size_of_field)) + return false; + break; default: return false; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index d6ce8118562491..4a26dbda9fa513 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -334,9 +334,11 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, } bio->xrp_enabled = dio->iocb->xrp_enabled; - bio->xrp_inode = dio->iocb->ki_filp->f_inode; bio->xrp_partition_start_sector = 0; bio->xrp_count = 1; + bio->xrp_fdtable = current->files; + bio->xrp_cur_fd = dio->iocb->xrp_cur_fd; + bio->xrp_file_offset = dio->iocb->xrp_file_offset; if (bio->xrp_enabled) { if (get_user_pages_fast(dio->iocb->xrp_scratch_buf, 1, FOLL_WRITE, &bio->xrp_scratch_page) != 1) { printk("iomap_dio_bio_actor: failed to get scratch page\n"); diff --git a/fs/read_write.c b/fs/read_write.c index 75a538138b5e7d..d0e61addf59121 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -419,7 +419,8 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo return ret; } -static ssize_t new_sync_read_xrp(struct file *filp, char __user *data_buf, size_t len, loff_t *ppos, unsigned int bpf_fd, char __user *scratch_buf) +static ssize_t new_sync_read_xrp(struct file *filp, unsigned int fd, char __user *data_buf, size_t len, loff_t *ppos, + unsigned int bpf_fd, char __user *scratch_buf) { struct iovec iov = { .iov_base = data_buf, .iov_len = len }; struct kiocb kiocb; @@ -431,6 +432,8 @@ static ssize_t new_sync_read_xrp(struct file *filp, char __user *data_buf, size_ kiocb.xrp_enabled = true; kiocb.xrp_scratch_buf = scratch_buf; kiocb.xrp_bpf_fd = bpf_fd; + kiocb.xrp_cur_fd = fd; + kiocb.xrp_file_offset = *ppos; iov_iter_init(&iter, READ, &iov, 1, len); ret = call_read_iter(filp, &kiocb, &iter); @@ -525,7 +528,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) return ret; } -ssize_t vfs_read_xrp(struct file *file, char __user *data_buf, size_t count, loff_t *pos, unsigned int bpf_fd, char __user *scratch_buf) +ssize_t vfs_read_xrp(struct file *file, unsigned int fd, char __user *data_buf, size_t count, loff_t *pos, unsigned int bpf_fd, char __user *scratch_buf) { ssize_t ret; @@ -547,7 +550,7 @@ ssize_t vfs_read_xrp(struct file *file, char __user *data_buf, size_t count, lof if (file->f_op->read) ret = file->f_op->read(file, data_buf, count, pos); else if (file->f_op->read_iter) - ret = new_sync_read_xrp(file, data_buf, count, pos, bpf_fd, scratch_buf); + ret = new_sync_read_xrp(file, fd, data_buf, count, pos, bpf_fd, scratch_buf); else ret = -EINVAL; if (ret > 0) { @@ -757,7 +760,7 @@ ssize_t ksys_read_xrp(unsigned int fd, char __user *data_buf, if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) - ret = vfs_read_xrp(f.file, data_buf, count, &pos, bpf_fd, scratch_buf); + ret = vfs_read_xrp(f.file, fd, data_buf, count, &pos, bpf_fd, scratch_buf); fdput(f); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 66057266501358..6d8e87278af37b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -277,13 +277,14 @@ struct bio { struct bio_set *bi_pool; bool xrp_enabled; - struct inode *xrp_inode; u64 xrp_partition_start_sector; int xrp_count; struct page *xrp_scratch_page; struct bpf_prog *xrp_bpf_prog; u64 xrp_extent_version; loff_t xrp_file_offset; + struct files_struct *xrp_fdtable; + s32 xrp_cur_fd; /* * We can inline a number of vecs at the end of the bio, to avoid diff --git a/include/linux/filter.h b/include/linux/filter.h index a9270137f1168a..24200632a58bf0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1478,6 +1478,9 @@ struct bpf_xrp_kern { uint64_t size[16]; char *data; char *scratch; + __s32 fd_arr[16]; + __u64 cur_addr; + __s32 cur_fd; }; struct xrp_stats { diff --git a/include/linux/fs.h b/include/linux/fs.h index 4c6971af0ad156..1bac9b327c435e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -324,6 +324,8 @@ struct kiocb { bool xrp_enabled; char __user *xrp_scratch_buf; unsigned int xrp_bpf_fd; + unsigned int xrp_cur_fd; + unsigned long xrp_file_offset; /* The 'ki_filp' pointer is shared in a union for aio */ randomized_struct_fields_start diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8e54088c6d662e..f6279ce8fd0de8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5261,6 +5261,9 @@ struct bpf_xrp { __u64 size[16]; char *data; char *scratch; + __s32 fd_arr[16]; + __u64 cur_addr; + __s32 cur_fd; }; #endif /* _UAPI__LINUX_BPF_H__ */ From 88bacc45beaf52b74019fb70af5d4d2309939556 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 4 May 2022 14:08:49 -0400 Subject: [PATCH 6/7] Update cur_fd after bpf program runs --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index db1bdc2834f5f9..74c575d9f109de 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1163,6 +1163,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) data_len = 512; // FIXME: support variable data_len and more than one next_addr req->bio->xrp_file_offset = file_offset; + req->bio->xrp_cur_fd = fd; if (!fd_is_open(fd, fdt)) { printk("nvme_handle_cqe: bad file descriptor given %d, dump context\n", fd); From 2eca5ef61f80cabe98837520e2d75e55457d10cf Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Tue, 17 May 2022 23:16:02 -0400 Subject: [PATCH 7/7] Initialize cur_addr and cur_fd in nvme_handle_cqe --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 74c575d9f109de..ee1262a10ed326 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1128,6 +1128,8 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) memset(&ebpf_context, 0, sizeof(struct bpf_xrp_kern)); ebpf_context.data = page_address(bio_page(req->bio)); ebpf_context.scratch = page_address(req->bio->xrp_scratch_page); + ebpf_context.cur_addr = req->bio->xrp_file_offset; + ebpf_context.cur_fd = req->bio->xrp_cur_fd; ebpf_start = ktime_get(); ebpf_prog = req->bio->xrp_bpf_prog; ebpf_return = BPF_PROG_RUN(ebpf_prog, &ebpf_context);