aboutsummaryrefslogtreecommitdiffstats
path: root/executor/common_linux.h
diff options
context:
space:
mode:
authorNecip Fazil Yildiran <necip@google.com>2020-07-24 19:51:04 +0300
committerGitHub <noreply@github.com>2020-07-24 18:51:04 +0200
commit0a13649c058c68a7707b22beb08b0806ce3a1d42 (patch)
treeb53ffe0c21213c04ec48ffb6d8c549a7e52d5bbe /executor/common_linux.h
parent83892305a53be0b3d5a1352d5329989265a5ba0b (diff)
sys/linux: enhanced descs for io_uring
* sys/linux: enhanced descs for io_uring Introduced pseudo-call "syz_io_uring_put_sqes_on_ring()" for writing submission queue entries (sqes) on sq_ring, which was obtained by mmap'ping the offsets obtained from io_uring_setup(). Added descriptions for io_ring_register operations that were missing earlier. Did misc changes to adapt the descriptions for the updates on the io_uring subsystem. * pkg/host: add io_uring pseudo-syscall * executor/common_linux.h: fix issues with io_uring pseudo-syscall * executor: fixed io_uring offset computation * executor: fixes and refactorings in syz_io_uring_submit() * executor: added syz_io_uring_complete() pseudo-syscall for io_uring * sys/linux: added descriptions for io_uring operations Each operation requires a different struct io_uring_sqe set up. Those are described to be submitted to the sq ring. * executor: use uint32 instead of uint32_t * executor: remove nonfailing from pseudo-calls * sys/linux: fix io_uring epoll_ctl sqe * prog: fix TestTransitivelyEnabledCallsLinux() The newly introduced syscall, syz_io_uring_submit$IORING_OP_EPOLL_CTL, uses fd_epoll. Adapt TestTransitivelyEnabledCallsLinux() to account for this. * sys/linux: add IORING_OP_PROVIDE_BUFFERS and IORING_OP_REMOVE_BUFFERS * sys/linux: fix IORING_OP_WRITE_FIXED and IORING_OP_READ_FIXED addr and len are for the buffer located at buf_index * sys/linux: io_uring: use reg. bufs for READ, READV, RECV, RECVMSG As a result, IOSQE_BUFFER_SELECT_BIT is included in the iosqe_flags. * sys/linux: io_uring: misc fixes * sys/linux: io_uring: add IORING_SETUP_ATTACH_WQ * executor: refactorings on io_uring pseudo syscalls * sys/linux: io_uring: fix desc for params.cq_entries * executor: fix SQ_ARRAY_OFFSET computation This is required with the fix in io_uring kernel code. https://lore.kernel.org/io-uring/CACT4Y+bgTCMXi3eU7xV+W0ZZNceZFUWRTkngojdr0G_yuY8w9w@mail.gmail.com/T/#t * executor: added pseudosyscall syz_io_uring_cq_eventfd_toggle() The usage of cq_ring->flags is only for manipulating IORING_CQ_EVENTFD_DISABLED bit. This is achieved by a pseudo-syscall, which toggles the bit. * executor: added pseudocall syz_io_uring_put_ring_metadata Removed syz_io_uring_cq_eventfd_toggle() and introduced syz_io_uring_put_ring_metadata() instead. We have many pieces of metadata for both sq_ring and cq_ring, for which we are given the offsets, and some of are not supposed to be manipulated by the application. Among them, both sq and cq flags can be changed. Both valid and invalid cases might cause interesting outcomes. Use the newly introduced pseudo syscall to manipulate them randomly while also manipulating the flags to their special values. * executor: added pseudo-syscall syz_memcpy_off Removed syz_io_uring_put_ring_metadata() and instead added a much more generic pseudo systemcall to achieve the task. This should benefit other subsystems as well. * sys/linux: refactored io_uring descriptions syz_io_uring_submit() is called with a union of sqes to reduce duplication of other parameters of the function. io_uring_sqe is templated with io_uring_sqe_t, and this template type is used to describe sqes for different ops. The organization of io_uring.txt is changed. * sys/linux: io_uring: improved descs to utilize registered files The files are registered using io_uring_register$IORING_REGISTER_FILES(). When IOSQE_FIXED_FILE_BIT is enabled in iosqe_flags in sqe, a variety of operations can use those registered files using the index of the file instead of fd. Changed the sqe descriptions for the eligible operations to utilize this. * sys/linux: io_uring: improved the descs to utilize personality_id in sqes A personality_id can be registered for a io_uring fd using io_uring_register$IORING_REGISTER_PERSONALITY(). This id can be utilized within sqes. This commit improves the descs for io_uring to utilize it. In addition, the descriptions for the misc field in io_uring_sqe_t is refactored as most are shared among sqes. * sys/linux: io_uring: utilized cqe.res io_uring_cqe.res is used to carry the return value of operations achieved through io_uring. The only operations with meaningful return values (in terms of their possible usage) are openat and openat2. The pseudo-syscall syz_io_uring_complete() is modified to account for this and return those fds. The description for sqe_user_data is splitted into two to identify openat and non-openat io_uring ops. IORING_OP_IOCTL was suggested but never supported in io_uring. Thus, the note on this is removed in the descriptions. tee() expects pipefds, thus, IORING_OP_TEE. The descriptions for the pipe r/w fds are written as ordinary fd. Thus, in the description for IORING_OP_TEE, which is io_uring_sqe_tee, fd is used in the place where pipefds are expected. The note on this is removed in the descriptions. * sys/linux/test: added test for io_uring This is not tested yet. * sys/linux/test: fixed the test for io_uring The changes successfully pass the sys/linux/test/io_uring test. sys/linux/io_uring.txt: sq_ring_ptr and cq_ring_ptr are really the same. Thus, they are replaced with ring_ptr. executor/common_linux.h: thanks to io_uring test, a bug is found in where the sq_array's address is computed in syz_io_uring_submit(). Fixed. In addition, similar to the descriptions, the naming for the ring_ptr is changed from {sq,cq}_ring_ptr to ring_ptr. * sys/linux: io_uring: misc fixes * sys/linux: io_uring: changed the sqe_user_data enum Used a smaller range to ease the collisions. Used comperatively unique and magic numbers for openat user_data to avoid thinking as if the cqe belongs to openat while the user_data is coming from some random location. * pkg/host: added checks for io_uring syscall * pkg/host: fixed checks for io_uring syscall * sys/linux: fixed io_uring test
Diffstat (limited to 'executor/common_linux.h')
-rw-r--r--executor/common_linux.h147
1 files changed, 147 insertions, 0 deletions
diff --git a/executor/common_linux.h b/executor/common_linux.h
index 324391196..546e60b79 100644
--- a/executor/common_linux.h
+++ b/executor/common_linux.h
@@ -1355,6 +1355,153 @@ static long syz_emit_ethernet(volatile long a0, volatile long a1, volatile long
}
#endif
+#if SYZ_EXECUTOR || __NR_syz_io_uring_submit || __NR_syz_io_uring_complete
+
+#define SIZEOF_IO_URING_SQE 64
+#define SIZEOF_IO_URING_CQE 16
+
+// Once a io_uring is set up by calling io_uring_setup, the offsets to the member fields
+// to be used on the mmap'ed area are set in structs io_sqring_offsets and io_cqring_offsets.
+// Except io_sqring_offsets.array, the offsets are static while all depend on how struct io_rings
+// is organized in code. The offsets can be marked as resources in syzkaller descriptions but
+// this makes it difficult to generate correct programs by the fuzzer. Thus, the offsets are
+// hard-coded here (and in the descriptions), and array offset is later computed once the number
+// of entries is available. Another way to obtain the offsets is to setup another io_uring here
+// and use what it returns. It is slower but might be more maintainable.
+#define SQ_HEAD_OFFSET 0
+#define SQ_TAIL_OFFSET 64
+#define SQ_RING_MASK_OFFSET 256
+#define SQ_RING_ENTRIES_OFFSET 264
+#define SQ_FLAGS_OFFSET 276
+#define SQ_DROPPED_OFFSET 272
+#define CQ_HEAD_OFFSET 128
+#define CQ_TAIL_OFFSET 192
+#define CQ_RING_MASK_OFFSET 260
+#define CQ_RING_ENTRIES_OFFSET 268
+#define CQ_RING_OVERFLOW_OFFSET 284
+#define CQ_FLAGS_OFFSET 280
+#define CQ_CQES_OFFSET 320
+#define SQ_ARRAY_OFFSET(sq_entries, cq_entries) (round_up(CQ_CQES_OFFSET + cq_entries * SIZEOF_IO_URING_CQE, 64))
+
+uint32 round_up(uint32 x, uint32 a)
+{
+ return (x + a - 1) & ~(a - 1);
+}
+
+#if SYZ_EXECUTOR || __NR_syz_io_uring_complete
+
+// From linux/io_uring.h
+struct io_uring_cqe {
+ uint64 user_data;
+ uint32 res;
+ uint32 flags;
+};
+
+static long syz_io_uring_complete(volatile long a0)
+{
+ // syzlang: syz_io_uring_complete(ring_ptr ring_ptr)
+ // C: syz_io_uring_complete(char* ring_ptr)
+
+ // It is not checked if the ring is empty
+
+ // Cast to original
+ char* ring_ptr = (char*)a0;
+
+ // Compute the head index and the next head value
+ uint32 cq_ring_mask = *(uint32*)(ring_ptr + CQ_RING_MASK_OFFSET);
+ uint32* cq_head_ptr = (uint32*)(ring_ptr + CQ_HEAD_OFFSET);
+ uint32 cq_head = *cq_head_ptr & cq_ring_mask;
+ uint32 cq_head_next = *cq_head_ptr + 1;
+
+ // Compute the ptr to the src cq entry on the ring
+ char* cqe_src = ring_ptr + CQ_CQES_OFFSET + cq_head * SIZEOF_IO_URING_CQE;
+
+ // Get the cq entry from the ring
+ struct io_uring_cqe cqe;
+ memcpy(&cqe, cqe_src, sizeof(cqe));
+
+ // Advance the head. Head is a free-flowing integer and relies on natural wrapping.
+ // Ensure that the kernel will never see a head update without the preceeding CQE
+ // stores being done.
+ __atomic_store_n(cq_head_ptr, cq_head_next, __ATOMIC_RELEASE);
+
+ // In the descriptions (sys/linux/io_uring.txt), openat and openat2 are passed
+ // with a unique range of sqe.user_data (0x12345 and 0x23456) to identify the operations
+ // which produces an fd instance. Check cqe.user_data, which should be the same
+ // as sqe.user_data for that operation. If it falls in that unique range, return
+ // cqe.res as fd. Otherwise, just return an invalid fd.
+ return (cqe.user_data == 0x12345 || cqe.user_data == 0x23456) ? (long)cqe.res : (long)-1;
+}
+
+#endif
+
+#if SYZ_EXECUTOR || __NR_syz_io_uring_submit
+
+static long syz_io_uring_submit(volatile long a0, volatile long a1, volatile long a2, volatile long a3)
+{
+ // syzlang: syz_io_uring_submit(ring_ptr ring_ptr, sqes_ptr sqes_ptr, sqe ptr[in, io_uring_sqe], sqes_index int32)
+ // C: syz_io_uring_submit(char* ring_ptr, io_uring_sqe* sqes_ptr, io_uring_sqe* sqe, uint32 sqes_index)
+
+ // It is not checked if the ring is full
+
+ // Cast to original
+ char* ring_ptr = (char*)a0; // This will be exposed to offsets in bytes
+ char* sqes_ptr = (char*)a1;
+ char* sqe = (char*)a2;
+ uint32 sqes_index = (uint32)a3;
+
+ uint32 sq_ring_entries = *(uint32*)(ring_ptr + SQ_RING_ENTRIES_OFFSET);
+ uint32 cq_ring_entries = *(uint32*)(ring_ptr + CQ_RING_ENTRIES_OFFSET);
+
+ // Compute the sq_array offset
+ uint32 sq_array_off = SQ_ARRAY_OFFSET(sq_ring_entries, cq_ring_entries);
+
+ // Get the ptr to the destination for the sqe
+ if (sq_ring_entries)
+ sqes_index %= sq_ring_entries;
+ char* sqe_dest = sqes_ptr + sqes_index * SIZEOF_IO_URING_SQE;
+
+ // Write the sqe entry to its destination in sqes
+ memcpy(sqe_dest, sqe, SIZEOF_IO_URING_SQE);
+
+ // Write the index to the sqe array
+ uint32 sq_ring_mask = *(uint32*)(ring_ptr + SQ_RING_MASK_OFFSET);
+ uint32* sq_tail_ptr = (uint32*)(ring_ptr + SQ_TAIL_OFFSET);
+ uint32 sq_tail = *sq_tail_ptr & sq_ring_mask;
+ uint32 sq_tail_next = *sq_tail_ptr + 1;
+ uint32* sq_array = (uint32*)(ring_ptr + sq_array_off);
+ *(sq_array + sq_tail) = sqes_index;
+
+ // Advance the tail. Tail is a free-flowing integer and relies on natural wrapping.
+ // Ensure that the kernel will never see a tail update without the preceeding SQE
+ // stores being done.
+ __atomic_store_n(sq_tail_ptr, sq_tail_next, __ATOMIC_RELEASE);
+
+ // Now the application is free to call io_uring_enter() to submit the sqe
+ return 0;
+}
+
+#endif
+
+#endif
+
+// Same as memcpy except that it accepts offset to dest and src.
+#if SYZ_EXECUTOR || __NR_syz_memcpy_off
+static long syz_memcpy_off(volatile long a0, volatile long a1, volatile long a2, volatile long a3, volatile long a4)
+{
+ // C: syz_memcpy_off(void* dest, uint32 dest_off, void* src, uint32 src_off, size_t n)
+
+ // Cast to original
+ char* dest = (char*)a0;
+ uint32 dest_off = (uint32)a1;
+ char* src = (char*)a2;
+ uint32 src_off = (uint32)a3;
+ size_t n = (size_t)a4;
+
+ return (long)memcpy(dest + dest_off, src + src_off, n);
+}
+#endif
+
#if SYZ_EXECUTOR || SYZ_REPEAT && SYZ_NET_INJECTION
static void flush_tun()
{