kernel小记

版本相关

过时

prepare_kernel_cred(NULL)

过时版本:6.2

自从内核版本 6.2 起,prepare_kernel_cred(NULL)不再拷贝 init_cred,而是将其视为一个运行时错误并返回 NULL,这使得这种提权方法无法再应用于 6.2 及更高版本的内核:

1
2
3
4
5
6
7
8
9
10
11
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
const struct cred *old;
struct cred *new;

if (WARN_ON_ONCE(!daemon))
return NULL;

new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;

UAF cred_jar

过时版本:4.5

从4.5版本开始,我们已无法直接分配到 cred_jar 中的 object,这是因为 cred_jar 在创建时设置了 SLAB_ACCOUNT 标记,在 CONFIG_MEMCG_KMEM=y 时(默认开启)cred_jar 不会再与相同大小的 kmalloc-192 进行合并

1
2
3
4
5
6
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
}

pt_regs

过时版本:5.13

在5.13,内核入栈时添加了一个偏移值,这意味着 pt_regs 与我们触发劫持内核执行流时的栈间偏移值不再是固定值

1
2
3
4
5
6
7
8
9
10
11
12
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 4efd39aacb9f2..7b2542b13ebd9 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -38,6 +38,7 @@
#ifdef CONFIG_X86_64
__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
+ add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);

instrumentation_begin();

userfaultfd

过时版本:5.11

在大概5.11版本前后,由于变量sysctl_unprivileged_userfaultfd不再被初识赋值为1,使得userfaultfd不再能被非root用户使用

1
2
3
4
//before
static int sysctl_unprivileged_userfaultfd __read_mostly=1;
//now:
static int sysctl_unprivileged_userfaultfd __read_mostly;

引入

KPTI

Linux 4.15 中引入了 KPTI 机制,并且该机制被反向移植到了 Linux 4.14.11,4.9.75,4.4.110

结构体

tty_struct

申请obj大小:1k

GFP_KERNEL_ACCOUNT:是

功能:.text泄露|直接映射区泄露|劫持流


打开一个 tty 设备文件时,内核最终会调用 alloc_tty_struct() 来分配一个 tty_struct 结构体

一般情况下,我们选择打开/dev/ptmx文件来分配一个tty_struct结构体

关闭该文件即可释放

1
2
fd=open("/dev/ptmx");//分配
close(fd);//释放

该结构体定义于 include/linux/tty.h 中,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/**
* struct tty_struct - state associated with a tty while open
*
* @flow.lock: lock for flow members
* @flow.stopped: tty stopped/started by tty_stop/tty_start
* @flow.tco_stopped: tty stopped/started by TCOOFF/TCOON ioctls (it has
* precedense over @flow.stopped)
* @flow.unused: alignment for Alpha, so that no members other than @flow.* are
* modified by the same 64b word store. The @flow's __aligned is
* there for the very same reason.
* @ctrl.lock: lock for ctrl members
* @ctrl.pgrp: process group of this tty (setpgrp(2))
* @ctrl.session: session of this tty (setsid(2)). Writes are protected by both
* @ctrl.lock and legacy mutex, readers must use at least one of
* them.
* @ctrl.pktstatus: packet mode status (bitwise OR of TIOCPKT_* constants)
* @ctrl.packet: packet mode enabled
*
* All of the state associated with a tty while the tty is open. Persistent
* storage for tty devices is referenced here as @port in struct tty_port.
*/
struct tty_struct {
int magic;//掩码0x5401
struct kref kref;
struct device *dev; /* class device or NULL (e.g. ptys, serdev) */
struct tty_driver *driver;
const struct tty_operations *ops;
int index;

/* Protects ldisc changes: Lock tty not pty */
struct ld_semaphore ldisc_sem;
struct tty_ldisc *ldisc;

struct mutex atomic_write_lock;
struct mutex legacy_mutex;
struct mutex throttle_mutex;
struct rw_semaphore termios_rwsem;
struct mutex winsize_mutex;
/* Termios values are protected by the termios rwsem */
struct ktermios termios, termios_locked;
char name[64];
unsigned long flags;
int count;
struct winsize winsize; /* winsize_mutex */

struct {
spinlock_t lock;
bool stopped;
bool tco_stopped;
unsigned long unused[0];
} __aligned(sizeof(unsigned long)) flow;

struct {
spinlock_t lock;
struct pid *pgrp;
struct pid *session;
unsigned char pktstatus;
bool packet;
unsigned long unused[0];
} __aligned(sizeof(unsigned long)) ctrl;

int hw_stopped;
unsigned int receive_room; /* Bytes free for queue */
int flow_change;

struct tty_struct *link;
struct fasync_struct *fasync;
wait_queue_head_t write_wait;
wait_queue_head_t read_wait;
struct work_struct hangup_work;
void *disc_data;
void *driver_data;
spinlock_t files_lock; /* protects tty_files list */
struct list_head tty_files;

#define N_TTY_BUF_SIZE 4096

int closing;
unsigned char *write_buf;
int write_cnt;
/* If the tty has a pending do_SAK, queue it here - akpm */
struct work_struct SAK_work;
struct tty_port *port;
} __randomize_layout;

/* Each of a tty's open files has private_data pointing to tty_file_private */
struct tty_file_private {
struct tty_struct *tty;
struct file *file;
struct list_head list;
};

/* tty magic number */
#define TTY_MAGIC 0x5401

tty_struct中ops对应的tty_operations结构体,定义于/include/linux/tty_driver.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
struct tty_operations {
struct tty_struct * (*lookup)(struct tty_driver *driver,
struct file *filp, int idx);
int (*install)(struct tty_driver *driver, struct tty_struct *tty);
void (*remove)(struct tty_driver *driver, struct tty_struct *tty);
int (*open)(struct tty_struct * tty, struct file * filp);
void (*close)(struct tty_struct * tty, struct file * filp);
void (*shutdown)(struct tty_struct *tty);
void (*cleanup)(struct tty_struct *tty);
int (*write)(struct tty_struct * tty,
const unsigned char *buf, int count);
int (*put_char)(struct tty_struct *tty, unsigned char ch);
void (*flush_chars)(struct tty_struct *tty);
unsigned int (*write_room)(struct tty_struct *tty);
unsigned int (*chars_in_buffer)(struct tty_struct *tty);
int (*ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
long (*compat_ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
void (*set_termios)(struct tty_struct *tty, struct ktermios * old);
void (*throttle)(struct tty_struct * tty);
void (*unthrottle)(struct tty_struct * tty);
void (*stop)(struct tty_struct *tty);
void (*start)(struct tty_struct *tty);
void (*hangup)(struct tty_struct *tty);
int (*break_ctl)(struct tty_struct *tty, int state);
void (*flush_buffer)(struct tty_struct *tty);
void (*set_ldisc)(struct tty_struct *tty);
void (*wait_until_sent)(struct tty_struct *tty, int timeout);
void (*send_xchar)(struct tty_struct *tty, char ch);
int (*tiocmget)(struct tty_struct *tty);
int (*tiocmset)(struct tty_struct *tty,
unsigned int set, unsigned int clear);
int (*resize)(struct tty_struct *tty, struct winsize *ws);
int (*get_icount)(struct tty_struct *tty,
struct serial_icounter_struct *icount);
int (*get_serial)(struct tty_struct *tty, struct serial_struct *p);
int (*set_serial)(struct tty_struct *tty, struct serial_struct *p);
void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m);
#ifdef CONFIG_CONSOLE_POLL
int (*poll_init)(struct tty_driver *driver, int line, char *options);
int (*poll_get_char)(struct tty_driver *driver, int line);
void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
#endif
int (*proc_show)(struct seq_file *, void *);
} __randomize_layout;

seq_operations

申请obj大小:32

GFP_KERNEL_ACCOUNT:是

功能:.text泄露|劫持流

当打开/proc/id/stat文件时,会分配一个seq_operations结构体,定义如下

1
2
3
4
5
6
struct seq_operations {
void * (*start) (struct seq_file *m, loff_t *pos);
void (*stop) (struct seq_file *m, void *v);
void * (*next) (struct seq_file *m, void *v, loff_t *pos);
int (*show) (struct seq_file *m, void *v);
};

只包含四个函数指针,显然可以用来劫持执行流和泄露地址

一般选择打开/proc/self/stat分配该结构体,同样关闭即可释放

1
2
fd=open("/proc/self/stat");//分配
close(fd);//释放

当我们 read 一个 stat 文件时,内核会调用其 proc_ops 的 proc_read_iter 指针,其默认值为 seq_read_iter() 函数,定义于 fs/seq_file.c 中,注意到有如下逻辑:

1
2
3
4
5
6
ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct seq_file *m = iocb->ki_filp->private_data;
//...
p = m->op->start(m, &m->index);
//...

即其会调用 seq_operations 中的 start 函数指针,那么我们只需要控制 seq_operations->start 后再读取对应 stat 文件便能控制内核执行流

ldt_struct

申请obj大小:32[slab]||16[slub]

GFP_KERNEL_ACCOUNT:否

功能:任意读|任意写

ldt 即局部段描述符表(Local Descriptor Table),其中存放着进程的段描述符,段寄存器当中存放着的段选择子便是段描述符表中段描述符的索引

定义如下(/arch/x86/include/asm/mmu_context.h)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*
* ldt_structs can be allocated, used, and freed, but they are never
* modified while live.
*/
struct ldt_struct {
/*
* Xen requires page-aligned LDTs with special permissions. This is
* needed to prevent us from installing evil descriptors such as
* call gates. On native, we could merge the ldt_struct and LDT
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
unsigned int nr_entries;

/*
* If PTI is in use, then the entries array is not mapped while we're
* in user mode. The whole array will be aliased at the addressed
* given by ldt_slot_va(slot). We use two slots so that we can allocate
* and map, and enable a new LDT without invalidating the mapping
* of an older, still-in-use LDT.
*
* slot will be -1 if this LDT doesn't have an alias mapping.
*/
int slot;
};

结构体大小是0x10,slub中会在kmalloc-16申请,slab则会在kmalloc-32申请

entries指向一个数组

nr_entries记录着数组的数量

struct desc_struct即使段描述符,定义如下(/arch/x86/include/asm/desc_defs.h),暂时不管他

1
2
3
4
5
6
7
/* 8 byte segment descriptor */
struct desc_struct {
u16 limit0;
u16 base0;
u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1;
u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
} __attribute__((packed));

Linux 提供 modify_ldt 系统调用,通过该系统调用可以获取或修改当前进程的 LDT

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
unsigned long , bytecount)
{
int ret = -ENOSYS;

switch (func) {
case 0:
ret = read_ldt(ptr, bytecount);
break;
case 1:
ret = write_ldt(ptr, bytecount, 1);
break;
case 2:
ret = read_default_ldt(ptr, bytecount);
break;
case 0x11:
ret = write_ldt(ptr, bytecount, 0);
break;
}
/*
* The SYSCALL_DEFINE() macros give us an 'unsigned long'
* return type, but tht ABI for sys_modify_ldt() expects
* 'int'. This cast gives us an int-sized value in %rax
* for the return code. The 'unsigned' is necessary so
* the compiler does not try to sign-extend the negative
* return codes into the high half of the register when
* taking the value from int->long.
*/
return (unsigned int)ret;
}

read_ldt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
struct mm_struct *mm = current->mm;
unsigned long entries_size;
int retval;

down_read(&mm->context.ldt_usr_sem);

if (!mm->context.ldt) {
retval = 0;
goto out_unlock;
}

if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;

entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
if (entries_size > bytecount)
entries_size = bytecount;

if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
retval = -EFAULT;
goto out_unlock;
}

if (entries_size != bytecount) {
/* Zero-fill the rest and pretend we read bytecount bytes. */
if (clear_user(ptr + entries_size, bytecount - entries_size)) {
retval = -EFAULT;
goto out_unlock;
}
}
retval = bytecount;

out_unlock:
up_read(&mm->context.ldt_usr_sem);
return retval;
}

其中两个常量宏的定义如下

1
2
3
4
/* Maximum number of LDT entries supported. */
#define LDT_ENTRIES 8192
/* The size of each LDT entry. */
#define LDT_ENTRY_SIZE 8

重点看

1
2
3
4
if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
retval = -EFAULT;
goto out_unlock;
}

如果我们能够修改ldt结构的entries结构,便能够做到任意读

并且copy_from_user和copy_to_user的返回值均是未成功copy的数量,可以以此判断是否命中

大范围搜索内存

不过就算read_ldt能够帮助我们搜索内存,但是仍然无法完全避免hardened usercopy的影响

但观察 fork 系统调用的源码,我们可以发现如下执行链:

1
2
3
4
5
6
7
8
sys_fork()
kernel_clone()
copy_process()
copy_mm()
dup_mm()
dup_mmap()
arch_dup_mmap()
ldt_dup_context()

ldt_dup_context() 定义于 arch/x86/kernel/ldt.c 中,逻辑如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
/*
* Called on fork from arch_dup_mmap(). Just copy the current LDT state,
* the new task is not running, so nothing can be installed.
*/
int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{
//...

memcpy(new_ldt->entries, old_mm->context.ldt->entries,
new_ldt->nr_entries * LDT_ENTRY_SIZE);

//...
}

其中new_ldt->nr_entriesold_ldt->nr_entries赋值

在这里会通过 memcpy 将父进程的 ldt->entries 拷贝给子进程,是完全处在内核中的操作,因此能够绕过hardened usercopy的检查

当父进程设置目标地址后,再打开子进程,便会将目标地址处的内容复制到子进程的ldt中,之后再使用read_ldt便能够直接读取

write_ldt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
struct ldt_struct *new_ldt, *old_ldt;
unsigned int old_nr_entries, new_nr_entries;
struct user_desc ldt_info;
struct desc_struct ldt;
int error;

error = -EINVAL;
if (bytecount != sizeof(ldt_info))
goto out;
error = -EFAULT;
if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
goto out;

error = -EINVAL;
if (ldt_info.entry_number >= LDT_ENTRIES)
goto out;
if (ldt_info.contents == 3) {
if (oldmode)
goto out;
if (ldt_info.seg_not_present == 0)
goto out;
}

if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
LDT_empty(&ldt_info)) {
/* The user wants to clear the entry. */
memset(&ldt, 0, sizeof(ldt));
} else {
if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
error = -EINVAL;
goto out;
}

fill_ldt(&ldt, &ldt_info);
if (oldmode)
ldt.avl = 0;
}

if (down_write_killable(&mm->context.ldt_usr_sem))
return -EINTR;

old_ldt = mm->context.ldt;
old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);

error = -ENOMEM;
new_ldt = alloc_ldt_struct(new_nr_entries);
if (!new_ldt)
goto out_unlock;

if (old_ldt)
memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);

new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);

/*
* If we are using PTI, map the new LDT into the userspace pagetables.
* If there is already an LDT, use the other slot so that other CPUs
* will continue to use the old LDT until install_ldt() switches
* them over to the new LDT.
*/
error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
if (error) {
/*
* This only can fail for the first LDT setup. If an LDT is
* already installed then the PTE page is already
* populated. Mop up a half populated page table.
*/
if (!WARN_ON_ONCE(old_ldt))
free_ldt_pgtables(mm);
free_ldt_struct(new_ldt);
goto out_unlock;
}

install_ldt(mm, new_ldt);
unmap_ldt_struct(mm, old_ldt);
free_ldt_struct(old_ldt);
error = 0;

out_unlock:
up_write(&mm->context.ldt_usr_sem);
out:
return error;
}

我们主要关注

1
2
3
4
5
6
new_ldt = alloc_ldt_struct(new_nr_entries);
if (!new_ldt)
goto out_unlock;
if (old_ldt)
memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
new_ldt->entries[ldt_info.entry_number] = ldt;

new_ldt是新申请出来的object,在alloc之后memcpy之前有一个窗口期,若是我么能够在这期间竞争修改new_ldt->entries,那么便能够做到任意写,不过这个窗口期比较短,实际运用成功率较低

但我们还可以注意到new_ldt->entries[ldt_info.entry_number] = ldt;这一句,memcpy函数拷贝的长度是old_nr_entries * LDT_ENTRY_SIZE

这个数据量相对较大,那么如果能够在memcpy函数执行的过程中通过竞争修改new_ldt->entries,也能够做到小范围的任意写

至于任意写的值其实也并不是完全受我们控制,不过可以根据我们传入的结构体,在一定程度上进行控制(具体可以看这个函数的完整流程)

我们需要传入的结构体的定义如下,新的ldt一定程度上受这个结构体控制,可以根据要求更改

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/*
* Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
* not to the default values if you still want to do syscalls. This
* call is more for 32bit mode therefore.
*/
struct user_desc {
unsigned int entry_number;
unsigned int base_addr;
unsigned int limit;
unsigned int seg_32bit:1;
unsigned int contents:2;
unsigned int read_exec_only:1;
unsigned int limit_in_pages:1;
unsigned int seg_not_present:1;
unsigned int useable:1;
#ifdef __x86_64__
/*
* Because this bit is not present in 32-bit user code, user
* programs can pass uninitialized values here. Therefore, in
* any context in which a user_desc comes from a 32-bit program,
* the kernel must act as though lm == 0, regardless of the
* actual value.
*/
unsigned int lm:1;
#endif
};

setxattr

申请obj大小:任意

GFP_KERNEL_ACCOUNT:否

功能:堆占位

观察 setxattr 源码,发现如下调用链:

1
2
3
SYS_setxattr()
path_setxattr()
setxattr()

setxattr() 函数中有如下逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static long
setxattr(struct dentry *d, const char __user *name, const void __user *value,
size_t size, int flags)
{
//...
kvalue = kvmalloc(size, GFP_KERNEL);
if (!kvalue)
return -ENOMEM;
if (copy_from_user(kvalue, value, size)) {

//,..

kvfree(kvalue);

return error;
}

这里的 value 和 size 都是由我们来指定的,即我们可以分配任意大小的 object 并向其中写入内容,之后该对象会被释放掉

使用时按照以下格式,其中第一个字符串需要是本进程的文件路径,第二个字符串任意

1
setxattr("/tmp/exp", "abcdefg", &buf,len,0);

通常需要配合 userfaultfd 系统调用完成进一步的利用

shm_file_data

申请obj大小:32

GFP_KERNEL_ACCOUNT:否

功能:.text泄露

这个结构体主要是用于泄露内核基址的

1
2
3
4
5
6
struct shm_file_data {
int id;
struct ipc_namespace *ns;
struct file *file;
const struct vm_operations_struct *vm_ops;
};

大小为0x20,从kmalloc-32中分配

其中的 ns字段和vm_ops字段皆指向内核的.text段中

file字段位于内核线性映射区,能够泄露内核堆地址

有四个相关函数shmgetshmatshmctlshmdt

分配

使用 shmget 系统调用可以获得一个共享内存对象,随后要使用 shmat 系统调用将共享内存对象映射到进程的地址空间

shmget

1
2
3
4
SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
{
return ksys_shmget(key, size, shmflg);
}

一般这样调用,key可以是任意整型,返回一个shmid

shm_id = shmget(114514, 0x1000, SHM_R | SHM_W | IPC_CREAT);

shmat

1
2
3
4
5
6
7
8
9
10
11
SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
{
unsigned long ret;
long err;

err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
if (err)
return err;
force_successful_syscall_return();
return (long)ret;
}

在do_shmat中会分配一个shm_file_data结构体

1
2
3
4
5
6
7
8
9
10
11
12
long do_shmat(int shmid, char __user *shmaddr, int shmflg,
ulong *raddr, unsigned long shmlba)
{
//...

struct shm_file_data *sfd;

//...

sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
//...
file->private_data = sfd;

释放

shmdt 系统调用用以断开与共享内存对象的连接,观察其源码,发现其会调用 ksys_shmdt() 函数,注意到如下调用链:

1
2
3
4
5
SYS_shmdt()
ksys_shmdt()
do_munmap()
remove_vma_list()
remove_vma()

其中有着这样一条代码:

1
2
3
4
5
6
7
8
static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
{
struct vm_area_struct *next = vma->vm_next;

might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
//...

在这里调用了该 vma 的 vm_ops 对应的 close 函数,我们将目光重新放回共享内存对应的 vma 的初始化的流程当中,在 shmat() 中注意到如下逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
long do_shmat(int shmid, char __user *shmaddr, int shmflg,
ulong *raddr, unsigned long shmlba)
{
//...
sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
if (!sfd) {
fput(base);
goto out_nattch;
}

file = alloc_file_clone(base, f_flags,
is_file_hugepages(base) ?
&shm_file_operations_huge :
&shm_file_operations);

在这里调用了 alloc_file_clone() 函数,其会调用 alloc_file() 函数将第三个参数赋值给新的 file 结构体的 f_op 域,在这里是 shm_file_operationsshm_file_operations_huge,定义于 /ipc/shm.c 中,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static const struct file_operations shm_file_operations = {
.mmap = shm_mmap,
.fsync = shm_fsync,
.release = shm_release,
.get_unmapped_area = shm_get_unmapped_area,
.llseek = noop_llseek,
.fallocate = shm_fallocate,
};

/*
* shm_file_operations_huge is now identical to shm_file_operations,
* but we keep it distinct for the sake of is_file_shm_hugepages().
*/
static const struct file_operations shm_file_operations_huge = {
.mmap = shm_mmap,
.fsync = shm_fsync,
.release = shm_release,
.get_unmapped_area = shm_get_unmapped_area,
.llseek = noop_llseek,
.fallocate = shm_fallocate,
};

在这里对于关闭 shm 文件,对应的是 shm_release 函数,如下:

1
2
3
4
5
6
7
8
9
10
static int shm_release(struct inode *ino, struct file *file)
{
struct shm_file_data *sfd = shm_file_data(file);

put_ipc_ns(sfd->ns);
fput(sfd->file);
shm_file_data(file) = NULL;
kfree(sfd);
return 0;
}

即当我们进行 shmdt 系统调用时便可以释放 shm_file_data 结构体

msg

申请obj大小:64-4k

GFP_KERNEL_ACCOUNT:是

功能:任意读|任意写|堆喷

在 Linux kernel 中有着一组 system V 消息队列相关的系统调用:

  • msgget:创建一个消息队列
  • msgsnd:向指定消息队列发送消息
  • msgrcv:从指定消息队列接接收消息

msgget

其中msgget用于创建一个消息队列时

1
int msgget(key_t key, int msgflag)
  • key:值为函数ftok的返回值或IPC_PRIVATE,若为IPC_PRIVATE则直接创建新的消息队列
  • msgflag:IPC_CREAT:创建新的消息队列。 IPC_EXCL:与IPC_CREAT一同使用,表示如果要创建的消息队列已经存在,则返回错误。(IPC_EXCL没有什么实质性的意义,但是可以帮我们确定是新建了消息队列而不是返回已经存在的消息队列) IPC_NOWAIT:读写消息队列要求无法满足时,不阻塞。返回值: 调用成功返回队列标识符,否则返回-1. 其中该参数需要配合权限控制符,例如0666|IPC_CREAT

在内核空间中会创建一个 msg_queue 结构体,其表示一个消息队列:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/* one msq_queue structure for each present queue on the system */
struct msg_queue {
struct kern_ipc_perm q_perm;
time64_t q_stime; /* last msgsnd time */
time64_t q_rtime; /* last msgrcv time */
time64_t q_ctime; /* last change time */
unsigned long q_cbytes; /* current number of bytes on queue */
unsigned long q_qnum; /* number of messages in queue */
unsigned long q_qbytes; /* max number of bytes on queue */
struct pid *q_lspid; /* pid of last msgsnd */
struct pid *q_lrpid; /* last receive pid */

struct list_head q_messages;
struct list_head q_receivers;
struct list_head q_senders;
} __randomize_layout;

msgsnd

1
int msgsnd(int msqid, const void *msgp, size_t msgsz, int msgflg)
  • msqid:队列标识符
  • msgp:指向发送的消息,消息的前八个字节必须是msgtyp(值可以自定义),后面跟真正的消息
  • msgsz:真正的消息长度
  • msgflg:标志位
    • 0:消息队列满时,msgsnd阻塞直到消息能够写入消息队列
    • IPC_NOWAIT:消息队列满时不等待立即返回
    • IPC_NOERROR:若发送的消息长度大于msgsz,则截断

msgsnd 系统调用在指定消息队列上发送一条指定大小的 message 时,会建立msg_msg结构体。

1
2
3
4
5
6
7
8
9
/* one msg_msg structure for each message */
struct msg_msg {
struct list_head m_list;
long m_type;
size_t m_ts; /* message text size */
struct msg_msgseg *next;
void *security;
/* the actual message follows immediately */
};

在内核当中这两个结构体形成一个如下结构的循环双向链表:

若是消息队列中只有一个消息则是这样:

虽然 msg_queue 的大小基本上是固定的,但是 msg_msg 作为承载消息的本体其大小是可以随着消息大小的改变而进行变动的,去除掉 msg_msg 结构体本身的 0x30 字节的部分(或许可以称之为 header)剩余的部分都用来存放用户数据,因此内核分配的 object 的大小是跟随着我们发送的 message 的大小进行变动的

而当我们单次发送大于【一个页面大小 - header size】大小的消息时,内核会额外补充添加 msg_msgseg 结构体,其与 msg_msg 之间形成如下单向链表结构:

同样地,单个 msg_msgseg 的大小最大为一个页面大小,因此超出这个范围的消息内核会额外补充上更多的 msg_msgseg 结构体

msgrcv

1
ssize_t msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp,int msgflg)
  • msgqid:消息队列标识符
  • msgp:存放消息的结构体,消息类型msgtyp也会放到这里
  • msgsz:接收消息长度,不包含消息类型
  • msgtyp:
    • 0:接收第一个消息
    • >0:接收消息类型等于msgtyp的第一个函数
    • <0:返回队列中消息类型值小于或等于 msgtyp 绝对值的消息,如果这种消息有若干个,则取类型值最小的消息
  • msgflg:
    • 0:msgrcv() 调用阻塞直到接收消息成功为止
    • MSG_NOERROR:若返回的消息字节数比 nbytes 字节数多,则消息就会截短到 nbytes 字节,且不通知消息发送进程
    • MSG_COPY:读取但不释放,当我们在调用 msgrcv 接收消息时,相应的 msg_msg 链表便会被释放,当我们在调用 msgrcv 时若设置了 MSG_COPY 标志位,则内核会将 message 拷贝一份后再拷贝到用户空间,原双向链表中的 message 并不会被 unlink,从而我们便可以多次重复地读取同一个 msg_msg 链条中数据
    • IPC_NOWAIT:调用进程会立即返回,若没有收到消息则立即返回 -1

MSG_COPY

MSG_COPY位为1的时候,在find_msg中会返回msg_msg双向循环链表中,第msgtypmsg_msg,也就是返回第msgtyp条消息,而不是根据msgtyp去和msg->m_type进行匹配

此外必须同时搭配IPC_NOWAIT标志

对于 MSG_COPY 而言,数据的拷贝使用的是 copy_msg() 函数,其会比对源消息的 m_ts 是否大于存储拷贝的消息的 m_ts ,若大于则拷贝失败,而后者则为我们传入 msgrcv()msgsz,因此若我们仅读取单条消息则需要保证后者大于相等前者

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
{
struct msg_msgseg *dst_pseg, *src_pseg;
size_t len = src->m_ts;
size_t alen;

if (src->m_ts > dst->m_ts)// 有个 size 检查
return ERR_PTR(-EINVAL);

alen = min(len, DATALEN_MSG);
memcpy(dst + 1, src + 1, alen);

for (dst_pseg = dst->next, src_pseg = src->next;
src_pseg != NULL;//以源 msg 链表尾为终止
dst_pseg = dst_pseg->next, src_pseg = src_pseg->next) {

len -= alen;
alen = min(len, DATALEN_SEG);
memcpy(dst_pseg + 1, src_pseg + 1, alen);
}

dst->m_type = src->m_type;
dst->m_ts = src->m_ts;

return dst;
}

内存越界读

在拷贝数据时对长度的判断主要依靠的是 msg_msg->m_ts,我们不难想到的是:若是我们能够控制一个 msg_msg 的 header,将其 m_sz 成员改为一个较大的数,我们就能够越界读取出最多将近一张内存页大小的数据

任意地址读

对于大于一张内存页的数据而言内核会在 msg_msg 的基础上再补充加上 msg_msgseg 结构体,形成一个单向链表,我们不难想到的是:若是我们能够同时劫持 msg_msg->m_tsmsg_msg->next,我们便能够完成内核空间中的任意地址读

但这个方法有一个缺陷,无论是 MSG_COPY 还是常规的接收消息,其拷贝消息的过程的判断主要依据还是单向链表的 next 指针,因此若我们需要完成对特定地址向后的一块区域的读取,我们需要保证该地址上的数据为 NULL

任意地址写

当我们调用 msgsnd 系统调用时,其会调用 load_msg() 将用户空间数据拷贝到内核空间中,首先是调用 alloc_msg() 分配 msg_msg 单向链表,之后才是正式的拷贝过程,即空间的分配与数据的拷贝是分开进行的

我们不难想到的是,在拷贝时利用 userfaultfd/FUSE 将拷贝停下来,在子进程中篡改 msg_msg 的 next 指针,在恢复拷贝之后便会向我们篡改后的目标地址上写入数据,从而实现任意地址写

pipe

pipe_inode_info

申请obj大小:192

GFP_KERNEL_ACCOUNT:是

功能:泄露直接映射区地址

在内核中,管道本质上是创建了一个虚拟的 inode 来表示的,对应的就是一个 pipe_inode_info 结构体(inode->i_pipe),其中包含了一个管道的所有信息,当我们创建一个管道时,内核会创建一个 VFS inode 与一个 pipe_inode_info 结构体:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/**
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
* @head: The point of buffer production
* @tail: The point of buffer consumption
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
* @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
* @files: number of struct file referring this pipe (protected by ->i_lock)
* @r_counter: reader counter
* @w_counter: writer counter
* @fasync_readers: reader side fasync
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
* @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};

pipe_inode_info->bufs 为一个动态分配的结构体数组,因此我们可以利用他来泄露出内核的“堆”上地址

pipe_buffer

申请obj大小:任意(默认1k)

GFP_KERNEL_ACCOUNT:是

功能:泄露直接映射区地址

当创建一个管道时,会自动创建PIPE_DEF_BUFFERS(16)pipe_buffer,(只有在使用时才会按需分配初始化)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/**
* struct pipe_buffer - a linux kernel pipe buffer
* @page: the page containing the data for the pipe buffer
* @offset: offset of data inside the @page
* @len: length of data inside the @page
* @ops: operations associated with this buffer. See @pipe_buf_operations.
* @flags: pipe buffer flags. See above.
* @private: private data owned by the ops.
**/
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

offset是读取的偏移,len是可读的长度,写入的偏移是offset+len

一个管道的pipe_buffer数量是可以更改的

存在一个系统调用fcntl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe;
long ret;

pipe = get_pipe_info(file, false);
if (!pipe)
return -EBADF;

__pipe_lock(pipe);

switch (cmd) {
case F_SETPIPE_SZ:
ret = pipe_set_size(pipe, arg);
//...

static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
{
struct pipe_buffer *bufs;
unsigned int size, nr_pages;
unsigned long user_bufs;
long ret = 0;

size = round_pipe_size(arg);
nr_pages = size >> PAGE_SHIFT;

if (!nr_pages)
return -EINVAL;

/*
* If trying to increase the pipe capacity, check that an
* unprivileged user is not trying to exceed various limits
* (soft limit check here, hard limit check just below).
* Decreasing the pipe capacity is always permitted, even
* if the user is currently over a limit.
*/
if (nr_pages > pipe->buffers &&
size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
return -EPERM;

user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);

if (nr_pages > pipe->buffers &&
(too_many_pipe_buffers_hard(user_bufs) ||
too_many_pipe_buffers_soft(user_bufs)) &&
is_unprivileged_user()) {
ret = -EPERM;
goto out_revert_acct;
}

/*
* We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
* expect a lot of shrink+grow operations, just free and allocate
* again like we would do for growing. If the pipe currently
* contains more buffers than arg, then return busy.
*/
if (nr_pages < pipe->nrbufs) {
ret = -EBUSY;
goto out_revert_acct;
}

bufs = kcalloc(nr_pages, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs)) {
ret = -ENOMEM;
goto out_revert_acct;
}

/*
* The pipe array wraps around, so just start the new one at zero
* and adjust the indexes.
*/
if (pipe->nrbufs) {
unsigned int tail;
unsigned int head;

tail = pipe->curbuf + pipe->nrbufs;
if (tail < pipe->buffers)
tail = 0;
else
tail &= (pipe->buffers - 1);

head = pipe->nrbufs - tail;
if (head)
memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
if (tail)
memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
}

pipe->curbuf = 0;
kfree(pipe->bufs);
pipe->bufs = bufs;
pipe->buffers = nr_pages;
return nr_pages * PAGE_SIZE;

out_revert_acct:
(void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers);
return ret;
}

当选项为F_SETPIPE_SZ

其会修改当前pipe的bufs数组大小为第三个参数(arg>>12)*sizeof(*bufs)

例如当我们使用fcntl(pipe_fd, F_SETPIPE_SZ, 0x1000 * 64)

就会更改bufs从kmalloc-cg-2k进行分配

不过注意arg>>12必须是2的幂次方


劫持执行流

当我们关闭了管道的两端时,会触发 pipe_buffer->pipe_buffer_operations->release 这一指针

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

当执行到该指针时 rsi 寄存器刚好指向对应的 pipe_buffer,因此我们可以将函数表劫持到 pipe_buffer 上,找到一条合适的 gadget 将栈迁移到该处,从而更顺利地完成 ROP

任意地址读写

管道的读写通过 pipe_buffer[i].page 确定读写的内存,因此若我们能够修改 page 指针,则我们便能完成对整个物理内存区域的读操作,以及对直接映射区上有写权限的内存区域的写操作

sk_buff

申请obj大小:>=512

GFP_KERNEL_ACCOUNT:否

功能:堆喷

sk_buff 是 Linux kernel 网络协议栈中一个重要的基础结构体,其用以表示在网络协议栈中传输的一个「包」,但其结构体本身不包含一个包的数据部分,而是包含该包的各种属性,数据包的本体数据则使用一个单独的 object 储存

这个结构体成员比较多,我们主要关注核心部分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
struct sk_buff {
union {
struct {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;

// ...
};

// ...

/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize;
refcount_t users;

#ifdef CONFIG_SKB_EXTENSIONS
/* only useable after checking ->active_extensions != 0 */
struct skb_ext *extensions;
#endif
};

sk_buff 结构体与其所表示的数据包形成如下结构,其中:

  • head :一个数据包实际的起始处(也就是为该数据包分配的 object 的首地址)
  • end :一个数据包实际的末尾(为该数据包分配的 object 的末尾地址)
  • data当前所在 layer 的数据包对应的起始地址
  • tail当前所在 layer 的数据包对应的末尾地址

data 和 tail 可以这么理解:数据包每经过网络层次模型中的一层都会被添加/删除一个 header (有时还有一个 tail),data 与 tail 便是用以对此进行标识的

多个 sk_buff 之间形成双向链表结构,类似于 msg_queue,这里同样有一个 sk_buff_head 结构作为哨兵节点

在内核网络协议栈中很多地方都会用到该结构体,例如读写 socket 一类的操作都会造成包的创建,其最终都会调用到 alloc_skb() 来分配该结构体,而这个函数又是 __alloc_skb() 的 wrapper,不过需要注意的是其会从独立的 skbuff_fclone_cache/ skbuff_head_cache 取 object

sk_buff 虽然是从独立的 kmem_cache 中分配的,但其对应的数据包不是

因此我们仍然可以实现近乎任意大小 object 的分配与释放

因此 sk_buffmsg_msg 一样常被用来完成堆喷的工作,不同的是 msg_msg 带了一个 header,而 sk_buff 的数据包则带一个 tail——skb_shared_info 结构体

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
struct skb_shared_info {
__u8 flags;
__u8 meta_len;
__u8 nr_frags;
__u8 tx_flags;
unsigned short gso_size;
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
struct sk_buff *frag_list;
struct skb_shared_hwtstamps hwtstamps;
unsigned int gso_type;
u32 tskey;

/*
* Warning : all fields before dataref are cleared in __alloc_skb()
*/
atomic_t dataref;

/* Intermediate layers must ensure that destructor_arg
* remains valid until skb destructor */
void * destructor_arg;

/* must be last field, see pskb_expand_head() */
skb_frag_t frags[MAX_SKB_FRAGS];
};

skb_shared_info 结构体的大小为 320 字节,这意味着我们能够利用分配的 object 最小的大小也得是 512 字节,这无疑为我们的利用增添了几分难度,但不可否认的是 sk_buff 仍为我们提供了较大对象的任意分配写入与释放

正所谓有发必有收,我们只需要沿着发送的路径接收该包就能将其释放掉,例如若是我们通过向套接字中写入数据创建了一个包,则从套接字中读出该包便能将其释放

key

申请obj大小:任意

GFP_KERNEL_ACCOUNT:否

功能:泄露.text|泄露直接映射区地址

在linux内核中有一套专门用于管理密钥的子系统,负责密钥的创建,读取,更新以及销毁等功能

内核提供了两个系统调用来负责这些操作

add_key()keyctl()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
        #include <sys/types.h>
#include <keyutils.h>

key_serial_t add_key(const char *type, const char *description,
const void *payload, size_t plen,
key_serial_t keyring);
//...
#include <asm/unistd.h>
#include <linux/keyctl.h>
#include <unistd.h>

long syscall(__NR_keyctl, int operation, __kernel_ulong_t arg2,
__kernel_ulong_t arg3, __kernel_ulong_t arg4,
__kernel_ulong_t arg5);

当调用add_key()创建带有 description 字符串的、类型为 "user" 的、长度为 plen 的内容为 payload 的密钥时

  • 首先会在内核空间中分配 obj 1 与 obj2,分配 flag 为 GFP_KERNEL,用以保存 description (字符串,最大大小为 4096)、payload (普通数据,大小无限制)
  • 分配 obj3 保存 description ,分配 obj4 保存 payload,分配 flag 皆为 GFP_KERNEL
  • 释放 obj1 与 obj2,返回密钥 id

其中 obj4 为一个 user_key_payload 结构体,定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
struct user_key_payload {
struct rcu_head rcu; /* RCU destructor */
unsigned short datalen; /* length of this data */
char data[] __aligned(__alignof__(u64)); /* actual data */
};

//...

struct callback_head {
struct callback_head *next;
void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head

由于decription和payload的长度都是可控的,那么obj1-4的大小其实也是可控的

类似于 msg_msguser_key_payload 结构体有着一个固定大小的头部,其余空间用来存储来自用户空间的数据(密钥内容)。

keyctl() 系统调用为我们提供了读取、更新(分配新对象,释放旧对象)、销毁密钥(释放 payload)的功能,其中读取的最大长度由 user_key_payload->datalen 决定

读取 key 时的 len 应当不小于 user_key_payload->datalen,否则会读取失败,但是可以大于datalen。

当释放一个密钥时

内核中,对应的处理流程是_x64_sys_keyctl() -> keyctl_revoke_key() -> key_revoke() -> user_revoke() -> call_rcu(),在call_rcu()函数中将 user_key_payload 结构体的 rcu.func 设置成user_free_payload_rcu()函数的地址

io_uring

申请obj大小:任意

GFP_KERNEL_ACCOUNT:否

功能:泄露.text|泄露直接映射区地址

io_uring是自内核版本 5.1 引入的全新的高性能异步 I/O 框架,

方法

userfaultfd

userfaultfd配合copy_from_user或者copy_to_user在条件竞争中有着十分强大的作用

其作用就是监控在某一段内存中发生的页错误,并且可以由用户指定如何处理

一般情况下我们选择使用sleep()或者pause()等函数使该线程停下,这样让我们完成条件竞争构造uaf等,提供了十分便利的条件

不过在大概5.11版本前后,由于变量sysctl_unprivileged_userfaultfd不再被初识赋值为1,使得userfaultfd不再能被非root用户使用

一个模板:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
void* userfaultfd_stuck_handler(void* arg)
{
struct uffd_msg msg;
unsigned long uffd = (unsigned long) arg;
puts("[+] stuck handler created");
int nready;
struct pollfd pollfd;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
puts("[+] stuck handler unblocked");
pause();
if (nready != 1)
{
ErrExit("[-] Wrong poll return val");
}
nready = read(uffd, &msg, sizeof(msg));
if (nready <= 0)
{
ErrExit("[-] msg err");
}

char* page = (char*) mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (page == MAP_FAILED)
{
ErrExit("[-] mmap err");
}
struct uffdio_copy uc;
// init page
//这部分是缺页异常处理得到的页的内容填充
memset(page, 0, sizeof(page));
uc.src = (unsigned long) page;
uc.dst = (unsigned long) msg.arg.pagefault.address & ~(PAGE_SIZE - 1);
uc.len = PAGE_SIZE;
uc.mode = 0;
uc.copy = 0;
ioctl(uffd, UFFDIO_COPY, &uc);
puts("[+] stuck handler done");
return NULL;
}

void RegisterUserfault(void *fault_page, void* handler)
{
pthread_t thr;
struct uffdio_api ua;
struct uffdio_register ur;
uint64_t uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
ua.api = UFFD_API;
ua.features = 0;
if (ioctl(uffd, UFFDIO_API, &ua) == -1)
ErrExit("[-] ioctl-UFFDIO_API");

ur.range.start = (unsigned long)fault_page; //我们要监视的区域
ur.range.len = PAGE_SIZE;
ur.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &ur) == -1) //注册缺页错误处理,当发生缺页时,程序会阻塞,此时,我们在另一个线程里操作
ErrExit("[-] ioctl-UFFDIO_REGISTER");
//开一个线程,接收错误的信号,然后处理
int s = pthread_create(&thr, NULL,handler, (void*)uffd);
if (s!=0)
ErrExit("[-] pthread_create");
}

int main(){
...
RegisterUserfault(stuck_mapped_memory, userfaultfd_stuck_handler);
...
}

pgv页级分配

setsockopt

当创建一个协议为PF_PACKET的socket时,如下

socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);

如果再先调用 setsockopt()PACKET_VERSION 设为 TPACKET_V1或者TPACKET_V2

1
2
3
4
5
enum tpacket_versions {
TPACKET_V1,
TPACKET_V2,
TPACKET_V3,
};

1
2
int version = TPACKET_V1;
setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));

那么之后当我们创建如下这样一个结构体时

1
2
3
4
5
6
struct tpacket_req {
unsigned int tp_block_size;
unsigned int tp_block_nr;
unsigned int tp_frame_size;
unsigned int tp_frame_nr;
};

再调用PACKET_TX_RING或者PACKET_RX_RING

1
2
3
4
5
6
7
8
struct tpacket_req req;

req.tp_block_size = size;
req.tp_block_nr = nr;
req.tp_frame_size = 0x1000;
req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;

setsockopt(socket_fd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));

之后便会分配tp_frame_nr个页框,并且是直接从伙伴系统中提取

并且当关闭socket时,会将这些页框释放

最后需要注意的是

需要注意的是低权限用户无法创建一个类型为 SOCK_RAW 协议为 PF_PACKET 的 socket,但是我们可以通过开辟新的命名空间来绕过该限制,不过这样也有一定的缺陷:我们的进程也被隔离到该进程里了,无法获得“真正的 root 权限”

因此我们最好的做法便是开辟一个子进程,在该子进程中开辟新命名空间专门进行堆喷,父进程/其他子进程用于提权,通过管道与该子进程进行交互

modprobe_path

Gadget

work_for_cpu_fn

一个十分神奇的gadget

源码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
struct work_for_cpu {
struct work_struct work;
long (*fn)(void *);
void *arg;
long ret;
};

static void work_for_cpu_fn(struct work_struct *work)
{
struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

wfc->ret = wfc->fn(wfc->arg);
}

可以看到其会有如下调用

*(arg+0x30)=*(arg+0x20)(*(arg+0x28))

也就是说这个函数不经能够劫持实现任意函数调用,同时还能存储返回值(nb嘞)

该函数位于 workqueue 机制的实现中,只要是开启了多核支持的内核 (CONFIG_SMP)都会包含这个函数的代码。不难注意到,这个函数非常好用,只要能控制第一个参数指向的内存,即可实现带一个任意参数调用任意函数,并把返回值存回第一个参数指向的内存的功能,且该 “gadget” 能干净的返回,执行的过程中完全不用管 SMAP、SMEP 的事情。由于内核中大量的 read / write / ioctl 之类的实现的第一个参数也都恰好是对应的对象本身,可谓是非常的适合这种场景了。考虑到我们提权需要做的事情只是 commit_creds(prepare_kernel_cred(0)),完全可以用两次上述的函数调用原语实现。(如果还需要禁用 SELinux 之类的,再找一个任意地址写 0 的 gadget 即可,很容易找)

奇技淫巧

QEMU逃逸

当qemu启动脚本中没有重定向monitor时,可以直接ctrl+A C逃逸,解压rootfs.img读flag

1
2
3
4
5
6
7
8
9
10
migrate "exec:cp rootfs.img /tmp "
migrate "exec:cd /tmp;zcat rootfs.img | cpio -idmv 1>&2"
migrate "exec:cat /tmp/flag 1>&2"
(qemu) migrate "exec:cat /tmp/flag 1>&2"
flag{test_flag}qemu-system-x86_64: failed to save SaveStateEntry with id(name):)
qemu-system-x86_64: Unable to write to command: Broken pipe
qemu-system-x86_64: Unable to write to command: Broken pipe


zcat rootfs.cpio | cpio -idmv 1>&2

bin目录不为ROOT

这样可以修改bin里面的命令,而init脚本在退出时,通常包含poweroff命令,或者umount命令,而init运行时是以root权限运行的,所以我们可以修改这些命令从而在输入exit命令调用init中在setsid剩下的命令时来直接cat flag或者获得shell

根目录不为ROOT

那么在根目录下,虽然bin的所有者为root,但是可以对bin进行改名,然后我们伪造一个bin目录,里面放上我们伪造的命令,那么就可以以root权限调用这个伪造的命令了,如下为所示例子。

1
2
3
4
5
mv bin evil_bin
/evil_bin/mkdir bin
echo "#!/evil_bin/sh" > /bin/power
echo "/evil_bin/sh" >> /bin/power
exit

密码设置

如果root账号的密码没有设置的话,直接su即可登录到root,非预期的。

又或者是默认的密码#,当然其实很多内核并没有提供su

例如linectf2022-encrypt

1
2
3
4
5
6
7
8
9
10
11
12
13
First, connect via netcat.
We got a shell, let's look around:
/ $ ls
=> We see the flag file.
/ $ cat flag
=> Not enough permissions.
Are there many other users?
/ $ cat /etc/passwd
=> Only root seems to be available. Let's try switching to it:
/ $ su
/ #
=> See the #? This worked! We got root!
/ # cat flag

suid

带有suid的可执行文件允许我们拥有文件所有者的权限

还是lincectf2022的encrypt

find / -perm -u=s -type f 2>/dev/null

打印的字符串告诉我们busybox具有这个权限位

Run busybox and it will show us all the configurations which are avialable

发现其中具有su于是

busybox su root

linux支持

direct_map_addr_to/from_page_addr

直接映射区的虚拟地址可以通过线性对应关系与vmemmap段的page结构相互转化

而每一个page结构体又对应着唯一一个PGN

又因为直接映射区上的地址减去page_offset_base就是物理内存地址,所以这些之间存在直观的直接线性关系

关系如下

direct_map_addr_to_page_addr

1
2
3
4
5
6
7
8
size_t direct_map_addr_to_page_addr(size_t direct_map_addr)
{
size_t page_count;

page_count = ((direct_map_addr & (~0xfff)) - page_offset_base) / 0x1000;

return vmemmap_base + page_count * 0x40;
}

page_addr_to_direct_map_addr

1
2
3
4
5
6
7
8
size_t page_addr_to_direct_map_addr(size_t vmem_map_addr)
{
size_t page_count;

page_count = ((vmem_map_addr & (~0x40)) - vmemmap_base) / 0x40;

return direct_map_addr + page_count * 0x1000;
}

setreuid/setregid

内核判断用户的依据是当前进程cred结构体中uideuid两个字段

特别的当我们需要提权为root

必须要uideuid两个字段都为0,linux才会认为我们是root用户(在做过一些尝试后,可以确认至少较高版本是这样的,至于低版本尚未确认)

如果仅有其中一个字段为0,内核并不会将我们视作特权用户

提权时当然最好能够同时将这uideuid都修改为0,但若是条件不允许,仅仅能修改其中的一个,我们就可以利用setreuid(0,0)来进一步的提权

当执行setreuid(0,0)时,只要uideuid中任意一个为0,便能够将另一个也修改为0,进而成为被操作系统认可的root用户

此外:suid并不具备这样的能力

看内核源码:(/kernel/sys.c)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/*
* Unprivileged users may change the real uid to the effective uid
* or vice versa. (BSD-style)
*
* If you set the real uid at all, or set the effective uid to a value not
* equal to the real uid, then the saved uid is set to the new effective uid.
*
* This makes it possible for a setuid program to completely drop its
* privileges, which is often a useful assertion to make when you are doing
* a security audit over a program.
*
* The general idea is that a program which uses just setreuid() will be
* 100% compatible with BSD. A program which uses just setuid() will be
* 100% compatible with POSIX with saved IDs.
*/
long __sys_setreuid(uid_t ruid, uid_t euid)
{
struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
kuid_t kruid, keuid;

kruid = make_kuid(ns, ruid);
keuid = make_kuid(ns, euid);

if ((ruid != (uid_t) -1) && !uid_valid(kruid))
return -EINVAL;
if ((euid != (uid_t) -1) && !uid_valid(keuid))
return -EINVAL;

new = prepare_creds();
if (!new)
return -ENOMEM;
old = current_cred();

retval = -EPERM;
if (ruid != (uid_t) -1) {
new->uid = kruid;
if (!uid_eq(old->uid, kruid) &&
!uid_eq(old->euid, kruid) &&
!ns_capable_setid(old->user_ns, CAP_SETUID))
goto error;
}

if (euid != (uid_t) -1) {
new->euid = keuid;
if (!uid_eq(old->uid, keuid) &&
!uid_eq(old->euid, keuid) &&
!uid_eq(old->suid, keuid) &&
!ns_capable_setid(old->user_ns, CAP_SETUID))
goto error;
}

if (!uid_eq(new->uid, old->uid)) {
retval = set_user(new);
if (retval < 0)
goto error;
}
if (ruid != (uid_t) -1 ||
(euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
new->suid = new->euid;
new->fsuid = new->euid;

retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
if (retval < 0)
goto error;

retval = set_cred_ucounts(new);
if (retval < 0)
goto error;

flag_nproc_exceeded(new);
return commit_creds(new);

error:
abort_creds(new);
return retval;
}

SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
return __sys_setreuid(ruid, euid);
}

可以看到,不考虑特权用户的情况下

要设置uid,必须要uid,euid中至少一个为目标id

要设置euid,必须要uid,euid,suid中至少一个为目标id

secondary_startup_64

一般情况下

在直接映射区direct_mapping_area+0x9d000的位置

会有一个函数指针secondary_startup_64

且其就在kernel text开头的部分,一般等于kernel text+0x70

内存大小对kernelpwn的影响

This file implements KASLR memory randomization for x86_64. It randomizes the virtual address space of kernel memory regions (physical memory mapping, vmalloc & vmemmap) for x86_64. This security feature mitigates exploits relying on predictable kernel addresses.

kaslr主要针对以下三个区域

  • direct mapping area
  • vmalloc area
  • virtual memory map area

更多相关可以在/arch/x86/mm/kaslr.c中查看

kaslr的随机化粒度是256m,粒度即说明kaslr随机化范围的最小单位

当分配的内存大小小于256M时,因为加载的内存达不到一个粒度

我们可以简单的得到:

page_offset_base = heap_leak & 0xffffffffff0000000

但在 MEM > 256M 的机器上,这并不总是准确的,我们只能得到本粒度的基址

因此要获得确切的基址就需要办法获得一些能够大致确定其范围的能力,不过好在一般内核pwn题内存都不会大于256m,而就算大于256m,也会优先使用较低的

当 MEM > 16G时,vmemmap_base也不能使用上面的代码实现,不过好在一般也不会有这种情况