Introduction
eBPF stands for extended Berkeley Packet Filter
, but can do more
than just filter packets.
The idea is the same as when you use the tcpdump
command:
$ tcpdump ip host helios and port 80
the ip host helios and port 80
filter/program is compiled and run in a virtual machine.
With eBPF, you program in C (a restricted version of it), you compile it to the target VM then load this code inside the kernel and run it inside the kernel. It also provides access to eBPF maps to store and share data across several eBPF programs.
This is a efficient way to manipulate and trace what is happening in the kernel.
Maps are data structure that you can access from the user-land. So it’s a convenient way to get data from the kernel-land.
eBPF programs are executed is triggered by events, meaning that it’s executed only when, for example, a network packet is received or a specific kernel function is executed.
Requirements
- A 4.X kernel with BPF support enabled:
$ grep _BPF /boot/config-4.16.0-1-amd64
CONFIG_CGROUP_BPF=y
CONFIG_BPF=y
CONFIG_BPF_SYSCALL=y
# CONFIG_BPF_JIT_ALWAYS_ON is not set
CONFIG_NETFILTER_XT_MATCH_BPF=m
CONFIG_NET_CLS_BPF=m
CONFIG_NET_ACT_BPF=m
CONFIG_BPF_JIT=y
# CONFIG_BPF_STREAM_PARSER is not set
CONFIG_LWTUNNEL_BPF=y
CONFIG_BPF_EVENTS=y
# CONFIG_BPF_KPROBE_OVERRIDE is not set
CONFIG_TEST_BPF=m
The Linux kernel source code, to test examples
An eBPF compiler: LLVM have an eBPF backend
$ llc --version LLVM (http://llvm.org/): LLVM version 4.0.1 Optimized build. Default target: x86_64-pc-linux-gnu Host CPU: penryn Registered Targets: aarch64 - AArch64 (little endian) aarch64_be - AArch64 (big endian) amdgcn - AMD GCN GPUs arm - ARM arm64 - ARM64 (little endian) armeb - ARM (big endian) bpf - BPF (host endian) bpfeb - BPF (big endian) bpfel - BPF (little endian) hexagon - Hexagon [ ... ]
The bpf(2) system call
NAME bpf - perform a command on an extended BPF map or program SYNOPSIS #include <linux/bpf.h> int bpf(int cmd, union bpf_attr *attr, unsigned int size);
There is only one system call. cmd
is the operation to perform and
argument attr
is a union of structures. for the corresponding
command.
BPF_MAP_CREATE
, BPF_MAP_LOOKUP_ELEM
, BPF_MAP_UPDATE_ELEM
, BPF_MAP_DELETE_ELEM
, BPF_MAP_GET_NEXT_KEY
: Commands to use maps.
BPF_PROG_LOAD
: Verify and load an eBPF program
Other commands exist but are not yet documented in the man page. Look
at the linux/bpf.h
header file if your are curious.
At the time of writing this article, the union is:
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
__u32 key_size; /* size of key in bytes */
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
__u32 map_flags; /* BPF_MAP_CREATE related
* flags defined above.
*/
__u32 inner_map_fd; /* fd pointing to the inner map */
__u32 numa_node; /* numa node (effective only if
* BPF_F_NUMA_NODE is set).
*/
char map_name[BPF_OBJ_NAME_LEN];
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
__u32 map_fd;
__aligned_u64 key;
union {
__aligned_u64 value;
__aligned_u64 next_key;
};
__u64 flags;
};
struct { /* anonymous struct used by BPF_PROG_LOAD command */
__u32 prog_type; /* one of enum bpf_prog_type */
__u32 insn_cnt;
__aligned_u64 insns;
__aligned_u64 license;
__u32 log_level; /* verbosity level of verifier */
__u32 log_size; /* size of user buffer */
__aligned_u64 log_buf; /* user supplied buffer */
__u32 kern_version; /* checked when prog_type=kprobe */
__u32 prog_flags;
char prog_name[BPF_OBJ_NAME_LEN];
__u32 prog_ifindex; /* ifindex of netdev to prep for */
};
struct { /* anonymous struct used by BPF_OBJ_* commands */
__aligned_u64 pathname;
__u32 bpf_fd;
__u32 file_flags;
};
struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
__u32 target_fd; /* container object to attach to */
__u32 attach_bpf_fd; /* eBPF program to attach */
__u32 attach_type;
__u32 attach_flags;
};
struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
__u32 prog_fd;
__u32 retval;
__u32 data_size_in;
__u32 data_size_out;
__aligned_u64 data_in;
__aligned_u64 data_out;
__u32 repeat;
__u32 duration;
} test;
struct { /* anonymous struct used by BPF_*_GET_*_ID */
union {
__u32 start_id;
__u32 prog_id;
__u32 map_id;
};
__u32 next_id;
__u32 open_flags;
};
struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
__u32 bpf_fd;
__u32 info_len;
__aligned_u64 info;
} info;
struct { /* anonymous struct used by BPF_PROG_QUERY command */
__u32 target_fd; /* container object to query */
__u32 attach_type;
__u32 query_flags;
__u32 attach_flags;
__aligned_u64 prog_ids;
__u32 prog_cnt;
} query;
} __attribute__((aligned(8)));
Running an example
First, let’s create a simple eBPF program by just creating a maps, write a value in it, and read its content back, from a user-land program.
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <asm/unistd.h>
#include <linux/bpf.h>
#include <inttypes.h>
#define min(x, y) ((x) < (y) ? (x) : (y))
static inline __u64 ptr_to_u64(const void *ptr) {
return (__u64) (unsigned long) ptr;
}
static inline int sys_bpf(enum bpf_cmd cmd,
union bpf_attr *attr,
unsigned int size) {
return syscall(__NR_bpf, cmd, attr, size);
}
int bpf_create_map(enum bpf_map_type map_type,
unsigned int key_size,
unsigned int value_size,
unsigned int max_entries) {
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};
return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
int bpf_lookup_elem(int fd, const void *key, void *value) {
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
};
return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}
int bpf_update_elem(int fd, const void *key, const void *value,
uint64_t flags) {
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
.flags = flags,
};
return sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
void dump_map(int fd) {
int rc;
int key, next_key;
uint64_t value;
key = 0;
while ((rc = bpf_get_next_key(fd, &key, &next_key)) == 0) {
bpf_lookup_elem(fd, &next_key, &value);
printf("Element: %d -> %ld\n", next_key, value);
key = next_key;
}
}
sys_bpf()
: this function is needed to call thebpf
syscallbpf_create_map()
,bpf_lookup_elem()
,bpf_update_elem()
: helper functions, those function are take from the IOVisor project
int main(int argc, char **argv) {
int map_fd, key;
int max_entries = 1024;
uint64_t value = 0;
map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key),
sizeof(value), max_entries);
if (map_fd < 0) {
printf("failed to create map '%s'\n", strerror(errno));
/* likely not run as root */
return 1;
}
printf("We just created an eBPF map of %d entries\n", max_entries);
if (bpf_lookup_elem(map_fd, "MY_KEY", &value) == -1) {
printf("BEFORE: There is no element with key 'MY_KEY'\n");
} else {
printf("BEFORE: There is an element with with key 'MY_KEY': %ld\n", value);
}
value = 42;
if (bpf_update_elem(map_fd, "MY_KEY", &value, BPF_ANY) == 0) {
printf("New element updated\n");
} else {
printf("Error while updating the map: %s\n", strerror(errno));
return 1;
}
value = 51;
if (bpf_lookup_elem(map_fd, "MY_KEY", &value) == -1) {
printf("AFTER: There is no element with key 'MY_KEY'\n");
} else {
printf("AFTER: There is an element with with key 'MY_KEY': %ld\n", value);
}
printf("### Dump of table ###\n");
dump_map(map_fd);
while(1) {
sleep(1);
}
return 0;
}
Here we have a simple user-land program, which creates an in-kernel BPF map and add an element to it. As soon as the user-land program is terminated, the map is deleted.
User-land program to load the eBPF code
Now that the map is created we can create and load a BPF program to update the map.
We will attach our program to one kind of event: kprobe. Each time this event will be triggered, our BPF program will be executed. In our case, it will increment the value in the BPF map having key 100.
Many type of eBPF program can be created:
enum bpf_prog_type {
BPF_PROG_TYPE_UNSPEC,
BPF_PROG_TYPE_SOCKET_FILTER,
BPF_PROG_TYPE_KPROBE,
BPF_PROG_TYPE_SCHED_CLS,
BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
BPF_PROG_TYPE_CGROUP_SKB,
BPF_PROG_TYPE_CGROUP_SOCK,
BPF_PROG_TYPE_LWT_IN,
BPF_PROG_TYPE_LWT_OUT,
BPF_PROG_TYPE_LWT_XMIT,
BPF_PROG_TYPE_SOCK_OPS,
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_CGROUP_DEVICE,
};
Add a new helper function to load the BPF program into the kernel:
#include <linux/version.h>
#define LOG_BUF_SIZE 1024
char bpf_log_buf[LOG_BUF_SIZE];
int bpf_prog_load(enum bpf_prog_type type,
const struct bpf_insn *insns, int insn_cnt,
const char *license) {
union bpf_attr attr = {
.prog_type = type,
.insns = ptr_to_u64(insns),
.insn_cnt = insn_cnt,
.license = ptr_to_u64(license),
.log_buf = ptr_to_u64(bpf_log_buf),
.log_size = LOG_BUF_SIZE,
.log_level = 1,
.kern_version = LINUX_VERSION_CODE
};
return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
}
This helper function is using a global bpf_log_buf
variable to
eventually store the compilation errors messages.
- We will attach to a kprobe event, so, the type of the program is
BPF_PROG_TYPE_KPROBE
. insns
: the actual BPF code to executeinsn_cnt
: number of instructions in the BPF program
The eBPF program itself
Each BPF instruction looks like this:
struct bpf_insn {
__u8 code; /* opcode */
__u8 dst_reg:4; /* dest register */
__u8 src_reg:4; /* source register */
__s16 off; /* signed offset */
__s32 imm; /* signed immediate constant */
};
So, for example, the exit instruction will be like this:
(struct bpf_insn) {
.code = BPF_JMP | BPF_EXIT,
.dst_reg = 0,
.src_reg = 0,
.off = 0,
.imm = 0},
The BPF_EXIT
instruction is of class BPF_JMP
.
But before doing an exit:
The eBPF program needs to store return value into register R0 before doing a BPF_EXIT.
(struct bpf_insn) {
.code = BPF_ALU64 | BPF_MOV | BPF_K,
.dst_reg = BPF_REG_0,
.src_reg = 0,
.off = 0,
.imm = 0
}
The BPF_MOV
instruction is of class BPF_ALU64
.
.imm
is the immediate value (0) to store into the BPF_REF_0
register.
struct bpf_insn prog[] = {
(struct bpf_insn) {
.code = BPF_ALU64 | BPF_MOV | BPF_K,
.dst_reg = BPF_REG_0,
.src_reg = 0,
.off = 0,
.imm = 0},
(struct bpf_insn) {
.code = BPF_JMP | BPF_EXIT,
.dst_reg = 0,
.src_reg = 0,
.off = 0,
.imm = 0},
}
if ((prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, prog, 2, "GPL")) < 0) {
printf("failed to load BPF program: %s\n", strerror(errno));
printf("%s", bpf_log_buf);
return 1;
}
printf("The eBPF program has been loaded\n");
Now it’s time to attach this program to a kprobe event.
Attach the program to a kprobe
We will use perf_event_open(2)
to create a performance monitoring
probe, then use an iotcl(2)
call with the PERF_EVENT_IOC_SET_BPF
request argument on the eBPF program fd.
The /proc/kallsyms
file provides all of the symbols of the kernel
where you can put probes on. Let’s take sys_listen
which correspond
to the listen()
syscall.
$ echo "p sys_listen" >> /sys/kernel/debug/tracing/kprobe_events
$ cat /sys/kernel/debug/tracing/events/kprobes/p_sys_listen_0/id
1827
First, one helper function to call the perf_event_open
syscall:
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags) {
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
Then create the event and attach the eBPF program to it:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = 1827
};
int event_fd = perf_event_open(&attr, -1, 0, -1, 0);
if (event_fd == -1) {
printf("Error opening event: %s\n", strerror(errno));
return 1;
}
printf("Event file descriptor created\n");
if (ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd) != 0) {
printf("Error while attaching BPF program: %s\n", strerror(errno));
return 1;
}
printf("eBPF program attached to event\n");
Update the eBPF program
Now, we need to update our BPF program which currently does nothing to
something that increments an entry in the map each time this event is
triggered, so each time the listen()
syscall is called.
High-level, the eBPF program needs to:
- take the previous map value for key 0
- increment it by one
- update the map value for key 0
Instead of directly using struct bpf_insn
structures to program,
let’s use helper headers in the kernel source
code. tools/include/linux/filter.h
provides usefull #define
s. Our
previous simple exit eBFP program becomes:
struct bpf_insn prog[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN()
};
Now let’s use an example eBPF program which increments the map entry
each time the listen()
syscall is called:
struct bpf_insn prog[] = {
/* Put 1 (the map key) on the stack */
BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 1),
/* Put frame pointer into R2 */
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
/* Decrement pointer by four */
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
/* Put map_fd into R1 */
BPF_LD_MAP_FD(BPF_REG_1, map_fd),
/* Load current count from map into R0 */
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
/* If returned value NULL, skip two instructions and return */
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
/* Put 1 into R1 */
BPF_MOV64_IMM(BPF_REG_1, 1),
/* Increment value by 1 */
BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0),
/* lock *(u64 *) r0 += r1 */
BPF_MOV64_IMM(BPF_REG_0, 0),
/* Return from program */
BPF_EXIT_INSN()
};
At the end of the user-land program we can update the loop to display the map at regular intervals:
while(1) {
printf("### Dump of table ###\n");
dump_map(map_fd);
sleep(1);
}
Let’s test it !
gcc -Ipath/to/kernel/source eBPF.c -o eBPF
sudo eBPF
he eBPF program has been loaded
Event file descriptor created
eBPF program attached to event
### Dump of table ###
Element: 1 -> 0
### Dump of table ###
Element: 1 -> 0
### Dump of table ###
Element: 1 -> 0
### Dump of table ###
Element: 1 -> 0
[...]
And now, launch something that does a listen()
call (nc -l -p 8080
).
### Dump of table ###
Element: 1 -> 1
### Dump of table ###
Element: 1 -> 1
### Dump of table ###
Element: 1 -> 1
### Dump of table ###
Element: 1 -> 2
### Dump of table ###
Element: 1 -> 2
### Dump of table ###
Summary
So, what we did until now is:
- load a eBPF program into the kernel, written in plain full assembly-like code
- attach this program to a kprobe event
- display the content of the updated eBPF map
- observe that the counter is incremented each time the eBPF program is executed
Obviously, writing a full complex program with eBPF instruction is not very fun. Fortunatly, LLVM has a BPF backend and can compile a subset of a C source code into a BPF bytecode.
LLVM eBPF backend
Instead of writing the BPF program with BPF instructions, we will write a C program that will be compiled into BPF instructions. We will then load the object compiled file into the kernel with the same method.
Test LLVM compilation
Create the C program
#include <linux/ptrace.h>
int bpf_prog1(struct pt_regs *ctx)
{
return 'A';
}
CLANG = clang
LLC = llc
KERNELSOURCE = ./linux-source-4.16
KERNELBUILD = /lib/modules/4.16.0-1-amd64/build/
ARCH = x86
LINUXINCLUDE += -I$(KERNELBUILD)/arch/$(ARCH)/include/generated
LINUXINCLUDE += -I$(KERNELBUILD)/arch/$(ARCH)/include/generated/uapi
LINUXINCLUDE += -I$(KERNELBUILD)/include
##LINUXINCLUDE += -I$(KERNELBUILD)/include/uapi
##LINUXINCLUDE += -I$(KERNELBUILD)/include/generated/uapi
##LINUXINCLUDE += -I$(KERNELBUILD)/arch/$(ARCH)/include
##LINUXINCLUDE += -I$(KERNELBUILD)/arch/$(ARCH)/include/uapi
LINUXINCLUDE += -I$(KERNELSOURCE)/arch/$(ARCH)/include
LINUXINCLUDE += -I$(KERNELSOURCE)/arch/$(ARCH)/include/uapi
LINUXINCLUDE += -I$(KERNELSOURCE)/include
LINUXINCLUDE += -I$(KERNELSOURCE)/include/uapi
LINUXINCLUDE += -include $(KERNELSOURCE)/include/linux/kconfig.h
##LINUXINCLUDE += -I$(KERNELSOURCE)/arch/$(ARCH)/include/generated/uapi
##LINUXINCLUDE += -I$(KERNELSOURCE)/arch/$(ARCH)/include/generated
##LINUXINCLUDE += -I$(KERNELSOURCE)/include/generated/uapi
# LLVM_INCLUDES = -I/usr/lib/clang/4.0.1/include/
LLVM_INCLUDES = -I/usr/lib/gcc/x86_64-linux-gnu/7/include
llvm_simple: llvm_simple.c
$(CLANG) -S -nostdinc $(LINUXINCLUDE) $(LLVM_INCLUDES) \
-D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
-Wno-compare-distinct-pointer-types \
-Wno-gnu-variable-sized-type-not-at-end \
-Wno-tautological-compare \
-O2 -emit-llvm -c $<
$(LLC) -march=bpf -filetype=obj -o $@ $(<:.c=.ll)
readelf -x .text llvm_simple
Hex dump of section '.text':
0x00000000 b7000000 41000000 95000000 00000000 ....A...........
objcopy -O binary -I elf32-little --only-section=.text llvm_simple llvm_simple.bin
hexdump -C llvm_simple.bin
00000000 b7 00 00 00 41 00 00 00 95 00 00 00 00 00 00 00 |....A...........|
00000010
Load the compiled bytecode into the kernel
Now that we extracted the eBPF bytecode generated by LLVM, we can load it into the kernel the same way:
char buf[] = {0xb7, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
printf("There are %d instructions\n", sizeof(buf) / sizeof(struct bpf_insn));
if ((prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, buf, sizeof(buf) / sizeof(struct bpf_insn), "GPL")) < 0) {
printf("failed to load BPF program: %s\n", strerror(errno));
printf("%s", bpf_log_buf);
return 1;
}
A more complex C/eBPF code
Using bcc
ply
Read latency
kprobe:SyS_read {
@start[tid()] = nsecs();
}
kretprobe:SyS_read /@start[tid()]/ {
delta = nsecs() - @start[tid()];
@ns.quantize(delta);
if (delta > 10000) @longproc[comm()].count();
@start[tid()] = nil;
}
Language and VM description
The VM has 11 register, BPF_REG_0
to BPF_REG_10
Their role is (copied from kernel source):
- R0 - return value from in-kernel function, and exit value for eBPF program
- R1-R5 - arguments from eBPF program to in-kernel function
- R6-R9 - callee saved registers that in-kernel function will preserve
- R10 - read-only frame pointer to access stack