bpf: introduce BPF BATCH File Format Signed-off-by: Alexei Starovoitov <ast@kernel.org>
diff --git a/Documentation/bpf/bpf_batch.rst b/Documentation/bpf/bpf_batch.rst new file mode 100644 index 0000000..683ef41 --- /dev/null +++ b/Documentation/bpf/bpf_batch.rst
@@ -0,0 +1,284 @@ +================ +BPF BATCH Design +================ + +Present state +============= +Currently libbpf and golang ebpf libraries require sophisticated ELF file +parsers. ELF parsing followed by relocation processing (for maps, BTF.ext, +CO-RE, functions) is 90% work that libbpf and other libraries do. The following +design allows abstracting this work as a new file format. + +BPF BATCH Goals +=============== + +- minimize amount of work libbpf needs to do to load programs. +- remove libelf dependency from various program loaders. +- enable feature parity between libbpf, libbcc and other libraries (golang). +- enable signed BPF programs. +- enable batched BPF syscall to improve performance. + The batched map_lookup/update/delete can be generalized for all sys_bpf + commands. For example: attach of multiple programs to different points will + be possible with one syscall. + +Overview +======== + +Represent the program loading as a sequence of sys_bpf commands with trivial +chaining. For example: to load a program that uses a map the map should be +created first with BPF_MAP_CREATE command. Then its result (map_fd) should be +used to adjust program instructions. Then BPF_PROG_LOAD command can be +executed. In more complex cases BTF needs to be loaded first. Then maps +created. And then programs loaded. + +BPF BATCH File Format (BBFF) or Bare Bone File Format +===================================================== + + cmd_0 cmd_1 ... cmd_N signature + +Where signature is kernel module like signature that covers [cmd_1 .. cmd_N]. + +Every command is a variable length structure:: + + struct { + u8 opcode; + // arguments + }; + +cmd_0 always has opcode == BBFF_ARGS and not included in the signature. +It serves as a storage of input arguments (max_entries for maps, attach +locations for progs, rodata, log_level) and output arguments +(log_buf, map/prog FDs). + +The opcodes are:: + + enum bpf_batch { + BBFF_EXIT_ON_ERR = 0, + BBFF_ARGS, // array of byte arguments + BBFF_APPLY_ARGS, // instruction to apply N bytes from argument + // array to a specified location + BBFF_DATA, // array of data (set of bpf_insn, bpf_line_info, BTF) + BBFF_CMD, // execute sys_bpf command + BBFF_APPLY, // apply the result of the previous command + // (64-bit pointer or 32-bit file descriptor) + // to a specified location + BBFF_COPY_USER, // copy the result of the previous command + // into user memory + }; + +The pseudo code for the sys_bpf() BPF_BATCH command:: + + int bpf_batch(void __user * user_cmds, int size) { + copy_from_user(cmd, user_cmds, size); + verify_signature(cmd[1..N]); + for_each(cmd) + switch (cmd->op) { + case BBFF_EXIT_ON_ERR: + struct bbff_exit_on_err { + u8 opcode; + } * cmd; + if ((s64)last_result < 0) return last_result; + + case BBFF_ARGS: + struct bbff_args { + u8 opcode; + u32 size; + u8 args[]; + } * cmd; + args = cmd->args; + + case BBFF_APPLY_ARGS: + struct bbff_apply_args { + u8 opcode; + u8 size; + u32 arg_offset; + u32 tgt_offset; + } * cmd; + memcpy((void * )cmd + cmd->tgt_offset, args + cmd->arg_offset, cmd->size); + + case BBFF_DATA: + struct bbff_data { + u8 opcode; + u32 size; + u8 data[]; + } * cmd; + last_result = (u64) cmd->data; + + case BBFF_CMD: + struct bbff_cmd { + u8 opcode; + u8 cmd; + u8 size; + u8 attr[]; + } * cmd; + last_result = (s64) (s32) sys_bpf(cmd->cmd, cmd->attr, cmd->size); + + case BBFF_APPLY: + struct bbff_apply { + u8 opcode; + u8 size; // 4 or 8 + u32 tgt_offset; + } * cmd; + memcpy((void * )cmd + cmd->tgt_offset, &last_result, cmd->size); + + case BBFF_COPY_USER: + struct bbff_copy_user { + u8 opcode; + u8 size; // always 4 + u32 arg_offset; + } * cmd; + copy_to_user(((struct bbff_args * )user_cmds)->args + cmd->arg_offset, + &last_result, cmd->size); + } + +Every BBFF operation is a trivial single-liner. + +The following BPF program:: + + struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1024); + __type(key, __u32); + __type(value, __u64); + } my_array SEC(".maps"); + + SEC("cgroup_skb/ingress") + int my_bpf_prog(struct __sk_buff *skb) + { + int key = 0; + bpf_map_lookup_elem(&my_array, &key); + return 0; + } + +represented as BBFF will look like:: + + (struct bbff_args) { + .opcode = ARGS, .size = 12, + .args = { + (u32) 1024 /* max_entries */, + (u32) 0 /* map_fd will be stored here */, + (u32) 0 /* prog_fd will be stored here */, + }, + }, + (struct bbff_apply_args) { // store max_entries into map_create's bpf_attr + .opcode = APPLY_ARGS, .size = 4, .arg_offset = 0, .tgt_offset = &LABEL3, + }, + (struct bbff_data) { + .opcode = DATA, .size = ..., + .data = { BTF blob copied from ELF }, + }, + (struct bbff_apply) { // store btf pointer into btf_load's bpf_attr + .opcode = APPLY, .size = 8, .tgt_offset = &LABEL1, + }, + (struct bbff_cmd) { // load BTF + .opcode = CMD, .cmd = BPF_BTF_LOAD, .size = 40, + .attr = (union bpf_attr) { + .btf = .., // LABEL1. btf pointer will be stored into here + .btf_size = .., // same as bbff_data.size above + }, + }, + (struct bbff_apply) { // store btf_fd into map_create's bpf_attr + .opcode = APPLY, .size = 4, .tgt_offset = &LABEL2, + }, + (struct bbff_cmd) { // create a map + .opcode = CMD, .cmd = BPF_MAP_CREATE, .size = 60, + .attr = (union bpf_attr) { + .map_type = BPF_MAP_TYPE_ARRAY, + .btf_fd = .., // LABEL2 + .key_size = 4, + .btf_key_type_id = .., + .max_entries = .., // LABEL3 + }, + }, + (struct bbff_apply) { // store map_fd somewhere inside bpf_insn stream + .opcode = APPLY, .size = 4, .tgt_offset = &LABEL4, + }, + (struct bbff_copy_user) { // store map_fd back to user space + .opcode = COPY_USER, .size = 4, .arg_offset = 4, + }, + (struct bbff_exit_on_err) { .opcode = EXIT_ON_ERR }, + (struct bbff_data) { + .opcode = DATA, .size = .., + .data = { // bpf_insn array + BPF_LDX_MEM(), + .., + BPF_LD_MAP_FD(), // LABEL4 + .., + BPF_EXIT_INSN(), + }, + }, + (struct bbff_apply) { // store bpf_insn pointer into prog_load's bpf_attr + .opcode = APPLY, .size = 8, .tgt_offset = &LABEL5, + }, + (struct bbff_cmd) { // load program + .opcode = CMD, .cmd = BPF_PROG_LOAD, .size = 80, + .attr = (union bpf_attr) { + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .expected_attach_type = BPF_CGROUP_INET_INGRESS, + .insns = .., // LABEL5 + }, + }, + (struct bbff_copy_user) { // store prog_fd back to user space + .opcode = COPY_USER, .size = 4, .arg_offset = 8, + }, + +This is how the user space will use BBFF to load this program:: + + u8 bbff_blob[] = { ... }; + struct bbff_args *bbff_args = bbff_blob; + struct my_args { + u32 my_array__max_entries; // input + int my_array__fd; // output + int my_bpf_prog__fd; // output + } *args = bbff_args->args; + + // adjust args->my_array__max_entries from 1024 to other value if necessary + sys_bpf(BPF_BATCH, bbff_blob, sizeof(bbff_blob)); + // args->my_array__fd + +The bpftool will generate .c, .h, and .bbff files out of .o file. +The .bbff file will contain BBFF in binary format. +The .c file will contain the same BBFF in .c like above. +The .h will contain single struct that describes layout of arguments +like struct my_args above. + +By default the bpftool will generate .h with the following args to be +customized by the user:: + + struct args { + /* inputs */ + struct { + u32 max_entries; + } map_name1, map_nameN; + struct { + u32 attach_prog_fd; + u32 attach_btf_id; + } prog_name1, prog_nameN; + u32 log_level; + u32 log_size; + u64 log_buf; /* verifier log stored here */ + /* outputs */ + int map_name1__fd; + int map_nameN__fd; + int prog_name1__fd; + int prog_nameN__fd; + }; + +If rodata or global variables are used by the program they will be generated +with their actual names as part of 'struct args' as well. + +BBFF can be seen as a replacement for "skeleton" that doesn't rely on ELF +and needs minimal libbpf support. These few libbpf functions can be split +into libbpf-mini or libbbff. + +Handling CO-RE +============== + +1. CO-RE can be supported as bbff_args, but then non-trivial part of libbpf +would have to run to compute those args before BPF_BATCH command. +Hence it's cleaner to: +2. refactor libbpf's CO-RE handling code to be run in the kernel and let kernel +handle CO-RE relocation as part of BPF_PROG_LOAD command. It has important +additional benefit that complex verifier.c:btf_struct_access() and +btf_struct_walk() logic will be deprecated. The field access will be precise. +The skb->dev->ifindex dereferences will no longer be ambiguous.