diff options
-rw-r--r-- | etc/iproute2/bpf_pinning | 6 | ||||
-rw-r--r-- | examples/bpf/README | 13 | ||||
-rw-r--r-- | examples/bpf/bpf_cyclic.c | 30 | ||||
-rw-r--r-- | examples/bpf/bpf_funcs.h | 58 | ||||
-rw-r--r-- | examples/bpf/bpf_graft.c | 67 | ||||
-rw-r--r-- | examples/bpf/bpf_prog.c | 33 | ||||
-rw-r--r-- | examples/bpf/bpf_shared.c | 48 | ||||
-rw-r--r-- | examples/bpf/bpf_shared.h | 6 | ||||
-rw-r--r-- | examples/bpf/bpf_tailcall.c | 99 | ||||
-rw-r--r-- | include/bpf_api.h | 225 | ||||
-rw-r--r-- | include/bpf_elf.h | 6 | ||||
-rw-r--r-- | include/utils.h | 7 | ||||
-rw-r--r-- | lib/rt_names.c | 5 | ||||
-rw-r--r-- | tc/e_bpf.c | 46 | ||||
-rw-r--r-- | tc/f_bpf.c | 131 | ||||
-rw-r--r-- | tc/m_bpf.c | 158 | ||||
-rw-r--r-- | tc/tc_bpf.c | 1619 | ||||
-rw-r--r-- | tc/tc_bpf.h | 74 |
18 files changed, 1947 insertions, 684 deletions
diff --git a/etc/iproute2/bpf_pinning b/etc/iproute2/bpf_pinning new file mode 100644 index 0000000..2b39c70 --- /dev/null +++ b/etc/iproute2/bpf_pinning @@ -0,0 +1,6 @@ +# +# subpath mappings from mount point for pinning +# +#3 tracing +#4 foo/bar +#5 tc/cls1 diff --git a/examples/bpf/README b/examples/bpf/README new file mode 100644 index 0000000..4247257 --- /dev/null +++ b/examples/bpf/README @@ -0,0 +1,13 @@ +eBPF toy code examples (running in kernel) to familiarize yourself +with syntax and features: + + - bpf_prog.c -> Classifier examples with using maps + - bpf_shared.c -> Ingress/egress map sharing example + - bpf_tailcall.c -> Using tail call chains + - bpf_cyclic.c -> Simple cycle as tail calls + - bpf_graft.c -> Demo on altering runtime behaviour + +User space code example: + + - bpf_agent.c -> Counterpart to bpf_prog.c for user + space to transfer/read out map data diff --git a/examples/bpf/bpf_cyclic.c b/examples/bpf/bpf_cyclic.c new file mode 100644 index 0000000..c66cbec --- /dev/null +++ b/examples/bpf/bpf_cyclic.c @@ -0,0 +1,30 @@ +#include "../../include/bpf_api.h" + +/* Cyclic dependency example to test the kernel's runtime upper + * bound on loops. Also demonstrates on how to use direct-actions, + * loaded as: tc filter add [...] bpf da obj [...] + */ +#define JMP_MAP_ID 0xabccba + +BPF_PROG_ARRAY(jmp_tc, JMP_MAP_ID, PIN_OBJECT_NS, 1); + +__section_tail(JMP_MAP_ID, 0) +int cls_loop(struct __sk_buff *skb) +{ + char fmt[] = "cb: %u\n"; + + trace_printk(fmt, sizeof(fmt), skb->cb[0]++); + tail_call(skb, &jmp_tc, 0); + + skb->tc_classid = TC_H_MAKE(1, 42); + return TC_ACT_OK; +} + +__section_cls_entry +int cls_entry(struct __sk_buff *skb) +{ + tail_call(skb, &jmp_tc, 0); + return TC_ACT_SHOT; +} + +BPF_LICENSE("GPL"); diff --git a/examples/bpf/bpf_funcs.h b/examples/bpf/bpf_funcs.h deleted file mode 100644 index 1545fa9..0000000 --- a/examples/bpf/bpf_funcs.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef __BPF_FUNCS__ -#define __BPF_FUNCS__ - -/* Misc macros. */ -#ifndef __maybe_unused -# define __maybe_unused __attribute__ ((__unused__)) -#endif - -#ifndef __section -# define __section(NAME) __attribute__((section(NAME), used)) -#endif - -#ifndef offsetof -# define offsetof __builtin_offsetof -#endif - -#ifndef htons -# define htons(x) __constant_htons((x)) -#endif - -#ifndef likely -# define likely(x) __builtin_expect(!!(x), 1) -#endif - -#ifndef unlikely -# define unlikely(x) __builtin_expect(!!(x), 0) -#endif - -/* The verifier will translate them to actual function calls. */ -static void *(*bpf_map_lookup_elem)(void *map, void *key) __maybe_unused = - (void *) BPF_FUNC_map_lookup_elem; - -static int (*bpf_map_update_elem)(void *map, void *key, void *value, - unsigned long long flags) __maybe_unused = - (void *) BPF_FUNC_map_update_elem; - -static int (*bpf_map_delete_elem)(void *map, void *key) __maybe_unused = - (void *) BPF_FUNC_map_delete_elem; - -static unsigned int (*get_smp_processor_id)(void) __maybe_unused = - (void *) BPF_FUNC_get_smp_processor_id; - -static unsigned int (*get_prandom_u32)(void) __maybe_unused = - (void *) BPF_FUNC_get_prandom_u32; - -/* LLVM built-in functions that an eBPF C program may use to emit - * BPF_LD_ABS and BPF_LD_IND instructions. - */ -unsigned long long load_byte(void *skb, unsigned long long off) - asm ("llvm.bpf.load.byte"); - -unsigned long long load_half(void *skb, unsigned long long off) - asm ("llvm.bpf.load.half"); - -unsigned long long load_word(void *skb, unsigned long long off) - asm ("llvm.bpf.load.word"); - -#endif /* __BPF_FUNCS__ */ diff --git a/examples/bpf/bpf_graft.c b/examples/bpf/bpf_graft.c new file mode 100644 index 0000000..f48fd02 --- /dev/null +++ b/examples/bpf/bpf_graft.c @@ -0,0 +1,67 @@ +#include "../../include/bpf_api.h" + +/* This example demonstrates how classifier run-time behaviour + * can be altered with tail calls. We start out with an empty + * jmp_tc array, then add section aaa to the array slot 0, and + * later on atomically replace it with section bbb. Note that + * as shown in other examples, the tc loader can prepopulate + * tail called sections, here we start out with an empty one + * on purpose to show it can also be done this way. + * + * tc filter add dev foo parent ffff: bpf obj graft.o + * tc exec bpf dbg + * [...] + * Socket Thread-20229 [001] ..s. 138993.003923: : fallthrough + * <idle>-0 [001] ..s. 138993.202265: : fallthrough + * Socket Thread-20229 [001] ..s. 138994.004149: : fallthrough + * [...] + * + * tc exec bpf graft m:globals/jmp_tc key 0 obj graft.o sec aaa + * tc exec bpf dbg + * [...] + * Socket Thread-19818 [002] ..s. 139012.053587: : aaa + * <idle>-0 [002] ..s. 139012.172359: : aaa + * Socket Thread-19818 [001] ..s. 139012.173556: : aaa + * [...] + * + * tc exec bpf graft m:globals/jmp_tc key 0 obj graft.o sec bbb + * tc exec bpf dbg + * [...] + * Socket Thread-19818 [002] ..s. 139022.102967: : bbb + * <idle>-0 [002] ..s. 139022.155640: : bbb + * Socket Thread-19818 [001] ..s. 139022.156730: : bbb + * [...] + */ + +BPF_PROG_ARRAY(jmp_tc, 0, PIN_GLOBAL_NS, 1); + +__section("aaa") +int cls_aaa(struct __sk_buff *skb) +{ + char fmt[] = "aaa\n"; + + trace_printk(fmt, sizeof(fmt)); + return TC_H_MAKE(1, 42); +} + +__section("bbb") +int cls_bbb(struct __sk_buff *skb) +{ + char fmt[] = "bbb\n"; + + trace_printk(fmt, sizeof(fmt)); + return TC_H_MAKE(1, 43); +} + +__section_cls_entry +int cls_entry(struct __sk_buff *skb) +{ + char fmt[] = "fallthrough\n"; + + tail_call(skb, &jmp_tc, 0); + trace_printk(fmt, sizeof(fmt)); + + return BPF_H_DEFAULT; +} + +BPF_LICENSE("GPL"); diff --git a/examples/bpf/bpf_prog.c b/examples/bpf/bpf_prog.c index 009febd..4728049 100644 --- a/examples/bpf/bpf_prog.c +++ b/examples/bpf/bpf_prog.c @@ -168,8 +168,8 @@ /* Common, shared definitions with ebpf_agent.c. */ #include "bpf_shared.h" -/* Selection of BPF helper functions for our example. */ -#include "bpf_funcs.h" +/* BPF helper functions for our example. */ +#include "../../include/bpf_api.h" /* Could be defined here as well, or included from the header. */ #define TC_ACT_UNSPEC (-1) @@ -387,10 +387,10 @@ static inline void cls_update_proto_map(const struct __sk_buff *skb, uint8_t proto = flow->ip_proto; struct count_tuple *ct, _ct; - ct = bpf_map_lookup_elem(&map_proto, &proto); + ct = map_lookup_elem(&map_proto, &proto); if (likely(ct)) { - __sync_fetch_and_add(&ct->packets, 1); - __sync_fetch_and_add(&ct->bytes, skb->len); + lock_xadd(&ct->packets, 1); + lock_xadd(&ct->bytes, skb->len); return; } @@ -398,7 +398,7 @@ static inline void cls_update_proto_map(const struct __sk_buff *skb, _ct.packets = 1; _ct.bytes = skb->len; - bpf_map_update_elem(&map_proto, &proto, &_ct, BPF_ANY); + map_update_elem(&map_proto, &proto, &_ct, BPF_ANY); } static inline void cls_update_queue_map(const struct __sk_buff *skb) @@ -409,11 +409,11 @@ static inline void cls_update_queue_map(const struct __sk_buff *skb) mismatch = skb->queue_mapping != get_smp_processor_id(); - cq = bpf_map_lookup_elem(&map_queue, &queue); + cq = map_lookup_elem(&map_queue, &queue); if (likely(cq)) { - __sync_fetch_and_add(&cq->total, 1); + lock_xadd(&cq->total, 1); if (mismatch) - __sync_fetch_and_add(&cq->mismatch, 1); + lock_xadd(&cq->mismatch, 1); return; } @@ -421,7 +421,7 @@ static inline void cls_update_queue_map(const struct __sk_buff *skb) _cq.total = 1; _cq.mismatch = mismatch ? 1 : 0; - bpf_map_update_elem(&map_queue, &queue, &_cq, BPF_ANY); + map_update_elem(&map_queue, &queue, &_cq, BPF_ANY); } /* eBPF program definitions, placed in various sections, which can @@ -439,7 +439,8 @@ static inline void cls_update_queue_map(const struct __sk_buff *skb) * It is however not required to have multiple programs sharing * a file. */ -__section("classifier") int cls_main(struct __sk_buff *skb) +__section("classifier") +int cls_main(struct __sk_buff *skb) { struct flow_keys flow; @@ -456,13 +457,14 @@ static inline void act_update_drop_map(void) { uint32_t *count, cpu = get_smp_processor_id(); - count = bpf_map_lookup_elem(&map_drops, &cpu); + count = map_lookup_elem(&map_drops, &cpu); if (count) /* Only this cpu is accessing this element. */ (*count)++; } -__section("action-mark") int act_mark_main(struct __sk_buff *skb) +__section("action-mark") +int act_mark_main(struct __sk_buff *skb) { /* You could also mangle skb data here with the helper function * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could @@ -479,7 +481,8 @@ __section("action-mark") int act_mark_main(struct __sk_buff *skb) return TC_ACT_UNSPEC; } -__section("action-rand") int act_rand_main(struct __sk_buff *skb) +__section("action-rand") +int act_rand_main(struct __sk_buff *skb) { /* Sorry, we're near event horizon ... */ if ((get_prandom_u32() & 3) == 0) { @@ -493,4 +496,4 @@ __section("action-rand") int act_rand_main(struct __sk_buff *skb) /* Last but not least, the file contains a license. Some future helper * functions may only be available with a GPL license. */ -char __license[] __section("license") = "GPL"; +BPF_LICENSE("GPL"); diff --git a/examples/bpf/bpf_shared.c b/examples/bpf/bpf_shared.c new file mode 100644 index 0000000..accc0ad --- /dev/null +++ b/examples/bpf/bpf_shared.c @@ -0,0 +1,48 @@ +#include "../../include/bpf_api.h" + +/* Minimal, stand-alone toy map pinning example: + * + * clang -target bpf -O2 [...] -o bpf_shared.o -c bpf_shared.c + * tc filter add dev foo parent 1: bpf obj bpf_shared.o sec egress + * tc filter add dev foo parent ffff: bpf obj bpf_shared.o sec ingress + * + * Both classifier will share the very same map instance in this example, + * so map content can be accessed from ingress *and* egress side! + * + * This example has a pinning of PIN_OBJECT_NS, so it's private and + * thus shared among various program sections within the object. + * + * A setting of PIN_GLOBAL_NS would place it into a global namespace, + * so that it can be shared among different object files. A setting + * of PIN_NONE (= 0) means no sharing, so each tc invocation a new map + * instance is being created. + */ + +BPF_ARRAY4(map_sh, 0, PIN_OBJECT_NS, 1); /* or PIN_GLOBAL_NS, or PIN_NONE */ + +__section("egress") +int emain(struct __sk_buff *skb) +{ + int key = 0, *val; + + val = map_lookup_elem(&map_sh, &key); + if (val) + lock_xadd(val, 1); + + return BPF_H_DEFAULT; +} + +__section("ingress") +int imain(struct __sk_buff *skb) +{ + char fmt[] = "map val: %d\n"; + int key = 0, *val; + + val = map_lookup_elem(&map_sh, &key); + if (val) + trace_printk(fmt, sizeof(fmt), *val); + + return BPF_H_DEFAULT; +} + +BPF_LICENSE("GPL"); diff --git a/examples/bpf/bpf_shared.h b/examples/bpf/bpf_shared.h index 46423ec..a24038d 100644 --- a/examples/bpf/bpf_shared.h +++ b/examples/bpf/bpf_shared.h @@ -1,10 +1,6 @@ #ifndef __BPF_SHARED__ #define __BPF_SHARED__ -#include <stdint.h> - -#include "../../include/bpf_elf.h" - enum { BPF_MAP_ID_PROTO, BPF_MAP_ID_QUEUE, @@ -14,7 +10,7 @@ enum { }; struct count_tuple { - long packets; /* type long for __sync_fetch_and_add() */ + long packets; /* type long for lock_xadd() */ long bytes; }; diff --git a/examples/bpf/bpf_tailcall.c b/examples/bpf/bpf_tailcall.c new file mode 100644 index 0000000..040790d --- /dev/null +++ b/examples/bpf/bpf_tailcall.c @@ -0,0 +1,99 @@ +#include "../../include/bpf_api.h" + +#define ENTRY_INIT 3 +#define ENTRY_0 0 +#define ENTRY_1 1 +#define MAX_JMP_SIZE 2 + +#define FOO 42 +#define BAR 43 + +/* This example doesn't really do anything useful, but it's purpose is to + * demonstrate eBPF tail calls on a very simple example. + * + * cls_entry() is our classifier entry point, from there we jump based on + * skb->hash into cls_case1() or cls_case2(). They are both part of the + * program array jmp_tc. Indicated via __section_tail(), the tc loader + * populates the program arrays with the loaded file descriptors already. + * + * To demonstrate nested jumps, cls_case2() jumps within the same jmp_tc + * array to cls_case1(). And whenever we arrive at cls_case1(), we jump + * into cls_exit(), part of the jump array jmp_ex. + * + * Also, to show it's possible, all programs share map_sh and dump the value + * that the entry point incremented. The sections that are loaded into a + * program array can be atomically replaced during run-time, e.g. to change + * classifier behaviour. + */ + +BPF_PROG_ARRAY(jmp_tc, FOO, PIN_OBJECT_NS, MAX_JMP_SIZE); +BPF_PROG_ARRAY(jmp_ex, BAR, PIN_OBJECT_NS, 1); + +BPF_ARRAY4(map_sh, 0, PIN_OBJECT_NS, 1); + +__section_tail(FOO, ENTRY_0) +int cls_case1(struct __sk_buff *skb) +{ + char fmt[] = "case1: map-val: %d from:%u\n"; + int key = 0, *val; + + val = map_lookup_elem(&map_sh, &key); + if (val) + trace_printk(fmt, sizeof(fmt), *val, skb->cb[0]); + + skb->cb[0] = ENTRY_0; + tail_call(skb, &jmp_ex, ENTRY_0); + + return BPF_H_DEFAULT; +} + +__section_tail(FOO, ENTRY_1) +int cls_case2(struct __sk_buff *skb) +{ + char fmt[] = "case2: map-val: %d from:%u\n"; + int key = 0, *val; + + val = map_lookup_elem(&map_sh, &key); + if (val) + trace_printk(fmt, sizeof(fmt), *val, skb->cb[0]); + + skb->cb[0] = ENTRY_1; + tail_call(skb, &jmp_tc, ENTRY_0); + + return BPF_H_DEFAULT; +} + +__section_tail(BAR, ENTRY_0) +int cls_exit(struct __sk_buff *skb) +{ + char fmt[] = "exit: map-val: %d from:%u\n"; + int key = 0, *val; + + val = map_lookup_elem(&map_sh, &key); + if (val) + trace_printk(fmt, sizeof(fmt), *val, skb->cb[0]); + + /* Termination point. */ + return BPF_H_DEFAULT; +} + +__section_cls_entry +int cls_entry(struct __sk_buff *skb) +{ + char fmt[] = "fallthrough\n"; + int key = 0, *val; + + /* For transferring state, we can use skb->cb[0] ... skb->cb[4]. */ + val = map_lookup_elem(&map_sh, &key); + if (val) { + lock_xadd(val, 1); + + skb->cb[0] = ENTRY_INIT; + tail_call(skb, &jmp_tc, skb->hash & (MAX_JMP_SIZE - 1)); + } + + trace_printk(fmt, sizeof(fmt)); + return BPF_H_DEFAULT; +} + +BPF_LICENSE("GPL"); diff --git a/include/bpf_api.h b/include/bpf_api.h new file mode 100644 index 0000000..8503b9a --- /dev/null +++ b/include/bpf_api.h @@ -0,0 +1,225 @@ +#ifndef __BPF_API__ +#define __BPF_API__ + +/* Note: + * + * This file can be included into eBPF kernel programs. It contains + * a couple of useful helper functions, map/section ABI (bpf_elf.h), + * misc macros and some eBPF specific LLVM built-ins. + */ + +#include <stdint.h> + +#include <linux/pkt_cls.h> +#include <linux/bpf.h> +#include <linux/filter.h> + +#include <asm/byteorder.h> + +#include "bpf_elf.h" + +/** Misc macros. */ + +#ifndef __stringify +# define __stringify(X) #X +#endif + +#ifndef __maybe_unused +# define __maybe_unused __attribute__((__unused__)) +#endif + +#ifndef offsetof +# define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) +#endif + +#ifndef likely +# define likely(X) __builtin_expect(!!(X), 1) +#endif + +#ifndef unlikely +# define unlikely(X) __builtin_expect(!!(X), 0) +#endif + +#ifndef htons +# define htons(X) __constant_htons((X)) +#endif + +#ifndef ntohs +# define ntohs(X) __constant_ntohs((X)) +#endif + +#ifndef htonl +# define htonl(X) __constant_htonl((X)) +#endif + +#ifndef ntohl +# define ntohl(X) __constant_ntohl((X) +#endif + +/** Section helper macros. */ + +#ifndef __section +# define __section(NAME) \ + __attribute__((section(NAME), used)) +#endif + +#ifndef __section_tail +# define __section_tail(ID, KEY) \ + __section(__stringify(ID) "/" __stringify(KEY)) +#endif + +#ifndef __section_cls_entry +# define __section_cls_entry \ + __section(ELF_SECTION_CLASSIFIER) +#endif + +#ifndef __section_act_entry +# define __section_act_entry \ + __section(ELF_SECTION_ACTION) +#endif + +#ifndef __section_license +# define __section_license \ + __section(ELF_SECTION_LICENSE) +#endif + +#ifndef __section_maps +# define __section_maps \ + __section(ELF_SECTION_MAPS) +#endif + +/** Declaration helper macros. */ + +#ifndef BPF_LICENSE +# define BPF_LICENSE(NAME) \ + char ____license[] __section_license = NAME +#endif + +#ifndef __BPF_MAP +# define __BPF_MAP(NAME, TYPE, ID, SIZE_KEY, SIZE_VALUE, PIN, MAX_ELEM) \ + struct bpf_elf_map __section_maps NAME = { \ + .type = (TYPE), \ + .id = (ID), \ + .size_key = (SIZE_KEY), \ + .size_value = (SIZE_VALUE), \ + .pinning = (PIN), \ + .max_elem = (MAX_ELEM), \ + } +#endif + +#ifndef BPF_HASH +# define BPF_HASH(NAME, ID, SIZE_KEY, SIZE_VALUE, PIN, MAX_ELEM) \ + __BPF_MAP(NAME, BPF_MAP_TYPE_HASH, ID, SIZE_KEY, SIZE_VALUE, \ + PIN, MAX_ELEM) +#endif + +#ifndef BPF_ARRAY +# define BPF_ARRAY(NAME, ID, SIZE_VALUE, PIN, MAX_ELEM) \ + __BPF_MAP(NAME, BPF_MAP_TYPE_ARRAY, ID, sizeof(uint32_t), \ + SIZE_VALUE, PIN, MAX_ELEM) +#endif + +#ifndef BPF_ARRAY2 +# define BPF_ARRAY2(NAME, ID, PIN, MAX_ELEM) \ + BPF_ARRAY(NAME, ID, sizeof(uint16_t), PIN, MAX_ELEM) +#endif + +#ifndef BPF_ARRAY4 +# define BPF_ARRAY4(NAME, ID, PIN, MAX_ELEM) \ + BPF_ARRAY(NAME, ID, sizeof(uint32_t), PIN, MAX_ELEM) +#endif + +#ifndef BPF_ARRAY8 +# define BPF_ARRAY8(NAME, ID, PIN, MAX_ELEM) \ + BPF_ARRAY(NAME, ID, sizeof(uint64_t), PIN, MAX_ELEM) +#endif + +#ifndef BPF_PROG_ARRAY +# define BPF_PROG_ARRAY(NAME, ID, PIN, MAX_ELEM) \ + __BPF_MAP(NAME, BPF_MAP_TYPE_PROG_ARRAY, ID, sizeof(uint32_t), \ + sizeof(uint32_t), PIN, MAX_ELEM) +#endif + +/** Classifier helper */ + +#ifndef BPF_H_DEFAULT +# define BPF_H_DEFAULT -1 +#endif + +/** BPF helper functions for tc. */ + +#ifndef BPF_FUNC +# define BPF_FUNC(NAME, ...) \ + (* NAME)(__VA_ARGS__) __maybe_unused = (void *) BPF_FUNC_##NAME +#endif + +/* Map access/manipulation */ +static void *BPF_FUNC(map_lookup_elem, void *map, const void *key); +static int BPF_FUNC(map_update_elem, void *map, const void *key, + const void *value, uint32_t flags); +static int BPF_FUNC(map_delete_elem, void *map, const void *key); + +/* Time access */ +static uint64_t BPF_FUNC(ktime_get_ns); + +/* Debugging */ +static void BPF_FUNC(trace_printk, const char *fmt, int fmt_size, ...); + +/* Random numbers */ +static uint32_t BPF_FUNC(get_prandom_u32); + +/* Tail calls */ +static void BPF_FUNC(tail_call, struct __sk_buff *skb, void *map, + uint32_t index); + +/* System helpers */ +static uint32_t BPF_FUNC(get_smp_processor_id); + +/* Packet misc meta data */ +static uint32_t BPF_FUNC(get_cgroup_classid, struct __sk_buff *skb); +static uint32_t BPF_FUNC(get_route_realm, struct __sk_buff *skb); + +/* Packet redirection */ +static int BPF_FUNC(redirect, int ifindex, uint32_t flags); +static int BPF_FUNC(clone_redirect, struct __sk_buff *skb, int ifindex, + uint32_t flags); + +/* Packet manipulation */ +#define BPF_PSEUDO_HDR 0x10 +#define BPF_HAS_PSEUDO_HDR(flags) ((flags) & BPF_PSEUDO_HDR) +#define BPF_HDR_FIELD_SIZE(flags) ((flags) & 0x0f) + +static int BPF_FUNC(skb_store_bytes, struct __sk_buff *skb, uint32_t off, + void *from, uint32_t len, uint32_t flags); +static int BPF_FUNC(l3_csum_replace, struct __sk_buff *skb, uint32_t off, + uint32_t from, uint32_t to, uint32_t flags); +static int BPF_FUNC(l4_csum_replace, struct __sk_buff *skb, uint32_t off, + uint32_t from, uint32_t to, uint32_t flags); + +/* Packet vlan encap/decap */ +static int BPF_FUNC(skb_vlan_push, struct __sk_buff *skb, uint16_t proto, + uint16_t vlan_tci); +static int BPF_FUNC(skb_vlan_pop, struct __sk_buff *skb); + +/* Packet tunnel encap/decap */ +static int BPF_FUNC(skb_get_tunnel_key, struct __sk_buff *skb, + struct bpf_tunnel_key *to, uint32_t size, uint32_t flags); +static int BPF_FUNC(skb_set_tunnel_key, struct __sk_buff *skb, + struct bpf_tunnel_key *from, uint32_t size, uint32_t flags); + +/** LLVM built-ins */ + +#ifndef lock_xadd +# define lock_xadd(ptr, val) ((void) __sync_fetch_and_add(ptr, val)) +#endif + +unsigned long long load_byte(void *skb, unsigned long long off) + asm ("llvm.bpf.load.byte"); + +unsigned long long load_half(void *skb, unsigned long long off) + asm ("llvm.bpf.load.half"); + +unsigned long long load_word(void *skb, unsigned long long off) + asm ("llvm.bpf.load.word"); + +#endif /* __BPF_API__ */ diff --git a/include/bpf_elf.h b/include/bpf_elf.h index 4bd6bb0..31a8974 100644 --- a/include/bpf_elf.h +++ b/include/bpf_elf.h @@ -21,6 +21,11 @@ #define ELF_MAX_MAPS 64 #define ELF_MAX_LICENSE_LEN 128 +/* Object pinning settings */ +#define PIN_NONE 0 +#define PIN_OBJECT_NS 1 +#define PIN_GLOBAL_NS 2 + /* ELF map definition */ struct bpf_elf_map { __u32 type; @@ -28,6 +33,7 @@ struct bpf_elf_map { __u32 size_value; __u32 max_elem; __u32 id; + __u32 pinning; }; #endif /* __BPF_ELF__ */ diff --git a/include/utils.h b/include/utils.h index cc821e8..7310f4e 100644 --- a/include/utils.h +++ b/include/utils.h @@ -40,6 +40,10 @@ extern bool do_all; #define IPSEC_PROTO_ANY 255 #endif +#ifndef CONFDIR +#define CONFDIR "/etc/iproute2" +#endif + #define SPRINT_BSIZE 64 #define SPRINT_BUF(x) char x[SPRINT_BSIZE] @@ -196,6 +200,9 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr *n); __attribute__ ((format (printf, (pos_str), (pos_args)))) #endif +#define _textify(x) #x +#define textify(x) _textify(x) + #define htonll(x) ((1==htonl(1)) ? (x) : ((uint64_t)htonl((x) & 0xFFFFFFFF) << 32) | htonl((x) >> 32)) #define ntohll(x) ((1==ntohl(1)) ? (x) : ((uint64_t)ntohl((x) & 0xFFFFFFFF) << 32) | ntohl((x) >> 32)) diff --git a/lib/rt_names.c b/lib/rt_names.c index 1071a93..f6d17c0 100644 --- a/lib/rt_names.c +++ b/lib/rt_names.c @@ -23,10 +23,7 @@ #include <linux/rtnetlink.h> #include "rt_names.h" - -#ifndef CONFDIR -#define CONFDIR "/etc/iproute2" -#endif +#include "utils.h" #define NAME_MAX_LEN 512 @@ -26,10 +26,19 @@ static char *argv_default[] = { BPF_DEFAULT_CMD, NULL }; static void explain(void) { - fprintf(stderr, "Usage: ... bpf [ import UDS_FILE ] [ run CMD ]\n\n"); + fprintf(stderr, "Usage: ... bpf [ import UDS_FILE ] [ run CMD ]\n"); + fprintf(stderr, " ... bpf [ debug ]\n"); + fprintf(stderr, " ... bpf [ graft MAP_FILE ] [ key KEY ]\n"); + fprintf(stderr, " `... [ object-file OBJ_FILE ] [ type TYPE ] [ section NAME ] [ verbose ]\n"); + fprintf(stderr, " `... [ object-pinned PROG_FILE ]\n"); + fprintf(stderr, "\n"); fprintf(stderr, "Where UDS_FILE provides the name of a unix domain socket file\n"); fprintf(stderr, "to import eBPF maps and the optional CMD denotes the command\n"); fprintf(stderr, "to be executed (default: \'%s\').\n", BPF_DEFAULT_CMD); + fprintf(stderr, "Where MAP_FILE points to a pinned map, OBJ_FILE to an object file\n"); + fprintf(stderr, "and PROG_FILE to a pinned program. TYPE can be {cls, act}, where\n"); + fprintf(stderr, "\'cls\' is default. KEY is optional and can be inferred from the\n"); + fprintf(stderr, "section name, otherwise it needs to be provided.\n"); } static int bpf_num_env_entries(void) @@ -58,17 +67,40 @@ static int parse_bpf(struct exec_util *eu, int argc, char **argv) NEXT_ARG(); argv_run = argv; break; - } else if (matches(*argv, "import") == 0 || - matches(*argv, "imp") == 0) { + } else if (matches(*argv, "import") == 0) { NEXT_ARG(); bpf_uds_name = *argv; + } else if (matches(*argv, "debug") == 0 || + matches(*argv, "dbg") == 0) { + if (bpf_trace_pipe()) + fprintf(stderr, + "No trace pipe, tracefs not mounted?\n"); + return -1; + } else if (matches(*argv, "graft") == 0) { + const char *bpf_map_path; + bool has_key = false; + uint32_t key; + + NEXT_ARG(); + bpf_map_path = *argv; + NEXT_ARG(); + if (matches(*argv, "key") == 0) { + NEXT_ARG(); + if (get_unsigned(&key, *argv, 0)) { + fprintf(stderr, "Illegal \"key\"\n"); + return -1; + } + has_key = true; + NEXT_ARG(); + } + return bpf_graft_map(bpf_map_path, has_key ? + &key : NULL, argc, argv); } else { explain(); return -1; } - argc--; - argv++; + NEXT_ARG_FWD(); } if (!bpf_uds_name) { @@ -142,6 +174,6 @@ err: } struct exec_util bpf_exec_util = { - .id = "bpf", - .parse_eopt = parse_bpf, + .id = "bpf", + .parse_eopt = parse_bpf, }; @@ -11,19 +11,8 @@ #include <stdio.h> #include <stdlib.h> -#include <unistd.h> -#include <syslog.h> -#include <fcntl.h> -#include <libgen.h> -#include <sys/socket.h> -#include <netinet/in.h> -#include <arpa/inet.h> -#include <string.h> -#include <stdbool.h> -#include <errno.h> -#include <limits.h> -#include <linux/filter.h> -#include <linux/if.h> + +#include <linux/bpf.h> #include "utils.h" #include "tc_util.h" @@ -31,6 +20,13 @@ static const enum bpf_prog_type bpf_type = BPF_PROG_TYPE_SCHED_CLS; +static const int nla_tbl[BPF_NLA_MAX] = { + [BPF_NLA_OPS_LEN] = TCA_BPF_OPS_LEN, + [BPF_NLA_OPS] = TCA_BPF_OPS, + [BPF_NLA_FD] = TCA_BPF_FD, + [BPF_NLA_NAME] = TCA_BPF_NAME, +}; + static void explain(void) { fprintf(stderr, "Usage: ... bpf ...\n"); @@ -42,6 +38,7 @@ static void explain(void) fprintf(stderr, "eBPF use case:\n"); fprintf(stderr, " object-file FILE [ section CLS_NAME ] [ export UDS_FILE ]"); fprintf(stderr, " [ verbose ] [ direct-action ]\n"); + fprintf(stderr, " object-pinned FILE [ direct-action ]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Common remaining options:\n"); fprintf(stderr, " [ action ACTION_SPEC ]\n"); @@ -51,7 +48,8 @@ static void explain(void) fprintf(stderr, "c,t,f,k and s are decimals; s denotes number of 4-tuples\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where FILE points to a file containing the BPF_BYTECODE string,\n"); - fprintf(stderr, "an ELF file containing eBPF map definitions and bytecode.\n"); + fprintf(stderr, "an ELF file containing eBPF map definitions and bytecode, or a\n"); + fprintf(stderr, "pinned eBPF program.\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where CLS_NAME refers to the section name containing the\n"); fprintf(stderr, "classifier (default \'%s\').\n", bpf_default_section(bpf_type)); @@ -66,119 +64,38 @@ static void explain(void) static int bpf_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n) { + const char *bpf_obj = NULL, *bpf_uds_name = NULL; struct tcmsg *t = NLMSG_DATA(n); - const char *bpf_uds_name = NULL; - const char *bpf_sec_name = NULL; unsigned int bpf_flags = 0; - char *bpf_obj = NULL; - struct rtattr *tail; bool seen_run = false; - long h = 0; + struct rtattr *tail; int ret = 0; if (argc == 0) return 0; if (handle) { - h = strtol(handle, NULL, 0); - if (h == LONG_MIN || h == LONG_MAX) { - fprintf(stderr, "Illegal handle \"%s\", must be " - "numeric.\n", handle); + if (get_u32(&t->tcm_handle, handle, 0)) { + fprintf(stderr, "Illegal \"handle\"\n"); return -1; } } - t->tcm_handle = h; - tail = (struct rtattr *)(((void *)n) + NLMSG_ALIGN(n->nlmsg_len)); addattr_l(n, MAX_MSG, TCA_OPTIONS, NULL, 0); while (argc > 0) { if (matches(*argv, "run") == 0) { - struct sock_filter bpf_ops[BPF_MAXINSNS]; - bool from_file, ebpf, bpf_verbose; - int ret; - NEXT_ARG(); opt_bpf: - bpf_sec_name = bpf_default_section(bpf_type); - bpf_verbose = false; - ebpf = false; seen_run = true; - - if (strcmp(*argv, "bytecode-file") == 0 || - strcmp(*argv, "bcf") == 0) { - from_file = true; - } else if (strcmp(*argv, "bytecode") == 0 || - strcmp(*argv, "bc") == 0) { - from_file = false; - } else if (strcmp(*argv, "object-file") == 0 || - strcmp(*argv, "obj") == 0) { - ebpf = true; - } else { - fprintf(stderr, "What is \"%s\"?\n", *argv); - explain(); - return -1; - } - - NEXT_ARG(); - if (ebpf) { - bpf_uds_name = getenv(BPF_ENV_UDS); - bpf_obj = *argv; - - NEXT_ARG_FWD(); - - if (argc > 0 && - (strcmp(*argv, "section") == 0 || - strcmp(*argv, "sec") == 0)) { - NEXT_ARG(); - bpf_sec_name = *argv; - NEXT_ARG_FWD(); - } - if (argc > 0 && !bpf_uds_name && - (strcmp(*argv, "export") == 0 || - strcmp(*argv, "exp") == 0)) { - NEXT_ARG(); - bpf_uds_name = *argv; - NEXT_ARG_FWD(); - } - if (argc > 0 && - (strcmp(*argv, "verbose") == 0 || - strcmp(*argv, "verb") == 0)) { - bpf_verbose = true; - NEXT_ARG_FWD(); - } - - PREV_ARG(); - } - - ret = ebpf ? bpf_open_object(bpf_obj, bpf_type, bpf_sec_name, - bpf_verbose) : - bpf_parse_ops(argc, argv, bpf_ops, from_file); - if (ret < 0) { - fprintf(stderr, "%s\n", ebpf ? - "Could not load object" : - "Illegal \"bytecode\""); + if (bpf_parse_common(&argc, &argv, nla_tbl, bpf_type, + &bpf_obj, &bpf_uds_name, n)) { + fprintf(stderr, "Failed to retrieve (e)BPF data!\n"); return -1; } - - if (ebpf) { - char bpf_name[256]; - - bpf_obj = basename(bpf_obj); - - snprintf(bpf_name, sizeof(bpf_name), "%s:[%s]", - bpf_obj, bpf_sec_name); - - addattr32(n, MAX_MSG, TCA_BPF_FD, ret); - addattrstrz(n, MAX_MSG, TCA_BPF_NAME, bpf_name); - } else { - addattr16(n, MAX_MSG, TCA_BPF_OPS_LEN, ret); - addattr_l(n, MAX_MSG, TCA_BPF_OPS, &bpf_ops, - ret * sizeof(struct sock_filter)); - } } else if (matches(*argv, "classid") == 0 || - strcmp(*argv, "flowid") == 0) { + matches(*argv, "flowid") == 0) { unsigned int handle; NEXT_ARG(); @@ -204,7 +121,7 @@ opt_bpf: return -1; } continue; - } else if (strcmp(*argv, "help") == 0) { + } else if (matches(*argv, "help") == 0) { explain(); return -1; } else { @@ -280,7 +197,7 @@ static int bpf_print_opt(struct filter_util *qu, FILE *f, } struct filter_util bpf_filter_util = { - .id = "bpf", - .parse_fopt = bpf_parse_opt, - .print_fopt = bpf_print_opt, + .id = "bpf", + .parse_fopt = bpf_parse_opt, + .print_fopt = bpf_print_opt, }; @@ -12,20 +12,23 @@ #include <stdio.h> #include <stdlib.h> -#include <unistd.h> -#include <string.h> -#include <stdbool.h> -#include <libgen.h> + #include <linux/bpf.h> #include <linux/tc_act/tc_bpf.h> #include "utils.h" -#include "rt_names.h" #include "tc_util.h" #include "tc_bpf.h" static const enum bpf_prog_type bpf_type = BPF_PROG_TYPE_SCHED_ACT; +static const int nla_tbl[BPF_NLA_MAX] = { + [BPF_NLA_OPS_LEN] = TCA_ACT_BPF_OPS_LEN, + [BPF_NLA_OPS] = TCA_ACT_BPF_OPS, + [BPF_NLA_FD] = TCA_ACT_BPF_FD, + [BPF_NLA_NAME] = TCA_ACT_BPF_NAME, +}; + static void explain(void) { fprintf(stderr, "Usage: ... bpf ... [ index INDEX ]\n"); @@ -37,12 +40,14 @@ static void explain(void) fprintf(stderr, "eBPF use case:\n"); fprintf(stderr, " object-file FILE [ section ACT_NAME ] [ export UDS_FILE ]"); fprintf(stderr, " [ verbose ]\n"); + fprintf(stderr, " object-pinned FILE\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where BPF_BYTECODE := \'s,c t f k,c t f k,c t f k,...\'\n"); fprintf(stderr, "c,t,f,k and s are decimals; s denotes number of 4-tuples\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where FILE points to a file containing the BPF_BYTECODE string,\n"); - fprintf(stderr, "an ELF file containing eBPF map definitions and bytecode.\n"); + fprintf(stderr, "an ELF file containing eBPF map definitions and bytecode, or a\n"); + fprintf(stderr, "pinned eBPF program.\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where ACT_NAME refers to the section name containing the\n"); fprintf(stderr, "action (default \'%s\').\n", bpf_default_section(bpf_type)); @@ -54,114 +59,40 @@ static void explain(void) fprintf(stderr, "explicitly specifies an action index upon creation.\n"); } -static void usage(void) +static int bpf_parse_opt(struct action_util *a, int *ptr_argc, char ***ptr_argv, + int tca_id, struct nlmsghdr *n) { - explain(); - exit(-1); -} - -static int parse_bpf(struct action_util *a, int *argc_p, char ***argv_p, - int tca_id, struct nlmsghdr *n) -{ - char **argv = *argv_p, bpf_name[256]; + const char *bpf_obj = NULL, *bpf_uds_name = NULL; + struct tc_act_bpf parm; + bool seen_run = false; struct rtattr *tail; - struct tc_act_bpf parm = { 0 }; - struct sock_filter bpf_ops[BPF_MAXINSNS]; - bool ebpf_fill = false, bpf_fill = false; - bool ebpf = false, seen_run = false; - const char *bpf_uds_name = NULL; - const char *bpf_sec_name = NULL; - char *bpf_obj = NULL; - int argc = *argc_p, ret = 0; - __u16 bpf_len = 0; - __u32 bpf_fd = 0; + int argc, ret = 0; + char **argv; + + argv = *ptr_argv; + argc = *ptr_argc; if (matches(*argv, "bpf") != 0) return -1; NEXT_ARG(); + tail = NLMSG_TAIL(n); + addattr_l(n, MAX_MSG, tca_id, NULL, 0); + while (argc > 0) { if (matches(*argv, "run") == 0) { - bool from_file, bpf_verbose; - int ret; - NEXT_ARG(); opt_bpf: - bpf_sec_name = bpf_default_section(bpf_type); - bpf_verbose = false; seen_run = true; - - if (strcmp(*argv, "bytecode-file") == 0 || - strcmp(*argv, "bcf") == 0) { - from_file = true; - } else if (strcmp(*argv, "bytecode") == 0 || - strcmp(*argv, "bc") == 0) { - from_file = false; - } else if (strcmp(*argv, "object-file") == 0 || - strcmp(*argv, "obj") == 0) { - ebpf = true; - } else { - fprintf(stderr, "unexpected \"%s\"\n", *argv); - explain(); + if (bpf_parse_common(&argc, &argv, nla_tbl, bpf_type, + &bpf_obj, &bpf_uds_name, n)) { + fprintf(stderr, "Failed to retrieve (e)BPF data!\n"); return -1; } - - NEXT_ARG(); - if (ebpf) { - bpf_uds_name = getenv(BPF_ENV_UDS); - bpf_obj = *argv; - - NEXT_ARG_FWD(); - - if (argc > 0 && - (strcmp(*argv, "section") == 0 || - strcmp(*argv, "sec") == 0)) { - NEXT_ARG(); - bpf_sec_name = *argv; - NEXT_ARG_FWD(); - } - if (argc > 0 && !bpf_uds_name && - (strcmp(*argv, "export") == 0 || - strcmp(*argv, "exp") == 0)) { - NEXT_ARG(); - bpf_uds_name = *argv; - NEXT_ARG_FWD(); - } - if (argc > 0 && - (strcmp(*argv, "verbose") == 0 || - strcmp(*argv, "verb") == 0)) { - bpf_verbose = true; - NEXT_ARG_FWD(); - } - - PREV_ARG(); - } - - ret = ebpf ? bpf_open_object(bpf_obj, bpf_type, bpf_sec_name, - bpf_verbose) : - bpf_parse_ops(argc, argv, bpf_ops, from_file); - if (ret < 0) { - fprintf(stderr, "%s\n", ebpf ? - "Could not load object" : - "Illegal \"bytecode\""); - return -1; - } - - if (ebpf) { - bpf_obj = basename(bpf_obj); - - snprintf(bpf_name, sizeof(bpf_name), "%s:[%s]", - bpf_obj, bpf_sec_name); - - bpf_fd = ret; - ebpf_fill = true; - } else { - bpf_len = ret; - bpf_fill = true; - } } else if (matches(*argv, "help") == 0) { - usage(); + explain(); + return -1; } else if (matches(*argv, "index") == 0) { break; } else { @@ -173,7 +104,9 @@ opt_bpf: NEXT_ARG_FWD(); } + memset(&parm, 0, sizeof(parm)); parm.action = TC_ACT_PIPE; + if (argc) { if (matches(*argv, "reclassify") == 0) { parm.action = TC_ACT_RECLASSIFY; @@ -207,32 +140,19 @@ opt_bpf: } } - tail = NLMSG_TAIL(n); - - addattr_l(n, MAX_MSG, tca_id, NULL, 0); addattr_l(n, MAX_MSG, TCA_ACT_BPF_PARMS, &parm, sizeof(parm)); - - if (ebpf_fill) { - addattr32(n, MAX_MSG, TCA_ACT_BPF_FD, bpf_fd); - addattrstrz(n, MAX_MSG, TCA_ACT_BPF_NAME, bpf_name); - } else if (bpf_fill) { - addattr16(n, MAX_MSG, TCA_ACT_BPF_OPS_LEN, bpf_len); - addattr_l(n, MAX_MSG, TCA_ACT_BPF_OPS, &bpf_ops, - bpf_len * sizeof(struct sock_filter)); - } - tail->rta_len = (char *)NLMSG_TAIL(n) - (char *)tail; - *argc_p = argc; - *argv_p = argv; - if (bpf_uds_name) ret = bpf_send_map_fds(bpf_uds_name, bpf_obj); + *ptr_argc = argc; + *ptr_argv = argv; + return ret; } -static int print_bpf(struct action_util *au, FILE *f, struct rtattr *arg) +static int bpf_print_opt(struct action_util *au, FILE *f, struct rtattr *arg) { struct rtattr *tb[TCA_ACT_BPF_MAX + 1]; struct tc_act_bpf *parm; @@ -249,7 +169,6 @@ static int print_bpf(struct action_util *au, FILE *f, struct rtattr *arg) } parm = RTA_DATA(tb[TCA_ACT_BPF_PARMS]); - fprintf(f, "bpf "); if (tb[TCA_ACT_BPF_NAME]) @@ -276,12 +195,11 @@ static int print_bpf(struct action_util *au, FILE *f, struct rtattr *arg) } fprintf(f, "\n "); - return 0; } struct action_util bpf_action_util = { - .id = "bpf", - .parse_aopt = parse_bpf, - .print_aopt = print_bpf, + .id = "bpf", + .parse_aopt = bpf_parse_opt, + .print_aopt = bpf_print_opt, }; diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index 276871a..beb74be 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -20,18 +20,25 @@ #include <errno.h> #include <fcntl.h> #include <stdarg.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/un.h> -#include <linux/filter.h> -#include <linux/netlink.h> -#include <linux/rtnetlink.h> #ifdef HAVE_ELF #include <libelf.h> #include <gelf.h> #endif +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/un.h> +#include <sys/vfs.h> +#include <sys/mount.h> +#include <sys/syscall.h> +#include <sys/sendfile.h> +#include <sys/resource.h> + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/if_alg.h> + #include "utils.h" #include "bpf_elf.h" @@ -40,9 +47,51 @@ #include "tc_util.h" #include "tc_bpf.h" -int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len, - char **bpf_string, bool *need_release, - const char separator) +#ifdef HAVE_ELF +static int bpf_obj_open(const char *path, enum bpf_prog_type type, + const char *sec, bool verbose); +#else +static int bpf_obj_open(const char *path, enum bpf_prog_type type, + const char *sec, bool verbose) +{ + fprintf(stderr, "No ELF library support compiled in.\n"); + errno = ENOSYS; + return -1; +} +#endif + +static inline __u64 bpf_ptr_to_u64(const void *ptr) +{ + return (__u64)(unsigned long)ptr; +} + +static int bpf(int cmd, union bpf_attr *attr, unsigned int size) +{ +#ifdef __NR_bpf + return syscall(__NR_bpf, cmd, attr, size); +#else + fprintf(stderr, "No bpf syscall, kernel headers too old?\n"); + errno = ENOSYS; + return -1; +#endif +} + +static int bpf_map_update(int fd, const void *key, const void *value, + uint64_t flags) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = bpf_ptr_to_u64(key), + .value = bpf_ptr_to_u64(value), + .flags = flags, + }; + + return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); +} + +static int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len, + char **bpf_string, bool *need_release, + const char separator) { char sp; @@ -90,8 +139,8 @@ int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len, return 0; } -int bpf_parse_ops(int argc, char **argv, struct sock_filter *bpf_ops, - bool from_file) +static int bpf_ops_parse(int argc, char **argv, struct sock_filter *bpf_ops, + bool from_file) { char *bpf_string, *token, separator = ','; int ret = 0, i = 0; @@ -135,7 +184,6 @@ int bpf_parse_ops(int argc, char **argv, struct sock_filter *bpf_ops, goto out; } ret = bpf_len; - out: if (need_release) free(bpf_string); @@ -161,6 +209,246 @@ void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len) ops[i].jf, ops[i].k); } +static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map, + int length) +{ + char file[PATH_MAX], buff[4096]; + struct bpf_elf_map tmp, zero; + unsigned int val; + FILE *fp; + + snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd); + + fp = fopen(file, "r"); + if (!fp) { + fprintf(stderr, "No procfs support?!\n"); + return -EIO; + } + + memset(&tmp, 0, sizeof(tmp)); + while (fgets(buff, sizeof(buff), fp)) { + if (sscanf(buff, "map_type:\t%u", &val) == 1) + tmp.type = val; + else if (sscanf(buff, "key_size:\t%u", &val) == 1) + tmp.size_key = val; + else if (sscanf(buff, "value_size:\t%u", &val) == 1) + tmp.size_value = val; + else if (sscanf(buff, "max_entries:\t%u", &val) == 1) + tmp.max_elem = val; + } + + fclose(fp); + + if (!memcmp(&tmp, map, length)) { + return 0; + } else { + memset(&zero, 0, sizeof(zero)); + /* If kernel doesn't have eBPF-related fdinfo, we cannot do much, + * so just accept it. We know we do have an eBPF fd and in this + * case, everything is 0. It is guaranteed that no such map exists + * since map type of 0 is unloadable BPF_MAP_TYPE_UNSPEC. + */ + if (!memcmp(&tmp, &zero, length)) + return 0; + + fprintf(stderr, "Map specs from pinned file differ!\n"); + return -EINVAL; + } +} + +static int bpf_mnt_fs(const char *target) +{ + bool bind_done = false; + + while (mount("", target, "none", MS_PRIVATE | MS_REC, NULL)) { + if (errno != EINVAL || bind_done) { + fprintf(stderr, "mount --make-private %s failed: %s\n", + target, strerror(errno)); + return -1; + } + + if (mount(target, target, "none", MS_BIND, NULL)) { + fprintf(stderr, "mount --bind %s %s failed: %s\n", + target, target, strerror(errno)); + return -1; + } + + bind_done = true; + } + + if (mount("bpf", target, "bpf", 0, NULL)) { + fprintf(stderr, "mount -t bpf bpf %s failed: %s\n", + target, strerror(errno)); + return -1; + } + + return 0; +} + +static int bpf_valid_mntpt(const char *mnt, unsigned long magic) +{ + struct statfs st_fs; + + if (statfs(mnt, &st_fs) < 0) + return -ENOENT; + if ((unsigned long)st_fs.f_type != magic) + return -ENOENT; + + return 0; +} + +static const char *bpf_find_mntpt(const char *fstype, unsigned long magic, + char *mnt, int len, + const char * const *known_mnts) +{ + const char * const *ptr; + char type[100]; + FILE *fp; + + if (known_mnts) { + ptr = known_mnts; + while (*ptr) { + if (bpf_valid_mntpt(*ptr, magic) == 0) { + strncpy(mnt, *ptr, len - 1); + mnt[len - 1] = 0; + return mnt; + } + ptr++; + } + } + + fp = fopen("/proc/mounts", "r"); + if (fp == NULL || len != PATH_MAX) + return NULL; + + while (fscanf(fp, "%*s %" textify(PATH_MAX) "s %99s %*s %*d %*d\n", + mnt, type) == 2) { + if (strcmp(type, fstype) == 0) + break; + } + + fclose(fp); + if (strcmp(type, fstype) != 0) + return NULL; + + return mnt; +} + +int bpf_trace_pipe(void) +{ + char tracefs_mnt[PATH_MAX] = TRACE_DIR_MNT; + static const char * const tracefs_known_mnts[] = { + TRACE_DIR_MNT, + "/sys/kernel/debug/tracing", + "/tracing", + "/trace", + 0, + }; + char tpipe[PATH_MAX]; + const char *mnt; + int fd; + + mnt = bpf_find_mntpt("tracefs", TRACEFS_MAGIC, tracefs_mnt, + sizeof(tracefs_mnt), tracefs_known_mnts); + if (!mnt) { + fprintf(stderr, "tracefs not mounted?\n"); + return -1; + } + + snprintf(tpipe, sizeof(tpipe), "%s/trace_pipe", mnt); + + fd = open(tpipe, O_RDONLY); + if (fd < 0) + return -1; + + fprintf(stderr, "Running! Hang up with ^C!\n\n"); + while (1) { + static char buff[4096]; + ssize_t ret; + + ret = read(fd, buff, sizeof(buff) - 1); + if (ret > 0) { + write(2, buff, ret); + fflush(stderr); + } + } + + return 0; +} + +static const char *bpf_get_tc_dir(void) +{ + static bool bpf_mnt_cached = false; + static char bpf_tc_dir[PATH_MAX]; + static const char *mnt; + static const char * const bpf_known_mnts[] = { + BPF_DIR_MNT, + 0, + }; + char bpf_mnt[PATH_MAX] = BPF_DIR_MNT; + char bpf_glo_dir[PATH_MAX]; + int ret; + + if (bpf_mnt_cached) + goto done; + + mnt = bpf_find_mntpt("bpf", BPF_FS_MAGIC, bpf_mnt, sizeof(bpf_mnt), + bpf_known_mnts); + if (!mnt) { + mnt = getenv(BPF_ENV_MNT); + if (!mnt) + mnt = BPF_DIR_MNT; + ret = bpf_mnt_fs(mnt); + if (ret) { + mnt = NULL; + goto out; + } + } + + snprintf(bpf_tc_dir, sizeof(bpf_tc_dir), "%s/%s", mnt, BPF_DIR_TC); + ret = mkdir(bpf_tc_dir, S_IRWXU); + if (ret && errno != EEXIST) { + fprintf(stderr, "mkdir %s failed: %s\n", bpf_tc_dir, + strerror(errno)); + mnt = NULL; + goto out; + } + + snprintf(bpf_glo_dir, sizeof(bpf_glo_dir), "%s/%s", + bpf_tc_dir, BPF_DIR_GLOBALS); + ret = mkdir(bpf_glo_dir, S_IRWXU); + if (ret && errno != EEXIST) { + fprintf(stderr, "mkdir %s failed: %s\n", bpf_glo_dir, + strerror(errno)); + mnt = NULL; + goto out; + } + + mnt = bpf_tc_dir; +out: + bpf_mnt_cached = true; +done: + return mnt; +} + +static int bpf_obj_get(const char *pathname) +{ + union bpf_attr attr; + char tmp[PATH_MAX]; + + if (strlen(pathname) > 2 && pathname[0] == 'm' && + pathname[1] == ':' && bpf_get_tc_dir()) { + snprintf(tmp, sizeof(tmp), "%s/%s", + bpf_get_tc_dir(), pathname + 2); + pathname = tmp; + } + + memset(&attr, 0, sizeof(attr)); + attr.pathname = bpf_ptr_to_u64(pathname); + + return bpf(BPF_OBJ_GET, &attr, sizeof(attr)); +} + const char *bpf_default_section(const enum bpf_prog_type type) { switch (type) { @@ -173,18 +461,262 @@ const char *bpf_default_section(const enum bpf_prog_type type) } } +enum bpf_mode { + CBPF_BYTECODE = 0, + CBPF_FILE, + EBPF_OBJECT, + EBPF_PINNED, + __BPF_MODE_MAX, +#define BPF_MODE_MAX __BPF_MODE_MAX +}; + +static int bpf_parse(int *ptr_argc, char ***ptr_argv, const bool *opt_tbl, + enum bpf_prog_type *type, enum bpf_mode *mode, + const char **ptr_object, const char **ptr_section, + const char **ptr_uds_name, struct sock_filter *opcodes) +{ + const char *file, *section, *uds_name; + bool verbose = false; + int ret, argc; + char **argv; + + argv = *ptr_argv; + argc = *ptr_argc; + + if (opt_tbl[CBPF_BYTECODE] && + (matches(*argv, "bytecode") == 0 || + strcmp(*argv, "bc") == 0)) { + *mode = CBPF_BYTECODE; + } else if (opt_tbl[CBPF_FILE] && + (matches(*argv, "bytecode-file") == 0 || + strcmp(*argv, "bcf") == 0)) { + *mode = CBPF_FILE; + } else if (opt_tbl[EBPF_OBJECT] && + (matches(*argv, "object-file") == 0 || + strcmp(*argv, "obj") == 0)) { + *mode = EBPF_OBJECT; + } else if (opt_tbl[EBPF_PINNED] && + (matches(*argv, "object-pinned") == 0 || + matches(*argv, "pinned") == 0 || + matches(*argv, "fd") == 0)) { + *mode = EBPF_PINNED; + } else { + fprintf(stderr, "What mode is \"%s\"?\n", *argv); + return -1; + } + + NEXT_ARG(); + file = section = uds_name = NULL; + if (*mode == EBPF_OBJECT || *mode == EBPF_PINNED) { + file = *argv; + NEXT_ARG_FWD(); + + if (*type == BPF_PROG_TYPE_UNSPEC) { + if (argc > 0 && matches(*argv, "type") == 0) { + NEXT_ARG(); + if (matches(*argv, "cls") == 0) { + *type = BPF_PROG_TYPE_SCHED_CLS; + } else if (matches(*argv, "act") == 0) { + *type = BPF_PROG_TYPE_SCHED_ACT; + } else { + fprintf(stderr, "What type is \"%s\"?\n", + *argv); + return -1; + } + NEXT_ARG_FWD(); + } else { + *type = BPF_PROG_TYPE_SCHED_CLS; + } + } + + section = bpf_default_section(*type); + if (argc > 0 && matches(*argv, "section") == 0) { + NEXT_ARG(); + section = *argv; + NEXT_ARG_FWD(); + } + + uds_name = getenv(BPF_ENV_UDS); + if (argc > 0 && !uds_name && + matches(*argv, "export") == 0) { + NEXT_ARG(); + uds_name = *argv; + NEXT_ARG_FWD(); + } + + if (argc > 0 && matches(*argv, "verbose") == 0) { + verbose = true; + NEXT_ARG_FWD(); + } + + PREV_ARG(); + } + + if (*mode == CBPF_BYTECODE || *mode == CBPF_FILE) + ret = bpf_ops_parse(argc, argv, opcodes, *mode == CBPF_FILE); + else if (*mode == EBPF_OBJECT) + ret = bpf_obj_open(file, *type, section, verbose); + else if (*mode == EBPF_PINNED) + ret = bpf_obj_get(file); + else + return -1; + + if (ptr_object) + *ptr_object = file; + if (ptr_section) + *ptr_section = section; + if (ptr_uds_name) + *ptr_uds_name = uds_name; + + *ptr_argc = argc; + *ptr_argv = argv; + + return ret; +} + +int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl, + enum bpf_prog_type type, const char **ptr_object, + const char **ptr_uds_name, struct nlmsghdr *n) +{ + struct sock_filter opcodes[BPF_MAXINSNS]; + const bool opt_tbl[BPF_MODE_MAX] = { + [CBPF_BYTECODE] = true, + [CBPF_FILE] = true, + [EBPF_OBJECT] = true, + [EBPF_PINNED] = true, + }; + char annotation[256]; + const char *section; + enum bpf_mode mode; + int ret; + + ret = bpf_parse(ptr_argc, ptr_argv, opt_tbl, &type, &mode, + ptr_object, §ion, ptr_uds_name, opcodes); + if (ret < 0) + return ret; + + if (mode == CBPF_BYTECODE || mode == CBPF_FILE) { + addattr16(n, MAX_MSG, nla_tbl[BPF_NLA_OPS_LEN], ret); + addattr_l(n, MAX_MSG, nla_tbl[BPF_NLA_OPS], opcodes, + ret * sizeof(struct sock_filter)); + } + + if (mode == EBPF_OBJECT || mode == EBPF_PINNED) { + snprintf(annotation, sizeof(annotation), "%s:[%s]", + basename(*ptr_object), mode == EBPF_PINNED ? + "*fsobj" : section); + + addattr32(n, MAX_MSG, nla_tbl[BPF_NLA_FD], ret); + addattrstrz(n, MAX_MSG, nla_tbl[BPF_NLA_NAME], annotation); + } + + return 0; +} + +int bpf_graft_map(const char *map_path, uint32_t *key, int argc, char **argv) +{ + enum bpf_prog_type type = BPF_PROG_TYPE_UNSPEC; + const bool opt_tbl[BPF_MODE_MAX] = { + [CBPF_BYTECODE] = false, + [CBPF_FILE] = false, + [EBPF_OBJECT] = true, + [EBPF_PINNED] = true, + }; + const struct bpf_elf_map test = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .size_key = sizeof(int), + .size_value = sizeof(int), + }; + int ret, prog_fd, map_fd; + const char *section; + enum bpf_mode mode; + uint32_t map_key; + + prog_fd = bpf_parse(&argc, &argv, opt_tbl, &type, &mode, + NULL, §ion, NULL, NULL); + if (prog_fd < 0) + return prog_fd; + if (key) { + map_key = *key; + } else { + ret = sscanf(section, "%*i/%i", &map_key); + if (ret != 1) { + fprintf(stderr, "Couldn\'t infer map key from section " + "name! Please provide \'key\' argument!\n"); + ret = -EINVAL; + goto out_prog; + } + } + + map_fd = bpf_obj_get(map_path); + if (map_fd < 0) { + fprintf(stderr, "Couldn\'t retrieve pinned map \'%s\': %s\n", + map_path, strerror(errno)); + ret = map_fd; + goto out_prog; + } + + ret = bpf_map_selfcheck_pinned(map_fd, &test, + offsetof(struct bpf_elf_map, max_elem)); + if (ret < 0) { + fprintf(stderr, "Map \'%s\' self-check failed!\n", map_path); + goto out_map; + } + + ret = bpf_map_update(map_fd, &map_key, &prog_fd, BPF_ANY); + if (ret < 0) + fprintf(stderr, "Map update failed: %s\n", strerror(errno)); +out_map: + close(map_fd); +out_prog: + close(prog_fd); + return ret; +} + #ifdef HAVE_ELF +struct bpf_elf_prog { + enum bpf_prog_type type; + const struct bpf_insn *insns; + size_t size; + const char *license; +}; + +struct bpf_hash_entry { + unsigned int pinning; + const char *subpath; + struct bpf_hash_entry *next; +}; + +struct bpf_elf_ctx { + Elf *elf_fd; + GElf_Ehdr elf_hdr; + Elf_Data *sym_tab; + Elf_Data *str_tab; + int obj_fd; + int map_fds[ELF_MAX_MAPS]; + struct bpf_elf_map maps[ELF_MAX_MAPS]; + int sym_num; + int map_num; + bool *sec_done; + int sec_maps; + char license[ELF_MAX_LICENSE_LEN]; + enum bpf_prog_type type; + bool verbose; + struct bpf_elf_st stat; + struct bpf_hash_entry *ht[256]; +}; + struct bpf_elf_sec_data { - GElf_Shdr sec_hdr; - char *sec_name; - Elf_Data *sec_data; + GElf_Shdr sec_hdr; + Elf_Data *sec_data; + const char *sec_name; }; struct bpf_map_data { - int *fds; - const char *obj; - struct bpf_elf_st *st; - struct bpf_elf_map *ent; + int *fds; + const char *obj; + struct bpf_elf_st *st; + struct bpf_elf_map *ent; }; /* If we provide a small buffer with log level enabled, the kernel @@ -193,15 +725,8 @@ struct bpf_map_data { * verifier we still want to hand something descriptive to the user. */ static char bpf_log_buf[65536]; -static bool bpf_verbose; -static struct bpf_elf_st bpf_st; - -static int map_fds[ELF_MAX_MAPS]; -static struct bpf_elf_map map_ent[ELF_MAX_MAPS]; - -static void bpf_dump_error(const char *format, ...) __check_format_string(1, 2); -static void bpf_dump_error(const char *format, ...) +static __check_format_string(1, 2) void bpf_dump_error(const char *format, ...) { va_list vl; @@ -215,187 +740,431 @@ static void bpf_dump_error(const char *format, ...) } } -static void bpf_save_finfo(int file_fd) +static int bpf_map_create(enum bpf_map_type type, unsigned int size_key, + unsigned int size_value, unsigned int max_elem) { - struct stat st; - int ret; + union bpf_attr attr = { + .map_type = type, + .key_size = size_key, + .value_size = size_value, + .max_entries = max_elem, + }; - memset(&bpf_st, 0, sizeof(bpf_st)); + return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); +} - ret = fstat(file_fd, &st); - if (ret < 0) { - fprintf(stderr, "Stat of elf file failed: %s\n", - strerror(errno)); - return; +static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, + size_t size, const char *license) +{ + union bpf_attr attr = { + .prog_type = type, + .insns = bpf_ptr_to_u64(insns), + .insn_cnt = size / sizeof(struct bpf_insn), + .license = bpf_ptr_to_u64(license), + .log_buf = bpf_ptr_to_u64(bpf_log_buf), + .log_size = sizeof(bpf_log_buf), + .log_level = 1, + }; + + if (getenv(BPF_ENV_NOLOG)) { + attr.log_buf = 0; + attr.log_size = 0; + attr.log_level = 0; } - bpf_st.st_dev = st.st_dev; - bpf_st.st_ino = st.st_ino; + return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } -static void bpf_clear_finfo(void) +static int bpf_obj_pin(int fd, const char *pathname) { - memset(&bpf_st, 0, sizeof(bpf_st)); + union bpf_attr attr = { + .pathname = bpf_ptr_to_u64(pathname), + .bpf_fd = fd, + }; + + return bpf(BPF_OBJ_PIN, &attr, sizeof(attr)); } -static bool bpf_may_skip_map_creation(int file_fd) +static int bpf_obj_hash(const char *object, uint8_t *out, size_t len) { - struct stat st; - int ret; + struct sockaddr_alg alg = { + .salg_family = AF_ALG, + .salg_type = "hash", + .salg_name = "sha1", + }; + int ret, cfd, ofd, ffd; + struct stat stbuff; + ssize_t size; + + if (!object || len != 20) + return -EINVAL; - ret = fstat(file_fd, &st); + cfd = socket(AF_ALG, SOCK_SEQPACKET, 0); + if (cfd < 0) { + fprintf(stderr, "Cannot get AF_ALG socket: %s\n", + strerror(errno)); + return cfd; + } + + ret = bind(cfd, (struct sockaddr *)&alg, sizeof(alg)); if (ret < 0) { - fprintf(stderr, "Stat of elf file failed: %s\n", + fprintf(stderr, "Error binding socket: %s\n", strerror(errno)); + goto out_cfd; + } + + ofd = accept(cfd, NULL, 0); + if (ofd < 0) { + fprintf(stderr, "Error accepting socket: %s\n", strerror(errno)); - return false; + ret = ofd; + goto out_cfd; + } + + ffd = open(object, O_RDONLY); + if (ffd < 0) { + fprintf(stderr, "Error opening object %s: %s\n", + object, strerror(errno)); + ret = ffd; + goto out_ofd; + } + + ret = fstat(ffd, &stbuff); + if (ret < 0) { + fprintf(stderr, "Error doing fstat: %s\n", + strerror(errno)); + goto out_ffd; } - return (bpf_st.st_dev == st.st_dev) && - (bpf_st.st_ino == st.st_ino); + size = sendfile(ofd, ffd, NULL, stbuff.st_size); + if (size != stbuff.st_size) { + fprintf(stderr, "Error from sendfile (%zd vs %zu bytes): %s\n", + size, stbuff.st_size, strerror(errno)); + ret = -1; + goto out_ffd; + } + + size = read(ofd, out, len); + if (size != len) { + fprintf(stderr, "Error from read (%zd vs %zu bytes): %s\n", + size, len, strerror(errno)); + ret = -1; + } else { + ret = 0; + } +out_ffd: + close(ffd); +out_ofd: + close(ofd); +out_cfd: + close(cfd); + return ret; } -static int bpf_create_map(enum bpf_map_type type, unsigned int size_key, - unsigned int size_value, unsigned int max_elem) +static const char *bpf_get_obj_uid(const char *pathname) { - union bpf_attr attr = { - .map_type = type, - .key_size = size_key, - .value_size = size_value, - .max_entries = max_elem, - }; + static bool bpf_uid_cached = false; + static char bpf_uid[64]; + uint8_t tmp[20]; + int ret; - return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); + if (bpf_uid_cached) + goto done; + + ret = bpf_obj_hash(pathname, tmp, sizeof(tmp)); + if (ret) { + fprintf(stderr, "Object hashing failed!\n"); + return NULL; + } + + hexstring_n2a(tmp, sizeof(tmp), bpf_uid, sizeof(bpf_uid)); + bpf_uid_cached = true; +done: + return bpf_uid; } -static int bpf_update_map(int fd, const void *key, const void *value, - uint64_t flags) +static int bpf_init_env(const char *pathname) { - union bpf_attr attr = { - .map_fd = fd, - .key = bpf_ptr_to_u64(key), - .value = bpf_ptr_to_u64(value), - .flags = flags, + struct rlimit limit = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, }; - return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); + /* Don't bother in case we fail! */ + setrlimit(RLIMIT_MEMLOCK, &limit); + + if (!bpf_get_tc_dir()) { + fprintf(stderr, "Continuing without mounted eBPF fs. " + "Too old kernel?\n"); + return 0; + } + + if (!bpf_get_obj_uid(pathname)) + return -1; + + return 0; } -static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, - unsigned int len, const char *license) +static const char *bpf_custom_pinning(const struct bpf_elf_ctx *ctx, + uint32_t pinning) { - union bpf_attr attr = { - .prog_type = type, - .insns = bpf_ptr_to_u64(insns), - .insn_cnt = len / sizeof(struct bpf_insn), - .license = bpf_ptr_to_u64(license), - .log_buf = bpf_ptr_to_u64(bpf_log_buf), - .log_size = sizeof(bpf_log_buf), - .log_level = 1, - }; + struct bpf_hash_entry *entry; - return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + entry = ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)]; + while (entry && entry->pinning != pinning) + entry = entry->next; + + return entry ? entry->subpath : NULL; } -static int bpf_prog_attach(enum bpf_prog_type type, const char *sec, - const struct bpf_insn *insns, unsigned int size, - const char *license) +static bool bpf_no_pinning(const struct bpf_elf_ctx *ctx, + uint32_t pinning) { - int prog_fd = bpf_prog_load(type, insns, size, license); + switch (pinning) { + case PIN_OBJECT_NS: + case PIN_GLOBAL_NS: + return false; + case PIN_NONE: + return true; + default: + return !bpf_custom_pinning(ctx, pinning); + } +} - if (prog_fd < 0 || bpf_verbose) { - bpf_dump_error("%s (section \'%s\'): %s\n", prog_fd < 0 ? - "BPF program rejected" : - "BPF program verification", - sec, strerror(errno)); +static void bpf_make_pathname(char *pathname, size_t len, const char *name, + const struct bpf_elf_ctx *ctx, uint32_t pinning) +{ + switch (pinning) { + case PIN_OBJECT_NS: + snprintf(pathname, len, "%s/%s/%s", bpf_get_tc_dir(), + bpf_get_obj_uid(NULL), name); + break; + case PIN_GLOBAL_NS: + snprintf(pathname, len, "%s/%s/%s", bpf_get_tc_dir(), + BPF_DIR_GLOBALS, name); + break; + default: + snprintf(pathname, len, "%s/../%s/%s", bpf_get_tc_dir(), + bpf_custom_pinning(ctx, pinning), name); + break; } +} + +static int bpf_probe_pinned(const char *name, const struct bpf_elf_ctx *ctx, + uint32_t pinning) +{ + char pathname[PATH_MAX]; - return prog_fd; + if (bpf_no_pinning(ctx, pinning) || !bpf_get_tc_dir()) + return 0; + + bpf_make_pathname(pathname, sizeof(pathname), name, ctx, pinning); + return bpf_obj_get(pathname); } -static int bpf_map_attach(enum bpf_map_type type, unsigned int size_key, - unsigned int size_value, unsigned int max_elem) +static int bpf_make_obj_path(void) { - int map_fd = bpf_create_map(type, size_key, size_value, max_elem); + char tmp[PATH_MAX]; + int ret; - if (map_fd < 0) - bpf_dump_error("BPF map rejected: %s\n", strerror(errno)); + snprintf(tmp, sizeof(tmp), "%s/%s", bpf_get_tc_dir(), + bpf_get_obj_uid(NULL)); - return map_fd; + ret = mkdir(tmp, S_IRWXU); + if (ret && errno != EEXIST) { + fprintf(stderr, "mkdir %s failed: %s\n", tmp, strerror(errno)); + return ret; + } + + return 0; } -static void bpf_maps_init(void) +static int bpf_make_custom_path(const char *todo) { - int i; + char tmp[PATH_MAX], rem[PATH_MAX], *sub; + int ret; + + snprintf(tmp, sizeof(tmp), "%s/../", bpf_get_tc_dir()); + snprintf(rem, sizeof(rem), "%s/", todo); + sub = strtok(rem, "/"); - memset(map_ent, 0, sizeof(map_ent)); - for (i = 0; i < ARRAY_SIZE(map_fds); i++) - map_fds[i] = -1; + while (sub) { + if (strlen(tmp) + strlen(sub) + 2 > PATH_MAX) + return -EINVAL; + + strcat(tmp, sub); + strcat(tmp, "/"); + + ret = mkdir(tmp, S_IRWXU); + if (ret && errno != EEXIST) { + fprintf(stderr, "mkdir %s failed: %s\n", tmp, + strerror(errno)); + return ret; + } + + sub = strtok(NULL, "/"); + } + + return 0; } -static int bpf_maps_count(void) +static int bpf_place_pinned(int fd, const char *name, + const struct bpf_elf_ctx *ctx, uint32_t pinning) { - int i, count = 0; + char pathname[PATH_MAX]; + const char *tmp; + int ret = 0; - for (i = 0; i < ARRAY_SIZE(map_fds); i++) { - if (map_fds[i] < 0) - break; - count++; + if (bpf_no_pinning(ctx, pinning) || !bpf_get_tc_dir()) + return 0; + + if (pinning == PIN_OBJECT_NS) + ret = bpf_make_obj_path(); + else if ((tmp = bpf_custom_pinning(ctx, pinning))) + ret = bpf_make_custom_path(tmp); + if (ret < 0) + return ret; + + bpf_make_pathname(pathname, sizeof(pathname), name, ctx, pinning); + return bpf_obj_pin(fd, pathname); +} + +static int bpf_prog_attach(const char *section, + const struct bpf_elf_prog *prog, bool verbose) +{ + int fd; + + /* We can add pinning here later as well, same as bpf_map_attach(). */ + errno = 0; + fd = bpf_prog_load(prog->type, prog->insns, prog->size, + prog->license); + if (fd < 0 || verbose) { + bpf_dump_error("Prog section \'%s\' (type:%u insns:%zu " + "license:\'%s\') %s%s (%d)!\n\n", + section, prog->type, + prog->size / sizeof(struct bpf_insn), + prog->license, fd < 0 ? "rejected :" : + "loaded", fd < 0 ? strerror(errno) : "", + fd < 0 ? errno : fd); } - return count; + return fd; } -static void bpf_maps_destroy(void) +static int bpf_map_attach(const char *name, const struct bpf_elf_map *map, + const struct bpf_elf_ctx *ctx, bool verbose) { + int fd, ret; + + fd = bpf_probe_pinned(name, ctx, map->pinning); + if (fd > 0) { + ret = bpf_map_selfcheck_pinned(fd, map, + offsetof(struct bpf_elf_map, + id)); + if (ret < 0) { + close(fd); + fprintf(stderr, "Map \'%s\' self-check failed!\n", + name); + return ret; + } + if (verbose) + fprintf(stderr, "Map \'%s\' loaded as pinned!\n", + name); + return fd; + } + + errno = 0; + fd = bpf_map_create(map->type, map->size_key, map->size_value, + map->max_elem); + if (fd < 0 || verbose) { + bpf_dump_error("Map \'%s\' (type:%u id:%u pinning:%u " + "ksize:%u vsize:%u max-elems:%u) %s%s (%d)!\n", + name, map->type, map->id, map->pinning, + map->size_key, map->size_value, map->max_elem, + fd < 0 ? "rejected: " : "loaded", fd < 0 ? + strerror(errno) : "", fd < 0 ? errno : fd); + if (fd < 0) + return fd; + } + + ret = bpf_place_pinned(fd, name, ctx, map->pinning); + if (ret < 0 && errno != EEXIST) { + fprintf(stderr, "Could not pin %s map: %s\n", name, + strerror(errno)); + close(fd); + return ret; + } + + return fd; +} + +#define __ELF_ST_BIND(x) ((x) >> 4) +#define __ELF_ST_TYPE(x) (((unsigned int) x) & 0xf) + +static const char *bpf_str_tab_name(const struct bpf_elf_ctx *ctx, + const GElf_Sym *sym) +{ + return ctx->str_tab->d_buf + sym->st_name; +} + +static const char *bpf_map_fetch_name(struct bpf_elf_ctx *ctx, int which) +{ + GElf_Sym sym; int i; - memset(map_ent, 0, sizeof(map_ent)); - for (i = 0; i < ARRAY_SIZE(map_fds); i++) { - if (map_fds[i] >= 0) - close(map_fds[i]); + for (i = 0; i < ctx->sym_num; i++) { + if (gelf_getsym(ctx->sym_tab, i, &sym) != &sym) + continue; + + if (__ELF_ST_BIND(sym.st_info) != STB_GLOBAL || + __ELF_ST_TYPE(sym.st_info) != STT_NOTYPE || + sym.st_shndx != ctx->sec_maps || + sym.st_value / sizeof(struct bpf_elf_map) != which) + continue; + + return bpf_str_tab_name(ctx, &sym); } + + return NULL; } -static int bpf_maps_attach(struct bpf_elf_map *maps, unsigned int num_maps) +static int bpf_maps_attach_all(struct bpf_elf_ctx *ctx) { - int i, ret; + const char *map_name; + int i, fd; - for (i = 0; (i < num_maps) && (num_maps <= ARRAY_SIZE(map_fds)); i++) { - struct bpf_elf_map *map = &maps[i]; + for (i = 0; i < ctx->map_num; i++) { + map_name = bpf_map_fetch_name(ctx, i); + if (!map_name) + return -EIO; - ret = bpf_map_attach(map->type, map->size_key, - map->size_value, map->max_elem); - if (ret < 0) - goto err_unwind; + fd = bpf_map_attach(map_name, &ctx->maps[i], ctx, + ctx->verbose); + if (fd < 0) + return fd; - map_fds[i] = ret; + ctx->map_fds[i] = fd; } return 0; - -err_unwind: - bpf_maps_destroy(); - return ret; } -static int bpf_fill_section_data(Elf *elf_fd, GElf_Ehdr *elf_hdr, int sec_index, - struct bpf_elf_sec_data *sec_data) +static int bpf_fill_section_data(struct bpf_elf_ctx *ctx, int section, + struct bpf_elf_sec_data *data) { + Elf_Data *sec_edata; GElf_Shdr sec_hdr; Elf_Scn *sec_fd; - Elf_Data *sec_edata; char *sec_name; - memset(sec_data, 0, sizeof(*sec_data)); + memset(data, 0, sizeof(*data)); - sec_fd = elf_getscn(elf_fd, sec_index); + sec_fd = elf_getscn(ctx->elf_fd, section); if (!sec_fd) return -EINVAL; - if (gelf_getshdr(sec_fd, &sec_hdr) != &sec_hdr) return -EIO; - sec_name = elf_strptr(elf_fd, elf_hdr->e_shstrndx, + sec_name = elf_strptr(ctx->elf_fd, ctx->elf_hdr.e_shstrndx, sec_hdr.sh_name); if (!sec_name || !sec_hdr.sh_size) return -ENOENT; @@ -404,16 +1173,131 @@ static int bpf_fill_section_data(Elf *elf_fd, GElf_Ehdr *elf_hdr, int sec_index, if (!sec_edata || elf_getdata(sec_fd, sec_edata)) return -EIO; - memcpy(&sec_data->sec_hdr, &sec_hdr, sizeof(sec_hdr)); - sec_data->sec_name = sec_name; - sec_data->sec_data = sec_edata; + memcpy(&data->sec_hdr, &sec_hdr, sizeof(sec_hdr)); + + data->sec_name = sec_name; + data->sec_data = sec_edata; + return 0; +} + +static int bpf_fetch_maps(struct bpf_elf_ctx *ctx, int section, + struct bpf_elf_sec_data *data) +{ + if (data->sec_data->d_size % sizeof(struct bpf_elf_map) != 0) + return -EINVAL; + + ctx->map_num = data->sec_data->d_size / sizeof(struct bpf_elf_map); + ctx->sec_maps = section; + ctx->sec_done[section] = true; + + if (ctx->map_num > ARRAY_SIZE(ctx->map_fds)) { + fprintf(stderr, "Too many BPF maps in ELF section!\n"); + return -ENOMEM; + } + + memcpy(ctx->maps, data->sec_data->d_buf, data->sec_data->d_size); + return 0; +} + +static int bpf_fetch_license(struct bpf_elf_ctx *ctx, int section, + struct bpf_elf_sec_data *data) +{ + if (data->sec_data->d_size > sizeof(ctx->license)) + return -ENOMEM; + + memcpy(ctx->license, data->sec_data->d_buf, data->sec_data->d_size); + ctx->sec_done[section] = true; + return 0; +} +static int bpf_fetch_symtab(struct bpf_elf_ctx *ctx, int section, + struct bpf_elf_sec_data *data) +{ + ctx->sym_tab = data->sec_data; + ctx->sym_num = data->sec_hdr.sh_size / data->sec_hdr.sh_entsize; + ctx->sec_done[section] = true; return 0; } -static int bpf_apply_relo_data(struct bpf_elf_sec_data *data_relo, - struct bpf_elf_sec_data *data_insn, - Elf_Data *sym_tab) +static int bpf_fetch_strtab(struct bpf_elf_ctx *ctx, int section, + struct bpf_elf_sec_data *data) +{ + ctx->str_tab = data->sec_data; + ctx->sec_done[section] = true; + return 0; +} + +static int bpf_fetch_ancillary(struct bpf_elf_ctx *ctx) +{ + struct bpf_elf_sec_data data; + int i, ret = -1; + + for (i = 1; i < ctx->elf_hdr.e_shnum; i++) { + ret = bpf_fill_section_data(ctx, i, &data); + if (ret < 0) + continue; + + if (!strcmp(data.sec_name, ELF_SECTION_MAPS)) + ret = bpf_fetch_maps(ctx, i, &data); + else if (!strcmp(data.sec_name, ELF_SECTION_LICENSE)) + ret = bpf_fetch_license(ctx, i, &data); + else if (data.sec_hdr.sh_type == SHT_SYMTAB) + ret = bpf_fetch_symtab(ctx, i, &data); + else if (data.sec_hdr.sh_type == SHT_STRTAB && + i != ctx->elf_hdr.e_shstrndx) + ret = bpf_fetch_strtab(ctx, i, &data); + if (ret < 0) { + fprintf(stderr, "Error parsing section %d! Perhaps" + "check with readelf -a?\n", i); + break; + } + } + + if (ctx->sym_tab && ctx->str_tab && ctx->sec_maps) { + ret = bpf_maps_attach_all(ctx); + if (ret < 0) { + fprintf(stderr, "Error loading maps into kernel!\n"); + return ret; + } + } + + return ret; +} + +static int bpf_fetch_prog(struct bpf_elf_ctx *ctx, const char *section) +{ + struct bpf_elf_sec_data data; + struct bpf_elf_prog prog; + int ret, i, fd = -1; + + for (i = 1; i < ctx->elf_hdr.e_shnum; i++) { + if (ctx->sec_done[i]) + continue; + + ret = bpf_fill_section_data(ctx, i, &data); + if (ret < 0 || strcmp(data.sec_name, section)) + continue; + + memset(&prog, 0, sizeof(prog)); + prog.type = ctx->type; + prog.insns = data.sec_data->d_buf; + prog.size = data.sec_data->d_size; + prog.license = ctx->license; + + fd = bpf_prog_attach(section, &prog, ctx->verbose); + if (fd < 0) + continue; + + ctx->sec_done[i] = true; + break; + } + + return fd; +} + +static int bpf_apply_relo_data(struct bpf_elf_ctx *ctx, + struct bpf_elf_sec_data *data_relo, + struct bpf_elf_sec_data *data_insn) { Elf_Data *idata = data_insn->sec_data; GElf_Shdr *rhdr = &data_relo->sec_hdr; @@ -422,7 +1306,7 @@ static int bpf_apply_relo_data(struct bpf_elf_sec_data *data_relo, unsigned int num_insns = idata->d_size / sizeof(*insns); for (relo_ent = 0; relo_ent < relo_num; relo_ent++) { - unsigned int ioff, fnum; + unsigned int ioff, rmap; GElf_Rel relo; GElf_Sym sym; @@ -430,291 +1314,367 @@ static int bpf_apply_relo_data(struct bpf_elf_sec_data *data_relo, return -EIO; ioff = relo.r_offset / sizeof(struct bpf_insn); - if (ioff >= num_insns) - return -EINVAL; - if (insns[ioff].code != (BPF_LD | BPF_IMM | BPF_DW)) + if (ioff >= num_insns || + insns[ioff].code != (BPF_LD | BPF_IMM | BPF_DW)) return -EINVAL; - if (gelf_getsym(sym_tab, GELF_R_SYM(relo.r_info), &sym) != &sym) + if (gelf_getsym(ctx->sym_tab, GELF_R_SYM(relo.r_info), &sym) != &sym) return -EIO; - fnum = sym.st_value / sizeof(struct bpf_elf_map); - if (fnum >= ARRAY_SIZE(map_fds)) + rmap = sym.st_value / sizeof(struct bpf_elf_map); + if (rmap >= ARRAY_SIZE(ctx->map_fds)) return -EINVAL; - if (map_fds[fnum] < 0) + if (!ctx->map_fds[rmap]) return -EINVAL; + if (ctx->verbose) + fprintf(stderr, "Map \'%s\' (%d) injected into prog " + "section \'%s\' at offset %u!\n", + bpf_str_tab_name(ctx, &sym), ctx->map_fds[rmap], + data_insn->sec_name, ioff); + insns[ioff].src_reg = BPF_PSEUDO_MAP_FD; - insns[ioff].imm = map_fds[fnum]; + insns[ioff].imm = ctx->map_fds[rmap]; } return 0; } -static int bpf_fetch_ancillary(int file_fd, Elf *elf_fd, GElf_Ehdr *elf_hdr, - bool *sec_done, char *license, unsigned int lic_len, - Elf_Data **sym_tab) +static int bpf_fetch_prog_relo(struct bpf_elf_ctx *ctx, const char *section) { - int sec_index, ret = -1; + struct bpf_elf_sec_data data_relo, data_insn; + struct bpf_elf_prog prog; + int ret, idx, i, fd = -1; - for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) { - struct bpf_elf_sec_data data_anc; + for (i = 1; i < ctx->elf_hdr.e_shnum; i++) { + ret = bpf_fill_section_data(ctx, i, &data_relo); + if (ret < 0 || data_relo.sec_hdr.sh_type != SHT_REL) + continue; - ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index, - &data_anc); + idx = data_relo.sec_hdr.sh_info; + ret = bpf_fill_section_data(ctx, idx, &data_insn); + if (ret < 0 || strcmp(data_insn.sec_name, section)) + continue; + + ret = bpf_apply_relo_data(ctx, &data_relo, &data_insn); if (ret < 0) continue; - /* Extract and load eBPF map fds. */ - if (!strcmp(data_anc.sec_name, ELF_SECTION_MAPS) && - !bpf_may_skip_map_creation(file_fd)) { - struct bpf_elf_map *maps; - unsigned int maps_num; + memset(&prog, 0, sizeof(prog)); + prog.type = ctx->type; + prog.insns = data_insn.sec_data->d_buf; + prog.size = data_insn.sec_data->d_size; + prog.license = ctx->license; - if (data_anc.sec_data->d_size % sizeof(*maps) != 0) - return -EINVAL; + fd = bpf_prog_attach(section, &prog, ctx->verbose); + if (fd < 0) + continue; - maps = data_anc.sec_data->d_buf; - maps_num = data_anc.sec_data->d_size / sizeof(*maps); - memcpy(map_ent, maps, data_anc.sec_data->d_size); + ctx->sec_done[i] = true; + ctx->sec_done[idx] = true; + break; + } - ret = bpf_maps_attach(maps, maps_num); - if (ret < 0) - return ret; + return fd; +} - sec_done[sec_index] = true; - } - /* Extract eBPF license. */ - else if (!strcmp(data_anc.sec_name, ELF_SECTION_LICENSE)) { - if (data_anc.sec_data->d_size > lic_len) - return -ENOMEM; - - sec_done[sec_index] = true; - memcpy(license, data_anc.sec_data->d_buf, - data_anc.sec_data->d_size); - } - /* Extract symbol table for relocations (map fd fixups). */ - else if (data_anc.sec_hdr.sh_type == SHT_SYMTAB) { - sec_done[sec_index] = true; - *sym_tab = data_anc.sec_data; - } - } +static int bpf_fetch_prog_sec(struct bpf_elf_ctx *ctx, const char *section) +{ + int ret = -1; + + if (ctx->sym_tab) + ret = bpf_fetch_prog_relo(ctx, section); + if (ret < 0) + ret = bpf_fetch_prog(ctx, section); return ret; } -static int bpf_fetch_prog_relo(Elf *elf_fd, GElf_Ehdr *elf_hdr, bool *sec_done, - enum bpf_prog_type type, const char *sec, - const char *license, Elf_Data *sym_tab) +static int bpf_find_map_by_id(struct bpf_elf_ctx *ctx, uint32_t id) { - int sec_index, prog_fd = -1; + int i; - for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) { - struct bpf_elf_sec_data data_relo, data_insn; - int ins_index, ret; + for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++) + if (ctx->map_fds[i] && ctx->maps[i].id == id && + ctx->maps[i].type == BPF_MAP_TYPE_PROG_ARRAY) + return i; + return -1; +} - /* Attach eBPF programs with relocation data (maps). */ - ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index, - &data_relo); - if (ret < 0 || data_relo.sec_hdr.sh_type != SHT_REL) - continue; +static int bpf_fill_prog_arrays(struct bpf_elf_ctx *ctx) +{ + struct bpf_elf_sec_data data; + uint32_t map_id, key_id; + int fd, i, ret, idx; - ins_index = data_relo.sec_hdr.sh_info; + for (i = 1; i < ctx->elf_hdr.e_shnum; i++) { + if (ctx->sec_done[i]) + continue; - ret = bpf_fill_section_data(elf_fd, elf_hdr, ins_index, - &data_insn); + ret = bpf_fill_section_data(ctx, i, &data); if (ret < 0) continue; - if (strcmp(data_insn.sec_name, sec)) - continue; - ret = bpf_apply_relo_data(&data_relo, &data_insn, sym_tab); - if (ret < 0) + ret = sscanf(data.sec_name, "%i/%i", &map_id, &key_id); + if (ret != 2) continue; - prog_fd = bpf_prog_attach(type, sec, data_insn.sec_data->d_buf, - data_insn.sec_data->d_size, license); - if (prog_fd < 0) + idx = bpf_find_map_by_id(ctx, map_id); + if (idx < 0) continue; - sec_done[sec_index] = true; - sec_done[ins_index] = true; - break; + fd = bpf_fetch_prog_sec(ctx, data.sec_name); + if (fd < 0) + return -EIO; + + ret = bpf_map_update(ctx->map_fds[idx], &key_id, + &fd, BPF_ANY); + if (ret < 0) + return -ENOENT; + + ctx->sec_done[i] = true; } - return prog_fd; + return 0; } -static int bpf_fetch_prog(Elf *elf_fd, GElf_Ehdr *elf_hdr, bool *sec_done, - enum bpf_prog_type type, const char *sec, - const char *license) +static void bpf_save_finfo(struct bpf_elf_ctx *ctx) { - int sec_index, prog_fd = -1; + struct stat st; + int ret; - for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) { - struct bpf_elf_sec_data data_insn; - int ret; + memset(&ctx->stat, 0, sizeof(ctx->stat)); - /* Attach eBPF programs without relocation data. */ - if (sec_done[sec_index]) - continue; + ret = fstat(ctx->obj_fd, &st); + if (ret < 0) { + fprintf(stderr, "Stat of elf file failed: %s\n", + strerror(errno)); + return; + } - ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index, - &data_insn); - if (ret < 0) - continue; - if (strcmp(data_insn.sec_name, sec)) - continue; + ctx->stat.st_dev = st.st_dev; + ctx->stat.st_ino = st.st_ino; +} - prog_fd = bpf_prog_attach(type, sec, data_insn.sec_data->d_buf, - data_insn.sec_data->d_size, license); - if (prog_fd < 0) +static int bpf_read_pin_mapping(FILE *fp, uint32_t *id, char *path) +{ + char buff[PATH_MAX]; + + while (fgets(buff, sizeof(buff), fp)) { + char *ptr = buff; + + while (*ptr == ' ' || *ptr == '\t') + ptr++; + + if (*ptr == '#' || *ptr == '\n' || *ptr == 0) continue; - sec_done[sec_index] = true; - break; + if (sscanf(ptr, "%i %s\n", id, path) != 2 && + sscanf(ptr, "%i %s #", id, path) != 2) { + strcpy(path, ptr); + return -1; + } + + return 1; } - return prog_fd; + return 0; } -static int bpf_fetch_prog_sec(Elf *elf_fd, GElf_Ehdr *elf_hdr, bool *sec_done, - enum bpf_prog_type type, const char *sec, - const char *license, Elf_Data *sym_tab) +static bool bpf_pinning_reserved(uint32_t pinning) { - int ret = -1; - - if (sym_tab) - ret = bpf_fetch_prog_relo(elf_fd, elf_hdr, sec_done, type, - sec, license, sym_tab); - if (ret < 0) - ret = bpf_fetch_prog(elf_fd, elf_hdr, sec_done, type, sec, - license); - return ret; + switch (pinning) { + case PIN_NONE: + case PIN_OBJECT_NS: + case PIN_GLOBAL_NS: + return true; + default: + return false; + } } -static int bpf_fill_prog_arrays(Elf *elf_fd, GElf_Ehdr *elf_hdr, bool *sec_done, - enum bpf_prog_type type, const char *license, - Elf_Data *sym_tab) +static void bpf_hash_init(struct bpf_elf_ctx *ctx, const char *db_file) { - int sec_index; + struct bpf_hash_entry *entry; + char subpath[PATH_MAX]; + uint32_t pinning; + FILE *fp; + int ret; - for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) { - struct bpf_elf_sec_data data_insn; - int ret, map_id, key_id, prog_fd; + fp = fopen(db_file, "r"); + if (!fp) + return; - if (sec_done[sec_index]) + memset(subpath, 0, sizeof(subpath)); + while ((ret = bpf_read_pin_mapping(fp, &pinning, subpath))) { + if (ret == -1) { + fprintf(stderr, "Database %s is corrupted at: %s\n", + db_file, subpath); + fclose(fp); + return; + } + + if (bpf_pinning_reserved(pinning)) { + fprintf(stderr, "Database %s, id %u is reserved - " + "ignoring!\n", db_file, pinning); continue; + } - ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index, - &data_insn); - if (ret < 0) + entry = malloc(sizeof(*entry)); + if (!entry) { + fprintf(stderr, "No memory left for db entry!\n"); continue; + } - ret = sscanf(data_insn.sec_name, "%i/%i", &map_id, &key_id); - if (ret != 2) + entry->pinning = pinning; + entry->subpath = strdup(subpath); + if (!entry->subpath) { + fprintf(stderr, "No memory left for db entry!\n"); + free(entry); continue; + } - if (map_id >= ARRAY_SIZE(map_fds) || map_fds[map_id] < 0) - return -ENOENT; - if (map_ent[map_id].type != BPF_MAP_TYPE_PROG_ARRAY || - map_ent[map_id].max_elem <= key_id) - return -EINVAL; + entry->next = ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)]; + ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)] = entry; + } - prog_fd = bpf_fetch_prog_sec(elf_fd, elf_hdr, sec_done, - type, data_insn.sec_name, - license, sym_tab); - if (prog_fd < 0) - return -EIO; + fclose(fp); +} - ret = bpf_update_map(map_fds[map_id], &key_id, &prog_fd, - BPF_ANY); - if (ret < 0) - return -ENOENT; +static void bpf_hash_destroy(struct bpf_elf_ctx *ctx) +{ + struct bpf_hash_entry *entry; + int i; - sec_done[sec_index] = true; + for (i = 0; i < ARRAY_SIZE(ctx->ht); i++) { + while ((entry = ctx->ht[i]) != NULL) { + ctx->ht[i] = entry->next; + free((char *)entry->subpath); + free(entry); + } } - - return 0; } -int bpf_open_object(const char *path, enum bpf_prog_type type, - const char *sec, bool verbose) +static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname, + enum bpf_prog_type type, bool verbose) { - char license[ELF_MAX_LICENSE_LEN]; - int file_fd, prog_fd = -1, ret; - Elf_Data *sym_tab = NULL; - GElf_Ehdr elf_hdr; - bool *sec_done; - Elf *elf_fd; + int ret = -EINVAL; - if (elf_version(EV_CURRENT) == EV_NONE) - return -EINVAL; + if (elf_version(EV_CURRENT) == EV_NONE || + bpf_init_env(pathname)) + return ret; + + memset(ctx, 0, sizeof(*ctx)); + ctx->verbose = verbose; + ctx->type = type; - file_fd = open(path, O_RDONLY, 0); - if (file_fd < 0) - return -errno; + ctx->obj_fd = open(pathname, O_RDONLY); + if (ctx->obj_fd < 0) + return ctx->obj_fd; - elf_fd = elf_begin(file_fd, ELF_C_READ, NULL); - if (!elf_fd) { + ctx->elf_fd = elf_begin(ctx->obj_fd, ELF_C_READ, NULL); + if (!ctx->elf_fd) { ret = -EINVAL; - goto out; + goto out_fd; } - if (gelf_getehdr(elf_fd, &elf_hdr) != &elf_hdr) { + if (gelf_getehdr(ctx->elf_fd, &ctx->elf_hdr) != + &ctx->elf_hdr) { ret = -EIO; goto out_elf; } - sec_done = calloc(elf_hdr.e_shnum, sizeof(*sec_done)); - if (!sec_done) { + ctx->sec_done = calloc(ctx->elf_hdr.e_shnum, + sizeof(*(ctx->sec_done))); + if (!ctx->sec_done) { ret = -ENOMEM; goto out_elf; } - memset(license, 0, sizeof(license)); - bpf_verbose = verbose; + bpf_save_finfo(ctx); + bpf_hash_init(ctx, CONFDIR "/bpf_pinning"); - if (!bpf_may_skip_map_creation(file_fd)) - bpf_maps_init(); + return 0; +out_elf: + elf_end(ctx->elf_fd); +out_fd: + close(ctx->obj_fd); + return ret; +} - ret = bpf_fetch_ancillary(file_fd, elf_fd, &elf_hdr, sec_done, - license, sizeof(license), &sym_tab); - if (ret < 0) - goto out_maps; +static int bpf_maps_count(struct bpf_elf_ctx *ctx) +{ + int i, count = 0; - prog_fd = bpf_fetch_prog_sec(elf_fd, &elf_hdr, sec_done, type, - sec, license, sym_tab); - if (prog_fd < 0) - goto out_maps; + for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++) { + if (!ctx->map_fds[i]) + break; + count++; + } - if (!bpf_may_skip_map_creation(file_fd)) { - ret = bpf_fill_prog_arrays(elf_fd, &elf_hdr, sec_done, - type, license, sym_tab); - if (ret < 0) - goto out_prog; + return count; +} + +static void bpf_maps_teardown(struct bpf_elf_ctx *ctx) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++) { + if (ctx->map_fds[i]) + close(ctx->map_fds[i]); } +} - bpf_save_finfo(file_fd); +static void bpf_elf_ctx_destroy(struct bpf_elf_ctx *ctx, bool failure) +{ + if (failure) + bpf_maps_teardown(ctx); - free(sec_done); + bpf_hash_destroy(ctx); + free(ctx->sec_done); + elf_end(ctx->elf_fd); + close(ctx->obj_fd); +} - elf_end(elf_fd); - close(file_fd); +static struct bpf_elf_ctx __ctx; - return prog_fd; +static int bpf_obj_open(const char *pathname, enum bpf_prog_type type, + const char *section, bool verbose) +{ + struct bpf_elf_ctx *ctx = &__ctx; + int fd = 0, ret; -out_prog: - close(prog_fd); -out_maps: - bpf_maps_destroy(); - free(sec_done); -out_elf: - elf_end(elf_fd); + ret = bpf_elf_ctx_init(ctx, pathname, type, verbose); + if (ret < 0) { + fprintf(stderr, "Cannot initialize ELF context!\n"); + return ret; + } + + ret = bpf_fetch_ancillary(ctx); + if (ret < 0) { + fprintf(stderr, "Error fetching ELF ancillary data!\n"); + goto out; + } + + fd = bpf_fetch_prog_sec(ctx, section); + if (fd < 0) { + fprintf(stderr, "Error fetching program/map!\n"); + ret = fd; + goto out; + } + + ret = bpf_fill_prog_arrays(ctx); + if (ret < 0) + fprintf(stderr, "Error filling program arrays!\n"); out: - close(file_fd); - bpf_clear_finfo(); - return prog_fd; + bpf_elf_ctx_destroy(ctx, ret < 0); + if (ret < 0) { + if (fd) + close(fd); + return ret; + } + + return fd; } static int @@ -803,6 +1763,7 @@ bpf_map_set_recv(int fd, int *fds, struct bpf_map_aux *aux, int bpf_send_map_fds(const char *path, const char *obj) { + struct bpf_elf_ctx *ctx = &__ctx; struct sockaddr_un addr; struct bpf_map_data bpf_aux; int fd, ret; @@ -827,18 +1788,18 @@ int bpf_send_map_fds(const char *path, const char *obj) memset(&bpf_aux, 0, sizeof(bpf_aux)); - bpf_aux.fds = map_fds; - bpf_aux.ent = map_ent; - + bpf_aux.fds = ctx->map_fds; + bpf_aux.ent = ctx->maps; + bpf_aux.st = &ctx->stat; bpf_aux.obj = obj; - bpf_aux.st = &bpf_st; ret = bpf_map_set_send(fd, &addr, sizeof(addr), &bpf_aux, - bpf_maps_count()); + bpf_maps_count(ctx)); if (ret < 0) fprintf(stderr, "Cannot send fds to %s: %s\n", path, strerror(errno)); + bpf_maps_teardown(ctx); close(fd); return ret; } diff --git a/tc/tc_bpf.h b/tc/tc_bpf.h index 2ad8812..526d0b1 100644 --- a/tc/tc_bpf.h +++ b/tc/tc_bpf.h @@ -13,61 +13,57 @@ #ifndef _TC_BPF_H_ #define _TC_BPF_H_ 1 -#include <linux/filter.h> #include <linux/netlink.h> -#include <linux/rtnetlink.h> #include <linux/bpf.h> -#include <sys/syscall.h> -#include <errno.h> -#include <stdio.h> -#include <stdint.h> +#include <linux/magic.h> #include "utils.h" #include "bpf_scm.h" +enum { + BPF_NLA_OPS_LEN = 0, + BPF_NLA_OPS, + BPF_NLA_FD, + BPF_NLA_NAME, + __BPF_NLA_MAX, +}; + +#define BPF_NLA_MAX __BPF_NLA_MAX + #define BPF_ENV_UDS "TC_BPF_UDS" +#define BPF_ENV_MNT "TC_BPF_MNT" +#define BPF_ENV_NOLOG "TC_BPF_NOLOG" -int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len, - char **bpf_string, bool *need_release, - const char separator); -int bpf_parse_ops(int argc, char **argv, struct sock_filter *bpf_ops, - bool from_file); -void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len); +#ifndef BPF_FS_MAGIC +# define BPF_FS_MAGIC 0xcafe4a11 +#endif +#define BPF_DIR_MNT "/sys/fs/bpf" + +#define BPF_DIR_TC "tc" +#define BPF_DIR_GLOBALS "globals" + +#ifndef TRACEFS_MAGIC +# define TRACEFS_MAGIC 0x74726163 +#endif + +#define TRACE_DIR_MNT "/sys/kernel/tracing" + +int bpf_trace_pipe(void); const char *bpf_default_section(const enum bpf_prog_type type); -#ifdef HAVE_ELF -int bpf_open_object(const char *path, enum bpf_prog_type type, - const char *sec, bool verbose); +int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl, + enum bpf_prog_type type, const char **ptr_object, + const char **ptr_uds_name, struct nlmsghdr *n); +int bpf_graft_map(const char *map_path, uint32_t *key, int argc, char **argv); +void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len); + +#ifdef HAVE_ELF int bpf_send_map_fds(const char *path, const char *obj); int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux, unsigned int entries); - -static inline __u64 bpf_ptr_to_u64(const void *ptr) -{ - return (__u64) (unsigned long) ptr; -} - -static inline int bpf(int cmd, union bpf_attr *attr, unsigned int size) -{ -#ifdef __NR_bpf - return syscall(__NR_bpf, cmd, attr, size); #else - fprintf(stderr, "No bpf syscall, kernel headers too old?\n"); - errno = ENOSYS; - return -1; -#endif -} -#else -static inline int bpf_open_object(const char *path, enum bpf_prog_type type, - const char *sec, bool verbose) -{ - fprintf(stderr, "No ELF library support compiled in.\n"); - errno = ENOSYS; - return -1; -} - static inline int bpf_send_map_fds(const char *path, const char *obj) { return 0; |