From d13ea934c2b2049d69e717d90b0b993309da6ed0 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Wed, 23 Oct 2024 10:12:39 +0800 Subject: [PATCH] Misc fixes - rx: Flush rx fq when group free - tx: Add lock for txch sharing - kern: Add access check for ipv6 in xudp_hash() - bpf: Use lazy logging when prog loading - ip pkt: init pseudo_ip_id as random - udp cksum: update algorithm according to kernel - cksum: fix wrong byte order check in do_csum() - udp: add checksum for ipv4 udp - ip: add pseudo random ip id with global variable Signed-off-by: Philo Lu --- bpf/bpf.c | 44 ++++++++++++++++++++++++++++++++++++-------- group/channel.c | 12 ++++++++++++ group/group.c | 11 +++++++++-- include/channel.h | 2 ++ kern/kern_core.c | 4 ++++ xudp/checksum.h | 30 +++++++++++++++++++++++++----- xudp/packet.c | 34 +++++++++++++++++++++++++++++++++- xudp/packet.h | 1 + xudp/tx.c | 21 ++++++++++++++------- xudp/xsk.c | 1 + xudp/xudp.c | 1 + xudp/xudp_types.h | 6 ++++++ 12 files changed, 144 insertions(+), 23 deletions(-) diff --git a/bpf/bpf.c b/bpf/bpf.c index 2b42fe1..79bfa7d 100644 --- a/bpf/bpf.c +++ b/bpf/bpf.c @@ -33,7 +33,7 @@ struct map{ static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, int insn_cnt, - const char *license, char *log, int log_size) + const char *license, char *log, int log_size, int log_level) { union bpf_attr attr = { .prog_type = type, @@ -42,7 +42,7 @@ static int bpf_prog_load(enum bpf_prog_type type, .license = ptr_to_u64(license), .log_buf = ptr_to_u64(log), .log_size = log_size, - .log_level = 1, + .log_level = log_level, }; return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); @@ -465,12 +465,13 @@ static int bpf_elf_collect_scn(struct bpf *b) return 0; } - +/* verifier maximum in kernels */ +#define BPF_LOG_BUF_SIZE (UINT32_MAX >> 8) static int _bpf_load(struct bpf *b, char *e, int size) { Elf *elf; - int ret; - char log[1024 * 500]; + int ret, log_level, log_size; + char *log_buf; elf = elf_memory(e, size); if (!elf) @@ -500,17 +501,44 @@ static int _bpf_load(struct bpf *b, char *e, int size) goto err; } + /* Bpf prog loading may fail due to insufficient log buffer. To avoid this, we firstly try + * loading with log_level == 0. If it fails, try again with log_level == 1 to collect more + * error messages. + */ + log_level = 0; +retry_load: + if (log_level) { + log_size = BPF_LOG_BUF_SIZE; + log_buf = malloc(log_size); + if (!log_buf) { + ret = -ENOMEM; + goto err; + } + + } else { + log_size = 0; + log_buf = NULL; + } + ret = bpf_prog_load(b->type, (const struct bpf_insn *)b->ins.p, b->ins.size/sizeof(struct bpf_insn), b->license.p, - log, - sizeof(log)); + log_buf, log_size, log_level); if (ret < 0) { - printf("==== bpf load log ====\n%s\n", log); + + if (!log_level) { + log_level = 1; + goto retry_load; + } + + printf("==== bpf load log ====\n%s\n", log_buf); printf("bpf load err: %s\n", strerror(errno)); } + if (log_buf) + free(log_buf); + b->prog_fd = ret; err: diff --git a/group/channel.c b/group/channel.c index d4ae051..597e620 100644 --- a/group/channel.c +++ b/group/channel.c @@ -294,4 +294,16 @@ int xudp_recycle(xudp_msghdr *hdr) return 0; } +void xudp_xsk_flush_fq(struct xdpsock *xsk) +{ + struct xdp_desc descs[BATCH_SIZE]; + unsigned int n; + while (1) { + n = xq_deq(&xsk->ring, descs, BATCH_SIZE); + if (!n) + break; + + umem_fq_put(xsk, descs, n); + } +} diff --git a/group/group.c b/group/group.c index 072a129..9438198 100644 --- a/group/group.c +++ b/group/group.c @@ -189,8 +189,15 @@ void xudp_group_free(struct xudp_group *g) rxch = gnic->rxch + i; xsk = &rxch->xsk; - if (xsk->x && xsk->sfd != -1) - __xudp_xsk_free(xsk); + if (!xsk->x || xsk->sfd == -1) + continue; + + /* There could still be pending packets using fq. Clear them. */ + close(xsk->sfd); + xsk->sfd = -1; + xudp_xsk_flush_fq(xsk); + + __xudp_xsk_free(xsk); } free(gnic); diff --git a/include/channel.h b/include/channel.h index 84ebcce..a0dfbde 100644 --- a/include/channel.h +++ b/include/channel.h @@ -105,6 +105,8 @@ static inline void dump_check(xudp *x, struct xudp_group *g, char *pkt, int len) dump_free(d, g); } } + +void xudp_xsk_flush_fq(struct xdpsock *xsk); #endif diff --git a/kern/kern_core.c b/kern/kern_core.c index 015d675..a87cb31 100644 --- a/kern/kern_core.c +++ b/kern/kern_core.c @@ -182,6 +182,10 @@ static int xudp_hash(struct xudp_ctx *ctx) hash = ctx->hdrs.udp->source; + /* This cannot happen actually, but is necessary to pass verifier. */ + if (!access_ok(ctx, &ctx->hdrs.iph6->saddr)) + return hash; + #pragma unroll for (i = 0; i < sizeof(ctx->hdrs.iph6->saddr)/4; ++i) hash += *((int *)&(ctx->hdrs.iph6->saddr) + i); diff --git a/xudp/checksum.h b/xudp/checksum.h index fd31be0..99d9950 100644 --- a/xudp/checksum.h +++ b/xudp/checksum.h @@ -141,11 +141,28 @@ static inline u16 udp_checksum(u8 *u, __be32 saddr, __be32 daddr, u16 size) #define sum32(sum, val) { \ int carry; \ - sum += val; \ - carry = sum < val; \ + sum += (val); \ + carry = sum < (val); \ sum += carry; \ } +/* Calculate checksum for pseudo header including ipv4 and ipv6. + * Codes are derived from csum_tcpudp_nofold() in kernel (lib/checksum.c). + */ +static inline u32 udp_hdr_csum(u32 sum, struct in_addr *saddr, + struct in_addr *daddr, u32 size) +{ + sum32(sum, saddr->s_addr); + sum32(sum, daddr->s_addr); +#if __BYTE_ORDER == __BIG_ENDIAN + sum32(sum, IPPROTO_UDP + size); +#else + sum32(sum, (IPPROTO_UDP + size) << 8); +#endif + + return sum; +} + static inline u32 udp6_hdr_csum(u32 sum, struct in6_addr *saddr, struct in6_addr *daddr, u32 size) { @@ -159,8 +176,11 @@ static inline u32 udp6_hdr_csum(u32 sum, struct in6_addr *saddr, sum32(sum, daddr->s6_addr32[2]); sum32(sum, daddr->s6_addr32[3]); - sum32(sum, htonl(size)); - sum32(sum, htonl(IPPROTO_UDP)); +#if __BYTE_ORDER == __BIG_ENDIAN + sum32(sum, IPPROTO_UDP + size); +#else + sum32(sum, (IPPROTO_UDP + size) << 8); +#endif return sum; } @@ -183,7 +203,7 @@ static inline u32 do_csum(unsigned char *buf, u32 size) } if (size & 1) { -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER == __LITTLE_ENDIAN sum += *buf; #else sum += (*buf << 8); diff --git a/xudp/packet.c b/xudp/packet.c index c6f4ada..08708ab 100644 --- a/xudp/packet.c +++ b/xudp/packet.c @@ -12,6 +12,8 @@ */ #include +#include +#include #include "packet.h" #include "checksum.h" @@ -22,6 +24,11 @@ #define CSUM_MANGLED_0 ((u16)0xffff) +/* Used to generate iph.id. All packets are marked with Don't Fragment so + * theoretically iph.id does matter. We just init it randomly and keep + * it changing as ip packets generating to mimic kernel behavior. */ +static unsigned short pseudo_ip_id; + typedef unsigned char u8; //#ifdef __x86_64__ @@ -40,6 +47,13 @@ typedef unsigned char u8; //#define memcpy __movsb //#endif +/* set pseudo_ip_id as a random number during xudp_init */ +void packet_build_init(void) +{ + srand(time(0)); + pseudo_ip_id = rand(); +} + static void xudp_checksum_half(struct iphdr *iph) { #define IP_HALF_SUM (ntohs(IP_VIT) + \ @@ -51,6 +65,7 @@ static void xudp_checksum_half(struct iphdr *iph) sum = IP_HALF_SUM; sum += iph->tot_len; + sum += iph->id; p = (u16 *)&iph->saddr; @@ -73,7 +88,7 @@ static void iph_build(struct iphdr *iph, u32 size, *((__be16 *)iph) = htons(IP_VIT); iph->tot_len = htons(sizeof(*iph) + size); - iph->id = 0; + iph->id = pseudo_ip_id++; iph->frag_off = htons(IP_DF); iph->ttl = IP_XUDP_TTL; iph->protocol = IPPROTO_UDP; @@ -102,6 +117,20 @@ static void iph_build6(struct ipv6hdr *iph6, u32 size, iph6->daddr = dst->sin6_addr; } +static void udp_csum(struct udphdr *udp, u32 size, + struct in_addr *saddr, struct in_addr *daddr) +{ + u32 sum; + + sum = do_csum((unsigned char *)udp, size); + sum = udp_hdr_csum(sum, saddr, daddr, size); + + udp->check = csum_fold(sum); + + if (udp->check == 0) + udp->check = CSUM_MANGLED_0; +} + static void udp_csum6(struct udphdr *udp, u32 size, struct in6_addr *saddr, struct in6_addr *daddr) { @@ -172,6 +201,9 @@ void xudp_packet_udp(struct packet_info *info) iph_build(iph, size, info->from, info->to); udp_build(udp, size, info->from->sin_port, info->to->sin_port); + /* udp checksum */ + udp_csum(udp, size, &info->from->sin_addr, &info->to->sin_addr); + info->len = info->payload_size + IPV4_HEADROOM; } else { eth = (void *)(info->data - IPV6_HEADROOM); diff --git a/xudp/packet.h b/xudp/packet.h index 6828534..3d0de8b 100644 --- a/xudp/packet.h +++ b/xudp/packet.h @@ -52,6 +52,7 @@ struct packet_info { int len; }; +void packet_build_init(void); void xudp_packet_udp(struct packet_info *info); void xudp_packet_udp_payload(struct packet_info *info); diff --git a/xudp/tx.c b/xudp/tx.c index 86e40d8..6c17043 100644 --- a/xudp/tx.c +++ b/xudp/tx.c @@ -624,7 +624,9 @@ int xudp_send_channel(struct xdpsock *xsk, char *buf, u32 size, info.payload = buf; info.payload_size = size; + pthread_spin_lock(&txch->lock); ret = xudp_xsk_send_one(txch, &info, g, flags); + pthread_spin_unlock(&txch->lock); end: return ret; @@ -798,22 +800,27 @@ err: int xudp_commit_channel(xudp_channel *ch) { struct txch *txch; + int ret = 0; ch = ch->tx_xsk; txch = (struct txch *)ch; - kick_xsk(txch); + pthread_spin_lock(&txch->lock); + kick_xsk(txch); logdebug(ch->x->log, "commit channel. complete: %s\n", txch->need_commit ? "no" : "yes"); - if (txch->need_commit) { - errno = EAGAIN; - return -XUDP_ERR_COMMIT_AGAIN; - } - errno = EXIT_SUCCESS; - return 0; + if (txch->need_commit) { + errno = EAGAIN; + ret = -XUDP_ERR_COMMIT_AGAIN; + goto out; + } + errno = EXIT_SUCCESS; +out: + pthread_spin_unlock(&txch->lock); + return ret; } void xudp_tx_set_frame_size(int size) diff --git a/xudp/xsk.c b/xudp/xsk.c index 5b46401..2672739 100644 --- a/xudp/xsk.c +++ b/xudp/xsk.c @@ -626,6 +626,7 @@ static int xsk_tx_init(xudp *x, struct txch *txch) if (err) goto err; + pthread_spin_init(&txch->lock, PTHREAD_PROCESS_SHARED); return 0; err: xudp_xsk_free(&txch->xsk); diff --git a/xudp/xudp.c b/xudp/xudp.c index 3c9ab09..2381357 100644 --- a/xudp/xudp.c +++ b/xudp/xudp.c @@ -222,6 +222,7 @@ xudp *xudp_init(struct xudp_conf *c, u32 c_size) log->prefix_len = strlen(log->prefix); neigh_init(); + packet_build_init(); x->log = log; diff --git a/xudp/xudp_types.h b/xudp/xudp_types.h index 03ce3bc..7049369 100644 --- a/xudp/xudp_types.h +++ b/xudp/xudp_types.h @@ -95,6 +95,12 @@ struct txch { /* fast link */ struct xudp_nic *nic; + + /* txch may be shared by multiple groups with the same group id. + * For example, when nginx reloading the new worker and old worker share the same group id, + * thus sharing the same txch. + */ + pthread_spinlock_t lock; }; struct xudp_nicxsk{ -- Gitee