networking: New subproject.

Add scalar and NEON ones' complement checksumming implementations for AArch64 and Armv7-A.
author: Ola Liljedahl <ola.liljedahl@arm.com> 2020-02-27 15:10:03 +0100
committer: Szabolcs Nagy <szabolcs.nagy@arm.com> 2020-02-27 17:05:51 +0000
commit: 6a988f6813aa5d929ec7f8aa17946f2d9cf511ce (patch)
tree: 44d03c453732bf5f47323e08cae711ea0fcc7e8f /networking
parent: 6f41cff0221e245745b39a246f1129050e8e0c6d (diff)
download: platform_external_arm-optimized-routines-6a988f6813aa5d929ec7f8aa17946f2d9cf511ce.tar.gz
platform_external_arm-optimized-routines-6a988f6813aa5d929ec7f8aa17946f2d9cf511ce.tar.bz2
platform_external_arm-optimized-routines-6a988f6813aa5d929ec7f8aa17946f2d9cf511ce.zip
7 files changed, 979 insertions, 0 deletions
diff --git a/networking/Dir.mk b/networking/Dir.mk
new file mode 100644
index 0000000..b496103
--- /dev/null
+++ b/networking/Dir.mk
@@ -0,0 +1,76 @@
+# Makefile fragment - requires GNU make
+#
+# Copyright (c) 2019-2020, Arm Limited.
+# SPDX-License-Identifier: MIT
+
+S := $(srcdir)/networking
+B := build/networking
+
+ifeq ($(ARCH),)
+all-networking check-networking install-networking clean-networking:
+	@echo "*** Please set ARCH in config.mk. ***"
+	@exit 1
+else
+
+networking-lib-srcs := $(wildcard $(S)/*.[cS]) $(wildcard $(S)/$(ARCH)/*.[cS])
+networking-test-srcs := $(wildcard $(S)/test/*.c)
+
+networking-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
+
+networking-libs := \
+	build/lib/libnetworking.so \
+	build/lib/libnetworking.a \
+
+networking-tools := \
+	build/bin/test/chksum
+
+networking-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-lib-srcs)))
+networking-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-test-srcs)))
+
+networking-objs := \
+	$(networking-lib-objs) \
+	$(networking-lib-objs:%.o=%.os) \
+	$(networking-test-objs) \
+
+networking-files := \
+	$(networking-objs) \
+	$(networking-libs) \
+	$(networking-tools) \
+	$(networking-includes) \
+
+all-networking: $(networking-libs) $(networking-tools) $(networking-includes)
+
+$(networking-objs): $(networking-includes)
+$(networking-objs): CFLAGS_ALL += $(networking-cflags)
+
+build/lib/libnetworking.so: $(networking-lib-objs:%.o=%.os)
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
+
+build/lib/libnetworkinglib.a: $(networking-lib-objs)
+	rm -f $@
+	$(AR) rc $@ $^
+	$(RANLIB) $@
+
+build/bin/test/%: $(B)/test/%.o build/lib/libnetworkinglib.a
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/include/%.h: $(S)/include/%.h
+	cp $< $@
+
+build/bin/%.sh: $(S)/test/%.sh
+	cp $< $@
+
+check-networking: $(networking-tools)
+	$(EMULATOR) build/bin/test/chksum -i simple
+	$(EMULATOR) build/bin/test/chksum -i scalar
+	$(EMULATOR) build/bin/test/chksum -i simd || true # simd is not always available
+
+install-networking: \
+ $(networking-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
+ $(networking-includes:build/include/%=$(DESTDIR)$(includedir)/%)
+
+clean-networking:
+	rm -f $(networking-files)
+endif
+
+.PHONY: all-networking check-networking install-networking clean-networking
diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c
new file mode 100644
index 0000000..6d5be58
--- /dev/null
+++ b/networking/aarch64/chksum_simd.c
@@ -0,0 +1,146 @@
+/*
+ * AArch64-specific checksum implementation using NEON
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "networking.h"
+#include "../chksum_common.h"
+
+#ifndef __ARM_NEON
+#pragma GCC target("+simd")
+#endif
+
+#include <arm_neon.h>
+
+always_inline
+static inline uint64_t
+slurp_head64(const void **pptr, uint32_t *nbytes)
+{
+    Assert(*nbytes >= 8);
+    uint64_t sum = 0;
+    uint32_t off = (uintptr_t) *pptr % 8;
+    if (likely(off != 0))
+    {
+	/* Get rid of bytes 0..off-1 */
+	const unsigned char *ptr64 = align_ptr(*pptr, 8);
+	uint64_t mask = ALL_ONES << (CHAR_BIT * off);
+	uint64_t val = load64(ptr64) & mask;
+	/* Fold 64-bit sum to 33 bits */
+	sum = val >> 32;
+	sum += (uint32_t) val;
+	*pptr = ptr64 + 8;
+	*nbytes -= 8 - off;
+    }
+    return sum;
+}
+
+always_inline
+static inline uint64_t
+slurp_tail64(uint64_t sum, const void *ptr, uint32_t nbytes)
+{
+    Assert(nbytes < 8);
+    if (likely(nbytes != 0))
+    {
+	/* Get rid of bytes 7..nbytes */
+	uint64_t mask = ALL_ONES >> (CHAR_BIT * (8 - nbytes));
+	Assert(__builtin_popcountl(mask) / CHAR_BIT == nbytes);
+	uint64_t val = load64(ptr) & mask;
+	sum += val >> 32;
+	sum += (uint32_t) val;
+	nbytes = 0;
+    }
+    Assert(nbytes == 0);
+    return sum;
+}
+
+unsigned short
+__chksum_aarch64_simd(const void *ptr, unsigned int nbytes)
+{
+    bool swap = (uintptr_t) ptr & 1;
+    uint64_t sum;
+
+    if (unlikely(nbytes < 50))
+    {
+	sum = slurp_small(ptr, nbytes);
+	swap = false;
+	goto fold;
+    }
+
+    /* 8-byte align pointer */
+    Assert(nbytes >= 8);
+    sum = slurp_head64(&ptr, &nbytes);
+    Assert(((uintptr_t) ptr & 7) == 0);
+
+    const uint32_t *may_alias ptr32 = ptr;
+
+    uint64x2_t vsum0 = { 0, 0 };
+    uint64x2_t vsum1 = { 0, 0 };
+    uint64x2_t vsum2 = { 0, 0 };
+    uint64x2_t vsum3 = { 0, 0 };
+
+    /* Sum groups of 64 bytes */
+    for (uint32_t i = 0; i < nbytes / 64; i++)
+    {
+	uint32x4_t vtmp0 = vld1q_u32(ptr32);
+	uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
+	uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8);
+	uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12);
+	vsum0 = vpadalq_u32(vsum0, vtmp0);
+	vsum1 = vpadalq_u32(vsum1, vtmp1);
+	vsum2 = vpadalq_u32(vsum2, vtmp2);
+	vsum3 = vpadalq_u32(vsum3, vtmp3);
+	ptr32 += 16;
+    }
+    nbytes %= 64;
+
+    /* Fold vsum2 and vsum3 into vsum0 and vsum1 */
+    vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2));
+    vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3));
+
+    /* Add any trailing group of 32 bytes */
+    if (nbytes & 32)
+    {
+	uint32x4_t vtmp0 = vld1q_u32(ptr32);
+	uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
+	vsum0 = vpadalq_u32(vsum0, vtmp0);
+	vsum1 = vpadalq_u32(vsum1, vtmp1);
+	ptr32 += 8;
+	nbytes -= 32;
+    }
+    Assert(nbytes < 32);
+
+    /* Fold vsum1 into vsum0 */
+    vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1));
+
+    /* Add any trailing group of 16 bytes */
+    if (nbytes & 16)
+    {
+	uint32x4_t vtmp = vld1q_u32(ptr32);
+	vsum0 = vpadalq_u32(vsum0, vtmp);
+	ptr32 += 4;
+	nbytes -= 16;
+    }
+    Assert(nbytes < 16);
+
+    /* Add any trailing group of 8 bytes */
+    if (nbytes & 8)
+    {
+	uint32x2_t vtmp = vld1_u32(ptr32);
+	vsum0 = vaddw_u32(vsum0, vtmp);
+	ptr32 += 2;
+	nbytes -= 8;
+    }
+    Assert(nbytes < 8);
+
+    uint64_t val = vaddlvq_u32(vreinterpretq_u32_u64(vsum0));
+    sum += val >> 32;
+    sum += (uint32_t) val;
+
+    /* Handle any trailing 0..7 bytes */
+    sum = slurp_tail64(sum, ptr32, nbytes);
+
+fold:
+    return fold_and_swap(sum, swap);
+}
diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c
new file mode 100644
index 0000000..7f69adf
--- /dev/null
+++ b/networking/arm/chksum_simd.c
@@ -0,0 +1,149 @@
+/*
+ * Armv7-A specific checksum implementation using NEON
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "networking.h"
+#include "../chksum_common.h"
+
+#ifndef __ARM_NEON
+#pragma GCC target("+simd")
+#endif
+
+#include <arm_neon.h>
+
+unsigned short
+__chksum_arm_simd(const void *ptr, unsigned int nbytes)
+{
+    bool swap = (uintptr_t) ptr & 1;
+    uint64x1_t vsum = { 0 };
+
+    if (unlikely(nbytes < 40))
+    {
+	uint64_t sum = slurp_small(ptr, nbytes);
+	return fold_and_swap(sum, false);
+    }
+
+    /* 8-byte align pointer */
+    /* Inline slurp_head-like code since we use NEON here */
+    Assert(nbytes >= 8);
+    uint32_t off = (uintptr_t) ptr & 7;
+    if (likely(off != 0))
+    {
+	const uint64_t *may_alias ptr64 = align_ptr(ptr, 8);
+	uint64x1_t vword64 = vld1_u64(ptr64);
+	/* Get rid of bytes 0..off-1 */
+	uint64x1_t vmask = vdup_n_u64(ALL_ONES);
+	int64x1_t vshiftl = vdup_n_s64(CHAR_BIT * off);
+	vmask = vshl_u64(vmask, vshiftl);
+	vword64 = vand_u64(vword64, vmask);
+	uint32x2_t vtmp = vreinterpret_u32_u64(vword64);
+	/* Set accumulator */
+	vsum = vpaddl_u32(vtmp);
+	/* Update pointer and remaining size */
+	ptr = (char *) ptr64 + 8;
+	nbytes -= 8 - off;
+    }
+    Assert(((uintptr_t) ptr & 7) == 0);
+
+    /* Sum groups of 64 bytes */
+    uint64x2_t vsum0 = { 0, 0 };
+    uint64x2_t vsum1 = { 0, 0 };
+    uint64x2_t vsum2 = { 0, 0 };
+    uint64x2_t vsum3 = { 0, 0 };
+    const uint32_t *may_alias ptr32 = ptr;
+    for (uint32_t i = 0; i < nbytes / 64; i++)
+    {
+	uint32x4_t vtmp0 = vld1q_u32(ptr32);
+	uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
+	uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8);
+	uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12);
+	vsum0 = vpadalq_u32(vsum0, vtmp0);
+	vsum1 = vpadalq_u32(vsum1, vtmp1);
+	vsum2 = vpadalq_u32(vsum2, vtmp2);
+	vsum3 = vpadalq_u32(vsum3, vtmp3);
+	ptr32 += 16;
+    }
+    nbytes %= 64;
+
+    /* Fold vsum1/vsum2/vsum3 into vsum0 */
+    vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2));
+    vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3));
+    vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1));
+
+    /* Add any trailing 16-byte groups */
+    while (likely(nbytes >= 16))
+    {
+	uint32x4_t vtmp0 = vld1q_u32(ptr32);
+	vsum0 = vpadalq_u32(vsum0, vtmp0);
+	ptr32 += 4;
+	nbytes -= 16;
+    }
+    Assert(nbytes < 16);
+
+    /* Fold vsum0 into vsum */
+    {
+	/* 4xu32 (4x32b) -> 2xu64 (2x33b) */
+	vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0));
+	/* 4xu32 (2x(1b+32b)) -> 2xu64 (2x(0b+32b)) */
+	vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0));
+	/* 4xu32 (4x32b) -> 2xu64 (2x33b) */
+	Assert((vgetq_lane_u64(vsum0, 0) >> 32) == 0);
+	Assert((vgetq_lane_u64(vsum0, 1) >> 32) == 0);
+	uint32x2_t vtmp = vmovn_u64(vsum0);
+	/* Add to accumulator */
+	vsum = vpadal_u32(vsum, vtmp);
+    }
+
+    /* Add any trailing group of 8 bytes */
+    if (nbytes & 8)
+    {
+	uint32x2_t vtmp = vld1_u32(ptr32);
+	/* Add to accumulator */
+	vsum = vpadal_u32(vsum, vtmp);
+	ptr32 += 2;
+	nbytes -= 8;
+    }
+    Assert(nbytes < 8);
+
+    /* Handle any trailing 1..7 bytes */
+    if (likely(nbytes != 0))
+    {
+	Assert(((uintptr_t) ptr32 & 7) == 0);
+	Assert(nbytes < 8);
+	uint64x1_t vword64 = vld1_u64((const uint64_t *) ptr32);
+	/* Get rid of bytes 7..nbytes */
+	uint64x1_t vmask = vdup_n_u64(ALL_ONES);
+	int64x1_t vshiftr = vdup_n_s64(-CHAR_BIT * (8 - nbytes));
+	vmask = vshl_u64(vmask, vshiftr);/* Shift right */
+	vword64 = vand_u64(vword64, vmask);
+	/* Fold 64-bit sum to 33 bits */
+	vword64 = vpaddl_u32(vreinterpret_u32_u64(vword64));
+	/* Add to accumulator */
+	vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64));
+    }
+
+    /* Fold 64-bit vsum to 32 bits */
+    vsum = vpaddl_u32(vreinterpret_u32_u64(vsum));
+    vsum = vpaddl_u32(vreinterpret_u32_u64(vsum));
+    Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0);
+
+    /* Fold 32-bit vsum to 16 bits */
+    uint32x2_t vsum32 = vreinterpret_u32_u64(vsum);
+    vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32));
+    vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32));
+    Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 1) == 0);
+    Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 2) == 0);
+    Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 3) == 0);
+
+    /* Convert to 16-bit scalar */
+    uint16_t sum = vget_lane_u16(vreinterpret_u16_u32(vsum32), 0);
+
+    if (unlikely(swap))/* Odd base pointer is unexpected */
+    {
+	sum = bswap16(sum);
+    }
+    return sum;
+}
diff --git a/networking/chksum.c b/networking/chksum.c
new file mode 100644
index 0000000..95ce5ba
--- /dev/null
+++ b/networking/chksum.c
@@ -0,0 +1,81 @@
+/*
+ * Compute 16-bit sum in ones' complement arithmetic (with end-around carry).
+ * This sum is often used as a simple checksum in networking.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "networking.h"
+#include "chksum_common.h"
+
+always_inline
+static inline uint32_t
+slurp_head32(const void **pptr, uint32_t *nbytes)
+{
+    uint32_t sum = 0;
+    Assert(*nbytes >= 4);
+    uint32_t off = (uintptr_t) *pptr % 4;
+    if (likely(off != 0))
+    {
+	/* Get rid of bytes 0..off-1 */
+	const unsigned char *ptr32 = align_ptr(*pptr, 4);
+	uint32_t mask = ~0U << (CHAR_BIT * off);
+	sum = load32(ptr32) & mask;
+	*pptr = ptr32 + 4;
+	*nbytes -= 4 - off;
+    }
+    return sum;
+}
+
+/* Additional loop unrolling would help when not auto-vectorizing */
+unsigned short
+__chksum(const void *ptr, unsigned int nbytes)
+{
+    bool swap = false;
+    uint64_t sum = 0;
+
+    if (nbytes > 300)
+    {
+	/* 4-byte align pointer */
+	swap = (uintptr_t) ptr & 1;
+	sum = slurp_head32(&ptr, &nbytes);
+    }
+    /* Else benefit of aligning not worth the overhead */
+
+    /* Sum all 16-byte chunks */
+    const char *cptr = ptr;
+    for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--)
+    {
+	uint64_t h0 = load32(cptr + 0);
+	uint64_t h1 = load32(cptr + 4);
+	uint64_t h2 = load32(cptr + 8);
+	uint64_t h3 = load32(cptr + 12);
+	sum += h0 + h1 + h2 + h3;
+	cptr += 16;
+    }
+    nbytes %= 16;
+    Assert(nbytes < 16);
+
+    /* Handle any trailing 4-byte chunks */
+    while (nbytes >= 4)
+    {
+	sum += load32(cptr);
+	cptr += 4;
+	nbytes -= 4;
+    }
+    Assert(nbytes < 4);
+
+    if (nbytes & 2)
+    {
+	sum += load16(cptr);
+	cptr += 2;
+    }
+
+    if (nbytes & 1)
+    {
+	sum += *(uint8_t *)cptr;
+    }
+
+    return fold_and_swap(sum, swap);
+}
diff --git a/networking/chksum_common.h b/networking/chksum_common.h
new file mode 100644
index 0000000..958c8cc
--- /dev/null
+++ b/networking/chksum_common.h
@@ -0,0 +1,132 @@
+/*
+ * Common code for checksum implementations
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef CHKSUM_COMMON_H
+#define CHKSUM_COMMON_H
+
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+#error Only little endian supported
+#endif
+
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Assertions must be explicitly enabled */
+#if WANT_ASSERT
+#undef NDEBUG
+#include <assert.h>
+#define Assert(exp) assert(exp)
+#else
+#define Assert(exp) (void) (exp)
+#endif
+
+#ifdef __GNUC__
+#define likely(x)     __builtin_expect(!!(x), 1)
+#define unlikely(x)   __builtin_expect(!!(x), 0)
+#define may_alias     __attribute__((__may_alias__))
+#define always_inline __attribute__((always_inline))
+#ifdef __clang__
+#define no_unroll_loops
+#else
+#define no_unroll_loops  __attribute__((optimize("no-unroll-loops")))
+#endif
+#define bswap16(x)    __builtin_bswap16((x))
+#else
+#define likely(x)     (x)
+#define unlikely(x)   (x)
+#define may_alias
+#define always_inline
+#define no_unroll_loops
+#define bswap16(x)    ((uint8_t)((x) >> 8) | ((uint8_t)(x) << 8))
+#endif
+
+#define ALL_ONES ~UINT64_C(0)
+
+static inline
+uint64_t load64(const void *ptr)
+{
+    /* GCC will optimise this to a normal load instruction */
+    uint64_t v;
+    memcpy(&v, ptr, sizeof v);
+    return v;
+}
+
+static inline
+uint32_t load32(const void *ptr)
+{
+    /* GCC will optimise this to a normal load instruction */
+    uint32_t v;
+    memcpy(&v, ptr, sizeof v);
+    return v;
+}
+
+static inline
+uint16_t load16(const void *ptr)
+{
+    /* GCC will optimise this to a normal load instruction */
+    uint16_t v;
+    memcpy(&v, ptr, sizeof v);
+    return v;
+}
+
+/* slurp_small() is for small buffers, don't waste cycles on alignment */
+no_unroll_loops
+always_inline
+static inline uint64_t
+slurp_small(const void *ptr, uint32_t nbytes)
+{
+    const unsigned char *cptr = ptr;
+    uint64_t sum = 0;
+    while (nbytes >= 4)
+    {
+	sum += load32(cptr);
+	cptr += 4;
+	nbytes -= 4;
+    }
+    if (nbytes & 2)
+    {
+	sum += load16(cptr);
+	cptr += 2;
+    }
+    if (nbytes & 1)
+    {
+	sum += (uint8_t) *cptr;
+    }
+    return sum;
+}
+
+static inline const void *
+align_ptr(const void *ptr, size_t bytes)
+{
+    return (void *) ((uintptr_t) ptr & -(uintptr_t) bytes);
+}
+
+always_inline
+static inline uint16_t
+fold_and_swap(uint64_t sum, bool swap)
+{
+    /* Fold 64-bit sum to 32 bits */
+    sum = (sum & 0xffffffff) + (sum >> 32);
+    sum = (sum & 0xffffffff) + (sum >> 32);
+    Assert(sum == (uint32_t) sum);
+
+    /* Fold 32-bit sum to 16 bits */
+    sum = (sum & 0xffff) + (sum >> 16);
+    sum = (sum & 0xffff) + (sum >> 16);
+    Assert(sum == (uint16_t) sum);
+
+    if (unlikely(swap)) /* Odd base pointer is unexpected */
+    {
+	sum = bswap16(sum);
+    }
+
+    return (uint16_t) sum;
+}
+
+#endif
diff --git a/networking/include/networking.h b/networking/include/networking.h
new file mode 100644
index 0000000..a88feff
--- /dev/null
+++ b/networking/include/networking.h
@@ -0,0 +1,14 @@
+/*
+ * Public API.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+unsigned short __chksum (const void *, unsigned int);
+#if __aarch64__ && __ARM_NEON
+unsigned short __chksum_aarch64_simd (const void *, unsigned int);
+#endif
+#if __arm__ && __ARM_NEON
+unsigned short __chksum_arm_simd (const void *, unsigned int);
+#endif
diff --git a/networking/test/chksum.c b/networking/test/chksum.c
new file mode 100644
index 0000000..50722a4
--- /dev/null
+++ b/networking/test/chksum.c
@@ -0,0 +1,381 @@
+/*
+ * Ones' complement checksum test & benchmark
+ *
+ * Copyright 2016-2020 ARM Limited
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+#include "../include/networking.h"
+
+#if WANT_ASSERT
+#undef NDEBUG
+#include <assert.h>
+#define Assert(exp) assert(exp)
+#else
+#define Assert(exp) (void) (exp)
+#endif
+
+#ifdef __GNUC__
+#define may_alias __attribute__((__may_alias__))
+#else
+#define may_alias
+#endif
+
+#define CACHE_LINE 64
+#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
+
+/* Reference implementation - do not modify! */
+static uint16_t
+checksum_simple(const void *ptr, uint32_t nbytes)
+{
+    const uint16_t *may_alias hptr = ptr;
+    uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */
+
+    /* Sum all halfwords, assume misaligned accesses are handled in HW */
+    for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
+    {
+	sum += *hptr++;
+    }
+
+    /* Add any trailing odd byte */
+    if ((nbytes & 0x01) != 0)
+    {
+	sum += *(uint8_t *) hptr;
+    }
+
+    /* Fold 64-bit sum to 32 bits */
+    sum = (sum & 0xffffffff) + (sum >> 32);
+    sum = (sum & 0xffffffff) + (sum >> 32);
+    Assert(sum == (uint32_t) sum);
+
+    /* Fold 32-bit sum to 16 bits */
+    sum = (sum & 0xffff) + (sum >> 16);
+    sum = (sum & 0xffff) + (sum >> 16);
+    Assert(sum == (uint16_t) sum);
+
+    return (uint16_t) sum;
+}
+
+static struct
+{
+    uint16_t (*cksum_fp)(const void *, uint32_t);
+    const char *name;
+} implementations[] =
+{
+    { checksum_simple, "simple"},
+    { __chksum, "scalar"},
+#if __arm__
+    { __chksum_arm_simd, "simd" },
+#elif __aarch64__
+    { __chksum_aarch64_simd, "simd" },
+#endif
+    { NULL, NULL}
+};
+
+static int
+find_impl(const char *name)
+{
+    for (int i = 0; implementations[i].name != NULL; i++)
+    {
+	if (strcmp(implementations[i].name, name) == 0)
+	{
+	    return i;
+	}
+    }
+    return -1;
+}
+
+static uint16_t (*CKSUM_FP)(const void *, uint32_t);
+static volatile uint16_t SINK;
+
+static bool
+verify(const void *data, uint32_t offset, uint32_t size)
+{
+
+    uint16_t csum_expected = checksum_simple(data, size);
+    uint16_t csum_actual = CKSUM_FP(data, size);
+    if (csum_actual != csum_expected)
+    {
+	fprintf(stderr, "\nInvalid checksum for offset %u size %u: "
+		"actual %04x expected %04x (valid)",
+		offset, size, csum_actual, csum_expected);
+	if (size < 65536)
+	{
+	    /* Fatal error */
+	    exit(EXIT_FAILURE);
+	}
+	/* Else some implementations only support sizes up to 2^16 */
+	return false;
+    }
+    return true;
+}
+
+static uint64_t
+clock_get_ns(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
+}
+
+static void
+benchmark(const uint8_t *base,
+	  size_t poolsize,
+	  uint32_t blksize,
+	  uint32_t numops,
+	  uint64_t cpufreq)
+{
+    printf("%11u ", (unsigned int) blksize); fflush(stdout);
+
+    uint64_t start = clock_get_ns();
+    for (uint32_t i = 0; i < numops; i ++)
+    {
+	/* Read a random value from the pool */
+	uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
+	/* Generate a random starting address */
+	const void *data = &base[random % (poolsize - blksize)];
+	SINK = CKSUM_FP(data, blksize);
+    }
+    uint64_t end = clock_get_ns();
+
+#define MEGABYTE 1000000 /* Decimal megabyte (MB) */
+    uint64_t elapsed_ns = end - start;
+    uint64_t elapsed_ms = elapsed_ns / 1000000;
+    uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
+    uint64_t accbytes = (uint64_t) numops * blksize;
+    printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
+    unsigned int cyc_per_blk = cpufreq / blks_per_s;
+    printf("%11u ", cyc_per_blk);
+    if (blksize != 0)
+    {
+	unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
+	printf("%7u.%03u ",
+		cyc_per_byte / 1000, cyc_per_byte % 1000);
+    }
+    printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+    int c;
+    bool DUMP = false;
+    uint32_t IMPL = 0;/* Simple implementation */
+    uint64_t CPUFREQ = 0;
+    uint32_t BLKSIZE = 0;
+    uint32_t NUMOPS = 1000000;
+    uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */
+
+    setvbuf(stdout, NULL, _IOLBF, 160);
+    while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)
+    {
+	switch (c)
+	{
+	    case 'b' :
+		{
+		    int blksize = atoi(optarg);
+		    if (blksize < 1 || blksize > POOLSIZE / 2)
+		    {
+			fprintf(stderr, "Invalid block size %d\n", blksize);
+			exit(EXIT_FAILURE);
+		    }
+		    BLKSIZE = (unsigned) blksize;
+		    break;
+		}
+	    case 'd' :
+		DUMP = true;
+		break;
+	    case 'f' :
+		{
+		    int64_t cpufreq = atoll(optarg);
+		    if (cpufreq < 1)
+		    {
+			fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",
+				cpufreq);
+			exit(EXIT_FAILURE);
+		    }
+		    CPUFREQ = cpufreq;
+		    break;
+		}
+	    case 'i' :
+		{
+		    int impl = find_impl(optarg);
+		    if (impl < 0)
+		    {
+			fprintf(stderr, "Invalid implementation %s\n", optarg);
+			goto usage;
+		    }
+		    IMPL = (unsigned) impl;
+		    break;
+		}
+	    case 'n' :
+		{
+		    int numops = atoi(optarg);
+		    if (numops < 1)
+		    {
+			fprintf(stderr, "Invalid number of operations %d\n", numops);
+			exit(EXIT_FAILURE);
+		    }
+		    NUMOPS = (unsigned) numops;
+		    break;
+		}
+	    case 'p' :
+		{
+		    int poolsize = atoi(optarg);
+		    if (poolsize < 4096)
+		    {
+			fprintf(stderr, "Invalid pool size %d\n", poolsize);
+			exit(EXIT_FAILURE);
+		    }
+		    char c = optarg[strlen(optarg) - 1];
+		    if (c == 'M')
+		    {
+			POOLSIZE = (unsigned) poolsize * 1024 * 1024;
+		    }
+		    else if (c == 'K')
+		    {
+			POOLSIZE = (unsigned) poolsize * 1024;
+		    }
+		    else
+		    {
+			POOLSIZE = (unsigned) poolsize;
+		    }
+		    break;
+		}
+	    default :
+usage :
+		fprintf(stderr, "Usage: checksum <options>\n"
+			"-b <blksize>    Block size\n"
+			"-d              Dump first 96 bytes of data\n"
+			"-f <cpufreq>    CPU frequency (Hz)\n"
+			"-i <impl>       Implementation\n"
+			"-n <numops>     Number of operations\n"
+			"-p <poolsize>   Pool size (K or M suffix)\n"
+		       );
+		printf("Implementations:");
+		for (int i = 0; implementations[i].name != NULL; i++)
+		{
+		    printf(" %s", implementations[i].name);
+		}
+		printf("\n");
+		exit(EXIT_FAILURE);
+	}
+    }
+    if (optind > argc)
+    {
+	goto usage;
+    }
+
+    CKSUM_FP = implementations[IMPL].cksum_fp;
+    POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
+    uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,
+			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+    if (base == MAP_FAILED)
+    {
+	perror("aligned_alloc"), exit(EXIT_FAILURE);
+    }
+    for (size_t i = 0; i < POOLSIZE / 4; i++)
+    {
+	((uint32_t *) base)[i] = rand();
+    }
+
+    printf("Implementation: %s\n", implementations[IMPL].name);
+    printf("numops %u, poolsize ", NUMOPS);
+    if (POOLSIZE % (1024 * 1024) == 0)
+    {
+	printf("%uMiB", POOLSIZE / (1024 * 1024));
+    }
+    else if (POOLSIZE % 1024 == 0)
+    {
+	printf("%uKiB", POOLSIZE / 1024);
+    }
+    else
+    {
+	printf("%uB", POOLSIZE);
+    }
+    printf(", blocksize %u, CPU frequency %juMHz\n",
+	   BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
+#if WANT_ASSERT
+    printf("Warning: assertions are enabled\n");
+#endif
+
+    if (DUMP)
+    {
+	/* Print out first 96 bytes of data for human debugging */
+	for (int i = 0; i < 96; i++)
+	{
+	    if (i % 8 == 0)
+		printf("%2u:", i);
+	    printf(" %02x", base[i]);
+	    if (i % 8 == 7)
+		printf("\n");
+	}
+    }
+
+    /* Verify that chosen algorithm handles all combinations of offsets and sizes */
+    printf("Verifying..."); fflush(stdout);
+    bool success = true;
+    /* Check all (relevant) combinations of size and offset */
+    for (int size = 0; size <= 256; size++)
+    {
+	for (int offset = 0; offset < 255; offset++)
+	{
+	    /* Check at start of mapped memory */
+	    success &= verify(&base[offset], offset, size);
+	    /* Check at end of mapped memory */
+	    uint8_t *p = base + POOLSIZE - (size + offset);
+	    success &= verify(p, (uintptr_t) p % 64, size);
+	}
+    }
+    /* Check increasingly larger sizes */
+    for (size_t size = 1; size < POOLSIZE; size *= 2)
+    {
+	success &= verify(base, 0, size);
+    }
+    /* Check the full size, this can detect accumulator overflows */
+    success &= verify(base, 0, POOLSIZE);
+    printf("%s\n", success ? "OK" : "failure");
+
+    /* Print throughput in decimal megabyte (1000000B) per second */
+    if (CPUFREQ != 0)
+    {
+	printf("%11s %11s %11s %11s\n",
+	       "block size", "MB/s", "cycles/blk", "cycles/byte");
+    }
+    else
+    {
+	printf("%11s %11s %11s %11s\n",
+	       "block size", "MB/s", "ns/blk", "ns/byte");
+	CPUFREQ = 1000000000;
+    }
+    if (BLKSIZE != 0)
+    {
+	benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);
+    }
+    else
+    {
+	static const uint16_t sizes[] =
+	    { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
+	for (int i = 0; sizes[i] != 0; i++)
+	{
+	    uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
+	    benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);
+	}
+    }
+
+    if (munmap(base, POOLSIZE) != 0)
+    {
+	perror("munmap"), exit(EXIT_FAILURE);
+    }
+
+    return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
author	Ola Liljedahl <ola.liljedahl@arm.com>	2020-02-27 15:10:03 +0100
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>	2020-02-27 17:05:51 +0000
commit	6a988f6813aa5d929ec7f8aa17946f2d9cf511ce (patch)
tree	44d03c453732bf5f47323e08cae711ea0fcc7e8f /networking
parent	6f41cff0221e245745b39a246f1129050e8e0c6d (diff)
download	platform_external_arm-optimized-routines-6a988f6813aa5d929ec7f8aa17946f2d9cf511ce.tar.gz platform_external_arm-optimized-routines-6a988f6813aa5d929ec7f8aa17946f2d9cf511ce.tar.bz2 platform_external_arm-optimized-routines-6a988f6813aa5d929ec7f8aa17946f2d9cf511ce.zip