diff --git a/binaries/mips32r1-msb/ip2net b/binaries/mips32r1-msb/ip2net index a4481ac..2a73123 100755 Binary files a/binaries/mips32r1-msb/ip2net and b/binaries/mips32r1-msb/ip2net differ diff --git a/binaries/mips64r2-msb/ip2net b/binaries/mips64r2-msb/ip2net index 28f63ce..14d9331 100755 Binary files a/binaries/mips64r2-msb/ip2net and b/binaries/mips64r2-msb/ip2net differ diff --git a/binaries/ppc/ip2net b/binaries/ppc/ip2net index 7225e3b..12780f5 100755 Binary files a/binaries/ppc/ip2net and b/binaries/ppc/ip2net differ diff --git a/ip2net/ip2net.c b/ip2net/ip2net.c index 60316bd..861c6c9 100644 --- a/ip2net/ip2net.c +++ b/ip2net/ip2net.c @@ -57,8 +57,41 @@ static uint32_t unique(uint32_t *pu, uint32_t ct) +#if defined(__GNUC__) && !defined(__llvm__) +__attribute__((optimize ("no-strict-aliasing"))) +#endif static int cmp6(const void * a, const void * b, void *arg) { + // this function is critical to sort performance + // on big endian systems cpu byte order is equal to network byte order + // no conversion required. it's possible to improve speed by using big size compares + // assume that a and b are properly aligned +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__==__ORDER_BIG_ENDIAN__) +#ifdef __SIZEOF_INT128__ + // the fastest possible way (MIPS64/PPC64 only ?) + const unsigned __int128 *pa = (unsigned __int128*)((struct in6_addr *)a)->s6_addr; + const unsigned __int128 *pb = (unsigned __int128*)((struct in6_addr *)b)->s6_addr; + return *pa < *pb ? -1 : *pa == *pb ? 0 : 1; +#else + const uint64_t *pa = (uint64_t*)((struct in6_addr *)a)->s6_addr; + const uint64_t *pb = (uint64_t*)((struct in6_addr *)b)->s6_addr; + + if (pa[0] < pb[0]) + return -1; + else if (pa[0] == pb[0]) + { + if (pa[1] < pb[1]) + return -1; + else if (pa[1] > pb[1]) + return 1; + else + return 0; + } + else + return 1; // pa[0] > pb[0] +#endif +#else + // little endian or unknown. reversing byte order voids performance improvement. so do byte comparision for (uint8_t i = 0; i < sizeof(((struct in6_addr *)0)->s6_addr); i++) { if (((struct in6_addr *)a)->s6_addr[i] < ((struct in6_addr *)b)->s6_addr[i]) @@ -67,6 +100,7 @@ static int cmp6(const void * a, const void * b, void *arg) return 1; } return 0; +#endif } // make presorted array unique. return number of unique items. static uint32_t unique6(struct in6_addr *pu, uint32_t ct) @@ -124,6 +158,7 @@ static void ip6_and(const struct in6_addr *a, const struct in6_addr *b, struct i // YES, from my point of view C should work as a portable assembler. It must do what I instruct it to do. // that's why I disable strict aliasing for this function. I observed gcc can miscompile with O2/O3 setting if inlined and not coded "correct" // result = a & b +// assume that a and b are properly aligned #if defined(__GNUC__) && !defined(__llvm__) __attribute__((optimize ("no-strict-aliasing"))) #endif