[PATCH] __div64_32: implement division by multiplication for 32-bit arches
Randy Dunlap
rdunlap at infradead.org
Thu Oct 29 10:09:18 PDT 2015
typos (spellos):
On 10/28/15 15:47, Alexey Brodkin wrote:
> ---
> lib/div64.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 128 insertions(+), 25 deletions(-)
>
> diff --git a/lib/div64.c b/lib/div64.c
> index 62a698a..3055328 100644
> --- a/lib/div64.c
> +++ b/lib/div64.c
> @@ -23,37 +23,140 @@
> /* Not needed on 64bit architectures */
> #if BITS_PER_LONG == 32
>
> +
> +/*
> + * If the divisor happens to be constant, we determine the appropriate
> + * inverse at compile time to turn the division into a few inline
> + * multiplications instead which is much faster.
> + */
> uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base)
> {
> - uint64_t rem = *n;
> - uint64_t b = base;
> - uint64_t res, d = 1;
> - uint32_t high = rem >> 32;
> -
> - /* Reduce the thing a bit first */
> - res = 0;
> - if (high >= base) {
> - high /= base;
> - res = (uint64_t) high << 32;
> - rem -= (uint64_t) (high*base) << 32;
> - }
> + unsigned int __r, __b = base;
>
> - while ((int64_t)b > 0 && b < rem) {
> - b = b+b;
> - d = d+d;
> - }
> + if (!__builtin_constant_p(__b) || __b == 0) {
> + /* non-constant divisor (or zero): slow path */
> + uint64_t rem = *n;
> + uint64_t b = base;
> + uint64_t res, d = 1;
> + uint32_t high = rem >> 32;
> +
> + /* Reduce the thing a bit first */
> + res = 0;
> + if (high >= base) {
> + high /= base;
> + res = (uint64_t) high << 32;
> + rem -= (uint64_t) (high*base) << 32;
> + }
> +
> + while ((int64_t)b > 0 && b < rem) {
> + b = b+b;
> + d = d+d;
> + }
> +
> + do {
> + if (rem >= b) {
> + rem -= b;
> + res += d;
> + }
> + b >>= 1;
> + d >>= 1;
> + } while (d);
>
> - do {
> - if (rem >= b) {
> - rem -= b;
> - res += d;
> + *n = res;
> + __r = rem;
> + } else if ((__b & (__b - 1)) == 0) {
> + /*
> + * Trivial: __b is constant and a power of 2
> + * gcc does the right thing with this code.
> + * Even though code is the same as above but
> + * we make it visually as a separate path.
> + * Still only one of these branches will survive
> + * pre-processor stage, so let's leave it here.
> + */
> + __r = *n;
> + __r &= (__b - 1);
> + *n /= __b;
> + } else {
> + /* Start of preprocessor calculations */
> +
> + /*
> + * Multiply by inverse of __b: *n/b = *n*(p/b)/p
> + * We rely on the fact that most of this code gets
> + * optimized away at compile time due to constant
> + * propagation and only a couple inline assembly
> + * instructions should remain. Better avoid any
> + * code construct that might prevent that.
> + */
> + unsigned long long __res, __x, __t, __m, __n = *n;
> + unsigned int __p;
> + /* preserve low part of *n for reminder computation */
remainder
> + __r = __n;
> + /* determine number of bits to represent __b */
> + __p = 1 << __div64_fls(__b);
> + /* compute __m = ((__p << 64) + __b - 1) / __b */
> + __m = (~0ULL / __b) * __p;
> + __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;
> + /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */
> + __x = ~0ULL / __b * __b - 1;
> + __res = (__m & 0xffffffff) * (__x & 0xffffffff);
> + __res >>= 32;
> + __res += (__m & 0xffffffff) * (__x >> 32);
> + __t = __res;
> + __res += (__x & 0xffffffff) * (__m >> 32);
> + __t = (__res < __t) ? (1ULL << 32) : 0;
> + __res = (__res >> 32) + __t;
> + __res += (__m >> 32) * (__x >> 32);
> + __res /= __p;
> + /* End of preprocessor calculations */
> +
> + /* Start of run-time calculations */
> + __res = (unsigned int)__m * (unsigned int)__n;
> + __res >>= 32;
> + __res += (unsigned int)__m * (__n >> 32);
> + __t = __res;
> + __res += (unsigned int)__n * (__m >> 32);
> + __t = (__res < __t) ? (1ULL << 32) : 0;
> + __res = (__res >> 32) + __t;
> + __res += (__m >> 32) * (__n >> 32);
> + __res /= __p;
> +
> + /*
> + * The reminder can be computed with 32-bit regs
remainder
> + * only, and gcc is good at that.
> + */
> + {
> + unsigned int __res0 = __res;
> + unsigned int __b0 = __b;
> +
> + __r -= __res0 * __b0;
> }
> - b >>= 1;
> - d >>= 1;
> - } while (d);
> + /* End of run-time calculations */
>
> - *n = res;
> - return rem;
> + *n = __res;
> + }
> + return __r;
> }
>
> EXPORT_SYMBOL(__div64_32);
--
~Randy
More information about the linux-snps-arc
mailing list