Commit 162c65f2 authored by Nicolas Pitre's avatar Nicolas Pitre Committed by Russell King

[ARM PATCH] 1674/1: better ARM division routines

Patch from Nicolas Pitre

Here's my rewrite of the ARM division routines:

 - take advantage of the clz instruction on ARMv5+
 - simpler and faster code for modulo routines
 - fast path for power of 2 divisors
 - other miscellaneous optimizations

This code has already been merged into gcc-3.4.
parent 157e3d70
@ libgcc1 routines for ARM cpu. /*
@ Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk) * linux/arch/arm/lib/lib1funcs.S: Optimized ARM division routines
*
* Author: Nicolas Pitre <nico@cam.org>
* - contributed to gcc-3.4 on Sep 30, 2003
* - adapted for the Linux kernel on Oct 2, 2003
*/
/* Copyright (C) 1995, 1996, 1998 Free Software Foundation, Inc. /* Copyright 1995, 1996, 1998, 1999, 2000, 2003 Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the under the terms of the GNU General Public License as published by the
...@@ -10,11 +15,12 @@ later version. ...@@ -10,11 +15,12 @@ later version.
In addition to the permissions in the GNU General Public License, the In addition to the permissions in the GNU General Public License, the
Free Software Foundation gives you unlimited permission to link the Free Software Foundation gives you unlimited permission to link the
compiled version of this file with other programs, and to distribute compiled version of this file into combinations with other programs,
those programs without any restriction coming from the use of this and to distribute those combinations without any restriction coming
file. (The General Public License restrictions do apply in other from the use of this file. (The General Public License restrictions
respects; for example, they cover modification of the file, and do apply in other respects; for example, they cover modification of
distribution when not linked into another program.) the file, and distribution when not linked into a combine
executable.)
This file is distributed in the hope that it will be useful, but This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of WITHOUT ANY WARRANTY; without even the implied warranty of
...@@ -26,287 +32,283 @@ along with this program; see the file COPYING. If not, write to ...@@ -26,287 +32,283 @@ along with this program; see the file COPYING. If not, write to
the Free Software Foundation, 59 Temple Place - Suite 330, the Free Software Foundation, 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */ Boston, MA 02111-1307, USA. */
/* As a special exception, if you link this library with other files,
some of which are compiled with GCC, to produce an executable,
this library does not by itself cause the resulting executable
to be covered by the GNU General Public License.
This exception does not however invalidate any other reasons why
the executable file might be covered by the GNU General Public License.
*/
/* This code is derived from gcc 2.95.3
* 29/07/01 Adapted for linux
* 27/03/03 Ian Molton Clean up CONFIG_CPU
*/
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/assembler.h> #include <asm/assembler.h>
#include <asm/hardware.h>
#include <linux/config.h>
.macro ARM_DIV_BODY dividend, divisor, result, curbit
#define RET mov
#define RETc(x) mov##x #if __LINUX_ARM_ARCH__ >= 5
#define RETCOND
clz \curbit, \divisor
dividend .req r0 clz \result, \dividend
divisor .req r1 sub \result, \curbit, \result
result .req r2 mov \curbit, #1
overdone .req r2 mov \divisor, \divisor, lsl \result
curbit .req r3 mov \curbit, \curbit, lsl \result
mov \result, #0
ENTRY(__udivsi3) #else
cmp divisor, #0
beq Ldiv0 @ Initially shift the divisor left 3 bits if possible,
mov curbit, #1 @ set curbit accordingly. This allows for curbit to be located
mov result, #0 @ at the left end of each 4 bit nibbles in the division loop
cmp dividend, divisor @ to save one loop in most cases.
bcc Lgot_result_udivsi3 tst \divisor, #0xe0000000
1: moveq \divisor, \divisor, lsl #3
moveq \curbit, #8
movne \curbit, #1
@ Unless the divisor is very big, shift it up in multiples of @ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main @ four bits, since this is the amount of unwinding in the main
@ division loop. Continue shifting until the divisor is @ division loop. Continue shifting until the divisor is
@ larger than the dividend. @ larger than the dividend.
cmp divisor, #0x10000000 1: cmp \divisor, #0x10000000
cmpcc divisor, dividend cmplo \divisor, \dividend
movcc divisor, divisor, lsl #4 movlo \divisor, \divisor, lsl #4
movcc curbit, curbit, lsl #4 movlo \curbit, \curbit, lsl #4
bcc 1b blo 1b
2:
@ For very big divisors, we must shift it a bit at a time, or @ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing. @ we will be in danger of overflowing.
cmp divisor, #0x80000000 1: cmp \divisor, #0x80000000
cmpcc divisor, dividend cmplo \divisor, \dividend
movcc divisor, divisor, lsl #1 movlo \divisor, \divisor, lsl #1
movcc curbit, curbit, lsl #1 movlo \curbit, \curbit, lsl #1
bcc 2b blo 1b
3:
@ Test for possible subtractions, and note which bits
@ are done in the result. On the final pass, this may subtract
@ too much from the dividend, but the result will be ok, since the
@ "bit" will have been shifted out at the bottom.
cmp dividend, divisor
subcs dividend, dividend, divisor
orrcs result, result, curbit
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs result, result, curbit, lsr #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs result, result, curbit, lsr #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs result, result, curbit, lsr #3
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne 3b
Lgot_result_udivsi3:
mov r0, result
RET pc, lr
Ldiv0: mov \result, #0
str lr, [sp, #-4]!
bl __div0
mov r0, #0 @ about as wrong as it could be
ldmia sp!, {pc}RETCOND
/* __umodsi3 ----------------------- */ #endif
@ Division loop
1: cmp \dividend, \divisor
subhs \dividend, \dividend, \divisor
orrhs \result, \result, \curbit
cmp \dividend, \divisor, lsr #1
subhs \dividend, \dividend, \divisor, lsr #1
orrhs \result, \result, \curbit, lsr #1
cmp \dividend, \divisor, lsr #2
subhs \dividend, \dividend, \divisor, lsr #2
orrhs \result, \result, \curbit, lsr #2
cmp \dividend, \divisor, lsr #3
subhs \dividend, \dividend, \divisor, lsr #3
orrhs \result, \result, \curbit, lsr #3
cmp \dividend, #0 @ Early termination?
movnes \curbit, \curbit, lsr #4 @ No, any more bits to do?
movne \divisor, \divisor, lsr #4
bne 1b
.endm
.macro ARM_DIV2_ORDER divisor, order
#if __LINUX_ARM_ARCH__ >= 5
clz \order, \divisor
rsb \order, \order, #31
#else
cmp \divisor, #(1 << 16)
movhs \divisor, \divisor, lsr #16
movhs \order, #16
movlo \order, #0
cmp \divisor, #(1 << 8)
movhs \divisor, \divisor, lsr #8
addhs \order, \order, #8
cmp \divisor, #(1 << 4)
movhs \divisor, \divisor, lsr #4
addhs \order, \order, #4
cmp \divisor, #(1 << 2)
addhi \order, \order, #3
addls \order, \order, \divisor, lsr #1
#endif
.endm
.macro ARM_MOD_BODY dividend, divisor, order, spare
#if __LINUX_ARM_ARCH__ >= 5
clz \order, \divisor
clz \spare, \dividend
sub \order, \order, \spare
mov \divisor, \divisor, lsl \order
#else
mov \order, #0
ENTRY(__umodsi3)
cmp divisor, #0
beq Ldiv0
mov curbit, #1
cmp dividend, divisor
RETc(cc) pc, lr
1:
@ Unless the divisor is very big, shift it up in multiples of @ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main @ four bits, since this is the amount of unwinding in the main
@ division loop. Continue shifting until the divisor is @ division loop. Continue shifting until the divisor is
@ larger than the dividend. @ larger than the dividend.
cmp divisor, #0x10000000 1: cmp \divisor, #0x10000000
cmpcc divisor, dividend cmplo \divisor, \dividend
movcc divisor, divisor, lsl #4 movlo \divisor, \divisor, lsl #4
movcc curbit, curbit, lsl #4 addlo \order, \order, #4
bcc 1b blo 1b
2:
@ For very big divisors, we must shift it a bit at a time, or @ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing. @ we will be in danger of overflowing.
cmp divisor, #0x80000000 1: cmp \divisor, #0x80000000
cmpcc divisor, dividend cmplo \divisor, \dividend
movcc divisor, divisor, lsl #1 movlo \divisor, \divisor, lsl #1
movcc curbit, curbit, lsl #1 addlo \order, \order, #1
bcc 2b blo 1b
3: #endif
@ Test for possible subtractions. On the final pass, this may
@ subtract too much from the dividend, so keep track of which @ Perform all needed substractions to keep only the reminder.
@ subtractions are done, we can fix them up afterwards... @ Do comparisons in batch of 4 first.
mov overdone, #0 subs \order, \order, #3 @ yes, 3 is intended here
cmp dividend, divisor blt 2f
subcs dividend, dividend, divisor
cmp dividend, divisor, lsr #1 1: cmp \dividend, \divisor
subcs dividend, dividend, divisor, lsr #1 subhs \dividend, \dividend, \divisor
orrcs overdone, overdone, curbit, ror #1 cmp \dividend, \divisor, lsr #1
cmp dividend, divisor, lsr #2 subhs \dividend, \dividend, \divisor, lsr #1
subcs dividend, dividend, divisor, lsr #2 cmp \dividend, \divisor, lsr #2
orrcs overdone, overdone, curbit, ror #2 subhs \dividend, \dividend, \divisor, lsr #2
cmp dividend, divisor, lsr #3 cmp \dividend, \divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3 subhs \dividend, \dividend, \divisor, lsr #3
orrcs overdone, overdone, curbit, ror #3 cmp \dividend, #1
mov ip, curbit mov \divisor, \divisor, lsr #4
cmp dividend, #0 @ Early termination? subges \order, \order, #4
movnes curbit, curbit, lsr #4 @ No, any more bits to do? bge 1b
movne divisor, divisor, lsr #4
bne 3b tst \order, #3
teqne \dividend, #0
@ Any subtractions that we should not have done will be recorded in beq 5f
@ the top three bits of "overdone". Exactly which were not needed
@ are governed by the position of the bit, stored in ip. @ Either 1, 2 or 3 comparison/substractions are left.
@ If we terminated early, because dividend became zero, 2: cmn \order, #2
@ then none of the below will match, since the bit in ip will not be blt 4f
@ in the bottom nibble. beq 3f
ands overdone, overdone, #0xe0000000 cmp \dividend, \divisor
RETc(eq) pc, lr @ No fixups needed subhs \dividend, \dividend, \divisor
tst overdone, ip, ror #3 mov \divisor, \divisor, lsr #1
addne dividend, dividend, divisor, lsr #3 3: cmp \dividend, \divisor
tst overdone, ip, ror #2 subhs \dividend, \dividend, \divisor
addne dividend, dividend, divisor, lsr #2 mov \divisor, \divisor, lsr #1
tst overdone, ip, ror #1 4: cmp \dividend, \divisor
addne dividend, dividend, divisor, lsr #1 subhs \dividend, \dividend, \divisor
RET pc, lr 5:
.endm
ENTRY(__udivsi3)
subs r2, r1, #1
moveq pc, lr
bcc Ldiv0
cmp r0, r1
bls 11f
tst r1, r2
beq 12f
ARM_DIV_BODY r0, r1, r2, r3
mov r0, r2
mov pc, lr
11: moveq r0, #1
movne r0, #0
mov pc, lr
12: ARM_DIV2_ORDER r1, r2
mov r0, r0, lsr r2
mov pc, lr
ENTRY(__umodsi3)
subs r2, r1, #1 @ compare divisor with 1
bcc Ldiv0
cmpne r0, r1 @ compare dividend with divisor
moveq r0, #0
tsthi r1, r2 @ see if divisor is power of 2
andeq r0, r0, r2
movls pc, lr
ARM_MOD_BODY r0, r1, r2, r3
mov pc, lr
ENTRY(__divsi3) ENTRY(__divsi3)
eor ip, dividend, divisor @ Save the sign of the result.
mov curbit, #1 cmp r1, #0
mov result, #0 eor ip, r0, r1 @ save the sign of the result.
cmp divisor, #0
rsbmi divisor, divisor, #0 @ Loops below use unsigned.
beq Ldiv0 beq Ldiv0
cmp dividend, #0 rsbmi r1, r1, #0 @ loops below use unsigned.
rsbmi dividend, dividend, #0 subs r2, r1, #1 @ division by 1 or -1 ?
cmp dividend, divisor beq 10f
bcc Lgot_result_divsi3 movs r3, r0
rsbmi r3, r0, #0 @ positive dividend value
cmp r3, r1
bls 11f
tst r1, r2 @ divisor is power of 2 ?
beq 12f
1: ARM_DIV_BODY r3, r1, r0, r2
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main cmp ip, #0
@ division loop. Continue shifting until the divisor is rsbmi r0, r0, #0
@ larger than the dividend. mov pc, lr
cmp divisor, #0x10000000
cmpcc divisor, dividend 10: teq ip, r0 @ same sign ?
movcc divisor, divisor, lsl #4 rsbmi r0, r0, #0
movcc curbit, curbit, lsl #4 mov pc, lr
bcc 1b
11: movlo r0, #0
moveq r0, ip, asr #31
orreq r0, r0, #1
mov pc, lr
12: ARM_DIV2_ORDER r1, r2
2:
@ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing.
cmp divisor, #0x80000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #1
movcc curbit, curbit, lsl #1
bcc 2b
3:
@ Test for possible subtractions, and note which bits
@ are done in the result. On the final pass, this may subtract
@ too much from the dividend, but the result will be ok, since the
@ "bit" will have been shifted out at the bottom.
cmp dividend, divisor
subcs dividend, dividend, divisor
orrcs result, result, curbit
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs result, result, curbit, lsr #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs result, result, curbit, lsr #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs result, result, curbit, lsr #3
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne 3b
Lgot_result_divsi3:
mov r0, result
cmp ip, #0 cmp ip, #0
mov r0, r3, lsr r2
rsbmi r0, r0, #0 rsbmi r0, r0, #0
RET pc, lr mov pc, lr
ENTRY(__modsi3) ENTRY(__modsi3)
mov curbit, #1
cmp divisor, #0 cmp r1, #0
rsbmi divisor, divisor, #0 @ Loops below use unsigned.
beq Ldiv0 beq Ldiv0
@ Need to save the sign of the dividend, unfortunately, we need rsbmi r1, r1, #0 @ loops below use unsigned.
@ ip later on; this is faster than pushing lr and using that. movs ip, r0 @ preserve sign of dividend
str dividend, [sp, #-4]! rsbmi r0, r0, #0 @ if negative make positive
cmp dividend, #0 subs r2, r1, #1 @ compare divisor with 1
rsbmi dividend, dividend, #0 cmpne r0, r1 @ compare dividend with divisor
cmp dividend, divisor moveq r0, #0
bcc Lgot_result_modsi3 tsthi r1, r2 @ see if divisor is power of 2
andeq r0, r0, r2
1: bls 10f
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main ARM_MOD_BODY r0, r1, r2, r3
@ division loop. Continue shifting until the divisor is
@ larger than the dividend. 10: cmp ip, #0
cmp divisor, #0x10000000 rsbmi r0, r0, #0
cmpcc divisor, dividend mov pc, lr
movcc divisor, divisor, lsl #4
movcc curbit, curbit, lsl #4
bcc 1b Ldiv0:
str lr, [sp, #-4]!
bl __div0
mov r0, #0 @ About as wrong as it could be.
ldr pc, [sp], #4
2:
@ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing.
cmp divisor, #0x80000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #1
movcc curbit, curbit, lsl #1
bcc 2b
3:
@ Test for possible subtractions. On the final pass, this may
@ subtract too much from the dividend, so keep track of which
@ subtractions are done, we can fix them up afterwards...
mov overdone, #0
cmp dividend, divisor
subcs dividend, dividend, divisor
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs overdone, overdone, curbit, ror #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs overdone, overdone, curbit, ror #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs overdone, overdone, curbit, ror #3
mov ip, curbit
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne 3b
@ Any subtractions that we should not have done will be recorded in
@ the top three bits of "overdone". Exactly which were not needed
@ are governed by the position of the bit, stored in ip.
@ If we terminated early, because dividend became zero,
@ then none of the below will match, since the bit in ip will not be
@ in the bottom nibble.
ands overdone, overdone, #0xe0000000
beq Lgot_result_modsi3
tst overdone, ip, ror #3
addne dividend, dividend, divisor, lsr #3
tst overdone, ip, ror #2
addne dividend, dividend, divisor, lsr #2
tst overdone, ip, ror #1
addne dividend, dividend, divisor, lsr #1
Lgot_result_modsi3:
ldr ip, [sp], #4
cmp ip, #0
rsbmi dividend, dividend, #0
RET pc, lr
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment