This sort of follows on from having helped with the divmod10() thread: divmod10() : a fast replacement for /10 and %10 (unsigned) - #34 by TCWORLD - Libraries - Arduino Forum
I have a program which does a large number of division by 3 and division by 5. The built in functions for division are fairly slow, so a faster way was needed.
The two methods below achieve the same result but much faster by making use of reciprocal multiplication.
Both of these only 30 32 clock cycles including function call/ret overhead, compared with over 200 cycles for the built in functions.
Example useage:
unsigned int x = 300;
x = divu5(x); //instead of: x = x / 5;
if (x == 60){
Serial.print("It Works :D");
}
For unsigned integer division by 3 [32 Clocks including 'call' and 'ret']
unsigned int divu3(unsigned int n) __attribute__((noinline));
unsigned int divu3(unsigned int n) {
unsigned long working;
asm volatile(
"ldi %A1, %3 \n\t"
"ldi %B1, 0xFF \n\t"
"ldi %C1, 0x00 \n\t"
"ldi %D1, 0x00 \n\t"
"cpi %A0, 0xFF \n\t"
"cpc %B0, %B1 \n\t"
"brlo .+4 \n\t"
"ldi %C1, 0x01 \n\t" //final answer++
"rjmp .+4 \n\t"
"subi %A0, 0xFF \n\t" //n++
"sbci %B0, 0xFF \n\t"
"mul %A0, %A1 \n\t" //A * X
"mov %B1, r1 \n\t"
"add %B1, r0 \n\t" //A* X'
"adc %C1, r1 \n\t"
"mul %B0, %A1 \n\t"
"add %B1, r0 \n\t" //A* X'
"adc %C1, r1 \n\t"
"adc %D1, %D1 \n\t" //D0 is known 0, we need to grab the carry
"add %C1, r0 \n\t" //A* X'
"adc %D1, r1 \n\t"
"movw %A0, %C1 \n\t" // >> 16
"eor r1, r1 \n\t"
: "=r" (n), "=r" (working)
: "0" (n), "M" (0x55)
: "r1", "r0"
);
return n;
}
For unsigned integer division by 5 [32 Clocks including 'call' and 'ret']
unsigned int divu5(unsigned int n) __attribute__((noinline));
unsigned int divu5(unsigned int n) {
unsigned long working;
asm volatile(
"ldi %A1, %3 \n\t"
"ldi %B1, 0xFF \n\t"
"ldi %C1, 0x00 \n\t"
"ldi %D1, 0x00 \n\t"
"cpi %A0, 0xFF \n\t"
"cpc %B0, %B1 \n\t"
"brlo .+4 \n\t"
"ldi %C1, 0x01 \n\t" //final answer++
"rjmp .+4 \n\t"
"subi %A0, 0xFF \n\t" //n++
"sbci %B0, 0xFF \n\t"
"mul %A0, %A1 \n\t" //A * X
"mov %B1, r1 \n\t"
"add %B1, r0 \n\t" //A* X'
"adc %C1, r1 \n\t"
"mul %B0, %A1 \n\t"
"add %B1, r0 \n\t" //A* X'
"adc %C1, r1 \n\t"
"adc %D1, %D1 \n\t" //D1 is known 0, we need to grab the carry
"add %C1, r0 \n\t" //A* X'
"adc %D1, r1 \n\t"
"movw %A0, %C1 \n\t" // >> 16
"eor r1, r1 \n\t"
: "=r" (n), "=r" (working)
: "0" (n), "M" (0x33)
: "r1", "r0"
);
return n;
}
EDIT: Updated code to correct a minor glitch (correction costs 2 clock cycles).