Go Down

Topic: divu5() and divu3(). Fast replacements for unsigned division by 3, 5, 6, 10, 15 (Read 3 times) previous topic - next topic

Tom Carpenter

This sort of follows on from having helped with the divmod10() thread: http://forum.arduino.cc/index.php?topic=167414.msg1280458#msg1280458

I have a program which does a large number of division by 3 and division by 5. The built in functions for division are fairly slow, so a faster way was needed.

The two methods below achieve the same result but much faster by making use of reciprocal multiplication.
Both of these only 30 32 clock cycles including function call/ret overhead, compared with over 200 cycles for the built in functions.

Example useage:
Code: [Select]

 unsigned int x = 300;
 x = divu5(x); //instead of: x = x / 5;
 if (x == 60){
   Serial.print("It Works :D");
 }


For unsigned integer division by 3      [32 Clocks including 'call' and 'ret']
Code: [Select]
unsigned int divu3(unsigned int n) __attribute__((noinline));
unsigned int divu3(unsigned int n) {
 unsigned long working;
 asm volatile(
   "ldi  %A1, %3   \n\t"
   "ldi  %B1, 0xFF \n\t"
   "ldi  %C1, 0x00 \n\t"
   "ldi  %D1, 0x00 \n\t"
   "cpi  %A0, 0xFF \n\t"
   "cpc  %B0, %B1  \n\t"
   "brlo .+4       \n\t"
   "ldi  %C1, 0x01 \n\t"  //final answer++
   "rjmp .+4       \n\t"
   "subi %A0, 0xFF \n\t"  //n++
   "sbci %B0, 0xFF \n\t"
   "mul  %A0, %A1  \n\t"  //A * X
   "mov  %B1,  r1  \n\t"
   
   "add  %B1,  r0  \n\t"  //A* X'
   "adc  %C1,  r1  \n\t"
   
   "mul  %B0, %A1  \n\t"
   "add  %B1,  r0  \n\t"  //A* X'
   "adc  %C1,  r1  \n\t"
   "adc  %D1, %D1  \n\t"  //D0 is known 0, we need to grab the carry
   
   "add  %C1,  r0  \n\t"  //A* X'
   "adc  %D1,  r1  \n\t"
   
   "movw %A0, %C1  \n\t"  // >> 16
   
   "eor   r1,  r1  \n\t"
   
   : "=r" (n), "=r" (working)
   : "0" (n), "M" (0x55)
   : "r1", "r0"
 );
 return n;
}


For unsigned integer division by 5      [32 Clocks including 'call' and 'ret']
Code: [Select]

unsigned int divu5(unsigned int n) __attribute__((noinline));
unsigned int divu5(unsigned int n) {
 unsigned long working;
 asm volatile(
   "ldi  %A1, %3   \n\t"
   "ldi  %B1, 0xFF \n\t"
   "ldi  %C1, 0x00 \n\t"
   "ldi  %D1, 0x00 \n\t"
   "cpi  %A0, 0xFF \n\t"
   "cpc  %B0, %B1  \n\t"
   "brlo .+4       \n\t"
   "ldi  %C1, 0x01 \n\t"  //final answer++
   "rjmp .+4       \n\t"
   "subi %A0, 0xFF \n\t"  //n++
   "sbci %B0, 0xFF \n\t"
   "mul  %A0, %A1  \n\t"  //A * X
   "mov  %B1,  r1  \n\t"
   
   "add  %B1,  r0  \n\t"  //A* X'
   "adc  %C1,  r1  \n\t"
   
   "mul  %B0, %A1  \n\t"
   "add  %B1,  r0  \n\t"  //A* X'
   "adc  %C1,  r1  \n\t"
   "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
   "add  %C1,  r0  \n\t"  //A* X'
   "adc  %D1,  r1  \n\t"
   
   "movw %A0, %C1  \n\t"  // >> 16
   
   "eor   r1,  r1  \n\t"
   
   : "=r" (n), "=r" (working)
   : "0" (n), "M" (0x33)
   : "r1", "r0"
 );
 return n;
}


EDIT: Updated code to correct a minor glitch (correction costs 2 clock cycles).
~Tom~

Tom Carpenter

#1
Jun 18, 2013, 12:17 am Last Edit: Jun 18, 2013, 08:29 pm by Tom Carpenter Reason: 1
It can be extended to others such as 60:      [31 Clocks including 'call' and 'ret']
Code: [Select]
unsigned int divu60(unsigned int n) __attribute__((noinline));
unsigned int divu60(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
   
    "lsr  %B0       \n\t"
    "ror  %A0       \n\t"
    "lsr  %B0       \n\t"  // we have divided by 4, now divide by 15.
    "ror  %A0       \n\t"
   
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
   
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
   
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
   
    "movw %A0, %C1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x11)
    : "r1", "r0"
  );
  return n;
}


30:      [29 Clocks including 'call' and 'ret']
Code: [Select]
unsigned int divu30(unsigned int n) __attribute__((noinline));
unsigned int divu30(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
   
    "lsr  %B0       \n\t"  // we have divided by 2, now divide by 15.
    "ror  %A0       \n\t"
   
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
   
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
   
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
   
    "movw %A0, %C1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x11)
    : "r1", "r0"
  );
  return n;
}


15:      [32 Clocks including 'call' and 'ret']
Code: [Select]
unsigned int divu15(unsigned int n) __attribute__((noinline));
unsigned int divu15(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %B1, 0xFF \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    "cpi  %A0, 0xFF \n\t"
    "cpc  %B0, %B1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %C1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
   
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
   
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
   
    "movw %A0, %C1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x11)
    : "r1", "r0"
  );
  return n;
}


10:      [29 Clocks including 'call' and 'ret']
Code: [Select]
unsigned int divu10(unsigned int n) __attribute__((noinline));
unsigned int divu10(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
   
    "lsr  %B0       \n\t"  // we have divided by 2, now divide by 5.
    "ror  %A0       \n\t"
   
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
   
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
   
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
   
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
   
    "movw %A0, %C1  \n\t"  // >> 16
       
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x33)
    : "r1", "r0"
  );
  return n;
}


And 6:      [29 Clocks including 'call' and 'ret']
Code: [Select]
unsigned int divu6(unsigned int n) __attribute__((noinline));
unsigned int divu6(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
   
    "lsr  %B0       \n\t"  // we have divided by 2, now divide by 3.
    "ror  %A0       \n\t"
   
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
   
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
   
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
   
    "movw %A0, %C1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x55)
    : "r1", "r0"
  );
  return n;
}


EDIT: A minor correction to each function to remove a glitch with n = 0xFFFF
~Tom~

drjiohnsmith

Fantastic

SO,

Any way you can make them into a single lib,
   so that we can all easily call them up ?

might even get them added to the 'standard' code.
   or even the compiler modified to substitute directly.

but a lib for now would be great.

Tom Carpenter

One library for thee.

EDIT:
Fixed a glitch with n=0xFFFF
~Tom~


Tom Carpenter

There was a minor glitch in all of the functions which is now correct. The fix only costs 2 clock cycles so they are still very fast. I have updated the attachment in my previous post with the new version.

I have also added divu7() for division by 7. This takes slightly longer at on average 40.5 clock cycles (numbers <=0x7FFF are 40, those > are 41).
~Tom~

drjiohnsmith

do 'we' require to wrap this a little.

I've not done any of this, but dosn't arduino need a word file , and be put into a folder ?

something like this ?
#######################################
# Syntax Coloring Map For fastdivide
#######################################

#######################################
# Datatypes (KEYWORD1)
#######################################

fastdivde KEYWORD1

#######################################
# Methods and Functions (KEYWORD2)
#######################################

divu3   KEYWORD2
divu5   KEYWORD2
divu30   KEYWORD2
divu60   KEYWORD2
divu15   KEYWORD2
divu10   KEYWORD2
divu6   KEYWORD2
divu7   KEYWORD2
divmod10   KEYWORD2


#######################################
# Constants (LITERAL1)
#######################################

Tom Carpenter

While not an absolute requirement, I suppose it can't hurt. I have added a keywords.txt file to this new version.


In the new version, a couple of the divide functions were further optimised to shave off a clock cycle or two. I have also added additional divisions, a list of which can be found in the header file. All of /1 to /15 are included (divu1() is just a #define as it is clearly a pointless division to do.).


If any more are needed, let me know and I will have a look to see if they can be done. The trouble is that I have all but run out of resolution as they are only 16bit numbers. 9 and 11 were a challenge, and 13 proved to be impossible without using additional registers which I don't want to do.
~Tom~

robtillaart

Nice work Tom!

I have some functions prototyped some time ago. can be used as starting point but can still be optimized further.

divu24, divu12, divu3600, for clock math

Code: [Select]

uint32_t div24(uint32_t in)
{
  return div3(in) >> 3;
}

uint32_t div12(uint32_t in)
{
  return div3(in) >> 2;
}

uint32_t div86400(uint32_t in)  // seconds to days
{
  return ((in >> 1) + (in >>2) + (in>>7) + (in>>11) + (in>>12) - (in>>15)) >> 16;
}

uint32_t div3600(uint32_t in)
{
  return div60(div60(in));
}



divu1000 - for millis/micros, millimeters to meters etc

some experimental code (to be optimized)
Code: [Select]

uint32_t div100(uint32_t in)  // cm to meters
{
  return div10(div10(in));
}

uint32_t div1000(uint32_t in)
{
  return ((in>>2) + (in>>8) + ((in>>1) + (in>>5) + (in>>8) + (in>>11) + (in>>12) + (in >>14) + (in >> 15)) >> 8) >> 8 ;
}



div29 (for ultrasonic sensors) see - http://forum.arduino.cc/index.php?topic=165860.0 -

some gonio related experiments (to be optimized)
Code: [Select]

uint32_t divPI(uint32_t in)
{
  uint32_t x = (in>>2) + (in>>4) + (in>>8) + (in>>9) - (in>>15) + (in >>17); // minus is correct
  return x;
}

uint32_t mulPI(uint32_t in)
{
  uint32_t x = (in<<1) + in + (in>>3) + (in>>6) + (in>>10);
  return x;
}

uint32_t degrees2radians(uint32_t in)
{
  uint32_t x = (in>>1) + (in>>5);
  x += (( (in>>1) + (in>>2) - (in>>3) + (in >>6) ) >>5);
  return x >> 5;
}

uint32_t radians2degrees(uint32_t in)
{
  uint32_t x = (in << 5) + (in<<4) + (in<<3) + in + (in>>2) + (in>>5) + (in>>7) + (in>>8) + (in>>9) + (in >>10);
  return x;
}


my div 13 code (not tested in detail)
Code: [Select]

uint32_t div13(uint32_t in)
{
  // powers 4,  7,8,9,11,12,16,  19,20,21,23,24,28
  uint32 t = in>>7 + in>>8 + in>>9 + in>>11 + in>>12 + in>>16;
  t = t + t>>12;
  return (in>>4) + t;
}


Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

Tom Carpenter

The following attachment has the same divisions as before (along with divu24), but has been optimised to take up less flash when more than one divxx() function is being used. The modification has no impact on execution time - all functions take the same amount of clock cycles as before.

Its experimental because I have only tested it in one sketch and I am not sure if I have told the compiler everything it needs to know to be able to reproduce the same code in all situations (register choices).
~Tom~

drjiohnsmith

This is all getting rather good,

is it time it was on the play ground  as a project ?

robtillaart

@Tom
idea, why not making a divmod_N() version of them all ?  OK if one wants only the div and max speed.

It takes only a few extra instructions to derive the mod value, and div and mod are often used together?

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

drjiohnsmith

on this forum we seem to have two threads running on ways to implement very efficient division and mod operations.

http://forum.arduino.cc/index.php?topic=172635.0

http://forum.arduino.cc/index.php?topic=167414.0

I'd hate to loose these over time,
  which might well happen if they just reside on the forum.

is there any posibility that these could be put together, and put on the playground or something so they are not lost .

please

robtillaart

Quote
is there any posibility that these could be put together, and put on the playground or something so they are not lost

Yes, we could but as long as the discussion moves fast it makes little sense to consolidate.
I will make a place holder to extreme optimization threads.

to be continued...
Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

robtillaart

@drjiohnsmith
Made an entry on the playground:

Check - http://playground.arduino.cc//Main/GeneralCodeLibrary - ==> Snippets & Sketches ==> Perfomance Snippets

give it a try...
Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

Go Up