divu5() and divu3(). Fast replacements for unsigned division by 3, 5, 6, 10, 15

This sort of follows on from having helped with the divmod10() thread: divmod10() : a fast replacement for /10 and %10 (unsigned) - #34 by TCWORLD - Libraries - Arduino Forum

I have a program which does a large number of division by 3 and division by 5. The built in functions for division are fairly slow, so a faster way was needed.

The two methods below achieve the same result but much faster by making use of reciprocal multiplication.
Both of these only 30 32 clock cycles including function call/ret overhead, compared with over 200 cycles for the built in functions.

Example useage:

  unsigned int x = 300;
  x = divu5(x); //instead of: x = x / 5;
  if (x == 60){
    Serial.print("It Works :D");
  }

For unsigned integer division by 3 [32 Clocks including 'call' and 'ret']

unsigned int divu3(unsigned int n) __attribute__((noinline));
unsigned int divu3(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %B1, 0xFF \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    "cpi  %A0, 0xFF \n\t"
    "cpc  %B0, %B1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %C1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
    
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D0 is known 0, we need to grab the carry
    
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
    
    "movw %A0, %C1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x55)
    : "r1", "r0"
  );
  return n;
}

For unsigned integer division by 5 [32 Clocks including 'call' and 'ret']

unsigned int divu5(unsigned int n) __attribute__((noinline));
unsigned int divu5(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %B1, 0xFF \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    "cpi  %A0, 0xFF \n\t"
    "cpc  %B0, %B1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %C1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
    
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
    
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
    
    "movw %A0, %C1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x33)
    : "r1", "r0"
  );
  return n;
}

EDIT: Updated code to correct a minor glitch (correction costs 2 clock cycles).

It can be extended to others such as 60: [31 Clocks including 'call' and 'ret']

unsigned int divu60(unsigned int n) __attribute__((noinline));
unsigned int divu60(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    
    "lsr  %B0       \n\t" 
    "ror  %A0       \n\t"
    "lsr  %B0       \n\t"  // we have divided by 4, now divide by 15.
    "ror  %A0       \n\t"
    
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
    
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
    
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
    
    "movw %A0, %C1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x11)
    : "r1", "r0"
  );
  return n;
}

30: [29 Clocks including 'call' and 'ret']

unsigned int divu30(unsigned int n) __attribute__((noinline));
unsigned int divu30(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    
    "lsr  %B0       \n\t"  // we have divided by 2, now divide by 15.
    "ror  %A0       \n\t"
    
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
    
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
    
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
    
    "movw %A0, %C1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x11)
    : "r1", "r0"
  );
  return n;
}

15: [32 Clocks including 'call' and 'ret']

unsigned int divu15(unsigned int n) __attribute__((noinline));
unsigned int divu15(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %B1, 0xFF \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    "cpi  %A0, 0xFF \n\t"
    "cpc  %B0, %B1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %C1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
    
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
    
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
    
    "movw %A0, %C1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x11)
    : "r1", "r0"
  );
  return n;
}

10: [29 Clocks including 'call' and 'ret']

unsigned int divu10(unsigned int n) __attribute__((noinline));
unsigned int divu10(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    
    "lsr  %B0       \n\t"  // we have divided by 2, now divide by 5.
    "ror  %A0       \n\t"
    
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
    
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
    
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
    
    "movw %A0, %C1  \n\t"  // >> 16
        
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x33)
    : "r1", "r0"
  );
  return n;
}

And 6: [29 Clocks including 'call' and 'ret']

unsigned int divu6(unsigned int n) __attribute__((noinline));
unsigned int divu6(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    
    "lsr  %B0       \n\t"  // we have divided by 2, now divide by 3.
    "ror  %A0       \n\t"
    
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
    
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
    
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
    
    "movw %A0, %C1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x55)
    : "r1", "r0"
  );
  return n;
}

EDIT: A minor correction to each function to remove a glitch with n = 0xFFFF

Fantastic

SO,

Any way you can make them into a single lib,
so that we can all easily call them up ?

might even get them added to the 'standard' code.
or even the compiler modified to substitute directly.

but a lib for now would be great.

One library for thee.

EDIT:
Fixed a glitch with n=0xFFFF

FastDivision.zip (1.63 KB)

thank you

There was a minor glitch in all of the functions which is now correct. The fix only costs 2 clock cycles so they are still very fast. I have updated the attachment in my previous post with the new version.

I have also added divu7() for division by 7. This takes slightly longer at on average 40.5 clock cycles (numbers <=0x7FFF are 40, those > are 41).

do 'we' require to wrap this a little.

I've not done any of this, but dosn't arduino need a word file , and be put into a folder ?

something like this ?
#######################################

Syntax Coloring Map For fastdivide

#######################################

#######################################

Datatypes (KEYWORD1)

#######################################

fastdivde KEYWORD1

#######################################

Methods and Functions (KEYWORD2)

#######################################

divu3 KEYWORD2
divu5 KEYWORD2
divu30 KEYWORD2
divu60 KEYWORD2
divu15 KEYWORD2
divu10 KEYWORD2
divu6 KEYWORD2
divu7 KEYWORD2
divmod10 KEYWORD2

#######################################

Constants (LITERAL1)

#######################################

While not an absolute requirement, I suppose it can't hurt. I have added a keywords.txt file to this new version.

In the new version, a couple of the divide functions were further optimised to shave off a clock cycle or two. I have also added additional divisions, a list of which can be found in the header file. All of /1 to /15 are included (divu1() is just a #define as it is clearly a pointless division to do.).

If any more are needed, let me know and I will have a look to see if they can be done. The trouble is that I have all but run out of resolution as they are only 16bit numbers. 9 and 11 were a challenge, and 13 proved to be impossible without using additional registers which I don't want to do.

FastDivision.zip (2.36 KB)

Nice work Tom!

I have some functions prototyped some time ago. can be used as starting point but can still be optimized further.

divu24, divu12, divu3600, for clock math

uint32_t div24(uint32_t in)
{
  return div3(in) >> 3;
}

uint32_t div12(uint32_t in)
{
  return div3(in) >> 2;
}

uint32_t div86400(uint32_t in)  // seconds to days
{
  return ((in >> 1) + (in >>2) + (in>>7) + (in>>11) + (in>>12) - (in>>15)) >> 16;
}

uint32_t div3600(uint32_t in)
{
  return div60(div60(in));
}

divu1000 - for millis/micros, millimeters to meters etc

some experimental code (to be optimized)

uint32_t div100(uint32_t in)  // cm to meters
{
  return div10(div10(in));
}

uint32_t div1000(uint32_t in)
{
  return ((in>>2) + (in>>8) + ((in>>1) + (in>>5) + (in>>8) + (in>>11) + (in>>12) + (in >>14) + (in >> 15)) >> 8) >> 8 ;
}

div29 (for ultrasonic sensors) see - PING duration to cm division / 29 optimized - Libraries - Arduino Forum -

some gonio related experiments (to be optimized)

uint32_t divPI(uint32_t in)
{
  uint32_t x = (in>>2) + (in>>4) + (in>>8) + (in>>9) - (in>>15) + (in >>17); // minus is correct
  return x;
}

uint32_t mulPI(uint32_t in)
{
  uint32_t x = (in<<1) + in + (in>>3) + (in>>6) + (in>>10);
  return x;
}

uint32_t degrees2radians(uint32_t in)
{
  uint32_t x = (in>>1) + (in>>5);
  x += (( (in>>1) + (in>>2) - (in>>3) + (in >>6) ) >>5);
  return x >> 5;
}

uint32_t radians2degrees(uint32_t in)
{
  uint32_t x = (in << 5) + (in<<4) + (in<<3) + in + (in>>2) + (in>>5) + (in>>7) + (in>>8) + (in>>9) + (in >>10);
  return x;
}

my div 13 code (not tested in detail)

uint32_t div13(uint32_t in)
{
  // powers 4,  7,8,9,11,12,16,  19,20,21,23,24,28
  uint32 t = in>>7 + in>>8 + in>>9 + in>>11 + in>>12 + in>>16;
  t = t + t>>12;
  return (in>>4) + t;
}

The following attachment has the same divisions as before (along with divu24), but has been optimised to take up less flash when more than one divxx() function is being used. The modification has no impact on execution time - all functions take the same amount of clock cycles as before.

Its experimental because I have only tested it in one sketch and I am not sure if I have told the compiler everything it needs to know to be able to reproduce the same code in all situations (register choices).

FastDivision (Experimental).zip (2.18 KB)

This is all getting rather good,

is it time it was on the play ground as a project ?

@Tom
idea, why not making a divmod_N() version of them all ? OK if one wants only the div and max speed.

It takes only a few extra instructions to derive the mod value, and div and mod are often used together?

on this forum we seem to have two threads running on ways to implement very efficient division and mod operations.

http://forum.arduino.cc/index.php?topic=172635.0

http://forum.arduino.cc/index.php?topic=167414.0

I'd hate to loose these over time,
which might well happen if they just reside on the forum.

is there any posibility that these could be put together, and put on the playground or something so they are not lost .

please

is there any posibility that these could be put together, and put on the playground or something so they are not lost

Yes, we could but as long as the discussion moves fast it makes little sense to consolidate.
I will make a place holder to extreme optimization threads.

to be continued...

@drjiohnsmith
Made an entry on the playground:

Check - Arduino Playground - HomePage - ==> Snippets & Sketches ==> Perfomance Snippets

give it a try...

well done and thank you

I am sure div60() and div7() would come in very handy for timekeeping.