Pages: [1] 2   Go Down
Author Topic: divu5() and divu3(). Fast replacements for unsigned division by 3, 5, 6, 10, 15  (Read 3073 times)
0 Members and 1 Guest are viewing this topic.
Leeds, UK
Online Online
Edison Member
*
Karma: 78
Posts: 1719
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

This sort of follows on from having helped with the divmod10() thread: http://forum.arduino.cc/index.php?topic=167414.msg1280458#msg1280458

I have a program which does a large number of division by 3 and division by 5. The built in functions for division are fairly slow, so a faster way was needed.

The two methods below achieve the same result but much faster by making use of reciprocal multiplication.
Both of these only 30 32 clock cycles including function call/ret overhead, compared with over 200 cycles for the built in functions.

Example useage:
Code:
 unsigned int x = 300;
  x = divu5(x); //instead of: x = x / 5;
  if (x == 60){
    Serial.print("It Works :D");
  }

For unsigned integer division by 3      [32 Clocks including 'call' and 'ret']
Code:
unsigned int divu3(unsigned int n) __attribute__((noinline));
unsigned int divu3(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %B1, 0xFF \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    "cpi  %A0, 0xFF \n\t"
    "cpc  %B0, %B1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %C1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
    
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D0 is known 0, we need to grab the carry
    
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
    
    "movw %A0, %C1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x55)
    : "r1", "r0"
  );
  return n;
}

For unsigned integer division by 5      [32 Clocks including 'call' and 'ret']
Code:
unsigned int divu5(unsigned int n) __attribute__((noinline));
unsigned int divu5(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %B1, 0xFF \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    "cpi  %A0, 0xFF \n\t"
    "cpc  %B0, %B1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %C1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
    
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
    
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
    
    "movw %A0, %C1  \n\t"  // >> 16
    
    "eor   r1,  r1  \n\t"
    
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x33)
    : "r1", "r0"
  );
  return n;
}

EDIT: Updated code to correct a minor glitch (correction costs 2 clock cycles).
« Last Edit: June 18, 2013, 01:26:56 pm by Tom Carpenter » Logged

~Tom~

Leeds, UK
Online Online
Edison Member
*
Karma: 78
Posts: 1719
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

It can be extended to others such as 60:      [31 Clocks including 'call' and 'ret']
Code:
unsigned int divu60(unsigned int n) __attribute__((noinline));
unsigned int divu60(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
   
    "lsr  %B0       \n\t"
    "ror  %A0       \n\t"
    "lsr  %B0       \n\t"  // we have divided by 4, now divide by 15.
    "ror  %A0       \n\t"
   
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
   
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
   
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
   
    "movw %A0, %C1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x11)
    : "r1", "r0"
  );
  return n;
}

30:      [29 Clocks including 'call' and 'ret']
Code:
unsigned int divu30(unsigned int n) __attribute__((noinline));
unsigned int divu30(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
   
    "lsr  %B0       \n\t"  // we have divided by 2, now divide by 15.
    "ror  %A0       \n\t"
   
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
   
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
   
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
   
    "movw %A0, %C1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x11)
    : "r1", "r0"
  );
  return n;
}

15:      [32 Clocks including 'call' and 'ret']
Code:
unsigned int divu15(unsigned int n) __attribute__((noinline));
unsigned int divu15(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %B1, 0xFF \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
    "cpi  %A0, 0xFF \n\t"
    "cpc  %B0, %B1  \n\t"
    "brlo .+4       \n\t"
    "ldi  %C1, 0x01 \n\t"  //final answer++
    "rjmp .+4       \n\t"
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
   
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
   
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
   
    "movw %A0, %C1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x11)
    : "r1", "r0"
  );
  return n;
}

10:      [29 Clocks including 'call' and 'ret']
Code:
unsigned int divu10(unsigned int n) __attribute__((noinline));
unsigned int divu10(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
   
    "lsr  %B0       \n\t"  // we have divided by 2, now divide by 5.
    "ror  %A0       \n\t"
   
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
   
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
   
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
   
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
   
    "movw %A0, %C1  \n\t"  // >> 16
       
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x33)
    : "r1", "r0"
  );
  return n;
}

And 6:      [29 Clocks including 'call' and 'ret']
Code:
unsigned int divu6(unsigned int n) __attribute__((noinline));
unsigned int divu6(unsigned int n) {
  unsigned long working;
  asm volatile(
    "ldi  %A1, %3   \n\t"
    "ldi  %C1, 0x00 \n\t"
    "ldi  %D1, 0x00 \n\t"
   
    "lsr  %B0       \n\t"  // we have divided by 2, now divide by 3.
    "ror  %A0       \n\t"
   
    "subi %A0, 0xFF \n\t"  //n++
    "sbci %B0, 0xFF \n\t"
    "mul  %A0, %A1  \n\t"  //A * X
    "mov  %B1,  r1  \n\t"
   
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
   
    "mul  %B0, %A1  \n\t"
    "add  %B1,  r0  \n\t"  //A* X'
    "adc  %C1,  r1  \n\t"
    "adc  %D1, %D1  \n\t"  //D1 is known 0, we need to grab the carry
   
    "add  %C1,  r0  \n\t"  //A* X'
    "adc  %D1,  r1  \n\t"
   
    "movw %A0, %C1  \n\t"  // >> 16
   
    "eor   r1,  r1  \n\t"
   
    : "=r" (n), "=r" (working)
    : "0" (n), "M" (0x55)
    : "r1", "r0"
  );
  return n;
}

EDIT: A minor correction to each function to remove a glitch with n = 0xFFFF
« Last Edit: June 18, 2013, 01:29:22 pm by Tom Carpenter » Logged

~Tom~

Offline Offline
Sr. Member
****
Karma: 4
Posts: 327
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

Fantastic

SO,

Any way you can make them into a single lib,
   so that we can all easily call them up ?

might even get them added to the 'standard' code.
   or even the compiler modified to substitute directly.

but a lib for now would be great.
Logged

Leeds, UK
Online Online
Edison Member
*
Karma: 78
Posts: 1719
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

One library for thee.

EDIT:
Fixed a glitch with n=0xFFFF

* FastDivision.zip (1.63 KB - downloaded 28 times.)
« Last Edit: June 18, 2013, 01:36:08 pm by Tom Carpenter » Logged

~Tom~

Offline Offline
Sr. Member
****
Karma: 4
Posts: 327
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

thank you

Logged

Leeds, UK
Online Online
Edison Member
*
Karma: 78
Posts: 1719
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

There was a minor glitch in all of the functions which is now correct. The fix only costs 2 clock cycles so they are still very fast. I have updated the attachment in my previous post with the new version.

I have also added divu7() for division by 7. This takes slightly longer at on average 40.5 clock cycles (numbers <=0x7FFF are 40, those > are 41).
Logged

~Tom~

Offline Offline
Sr. Member
****
Karma: 4
Posts: 327
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

do 'we' require to wrap this a little.

I've not done any of this, but dosn't arduino need a word file , and be put into a folder ?

something like this ?
#######################################
# Syntax Coloring Map For fastdivide
#######################################

#######################################
# Datatypes (KEYWORD1)
#######################################

fastdivde KEYWORD1

#######################################
# Methods and Functions (KEYWORD2)
#######################################

divu3   KEYWORD2
divu5   KEYWORD2
divu30   KEYWORD2
divu60   KEYWORD2
divu15   KEYWORD2
divu10   KEYWORD2
divu6   KEYWORD2
divu7   KEYWORD2
divmod10   KEYWORD2


#######################################
# Constants (LITERAL1)
#######################################
Logged

Leeds, UK
Online Online
Edison Member
*
Karma: 78
Posts: 1719
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

While not an absolute requirement, I suppose it can't hurt. I have added a keywords.txt file to this new version.


In the new version, a couple of the divide functions were further optimised to shave off a clock cycle or two. I have also added additional divisions, a list of which can be found in the header file. All of /1 to /15 are included (divu1() is just a #define as it is clearly a pointless division to do.).


If any more are needed, let me know and I will have a look to see if they can be done. The trouble is that I have all but run out of resolution as they are only 16bit numbers. 9 and 11 were a challenge, and 13 proved to be impossible without using additional registers which I don't want to do.

* FastDivision.zip (2.36 KB - downloaded 30 times.)
« Last Edit: June 18, 2013, 04:04:19 pm by Tom Carpenter » Logged

~Tom~

Global Moderator
Netherlands
Offline Offline
Shannon Member
*****
Karma: 211
Posts: 13478
In theory there is no difference between theory and practice, however in practice there are many...
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

Nice work Tom!

I have some functions prototyped some time ago. can be used as starting point but can still be optimized further.

divu24, divu12, divu3600, for clock math

Code:
uint32_t div24(uint32_t in)
{
  return div3(in) >> 3;
}

uint32_t div12(uint32_t in)
{
  return div3(in) >> 2;
}

uint32_t div86400(uint32_t in)  // seconds to days
{
  return ((in >> 1) + (in >>2) + (in>>7) + (in>>11) + (in>>12) - (in>>15)) >> 16;
}

uint32_t div3600(uint32_t in)
{
  return div60(div60(in));
}


divu1000 - for millis/micros, millimeters to meters etc

some experimental code (to be optimized)
Code:
uint32_t div100(uint32_t in)  // cm to meters
{
  return div10(div10(in));
}

uint32_t div1000(uint32_t in)
{
  return ((in>>2) + (in>>8) + ((in>>1) + (in>>5) + (in>>8) + (in>>11) + (in>>12) + (in >>14) + (in >> 15)) >> 8) >> 8 ;
}


 div29 (for ultrasonic sensors) see - http://forum.arduino.cc/index.php?topic=165860.0 -

some gonio related experiments (to be optimized)
Code:
uint32_t divPI(uint32_t in)
{
  uint32_t x = (in>>2) + (in>>4) + (in>>8) + (in>>9) - (in>>15) + (in >>17); // minus is correct
  return x;
}

uint32_t mulPI(uint32_t in)
{
  uint32_t x = (in<<1) + in + (in>>3) + (in>>6) + (in>>10);
  return x;
}

uint32_t degrees2radians(uint32_t in)
{
  uint32_t x = (in>>1) + (in>>5);
  x += (( (in>>1) + (in>>2) - (in>>3) + (in >>6) ) >>5);
  return x >> 5;
}

uint32_t radians2degrees(uint32_t in)
{
  uint32_t x = (in << 5) + (in<<4) + (in<<3) + in + (in>>2) + (in>>5) + (in>>7) + (in>>8) + (in>>9) + (in >>10);
  return x;
}

my div 13 code (not tested in detail)
Code:
uint32_t div13(uint32_t in)
{
  // powers 4,  7,8,9,11,12,16,  19,20,21,23,24,28
  uint32 t = in>>7 + in>>8 + in>>9 + in>>11 + in>>12 + in>>16;
  t = t + t>>12;
  return (in>>4) + t;
}

Logged

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

Leeds, UK
Online Online
Edison Member
*
Karma: 78
Posts: 1719
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

The following attachment has the same divisions as before (along with divu24), but has been optimised to take up less flash when more than one divxx() function is being used. The modification has no impact on execution time - all functions take the same amount of clock cycles as before.

Its experimental because I have only tested it in one sketch and I am not sure if I have told the compiler everything it needs to know to be able to reproduce the same code in all situations (register choices).

* FastDivision (Experimental).zip (2.18 KB - downloaded 34 times.)
Logged

~Tom~

Offline Offline
Sr. Member
****
Karma: 4
Posts: 327
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

This is all getting rather good,

is it time it was on the play ground  as a project ?
Logged

Global Moderator
Netherlands
Offline Offline
Shannon Member
*****
Karma: 211
Posts: 13478
In theory there is no difference between theory and practice, however in practice there are many...
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

@Tom
idea, why not making a divmod_N() version of them all ?  OK if one wants only the div and max speed.

It takes only a few extra instructions to derive the mod value, and div and mod are often used together?

Logged

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

Offline Offline
Sr. Member
****
Karma: 4
Posts: 327
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

on this forum we seem to have two threads running on ways to implement very efficient division and mod operations.

http://forum.arduino.cc/index.php?topic=172635.0

http://forum.arduino.cc/index.php?topic=167414.0

I'd hate to loose these over time,
   which might well happen if they just reside on the forum.

is there any posibility that these could be put together, and put on the playground or something so they are not lost .

please
« Last Edit: June 24, 2013, 04:52:14 am by drjiohnsmith » Logged

Global Moderator
Netherlands
Offline Offline
Shannon Member
*****
Karma: 211
Posts: 13478
In theory there is no difference between theory and practice, however in practice there are many...
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

Quote
is there any posibility that these could be put together, and put on the playground or something so they are not lost
Yes, we could but as long as the discussion moves fast it makes little sense to consolidate.
I will make a place holder to extreme optimization threads.

to be continued...
« Last Edit: June 24, 2013, 01:09:45 pm by robtillaart » Logged

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

Global Moderator
Netherlands
Offline Offline
Shannon Member
*****
Karma: 211
Posts: 13478
In theory there is no difference between theory and practice, however in practice there are many...
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

@drjiohnsmith
Made an entry on the playground:

Check - http://playground.arduino.cc//Main/GeneralCodeLibrary - ==> Snippets & Sketches ==> Perfomance Snippets

give it a try...
« Last Edit: June 24, 2013, 01:09:25 pm by robtillaart » Logged

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

Pages: [1] 2   Go Up
Jump to: