Pages: 1 2 [3] 4 5 ... 8   Go Down
Author Topic: divmod10() : a fast replacement for /10 and %10 (unsigned)  (Read 20003 times)
0 Members and 1 Guest are viewing this topic.
Leeds, UK
Offline Offline
Edison Member
*
Karma: 80
Posts: 1726
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

There is one more MASSIVE optimisation which can be acheived, but I can't make gcc do it.

The solution... write the entire function manually in assembler.

Code:
void divmod10(uint32_t in, uint32_t &div, uint8_t &mod) __attribute__((noinline));
void divmod10(uint32_t in, uint32_t &div, uint8_t &mod)
{
  //assumes that div/mod pointers arrive in r18:r19 and r20:r21 pairs (doesn't matter which way around)
  //and that in arrives in r22:r25 quad
   asm volatile(
    "movw r30, %2   \n\t"  //uint32_t* divPtr = ÷
    "movw r26, %1    \n\t" //uint32_t* modPtr = &mod;
    
    "mov   r0, %A0  \n\t"  //byte temp = in
    "movw r18, %A0  \n\t"  //uint32_t q = in;
    "movw r20, %C0  \n\t"  
    "ori  r18, 0x01 \n\t"  //q |= 1;
    
    "lsr  r25       \n\t"  //x = in >> 2 //note: x reuses registers of 'in', as 'in' was backed up in r0
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"  
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    
    "sub  r18, r22  \n\t" //q = q - x;
    "sbc  r19, r23  \n\t"
    "sbc  r20, r24  \n\t"
    "sbc  r21, r25  \n\t"
    
    "movw r22, r18  \n\t" //x = q;
    "movw r24, r20  \n\t"
    "lsr  r25       \n\t" //x = x >> 4;
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    
    "add  r22, r18  \n\t" //x = x + q
    "adc  r23, r19  \n\t"
    "adc  r24, r20  \n\t"
    "adc  r25, r21  \n\t"
   
    "movw r18, r22  \n\t" //q = x
    "movw r20, r24  \n\t"
    "add  r18, r23  \n\t" //q = q + (x >> 8)
    "adc  r19, r24  \n\t"
    "adc  r20, r25  \n\t"
    "adc  r21, r1   \n\t"

    "mov  r18, r19  \n\t" //q = q >> 8
    "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r22  \n\t" //q = q + x
    "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"

    "mov  r18, r19  \n\t" //q = q >> 8
    "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r22  \n\t" //q = q + x
    "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"
    
    "mov  r18, r19  \n\t" //q = q >> 8
    "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r22  \n\t" //q = q + x
    "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"
    
    "andi r18, 0xF8 \n\t" //q = q & ~0x7
    
    "sub   r0, r18  \n\t" //in = in - q
    
    "lsr  r21       \n\t" //q = q >> 2
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
    "lsr  r21       \n\t"
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
    
    "sub  r0, r18   \n\t" //in = in - q
    "st    X, r0    \n\t" //mod = in;
    
    "lsr  r21       \n\t" //q = q >> 1
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
    
    "st    Z, r18  \n\t" //div = q
    "std  Z+1, r19  \n\t"
    "std  Z+2, r20  \n\t"
    "std  Z+3, r21  \n\t"
    
    :
    : "r" (in), "r" (&mod), "r" (&div)
    : "r0", "r26", "r27", "r31", "r31"
  );  
}

And the results speak for themselves:
Quote

i/10   38.8360
i%10   38.7400
--------------
divmod10   7.8800
--------------
test div 0..65535
test mod 0..65535
test div 0..maxlong (20K random samples)
test mod 0..maxlong (20K random samples)
done
« Last Edit: June 15, 2013, 10:11:33 am by Tom Carpenter » Logged

~Tom~

Global Moderator
Dallas
Offline Offline
Shannon Member
*****
Karma: 208
Posts: 12936
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

Quote
divmod10   7.8800

Including the call and ret it takes 101 clock cycles to execute the code which comes out to 101 cycles / 16 cycles per microsecond = 6.3125 microseconds.  The measurement code needs some work.  Excellent job on the assembly!

Code:
call 2
movw r30, 2% 1
movw r26, 1% 1
mov r0, %A0 1
movw r18, %A0 1
movw r20, %C0 1
ori r18, 0x01 1
lsr r25 1
ror r24 1
ror r23 1
ror r22 1
lsr r25 1
ror r24 1
ror r23 1
ror r22 1
sub r18, r22 1
sbc r19, r23 1
sbc r20, r24 1
sbc r21, r25 1
movw r22, r18 1
movw r24, r20 1
lsr r25 1
ror r24 1
ror r23 1
ror r22 1
lsr r25 1
ror r24 1
ror r23 1
ror r22 1
lsr r25 1
ror r24 1
ror r23 1
ror r22 1
lsr r25 1
ror r24 1
ror r23 1
ror r22 1
add r22, r18 1
adc r23, r19 1
adc r24, r20 1
adc r25, r21 1
movw r18, r22 1
movw r20, r24 1
add r18, r23 1
adc r19, r24 1
adc r20, r25 1
adc r21, r1 1
mov r18, r19 1
mov r19, r20 1
mov r20, r21 1
eor r21, r21 1
add r18, r22 1
adc r19, r23 1
adc r20, r24 1
adc r21, r25 1
mov r18, r19 1
mov r19, r20 1
mov r20, r21 1
eor r21, r21 1
add r18, r22 1
adc r19, r23 1
adc r20, r24 1
adc r21, r25 1
mov r18, r19 1
mov r19, r20 1
mov r20, r21 1
eor r21, r21 1
add r18, r22 1
adc r19, r23 1
adc r20, r24 1
adc r21, r25 1
andi r18, 0xF8 1
sub r0, r18 1
lsr r21 1
ror r20 1
ror r19 1
ror r18 1
lsr r21 1
ror r20 1
ror r19 1
ror r18 1
sub r0, r18 1
st X, r0 2
lsr r21 1
ror r20 1
ror r19 1
ror r18 1
st Z, r18 2
std Z+1, r19 2
std Z+2, r20 2
std Z+3, r21 2
ret 4
Logged

Offline Offline
Newbie
*
Karma: 2
Posts: 12
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

I wonder why Tom Carpenter didn't took the chance to implement my suggested algorithmic improvement given in reply #28?

So, I coded it myself into the given assembler template and indeed found the expected performance gain. (I don't want to bother you with this solution because I found an even better one, see below.)

Meanwhile I found that:
Code:
q= (q>>8) + x;
q= (q>>16) + (x>>8) + x;
q= (q>>8) + x;
is a valid substitute for this:
Code:
q= (q>>8) + x;
q= (q>>8) + x;
q= (q>>8) + x;
q= (q>>8) + x;
as well.

This last substitute offers even more optimization choices at assembler level and I (temporary) end up with this:
Code:
void divmod10(uint32_t in, uint32_t &div, uint8_t &mod) __attribute__((noinline));
void divmod10(uint32_t in, uint32_t &div, uint8_t &mod)
{
  //assumes that div/mod pointers arrive in r18:r19 and r20:r21 pairs (doesn't matter which way around)
  //and that in arrives in r22:r25 quad
   asm volatile(
    "movw r30, %2   \n\t"  //uint32_t* divPtr = ÷
    "movw r26, %1    \n\t" //uint32_t* modPtr = &mod;

    "mov   r0, %A0  \n\t"  //byte temp = in
    "movw r18, %A0  \n\t"  //uint32_t q = in;
    "movw r20, %C0  \n\t"
    "ori  r18, 0x01 \n\t"  //q |= 1;

    "lsr  r25       \n\t"  //x = in >> 2 //note: x reuses registers of 'in', as 'in' was backed up in r0
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"

    "sub  r18, r22  \n\t" //q = q - x;
    "sbc  r19, r23  \n\t"
    "sbc  r20, r24  \n\t"
    "sbc  r21, r25  \n\t"

    "movw r22, r18  \n\t" //x = q;
    "movw r24, r20  \n\t"
    "lsr  r25       \n\t" //x = x >> 4;
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"

    "add  r22, r18  \n\t" //x = x + q
    "adc  r23, r19  \n\t"
    "adc  r24, r20  \n\t"
    "adc  r25, r21  \n\t"

    //q= x + (x>>8);
    "movw r18, r22  \n\t" //q = x
    "movw r20, r24  \n\t"
    "add  r18, r23  \n\t" //q = q + (x >> 8)
    "adc  r19, r24  \n\t"
    "adc  r20, r25  \n\t"
    "adc  r21, r1   \n\t"

   //q= (q>>16) + (x>>8) + x;
   "movw r18, r20  \n\t" //q= (q>>16);
   "clr  r20   \n\t"
   "clr  r21   \n\t"
   "add  r18, r23  \n\t" //q += (x >> 8)
   "adc  r19, r24  \n\t"
   "adc  r20, r25  \n\t"
   "adc  r21, r21  \n\t" // we need the carry only
   "add  r18, r22  \n\t" //q += x
   "adc  r19, r23  \n\t"
   "adc  r20, r24  \n\t"
   "adc  r21, r25  \n\t"


    //q= (q>>8) + x;
    "mov  r18, r19  \n\t" //q = q >> 8
    "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "clr  r21       \n\t"
    "add  r18, r22  \n\t" //q = q + x
    "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"

    "andi r18, 0xF8 \n\t" //q = q & ~0x7

    "sub   r0, r18  \n\t" //in = in - q

    "lsr  r21       \n\t" //q = q >> 2
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
    "lsr  r21       \n\t"
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"

    "sub  r0, r18   \n\t" //in = in - q
    "st    X, r0    \n\t" //mod = in;

    "lsr  r21       \n\t" //q = q >> 1
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"

    "st    Z, r18  \n\t" //div = q
    "std  Z+1, r19  \n\t"
    "std  Z+2, r20  \n\t"
    "std  Z+3, r21  \n\t"

    :
    : "r" (in), "r" (&mod), "r" (&div)
    : "r0", "r26", "r27", "r31", "r31"
  );
}
With another slice of ~4% off.


« Last Edit: June 16, 2013, 09:17:23 am by genom2 » Logged

Leeds, UK
Offline Offline
Edison Member
*
Karma: 80
Posts: 1726
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

I wonder why Tom Carpenter didn't took the chance to implement my suggested algorithmic improvement given in reply #28?

See reply #29 for why i didn't. I thought that what you suggested in reply #28 was not a valid substitute. But actually having tested further, the discrepancy I found seems to self correct.

Your second seems a better optimisation. It saves 5 clock cycles.

Tidied up a bit:
Code:
void divmod10(uint32_t in, uint32_t &div, uint8_t &mod) __attribute__((noinline));
void divmod10(uint32_t in, uint32_t &div, uint8_t &mod)
{
  //assumes that div/mod pointers arrive in r18:r19 and r20:r21 pairs (doesn't matter which way around)
  //and that in arrives in r22:r25 quad
   asm volatile(
    "movw r30, %2   \n\t"  //uint32_t* divPtr = ÷
    "movw r26, %1    \n\t" //uint32_t* modPtr = &mod;
    
    "mov   r0, %A0  \n\t"  //byte temp = in
    "movw r18, %A0  \n\t"  //uint32_t q = in;
    "movw r20, %C0  \n\t"  
    "ori  r18, 0x01 \n\t"  //q |= 1;
    
    "lsr  r25       \n\t"  //x = in >> 2 //note: x reuses registers of 'in', as 'in' was backed up in r0
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"  
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    
    "sub  r18, r22  \n\t" //q = q - x;
    "sbc  r19, r23  \n\t"
    "sbc  r20, r24  \n\t"
    "sbc  r21, r25  \n\t"
    
    "movw r22, r18  \n\t" //x = q;
    "movw r24, r20  \n\t"
    "lsr  r25       \n\t" //x = x >> 4;
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    
    "add  r22, r18  \n\t" //x = x + q
    "adc  r23, r19  \n\t"
    "adc  r24, r20  \n\t"
    "adc  r25, r21  \n\t"
    
    "movw r18, r22  \n\t" //q = x
    "movw r20, r24  \n\t"
    "add  r18, r23  \n\t" //q = q + (x >> 8)
    "adc  r19, r24  \n\t"
    "adc  r20, r25  \n\t"
    "adc  r21, r1   \n\t"

    "movw r18, r20  \n\t" //q = q >> 16
    "eor  r20, r20  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r23  \n\t" //q = q + (x>>8)
    "adc  r19, r24  \n\t"
    "adc  r20, r25  \n\t"
    "adc  r21, r1   \n\t" //NOTE: r1 is a known 0.
    "add  r18, r22  \n\t" //q = q + x
    "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"
    
    "mov  r18, r19  \n\t" //q = q >> 8
    "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r22  \n\t" //q = q + x
    "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"
    
    "andi r18, 0xF8 \n\t" //q = q & ~0x7
    
    "sub   r0, r18  \n\t" //in = in - q
    
    "lsr  r21       \n\t" //q = q >> 2
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
    "lsr  r21       \n\t"
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
    
    "sub  r0, r18   \n\t" //in = in - q
    "st    X, r0    \n\t" //mod = in;
    
    "lsr  r21       \n\t" //q = q >> 1
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
    
    "st    Z, r18  \n\t" //div = q
    "std  Z+1, r19  \n\t"
    "std  Z+2, r20  \n\t"
    "std  Z+3, r21  \n\t"
    
    :
    : "r" (in), "r" (&mod), "r" (&div)
    : "r0", "r26", "r27", "r31", "r31"
  );
}
« Last Edit: June 16, 2013, 10:47:54 am by Tom Carpenter » Logged

~Tom~

Global Moderator
Netherlands
Offline Offline
Shannon Member
*****
Karma: 217
Posts: 13739
In theory there is no difference between theory and practice, however in practice there are many...
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

in one word - respect!

It becomes almost an assembler course! - I can read it but almost no experience writing avr asm.

@Tom & genom2
A question arises:  is porting the fastest C to asm automatically the fastest asm version?
I mean, the previous C versions that had less statements, can they be more efficient when hand-coded to asm?


Logged

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

Leeds, UK
Offline Offline
Edison Member
*
Karma: 80
Posts: 1726
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

@Tom & genom2
A question arises:  is porting the fastest C to asm automatically the fastest asm version?
I mean, the previous C versions that had less statements, can they be more efficient when hand-coded to asm?

Most things can be optimised by manually writing the assembly, at least in part.

A lot of the asm that gcc generated for the C versions of divmod10 was fine and very well optimised, in fact much I've just used. The trouble is that there are optimisations get missed, partly because to see them requires a bit of creativity.
The optimiser is very good, there are some times I look at what it comes up with and think, "wow... that was clever!". But there are other times I think, "why don't you just do this, it would be far quicker".

Really it is about trying to write the C in just the right order and manner that the compiler can optimise well. That is something which takes practice, patience and observing how different code ordering affects the compiler output.  

Most of the time now I am using a lot of inline assembler in time critical areas, such as this interrupt routine:
Code:
ISR(TIMER3_CAPT_vect) {
  byte timeSegment = distributionSegment(DC);
  byte index = (distributionWidth[DC] << 1) & timeSegment;
  interruptOVFCount(DC) = *(int*)((byte*)timerOVF[DC] + index); //move to next pwm period
  distributionSegment(DC) = timeSegment + 1;
  
  unsigned int count = interruptCount(DC)-1; //OCR1B is used as it is an unused register which affords us quick access.
  if (count == 0) {
    register byte port asm("r18") = stepPort(DC);
    register unsigned int startVal asm("r24") = currentMotorSpeed(DC);
    //byte port = STEP0PORT;
    //unsigned int startVal = currentMotorSpeed(DC);
    interruptCount(DC) = startVal; //start val is used for specifying how many interrupts between steps. This tends to IVal after acceleration
    if (port & stepHigh(DC)){
      stepPort(DC) = port & stepLow(DC);
      unsigned long jVal = synta.cmd.jVal[DC];
      jVal = jVal + synta.cmd.stepDir[DC];
      synta.cmd.jVal[DC] = jVal;
      if(synta.cmd.gotoEn[DC]){
        if (gotoPosn[DC] == jVal){ //reached the decelleration marker. Note that this line loads gotoPosn[] into r24:r27
          //will decellerate the rest of the way. So first move gotoPosn[] to end point.
          if (decelerationSteps(DC) & _BV(7)) {
            /*
            unsigned long gotoPos = gotoPosn[DC];
            if (synta.cmd.stepDir[DC] < 0){
              gotoPosn[DC] = gotoPos + decelerationSteps(DC);
            } else {
              gotoPosn[DC] = gotoPos - decelerationSteps(DC);
            }
            */
            //--- This is a heavily optimised version of the code commented out just above ------------
            //During compare of jVal and gotoPosn[], gotoPosn[] was loaded into r24 to r27
            register char stepDir asm("r19") = synta.cmd.stepDir[DC];
            asm volatile(
              "in   %A0, %2  \n\t"
              "cp   %1, __zero_reg__  \n\t"
              "brlt .+4      \n\t"
              "neg  %A0      \n\t"
              "eor  %B0, %B0   \n\t"
              "mov  %C0, %B0   \n\t"
              "mov  %D0, %B0   \n\t"
              "add  %A0, r24  \n\t" //add previously loaded gotoPosn[] registers to new gotoPosn[] registers.
              "adc  %B0, r25  \n\t"
              "adc  %C0, r26  \n\t"
              "adc  %D0, r27  \n\t"
              : "=a" (gotoPosn[DC]) //goto selects r18:r21. This adds sts commands for all 4 bytes
              : "r" (stepDir),"I" (_SFR_IO_ADDR(decelerationSteps(DC)))       //stepDir is in r19 to match second byte of goto.
              :
            );
            //-----------------------------------------------------------------------------------------
            decelerationSteps(DC) = 0; //say we are stopping
            synta.cmd.IVal[DC] = stopSpeed[DC]; //decellerate to min speed. This is possible in at most 80 steps.
          } else {
            goto stopMotorISR3;
          }
        }
      }
      if (currentMotorSpeed(DC) > stopSpeed[DC]) {
stopMotorISR3:
        synta.cmd.gotoEn[DC] = 0; //Not in goto mode.
        synta.cmd.stopped[DC] = 1; //mark as stopped
        timerDisable(DC);
      }
    } else {
      stepPort(DC) = port | stepHigh(DC);
      register unsigned int iVal asm("r20") = synta.cmd.IVal[DC];
      //unsigned int iVal = synta.cmd.IVal[RA];
      register byte increment asm("r0") = synta.cmd.stepIncrement[DC];
      asm volatile(
        "movw r18, %0   \n\t"
        "sub  %A0, %2    \n\t"
        "sbc  %B0, __zero_reg__    \n\t"
        "cp   %A0, %A1   \n\t"
        "cpc  %B0, %B1   \n\t"
        "brge 1f         \n\t" //branch if iVal <= (startVal-increment)
        "movw  %0, r18   \n\t"
        "add  %A0, %2    \n\t"
        "adc  %B0, __zero_reg__    \n\t"
        "cp   %A1, %A0   \n\t"
        "cpc  %B1, %B0   \n\t"
        "brge 1f         \n\t" //branch if (startVal+increment) <= iVal
        "movw  %0, %1   \n\t"  //copy iVal to startVal
        "1:             \n\t"
        : "=&w" (startVal) //startVal is in r24:25
        : "a" (iVal), "t" (increment) //iVal is in r20:21
        :
      );
      currentMotorSpeed(DC) = startVal; //store startVal
      /*
      if (startVal - increment >= iVal) { // if (iVal <= startVal-increment)
        startVal = startVal - increment;
      } else if (startVal + increment <= iVal){
        startVal = startVal + increment;
      } else {
        startVal = iVal;
      }
      currentMotorSpeed(DC) = startVal;
      */
    }
  } else {
    interruptCount(DC) = count;
  }
}
There is much that needs to be done in as short a space of time as possible. There are places where looking at the assembler I can see areas which I can see a better way. Sometimes that is just arranging if statements in a different way, others changing the use of variables.
Notice for example this line:
Code:
 interruptOVFCount(DC) = *(int*)((byte*)timerOVF[DC] + index); //move to next pwm period
I could have written:
Code:
 interruptOVFCount(DC) = timerOVF[DC][index]; //move to next pwm period
Which makes it a lot more readable, but results in 2 extra clock cycles.
Over the whole interrupt, two days of optimisation brought it down from 500+ clock cycles to a little over 170.
« Last Edit: June 16, 2013, 03:24:25 pm by Tom Carpenter » Logged

~Tom~

Offline Offline
Newbie
*
Karma: 2
Posts: 12
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

I'm not able to make a profound statement, because I have only little experience with assembler. So far I don't like assembler because its always specific to CPUs. (What about divmod10(..) for ARM based Arduinos?). On the other side I like programming with JAVA (which is - in theory - an interpreter). All my algorithmic improvements made for this thread are originally developed with JAVA on a PC (using '>>>' for unsigned right shift) and manually "compiled" to C, to make the proof on my Duemilanove.

In the past I used C to implement my ideas, just because there was no JAVA for microcontrollers like Arduinos. But there is light at the end of the tunnel: I'm the author of HaikuVM, a full featured open source JAVA VM for AVRs (works even for an ATmega8). Hoping HaikuVM becomes more popular in the future.
Logged

Global Moderator
Netherlands
Offline Offline
Shannon Member
*****
Karma: 217
Posts: 13739
In theory there is no difference between theory and practice, however in practice there are many...
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

[off topic]
Quote
Hoping HaikuVM becomes more popular in the future.
You can always discuss it in the exhibition gallery when it runs on an Arduino !
Logged

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

0
Offline Offline
God Member
*****
Karma: 26
Posts: 610
Always making something...
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

Wow, pretty amazing work!

This may be a dumb or overly obvious question, but would it be ok to include this in Arduino's Print.cpp file, which is LGLP licensed?

Currently that file only has attribution to David Mellis.  If I make a copy including this amazing code, who should I add to the attribution and how should your names appear?
Logged

Global Moderator
Netherlands
Offline Offline
Shannon Member
*****
Karma: 217
Posts: 13739
In theory there is no difference between theory and practice, however in practice there are many...
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

Quote
This may be a dumb or overly obvious question, but would it be ok to include this in Arduino's Print.cpp file, which is LGLP licensed?
Questions are never dumb smiley-wink

Print.cpp is one I had in mind too, printing numbers could be (estimate) 10-30% faster.
imho the divmod10() should be usable from user code too, so encapsulate in a lib fastmath.h or arduino.h (?).

I do not know the legal details of the LGPL (can you explain it simply?), but my version of the code can be used.

Still todo: test on several platforms: 168, 268, tiny, teensy, due etc.

Quote
If I make a copy including this amazing code, who should I add to the attribution and how should your names appear?
imho: reference to this thread and optionally the forum names.
Logged

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

Global Moderator
Netherlands
Offline Offline
Shannon Member
*****
Karma: 217
Posts: 13739
In theory there is no difference between theory and practice, however in practice there are many...
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

Quote
divmod10   7.8800

Including the call and ret it takes 101 clock cycles to execute the code which comes out to 101 cycles / 16 cycles per microsecond = 6.3125 microseconds.  The measurement code needs some work.

Here a redo of the measurement code
The first loop tests 100.000 x 1 call to divmod. (includes loop overhead)
The second loop tests 100.000 x 2 calls to divmod. (includes loop overhead)

By subtracting the found values one get a better approximation of the time per divmod().

Code:
unsigned long start = 0;
unsigned long stop = 0;
volatile unsigned long q;

void setup()
{

  Serial.begin(115200);
  Serial.println("testing divmod10()\n");

  Serial.print("divmod10\t");
  start = micros();
  uint8_t m = 0;
  uint32_t d = 0;
  for (unsigned long i = 0; i < 100000; i++)
  {
    divmod10(i, d, m);
  }
  stop = micros();
  float f1 = (stop - start)/100000.0;
  Serial.println(f1, 4);
  Serial.println("--------------");


  Serial.print("divmod10\t");
  start = micros();
  m = 0;
  d = 0;
  for (unsigned long i = 0; i < 100000; i++)
  {
    divmod10(i, d, m);
    divmod10(i, d, m);
  }
  stop = micros();
  float f2 = (stop - start)/100000.0;
  Serial.println(f2, 4);
  Serial.println("--------------");
  Serial.print("one run takes: ");
  Serial.println(f2-f1, 4);
  Serial.println("--------------");

  Serial.println("done");
}

void loop()
{
}

void divmod10(uint32_t in, uint32_t &div, uint8_t &mod) __attribute__((noinline));
void divmod10(uint32_t in, uint32_t &div, uint8_t &mod)
{
  //assumes that div/mod pointers arrive in r18:r19 and r20:r21 pairs (doesn't matter which way around)
  //and that in arrives in r22:r25 quad
  asm volatile(
  "movw r30, %2   \n\t"  //uint32_t* divPtr = &div;
  "movw r26, %1    \n\t" //uint32_t* modPtr = &mod;

  "mov   r0, %A0  \n\t"  //byte temp = in
  "movw r18, %A0  \n\t"  //uint32_t q = in;
  "movw r20, %C0  \n\t" 
    "ori  r18, 0x01 \n\t"  //q |= 1;

  "lsr  r25       \n\t"  //x = in >> 2 //note: x reuses registers of 'in', as 'in' was backed up in r0
  "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t" 
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"

    "sub  r18, r22  \n\t" //q = q - x;
  "sbc  r19, r23  \n\t"
    "sbc  r20, r24  \n\t"
    "sbc  r21, r25  \n\t"

    "movw r22, r18  \n\t" //x = q;
  "movw r24, r20  \n\t"
    "lsr  r25       \n\t" //x = x >> 4;
  "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"

    "add  r22, r18  \n\t" //x = x + q
  "adc  r23, r19  \n\t"
    "adc  r24, r20  \n\t"
    "adc  r25, r21  \n\t"

    "movw r18, r22  \n\t" //q = x
  "movw r20, r24  \n\t"
    "add  r18, r23  \n\t" //q = q + (x >> 8)
  "adc  r19, r24  \n\t"
    "adc  r20, r25  \n\t"
    "adc  r21, r1   \n\t"

    "mov  r18, r19  \n\t" //q = q >> 8
  "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r22  \n\t" //q = q + x
  "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"

    "mov  r18, r19  \n\t" //q = q >> 8
  "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r22  \n\t" //q = q + x
  "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"

    "mov  r18, r19  \n\t" //q = q >> 8
  "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r22  \n\t" //q = q + x
  "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"

    "andi r18, 0xF8 \n\t" //q = q & ~0x7

  "sub   r0, r18  \n\t" //in = in - q

  "lsr  r21       \n\t" //q = q >> 2
  "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
    "lsr  r21       \n\t"
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"

    "sub  r0, r18   \n\t" //in = in - q
  "st    X, r0    \n\t" //mod = in;

  "lsr  r21       \n\t" //q = q >> 1
  "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"

    "st     Z, r18  \n\t" //div = q
  "std  Z+1, r19  \n\t"
    "std  Z+2, r20  \n\t"
    "std  Z+3, r21  \n\t"

:
:
    "r" (in), "r" (&mod), "r" (&div)
:
    "r0", "r26", "r27", "r31", "r31"
    ); 
}

The output:
divmod10   7.6716
--------------
divmod10   14.4000
--------------
one run takes: 6.7284

6.3125 and 6.7284 differ 6.6%  where the original estimate 7.7360 differs 22.5%  (accuracy improved by factor 3+)

The difference of 0.4 uSec can be due to parameter passing?

in conclusion:
1) time measurement can be improved.
2) loop overhead is about 1.1 uSec
3) previous measurements are all including the loop overhead so ~1.1 uSec too high
Logged

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

Leeds, UK
Offline Offline
Edison Member
*
Karma: 80
Posts: 1726
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

The 0.4uS will be the for loop. You have to increment 'i', which takes 4 clock cycles, plus a compare is needed to check if 'i' is < 100000 which will take 4 as well.
Logged

~Tom~

Global Moderator
Netherlands
Offline Offline
Shannon Member
*****
Karma: 217
Posts: 13739
In theory there is no difference between theory and practice, however in practice there are many...
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

but both loop 1 and 2 have the same loop overhead
=> so the difference cannot include the loop overhead, so the time can only be the 100K calls to divmod()

I can only think of param handling, placing i and the addresses of d and m on the stack.

yes there is the call to millis() to but that is done only once for 100K calls so neglectible
Logged

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

Leeds, UK
Offline Offline
Edison Member
*
Karma: 80
Posts: 1726
Once the magic blue smoke is released, it won't go back in!
View Profile
WWW
 Bigger Bigger  Smaller Smaller  Reset Reset

True.

Looking at the assembly, there are 4 clock cycles to copy values into registers in the loop:
Code:
    330: c8 01       movw r24, r16
     332: b7 01       movw r22, r14
     334: a5 01       movw r20, r10
     336: 96 01       movw r18, r12
     338: 04 df       rcall .-504     ; 0x142 <_Z8divmod10mRmRh> //first call
     33a: c8 01       movw r24, r16  //each of these 4 are a single clock cycle
     33c: b7 01       movw r22, r14
     33e: a5 01       movw r20, r10
     340: 96 01       movw r18, r12
     342: ff de       rcall .-514     ; 0x142 <_Z8divmod10mRmRh> //second call

That will account for about half of the discrepancy.

Beyond that there is no difference in loop overhead. I only looked at the code you put quickly so had missed what you were trying to do.

One thing to remember is that micros() is not that accurate and there may be some error in it. Especially as in the longer loop (the one which calls divmod10() twice), there will be more micros() interrupts occuring which will add additional overhead to that loop.

EDIT:

Interestingly with the latest version should be 103 clock cycles long including ret and call (104 on a mega as ret takes an aditional clock cycle). So the target is really only 6.4375us (6.6875us if you include the 4 movw clock cycles used to load the registers).

To improve measurement, i used Timer1 to count how many clock cycles are used, and disabled millis(), which removes any error from interrupts.
Code:
unsigned long start = 0;
unsigned long stop = 0;
volatile unsigned long q;

void setup()
{

  Serial.begin(115200);
  Serial.println("testing divmod10()\n");
  
  byte backup = TIMSK0;
  
  Serial.print("divmod10\t");
  
  TCCR1A = 0;
  TCCR1B = 5;
  
  uint8_t m = 0;
  uint32_t d = 0;
  Serial.flush(); //wait for serial buffer to clear
  TIMSK0 = 0; //disable millis;
  TCNT1 = 0;
  for (unsigned long i = 0; i < 100000; i++)
  {
    divmod10(i, d, m);
  }
  stop = TCNT1; //how many clock cycles.
  TIMSK0 = backup; //renable millis;
  float f1 = stop*64;//There are 64us per clock cycle with a 1:1024 prescaler.
  f1 /= 100000.0;
  Serial.println(f1, 4);
  Serial.println("--------------");


  Serial.print("divmod10\t");
  Serial.flush(); //wait for serial buffer to clear
  m = 0;
  d = 0;
  TIMSK0 = 0; //disable millis;
  TCNT1 = 0;
  for (unsigned long i = 0; i < 100000; i++)
  {
    divmod10(i, d, m);
    divmod10(i, d, m);
  }
  stop = TCNT1;
  TIMSK0 = backup; //renable millis;
  float f2 = stop * 64;//There are 64us per clock cycle with a 1:1024 prescaler.
  f2 /= 100000.0;
  Serial.println(f2, 4);
  Serial.println("--------------");
  Serial.print("one run takes: ");
  Serial.println(f2-f1, 4);
  Serial.println("--------------");

  Serial.println("done");
}

void loop()
{
}

void divmod10(uint32_t in, uint32_t &div, uint8_t &mod) __attribute__((noinline));
void divmod10(uint32_t in, uint32_t &div, uint8_t &mod)
{
  //assumes that div/mod pointers arrive in r18:r19 and r20:r21 pairs (doesn't matter which way around)
  //and that in arrives in r22:r25 quad
  asm volatile(
    "movw r30, %2   \n\t"  //uint32_t* divPtr = &div;
    "movw r26, %1    \n\t" //uint32_t* modPtr = &mod;
  
    "mov   r0, %A0  \n\t"  //byte temp = in
    "movw r18, %A0  \n\t"  //uint32_t q = in;
    "movw r20, %C0  \n\t"  
    "ori  r18, 0x01 \n\t"  //q |= 1;
  
    "lsr  r25       \n\t"  //x = in >> 2 //note: x reuses registers of 'in', as 'in' was backed up in r0
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"  
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
  
    "sub  r18, r22  \n\t" //q = q - x;
    "sbc  r19, r23  \n\t"
    "sbc  r20, r24  \n\t"
    "sbc  r21, r25  \n\t"
  
    "movw r22, r18  \n\t" //x = q;
    "movw r24, r20  \n\t"
    "lsr  r25       \n\t" //x = x >> 4;
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
    "lsr  r25       \n\t"
    "ror  r24       \n\t"
    "ror  r23       \n\t"
    "ror  r22       \n\t"
  
    "add  r22, r18  \n\t" //x = x + q
    "adc  r23, r19  \n\t"
    "adc  r24, r20  \n\t"
    "adc  r25, r21  \n\t"
  
    "movw r18, r22  \n\t" //q = x
    "movw r20, r24  \n\t"
    "add  r18, r23  \n\t" //q = q + (x >> 8)
    "adc  r19, r24  \n\t"
    "adc  r20, r25  \n\t"
    "adc  r21, r1   \n\t"
  
    "mov  r18, r19  \n\t" //q = q >> 8
    "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r22  \n\t" //q = q + x
    "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"
  
    "mov  r18, r19  \n\t" //q = q >> 8
    "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r22  \n\t" //q = q + x
    "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"
  
    "mov  r18, r19  \n\t" //q = q >> 8
    "mov  r19, r20  \n\t"
    "mov  r20, r21  \n\t"
    "eor  r21, r21  \n\t"
    "add  r18, r22  \n\t" //q = q + x
    "adc  r19, r23  \n\t"
    "adc  r20, r24  \n\t"
    "adc  r21, r25  \n\t"
  
    "andi r18, 0xF8 \n\t" //q = q & ~0x7
  
    "sub   r0, r18  \n\t" //in = in - q
  
    "lsr  r21       \n\t" //q = q >> 2
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
    "lsr  r21       \n\t"
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
  
    "sub  r0, r18   \n\t" //in = in - q
    "st    X, r0    \n\t" //mod = in;
  
    "lsr  r21       \n\t" //q = q >> 1
    "ror  r20       \n\t"
    "ror  r19       \n\t"
    "ror  r18       \n\t"
  
    "st    Z, r18  \n\t" //div = q
    "std  Z+1, r19  \n\t"
    "std  Z+2, r20  \n\t"
    "std  Z+3, r21  \n\t"
  
    :
    :  "r" (in), "r" (&mod), "r" (&div)
    : "r0", "r26", "r27", "r31", "r31"
  );  
}

On an Arduino Mega, this yeilds:
Quote
testing divmod10()

divmod10   7.6877
--------------
divmod10   14.4378
--------------
one run takes: 6.7501
--------------
done

On a mega there are 104clocks plus the 4 for movw, so 108 in total. So the time required should be:
1000000 * 108 / 16000000 = 6.75us, so bang on.

On an Arduino Uno, it yeilds:
Quote
testing divmod10()

divmod10   7.6250
--------------
divmod10   14.3123
--------------
one run takes: 6.6874
--------------
done

On an Uno, there are 103clocks plus the 4 for movw, so 107 in total. So the time required should be:
1000000 * 107 / 16000000 = 6.6875us, so again, the test is bang on.
« Last Edit: June 22, 2013, 05:36:35 pm by Tom Carpenter » Logged

~Tom~

Offline Offline
Sr. Member
****
Karma: 4
Posts: 327
View Profile
 Bigger Bigger  Smaller Smaller  Reset Reset

on this forum we seem to have two threads running on ways to implement very efficient division and mod operations.

http://forum.arduino.cc/index.php?topic=172635.0

http://forum.arduino.cc/index.php?topic=167414.0

I'd hate to loose these over time,
   which might well happen if they just reside on the forum.

is there any posibility that these could be put together, and put on the playground or something so they are not lost .

please
« Last Edit: June 24, 2013, 04:51:33 am by drjiohnsmith » Logged

Pages: 1 2 [3] 4 5 ... 8   Go Up
Jump to: