divmod10() : a fast replacement for /10 and %10 (unsigned)

I put divmod10 into Print.cpp. I did some quick testing and it seems to work quite well.

This is Teensy's implementation of Print, which differs slightly from Arduino's. It has several other optimizations....

Print.cpp (10.1 KB)

Print.h (6.69 KB)

Ideally, divmod10 could be an inline asm macro that modifies "n" in place and returns the remainder in a register. That would eliminate the function call+return, the pointer overhead, and the compiler having to reallocate registers during the loop because a function is called.

It'll probably require a 32 bit temporary, if I understand the existing asm code?

Hi Paul,

I have done the same mod to the standard Arduino Print class, which is attached.

There are a couple of improvements I have made:
(1)
The no base print calls can just call printNumberDec() directly rather than going through printNumber().

    size_t print(unsigned char b)				{ return printNumberDec(b, 0); }
    size_t print(int n)				{ return print((long)n); }
    size_t print(unsigned int n)			{ return printNumberDec(n, 0); }
    size_t print(long n)      {if (n < 0) {return printNumberDec(-n, 1);} else {return printNumberDec(n, 0);} }
    size_t print(unsigned long n)			{ return printNumberDec(n, 0); }

(2)
As suggested, I have made an inline version of divmod10 for the printNumberDec() function. You should be able to merge this back into the Teensy version fairly readily.

Print.h (4.43 KB)

Print.cpp (7.39 KB)

@Paul Stoffregen

a quick look at print.cpp

in the generic printNumber you have this code

	} else if (base == 1) {
		base = 10;
	}

could be

	} else if (base == 1) {
		printNumberDEC(n, sign);
	}

count += printNumber(int_part, sign, 10);
==>
count += printNumberDEC(int_part, sign);

Did you measure timing improvements?

I converted it to inline asm too, slightly differently but very similar.

#define divmod10_asm(in32, tmp32, mod8)                         \
asm volatile (                                                  \
        "mov    %2, %A0         \n\t" /* mod = in */            \
        "ori    %A0, 1          \n\t" /* q = in | 1 */          \
        "movw   %A1, %A0        \n\t" /* x = q */               \
        "movw   %C1, %C0        \n\t"                           \
        "lsr    %D1             \n\t"  /* x = x >> 2 */         \
        "ror    %C1             \n\t"                           \
        "ror    %B1             \n\t"                           \
        "ror    %A1             \n\t"                           \
        "lsr    %D1             \n\t"                           \
        "ror    %C1             \n\t"                           \
        "ror    %B1             \n\t"                           \
        "ror    %A1             \n\t"                           \
        "sub    %A0, %A1        \n\t" /* q = q - x  */          \
        "sbc    %B0, %B1        \n\t"                           \
        "sbc    %C0, %C1        \n\t"                           \
        "sbc    %D0, %D1        \n\t"                           \
        "movw   %A1, %A0        \n\t" /* x = q  */              \
        "movw   %C1, %C0        \n\t"                           \
        "lsr    %D1             \n\t" /* x = x >> 4  */         \
        "ror    %C1             \n\t"                           \
        "ror    %B1             \n\t"                           \
        "ror    %A1             \n\t"                           \
        "lsr    %D1             \n\t"                           \
        "ror    %C1             \n\t"                           \
        "ror    %B1             \n\t"                           \
        "ror    %A1             \n\t"                           \
        "lsr    %D1             \n\t"                           \
        "ror    %C1             \n\t"                           \
        "ror    %B1             \n\t"                           \
        "ror    %A1             \n\t"                           \
        "lsr    %D1             \n\t"                           \
        "ror    %C1             \n\t"                           \
        "ror    %B1             \n\t"                           \
        "ror    %A1             \n\t"                           \
        "add    %A1, %A0        \n\t" /* x = x + q */           \
        "adc    %B1, %B0        \n\t"                           \
        "adc    %C1, %C0        \n\t"                           \
        "adc    %D1, %D0        \n\t"                           \
        "movw   %A0, %A1        \n\t" /* q = x */               \
        "movw   %C0, %C1        \n\t"                           \
        "add    %A0, %B1        \n\t" /* q = q + (x >> 8) */            \
        "adc    %B0, %C1        \n\t"                           \
        "adc    %C0, %D1        \n\t"                           \
        "adc    %D0, r1         \n\t"                           \
        "mov    %A0, %B0        \n\t" /* q = q >> 8 */          \
        "mov    %B0, %C0        \n\t"                           \
        "mov    %C0, %D0        \n\t"                           \
        "eor    %D0, %D0        \n\t"                           \
        "add    %A0, %A1        \n\t" /* q = q + x */           \
        "adc    %B0, %B1        \n\t"                           \
        "adc    %C0, %C1        \n\t"                           \
        "adc    %D0, %D1        \n\t"                           \
        "mov    %A0, %B0        \n\t" /* q = q >> 8 */          \
        "mov    %B0, %C0        \n\t"                           \
        "mov    %C0, %D0        \n\t"                           \
        "eor    %D0, %D0        \n\t"                           \
        "add    %A0, %A1        \n\t" /* q = q + x */           \
        "adc    %B0, %B1        \n\t"                           \
        "adc    %C0, %C1        \n\t"                           \
        "adc    %D0, %D1        \n\t"                           \
        "mov    %A0, %B0        \n\t" /* q = q >> 8 */          \
        "mov    %B0, %C0        \n\t"                           \
        "mov    %C0, %D0        \n\t"                           \
        "eor    %D0, %D0        \n\t"                           \
        "add    %A0, %A1        \n\t" /* q = q + x */           \
        "adc    %B0, %B1        \n\t"                           \
        "adc    %C0, %C1        \n\t"                           \
        "adc    %D0, %D1        \n\t"                           \
        "andi   %A0, 0xF8       \n\t" /* q = q & ~0x7 */        \
        "sub    %2, %A0         \n\t" /* mod = mod - q */       \
        "lsr    %D0             \n\t" /* q = q >> 2  */         \
        "ror    %C0             \n\t"                           \
        "ror    %B0             \n\t"                           \
        "ror    %A0             \n\t"                           \
        "lsr    %D0             \n\t"                           \
        "ror    %C0             \n\t"                           \
        "ror    %B0             \n\t"                           \
        "ror    %A0             \n\t"                           \
        "sub    %2, %A0         \n\t" /* mod = mod - q */       \
        "lsr    %D0             \n\t" /* q = q >> 1 */          \
        "ror    %C0             \n\t"                           \
        "ror    %B0             \n\t"                           \
        "ror    %A0             \n\t"                           \
        :  "+d" (in32), "=r" (tmp32), "=r" (mod8) : :           \
)

robtillaart:
Did you measure timing improvements?

Yes, just now, printing several integers with micros() read inside the printNumberXXX functions.

There are a couple caveats, but basically:

With printNumberAny(): 1412 us -- single call to C library divide
With printNumberDec(): 218 us -- using divmod10_asm

Caveat #1: Adding the storage of micro() causes the compiler to allocate many more registers. I didn't analyze the generated printNumberDec code in detail, but it is clear from a casual look at the assembly listing that just adding the test causes the whole function to be slower.

Caveat #2: The timer0 interrupt is at play, so the measurement include extra time and sometimes changes slightly from run to run.

Here's the (very arbitrary) test code:

extern uint32_t usec_print;

void setup()
{
  for (int i=0; i<100; i++) {
    Serial.println(12345678l);
    Serial.println(0);
    Serial.println(792942ul);
    Serial.println(12345);
    Serial.println(2586173071ul);
    Serial.println(234);
  }
  Serial.print("usec = ");
  Serial.println((float)usec_print / 100.0);
}

void loop()
{
}

Here are the Print files. These have the test code inside and printNumberAny is enabled...

Print.cpp (8.62 KB)

Print.h (6.77 KB)

As a simple comparison, take this code:

unsigned long start = 0;
unsigned long stop = 0;
volatile unsigned long q;

void setup()
{

  Serial.begin(115200);
  Serial.println("testing");
  
  byte backup = TIMSK0;
  
  TCCR1A = 0;
  TCCR1B = 4; 
  TIMSK1 |= _BV(TOIE1);
  
  Serial.flush(); //wait for serial buffer to clear
  TIMSK0 = 0; //disable millis;
  TCNT1 = 0;
  
  Serial.println(1073741824ul);
  Serial.println(101819584ul);
  Serial.println(10737);
  
  stop = TCNT1; //how many clock cycles.
  TIMSK0 = backup; //renable millis;
  stop*=16;//There are 16us per clock cycle with a 1:256 prescaler.
  Serial.print("Time=");
  Serial.println(stop);
  Serial.println("done");
  TIMSK1 &= ~_BV(TOIE1);
}

void loop()
{
}

ISR(TIMER1_OVF_vect) {
  Serial.println("I Overflowed!"); //just to make sure we can tell if this happens.
}

The old print functions:

testing
1073741824
101819584
10737
Time=1408
done

The new functions:

testing
1073741824
101819584
10737
Time=480
done

A rather arbitrary test I know, but there is a clear difference, almost 1ms is knocked off the time it takes to fill the Serial buffer.

I ran this on Teensy 2.0. It takes 224... so a good portion of that 480 on normal Arduino is probably the slow hardware serial code which uses a vtable function call and circular buffer manipulation overhead for each individual byte.

for long numbers:

original = 1408/24 = ~59us / digit to print
"tom" = 480/24 = ~20 us/digit to print (33%)
"paul" = 224/24 = ~ 9.5uS /digit to print (16%)

A quick arbitrary float test (same prog as above, slightly different printnumber)

  Serial.println(10737.41824, 4);
  Serial.println(1.01819584, 4);
  Serial.println(107.37, 2);

original printnumber
testing
10737.4179
1.0182
107.37
Time=2144
done

divmod10 printnumber
testing
10737.4179
1.0182
107.37
Time=1472
done

so for floats there is also serious gain ~30% gain

(note this gain is only due to the integral part, the remainder part is a loop using a float mul per digit

  // Extract digits from the remainder one at a time
  while (digits-- > 0)
  {
    remainder *= 10.0;
    int toPrint = int(remainder);
    n += print(toPrint);
    remainder -= toPrint; 
  }

this can be optimized to something like:

  // Extract digits from the remainder one at a time
  long t = 1;
  while (digits-- > 0) t *= 10;
  remainder *= t;
  n += print(long(remainder));

a quick test exposes the flaw in the proposed code but also indicates there is room for improvement.

testing
10737.4179
1.182 <<<<<< FAIL !!!
107.37
Time=1152 <<< (another 25%)
done

robtillaart:
"tom" = 480/24 = ~20 us/digit to print (33%)
"paul" = 224/24 = ~ 9.5uS /digit to print (16%)

At 16 MHz, the divmod10_asm() macro is 5.2 us.

It's exactly 83 instructions, all of them single-cycle. There's no call-return, no pointers, no load-store instructions. It operates entirely on values in only 9 registers, which are chosen by the compiler's register allocator based on the needs of the surrounding code. Unless there's more algorithm magic, I believe it's safe to say this 83 cycle version is the fastest possible implementation.

Obviously there's some room for improvement in actually moving the data....

fast.

and all are much of an improvement on the original aruino,

'all' we need to do is document up and make available in a form every one can use easily.

thats that hard part,

now I wonder if a pre parser could notice the opportunity to use these in 'normal' arduino code and substitute !

OK OK OK

thats another task..

well done to all

drjiohnsmith:
'all' we need to do is document up and make available in a form every one can use easily.
thats that hard part,

How about the Print.cpp and Print.h Tom posted on reply #51 or I posted on #49 and #54 ?

All you have to do to use these is drop them into your Arduino's hardware/arduino/cores/arduino folder (or hardware/teensy/cores/teensy), replacing the Print.cpp and Print.h that are already there.

On a Mac, you'll need to control-click on Arduino and choose "show package contents", then navigate or search until you find the folder with Print.cpp and Print.h. I believe it's Contents/Resources/Java/hardware/arduino/cores/arduino, but I don't have a Mac handy at the moment to check.

Admittedly, the copy I posted on #54 had test code enabled. Here's a "final" version with all the test stuff commented, as ready-to-use as it gets. These are tested with Teensy but probably will work with official Arduino boards.

Print.cpp (8.64 KB)

Print.h (6.77 KB)

When Arduino release a new IDE, etc, those libraries you have upgraded are potentially going to get blown away, and require work to reinstall / overwrite.

One potential solution would be if you could set up pathing so modified libraries were linked if they existed in rather than default libraries.

I restarted my exhaustive test, using the asm macro version. In a few days I should be able to confirm if it matches the C library version for all 2^32 possible inputs....

Unless there's more algorithm magic, I believe it's safe to say this 83 cycle version is the fastest possible implementation.

That's why I asked several posts back if one of the earlier C version - shorter in terms of C - could be faster is asm as it might be less statements.

In a few days I should be able to confirm if it matches the C library version for all 2^32 possible inputs....

how do you do this test? Generate strings and compare them?

robtillaart:
how do you do this test? Generate strings and compare them?

Compute the quotient and modulus using both and check if either is different. Basically, like this:

                q1 = n / 10;
                m1 = n - q1 * 10;
                q2 = n;
                divmod10_asm(q2, tmp, m2);
                if (q1 != q2 || m1 != m2) {
                        errorcount++;
                }

Here is the complete code. It should run on any AVR-based Arduino compatible board.

divmod10_asm_test.zip (1.68 KB)

Opps, forgot this. To run it on non-Teensy boards, you'll also have to add this .h file. Or you could comment out the stuff that prints dots and blinks the LED, but that would not be any fun.

elapsedMillis.h (4.2 KB)

would it not be faster to

for(unsigned long(n = 0; n < 0xFFFFFFFF; n++)
{
  divmod10_asm(n, d, m); 
  if (n != (10*d + m)) Serial.println(i);
}

as division is quite expensive as we know :wink:

That would be much faster, aside from a couple minor code problems. But does only comparing the output to itself really form a valid test?

The slow way will complete in a few days, which really isn't very long in the grand scheme of things.

But does only comparing the output to itself really form a valid test?

no there is a small chance in theory that there exist a (d, m) pair that would also solve the n = 10*d + m equation (e.g. n = 100, d = 0, m = 100)
but given the fact that the code is a white box and we know how the code works this chance is minimal.

To prevent this theoretical issue one could add a test for m to be smaller than 10.

for(unsigned long(n = 0; n < 0xFFFFFFFF; n++)
{
  divmod10_asm(n, d, m); 
  if (n != (10*d + m) || m >= 10 ) Serial.println(n);
}