I wonder why Tom Carpenter didn't took the chance to implement my suggested algorithmic improvement given in reply #28?
So, I coded it myself into the given assembler template and indeed found the expected performance gain. (I don't want to bother you with this solution because I found an even better one, see below.)
Meanwhile I found that:
q= (q>>8) + x;
q= (q>>16) + (x>>8) + x;
q= (q>>8) + x;
is a valid substitute for this:
q= (q>>8) + x;
q= (q>>8) + x;
q= (q>>8) + x;
q= (q>>8) + x;
as well.
This last substitute offers even more optimization choices at assembler level and I (temporary) end up with this:
void divmod10(uint32_t in, uint32_t &div, uint8_t &mod) __attribute__((noinline));
void divmod10(uint32_t in, uint32_t &div, uint8_t &mod)
{
//assumes that div/mod pointers arrive in r18:r19 and r20:r21 pairs (doesn't matter which way around)
//and that in arrives in r22:r25 quad
asm volatile(
"movw r30, %2 \n\t" //uint32_t* divPtr = ÷
"movw r26, %1 \n\t" //uint32_t* modPtr = &mod;
"mov r0, %A0 \n\t" //byte temp = in
"movw r18, %A0 \n\t" //uint32_t q = in;
"movw r20, %C0 \n\t"
"ori r18, 0x01 \n\t" //q |= 1;
"lsr r25 \n\t" //x = in >> 2 //note: x reuses registers of 'in', as 'in' was backed up in r0
"ror r24 \n\t"
"ror r23 \n\t"
"ror r22 \n\t"
"lsr r25 \n\t"
"ror r24 \n\t"
"ror r23 \n\t"
"ror r22 \n\t"
"sub r18, r22 \n\t" //q = q - x;
"sbc r19, r23 \n\t"
"sbc r20, r24 \n\t"
"sbc r21, r25 \n\t"
"movw r22, r18 \n\t" //x = q;
"movw r24, r20 \n\t"
"lsr r25 \n\t" //x = x >> 4;
"ror r24 \n\t"
"ror r23 \n\t"
"ror r22 \n\t"
"lsr r25 \n\t"
"ror r24 \n\t"
"ror r23 \n\t"
"ror r22 \n\t"
"lsr r25 \n\t"
"ror r24 \n\t"
"ror r23 \n\t"
"ror r22 \n\t"
"lsr r25 \n\t"
"ror r24 \n\t"
"ror r23 \n\t"
"ror r22 \n\t"
"add r22, r18 \n\t" //x = x + q
"adc r23, r19 \n\t"
"adc r24, r20 \n\t"
"adc r25, r21 \n\t"
//q= x + (x>>8);
"movw r18, r22 \n\t" //q = x
"movw r20, r24 \n\t"
"add r18, r23 \n\t" //q = q + (x >> 8)
"adc r19, r24 \n\t"
"adc r20, r25 \n\t"
"adc r21, r1 \n\t"
//q= (q>>16) + (x>>8) + x;
"movw r18, r20 \n\t" //q= (q>>16);
"clr r20 \n\t"
"clr r21 \n\t"
"add r18, r23 \n\t" //q += (x >> 8)
"adc r19, r24 \n\t"
"adc r20, r25 \n\t"
"adc r21, r21 \n\t" // we need the carry only
"add r18, r22 \n\t" //q += x
"adc r19, r23 \n\t"
"adc r20, r24 \n\t"
"adc r21, r25 \n\t"
//q= (q>>8) + x;
"mov r18, r19 \n\t" //q = q >> 8
"mov r19, r20 \n\t"
"mov r20, r21 \n\t"
"clr r21 \n\t"
"add r18, r22 \n\t" //q = q + x
"adc r19, r23 \n\t"
"adc r20, r24 \n\t"
"adc r21, r25 \n\t"
"andi r18, 0xF8 \n\t" //q = q & ~0x7
"sub r0, r18 \n\t" //in = in - q
"lsr r21 \n\t" //q = q >> 2
"ror r20 \n\t"
"ror r19 \n\t"
"ror r18 \n\t"
"lsr r21 \n\t"
"ror r20 \n\t"
"ror r19 \n\t"
"ror r18 \n\t"
"sub r0, r18 \n\t" //in = in - q
"st X, r0 \n\t" //mod = in;
"lsr r21 \n\t" //q = q >> 1
"ror r20 \n\t"
"ror r19 \n\t"
"ror r18 \n\t"
"st Z, r18 \n\t" //div = q
"std Z+1, r19 \n\t"
"std Z+2, r20 \n\t"
"std Z+3, r21 \n\t"
:
: "r" (in), "r" (&mod), "r" (&div)
: "r0", "r26", "r27", "r31", "r31"
);
}
With another slice of ~4% off.