Hi Nick,
I don't suppose we can talk you out of using assembler?
I don't think so... What I'm trying to do is to multiply two signed 0.31 fixed-point variables stared as int32_t (long int). The best I can do in C is to write
z.value=(x.value>0) ? ((long long)(y.value)*((unsigned long)(x.value)<<1)+0x80000000)>>32 : -(((long long)(y.value)*((unsigned long)(-x.value)<<1)+0x80000000)>>32);
The 0x80000000 is just for rounding.
As you can see, it's far from optimal as the compiler has to perform 64X64->64 bit multiplication and then drop the lower 31 bits (actually I'm first shifting one bit to the right and then drop the lower 32 bits because AVR can't shift more than one bit per cycle, while I believe the compiler will shift 32 bits for me for free).
Using assembler, I could do it much faster by performing 32X32->64 bit multiplication and shifting at the same time utilizing fmul* instructions like this:
asm volatile (
"clr %[Z] \n\t"
"fmuls %D[X], %D[Y] \n\t"
"movw %C[R], r0 \n\t"
"fmulsu %D[Y], %B[X] \n\t"
"sbc %C[R], %[Z] \n\t"
"sbc %D[R], %[Z] \n\t"
"movw %A[R], r0 \n\t"
"fmulsu %D[Y], %C[X] \n\t"
"sbc %D[R], %[Z] \n\t"
"add %B[R], r0 \n\t"
"adc %C[R], r1 \n\t"
"adc %D[R], %[Z] \n\t"
"fmulsu %D[Y], %A[X] \n\t"
"sbc %B[R], %[Z] \n\t"
"sbc %C[R], %[Z] \n\t"
"sbc %D[R], %[Z] \n\t"
"mov %D[T], r0 \n\t"
"add %A[R], r1 \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmulsu %D[X], %C[Y] \n\t"
"sbc %D[R], %[Z] \n\t"
"add %B[R], r0 \n\t"
"adc %C[R], r1 \n\t"
"adc %D[R], %[Z] \n\t"
"fmul %C[X], %C[Y] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"add %A[R], r0 \n\t"
"adc %B[R], r1 \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmul %B[X], %C[Y] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"add %D[T], r0 \n\t"
"adc %A[R], r1 \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmul %A[X], %C[Y] \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"mov %C[T], r0 \n\t"
"add %D[T], r1 \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmulsu %D[X], %B[Y] \n\t"
"sbc %C[R], %[Z] \n\t"
"sbc %D[R], %[Z] \n\t"
"add %A[R], r0 \n\t"
"adc %B[R], r1 \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmul %C[X], %B[Y] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"add %D[T], r0 \n\t"
"adc %A[R], r1 \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmul %B[X], %B[Y] \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"add %C[T], r0 \n\t"
"adc %D[T], r1 \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmul %A[X], %B[Y] \n\t"
"adc %D[T], %[Z] \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"mov %B[T], r0 \n\t"
"add %C[T], r1 \n\t"
"adc %D[T], %[Z] \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmulsu %D[X], %A[Y] \n\t"
"sbc %B[R], %[Z] \n\t"
"sbc %C[R], %[Z] \n\t"
"sbc %D[R], %[Z] \n\t"
"add %D[T], r0 \n\t"
"adc %A[R], r1 \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmul %C[X], %A[Y] \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"add %C[T], r0 \n\t"
"adc %D[T], r1 \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmul %B[X], %A[Y] \n\t"
"adc %D[T], %[Z] \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"add %B[T], r0 \n\t"
"adc %C[T], r1 \n\t"
"adc %D[T], %[Z] \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"fmul %A[X], %A[Y] \n\t"
"adc %C[T], %[Z] \n\t"
"adc %D[T], %[Z] \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"add %B[T], r1 \n\t"
"adc %C[T], %[Z] \n\t"
"adc %D[T], %[Z] \n\t"
"adc %A[R], %[Z] \n\t"
"adc %B[R], %[Z] \n\t"
"adc %C[R], %[Z] \n\t"
"adc %D[R], %[Z] \n\t"
"clr r1 \n\t"
: [R]"=&r"(z.value), [T]"=&r"(tmp), [Z]"=&r"(zero) // (long int)z.value is the result, (long int)tmp is a temporary variable for storing lower 32 bits of the multiplication result
: [X]"a"(x.value), [Y]"a"(y.value) //(long int)x.value and (long int)y.value are the input variables
);
If the above code compiles, it would (provided it's bug-free) do the fixed point multiplication in less than 160 cycles. It could even do it about twice faster with loosing precision of 1-2 bits by doing 32X32->32 multiplication and not calculating the lower 4 bytes internally.
I don't believe that compiler-writers can do it much better, to be honest.
You see, 10 push's and pop's are nothing comparing to the overhead to perform 64X64 multiplication.