Another optimization question - can I speed up this 32 bit multiply?

So I was looking at that asm function to multiply 8*16... I've got it partially figured out:

#define MultiU8X16to24(longRes, charIn1, intIn2) \
asm volatile ( \
"clr r16 \n\t" \        /// r16 = 0
"mul %A1, %A2 \n\t" \   //  Multiply a1 and a2 and put the result in r1:r0
"movw %A0, r0 \n\t" \   // Move word from r0 to A0?  Does it move only a byte from r0?  Or does it move r0 and r1?  I assume the latter.
"mov %C0, r16 \n\t" \   // Move r16 into C0.  Isn't r16 just 0?
"mov %D0, r16 \n\t" \   // Move R16 into D0.  Again, isn't r16 0?
"mul %A1, %B2 \n\t" \  // Multiply a1 and b2 and put the result in r1:r0
"add %B0, r0 \n\t" \   // B0 = B0 + r0
"adc %C0, r1 \n\t" \   // C0 = C0 + r1 + carry?  Carry from where?  From the previous add instruction?  I think ADD sets a carry flag if it overflows.
"adc %D0, r16 \n\t" \  // D0 = D0 + r16 + carry   Isn't r16 still 0?
"clr r1 \n\t" \        // r1 = 0?  Why?  
: \
"=&r" (longRes) \  // I have no idea what the following does.  Or what %A0 %C0 % D0 are.
: \
"a" (charIn1), \
"a" (intIn2) \
: \
"r16" \
)