Another optimization question - can I speed up this 32 bit multiply?

What the hell is going on here?

#define MultiU16X16to32(longRes, intIn1, intIn2) \
asm volatile ( \
"clr r26 \n\t" \
"mul %A1, %A2 \n\t" \
"movw %A0, r0 \n\t" \
"mul %B1, %B2 \n\t" \
"movw %C0, r0 \n\t" \
"mul %B2, %A1 \n\t" \
"add %B0, r0 \n\t" \
"adc %C0, r1 \n\t" \
"adc %D0, r26 \n\t" \
"mul %B1, %A2 \n\t" \
"add %B0, r0 \n\t" \
"adc %C0, r1 \n\t" \
"adc %D0, r26 \n\t" \
"clr r1 \n\t" \
: \
"=&r" (longRes) \
: \
"a" (intIn1), \
"a" (intIn2) \
: \
"r26" \
) 



#define MultiU8X16to24(longRes, charIn1, intIn2) \ 
asm volatile ( \
"clr r16 \n\t" \       
"mul %A1, %A2 \n\t" \   
"movw %A0, r0 \n\t" \   
"mov %C0, r16 \n\t" \   
"mov %D0, r16 \n\t" \   
"mul %A1, %B2 \n\t" \  
"add %B0, r0 \n\t" \   
"adc %C0, r1 \n\t" \   
"adc %D0, r16 \n\t" \  
"clr r1 \n\t" \
: \                  
"=&r" (longRes) \  
: \                
"a" (charIn1), \ 
"a" (intIn2) \
: \                
"r16" \            
)

I was getting an error in the second define, "expected ) before : token", so I copied another example assembly function which is nearly identical, pasted that before the one that's broken, and I get no errors on that one, but of course the second one still errors.