Another optimization question - can I speed up this 32 bit multiply?

New and improved hardware SPI DAC update code for WaveHC, with fine volume control:

(volume needs to be changed to a uint8_t in the wave class if it isn't, I forget what it defaulted to)

// Beware hidden spaces after \'s in defines!
// I guess no ; is needed after the asm because when you call MultiU8X16to24() you will put one after it, and that will just be replaced with the code below.

#define MultiU8X16to24(longRes, charIn1, intIn2) \
asm volatile ( \
"clr r16 \n\t" \
"mul %A1, %A2 \n\t" \
"movw %A0, r0 \n\t" \
"mov %C0, r16 \n\t" \
"mov %D0, r16 \n\t" \
"mul %A1, %B2 \n\t" \
"add %B0, r0 \n\t" \
"adc %C0, r1 \n\t" \
"adc %D0, r16 \n\t" \
"clr r1 \n\t" \
: \
"=&r" (longRes) \
: \
"a" (charIn1), \
"a" (intIn2) \
: \
"r16" \
)


//------------------------------------------------------------------------------
// timer interrupt for DAC
ISR(TIMER1_COMPA_vect) {
  if (!playing) return;

  if (playpos >= playend) { // If we've reached the end of the buffer...
    if (sdstatus == SD_READY) {
    
      // swap double buffers
      playpos = sdbuff;
      playend = sdend;
      sdbuff = sdbuff != buffer1 ? buffer1 : buffer2;
      
      sdstatus = SD_FILLING;
      // interrupt to call SD reader
	    TIMSK1 |= _BV(OCIE1B);
    }
    else if (sdstatus == SD_END_FILE) {
      playing->stop();
      return;
    }
    else {
      // count overrun error if not at end of file
      if (playing->remainingBytesInChunk) {
        playing->errors++;
      }
      return;
    }
  }

  //uint8_t dh, dl;

  uint16_t sample;

  if (playing->BitsPerSample == 16) {
  
    // 16-bit is signed
    //dh = 0X80 ^ playpos[1]; // Flip most significant bit. ^ = XOR, and XOR sets bits that are the same to be 0, and different to be 1.  So xoring 101 with 000 gives 101.  While xoring 101 with 100 gives 001.  I guess fipping this one bit is all that is needed to convert from signed to unsigned! 
    //dl = playpos[0];
    
	// Calculate 16-bit sample:

		sample = 0X80 ^ playpos[1];
		sample = sample << 8;
		sample = sample | playpos[0];

	playpos += 2;

  }
  else {
  
    // 8-bit is unsigned
    //dh = playpos[0];
    //dl = 0;

	// Calculate 16-bit sample:
		sample = playpos[0]<<8;   
	
    playpos++;

  }
  
#if DVOLUME

	// dh, dl -> 16 bit tmp:
	//uint16_t tmp = (dh << 8) | dl;
	//tmp >>= playing->volume;
	
	// dh, dl -> 12 bit tmp:
 	//uint32_t tmp = (dh << 4) | (dl >> 4);
	//tmp = (tmp * playing->volume) >> 10; // Volume is 10 bit, so 0..1023.  4095*1023 / 1024 = 4091 max. 

	// Adjust sample by volume:
		uint32_t tmp;
		MultiU8X16to24(tmp, playing->volume, sample); // (uint32_t) tmp = (uint16_t) sample * (uint8_t) volume
	
		sample = tmp >> 8; // Divide by 256 to finish adusting sample by volume.   	
	sample = sample >> 4;	// Convert 16 bit sample to 12 bit.

		// It is 3x faster to shift by 8 and then a shift by 4 than to let the compiler figure out it can do the same with a shift by 12!

#endif //DVOLUME

     // Take the SS pin low to select the DAC. (Bit 4 on port B is the hardware SPI slave select pin on the Atmega1284p)
     		PORTB &= ~0b10000; 

     // Send 16 bits to the DAC.
	// First 4 bits are configuration bits. 0011 = (DAC A, unbuffered, 1x gain, not muted)
	// Last 12 bits specify the output voltage. (MSB first)  
	
		//SPI.transfer((tmp >> 8)|0b00110000);
		//SPI.transfer(tmp);

#if DVOLUME 

    		SPDR = (sample >> 8)|0b00110000; // sample is 12-bit
    		while (!(SPSR & _BV(SPIF)));
		
    		SPDR = sample; //SPDR = tmp;
    		while (!(SPSR & _BV(SPIF)));

#else

		// This code is broken because I no longer store dh and dl seperately.

    		SPDR = 0b00110000 | (dh >> 4);
    		while (!(SPSR & _BV(SPIF)));
		
    		SPDR = (dh << 4) | (dl >> 4);
    		while (!(SPSR & _BV(SPIF)));

#endif

     // Take the SS pin high to de-select the DAC:
      	PORTB |= 0b10000;

}

This could probably be optimized quite a bit more if pure assembler was used stating from the point where the sample data was loaded into the sample variable. If some of those shifts could be removed that would save a lot of cycles.