Another optimization question - can I speed up this 32 bit multiply?

Hm... tried to get rid of one assembly instruction and have the OR operation done to r19 instead of copying it to another register and doing it on that, but the compiler instead added a movw command earlier in the code to move the data into new registers anyway:

	sample = tmp >> 8; // For 12-bit sample input.  This should be optimized to just copying some bytes.		
    1e9c:	89 2f       	mov	r24, r25
    1e9e:	9a 2f       	mov	r25, r26
    1ea0:	ab 2f       	mov	r26, r27
    1ea2:	bb 27       	eor	r27, r27
    1ea4:	9c 01       	movw	r18, r24
	//sample = tmp >> 12; // For 16-bit sample input.  (output 12 bit sample)
	sample = sample >> 4;	// For 16-bit sample input.  (output 12 bit sample)
    1ea6:	84 e0       	ldi	r24, 0x04	; 4
    1ea8:	36 95       	lsr	r19
    1eaa:	27 95       	ror	r18
    1eac:	8a 95       	dec	r24
    1eae:	e1 f7       	brne	.-8      	; 0x1ea8 <__vector_13+0x136>

#endif //DVOLUME

     // Take the SS pin low to select the DAC. (Bit 4 on port B is the hardware SPI slave select pin on the Atmega1284p)
     		PORTB &= ~0b10000; 
    1eb0:	2c 98       	cbi	0x05, 4	; 5
		//SPI.transfer(tmp);

#if DVOLUME 

		//SPDR = (tmp >> 8)|0b00110000; // tmp = 12-bit.  Does the C compiler optimize out the shift here?
    		SPDR = (sample >> 8)|0b00110000; // sample = 12-bit
    1eb2:	83 2f       	mov	r24, r19
    1eb4:	80 63       	ori	r24, 0x30	; 48
    1eb6:	8e bd       	out	0x2e, r24	; 46
    		while (!(SPSR & _BV(SPIF)));
    1eb8:	0d b4       	in	r0, 0x2d	; 45
    1eba:	07 fe       	sbrs	r0, 7
    1ebc:	fd cf       	rjmp	.-6      	; 0x1eb8 <__vector_13+0x146>
		sample = tmp >> 8; // Divide by 256 to finish adusting sample by volume.   	
    1e9c:	89 2f       	mov	r24, r25
    1e9e:	9a 2f       	mov	r25, r26
    1ea0:	ab 2f       	mov	r26, r27
    1ea2:	bb 27       	eor	r27, r27
	sample = sample >> 4;	// Convert 16 bit sample to 12 bit.
    1ea4:	24 e0       	ldi	r18, 0x04	; 4
    1ea6:	96 95       	lsr	r25
    1ea8:	87 95       	ror	r24
    1eaa:	2a 95       	dec	r18
    1eac:	e1 f7       	brne	.-8      	; 0x1ea6 <__vector_13+0x134>
		// It is 3x faster to shift by 8 and then a shift by 4 than to let the compiler figure out it can do the same with a shift by 12!

#endif //DVOLUME

     // Take the SS pin low to select the DAC. (Bit 4 on port B is the hardware SPI slave select pin on the Atmega1284p)
     		PORTB &= ~0b10000; 
    1eae:	2c 98       	cbi	0x05, 4	; 5
		//SPI.transfer((tmp >> 8)|0b00110000);
		//SPI.transfer(tmp);

#if DVOLUME 
		
		sample = sample | 0b0011000000000000;
    1eb0:	90 63       	ori	r25, 0x30	; 48
		SPDR = sample >> 8;
    1eb2:	9e bd       	out	0x2e, r25	; 46
    		//SPDR = (sample >> 8)|0b00110000; // sample is 12-bit
    		while (!(SPSR & _BV(SPIF)));
    1eb4:	0d b4       	in	r0, 0x2d	; 45
    1eb6:	07 fe       	sbrs	r0, 7
    1eb8:	fd cf       	rjmp	.-6      	; 0x1eb4 <__vector_13+0x142>