Hm... tried to get rid of one assembly instruction and have the OR operation done to r19 instead of copying it to another register and doing it on that, but the compiler instead added a movw command earlier in the code to move the data into new registers anyway:
sample = tmp >> 8; // For 12-bit sample input. This should be optimized to just copying some bytes.
1e9c: 89 2f mov r24, r25
1e9e: 9a 2f mov r25, r26
1ea0: ab 2f mov r26, r27
1ea2: bb 27 eor r27, r27
1ea4: 9c 01 movw r18, r24
//sample = tmp >> 12; // For 16-bit sample input. (output 12 bit sample)
sample = sample >> 4; // For 16-bit sample input. (output 12 bit sample)
1ea6: 84 e0 ldi r24, 0x04 ; 4
1ea8: 36 95 lsr r19
1eaa: 27 95 ror r18
1eac: 8a 95 dec r24
1eae: e1 f7 brne .-8 ; 0x1ea8 <__vector_13+0x136>
#endif //DVOLUME
// Take the SS pin low to select the DAC. (Bit 4 on port B is the hardware SPI slave select pin on the Atmega1284p)
PORTB &= ~0b10000;
1eb0: 2c 98 cbi 0x05, 4 ; 5
//SPI.transfer(tmp);
#if DVOLUME
//SPDR = (tmp >> 8)|0b00110000; // tmp = 12-bit. Does the C compiler optimize out the shift here?
SPDR = (sample >> 8)|0b00110000; // sample = 12-bit
1eb2: 83 2f mov r24, r19
1eb4: 80 63 ori r24, 0x30 ; 48
1eb6: 8e bd out 0x2e, r24 ; 46
while (!(SPSR & _BV(SPIF)));
1eb8: 0d b4 in r0, 0x2d ; 45
1eba: 07 fe sbrs r0, 7
1ebc: fd cf rjmp .-6 ; 0x1eb8 <__vector_13+0x146>
sample = tmp >> 8; // Divide by 256 to finish adusting sample by volume.
1e9c: 89 2f mov r24, r25
1e9e: 9a 2f mov r25, r26
1ea0: ab 2f mov r26, r27
1ea2: bb 27 eor r27, r27
sample = sample >> 4; // Convert 16 bit sample to 12 bit.
1ea4: 24 e0 ldi r18, 0x04 ; 4
1ea6: 96 95 lsr r25
1ea8: 87 95 ror r24
1eaa: 2a 95 dec r18
1eac: e1 f7 brne .-8 ; 0x1ea6 <__vector_13+0x134>
// It is 3x faster to shift by 8 and then a shift by 4 than to let the compiler figure out it can do the same with a shift by 12!
#endif //DVOLUME
// Take the SS pin low to select the DAC. (Bit 4 on port B is the hardware SPI slave select pin on the Atmega1284p)
PORTB &= ~0b10000;
1eae: 2c 98 cbi 0x05, 4 ; 5
//SPI.transfer((tmp >> 8)|0b00110000);
//SPI.transfer(tmp);
#if DVOLUME
sample = sample | 0b0011000000000000;
1eb0: 90 63 ori r25, 0x30 ; 48
SPDR = sample >> 8;
1eb2: 9e bd out 0x2e, r25 ; 46
//SPDR = (sample >> 8)|0b00110000; // sample is 12-bit
while (!(SPSR & _BV(SPIF)));
1eb4: 0d b4 in r0, 0x2d ; 45
1eb6: 07 fe sbrs r0, 7
1eb8: fd cf rjmp .-6 ; 0x1eb4 <__vector_13+0x142>