And here's the code if I generate a 16 bit sample instead:
sample = 0X80 ^ playpos[1];
1e64: 81 81 ldd r24, Z+1 ; 0x01
sample = sample << 8;
1e66: 38 2f mov r19, r24
1e68: 30 58 subi r19, 0x80 ; 128
1e6a: 20 e0 ldi r18, 0x00 ; 0
sample = sample | playpos[0];
1e6c: 80 81 ld r24, Z
1e6e: 48 2f mov r20, r24
1e70: 50 e0 ldi r21, 0x00 ; 0
1e72: 42 2b or r20, r18
1e74: 53 2b or r21, r19
Now why generate a 16 bit sample if I need a 12 bit sample? Well, I need to do a shift later for the volume anyway, and the 8x16->24 multiply routine can handle a 16 bit sample without overflowing... So maybe if I shift by 12...
uint32_t tmp;
MultiU8X16to24(tmp, playing->volume, sample); // (uint32_t) tmp = (uint16_t) sample * (uint8_t) volume
1e86: 2f 85 ldd r18, Y+15 ; 0x0f
1e88: 00 27 eor r16, r16
1e8a: 24 9f mul r18, r20
1e8c: c0 01 movw r24, r0
1e8e: a0 2f mov r26, r16
1e90: b0 2f mov r27, r16
1e92: 25 9f mul r18, r21
1e94: 90 0d add r25, r0
1e96: a1 1d adc r26, r1
1e98: b0 1f adc r27, r16
1e9a: 11 24 eor r1, r1
//sample = tmp >> 8; // For 12-bit sample input. This should be optimized to just copying some bytes.
sample = tmp >> 12; // For 16-bit sample input. (output 12 bit sample)
1e9c: 2c e0 ldi r18, 0x0C ; 12
1e9e: b6 95 lsr r27
1ea0: a7 95 ror r26
1ea2: 97 95 ror r25
1ea4: 87 95 ror r24
1ea6: 2a 95 dec r18
1ea8: d1 f7 brne .-12 ; 0x1e9e <__vector_13+0x12c>
1eaa: 28 2f mov r18, r24
Hm, well how does that compare to when I shifted before I did the volume adjustment?
// Adjust 12-bit sample by volume:
uint32_t tmp;
MultiU8X16to24(tmp, playing->volume, sample); // (uint32_t) tmp = (uint16_t) sample * (uint8_t) volume
1e9c: 2f 85 ldd r18, Y+15 ; 0x0f
1e9e: 00 27 eor r16, r16
1ea0: 24 9f mul r18, r20
1ea2: c0 01 movw r24, r0
1ea4: a0 2f mov r26, r16
1ea6: b0 2f mov r27, r16
1ea8: 25 9f mul r18, r21
1eaa: 90 0d add r25, r0
1eac: a1 1d adc r26, r1
1eae: b0 1f adc r27, r16
1eb0: 11 24 eor r1, r1
sample = tmp >> 8; // This should be optimized to just copying some bytes.
1eb2: 89 2f mov r24, r25
1eb4: 9a 2f mov r25, r26
1eb6: ab 2f mov r26, r27
1eb8: bb 27 eor r27, r27
1eba: 28 2f mov r18, r24
Hm...
I wonder how the shift by 12 compares to doing a shift by 8 and then a shift by 4?
// Adjust sample by volume:
uint32_t tmp;
MultiU8X16to24(tmp, playing->volume, sample); // (uint32_t) tmp = (uint16_t) sample * (uint8_t) volume
1e86: 2f 85 ldd r18, Y+15 ; 0x0f
1e88: 00 27 eor r16, r16
1e8a: 24 9f mul r18, r20
1e8c: c0 01 movw r24, r0
1e8e: a0 2f mov r26, r16
1e90: b0 2f mov r27, r16
1e92: 25 9f mul r18, r21
1e94: 90 0d add r25, r0
1e96: a1 1d adc r26, r1
1e98: b0 1f adc r27, r16
1e9a: 11 24 eor r1, r1
sample = tmp >> 8; // For 12-bit sample input. This should be optimized to just copying some bytes.
1e9c: 89 2f mov r24, r25
1e9e: 9a 2f mov r25, r26
1ea0: ab 2f mov r26, r27
1ea2: bb 27 eor r27, r27
1ea4: 9c 01 movw r18, r24
//sample = tmp >> 12; // For 16-bit sample input. (output 12 bit sample)
sample = sample >> 4; // For 16-bit sample input. (output 12 bit sample)
1ea6: 84 e0 ldi r24, 0x04 ; 4
1ea8: 36 95 lsr r19
1eaa: 27 95 ror r18
1eac: 8a 95 dec r24
1eae: e1 f7 brne .-8 ; 0x1ea8 <__vector_13+0x136>