Another optimization question - can I speed up this 32 bit multiply?

And here's the code if I generate a 16 bit sample instead:

	sample = 0X80 ^ playpos[1];
    1e64:	81 81       	ldd	r24, Z+1	; 0x01
	sample = sample << 8;
    1e66:	38 2f       	mov	r19, r24
    1e68:	30 58       	subi	r19, 0x80	; 128
    1e6a:	20 e0       	ldi	r18, 0x00	; 0
	sample = sample | playpos[0];
    1e6c:	80 81       	ld	r24, Z
    1e6e:	48 2f       	mov	r20, r24
    1e70:	50 e0       	ldi	r21, 0x00	; 0
    1e72:	42 2b       	or	r20, r18
    1e74:	53 2b       	or	r21, r19

Now why generate a 16 bit sample if I need a 12 bit sample? Well, I need to do a shift later for the volume anyway, and the 8x16->24 multiply routine can handle a 16 bit sample without overflowing... So maybe if I shift by 12...

	uint32_t tmp;
	MultiU8X16to24(tmp, playing->volume, sample); // (uint32_t) tmp = (uint16_t) sample * (uint8_t) volume
    1e86:	2f 85       	ldd	r18, Y+15	; 0x0f
    1e88:	00 27       	eor	r16, r16
    1e8a:	24 9f       	mul	r18, r20
    1e8c:	c0 01       	movw	r24, r0
    1e8e:	a0 2f       	mov	r26, r16
    1e90:	b0 2f       	mov	r27, r16
    1e92:	25 9f       	mul	r18, r21
    1e94:	90 0d       	add	r25, r0
    1e96:	a1 1d       	adc	r26, r1
    1e98:	b0 1f       	adc	r27, r16
    1e9a:	11 24       	eor	r1, r1
	
	//sample = tmp >> 8; // For 12-bit sample input.  This should be optimized to just copying some bytes.		
	sample = tmp >> 12; // For 16-bit sample input.  (output 12 bit sample)
    1e9c:	2c e0       	ldi	r18, 0x0C	; 12
    1e9e:	b6 95       	lsr	r27
    1ea0:	a7 95       	ror	r26
    1ea2:	97 95       	ror	r25
    1ea4:	87 95       	ror	r24
    1ea6:	2a 95       	dec	r18
    1ea8:	d1 f7       	brne	.-12     	; 0x1e9e <__vector_13+0x12c>
    1eaa:	28 2f       	mov	r18, r24

Hm, well how does that compare to when I shifted before I did the volume adjustment?

	// Adjust 12-bit sample by volume:
	uint32_t tmp;
	MultiU8X16to24(tmp, playing->volume, sample); // (uint32_t) tmp = (uint16_t) sample * (uint8_t) volume
    1e9c:	2f 85       	ldd	r18, Y+15	; 0x0f
    1e9e:	00 27       	eor	r16, r16
    1ea0:	24 9f       	mul	r18, r20
    1ea2:	c0 01       	movw	r24, r0
    1ea4:	a0 2f       	mov	r26, r16
    1ea6:	b0 2f       	mov	r27, r16
    1ea8:	25 9f       	mul	r18, r21
    1eaa:	90 0d       	add	r25, r0
    1eac:	a1 1d       	adc	r26, r1
    1eae:	b0 1f       	adc	r27, r16
    1eb0:	11 24       	eor	r1, r1
	sample = tmp >> 8; // This should be optimized to just copying some bytes.
    1eb2:	89 2f       	mov	r24, r25
    1eb4:	9a 2f       	mov	r25, r26
    1eb6:	ab 2f       	mov	r26, r27
    1eb8:	bb 27       	eor	r27, r27
    1eba:	28 2f       	mov	r18, r24

Hm...

I wonder how the shift by 12 compares to doing a shift by 8 and then a shift by 4?

	// Adjust sample by volume:
	uint32_t tmp;
	MultiU8X16to24(tmp, playing->volume, sample); // (uint32_t) tmp = (uint16_t) sample * (uint8_t) volume
    1e86:	2f 85       	ldd	r18, Y+15	; 0x0f
    1e88:	00 27       	eor	r16, r16
    1e8a:	24 9f       	mul	r18, r20
    1e8c:	c0 01       	movw	r24, r0
    1e8e:	a0 2f       	mov	r26, r16
    1e90:	b0 2f       	mov	r27, r16
    1e92:	25 9f       	mul	r18, r21
    1e94:	90 0d       	add	r25, r0
    1e96:	a1 1d       	adc	r26, r1
    1e98:	b0 1f       	adc	r27, r16
    1e9a:	11 24       	eor	r1, r1
	
	sample = tmp >> 8; // For 12-bit sample input.  This should be optimized to just copying some bytes.		
    1e9c:	89 2f       	mov	r24, r25
    1e9e:	9a 2f       	mov	r25, r26
    1ea0:	ab 2f       	mov	r26, r27
    1ea2:	bb 27       	eor	r27, r27
    1ea4:	9c 01       	movw	r18, r24
	//sample = tmp >> 12; // For 16-bit sample input.  (output 12 bit sample)
	sample = sample >> 4;	// For 16-bit sample input.  (output 12 bit sample)
    1ea6:	84 e0       	ldi	r24, 0x04	; 4
    1ea8:	36 95       	lsr	r19
    1eaa:	27 95       	ror	r18
    1eac:	8a 95       	dec	r24
    1eae:	e1 f7       	brne	.-8      	; 0x1ea8 <__vector_13+0x136>