Like PetriH said, this is more of an optimization challenge/exercise for 8-bit AVR and trying to push Uno to its limits (I got Teensy 3.0 I could use but that's beside the point). GCC isn't really able to squeeze the cycles out from the C++ code, and even the latest AVR version 4.7.2 has 94 cycles for the inner loop, while my hand optimized asm is currently at 55 cycles, although there are still some bugs.
Something I have been thinking of doing is to change the loop to mix one channel at the time to 16-bit buffer instead of mixing all channels at once to 8-bit buffer, which will help to keep the channel data in registers without need to constantly fetching it from memory. It'll consume twice the buffer memory, but I think I should be able to handle it (e.g. Halve the buffer length and call the mixing loop twice as frequently).
Regarding using EEPROM that would be no improvement over reading the data from RAM, like I'm doing now. These audio channel constants are not constant over the lifetime of the entire program but constant for 1/50th of a second while the mixing loop is running. I have used this SMC technique for this purpose long time ago on 286 but it has different memory model and thus a feasible solution on that platform.
If anyone is curious, here's the mixer loop I'm optimizing (the optimized asm and the original C++ implementations:
void mod_player::mix_buffer_batch()
{
// mix batch of samples
uint8_t *buf=m_buffer+(m_buffer_batch_write_idx?buffer_batch_size:0), *buf_end=buf+buffer_batch_size;
m_buffer_batch_write_idx^=1;
audio_channel *channel_begin=m_channels, *channel_end=m_channels+modplayer_max_channels;
asm volatile
(
"sample_mix: \n\t"
"ldi r16, %[center_lo] \n\t" // r16-17 = res
"ldi r17, %[center_hi] \n\t"
"ldi r18, %[num_channels] \n\t"
"movw r28, %[channel_begin] \n\t" // Y=[channel_begin]
"channel_mix: \n\t"
"ld r19, Y+ \n\t" // r19-21 = sample_pos (16.8fp)
"ld r20, Y+ \n\t"
"ld r21, Y+ \n\t"
"ld r0, Y+ \n\t"
"ld r30, Y+ \n\t" // Z(r30-31) = sample_addr
"ld r31, Y+ \n\t"
"add r30, r20 \n\t" // Z = sample_addr + sample_pos>>8
"adc r31, r21 \n\t"
"lpm r22, Z\n\t" // r22 = smp
"ld r23, Y+ \n\t" // r23 = vol
"mulsu r22, r23 \n\t" // r0-r1 = smp*vol
"mov r0, r1 \n\t" // res+=(smp*vol)>>8
"lsl r1 \n\t" // ...
"sbc r1, r1 \n\t" // ...
"add r16, r0 \n\t" // ...
"adc r17, r1 \n\t" // ...
"eor r0, r0 \n\t" // ...
"ld r22, Y+ \n\t" // r22-r23 = sample_speed (8.8fp)
"ld r23, Y+ \n\t"
"add r19, r22 \n\t" // sample_pos+=sample_speed
"adc r20, r23 \n\t" // ...
"adc r21, r0 \n\t" // ...
"ld r22, Y+ \n\t" // r22-r23 = sample_end
"ld r23, Y+ \n\t" // ...
"brcs sample_end \n\t" // if(sample_pos>sample_end) goto sample_end;
"cp r20, r22 \n\t" // ...
"cpc r21, r23 \n\t" // ...
"brcs sample_end \n\t" // ...
"sbiw r28, 11 \n\t" // store sample pos back to memory
"next_channel: \n\t"
"st Y+, r19 \n\t" // ...
"st Y+, r20 \n\t" // ...
"st Y+, r21 \n\t" // ...
"adiw r28, %[channel_size]-3 \n\t" // proceed to the next channel
"dec r18 \n\t"
"brne channel_mix \n\t" // ...
"asr r17 \n\t" // res>>=2;
"ror r16 \n\t"
"asr r17 \n\t"
"brne clamp_res \n\t"
"ror r16 \n\t"
"st X+, r16 \n\t"
"cp r26, %[buf_end] \n\t"
"cpc r27, %B2 \n\t"
"brne sample_mix \n\t"
"jmp mix_end \n\t"
"clamp_res: \n\t" // res=res<0?0:255;
"lsl r17 \n\t" // ...
"sbc r16, r16 \n\t" // ...
"com r16 \n\t" // ...
"st X+, r16 \n\t"
"cp r26, %[buf_end] \n\t"
"cpc r27, %B2 \n\t"
"brne sample_mix \n\t"
"jmp mix_end \n\t"
"sample_end: \n\t"
"ld r22, Y+ \n\t" // r22-23 = loop_len
"ld r23, Y+ \n\t" // ...
"sbiw r28, 13 \n\t"
"sub r20, r22 \n\t" // sample_pos-=loop_len;
"sbc r21, r23 \n\t" // ...
"or r22, r23 \n\t" // if(loop_len) goto next_channel;
"brne next_channel \n\t"
"clr r19 \n\t"
"clr r20 \n\t"
"clr r21 \n\t"
"std Y+6, r0 \n\t" // volume = 0
"std Y+7, r0 \n\t" // sample_speed = 0
"std Y+8, r0 \n\t" // ...
"rjmp next_channel \n\t"
"mix_end: \n\t"
:
:[channel_begin] "r" (channel_begin)
,[buf] "x" (buf)
,[buf_end] "r" (buf_end)
,[num_channels] "I" (modplayer_max_channels)
,[center_lo] "I" ((modplayer_max_channels*0x80)&0xff)
,[center_hi] "I" ((modplayer_max_channels*0x80)>>8)
,[channel_size] "I" (sizeof(audio_channel))
:"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r28", "r29", "r30", "r31"
);
}
//----
#if 0
void mod_player::mix_buffer_batch()
{
// mix batch of samples
uint8_t *buf=m_buffer+(m_buffer_batch_write_idx?buffer_batch_size:0), *buf_end=buf+buffer_batch_size;
m_buffer_batch_write_idx^=1;
audio_channel *channel_begin=m_channels, *channel_end=m_channels+modplayer_max_channels;
do
{
// mix sample and advance all channels
int16_t res=0x80*modplayer_max_channels;
audio_channel *channel=channel_begin;
do
{
// mix channel sample and advance sample position
int8_t smp=(int8_t)pgm_read_byte(channel->sample+(channel->sample_pos>>8));
uint8_t vol=channel->volume;
res+=(smp*vol)>>8;
channel->sample_pos+=channel->sample_speed;
if((channel->sample_pos>>8)>=channel->sample_end)
{
channel->sample_pos-=long(channel->loop_len)<<8;
if(!channel->loop_len)
{
channel->sample_pos=0;
channel->sample_speed=0;
channel->volume=0;
}
}
} while(++channel!=channel_end);
// clip sample and write it to the buffer
res/=modplayer_max_channels;
*buf++=res<0?0:res>0xff?0xff:res;
} while(buf!=buf_end);
}
//----
#endif