Who cares about discussions long ago. I learned this long ago from my mentor, a great Nobel prize wining physicist.
One experimentally verified fact is worth more than a thousand opinions.
Here are three ways to copy memory and the loop takes 8 cycles in all three. This proves that the advantage of using an 8-bit index to access memory is mostly a myth.
My favorite, the first one, is the smallest.
void mem(uint8_t* dst, uint8_t* src, uint16_t n) {
0: fc 01 movw r30, r24
uint8_t* limit = src + n;
2: 46 0f add r20, r22
4: 57 1f adc r21, r23
6: db 01 movw r26, r22
while(src < limit) *dst++ = *src++;
8: 00 c0 rjmp .+0 ; 0xa <_Z3memPhS_j+0xa>
a: 8d 91 ld r24, X+
c: 81 93 st Z+, r24
e: a4 17 cp r26, r20
10: b5 07 cpc r27, r21
12: 00 f0 brcs .+0 ; 0x14 <_Z3memPhS_j+0x14>
}
14: 08 95 ret
Using an 8-bit count doesn't help:
void mem(uint8_t* dst, uint8_t* src, uint8_t n) {
0: fc 01 movw r30, r24
uint8_t* limit = src + n;
2: cb 01 movw r24, r22
4: 84 0f add r24, r20
6: 91 1d adc r25, r1
8: db 01 movw r26, r22
while(src < limit) *dst++ = *src++;
a: 00 c0 rjmp .+0 ; 0xc <_Z3memPhS_h+0xc>
c: 2d 91 ld r18, X+
e: 21 93 st Z+, r18
10: a8 17 cp r26, r24
12: b9 07 cpc r27, r25
14: 00 f0 brcs .+0 ; 0x16 <_Z3memPhS_h+0x16>
16: 08 95 ret
}
Trying hard to take advantage of the 8-bit index makes matters worse:
void mem1(uint8_t* dst, uint8_t* src, uint8_t n) {
for (uint8_t i = 0; i < n; i++) dst[i] = src[i];
0: 26 2f mov r18, r22
2: 37 2f mov r19, r23
4: f9 01 movw r30, r18
6: 28 2f mov r18, r24
8: 39 2f mov r19, r25
a: d9 01 movw r26, r18
c: 80 e0 ldi r24, 0x00 ; 0
e: 00 c0 rjmp .+0 ; 0x10 <_Z4mem1PhS_h+0x10>
10: 91 91 ld r25, Z+
12: 9d 93 st X+, r25
14: 8f 5f subi r24, 0xFF ; 255
16: 84 17 cp r24, r20
18: 00 f0 brcs .+0 ; 0x1a <_Z4mem1PhS_h+0x1a>
1a: 08 95 ret
}
Edit: Suppose after seeing this you try to be really clever and do two bytes per loop like this:
void mem(uint8_t* dst, uint8_t* src, uint16_t n) {
uint8_t* limit = src + n;
if (n & 1) *dst++ = *src++;
while(src < limit) {
*dst++ = *src++;
*dst++ = *src++;
}
}
Not a good idea after all:
void mem(uint8_t* dst, uint8_t* src, uint16_t n) {
0: fc 01 movw r30, r24
2: db 01 movw r26, r22
uint8_t* limit = src + n;
4: cb 01 movw r24, r22
6: 84 0f add r24, r20
8: 95 1f adc r25, r21
if (n & 1) *dst++ = *src++;
a: 40 ff sbrs r20, 0
c: 00 c0 rjmp .+0 ; 0xe <_Z3memPhS_j+0xe>
e: 2d 91 ld r18, X+
10: 21 93 st Z+, r18
12: 00 c0 rjmp .+0 ; 0x14 <_Z3memPhS_j+0x14>
while(src < limit) {
*dst++ = *src++;
14: 2c 91 ld r18, X
16: 20 83 st Z, r18
*dst++ = *src++;
18: 11 96 adiw r26, 0x01 ; 1
1a: 2c 91 ld r18, X
1c: 11 97 sbiw r26, 0x01 ; 1
1e: 21 83 std Z+1, r18 ; 0x01
20: 32 96 adiw r30, 0x02 ; 2
22: 12 96 adiw r26, 0x02 ; 2
24: a8 17 cp r26, r24
26: b9 07 cpc r27, r25
28: 00 f0 brcs .+0 ; 0x2a <_Z3memPhS_j+0x2a>
}
}
2a: 08 95 ret ret