Most of the "savings" of using an 8bit index is in the preliminary math BEFORE the copy (after all, the copy itself used to only be one byte.)
00000036 <memx_o>:
uint8_t buffer[BUFSIZE];
uint16_t bufd16;
uint8_t bufd8;
void memx_o(uint8_t *src, uint16_t n)
{
36: dc 01 movw r26, r24
uint8_t *limit = src + n;
38: 68 0f add r22, r24
3a: 79 1f adc r23, r25
uint8_t *dst = &buffer[++bufd16 % BUFSIZE];
3c: 80 91 00 00 lds r24, 0x0000
40: 90 91 00 00 lds r25, 0x0000
44: 01 96 adiw r24, 0x01 ; 1
46: 90 93 00 00 sts 0x0000, r25
4a: 80 93 00 00 sts 0x0000, r24
4e: fc 01 movw r30, r24
50: ef 73 andi r30, 0x3F ; 63
52: f0 70 andi r31, 0x00 ; 0
54: e0 50 subi r30, 0x00 ; 0
56: f0 40 sbci r31, 0x00 ; 0
58: 00 c0 rjmp .+0 ; 0x5a <memx_o+0x24>
while (src < limit) {
*dst++ = *src++;
5a: 8d 91 ld r24, X+
5c: 81 93 st Z+, r24
5e: a6 17 cp r26, r22
60: b7 07 cpc r27, r23
62: 00 f0 brcs .+0 ; 0x64 <memx_o+0x2e>
*dst++ = *src++;
}
}
64: 08 95 ret
vs
00000066 <memx>:
uint8_t bufs;
void memx(uint8_t *src, uint8_t n)
{
66: dc 01 movw r26, r24
uint8_t i = ++bufd8 % BUFSIZE;
68: 80 91 00 00 lds r24, 0x0000
6c: 8f 5f subi r24, 0xFF ; 255
6e: 80 93 00 00 sts 0x0000, r24
uint8_t *dst = &buffer[i];
72: 8f 73 andi r24, 0x3F ; 63
74: e8 2f mov r30, r24
76: f0 e0 ldi r31, 0x00 ; 0
78: e0 50 subi r30, 0x00 ; 0
7a: f0 40 sbci r31, 0x00 ; 0
do {
*dst++ = *src++;
7c: 8d 91 ld r24, X+
7e: 81 93 st Z+, r24
} while (--n);
80: 61 50 subi r22, 0x01 ; 1
82: 01 f4 brne .+0 ; 0x84 <memx+0x1e>
}
84: 08 95 ret