This produced a time of 212 ± 4 us:
I couldn't line up more than 48 stores in a row while still being a multiple of 1536, maybe because of the relative jump at the end.
start = micros();
asm(
"eor r1, r1 \n\t"
"ldi r30, lo8(screen) \n\t"
"ldi r31, hi8(screen) \n\t"
"ldi r24, hi8(screen+1536) \n\t"
"1: \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"st Z+, r1 \n\t"
"cpi r30, lo8(screen+1536) \n\t"
"cpc r31, r24 \n\t"
"brcs 1b \n\t"
::: "r1", "r24", "r30", "r31"
);
stop = micros();