ASSEMBLER HELP : Fast clear of an array

Hello all

I have dabbled with assembler on arduino (uno etc) but actually its more luck than judgement, trial and error of adjusting other assembly code I have seen others do

What I need is a very simple piece of code that will clear an array that has 1536 elements in it

for instance

my array is screen[..]

in C it would just be

for (uint16_t x; x<1536; x++) {
   screen[x]=0;
}

But this just isnt fast enough

Can anyone help PLEASE ??

I don't have access to my system, but does:

  memset(screen, 0, sizeof(screen));

work any faster? Often the mem*() functions are hand-tweaked assembler, but I don't know if that's true for the C++ compiler. They could be macros that won't help, but it's worth a try.

the proof is in the pudding test

uint32_t start;
uint32_t stop;

byte screen[1536];

void setup() 
{
  Serial.begin(115200);
  Serial.println("Start ");

  start = micros();
  for (uint16_t x = 0; x<1536; x++)   // updated, see post el_supremo (Thanks)
  {
    screen[x]=0;
  }
  stop = micros();
  Serial.println(stop-start);

  start = micros();
  memset(screen, 0, sizeof(screen));
  stop = micros();
  Serial.println(stop-start);
}

void loop() 
{
}

Start
704
596

so memset is 15.6% faster (tested on UNO)

Does this

  for (uint16_t x; x<1536; x++)

implicitly set x to zero? If not, you aren't necessarily measuring the right thing.

Pete

@el_supremo (you are right - I'll fix that code - although the timing is right)

extended test with pointer math + including check if all is really set to zero.

uint32_t start;
uint32_t stop;

byte screen[1536];

void setup() 
{
  Serial.begin(115200);
  Serial.println("Start ");

  start = micros();
  for (uint16_t x=0; x<1536; x++) 
  {
    screen[x] = 0;
  }
  stop = micros();
  Serial.print("reference:\t");
  Serial.println(stop-start);
  for (uint16_t x=0; x<1536; x++) 
  {
    if (screen[x] != 0) Serial.print(x);
    screen[x] = 1;
  }
  Serial.println();


  start = micros();
  memset(screen, 0, sizeof(screen));
  stop = micros();
  Serial.print("memset:\t\t");
  Serial.println(stop-start);

  for (uint16_t x=0; x<1536; x++) 
  {
    if (screen[x] != 0) Serial.print(x);
    screen[x] = 1;
  }
  Serial.println();


  start = micros();
  byte *r =  &screen[0] ;
  while(r < screen+1536) { 
    *r = 0;
    r++;
  } 
  stop = micros();
  Serial.print("08 bit ptr:\t");
  Serial.println(stop-start);

  for (uint16_t x=0; x<1536; x++) 
  {
    if (screen[x] != 0) Serial.print(x);
    screen[x] == 1;
  }
  Serial.println();


  start = micros();
  uint16_t *p = (uint16_t *) &screen[0] ;
  while(p < (uint16_t *)(screen+1536)) { 
    *p = (uint16_t)0;
    p++;
  } 
  stop = micros();
  Serial.print("16 bit ptr:\t");
  Serial.println(stop-start);

  for (uint16_t x=0; x<1536; x++) 
  {
    if (screen[x] != 0) Serial.print(x);
    screen[x] == 1;
  }
  Serial.println();

  start = micros();
  uint32_t *q = (uint32_t *) &screen[0] ;
  while(q < (uint32_t *)(screen+1536)) { 
    *q = (uint32_t)0;
    q++;
  } 
  stop = micros();
  Serial.print("32 bit ptr:\t");
  Serial.println(stop-start);

  for (uint16_t x=0; x<1536; x++) 
  {
    if (screen[x] != 0) Serial.print(x);
    screen[x] = 1;
  }
  Serial.println();
}

void loop() 
{
}

Start
reference: 708

memset: 588

08 bit ptr: 684

16 bit ptr: 448

32 bit ptr: 316

So optimized 32 bits pointer version is more than twice as fast the reference.
This is partly because the addresses of screen and screen+1536 are multiples of 4.

32 bit ptr code wrapped into a function.

void memclr(byte * addr, uint16_t size)
{
  uint32_t *p = (uint32_t *)addr;
  while( p < (uint32_t *)(addr + size) ) *p++ = (uint32_t)0;
}

result: 392 (slower than the hard coded 32bit ptr code but at least reusable)

Note it does only handle multiples of 4!

The fastest version:

  uint32_t *q = (uint32_t *) &screen[0] ;
  while(q < (uint32_t *)(screen+1536)) {
    *q = (uint32_t)0;
    q++;
  }

generated this code:

	ldi	r30, 0x5C	; get pointer to screen[]
	ldi	r31, 0x01	; 
	rjmp	test      	; test to see if we're already done


loop:	st	Z+, r1		; set 4 sequential bytes to R1 (=0)
	st	Z+, r1
	st	Z+, r1
	st	Z+, r1
test:	ldi	r24, 0x07	; get high byte of last location in screen[]
	cpi	r30, 0x5C	; compare low byte to index Z low byte
	cpc	r31, r24	; compare high byte to index Z
	brcs	loop		;

I think you could squeeze more if you were willing to hand code in assembly.

*q++ = 0UL; surely?

Typically, counting down to zero is faster than counting up.

void memclr(byte * addr, uint16_t size)
{
  uint32_t *p = (uint32_t *)addr;
  uint32_t *q = (uint32_t *)(addr + size);
  while( --q >= p ) *q = 0UL;
}

result ==> 364 (same order)

 start = micros();
  uint64_t *t = (uint64_t *)screen;
  while(t < (uint64_t *)(screen+1536)) { 
    *t++ = 0ULL;
  } 
  stop = micros();
  Serial.print(F("64 bit+ptr:\t"));
  Serial.println(stop-start);

64 bit+ptr: 284

// works because 1536 is a multiple of 64 (24x64)...
// in fact this is loop unrolling...

284/708 = 40% of the reference time

mcnobby:
What I need is a very simple piece of code that will clear an array that has 1536 elements in it

Why?

This produced a time of 212 ± 4 us:

I couldn't line up more than 48 stores in a row while still being a multiple of 1536, maybe because of the relative jump at the end.

  start = micros();
  
  asm(
    "eor r1, r1 \n\t"
    "ldi r30, lo8(screen) \n\t"
    "ldi r31, hi8(screen) \n\t"
    "ldi r24, hi8(screen+1536) \n\t"
 "1:                           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
 
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
 
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
 
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"

    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
 
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
 
    "cpi r30, lo8(screen+1536) \n\t"
    "cpc r31, r24 \n\t"
    "brcs 1b      \n\t"
     ::: "r1", "r24", "r30", "r31"
  );

  stop = micros();

Loop unrolling, eh?

uint32_t *foo = (uint32_t *) &screen[0] ;

memset ((void *) screen, 1, sizeof (screen));

start = micros();

foo [0] = 0;
foo [1] = 0;
foo [2] = 0;
foo [3] = 0;
foo [4] = 0;
foo [5] = 0;
foo [6] = 0;
foo [7] = 0;
foo [8] = 0;
foo [9] = 0;
foo [10] = 0;
foo [11] = 0;
foo [12] = 0;
foo [13] = 0;
foo [14] = 0;
foo [15] = 0;
foo [16] = 0;
foo [17] = 0;
foo [18] = 0;
foo [19] = 0;
foo [20] = 0;
foo [21] = 0;
foo [22] = 0;
foo [23] = 0;
foo [24] = 0;
foo [25] = 0;
foo [26] = 0;
foo [27] = 0;
foo [28] = 0;
foo [29] = 0;
foo [30] = 0;
foo [31] = 0;
foo [32] = 0;
foo [33] = 0;
foo [34] = 0;
foo [35] = 0;
foo [36] = 0;
foo [37] = 0;
foo [38] = 0;
foo [39] = 0;
foo [40] = 0;
foo [41] = 0;
foo [42] = 0;
foo [43] = 0;
foo [44] = 0;
foo [45] = 0;
foo [46] = 0;
foo [47] = 0;
foo [48] = 0;
foo [49] = 0;
foo [50] = 0;
foo [51] = 0;
foo [52] = 0;
foo [53] = 0;
foo [54] = 0;
foo [55] = 0;
foo [56] = 0;
foo [57] = 0;
foo [58] = 0;
foo [59] = 0;
foo [60] = 0;
foo [61] = 0;
foo [62] = 0;
foo [63] = 0;
foo [64] = 0;
foo [65] = 0;
foo [66] = 0;
foo [67] = 0;
foo [68] = 0;
foo [69] = 0;
foo [70] = 0;
foo [71] = 0;
foo [72] = 0;
foo [73] = 0;
foo [74] = 0;
foo [75] = 0;
foo [76] = 0;
foo [77] = 0;
foo [78] = 0;
foo [79] = 0;
foo [80] = 0;
foo [81] = 0;
foo [82] = 0;
foo [83] = 0;
foo [84] = 0;
foo [85] = 0;
foo [86] = 0;
foo [87] = 0;
foo [88] = 0;
foo [89] = 0;
foo [90] = 0;
foo [91] = 0;
foo [92] = 0;
foo [93] = 0;
foo [94] = 0;
foo [95] = 0;
foo [96] = 0;
foo [97] = 0;
foo [98] = 0;
foo [99] = 0;
foo [100] = 0;
foo [101] = 0;
foo [102] = 0;
foo [103] = 0;
foo [104] = 0;
foo [105] = 0;
foo [106] = 0;
foo [107] = 0;
foo [108] = 0;
foo [109] = 0;
foo [110] = 0;
foo [111] = 0;
foo [112] = 0;
foo [113] = 0;
foo [114] = 0;
foo [115] = 0;
foo [116] = 0;
foo [117] = 0;
foo [118] = 0;
foo [119] = 0;
foo [120] = 0;
foo [121] = 0;
foo [122] = 0;
foo [123] = 0;
foo [124] = 0;
foo [125] = 0;
foo [126] = 0;
foo [127] = 0;
foo [128] = 0;
foo [129] = 0;
foo [130] = 0;
foo [131] = 0;
foo [132] = 0;
foo [133] = 0;
foo [134] = 0;
foo [135] = 0;
foo [136] = 0;
foo [137] = 0;
foo [138] = 0;
foo [139] = 0;
foo [140] = 0;
foo [141] = 0;
foo [142] = 0;
foo [143] = 0;
foo [144] = 0;
foo [145] = 0;
foo [146] = 0;
foo [147] = 0;
foo [148] = 0;
foo [149] = 0;
foo [150] = 0;
foo [151] = 0;
foo [152] = 0;
foo [153] = 0;
foo [154] = 0;
foo [155] = 0;
foo [156] = 0;
foo [157] = 0;
foo [158] = 0;
foo [159] = 0;
foo [160] = 0;
foo [161] = 0;
foo [162] = 0;
foo [163] = 0;
foo [164] = 0;
foo [165] = 0;
foo [166] = 0;
foo [167] = 0;
foo [168] = 0;
foo [169] = 0;
foo [170] = 0;
foo [171] = 0;
foo [172] = 0;
foo [173] = 0;
foo [174] = 0;
foo [175] = 0;
foo [176] = 0;
foo [177] = 0;
foo [178] = 0;
foo [179] = 0;
foo [180] = 0;
foo [181] = 0;
foo [182] = 0;
foo [183] = 0;
foo [184] = 0;
foo [185] = 0;
foo [186] = 0;
foo [187] = 0;
foo [188] = 0;
foo [189] = 0;
foo [190] = 0;
foo [191] = 0;
foo [192] = 0;
foo [193] = 0;
foo [194] = 0;
foo [195] = 0;
foo [196] = 0;
foo [197] = 0;
foo [198] = 0;
foo [199] = 0;
foo [200] = 0;
foo [201] = 0;
foo [202] = 0;
foo [203] = 0;
foo [204] = 0;
foo [205] = 0;
foo [206] = 0;
foo [207] = 0;
foo [208] = 0;
foo [209] = 0;
foo [210] = 0;
foo [211] = 0;
foo [212] = 0;
foo [213] = 0;
foo [214] = 0;
foo [215] = 0;
foo [216] = 0;
foo [217] = 0;
foo [218] = 0;
foo [219] = 0;
foo [220] = 0;
foo [221] = 0;
foo [222] = 0;
foo [223] = 0;
foo [224] = 0;
foo [225] = 0;
foo [226] = 0;
foo [227] = 0;
foo [228] = 0;
foo [229] = 0;
foo [230] = 0;
foo [231] = 0;
foo [232] = 0;
foo [233] = 0;
foo [234] = 0;
foo [235] = 0;
foo [236] = 0;
foo [237] = 0;
foo [238] = 0;
foo [239] = 0;
foo [240] = 0;
foo [241] = 0;
foo [242] = 0;
foo [243] = 0;
foo [244] = 0;
foo [245] = 0;
foo [246] = 0;
foo [247] = 0;
foo [248] = 0;
foo [249] = 0;
foo [250] = 0;
foo [251] = 0;
foo [252] = 0;
foo [253] = 0;
foo [254] = 0;
foo [255] = 0;
foo [256] = 0;
foo [257] = 0;
foo [258] = 0;
foo [259] = 0;
foo [260] = 0;
foo [261] = 0;
foo [262] = 0;
foo [263] = 0;
foo [264] = 0;
foo [265] = 0;
foo [266] = 0;
foo [267] = 0;
foo [268] = 0;
foo [269] = 0;
foo [270] = 0;
foo [271] = 0;
foo [272] = 0;
foo [273] = 0;
foo [274] = 0;
foo [275] = 0;
foo [276] = 0;
foo [277] = 0;
foo [278] = 0;
foo [279] = 0;
foo [280] = 0;
foo [281] = 0;
foo [282] = 0;
foo [283] = 0;
foo [284] = 0;
foo [285] = 0;
foo [286] = 0;
foo [287] = 0;
foo [288] = 0;
foo [289] = 0;
foo [290] = 0;
foo [291] = 0;
foo [292] = 0;
foo [293] = 0;
foo [294] = 0;
foo [295] = 0;
foo [296] = 0;
foo [297] = 0;
foo [298] = 0;
foo [299] = 0;
foo [300] = 0;
foo [301] = 0;
foo [302] = 0;
foo [303] = 0;
foo [304] = 0;
foo [305] = 0;
foo [306] = 0;
foo [307] = 0;
foo [308] = 0;
foo [309] = 0;
foo [310] = 0;
foo [311] = 0;
foo [312] = 0;
foo [313] = 0;
foo [314] = 0;
foo [315] = 0;
foo [316] = 0;
foo [317] = 0;
foo [318] = 0;
foo [319] = 0;
foo [320] = 0;
foo [321] = 0;
foo [322] = 0;
foo [323] = 0;
foo [324] = 0;
foo [325] = 0;
foo [326] = 0;
foo [327] = 0;
foo [328] = 0;
foo [329] = 0;
foo [330] = 0;
foo [331] = 0;
foo [332] = 0;
foo [333] = 0;
foo [334] = 0;
foo [335] = 0;
foo [336] = 0;
foo [337] = 0;
foo [338] = 0;
foo [339] = 0;
foo [340] = 0;
foo [341] = 0;
foo [342] = 0;
foo [343] = 0;
foo [344] = 0;
foo [345] = 0;
foo [346] = 0;
foo [347] = 0;
foo [348] = 0;
foo [349] = 0;
foo [350] = 0;
foo [351] = 0;
foo [352] = 0;
foo [353] = 0;
foo [354] = 0;
foo [355] = 0;
foo [356] = 0;
foo [357] = 0;
foo [358] = 0;
foo [359] = 0;
foo [360] = 0;
foo [361] = 0;
foo [362] = 0;
foo [363] = 0;
foo [364] = 0;
foo [365] = 0;
foo [366] = 0;
foo [367] = 0;
foo [368] = 0;
foo [369] = 0;
foo [370] = 0;
foo [371] = 0;
foo [372] = 0;
foo [373] = 0;
foo [374] = 0;
foo [375] = 0;
foo [376] = 0;
foo [377] = 0;
foo [378] = 0;
foo [379] = 0;
foo [380] = 0;
foo [381] = 0;
foo [382] = 0;
foo [383] = 0;

  stop = micros();
  Serial.print("unrolled loop:\t");
  Serial.println(stop-start);

Gives:

unrolled loop:	200

Diminishing returns to keep unrolling as the cost of the compare and jump isn't that high, relatively, as the number increases.

With 48 ST Z+, r1 instructions the time was 212 us.
With 24 it was 220.
With 8 it was 260
With 4 it was 308.

It may turn out that this isn't necessary anyway. Consider circular buffers.

Also http://xyproblem.info/

Yes, of course. But it's still an interesting question even if unnecessary in this instance.

I just want to point out that testing with this:

#include "memdebug.h"

void setup ()
  {
  Serial.begin (115200);
  Serial.println ();
  Serial.print (F("Free memory = "));
  Serial.println (getFreeMemory ());
  }  // end of setup

void loop () { }

I get:

Free memory = 1702

So with an array of 1536 bytes, you now have 166 free bytes, including for things like nested function calls. So this sketch better not have many other variables.

LOL :slight_smile:

48 in one loop

void memclr48(byte * addr, uint16_t size)
{
  uint64_t *p = (uint64_t *)addr;
  uint64_t *q = (uint64_t *)(addr + size);
  while( p < q ) 
  {
    *p++ = 0ULL;
    *p++ = 0ULL;
    *p++ = 0ULL;
    *p++ = 0ULL;
    *p++ = 0ULL;
    *p++ = 0ULL;
  }
}

also 212 but less typing :slight_smile:

So with an array of 1536 bytes, you now have 166 free bytes, including for things like nested function calls. So this sketch better not have many other variables.

that's true for an UNO, but for a MEGA etc it is an interesting exercise ...