Go Down

Topic: ASSEMBLER HELP : Fast clear of an array (Read 9469 times) previous topic - next topic

mcnobby

Hello all

I have dabbled with assembler on arduino (uno etc) but actually its more luck than judgement, trial and error of adjusting other assembly code I have seen others do

What I need is a very simple piece of code that will clear an array that has 1536 elements in it

for instance

my array is
Code: [Select]
screen[..]

in C it would just be

Code: [Select]

for (uint16_t x; x<1536; x++) {
   screen[x]=0;
}


But this just isnt fast enough

Can anyone help PLEASE ??
http://www.youtube.com/user/Recovered
http://www.smartshow.lighting

econjack

I don't have access to my system, but does:

Code: [Select]

  memset(screen, 0, sizeof(screen));


work any faster? Often the mem*() functions are hand-tweaked assembler, but I don't know if that's true for the C++ compiler. They could be macros that won't help, but it's worth a try.

robtillaart

#2
Oct 27, 2014, 06:19 pm Last Edit: Oct 27, 2014, 06:54 pm by robtillaart
the proof is in the pudding test

Code: [Select]
uint32_t start;
uint32_t stop;

byte screen[1536];

void setup()
{
  Serial.begin(115200);
  Serial.println("Start ");

  start = micros();
  for (uint16_t x = 0; x<1536; x++)   // updated, see post el_supremo (Thanks)
  {
    screen[x]=0;
  }
  stop = micros();
  Serial.println(stop-start);

  start = micros();
  memset(screen, 0, sizeof(screen));
  stop = micros();
  Serial.println(stop-start);
}

void loop()
{
}


Start
704
596

so memset is 15.6% faster (tested on UNO)


Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

el_supremo

Does this
Code: [Select]
  for (uint16_t x; x<1536; x++)
implicitly set x to zero? If not, you aren't necessarily measuring the right thing.

Pete
Don't send me technical questions via Private Message.

robtillaart

@el_supremo (you are right - I'll fix that code - although the timing is right)


extended test with pointer math + including check if all is really set to zero.

Code: [Select]

uint32_t start;
uint32_t stop;

byte screen[1536];

void setup()
{
  Serial.begin(115200);
  Serial.println("Start ");

  start = micros();
  for (uint16_t x=0; x<1536; x++)
  {
    screen[x] = 0;
  }
  stop = micros();
  Serial.print("reference:\t");
  Serial.println(stop-start);
  for (uint16_t x=0; x<1536; x++)
  {
    if (screen[x] != 0) Serial.print(x);
    screen[x] = 1;
  }
  Serial.println();


  start = micros();
  memset(screen, 0, sizeof(screen));
  stop = micros();
  Serial.print("memset:\t\t");
  Serial.println(stop-start);

  for (uint16_t x=0; x<1536; x++)
  {
    if (screen[x] != 0) Serial.print(x);
    screen[x] = 1;
  }
  Serial.println();


  start = micros();
  byte *r =  &screen[0] ;
  while(r < screen+1536) {
    *r = 0;
    r++;
  }
  stop = micros();
  Serial.print("08 bit ptr:\t");
  Serial.println(stop-start);

  for (uint16_t x=0; x<1536; x++)
  {
    if (screen[x] != 0) Serial.print(x);
    screen[x] == 1;
  }
  Serial.println();


  start = micros();
  uint16_t *p = (uint16_t *) &screen[0] ;
  while(p < (uint16_t *)(screen+1536)) {
    *p = (uint16_t)0;
    p++;
  }
  stop = micros();
  Serial.print("16 bit ptr:\t");
  Serial.println(stop-start);

  for (uint16_t x=0; x<1536; x++)
  {
    if (screen[x] != 0) Serial.print(x);
    screen[x] == 1;
  }
  Serial.println();

  start = micros();
  uint32_t *q = (uint32_t *) &screen[0] ;
  while(q < (uint32_t *)(screen+1536)) {
    *q = (uint32_t)0;
    q++;
  }
  stop = micros();
  Serial.print("32 bit ptr:\t");
  Serial.println(stop-start);

  for (uint16_t x=0; x<1536; x++)
  {
    if (screen[x] != 0) Serial.print(x);
    screen[x] = 1;
  }
  Serial.println();
}

void loop()
{
}


Start
reference:   708

memset:   588

08 bit ptr:   684

16 bit ptr:   448

32 bit ptr:   316


So optimized 32 bits pointer version is more than twice as fast the reference.
This is partly because the addresses of screen and screen+1536 are multiples of 4.

Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

robtillaart

32 bit ptr code wrapped into a function.
Code: [Select]

void memclr(byte * addr, uint16_t size)
{
  uint32_t *p = (uint32_t *)addr;
  while( p < (uint32_t *)(addr + size) ) *p++ = (uint32_t)0;
}


result:  392  (slower than the hard coded 32bit ptr code but at least reusable)

Note it does only handle multiples of 4!
Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

jboyton

The fastest version:

Code: [Select]
  uint32_t *q = (uint32_t *) &screen[0] ;
  while(q < (uint32_t *)(screen+1536)) {
    *q = (uint32_t)0;
    q++;
  }


generated this code:

Code: [Select]
ldi r30, 0x5C ; get pointer to screen[]
ldi r31, 0x01 ;
rjmp test      ; test to see if we're already done


loop: st Z+, r1 ; set 4 sequential bytes to R1 (=0)
st Z+, r1
st Z+, r1
st Z+, r1
test: ldi r24, 0x07 ; get high byte of last location in screen[]
cpi r30, 0x5C ; compare low byte to index Z low byte
cpc r31, r24 ; compare high byte to index Z
brcs loop ;


I think you could squeeze more if you were willing to hand code in assembly.

AWOL

"Pete, it's a fool (who) looks for logic in the chambers of the human heart." Ulysses Everett McGill.
Do not send technical questions via personal messaging - they will be ignored.
I speak for myself, not Arduino.

Coding Badly


Typically, counting down to zero is faster than counting up.

robtillaart

Typically, counting down to zero is faster than counting up.

Code: [Select]

void memclr(byte * addr, uint16_t size)
{
  uint32_t *p = (uint32_t *)addr;
  uint32_t *q = (uint32_t *)(addr + size);
  while( --q >= p ) *q = 0UL;
}


result ==> 364  (same order)
Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

robtillaart

#10
Oct 27, 2014, 09:14 pm Last Edit: Oct 27, 2014, 09:25 pm by robtillaart
Code: [Select]

 start = micros();
  uint64_t *t = (uint64_t *)screen;
  while(t < (uint64_t *)(screen+1536)) {
    *t++ = 0ULL;
  }
  stop = micros();
  Serial.print(F("64 bit+ptr:\t"));
  Serial.println(stop-start);

64 bit+ptr: 284      

// works because 1536 is a multiple of 64 (24x64)...
// in fact this is loop unrolling...


284/708 = 40% of the reference time


Rob Tillaart

Nederlandse sectie - http://arduino.cc/forum/index.php/board,77.0.html -
(Please do not PM for private consultancy)

nickgammon

What I need is a very simple piece of code that will clear an array that has 1536 elements in it
Why?
Please post technical questions on the forum, not by personal message. Thanks!

More info: http://www.gammon.com.au/electronics

jboyton

#12
Oct 27, 2014, 09:21 pm Last Edit: Oct 27, 2014, 09:22 pm by jboyton
This produced a time of 212 ± 4 us:

I couldn't line up more than 48 stores in a row while still being a multiple of 1536, maybe because of the relative jump at the end.

Code: [Select]
  start = micros();
 
  asm(
    "eor r1, r1 \n\t"
    "ldi r30, lo8(screen) \n\t"
    "ldi r31, hi8(screen) \n\t"
    "ldi r24, hi8(screen+1536) \n\t"
 "1:                           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
 
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
 
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
 
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"

    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
 
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
    "st Z+, r1           \n\t"
 
    "cpi r30, lo8(screen+1536) \n\t"
    "cpc r31, r24 \n\t"
    "brcs 1b      \n\t"
     ::: "r1", "r24", "r30", "r31"
  );

  stop = micros();

nickgammon

Loop unrolling, eh?

Code: [Select]


uint32_t *foo = (uint32_t *) &screen[0] ;

memset ((void *) screen, 1, sizeof (screen));

start = micros();

foo [0] = 0;
foo [1] = 0;
foo [2] = 0;
foo [3] = 0;
foo [4] = 0;
foo [5] = 0;
foo [6] = 0;
foo [7] = 0;
foo [8] = 0;
foo [9] = 0;
foo [10] = 0;
foo [11] = 0;
foo [12] = 0;
foo [13] = 0;
foo [14] = 0;
foo [15] = 0;
foo [16] = 0;
foo [17] = 0;
foo [18] = 0;
foo [19] = 0;
foo [20] = 0;
foo [21] = 0;
foo [22] = 0;
foo [23] = 0;
foo [24] = 0;
foo [25] = 0;
foo [26] = 0;
foo [27] = 0;
foo [28] = 0;
foo [29] = 0;
foo [30] = 0;
foo [31] = 0;
foo [32] = 0;
foo [33] = 0;
foo [34] = 0;
foo [35] = 0;
foo [36] = 0;
foo [37] = 0;
foo [38] = 0;
foo [39] = 0;
foo [40] = 0;
foo [41] = 0;
foo [42] = 0;
foo [43] = 0;
foo [44] = 0;
foo [45] = 0;
foo [46] = 0;
foo [47] = 0;
foo [48] = 0;
foo [49] = 0;
foo [50] = 0;
foo [51] = 0;
foo [52] = 0;
foo [53] = 0;
foo [54] = 0;
foo [55] = 0;
foo [56] = 0;
foo [57] = 0;
foo [58] = 0;
foo [59] = 0;
foo [60] = 0;
foo [61] = 0;
foo [62] = 0;
foo [63] = 0;
foo [64] = 0;
foo [65] = 0;
foo [66] = 0;
foo [67] = 0;
foo [68] = 0;
foo [69] = 0;
foo [70] = 0;
foo [71] = 0;
foo [72] = 0;
foo [73] = 0;
foo [74] = 0;
foo [75] = 0;
foo [76] = 0;
foo [77] = 0;
foo [78] = 0;
foo [79] = 0;
foo [80] = 0;
foo [81] = 0;
foo [82] = 0;
foo [83] = 0;
foo [84] = 0;
foo [85] = 0;
foo [86] = 0;
foo [87] = 0;
foo [88] = 0;
foo [89] = 0;
foo [90] = 0;
foo [91] = 0;
foo [92] = 0;
foo [93] = 0;
foo [94] = 0;
foo [95] = 0;
foo [96] = 0;
foo [97] = 0;
foo [98] = 0;
foo [99] = 0;
foo [100] = 0;
foo [101] = 0;
foo [102] = 0;
foo [103] = 0;
foo [104] = 0;
foo [105] = 0;
foo [106] = 0;
foo [107] = 0;
foo [108] = 0;
foo [109] = 0;
foo [110] = 0;
foo [111] = 0;
foo [112] = 0;
foo [113] = 0;
foo [114] = 0;
foo [115] = 0;
foo [116] = 0;
foo [117] = 0;
foo [118] = 0;
foo [119] = 0;
foo [120] = 0;
foo [121] = 0;
foo [122] = 0;
foo [123] = 0;
foo [124] = 0;
foo [125] = 0;
foo [126] = 0;
foo [127] = 0;
foo [128] = 0;
foo [129] = 0;
foo [130] = 0;
foo [131] = 0;
foo [132] = 0;
foo [133] = 0;
foo [134] = 0;
foo [135] = 0;
foo [136] = 0;
foo [137] = 0;
foo [138] = 0;
foo [139] = 0;
foo [140] = 0;
foo [141] = 0;
foo [142] = 0;
foo [143] = 0;
foo [144] = 0;
foo [145] = 0;
foo [146] = 0;
foo [147] = 0;
foo [148] = 0;
foo [149] = 0;
foo [150] = 0;
foo [151] = 0;
foo [152] = 0;
foo [153] = 0;
foo [154] = 0;
foo [155] = 0;
foo [156] = 0;
foo [157] = 0;
foo [158] = 0;
foo [159] = 0;
foo [160] = 0;
foo [161] = 0;
foo [162] = 0;
foo [163] = 0;
foo [164] = 0;
foo [165] = 0;
foo [166] = 0;
foo [167] = 0;
foo [168] = 0;
foo [169] = 0;
foo [170] = 0;
foo [171] = 0;
foo [172] = 0;
foo [173] = 0;
foo [174] = 0;
foo [175] = 0;
foo [176] = 0;
foo [177] = 0;
foo [178] = 0;
foo [179] = 0;
foo [180] = 0;
foo [181] = 0;
foo [182] = 0;
foo [183] = 0;
foo [184] = 0;
foo [185] = 0;
foo [186] = 0;
foo [187] = 0;
foo [188] = 0;
foo [189] = 0;
foo [190] = 0;
foo [191] = 0;
foo [192] = 0;
foo [193] = 0;
foo [194] = 0;
foo [195] = 0;
foo [196] = 0;
foo [197] = 0;
foo [198] = 0;
foo [199] = 0;
foo [200] = 0;
foo [201] = 0;
foo [202] = 0;
foo [203] = 0;
foo [204] = 0;
foo [205] = 0;
foo [206] = 0;
foo [207] = 0;
foo [208] = 0;
foo [209] = 0;
foo [210] = 0;
foo [211] = 0;
foo [212] = 0;
foo [213] = 0;
foo [214] = 0;
foo [215] = 0;
foo [216] = 0;
foo [217] = 0;
foo [218] = 0;
foo [219] = 0;
foo [220] = 0;
foo [221] = 0;
foo [222] = 0;
foo [223] = 0;
foo [224] = 0;
foo [225] = 0;
foo [226] = 0;
foo [227] = 0;
foo [228] = 0;
foo [229] = 0;
foo [230] = 0;
foo [231] = 0;
foo [232] = 0;
foo [233] = 0;
foo [234] = 0;
foo [235] = 0;
foo [236] = 0;
foo [237] = 0;
foo [238] = 0;
foo [239] = 0;
foo [240] = 0;
foo [241] = 0;
foo [242] = 0;
foo [243] = 0;
foo [244] = 0;
foo [245] = 0;
foo [246] = 0;
foo [247] = 0;
foo [248] = 0;
foo [249] = 0;
foo [250] = 0;
foo [251] = 0;
foo [252] = 0;
foo [253] = 0;
foo [254] = 0;
foo [255] = 0;
foo [256] = 0;
foo [257] = 0;
foo [258] = 0;
foo [259] = 0;
foo [260] = 0;
foo [261] = 0;
foo [262] = 0;
foo [263] = 0;
foo [264] = 0;
foo [265] = 0;
foo [266] = 0;
foo [267] = 0;
foo [268] = 0;
foo [269] = 0;
foo [270] = 0;
foo [271] = 0;
foo [272] = 0;
foo [273] = 0;
foo [274] = 0;
foo [275] = 0;
foo [276] = 0;
foo [277] = 0;
foo [278] = 0;
foo [279] = 0;
foo [280] = 0;
foo [281] = 0;
foo [282] = 0;
foo [283] = 0;
foo [284] = 0;
foo [285] = 0;
foo [286] = 0;
foo [287] = 0;
foo [288] = 0;
foo [289] = 0;
foo [290] = 0;
foo [291] = 0;
foo [292] = 0;
foo [293] = 0;
foo [294] = 0;
foo [295] = 0;
foo [296] = 0;
foo [297] = 0;
foo [298] = 0;
foo [299] = 0;
foo [300] = 0;
foo [301] = 0;
foo [302] = 0;
foo [303] = 0;
foo [304] = 0;
foo [305] = 0;
foo [306] = 0;
foo [307] = 0;
foo [308] = 0;
foo [309] = 0;
foo [310] = 0;
foo [311] = 0;
foo [312] = 0;
foo [313] = 0;
foo [314] = 0;
foo [315] = 0;
foo [316] = 0;
foo [317] = 0;
foo [318] = 0;
foo [319] = 0;
foo [320] = 0;
foo [321] = 0;
foo [322] = 0;
foo [323] = 0;
foo [324] = 0;
foo [325] = 0;
foo [326] = 0;
foo [327] = 0;
foo [328] = 0;
foo [329] = 0;
foo [330] = 0;
foo [331] = 0;
foo [332] = 0;
foo [333] = 0;
foo [334] = 0;
foo [335] = 0;
foo [336] = 0;
foo [337] = 0;
foo [338] = 0;
foo [339] = 0;
foo [340] = 0;
foo [341] = 0;
foo [342] = 0;
foo [343] = 0;
foo [344] = 0;
foo [345] = 0;
foo [346] = 0;
foo [347] = 0;
foo [348] = 0;
foo [349] = 0;
foo [350] = 0;
foo [351] = 0;
foo [352] = 0;
foo [353] = 0;
foo [354] = 0;
foo [355] = 0;
foo [356] = 0;
foo [357] = 0;
foo [358] = 0;
foo [359] = 0;
foo [360] = 0;
foo [361] = 0;
foo [362] = 0;
foo [363] = 0;
foo [364] = 0;
foo [365] = 0;
foo [366] = 0;
foo [367] = 0;
foo [368] = 0;
foo [369] = 0;
foo [370] = 0;
foo [371] = 0;
foo [372] = 0;
foo [373] = 0;
foo [374] = 0;
foo [375] = 0;
foo [376] = 0;
foo [377] = 0;
foo [378] = 0;
foo [379] = 0;
foo [380] = 0;
foo [381] = 0;
foo [382] = 0;
foo [383] = 0;

  stop = micros();
  Serial.print("unrolled loop:\t");
  Serial.println(stop-start);


Gives:

Code: [Select]

unrolled loop: 200
Please post technical questions on the forum, not by personal message. Thanks!

More info: http://www.gammon.com.au/electronics

jboyton

Diminishing returns to keep unrolling as the cost of the compare and jump isn't that high, relatively, as the number increases.

With 48 ST Z+, r1 instructions the time was 212 us.
With 24 it was 220.
With 8 it was 260
With 4 it was 308.

Go Up