'memcpy' quicker or slower than using loop to copy variable ?

As one may understand, i was going from the point of view that memcpy would be quicker than using something like for(i = 0; i<nl; i++)  larr[i] = array[l+i]; but the results i was getting were showing the opposite. From the time i was programming the Z80, one of it’s most powerful command would be ‘block’ copying, which was quite a new feature at the time. So i was expecting that memcpy would just translate into a single ‘block-copy’ assembler command, now what could possibly be quicker than that ?
For completions sake i was comparing sorting algorithms, bubble-sort vs merge-sort and was wondering if the merge-sort code could be optimized since at 18 entries it was slower than bubble-sort (cross-over point seems to be near 30 entries)

#define NR_INTS 29
#define RANGE 1024

int array[NR_INTS];




void setup() {
  Serial.begin(9600);
  randomSeed(analogRead(A0));

  Serial.println("Merge Sort");
  FillRandom();
  uint32_t moment=micros();
  mergeSort(array, 0, NR_INTS-1); 
  moment=micros()-moment;
  Serial.println("Array after sorting :");
  DisplayArray();
  Serial.print("Sorting time : ");
  Serial.print(moment,DEC);
  Serial.println(" us");
  Serial.println("\nBubble Sort");
  FillRandom();
  moment=micros();
  BubbleSort(array, NR_INTS); 
  moment=micros()-moment;
  Serial.println("Array after sorting :");
  DisplayArray();
  Serial.print("Sorting time : ");
  Serial.print(moment,DEC);
  Serial.println(" us");
}

void loop() {
}

void FillRandom() {
  
  Serial.println("Array before sorting :");
  for (uint8_t i=0; i<NR_INTS; i++) {
    array[i]=random(0,RANGE);
  }
  DisplayArray();
}

void DisplayArray() {
  for (uint8_t i=0; i<NR_INTS; i++) {
    Serial.print(array[i],DEC);
    if (i<NR_INTS-1) Serial.print(", ");
  }
  Serial.println(".");
}

void swapping(int &a, int &b) {     //swap the content of a and b
   int temp;
   temp = a;
   a = b;
   b = temp;
}

void merge(int *array, int l, int m, int r) {
   int i, j, k, nl, nr;
   //size of left and right sub-arrays
   nl = m-l+1; 
   nr = r-m;
   int larr[nl], rarr[nr];
   //fill left and right sub-arrays
   for(i = 0; i<nl; i++)   larr[i] = array[l+i];
   //memcpy(larr, array+l, nl*2);
   for(j = 0; j<nr; j++)   rarr[j] = array[m+1+j];
   //memcpy(rarr, array+m+1, nr*2);
   i = 0; j = 0; k = l;
   //marge temp arrays to real array
   while(i < nl && j<nr) {
      if(larr[i] <= rarr[j]) {
         array[k] = larr[i];
         i++;
      }
      else{
         array[k] = rarr[j];
         j++;
      }
      k++;
   }
   while(i<nl) {       //extra element in left array
      array[k] = larr[i];
      i++; k++;
   }
   while(j<nr) {     //extra element in right array
      array[k] = rarr[j];
      j++; k++;
   }
}

void mergeSort(int *array, int l, int r) {
   int m;
   if(l < r) {
      int m = l+(r-l)/2;
      // Sort first and second arrays
      mergeSort(array, l, m);
      mergeSort(array, m+1, r);
      merge(array, l, m, r);
   }
}

void BubbleSort(int *array, uint8_t elements) {
  for (uint8_t c = 0; c < elements-1; c++) { // do a bubble sort
    for (uint8_t d = 0; d < elements - c - 1; d++) {
      if (array[d] > array[d + 1]) { // compare
         swapping(array[d],array[d+1]);    
      }
    }
  }
}

Sketch is provided for motivation of the question.

As usual, it depends. My company produced a C compiler for MSDOS back in the 1980’s. Virtually all of the standard library functions were written in assembler, tweaked to squeeze out the last bit of performance. I don’t know if that the case for the Arduino libraries, as it seems many are in C source.

As to the Bubble Sort time, it also depends. The Bubble Sort gets a lot of bad press with respect to sort times, but if you have a sorted list and are simply inserting a new entry into the list, the Bubble Sort may be the fastest sort you can use. Also, non-trivial sorting algorithms often can be coded multiple ways, and that may impact the results, too. For example, I made just a few minor changes in your code and shaved about 20% off the run time for the Merge sort, and I’m a long way from being the sharpest knife in the drawer.

#define NR_INTS 29
#define RANGE 1024

int array[NR_INTS];

void setup() {
  Serial.begin(115200);
  randomSeed(analogRead(A0));

  Serial.println("Merge Sort");
  FillRandom();
  uint32_t moment = micros();
  mergeSort(array, 0, NR_INTS - 1);
  moment = micros() - moment;
  Serial.println("Array after sorting :");
  DisplayArray();
  Serial.print("Sorting time : ");
  Serial.print(moment, DEC);
  Serial.println(" us");
  Serial.println("\nBubble Sort");
  FillRandom();
  moment = micros();
  BubbleSort(array, NR_INTS);
  moment = micros() - moment;
  Serial.println("Array after sorting :");
  DisplayArray();
  Serial.print("Sorting time : ");
  Serial.print(moment, DEC);
  Serial.println(" us");
}

void loop() {
}

void FillRandom() {

  Serial.println("Array before sorting :");
  for (uint8_t i = 0; i < NR_INTS; i++) {
    array[i] = random(0, RANGE);
  }
  DisplayArray();
}

void DisplayArray() {
  for (uint8_t i = 0; i < NR_INTS; i++) {
    Serial.print(array[i], DEC);
    if (i < NR_INTS - 1) Serial.print(", ");
  }
  Serial.println(".");
}

void swapping(int &a, int &b) {     //swap the content of a and b
  int temp;
  temp = a;
  a = b;
  b = temp;
}

void merge(int *array, int left, int mid, int right) {
  int tempArray[right - left + 1];
  int pos = 0, lpos = left, rpos = mid + 1;
  while (lpos <= mid && rpos <= right)
  {
    if (array[lpos] < array[rpos])
    {
      tempArray[pos++] = array[lpos++];
    }
    else
    {
      tempArray[pos++] = array[rpos++];
    }
  }
  while (lpos <= mid)  tempArray[pos++] = array[lpos++];
  while (rpos <= right)tempArray[pos++] = array[rpos++];
  int iter;
  /* Copy back the sorted array to the original array */
  for (iter = 0; iter < pos; iter++)
  {
    array[iter + left] = tempArray[iter];
  }
  return;
}

void mergeSort(int *array, int l, int r) {
  int m = (l + r) / 2;
  if (l < r) {
    // Sort first and second arrays
    mergeSort(array, l, m);
    mergeSort(array, m + 1, r);
    merge(array, l, m, r);
  }
}

void BubbleSort(int *array, uint8_t elements) {
  for (uint8_t c = 0; c < elements - 1; c++) { // do a bubble sort
    for (uint8_t d = 0; d < elements - c - 1; d++) {
      if (array[d] > array[d + 1]) { // compare
        swapping(array[d], array[d + 1]);
      }
    }
  }
}

Deva_Rishi:
...
So i was expecting that memcpy would just translate into a single 'block-copy' assembler command
...

I must have missed that instruction.

Yeah i was figuring the merge-sort was not the quickest programatically (this is just the one i found), mind you there is something to be said for just running the loop until there is no change in bubble sort (rather then doing a complete compare as it is now)
Actually i was considering doing a re-sort, from previously sorted values, but that would mean extra complications in keeping track where values are so they can be updated. Still back to the original question, could it be that it is just the multiplication in the amount of bytes (or even that there is a function for copying words ?) and one more thing, how come

memcpy(rarr, array+m+1, nr*2);

this ends up to being the correct source address ? shouldn't the 'm+1' be multiplied by 2 as well ? at least that's what i thought... probably both wrong :wink:

JimEli:
I must have missed that instruction.

of the top of my head, you set the HL & BC registers with source & target addresses and the do INIR ? or INDR ? one decreases, hmm there must have been a counter set as well, probably the E register, Ok, it was not quite a single instruction, but it was rather powerful (great for copying onto the screen !)

sorry LDIR & LDDR i guess.. something like that.

Deva_Rishi:
and one more thing, how come

memcpy(rarr, array+m+1, nr*2);

this ends up to being the correct source address ? shouldn't the 'm+1' be multiplied by 2 as well ? at least that's what i thought...

If I understand the question correctly, it's because the C / C++ language implicitly multiplies all scalars used in pointer arithmetic by the size of the object being pointed to, So:

ObjectType arrayObject[10]
ObjectType singleObject;
uint8_t k;
.
.
.
singleObject = *(arrayObject + k);

will always return the object at index k of the array regardless of ObjectType's size.

On which machine are you doing this?

The AVR (Arduino Uno) does not have any special block transfer instructions (such a thing is not very "RISC") Any memory copy has to go through 8bit registers, so a bunch of the usual "memcpy" optimizations (transferring more than one byte at a time) are also not applicable.

I'd expect the AVR memcpy() to be quite highly optimized, but with no available "tricks" to use, I'd also expect an in-line loop to compile to about the same code. Since memcpy() is a pre-defined library function, it will (probably?) incur the overhead of moving arguments to and from the ABI-defined registers, while the in-line loop can be further optimized to use any registers that are "convenient." This would make the loop "slightly faster" for your particular application, I think.

To do this level of analysis, you really need to look at the assembly/machine code that is produced...

gfvalvo:
If I understand the question correctly, it's because the C / C++ language implicitly multiplies all scalars used in pointer arithmetic by the size of the object being pointed to,

Ah, Clear i was suspecting something like that, though i would be OK about the pointer being 'just an address'

On which machine are you doing this?

I am using an UNO atm, but was planning to use a Pro-mini, but that is the same processor though..
When i did a test with a 6-byte struct that slowed things down considerably, in the end the sorting will be done in the background, but efficiency does increase the update rate. Now i am considering to actually just swap the location of an array of pointers, but of course that slows the retrieving and comparing of values (how much though) anyway there are many trade-offs and an update on the merge-sort code to look at. 20% ! that would make merge-sort quicker at 18 entries already. (maybe at 14 already)

Interesting thread.

The compiler optimization flags can probably be optimized for the many loops.
In my test the extra fast optimization -O3 fails, and memset() is slower.

I have added a delay of 500ms before the timing starts, to be sure that no serial interrupts are running.

For a SAMD21 (Arduino Zero, M0, MKR), I get these numbers:

Deva_Rishi        : 244, 185
with -O           : 227, 160
with -O2          : 225, 162
with -O3          : 347, 163
memcpy            : 282, 185
memcpy with -O    : 267, 157
memcpy with -O2   : 272, 161
memcpy with -O3   : 286, 158
econjack          : 237, 186
econjack with -O  : 204, 163
econjack with -O2 : 209, 160
econjack with -O3 : 313, 160

The first number is the Merge Sort, the second number is the Bubble Sort.
Using memset() is slower, except when -O3 is used, then the failing -O3 is somewhat compensated by the use of memset.
The -O and -O2 are of course better for speed than the default Arduino -Os for size.

For the compiler flags, I used the #pragma

#pragma GCC optimize("-O3")

These are the number of a Arduino Uno.
I was not expecting this, both -O and -O3 fail, only -O2 is good.
The memcpy() is once again slower.
The econjack version is better with a SAMD21, but that appears not to work very well with the 8-bit AVR Arduino Uno.

Deva_Rishi        :  868, 852
with -O           : 1084, 888
with -O2          :  768, 792
with -O3          :  928, 800
memcpy            :  908, 852
memcpy with -O    : 1184, 904
memcpy with -O2   :  884, 788
memcpy with -O3   :  928, 800
econjack          : 1016, 844
econjack with -O  : 1032, 900
econjack with -O2 :  892, 788
econjack with -O3 :  964, 812

Deva_Rishi:
Ah, Clear i was suspecting something like that, though i would be OK about the pointer being 'just an address'

It turns out that the reverse is also true. When you subtract 2 pointers (producing a scalar), the result is divided by the size of the object being pointed to. This makes perfect sense, but I never thought about it before:

void setup() {
  uint32_t dataArray[10];
  uint32_t startAddress, endAddress, offset;

  Serial.begin(115200);
  delay(1000);
  startAddress = (uint32_t) & (dataArray[1]);
  endAddress = (uint32_t) & (dataArray[5]);
  offset = &(dataArray[5]) - &(dataArray[1]);
  Serial.print("Address Difference = ");
  Serial.println(endAddress - startAddress);
  Serial.print("Offset = ");
  Serial.println(offset);
}

void loop() {
}

Serial Output:

Address Difference = 16
Offset = 4

memcpy() simple source code looks usually like this

void *
memcpy (void *dest, const void *src, size_t len)
{
  char *d = dest;
  const char *s = src;
  while (len--)
    *d++ = *s++;
  return dest;
}

so if you inline and do not return anything, use 8 bit for length because you know it’s enough for your array, or move more than 1 byte at a time if your architecture is optimized for this, then you’ll do better than generic code

I have added a delay of 500ms before the timing starts, to be sure that no serial interrupts are running.

Oh what point to make ! Cheers.

#if FORLOOP
// Using for loops.
  for (i = 0; i < nl; i++)   larr[i] = array[l + i];
     872:       80 e0           ldi     r24, 0x00       ; 0
     874:       90 e0           ldi     r25, 0x00       ; 0
     876:       82 17           cp      r24, r18
     878:       93 07           cpc     r25, r19
     87a:       54 f4           brge    .+20            ; 0x890
     87c:       d6 01           movw    r26, r12
     87e:       8d 90           ld      r8, X+
     880:       9d 90           ld      r9, X+
     882:       6d 01           movw    r12, r26
     884:       da 01           movw    r26, r20
     886:       8d 92           st      X+, r8
     888:       9d 92           st      X+, r9
     88a:       ad 01           movw    r20, r26
     88c:       01 96           adiw    r24, 0x01       ; 1
     88e:       f3 cf           rjmp    .-26            ; 0x876
     890:       c7 01           movw    r24, r14
     892:       88 0f           add     r24, r24
     894:       99 1f           adc     r25, r25
     896:       86 5e           subi    r24, 0xE6       ; 230
     898:       97 4d           sbci    r25, 0xD7       ; 215
     89a:       75 01           movw    r14, r10
     89c:       40 e0           ldi     r20, 0x00       ; 0
     89e:       50 e0           ldi     r21, 0x00       ; 0
  for (j = 0; j < nr; j++)   rarr[j] = array[m + 1 + j];
     8a0:       46 17           cp      r20, r22
     8a2:       57 07           cpc     r21, r23
     8a4:       5c f4           brge    .+22            ; 0x8bc
     8a6:       dc 01           movw    r26, r24
     8a8:       cd 90           ld      r12, X+
     8aa:       dd 90           ld      r13, X+
     8ac:       cd 01           movw    r24, r26
     8ae:       d7 01           movw    r26, r14
     8b0:       cd 92           st      X+, r12
     8b2:       dd 92           st      X+, r13
     8b4:       7d 01           movw    r14, r26
     8b6:       4f 5f           subi    r20, 0xFF       ; 255
     8b8:       5f 4f           sbci    r21, 0xFF       ; 255
     8ba:       f2 cf           rjmp    .-28            ; 0x8a0
     8bc:       80 e0           ldi     r24, 0x00       ; 0
     8be:       90 e0           ldi     r25, 0x00       ; 0
     8c0:       40 e0           ldi     r20, 0x00       ; 0
     8c2:       50 e0           ldi     r21, 0x00       ; 0

#else
// Using memcpy
  memcpy(larr, array + l, nl * 2);
     868:       c8 01           movw    r24, r16
     86a:       88 0f           add     r24, r24
     86c:       99 1f           adc     r25, r25
     86e:       9c 01           movw    r18, r24
     870:       26 5e           subi    r18, 0xE6       ; 230
     872:       37 4d           sbci    r19, 0xD7       ; 215
     874:       39 01           movw    r6, r18
     876:       b9 01           movw    r22, r18
     878:       c2 01           movw    r24, r4
     87a:       0e 94 67 08     call    0x10ce  ; 0x10ce <memcpy>
  memcpy(rarr, array + m + 1, nr * 2);
     87e:       b5 01           movw    r22, r10
     880:       66 0f           add     r22, r22
     882:       77 1f           adc     r23, r23
     884:       66 5e           subi    r22, 0xE6       ; 230
     886:       77 4d           sbci    r23, 0xD7       ; 215
     888:       a4 01           movw    r20, r8
     88a:       c1 01           movw    r24, r2
     88c:       0e 94 67 08     call    0x10ce  ; 0x10ce <memcpy>
     890:       f3 01           movw    r30, r6
#endif

000010ce <memcpy>:
    10ce:       fb 01           movw    r30, r22
    10d0:       dc 01           movw    r26, r24
    10d2:       02 c0           rjmp    .+4             ; 0x10d8 <memcpy+0xa>
    10d4:       01 90           ld      r0, Z+
    10d6:       0d 92           st      X+, r0
    10d8:       41 50           subi    r20, 0x01       ; 1
    10da:       50 40           sbci    r21, 0x00       ; 0
    10dc:       d8 f7           brcc    .-10            ; 0x10d4 <memcpy+0x6>
    10de:       08 95           ret