Byte mirroring

b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;

This requires 3 long multiplications plus some 32-bit logical ops. On AVR I would stand either of my solutions up against this. The optimal solution is likely a lookup table which would do the job in something like 4 instructions.

b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;

This requires 3 long multiplications plus some 32-bit logical ops. On AVR I would stand either of my solutions up against this. The optimal solution is likely a lookup table which would do the job in something like 4 instructions.

You are probably right, optimal allways depends on context, that might be size or speed or something in between.

  • a lookuptable would take 256 bytes and addressing it is (beginaddress + index) so that is optimized for speed. For the Arduino 256 bytes is 13% of the RAM (or store it in prog_mem). Which version is optimized for size I don't know, I think the loop enrolled version is a good compromise of speed and size.

Once again thank you all for the help provided

I have found an easier way to resolve my problem by generating an rotated + mirrored font file

this is a great toll
http://www.min.at/prinz/software/pixelfont/

A small test for different methods, (IDE 21, Atmel 328, windows 7)

void setup()
{
  Serial.begin(115200);
}

void loop()
{
  unsigned long start = micros();
  for (int i=0; i<256; i++)
  {
    volatile uint8_t b = i;
    b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16; 
    b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16; 
  }
  Serial.println(micros() - start);
  while (1);
}

Size : 2758
Time: 6408 usec (512 flips) => 12.5 usec = 200 clockcycles / flip

void setup()
{
  Serial.begin(115200);
}

void loop()
{
  unsigned long start = micros();
  for (int i=0; i<256; i++)
  {
    volatile uint8_t b = i;
    b = (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;
    b = (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;
  }
  Serial.println(micros() - start);
  while (1);
}

Size : 6872
Time: 151804 usec (512 flips) => 296.5 usec per flip = 4744 clockcycles / flip

Although the formula is shorter the use of ULL datatype in combination with modulo makes this far bigger and slower

void setup()
{
  Serial.begin(115200);
}

// big lookup table
uint8_t x[256] = { 0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,8,136,72,200,40,168,104,232,24,152,88,216,56,184,120,248,4,132,68,196,36,164,100,228,20,148,84,212,52,180,116,244,12,140,76,204,44,172,108,236,28,156,92,220,60,188,124,252,2,130,66,194,34,162,98,226,18,146,82,210,50,178,114,242,10,138,74,202,42,170,106,234,26,154,90,218,58,186,122,250,6,134,70,198,38,166,102,230,22,150,86,214,54,182,118,246,14,142,78,206,46,174,110,238,30,158,94,222,62,190,126,254,1,129,65,193,33,161,97,225,17,145,81,209,49,177,113,241,9,137,73,201,41,169,105,233,25,153,89,217,57,185,121,249,5,133,69,197,37,165,101,229,21,149,85,213,53,181,117,245,13,141,77,205,45,173,109,237,29,157,93,221,61,189,125,253,3,131,67,195,35,163,99,227,19,147,83,211,51,179,115,243,11,139,75,203,43,171,107,235,27,155,91,219,59,187,123,251,7,135,71,199,39,167,103,231,23,151,87,215,55,183,119,247,15,143,79,207,47,175,111,239,31,159,95,223,63,191,127,255};


void loop()
{
  unsigned long start = micros();
  for (int i=0; i<256; i++)
  {
    volatile uint8_t b = i;
    b = x[b];
    b = x[b];
  }
  Serial.println(micros() - start);
  while (1);
}

Size : 2804
Time: 436 usec (512 flips) => 0.85 usec per flip = 13.6 clockcycles / flip

Really fast as expected, but not the smallest footprint so far

void setup()
{
  Serial.begin(115200);
}

// small lookup table - 4bit level iso 8 bit
// notice the pattern between the first 8 and the latter 8 in the table
uint8_t xx[16] = { 0,8,4,12,  2,10,6,14,  1,9,5,13,  3,11,7,15};

void loop()
{
  unsigned long start = micros();
  for (int i=0; i<256; i++)
  {
    volatile uint8_t b = i;
    b = xx[b & 0x0F] << 4 | xx[b>>4];
    b = xx[b & 0x0F] << 4 | xx[b>>4];
  }
  Serial.println(micros() - start);
  while (1);
}

Size : 2624
Time: 1468 usec (512 flips) => 2.86 usec per flip = 46 clockcycles / flip

Three times slower than the fastest, but still second fastest.
With 2624 the smallest in footprint so far.

void setup()
{
  Serial.begin(115200);
}

void loop()
{
  unsigned long start = micros();
  for (int j=0; j<256; j++)
  {
    volatile uint8_t value = j;
    volatile uint8_t ret = 0;
    for (int i = 0; i < 8; i++)
      if ((value & (uint8_t)(1 << i)) != 0) ret += (uint8_t)(1 << (7 - i));
    value = ret;
    for (int i = 0; i < 8; i++)
      if ((value & (uint8_t)(1 << i)) != 0) ret += (uint8_t)(1 << (7 - i));
  }
  Serial.println(micros() - start);
  while (1);
}

Size : 2676
Time: 14008 usec (512 flips) => 27.4 usec per flip = 438 clockcycles / flip

Good footprint, but relative slow.

void setup()
{
  Serial.begin(115200);
}

void loop()
{
  unsigned long start = micros();
  for (int j=0; j<256; j++)
  {
    volatile uint8_t in = j;
    uint8_t out = 0;
    if (in & 0x01) out |= 0x80;
    if (in & 0x02) out |= 0x40;
    if (in & 0x04) out |= 0x20;
    if (in & 0x08) out |= 0x10;
    if (in & 0x10) out |= 0x08;
    if (in & 0x20) out |= 0x04;
    if (in & 0x40) out |= 0x02;
    if (in & 0x80) out |= 0x01;
    in = out;
    out = 0;
    if (in & 0x01) out |= 0x80;
    if (in & 0x02) out |= 0x40;
    if (in & 0x04) out |= 0x20;
    if (in & 0x08) out |= 0x10;
    if (in & 0x10) out |= 0x08;
    if (in & 0x20) out |= 0x04;
    if (in & 0x40) out |= 0x02;
    if (in & 0x80) out |= 0x01;
  }
  Serial.println(micros() - start);
  while (1);
}

Size : 2596
Time: 996 usec (512 flips) => 1.95 usec per flip = 32 clockcycles / flip

Smallest footprint, second in performance

So the 256 byte lookup table is fastest but the enrolled loop is very fast too and has a 200byte smaller footprint.

All disclaimers apply :slight_smile:

You need a control in which you calculate the computational complexity of the loop overhead and then factor that out of the other measurements. Your figure of 13.6 clockcycles/flip for the table lookup includes the overhead of incrementing and comparing the loop variable, the conditional branch at the top and the branch back from the bottom. That's, what? 4 instructions -- comparable to the loop contents.

And the table lookup should be to progmem -- making it at least one instruction longer. The way you have it, it takes 256 bytes of ROM and another 256 bytes of RAM. It is probably the worst approach since RAM is so scarce.

Anyhow, nice work and thank you for backing up my claims!

You're welcome, was kinda fun

You are 100% right that these tests are only indicative as overhead is included, but that said, they are indicative enough :slight_smile:

You could try this
byte data = ???;
byte tmp = 0; // temporary variable to hold mirror of data
for ( int i =0; i <8; i++ ) bitWrite( tmp, 7-i, bitRead( data, i ) );

data = tmp;

Hello. I made a program for this problem, so you can turn right90-left90 and make horisontal and vertical mirroring.
Screenshot:
http://kepfeltoltes.hu/a/Screenshot_2014-08-22_16.24.09_www.kepfeltoltes.hu_.png
http://kecsotamas.atw.hu/ss.png

Links:
v1.1
http://kecsotamas.atw.hu/For8x8FontByVK.rar

If you want to mirror an arbritrary length of bytes, then the following approach may be an idea:

byte source = ...; // e.g. 1011
byte mirror = 0;

unsigned int mask = 1;
byte tmp = source;

for( int i= 0; i < length; i++){
mirror <<=1;
if( tmp & mask ){
mirror += 1;
}
tmp >>=1;
}

It's a bit of a draft, and so it may need some refinement, but the generla idea is that you use countershifting and testing of bit 0 to mirror the source

We had a large discussion about this a while ago.

From my test sketch I have pulled a few different versions out.

These are listed fastest first for IDE 1.5.7. versions 1.5.6 and below, including 1.0.x series will have different results because of the older compiler. ( CodingBadly's versions wins on these older platforms )

Of course robtillaarts lookup tables were the fastest, however I haven't included tese. There were many other great versions posted by others, but here is some of the fastest ( non lookup table versions ) from the collection.

Nilton61:

byte MirrorByte(byte x)
  {
    x = (((x&0xaa)>>1) + ((x&0x55)<<1));
    x = (((x&0xcc)>>2) + ((x& 0x33)<<2));
    return((x>>4) + (x<<4));
  }

Coding Badly:

byte MirrorByte( byte v )
  {
    uint8_t t1;
    uint8_t t2;
  
    asm volatile
    (
      "mov   %[t1], %[v]"                               "\n\t"
      "rol   %[t1]"                                     "\n\t"
      "ror   %[t2]"                                     "\n\t"
      "rol   %[t1]"                                     "\n\t"
      "ror   %[t2]"                                     "\n\t"
      "rol   %[t1]"                                     "\n\t"
      "ror   %[t2]"                                     "\n\t"
      "rol   %[t1]"                                     "\n\t"
      "ror   %[t2]"                                     "\n\t"
      "rol   %[t1]"                                     "\n\t"
      "ror   %[t2]"                                     "\n\t"
      "rol   %[t1]"                                     "\n\t"
      "ror   %[t2]"                                     "\n\t"
      "rol   %[t1]"                                     "\n\t"
      "ror   %[t2]"                                     "\n\t"
      "rol   %[t1]"                                     "\n\t"
      "ror   %[t2]"                                     "\n\t"
      :
        // Outputs
        [t1] "=&r" ( t1 ),
        [t2] "=&r" ( t2 )
      :
        // Inputs
        [v] "r" ( v )
      :
        // Clobbers
    );
    return( t2 );
  }

Westfw

byte MirrorByte( byte arg )
  {
    byte result=0;
    if (arg & (1<<0)) result |= (0x80>>0);
    if (arg & (1<<1)) result |= (0x80>>1);
    if (arg & (1<<2)) result |= (0x80>>2);
    if (arg & (1<<3)) result |= (0x80>>3);
    if (arg & (1<<4)) result |= (0x80>>4);
    if (arg & (1<<5)) result |= (0x80>>5);
    if (arg & (1<<6)) result |= (0x80>>6);
    if (arg & (1<<7)) result |= (0x80>>7);
    return result;
  }

And my attempt:

byte MirrorByte( byte c )
  {  
    struct BitByte{
     int A : 1; int B : 1; int C : 1; int D : 1; 
     int E : 1; int F : 1; int G : 1; int H : 1; 
    };
    const BitByte *z = ( BitByte* ) &c;
    union{ BitByte g; char h; } i = {{ z->H, z->G, z->F, z->E, z->D, z->C, z->B, z->A }};
    return i.h;
  }

Definitely some interesting approaches to choose from.