Help with optimizing code

sketch_sep17a:52: error: assignment of read-only parameter ‘b’
Why did you turn all functions const?
Is it optimized better if they are const?

newhobby: sketch_sep17a:52: error: assignment of read-only parameter 'b'

Hmm, ok, remove const in that function.

Why did you turn all functions const? Is it optimized better if they are const?

Yes.

nope. Not really much change.. a couple milliseconds.

newhobby: It's one of those small Nokia LCD screens.

One of those screens that don't have a part number?

It was used on nokia 6100 phones and has phillips driver pcf8833.

I don’t like the use of #define everywhere. It confuses things, and in the case of the code above, makes an “if” not work properly, eg.

static void ShiftBits(byte b)
{
  for (byte bit = 0; bit != 8; ++bit, b <<= 1)
  {
    CLK0;
    if(b & 0x80)
      SDA1;
    else
      SDA0;
    CLK1;
  }
}

Gives:

sketch_sep18c.ino: In function 'void ShiftBits(byte)':
sketch_sep18c:58: error: 'else' without a previous 'if'

This is because the #define also has a semicolon, so it translates to:

if(b & 0x80)
      sbi(PORTE, 0x08);;
    else

Fixing that up (and the const) and changing from millis timing to micros, I got this:

364944
415320

Now, revamping to use function calls instead of #define, like this:

void SDA0 ()
  {
  cbi(PORTE, 0x08);
  }
  
void SDA1 ()
  {
  sbi(PORTE, 0x08);
  }

void CLK0 ()
  {
  cbi(PORTG, 0x20);
  }

void CLK1 ()
  {
  sbi(PORTG, 0x20);
  }

...

static void SendData(const byte data)
{
  CLK0 ();
  SDA1 ();
  CLK1 ();
  ShiftBits(data);
}

static void SendCMD(const byte data)
{
  CLK0 ();
  SDA0 ();
  CLK1 ();
  ShiftBits(data);
}

static void ShiftBits(byte b)
{
  for (byte bit = 0; bit != 8; ++bit, b <<= 1)
  {
    CLK0 ();
    if(b & 0x80)
      SDA1 ();
    else
      SDA0 ();
    CLK1 ();
  }
}

Same code size. Same execution size. Less chance of bugs due to the semicolon in the define.

364944
415320

Remember, it’s an optimizing compiler. You don’t have to second-guess it by in-lining stuff and trying to help it. It doesn’t need your help. It’s smart. Of course it helps to use appropriate data types, but apart from that …

I found someone with a library here:

http://forum.arduino.cc/index.php?topic=95864.0

Linking to:

http://code.google.com/p/ardgrafix6100/

He used hardware SPI. I presume it worked:

void ArdGrafix6100_Philips::WriteCommand(uint8_t command)
{
    PORTB &= ~(_BV(PIN_CS_PORTB));      //Pull the Chip Select pin low to enable chip communication
    PORTB &= ~(_BV(PIN_MOSI_PORTB));    //THis is a command - so first D/C data bit of 9 bits must be LOW - pull MOSI low
      
    PORTB |= _BV(PIN_CLK_PORTB);        //Pulse the clock HIGH
    asm("nop"); asm("nop");         //Wait for 4 clock cycles by nop assembly instruction
    asm("nop"); asm("nop");
    PORTB &= ~(_BV(PIN_CLK_PORTB));     //THen pulse the clock back to LOW to complete transfer of first D/C bit
      
    SPI_Transfer(command);              //Transfer the actual command with hardware SPI
    PORTB |= _BV(PIN_CS_PORTB);         //Pull the Chip Select pin bach HIGH to disable chip communication
}
    
void ArdGrafix6100_Philips::WriteData(uint8_t data)
{
    PORTB &= ~(_BV(PIN_CS_PORTB));      //Pull the Chip Select pin low to enable chip communication
    PORTB |= _BV(PIN_MOSI_PORTB);       //THis is a data - so first D/C data bit of 9 bits must be HIGH - pull MOSI HIGH 

    PORTB |= _BV(PIN_CLK_PORTB);        //Pulse the clock HIGH
    asm("nop"); asm("nop");         //Wait for 4 clock cycles by nop assembly instruction
    asm("nop"); asm("nop");
    PORTB &= ~(_BV(PIN_CLK_PORTB));     //THen pulse the clock back to LOW to complete transfer of first D/C bit

    SPI_Transfer(data);                 //Transfer the actual data with hardware SPI 
    PORTB |= _BV(PIN_CS_PORTB);         //Pull the Chip Select pin bach HIGH to disable chip communication
}

Here’s an example using hardware SPI:

#include <SPI.h>

void CS0 ()
  {
  PORTB &= ~ bit (0);
  }  // end of CS0
  
void CS1 ()
  {
  PORTB |=  bit (0);
  }  // end of CS1
  

void CLK0 ()
  {
  PORTB &= ~ bit (1);
  }  // end of CLK0

void CLK1 ()
  {
  PORTB |=  bit (1);
  } // end of CLK1
  
void SDA0 ()
  {
  PORTB &= ~ bit (2);
  }  // end of SDA0
  
void SDA1 ()
  {
  PORTB |=  bit (2);
  } // end of SDA1

void setup()
{
  Serial.begin(115200);
  SPI.begin ();
  Serial.println ("Starting ...");
  unsigned long start = micros();
  for (byte i=0;i<132;i++)
    for (byte j=0;j<132;j++)
      SendColor12BitOptimized(0xff);
  Serial.println(micros()-start);
  start = micros();
  for (byte i=0;i<132;i++)
    for (byte j=0;j<132;j++)
      SendColor12Bit(0xff);
  Serial.println(micros()-start);
  
}  // end of setup

void loop()
{
}

void SendData(const byte data)
{
  // select chip
  CS0 ();
  // turn hardware SPI off to get at clock and data pins
  SPCR &= ~ bit (SPE);
  CLK1 ();
  SDA1 ();  // data mode
  CLK0 ();
  // turn hardware SPI back on
  SPCR |= bit (SPE);
  SPI.transfer (data);
  // deselect chip
  CS1 ();
}  // end of SendData

static void SendCMD(const byte data)
{
  // select chip
  CS0 ();
  // turn hardware SPI off to get at clock and data pins
  SPCR &= ~ bit (SPE);
  CLK1 ();
  SDA0 ();  // command mode
  CLK0 ();
  // turn hardware SPI back on
  SPCR |= bit (SPE);
  SPI.transfer (data);
  // deselect chip
  CS1 ();
}  // end of SendCMD

void SendColor12BitOptimized(const byte &color)
{
  byte RR = (color & 0xE0) >> 4;
  if (RR >= 8)
    ++RR;
  SendData(RR);

  RR = (color & 0x1C) << 3;
  if (RR >= 0x70)
    RR += 16;

  RR += (color & 0x03) * 5;
  SendData(RR);
}

void SendColor12Bit(byte color)
{
  int R=(color&0xE0)>>5;
  if (R>=4) R=((R*2)+1); else R=R<<1;
  int G=(color&0x1C)>>2;
  if (G>=4) G=((G*2)+1)<<4; else G=G<<5;
  int B=(color&0x03)*5;
  SendData(R);
  SendData(G+B);
}

Timings (microseconds):

202772
253112

Proof that it works:

Of course, you have to use the hardware SPI pins, not the ones you were using.

Oh, I had no doubts that it worked. Like I said, a while back I did try using hardware spi, but hadn’t seen much change, but I cut some traces on my board and did a couple air wires to get the hardware spi traces to make a comparison.
It did improve quite substantially :slight_smile:

newhobby: Thanks all for the help!!! Unfortunately, I should have done my homework before I asked the question :( I noticed after I edited the code that the bottleneck isn't really on the color calculation. Although it helped a bit, it didn't help really substantially.

I did a quick comparison and it took 420ms to display 131x131 pixels with the original code and 391ms with Rob's optmized using byte size.

132 x 132 bytes send makes 17424 (x2) = 34848 bytes send.

gaining ~2us per byte expanded wins about 34 millisec so that is quite in line with the measurements.

small change - add inline

inline void SendColor12BitOptimized(const byte &color)
{
  byte RR = (color & 0xE0) >> 4;
  if (RR >= 8)
    ++RR;
  SendData(RR);

  RR = (color & 0x1C) << 3;
  if (RR >= 0x70)
    RR += 16;

  RR += (color & 0x03) * 5;
  SendData(RR);
}

output changes too (my MEGA2560)

312
415

note: without inline I got 363/415

comparing with zero is faster and if that is done 8*34848 times (about quarter of a million times) ==> gain 20uS

  Serial.begin(115200);
  unsigned long start = millis();
  for (byte i=132; i>0; i--)
    for (byte j=132; j>0; j--)
      SendColor12BitOptimized(0xff);    
  Serial.println(millis() - start);

...

static void ShiftBits(byte b)
{
  for (byte bit = 8; bit >0; --bit, b <<= 1)
  {
    CLK0;
    if (b & 0x80)
    {
      SDA1;
    }
    else
    {
      SDA0;
    }
    CLK1;
  }
}

output:
293
399

SQUEEEZE!

static void ShiftBits(byte b)
{
  byte dat1 = PORTE | 0x08;
  byte dat0 = PORTE & ~0x08;
  byte clk1 = PORTG | 0x20;
  byte clk0 = PORTG & ~0x20;

  for (byte bit = 8; bit >0; --bit, b <<= 1)
  {
    PORTG = clk0;
    if (b & 0x80)
    {
      PORTE = dat1;
    }
    else
    {
      PORTE = dat0;
    }
    PORTG = clk1;
  }
}

output:
259
362

last droplet… :wink:

// merge shiftBits and sendData as sendData is called 34848 times
// note this optimization assumes there is no interrupt that sets PORTE or PORTG !!

static void SendData(byte data)
{
  byte dat1 = PORTE | 0x08;
  byte dat0 = PORTE & ~0x08;
  byte clk1 = PORTG | 0x20;
  byte clk0 = PORTG & ~0x20;

  PORTG = clk0;
  PORTE = dat1;
  PORTG = clk1;
  for (byte bit = 8; bit >0; --bit, data <<= 1)
  {
    PORTG = clk0;
    if (data & 0x80)
    {
      PORTE = dat1;
    }
    else
    {
      PORTE = dat0;
    }
    PORTG = clk1;
  }
  //  CLK0;
  //  SDA1;
  //  CLK1;
  //  ShiftBits(data);
}

output:
232
317

only optimization I see left needs a rewrite so all the color info in one uint16_t. Note that every second byte send has 4 zero bits. that means 4 unneeded tests.

would be something like snippet below

...
  // BYTE 1
  data <<= 4; // do 4 zero bytes
  for (byte bit = 4; bit >0; --bit)
  {
    PORTG = clk0;
    PORTE = dat0;
    PORTG = clk1;
  }
  // rest of byte 1
  for (byte bit = 4; bit >0; --bit, data <<= 1)
  {
    PORTG = clk0;
    if (data & 0x80)
    {
      PORTE = dat1;
    }
    else
    {
      PORTE = dat0;
    }
    PORTG = clk1;
  }
  ...
  // BYTE 2
  for (byte bit = 8; bit >0; --bit, data <<= 1)
  {
    PORTG = clk0;
    if (data & 0x80)
    {
      PORTE = dat1;
    }
    else
    {
      PORTE = dat0;
    }
    PORTG = clk1;
  }

Wow!! Thanks :) I'll try that one.

Too bad I don't have an Arduino with me and can't test right now. However, I think it's loop unrolling time :)

#define bitOut(cond) PORTG = clk0; PORTE = (cond ? dat1 : dat0); PORTG = clk1

static void SendData(byte data)
{
  byte dat1 = PORTE | 0x08;
  byte dat0 = PORTE & ~0x08;
  byte clk1 = PORTG | 0x20;
  byte clk0 = PORTG & ~0x20;

  bitOut(1);

  bitOut(data & 0x80);
  bitOut(data & 0x40);
  bitOut(data & 0x20);
  bitOut(data & 0x10);
  bitOut(data & 0x08);
  bitOut(data & 0x04);
  bitOut(data & 0x02);
  bitOut(data & 0x01);

  //  CLK0;
  //  SDA1;
  //  CLK1;
  //  ShiftBits(data);
}

@int2str loop unrolling gives definitely more code. A quick test gives very nice results:

172 274

The code I posted yesterday with hardware SPI gave (with millis rather than micros for a direct comparison):

202
253

Inlining the two colour functions (SendColor12BitOptimized, SendColor12Bit) as suggested by Rob:

150
151

Inlining SendData and SendCMD gives:

129
130

I don’t think you need to worry too much about unrolling the loop if you use hardware SPI.

Code to reproduce last test:

#include <SPI.h>

void CS0 ()
  {
  PORTB &= ~ bit (0);
  }  // end of CS0
  
void CS1 ()
  {
  PORTB |=  bit (0);
  }  // end of CS1
  

void CLK0 ()
  {
  PORTB &= ~ bit (1);
  }  // end of CLK0

void CLK1 ()
  {
  PORTB |=  bit (1);
  } // end of CLK1
  
void SDA0 ()
  {
  PORTB &= ~ bit (2);
  }  // end of SDA0
  
void SDA1 ()
  {
  PORTB |=  bit (2);
  } // end of SDA1

void setup()
{
  Serial.begin(115200);
  SPI.begin ();
  Serial.println ("Starting ...");
  unsigned long start = millis();
  for (byte i=0;i<132;i++)
    for (byte j=0;j<132;j++)
      SendColor12BitOptimized(0xff);
  Serial.println(millis()-start);
  start = millis();
  for (byte i=0;i<132;i++)
    for (byte j=0;j<132;j++)
      SendColor12Bit(0xff);
  Serial.println(millis()-start);
  
}  // end of setup

void loop()
{
}

inline void SendData(const byte data)
{
  // select chip
  CS0 ();
  // turn hardware SPI off to get at clock and data pins
  SPCR &= ~ bit (SPE);
  CLK1 ();
  SDA1 ();  // data mode
  CLK0 ();
  // turn hardware SPI back on
  SPCR |= bit (SPE);
  SPI.transfer (data);
  // deselect chip
  CS1 ();
}  // end of SendData

inline void SendCMD(const byte data)
{
  // select chip
  CS0 ();
  // turn hardware SPI off to get at clock and data pins
  SPCR &= ~ bit (SPE);
  CLK1 ();
  SDA0 ();  // command mode
  CLK0 ();
  // turn hardware SPI back on
  SPCR |= bit (SPE);
  SPI.transfer (data);
  // deselect chip
  CS1 ();
}  // end of SendCMD

inline void SendColor12BitOptimized(const byte &color)
{
  byte RR = (color & 0xE0) >> 4;
  if (RR >= 8)
    ++RR;
  SendData(RR);

  RR = (color & 0x1C) << 3;
  if (RR >= 0x70)
    RR += 16;

  RR += (color & 0x03) * 5;
  SendData(RR);
}

inline void SendColor12Bit(byte color)
{
  int R=(color&0xE0)>>5;
  if (R>=4) R=((R*2)+1); else R=R<<1;
  int G=(color&0x1C)>>2;
  if (G>=4) G=((G*2)+1)<<4; else G=G<<5;
  int B=(color&0x03)*5;
  SendData(R);
  SendData(G+B);
}

@int2str Definitely the winner :)