faster digitalWrite - SOLVED

Hi everybody,
I have a question regarding an arduino code controlling the rotary speed of 26 stepper motors attached to an arduino Mega 2560. I generate different rotary frequencies for each stepper using the timer1() library.

Since the stepper motors rotate in audio frequencies, the conventional digitalWrite() function is not fast enough for that purpose. I have tried several options (like digitalWritefast(), digitalWrite2()), but until now all my attempts have failed.

Here is the code, which is a excerpt from a monster longish code, so please don't get confused.
The crucial part are the 3 digitalWrite() functions in the void togglePin() function.
How could I speed them up?

Any help is highly appreciated!

#include <TimerOne.h>
#define RESOLUTION 40               // Resolution of our timer 

const byte firstPin = 2;
const byte lastPin = 53;

//Setup pins (Even-odd pairs for step control and direction)
 void setup(){
    for (byte x = firstPin; x <= lastPin; x = x + 1) {
        pinMode(x, OUTPUT); 
        delay(2); }

 Timer1.initialize(RESOLUTION); // Set up a timer at the defined resolution
 Timer1.attachInterrupt(tick); // Attach the tick function
 Serial.begin(57600);
}

// Function called by the timer inturrupt at the specified resolution.
void tick()  {
 /* If there is a period set for a certain pin, count the number of
  ticks that pass, and toggle the pin if the current period is reached.  */
 byte i;
  for(i = firstPin ; i <= fddAmount; i=i+2) {
    if (period[i]>0){
        currentTick[i]++;
      if (currentTick[i] >= period[i]){
        togglePin(i,i+1);
        currentTick[i]=0;
      }
    }
  }
} 

void togglePin(byte pin, byte direction_pin) {
  
  //Switch directions if end has been reached
  if (currentPosition[pin] >= MAX_POSITION[pin]) {
    currentState[direction_pin] = HIGH;
    digitalWrite(direction_pin,HIGH);
  } 
  else if (currentPosition[pin] <= 0) {
    currentState[direction_pin] = LOW;
    digitalWrite(direction_pin,LOW);
  }
  
  //Update currentPosition
  if (currentState[direction_pin] == HIGH){
    currentPosition[pin]--;
  } 
  else {
    currentPosition[pin]++;
  }
  
  //Pulse the control pin
  digitalWrite(pin,currentState[pin]);
  currentState[pin] = ~currentState[pin];
}

arduino_forum.ino (1.56 KB)

There is a digitalWriteFast library which is getting close to the speed of port manipulation without the complexity.

...R

Thank you for the hint. I already knew and tried this library, but can’t get it to work in my patch.
I would love to use digitalWriteFast, but when I try to use it I get the following error message during compilation:

call to 'NonConstantUsed' declared with attribute error:

I understand that the pins used by digitalWriteFast have to be either #define or constants. As you can see in the modified code below, I created an array that holds all possible digital pin numbers as const byte, but nevertheless it still does not work. This code gives me the above mentioned error message.

What am I doing wrong here? Any suggestions?

#include <digitalWriteFast.h>
#include <TimerOne.h>
#define RESOLUTION 40               // Resolution of our timer 

const byte firstPin = 2;
const byte lastPin = 53;

const byte pins[] = {0,0,
                     2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
                     21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
                     41,42,43,44,45,46,47,48,49,50,51,52,53};

//Setup pins (Even-odd pairs for step control and direction)
 void setup(){
    for (byte x = firstPin; x <= lastPin; x = x + 1) {
        pinMode(x, OUTPUT); 
        delay(2); }

 Timer1.initialize(RESOLUTION); // Set up a timer at the defined resolution
 Timer1.attachInterrupt(tick); // Attach the tick function
 Serial.begin(57600);
}

// Function called by the timer inturrupt at the specified resolution.
void tick()  {
 /* If there is a period set for a certain pin, count the number of
  ticks that pass, and toggle the pin if the current period is reached.  */
 byte i;
  for(i = firstPin ; i <= fddAmount; i=i+2) {
    if (period[i]>0){
        currentTick[i]++;
      if (currentTick[i] >= period[i]){
        togglePin(i,i+1);
        currentTick[i]=0;
      }
    }
  }
} 

void togglePin(byte pin, byte direction_pin) {
  
  //Switch directions if end has been reached
  if (currentPosition[pin] >= MAX_POSITION[pin]) {
    currentState[direction_pin] = HIGH;
    digitalWriteFast(pins[direction_pin],HIGH);
  } 
  else if (currentPosition[pin] <= 0) {
    currentState[direction_pin] = LOW;
    digitalWriteFast(pins[direction_pin],LOW);
  }
  
  //Update currentPosition
  if (currentState[direction_pin] == HIGH){
    currentPosition[pin]--;
  } 
  else {
    currentPosition[pin]++;
  }
  
  //Pulse the control pin
  digitalWriteFast(pins[pin],currentState[pin]);
  currentState[pin] = ~currentState[pin];
}

DigitalWritefast only works with constant pin and constant value, but you’re looping through a bunch of pins, which means the PIN number is variable.

You could unroll the loop and write a whole bunch of digitalwritefast statements. After all, on a mega, you should have plenty of code space...

In order to engage the AVR gcc compiler kludge that generates bit set and bit clear instructions, the pin argument passed to digitalWriteFast() must be compile time constant.

In this call:

digitalWriteFast(pins[direction_pin],HIGH);

pins[direction_pin] is not a compile time constant. It is a runtime value that varies depending on the value of direction_pin.

--- bill

bperrybap:
In order to engage the AVR gcc compiler kludge that generates bit set and bit clear instructions, the pin argument passed to digitalWriteFast() must be compile time constant.

And the OP will be faced with the same constraint (or some complex work-around code) if s/he wants to use Port Manipulation.

It may be instructive for the OP to study the code in the digitalWriteFast library - it is not very long.

...R

Thanks for all the info. I understand now the logical error I made.

Now that I know what I cannot do, it would be great to solve the problem of how to loop through a bunch of pins using digitalWriteFast or something equivalent. Speed is really an issue here, since this code constantly checks 52 possible pins that sometimes play all at the same time.

Regarding the following idea: would it be possible to see a small code example or snippet that I could then use and refine myself? Since code space is really not an issue at the moment, I still have plenty... :slight_smile:

westfw:
You could unroll the loop and write a whole bunch of digitalwritefast statements. After all, on a mega, you should have plenty of code space...

In the meantime I had a look at:

Robin2:
It may be instructive for the OP to study the code in the digitalWriteFast library - it is not very long.

Mamma mia, that looks like rocket science to me. At least right now...but I'm on it!

You could create a separate function for each digitalWriteFast() to each pin and then your code could iterate through your pin array and call the appropriate function.

With a little more theoretical complexity you could create an array of function pointers to each of the digitalWriteFast functions which would allow for neater code. I have used function pointers but I don't recall the syntax so I won't attempt an example.

...R

Why go through all those gyrations just to use a library you don't need. Just learn how to do Direct Port Manipulation. It's not that hard and you'll never have to ask "how can I do it faster?". Because you can't.

Seriously, it doesn't get much simpler than this. Tested on an Uno, I don't have a Mega.

void directWrite(uint8_t pin, uint8_t value);

struct PinInfo {
  volatile uint8_t *reg;
  uint8_t mask;
};

const PinInfo unoPins[] = {
  {&PORTD, (uint8_t) 0x11111110},
  {&PORTD, (uint8_t) 0x11111101},
  {&PORTD, (uint8_t) 0x11111011},
  {&PORTD, (uint8_t) 0x11110111},
  {&PORTD, (uint8_t) 0x11101111},
  {&PORTD, (uint8_t) 0x11011111},
  {&PORTD, (uint8_t) 0x10111111},
  {&PORTD, (uint8_t) 0x01111111},
  {&PORTB, (uint8_t) 0x11111110},
  {&PORTB, (uint8_t) 0x11111101},
  {&PORTB, (uint8_t) 0x11111011},
  {&PORTB, (uint8_t) 0x11110111},
  {&PORTB, (uint8_t) 0x11101111},
  {&PORTB, (uint8_t) 0x11011111},
  {&PORTC, (uint8_t) 0x11111110},
  {&PORTC, (uint8_t) 0x11111101},
  {&PORTC, (uint8_t) 0x11111011},
  {&PORTC, (uint8_t) 0x11110111},
  {&PORTC, (uint8_t) 0x11101111},
  {&PORTC, (uint8_t) 0x11011111},
};

void setup() {
  Serial.begin(115200);
  delay(1000);
  Serial.println("Starting");
  pinMode(13, OUTPUT);
}

void loop() {
  // Blink the built-in LED at ~10 Hz
  directWrite(13, 1);
  delay(50);
  directWrite(13, 0);
  delay(50);
}

void directWrite(uint8_t pin, uint8_t value) {
  if (value == 0) {
    *(unoPins[pin].reg) &= unoPins[pin].mask;
  } else {
    *(unoPins[pin].reg) |= ~unoPins[pin].mask;
  }
}

gfvalvo:
Seriously, it doesn't get much simpler than this. Tested on an Uno, I don't have a Mega.

Very good.

It is always wise to stop over-thinking things.

...R

Thank you all very much. That is most helpful information and highly appreciated.
I decided to go with direct port manipulation, as this will be the fastest possible solution
However, as I have never ever before worked with direct port manipulation, it is unfortunatley not yet so simple for me.

Could anybody briefly explain me that line of code?

gfvalvo:

  {&PORTD, (uint8_t) 0x11111110},

I assume that once I understood that line, I can improve my code using direct port manipulation.

Meanwhile I continue studying bitmath and the registers of the arduino mega.

Could anybody briefly explain me that line of code?

Two constants, the first being the memory address of the port D output register, the second relating to a specific bit.

gfvalvo:
Seriously, it doesn't get much simpler than this. Tested on an Uno, I don't have a Mega.

That code uses what I call "Indirect Port i/o", while it is much faster than the code implementation used in arduino.cc supplied AVR digital i/o routines like digitalWrite() and digitalRead(), by avoiding the slow flash data table lookups, it does not trigger the avr GCC kludge that looks for register bit manipulation to optimize it to bit set (sbr) and bit clear (cbr) instructions.
This means that the code will use multiple instructions to set and clear bits which is not atomic and is interruptible.
This will cause register corruption if registers are being modified in ISR functions.

Here is the loop() code actually generates:

 672:   85 b1           in      r24, 0x05       ; 5
 674:   8e 6e           ori     r24, 0xEE       ; 238
 676:   85 b9           out     0x05, r24       ; 5
}

void loop() {
  // Blink the built-in LED at ~10 Hz
  directWrite(13, 1);
  delay(50);
 678:   62 e3           ldi     r22, 0x32       ; 50
 67a:   70 e0           ldi     r23, 0x00       ; 0
 67c:   80 e0           ldi     r24, 0x00       ; 0
 67e:   90 e0           ldi     r25, 0x00       ; 0
 680:   0e 94 a2 00     call    0x144   ; 0x144 <delay>
  {&PORTC, (uint8_t) 0x11011111},
};

void directWrite(uint8_t pin, uint8_t value) {
  if (value == 0) {
    *(unoPins[pin].reg) &= unoPins[pin].mask;
 684:   85 b1           in      r24, 0x05       ; 5
 686:   81 71           andi    r24, 0x11       ; 17
 688:   85 b9           out     0x05, r24       ; 5
void loop() {
  // Blink the built-in LED at ~10 Hz
  directWrite(13, 1);
  delay(50);
  directWrite(13, 0);
  delay(50)

Note the in/or/out and in/and/out instructions vs sbr and cbr instructions.
i.e. it is 3 instructions to alter a bit in the register vs just one.
This means that the sequence of instructions to update the register is not atomic and can be interrupted by an ISR.
If that ISR is messing with the same register, during the middle of this operation, the register will be corrupted by the foreground (non ISR code).
The Arduino supplied pin manipulation routines mask interrupts to ensure atomicity.

Also, my preference when using indirect port i/o is to build up the register pointer(s) and bitmask(s) runtime using the Arduino supplied macros digitalPinToPort(), digitalPinToBitMask(), portOutputRegistr().
That way, the mapping tables will always match what is in the variant file so it works no matter which board is being used.
If that isn't used, then it takes quite a bit of time to accurately & correctly create the pin mapping data that matches the desired variant pin mapping data - like what is in the PinInfo table in this example.

IMO,
If speed is that critical, then it is time to throw the Arduino pin manipulation APIs out the window and use direct register manipulation.
When using direct register manipulation, there may be some further optimizations that can be done since you can read / write 8 bits a time.

But, as always, if there are ISRs involved then care must be taken to ensure atomicity.

--- bill

Direct port io with variable port/value isn’t all that trivial.
Gfvalvo’s constants need to be “0b0001000”, not “0x...”
Three instructions per pin change isn’t bad - sbi would be two cycles anyway, and some of the mega pins aren’t reachable with sbi anyway...

(Sorry, can’t do an unrolling example right now. I’m “afk”)

westfw:
Gfvalvo’s constants need to be “0b0001000”, not “0x...”

Doooh!!!!

Corrected:

void directWrite(uint8_t pin, uint8_t value);

struct PinInfo {
  volatile uint8_t *reg;
  uint8_t mask;
};

const PinInfo unoPins[] = {
  {&PORTD, (uint8_t) 0b11111110},
  {&PORTD, (uint8_t) 0b11111101},
  {&PORTD, (uint8_t) 0b11111011},
  {&PORTD, (uint8_t) 0b11110111},
  {&PORTD, (uint8_t) 0b11101111},
  {&PORTD, (uint8_t) 0b11011111},
  {&PORTD, (uint8_t) 0b10111111},
  {&PORTD, (uint8_t) 0b01111111},
  {&PORTB, (uint8_t) 0b11111110},
  {&PORTB, (uint8_t) 0b11111101},
  {&PORTB, (uint8_t) 0b11111011},
  {&PORTB, (uint8_t) 0b11110111},
  {&PORTB, (uint8_t) 0b11101111},
  {&PORTB, (uint8_t) 0b11011111},
  {&PORTC, (uint8_t) 0b11111110},
  {&PORTC, (uint8_t) 0b11111101},
  {&PORTC, (uint8_t) 0b11111011},
  {&PORTC, (uint8_t) 0b11110111},
  {&PORTC, (uint8_t) 0b11101111},
  {&PORTC, (uint8_t) 0b11011111},
};

void setup() {
  Serial.begin(115200);
  delay(1000);
  Serial.println("Starting");
  pinMode(13, OUTPUT);
}

void loop() {
  // Blink the built-in LED at ~10 Hz
  directWrite(13, 1);
  delay(50);
  directWrite(13, 0);
  delay(50);
}

void directWrite(uint8_t pin, uint8_t value) {
  if (value == 0) {
    *(unoPins[pin].reg) &= unoPins[pin].mask;
  } else {
    *(unoPins[pin].reg) |= ~unoPins[pin].mask;
  }
}

gfvalvo:
Doooh!!!!

Wow, I missed as well.
The corrected version does generate sbi/cbi instructions as the optimizer is able to look down into the data table at compile time and figure out that it is constant and of the proper form to cause the trigger.

So it is very nice.

MANY Years ago I tried to get the Arduino.cc team to update the core library to handle the digital in a similar manor like this using a const data table, and they didn't want to do it. There was a lengthy discussion on the developers list but both myself and Paul Stoffregen failed to convince them to make any changes to the AVR digital core i/o functions.
I think they simply did not really understand the benefits.
So today, the AVR code for Uno/Mega is slow, while the AVR code for Teensy AVR products is much faster.

Back then you had to declare the data table as static const and use a wrapper macro if you wanted to put the data into a table and get the compiler to trigger the AVR kludge to get cbr/sbr instructions.
The digitalWriteFast() code (which I absolutely abhor) took the approach of using a tenary.
The reason I hate it is it could have named the wrappers the same as the functions to get a totally transparent "it just works, but faster" implementation by simply including the header with no modifications to any of the other user sketch code.

Now it looks like the compiler is a bit smarter and can see constant data in table, including when using a function since it is in-lining the function during complication of the module so it behaves like a macro, preserving the const knowledge through the function.

A full implementation, as would be needed in the Arduino IDE AVR core isn't quite this simple as it needs to efficiently address both const and non const pins and data but it can be handle and accounted for with some clever wrapper macros.

--- bill

So, out of idle curiosity, does the directWrite "function" in this code compile into fewer assembly instructions?

void set00() {
  PORTD |= 0b00000001;
}

void reset00() {
  PORTD &= 0b11111110;
}

void set01() {
  PORTD |= 0b00000010;
}

void reset01() {
  PORTD &= 0b11111101;
}

void set02() {
  PORTD |= 0b00000100;
}

void reset02() {
  PORTD &= 0b11111011;
}

void set03() {
  PORTD |= 0b00001000;
}

void reset03() {
  PORTD &= 0b11110111;
}

void set04() {
  PORTD |= 0b00010000;
}

void reset04() {
  PORTD &= 0b11101111;
}

void set05() {
  PORTD |= 0b00100000;
}

void reset05() {
  PORTD &= 0b11011111;
}

void set06() {
  PORTD |= 0b01000000;
}

void reset06() {
  PORTD &= 0b10111111;
}

void set07() {
  PORTD |= 0b10000000;
}

void reset07() {
  PORTD &= 0b01111111;
}

void set08() {
  PORTB |= 0b00000001;
}

void reset08() {
  PORTB &= 0b11111110;
}

void set09() {
  PORTB |= 0b00000010;
}

void reset09() {
  PORTB &= 0b11111101;
}

void set10() {
  PORTB |= 0b00000100;
}

void reset10() {
  PORTB &= 0b11111011;
}

void set11() {
  PORTB |= 0b00001000;
}

void reset11() {
  PORTB &= 0b11110111;
}

void set12() {
  PORTB |= 0b00010000;
}

void reset12() {
  PORTB &= 0b11101111;
}

void set13() {
  PORTB |= 0b00100000;
}

void reset13() {
  PORTB &= 0b11011111;
}

void set14() {
  PORTC |= 0b00000001;
}

void reset14() {
  PORTC &= 0b11111110;
}

void set15() {
  PORTC |= 0b00000010;
}

void reset15() {
  PORTC &= 0b11111101;
}

void set16() {
  PORTC |= 0b00000100;
}

void reset16() {
  PORTC &= 0b11111011;
}

void set17() {
  PORTC |= 0b00001000;
}

void reset17() {
  PORTC &= 0b11110111;
}

void set18() {
  PORTC |= 0b00010000;
}

void reset18() {
  PORTC &= 0b11101111;
}

void set19() {
  PORTC |= 0b00100000;
}

void reset19() {
  PORTC &= 0b11011111;
}

void (*functArray[][2])() {
  {reset00, set00},
  {reset01, set01},
  {reset02, set02},
  {reset03, set03},
  {reset04, set04},
  {reset05, set05},
  {reset06, set06},
  {reset07, set07},
  {reset08, set08},
  {reset09, set09},
  {reset10, set10},
  {reset11, set11},
  {reset12, set12},
  {reset13, set13},
  {reset14, set14},
  {reset15, set15},
  {reset16, set16},
  {reset17, set17},
  {reset18, set18},
  {reset19, set19}
};

#define directWrite(pin, value) functArray[pin][value]();

void setup() {
  Serial.begin(115200);
  delay(1000);
  Serial.println("Starting");
  pinMode(13, OUTPUT);
}


void loop() {
  // Blink the built-in LED at ~10 Hz
  directWrite(13, 1);
  delay(50);
  directWrite(13, 0);
  delay(50);
}

Yes on AVR processors. avrgcc takes advantage of the bit set and clear instructions, which work for the port registers and certain other low memory addresses.

gfvalvo:
So, out of idle curiosity, does the directWrite "function" in this code compile into fewer assembly instructions?

I should have asked my question more clearly. Between the code in Reply #15 and #17, which produces the more efficient assembly language?