ISR sporadically failing to trigger

Hello,

I am trying to create a PID cooling system for a work project. The details of the hardware are as follows:

  • RocketScream Mini Ultra Pro v3 (Arduino Zero ATSAMD21G18A-AU)
  • Zero onboard RTC
  • Adafruit thermocouple amplifier module MAX31856
  • Two coil relays switched by D7 and D8 simultaneously (two solenoids for failure mitigation) through low-side transistor switches.

I am using the SAMD_timerinterrupt library for millisecond interrupt control and the AutoPID library for interval calculations.

The timing of solenoid opening and closing is controlled by two ISRs - first, an ISR attached to the onboard RTC fires every 10 seconds that the clock ticks, then resets that alarm 10 seconds into the future. The RTC ISR checks the current injection interval requested by the PID controller. If the interval is above a minimum value, the RTC ISR sets another ISR with the shutSolenoids() function attached on a hardware timer which triggers after the interval has elapsed. The ISR then runs shootSolenoids() if the ISR was successfully attached.

My problem arises during frequent cases when the post-interval ISR does not fire. Or, at least, none of the flags it should be changing are changed.

I have written a very undesirable "overshootManager" function which checks the state of the output pins after they are supposed to have closed. Semi-regularly, this overshoot manager will find that the digital pins are still HIGH when they should be LOW. This is not a problem with my coil relays sticking shut, as they are not connected to any load, and the digitalRead() confirms they are still internally assigned HIGH. The LED on the relay indicates that they are still receiving power.

#define TIMER_INTERRUPT_DEBUG         0
#define _TIMERINTERRUPT_LOGLEVEL_     0

#define USING_TIMER_TC3         false
#define USING_TIMER_TC4         false    
#define USING_TIMER_TC5         false
#define USING_TIMER_TCC         true    // Handles solenoid closing
#define USING_TIMER_TCC1        false     
#define USING_TIMER_TCC2        false

#include "SAMDTimerInterrupt.h"
#include <Wire.h>                   // for I2C communication
#include <RTCZero.h>                // for RTC
#include <Adafruit_MAX31856.h>      // for thermocouple amplifier
#include <AutoPID.h>                // PID lib

#define TC1_CS_PIN 3    // Gas temperature

#define PRIME_PIN 7     // Primary solenoid 
#define INTLK_PIN 8     // Interlock solenoid

#define OUTPUT_MIN -255
#define OUTPUT_MAX 0 
#define KP 8         
#define KI 0           
#define KD 0.3   

double temp1, setPoint, outputVal, interval;
int TStart = 20;        // Initial target degrees celsius
int dT = -1;            // Change in C per hour
int holdDelay = 0;      // Minutes to hold initial temperature

#define pwmPeriod 10             // injection interval; seconds
uint32_t start, now; 

SAMDTimer ITimer0(TIMER_TCC); 
AutoPID PID(&temp1, &setPoint, &outputVal, OUTPUT_MIN, OUTPUT_MAX, KP, KI, KD); 
Adafruit_MAX31856 maxthermo1 = Adafruit_MAX31856(TC1_CS_PIN); 
RTCZero rtc;

volatile bool flagOpen = false;
volatile bool flagClose = false;
volatile uint32_t lastInterval;
volatile uint32_t openTime;
volatile uint32_t closeTime;

void setup() {
  SerialUSB.begin(9600);
  delay(1000);
  SerialUSB.println("Connected");

  pinMode(PRIME_PIN, OUTPUT);
  pinMode(INTLK_PIN, OUTPUT);

  rtc.begin();

  maxthermo1.begin();
  maxthermo1.setThermocoupleType(MAX31856_TCTYPE_T);
  maxthermo1.setConversionMode(MAX31856_CONTINUOUS);

  PID.setTimeStep(500);
  
  now = rtc.getEpoch();
  start = now;

  rtc.setAlarmEpoch(now + (pwmPeriod - 1));
  rtc.enableAlarm(rtc.MATCH_SS);

  rtc.attachInterrupt(manageSolenoids);

}

void loop() {
  updateTemperature();
  now = rtc.getEpoch();  
  updateSetpoint();       // Update setpoint based on temp profile and current time
  PID.run();              // Update PID evaluation based on setpoint
  interval = pwmPeriod * 0.7 * outputVal/OUTPUT_MIN; // Max inject interval is 70% of cycle
  overshootManager();  // Check and correct failed solenoid close

  // Debugging ISR timing
  if (flagOpen) {
    SerialUSB.print("Inject processed at "); SerialUSB.print(openTime);
    SerialUSB.print(", For duration "); SerialUSB.println(lastInterval);
    flagOpen = false;
  }
  if (flagClose) {
    SerialUSB.print("Close processed at "); SerialUSB.print(closeTime);
    SerialUSB.print(", real duration "); SerialUSB.println(closeTime - openTime);
    flagClose = false;
  }
}

void updateSetpoint() {
  uint32_t elapsed = (((now - start) >= holdDelay*60) ? (now - (start + holdDelay*60)) : 0);
  double TTarget = TStart + ((double)elapsed * dT / 3600);
  setPoint = ((TTarget > -195) ? TTarget : -195); // Clamp setPoint to -195 if TTarget is lower
}

void updateTemperature() {
  if (maxthermo1.conversionComplete()) {
    temp1 = maxthermo1.readThermocoupleTemperature();
  } 
  // We might handle failure to read temperature here. Not a problem currently.
}

void manageSolenoids() {
  // Interrupt called by RTC alarm every PWM_PERIOD seconds. 
  // Sets new RTC interrupt for next evaluation period.
  uint32_t epoch = rtc.getEpoch();
  rtc.setAlarmEpoch(epoch + (pwmPeriod - 1));
  uint32_t intervalMS = interval*1000;

  // Open solenoids if inject interval >50ms (filter short injections)
  // and set SAMD hardware timer interrupt to close after inject interval milliseconds.
  if (intervalMS > 50) {
    if (ITimer0.attachInterruptInterval_MS(intervalMS, shutSolenoids)) {
      lastInterval = intervalMS;
      shootSolenoids();
    }
  }
}

void shootSolenoids() {
  digitalWrite(PRIME_PIN, HIGH);
  digitalWrite(INTLK_PIN, HIGH);
  flagOpen = true;
  openTime = millis();
}

void shutSolenoids() {
  digitalWrite(PRIME_PIN, LOW);
  digitalWrite(INTLK_PIN, LOW);
  ITimer0.disableTimer();
  flagClose = true;
  closeTime = millis();
}

void overshootManager() {
  // Weird problem with interrupt failing to close solenoids...
  uint32_t getNow = millis();
  if ((getNow - lastInterval - 10) > openTime) { // Output pins should have been shut by now
    if (digitalRead(PRIME_PIN) == HIGH) { // But in some cases, they're not
      SerialUSB.println("Warning - overshoot detected");
      shutSolenoids();
    }
  }
}

Here is a sample of the Serial output:

12:28:51.667 -> Inject processed at 86811, For duration 1254
12:28:52.890 -> Close processed at 88066, real duration 1255
12:29:01.569 -> Inject processed at 96773, For duration 1260
12:29:02.834 -> Warning - overshoot detected
12:29:02.834 -> Close processed at 98184, real duration 1411
12:29:11.561 -> Inject processed at 106736, For duration 1273
12:29:12.914 -> Close processed at 108010, real duration 1274

As you can see, the fourth line indicates the shutSolenoids function had not fired some time after it should have, so the overshootManager detected a failure to close. This is also shown by the discrepancy between the target duration of 1260ms but the real duration of 1411ms.

I'm not sure if I've created a race condition somehow, but the "Inject processed" message cannot appear unless the system confirms the ISR has been attached to ITimer0. Since shutSolenoids() is directly attached to this timer, I can't imagine why it would be failing to fire.

Why do you need interrupts?

The inject time sensitivity is very high, as the system is very responsive to temperature fluctuations. Liquid nitrogen is injected into a vacuum Dewar flask, and an excess injection of 50-100ms can induce huge overcooling. So, I need reliable timing in the dozens of milliseconds range to avoid overshooting.

I wrote another version of the code that relied on interval calculations. Unfortunately, there are additional tasks going on that may take a lot of time, which aren’t reflected in my code. Primarily, updating a SHARP memory display with metrics for the user. The screen isn’t critical to the problem being described so I removed that portion of code.

When using interval-based timing, I would see liquid nitrogen injections occurring 100-150ms longer than they should have. Hence, I want an interrupt so the system has to drop everything and shut the solenoids at the correct time.

Without an annotated schematic I will take a SWAG. Your rise/fall time of the interrupt could be to long or you have not gotten out of the ISR when it occurs again. As far as flow rate can you slow that down, that will give you more control. You would probably better writing non blocking code without interrupts.

Do not confuse the timing of "WHEN" you command the solenoid with when it becomes fully open and when it becomes fully closed.

Do not confuse the timing of "WHEN" you command the solenoid with when it becomes fully open and when it becomes fully closed.

I am not, this problem has nothing to do with solenoid response time. The solenoids aren't even connected. This is regarding an apparent failure of the system to execute a scheduled ISR, or a failure to schedule the ISR even when the system claims it has been scheduled.

Without an annotated schematic I will take a SWAG. Your rise/fall time of the interrupt could be to long or you have not gotten out of the ISR when it occurs again. As far as flow rate can you slow that down, that will give you more control. You would probably better writing non blocking code without interrupts.

The solenoid interrupt is guaranteed to only happen at most once per ten seconds, and at least 50ms after the RTC-attached ISR. I have specifically made the ISRs as small as possible so they do not take significant time. Unfortunately I cannot control flow rate as this hardware is meant to control already-deployed hardware that I cannot modify. My schematic really has nothing to do with it - this issue could be reproduced with a bare chip with nothing attached.

I would write non-blocking code, but the SHARP display library is the culprit for my long loop times, and it's not negotiable to get rid of it - nor am I going to start rewriting libraries.

OK, can we try that? That is to ask, can you write a small program that does the barest of bones treatment of the timing?

As I understand it

Every ten seconds, an RTC interrupt happens that shoots the solenoids (!) and also schedules another interrupt for some time in the future. When that time arrives, the solenoids are shut. If overshootManager() thinks it should, it shuts the solenoids.

Drop all the baggages of the PID choosing the interval, the thermocouples, everything and just turn on and off an LED, like on every ten seconds for 1250 milliseconds.

Your algorithm seems reasonable. I have flown about your code and it looks like plausible implementation. I did see that

  now = rtc.getEpoch();
  start = now;

start is always now which makes updateSetpoint() less than sensical. Should not be the current issue, just a peephole observation.

I'd strip it and test it here, but I have no board (or simuator) that would run the software, stripped or not.

FWIW everything looks plausible, I am thinking there may be a flaw or subtlety in the SAMDTimerInterrupt library.

Or, but your work so far seems to rule it out, there is something that crops up in your calculations that is not exposed in the debug printing you have placed in the loop().

I do also see you using your grandfather's baud rate, I don't see it as the problem, but you can crank that up to something 21st century like 115200 and save dozens of milliseconds that your debug printation is taking.

Only ideas, this hole thing is above my pay grade. Sounds like you are having fun even now.

a7

1 Like

I just noticed that the ten second intervals are 9962 and 9963 milliseconds. Any reason they should not be closer to if not exactly 10000?

Also I see there is never a detach corresponding to multiple attachments of the timer interrupt. This may not be a problem, but it does remind me that I wondered if that timer can be once attached, and then ignored, if necessary, and only reconfigured (and subsequently not ignored) when you shoot the solenoids.

a7

1 Like

Thanks for your responses. Your understanding is correct, let me try to answer your comments in order.

That is to ask, can you write a small program that does the barest of bones treatment of the timing?

I suppose what I meant to say when I claimed the schematic was irrelevant, is that the only external sensing device is the MAX31856 thermocouple amplifier. When I disable the updateTemperature() routine and simply force the temperature to be, say, 16.5, the issue still occurs.

start is always now which makes updateSetpoint () less than sensical.

now gets updated each iteration of loop(), the setpoint calculation and hold delay are confirmed working.

just noticed that the ten second intervals are 9962 and 9963 milliseconds. Any reason they should not be closer to if not exactly 10000?

The system clock should be accurate to a few ppm relative to the RTC, so I'm not sure. Is it possible that, between the start of manageSolenoids() and shootSolenoids(), where millis() is sampled, something is taking a long time to occur? In this time, the RTC is being sampled for the UNIX epoch, and the timer is being attached. I can't imagine this taking a whole 30-40ms, though?

Also I see there is never a detach corresponding to multiple attachments of the timer interrupt. This may not be a problem, but it does remind me that I wondered if that timer can be once attached, and then ignored, if necessary, and only reconfigured (and subsequently not ignored) when you shoot the solenoids.

Every example test I've run from the SAMD_timerinterrupt library has run for 10-20 minutes without missing a single interrupt, including an example that runs 6 different timers simultaneously with different durations for each that would cause periodic simultaneous triggering of multiple timers. Also, another that routinely changes the duration of the interrupts.

But, I am thinking there is some subtle issue with how I'm disabling and re-enabling the timer, as that approach isn't implemented in any of their examples. The interrupt is not being detached and reattached, the timer is being disabled and then initiated again in the RTC-fired ISR.

I may try, instead of disabling the timer, setting it to a long time (say, 11s into the future) and then again setting it to the inject interval upon RTC-ISR firing. The reason I'm not doing that currently is because the interrupt will repeatedly fire every interval unless it's turned off. However, the approach of setting the timer to a new interval time rather than disabling it is demonstrated working in their examples.

Unfortunately I left the device at work so I have to go back and get it before trying this.

No, that's good. Give it a rest. :expressionless:

I misread the code on the start == now point, sry.

I appreciate that you can test things by jamming in the temperature and so forth. I was thinking that a simpler program might turn up the issue and pin it squarely on not you.

I look forward to seeing what this is happening.

a7

Here's something that compiles and I believe is the least you need to demonstrate any problem with the idea you are exploiting.

# define TIMER_INTERRUPT_DEBUG         0
# define _TIMERINTERRUPT_LOGLEVEL_     0

# define USING_TIMER_TC3         false
# define USING_TIMER_TC4         false
# define USING_TIMER_TC5         false
# define USING_TIMER_TCC         true    // Handles solenoid closing
# define USING_TIMER_TCC1        false
# define USING_TIMER_TCC2        false

# include "SAMDTimerInterrupt.h"
# include <RTCZero.h>

# define PRIME_PIN 7     // Primary solenoid 
# define INTLK_PIN 8     // Interlock solenoid

double interval;

# define pwmPeriod 10             // injection interval; seconds
uint32_t start, now;

SAMDTimer ITimer0(TIMER_TCC);
RTCZero rtc;

volatile bool flagOpen = false;
volatile bool flagClose = false;
volatile uint32_t lastInterval;
volatile uint32_t openTime;
volatile uint32_t closeTime;

void setup()
{
  SerialUSB.begin(9600);   // make this 115200, see what diffs
  delay(1000);
  SerialUSB.println("Connected");

  pinMode(PRIME_PIN, OUTPUT);
  pinMode(INTLK_PIN, OUTPUT);

  rtc.begin();

  now = rtc.getEpoch();
  start = now;

  rtc.setAlarmEpoch(now + (pwmPeriod - 1));
  rtc.enableAlarm(rtc.MATCH_SS);

  rtc.attachInterrupt(manageSolenoids);
}

void loop()
{
  interval = 1.234;   // is this inseconds right?

  overshootManager();

  // Debugging ISR timing
  if (flagOpen) {
    SerialUSB.print("Inject processed at "); SerialUSB.print(openTime);
    SerialUSB.print(", For duration "); SerialUSB.println(lastInterval);
    flagOpen = false;
  }
  if (flagClose) {
    SerialUSB.print("Close processed at "); SerialUSB.print(closeTime);
    SerialUSB.print(", real duration "); SerialUSB.println(closeTime - openTime);
    flagClose = false;
  }
}

void manageSolenoids() {
  uint32_t epoch = rtc.getEpoch();
  rtc.setAlarmEpoch(epoch + (pwmPeriod - 1));
  uint32_t intervalMS = interval * 1000;

  if (intervalMS > 50) {
    if (ITimer0.attachInterruptInterval_MS(intervalMS, shutSolenoids)) {
      lastInterval = intervalMS;
      shootSolenoids();
    }
  }
}

void shootSolenoids() {
  digitalWrite(PRIME_PIN, HIGH);
  digitalWrite(INTLK_PIN, HIGH);
  flagOpen = true;
  openTime = millis();
}

void shutSolenoids() {
  digitalWrite(PRIME_PIN, LOW);
  digitalWrite(INTLK_PIN, LOW);
  ITimer0.disableTimer();
  flagClose = true;
  closeTime = millis();
}

// Weird problem with interrupt failing to close solenoids...
void overshootManager() {
  uint32_t getNow = millis();
  if ((getNow - lastInterval - 10) > openTime) { // Output pins should have been shut by now
    if (digitalRead(PRIME_PIN) == HIGH) { // But in some cases, they're not
      SerialUSB.println("Warning - overshoot detected");
      shutSolenoids();
    }
  }
}

I left the 9600 in there as the first thing to change if this reduced sketch also malfunctions.

I may have made some errors stripping this, but this should be a jump start. It is easier for me to read code without having to blind myself to distractions that though coded out are still in the source.

The overshootManager() looks like it panics when the shutdown is but 10 ms late. But the report shows the panic shutdown happening > 100 ms late. This seems odd, so maybe again I am misreading the code. You tell me.

HTH

a7

Bear in mind that touching the RTCZero inside an interrupt service routine is very likely a serious mistake. As far as I can tell the RTCZero code is not re-entrant.

This topic was automatically closed 180 days after the last reply. New replies are no longer allowed.