Ethernet2 (UDP) SPI transfers have a lot of dead time

what's it doing with the other 123 clock cycles (84MHz)?

The UDP write code isn't what It'd call "tight" (it calls SPI::transfer() N types instead of SPI::transfer(), for example (but I guess that's so it doesn't overwrite with "input")), but it doesn't look like it should be 123 clocks worth...

Udp.write(buf, size)
  Socket::bufferData(... buf, size)
    W5500Class::send_data_processing_offset(... buf, len)
       W5500Class::write(... buf, len)
         for (len)
           SPI.transfer(buf[i])
	      byte SPIClass::transfer(byte _pin, uint8_t _data, SPITransferMode _mode) {
		      uint32_t ch = BOARD_PIN_TO_SPI_CHANNEL(_pin);
		 819d0:	290a      	cmp	r1, #10
		      // SPI_CSR_DLYBCT(1) keeps CS enabled for 32 MCLK after a completed
		      // transfer. Some device needs that for working properly.
		      SPI_ConfigureNPCS(spi, ch, mode[ch] | SPI_CSR_SCBR(divider[ch]) | SPI_CSR_DLYBCT(1));
	      }

	      byte SPIClass::transfer(byte _pin, uint8_t _data, SPITransferMode _mode) {
		 819d2:	b430      	push	{r4, r5}
		      uint32_t ch = BOARD_PIN_TO_SPI_CHANNEL(_pin);
		 819d4:	d027      	beq.n	81a26 <SPIClass::transfer(unsigned char, unsigned char, SPITransferMode)+0x56>
		 819d6:	2904      	cmp	r1, #4
		 819d8:	d029      	beq.n	81a2e <SPIClass::transfer(unsigned char, unsigned char, SPITransferMode)+0x5e>
		 819da:	2934      	cmp	r1, #52	; 0x34
		 819dc:	bf14      	ite	ne
		 819de:	f44f 25e0 	movne.w	r5, #458752	; 0x70000
		 819e2:	f44f 2530 	moveq.w	r5, #720896	; 0xb0000
		 819e6:	bf14      	ite	ne
		 819e8:	2103      	movne	r1, #3
		 819ea:	2102      	moveq	r1, #2
		      // Reverse bit order
		      if (bitOrder[ch] == LSBFIRST)
		 819ec:	4401      	add	r1, r0
		 819ee:	7a0c      	ldrb	r4, [r1, #8]
		 819f0:	b91c      	cbnz	r4, 819fa <SPIClass::transfer(unsigned char, unsigned char, SPITransferMode)+0x2a>
	       */
	      __attribute__( ( always_inline ) ) static __INLINE uint32_t __RBIT(uint32_t value)
	      {
		uint32_t result;

		 __ASM volatile ("rbit %0, %1" : "=r" (result) : "r" (value) );
		 819f2:	fa92 f2a2 	rbit	r2, r2
	       */
	      __attribute__( ( always_inline ) ) static __INLINE uint32_t __REV(uint32_t value)
	      {
		uint32_t result;

		__ASM volatile ("rev %0, %1" : "=r" (result) : "r" (value) );
		 819f6:	ba12      	rev	r2, r2
			      _data = __REV(__RBIT(_data));
		 819f8:	b2d2      	uxtb	r2, r2
		      uint32_t d = _data | SPI_PCS(ch);
		      if (_mode == SPI_LAST)
		 819fa:	2b01      	cmp	r3, #1
	      byte SPIClass::transfer(byte _pin, uint8_t _data, SPITransferMode _mode) {
		      uint32_t ch = BOARD_PIN_TO_SPI_CHANNEL(_pin);
		      // Reverse bit order
		      if (bitOrder[ch] == LSBFIRST)
			      _data = __REV(__RBIT(_data));
		      uint32_t d = _data | SPI_PCS(ch);
		 819fc:	ea42 0205 	orr.w	r2, r2, r5
		 81a00:	6803      	ldr	r3, [r0, #0]
		      if (_mode == SPI_LAST)
			      d |= SPI_TDR_LASTXFER;
		 81a02:	bf08      	it	eq
		 81a04:	f042 7280 	orreq.w	r2, r2, #16777216	; 0x1000000

		      // SPI_Write(spi, _channel, _data);
		      while ((spi->SPI_SR & SPI_SR_TDRE) == 0)
		 81a08:	6919      	ldr	r1, [r3, #16]
		 81a0a:	0789      	lsls	r1, r1, #30
		 81a0c:	d5fc      	bpl.n	81a08 <SPIClass::transfer(unsigned char, unsigned char, SPITransferMode)+0x38>
			      ;
		      spi->SPI_TDR = d;
		 81a0e:	60da      	str	r2, [r3, #12]

		      // return SPI_Read(spi);
		      while ((spi->SPI_SR & SPI_SR_RDRF) == 0)
		 81a10:	691a      	ldr	r2, [r3, #16]
		 81a12:	07d2      	lsls	r2, r2, #31
		 81a14:	d5fc      	bpl.n	81a10 <SPIClass::transfer(unsigned char, unsigned char, SPITransferMode)+0x40>
			      ;
		      d = spi->SPI_RDR;
		 81a16:	6898      	ldr	r0, [r3, #8]
		      // Reverse bit order
		      if (bitOrder[ch] == LSBFIRST)
		 81a18:	b914      	cbnz	r4, 81a20 <SPIClass::transfer(unsigned char, unsigned char, SPITransferMode)+0x50>
	       */
	      __attribute__( ( always_inline ) ) static __INLINE uint32_t __RBIT(uint32_t value)
	      {
		uint32_t result;

		 __ASM volatile ("rbit %0, %1" : "=r" (result) : "r" (value) );
		 81a1a:	fa90 f0a0 	rbit	r0, r0
	       */
	      __attribute__( ( always_inline ) ) static __INLINE uint32_t __REV(uint32_t value)
	      {
		uint32_t result;

		__ASM volatile ("rev %0, %1" : "=r" (result) : "r" (value) );
		 81a1e:	ba00      	rev	r0, r0
			      d = __REV(__RBIT(d));
		      return d & 0xFF;
	      }
		 81a20:	b2c0      	uxtb	r0, r0
		 81a22:	bc30      	pop	{r4, r5}
		 81a24:	4770      	bx	lr

I tried the following snippet of code and it has the same deadtime:

SPI.beginTransaction(SPISettings(28000000, MSBFIRST, SPI_MODE0));
digitalWrite(FPGA_SPIpin, LOW);
for (int i = 0; i<EthTxBuf_Size; i++)
{
  EthRxBuf[i] = SPI.transfer (byte (EthTxBuf[i]));
}
digitalWrite(FPGA_SPIpin, HIGH);
SPI.endTransaction();

I've attached a scope plot showing the SPI clock (it doesn't resolve particularly well, but it is obvious), there's oodles of deadtime!

try editing SPI.cpp and changing the occurances of SPI_CSR_DLYBCT(1) to SPI_CSR_DLYBCT(0)

I believe the (1) will cause 32 clocks between transfers, and I believe that the comment about CS is wrong since CS is manipulated manually (SPI_CSR_CSAAT - although I'm not sure why that's missing from setClockDivider()

Doesn't seem to have made any difference, it should have knocked off 381ns (3/4 of a graticule division on the scope plot) but it hasn't.
The location of SPI.cpp I edited was:
C:\Users[USERNAME]\AppData\Local\Arduino15\packages\arduino\hardware\sam\1.6.9\libraries\SPI\src\SPI.cpp

The scope plot is set to trigger on the falling edge of the /CS pin, there's about 1.2us before the first SPI transaction, I wonder if all (most) of the delay is before the transaction rather than after?

It looks like this may be a suspect:

uint32_t ch = BOARD_PIN_TO_SPI_CHANNEL(_pin);

I have been unable to locate this function to see what it is doing.

I have been unable to locate [BOARD_PIN_TO_SPI_CHANNEL] to see what it is doing

It's a macro defined in variant.h:

#define BOARD_PIN_TO_SPI_CHANNEL(x) \
    (x==BOARD_SPI_SS0 ? 0 : \
    (x==BOARD_SPI_SS1 ? 1 : \
    (x==BOARD_SPI_SS2 ? 2 : 3)))

I can get faster using the multi-byte SPI transfer, but even that leaves almost a byte-time worth of gap between bytes, which is really strange. There's a byte of buffering (a full byte transmission time get get the next byte ready); it should be able to output back-to-back, I would think.

Current test program:

#include <SPI.h>

Spi *myspi;

void setup()
{
  pinMode(13, OUTPUT);
  SPI.begin();
  Serial.begin(115200);
  myspi = SPI.spi;
}

#define DATSIZE 128

byte data[DATSIZE];

void loop() {
  Serial.println("Begin SPI test");
  SPI.beginTransaction(SPISettings(28000000, MSBFIRST, SPI_MODE0));
  digitalWrite(13, LOW);
#if 1
  for (byte i = 0; i < DATSIZE; i++)
  {
    data[i] = i;
  }
  SPI.transfer(data, sizeof(data), SPI_CONTINUE);
#else
  for (byte i = 0; i < DATSIZE; i++)
  {
    SPI.transfer (i, SPI_CONTINUE);
  }
#endif
  digitalWrite(13, HIGH);
  delay(100);
  SPI.endTransaction();
  for (int j = 0; j < 4; j++) {
    Serial.println(myspi->SPI_CSR[j], HEX);
  }
  delay(5000);

}

Thanks for telling me where that macro was, I was struggling to find it!

I was considering using the buffer transfer but I couldn't figure out from the documentation if it's the pointer that should be passed such that it overwrites the buffer with incoming data, else how do you receive? (I have wider concerns than just the Ethernet SPI, I want to use it for transfer with another device too, full duplex style).
This thread:
http://forum.arduino.cc/index.php?topic=407288.0
Suggests the pointer is passed to the function.
Looking at line 218 of SPI.cpp

void SPIClass::transfer(byte _pin, void *_buf, size_t _count, SPITransferMode _mode) {
...skipping to line 222
uint8_t *buffer = (uint8_t *)_buf;
...skipping to line 253
// Save read byte
	if (reverse)
		r = __REV(__RBIT(r));
	*buffer = r;
	buffer++;

My Cpp isn't strong but it looks like this will do both sending and receiving. Bit of a shame it overwrites the transmit buffer to do it tho :frowning:
There's quite a bunch of commands in there, which I guess is accounting for the gap between bytes. I'm seeing a gap about a bit longer than a byte transmission time (about 1.4 times).
I'm also seeing what I guess is an interrupt occurring in the middle of the transmission (I've attached a screengrab of it), it's not a huge problem in the grand scheme of things but is annoying :slight_smile:
(ignore the ends of the waveform, I've got other code running in there too. The first 2 transfers are a transfer16, just for comparison)

If this works, that's an increase in speed of about 3x for a 100 byte SPI transfer, things are starting to look up :slight_smile:
Now I just need the Ethernet2 library to make use of it :frowning:

it looks like this will do both sending and receiving. Bit of a shame it overwrites the transmit buffer to do it tho

Yes, it does, and it is :frowning: It seems that it would be nice if the SPI library had more features.

I've been trying to get some "bare" SPI code to work, so I can figure out whether this is inherent slowness, or some hardware/configuration issue. But I'm having troubles getting it to work :frowning:

Ahh, there is goes... This code produces a nice tight loop, and I'd really expect it to keep that SPI clock going pretty much continuously. It doesn't :frowning: Therefore, something "interesting" is happening! (I had to modify SPI.h to make SPI.spi a publicly accessible value, but otherwise it has no modifications.)

#include <SPI.h>

Spi *myspi;

void setup()
{
  SPI.begin();
  Serial.begin(115200);
  myspi = SPI.spi;
}

void loop() {
  Serial.println("Begin bare metal SPI test");

  SPI.beginTransaction(SPISettings(28000000, MSBFIRST, SPI_MODE0));

  for (byte i = 0; i < 100; i++) {
    while ((myspi->SPI_SR & SPI_SR_TDRE) == 0)
      ; // spin
    myspi->SPI_TDR = SPI_PCS(3) | i;
  }
  SPI.endTransaction();

  Serial.println("Bare metal finished");
  delay(1000);
}

Hmmm, My C might not be good enough.
This line

 myspi->SPI_TDR = SPI_PCS(3) | i;

similar to:

spi->SPI_TDR = d | SPI_PCS(ch);

in SPI.cpp, how did you get 3? And why is SPI_PCS(3) bitwise Or'd with the data to be sent? I couldn't work out what SPI_PCS(3) means.

What does the creation of myspi give us? Is this the only way to gain access to SPI_SR etc..?

I just about managed to alter SPI.h to make it work.... (tho I had to run your demo, I couldn't get it to work within my code).

Not much speed difference than the library, it looks like it's the while loop waiting for the clear signal

in SPI.cpp, how did you get 3? And why is SPI_PCS(3) bitwise Or'd with the data to be sent? I couldn't work out what SPI_PCS(3) means.

The SAM3 SPI controller can drive multiple chip-select (SS) pins as part of the SPI transactions, and the Due SPI library allows 4 separate devices with separate configuration, one one "spi port." PCS is "peripheral Chip Select"
"3" is the default, backward-compatible with the AVR single-SS (I could have used BOARD_PIN_TO_SPI_CHANNEL(BOARD_SPI_DEFAULT_SS), from variant.h, which is what I did manually.)

The write to the SPI data register has data in the low bits and chipselect info in the high bits (apparently the upper bits get copied directly to pins, so they're active-low, which was what was causing my problems. the SPI_PCS macro fixes that.)

myspi is just a pointer to the register set having to do with that SPI port. By copying it from the library object, I avoided having to figure out whether there was more than one, or which one was used, and I could be sure that I could mix the library code with the bare metal code...

Not much speed difference than the library

To be honest, I don't know if we'll get any actual speedup at the UDP level. 8 bits every 1750ns (from your first message) is almost 5Mbps, which isn't bad (although the WizNet site says it should do 15Mbps.) At the moment this is a puzzle in SPI behavior (for me.)

changing the occurances of SPI_CSR_DLYBCT(1) to SPI_CSR_DLYBCT(0)

This does help significantly for the bare-metal SPI case...

With DLYBCT(1):
Du_SPI_1.jpg
With DLYBCT set to zero after starttransaction:
DUe_SPI_2.jpg

Ah hah! I get it! Since the SPI single-byte "transaction" routines are all "return the read value", they MUST wait for the complete transmission to happen, so you get no benefit from the potential overlap of the transmission with other code. We might as well be bit-banging :frowning:
The counted version of write() does a little better, but it's still not taking advantage of both the shift-register AND the TX buffer register.

This is a longshot, but you might try my optimized Ethernet library.

Hopefully if you just put it into Documents/Arduino/libraries it'll override Arduino's version. Pay attention to the messages about duplicate libraries and which one the Arduino IDE is really using.

This doesn't fix the slowness of Due's SPI library, but it does eliminate the redundant accesses to those Wiznet index registers. It also uses transactions at the socket level, rather than needlessly starting and stopping the SPI transaction over and over again at the W5500 read/write level.

I have been mostly testing with TCP, and there are still many unsolved mysteries of slowness. I'm waiting for delivery of a network tap this weekend before I continue work on this... so I can see what the Wiznet chip is really doing with the packets.

It seems there is a SPI_CSR_DLYBCT(1) hiding in SPI.h, I had only changed the ones in SPI.cpp, so that explains it a bit more :slight_smile:
Having made this change, buffer transfers have improved as well, 100 bytes is now about 48us. It's also knocked 50us off the UDP time (450 down to 400us)
I shall try the wiznet library (this was Ethernet2 from the library manager) and see if there's an improvment.
Currently, I can see it is doing byte by byte transfers for all of it, so there's massive gaps

I notice that the wiznet site says 15Mbps, but the datasheet says 80 for SPI and 15 for the Ethernet link, which makes sense given the wiznet library sets the SPI speed to 42MHz :slight_smile:

Paul, you posted while I was still writing this post :slight_smile:
I did try your library yesterday (I posted in this thread: How can I speed up the transfer rate (Ethernet Shield 2, Arduino Mega) - Networking, Protocols, and Devices - Arduino Forum)
I couldn't get the retry and timeout to work, I really need these as I need to 'give up early' and carry on, I'm trying to get my looptime down

I've tried the Wiznet library (at 28MHz) and it does seem to be faster, the UDP packet is down to about 270us. Clearly the wiznet library isn't using the faster buffer transfer :frowning:

For testing, I've got a 100 byte SPI buffer transfer (not to the W5500) followed by the UDP transfer, just so I can compare on the fly, here's the really odd thing, the wiznet library is causing a minor slowdown in the SPI time! A 100 byte transfer over SPI takes 48.6us with the Ethernet2 library in use and 52us when I use the Wiznet library. It's not a huge amount, but it is very noticeable when you are looking at 10us/div on the scope, it crosses the 5 div boundary! Looking at the waveform, it is a small increase in deadtime between the SPI bursts, very strange!

I've tidied up my test code so it can be posted here.
It's currently setup to use Pauls library (and wiznet by folder renaming), there are commented out includes at the start for changing to Ethernet2, along with lines 42-45 for the timeout and retry configuration.

The wiznet library seems the fastest of the 3, tho there is the oddity of it increasing other SPI deadtime. None of the libraries seem to be using "transfer (buffer, size);", this would boost the speed enormously without resorting to bare-metal code.

#include <SPI.h>
//#include <Ethernet2.h>
#include "Ethernet.h"
#include "w5100.h"
//#include <utility/w5500.h>
// Edit the SPI speed in C:\Users\[USERNAME]\Documents\Arduino\libraries\Ethernet2\src\utility\w5500.cpp
//  to 28000000, 28MHz, line 25: (copy/paste the following is easiest)
//  SPISettings wiznet_SPI_settings(28000000, MSBFIRST, SPI_MODE0);
//
// Edit occurances of SPI_CSR_DLYBCT(1) to SPI_CSR_DLYBCT(0) in
//  C:\Users\[USERNAME]\AppData\Local\Arduino15\packages\arduino\hardware\sam\1.6.9\libraries\SPI\src\SPI.cpp
// and C:\Users\[USERNAME]\AppData\Local\Arduino15\packages\arduino\hardware\sam\1.6.9\libraries\SPI\src\SPI.h
//

byte mac[] = { 0xDE, 0xAD, 0xBE, 0xEF, 0xFE, 0xED };
const IPAddress MyIP( 192, 168, 0, 10 );
const IPAddress Cab1_IP (192, 168, 0, 2); //this address exists
const IPAddress Cab2_IP (192, 168, 0, 1); //this doesn't, for testing retry and timeout
unsigned int local_Port = 12345;
unsigned int Cab1_Port = 12345;
unsigned int Cab2_Port = 12345;
EthernetUDP Udp;

const int EthTxBuf_Size = 100;
byte EthTxBuf[EthTxBuf_Size];
byte EthRxBuf[EthTxBuf_Size];
unsigned long current_micros;
const unsigned long looptime = 2000;
const int Testpin = 52;
const int Errorpin = 22;
const int Framepin = 32;
const int FPGA_SPIpin = 4;

void setup()
{
  Ethernet.begin(mac,MyIP);
  Udp.begin(local_Port);
  pinMode(Testpin, OUTPUT);
  pinMode(Errorpin, OUTPUT);
  pinMode(Framepin, OUTPUT);
  pinMode(FPGA_SPIpin, OUTPUT);
  //w5500.setRetransmissionCount(1);
  //w5500.setRetransmissionTime(1);
  W5100.setRetransmissionCount(uint8_t(1));
  W5100.setRetransmissionTime(uint16_t(1));
}

void loop()
{
  current_micros = micros();
  for (byte j=0; j<EthTxBuf_Size; j++)
  {
    EthTxBuf[j] = j;
  }
  digitalWrite(Framepin, HIGH);
  digitalWrite(Framepin, LOW);
  digitalWrite(Testpin, HIGH);

  Udp.beginPacket(Cab1_IP, Cab1_Port);
  Udp.write(EthTxBuf, EthTxBuf_Size);
  if (Udp.endPacket() == 0)
  {
    digitalWrite(Errorpin, HIGH);
    digitalWrite(Errorpin, LOW);
  }
  
  digitalWrite(Testpin, LOW);

  for (byte j=0; j<EthTxBuf_Size; j++)
  {
    EthTxBuf[j] = j;
  }

  SPI.beginTransaction(SPISettings(28000000, MSBFIRST, SPI_MODE0));
  digitalWrite(FPGA_SPIpin, LOW);
  SPI.transfer (&EthTxBuf, EthTxBuf_Size);
  digitalWrite(FPGA_SPIpin, HIGH);
  SPI.endTransaction();
  
  digitalWrite(Testpin, HIGH);
  Udp.beginPacket(Cab2_IP, Cab2_Port);
  Udp.write(EthTxBuf, EthTxBuf_Size);
  if (Udp.endPacket() == 0)
  {
    digitalWrite(Errorpin, HIGH);
    digitalWrite(Errorpin, LOW);
  }
  digitalWrite(Testpin, LOW);
  while ((micros()- current_micros)<looptime)
  {
    
  }
}

weird_dave:
None of the libraries seem to be using "transfer (buffer, size);", this would boost the speed enormously without resorting to bare-metal code.

I decided to try a quick sanity check for this theory. I ran this code on an Arduino Due:

#include <SPI.h>

void setup() {
  SPI.begin();
  pinMode(10, OUTPUT);
}

void loop() {
  uint8_t data[5] = {0x55, 0x5A, 0x49, 0xAA, 0x96};
  digitalWrite(10, LOW);
  SPI.beginTransaction(SPISettings(25000000, MSBFIRST, SPI_MODE0));
  SPI.transfer(data, 5);
  SPI.endTransaction();
  digitalWrite(10, HIGH);
  delay(100);
}

Here is the rather disappointing result:

Then again, those 50% dead times between bytes are a LOT better than the overhead of calling SPI.transfer(byte) five times.

Here's how bad that is:

For comparison, here is Arduino Uno running the five SPI.transfer(byte) sketch:

Even with only an 8 bit CPU running at one fifth the clock speed, Uno manages to transfer the 5 bytes at only 8 Mbit/sec in approximately the total time Due does at 21 Mbit/sec.

For completeness, here is how Uno performs with SPI.transfer(buf, 5):

Your results with the Due match what I'm seeing. The Uno results put the Due to shame really given the core speed. I don't own an Uno to play with unfortunately, so thanks for sharing that research.

Does your library buffer the transfers? My results suggest they don't, but I recall reading they did (or were supposed to). I suspect it's possible to get a 100 bytes transmission done well under 100us with a buffer transfer, that's the whole UDP SPI transfer, at 28MHz.
Also, could you confirm if the timeout and retry count work with your library? They didn't seem to work for me :frowning: