It depends on how you write your software loops. On an AVR, it takes as long to control the loop as to copy the memory. A common trick is to copy blocks of 16 bytes at a time, then copy any partial block. An ARM is a lot more clever. And different Compilers and Optimisation can benefit from cacheing. For something like a memory transfer it is is only going to be nanoseconds.
For a UART at 9600 baud, a single byte is going to take 1040us however you do it. If you are doing something useful while the hardware peripheral is busy, you will hardly notice the load on the CPU.
If you sit in a busy-wait, you are tied up for the whole transmission.
Just to demonstrate the difference:
// DMA memory to memory ZERO
// ch 18 beat burst block
// xdk sam0/drivers/dma/dma.c
// packages/arduino/tools/CMSIS/4.0.0-atmel/Device/ATMEL/samd21/include/component/dmac.h
#define PRREG(x) Serial.print(#x" 0x"); Serial.println(x,HEX)
#define BYTES 1024
#define WORDS BYTES/sizeof(int)
char src[BYTES] __attribute__ ((aligned (8)));
char dst[BYTES] __attribute__ ((aligned (8)));
int *srcw = (int *) src, *dstw = (int *)dst;
void prmbs(char *lbl, unsigned long us, int bits) {
float mbs = (float)bits / us;
Serial.print(mbs, 2); Serial.print(" mbs ");
Serial.print(us); Serial.print(" us ");
Serial.println(lbl);
}
// DMA 12 channels
struct dmacdescriptor {
uint16_t btctrl;
uint16_t btcnt;
uint32_t srcaddr;
uint32_t dstaddr;
uint32_t descaddr;
} ;
volatile struct dmacdescriptor wrb[12] __attribute__ ((aligned (16)));
struct dmacdescriptor descriptor_section[12] __attribute__ ((aligned (16)));
struct dmacdescriptor descriptor __attribute__ ((aligned (16)));
void DMAC_Handler() {
// interrupts DMAC_CHINTENCLR_TERR DMAC_CHINTENCLR_TCMPL DMAC_CHINTENCLR_SUSP
// disable irqs ?
DMAC->CHID.reg = DMAC_CHID_ID(0);
DMAC->CHINTFLAG.reg = DMAC_CHINTENCLR_TCMPL; // clear
}
void dma_init() {
// probably on by default
PM->AHBMASK.reg |= PM_AHBMASK_DMAC ;
PM->APBBMASK.reg |= PM_APBBMASK_DMAC ;
// NVIC_EnableIRQ( DMAC_IRQn ) ;
// wrb[0].descaddr = (uint32_t)&descriptor; // or not, memcpy below?
DMAC->BASEADDR.reg = (uint32_t)descriptor_section;
DMAC->WRBADDR.reg = (uint32_t)wrb;
DMAC->CTRL.reg = DMAC_CTRL_DMAENABLE | DMAC_CTRL_LVLEN(0xf);
DMAC->CHID.reg = DMAC_CHID_ID(0); // channel 0
DMAC->CHCTRLA.reg &= ~DMAC_CHCTRLA_ENABLE;
DMAC->CHCTRLA.reg = DMAC_CHCTRLA_SWRST;
}
void memcpy32(void *dst, const void *src, size_t n) {
DMAC->CHID.reg = DMAC_CHID_ID(0); // channel 0
// DMAC->CHINTENSET.reg = (DMAC_CHINTENSET_MASK & 2); // bits err0 complete1 suspend2
descriptor.descaddr = 0;
descriptor.dstaddr = (uint32_t)dst + n;
descriptor.srcaddr = (uint32_t)src + n;
descriptor.btcnt = n;
descriptor.btctrl = DMAC_BTCTRL_DSTINC | DMAC_BTCTRL_SRCINC | DMAC_BTCTRL_VALID;
memcpy(&descriptor_section[0], &descriptor, sizeof(DmacDescriptor));
DMAC->CHCTRLA.reg |= DMAC_CHCTRLA_ENABLE;
DMAC->SWTRIGCTRL.reg |= (1 << 0); // trigger channel 0
// while (! (DMAC->CHINTFLAG.reg & DMAC_CHINTENCLR_TCMPL)); // spin wait
// while (wrb[0].btctrl & DMAC_BTCTRL_VALID) ; // spin wait
}
void wait_dma(void)
{
while (wrb[0].btctrl & DMAC_BTCTRL_VALID) ; // spin wait
}
void do_short_steps(uint32_t n)
{
volatile uint32_t counter; //ARM should be able to cache in registers
while (n--) {
for (counter = 100; counter--; ) ;
}
}
void do_shovels(uint32_t n)
{
uint32_t counter;
static volatile uint32_t memory; //should force access to memory bus
while (n--) {
for (counter = 58; counter--; ) memory ^= 0xFFFF;
}
}
void setup() {
Serial.begin(9600);
dma_init();
}
void loop() {
int i, errs = 0;
unsigned long t1;
for (i = 0; i < BYTES; i++) src[i] = i;
memset(dst, 0, BYTES);
t1 = micros();
do_short_steps(10);
t1 = micros() - t1;
prmbs("long walk", t1, BYTES * 8);
t1 = micros();
memcpy32(dst, src, BYTES);
wait_dma();
t1 = micros() - t1;
prmbs("chew gum", t1, BYTES * 8);
t1 = micros();
memcpy32(dst, src, BYTES);
do_short_steps(10);
wait_dma();
t1 = micros() - t1;
prmbs("chew gum and walk", t1, BYTES * 8);
t1 = micros();
do_shovels(10);
t1 = micros() - t1;
prmbs("dig hole", t1, BYTES * 8);
t1 = micros();
memcpy32(dst, src, BYTES);
do_shovels(10);
wait_dma();
t1 = micros() - t1;
prmbs("chew gum and dig hole", t1, BYTES * 8);
t1 = micros();
memcpy(dst, src, BYTES);
t1 = micros() - t1;
prmbs("memcpy", t1, BYTES * 8);
memset(dst, 0, BYTES);
t1 = micros();
memcpy32(dst, src, BYTES);
wait_dma();
t1 = micros() - t1;
prmbs("dma", t1, BYTES * 8);
for (i = 0; i < BYTES; i++) if (dst[i] != i % 256)errs++;
Serial.print("errs "); Serial.println(errs);
delay(3000);
}
I have pulled out the dma_wait() for completion as a separate function. You can see that the dma() operation has only added an extra 5us to the work done. i.e. you can either do something useful or waste time waiting for completion.
I have made no attempt to see what code is generated. It appears that do_shovels() and do_steps() don't make much difference to the DMA. It obviously finds cycles to access the memory bus.
David.