If you don't want to use the assembly-coded library or the fixed point fft library, here is the floating point code I've been using for some time. It came out of "Numerical Recipes" and was implemented using WinAVR. This version is just a test example with a square wave input and the answer is correct to about 5 decimal places. Note the unusual Fortran-like indexing of the data array (begins with element 1), so to be safe allocate N+1 elements for data.
/*
REAL FFT tests.
atmega64 @ 11.059 MHz with 32k external sram
910 milliseconds for 2048 point floating point transform (100 ms/256 points)
// relocated heap to external SRAM via linker command
// -Wl,--defsym=__heap_start=0x801100,--defsym=__heap_end=0x807fff
// also must include floating point printf and lib:
// -Wl,-u,vfprintf -lprintf_flt -lm
*/
#include <avr/io.h>
#include <avr/interrupt.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <avr/pgmspace.h>
#define F_CPU 11059200UL
#include <util/delay.h>
#define SWAP(a,b) tempr=(a);(a)=(b);(b)=tempr
#define NP 256
///////////////////////////////////////////////////////
// Global Variables
float *data; //allocated at runtime in main
int ndata=NP; //size of transform buffer
// initialization sequence in .init3
void init_extSRAM(void) __attribute__((naked,section(".init3"))) ;
void init_extSRAM(void) {
//Enable external SRAM: set MCUCR, XMCRA and XMCRB for
// External SRAM page configuration: 0000h - 7FFFh
// Lower page wait state(s): None
// Upper page wait state(s): None
MCUCR=0x80;
XMCRA=0x00;
XMCRB &=~7 ; //Make bits 0-2 of XMCRB low to enable all of external SRAM
}
volatile unsigned short ticks=0; //0.01 second clock ticks updated in ISR
// delay for time_ms milliseconds by looping
// _delay_ms() has max delay about 13 ms @ 20 MHz
void delay_ms( unsigned short time_ms )
{
unsigned int i;
for ( i = 0; i < time_ms; i++ ) _delay_ms( 1 );
}
// global COM=0 or 1 specifies UART0 or UART1
int COM=1;
#include "uputget.c"
int uart_putchar(char c, FILE *stream) {
/*
* Send character c to the LCD for use with printf
*/
uputchar(c);
return 0;
}
//setup outpuf file for printf
FILE uart_str = FDEV_SETUP_STREAM(uart_putchar, NULL, _FDEV_SETUP_WRITE);
/*
* Timer0 overflow interrupt handler.
* provides global ticks of 10 ms
*/
ISR(TIMER0_OVF_vect)
{
TCNT0=256-108;
ticks++;
}
void timer_init(void) {
// set up Timer0 to provide 100 tick/sec
TCCR0 = (7<<CS00); //clock/1024
TIMSK |= (1<<TOIE0); // enable int on overflow
TCNT0=256-108; //exact divisor for 11.059 MHz
}
void get_data(void)
{
int j;
for (j=0;j<NP/2;j++) {
data[j]= -1.0;
data[j+NP/2]= 1.0;
}
}
void four1(int nn, int isign)
{
int n,mmax,m,j,istep,i;
/*double*/
float wtemp,wr,wpr,wpi,wi,theta;
float tempr,tempi;
n=nn << 1;
j=1;
for (i=1;i<n;i+=2) {
if (j > i) {
SWAP(data[j],data[i]);
SWAP(data[j+1],data[i+1]);
}
m=n >> 1;
while (m >= 2 && j > m) {
j -= m;
m >>= 1;
}
j += m;
}
mmax=2;
while (n > mmax) {
istep=mmax << 1;
theta=isign*(6.2831853/mmax);
wtemp=sin(0.5*theta);
wpr = -2.0*wtemp*wtemp;
wpi=sin(theta);
wr=1.0;
wi=0.0;
for (m=1;m<mmax;m+=2) {
for (i=m;i<=n;i+=istep) {
j=i+mmax;
tempr=wr*data[j]-wi*data[j+1];
tempi=wr*data[j+1]+wi*data[j];
data[j]=data[i]-tempr;
data[j+1]=data[i+1]-tempi;
data[i] += tempr;
data[i+1] += tempi;
}
wr=(wtemp=wr)*wpr-wi*wpi+wr;
wi=wi*wpr+wtemp*wpi+wi;
}
mmax=istep;
}
}
void realft(int n, int isign)
{
int i,i1,i2,i3,i4,np3;
float c1=0.5,c2,h1r,h1i,h2r,h2i;
/*double*/
float wr,wi,wpr,wpi,wtemp,theta;
theta=3.141592654/(float) (n>>1);
if (isign == 1) {
c2 = -0.5;
four1(n>>1,1);
} else {
c2=0.5;
theta = -theta;
}
wtemp=sin(0.5*theta);
wpr = -2.0*wtemp*wtemp;
wpi=sin(theta);
wr=1.0+wpr;
wi=wpi;
np3=n+3;
for (i=2;i<=(n>>2);i++) {
i4=1+(i3=np3-(i2=1+(i1=i+i-1)));
h1r=c1*(data[i1]+data[i3]);
h1i=c1*(data[i2]-data[i4]);
h2r = -c2*(data[i2]+data[i4]);
h2i=c2*(data[i1]-data[i3]);
data[i1]=h1r+wr*h2r-wi*h2i;
data[i2]=h1i+wr*h2i+wi*h2r;
data[i3]=h1r-wr*h2r+wi*h2i;
data[i4] = -h1i+wr*h2i+wi*h2r;
wr=(wtemp=wr)*wpr-wi*wpi+wr;
wi=wi*wpr+wtemp*wpi+wi;
}
if (isign == 1) {
data[1] = (h1r=data[1])+data[2];
data[2] = h1r-data[2];
} else {
data[1]=c1*((h1r=data[1])+data[2]);
data[2]=c1*(h1r-data[2]);
four1(n>>1,-1);
}
}
////////////////////////////////////////////////////////////////////////
int main ()
{
int i;
DDRD |= 1; //set PORTD.0 = output
PORTD &= ~1; //led on
delay_ms(200);
PORTD |= 1; //led off
uart_init();
stdout = &uart_str; // associate stream with stdout
timer_init();
sei();
printf("Real FFT 0.1\n\r"); // announce string
if ( (data = (float *)malloc( (ndata+1)*sizeof(float)) ) == NULL )
printf("Unable to allocate data array\r\n");
else
printf("Successfully allocated data array\r\n");
get_data();
for(i=0; i<32; i+=2)
printf( "%7.2f %7.2f \r\n",(double) data[i],(double) data[i+1]);
ticks=0;
realft(NP,1); //transform
i=ticks;
printf(" realfft in %dx10 ms \r\n",i);
printf(" %d data points \r\n",NP);
for(i=0; i<32; i+=2)
printf( "%7.2f %7.2f \r\n",(double) data[i],(double) data[i+1]);
while(1);
}