use FHT library, free up RAM used by "fht_input" after getting "fht_log_out"

Is there any easy way to free up RAM used by "fht_input" after getting "fht_log_out"?

Thank you.

What are you babbling about? We expect to see links to libraries AND your code, so your question has some context.

Ok, the problem is amount of memory that this library need. And It is global, it is fixed for all time. From assembly it must global ... I try use a parameter or local but not work. This and add a module sd card with SdFat library leave very low free RAM.

Thank you.

Extract of FHT library:

/*
FHT for arduino - hartley transform
guest openmusiclabs.com 9.1.12
this is a speed optimized program
for calculating an N point FHT on a block of data
please read the read_me file for more info

modded 7.7.14 - fixed progmem for new avr-gcc (thanks to forum user kirill9617)
*/

...

#if (LOG_OUT == 1)
  uint8_t fht_log_out[(FHT_N/2)]; // FHT log output magintude buffer
#endif

...

int fht_input[(FHT_N)]; // FHT input data buffer

...

static inline void fht_window(void) {
  // save registers that are getting clobbered
  // avr-gcc requires r2:r17,r28:r29, and r1 cleared
  asm volatile (
  "push r2 \n"
  "push r3 \n"
  "push r4 \n"
  "push r5 \n"
  "push r15 \n"
  "push r16 \n"
  "push r17 \n"
  "push r28 \n"
  "push r29 \n"
  );

  // this applies a window to the data for better frequency resolution
  asm volatile (
  "ldi r28, lo8(fht_input) \n" // set to beginning of data space
  "ldi r29, hi8(fht_input) \n"
  "ldi r30, lo8(_window_func) \n" // set to beginning of lookup table
  "ldi r31, hi8(_window_func) \n"
  "clr r15 \n" // prep null register
  "ldi r20, "STRINGIFY(((FHT_N)&(0xff)))" \n"

  "1: \n"
  "lpm r22,z+ \n" // fetch window value
  "lpm r23,z+ \n"
  "ld r16,y \n" // fetch data
  "ldd r17,y+1 \n"

  // multiply by window
  "fmuls r17,r23 \n"
  "movw r4,r0 \n"
  "fmul r16,r22 \n"
  "adc r4,r15 \n"
  "movw r2,r0 \n"
  "fmulsu r17,r22 \n"
  "sbc r5,r15 \n"
  "add r3,r0 \n"
  "adc r4,r1 \n"
  "adc r5,r15 \n"
  "fmulsu r23,r16 \n"
  "sbc r5,r15 \n"
  "add r3,r0 \n"
  "adc r4,r1 \n"
  "adc r5,r15 \n"

  "st y+,r4 \n" // restore data
  "st y+,r5 \n"
  "dec r20 \n" // check if done
  "brne 1b \n"
  : :
  : "r0", "r1", "r2", "r3", "r4", "r5", "r15", "r16", "r17", "r20", "r30", "r31",
   "r22", "r23", "r28", "r29"
  );

  // get the clobbers off the stack
  asm volatile (
  "pop r29 \n"
  "pop r28 \n"
  "pop r17 \n"
  "pop r16 \n"
  "pop r15 \n"
  "pop r5 \n"
  "pop r4 \n"
  "pop r3 \n"
  "pop r2 \n"
  "clr r1 \n" // reset c compiler null register
  );
}

...

#endif // end include guard

well the best way then is to merge the library code in your own specific project and not use it as a library - or modify the library.

When you bring the library into your code - you can do what you want to it. for example change

#if (LOG_OUT == 1)
  uint8_t fht_log_out[(FHT_N/2)]; // FHT log output magintude buffer
#endif

into

#if (LOG_OUT == 1)
  uint8_t*  fht_log_out = (uint8_t  *)  0;; // FHT log output magintude buffer
#endif

and add in your code before you call the FHT

#if (LOG_OUT == 1)
fht_log_out = (uint8_t  *) malloc( sizeof( uint8_t ) * (FHT_N/2) );
#endif

and once you are done with it

#if (LOG_OUT == 1)
free(fht_log_out);
fht_log_out = (uint8_t  *)  0;
#endif

do the same with the other statically allocated arrays

Thank you ... I will try.

It does not work, execution hangs when calling the window function. Before I used "malloc" to "fht_input" and "log_out" and leave 842 byte of RAM free. I do not know if I'm doing something wrong. Without using "malloc" the code worked.

  ...
  fht_input = (int  *) malloc( sizeof( int ) * (FHT_N/2) );
  fht_log_out = (uint8_t  *) malloc( sizeof( uint8_t ) * (FHT_N/2) );
  byte tADCSRA = ADCSRA;
  byte tADMUX = ADMUX;
  ADCSRA = micADCSRA;
  ADMUX = micMux;
  for (int i=0;i<FHT_N;i++) {
    while(!(ADCSRA & 0x10));
    ADCSRA = micADCRst;
    byte m = ADCL;
    byte j = ADCH;
    int q = (j << 8) | m;
    q -= 0x0200;
    q <<= 6;
    fht_input[i] = q;
  }
  fht_window();
  fht_reorder();
  fht_run();
  fht_mag_log();
  sei();
  ADCSRA = tADCSRA;
  ADMUX = tADMUX;
  for (byte iBin = 1; iBin < FHT_N/2; iBin++) {
    ... copy out ...
  }
  free(fht_input);fht_input = (int  *)  0;
  free(fht_log_out);fht_log_out = (uint8_t  *)  0;
  ...

are you sure the arrays are defined in the right way:

int * fht_input;
uint8_t * fht_log_out;

are you sure you removed the #include for the FHT library and just added the modified files to your project?

Yes, I think so. I show a example code.

If you want to test, download the FHT library from "attachment:ArduinoFHT2.zip of ArduinoFHT - Open Music Labs Wiki" decompresses and copies the files:

256_reorder.inc
cas_lookup_256.inc
decibel.inc
hann_256.inc

in the project folder and change the code in the path of these files.

// change path "/home/ismael/Arduino/sketch_jun24a/" for you !!!!

#define micMux 0x46 // use 0x46 = A6 para microfono (para IDE con mini pro 0x46, A6 no funciona)
#define micADCRst 0xf7 // Reset ADC para microfono
#define micADCSRA 0xe7 // 7 => prescaler 128 para ADC del microfono

// FHT library
#define LOG_OUT 1 // declara array log_out
#define FHT_N 256 // fija a 256 valores el fht
#define STRINGIFY_(a) #a
#define STRINGIFY(a) STRINGIFY_(a)
#define SCALE 1
#define WINDOW 1
#define OCT_NORM 1
#define REORDER 1
#define OCTAVE 0
#define LOG_N 8
#define _R_V 8 // reorder value - used for reorder list
#include <avr/pgmspace.h>
extern const int16_t _cas_constants[] PROGMEM = {
  #include "/home/ismael/Arduino/sketch_jun24a/cas_lookup_256.inc"
};
extern const uint8_t _reorder_table[] PROGMEM = {
  #include "/home/ismael/Arduino/sketch_jun24a/256_reorder.inc"
};
extern const uint8_t _log_table[] PROGMEM = {
  #include "/home/ismael/Arduino/sketch_jun24a/decibel.inc"
};
extern const int16_t _window_func[] PROGMEM = {
  #include "/home/ismael/Arduino/sketch_jun24a/hann_256.inc"
};

uint8_t * fht_log_out = (uint8_t  *)  0;;
int * fht_input = (int  *)  0;;
//uint8_t fht_log_out[(FHT_N/2)];
//int fht_input[(FHT_N)];

static inline void fht_run(void) { ... copy from FHT library ... }
static inline void fht_reorder(void) { ... copy from FHT library ... }
static inline void fht_mag_log(void) { ... copy from FHT library ... }
static inline void fht_mag_lin(void) { ... copy from FHT library ... }
static inline void fht_mag_lin8(void) { ... copy from FHT library ... }
static inline void fht_window(void) { ... copy from FHT library ... }
static inline void fht_mag_octave(void) { ... copy from FHT library ... }
void Spectrum() {
  Serial.println(" Memory...");
  fht_input = (int  *) malloc( sizeof( int ) * (FHT_N/2) );
  fht_log_out = (uint8_t  *) malloc( sizeof( uint8_t ) * (FHT_N/2) );
  byte tADCSRA = ADCSRA;byte tADMUX = ADMUX;ADCSRA = micADCSRA;ADMUX = micMux;
  Serial.println(" Captures...");
  for (int i=0;i<FHT_N;i++) {
    while(!(ADCSRA & 0x10));
    ADCSRA = micADCRst;byte m = ADCL;byte j = ADCH;int q = (j << 8) | m;
    q -= 0x0200;q <<= 6;fht_input[i] = q;
  }
  Serial.println(" Windows...");
  fht_window();
  Serial.println(" Reorder...");
  fht_reorder();fht_run();fht_mag_log();sei();
  ADCSRA = tADCSRA;ADMUX = tADMUX;
  Serial.println(" Result...");
  for (byte iBin = 1; iBin < FHT_N/2; iBin++) {
    Serial.print("  ");Serial.print(iBin);Serial.print(": ");Serial.println(fht_log_out[iBin]);
  }
  free(fht_log_out);fht_log_out = (uint8_t  *)  0;
  free(fht_input);fht_input = (int  *)  0;
}
void setup() {
  pinMode(7, OUTPUT);digitalWrite(7, LOW);delay(100); // enciende microfono
  Serial.begin(115200);
  Serial.println("Setup...");
}
void loop() {
  Serial.println("Spectrum:");
  Spectrum();
  delay(5000);
}

Hi

semi bad news... (more work)

I had a look at the library source code and they do assembly code like below for example in fht_mag_log

  asm volatile (
  "ldi r26, lo8(fht_input) \n" // set to beginning of data space
  "ldi r27, hi8(fht_input) \n"
  "ldi r28, lo8(fht_log_out) \n" // set to beginning of result space
  "ldi r29, hi8(fht_log_out) \n"
  "ldi r30, lo8(fht_input + "STRINGIFY(FHT_N*2)") \n" // set to end of data space
  "ldi r31, hi8(fht_input + "STRINGIFY(FHT_N*2)") \n"

and they do that in a static inline function

The challenge with this is that the assembly language command ldi r26, lo8(fht_input) is calculated at compile time.

the register pairs R26:R27, R28:R29 and R30:R31 play a special role in the AVR architecture. The role is so important that these pairs have extra names in assembler: X, Y and Z. These pairs are 16-bit pointer registers, able to point to adresses with max. 16-bit into SRAM locations (X, Y or Z) or into locations in program memory (Z).

So by loading R26:R27 with fht_input, they actually set the X register pointing to the input buffer and register Y will point to fht_log_out buffer and register Z points to the last int of fht_input.

The challenge with that is that ldi loads an 8 bit constant directly to a register, thus the bits stored in register r26 are the the least significant 8 bits of our 16-bit integer 'fht_input' which at the time of compilation is the address of our pointer to an empty array, not the pointer to the buffer...

So it looks like they optimized the assembly language to directly point to a static address of the statically created array...

So if you want to fix this, then you need to go through all the assembly language and instead of doing ldi commands you should replace those with an instruction that gets what is at run time in the fht_input variable (because this is the pointer to the beginning of the data) and not the address of our pointer.

it's not un-doable, there are probably less than 20 occurrences of these in fht.h - just need to look at the ldi instructions referencing either fht_input or fht_log_out which are the 2 we messed around with and replace ldi with another code.

for example

 "ldi r26, lo8(fht_input) \n" // set to beginning of data space
  "ldi r27, hi8(fht_input) \n"

means

take the less significant 8 bits of fht_input address and store in register r26
take the most significant 8 bits of fht_input address and store in register r27

then the combo register X (r26:r27) points at the beginning of the fht_input array.

what we want to do is :

take the less significant 8 bits of what is held by the fht_input variable and store in register r26
take the most significant 8 bits of what is held by the fht_input variable and store in register r27

in assembly language, there is the command lds, you can use like this:

lds r2,$FF00 ; Load r2 with the contents of data space location $FF00

I have not tried but probably

  "lds r26, fht_input \n" // set to beginning of data space
  "lds r27, fht_input+1 \n"

would do what we want. go dynamically look into what is held in the fht_input pointer and use that as least significant part of the address vector, and then the next byte for the most significant...

same applies to the fht_log_out actions

want to give it a go?

(I've not checked if all the other files that are dynamically included hold any similar code).

to hell with encapsulation, hiding and others. Why assembler? arrrrrrr ...

First, many thanks for your help.

I have tried what you comment me. But I've stumbled on ...

...
  "cpi r28, lo8(fht_input + "STRINGIFY(FHT_N*2)") \n" // check if at end of dataspace
  "cpc r29, r9 \n"
...

and then ...

...
  "subi r26, lo8(-(fht_input)) \n" // pointer to offset
  "sbci r27, hi8(-(fht_input)) \n"
....

I suppose this would be settled with "lds" of "fht_input" to a record and operating with that record ... But do not even know that record could use (I know nothing assembler)

Desperately I searched a development of FHT in "C" language and found wff_fht but I see that applies to Arduino.

well assembly is fast which is key for returning quickly and not loose too many samples of the next window...
They developed it with static arrays in mind but indeed does not make it easy to modify... esp. if you don't understand assembly language...

doing math inline as they can afford to do because their array is static is OK for them but won't work for you.

My recommendation would be to use some other registers (which you need to save and restore and ensure they are not used anywhere else in their code) where you would do some math to store the weird things they need such as lo8(-(fht_input)) or hi8(-(fht_input)) or lo8(fht_input + "STRINGIFY(FHT_N*2)") and then when they need this, use an operation with the register instead of static values..

quite ambitious as you would need to read and understand most of the assembly they wrote...

if you find something in C, yes there are chances it will work better as all will likely be resolved at run time, not compilation time.

Hello, I finally learn some assembler for the microcontroller.
I hope you can help me something.
So far I have this ...

  "lpm r26,z+ \n" // fetch source address
  "clr r27 \n" // these next 3 lines could be optimized out
  "lsl r26 \n" // by chaging the lookup table
  "rol r27 \n" // only works for FHT_N <= 128
  "subi r26, lo8(-(fht_input)) \n" // add pointer to offset
  "sbci r27, hi8(-(fht_input)) \n"
  "ld r2,x+ \n" // fetch source
  "ld r3,x \n"

It could be done this way ...

 asm (
  "ldi r26,240 \n"
  "clr r27 \n" //
  "lsl r26 \n" //
  "rol r27 \n" // Multiplica por 2 el índice del array (2byte por cada int)
  "lds r24,(a) \n"
  "lds r25,(a+1) \n"  // almacena dirección de inicio del array
  "add r26, r24 \n"
  "adc r27, r25 \n"
  "sts (m),r26 \n"      // almacena en m para comprobar
  "sts (m+1),r27 \n"
  : : : "r24", "r25", "r26", "r27"
  );

I do not understand what is the meaning of subtracting a negative number, instead of adding.

And I wish that might correct me if you see something that is not right, thank you.

Can you comment on

"ldi r26,240 \n" ??

so I suppose you question is about this

  "subi r26, lo8(-(fht_input)) \n" // add pointer to offset
  "sbci r27, hi8(-(fht_input)) \n"

When you wan to do math on 16 bits, you do that in 2 steps first on the least significant byte and then on the most significant byte by taking into account the carry flag that you might have generated in the first operation.

If the operation you want to do is an addition, then bad luck :(... The AVR has no 'add immediate' or 'add immediate with carry" instructions

But engineers are smart :slight_smile: and in order to do an addition you use the subtract immediate and subtract immediate with carry instructions

The operation is done as follows:

  1. Subtract immediate Low byte of negated number from register Low byte.
  2. Subtract immediate with carry High byte of negated number from register High byte.

As we want to do this with a value, for that you use the SUBI and SBCI instructions

SUBI Rx,K ; Subtract the constant K from the content of register Rx and store the result in register Rx

SBCI Rx,K ; Subtract the constant K and the current value of the carry flag from the content of register Rx and store the result in register Rx

The lo8 and hi8 are what they call Relocatable Expression Modifiers which let you extract the low or high part of your 16 bits parameter

That's what happening in that code...

"ldi r26,240 \n"

This is for have a value on I can work... and I not must to load const in progmen from include.

With this test I belive to do it right way for apply to code library FHT. The first time and blindness. I keep to develop it and I will try test. If you see any error then you comment me.

Thank you.

I keep on ...

  // reset for next pass
  "movw r24,r10 \n" // make a quarter stride
  "lsr r25 \n"
  "ror r24 \n"
  "add r28,r24 \n" // lower is incremented by quarter stride to get to the next butterfly
  "adc r29,r25 \n"
  "cpi r28, lo8(fht_input + "STRINGIFY(FHT_N*2)") \n" // check if at end of dataspace
  "cpc r29, r9 \n"
  "brsh 10f \n"
  "movw r26,r28 \n" // bottom is now top
  "add r28,r10 \n" // bottom is moved down half a stride
  "adc r29,r11 \n"
  "mov r14,r8 \n" // reset inner loop counter
  "sub r30,r10 \n" // reset Wk lookup table pointer
  "sbc r31,r11 \n"
  "adiw r30,0x04 \n" // number of butterflies minus 1 for the ones not done
  "rjmp 6b \n" // keep going

It could be done this way ...

  "ldi r24,1 \n"
  "ldi r25,0 \n"    // almacena uno como incremento para trabajar
  "lds r28,(a) \n"
  "lds r29,(a+1) \n"
  "sts (i),r24 \n"      // almacena en i para comprobar
  "sts (i+1),r25 \n"
  "sts (im),r28 \n"      // almacena en im para comprobar
  "sts (im+1),r29 \n"
  "9: \n"
  "ldi r24,1 \n"
  "ldi r25,0 \n"
  "add r28, r24 \n"
  "adc r29, r25 \n" // incrementa el índice que apunta al array
  "lds r24,(a) \n"
  "lds r25,(a+1) \n"
  "subi r24,lo8(-("STRINGIFY(nMuestras*2)")) \n"
  "sbci r25,hi8(-("STRINGIFY(nMuestras*2)")) \n" // almacena final array
  "sts (m),r24 \n"      // almacena en m para comprobar
  "sts (m+1),r25 \n"
  "sts (d),r28 \n"      // almacena en d para comprobar
  "sts (d+1),r29 \n"
  "cp r28,r24 \n"
  "cpc r29,r25 \n"  // compara si llego al final del array
  "brsh 10f \n" 
  "rjmp 9b \n"
  "10: \n"

Assembly madness

All Rights, 772 (lin_out) RAM bytes used and freed. To see how to free program memory now. :wink:

  uint8_t*  fht_log_out = (uint8_t  *)  0;; // NUEVO // uint8_t fht_log_out[(FHT_N/2)]; // FHT log output magintude buffer
  uint16_t* fht_lin_out = (uint16_t  *)  0;; // NUEVO // uint16_t fht_lin_out[(FHT_N/2)]; // FHT linear output magintude buffer
  uint8_t* fht_lin_out8 = (uint8_t  *)  0;; // NUEVO // uint8_t fht_lin_out8[(FHT_N/2)]; // FHT linear output magintude buffer
int* fht_input = (int  *)  0;; // NUEVO // int fht_input[(FHT_N)]; // FHT input data buffer

  "lds r28, (fht_input) \n" // NUEVO "ldi r28, lo8(fht_input) \n" //set to beginning of data space
  "lds r29, (fht_input+1) \n" // NUEVO "ldi r29, hi8(fht_input) \n"

  "lds r25, (fht_input+1) \n"  // NUEVO
  "sbci r25, hi8(-(" STRINGIFY(FHT_N*2) ")) \n" //NUEVO "ldi r16, hi8((fht_input + " STRINGIFY(FHT_N*2) ")) \n"
  "mov r16, r25 \n"  // NUEVO prep end of dataspace register

  "lds r26, (fht_input) \n"  // NUEVO "ldi r26, lo8(fht_input) \n" //set top pointer to beginning of data space
  "lds r27, (fht_input+1) \n"  // NUEVO "ldi r27, hi8(fht_input) \n"

  "lds r24,(fht_input) \n"  // NUEVO
  "lds r25,(fht_input+1) \n"  // NUEVO Almaceno inicio array
  "subi r24,lo8(-(" STRINGIFY(FHT_N*2) ")) \n"  // NUEVO
  "sbci r25,hi8(-(" STRINGIFY(FHT_N*2) ")) \n"  // NUEVO almacena final array
  "cp r28, R24 \n" // NUEVO "cpi r28, lo8(fht_input + "STRINGIFY(FHT_N*2)") \n"
  "cpc r29, r25 \n" // NUEVO  "cpc r29, r9 \n" // check if at end of dataspace

  "lds r24,(fht_input) \n"  // NUEVO carga direccion alamcendada en puntero
  "lds r25,(fht_input+1) \n"  // NUEVO carga direccion alamcendada en puntero antes del bucle '1b'

  "add r26, r24 \n"  // NUEVO "subi r26, lo8(-(fht_input)) \n" // add pointer to offset
  "adc r27, r25 \n"  // NUEVO "sbci r27, hi8(-(fht_input)) \n"

  "add r28, r24 \n"  // NUEVO "subi r28, lo8(-(fht_input)) \n" // add pointer to offset
  "adc r29, r25 \n"  // NUEVO "sbci r29, hi8(-(fht_input)) \n"

  "lds r26, (fht_input) \n" // NUEVO "ldi r26, lo8(fht_input) \n" // set to beginning of data space
  "lds r27, (fht_input+1) \n" // NUEVO "ldi r27, hi8(fht_input) \n"
  "lds r28, (fht_log_out) \n" // NUEVO "ldi r28, lo8(fht_log_out) \n" // set to beginning of result space
  "lds r29, (fht_log_out+1) \n" // NUEVO "ldi r29, hi8(fht_log_out) \n"
  "lds r30,(fht_input) \n"  // NUEVO
  "lds r31,(fht_input+1) \n"  // NUEVO
  "subi r30,lo8(-(" STRINGIFY(FHT_N*2) ")) \n" // NUEVO "ldi r30, lo8(fht_input + "STRINGIFY(FHT_N*2)") \n" // set to end of data space
  "subi r31,hi8(-(" STRINGIFY(FHT_N*2) ")) \n" // NUEVO "ldi r31, hi8(fht_input + "STRINGIFY(FHT_N*2)") \n"

  "lds r26, (fht_input) \n" // NUEVO "ldi r26, lo8(fht_input) \n" // set to beginning of data space
  "lds r27, (fht_input+1) \n" // NUEVO "ldi r27, hi8(fht_input) \n"
  "lds r28, (fht_lin_out) \n" // NUEVO "ldi r28, lo8(fht_lin_out) \n" // set to beginning of result space
  "lds r29, (fht_lin_out+1) \n" // NUEVO "ldi r29, hi8(fht_lin_out) \n"
  "lds r30,(fht_input) \n"  // NUEVO
  "lds r31,(fht_input+1) \n"  // NUEVO
  "subi r30,lo8(-(" STRINGIFY(FHT_N*2) ")) \n" // NUEVO "ldi r30, lo8(fht_input + "STRINGIFY(FHT_N*2)") \n" // set to end of data space
  "subi r31,hi8(-(" STRINGIFY(FHT_N*2) ")) \n" // NUEVO "ldi r31, hi8(fht_input + "STRINGIFY(FHT_N*2)") \n"

  "lds r26, (fht_input) \n" // NUEVO "ldi r26, lo8(fht_input) \n" // set to beginning of data space
  "lds r27, (fht_input+1) \n" // NUEVO "ldi r27, hi8(fht_input) \n"
  "lds r28, (fht_lin_out8) \n" // NUEVO "ldi r28, lo8(fht_lin_out8) \n" // set to beginning of result space
  "lds r29, (fht_lin_out8+1) \n" // NUEVO "ldi r29, hi8(fht_lin_out8) \n"
  "lds r30,(fht_input) \n"  // NUEVO
  "lds r31,(fht_input+1) \n"  // NUEVO
  "subi r30,lo8(-(" STRINGIFY(FHT_N*2) ")) \n" // NUEVO "ldi r30, lo8(fht_input + "STRINGIFY(FHT_N*2)") \n" // set to end of data space
  "subi r31,hi8(-(" STRINGIFY(FHT_N*2) ")) \n" // NUEVO "ldi r31, hi8(fht_input + "STRINGIFY(FHT_N*2)") \n"

  "lds r28, (fht_input) \n" // NUEVO "ldi r28, lo8(fht_input) \n" // set to beginning of data space
  "lds r29, (fht_input+1) \n" // NUEVO "ldi r29, hi8(fht_input) \n"

  "lds r26, (fht_input) \n" // NUEVO "ldi r26, lo8(fht_input) \n" // set to beginning of data space
  "lds r27, (fht_input+1) \n" // NUEVO "ldi r27, hi8(fht_input) \n"
  "lds r28, (fht_oct_out) \n" // NUEVO "ldi r28, lo8(fht_oct_out) \n" // set to beginning of result space
  "lds r29, (fht_oct_out+1) \n" // NUEVO "ldi r29, hi8(fht_oct_out) \n"
  "lds r30,(fht_input) \n"  // NUEVO
  "lds r31,(fht_input+1) \n"  // NUEVO
  "subi r30,lo8(-(" STRINGIFY(FHT_N*2) ")) \n" // NUEVO "ldi r30, lo8(fht_input + "STRINGIFY(FHT_N*2)") \n" // set to end of data space
  "subi r31,hi8(-(" STRINGIFY(FHT_N*2) ")) \n" // NUEVO "ldi r31, hi8(fht_input + "STRINGIFY(FHT_N*2)") \n"

That's All Folks

Thank you J-M-L

Well done!!!