replicating SPI read and write with software SPI library

Hello,

I came across a software SPI library recently.I want to use this in the Due. Earlier I used SPI like this:

while((SPI0->SPI_SR & SPI_SR_TDRE)==0)
 {;}
  
  while ((SPI0->SPI_SR & SPI_SR_TXEMPTY) == 0);
  digitalWriteDirect(12, HIGH);
 SPI0->SPI_TDR = rout;
   //SPI0->SPI_TDR = lout;
  __asm__("nop");
  __asm__("nop");
  __asm__("nop");
  __asm__("nop");
  __asm__("nop");
  __asm__("nop");
  __asm__("nop");
  __asm__("nop");
  __asm__("nop");
  __asm__("nop");
  __asm__("nop");
 
 digitalWriteDirect(12, LOW);
 
 while((SPI0->SPI_SR & SPI_SR_TDRE)==0)
 {;}
 
 while((SPI0->SPI_SR & SPI_SR_RDRF) == 0)
 {;}
 lin = SPI0->SPI_RDR;
 delayMicroseconds(1);
  while((SPI0->SPI_SR & SPI_SR_RDRF) == 0)
 {;}
 rin = SPI0->SPI_RDR;

I am doing 16 bit SPI transfers.I want to do the same with the software SPI here.
http://www.iesensor.com/blog/2013/07/09/software-spi-library-for-arduino-tested-on-mega2560/

 #include <stdint.h>
    #define _MISO  40
    #define _MOSI  41
    #define _SCK   42
    #define _SS  12
    
    #define _CPOL 0
    #define _CPHA 1  
  
    #define SPI_MSBFIRST 1
    
    uint8_t spi_transfer(uint8_t b) ;
    void spi_setup();
    
    //#if  1//defined(__AVR_ATmega2560__)
      //for example 8cycles as one SPI period, for Arduino mega2560 16MHz, SPI 2MHz,  tested passed for AD7730!
      //for DUE 84MHz,  need longer nop cycles to keep waveform!  -> multiply each delay by 6 
      #define DELAY_CYCLES  2     //more precise than micro second delay,  1/4 of SPI bus frequency , depends on MCU master clock, 
      #define DELAY_CYCLES_P0  1  //propogation pre
      #define DELAY_CYCLES_P1  3  //propogation post
      #define DELAY_CYCLES_C0 1   //capture pre (SCK edge -> capture) usually smaller delay
      #define DELAY_CYCLES_C1 3   //capture post ( capture -> SCK edge)  usually bigger delay 
    //#else
    //  #error "checking your board and MCU main frequency and set prooper delay for software_SPI bit-banging"
    //#endif
  
 // #endif

  
  /*
  The timing diagram is shown to the right. The timing is further described below and applies to both the master and the slave device.
  
      At CPOL=0 the base value of the clock is zero
          For CPHA=0, data is captured on the clock's rising edge (low→high transition) and data is propagated on a falling edge (high→low clock transition).
          For CPHA=1, data is captured on the clock's falling edge and data is propagated on a rising edge.  ->  MODE1 for Arduino SPI lib
      At CPOL=1 the base value of the clock is one (inversion of CPOL=0)
          For CPHA=0, data is captured on clock's falling edge and data is propagated on a rising edge.
          For CPHA=1, data is captured on clock's rising edge and data is propagated on a falling edge.
  */

//#endif // header

//if SOFTWARE_SPI

  static inline void delayCycles(int cycles)
  {
      for (int i=0; i++; i<cycles)
          __asm__("nop\n\t");
         // __nop();
  }
  
  
  //#if SPI_MSBFIRST
    unsigned char msk[] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
    #define PCK() (bits[0] << 7 | bits[1] << 6 | bits[2] << 5 | bits[3] << 4 | bits[4] << 3 | bits[5] << 2 | bits[6] << 1 | bits[7])
  //#else
    //unsigned char msk[] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
    //#define PCK() (bits[0]  | bits[1] << 1 | bits[2] << 2 | bits[3] << 3 | bits[4] << 4 | bits[5] << 5 | bits[6] << 6 | bits[7]<<7)
 // #endif
  
  
  
  void spi_setup()
  {
    pinMode(_SS, OUTPUT);
    pinMode(_MISO, INPUT);
    pinMode(_MOSI, OUTPUT);
    pinMode(_SCK, OUTPUT);
    //
     digitalWrite(_SS, HIGH);
    #if _CPOL
     digitalWrite(_SCK, HIGH);
    #else
     digitalWrite(_SCK, LOW);
    #endif
  }
  
  // chip selection has been declared outside
  inline void spi_select() {digitalWrite(_SS, LOW);}
  inline void spi_unselect() {digitalWrite(_SS, HIGH);}
  
  //mode 0: SCK idle low, phase: reading at middle of SCK HIGH pulse
  //mode 1: SCK idle low, phase: reading at middle of SCK LOW pulse
  //this big-bang should work for both  CPHA=1  and CPHA=0
  uint8_t spi_transfer(uint8_t b) 
  {
    uint8_t reply=0;
    char bits[8] = {0, 0, 0, 0, 0, 0, 0, 0};  //reading buffer
  
    /*  hardware SPI
    SPDR=b;
    spi_wait();
    reply = SPDR;
    return reply;*/
  
      //cli();  it will cause error on Arduino, most of SPI should be interrupt tolerable
      //spi_select(); // should be called outside, may be required by one transition
      delayCycles(DELAY_CYCLES);   // checking timing characteristics, need delay from CS to rising edge?
      // here, delay is added, to make CPHA=1 and CPHA=0 both work!
      
      for(uint8_t _bit = 0;_bit < 8;_bit++)
      {
      #if _CPHA
        #if _CPOL 
          digitalWrite(_SCK, LOW);  //  propagation at rising edge
       #else
          digitalWrite(_SCK, HIGH);  // change this to LOW for CPOL=1
       #endif
          digitalWrite(_SCK, HIGH);  // change this to LOW for CPOL=1
          delayCycles(DELAY_CYCLES_P0);    
      
          digitalWrite(_MOSI, !!(b & msk[_bit]));
          delayCycles(DELAY_CYCLES_P1); //  propagation
       #if _CPOL 
         digitalWrite(_SCK, HIGH);  // data will be captured at falling edge
       #else
          digitalWrite(_SCK, LOW);  
       #endif
         delayCycles(DELAY_CYCLES_C0); // holding low, so there is enough time for data preparation and changing
      
         bits[_bit] = digitalRead(_MISO); // reading at the middle of SCK pulse
         delayCycles(DELAY_CYCLES_C1);  // wait until data is fetched by slave device,  while SCK low, checking DATAsheet for this interval 
      
      #else
          // changing MOSI big while SCK low, propogation 
          digitalWrite(_MOSI, !!(b & msk[_bit]));
          delayCycles(DELAY_CYCLES_P1); // there is a requirement of LOW and HIGH have identical interval!
  
        #if _CPOL
         digitalWrite(_SCK, LOW); 
        #else
         digitalWrite(_SCK, HIGH);
        #endif
         delayCycles(DELAY_CYCLES_C0);    // 
      
         bits[_bit] = digitalRead(_MISO); // reading at the middle of SCK pulse
         delayCycles(DELAY_CYCLES_C1);  // wait until data is fetched by slave device,  while SCK high, checking DATAsheet for this interval 
         
        #if _CPOL
          digitalWrite(_SCK, HIGH);
        #else
         digitalWrite(_SCK, LOW);  // data will change at falling edge
        #endif
         delayCycles(DELAY_CYCLES_P0); // holding low, so there is enough time for data preparation and changing
      
      #endif
  
      }
     delayCycles(DELAY_CYCLES);  // checking timing characteristics, it is no needed by AD7730, from CS to rising edge
     // spi_unselect();  
  
      //sei();  // it will cause error on Arduino, most of SPI should be interrupt tolerable
  
     reply = PCK();
  
    return reply;
   
  }

//#endif

uint8_t temp1;
void setup( )
{
  spi_setup( );
}

void loop( )
{
  spi_transfer(temp1);
}

How shall I do it?