fastest way to right shift an array of 4 bytes

After spending a while on this and continually getting wrong results from trying to shift a 'long'(not what should be expected/compiler bug?)

byte workingstorage[] = { 0xA0, 0xA8, 0x05, 0x01 };

void generateMask (uint8_t* workingstorage,int bitshift){
......
}

Basically I want to do a 32 bit shift right by a given number of bits, any Ideas

I'm not fully understanding, if you right shift 32 bits from a size of 4 bytes, you have nothing left, all bits are shifted out.
Just clear the array.

Post the code you have.

EDIT: are you saying that code like this isn't working how you expect

long l_Data = 0x00FF0000;
l_Data = l_Data >> 8;

Note that the results of shifting of signed integers (especially RIGHT shifting) are poorly defined (or explicitly NOT defined.)
Make sure you make your "long" is unsigned!

unsigned long rightshift(unsigned long a, char bits)
{
    return(a>>bits);
}

Generates exactly the code I'd expect; lsr/ror/ror/ror inside a loop...

Try using casts to make your intention clear to the compiler:

long data,value;

/* assign values to data and nbits */

value = (long)((unsigned long)data) >> nbits;

Normally I avoid casts, but this is one of the few situations where they make sense.

If you need to shift an unsigned long very often (> 1000's times per second) and you need it to be as fast as possible and you have flash to spare,
then you should know it can be optimized especially if you do an inplace shift and even more when the shift factor is constant.

A test sketch shows how the technique works, moving blocks of 8 bits first and then shift the last few bits.
This testcode is not faster for shift values smaller than 8 but faster for larger values.

If the shift factor is fixed you can write a dedicated shift e.g. inline unsigned long shift17(unsigned long l) - see testcode -

//
// FILE: shiftOptimize.pde
//   BY: Rob Tillaart
// DATE: 2011-jul-19
//
union t
{
  byte b[4];
  unsigned long l;
}

volatile temp;
volatile unsigned long y = 0;
unsigned long start;
unsigned long times = 1000L;

void setup()
{  
  Serial.begin(115200);
  Serial.println("shiftOptimize 0.1");

  unsigned long sum1 = test1();
  unsigned long sum2 = test2();
  Serial.print("average speedup factor: ");
  Serial.println(1.0* sum1/sum2, 2);
   
  shift17();

}

unsigned long test1()
{
  Serial.println("#===================#");
  Serial.println("# Normal shift 0-31 #");
  Serial.println("#===================#");

  unsigned long sum = 0;
  unsigned long min = 9999;
  unsigned long max = 0;
  unsigned long duration;

  for (int a=0; a <32; a++)
  {
    temp.l = 0xAA555555;
    start = micros();
    for (long  i=0; i< times; i++)
    {
      y = temp.l >> a;
    }
    duration = micros() - start;
    sum += duration;
    if (duration < min) min = duration;
    if (duration > max) max = duration;
    Serial.print(a);
    Serial.print(", ");
    Serial.println(duration);
  }
  Serial.print("average: ");
  Serial.println(sum/32);
  Serial.print("min: ");
  Serial.println(min);
  Serial.print("max: ");
  Serial.println(max);
  return sum;
}

unsigned long test2()
{ 
  Serial.println("#============================#");
  Serial.println("# Speed optimized shift 0-31 #");
  Serial.println("#============================#");

  unsigned long sum = 0;
  unsigned long min = 9999;
  unsigned long max = 0;
  unsigned long duration;
  for (int a=0; a <32; a++)
  {
    temp.l = 0xAA555555;
    int t=a;  // keep copy of a
    start = micros();
    for (long  i=0; i< times; i++)
    {
      if (a < 8);
      else if (a < 16)
      {
        temp.b[0] = temp.b[1];
        temp.b[1] = temp.b[2];
        temp.b[2] = temp.b[3];
        temp.b[3] = 0;
        a -= 8;
      }  
      else if (a < 24)
      {
        temp.b[0] = temp.b[2];
        temp.b[1] = temp.b[3];
        temp.b[2] = 0;
        temp.b[3] = 0;
        a -= 16;
      }   
      else
      {
        temp.b[0] = temp.b[3];
        temp.b[1] = 0;
        temp.b[2] = 0;
        temp.b[3] = 0;
        a -= 24;
      }
      y = temp.l >> a;
      // if (y != 0xAA555555 >> t) Serial.print('.');
    }
    duration = micros() - start;
    sum += duration;
    if (duration < min) min = duration;
    if (duration > max) max = duration;
    a = t;
    Serial.print(a);
    Serial.print(", ");
    Serial.println(duration);
  }
  Serial.print("average: ");
  Serial.println(sum/32);
  Serial.print("min: ");
  Serial.println(min);
  Serial.print("max: ");
  Serial.println(max);

  return sum;
}

unsigned long shift17()
{
  Serial.println("#==============#");
  Serial.println("# Shift17 test #");
  Serial.println("#==============#");

  unsigned long sum = 0;
  unsigned long min = 9999;
  unsigned long max = 0;
  unsigned long duration;

  temp.l = 0xAA555555;
  int a = 17;

  start = micros();
  for (long  i=0; i< times; i++)
  {
    temp.b[0] = temp.b[2];
    temp.b[1] = temp.b[3];
    temp.b[2] = 0;
    temp.b[3] = 0;
    y = temp.l >> 1;
  } 
  duration = micros() - start;
  sum += duration;
  if (duration < min) min = duration;
  if (duration > max) max = duration;

  Serial.print(a);
  Serial.print(", ");
  Serial.println(duration);

  Serial.print("average: ");
  Serial.println(sum/32);
  Serial.print("min: ");
  Serial.println(min);
  Serial.print("max: ");
  Serial.println(max);

  return sum;
}

void loop()
{
}

pYro_65:
I'm not fully understanding, if you right shift 32 bits from a size of 4 bytes, you have nothing left, all bits are shifted out.
Just clear the array.

Post the code you have.

EDIT: are you saying that code like this isn't working how you expect

long l_Data = 0x00FF0000;

l_Data = l_Data >> 8;

Yep exactly, I'm getting some stupidity when the top bits are manipulated.
I started out with something simple like this

#include <Flash.h>
#include <LiquidCrystal.h>
#include <ctype.h>
#include <limits.h>

// select the pins used on the LCD panel
LiquidCrystal lcd(8, 9, 4, 5, 6, 7);


byte workingstorage[] = { 
  0xFF, 0xFF, 0xFF, 0xFF };

void setup()
{
  Serial.begin(115200);
  lcd.begin(16, 2);      // Start the  LCD library
  lcd.clear();           // Clear LCD  
}

void loop(){
  shifter(workingstorage,false,1);
  lcd.setCursor(0,1);  //set the LCD to the second line pos 0
  mask_to_str(workingstorage);
  delay(1000);
}


void shifter(uint8_t* mask, boolean right, int count){
  unsigned long ulong;
  ulong = ( (mask[0] << 24) 
    + (mask[1] << 16) 
    + (mask[2] << 8) 
    + (mask[3] ) );

  if (right)
  {
    ulong = ulong >>count;
  }
  else{
    ulong = ulong <<count; 
  }

  mask[0] = (int)((ulong >> 24) & 0xff) ;
  mask[1] = (int)((ulong >> 16) & 0xff) ;
  mask[2] = (int)((ulong >> 8) & 0xff);
  mask[3] = (int)((ulong & 0xff));
}

void mask_to_str(const uint8_t* mask)
{
  static char buf[16];
  // This chews up code size!! adding 1.5k
  sprintf(buf,"%03d-%03d-%03d-%03d",mask[0],mask[1],mask[2],mask[3]);
  lcd.print(buf);
}

But as the mask travels left or right , and I print out the 'store' array the values are jumping or sometimes appearing in the wrong place
I thought it was something stupid like sign extension

Sorted it out...,with the use of some satanic pointer diddling and a loop to do each byte separately.
Which is what I should have done instead of trying to shortcut by shifting unsigned longs.

Casting the byte array to an unsigned long may not work as you expect or may not be portable because of endian issues.
This may or may not be a problem for you.

Just being nosy, could you explain what problem you're solving that calls for high speed bit shifting?

high speed bit shifting can be usefull in a library that reads from a sensor. Every single bit (or group of bits) can have a particular meaning. shifting is one way to get these fields.
(another way would be an union wih bitfields)
Another application is encryption and hash functions in which shifts are used. Of course most PRNG's also uses shifts (PseudoRandomNumberGenerators).

And the faster things are done the more time is left for other things ...

Ah, endianness. I forgot about that. Between sign and endianness, I expect we cam explain all of the original poster's problems.

{0x2, 0x5, 0x10, 0x80} cast to a long would give you 0x80100502, and right-shifting as signed would give you something like
0xC0080281 or {0x81, 0x2, 0x8, 0xc0}