One more tweak. This one doesn't use BigNumber, and it runs nearly twice as fast.
We used BigNumber to get around the fact that intermediate results can overflow an unsigned long long in the Miller-Rabin test. That test requires calculating ab % n for large numbers. Using numbers that are less than 1016 - 54 bits - the results always fit into a long long, but the intermediate products that generate the exponentials can be bigger than 264. BigNumber gets around the restriction, but at some cost in speed and storage.
This code uses function "mulmodll()," that produces the product of two unsigned long long's, modulo a third unsigned long long, without overflowing on intermediate products. It generates the product internally as two unsigned long long's, and calculates the modulo by repeated shifts and mods. It has its own overhead - it has to take % as many as seven times. But, it's working in native number formats, rather than in BCD, and it goes faster.
This code depends heavily on the fact that numbers bigger than 1016 aren't investigated. That fact simplifies the 128-bit multiplication, in that certain intermediate results can't overflow, and it simplifies the mod operations, too. The code isn't general. It's assumptions are that the lower half of any argument times the upper half of any argument will be less than 263, and that the modulus will always be less than 256. With 1016 = 253.15, the problem fits, with a bit or two to spare.
Results: Running on the Uno, 100000 primes starting at 961,748,929, and comparing to the fiftieth million from this list - The first fifty million primes - found an exact match, with calculation time averaging about 170 ms per prime. Using the tests of reply #80, calculation time for 16-digit numbers added up to 134,578 ms, for an improvement of a factor of a little better than 5 over using BigNumber, and about 1.9 times as fast as using Numbler4. For 12-digit numbers, calculation time was 52,472 ms, for a similar speed improvement. For numbers near the top of the range, starting at 9,999,999,999,990,001, the average calculation time for 100 primes was around 1.5 seconds - about three times what Nick was looking for.
The Due was ferocious. It averaged 10.6 ms for 1,000 12-digit primes starting at 100000000001; 25.8 ms for 16-digit starting at 1000000000000001, and 33.4 starting at 9900000000036791. My Due ran these tests, and then stopped working - I'm hoping that it's just bored.
There's room for improvement here. I think there's a sweet spot for the number of trial divisions, I think it depends to some extent on the size of the numbers under test, and I don't think I've found it here. There may be a way to reduce the number of modulo calls in function powmodll(), but I can't think of any yet.
Here's the code. The included file is attached at reply #70:
PROGMEM prog_uchar primes[] = {
3, 5, 7, 11, 13, 17, 19, 23, 29, 31,
37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79,
83, 89, 97,101,103,107,109,113,127,131,137,
139,149,151,157,163,167,173,179,181,191,193,
197,199,211,223,227,229,233,239,241,251
};
const byte nPrimes = sizeof(primes)/sizeof(primes[0]);
#include "Pseudoprimes.h";
const uint16_t nPSP = (sizeof(psps)/sizeof(psps[0])) >> 3;
uint64_t candidate;
unsigned long start;
long found = 0; // Number of primes found
void setup()
{
Serial.begin(115200);
Serial.println ("Starting.");
candidate = 961748927ULL;
} // end setup()
void loop()
{
uint64_t nextPSP;
int nextPSPindex;
for (int i=0;i<nPSP;i++) {
nextPSP = fetchPSP(i);
if ((nextPSP == candidate) || (nextPSP > candidate)) {
nextPSPindex = i;
break;
}
nextPSP = 0;
nextPSPindex = 0;
}
start = millis();
for ( ; candidate < 9999999999999999ULL; candidate += 2) {
if (nextPSP != candidate) {
if (isPrime(candidate)) {
found++;
showPrime (candidate);
}
}
else {
nextPSPindex++;
if (nextPSPindex < nPSP) {
nextPSP = fetchPSP(nextPSPindex);
}
}
}
found = 1;
showPrime(1);
found++;
showPrime(2);
candidate = 3; // back to start
} // end loop()
bool Miller(uint64_t source)
{
if(source <= 3) {
return true;
}
if((source & 1) == 0) {
return false;
}
uint64_t d = source - 1;
int s = 0;
while((d & 1) == 0)
{
d >>= 1;
s ++;
}
uint8_t a;
const byte bases[] = {2, 3, 5, 7,};// 11, 13, 17, 19, 23 };
const byte nbases = sizeof(bases)/sizeof(bases[0]);
for(int i = 0; i < nbases; i++)
{
a = bases[i];
uint64_t x = powmodll(a, d, source);
if((x == 1) || (x == (source - 1))) {
continue;
}
bool stillOK = false;
for(int r = 1; r < s; r++)
{
x = mulmodll(x, x, source);
if(x == 1)
return false;
if(x == (source - 1)) {
stillOK = true;
break;
}
}
if(!stillOK)
return false;
}
return true;
} // end Miller()
bool isPrime(uint64_t n) {
if (!check(n)) {
return false;
}
if (n > 0x10000) {
if (!Miller (n)) {
return false;
}
}
return true;
}
void showPrime (uint64_t n)
{
printll(n);
Serial.print(" is prime. Took ");
Serial.print(millis () - start);
Serial.print (" mS. Found ");
Serial.print (found);
Serial.println (" primes.");
start = millis ();
} // end showIfPrime()
bool check(uint64_t n) {
if (n <= 3) {
return true;
}
if (!(n & 1)) {
return false;
}
uint8_t i = 1;
uint8_t f = pgm_read_byte_near(primes);
while ((i < nPrimes) && (f < n)) {
if ((n % f) == 0) {
return false;
}
f = pgm_read_byte_near(primes+i);
i++;
}
return true;
} // end check()
uint64_t fetchPSP(int i) {
i <<= 3;
uint64_t n = 0;
for (int8_t k=0;k<sizeof(n);k++) {
n = n <<= 8;
n |= pgm_read_byte_near(psps+i+k);
}
return n;
} // end fetchPSP()
void printll(uint64_t n) {
// print an unsigned long long
if (n == 0) {
Serial.print('0');
return;
}
const byte DIGITS = 21;
char s[DIGITS+1];
for (byte i=0;i<DIGITS;i++) {
s[i] = 'X';
}
s[DIGITS] = 0;
byte i = DIGITS-1;
while (n > 0) {
s[i] = (n % 10) | '0';
n = n / 10;
i--;
}
i++;
char *z = &s[i];
Serial.print(z);
} // end printll()
uint64_t powmodll(uint8_t a, uint64_t b, uint64_t c) {
// Calculate a**b % c,
uint64_t x = 1;
uint64_t y = a;
while(b > 0) {
if(b & 1) {
x = mulmodll(x, y, c);
}
y = mulmodll(y, y, c);
b >>= 1;
}
return x;
} // end powmodll()
uint64_t mulmodll(uint64_t a, uint64_t b, uint64_t c) {
// Calculate a*b %c
uint64_t imh; // Intermediate results for multiplication
uint64_t iml;
uint64_t imm;
// Multiply a and b
// Routine is not general:
// relies on fact that MSB of inputs is zero
// Product stored as two unsigned long long's
imm = (a & 0xffffffffUL) * (b >> 32);
iml = (a >> 32) * (b & 0xffffffffUL);
imm += iml; // overflow for general inputs
iml = (a & 0xffffffffUL) * (b & 0xffffffffUL);
imm += iml >> 32;
iml = (iml & 0xffffffffUL) + (imm << 32);
imh = (a >> 32) * (b >> 32);
imh += imm >> 32;
if (imh == 0) {
return (iml % c);
}
for (uint8_t i=0;i<8;i++) {
imh = imh << 8;
imh |= iml >> 56;
iml = iml << 8;
if ((imh >> 56) != 0) {
if (imh > c) {
imh %= c;
}
else {
if (imh == c) {
imh = 0;
}
}
}
}
if (imh > c) {
imh %= c;
}
return imh;
} // end mulmodll()
Here's Uno output for comparison to reply #72, with clock speed the standard 16 MHz:
1000000000054141 is prime. Took 1433 mS. Found 1 primes.
1000000000054147 is prime. Took 778 mS. Found 2 primes.
1000000000054259 is prime. Took 1594 mS. Found 3 primes.