i have implemented in assembly Dik T. Winter sin333/cos333 routines
(with some change).
Do you see any error? (i'm not expert in FPU)
In your opinion what is the better and why?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <errno.h>
#include <float.h>
#include <time.h>
#define R return
#define P printf
#define F for
#define W while
#define U unsigned
long double
K2pi=6.28318530717958647692528676655900576839433879875021L;
long double kp[]=
{0.0L,
3.14159265358979323846264338327950288419716939937510L, // pi/1
1.57079632679489661923132169163975144209858469968755L, // pi/2
0.0L, // pi/3
0.78539816339744830961566084581987572104929234984377L, // pi/4
0.62831853071795864769252867665590057683943387987502L, // pi/5
0.52359877559829887307710723054658381403286156656251L, // pi/6
0.44879895051282760549466334046850041202816705705358L, // pi/7
0.39269908169872415480783042290993786052464617492188L, // pi/8
0.34906585039886591538473815369772254268857437770834L, // pi/9
0.31415926535897932384626433832795028841971693993751L // pi/10
};
long double kps[]=
{0.0L, 0.0L, 0.0L, 0.0L, // 0, pi, pi/2 pi/3
0.70710678118654752440084436210484903928483593768847L, // pi/4
0.58778525229247312916870595463907276859765243764314L, // pi/5
0.50000000000000000000000000000000000000000000000000L, // pi/6
0.43388373911755812047576833284835875460999072778745L, // pi/7
0.38268343236508977172845998403039886676134456248562L, // pi/8
0.34202014332566873304409961468225958076308336751416L, // pi/9
0.30901699437494742410229341718281905886015458990288L // pi/10
};
long double kpc[]=
{0.0L, 0.0L, 0.0L, 1.0L, // 0, pi, pi/2 pi/3
0.70710678118654752440084436210484903928483593768847L, // pi/4
0.80901699437494742410229341718281905886015458990288L, // pi/5
0.86602540378443864676372317075293618347140262690519L, // pi/6
0.90096886790241912623610231950744505116591916213185L, // pi/7
0.92387953251128675612818318939678828682241662586364L, // pi/8
0.93969262078590838405410927732473146993620813426446L, // pi/9
0.95105651629515357211643933337938214340569863412575L // pi/10
};
static double eps = 32*DBL_EPSILON;
double cos7(double x)
{double y;
static double b[] =
{-0.00000000001136800186, 0.00000000208758867982,
-0.00000027557315566764, 0.00002480158729369483,
-0.00138888888888807796, 0.04166666666666662966,
-0.5 };
//////////
asm{ fld x // st = x
fld x // st = x, x
mov ecx, 8
fmulp st(1), st(0) // st = x*x
mov esi, offset b
fld qword ptr [esi] // st = a[0], x2
l2:
fmul st(0), st(1) // st = a[c]*x2, x2
fld qword ptr [esi+ecx] // st = b[c+1], a[c]*x2, x2
add ecx, 8
faddp st(1), st(0) // st = b[c+1]+a[c]*x2, x2
cmp ecx, 48
jbe l2
fmulp st(1), st(0) // st = (b[6]+a[5]*x2)*x2
fld1 // st = 1, (b[6]+a[5]*x2)*x2
faddp st(1), st(0) // st = (b[6]+a[5]*x2)*x2+1
}
}
double sin7(double x)
{double y;
static double b[] =
{0.00000000015918135277, -0.00000002505113193827,
0.00000275573161030613, -0.00019841269836759707,
0.00833333333333094971, -0.16666666666666662966,
1.0 };
////////////
asm{ fld x // st = x
fld x // st = x, x
mov ecx, 8
fmul st(0), st(1) // st = x*x, x
mov esi, offset b
fld qword ptr [esi] // st = a[0], x2, x
l2:
fmul st(0), st(1) // st = a[c]*x2, x2, x
fld qword ptr [esi+ecx] // st = b[c+1], a[c]*x2, x2, x
add ecx, 8
faddp st(1), st(0) // st = b[c+1]+a[c]*x2, x2, x
cmp ecx, 48
jbe l2
fmulp st(2), st(0) // st = x2, (b[6]+a[5]*x2)*x
fstp y // st = (b[6]+a[5]*x2)*x
}
}
double cos333(double x) {
double y;
y = x * x;
return ((((((-0.00000000001136800186 * y +
0.00000000208758867982) * y -
0.00000027557315566764) * y +
0.00002480158729369483) * y -
0.00138888888888807796) * y +
0.04166666666666662966) * y -
0.5) * y + 1.0;
}
double sin333(double x) {
double y;
y = x * x;
return ((((((0.00000000015918135277 * y -
0.00000002505113193827) * y +
0.00000275573161030613) * y -
0.00019841269836759707) * y +
0.00833333333333094971) * y -
0.16666666666666662966) * y +
1.0) * x;
}
double cos3(double x)
{double y;
static double b[] =
{-0.00000000001136800186, 0.00000000208758867982,
-0.00000027557315566764, 0.00002480158729369483,
-0.00138888888888807796, 0.04166666666666662966,
-0.5 };
//////////
asm{ mov ecx, 30
mov esi, offset kp
fld x // st = x
fld eps // st = eps, x
l0:
fld tbyte ptr [esi+ecx]
// st = kp[c], eps, x
fsub st(0), st(2)
// st = kp[c]-x, eps, x
fabs // st = |kp[c]-x|, eps, x
fcomp st(1) // st = eps, x
fstsw ax
sahf
ja l1
fstp y // st = x
fstp y // st =
mov esi, offset kpc
fld tbyte ptr [esi+ecx]
jmp end
l1:
add ecx, 10
cmp ecx, 100
jbe l0
// st = eps, x
fstp y // st = x
fld x // st = x, x
mov ecx, 8
fmulp st(1), st(0) // st = x*x
mov esi, offset b
fld qword ptr [esi] // st = a[0], x2
l2:
fmul st(0), st(1) // st = a[c]*x2, x2
fld qword ptr [esi+ecx] // st = b[c+1], a[c]*x2, x2
add ecx, 8
faddp st(1), st(0) // st = b[c+1]+a[c]*x2, x2
cmp ecx, 48
jbe l2
fmulp st(1), st(0) // st = (b[6]+a[5]*x2)*x2
fld1 // st = 1, (b[6]+a[5]*x2)*x2
faddp st(1), st(0) // st = (b[6]+a[5]*x2)*x2+1
end:
}
}
double sin3(double x)
{double y;
static double b[] =
{0.00000000015918135277, -0.00000002505113193827,
0.00000275573161030613, -0.00019841269836759707,
0.00833333333333094971, -0.16666666666666662966,
1.0 };
////////////
asm{ mov ecx, 30
mov esi, offset kp
fld x // st = x
fld eps // st = eps, x
l0:
fld tbyte ptr [esi+ecx]
// st = kp[c], eps, x
fsub st(0), st(2)
// st = kp[c]-x, eps, x
fabs // st = |kp[c]-x|, eps, x
fcomp st(1) // st = eps, x
fstsw ax
sahf
ja l1
fstp y // st = x
fstp y // st =
mov esi, offset kps
fld tbyte ptr [esi+ecx]
jmp end
l1:
add ecx, 10
cmp ecx, 100
jbe l0
// st = eps, x
fstp y // st = x
fld x // st = x, x
mov ecx, 8
fmul st(0), st(1) // st = x*x, x
mov esi, offset b
fld qword ptr [esi] // st = a[0], x2, x
l2:
fmul st(0), st(1) // st = a[c]*x2, x2, x
fld qword ptr [esi+ecx] // st = b[c+1], a[c]*x2, x2, x
add ecx, 8
faddp st(1), st(0) // st = b[c+1]+a[c]*x2, x2, x
cmp ecx, 48
jbe l2
fmulp st(2), st(0) // st = x2, (b[6]+a[5]*x2)*x
fstp y // st = (b[6]+a[5]*x2)*x
end:
}
}
double cos33(double x)
{double y;
static double h[] = { 1.0/2.0, 1.0/24.0, 1.0/720.0, 1.0/40320.0,
1.0/3628800.0, 1.0/479001600.0, 1.0/87178291200.0};
///////////////////////////
asm{ mov ecx, 30
mov esi, offset kp
fld eps // st = eps
fld x // st = x, eps
l0:
fld tbyte ptr [esi+ecx]
// st = kp[c], x, eps
fsub st(0), st(1)
// st = kp[c]-x, x, eps
fabs // st = |kp[c]-x|, x, eps
fcomp st(2) // st = x, eps
fstsw ax
sahf
ja l1
fstp y // st = eps
fstp y // st =
mov esi, offset kpc
fld tbyte ptr [esi+ecx]
jmp end
l1:
add ecx, 10
cmp ecx, 100
jbe l0
// st = x, eps
fstp y
fstp y // st =
fld1 // st = 1
fld1 // st = 1, 1
fld x // st = x, 1, 1
fld x // st = x, x, 1, 1
fmulp st(1), st(0)
// st = x*x, 1, 1
fchs // st = -x*x, 1, 1
// x, y, z
mov ecx, 0
mov esi, offset h
l3:
fmul st(1), st(0) // st = x, y*x, z
fld qword ptr [esi+ecx]
// st = h[c], x, y, z
fmul st(0), st(2)
// st = y*h[c], x, y, z
faddp st(3), st(0)
// st = x, y, z+y*h[c]
add ecx, 8
cmp ecx, 56
jb l3
fstp y
fstp y // st = z 38
end:
}
}
double sin33(double x)
{double y;
static double h[] = {1.0/6.0, 1.0/120.0, 1.0/5040.0, 1.0/362880.0,
1.0/39916800.0, 1.0/6227020800.0, 1.0/1307674368000.0 };
///////////////////////////
asm{ mov ecx, 30
mov esi, offset kp
fld eps // st = eps
fld x // st = x, eps
l0:
fld tbyte ptr [esi+ecx]
// st = kp[c], x, eps
fsub st(0), st(1)
// st = kp[c]-x, x, eps
fabs // st = |kp[c]-x|, x, eps
fcomp st(2) // st = x, eps
fstsw ax
sahf
ja l1
fstp y // st = eps
fstp y // st =
mov esi, offset kps
fld tbyte ptr [esi+ecx]
jmp end
l1:
add ecx, 10
cmp ecx, 100
jbe l0
// st = x, eps
fstp y
fstp y // st =
fld x // st = x
fld x // st = x, x
fld x // st = x, x, x
fld x // st = x, x, x, x
fmulp st(1), st(0)
// st = x*x, x, x
fchs // st = -x*x, x, x
// x, y, z
mov ecx, 0
mov esi, offset h
l3:
fmul st(1), st(0) // st = x, y*x, z
fld qword ptr [esi+ecx]
// st = h[c], x, y, z
fmul st(0), st(2)
// st = y*h[c], x, y, z
faddp st(3), st(0)
// st = x, y, z+y*h[c]
add ecx, 8
cmp ecx, 56
jb l3
fstp y
fstp y // st = z 38
end:
}
}
double cosr(const double x);
double sinr(const double x);
double cosr(const double x)
{U sign=0;
double t, y, z, k;
////////////////////
t = x;
if(t<0) t = -t; // cos(t)=cos(-t)
if(t>=K2pi)
{y = t / K2pi;
modf(y, &z); // z=[y] con lo stesso segno
t = t - z*K2pi;
}
if(t>=kp[1]) // cos(t)= -cos(pi-t)= -cos(t-pi)
{ t= t-kp[1]; sign^=1;}
if(t>=kp[2]) // cos(t)= -cos(pi-t)
{ t= kp[1]-t; sign^=1;}
if(t>=kp[4]) // cos(t)= sin(pi/2 - t)
{ t=kp[2]-t;
y = sin7(t);
R sign && y!=0.0 ? -y: y;
}
y = cos7(t);
R sign && y!=0.0 ? -y: y;
}
double sinr(const double x)
{U sign=0;
double t, y, z, k;
////////////////////
t = x;
if(t<0) // sin(t)=-sin(-t)
{ t= -t; sign^=1; }
if(t>=K2pi)
{y = t / K2pi;
modf(y, &z); // z=[y] con lo stesso segno
t = t - z*K2pi;
}
if(t>=kp[1]) // sin(t)= sin(pi-t)= -sin(t-pi)
{ t= t-kp[1]; sign^=1;}
if(t>=kp[2]) // sin(t)= sin(pi-t)
t= kp[1]-t;
if(t>kp[4]) // sin(t)= cos(pi/2 - t)
{ t=kp[2]-t;
y = cos7(t);
R sign && y!=0.0 ? -y: y;
}
y = sin7(t);
R sign && y!=0.0 ? -y: y;
}
int main( void )
{int i, j;
U h;
double tmp, x, y, z0, z1, x0, delta;
time_t ti, tf;
F(i=-120; i>-134 ; --i)
{ P("(s,I)(%i*pi6)= %+.15f ", i, cos(i*kp[6]) );
P("%+.15f\n", cosr(i*kp[6]) );
}
srand((U)time(0));
F(i=0, x0=delta=0.0; i<500000; ++i)
{j= rand(); j=j&2;
x=rand(); y=rand();
if(y!=0) x/=y;
x+=rand();
if(j) x=-x;
z0 = cos(x); z1 = cosr(x);
if( z0 >= z1)
{y = z0-z1;
if(y>delta) { x0=x; delta=y;}
}
else { y = z1 - z0;
if(y>delta) { x0=x; delta=y;}
}
}
P("Delta=%e x0=%.15f\n cosr(x0)=%.15f\n",
delta, x0, cosr(x0));
P("cos(x0)=%.15f %.15f \n", cos(x0), 32.0*DBL_EPSILON);
ti=time(0);
F(tmp=0.00000001; tmp<kp[4]; tmp+=0.00000001)
if(0.0==cos3(tmp)) {P("NO ");break;}
tf=time(0);
P("tmp=%g cos3_delta=%g \n", tmp, difftime(tf, ti));
ti=time(0);
F(tmp=0.00000001; tmp<kp[4]; tmp+=0.00000001)
if(0.0==cos33(tmp)) {P("NO ");break;}
tf=time(0);
P("tmp=%g cos33_delta=%g \n", tmp, difftime(tf, ti));
ti=time(0);
F(tmp=0.00000001, h=0; tmp<kp[4]; tmp+=0.00000001, ++h)
if(0.0==cos333(tmp)) {P("NO ");break;}
tf=time(0);
P("tmp=%g cos333_delta=%g giri=%u\n", tmp, difftime(tf, ti), h);
ti=time(0);
F(tmp=0.00000001, h=0; tmp<kp[4]; tmp+=0.00000001, ++h)
if(0.0==cos7(tmp)) {P("NO ");break;}
tf=time(0);
P("tmp=%g cos7_delta=%g giri=%u\n", tmp, difftime(tf, ti), h);
R 0;
}
////////////////////
(s,I)(-120*pi6)= +1.000000000000000 +1.000000000000000
(s,I)(-121*pi6)= +0.866025403784438 +0.866025403784438
(s,I)(-122*pi6)= +0.499999999999997 +0.499999999999997
(s,I)(-123*pi6)= -0.000000000000006 -0.000000000000006
(s,I)(-124*pi6)= -0.499999999999996 -0.499999999999996
(s,I)(-125*pi6)= -0.866025403784438 -0.866025403784438
(s,I)(-126*pi6)= -1.000000000000000 -1.000000000000000
(s,I)(-127*pi6)= -0.866025403784437 -0.866025403784437
(s,I)(-128*pi6)= -0.499999999999994 -0.499999999999994
(s,I)(-129*pi6)= -0.000000000000004 -0.000000000000005
(s,I)(-130*pi6)= +0.499999999999999 +0.499999999999999
(s,I)(-131*pi6)= +0.866025403784439 +0.866025403784439
(s,I)(-132*pi6)= +1.000000000000000 +1.000000000000000
(s,I)(-133*pi6)= +0.866025403784442 +0.866025403784442
Delta=6.217249e-15 x0=43202.000000000000000
cosr(x0)=0.378915528769058
cos(x0)=0.378915528769052 0.000000000000007
tmp=0.785398 cos3_delta=14
tmp=0.785398 cos33_delta=15
tmp=0.785398 cos333_delta=9 giri=78539816
tmp=0.785398 cos7_delta=8 giri=78539816