[LIBM] Import win-libm from AMD

Source: https://github.com/amd/win-libm
This commit is contained in:
Timo Kreuzer
2022-06-12 12:02:01 +02:00
parent 081c637c06
commit 4afb647c78
82 changed files with 22392 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;;
;; Defines __L_2_by_pi_bits array
;; Used in trigonometric argument reduction
;;
.const
ALIGN 16
PUBLIC __L_2_by_pi_bits
__L_2_by_pi_bits DB 224, 241, 27, 193, 12, 88, 33, 116
DB 53, 126, 196, 126, 237, 175, 169, 75
DB 74, 41, 222, 231, 28, 244, 236, 197
DB 151, 175, 31, 235, 158, 212, 181, 168
DB 127, 121, 154, 253, 24, 61, 221, 38
DB 44, 159, 60, 251, 217, 180, 125, 180
DB 41, 104, 45, 70, 188, 188, 63, 96
DB 22, 120, 255, 95, 226, 127, 236, 160
DB 228, 247, 46, 126, 17, 114, 210, 231
DB 76, 13, 230, 88, 71, 230, 4, 249
DB 125, 209, 154, 192, 113, 166, 19, 18
DB 237, 186, 212, 215, 8, 162, 251, 156
DB 166, 196, 114, 172, 119, 248, 115, 72
DB 70, 39, 168, 187, 36, 25, 128, 75
DB 55, 9, 233, 184, 145, 220, 134, 21
DB 239, 122, 175, 142, 69, 249, 7, 65
DB 14, 241, 100, 86, 138, 109, 3, 119
DB 211, 212, 71, 95, 157, 240, 167, 84
DB 16, 57, 185, 13, 230, 139, 2, 0
DB 0, 0, 0, 0, 0, 0
END

View File

@@ -0,0 +1,62 @@
;;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;; Defines __Lcosarray and __Lsinarray arrays.
;; Used in sin.asm and cos.asm
;;
.const
ALIGN 16
PUBLIC __Lcosarray
__Lcosarray DQ 03fa5555555555555h ; 0.0416667 c1
DQ 0
DQ 0bf56c16c16c16967h ; -0.00138889 c2
DQ 0
DQ 03EFA01A019F4EC91h ; 2.48016e-005 c3
DQ 0
DQ 0bE927E4FA17F667Bh ; -2.75573e-007 c4
DQ 0
DQ 03E21EEB690382EECh ; 2.08761e-009 c5
DQ 0
DQ 0bDA907DB47258AA7h ; -1.13826e-011 c6
DQ 0
ALIGN 16
PUBLIC __Lsinarray
__Lsinarray DQ 0bfc5555555555555h ; -0.166667 s1
DQ 0
DQ 03f81111111110bb3h ; 0.00833333 s2
DQ 0
DQ 0bf2a01a019e83e5ch ; -0.000198413 s3
DQ 0
DQ 03ec71de3796cde01h ; 2.75573e-006 s4
DQ 0
DQ 0be5ae600b42fdfa7h ; -2.50511e-008 s5
DQ 0
DQ 03de5e0b2f9a43bb8h ; 1.59181e-010 s6
DQ 0
END

View File

@@ -0,0 +1,48 @@
;;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;; Defines __Lcosarray and __Lsinarray arrays.
;; Used in sin.asm and cos.asm
;; These coefficients are actually from Taylor series.
;;
.const
ALIGN 16
PUBLIC __Lcosfarray
__Lcosfarray DQ 0bfe0000000000000h ; -0.5 c0
DQ 03fa5555555555555h ; 0.0416667 c1
DQ 0bf56c16c16c16c16h ; -0.00138889 c2
DQ 03EFA01A01A01A019h ; 2.48016e-005 c3
DQ 0be927e4fb7789f5ch ; -2.75573e-007 c4
ALIGN 16
PUBLIC __Lsinfarray
__Lsinfarray DQ 0bfc5555555555555h ; -0.166667 s1
DQ 03f81111111111111h ; 0.00833333 s2
DQ 0bf2a01a01a01a01ah ; -0.000198413 s3
DQ 03ec71de3a556c734h ; 2.75573e-006 s4
END

View File

@@ -0,0 +1,41 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
double FN_PROTOTYPE(_chgsign)(double x)
{
/* Returns x with its sign reversed.
NaNs are not considered special; their sign bits are handled
the same as for any other number */
unsigned long u;
GET_BITS_DP64(x, u);
u ^= SIGNBIT_DP64;
PUT_BITS_DP64(u, x);
return x;
}

View File

@@ -0,0 +1,40 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
float FN_PROTOTYPE(_chgsignf)(float x)
{
/* Returns x with its sign reversed.
NaNs are not considered special; their sign bits are handled
the same as for any other number */
unsigned int u;
GET_BITS_SP32(x, u);
u ^= SIGNBIT_SP32;
PUT_BITS_SP32(u, x);
return x;
}

View File

@@ -0,0 +1,44 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
/* Returns the absolute value of x with the sign of y.
NaNs are not considered special; their sign bits are handled
the same as for any other number. */
double FN_PROTOTYPE(_copysign)(double x, double y)
{
unsigned long ux, uy;
GET_BITS_DP64(x, ux);
GET_BITS_DP64(y, uy);
if ((ux ^ uy) & SIGNBIT_DP64)
PUT_BITS_DP64(ux ^ SIGNBIT_DP64, x);
return x;
}

View File

@@ -0,0 +1,42 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
/* Returns the absolute value of x with the sign of y.
NaNs are not considered special; their sign bits are handled
the same as for any other number. */
float FN_PROTOTYPE(_copysignf)(float x, float y)
{
unsigned int ux, uy;
GET_BITS_SP32(x, ux);
GET_BITS_SP32(y, uy);
if ((ux ^ uy) & SIGNBIT_SP32)
PUT_BITS_SP32(ux ^ SIGNBIT_SP32, x);
return x;
}

View File

@@ -0,0 +1,39 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
/* Returns 0 if x is infinite or NaN, otherwise returns 1 */
int FN_PROTOTYPE(_finite)(double x)
{
unsigned long ux;
GET_BITS_DP64(x, ux);
return (int)(((ux & ~SIGNBIT_DP64) - PINFBITPATT_DP64) >> 63);
}

View File

@@ -0,0 +1,40 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
/* Returns 0 if x is infinite or NaN, otherwise returns 1 */
int FN_PROTOTYPE(_finitef)(float x)
{
unsigned int ux;
GET_BITS_SP32(x, ux);
return (int)(((ux & ~SIGNBIT_SP32) - PINFBITPATT_SP32) >> 31);
}

View File

@@ -0,0 +1,145 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_VAL_WITH_FLAGS
#define USE_NAN_WITH_FLAGS
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_NAN_WITH_FLAGS
#undef USE_VAL_WITH_FLAGS
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
#pragma function(acos)
double FN_PROTOTYPE(acos)(double x)
{
/* Computes arccos(x).
The argument is first reduced by noting that arccos(x)
is invalid for abs(x) > 1. For denormal and small
arguments arccos(x) = pi/2 to machine accuracy.
Remaining argument ranges are handled as follows.
For abs(x) <= 0.5 use
arccos(x) = pi/2 - arcsin(x)
= pi/2 - (x + x^3*R(x^2))
where R(x^2) is a rational minimax approximation to
(arcsin(x) - x)/x^3.
For abs(x) > 0.5 exploit the identity:
arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
together with the above rational approximation, and
reconstruct the terms carefully.
*/
/* Some constants and split constants. */
static const double
pi = 3.1415926535897933e+00, /* 0x400921fb54442d18 */
piby2 = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */
piby2_head = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */
piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */
double u, y, s=0.0, r;
int xexp, xnan, transform=0;
unsigned long ux, aux, xneg;
GET_BITS_DP64(x, ux);
aux = ux & ~SIGNBIT_DP64;
xneg = (ux & SIGNBIT_DP64);
xnan = (aux > PINFBITPATT_DP64);
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
/* Special cases */
if (xnan)
{
return _handle_error("acos", OP_ACOS, ux|0x0008000000000000, _DOMAIN,
0, EDOM, x, 0.0, 1);
}
else if (xexp < -56)
{ /* y small enough that arccos(x) = pi/2 */
return val_with_flags(piby2, AMD_F_INEXACT);
}
else if (xexp >= 0)
{ /* abs(x) >= 1.0 */
if (x == 1.0)
return 0.0;
else if (x == -1.0)
return val_with_flags(pi, AMD_F_INEXACT);
else
return _handle_error("acos", OP_ACOS, INDEFBITPATT_DP64, _DOMAIN,
AMD_F_INVALID, EDOM, x, 0.0, 1);
}
if (xneg) y = -x;
else y = x;
transform = (xexp >= -1); /* abs(x) >= 0.5 */
if (transform)
{ /* Transform y into the range [0,0.5) */
r = 0.5*(1.0 - y);
/* VC++ intrinsic call */
_mm_store_sd(&s, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r)));
y = s;
}
else
r = y*y;
/* Use a rational approximation for [0.0, 0.5] */
u = r*(0.227485835556935010735943483075 +
(-0.445017216867635649900123110649 +
(0.275558175256937652532686256258 +
(-0.0549989809235685841612020091328 +
(0.00109242697235074662306043804220 +
0.0000482901920344786991880522822991*r)*r)*r)*r)*r)/
(1.36491501334161032038194214209 +
(-3.28431505720958658909889444194 +
(2.76568859157270989520376345954 +
(-0.943639137032492685763471240072 +
0.105869422087204370341222318533*r)*r)*r)*r);
if (transform)
{ /* Reconstruct acos carefully in transformed region */
if (xneg) return pi - 2.0*(s+(y*u - piby2_tail));
else
{
double c, s1;
unsigned long us;
GET_BITS_DP64(s, us);
PUT_BITS_DP64(0xffffffff00000000 & us, s1);
c = (r-s1*s1)/(s+s1);
return 2.0*s1 + (2.0*c+2.0*y*u);
}
}
else
return piby2_head - (x - (piby2_tail - x*u));
}

View File

@@ -0,0 +1,146 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_VALF_WITH_FLAGS
#define USE_NANF_WITH_FLAGS
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_NANF_WITH_FLAGS
#undef USE_VALF_WITH_FLAGS
#undef USE_HANDLE_ERRORF
#include "libm_errno.h"
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(acosf)
float FN_PROTOTYPE(acosf)(float x)
{
/* Computes arccos(x).
The argument is first reduced by noting that arccos(x)
is invalid for abs(x) > 1. For denormal and small
arguments arccos(x) = pi/2 to machine accuracy.
Remaining argument ranges are handled as follows.
For abs(x) <= 0.5 use
arccos(x) = pi/2 - arcsin(x)
= pi/2 - (x + x^3*R(x^2))
where R(x^2) is a rational minimax approximation to
(arcsin(x) - x)/x^3.
For abs(x) > 0.5 exploit the identity:
arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
together with the above rational approximation, and
reconstruct the terms carefully.
*/
/* Some constants and split constants. */
static const float
piby2 = 1.5707963705e+00F; /* 0x3fc90fdb */
static const double
pi = 3.1415926535897933e+00, /* 0x400921fb54442d18 */
piby2_head = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */
piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */
float u, y, s = 0.0F, r;
int xexp, xnan, transform = 0;
unsigned int ux, aux, xneg;
GET_BITS_SP32(x, ux);
aux = ux & ~SIGNBIT_SP32;
xneg = (ux & SIGNBIT_SP32);
xnan = (aux > PINFBITPATT_SP32);
xexp = (int)((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
/* Special cases */
if (xnan)
{
return _handle_errorf("acosf", OP_ACOS, ux|0x00400000, _DOMAIN, 0,
EDOM, x, 0.0F, 1);
}
else if (xexp < -26)
/* y small enough that arccos(x) = pi/2 */
return valf_with_flags(piby2, AMD_F_INEXACT);
else if (xexp >= 0)
{ /* abs(x) >= 1.0 */
if (x == 1.0F)
return 0.0F;
else if (x == -1.0F)
return valf_with_flags((float)pi, AMD_F_INEXACT);
else
return _handle_errorf("acosf", OP_ACOS, INDEFBITPATT_SP32, _DOMAIN,
AMD_F_INVALID, EDOM, x, 0.0F, 1);
}
if (xneg) y = -x;
else y = x;
transform = (xexp >= -1); /* abs(x) >= 0.5 */
if (transform)
{ /* Transform y into the range [0,0.5) */
r = 0.5F*(1.0F - y);
/* VC++ intrinsic call */
_mm_store_ss(&s, _mm_sqrt_ss(_mm_load_ss(&r)));
y = s;
}
else
r = y*y;
/* Use a rational approximation for [0.0, 0.5] */
u=r*(0.184161606965100694821398249421F +
(-0.0565298683201845211985026327361F +
(-0.0133819288943925804214011424456F -
0.00396137437848476485201154797087F*r)*r)*r)/
(1.10496961524520294485512696706F -
0.836411276854206731913362287293F*r);
if (transform)
{
/* Reconstruct acos carefully in transformed region */
if (xneg)
return (float)(pi - 2.0*(s+(y*u - piby2_tail)));
else
{
float c, s1;
unsigned int us;
GET_BITS_SP32(s, us);
PUT_BITS_SP32(0xffff0000 & us, s1);
c = (r-s1*s1)/(s+s1);
return 2.0F*s1 + (2.0F*c+2.0F*y*u);
}
}
else
return (float)(piby2_head - (x - (piby2_tail - x*u)));
}

View File

@@ -0,0 +1,153 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_VAL_WITH_FLAGS
#define USE_NAN_WITH_FLAGS
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_NAN_WITH_FLAGS
#undef USE_VAL_WITH_FLAGS
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
#pragma function(asin)
double FN_PROTOTYPE(asin)(double x)
{
/* Computes arcsin(x).
The argument is first reduced by noting that arcsin(x)
is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
For denormal and small arguments arcsin(x) = x to machine
accuracy. Remaining argument ranges are handled as follows.
For abs(x) <= 0.5 use
arcsin(x) = x + x^3*R(x^2)
where R(x^2) is a rational minimax approximation to
(arcsin(x) - x)/x^3.
For abs(x) > 0.5 exploit the identity:
arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
together with the above rational approximation, and
reconstruct the terms carefully.
*/
/* Some constants and split constants. */
static const double
piby2_tail = 6.1232339957367660e-17, /* 0x3c91a62633145c07 */
hpiby2_head = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */
piby2 = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */
double u, v, y, s=0.0, r;
int xexp, xnan, transform=0;
unsigned long ux, aux, xneg;
GET_BITS_DP64(x, ux);
aux = ux & ~SIGNBIT_DP64;
xneg = (ux & SIGNBIT_DP64);
xnan = (aux > PINFBITPATT_DP64);
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
/* Special cases */
if (xnan)
{
return _handle_error("asin", OP_ASIN, ux|0x0008000000000000, _DOMAIN,
0, EDOM, x, 0.0, 1);
}
else if (xexp < -28)
{ /* y small enough that arcsin(x) = x */
return val_with_flags(x, AMD_F_INEXACT);
}
else if (xexp >= 0)
{ /* abs(x) >= 1.0 */
if (x == 1.0)
return val_with_flags(piby2, AMD_F_INEXACT);
else if (x == -1.0)
return val_with_flags(-piby2, AMD_F_INEXACT);
else
return _handle_error("asin", OP_ASIN, INDEFBITPATT_DP64, _DOMAIN,
AMD_F_INVALID, EDOM, x, 0.0, 1);
}
if (xneg) y = -x;
else y = x;
transform = (xexp >= -1); /* abs(x) >= 0.5 */
if (transform)
{ /* Transform y into the range [0,0.5) */
r = 0.5*(1.0 - y);
/* VC++ intrinsic call */
_mm_store_sd(&s, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r)));
y = s;
}
else
r = y*y;
/* Use a rational approximation for [0.0, 0.5] */
u = r*(0.227485835556935010735943483075 +
(-0.445017216867635649900123110649 +
(0.275558175256937652532686256258 +
(-0.0549989809235685841612020091328 +
(0.00109242697235074662306043804220 +
0.0000482901920344786991880522822991*r)*r)*r)*r)*r)/
(1.36491501334161032038194214209 +
(-3.28431505720958658909889444194 +
(2.76568859157270989520376345954 +
(-0.943639137032492685763471240072 +
0.105869422087204370341222318533*r)*r)*r)*r);
if (transform)
{ /* Reconstruct asin carefully in transformed region */
{
double c, s1, p, q;
unsigned long us;
GET_BITS_DP64(s, us);
PUT_BITS_DP64(0xffffffff00000000 & us, s1);
c = (r-s1*s1)/(s+s1);
p = 2.0*s*u - (piby2_tail-2.0*c);
q = hpiby2_head - 2.0*s1;
v = hpiby2_head - (p-q);
}
}
else
{
/* Use a temporary variable to prevent VC++ rearranging
y + y*u
into
y * (1 + u)
and getting an incorrectly rounded result */
double tmp;
tmp = y * u;
v = y + tmp;
}
if (xneg) return -v;
else return v;
}

View File

@@ -0,0 +1,151 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_VALF_WITH_FLAGS
#define USE_NANF_WITH_FLAGS
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_NANF_WITH_FLAGS
#undef USE_VALF_WITH_FLAGS
#undef USE_HANDLE_ERRORF
#include "libm_errno.h"
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(asinf)
float FN_PROTOTYPE(asinf)(float x)
{
/* Computes arcsin(x).
The argument is first reduced by noting that arcsin(x)
is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
For denormal and small arguments arcsin(x) = x to machine
accuracy. Remaining argument ranges are handled as follows.
For abs(x) <= 0.5 use
arcsin(x) = x + x^3*R(x^2)
where R(x^2) is a rational minimax approximation to
(arcsin(x) - x)/x^3.
For abs(x) > 0.5 exploit the identity:
arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
together with the above rational approximation, and
reconstruct the terms carefully.
*/
/* Some constants and split constants. */
static const float
piby2_tail = 7.5497894159e-08F, /* 0x33a22168 */
hpiby2_head = 7.8539812565e-01F, /* 0x3f490fda */
piby2 = 1.5707963705e+00F; /* 0x3fc90fdb */
float u, v, y, s = 0.0F, r;
int xexp, xnan, transform = 0;
unsigned int ux, aux, xneg;
GET_BITS_SP32(x, ux);
aux = ux & ~SIGNBIT_SP32;
xneg = (ux & SIGNBIT_SP32);
xnan = (aux > PINFBITPATT_SP32);
xexp = (int)((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
/* Special cases */
if (xnan)
{
return _handle_errorf("asinf", OP_ASIN, ux|0x00400000, _DOMAIN, 0,
EDOM, x, 0.0F, 1);
}
else if (xexp < -14)
/* y small enough that arcsin(x) = x */
return valf_with_flags(x, AMD_F_INEXACT);
else if (xexp >= 0)
{
/* abs(x) >= 1.0 */
if (x == 1.0F)
return valf_with_flags(piby2, AMD_F_INEXACT);
else if (x == -1.0F)
return valf_with_flags(-piby2, AMD_F_INEXACT);
else
return _handle_errorf("asinf", OP_ASIN, INDEFBITPATT_SP32, _DOMAIN,
AMD_F_INVALID, EDOM, x, 0.0F, 1);
}
if (xneg) y = -x;
else y = x;
transform = (xexp >= -1); /* abs(x) >= 0.5 */
if (transform)
{ /* Transform y into the range [0,0.5) */
r = 0.5F*(1.0F - y);
/* VC++ intrinsic call */
_mm_store_ss(&s, _mm_sqrt_ss(_mm_load_ss(&r)));
y = s;
}
else
r = y*y;
/* Use a rational approximation for [0.0, 0.5] */
u=r*(0.184161606965100694821398249421F +
(-0.0565298683201845211985026327361F +
(-0.0133819288943925804214011424456F -
0.00396137437848476485201154797087F*r)*r)*r)/
(1.10496961524520294485512696706F -
0.836411276854206731913362287293F*r);
if (transform)
{
/* Reconstruct asin carefully in transformed region */
float c, s1, p, q;
unsigned int us;
GET_BITS_SP32(s, us);
PUT_BITS_SP32(0xffff0000 & us, s1);
c = (r-s1*s1)/(s+s1);
p = 2.0F*s*u - (piby2_tail-2.0F*c);
q = hpiby2_head - 2.0F*s1;
v = hpiby2_head - (p-q);
}
else
{
/* Use a temporary variable to prevent VC++ rearranging
y + y*u
into
y * (1 + u)
and getting an incorrectly rounded result */
float tmp;
tmp = y * u;
v = y + tmp;
}
if (xneg) return -v;
else return v;
}

View File

@@ -0,0 +1,132 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_VAL_WITH_FLAGS
#define USE_NAN_WITH_FLAGS
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_VAL_WITH_FLAGS
#undef USE_NAN_WITH_FLAGS
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
#pragma function(atan)
double FN_PROTOTYPE(atan)(double x)
{
/* Some constants and split constants. */
static double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */
double chi, clo, v, s, q, z;
/* Find properties of argument x. */
unsigned long ux, aux, xneg;
GET_BITS_DP64(x, ux);
aux = ux & ~SIGNBIT_DP64;
xneg = (ux != aux);
if (xneg) v = -x;
else v = x;
/* Argument reduction to range [-7/16,7/16] */
if (aux > 0x4003800000000000) /* v > 39./16. */
{
if (aux > PINFBITPATT_DP64)
{
/* x is NaN */
return _handle_error("atan", OP_ATAN, ux|0x0008000000000000, _DOMAIN, 0,
EDOM, x, 0.0, 1);
}
else if (v > 0x4370000000000000)
{ /* abs(x) > 2^56 => arctan(1/x) is
insignificant compared to piby2 */
if (xneg)
return val_with_flags(-piby2, AMD_F_INEXACT);
else
return val_with_flags(piby2, AMD_F_INEXACT);
}
x = -1.0/v;
/* (chi + clo) = arctan(infinity) */
chi = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
clo = 6.12323399573676480327e-17; /* 0x3c91a62633145c06 */
}
else if (aux > 0x3ff3000000000000) /* 39./16. > v > 19./16. */
{
x = (v-1.5)/(1.0+1.5*v);
/* (chi + clo) = arctan(1.5) */
chi = 9.82793723247329054082e-01; /* 0x3fef730bd281f69b */
clo = 1.39033110312309953701e-17; /* 0x3c7007887af0cbbc */
}
else if (aux > 0x3fe6000000000000) /* 19./16. > v > 11./16. */
{
x = (v-1.0)/(1.0+v);
/* (chi + clo) = arctan(1.) */
chi = 7.85398163397448278999e-01; /* 0x3fe921fb54442d18 */
clo = 3.06161699786838240164e-17; /* 0x3c81a62633145c06 */
}
else if (aux > 0x3fdc000000000000) /* 11./16. > v > 7./16. */
{
x = (2.0*v-1.0)/(2.0+v);
/* (chi + clo) = arctan(0.5) */
chi = 4.63647609000806093515e-01; /* 0x3fddac670561bb4f */
clo = 2.26987774529616809294e-17; /* 0x3c7a2b7f222f65e0 */
}
else /* v < 7./16. */
{
x = v;
chi = 0.0;
clo = 0.0;
}
/* Core approximation: Remez(4,4) on [-7/16,7/16] */
s = x*x;
q = x*s*
(0.268297920532545909e0 +
(0.447677206805497472e0 +
(0.220638780716667420e0 +
(0.304455919504853031e-1 +
0.142316903342317766e-3*s)*s)*s)*s)/
(0.804893761597637733e0 +
(0.182596787737507063e1 +
(0.141254259931958921e1 +
(0.424602594203847109e0 +
0.389525873944742195e-1*s)*s)*s)*s);
z = chi - ((q - clo) - x);
if (xneg) z = -z;
return z;
}

View File

@@ -0,0 +1,750 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_VAL_WITH_FLAGS
#define USE_NAN_WITH_FLAGS
#define USE_SCALEDOUBLE_1
#define USE_SCALEDOUBLE_2
#define USE_SCALEUPDOUBLE1024
#define USE_SCALEDOWNDOUBLE
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_VAL_WITH_FLAGS
#undef USE_NAN_WITH_FLAGS
#undef USE_SCALEDOUBLE_1
#undef USE_SCALEDOUBLE_2
#undef USE_SCALEUPDOUBLE1024
#undef USE_SCALEDOWNDOUBLE
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
#pragma function(atan2)
double FN_PROTOTYPE(atan2)(double y, double x)
{
/* Arrays atan_jby256_lead and atan_jby256_tail contain
leading and trailing parts respectively of precomputed
values of atan(j/256), for j = 16, 17, ..., 256.
atan_jby256_lead contains the first 21 bits of precision,
and atan_jby256_tail contains a further 53 bits precision. */
static const double atan_jby256_lead[ 241] = {
6.24187886714935302734e-02, /* 0x3faff55b00000000 */
6.63088560104370117188e-02, /* 0x3fb0f99e00000000 */
7.01969265937805175781e-02, /* 0x3fb1f86d00000000 */
7.40829110145568847656e-02, /* 0x3fb2f71900000000 */
7.79666304588317871094e-02, /* 0x3fb3f59f00000000 */
8.18479657173156738281e-02, /* 0x3fb4f3fd00000000 */
8.57268571853637695312e-02, /* 0x3fb5f23200000000 */
8.96031260490417480469e-02, /* 0x3fb6f03b00000000 */
9.34767723083496093750e-02, /* 0x3fb7ee1800000000 */
9.73475575447082519531e-02, /* 0x3fb8ebc500000000 */
1.01215422153472900391e-01, /* 0x3fb9e94100000000 */
1.05080246925354003906e-01, /* 0x3fbae68a00000000 */
1.08941912651062011719e-01, /* 0x3fbbe39e00000000 */
1.12800359725952148438e-01, /* 0x3fbce07c00000000 */
1.16655409336090087891e-01, /* 0x3fbddd2100000000 */
1.20507001876831054688e-01, /* 0x3fbed98c00000000 */
1.24354958534240722656e-01, /* 0x3fbfd5ba00000000 */
1.28199219703674316406e-01, /* 0x3fc068d500000000 */
1.32039666175842285156e-01, /* 0x3fc0e6ad00000000 */
1.35876297950744628906e-01, /* 0x3fc1646500000000 */
1.39708757400512695312e-01, /* 0x3fc1e1fa00000000 */
1.43537282943725585938e-01, /* 0x3fc25f6e00000000 */
1.47361397743225097656e-01, /* 0x3fc2dcbd00000000 */
1.51181221008300781250e-01, /* 0x3fc359e800000000 */
1.54996633529663085938e-01, /* 0x3fc3d6ee00000000 */
1.58807516098022460938e-01, /* 0x3fc453ce00000000 */
1.62613749504089355469e-01, /* 0x3fc4d08700000000 */
1.66415214538574218750e-01, /* 0x3fc54d1800000000 */
1.70211911201477050781e-01, /* 0x3fc5c98100000000 */
1.74003481864929199219e-01, /* 0x3fc645bf00000000 */
1.77790164947509765625e-01, /* 0x3fc6c1d400000000 */
1.81571602821350097656e-01, /* 0x3fc73dbd00000000 */
1.85347914695739746094e-01, /* 0x3fc7b97b00000000 */
1.89118742942810058594e-01, /* 0x3fc8350b00000000 */
1.92884206771850585938e-01, /* 0x3fc8b06e00000000 */
1.96644186973571777344e-01, /* 0x3fc92ba300000000 */
2.00398445129394531250e-01, /* 0x3fc9a6a800000000 */
2.04147100448608398438e-01, /* 0x3fca217e00000000 */
2.07889914512634277344e-01, /* 0x3fca9c2300000000 */
2.11626768112182617188e-01, /* 0x3fcb169600000000 */
2.15357661247253417969e-01, /* 0x3fcb90d700000000 */
2.19082474708557128906e-01, /* 0x3fcc0ae500000000 */
2.22801089286804199219e-01, /* 0x3fcc84bf00000000 */
2.26513504981994628906e-01, /* 0x3fccfe6500000000 */
2.30219483375549316406e-01, /* 0x3fcd77d500000000 */
2.33919143676757812500e-01, /* 0x3fcdf11000000000 */
2.37612247467041015625e-01, /* 0x3fce6a1400000000 */
2.41298794746398925781e-01, /* 0x3fcee2e100000000 */
2.44978547096252441406e-01, /* 0x3fcf5b7500000000 */
2.48651623725891113281e-01, /* 0x3fcfd3d100000000 */
2.52317905426025390625e-01, /* 0x3fd025fa00000000 */
2.55977153778076171875e-01, /* 0x3fd061ee00000000 */
2.59629487991333007812e-01, /* 0x3fd09dc500000000 */
2.63274669647216796875e-01, /* 0x3fd0d97e00000000 */
2.66912937164306640625e-01, /* 0x3fd1151a00000000 */
2.70543813705444335938e-01, /* 0x3fd1509700000000 */
2.74167299270629882812e-01, /* 0x3fd18bf500000000 */
2.77783632278442382812e-01, /* 0x3fd1c73500000000 */
2.81392335891723632812e-01, /* 0x3fd2025500000000 */
2.84993648529052734375e-01, /* 0x3fd23d5600000000 */
2.88587331771850585938e-01, /* 0x3fd2783700000000 */
2.92173147201538085938e-01, /* 0x3fd2b2f700000000 */
2.95751571655273437500e-01, /* 0x3fd2ed9800000000 */
2.99322128295898437500e-01, /* 0x3fd3281800000000 */
3.02884817123413085938e-01, /* 0x3fd3627700000000 */
3.06439399719238281250e-01, /* 0x3fd39cb400000000 */
3.09986352920532226562e-01, /* 0x3fd3d6d100000000 */
3.13524961471557617188e-01, /* 0x3fd410cb00000000 */
3.17055702209472656250e-01, /* 0x3fd44aa400000000 */
3.20578098297119140625e-01, /* 0x3fd4845a00000000 */
3.24092388153076171875e-01, /* 0x3fd4bdee00000000 */
3.27598333358764648438e-01, /* 0x3fd4f75f00000000 */
3.31095933914184570312e-01, /* 0x3fd530ad00000000 */
3.34585189819335937500e-01, /* 0x3fd569d800000000 */
3.38066101074218750000e-01, /* 0x3fd5a2e000000000 */
3.41538190841674804688e-01, /* 0x3fd5dbc300000000 */
3.45002174377441406250e-01, /* 0x3fd6148400000000 */
3.48457098007202148438e-01, /* 0x3fd64d1f00000000 */
3.51903676986694335938e-01, /* 0x3fd6859700000000 */
3.55341434478759765625e-01, /* 0x3fd6bdea00000000 */
3.58770608901977539062e-01, /* 0x3fd6f61900000000 */
3.62190723419189453125e-01, /* 0x3fd72e2200000000 */
3.65602254867553710938e-01, /* 0x3fd7660700000000 */
3.69004726409912109375e-01, /* 0x3fd79dc600000000 */
3.72398376464843750000e-01, /* 0x3fd7d56000000000 */
3.75782966613769531250e-01, /* 0x3fd80cd400000000 */
3.79158496856689453125e-01, /* 0x3fd8442200000000 */
3.82525205612182617188e-01, /* 0x3fd87b4b00000000 */
3.85882616043090820312e-01, /* 0x3fd8b24d00000000 */
3.89230966567993164062e-01, /* 0x3fd8e92900000000 */
3.92570018768310546875e-01, /* 0x3fd91fde00000000 */
3.95900011062622070312e-01, /* 0x3fd9566d00000000 */
3.99220705032348632812e-01, /* 0x3fd98cd500000000 */
4.02532100677490234375e-01, /* 0x3fd9c31600000000 */
4.05834197998046875000e-01, /* 0x3fd9f93000000000 */
4.09126996994018554688e-01, /* 0x3fda2f2300000000 */
4.12410259246826171875e-01, /* 0x3fda64ee00000000 */
4.15684223175048828125e-01, /* 0x3fda9a9200000000 */
4.18948888778686523438e-01, /* 0x3fdad00f00000000 */
4.22204017639160156250e-01, /* 0x3fdb056400000000 */
4.25449609756469726562e-01, /* 0x3fdb3a9100000000 */
4.28685665130615234375e-01, /* 0x3fdb6f9600000000 */
4.31912183761596679688e-01, /* 0x3fdba47300000000 */
4.35129165649414062500e-01, /* 0x3fdbd92800000000 */
4.38336372375488281250e-01, /* 0x3fdc0db400000000 */
4.41534280776977539062e-01, /* 0x3fdc421900000000 */
4.44722414016723632812e-01, /* 0x3fdc765500000000 */
4.47900772094726562500e-01, /* 0x3fdcaa6800000000 */
4.51069593429565429688e-01, /* 0x3fdcde5300000000 */
4.54228639602661132812e-01, /* 0x3fdd121500000000 */
4.57377910614013671875e-01, /* 0x3fdd45ae00000000 */
4.60517644882202148438e-01, /* 0x3fdd791f00000000 */
4.63647603988647460938e-01, /* 0x3fddac6700000000 */
4.66767549514770507812e-01, /* 0x3fdddf8500000000 */
4.69877958297729492188e-01, /* 0x3fde127b00000000 */
4.72978591918945312500e-01, /* 0x3fde454800000000 */
4.76069211959838867188e-01, /* 0x3fde77eb00000000 */
4.79150056838989257812e-01, /* 0x3fdeaa6500000000 */
4.82221126556396484375e-01, /* 0x3fdedcb600000000 */
4.85282421112060546875e-01, /* 0x3fdf0ede00000000 */
4.88333940505981445312e-01, /* 0x3fdf40dd00000000 */
4.91375446319580078125e-01, /* 0x3fdf72b200000000 */
4.94406938552856445312e-01, /* 0x3fdfa45d00000000 */
4.97428894042968750000e-01, /* 0x3fdfd5e000000000 */
5.00440597534179687500e-01, /* 0x3fe0039c00000000 */
5.03442764282226562500e-01, /* 0x3fe01c3400000000 */
5.06434917449951171875e-01, /* 0x3fe034b700000000 */
5.09417057037353515625e-01, /* 0x3fe04d2500000000 */
5.12389183044433593750e-01, /* 0x3fe0657e00000000 */
5.15351772308349609375e-01, /* 0x3fe07dc300000000 */
5.18304347991943359375e-01, /* 0x3fe095f300000000 */
5.21246910095214843750e-01, /* 0x3fe0ae0e00000000 */
5.24179458618164062500e-01, /* 0x3fe0c61400000000 */
5.27101993560791015625e-01, /* 0x3fe0de0500000000 */
5.30014991760253906250e-01, /* 0x3fe0f5e200000000 */
5.32917976379394531250e-01, /* 0x3fe10daa00000000 */
5.35810947418212890625e-01, /* 0x3fe1255d00000000 */
5.38693904876708984375e-01, /* 0x3fe13cfb00000000 */
5.41567325592041015625e-01, /* 0x3fe1548500000000 */
5.44430732727050781250e-01, /* 0x3fe16bfa00000000 */
5.47284126281738281250e-01, /* 0x3fe1835a00000000 */
5.50127506256103515625e-01, /* 0x3fe19aa500000000 */
5.52961349487304687500e-01, /* 0x3fe1b1dc00000000 */
5.55785179138183593750e-01, /* 0x3fe1c8fe00000000 */
5.58598995208740234375e-01, /* 0x3fe1e00b00000000 */
5.61403274536132812500e-01, /* 0x3fe1f70400000000 */
5.64197540283203125000e-01, /* 0x3fe20de800000000 */
5.66981792449951171875e-01, /* 0x3fe224b700000000 */
5.69756031036376953125e-01, /* 0x3fe23b7100000000 */
5.72520732879638671875e-01, /* 0x3fe2521700000000 */
5.75275897979736328125e-01, /* 0x3fe268a900000000 */
5.78021049499511718750e-01, /* 0x3fe27f2600000000 */
5.80756187438964843750e-01, /* 0x3fe2958e00000000 */
5.83481788635253906250e-01, /* 0x3fe2abe200000000 */
5.86197376251220703125e-01, /* 0x3fe2c22100000000 */
5.88903427124023437500e-01, /* 0x3fe2d84c00000000 */
5.91599464416503906250e-01, /* 0x3fe2ee6200000000 */
5.94285964965820312500e-01, /* 0x3fe3046400000000 */
5.96962928771972656250e-01, /* 0x3fe31a5200000000 */
5.99629878997802734375e-01, /* 0x3fe3302b00000000 */
6.02287292480468750000e-01, /* 0x3fe345f000000000 */
6.04934692382812500000e-01, /* 0x3fe35ba000000000 */
6.07573032379150390625e-01, /* 0x3fe3713d00000000 */
6.10201358795166015625e-01, /* 0x3fe386c500000000 */
6.12820148468017578125e-01, /* 0x3fe39c3900000000 */
6.15428924560546875000e-01, /* 0x3fe3b19800000000 */
6.18028640747070312500e-01, /* 0x3fe3c6e400000000 */
6.20618820190429687500e-01, /* 0x3fe3dc1c00000000 */
6.23198986053466796875e-01, /* 0x3fe3f13f00000000 */
6.25770092010498046875e-01, /* 0x3fe4064f00000000 */
6.28331184387207031250e-01, /* 0x3fe41b4a00000000 */
6.30883216857910156250e-01, /* 0x3fe4303200000000 */
6.33425712585449218750e-01, /* 0x3fe4450600000000 */
6.35958671569824218750e-01, /* 0x3fe459c600000000 */
6.38482093811035156250e-01, /* 0x3fe46e7200000000 */
6.40995979309082031250e-01, /* 0x3fe4830a00000000 */
6.43500804901123046875e-01, /* 0x3fe4978f00000000 */
6.45996093750000000000e-01, /* 0x3fe4ac0000000000 */
6.48482322692871093750e-01, /* 0x3fe4c05e00000000 */
6.50959014892578125000e-01, /* 0x3fe4d4a800000000 */
6.53426170349121093750e-01, /* 0x3fe4e8de00000000 */
6.55884265899658203125e-01, /* 0x3fe4fd0100000000 */
6.58332824707031250000e-01, /* 0x3fe5111000000000 */
6.60772323608398437500e-01, /* 0x3fe5250c00000000 */
6.63202762603759765625e-01, /* 0x3fe538f500000000 */
6.65623664855957031250e-01, /* 0x3fe54cca00000000 */
6.68035984039306640625e-01, /* 0x3fe5608d00000000 */
6.70438766479492187500e-01, /* 0x3fe5743c00000000 */
6.72832489013671875000e-01, /* 0x3fe587d800000000 */
6.75216674804687500000e-01, /* 0x3fe59b6000000000 */
6.77592277526855468750e-01, /* 0x3fe5aed600000000 */
6.79958820343017578125e-01, /* 0x3fe5c23900000000 */
6.82316303253173828125e-01, /* 0x3fe5d58900000000 */
6.84664726257324218750e-01, /* 0x3fe5e8c600000000 */
6.87004089355468750000e-01, /* 0x3fe5fbf000000000 */
6.89334869384765625000e-01, /* 0x3fe60f0800000000 */
6.91656589508056640625e-01, /* 0x3fe6220d00000000 */
6.93969249725341796875e-01, /* 0x3fe634ff00000000 */
6.96272850036621093750e-01, /* 0x3fe647de00000000 */
6.98567867279052734375e-01, /* 0x3fe65aab00000000 */
7.00854301452636718750e-01, /* 0x3fe66d6600000000 */
7.03131675720214843750e-01, /* 0x3fe6800e00000000 */
7.05400466918945312500e-01, /* 0x3fe692a400000000 */
7.07660198211669921875e-01, /* 0x3fe6a52700000000 */
7.09911346435546875000e-01, /* 0x3fe6b79800000000 */
7.12153911590576171875e-01, /* 0x3fe6c9f700000000 */
7.14387893676757812500e-01, /* 0x3fe6dc4400000000 */
7.16613292694091796875e-01, /* 0x3fe6ee7f00000000 */
7.18829631805419921875e-01, /* 0x3fe700a700000000 */
7.21037864685058593750e-01, /* 0x3fe712be00000000 */
7.23237514495849609375e-01, /* 0x3fe724c300000000 */
7.25428581237792968750e-01, /* 0x3fe736b600000000 */
7.27611064910888671875e-01, /* 0x3fe7489700000000 */
7.29785442352294921875e-01, /* 0x3fe75a6700000000 */
7.31950759887695312500e-01, /* 0x3fe76c2400000000 */
7.34108448028564453125e-01, /* 0x3fe77dd100000000 */
7.36257076263427734375e-01, /* 0x3fe78f6b00000000 */
7.38397598266601562500e-01, /* 0x3fe7a0f400000000 */
7.40530014038085937500e-01, /* 0x3fe7b26c00000000 */
7.42654323577880859375e-01, /* 0x3fe7c3d300000000 */
7.44770050048828125000e-01, /* 0x3fe7d52800000000 */
7.46877670288085937500e-01, /* 0x3fe7e66c00000000 */
7.48976707458496093750e-01, /* 0x3fe7f79e00000000 */
7.51068115234375000000e-01, /* 0x3fe808c000000000 */
7.53150939941406250000e-01, /* 0x3fe819d000000000 */
7.55226135253906250000e-01, /* 0x3fe82ad000000000 */
7.57292747497558593750e-01, /* 0x3fe83bbe00000000 */
7.59351730346679687500e-01, /* 0x3fe84c9c00000000 */
7.61402606964111328125e-01, /* 0x3fe85d6900000000 */
7.63445377349853515625e-01, /* 0x3fe86e2500000000 */
7.65480041503906250000e-01, /* 0x3fe87ed000000000 */
7.67507076263427734375e-01, /* 0x3fe88f6b00000000 */
7.69526004791259765625e-01, /* 0x3fe89ff500000000 */
7.71537303924560546875e-01, /* 0x3fe8b06f00000000 */
7.73540973663330078125e-01, /* 0x3fe8c0d900000000 */
7.75536537170410156250e-01, /* 0x3fe8d13200000000 */
7.77523994445800781250e-01, /* 0x3fe8e17a00000000 */
7.79504299163818359375e-01, /* 0x3fe8f1b300000000 */
7.81476497650146484375e-01, /* 0x3fe901db00000000 */
7.83441066741943359375e-01, /* 0x3fe911f300000000 */
7.85398006439208984375e-01}; /* 0x3fe921fb00000000 */
static const double atan_jby256_tail[ 241] = {
2.13244638182005395671e-08, /* 0x3e56e59fbd38db2c */
3.89093864761712760656e-08, /* 0x3e64e3aa54dedf96 */
4.44780900009437454576e-08, /* 0x3e67e105ab1bda88 */
1.15344768460112754160e-08, /* 0x3e48c5254d013fd0 */
3.37271051945395312705e-09, /* 0x3e2cf8ab3ad62670 */
2.40857608736109859459e-08, /* 0x3e59dca4bec80468 */
1.85853810450623807768e-08, /* 0x3e53f4b5ec98a8da */
5.14358299969225078306e-08, /* 0x3e6b9d49619d81fe */
8.85023985412952486748e-09, /* 0x3e43017887460934 */
1.59425154214358432060e-08, /* 0x3e511e3eca0b9944 */
1.95139937737755753164e-08, /* 0x3e54f3f73c5a332e */
2.64909755273544319715e-08, /* 0x3e5c71c8ae0e00a6 */
4.43388037881231070144e-08, /* 0x3e67cde0f86fbdc7 */
2.14757072421821274557e-08, /* 0x3e570f328c889c72 */
2.61049792670754218852e-08, /* 0x3e5c07ae9b994efe */
7.81439350674466302231e-09, /* 0x3e40c8021d7b1698 */
3.60125207123751024094e-08, /* 0x3e635585edb8cb22 */
6.15276238179343767917e-08, /* 0x3e70842567b30e96 */
9.54387964641184285058e-08, /* 0x3e799e811031472e */
3.02789566851502754129e-08, /* 0x3e6041821416bcee */
1.16888650949870856331e-07, /* 0x3e7f6086e4dc96f4 */
1.07580956468653338863e-08, /* 0x3e471a535c5f1b58 */
8.33454265379535427653e-08, /* 0x3e765f743fe63ca1 */
1.10790279272629526068e-07, /* 0x3e7dbd733472d014 */
1.08394277896366207424e-07, /* 0x3e7d18cc4d8b0d1d */
9.22176086126841098800e-08, /* 0x3e78c12553c8fb29 */
7.90938592199048786990e-08, /* 0x3e753b49e2e8f991 */
8.66445407164293125637e-08, /* 0x3e77422ae148c141 */
1.40839973537092438671e-08, /* 0x3e4e3ec269df56a8 */
1.19070438507307600689e-07, /* 0x3e7ff6754e7e0ac9 */
6.40451663051716197071e-08, /* 0x3e7131267b1b5aad */
1.08338682076343674522e-07, /* 0x3e7d14fa403a94bc */
3.52999550187922736222e-08, /* 0x3e62f396c089a3d8 */
1.05983273930043077202e-07, /* 0x3e7c731d78fa95bb */
1.05486124078259553339e-07, /* 0x3e7c50f385177399 */
5.82167732281776477773e-08, /* 0x3e6f41409c6f2c20 */
1.08696483983403942633e-07, /* 0x3e7d2d90c4c39ec0 */
4.47335086122377542835e-08, /* 0x3e680420696f2106 */
1.26896287162615723528e-08, /* 0x3e4b40327943a2e8 */
4.06534471589151404531e-08, /* 0x3e65d35e02f3d2a2 */
3.84504846300557026690e-08, /* 0x3e64a498288117b0 */
3.60715006404807269080e-08, /* 0x3e635da119afb324 */
6.44725903165522722801e-08, /* 0x3e714e85cdb9a908 */
3.63749249976409461305e-08, /* 0x3e638754e5547b9a */
1.03901294413833913794e-07, /* 0x3e7be40ae6ce3246 */
6.25379756302167880580e-08, /* 0x3e70c993b3bea7e7 */
6.63984302368488828029e-08, /* 0x3e71d2dd89ac3359 */
3.21844598971548278059e-08, /* 0x3e61476603332c46 */
1.16030611712765830905e-07, /* 0x3e7f25901bac55b7 */
1.17464622142347730134e-07, /* 0x3e7f881b7c826e28 */
7.54604017965808996596e-08, /* 0x3e7441996d698d20 */
1.49234929356206556899e-07, /* 0x3e8407ac521ea089 */
1.41416924523217430259e-07, /* 0x3e82fb0c6c4b1723 */
2.13308065617483489011e-07, /* 0x3e8ca135966a3e18 */
5.04230937933302320146e-08, /* 0x3e6b1218e4d646e4 */
5.45874922281655519035e-08, /* 0x3e6d4e72a350d288 */
1.51849028914786868886e-07, /* 0x3e84617e2f04c329 */
3.09004308703769273010e-08, /* 0x3e6096ec41e82650 */
9.67574548184738317664e-08, /* 0x3e79f91f25773e6e */
4.02508285529322212824e-08, /* 0x3e659c0820f1d674 */
3.01222268096861091157e-08, /* 0x3e602bf7a2df1064 */
2.36189860670079288680e-07, /* 0x3e8fb36bfc40508f */
1.14095158111080887695e-07, /* 0x3e7ea08f3f8dc892 */
7.42349089746573467487e-08, /* 0x3e73ed6254656a0e */
5.12515583196230380184e-08, /* 0x3e6b83f5e5e69c58 */
2.19290391828763918102e-07, /* 0x3e8d6ec2af768592 */
3.83263512187553886471e-08, /* 0x3e6493889a226f94 */
1.61513486284090523855e-07, /* 0x3e85ad8fa65279ba */
5.09996743535589922261e-08, /* 0x3e6b615784d45434 */
1.23694037861246766534e-07, /* 0x3e809a184368f145 */
8.23367955351123783984e-08, /* 0x3e761a2439b0d91c */
1.07591766213053694014e-07, /* 0x3e7ce1a65e39a978 */
1.42789947524631815640e-07, /* 0x3e832a39a93b6a66 */
1.32347123024711878538e-07, /* 0x3e81c3699af804e7 */
2.17626067316598149229e-08, /* 0x3e575e0f4e44ede8 */
2.34454866923044288656e-07, /* 0x3e8f77ced1a7a83b */
2.82966370261766916053e-09, /* 0x3e284e7f0cb1b500 */
2.29300919890907632975e-07, /* 0x3e8ec6b838b02dfe */
1.48428270450261284915e-07, /* 0x3e83ebf4dfbeda87 */
1.87937408574313982512e-07, /* 0x3e89397aed9cb475 */
6.13685946813334055347e-08, /* 0x3e707937bc239c54 */
1.98585022733583817493e-07, /* 0x3e8aa754553131b6 */
7.68394131623752961662e-08, /* 0x3e74a05d407c45dc */
1.28119052312436745644e-07, /* 0x3e8132231a206dd0 */
7.02119104719236502733e-08, /* 0x3e72d8ecfdd69c88 */
9.87954793820636301943e-08, /* 0x3e7a852c74218606 */
1.72176752381034986217e-07, /* 0x3e871bf2baeebb50 */
1.12877225146169704119e-08, /* 0x3e483d7db7491820 */
5.33549829555851737993e-08, /* 0x3e6ca50d92b6da14 */
2.13833275710816521345e-08, /* 0x3e56f5cde8530298 */
1.16243518048290556393e-07, /* 0x3e7f343198910740 */
6.29926408369055877943e-08, /* 0x3e70e8d241ccd80a */
6.45429039328021963791e-08, /* 0x3e71535ac619e6c8 */
8.64001922814281933403e-08, /* 0x3e77316041c36cd2 */
9.50767572202325800240e-08, /* 0x3e7985a000637d8e */
5.80851497508121135975e-08, /* 0x3e6f2f29858c0a68 */
1.82350561135024766232e-07, /* 0x3e8879847f96d909 */
1.98948680587390608655e-07, /* 0x3e8ab3d319e12e42 */
7.83548663450197659846e-08, /* 0x3e75088162dfc4c2 */
3.04374234486798594427e-08, /* 0x3e605749a1cd9d8c */
2.76135725629797411787e-08, /* 0x3e5da65c6c6b8618 */
4.32610105454203065470e-08, /* 0x3e6739bf7df1ad64 */
5.17107515324127256994e-08, /* 0x3e6bc31252aa3340 */
2.82398327875841444660e-08, /* 0x3e5e528191ad3aa8 */
1.87482469524195595399e-07, /* 0x3e8929d93df19f18 */
2.97481891662714096139e-08, /* 0x3e5ff11eb693a080 */
9.94421570843584316402e-09, /* 0x3e455ae3f145a3a0 */
1.07056210730391848428e-07, /* 0x3e7cbcd8c6c0ca82 */
6.25589580466881163081e-08, /* 0x3e70cb04d425d304 */
9.56641013869464593803e-08, /* 0x3e79adfcab5be678 */
1.88056307148355440276e-07, /* 0x3e893d90c5662508 */
8.38850689379557880950e-08, /* 0x3e768489bd35ff40 */
5.01215865527674122924e-09, /* 0x3e3586ed3da2b7e0 */
1.74166095998522089762e-07, /* 0x3e87604d2e850eee */
9.96779574395363585849e-08, /* 0x3e7ac1d12bfb53d8 */
5.98432026368321460686e-09, /* 0x3e39b3d468274740 */
1.18362922366887577169e-07, /* 0x3e7fc5d68d10e53c */
1.86086833284154215946e-07, /* 0x3e88f9e51884becb */
1.97671457251348941011e-07, /* 0x3e8a87f0869c06d1 */
1.42447160717199237159e-07, /* 0x3e831e7279f685fa */
1.05504240785546574184e-08, /* 0x3e46a8282f9719b0 */
3.13335218371639189324e-08, /* 0x3e60d2724a8a44e0 */
1.96518418901914535399e-07, /* 0x3e8a60524b11ad4e */
2.17692035039173536059e-08, /* 0x3e575fdf832750f0 */
2.15613114426529981675e-07, /* 0x3e8cf06902e4cd36 */
5.68271098300441214948e-08, /* 0x3e6e82422d4f6d10 */
1.70331455823369124256e-08, /* 0x3e524a091063e6c0 */
9.17590028095709583247e-08, /* 0x3e78a1a172dc6f38 */
2.77266304112916566247e-07, /* 0x3e929b6619f8a92d */
9.37041937614656939690e-08, /* 0x3e79274d9c1b70c8 */
1.56116346368316796511e-08, /* 0x3e50c34b1fbb7930 */
4.13967433808382727413e-08, /* 0x3e6639866c20eb50 */
1.70164749185821616276e-07, /* 0x3e86d6d0f6832e9e */
4.01708788545600086008e-07, /* 0x3e9af54def99f25e */
2.59663539226050551563e-07, /* 0x3e916cfc52a00262 */
2.22007487655027469542e-07, /* 0x3e8dcc1e83569c32 */
2.90542250809644081369e-07, /* 0x3e937f7a551ed425 */
4.67720537666628903341e-07, /* 0x3e9f6360adc98887 */
2.79799803956772554802e-07, /* 0x3e92c6ec8d35a2c1 */
2.07344552327432547723e-07, /* 0x3e8bd44df84cb036 */
2.54705698692735196368e-07, /* 0x3e9117cf826e310e */
4.26848589539548450728e-07, /* 0x3e9ca533f332cfc9 */
2.52506723633552216197e-07, /* 0x3e90f208509dbc2e */
2.14684129933849704964e-07, /* 0x3e8cd07d93c945de */
3.20134822201596505431e-07, /* 0x3e957bdfd67e6d72 */
9.93537565749855712134e-08, /* 0x3e7aab89c516c658 */
3.70792944827917252327e-08, /* 0x3e63e823b1a1b8a0 */
1.41772749369083698972e-07, /* 0x3e8307464a9d6d3c */
4.22446601490198804306e-07, /* 0x3e9c5993cd438843 */
4.11818433724801511540e-07, /* 0x3e9ba2fca02ab554 */
1.19976381502605310519e-07, /* 0x3e801a5b6983a268 */
3.43703078571520905265e-08, /* 0x3e6273d1b350efc8 */
1.66128705555453270379e-07, /* 0x3e864c238c37b0c6 */
5.00499610023283006540e-08, /* 0x3e6aded07370a300 */
1.75105139941208062123e-07, /* 0x3e878091197eb47e */
7.70807146729030327334e-08, /* 0x3e74b0f245e0dabc */
2.45918607526895836121e-07, /* 0x3e9080d9794e2eaf */
2.18359020958626199345e-07, /* 0x3e8d4ec242b60c76 */
8.44342887976445333569e-09, /* 0x3e4221d2f940caa0 */
1.07506148687888629299e-07, /* 0x3e7cdbc42b2bba5c */
5.36544954316820904572e-08, /* 0x3e6cce37bb440840 */
3.39109101518396596341e-07, /* 0x3e96c1d999cf1dd0 */
2.60098720293920613340e-08, /* 0x3e5bed8a07eb0870 */
8.42678991664621455827e-08, /* 0x3e769ed88f490e3c */
5.36972237470183633197e-08, /* 0x3e6cd41719b73ef0 */
4.28192558171921681288e-07, /* 0x3e9cbc4ac95b41b7 */
2.71535491483955143294e-07, /* 0x3e9238f1b890f5d7 */
7.84094998145075780203e-08, /* 0x3e750c4282259cc4 */
3.43880599134117431863e-07, /* 0x3e9713d2de87b3e2 */
1.32878065060366481043e-07, /* 0x3e81d5a7d2255276 */
4.18046802627967629428e-07, /* 0x3e9c0dfd48227ac1 */
2.65042411765766019424e-07, /* 0x3e91c964dab76753 */
1.70383695347518643694e-07, /* 0x3e86de56d5704496 */
1.54096497259613515678e-07, /* 0x3e84aeb71fd19968 */
2.36543402412459813461e-07, /* 0x3e8fbf91c57b1918 */
4.38416350106876736790e-07, /* 0x3e9d6bef7fbe5d9a */
3.03892161339927775731e-07, /* 0x3e9464d3dc249066 */
3.31136771605664899240e-07, /* 0x3e9638e2ec4d9073 */
6.49494294526590682218e-08, /* 0x3e716f4a7247ea7c */
4.10423429887181345747e-09, /* 0x3e31a0a740f1d440 */
1.70831640869113847224e-07, /* 0x3e86edbb0114a33c */
1.10811512657909180966e-07, /* 0x3e7dbee8bf1d513c */
3.23677724749783611964e-07, /* 0x3e95b8bdb0248f73 */
3.55662734259192678528e-07, /* 0x3e97de3d3f5eac64 */
2.30102333489738219140e-07, /* 0x3e8ee24187ae448a */
4.47429004000738629714e-07, /* 0x3e9e06c591ec5192 */
7.78167135617329598659e-08, /* 0x3e74e3861a332738 */
9.90345291908535415737e-08, /* 0x3e7a9599dcc2bfe4 */
5.85800913143113728314e-08, /* 0x3e6f732fbad43468 */
4.57859062410871843857e-07, /* 0x3e9eb9f573b727d9 */
3.67993069723390929794e-07, /* 0x3e98b212a2eb9897 */
2.90836464322977276043e-07, /* 0x3e9384884c167215 */
2.51621574250131388318e-07, /* 0x3e90e2d363020051 */
2.75789824740652815545e-07, /* 0x3e92820879fbd022 */
3.88985776250314403593e-07, /* 0x3e9a1ab9893e4b30 */
1.40214080183768019611e-07, /* 0x3e82d1b817a24478 */
3.23451432223550478373e-08, /* 0x3e615d7b8ded4878 */
9.15979180730608444470e-08, /* 0x3e78968f9db3a5e4 */
3.44371402498640470421e-07, /* 0x3e971c4171fe135f */
3.40401897215059498077e-07, /* 0x3e96d80f605d0d8c */
1.06431813453707950243e-07, /* 0x3e7c91f043691590 */
1.46204238932338846248e-07, /* 0x3e839f8a15fce2b2 */
9.94610376972039046878e-09, /* 0x3e455beda9d94b80 */
2.01711528092681771039e-07, /* 0x3e8b12c15d60949a */
2.72027977986191568296e-07, /* 0x3e924167b312bfe3 */
2.48402602511693757964e-07, /* 0x3e90ab8633070277 */
1.58480011219249621715e-07, /* 0x3e854554ebbc80ee */
3.00372828113368713281e-08, /* 0x3e60204aef5a4bb8 */
3.67816204583541976394e-07, /* 0x3e98af08c679cf2c */
2.46169793032343824291e-07, /* 0x3e90852a330ae6c8 */
1.70080468270204253247e-07, /* 0x3e86d3eb9ec32916 */
1.67806717763872914315e-07, /* 0x3e8685cb7fcbbafe */
2.67715622006907942620e-07, /* 0x3e91f751c1e0bd95 */
2.14411342550299170574e-08, /* 0x3e5705b1b0f72560 */
4.11228221283669073277e-07, /* 0x3e9b98d8d808ca92 */
3.52311752396749662260e-08, /* 0x3e62ea22c75cc980 */
3.52718000397367821054e-07, /* 0x3e97aba62bca0350 */
4.38857387992911129814e-07, /* 0x3e9d73833442278c */
3.22574606753482540743e-07, /* 0x3e95a5ca1fb18bf9 */
3.28730371182804296828e-08, /* 0x3e61a6092b6ecf28 */
7.56672470607639279700e-08, /* 0x3e744fd049aac104 */
3.26750155316369681821e-09, /* 0x3e2c114fd8df5180 */
3.21724445362095284743e-07, /* 0x3e95972f130feae5 */
1.06639427371776571151e-07, /* 0x3e7ca034a55fe198 */
3.41020788139524715063e-07, /* 0x3e96e2b149990227 */
1.00582838631232552824e-07, /* 0x3e7b00000294592c */
3.68439433859276640065e-07, /* 0x3e98b9bdc442620e */
2.20403078342388012027e-07, /* 0x3e8d94fdfabf3e4e */
1.62841467098298142534e-07, /* 0x3e85db30b145ad9a */
2.25325348296680733838e-07, /* 0x3e8e3e1eb95022b0 */
4.37462238226421614339e-07, /* 0x3e9d5b8b45442bd6 */
3.52055880555040706500e-07, /* 0x3e97a046231ecd2e */
4.75614398494781776825e-07, /* 0x3e9feafe3ef55232 */
3.60998399033215317516e-07, /* 0x3e9839e7bfd78267 */
3.79292434611513945954e-08, /* 0x3e645cf49d6fa900 */
1.29859015528549300061e-08, /* 0x3e4be3132b27f380 */
3.15927546985474913188e-07, /* 0x3e9533980bb84f9f */
2.28533679887379668031e-08, /* 0x3e5889e2ce3ba390 */
1.17222541823553133877e-07, /* 0x3e7f7778c3ad0cc8 */
1.51991208405464415857e-07, /* 0x3e846660cec4eba2 */
1.56958239325240655564e-07}; /* 0x3e85110b4611a626 */
/* Some constants and split constants. */
static double pi = 3.1415926535897932e+00, /* 0x400921fb54442d18 */
piby2 = 1.5707963267948966e+00, /* 0x3ff921fb54442d18 */
piby4 = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */
three_piby4 = 2.3561944901923449e+00, /* 0x4002d97c7f3321d2 */
pi_head = 3.1415926218032836e+00, /* 0x400921fb50000000 */
pi_tail = 3.1786509547056392e-08, /* 0x3e6110b4611a6263 */
piby2_head = 1.5707963267948965e+00, /* 0x3ff921fb54442d18 */
piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */
double u, v, vbyu, q1, q2, s, u1, vu1, u2, vu2, uu, c, r;
unsigned int swap_vu, index, xzero, yzero, xnan, ynan, xinf, yinf;
int m, xexp, yexp, diffexp;
/* Find properties of arguments x and y. */
unsigned long ux, ui, aux, xneg, uy, auy, yneg;
GET_BITS_DP64(x, ux);
GET_BITS_DP64(y, uy);
aux = ux & ~SIGNBIT_DP64;
auy = uy & ~SIGNBIT_DP64;
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
xneg = ux & SIGNBIT_DP64;
yneg = uy & SIGNBIT_DP64;
xzero = (aux == 0);
yzero = (auy == 0);
xnan = (aux > PINFBITPATT_DP64);
ynan = (auy > PINFBITPATT_DP64);
xinf = (aux == PINFBITPATT_DP64);
yinf = (auy == PINFBITPATT_DP64);
diffexp = yexp - xexp;
/* Special cases */
if (xnan)
return _handle_error("atan2", OP_ATAN2, ux|0x0008000000000000, _DOMAIN, 0,
EDOM, x, y, 2);
else if (ynan)
return _handle_error("atan2", OP_ATAN2, uy|0x0008000000000000, _DOMAIN, 0,
EDOM, x, y, 2);
else if (yzero)
{ /* Zero y gives +-0 for positive x
and +-pi for negative x */
if (xneg)
{
if (yneg) return val_with_flags(-pi,AMD_F_INEXACT);
else return val_with_flags(pi,AMD_F_INEXACT);
}
else return y;
}
else if (xzero)
{ /* Zero x gives +- pi/2
depending on sign of y */
if (yneg) return val_with_flags(-piby2,AMD_F_INEXACT);
else val_with_flags(piby2,AMD_F_INEXACT);
}
/* Scale up both x and y if they are both below 1/4.
This avoids any possible later denormalised arithmetic. */
if ((xexp < 1021 && yexp < 1021))
{
scaleUpDouble1024(ux, &ux);
scaleUpDouble1024(uy, &uy);
PUT_BITS_DP64(ux, x);
PUT_BITS_DP64(uy, y);
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
diffexp = yexp - xexp;
}
if (diffexp > 56)
{ /* abs(y)/abs(x) > 2^56 => arctan(x/y)
is insignificant compared to piby2 */
if (yneg) return val_with_flags(-piby2,AMD_F_INEXACT);
else return val_with_flags(piby2,AMD_F_INEXACT);
}
else if (diffexp < -28 && (!xneg))
{ /* x positive and dominant over y by a factor of 2^28.
In this case atan(y/x) is y/x to machine accuracy. */
if (diffexp < -1074) /* Result underflows */
{
if (yneg)
return val_with_flags(-0.0,AMD_F_INEXACT | AMD_F_UNDERFLOW);
else
return val_with_flags(0.0,AMD_F_INEXACT | AMD_F_UNDERFLOW);
}
else
{
if (diffexp < -1022)
{
/* Result will likely be denormalized */
y = scaleDouble_1(y, 100);
y /= x;
/* Now y is 2^100 times the true result. Scale it back down. */
GET_BITS_DP64(y, uy);
scaleDownDouble(uy, 100, &uy);
PUT_BITS_DP64(uy, y);
if ((uy & EXPBITS_DP64) == 0)
return val_with_flags(y, AMD_F_INEXACT | AMD_F_UNDERFLOW);
else
return y;
}
else
return y / x;
}
}
else if (diffexp < -56 && xneg)
{ /* abs(x)/abs(y) > 2^56 and x < 0 => arctan(y/x)
is insignificant compared to pi */
if (yneg) return val_with_flags(-pi,AMD_F_INEXACT);
else return val_with_flags(pi,AMD_F_INEXACT);
}
else if (yinf && xinf)
{ /* If abs(x) and abs(y) are both infinity
return +-pi/4 or +- 3pi/4 according to
signs. */
if (xneg)
{
if (yneg) return val_with_flags(-three_piby4,AMD_F_INEXACT);
else return val_with_flags(three_piby4,AMD_F_INEXACT);
}
else
{
if (yneg) return val_with_flags(-piby4,AMD_F_INEXACT);
else return val_with_flags(piby4,AMD_F_INEXACT);
}
}
/* General case: take absolute values of arguments */
u = x; v = y;
if (xneg) u = -x;
if (yneg) v = -y;
/* Swap u and v if necessary to obtain 0 < v < u. Compute v/u. */
swap_vu = (u < v);
if (swap_vu) { uu = u; u = v; v = uu; }
vbyu = v/u;
if (vbyu > 0.0625)
{ /* General values of v/u. Use a look-up
table and series expansion. */
index = (int)(256*vbyu + 0.5);
q1 = atan_jby256_lead[index-16];
q2 = atan_jby256_tail[index-16];
c = index*1./256;
GET_BITS_DP64(u, ui);
m = (int)((ui & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
u = scaleDouble_2(u,-m);
v = scaleDouble_2(v,-m);
GET_BITS_DP64(u, ui);
PUT_BITS_DP64(0xfffffffff8000000 & ui, u1); /* 26 leading bits of u */
u2 = u - u1;
r = ((v-c*u1)-c*u2)/(u+c*v);
/* Polynomial approximation to atan(r) */
s = r*r;
q2 = q2 + r - r*(s * (0.33333333333224095522 - s*(0.19999918038989143496)));
}
else if (vbyu < 1.e-8)
{ /* v/u is small enough that atan(v/u) = v/u */
q1 = 0.0;
q2 = vbyu;
}
else /* vbyu <= 0.0625 */
{
/* Small values of v/u. Use a series expansion
computed carefully to minimise cancellation */
GET_BITS_DP64(u, ui);
PUT_BITS_DP64(0xffffffff00000000 & ui, u1);
GET_BITS_DP64(vbyu, ui);
PUT_BITS_DP64(0xffffffff00000000 & ui, vu1);
u2 = u - u1;
vu2 = vbyu - vu1;
q1 = 0.0;
s = vbyu*vbyu;
q2 = vbyu +
((((v - u1*vu1) - u2*vu1) - u*vu2)/u -
(vbyu*s*(0.33333333333333170500 -
s*(0.19999999999393223405 -
s*(0.14285713561807169030 -
s*(0.11110736283514525407 -
s*(0.90029810285449784439E-01)))))));
}
/* Tidy-up according to which quadrant the arguments lie in */
if (swap_vu) {q1 = piby2_head - q1; q2 = piby2_tail - q2;}
if (xneg) {q1 = pi_head - q1; q2 = pi_tail - q2;}
q1 = q1 + q2;
if (yneg) q1 = - q1;
return q1;
}

View File

@@ -0,0 +1,469 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_VALF_WITH_FLAGS
#define USE_NAN_WITH_FLAGS
#define USE_SCALEDOUBLE_1
#define USE_SCALEDOWNDOUBLE
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_VALF_WITH_FLAGS
#undef USE_NAN_WITH_FLAGS
#undef USE_SCALEDOUBLE_1
#undef USE_SCALEDOWNDOUBLE
#undef USE_HANDLE_ERRORF
#include "libm_errno.h"
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(atan2f)
float FN_PROTOTYPE(atan2f)(float fy, float fx)
{
/* Array atan_jby256 contains precomputed values of atan(j/256),
for j = 16, 17, ..., 256. */
static const double atan_jby256[ 241] = {
6.24188099959573430842e-02, /* 0x3faff55bb72cfde9 */
6.63088949198234745008e-02, /* 0x3fb0f99ea71d52a6 */
7.01969710718705064423e-02, /* 0x3fb1f86dbf082d58 */
7.40829225490337306415e-02, /* 0x3fb2f719318a4a9a */
7.79666338315423007588e-02, /* 0x3fb3f59f0e7c559d */
8.18479898030765457007e-02, /* 0x3fb4f3fd677292fb */
8.57268757707448092464e-02, /* 0x3fb5f2324fd2d7b2 */
8.96031774848717321724e-02, /* 0x3fb6f03bdcea4b0c */
9.34767811585894559112e-02, /* 0x3fb7ee182602f10e */
9.73475734872236708739e-02, /* 0x3fb8ebc54478fb28 */
1.01215441667466668485e-01, /* 0x3fb9e94153cfdcf1 */
1.05080273416329528224e-01, /* 0x3fbae68a71c722b8 */
1.08941956989865793015e-01, /* 0x3fbbe39ebe6f07c3 */
1.12800381201659388752e-01, /* 0x3fbce07c5c3cca32 */
1.16655435441069349478e-01, /* 0x3fbddd21701eba6e */
1.20507009691224548087e-01, /* 0x3fbed98c2190043a */
1.24354994546761424279e-01, /* 0x3fbfd5ba9aac2f6d */
1.28199281231298117811e-01, /* 0x3fc068d584212b3d */
1.32039761614638734288e-01, /* 0x3fc0e6adccf40881 */
1.35876328229701304195e-01, /* 0x3fc1646541060850 */
1.39708874289163620386e-01, /* 0x3fc1e1fafb043726 */
1.43537293701821222491e-01, /* 0x3fc25f6e171a535c */
1.47361481088651630200e-01, /* 0x3fc2dcbdb2fba1ff */
1.51181331798580037562e-01, /* 0x3fc359e8edeb99a3 */
1.54996741923940972718e-01, /* 0x3fc3d6eee8c6626c */
1.58807608315631065832e-01, /* 0x3fc453cec6092a9e */
1.62613828597948567589e-01, /* 0x3fc4d087a9da4f17 */
1.66415301183114927586e-01, /* 0x3fc54d18ba11570a */
1.70211925285474380276e-01, /* 0x3fc5c9811e3ec269 */
1.74003600935367680469e-01, /* 0x3fc645bfffb3aa73 */
1.77790228992676047071e-01, /* 0x3fc6c1d4898933d8 */
1.81571711160032150945e-01, /* 0x3fc73dbde8a7d201 */
1.85347949995694760705e-01, /* 0x3fc7b97b4bce5b02 */
1.89118848926083965578e-01, /* 0x3fc8350be398ebc7 */
1.92884312257974643856e-01, /* 0x3fc8b06ee2879c28 */
1.96644245190344985064e-01, /* 0x3fc92ba37d050271 */
2.00398553825878511514e-01, /* 0x3fc9a6a8e96c8626 */
2.04147145182116990236e-01, /* 0x3fca217e601081a5 */
2.07889927202262986272e-01, /* 0x3fca9c231b403279 */
2.11626808765629753628e-01, /* 0x3fcb1696574d780b */
2.15357699697738047551e-01, /* 0x3fcb90d7529260a2 */
2.19082510780057748701e-01, /* 0x3fcc0ae54d768466 */
2.22801153759394493514e-01, /* 0x3fcc84bf8a742e6d */
2.26513541356919617664e-01, /* 0x3fccfe654e1d5395 */
2.30219587276843717927e-01, /* 0x3fcd77d5df205736 */
2.33919206214733416127e-01, /* 0x3fcdf110864c9d9d */
2.37612313865471241892e-01, /* 0x3fce6a148e96ec4d */
2.41298826930858800743e-01, /* 0x3fcee2e1451d980c */
2.44978663126864143473e-01, /* 0x3fcf5b75f92c80dd */
2.48651741190513253521e-01, /* 0x3fcfd3d1fc40dbe4 */
2.52317980886427151166e-01, /* 0x3fd025fa510665b5 */
2.55977303013005474952e-01, /* 0x3fd061eea03d6290 */
2.59629629408257511791e-01, /* 0x3fd09dc597d86362 */
2.63274882955282396590e-01, /* 0x3fd0d97ee509acb3 */
2.66912987587400396539e-01, /* 0x3fd1151a362431c9 */
2.70543868292936529052e-01, /* 0x3fd150973a9ce546 */
2.74167451119658789338e-01, /* 0x3fd18bf5a30bf178 */
2.77783663178873208022e-01, /* 0x3fd1c735212dd883 */
2.81392432649178403370e-01, /* 0x3fd2025567e47c95 */
2.84993688779881237938e-01, /* 0x3fd23d562b381041 */
2.88587361894077354396e-01, /* 0x3fd278372057ef45 */
2.92173383391398755471e-01, /* 0x3fd2b2f7fd9b5fe2 */
2.95751685750431536626e-01, /* 0x3fd2ed987a823cfe */
2.99322202530807379706e-01, /* 0x3fd328184fb58951 */
3.02884868374971361060e-01, /* 0x3fd362773707ebcb */
3.06439619009630070945e-01, /* 0x3fd39cb4eb76157b */
3.09986391246883430384e-01, /* 0x3fd3d6d129271134 */
3.13525122985043869228e-01, /* 0x3fd410cbad6c7d32 */
3.17055753209146973237e-01, /* 0x3fd44aa436c2af09 */
3.20578221991156986359e-01, /* 0x3fd4845a84d0c21b */
3.24092470489871664618e-01, /* 0x3fd4bdee586890e6 */
3.27598440950530811477e-01, /* 0x3fd4f75f73869978 */
3.31096076704132047386e-01, /* 0x3fd530ad9951cd49 */
3.34585322166458920545e-01, /* 0x3fd569d88e1b4cd7 */
3.38066122836825466713e-01, /* 0x3fd5a2e0175e0f4e */
3.41538425296541714449e-01, /* 0x3fd5dbc3fbbe768d */
3.45002177207105076295e-01, /* 0x3fd614840309cfe1 */
3.48457327308122011278e-01, /* 0x3fd64d1ff635c1c5 */
3.51903825414964732676e-01, /* 0x3fd685979f5fa6fd */
3.55341622416168290144e-01, /* 0x3fd6bdeac9cbd76c */
3.58770670270572189509e-01, /* 0x3fd6f61941e4def0 */
3.62190922004212156882e-01, /* 0x3fd72e22d53aa2a9 */
3.65602331706966821034e-01, /* 0x3fd7660752817501 */
3.69004854528964421068e-01, /* 0x3fd79dc6899118d1 */
3.72398446676754202311e-01, /* 0x3fd7d5604b63b3f7 */
3.75783065409248884237e-01, /* 0x3fd80cd46a14b1d0 */
3.79158669033441808605e-01, /* 0x3fd84422b8df95d7 */
3.82525216899905096124e-01, /* 0x3fd87b4b0c1ebedb */
3.85882669398073752109e-01, /* 0x3fd8b24d394a1b25 */
3.89230987951320717144e-01, /* 0x3fd8e92916f5cde8 */
3.92570135011828580396e-01, /* 0x3fd91fde7cd0c662 */
3.95900074055262896078e-01, /* 0x3fd9566d43a34907 */
3.99220769575252543149e-01, /* 0x3fd98cd5454d6b18 */
4.02532187077682512832e-01, /* 0x3fd9c3165cc58107 */
4.05834293074804064450e-01, /* 0x3fd9f93066168001 */
4.09127055079168300278e-01, /* 0x3fda2f233e5e530b */
4.12410441597387267265e-01, /* 0x3fda64eec3cc23fc */
4.15684422123729413467e-01, /* 0x3fda9a92d59e98cf */
4.18948967133552840902e-01, /* 0x3fdad00f5422058b */
4.22204048076583571270e-01, /* 0x3fdb056420ae9343 */
4.25449637370042266227e-01, /* 0x3fdb3a911da65c6c */
4.28685708391625730496e-01, /* 0x3fdb6f962e737efb */
4.31912235472348193799e-01, /* 0x3fdba473378624a5 */
4.35129193889246812521e-01, /* 0x3fdbd9281e528191 */
4.38336559857957774877e-01, /* 0x3fdc0db4c94ec9ef */
4.41534310525166673322e-01, /* 0x3fdc42191ff11eb6 */
4.44722423960939305942e-01, /* 0x3fdc76550aad71f8 */
4.47900879150937292206e-01, /* 0x3fdcaa6872f3631b */
4.51069655988523443568e-01, /* 0x3fdcde53432c1350 */
4.54228735266762495559e-01, /* 0x3fdd121566b7f2ad */
4.57378098670320809571e-01, /* 0x3fdd45aec9ec862b */
4.60517728767271039558e-01, /* 0x3fdd791f5a1226f4 */
4.63647609000806093515e-01, /* 0x3fddac670561bb4f */
4.66767723680866497560e-01, /* 0x3fdddf85bb026974 */
4.69878057975686880265e-01, /* 0x3fde127b6b0744af */
4.72978597903265574054e-01, /* 0x3fde4548066cf51a */
4.76069330322761219421e-01, /* 0x3fde77eb7f175a34 */
4.79150242925822533735e-01, /* 0x3fdeaa65c7cf28c4 */
4.82221324227853687105e-01, /* 0x3fdedcb6d43f8434 */
4.85282563559221225002e-01, /* 0x3fdf0ede98f393cf */
4.88333951056405479729e-01, /* 0x3fdf40dd0b541417 */
4.91375477653101910835e-01, /* 0x3fdf72b221a4e495 */
4.94407135071275316562e-01, /* 0x3fdfa45dd3029258 */
4.97428915812172245392e-01, /* 0x3fdfd5e0175fdf83 */
5.00440813147294050189e-01, /* 0x3fe0039c73c1a40b */
5.03442821109336358099e-01, /* 0x3fe01c341e82422d */
5.06434934483096732549e-01, /* 0x3fe034b709250488 */
5.09417148796356245022e-01, /* 0x3fe04d25314342e5 */
5.12389460310737621107e-01, /* 0x3fe0657e94db30cf */
5.15351866012543347040e-01, /* 0x3fe07dc3324e9b38 */
5.18304363603577900044e-01, /* 0x3fe095f30861a58f */
5.21246951491958210312e-01, /* 0x3fe0ae0e1639866c */
5.24179628782913242802e-01, /* 0x3fe0c6145b5b43da */
5.27102395269579471204e-01, /* 0x3fe0de05d7aa6f7c */
5.30015251423793132268e-01, /* 0x3fe0f5e28b67e295 */
5.32918198386882147055e-01, /* 0x3fe10daa77307a0d */
5.35811237960463593311e-01, /* 0x3fe1255d9bfbd2a8 */
5.38694372597246617929e-01, /* 0x3fe13cfbfb1b056e */
5.41567605391844897333e-01, /* 0x3fe1548596376469 */
5.44430940071603086672e-01, /* 0x3fe16bfa6f5137e1 */
5.47284380987436924748e-01, /* 0x3fe1835a88be7c13 */
5.50127933104692989907e-01, /* 0x3fe19aa5e5299f99 */
5.52961601994028217888e-01, /* 0x3fe1b1dc87904284 */
5.55785393822313511514e-01, /* 0x3fe1c8fe7341f64f */
5.58599315343562330405e-01, /* 0x3fe1e00babdefeb3 */
5.61403373889889367732e-01, /* 0x3fe1f7043557138a */
5.64197577362497537656e-01, /* 0x3fe20de813e823b1 */
5.66981934222700489912e-01, /* 0x3fe224b74c1d192a */
5.69756453482978431069e-01, /* 0x3fe23b71e2cc9e6a */
5.72521144698072359525e-01, /* 0x3fe25217dd17e501 */
5.75276017956117824426e-01, /* 0x3fe268a940696da6 */
5.78021083869819540801e-01, /* 0x3fe27f261273d1b3 */
5.80756353567670302596e-01, /* 0x3fe2958e59308e30 */
5.83481838685214859730e-01, /* 0x3fe2abe21aded073 */
5.86197551356360535557e-01, /* 0x3fe2c2215e024465 */
5.88903504204738026395e-01, /* 0x3fe2d84c2961e48b */
5.91599710335111383941e-01, /* 0x3fe2ee628406cbca */
5.94286183324841177367e-01, /* 0x3fe30464753b090a */
5.96962937215401501234e-01, /* 0x3fe31a52048874be */
5.99629986503951384336e-01, /* 0x3fe3302b39b78856 */
6.02287346134964152178e-01, /* 0x3fe345f01cce37bb */
6.04935031491913965951e-01, /* 0x3fe35ba0b60eccce */
6.07573058389022313541e-01, /* 0x3fe3713d0df6c503 */
6.10201443063065118722e-01, /* 0x3fe386c52d3db11e */
6.12820202165241245673e-01, /* 0x3fe39c391cd41719 */
6.15429352753104952356e-01, /* 0x3fe3b198e5e2564a */
6.18028912282561737612e-01, /* 0x3fe3c6e491c78dc4 */
6.20618898599929469384e-01, /* 0x3fe3dc1c2a188504 */
6.23199329934065904268e-01, /* 0x3fe3f13fb89e96f4 */
6.25770224888563042498e-01, /* 0x3fe4064f47569f48 */
6.28331602434009650615e-01, /* 0x3fe41b4ae06fea41 */
6.30883481900321840818e-01, /* 0x3fe430328e4b26d5 */
6.33425882969144482537e-01, /* 0x3fe445065b795b55 */
6.35958825666321447834e-01, /* 0x3fe459c652badc7f */
6.38482330354437466191e-01, /* 0x3fe46e727efe4715 */
6.40996417725432032775e-01, /* 0x3fe4830aeb5f7bfd */
6.43501108793284370968e-01, /* 0x3fe4978fa3269ee1 */
6.45996424886771558604e-01, /* 0x3fe4ac00b1c71762 */
6.48482387642300484032e-01, /* 0x3fe4c05e22de94e4 */
6.50959018996812410762e-01, /* 0x3fe4d4a8023414e8 */
6.53426341180761927063e-01, /* 0x3fe4e8de5bb6ec04 */
6.55884376711170835605e-01, /* 0x3fe4fd013b7dd17e */
6.58333148384755983962e-01, /* 0x3fe51110adc5ed81 */
6.60772679271132590273e-01, /* 0x3fe5250cbef1e9fa */
6.63202992706093175102e-01, /* 0x3fe538f57b89061e */
6.65624112284960989250e-01, /* 0x3fe54ccaf0362c8f */
6.68036061856020157990e-01, /* 0x3fe5608d29c70c34 */
6.70438865514021320458e-01, /* 0x3fe5743c352b33b9 */
6.72832547593763097282e-01, /* 0x3fe587d81f732fba */
6.75217132663749830535e-01, /* 0x3fe59b60f5cfab9d */
6.77592645519925151909e-01, /* 0x3fe5aed6c5909517 */
6.79959111179481823228e-01, /* 0x3fe5c2399c244260 */
6.82316554874748071313e-01, /* 0x3fe5d58987169b18 */
6.84665002047148862907e-01, /* 0x3fe5e8c6941043cf */
6.87004478341244895212e-01, /* 0x3fe5fbf0d0d5cc49 */
6.89335009598845749323e-01, /* 0x3fe60f084b46e05e */
6.91656621853199760075e-01, /* 0x3fe6220d115d7b8d */
6.93969341323259825138e-01, /* 0x3fe634ff312d1f3b */
6.96273194408023488045e-01, /* 0x3fe647deb8e20b8f */
6.98568207680949848637e-01, /* 0x3fe65aabb6c07b02 */
7.00854407884450081312e-01, /* 0x3fe66d663923e086 */
7.03131821924453670469e-01, /* 0x3fe6800e4e7e2857 */
7.05400476865049030906e-01, /* 0x3fe692a40556fb6a */
7.07660399923197958039e-01, /* 0x3fe6a5276c4b0575 */
7.09911618463524796141e-01, /* 0x3fe6b798920b3d98 */
7.12154159993178659249e-01, /* 0x3fe6c9f7855c3198 */
7.14388052156768926793e-01, /* 0x3fe6dc44551553ae */
7.16613322731374569052e-01, /* 0x3fe6ee7f10204aef */
7.18829999621624415873e-01, /* 0x3fe700a7c5784633 */
7.21038110854851588272e-01, /* 0x3fe712be84295198 */
7.23237684576317874097e-01, /* 0x3fe724c35b4fae7b */
7.25428749044510712274e-01, /* 0x3fe736b65a172dff */
7.27611332626510676214e-01, /* 0x3fe748978fba8e0f */
7.29785463793429123314e-01, /* 0x3fe75a670b82d8d8 */
7.31951171115916565668e-01, /* 0x3fe76c24dcc6c6c0 */
7.34108483259739652560e-01, /* 0x3fe77dd112ea22c7 */
7.36257428981428097003e-01, /* 0x3fe78f6bbd5d315e */
7.38398037123989547936e-01, /* 0x3fe7a0f4eb9c19a2 */
7.40530336612692630105e-01, /* 0x3fe7b26cad2e50fd */
7.42654356450917929600e-01, /* 0x3fe7c3d311a6092b */
7.44770125716075148681e-01, /* 0x3fe7d528289fa093 */
7.46877673555587429099e-01, /* 0x3fe7e66c01c114fd */
7.48977029182941400620e-01, /* 0x3fe7f79eacb97898 */
7.51068221873802288613e-01, /* 0x3fe808c03940694a */
7.53151280962194302759e-01, /* 0x3fe819d0b7158a4c */
7.55226235836744863583e-01, /* 0x3fe82ad036000005 */
7.57293115936992444759e-01, /* 0x3fe83bbec5cdee22 */
7.59351950749757920178e-01, /* 0x3fe84c9c7653f7ea */
7.61402769805578416573e-01, /* 0x3fe85d69576cc2c5 */
7.63445602675201784315e-01, /* 0x3fe86e2578f87ae5 */
7.65480478966144461950e-01, /* 0x3fe87ed0eadc5a2a */
7.67507428319308182552e-01, /* 0x3fe88f6bbd023118 */
7.69526480405658186434e-01, /* 0x3fe89ff5ff57f1f7 */
7.71537664922959498526e-01, /* 0x3fe8b06fc1cf3dfe */
7.73541011592573490852e-01, /* 0x3fe8c0d9145cf49d */
7.75536550156311621507e-01, /* 0x3fe8d13206f8c4ca */
7.77524310373347682379e-01, /* 0x3fe8e17aa99cc05d */
7.79504322017186335181e-01, /* 0x3fe8f1b30c44f167 */
7.81476614872688268854e-01, /* 0x3fe901db3eeef187 */
7.83441218733151756304e-01, /* 0x3fe911f35199833b */
7.85398163397448278999e-01}; /* 0x3fe921fb54442d18 */
/* Some constants. */
static double pi = 3.1415926535897932e+00, /* 0x400921fb54442d18 */
piby2 = 1.5707963267948966e+00, /* 0x3ff921fb54442d18 */
piby4 = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */
three_piby4 = 2.3561944901923449e+00; /* 0x4002d97c7f3321d2 */
double u, v, vbyu, q, s, uu, r;
unsigned int swap_vu, index, xzero, yzero, xnan, ynan, xinf, yinf;
int xexp, yexp, diffexp;
double x = fx;
double y = fy;
/* Find properties of arguments x and y. */
unsigned long ux, aux, xneg, uy, auy, yneg;
GET_BITS_DP64(x, ux);
GET_BITS_DP64(y, uy);
aux = ux & ~SIGNBIT_DP64;
auy = uy & ~SIGNBIT_DP64;
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
xneg = ux & SIGNBIT_DP64;
yneg = uy & SIGNBIT_DP64;
xzero = (aux == 0);
yzero = (auy == 0);
xnan = (aux > PINFBITPATT_DP64);
ynan = (auy > PINFBITPATT_DP64);
xinf = (aux == PINFBITPATT_DP64);
yinf = (auy == PINFBITPATT_DP64);
diffexp = yexp - xexp;
/* Special cases */
if (xnan)
{
unsigned int ufx;
GET_BITS_SP32(fx, ufx);
return _handle_errorf("atan2f", OP_ATAN2, ufx|0x00400000, _DOMAIN, 0,
EDOM, fx, fy, 2);
}
else if (ynan)
{
unsigned int ufy;
GET_BITS_SP32(fy, ufy);
return _handle_errorf("atan2f", OP_ATAN2, ufy|0x00400000, _DOMAIN, 0,
EDOM, fx, fy, 2);
}
else if (yzero)
{ /* Zero y gives +-0 for positive x
and +-pi for negative x */
if (xneg)
{
if (yneg) return valf_with_flags((float)-pi, AMD_F_INEXACT);
else return valf_with_flags((float)pi, AMD_F_INEXACT);
}
else return (float)y;
}
else if (xzero)
{ /* Zero x gives +- pi/2
depending on sign of y */
if (yneg) return valf_with_flags((float)-piby2, AMD_F_INEXACT);
else valf_with_flags((float)piby2, AMD_F_INEXACT);
}
if (diffexp > 26)
{ /* abs(y)/abs(x) > 2^26 => arctan(x/y)
is insignificant compared to piby2 */
if (yneg) return valf_with_flags((float)-piby2, AMD_F_INEXACT);
else return valf_with_flags((float)piby2, AMD_F_INEXACT);
}
else if (diffexp < -13 && (!xneg))
{ /* x positive and dominant over y by a factor of 2^13.
In this case atan(y/x) is y/x to machine accuracy. */
if (diffexp < -150) /* Result underflows */
{
if (yneg)
return valf_with_flags(-0.0F, AMD_F_INEXACT | AMD_F_UNDERFLOW);
else
return valf_with_flags(0.0F, AMD_F_INEXACT | AMD_F_UNDERFLOW);
}
else
{
if (diffexp < -126)
{
/* Result will likely be denormalized */
y = scaleDouble_1(y, 100);
y /= x;
/* Now y is 2^100 times the true result. Scale it back down. */
GET_BITS_DP64(y, uy);
scaleDownDouble(uy, 100, &uy);
PUT_BITS_DP64(uy, y);
if ((uy & EXPBITS_DP64) == 0)
return valf_with_flags((float)y, AMD_F_INEXACT | AMD_F_UNDERFLOW);
else
return (float)y;
}
else
return (float)(y / x);
}
}
else if (diffexp < -26 && xneg)
{ /* abs(x)/abs(y) > 2^56 and x < 0 => arctan(y/x)
is insignificant compared to pi */
if (yneg) return valf_with_flags((float)-pi, AMD_F_INEXACT);
else return valf_with_flags((float)pi, AMD_F_INEXACT);
}
else if (yinf && xinf)
{ /* If abs(x) and abs(y) are both infinity
return +-pi/4 or +- 3pi/4 according to
signs. */
if (xneg)
{
if (yneg) return valf_with_flags((float)-three_piby4, AMD_F_INEXACT);
else return valf_with_flags((float)three_piby4, AMD_F_INEXACT);
}
else
{
if (yneg) return valf_with_flags((float)-piby4, AMD_F_INEXACT);
else return valf_with_flags((float)piby4, AMD_F_INEXACT);
}
}
/* General case: take absolute values of arguments */
u = x; v = y;
if (xneg) u = -x;
if (yneg) v = -y;
/* Swap u and v if necessary to obtain 0 < v < u. Compute v/u. */
swap_vu = (u < v);
if (swap_vu) { uu = u; u = v; v = uu; }
vbyu = v/u;
if (vbyu > 0.0625)
{ /* General values of v/u. Use a look-up
table and series expansion. */
index = (int)(256*vbyu + 0.5);
r = (256*v-index*u)/(256*u+index*v);
/* Polynomial approximation to atan(vbyu) */
s = r*r;
q = atan_jby256[index-16] + r - r*s*0.33333333333224095522;
}
else if (vbyu < 1.e-4)
{ /* v/u is small enough that atan(v/u) = v/u */
q = vbyu;
}
else /* vbyu <= 0.0625 */
{
/* Small values of v/u. Use a series expansion */
s = vbyu*vbyu;
q = vbyu -
vbyu*s*(0.33333333333333170500 -
s*(0.19999999999393223405 -
s*0.14285713561807169030));
}
/* Tidy-up according to which quadrant the arguments lie in */
if (swap_vu) {q = piby2 - q;}
if (xneg) {q = pi - q;}
if (yneg) q = - q;
return (float)q;
}

View File

@@ -0,0 +1,135 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_VALF_WITH_FLAGS
#define USE_NAN_WITH_FLAGS
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_VALF_WITH_FLAGS
#undef USE_NAN_WITH_FLAGS
#undef USE_HANDLE_ERRORF
#include "libm_errno.h"
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(atanf)
float FN_PROTOTYPE(atanf)(float fx)
{
/* Some constants and split constants. */
static double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */
double c, v, s, q, z;
unsigned int xnan;
double x = fx;
/* Find properties of argument fx. */
unsigned long ux, aux, xneg;
GET_BITS_DP64(x, ux);
aux = ux & ~SIGNBIT_DP64;
xneg = ux & SIGNBIT_DP64;
v = x;
if (xneg) v = -x;
/* Argument reduction to range [-7/16,7/16] */
if (aux < 0x3fdc000000000000) /* v < 7./16. */
{
x = v;
c = 0.0;
}
else if (aux < 0x3fe6000000000000) /* v < 11./16. */
{
x = (2.0*v-1.0)/(2.0+v);
/* c = arctan(0.5) */
c = 4.63647609000806093515e-01; /* 0x3fddac670561bb4f */
}
else if (aux < 0x3ff3000000000000) /* v < 19./16. */
{
x = (v-1.0)/(1.0+v);
/* c = arctan(1.) */
c = 7.85398163397448278999e-01; /* 0x3fe921fb54442d18 */
}
else if (aux < 0x4003800000000000) /* v < 39./16. */
{
x = (v-1.5)/(1.0+1.5*v);
/* c = arctan(1.5) */
c = 9.82793723247329054082e-01; /* 0x3fef730bd281f69b */
}
else
{
xnan = (aux > PINFBITPATT_DP64);
if (xnan)
{
/* x is NaN */
unsigned int uhx;
GET_BITS_SP32(fx, uhx);
return _handle_errorf("atanf", OP_ATAN, uhx|0x00400000, _DOMAIN,
0, EDOM, fx, 0.0F, 1);
}
else if (v > 0x4c80000000000000)
{ /* abs(x) > 2^26 => arctan(1/x) is
insignificant compared to piby2 */
if (xneg)
return valf_with_flags((float)-piby2, AMD_F_INEXACT);
else
return valf_with_flags((float)piby2, AMD_F_INEXACT);
}
x = -1.0/v;
/* c = arctan(infinity) */
c = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
}
/* Core approximation: Remez(2,2) on [-7/16,7/16] */
s = x*x;
q = x*s*
(0.296528598819239217902158651186e0 +
(0.192324546402108583211697690500e0 +
0.470677934286149214138357545549e-2*s)*s)/
(0.889585796862432286486651434570e0 +
(0.111072499995399550138837673349e1 +
0.299309699959659728404442796915e0*s)*s);
z = c - (q - x);
if (xneg) z = -z;
return (float)z;
}

View File

@@ -0,0 +1,34 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
double __cdecl _cabs(COMPLEX z)
{
/* Returns the absolute value of a complex number z
with real part a and complex part b. */
return _hypot(z.x, z.y);
}

View File

@@ -0,0 +1,35 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
float _hypotf(float,float);
float _cabsf(COMPLEX z)
{
/* Returns the absolute value of a complex number z
with real part a and complex part b. */
return _hypotf((float)z.x, (float)z.y);
}

View File

@@ -0,0 +1,88 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#include "libm_errno.h"
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_HANDLE_ERROR
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(ceil)
double FN_PROTOTYPE(ceil)(double x)
{
double r;
long rexp, xneg;
unsigned long ux, ax, ur, mask;
GET_BITS_DP64(x, ux);
ax = ux & (~SIGNBIT_DP64);
xneg = (ux != ax);
if (ax >= 0x4340000000000000)
{
/* abs(x) is either NaN, infinity, or >= 2^53 */
if (ax > 0x7ff0000000000000)
/* x is NaN */
return _handle_error("ceil", OP_CEIL, ux|0x0008000000000000, _DOMAIN, 0,
EDOM, x, 0.0, 1);
else
return x;
}
else if (ax < 0x3ff0000000000000) /* abs(x) < 1.0 */
{
if (ax == 0x0000000000000000)
/* x is +zero or -zero; return the same zero */
return x;
else if (xneg) /* x < 0.0 */
{
PUT_BITS_DP64(SIGNBIT_DP64, r); /* return -0.0 */
return r;
}
else
return 1.0;
}
else
{
rexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
/* Mask out the bits of r that we don't want */
mask = 1;
mask = (mask << (EXPSHIFTBITS_DP64 - rexp)) - 1;
ur = (ux & ~mask);
PUT_BITS_DP64(ur, r);
if (xneg || (ur == ux))
return r;
else
/* We threw some bits away and x was positive */
return r + 1.0;
}
}

View File

@@ -0,0 +1,86 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#include "libm_errno.h"
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_HANDLE_ERRORF
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(ceilf)
float FN_PROTOTYPE(ceilf)(float x)
{
float r;
int rexp, xneg;
unsigned int ux, ax, ur, mask;
GET_BITS_SP32(x, ux);
ax = ux & (~SIGNBIT_SP32);
xneg = (ux != ax);
if (ax >= 0x4b800000)
{
/* abs(x) is either NaN, infinity, or >= 2^24 */
if (ax > 0x7f800000)
/* x is NaN */
return _handle_errorf("ceilf", OP_CEIL, ux, _DOMAIN, 0, EDOM, x,
0.0F, 1);
else
return x;
}
else if (ax < 0x3f800000) /* abs(x) < 1.0 */
{
if (ax == 0x00000000)
/* x is +zero or -zero; return the same zero */
return x;
else if (xneg) /* x < 0.0 */
{
PUT_BITS_SP32(SIGNBIT_SP32, r); /* return -0.0 */
return r;
}
else
return 1.0F;
}
else
{
rexp = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
/* Mask out the bits of r that we don't want */
mask = (1 << (EXPSHIFTBITS_SP32 - rexp)) - 1;
ur = (ux & ~mask);
PUT_BITS_SP32(ur, r);
if (xneg || (ux == ur)) return r;
else
/* We threw some bits away and x was positive */
return r + 1.0F;
}
}

View File

@@ -0,0 +1,533 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;
; An implementation of the cos function.
;
; Prototype:
;
; double cos(double x);
;
; Computes cos(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
; If FMA3 hardware is available, an FMA3 implementation of cos will be used.
.const
ALIGN 16
L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1
DQ 0
L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
DQ 0
L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
DQ 0
L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
DQ 0
ALIGN 16
L_one DQ 03FF0000000000000h, 03FF0000000000000h
L_signbit DQ 08000000000000000h, 00000000000000000h
L_int_one DQ 00000000000000001h, 00000000000000000h
L_int_two DQ 00000000000000002h, 00000000000000000h
L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi
L_one_half DQ 03FE0000000000000h ; .5
L_neg_one_half DQ 0bfe0000000000000h ; - 0.5
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
L_piby4 DQ 03FE921FB54442D18h ; pi/4
L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW
L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
L__inf_mask_64 DQ 07FF0000000000000h ; +Inf
EXTRN __Lcosarray:QWORD
EXTRN __Lsinarray:QWORD
EXTRN __use_fma3_lib:DWORD
; local storage offsets
p_temp EQU 020h ; temporary for get/put bits operation
p_temp1 EQU 030h ; temporary for get/put bits operation
dummy_space EQU 040h
stack_size EQU 068h
include fm.inc
fname TEXTEQU <cos>
fname_special TEXTEQU <_cos_special>
;Define name and any external functions being called
EXTERN __remainder_piby2_forAsm : PROC
EXTERN __remainder_piby2_fma3 : PROC
EXTERN __remainder_piby2_fma3_bdl : PROC
EXTERN fname_special : PROC
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne L_cos_fma3
Lcos_sse2:
movd rdx, xmm0
xorpd xmm2, xmm2 ; zeroed out for later use
mov r10, rdx
btr r10, 63 ; r10 <-- |x|
cmp r10, L_piby4
jb Lcos_sse2_absx_lt_piby4
Lcos_absx_nlt_piby4: ; common case
; Here rdx has x, r10 has |x|
movd xmm0, r10 ; xmm0 <-- |x|
cmp r10, QWORD PTR L_small_arg_cw
jae Lcos_reduce_precise ; Note NaN/Inf will branch
; At this point we have |x| < L_small_arg_cw, which is currently 500000.
; Note that if |x| were too large, conversion of npi2 to integer would fail.
; We reduce the argument to be in a range from -pi/4 to +pi/4
; by subtracting multiples of pi/2
movapd xmm2, xmm0
mulsd xmm2, L_2_by_pi
movapd xmm4, xmm0
; xexp = ax >> EXPSHIFTBITS_DP64;
mov r9, r10
shr r9, 52 ; >>EXPSHIFTBITS_DP64
; How many pi/2 is |x| a multiple of?
; npi2 = (int)(x * twobypi + 0.5);
addsd xmm2, L_one_half ; npi2
movsd xmm3, L_real_piby2_1
cvttpd2dq xmm0, xmm2 ; convert npi2 to integer
movsd xmm1, L_real_piby2_1tail
cvtdq2pd xmm2, xmm0 ; and back to double.
; Subtract the multiple from x to get an extra-precision remainder
; rhead = x - npi2 * piby2_1;
mulsd xmm3, xmm2
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_1tail;
mulsd xmm1, xmm2 ; rtail
movd eax, xmm0 ; eax <-- npi2
; GET_BITS_DP64(rhead-rtail, uy);
; originally only rhead
movapd xmm0, xmm4
subsd xmm0, xmm1
movsd xmm3, L_real_piby2_2
movd rcx, xmm0 ; rcx <-- rhead - rtail
movsd xmm5, L_real_piby2_2tail ; piby2_2tail
; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc,
; xmm4=rhead xmm5= temp for calc
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
; expdiff measures how close rhead - rtail is to |x|
; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer)
shl rcx, 1 ; strip any sign bit
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
sub r9, rcx ; expdiff
;; if (expdiff > 15)
cmp r9, 15
jle Lcos_sse2_cw_reduction_done
; Here the remainder is pretty small compared with x, which
; implies that x is a near multiple of pi/2
; (x matches the multiple to at least 15 bits)
; So we do another stage of argument reduction.
; t = rhead;
movapd xmm1, xmm4
; rtail = npi2 * piby2_2;
mulsd xmm3, xmm2
; rhead = t - rtail;
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subsd xmm1, xmm4 ; t - rhead
subsd xmm1, xmm3 ; -rtail
subsd xmm5, xmm1 ; rtail
; r = rhead - rtail;
movapd xmm0, xmm4
;HARSHA
;xmm1=rtail
movapd xmm1, xmm5 ; xmm1 <-- copy of rtail
subsd xmm0, xmm5
; xmm0=r, xmm4=rhead, xmm1=rtail
Lcos_sse2_cw_reduction_done:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; if the input was close to a pi/2 multiple
; The original NAG code missed this trick.
; If the input is very close to n*pi/2 after reduction, so r < 2^-27,
; then the cos is either ~ 1.0 or ~r, to within 53 bits.
; NOTE: Unfortunately, this introduces two jcc instructions close to each
; other and to other branches. As r < 2^-13 should be rather uncommon,
; the problems for branch prediction outweigh the computational savings. - WAT
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region = npi2 & 3;
subsd xmm4, xmm0 ; rhead-r
subsd xmm4, xmm1 ; rr = (rhead-r) - rtail
Lcos_piby4:
; perform taylor series to calc sinx or cosx
; x2 = r * r;
;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the cos path
;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
movapd xmm3, xmm0
movapd xmm2, xmm0
mulsd xmm2, xmm0 ;x2
bt eax,0
jnc Lcos_sse2_calc_cos
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 1 or 3 do a sin calculation
movsd xmm3, __Lsinarray+50h ; s6
mulsd xmm3, xmm2 ; x2s6
movsd xmm5, __Lsinarray+20h ; s3
movsd QWORD PTR p_temp[rsp], xmm4 ; store xx
movapd xmm1, xmm2 ; move for x4
mulsd xmm1, xmm2 ; x4
movsd QWORD PTR p_temp1[rsp], xmm0 ; store x
mulsd xmm5, xmm2 ; x2s3
movapd xmm4, xmm0 ; move for x3
addsd xmm3, __Lsinarray+40h ; s5+x2s6
mulsd xmm1, xmm2 ; x6
mulsd xmm3, xmm2 ; x2(s5+x2s6)
mulsd xmm4, xmm2 ; x3
addsd xmm5, __Lsinarray+10h ; s2+x2s3
mulsd xmm5, xmm2 ; x2(s2+x2s3)
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
mulsd xmm2, L_one_half ; 0.5 *x2
movsd xmm0, QWORD PTR p_temp[rsp] ; load xx
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
mulsd xmm2, xmm0 ; 0.5 * x2 *xx
addsd xmm3, xmm5 ; zs
mulsd xmm4, xmm3 ; *x3
subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx
addsd xmm0, xmm4 ; +xx
addsd xmm0, QWORD PTR p_temp1[rsp] ; +x
jmp Lcos_sse2_adjust_region
ALIGN 16
Lcos_sse2_calc_cos:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 or 2 - do a cos calculation
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
mulsd xmm4, xmm0 ; x*xx
movsd xmm5, L_one_half
movsd xmm1, __Lcosarray+50h ; c6
movsd xmm0, __Lcosarray+20h ; c3
mulsd xmm5, xmm2 ; r = 0.5 *x2
movapd xmm3, xmm2 ; copy of x2
movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx
mulsd xmm1, xmm2 ; c6*x2
mulsd xmm0, xmm2 ; c3*x2
subsd xmm5, L_one ; -t=r-1.0, trash r
mulsd xmm3, xmm2 ; x4
addsd xmm1, __Lcosarray+40h ; c5+x2c6
addsd xmm0, __Lcosarray+10h ; c2+x2C3
addsd xmm5, L_one ; 1 + (-t), trash t
mulsd xmm3, xmm2 ; x6
mulsd xmm1, xmm2 ; x2(c5+x2c6)
mulsd xmm0, xmm2 ; x2(c2+x2C3)
movapd xmm4, xmm2 ; copy of x2
mulsd xmm4, L_one_half ; r recalculate
addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6)
addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3)
mulsd xmm2, xmm2 ; x4 recalculate
subsd xmm5, xmm4 ; (1 + (-t)) - r
mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6))
addsd xmm0, xmm1 ; zc
subsd xmm4, L_one ; t relaculate
subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx
mulsd xmm0, xmm2 ; x4 * zc
addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx)
subsd xmm0, xmm4 ; result - (-t)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lcos_sse2_adjust_region:
; switch (region)
add eax, 1
and eax, 2
jz Lcos_sse2_cleanup
;; if the original region 1 or 2 then we negate the result.
movapd xmm2, xmm0
xorpd xmm0, xmm0
subsd xmm0, xmm2
ALIGN 16
Lcos_sse2_cleanup:
StackDeallocate stack_size
ret
ALIGN 16
Lcos_sse2_absx_lt_piby4:
; cos = cos_piby4(x, 0.0);
; x2 = r * r;
cmp r10, L_two_to_neg_13
jb Lcos_sse2_x_small
movapd xmm2, xmm0
mulsd xmm2, xmm0 ; x2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 - do a cos calculation
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
movsd xmm1, __Lcosarray+10h ; c2
movapd xmm4, xmm2 ; move for x4
mulsd xmm4, xmm2 ; x4
movsd xmm3, __Lcosarray+30h ; c4
mulsd xmm1, xmm2 ; c2x2
movsd xmm5, __Lcosarray+50h ; c6
mulsd xmm3, xmm2 ; c4x2
movapd xmm0, xmm4 ; move for x8
mulsd xmm5, xmm2 ; c6x2
mulsd xmm0, xmm4 ; x8
addsd xmm1, __Lcosarray ; c1 + c2x2
mulsd xmm1, xmm4 ; c1x4 + c2x6
addsd xmm3, __Lcosarray+20h ; c3 + c4x2
mulsd xmm2, L_neg_one_half ; -0.5x2, destroy xmm2
addsd xmm5, __Lcosarray+40h ; c5 + c6x2
mulsd xmm3, xmm0 ; c3x8 + c4x10
mulsd xmm4, xmm0 ; x12
mulsd xmm4, xmm5 ; c5x12 + c6x14
movsd xmm0, L_one
addsd xmm1, xmm3 ; c1x4 + c2x6 + c3x8 + c4x10
movapd xmm3, xmm2 ; preserve -0.5x2
addsd xmm2, xmm0 ; t = 1 - 0.5x2
subsd xmm0, xmm2 ; 1-t
addsd xmm0, xmm3 ; (1-t) - r
addsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
addsd xmm0, xmm1 ; (1-t) - r + c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
addsd xmm0, xmm2 ; 1 - 0.5x2 + above
StackDeallocate stack_size
ret
ALIGN 16
Lcos_sse2_x_small:
movsd xmm2, xmm0
movsd xmm0, L_one
cmp r10, L_two_to_neg_27
jb Lcos_sse2_x_smaller
mulsd xmm2, xmm2
mulsd xmm2, L_one_half
subsd xmm0, xmm2
StackDeallocate stack_size
ret
ALIGN 16
Lcos_sse2_x_smaller:
movsd xmm0, L_one
addsd xmm0, L_int_one ; really adding smallest subnormal; set inexact
StackDeallocate stack_size
ret
ALIGN 16
Lcos_reduce_precise:
; Reduce x into range [-pi/4, pi/4]
cmp r10, L__inf_mask_64
jae Lcos_x_naninf
call __remainder_piby2_forAsm
; At this point xmm0 has r, xmm1 has rr, rax has region
movapd xmm4, xmm1 ; xmm4 <-- rr
jmp Lcos_piby4
; xmm0 = x, xmm4 = xx, eax= region
ALIGN 16
Lcos_x_naninf:
call fname_special
StackDeallocate stack_size
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; From this point we assume that FMA3 and AVX hardware are present.
ALIGN 16
L_cos_fma3:
vmovq r9,xmm0
mov rax,r9
and r9,L_sign_mask ; clear sign
Lcos_early_exit_s_1: ;; unused label
cmp r9,L_piby4
jg Lcos_early_exit_s ; Note that NaN will branch
cmp r9,L_two_to_neg_13
jge Lcompute_cos_pyby_4
cmp r9,L_two_to_neg_27
jge Lcompute_1_xx_5
vmovq xmm0,L_one ; for tiniest args, cos is 1
jmp Lreturn_no_restore
Lcompute_1_xx_5:
vmulsd xmm1,xmm0,L_one_half ; xmm1l <-- .5*x
vfnmadd213sd xmm0,xmm1,L_one ; xmm0l <-- 1.0 - (.5*x)*x
jmp Lreturn_no_restore
Lcompute_cos_pyby_4:
; make sure this is accurate enough
; note that x^2 can't be all that close to 1 here
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- xx = x*x
vmovapd xmm0,__Lcosarray+050h ; xmm0 <-- c5
vfmadd213sd xmm0,xmm3,__Lcosarray+040h ; xmm0 <-- c5*xx + c4
vfmadd213sd xmm0,xmm3,__Lcosarray+030h ; xmm0 <-- (c5*xx + c4)*xx + c3
vfmadd213sd xmm0,xmm3,__Lcosarray+020h
vfmadd213sd xmm0,xmm3,__Lcosarray+010h
vfmadd213sd xmm0,xmm3,__Lcosarray
vfmsub213sd xmm0,xmm3,L_one_half
vfmadd213sd xmm0,xmm3,L_one
jmp Lreturn_no_restore
Lcos_early_exit_s:
mov r8,L__inf_mask_64
and rax,r8
cmp rax, r8
jz Lcos_x_naninf
Lrange_reduce:
vmovq xmm0,r9 ; r9 <-- |x|
cmp r9,L_small_arg_bdl
jae Lcos_remainder_piby2
; For __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
; on input
; x is in xmm0
; on output
; r is in xmm0
; rr is in xmm1
; region is in rax
; Boldo-Daumas-Li reduction for reasonably small |x|
call __remainder_piby2_fma3_bdl
;; if region is 0 or 2 do a cos calc.
;; if region is 1 or 3 do a sin calc.
Lcos_exit_s:
bt rax,0
jc Lsin_piby4_compute
Lcos_piby4_compute: ;; unused label
; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4]
vmovapd xmm2,L_one
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x * x
vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5
vsubsd xmm2,xmm2,xmm4 ; 1-t
vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r
vmovapd xmm5,__Lcosarray+040h
vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2
vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1
vfmadd231sd xmm5,xmm3,__Lcosarray+050h
vfmadd213sd xmm5,xmm3,__Lcosarray+030h
vfmadd213sd xmm5,xmm3,__Lcosarray+020h
vfmadd213sd xmm5,xmm3,__Lcosarray+010h
vfmadd213sd xmm5,xmm3,__Lcosarray
vfmadd213sd xmm5,xmm1,xmm2
vaddsd xmm0,xmm5,xmm4
jmp Lcos_exit_s_1
ALIGN 16
Lsin_piby4_compute:
; compute the sine of r+rr, where this sum is in [-pi/4,pi/4]
vmovapd xmm5,__Lsinarray+040h
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x
vfmadd231sd xmm5,xmm3,__Lsinarray+050h
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r
vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x
vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r
vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x
vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r
vmulsd xmm2,xmm3,xmm2
vsubsd xmm2,xmm2,xmm1
vfnmadd231sd xmm2, xmm4,__Lsinarray
vsubsd xmm0,xmm0,xmm2
Lcos_exit_s_1:
xor r8,r8
add eax, 1
and eax, 2
cmovnz r8, L_signbit
vmovq xmm3,r8
vxorpd xmm0,xmm0,xmm3
Lreturn_restore_regs:
StackDeallocate stack_size
ret
Lreturn_no_restore:
StackDeallocate stack_size
ret
ALIGN 16
Lcos_remainder_piby2:
; argument reduction for general x
call __remainder_piby2_fma3
jmp Lcos_exit_s
fname endp
END

View File

@@ -0,0 +1,525 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the cosf function.
;
; Prototype:
;
; float cosf(float x);
;
; Computes cosf(x).
; Based on the NAG C implementation.
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Original Author: Harsha Jagasia
.const
ALIGN 16
L_real_one DQ 03ff0000000000000h ; 1.0
DQ 0 ; for alignment
L_one_half DQ 03fe0000000000000h ; 0.5
DQ 0
L_2bypi DQ 03fe45f306dc9c883h ; 2./pi
DQ 0
L_one_sixth DQ 03fc5555555555555h ; 0.166666666666
DQ 0
L_piby2 DQ 03fe921fb54442d18h
DQ 0
L_piby2_1 DQ 03ff921fb54400000h ; piby2_1
DQ 0
L_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
DQ 0
L_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
DQ 0
L_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
DQ 0
L_large_x_sse2 DQ 0411E848000000000h ; 5e5
DQ 0
L_large_x_fma3 DQ 041E921FB60000000h ; 3.37325952e9
DQ 0
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
DQ 07FFFFFFFFFFFFFFFh
L__int_three DQ 00000000000000003h
DQ 00000000000000003h
L__min_norm_double DQ 00010000000000000h
DQ 00010000000000000h
L_two_to_neg_7 DQ 03f80000000000000h
DQ 0
L_two_to_neg_13 DQ 03f20000000000000h
DQ 0
L_inf_mask_32 DD 07F800000h
DQ 0
fname TEXTEQU <cosf>
fname_special TEXTEQU <_cosf_special>
;Define name and any external functions being called
EXTERN __remainder_piby2d2f_forAsm : PROC ; NEAR
EXTERN __remainder_piby2_fma3_bdl : PROC ; NEAR
EXTERN __remainder_piby2_fma3 : PROC ; NEAR
EXTERN fname_special : PROC
EXTERN _set_statfp : PROC
EXTRN __Lcosfarray:QWORD
EXTRN __Lsinfarray:QWORD
EXTRN __use_fma3_lib:DWORD
; define local variable storage offsets
p_temp equ 020h ; temporary for get/put bits operation
p_temp1 equ 030h ; temporary for get/put bits operation
dummy_space EQU 040h
stack_size EQU 068h
include fm.inc
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Lcosf_fma3
Lcosf_sse2:
xorpd xmm2, xmm2 ; zeroed out for later use
;; if NaN or inf
movd edx, xmm0
mov eax, 07f800000h
mov r10d, eax
and r10d, edx
cmp r10d, eax
jz Lcosf_sse2_naninf
cvtss2sd xmm0, xmm0
movd rdx, xmm0
; ax = (ux & ~SIGNBIT_DP64);
mov r10, rdx
btr r10, 63 ; r10 <-- |x|
mov r8d, 1 ; for determining region later on
movapd xmm1, xmm0 ; xmm1 <-- copy of x
;; if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */
mov rax, 03fe921fb54442d18h
cmp r10, rax
jg Lcosf_sse2_absx_gt_piby4
; *c = cos_piby4(x, 0.0);
movapd xmm2, xmm0
mulsd xmm2, xmm2 ;x^2
xor eax, eax
mov rdx, r10
movsd xmm5, QWORD PTR L_one_half
jmp Lcosf_sse2_calc_sincosf_piby4 ; done
ALIGN 16
Lcosf_sse2_absx_gt_piby4:
; reduce the argument to be in a range from -pi/4 to +pi/4
; by subtracting multiples of pi/2
; xneg = (ax != ux);
movd xmm0, r10 ; xmm0 <-- |x|
cmp r10, QWORD PTR L_large_x_sse2
jae Lcosf_sse2_reduce_precise
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; xmm0=abs(x), xmm1=x
;/* How many pi/2 is x a multiple of? */
movapd xmm2, xmm0
movsd xmm3, QWORD PTR L_2bypi
movapd xmm4, xmm0
movsd xmm5, QWORD PTR L_one_half
mulsd xmm2, xmm3
; movsd xmm5, QWORD PTR L_one_half
; movapd xmm2, xmm0
; mulsd xmm2, QWORD PTR L_2bypi
; movapd xmm4, xmm0
mov r9, r10
shr r9, 52 ; r9 <-- biased exponent of x
; npi2 = (int)(x * twobypi + 0.5);
addsd xmm2, xmm5 ; npi2
movsd xmm3, QWORD PTR L_piby2_1 ; piby2_1
cvttpd2dq xmm0, xmm2 ; xmm0 <-- npi2
movsd xmm1, QWORD PTR L_piby2_1tail ; piby2_1tail
cvtdq2pd xmm2, xmm0 ; xmm2 <-- (double)npi2
; Subtract the multiple from x to get an extra-precision remainder
; rhead = x - npi2 * piby2_1;
mulsd xmm3, xmm2 ; use piby2_1
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_1tail;
mulsd xmm1, xmm2 ; rtail
movd eax, xmm0
; GET_BITS_DP64(rhead-rtail, uy);
; originally only rhead
movapd xmm0, xmm4
subsd xmm0, xmm1
movsd xmm3, QWORD PTR L_piby2_2 ; piby2_2
movd rcx, xmm0 ; rcx <-- rhead-rtail
movsd xmm5, QWORD PTR L_piby2_2tail ; piby2_2tail
; region = npi2 & 3;
; and eax, 3
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
shl rcx, 1 ; strip any sign bit
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
sub r9, rcx ; expdiff
;; if (expdiff > 15)
cmp r9, 15
jle Lcosf_sse2_expdiff_le_15
; The remainder is pretty small compared with x, which
; implies that x is a near multiple of pi/2
; (x matches the multiple to at least 15 bits)
; t = rhead;
movapd xmm1, xmm4
; rtail = npi2 * piby2_2;
mulsd xmm3, xmm2
; rhead = t - rtail;
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subsd xmm1, xmm4 ; t - rhead
subsd xmm1, xmm3 ; -rtail
subsd xmm5, xmm1 ; rtail
; r = rhead - rtail;
movapd xmm0, xmm4
;HARSHA
;xmm1=rtail
movapd xmm1, xmm5
subsd xmm0, xmm5
; xmm0=r, xmm4=rhead, xmm1=rtail
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lcosf_sse2_expdiff_le_15:
cmp rcx, 03f2h ; is r < 2^-13 ?
jge Lcosf_sse2_calc_sincosf_piby4 ; use taylor series if not
cmp rcx, 03deh ; is r < 2^-33 ?
jle Lcosf_sse2_r_very_small ; then cosf(r) ~ 1 or r
movapd xmm2, xmm0
mulsd xmm2, xmm0 ; xmm2 <-- x^2
;; if region is 1 or 3 do a sinf calc.
and r8d, eax
jz Lcosf_sse2_r_small_calc_sin
Lcosf_sse2_r_small_calc_cos:
; region 1 or 3
; use simply polynomial
; *s = x - x*x*x*0.166666666666666666;
movsd xmm3, QWORD PTR L_one_sixth
mulsd xmm3, xmm0 ; * x
mulsd xmm3, xmm2 ; * x^2
subsd xmm0, xmm3 ; xs
jmp Lcosf_sse2_adjust_region
ALIGN 16
Lcosf_sse2_r_small_calc_sin:
; region 0 or 2
; cos = 1.0 - x*x*0.5;
movsd xmm0, QWORD PTR L_real_one ; 1.0
mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2
subsd xmm0, xmm2
jmp Lcosf_sse2_adjust_region
ALIGN 16
Lcosf_sse2_r_very_small:
; then sin(r) = r
; if region is 1 or 3 do a sin calc.
and r8d, eax
jnz Lcosf_sse2_adjust_region
movsd xmm0, QWORD PTR L_real_one ; cosf(r) is a 1
; By this point, calculations should already have set inexact
jmp Lcosf_sse2_adjust_region
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lcosf_sse2_reduce_precise:
; Reduce abs(x) into range [-pi/4, pi/4]
; remainder_piby2d2f(ax, &r, &region);
mov QWORD PTR p_temp[rsp], rdx ; save ux for use later
mov QWORD PTR p_temp1[rsp], r10 ; save ax for use later
call __remainder_piby2d2f_forAsm
mov rdx, QWORD PTR p_temp[rsp] ; restore ux for use later
mov r10, QWORD PTR p_temp1[rsp] ; restore ax for use later
mov r8d, 1 ; for determining region later
; Reduced argument is in xmm0. No second word; after all, we started in
; single precision. Region is in rax.
movapd xmm1, xmm0
movsd xmm5, QWORD PTR L_one_half
jmp Lcosf_sse2_calc_sincosf_piby4
; done with reducing the argument. Now perform the sin/cos calculations.
ALIGN 16
Lcosf_sse2_calc_sincosf_piby4:
movapd xmm2, xmm0
mulsd xmm2, xmm0 ; x^2
;; if region is 0 or 2, do a cosf calc
and r8d, eax
jz Lcosf_sse2_do_cosf_calc
; region is 1 or 3: do a sinf calc.
Lcosf_sse2_do_sinf_calc:
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
mulsd xmm1, xmm2 ; s4x2
movsd xmm4, xmm2 ; move for x4
mulsd xmm4, xmm2 ; x4
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
mulsd xmm5, xmm2 ; s2x2
movsd xmm3, xmm0 ; move for x3
mulsd xmm3, xmm2 ; x3
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
mulsd xmm1, xmm4 ; s3x4+s4x6
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
jmp Lcosf_sse2_adjust_region
ALIGN 16
Lcosf_sse2_do_cosf_calc:
; region 0 or 2 - do a cos calculation
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4
movsd xmm4, xmm2 ; move for x4
mulsd xmm1, xmm2 ; c4x2
movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2
mulsd xmm4, xmm2 ; x4
movsd xmm0, QWORD PTR __Lcosfarray ; c0
mulsd xmm3, xmm2 ; c2x2
mulsd xmm0, xmm2 ; c0x2 (=-0.5x2)
addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2
mulsd xmm1, xmm4 ; c3x4 + c4x6
addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2
addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6
mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10
addsd xmm0, QWORD PTR L_real_one ; 1 - 0.5x2
addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
Lcosf_sse2_adjust_region:
; xmm1 is cos or sin, relies on previous sections to
; switch (region)
add eax, 1
and eax, 2
jz Lcosf_sse2_cleanup
;; if region 1 or 2 then we negate the result.
xorpd xmm2, xmm2
subsd xmm2, xmm0
movapd xmm0, xmm2
ALIGN 16
Lcosf_sse2_cleanup:
cvtsd2ss xmm0, xmm0
StackDeallocate stack_size
ret
Lcosf_sse2_naninf:
call fname_special
StackDeallocate stack_size
ret
ALIGN 16
Lcosf_fma3:
vmovd eax,xmm0
mov r8d,L_inf_mask_32
and eax,r8d
cmp eax, r8d
jz Lcosf_fma3_naninf
vcvtss2sd xmm5,xmm0,xmm0
vmovq r9,xmm5
btr r9,63 ;clear sign
cmp r9,L_piby2
jg Lcosf_fma3_range_reduce
cmp r9,L_two_to_neg_7
jge Lcosf_fma3_compute_cosf_piby_4
cmp r9,L_two_to_neg_13
jge Lcosf_fma3_compute_1_xx_5
vmovq xmm0,QWORD PTR L_real_one
; Here we need to set inexact
vaddsd xmm0,xmm0,L__min_norm_double ; this will set inexact
jmp Lcosf_fma3_return
ALIGN 16
Lcosf_fma3_compute_1_xx_5:
vmulsd xmm0,xmm5,QWORD PTR L_one_half
vfnmadd213sd xmm0,xmm5,L_real_one ; xmm9 1.0 - x*x*(double2)0.5
jmp Lcosf_fma3_return
ALIGN 16
Lcosf_fma3_compute_cosf_piby_4:
movsd xmm0,xmm5
vmovapd xmm2,L_real_one
vmulsd xmm3,xmm0,xmm0
vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r
vsubsd xmm2,xmm2,xmm1
vmovsd xmm1,__Lcosfarray+018h
vfmadd231sd xmm1,xmm3,__Lcosfarray+020h
vfmadd213sd xmm1,xmm3,__Lcosfarray+010h
vfmadd213sd xmm1,xmm3,__Lcosfarray+008h
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
vmovdqa xmm0,xmm2
vfmadd231sd xmm0,xmm1,xmm3
jmp Lcosf_fma3_return
ALIGN 16
Lcosf_fma3_range_reduce:
vmovq xmm0,r9 ; xmm0 <-- |x|
cmp r9,L_large_x_fma3
jge Lcosf_reduce_precise
;cosff_range_e_5_s:
vandpd xmm1,xmm0,L_sign_mask
vmovapd xmm2,L_2bypi
vfmadd213sd xmm2,xmm1,L_one_half
vcvttpd2dq xmm2,xmm2
vpmovsxdq xmm1,xmm2
vandpd xmm4,xmm1,L__int_three ; region xmm4
vshufps xmm1 ,xmm1,xmm1,8
vcvtdq2pd xmm1,xmm1
vmovdqa xmm2,xmm0
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail
vsubsd xmm0,xmm2,xmm3 ; r_1 xmm0
vsubsd xmm2,xmm2,xmm0
vsubsd xmm1,xmm2,xmm3
vmovq rax,xmm4
jmp Lcosf_exit_s
ALIGN 16
Lcosf_reduce_precise:
vmovq xmm0,r9 ; r9 <-- |x|
cmp r9,L_large_x_fma3
jge Lcos_remainder_piby2
; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
; have the following conventions:
; on input
; x is in xmm0
; on output
; r is in xmm0
; rr is in xmm1
; region is in rax
; The _bdl routine is guaranteed not to touch r10
Lcos_remainder_piby2_small: ;; unused label
; Boldo-Daumas-Li reduction for reasonably small |x|
call __remainder_piby2_fma3_bdl
jmp Lcosf_exit_s
ALIGN 16
Lcos_remainder_piby2:
; argument reduction for general x
call __remainder_piby2_fma3
Lcosf_exit_s:
bt rax,0
jnc Lcosf_piby4_compute
;sinf_piby4_compute:
; vmovapd xmm1,__Lsinfarray+010h
vmovsd xmm1,__Lsinfarray+010h
vmulsd xmm3,xmm0,xmm0
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
vfmadd213sd xmm1,xmm3,__Lsinfarray+008h
vfmadd213sd xmm1,xmm3,__Lsinfarray
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
vfmadd231sd xmm0,xmm1,xmm3
jmp Lcosf_fma3_adjust_sign
ALIGN 16
Lcosf_piby4_compute:
vmovapd xmm2,L_real_one
vmulsd xmm3,xmm0,xmm0
vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r
vsubsd xmm2,xmm2,xmm1
vmovsd xmm1,__Lcosfarray+018h
vfmadd231sd xmm1 ,xmm3,__Lcosfarray+020h
vfmadd213sd xmm1 ,xmm3,__Lcosfarray+010h
vfmadd213sd xmm1 ,xmm3,__Lcosfarray+008h
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
vmovdqa xmm0, xmm2
vfmadd231sd xmm0 ,xmm1,xmm3
Lcosf_fma3_adjust_sign:
; assuming FMA3 ==> AVX ==> SSE4.1
; vpcmpeqq xmm1,xmm4,XMMWORD PTR L_int_one
; vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two
; vorpd xmm3,xmm2,xmm1
; vandpd xmm3,xmm3,L_signbit
add rax,1 ; 1,2 --> 2,3
shr rax,1 ; 2,3 --> 1
shl rax,63 ; 1 --> sign bit
vmovq xmm3,rax
vxorpd xmm0,xmm0,xmm3
Lcosf_fma3_return:
vcvtsd2ss xmm0,xmm0,xmm0
StackDeallocate stack_size
ret
Lcosf_fma3_naninf:
call fname_special
StackDeallocate stack_size
ret
fname endp
END

View File

@@ -0,0 +1,344 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_SPLITEXP
#define USE_SCALEDOUBLE_1
#define USE_SCALEDOUBLE_2
#define USE_INFINITY_WITH_FLAGS
#define USE_VAL_WITH_FLAGS
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_SPLITEXP
#undef USE_SCALEDOUBLE_1
#undef USE_SCALEDOUBLE_2
#undef USE_INFINITY_WITH_FLAGS
#undef USE_VAL_WITH_FLAGS
#undef USE_HANDLE_ERROR
#pragma function(cosh)
double cosh(double x)
{
/*
Derived from sinh subroutine
After dealing with special cases the computation is split into
regions as follows:
abs(x) >= max_cosh_arg:
cosh(x) = sign(x)*Inf
abs(x) >= small_threshold:
cosh(x) = sign(x)*exp(abs(x))/2 computed using the
splitexp and scaleDouble functions as for exp_amd().
abs(x) < small_threshold:
compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
cosh(x) is then sign(x)*z. */
static const double
max_cosh_arg = 7.10475860073943977113e+02, /* 0x408633ce8fb9f87e */
thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
// small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889;
small_threshold = 20.0;
/* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */
/* Lead and tail tabulated values of sinh(i) and cosh(i)
for i = 0,...,36. The lead part has 26 leading bits. */
static const double sinh_lead[ 37] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
1.17520117759704589844e+00, /* 0x3ff2cd9fc0000000 */
3.62686038017272949219e+00, /* 0x400d03cf60000000 */
1.00178747177124023438e+01, /* 0x40240926e0000000 */
2.72899169921875000000e+01, /* 0x403b4a3800000000 */
7.42032089233398437500e+01, /* 0x40528d0160000000 */
2.01713153839111328125e+02, /* 0x406936d228000000 */
5.48316116333007812500e+02, /* 0x4081228768000000 */
1.49047882080078125000e+03, /* 0x409749ea50000000 */
4.05154187011718750000e+03, /* 0x40afa71570000000 */
1.10132326660156250000e+04, /* 0x40c5829dc8000000 */
2.99370708007812500000e+04, /* 0x40dd3c4488000000 */
8.13773945312500000000e+04, /* 0x40f3de1650000000 */
2.21206695312500000000e+05, /* 0x410b00b590000000 */
6.01302140625000000000e+05, /* 0x412259ac48000000 */
1.63450865625000000000e+06, /* 0x4138f0cca8000000 */
4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */
1.20774762500000000000e+07, /* 0x4167093488000000 */
3.28299845000000000000e+07, /* 0x417f4f2208000000 */
8.92411500000000000000e+07, /* 0x419546d8f8000000 */
2.42582596000000000000e+08, /* 0x41aceb0888000000 */
6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */
1.79245641600000000000e+09, /* 0x41dab5adb8000000 */
4.87240166400000000000e+09, /* 0x41f226af30000000 */
1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */
3.60024494080000000000e+10, /* 0x4220c3d390000000 */
9.78648043520000000000e+10, /* 0x4236c93268000000 */
2.66024116224000000000e+11, /* 0x424ef822f0000000 */
7.23128516608000000000e+11, /* 0x42650bba30000000 */
1.96566712320000000000e+12, /* 0x427c9aae40000000 */
5.34323724288000000000e+12, /* 0x4293704708000000 */
1.45244246507520000000e+13, /* 0x42aa6b7658000000 */
3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */
1.07321789251584000000e+14, /* 0x42d866f348000000 */
2.91730863685632000000e+14, /* 0x42f0953e28000000 */
7.93006722514944000000e+14, /* 0x430689e220000000 */
2.15561576592179200000e+15}; /* 0x431ea215a0000000 */
static const double sinh_tail[ 37] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
1.60467555584448807892e-08, /* 0x3e513ae6096a0092 */
2.76742892754807136947e-08, /* 0x3e5db70cfb79a640 */
2.09697499555224576530e-07, /* 0x3e8c2526b66dc067 */
2.04940252448908240062e-07, /* 0x3e8b81b18647f380 */
1.65444891522700935932e-06, /* 0x3ebbc1cdd1e1eb08 */
3.53116789999998198721e-06, /* 0x3ecd9f201534fb09 */
6.94023870987375490695e-06, /* 0x3edd1c064a4e9954 */
4.98876893611587449271e-06, /* 0x3ed4eca65d06ea74 */
3.19656024605152215752e-05, /* 0x3f00c259bcc0ecc5 */
2.08687768377236501204e-04, /* 0x3f2b5a6647cf9016 */
4.84668088325403796299e-05, /* 0x3f09691adefb0870 */
1.17517985422733832468e-03, /* 0x3f53410fc29cde38 */
6.90830086959560562415e-04, /* 0x3f46a31a50b6fb3c */
1.45697262451506548420e-03, /* 0x3f57defc71805c40 */
2.99859023684906737806e-02, /* 0x3f9eb49fd80e0bab */
1.02538800507941396667e-02, /* 0x3f84fffc7bcd5920 */
1.26787628407699110022e-01, /* 0x3fc03a93b6c63435 */
6.86652479544033744752e-02, /* 0x3fb1940bb255fd1c */
4.81593627621056619148e-01, /* 0x3fded26e14260b50 */
1.70489513795397629181e+00, /* 0x3ffb47401fc9f2a2 */
1.12416073482258713767e+01, /* 0x40267bb3f55634f1 */
7.06579578070110514432e+00, /* 0x401c435ff8194ddc */
5.91244512999659974639e+01, /* 0x404d8fee052ba63a */
1.68921736147050694399e+02, /* 0x40651d7edccde3f6 */
2.60692936262073658327e+02, /* 0x40704b1644557d1a */
3.62419382134885609048e+02, /* 0x4076a6b5ca0a9dc4 */
4.07689930834187271103e+03, /* 0x40afd9cc72249aba */
1.55377375868385224749e+04, /* 0x40ce58de693edab5 */
2.53720210371943067003e+04, /* 0x40d8c70158ac6363 */
4.78822310734952334315e+04, /* 0x40e7614764f43e20 */
1.81871712615542812273e+05, /* 0x4106337db36fc718 */
5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */
6.41374032312148716301e+05, /* 0x412392bc108b37cc */
7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */
3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */
7.63580561355670914054e+06}; /* 0x415d20d76744835c */
static const double cosh_lead[ 37] = {
1.00000000000000000000e+00, /* 0x3ff0000000000000 */
1.54308062791824340820e+00, /* 0x3ff8b07550000000 */
3.76219564676284790039e+00, /* 0x400e18fa08000000 */
1.00676617622375488281e+01, /* 0x402422a490000000 */
2.73082327842712402344e+01, /* 0x403b4ee858000000 */
7.42099475860595703125e+01, /* 0x40528d6fc8000000 */
2.01715633392333984375e+02, /* 0x406936e678000000 */
5.48317031860351562500e+02, /* 0x4081228948000000 */
1.49047915649414062500e+03, /* 0x409749eaa8000000 */
4.05154199218750000000e+03, /* 0x40afa71580000000 */
1.10132329101562500000e+04, /* 0x40c5829dd0000000 */
2.99370708007812500000e+04, /* 0x40dd3c4488000000 */
8.13773945312500000000e+04, /* 0x40f3de1650000000 */
2.21206695312500000000e+05, /* 0x410b00b590000000 */
6.01302140625000000000e+05, /* 0x412259ac48000000 */
1.63450865625000000000e+06, /* 0x4138f0cca8000000 */
4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */
1.20774762500000000000e+07, /* 0x4167093488000000 */
3.28299845000000000000e+07, /* 0x417f4f2208000000 */
8.92411500000000000000e+07, /* 0x419546d8f8000000 */
2.42582596000000000000e+08, /* 0x41aceb0888000000 */
6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */
1.79245641600000000000e+09, /* 0x41dab5adb8000000 */
4.87240166400000000000e+09, /* 0x41f226af30000000 */
1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */
3.60024494080000000000e+10, /* 0x4220c3d390000000 */
9.78648043520000000000e+10, /* 0x4236c93268000000 */
2.66024116224000000000e+11, /* 0x424ef822f0000000 */
7.23128516608000000000e+11, /* 0x42650bba30000000 */
1.96566712320000000000e+12, /* 0x427c9aae40000000 */
5.34323724288000000000e+12, /* 0x4293704708000000 */
1.45244246507520000000e+13, /* 0x42aa6b7658000000 */
3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */
1.07321789251584000000e+14, /* 0x42d866f348000000 */
2.91730863685632000000e+14, /* 0x42f0953e28000000 */
7.93006722514944000000e+14, /* 0x430689e220000000 */
2.15561576592179200000e+15}; /* 0x431ea215a0000000 */
static const double cosh_tail[ 37] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
6.89700037027478056904e-09, /* 0x3e3d9f5504c2bd28 */
4.43207835591715833630e-08, /* 0x3e67cb66f0a4c9fd */
2.33540217013828929694e-07, /* 0x3e8f58617928e588 */
5.17452463948269748331e-08, /* 0x3e6bc7d000c38d48 */
9.38728274131605919153e-07, /* 0x3eaf7f9d4e329998 */
2.73012191010840495544e-06, /* 0x3ec6e6e464885269 */
3.29486051438996307950e-06, /* 0x3ecba3a8b946c154 */
4.75803746362771416375e-06, /* 0x3ed3f4e76110d5a4 */
3.33050940471947692369e-05, /* 0x3f017622515a3e2b */
9.94707313972136215365e-06, /* 0x3ee4dc4b528af3d0 */
6.51685096227860253398e-05, /* 0x3f11156278615e10 */
1.18132406658066663359e-03, /* 0x3f535ad50ed821f5 */
6.93090416366541877541e-04, /* 0x3f46b61055f2935c */
1.45780415323416845386e-03, /* 0x3f57e2794a601240 */
2.99862082708111758744e-02, /* 0x3f9eb4b45f6aadd3 */
1.02539925859688602072e-02, /* 0x3f85000b967b3698 */
1.26787669807076286421e-01, /* 0x3fc03a940fadc092 */
6.86652631843830962843e-02, /* 0x3fb1940bf3bf874c */
4.81593633223853068159e-01, /* 0x3fded26e1a2a2110 */
1.70489514001513020602e+00, /* 0x3ffb4740205796d6 */
1.12416073489841270572e+01, /* 0x40267bb3f55cb85d */
7.06579578098005001152e+00, /* 0x401c435ff81e18ac */
5.91244513000686140458e+01, /* 0x404d8fee052bdea4 */
1.68921736147088438429e+02, /* 0x40651d7edccde926 */
2.60692936262087528121e+02, /* 0x40704b1644557e0e */
3.62419382134890611269e+02, /* 0x4076a6b5ca0a9e1c */
4.07689930834187453002e+03, /* 0x40afd9cc72249abe */
1.55377375868385224749e+04, /* 0x40ce58de693edab5 */
2.53720210371943103382e+04, /* 0x40d8c70158ac6364 */
4.78822310734952334315e+04, /* 0x40e7614764f43e20 */
1.81871712615542812273e+05, /* 0x4106337db36fc718 */
5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */
6.41374032312148716301e+05, /* 0x412392bc108b37cc */
7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */
3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */
7.63580561355670914054e+06}; /* 0x415d20d76744835c */
unsigned long ux, aux, xneg;
double y, z, z1, z2;
int m;
/* Special cases */
GET_BITS_DP64(x, ux);
aux = ux & ~SIGNBIT_DP64;
if (aux < 0x3e30000000000000) /* |x| small enough that cosh(x) = 1 */
{
if (aux == 0)
/* with no inexact */
return 1.0;
else
return val_with_flags(1.0, AMD_F_INEXACT);
}
else if (aux >= PINFBITPATT_DP64) /* |x| is NaN or Inf */
{
if (aux > PINFBITPATT_DP64) /* x is NaN */
return _handle_error("cosh", OP_COSH, ux|0x0008000000000000,_DOMAIN,
0,EDOM, x, 0.0, 1);
else /* x is infinity */
return infinity_with_flags(0);
}
xneg = (aux != ux);
y = x;
if (xneg) y = -x;
if (y >= max_cosh_arg)
{
return _handle_error("cosh", OP_COSH, PINFBITPATT_DP64,_OVERFLOW,
AMD_F_INEXACT|AMD_F_OVERFLOW,ERANGE, x, 0.0, 1);
// z = infinity_with_flags(AMD_F_OVERFLOW);
}
else if (y >= small_threshold)
{
/* In this range y is large enough so that
the negative exponential is negligible,
so cosh(y) is approximated by sign(x)*exp(y)/2. The
code below is an inlined version of that from
exp() with two changes (it operates on
y instead of x, and the division by 2 is
done by reducing m by 1). */
splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
log2_by_32_tail, &m, &z1, &z2);
m -= 1;
if (m >= EMIN_DP64 && m <= EMAX_DP64)
z = scaleDouble_1((z1+z2),m);
else
z = scaleDouble_2((z1+z2),m);
}
else
{
/* In this range we find the integer part y0 of y
and the increment dy = y - y0. We then compute
z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
where sinh(y0) and cosh(y0) are tabulated above. */
int ind;
double dy, dy2, sdy, cdy;
ind = (int)y;
dy = y - ind;
dy2 = dy*dy;
sdy = dy*dy2*(0.166666666666666667013899e0 +
(0.833333333333329931873097e-2 +
(0.198412698413242405162014e-3 +
(0.275573191913636406057211e-5 +
(0.250521176994133472333666e-7 +
(0.160576793121939886190847e-9 +
0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
cdy = dy2*(0.500000000000000005911074e0 +
(0.416666666666660876512776e-1 +
(0.138888888889814854814536e-2 +
(0.248015872460622433115785e-4 +
(0.275573350756016588011357e-6 +
(0.208744349831471353536305e-8 +
0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
/* At this point sinh(dy) is approximated by dy + sdy, and cosh(dy) is approximated by 1 + cdy.
Shift some significant bits from dy to cdy. */
#if 0
double sdy1,sdy2;
GET_BITS_DP64(dy, ux);
ux &= 0xfffffffff8000000;
PUT_BITS_DP64(ux, sdy1); // sdy1 is upper 53-27=26 significant bits of dy.
sdy2 = sdy + (dy - sdy1); // sdy2 is sdy + lower bits of dy
z = ((((((cosh_tail[ind]*cdy + sinh_tail[ind]*sdy2)
+ sinh_tail[ind]*sdy1) + cosh_tail[ind])
+ cosh_lead[ind]*cdy) + sinh_lead[ind]*sdy2)
+ sinh_lead[ind]*sdy1) + cosh_lead[ind];
#else
z = ((((((cosh_tail[ind]*cdy + sinh_tail[ind]*sdy)
+ sinh_tail[ind]*dy) + cosh_tail[ind])
+ cosh_lead[ind]*cdy) + sinh_lead[ind]*sdy)
+ sinh_lead[ind]*dy) + cosh_lead[ind];
#endif
}
return z;
}

View File

@@ -0,0 +1,247 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_SPLITEXP
#define USE_SCALEDOUBLE_1
#define USE_SCALEDOUBLE_2
#define USE_INFINITYF_WITH_FLAGS
#define USE_VALF_WITH_FLAGS
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_SPLITEXP
#undef USE_SCALEDOUBLE_1
#undef USE_SCALEDOUBLE_2
#undef USE_INFINITYF_WITH_FLAGS
#undef USE_VALF_WITH_FLAGS
#undef USE_HANDLE_ERRORF
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(coshf)
float coshf(float fx)
{
/*
After dealing with special cases the computation is split into
regions as follows:
abs(x) >= max_cosh_arg:
cosh(x) = sign(x)*Inf
abs(x) >= small_threshold:
cosh(x) = sign(x)*exp(abs(x))/2 computed using the
splitexp and scaleDouble functions as for exp_amd().
abs(x) < small_threshold:
compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
cosh(x) is then sign(x)*z. */
static const double
/* The max argument of coshf, but stored as a double */
max_cosh_arg = 8.94159862922329438106e+01, /* 0x40565a9f84f82e63 */
thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889;
// small_threshold = 20.0;
/* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */
/* Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. */
static const double sinh_lead[ 37] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
1.17520119364380137839e+00, /* 0x3ff2cd9fc44eb982 */
3.62686040784701857476e+00, /* 0x400d03cf63b6e19f */
1.00178749274099008204e+01, /* 0x40240926e70949ad */
2.72899171971277496596e+01, /* 0x403b4a3803703630 */
7.42032105777887522891e+01, /* 0x40528d0166f07374 */
2.01713157370279219549e+02, /* 0x406936d22f67c805 */
5.48316123273246489589e+02, /* 0x408122876ba380c9 */
1.49047882578955000099e+03, /* 0x409749ea514eca65 */
4.05154190208278987484e+03, /* 0x40afa7157430966f */
1.10132328747033916443e+04, /* 0x40c5829dced69991 */
2.99370708492480553105e+04, /* 0x40dd3c4488cb48d6 */
8.13773957064298447222e+04, /* 0x40f3de1654d043f0 */
2.21206696003330085659e+05, /* 0x410b00b5916a31a5 */
6.01302142081972560845e+05, /* 0x412259ac48bef7e3 */
1.63450868623590236530e+06, /* 0x4138f0ccafad27f6 */
4.44305526025387924165e+06, /* 0x4150f2ebd0a7ffe3 */
1.20774763767876271158e+07, /* 0x416709348c0ea4ed */
3.28299845686652474105e+07, /* 0x417f4f22091940bb */
8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */
2.42582597704895108938e+08, /* 0x41aceb088b68e803 */
6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */
1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */
4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */
1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */
3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */
9.78648047144193725586e+10, /* 0x4236c932696a6b5c */
2.66024120300899291992e+11, /* 0x424ef822f7f6731c */
7.23128532145737548828e+11, /* 0x42650bba3796379a */
1.96566714857202099609e+12, /* 0x427c9aae4631c056 */
5.34323729076223046875e+12, /* 0x429370470aec28ec */
1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */
3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */
1.07321789892958031250e+14, /* 0x42d866f34a725782 */
2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */
7.93006726156715250000e+14, /* 0x430689e221bc8d5a */
2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */
static const double cosh_lead[ 37] = {
1.00000000000000000000e+00, /* 0x3ff0000000000000 */
1.54308063481524371241e+00, /* 0x3ff8b07551d9f550 */
3.76219569108363138810e+00, /* 0x400e18fa0df2d9bc */
1.00676619957777653269e+01, /* 0x402422a497d6185e */
2.73082328360164865444e+01, /* 0x403b4ee858de3e80 */
7.42099485247878334349e+01, /* 0x40528d6fcbeff3a9 */
2.01715636122455890700e+02, /* 0x406936e67db9b919 */
5.48317035155212010977e+02, /* 0x4081228949ba3a8b */
1.49047916125217807348e+03, /* 0x409749eaa93f4e76 */
4.05154202549259389343e+03, /* 0x40afa715845d8894 */
1.10132329201033226127e+04, /* 0x40c5829dd053712d */
2.99370708659497577173e+04, /* 0x40dd3c4489115627 */
8.13773957125740562333e+04, /* 0x40f3de1654d6b543 */
2.21206696005590405548e+05, /* 0x410b00b5916b6105 */
6.01302142082804115489e+05, /* 0x412259ac48bf13ca */
1.63450868623620807193e+06, /* 0x4138f0ccafad2d17 */
4.44305526025399193168e+06, /* 0x4150f2ebd0a8005c */
1.20774763767876680940e+07, /* 0x416709348c0ea503 */
3.28299845686652623117e+07, /* 0x417f4f22091940bf */
8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */
2.42582597704895138741e+08, /* 0x41aceb088b68e804 */
6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */
1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */
4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */
1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */
3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */
9.78648047144193725586e+10, /* 0x4236c932696a6b5c */
2.66024120300899291992e+11, /* 0x424ef822f7f6731c */
7.23128532145737548828e+11, /* 0x42650bba3796379a */
1.96566714857202099609e+12, /* 0x427c9aae4631c056 */
5.34323729076223046875e+12, /* 0x429370470aec28ec */
1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */
3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */
1.07321789892958031250e+14, /* 0x42d866f34a725782 */
2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */
7.93006726156715250000e+14, /* 0x430689e221bc8d5a */
2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */
unsigned long ux, aux, xneg;
unsigned int uhx;
double x = fx, y, z, z1, z2;
int m;
/* Special cases */
GET_BITS_DP64(x, ux);
aux = ux & ~SIGNBIT_DP64;
if (aux < 0x3f10000000000000) /* |x| small enough that cosh(x) = 1 */
{
if (aux == 0) return (float)1.0; /* with no inexact */
if (LAMBDA_DP64 + x > 1.0) return valf_with_flags((float)1.0, AMD_F_INEXACT); /* with inexact */
}
else if (aux >= PINFBITPATT_DP64) /* |x| is NaN or Inf */
if (aux > PINFBITPATT_DP64) /* x is NaN */
{
GET_BITS_SP32(fx, uhx);
return _handle_errorf("coshf",OP_COSH,uhx|0x00400000,_DOMAIN, 0,
EDOM, fx, 0.0, 1);
}
else /* x is infinity */
return infinityf_with_flags(0);
xneg = (aux != ux);
y = x;
if (xneg) y = -x;
if (y >= max_cosh_arg)
/* Return +infinity with overflow flag. */
return _handle_errorf("coshf",OP_COSH,PINFBITPATT_SP32,_OVERFLOW,
AMD_F_INEXACT|AMD_F_OVERFLOW,ERANGE, fx, 0.0, 1);
// z = infinity_with_flags(AMD_F_OVERFLOW);
else if (y >= small_threshold)
{
/* In this range y is large enough so that
the negative exponential is negligible,
so cosh(y) is approximated by sign(x)*exp(y)/2. The
code below is an inlined version of that from
exp() with two changes (it operates on
y instead of x, and the division by 2 is
done by reducing m by 1). */
splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
log2_by_32_tail, &m, &z1, &z2);
m -= 1;
/* scaleDouble_1 is always safe because the argument x was
float, rather than double */
z = scaleDouble_1((z1+z2),m);
}
else
{
/* In this range we find the integer part y0 of y
and the increment dy = y - y0. We then compute
z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
where sinh(y0) and cosh(y0) are tabulated above. */
int ind;
double dy, dy2, sdy, cdy;
ind = (int)y;
dy = y - ind;
dy2 = dy*dy;
sdy = dy + dy*dy2*(0.166666666666666667013899e0 +
(0.833333333333329931873097e-2 +
(0.198412698413242405162014e-3 +
(0.275573191913636406057211e-5 +
(0.250521176994133472333666e-7 +
(0.160576793121939886190847e-9 +
0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
cdy = 1 + dy2*(0.500000000000000005911074e0 +
(0.416666666666660876512776e-1 +
(0.138888888889814854814536e-2 +
(0.248015872460622433115785e-4 +
(0.275573350756016588011357e-6 +
(0.208744349831471353536305e-8 +
0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
z = cosh_lead[ind]*cdy + sinh_lead[ind]*sdy;
}
// if (xneg) z = - z;
return (float)z;
}

View File

@@ -0,0 +1,439 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; exp.asm
;
; An implementation of the exp libm function.
;
; Prototype:
;
; double exp(double x);
;
;
; Algorithm:
;
; e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
;
; x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
; n = 64*m + j, 0 <= j < 64
;
; e^x = 2^((64*m + j + f)/64)
; = (2^m) * (2^(j/64)) * 2^(f/64)
; = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
;
; f = x*(64/ln(2)) - n
; r = f*(ln(2)/64) = x - n*(ln(2)/64)
;
; e^x = (2^m) * (2^(j/64)) * e^r
;
; (2^(j/64)) is precomputed
;
; e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
; e^r = 1 + q
;
; q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
;
.const
ALIGN 16
; these codes and the ones in the corresponding .c file have to match
__flag_x_nan DD 00000001
__flag_y_zero DD 00000002
__flag_y_inf DD 00000003
ALIGN 16
L__real_1_by_720 DQ 03f56c16c16c16c17h
DQ 03f56c16c16c16c17h ; 1/720
L__real_1_by_120 DQ 03f81111111111111h
DQ 03f81111111111111h ; 1/120
L__real_1_by_6 DQ 03fc5555555555555h
DQ 03fc5555555555555h ; 1/6
L__real_1_by_2 DQ 03fe0000000000000h
DQ 03fe0000000000000h ; 1/2
L__real_1_by_24 DQ 03fa5555555555555h
DQ 03fa5555555555555h ; 1/24
ALIGN 16
L__log2_by_64_mtail_mhead DQ 0bf862e42fefa0000h, 0bd1cf79abc9e3b39h
L__ln_of_smallest_normal DQ 0C086232BDD7ABCD2h
L__zero DQ 00000000000000000h
L__max_exp_arg DQ 040862e42fefa39efh ; 709.78271289338397
L__denormal_tiny_threshold DQ 0c0874046dfefd9d0h ; -744.03460681327306
L__min_exp_arg DQ 0c0874910d52d3051h ; -745.13321910194111
L__real_64_by_log2 DQ 040571547652b82feh ; 64/ln(2)
L__positive_infinity DQ 07ff0000000000000h
L__negative_infinity DQ 0fff0000000000000h
L__real_qnanbit DQ 0008000000000000h ; qnan set bit
L__real_x_near0_threshold DQ 3c00000000000000h
L__log2_by_64_mhead DQ 0bf862e42fefa0000h
L__log2_by_64_mtail DQ 0bd1cf79abc9e3b39h
L__real_smallest_denormal DQ 00000000000000001h
L__real_one DQ 03ff0000000000000h
L__2_to_neg_26 DQ 03E50000000000000h ; 2^-26
L__min_normal DQ 00010000000000000h ; smallest normal
EXTRN __two_to_jby64_table:QWORD
EXTRN __two_to_jby64_head_table:QWORD
EXTRN __two_to_jby64_tail_table:QWORD
EXTRN __use_fma3_lib:DWORD
; make room for fname_special to save things
dummy_space EQU 020h
stack_size EQU 038h
include fm.inc
fname TEXTEQU <exp>
fname_special TEXTEQU <_exp_special>
;Define name and any external functions being called
EXTERN fname_special : PROC
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
; We need to avoid unwanted exceptions from a NaN argument.
; It could be argued that a signaling NaN should raise an exception,
; but the existing library doesn't. At any rate, the comparison operations
; don't seem to like quiet NaN either, so...
movd rdx, xmm0
btr rdx, 63
cmp rdx, L__positive_infinity
jge Lexp_x_is_nan_or_inf
cmp DWORD PTR __use_fma3_lib, 0
jne Lexp_fma3
movapd xmm2, xmm0
movapd xmm3, xmm0
; Some hardware has problems with too many branches in a single
; 16- or 32-byte window, so let's peel off the common case into
; a single branch.
cmplesd xmm2, L__max_exp_arg ; xmm2 <-- 0xFFFFFFFF is x is not too big positive
cmpnltsd xmm3, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative
andps xmm2, xmm3 ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise
ucomisd xmm2, xmm2 ; note that FFF... is NaN, so this comparison should set PF for in-range x
jp Lexp_y_is_finite
ucomisd xmm0, L__max_exp_arg
ja Lexp_y_is_inf
; Since we peeled off the cases with normal result,
; there is only one possibility remaining:
jmp Lexp_y_is_denormal_or_zero
ALIGN 16
Lexp_y_is_finite:
; x * (64/ln(2))
movapd xmm1, xmm0
btr rdx, 63 ; rdx <-- |x|
cmp rdx, L__2_to_neg_26
jbe Lexp_return_1_plus_x
mulsd xmm1, L__real_64_by_log2
; n = int( x * (64/ln(2)) )
cvttpd2dq xmm2, xmm1 ; xmm2 = (int)n
cvtdq2pd xmm1, xmm2 ; xmm1 = (double)n
movd ecx, xmm2
movapd xmm2, xmm1
; r1 = x - n * ln(2)/64 head
mulsd xmm1, L__log2_by_64_mhead
; j = n & 0x3f
mov rax, 03fh
and eax, ecx ; eax = j
; m = (n - j) / 64
sar ecx, 6 ; ecx = m
; r2 = - n * ln(2)/64 tail
mulsd xmm2, L__log2_by_64_mtail
addsd xmm0, xmm1 ; xmm0 = r1
; r1+r2
addsd xmm2, xmm0 ; xmm2 = r
; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
movapd xmm3, L__real_1_by_720 ; xmm3 = 1/720
mulsd xmm3, xmm2 ; xmm3 = r*1/720
movapd xmm0, L__real_1_by_6 ; xmm0 = 1/6
movapd xmm1, xmm2 ; xmm1 = r
mulsd xmm0, xmm2 ; xmm0 = r*1/6
addsd xmm3, L__real_1_by_120 ; xmm3 = 1/120 + (r*1/720)
mulsd xmm1, xmm2 ; xmm1 = r*r
addsd xmm0, L__real_1_by_2 ; xmm0 = 1/2 + (r*1/6)
movapd xmm4, xmm1 ; xmm4 = r*r
mulsd xmm4, xmm1 ; xmm4 = (r*r) * (r*r)
mulsd xmm3, xmm2 ; xmm3 = r * (1/120 + (r*1/720))
mulsd xmm0, xmm1 ; xmm0 = (r*r)*(1/2 + (r*1/6))
addsd xmm3, L__real_1_by_24 ; xmm3 = 1/24 + (r * (1/120 + (r*1/720)))
addsd xmm0, xmm2 ; xmm0 = r + ((r*r)*(1/2 + (r*1/6)))
mulsd xmm3, xmm4 ; xmm3 = ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
addsd xmm0, xmm3 ; xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
;(f)*(q) + f2 + f1
cmp ecx, 0fffffc02h ; -1022
lea rdx, __two_to_jby64_table
lea r11, __two_to_jby64_tail_table
lea r10, __two_to_jby64_head_table
mulsd xmm0, QWORD PTR [rdx+rax * 8 ]
addsd xmm0, QWORD PTR [r11+rax * 8 ]
addsd xmm0, QWORD PTR [r10+rax * 8 ]
jle Lexp_process_denormal
Lexp_process_normal:
shl rcx, 52
movd xmm2, rcx
paddq xmm0, xmm2
StackDeallocate stack_size
ret
ALIGN 16
Lexp_process_denormal:
jl Lexp_process_true_denormal
ucomisd xmm0, L__real_one
jae Lexp_process_normal
Lexp_process_true_denormal:
; here ( e^r < 1 and m = -1022 ) or m <= -1023
add ecx, 1074
mov rax, 1
shl rax, cl
movd xmm2, rax
mulsd xmm0, xmm2
jmp Lexp_finish
Lexp_y_is_one:
movsd xmm0, L__real_one
jmp Lexp_finish
ALIGN 16
Lexp_x_is_nan_or_inf:
movd rax, xmm0
cmp rax, L__positive_infinity
je Lexp_finish
cmp rax, L__negative_infinity
je Lexp_return_zero_without_exception
or rax, L__real_qnanbit
movd xmm1, rax
mov r8d, __flag_x_nan
call fname_special
jmp Lexp_finish
ALIGN 16
Lexp_y_is_inf:
mov rax, 07ff0000000000000h
movd xmm1, rax
mov r8d, __flag_y_inf
call fname_special
jmp Lexp_finish
ALIGN 16
Lexp_y_is_denormal_or_zero:
ucomisd xmm0, L__min_exp_arg
jbe Lexp_y_is_zero
movapd xmm0, L__real_smallest_denormal
jmp Lexp_finish
ALIGN 16
Lexp_y_is_zero:
pxor xmm1, xmm1
mov r8d, __flag_y_zero
call fname_special
jmp Lexp_finish
ALIGN 16
Lexp_return_1_plus_x:
cmp rdx, L__min_normal
jbe Lexp_return_1_plus_eps
addsd xmm0, L__real_one
StackDeallocate stack_size
ret 0
; Some hardware really does not like subnormals. Try to avoid them.
ALIGN 16
Lexp_return_1_plus_eps:
movsd xmm0, L__real_one
addsd xmm0, L__min_normal ; make sure inexact is set
StackDeallocate stack_size
ret 0
ALIGN 16
Lexp_return_zero_without_exception:
pxor xmm0,xmm0
StackDeallocate stack_size
ret 0
ALIGN 16
Lexp_finish:
StackDeallocate stack_size
ret 0
ALIGN 16
Lexp_fma3:
; Some hardware has problems with too many branches in a single
; 16- or 32-byte window, so let's peel off the common case into
; a single branch.
vcmplesd xmm2, xmm0, L__max_exp_arg ; xmm2 <-- 0xFFFFFFFF is x is not too big positive
vcmpnltsd xmm3, xmm0, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative
vandps xmm2, xmm3, xmm2 ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise
vucomisd xmm2, xmm2 ; note that FFF... is NaN, so this comparison should set PF for in-range x
jp Lexp_fma3_y_is_finite
vucomisd xmm0,L__max_exp_arg
ja Lexp_fma3_y_is_inf
; Since we peeled off the cases with normal result,
; there is only one possibility remaining:
jmp Lexp_fma3_y_is_zero
; vpsllq xmm1, xmm0, 1
; vpsrlq xmm1, xmm1, 1
; vucomisd xmm1, L__real_x_near0_threshold ; 2^-63
; jb Lexp_fma3_y_is_one
ALIGN 16
Lexp_fma3_y_is_finite:
vmovq rdx, xmm0
btr rdx, 63 ; rdx <-- |x|
cmp rdx, L__2_to_neg_26
jbe Lexp_fma3_return_1_plus_x
; x * (64/ln(2))
vmulsd xmm1,xmm0,L__real_64_by_log2
; n = int( x * (64/ln(2)) )
vcvttpd2dq xmm2,xmm1 ;xmm2 = (int)n
vcvtdq2pd xmm1,xmm2 ;xmm1 = (double)n ;can use round
vmovd ecx,xmm2
; r1 = x - n * ln(2)/64 head
; r2 = - n * ln(2)/64 tail
; r = r1+r2
vmovlhps xmm1,xmm1,xmm1 ;xmm1 = (double (double)n,)n
vmovq xmm0,xmm0 ;xmm0 = 0,x ;zero out the upper part
vfmadd132pd xmm1,xmm0,L__log2_by_64_mtail_mhead
vhaddpd xmm2,xmm1,xmm1 ;xmm2 = r,r
;j = n & 03fh
mov rax,03fh
and eax,ecx ;eax = j
; m = (n - j) / 64
sar ecx,6 ;ecx = m
; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
vmovapd xmm3,L__real_1_by_720
vfmadd213sd xmm3,xmm2,L__real_1_by_120
vfmadd213sd xmm3,xmm2,L__real_1_by_24
vfmadd213sd xmm3,xmm2,L__real_1_by_6
vfmadd213sd xmm3,xmm2,L__real_1_by_2
vmulsd xmm0,xmm2,xmm2
vfmadd213sd xmm0,xmm3,xmm2
; (f)*(q) + f2 + f1
cmp ecx,0fffffc02h ; -1022
lea rdx,__two_to_jby64_table
lea r11,__two_to_jby64_tail_table
lea r10,__two_to_jby64_head_table
vmulsd xmm2,xmm0,QWORD PTR[rdx + rax * 8]
vaddsd xmm1,xmm2,QWORD PTR[r11 + rax * 8]
vaddsd xmm0,xmm1,QWORD PTR[r10 + rax * 8]
jle Lexp_fma3_process_denormal
Lexp_fma3_process_normal:
shl rcx,52
vmovq xmm2,rcx
vpaddq xmm0,xmm0,xmm2
StackDeallocate stack_size
ret
ALIGN 16
Lexp_fma3_process_denormal:
jl Lexp_fma3_process_true_denormal
vucomisd xmm0,L__real_one
jae Lexp_fma3_process_normal
Lexp_fma3_process_true_denormal:
; here ( e^r < 1 and m = -1022 ) or m <= -1023
add ecx,1074
mov rax,1
shl rax,cl
vmovq xmm2,rax
vmulsd xmm0,xmm0,xmm2
jmp Lexp_fma3_finish
Lexp_fma3_y_is_one:
vmovsd xmm0, L__real_one
jmp Lexp_fma3_finish
ALIGN 16
Lexp_fma3_y_is_inf:
mov rax,07ff0000000000000h
vmovq xmm1,rax
mov r8d,__flag_y_inf
call fname_special
jmp Lexp_fma3_finish
ALIGN 16
Lexp_fma3_return_1_plus_x:
cmp rdx, L__min_normal
jbe Lexp_fma3_return_1_plus_eps
vaddsd xmm0, xmm0, L__real_one
StackDeallocate stack_size
ret 0
; Some hardware really does not like subnormals. Try to avoid them.
ALIGN 16
Lexp_fma3_return_1_plus_eps:
vmovsd xmm0, L__real_one
vaddsd xmm0, xmm0, L__min_normal ; make sure inexact is set
StackDeallocate stack_size
ret 0
ALIGN 16
Lexp_fma3_y_is_zero:
vpxor xmm1,xmm1,xmm1
mov r8d,__flag_y_zero
call fname_special
jmp Lexp_fma3_finish
ALIGN 16
Lexp_fma3_return_zero_without_exception:
vpxor xmm0,xmm0,xmm0
ALIGN 16
Lexp_fma3_finish:
StackDeallocate stack_size
ret
fname endp
END

View File

@@ -0,0 +1,162 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_SPLITEXP
#define USE_SCALEDOUBLE_1
#define USE_SCALEDOUBLE_2
#define USE_ZERO_WITH_FLAGS
#define USE_INFINITY_WITH_FLAGS
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_ZERO_WITH_FLAGS
#undef USE_SPLITEXP
#undef USE_SCALEDOUBLE_1
#undef USE_SCALEDOUBLE_2
#undef USE_INFINITY_WITH_FLAGS
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
/* exp2 is only provided for use by powf under Windows, so give
it a leading underscore. */
double FN_PROTOTYPE(_exp2)(double x)
{
static const double
max_exp2_arg = 1024.0, /* 0x4090000000000000 */
min_exp2_arg = -1074.0, /* 0xc090c80000000000 */
log2 = 6.931471805599453094178e-01, /* 0x3fe62e42fefa39ef */
log2_lead = 6.93147167563438415527E-01, /* 0x3fe62e42f8000000 */
log2_tail = 1.29965068938898869640E-08, /* 0x3e4be8e7bcd5e4f1 */
one_by_32_lead = 0.03125;
double y, z1, z2, z, hx, tx, y1, y2;
int m;
unsigned long ux, ax;
/*
Computation of exp2(x).
We compute the values m, z1, and z2 such that
exp2(x) = 2**m * (z1 + z2), where exp2(x) is 2**x.
Computations needed in order to obtain m, z1, and z2
involve three steps.
First, we reduce the argument x to the form
x = n/32 + remainder,
where n has the value of an integer and |remainder| <= 1/64.
The value of n = x * 32 rounded to the nearest integer and
the remainder = x - n/32.
Second, we approximate exp2(r1 + r2) - 1 where r1 is the leading
part of the remainder and r2 is the trailing part of the remainder.
Third, we reconstruct exp2(x) so that
exp2(x) = 2**m * (z1 + z2).
*/
GET_BITS_DP64(x, ux);
ax = ux & (~SIGNBIT_DP64);
if (ax >= 0x4090000000000000) /* abs(x) >= 1024.0 */
{
if(ax >= 0x7ff0000000000000)
{
/* x is either NaN or infinity */
if (ux & MANTBITS_DP64)
/* x is NaN */
return _handle_error("exp2", OP_EXP, ux|0x0008000000000000, _DOMAIN,
0, EDOM, x, 0.0, 1);
else if (ux & SIGNBIT_DP64)
/* x is negative infinity; return 0.0 with no flags. */
return 0.0;
else
/* x is positive infinity */
return x;
}
if (x > max_exp2_arg)
/* Return +infinity with overflow flag */
return _handle_error("exp2", OP_EXP, PINFBITPATT_DP64, _OVERFLOW,
AMD_F_OVERFLOW | AMD_F_INEXACT, ERANGE, x, 0.0, 1);
else if (x < min_exp2_arg)
/* x is negative. Return +zero with underflow and inexact flags */
return _handle_error("exp2", OP_EXP, 0, _UNDERFLOW,
AMD_F_UNDERFLOW | AMD_F_INEXACT, ERANGE, x, 0.0, 1);
}
/* Handle small arguments separately */
if (ax < 0x3fb7154764ee6c2f) /* abs(x) < 1/(16*log2) */
{
if (ax < 0x3c00000000000000) /* abs(x) < 2^(-63) */
return 1.0 + x; /* Raises inexact if x is non-zero */
else
{
/* Split x into hx (head) and tx (tail). */
unsigned long u;
hx = x;
GET_BITS_DP64(hx, u);
u &= 0xfffffffff8000000;
PUT_BITS_DP64(u, hx);
tx = x - hx;
/* Carefully multiply x by log2. y1 is the most significant
part of the result, and y2 the least significant part */
y1 = x * log2_lead;
y2 = (((hx * log2_lead - y1) + hx * log2_tail) +
tx * log2_lead) + tx * log2_tail;
y = y1 + y2;
z = (9.99564649780173690e-1 +
(1.61251249355268050e-5 +
(2.37986978239838493e-2 +
2.68724774856111190e-7*y)*y)*y)/
(9.99564649780173692e-1 +
(-4.99766199765151309e-1 +
(1.070876894098586184e-1 +
(-1.189773642681502232e-2 +
5.9480622371960190616e-4*y)*y)*y)*y);
z = ((z * y1) + (z * y2)) + 1.0;
}
}
else
{
/* Find m, z1 and z2 such that exp2(x) = 2**m * (z1 + z2) */
splitexp(x, log2, 32.0, one_by_32_lead, 0.0, &m, &z1, &z2);
/* Scale (z1 + z2) by 2.0**m */
if (m > EMIN_DP64 && m < EMAX_DP64)
z = scaleDouble_1((z1+z2),m);
else
z = scaleDouble_2((z1+z2),m);
}
return z;
}

View File

@@ -0,0 +1,101 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include <fpieee.h>
#include <excpt.h>
#include <float.h>
#include <math.h>
#include <errno.h>
#include "libm_new.h"
// y = expf(x)
// y = exp(x)
// these codes and the ones in the related .asm files have to match
#define EXP_X_NAN 1
#define EXP_Y_ZERO 2
#define EXP_Y_INF 3
float _expf_special(float x, float y, U32 code)
{
switch(code)
{
case EXP_X_NAN:
{
UT64 ym; ym.u64 = 0; ym.f32[0] = y;
_handle_errorf("expf", _FpCodeExp, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1);
}
break;
case EXP_Y_ZERO:
{
UT64 ym; ym.u64 = 0; ym.f32[0] = y;
_handle_errorf("expf", _FpCodeExp, ym.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, 0.0, 1);
}
break;
case EXP_Y_INF:
{
UT64 ym; ym.u64 = 0; ym.f32[0] = y;
_handle_errorf("expf", _FpCodeExp, ym.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, 0.0, 1);
}
break;
}
return y;
}
double _exp_special(double x, double y, U32 code)
{
switch(code)
{
case EXP_X_NAN:
{
UT64 ym; ym.f64 = y;
_handle_error("exp", _FpCodeExp, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1);
}
break;
case EXP_Y_ZERO:
{
UT64 ym; ym.f64 = y;
_handle_error("exp", _FpCodeExp, ym.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, 0.0, 1);
}
break;
case EXP_Y_INF:
{
UT64 ym; ym.f64 = y;
_handle_error("exp", _FpCodeExp, ym.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, 0.0, 1);
}
break;
}
return y;
}

View File

@@ -0,0 +1,303 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; expf.asm
;
; An implementation of the expf libm function.
;
; Prototype:
;
; float expf(float x);
;
;
; Algorithm:
; Similar to one presnted in exp.asm
;
; If FMA3 hardware is available, an FMA3 implementation of expf will be used.
.const
ALIGN 16
__real_inf DD 7f800000h
DD 0
DQ 0
__real_ninf DD 0ff800000h
DD 0
DQ 0
__real_qnanbit DD 00400000h
DD 0
DQ 0
__real_zero DD 00000000h
DD 0
DQ 0
__real_p8192 DQ 40c0000000000000h
DQ 0
__real_m9600 DQ 0c0c2c00000000000h
DQ 0
__real_64_by_log2 DQ 40571547652b82feh ; 64/ln(2)
DQ 0
__real_log2_by_64 DQ 3f862e42fefa39efh ; log2_by_64
DQ 0
__real_1_by_6 DQ 3fc5555555555555h ; 1/6
DQ 0
__real_1_by_2 DQ 3fe0000000000000h ; 1/2
DQ 0
; these codes and the ones in the corresponding .c file have to match
__flag_x_nan DD 00000001
__flag_y_zero DD 00000002
__flag_y_inf DD 00000003
EXTRN __two_to_jby64_table:QWORD
EXTRN __use_fma3_lib:DWORD
fname TEXTEQU <expf>
fname_special TEXTEQU <_expf_special>
; define local variable storage offsets
; make room for fname_special to save things
dummy_space EQU 020h
stack_size EQU 038h
include fm.inc
; external function
EXTERN fname_special:PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
; Do this to avoid possible exceptions from a NaN argument.
movd edx, xmm0
btr edx,31
cmp edx, DWORD PTR __real_inf
jge Lexpf_x_is_inf_or_nan
cmp DWORD PTR __use_fma3_lib, 0
jne Lexpf_fma3
Lexpf_sse2:
cvtss2sd xmm0, xmm0
; x * (64/ln(2))
movsd xmm3, QWORD PTR __real_64_by_log2
mulsd xmm3, xmm0
; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128
; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150)
comisd xmm3, QWORD PTR __real_p8192
jae Lexpf_y_is_inf
comisd xmm3, QWORD PTR __real_m9600
jb Lexpf_y_is_zero
; n = int( x * (64/ln(2)) )
cvtpd2dq xmm4, xmm3
lea r10, __two_to_jby64_table
cvtdq2pd xmm1, xmm4
; r = x - n * ln(2)/64
movsd xmm2, QWORD PTR __real_log2_by_64
mulsd xmm2, xmm1
movd ecx, xmm4
mov rax, 3fh
and eax, ecx
subsd xmm0, xmm2
movsd xmm1, xmm0
; m = (n - j) / 64
sub ecx, eax
sar ecx, 6
; q
movsd xmm3, QWORD PTR __real_1_by_6
mulsd xmm3, xmm0
mulsd xmm0, xmm0
addsd xmm3, QWORD PTR __real_1_by_2
mulsd xmm0, xmm3
addsd xmm0, xmm1
add rcx, 1023
shl rcx, 52
; (f)*(1+q)
movsd xmm2, QWORD PTR [r10+rax*8]
mulsd xmm0, xmm2
addsd xmm0, xmm2
movd xmm1, rcx
mulsd xmm0, xmm1
cvtsd2ss xmm0, xmm0
Lexpf_final_check:
StackDeallocate stack_size
ret
ALIGN 16
Lexpf_y_is_zero:
movss xmm1, DWORD PTR __real_zero
movd xmm0, edx
mov r8d, DWORD PTR __flag_y_zero
call fname_special
jmp Lexpf_finish
ALIGN 16
Lexpf_y_is_inf:
movss xmm1, DWORD PTR __real_inf
movd xmm0, edx
mov r8d, DWORD PTR __flag_y_inf
call fname_special
jmp Lexpf_finish
ALIGN 16
Lexpf_x_is_inf_or_nan:
cmp edx, DWORD PTR __real_inf
je Lexpf_finish
cmp edx, DWORD PTR __real_ninf
je Lexpf_process_zero
or edx, DWORD PTR __real_qnanbit
movd xmm1, edx
mov r8d, DWORD PTR __flag_x_nan
call fname_special
jmp Lexpf_finish
ALIGN 16
Lexpf_process_zero:
movss xmm0, DWORD PTR __real_zero
jmp Lexpf_final_check
ALIGN 16
Lexpf_finish:
StackDeallocate stack_size
ret
ALIGN 16
Lexpf_fma3:
vcvtss2sd xmm0, xmm0, xmm0
; x * (64/ln(2))
vmulsd xmm3, xmm0, QWORD PTR __real_64_by_log2
; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128
; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150)
vcomisd xmm3, QWORD PTR __real_p8192
jae Lexpf_fma3_y_is_inf
vucomisd xmm3, QWORD PTR __real_m9600
jb Lexpf_fma3_y_is_zero
; n = int( x * (64/ln(2)) )
vcvtpd2dq xmm4, xmm3
lea r10, __two_to_jby64_table
vcvtdq2pd xmm1, xmm4
; r = x - n * ln(2)/64
vfnmadd231sd xmm0, xmm1, QWORD PTR __real_log2_by_64
vmovd ecx, xmm4
mov rax, 3fh
and eax, ecx
vmovapd xmm1, xmm0 ; xmm1 <-- copy of r
; m = (n - j) / 64
sub ecx, eax
sar ecx, 6
; q
vmovsd xmm3, QWORD PTR __real_1_by_6
vmulsd xmm0, xmm0, xmm0 ; xmm0 <-- r^2
vfmadd213sd xmm3, xmm1, QWORD PTR __real_1_by_2 ; xmm3 <-- r/6 + 1/2
vfmadd213sd xmm0, xmm3, xmm1 ; xmm0 <-- q = r^2*(r/6 + 1/2) + r
add rcx, 1023
shl rcx, 52
; (f)*(1+q)
vmovsd xmm2, QWORD PTR [r10+rax*8]
vfmadd213sd xmm0, xmm2, xmm2
vmovq xmm2,rcx
vmulsd xmm0, xmm0, xmm2
vcvtsd2ss xmm0, xmm0, xmm0
Lexpf_fma3_final_check:
StackDeallocate stack_size
ret
ALIGN 16
Lexpf_fma3_y_is_zero:
vmovss xmm1, DWORD PTR __real_zero
vmovd xmm0, edx
mov r8d, DWORD PTR __flag_y_zero
call fname_special
jmp Lexpf_fma3_finish
ALIGN 16
Lexpf_fma3_y_is_inf:
vmovss xmm1, DWORD PTR __real_inf
vmovd xmm0, edx
mov r8d, DWORD PTR __flag_y_inf
call fname_special
jmp Lexpf_fma3_finish
ALIGN 16
Lexpf_fma3_process_zero:
vmovss xmm0, DWORD PTR __real_zero
jmp Lexpf_fma3_final_check
ALIGN 16
Lexpf_fma3_finish:
StackDeallocate stack_size
ret
fname endp
END

View File

@@ -0,0 +1,85 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#include "libm_errno.h"
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_HANDLE_ERROR
#pragma function(floor)
double FN_PROTOTYPE(floor)(double x)
{
double r;
long rexp, xneg;
unsigned long ux, ax, ur, mask;
GET_BITS_DP64(x, ux);
ax = ux & (~SIGNBIT_DP64);
xneg = (ux != ax);
if (ax >= 0x4340000000000000)
{
/* abs(x) is either NaN, infinity, or >= 2^53 */
if (ax > 0x7ff0000000000000)
/* x is NaN */
return _handle_error("floor", OP_FLOOR, ux|0x0008000000000000, _DOMAIN,
0, EDOM, x, 0.0, 1);
else
return x;
}
else if (ax < 0x3ff0000000000000) /* abs(x) < 1.0 */
{
if (ax == 0x0000000000000000)
/* x is +zero or -zero; return the same zero */
return x;
else if (xneg) /* x < 0.0 */
return -1.0;
else
return 0.0;
}
else
{
r = x;
rexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
/* Mask out the bits of r that we don't want */
mask = 1;
mask = (mask << (EXPSHIFTBITS_DP64 - rexp)) - 1;
ur = (ux & ~mask);
PUT_BITS_DP64(ur, r);
if (xneg && (ur != ux))
/* We threw some bits away and x was negative */
return r - 1.0;
else
return r;
}
}

View File

@@ -0,0 +1,83 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#include "libm_errno.h"
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_HANDLE_ERRORF
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(floorf)
float FN_PROTOTYPE(floorf)(float x)
{
float r;
int rexp, xneg;
unsigned int ux, ax, ur, mask;
GET_BITS_SP32(x, ux);
ax = ux & (~SIGNBIT_SP32);
xneg = (ux != ax);
if (ax >= 0x4b800000)
{
/* abs(x) is either NaN, infinity, or >= 2^24 */
if (ax > 0x7f800000)
/* x is NaN */
return _handle_errorf("floorf", OP_FLOOR, ux|0x00400000, _DOMAIN,
0, EDOM, x, 0.0F, 1);
else
return x;
}
else if (ax < 0x3f800000) /* abs(x) < 1.0 */
{
if (ax == 0x00000000)
/* x is +zero or -zero; return the same zero */
return x;
else if (xneg) /* x < 0.0 */
return -1.0F;
else
return 0.0F;
}
else
{
rexp = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
/* Mask out the bits of r that we don't want */
mask = (1 << (EXPSHIFTBITS_SP32 - rexp)) - 1;
ur = (ux & ~mask);
PUT_BITS_SP32(ur, r);
if (xneg && (ux != ur))
/* We threw some bits away and x was negative */
return r - 1.0F;
else
return r;
}
}

View File

@@ -0,0 +1,39 @@
StackAllocate MACRO size
if size ne 0
sub rsp, size
.ALLOCSTACK size
endif
ENDM
StackDeallocate MACRO size
if size ne 0
add rsp, size
endif
ENDM
SaveReg MACRO reg64, offset
mov QWORD PTR [rsp+offset], reg64
.SAVEREG reg64, offset
ENDM
RestoreReg MACRO reg64, offset
mov reg64, QWORD PTR [rsp+offset]
ENDM
SaveXmm MACRO xmmreg, offset
movdqa XMMWORD PTR [offset+rsp], xmmreg
.SAVEXMM128 xmmreg, offset
ENDM
RestoreXmm MACRO xmmreg, offset
movdqa xmmreg, XMMWORD PTR [offset+rsp]
ENDM
AVXSaveXmm MACRO xmmreg, offset
vmovdqa XMMWORD PTR [offset+rsp], xmmreg
.SAVEXMM128 xmmreg, offset
ENDM
AVXRestoreXmm MACRO xmmreg, offset
vmovdqa xmmreg, XMMWORD PTR [offset+rsp]
ENDM

View File

@@ -0,0 +1,66 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#ifdef TEST_STANDALONE
#include <stdio.h>
#pragma section (".CRT$XIC",long,read)
typedef void (__cdecl *_PIFV)(void);
#else
#include <sect_attribs.h>
#include <windows.h>
#include <cruntime.h>
#include <internal.h>
#endif
#define _CRTALLOC(x) __declspec(allocate(x))
int __fma3_is_available = 0;
int __use_fma3_lib = 0;
int __cdecl _set_FMA3_enable(int flag)
{
if (__fma3_is_available) __use_fma3_lib = flag;
return __use_fma3_lib;
}
int __fma3_lib_init(void);
_CRTALLOC(".CRT$XIC") static _PIFV init_fma3 = __fma3_lib_init;
int __fma3_lib_init(void)
{
int CPUID[4]; // CPUID[2] is ECX;
__fma3_is_available = 0;
__cpuid(CPUID, 1);
if (CPUID[2] & (1 << 12)) {
__fma3_is_available = 1;
}
__use_fma3_lib = __fma3_is_available;
return 0;
}

View File

@@ -0,0 +1,160 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; $Workfile: fmod.asm $
; $Revision: 4 $
; $Date: 9/15/04 16:43 $
;
;
; This is an optimized version of fmod.
;
; Define _CRTBLD_C9X to make it compliant with C90 and on.
;
; If building the OS CRTL (_NTSUBSET_ defined), abort.
; .ERRDEF _NTSUBSET_, "x87 code cannot be used in kernel mode"
DOMAIN EQU 1 ; _DOMAIN
EDOM EQU 33 ; EDOM
FPCODEFMOD EQU 22 ; _FpCodeFmod
INVALID EQU 8 ; AMD_F_INVALID
FPIND EQU 0fff8000000000000h ; indefinite
FPSNAN EQU 07ff7ffffffffffffh ; SNAN
FPQNAN EQU 07fffffffffffffffh ; QNAN
X87SW RECORD X87SW_B: 1,
X87SW_C3: 1,
X87SW_TOP: 3,
X87SW_C: 3,
X87SW_ES: 1,
X87SW_SF: 1,
X87SW_PE: 1,
X87SW_E: 5
X87XAM EQU MASK X87SW_C3 OR MASK X87SW_C AND NOT (1 SHL (X87SW_C + 1))
X87XAM_INF EQU 5 SHL X87SW_C
X87XAM_NAN EQU 1 SHL X87SW_C
X87XAM_BAD EQU MASK X87SW_E AND NOT 2
EXTRN _handle_error: PROC ; float _handle_error (char *fname, int opcode, unsigned long long value, int type, int flags, int error, double arg1, double arg2, int nargs)
.const
@fmodz DB "fmod", 0
.CODE
; double fmod [double, double] ----------------------------------
fmod PROC FRAME
sub rsp, 40 + 32
.ALLOCSTACK 40 + 32
.ENDPROLOG
movsd QWORD PTR 24 [rsp + 32], xmm1 ; Y
movsd QWORD PTR 16 [rsp + 32], xmm0 ; X
DB 0ddh, 44h, 24h, 38h ; fld QWORD PTR 24 [rsp + 32]
DB 0ddh, 44h, 24h, 30h ; fld QWORD PTR 16 [rsp + 32]
DB 0d9h, 0e5h ; fxam (X)
DB 09bh, 0ddh, 07ch, 024h, 010h ; fstsw 16 [rsp]
movzx ecx, WORD PTR 16 [rsp]
and ecx, X87XAM
fnclex ; clear exception flags
; in preparation for fprem
@@:
DB 0d9h, 0f8h ; fprem
DB 09bh, 0dfh, 0e0h ; fstsw ax
test ax, 4 SHL X87SW_C
jnz @b ; do it again in case of partial result
DB 0ddh, 01ch, 024h ; fstp QWORD PTR [rsp]
movlpd xmm0, QWORD PTR [rsp] ; result
DB 0d9h, 0e5h ; fxam (Y)
DB 09bh, 0ddh, 07ch, 024h, 008h ; fstsw 8 [rsp]
movzx edx, WORD PTR 8 [rsp]
and edx, X87XAM
DB 0ddh, 0d8h ; fstp st(0)
cmp edx, X87XAM_NAN ; fmod (x, NAN) = QNAN
je @error
cmp ecx, X87XAM_NAN ; fmod (NAN, y) = QNAN
je @error
and eax, X87XAM_BAD
jnz @raise ; handle error
IFNDEF _CRTBLD_C9X ; Not C90
cmp edx, X87XAM_INF ; fmod (x, infinity) = ???
je @raise
ELSE ; C90
; fmod (x, infinity) = x (as x87 already does)
ENDIF
@exit:
add rsp, 40 + 32
ret
ALIGN 16
@raise:
mov eax, INVALID ; raise exception
mov r8, FPIND
jmp @f
@error:
xor eax, eax ; no exception
movd r8, xmm0
jmp @f
@@:
lea rcx, [@fmodz] ; fname
mov edx, FPCODEFMOD ; opcode
; mov r8, INDEF ; value
mov r9d, DOMAIN ; type
mov DWORD PTR 0 [rsp + 32], eax ; flags
mov DWORD PTR 8 [rsp + 32], EDOM ; error
mov DWORD PTR 32 [rsp + 32], 2 ; nargs
call _handle_error ; (char *fname, int opcode, unsigned long long value, int type, int flags, int error, double arg1, double arg2, int nargs)
DB 09bh, 0dbh, 0e2h ; fclex
jmp @exit
fmod ENDP
; ---------------------------------------------------------------
END

View File

@@ -0,0 +1,160 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; $Workfile: fmodf.asm $
; $Revision: 4 $
; $Date: 9/15/04 16:43 $
;
;
; This is an optimized version of fmod.
;
; Define _CRTBLD_C9X to make it compliant with C90 and on.
;
; If building the OS CRTL (_NTSUBSET_ defined), abort.
.ERRDEF _NTSUBSET_, "x87 code cannot be used in kernel mode"
DOMAIN EQU 1 ; _DOMAIN
EDOM EQU 33 ; EDOM
FPCODEFMOD EQU 22 ; _FpCodeFmod
INVALID EQU 8 ; AMD_F_INVALID
FPIND EQU 0ffc00000h ; indefinite
FPSNAN EQU 07fbfffffh ; SNAN
FPQNAN EQU 07fffffffh ; QNAN
X87SW RECORD X87SW_B: 1,
X87SW_C3: 1,
X87SW_TOP: 3,
X87SW_C: 3,
X87SW_ES: 1,
X87SW_SF: 1,
X87SW_PE: 1,
X87SW_E: 5
X87XAM EQU MASK X87SW_C3 OR MASK X87SW_C AND NOT (1 SHL (X87SW_C + 1))
X87XAM_INF EQU 5 SHL X87SW_C
X87XAM_NAN EQU 1 SHL X87SW_C
X87XAM_BAD EQU MASK X87SW_E AND NOT 2
EXTRN _handle_errorf: PROC ; float _handle_error (char *fname, int opcode, unsigned long value, int type, int flags, int error, float arg1, float arg2, int nargs)
.CONST
@fmodfz DB "fmodf", 0
.CODE
; float fmodf [float, float] ------------------------------------
fmodf PROC FRAME
sub rsp, 40 + 32
.ALLOCSTACK 40 + 32
.ENDPROLOG
movss DWORD PTR 24 [rsp + 32], xmm1
movss DWORD PTR 16 [rsp + 32], xmm0
DB 0d9h, 44h, 24h, 38h ; fld DWORD PTR 24 [rsp + 32]
DB 0d9h, 44h, 24h, 30h ; fld DWORD PTR 16 [rsp + 32]
DB 0d9h, 0e5h ; fxam (X)
DB 09bh, 0ddh, 07ch, 024h, 010h ; fstsw 16 [rsp]
movzx ecx, WORD PTR 16 [rsp]
and ecx, X87XAM
fnclex ; clear exception flags
; in preparation for fprem
@@:
DB 0d9h, 0f8h ; fprem
DB 9bh, 0dfh, 0e0h ; fstsw ax
test ax, 00400h
jnz @b ; do it again in case of partial result
DB 0d9h, 1ch, 24h ; fstp DWORD PTR [rsp]
movss xmm0, DWORD PTR [rsp] ; result
DB 0d9h, 0e5h ; fxam (Y)
DB 09bh, 0ddh, 07ch, 024h, 008h ; fstsw 8 [rsp]
movzx edx, WORD PTR 8 [rsp]
and edx, X87XAM
DB 0ddh, 0d8h ; fstp st(0)
cmp edx, X87XAM_NAN ; fmod (x, NAN) = QNAN
je @error
cmp ecx, X87XAM_NAN ; fmod (NAN, y) = QNAN
je @error
and eax, X87XAM_BAD
jnz @raise ; handle error
IFNDEF _CRTBLD_C9X ; Not C90
cmp edx, X87XAM_INF ; fmod (x, infinity) = ???
je @raise
ELSE ; C90
; fmod (x, infinity) = x (as x87 already does)
ENDIF
@exit:
add rsp, 40 + 32
ret
ALIGN 16
@raise:
mov eax, INVALID ; raise exception
mov r8d, FPIND
jmp @f
@error:
xor eax, eax ; no exception
movd r8d, xmm0
jmp @f
@@:
lea rcx, [@fmodfz] ; fname
mov edx, FPCODEFMOD ; opcode
; mov r8d, [rsp] ; value
mov r9d, DOMAIN ; type
mov DWORD PTR 0 [rsp + 32], eax ; flags
mov DWORD PTR 8 [rsp + 32], EDOM ; error
mov DWORD PTR 32 [rsp + 32], 2 ; nargs
call _handle_errorf ; (char *fname, int opcode, unsigned long long value, int type, int flags, int error, double arg1, double arg2, int nargs)
DB 9Bh, 0DBh, 0E2h ; fclex
jmp @exit
fmodf ENDP
; ---------------------------------------------------------------
END

View File

@@ -0,0 +1,198 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define FAST_BUT_GREATER_THAN_ONE_ULP /* Helps speed by trading off a little
accuracy */
#define USE_SCALEDOUBLE_1
#define USE_INFINITY_WITH_FLAGS
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_SCALEDOUBLE_1
#undef USE_INFINITY_WITH_FLAGS
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
double FN_PROTOTYPE(_hypot)(double x, double y)
{
/* Returns sqrt(x*x + y*y) with no overflow or underflow unless
the result warrants it */
const double large = 1.79769313486231570815e+308; /* 0x7fefffffffffffff */
#ifdef FAST_BUT_GREATER_THAN_ONE_ULP
double r, retval;
unsigned long xexp, yexp, ux, uy;
#else
double u, r, retval, hx, tx, x2, hy, ty, y2, hs, ts;
unsigned long xexp, yexp, ux, uy, ut;
#endif
int dexp, expadjust;
GET_BITS_DP64(x, ux);
ux &= ~SIGNBIT_DP64;
GET_BITS_DP64(y, uy);
uy &= ~SIGNBIT_DP64;
xexp = (ux >> EXPSHIFTBITS_DP64);
yexp = (uy >> EXPSHIFTBITS_DP64);
if (xexp == BIASEDEMAX_DP64 + 1 || yexp == BIASEDEMAX_DP64 + 1)
{
/* One or both of the arguments are NaN or infinity. The
result will also be NaN or infinity. */
retval = x*x + y*y;
if (((xexp == BIASEDEMAX_DP64 + 1) && !(ux & MANTBITS_DP64)) ||
((yexp == BIASEDEMAX_DP64 + 1) && !(uy & MANTBITS_DP64)))
/* x or y is infinity. ISO C99 defines that we must
return +infinity, even if the other argument is NaN.
Note that the computation of x*x + y*y above will already
have raised invalid if either x or y is a signalling NaN. */
return infinity_with_flags(0);
else
/* One or both of x or y is NaN, and neither is infinity.
Raise invalid if it's a signalling NaN */
return retval;
}
/* Set x = abs(x) and y = abs(y) */
PUT_BITS_DP64(ux, x);
PUT_BITS_DP64(uy, y);
/* The difference in exponents between x and y */
dexp = (int)(xexp - yexp);
expadjust = 0;
if (ux == 0)
/* x is zero */
return y;
else if (uy == 0)
/* y is zero */
return x;
else if (dexp > MANTLENGTH_DP64 + 1 || dexp < -MANTLENGTH_DP64 - 1)
/* One of x and y is insignificant compared to the other */
return x + y; /* Raise inexact */
else if (xexp > EXPBIAS_DP64 + 500 || yexp > EXPBIAS_DP64 + 500)
{
/* Danger of overflow; scale down by 2**600. */
expadjust = 600;
ux -= 0x2580000000000000;
PUT_BITS_DP64(ux, x);
uy -= 0x2580000000000000;
PUT_BITS_DP64(uy, y);
}
else if (xexp < EXPBIAS_DP64 - 500 || yexp < EXPBIAS_DP64 - 500)
{
/* Danger of underflow; scale up by 2**600. */
expadjust = -600;
if (xexp == 0)
{
/* x is denormal - handle by adding 601 to the exponent
and then subtracting a correction for the implicit bit */
PUT_BITS_DP64(ux + 0x2590000000000000, x);
x -= 9.23297861778573578076e-128; /* 0x2590000000000000 */
GET_BITS_DP64(x, ux);
}
else
{
/* x is normal - just increase the exponent by 600 */
ux += 0x2580000000000000;
PUT_BITS_DP64(ux, x);
}
if (yexp == 0)
{
PUT_BITS_DP64(uy + 0x2590000000000000, y);
y -= 9.23297861778573578076e-128; /* 0x2590000000000000 */
GET_BITS_DP64(y, uy);
}
else
{
uy += 0x2580000000000000;
PUT_BITS_DP64(uy, y);
}
}
#ifdef FAST_BUT_GREATER_THAN_ONE_ULP
/* Not awful, but results in accuracy loss larger than 1 ulp */
r = x*x + y*y;
#else
/* Slower but more accurate */
/* Sort so that x is greater than y */
if (x < y)
{
u = y;
y = x;
x = u;
ut = ux;
ux = uy;
uy = ut;
}
/* Split x into hx and tx, head and tail */
PUT_BITS_DP64(ux & 0xfffffffff8000000, hx);
tx = x - hx;
PUT_BITS_DP64(uy & 0xfffffffff8000000, hy);
ty = y - hy;
/* Compute r = x*x + y*y with extra precision */
x2 = x*x;
y2 = y*y;
hs = x2 + y2;
if (dexp == 0)
/* We take most care when x and y have equal exponents,
i.e. are almost the same size */
ts = (((x2 - hs) + y2) +
((hx * hx - x2) + 2 * hx * tx) + tx * tx) +
((hy * hy - y2) + 2 * hy * ty) + ty * ty;
else
ts = (((x2 - hs) + y2) +
((hx * hx - x2) + 2 * hx * tx) + tx * tx);
r = hs + ts;
#endif
/* The sqrt can introduce another half ulp error. */
/* VC++ intrinsic call */
_mm_store_sd(&retval, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r)));
/* If necessary scale the result back. This may lead to
overflow but if so that's the correct result. */
retval = scaleDouble_1(retval, expadjust);
if (retval > large)
/* The result overflowed. Deal with errno. */
return _handle_error("_hypot", OP_HYPOT, PINFBITPATT_DP64, _OVERFLOW,
AMD_F_OVERFLOW | AMD_F_INEXACT, ERANGE, x, y, 2);
return retval;
}

View File

@@ -0,0 +1,99 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#ifdef USE_SOFTWARE_SQRT
#define USE_SQRTF_AMD_INLINE
#endif
#define USE_INFINITYF_WITH_FLAGS
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#ifdef USE_SOFTWARE_SQRT
#undef USE_SQRTF_AMD_INLINE
#endif
#undef USE_INFINITYF_WITH_FLAGS
#undef USE_HANDLE_ERRORF
#include "libm_errno.h"
float FN_PROTOTYPE(_hypotf)(float x, float y)
{
/* Returns sqrt(x*x + y*y) with no overflow or underflow unless
the result warrants it */
/* Do intermediate computations in double precision
and use sqrt instruction from chip if available. */
double dx = x, dy = y, dr, retval;
/* The largest finite float, stored as a double */
const double large = 3.40282346638528859812e+38; /* 0x47efffffe0000000 */
unsigned long ux, uy, avx, avy;
GET_BITS_DP64(x, avx);
avx &= ~SIGNBIT_DP64;
GET_BITS_DP64(y, avy);
avy &= ~SIGNBIT_DP64;
ux = (avx >> EXPSHIFTBITS_DP64);
uy = (avy >> EXPSHIFTBITS_DP64);
if (ux == BIASEDEMAX_DP64 + 1 || uy == BIASEDEMAX_DP64 + 1)
{
retval = x*x + y*y;
/* One or both of the arguments are NaN or infinity. The
result will also be NaN or infinity. */
if (((ux == BIASEDEMAX_DP64 + 1) && !(avx & MANTBITS_DP64)) ||
((uy == BIASEDEMAX_DP64 + 1) && !(avy & MANTBITS_DP64)))
/* x or y is infinity. ISO C99 defines that we must
return +infinity, even if the other argument is NaN.
Note that the computation of x*x + y*y above will already
have raised invalid if either x or y is a signalling NaN. */
return infinityf_with_flags(0);
else
/* One or both of x or y is NaN, and neither is infinity.
Raise invalid if it's a signalling NaN */
return (float)retval;
}
dr = (dx*dx + dy*dy);
#if USE_SOFTWARE_SQRT
retval = sqrtf_amd_inline(r);
#else
/* VC++ intrinsic call */
_mm_store_sd(&retval, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&dr)));
#endif
if (retval > large)
return _handle_errorf("_hypotf", OP_HYPOT, PINFBITPATT_SP32, _OVERFLOW,
AMD_F_OVERFLOW | AMD_F_INEXACT, ERANGE, x, y, 2);
else
return (float)retval;
}

View File

@@ -0,0 +1,49 @@
/***********************************************************************************/
/** MIT License **/
/** ----------- **/
/** **/
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
/** **/
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
/** of this Software and associated documentaon files (the "Software"), to deal **/
/** in the Software without restriction, including without limitation the rights **/
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
/** copies of the Software, and to permit persons to whom the Software is **/
/** furnished to do so, subject to the following conditions: **/
/** **/
/** The above copyright notice and this permission notice shall be included in **/
/** all copies or substantial portions of the Software. **/
/** **/
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
/** THE SOFTWARE. **/
/***********************************************************************************/
#ifndef LIBM_AMD_H_INCLUDED
#define LIBM_AMD_H_INCLUDED 1
#define FN_PROTOTYPE(fname) fname
#include <math.h>
#include <fpieee.h>
#ifndef IS_64BIT
#define IS_64BIT
#endif
#ifndef _COMPLEX_DEFINED
struct _complex
{
double x, y; /* real and imaginary parts */
};
#define _COMPLEX_DEFINED
#endif
#define COMPLEX struct _complex
extern void __remainder_piby2(double x, double *r, double *rr, int *region);
#endif /* LIBM_AMD_H_INCLUDED */

View File

@@ -0,0 +1,35 @@
/***********************************************************************************/
/** MIT License **/
/** ----------- **/
/** **/
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
/** **/
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
/** of this Software and associated documentaon files (the "Software"), to deal **/
/** in the Software without restriction, including without limitation the rights **/
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
/** copies of the Software, and to permit persons to whom the Software is **/
/** furnished to do so, subject to the following conditions: **/
/** **/
/** The above copyright notice and this permission notice shall be included in **/
/** all copies or substantial portions of the Software. **/
/** **/
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
/** THE SOFTWARE. **/
/***********************************************************************************/
#ifndef LIBM_ERRNO_AMD_H_INCLUDED
#define LIBM_ERRNO_AMD_H_INCLUDED 1
#include <stdio.h>
#include <errno.h>
#ifndef __set_errno
#define __set_errno(x) errno = (x)
#endif
#endif /* LIBM_ERRNO_AMD_H_INCLUDED */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,122 @@
/***********************************************************************************/
/** MIT License **/
/** ----------- **/
/** **/
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
/** **/
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
/** of this Software and associated documentaon files (the "Software"), to deal **/
/** in the Software without restriction, including without limitation the rights **/
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
/** copies of the Software, and to permit persons to whom the Software is **/
/** furnished to do so, subject to the following conditions: **/
/** **/
/** The above copyright notice and this permission notice shall be included in **/
/** all copies or substantial portions of the Software. **/
/** **/
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
/** THE SOFTWARE. **/
/***********************************************************************************/
#ifndef __LIBM_NEW_H__
#define __LIBM_NEW_H__
// Defines, protos, etc for *new* math funcs updated by AMD 11/2008
// Old files will continue to include libm_util.h, libm.h, libm_inlines.h
// until such time as these have all been refreshed w/ new versions.
typedef float F32;
typedef unsigned int U32;
typedef double F64;
typedef unsigned long long U64;
union UT32_
{
F32 f32;
U32 u32;
};
union UT64_
{
F64 f64;
U64 u64;
F32 f32[2];
U32 u32[2];
};
typedef union UT32_ UT32;
typedef union UT64_ UT64;
#define SIGN_MASK_32 0x80000000
#define MANTISSA_MASK_32 0x007fffff
#define EXPONENT_MASK_32 0x7f800000
#define QNAN_MASK_32 0x00400000
#define INF_POS_32 0x7f800000
#define INF_NEG_32 0xff800000
#define QNAN_POS_32 0x7fc00000
#define QNAN_NEG_32 0xffc00000
#define IND_32 0xffc00000
#define EXPONENT_FULL_32 0x7f800000
#define SIGN_SET_32 0x80000000
#define QNAN_SET_32 0x00400000
#define INF_POS_64 0x7ff0000000000000
#define INF_NEG_64 0xfff0000000000000
#define MANTISSA_MASK_64 0x000fffffffffffff
#define SIGN_MASK_64 0x8000000000000000
#define IND_64 0xfff8000000000000
#define QNAN_MASK_64 0x0008000000000000
// constants for 'flags' argument of _handle_error and _handle_errorf
#define AMD_F_INEXACT 0x00000010
#define AMD_F_OVERFLOW 0x00000001
#define AMD_F_UNDERFLOW 0x00000002
#define AMD_F_DIVBYZERO 0x00000004
#define AMD_F_INVALID 0x00000008
// define the Microsoft specific error handling routine
// Note to mainainers:
// These prototypes may appear, at first glance, to differ from the versions
// declared in libm_inlines.h and defined in libm_error.c. The third
// parameter appears to have changed type from unsigned long to unsigned long
// long. In fact they are the same because in both of the aforementioned
// files, long has been #defined to __int64 in a most cowardly fashion. This
// disgusts me. The buck stops here. - MAS
double _handle_error(
char *fname,
int opcode,
unsigned long long value,
int type,
int flags,
int error,
double arg1,
double arg2,
int nargs
);
float _handle_errorf(
char *fname,
int opcode,
unsigned long long value,
int type,
int flags,
int error,
float arg1,
float arg2,
int nargs
);
#endif // __LIBM_NEW_H

View File

@@ -0,0 +1,150 @@
/***********************************************************************************/
/** MIT License **/
/** ----------- **/
/** **/
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
/** **/
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
/** of this Software and associated documentaon files (the "Software"), to deal **/
/** in the Software without restriction, including without limitation the rights **/
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
/** copies of the Software, and to permit persons to whom the Software is **/
/** furnished to do so, subject to the following conditions: **/
/** **/
/** The above copyright notice and this permission notice shall be included in **/
/** all copies or substantial portions of the Software. **/
/** **/
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
/** THE SOFTWARE. **/
/***********************************************************************************/
#ifndef LIBM_UTIL_AMD_H_INCLUDED
#define LIBM_UTIL_AMD_H_INCLUDED 1
#define inline __inline
#undef long
#define long __int64
#include "emmintrin.h"
#include "float.h"
/* Compile-time verification that type long is the same size
as type double (i.e. we are really on a 64-bit machine) */
void check_long_against_double_size(int machine_is_64_bit[(sizeof(long) == sizeof(double))?1:-1]);
/* Definitions for double functions on 64 bit machines */
#define SIGNBIT_DP64 0x8000000000000000
#define EXPBITS_DP64 0x7ff0000000000000
#define MANTBITS_DP64 0x000fffffffffffff
#define ONEEXPBITS_DP64 0x3ff0000000000000
#define TWOEXPBITS_DP64 0x4000000000000000
#define HALFEXPBITS_DP64 0x3fe0000000000000
#define IMPBIT_DP64 0x0010000000000000
#define QNANBITPATT_DP64 0x7ff8000000000000
#define INDEFBITPATT_DP64 0xfff8000000000000
#define PINFBITPATT_DP64 0x7ff0000000000000
#define NINFBITPATT_DP64 0xfff0000000000000
#define EXPBIAS_DP64 1023
#define EXPSHIFTBITS_DP64 52
#define BIASEDEMIN_DP64 1
#define EMIN_DP64 -1022
#define BIASEDEMAX_DP64 2046
#define EMAX_DP64 1023
#define LAMBDA_DP64 1.0e300
#define MANTLENGTH_DP64 53
#define BASEDIGITS_DP64 15
/* These definitions, used by float functions,
are for both 32 and 64 bit machines */
#define SIGNBIT_SP32 0x80000000
#define EXPBITS_SP32 0x7f800000
#define MANTBITS_SP32 0x007fffff
#define ONEEXPBITS_SP32 0x3f800000
#define TWOEXPBITS_SP32 0x40000000
#define HALFEXPBITS_SP32 0x3f000000
#define IMPBIT_SP32 0x00800000
#define QNANBITPATT_SP32 0x7fc00000
#define INDEFBITPATT_SP32 0xffc00000
#define PINFBITPATT_SP32 0x7f800000
#define NINFBITPATT_SP32 0xff800000
#define EXPBIAS_SP32 127
#define EXPSHIFTBITS_SP32 23
#define BIASEDEMIN_SP32 1
#define EMIN_SP32 -126
#define BIASEDEMAX_SP32 254
#define EMAX_SP32 127
#define LAMBDA_SP32 1.0e30
#define MANTLENGTH_SP32 24
#define BASEDIGITS_SP32 7
#define CLASS_SIGNALLING_NAN 1
#define CLASS_QUIET_NAN 2
#define CLASS_NEGATIVE_INFINITY 3
#define CLASS_NEGATIVE_NORMAL_NONZERO 4
#define CLASS_NEGATIVE_DENORMAL 5
#define CLASS_NEGATIVE_ZERO 6
#define CLASS_POSITIVE_ZERO 7
#define CLASS_POSITIVE_DENORMAL 8
#define CLASS_POSITIVE_NORMAL_NONZERO 9
#define CLASS_POSITIVE_INFINITY 10
#define OLD_BITS_SP32(x) (*((unsigned int *)&x))
#define OLD_BITS_DP64(x) (*((unsigned long *)&x))
/* Alternatives to the above functions which don't have
problems when using high optimization levels on gcc */
#define GET_BITS_SP32(x, ux) \
{ \
volatile union {float f; unsigned int i;} _bitsy; \
_bitsy.f = (x); \
ux = _bitsy.i; \
}
#define PUT_BITS_SP32(ux, x) \
{ \
volatile union {float f; unsigned int i;} _bitsy; \
_bitsy.i = (ux); \
x = _bitsy.f; \
}
#define GET_BITS_DP64(x, ux) \
{ \
volatile union {double d; unsigned long i;} _bitsy; \
_bitsy.d = (x); \
ux = _bitsy.i; \
}
#define PUT_BITS_DP64(ux, x) \
{ \
volatile union {double d; unsigned long i;} _bitsy; \
_bitsy.i = (ux); \
x = _bitsy.d; \
}
/* Processor-dependent floating-point status flags */
#define AMD_F_OVERFLOW 0x00000001
#define AMD_F_UNDERFLOW 0x00000002
#define AMD_F_DIVBYZERO 0x00000004
#define AMD_F_INVALID 0x00000008
#define AMD_F_INEXACT 0x00000010
/* Processor-dependent floating-point precision-control flags */
#define AMD_F_EXTENDED 0x00000300
#define AMD_F_DOUBLE 0x00000200
#define AMD_F_SINGLE 0x00000000
/* Processor-dependent floating-point rounding-control flags */
#define AMD_F_RC_NEAREST 0x00000000
#define AMD_F_RC_DOWN 0x00002000
#define AMD_F_RC_UP 0x00004000
#define AMD_F_RC_ZERO 0x00006000
#endif /* LIBM_UTIL_AMD_H_INCLUDED */

View File

@@ -0,0 +1,557 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; log.asm
;
; An implementation of the log libm function.
;
; Prototype:
;
; double log(double x);
;
;
; Algorithm:
;
; Based on:
; Ping-Tak Peter Tang
; "Table-driven implementation of the logarithm function in IEEE
; floating-point arithmetic"
; ACM Transactions on Mathematical Software (TOMS)
; Volume 16, Issue 4 (December 1990)
;
;
; x very close to 1.0 is handled differently, for x everywhere else
; a brief explanation is given below
;
; x = (2^m)*A
; x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9))
; x = (2^m)*2*(G/2+g/2)
; x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-10))
;
; Y = (2^(-1))*(2^(-m))*(2^m)*A
; Now, range of Y is: 0.5 <= Y < 1
;
; F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit)
; Now, range of F is: 256 <= F <= 512
; F = F / 512
; Now, range of F is: 0.5 <= F <= 1
;
; f = -(Y-F), with (f <= 2^(-10))
;
; log(x) = m*log(2) + log(2) + log(F-f)
; log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
; log(x) = m*log(2) + log(2*F) + log(1-r)
;
; r = (f/F), with (r <= 2^(-9))
; r = f*(1/F) with (1/F) precomputed to avoid division
;
; log(x) = m*log(2) + log(G) - poly
;
; log(G) is precomputed
; poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6))
;
; log(2) and log(G) need to be maintained in extra precision
; to avoid losing precision in the calculations
;
.const
ALIGN 16
__real_ninf DQ 0fff0000000000000h ; -inf
DQ 0000000000000000h
__real_inf DQ 7ff0000000000000h ; +inf
DQ 0000000000000000h
__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN
DQ 0000000000000000h
__real_qnanbit DQ 0008000000000000h
DQ 0000000000000000h
__real_min_norm DQ 0010000000000000h
DQ 0000000000000000h
__real_mant DQ 000FFFFFFFFFFFFFh ; mantissa bits
DQ 0000000000000000h
__mask_1023 DQ 00000000000003ffh
DQ 0000000000000000h
__mask_001 DQ 0000000000000001h
DQ 0000000000000000h
__mask_mant_all8 DQ 000ff00000000000h
DQ 0000000000000000h
__mask_mant9 DQ 0000080000000000h
DQ 0000000000000000h
__real_two DQ 4000000000000000h ; 2
DQ 0000000000000000h
__real_one DQ 3ff0000000000000h ; 1
DQ 0000000000000000h
__real_near_one_lt DQ 3fee000000000000h ; .9375
DQ 0000000000000000h
__real_near_one_gt DQ 3ff1000000000000h ; 1.0625
DQ 0000000000000000h
__real_half DQ 3fe0000000000000h ; 1/2
DQ 0000000000000000h
__mask_100 DQ 0000000000000100h
DQ 0000000000000000h
__real_1_over_512 DQ 3f60000000000000h
DQ 0000000000000000h
__real_1_over_2 DQ 3fe0000000000000h
DQ 0000000000000000h
__real_1_over_3 DQ 3fd5555555555555h
DQ 0000000000000000h
__real_1_over_4 DQ 3fd0000000000000h
DQ 0000000000000000h
__real_1_over_5 DQ 3fc999999999999ah
DQ 0000000000000000h
__real_1_over_6 DQ 3fc5555555555555h
DQ 0000000000000000h
__mask_1023_f DQ 0c08ff80000000000h
DQ 0000000000000000h
__mask_2045 DQ 00000000000007fdh
DQ 0000000000000000h
__real_threshold DQ 3fb0000000000000h ; .0625
DQ 0000000000000000h
__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
DQ 0000000000000000h
__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
DQ 0000000000000000h
__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
DQ 0000000000000000h
__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
DQ 0000000000000000h
__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
DQ 0000000000000000h
__real_log2_lead DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01
DQ 00000000000000000h
__real_log2_tail DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08
DQ 00000000000000000h
; these codes and the ones in the corresponding .c file have to match
__flag_x_zero DD 00000001
__flag_x_neg DD 00000002
__flag_x_nan DD 00000003
EXTRN __log_256_lead:QWORD
EXTRN __log_256_tail:QWORD
EXTRN __log_F_inv_qword:QWORD
EXTRN __use_fma3_lib:DWORD
fname TEXTEQU <log>
fname_special TEXTEQU <_log_special>
; define local variable storage offsets
save_xmm6 EQU 20h
dummy_space EQU 40h
stack_size EQU 58h
include fm.inc
; external function
EXTERN fname_special:PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
SaveXmm xmm6, save_xmm6
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Llog_fma3
Llog_sse2:
; compute exponent part
movdqa xmm3, xmm0
movapd xmm4, xmm0
psrlq xmm3, 52
movd rax, xmm0
psubq xmm3, XMMWORD PTR __mask_1023
; NaN or inf
mov rcx, rax
btr rcx, 63
cmp rcx, QWORD PTR __real_inf
jae __x_is_inf_or_nan
movdqa xmm2, xmm0
cvtdq2pd xmm6, xmm3 ; xexp
pand xmm2, XMMWORD PTR __real_mant
subsd xmm4, QWORD PTR __real_one
comisd xmm6, QWORD PTR __mask_1023_f
je __denormal_adjust
__continue_common:
andpd xmm4, XMMWORD PTR __real_notsign
; compute index into the log tables
mov r9, rax
and rax, QWORD PTR __mask_mant_all8
and r9, QWORD PTR __mask_mant9
shl r9, 1
add rax, r9
movd xmm1, rax
; near one codepath
comisd xmm4, QWORD PTR __real_threshold
jb __near_one
; F, Y
shr rax, 44
por xmm2, XMMWORD PTR __real_half
por xmm1, XMMWORD PTR __real_half
lea r9, __log_F_inv_qword
; check for negative numbers or zero
xorpd xmm5, xmm5
comisd xmm0, xmm5
jbe __x_is_zero_or_neg
; f = F - Y, r = f * inv
subsd xmm1, xmm2 ; xmm1 <-- f = F - Y
mulsd xmm1, QWORD PTR [r9+rax*8] ; xmm1 <-- r = f * inv
movapd xmm2, xmm1 ; xmm2 <-- copy of r
movapd xmm0, xmm1 ; xmm0 <-- copy of r
lea r9, QWORD PTR __log_256_lead
; poly
movsd xmm3, QWORD PTR __real_1_over_6
movsd xmm1, QWORD PTR __real_1_over_3
mulsd xmm3, xmm2 ; xmm3 <-- r/6
mulsd xmm1, xmm2 ; xmm1 <-- r/3
mulsd xmm0, xmm2 ; xmm0 <-- r*r
movapd xmm4, xmm0 ; xmm4 <-- copy of r*r
addsd xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5
addsd xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2
mulsd xmm4, xmm0 ; xmm4 <-- r^4
mulsd xmm3, xmm2 ; xmm3 <-- (r/6 + 1/5)*r
mulsd xmm1, xmm0 ; xmm1 <-- (r/3 + 1/2)*r^2
addsd xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4
addsd xmm1, xmm2 ; xmm1 <-- (r/3 + 1/2)*r^2 + r
mulsd xmm3, xmm4 ; xmm3 <-- ((r/6+1/5)*r+1/4)*r^4
addsd xmm1, xmm3 ; xmm1 <-- poly
; m*log(2)_tail + log(G)_tail - poly
movsd xmm5, QWORD PTR __real_log2_tail
mulsd xmm5, xmm6 ; xmm5 <-- m*log2_tail
subsd xmm5, xmm1 ; xmm5 <-- m*log2_tail - poly
movsd xmm0, QWORD PTR [r9+rax*8] ; xmm0 <-- log(G)_lead
lea rdx, QWORD PTR __log_256_tail
movsd xmm2, QWORD PTR [rdx+rax*8] ; xmm2 <-- log(G)_tail
addsd xmm2, xmm5 ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail
movsd xmm4, QWORD PTR __real_log2_lead
mulsd xmm4, xmm6 ; xmm4 <-- m*log2_lead
addsd xmm0, xmm4 ; xmm0 <-- m*log2_lead + log(G)_lead
addsd xmm0, xmm2 ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
__near_one:
; r = x - 1.0
movsd xmm2, QWORD PTR __real_two
subsd xmm0, QWORD PTR __real_one ; r
addsd xmm2, xmm0
movsd xmm1, xmm0
divsd xmm1, xmm2 ; r/(2+r) = u/2
movsd xmm4, QWORD PTR __real_ca2
movsd xmm5, QWORD PTR __real_ca4
movsd xmm6, xmm0
mulsd xmm6, xmm1 ; correction
addsd xmm1, xmm1 ; u
movsd xmm2, xmm1
mulsd xmm2, xmm1 ; u^2
mulsd xmm4, xmm2
mulsd xmm5, xmm2
addsd xmm4, __real_ca1
addsd xmm5, __real_ca3
mulsd xmm2, xmm1 ; u^3
mulsd xmm4, xmm2
mulsd xmm2, xmm2
mulsd xmm2, xmm1 ; u^7
mulsd xmm5, xmm2
addsd xmm4, xmm5
subsd xmm4, xmm6
addsd xmm0, xmm4
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
__denormal_adjust:
por xmm2, XMMWORD PTR __real_one
subsd xmm2, QWORD PTR __real_one
movsd xmm5, xmm2
pand xmm2, XMMWORD PTR __real_mant
movd rax, xmm2
psrlq xmm5, 52
psubd xmm5, XMMWORD PTR __mask_2045
cvtdq2pd xmm6, xmm5
jmp __continue_common
ALIGN 16
__x_is_zero_or_neg:
jne __x_is_neg
movsd xmm1, QWORD PTR __real_ninf
mov r8d, DWORD PTR __flag_x_zero
call fname_special
jmp __finish
ALIGN 16
__x_is_neg:
movsd xmm1, QWORD PTR __real_neg_qnan
mov r8d, DWORD PTR __flag_x_neg
call fname_special
jmp __finish
ALIGN 16
__x_is_inf_or_nan:
cmp rax, QWORD PTR __real_inf
je __finish
cmp rax, QWORD PTR __real_ninf
je __x_is_neg
or rax, QWORD PTR __real_qnanbit
movd xmm1, rax
mov r8d, DWORD PTR __flag_x_nan
call fname_special
jmp __finish
ALIGN 16
__finish:
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3:
; compute exponent part
xor rax,rax
vpsrlq xmm3,xmm0,52
vmovq rax,xmm0
vpsubq xmm3,xmm3,QWORD PTR __mask_1023
vcvtdq2pd xmm6,xmm3 ; xexp
; NaN or inf
vpand xmm5,xmm0,QWORD PTR __real_inf
vcomisd xmm5,QWORD PTR __real_inf
je Llog_fma3_x_is_inf_or_nan
; check for negative numbers or zero
vpxor xmm5,xmm5,xmm5
vcomisd xmm0,xmm5
jbe Llog_fma3_x_is_zero_or_neg
vpand xmm2,xmm0,QWORD PTR __real_mant
vsubsd xmm4,xmm0,QWORD PTR __real_one
vcomisd xmm6,QWORD PTR __mask_1023_f
je Llog_fma3_denormal_adjust
Llog_fma3_continue_common:
; compute index into the log tables
vpand xmm1,xmm0,QWORD PTR __mask_mant_all8
vpand xmm3,xmm0,QWORD PTR __mask_mant9
vpsllq xmm3,xmm3,1
vpaddq xmm1,xmm3,xmm1
vmovq rax,xmm1
; near one codepath
vpand xmm4,xmm4,QWORD PTR __real_notsign
vcomisd xmm4,QWORD PTR __real_threshold
jb Llog_fma3_near_one
; F,Y
shr rax,44
vpor xmm2,xmm2,QWORD PTR __real_half
vpor xmm1,xmm1,QWORD PTR __real_half
lea r9,QWORD PTR __log_F_inv_qword
; f = F - Y,r = f * inv
vsubsd xmm1,xmm1,xmm2
vmulsd xmm1,xmm1,QWORD PTR[r9 + rax * 8]
lea r9,QWORD PTR __log_256_lead
; poly
vmulsd xmm0,xmm1,xmm1 ; r*r
vmovsd xmm3,QWORD PTR __real_1_over_6
vmovsd xmm5,QWORD PTR __real_1_over_3
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5
vfmadd213sd xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3
vmovsd xmm4,xmm0,xmm0
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)
vmulsd xmm4,xmm0,xmm0 ; r*r*r*r
vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r
vfmadd231sd xmm1,xmm3,xmm4
; m*log(2) + log(G) - poly
vmovsd xmm5,QWORD PTR __real_log2_tail
vfmsub213sd xmm5,xmm6,xmm1
vmovsd xmm0,QWORD PTR[r9 + rax * 8]
lea rdx,QWORD PTR __log_256_tail
vmovsd xmm1,QWORD PTR[rdx + rax * 8]
vaddsd xmm1,xmm1,xmm5
vfmadd231sd xmm0,xmm6,QWORD PTR __real_log2_lead
vaddsd xmm0,xmm0,xmm1
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3_near_one:
; r = x - 1.0
vmovsd xmm3,QWORD PTR __real_two
vsubsd xmm0,xmm0,QWORD PTR __real_one ; r
vaddsd xmm3,xmm3,xmm0
vdivsd xmm1,xmm0,xmm3 ; r/(2+r) = u/2
vmovsd xmm4,QWORD PTR __real_ca2
vmovsd xmm5,QWORD PTR __real_ca4
vmulsd xmm3,xmm0,xmm1 ; correction
vaddsd xmm1,xmm1,xmm1 ; u
vmulsd xmm2,xmm1,xmm1 ; u^2
vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1
vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3
vmulsd xmm2,xmm2,xmm1 ; u^3
vmulsd xmm4,xmm4,xmm2
vmulsd xmm2,xmm2,xmm2
vmulsd xmm2,xmm2,xmm1 ; u^7
vfmadd231sd xmm4,xmm5,xmm2
vsubsd xmm4,xmm4,xmm3
vaddsd xmm0,xmm0,xmm4
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
Llog_fma3_denormal_adjust:
vpor xmm2,xmm2,QWORD PTR __real_one
vsubsd xmm2,xmm2,QWORD PTR __real_one
vpsrlq xmm5,xmm2,52
vpand xmm2,xmm2,QWORD PTR __real_mant
vmovapd xmm0,xmm2
vpsubd xmm5,xmm5,XMMWORD PTR __mask_2045
vcvtdq2pd xmm6,xmm5
jmp Llog_fma3_continue_common
ALIGN 16
Llog_fma3_x_is_zero_or_neg:
jne Llog_fma3_x_is_neg
vmovsd xmm1,QWORD PTR __real_ninf
mov r8d,DWORD PTR __flag_x_zero
call fname_special
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3_x_is_neg:
vmovsd xmm1,QWORD PTR __real_neg_qnan
mov r8d,DWORD PTR __flag_x_neg
call fname_special
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3_x_is_inf_or_nan:
cmp rax,QWORD PTR __real_inf
je Llog_fma3_finish
cmp rax,QWORD PTR __real_ninf
je Llog_fma3_x_is_neg
or rax,QWORD PTR __real_qnanbit
vmovq xmm1,rax
mov r8d,DWORD PTR __flag_x_nan
call fname_special
ALIGN 16
Llog_fma3_finish:
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
fname endp
END

View File

@@ -0,0 +1,565 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; log10.asm
;
; An implementation of the log10 libm function.
;
; Prototype:
;
; double log10(double x);
;
;
; Algorithm:
; Similar to one presnted in log.asm
;
.const
ALIGN 16
__real_ninf DQ 0fff0000000000000h ; -inf
DQ 0000000000000000h
__real_inf DQ 7ff0000000000000h ; +inf
DQ 0000000000000000h
__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN
DQ 0000000000000000h
__real_qnanbit DQ 0008000000000000h
DQ 0000000000000000h
__int_1023 DQ 00000000000003ffh
DQ 0000000000000000h
__mask_001 DQ 0000000000000001h
DQ 0000000000000000h
__mask_mant DQ 000FFFFFFFFFFFFFh ; mask for mantissa bits
DQ 0000000000000000h
__mask_mant_top8 DQ 000ff00000000000h ; mask for top 8 mantissa bits
DQ 0000000000000000h
__mask_mant9 DQ 0000080000000000h ; mask for 9th mantissa bit
DQ 0000000000000000h
__real_log10_e DQ 3fdbcb7b1526e50eh
DQ 0000000000000000h
__real_log10_e_lead DQ 3fdbcb7800000000h ; log10e_lead 4.34293746948242187500e-01
DQ 0000000000000000h
__real_log10_e_tail DQ 3ea8a93728719535h ; log10e_tail 7.3495500964015109100644e-7
DQ 0000000000000000h
__real_log10_2_lead DQ 3fd3441350000000h
DQ 0000000000000000h
__real_log10_2_tail DQ 3e03ef3fde623e25h
DQ 0000000000000000h
__real_two DQ 4000000000000000h ; 2
DQ 0000000000000000h
__real_one DQ 3ff0000000000000h ; 1
DQ 0000000000000000h
__real_half DQ 3fe0000000000000h ; 1/2
DQ 0000000000000000h
__mask_100 DQ 0000000000000100h
DQ 0000000000000000h
__real_1_over_512 DQ 3f60000000000000h
DQ 0000000000000000h
__real_1_over_2 DQ 3fe0000000000000h
DQ 0000000000000000h
__real_1_over_3 DQ 3fd5555555555555h
DQ 0000000000000000h
__real_1_over_4 DQ 3fd0000000000000h
DQ 0000000000000000h
__real_1_over_5 DQ 3fc999999999999ah
DQ 0000000000000000h
__real_1_over_6 DQ 3fc5555555555555h
DQ 0000000000000000h
__real_neg_1023 DQ 0c08ff80000000000h
DQ 0000000000000000h
__mask_2045 DQ 00000000000007fdh
DQ 0000000000000000h
__real_threshold DQ 3fb0000000000000h ; .0625
DQ 0000000000000000h
__real_near_one_lt DQ 3fee000000000000h ; .9375
DQ 0000000000000000h
__real_near_one_gt DQ 3ff1000000000000h ; 1.0625
DQ 0000000000000000h
__real_min_norm DQ 0010000000000000h
DQ 0000000000000000h
__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
DQ 0000000000000000h
__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
DQ 0000000000000000h
__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
DQ 0000000000000000h
__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
DQ 0000000000000000h
__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
DQ 0000000000000000h
__mask_lower DQ 0ffffffff00000000h
DQ 0000000000000000h
; these codes and the ones in the corresponding .c file have to match
__flag_x_zero DD 00000001
__flag_x_neg DD 00000002
__flag_x_nan DD 00000003
EXTRN __log10_256_lead:QWORD
EXTRN __log10_256_tail:QWORD
EXTRN __log_F_inv_qword:QWORD
EXTRN __use_fma3_lib:DWORD
; local variable storage offsets
save_xmm6 EQU 20h
dummy_space EQU 30h
stack_size EQU 058h
include fm.inc
fname TEXTEQU <log10>
fname_special TEXTEQU <_log10_special>
EXTERN fname_special:PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
SaveXmm xmm6, save_xmm6
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Llog10_fma3
Llog10_sse2:
; compute exponent part
movapd xmm3, xmm0
movapd xmm4, xmm0
psrlq xmm3, 52
movd rax, xmm0
psubq xmm3, XMMWORD PTR __int_1023 ; xmm3 <-- unbiased exponent
; NaN or inf
movapd xmm5, xmm0
andpd xmm5, XMMWORD PTR __real_inf
comisd xmm5, QWORD PTR __real_inf
je Llog10_sse2_x_is_inf_or_nan
movapd xmm2, xmm0
cvtdq2pd xmm6, xmm3 ; xmm6 <-- unbiased exp as double
pand xmm2, XMMWORD PTR __mask_mant
subsd xmm4, QWORD PTR __real_one
comisd xmm6, QWORD PTR __real_neg_1023
je Llog10_sse2_denormal_adjust
Llog10_sse2_continue_common:
andpd xmm4, XMMWORD PTR __real_notsign
; compute index into the log tables
mov r9, rax
and rax, QWORD PTR __mask_mant_top8
and r9, QWORD PTR __mask_mant9
shl r9, 1
add rax, r9
movd xmm1, rax
; near one codepath
comisd xmm4, QWORD PTR __real_threshold
jb Llog10_sse2_near_one
; F, Y
shr rax, 44
por xmm2, XMMWORD PTR __real_half
por xmm1, XMMWORD PTR __real_half
lea r9, QWORD PTR __log_F_inv_qword
; check for negative numbers or zero
xorpd xmm5, xmm5
comisd xmm0, xmm5
jbe Llog10_sse2_x_is_zero_or_neg
; f = F - Y, r = f * inv
subsd xmm1, xmm2
mulsd xmm1, QWORD PTR [r9+rax*8]
movapd xmm2, xmm1
movapd xmm0, xmm1
lea r9, QWORD PTR __log10_256_lead
; poly
movsd xmm3, QWORD PTR __real_1_over_6
movsd xmm1, QWORD PTR __real_1_over_3
mulsd xmm3, xmm2
mulsd xmm1, xmm2
mulsd xmm0, xmm2
movapd xmm4, xmm0
addsd xmm3, QWORD PTR __real_1_over_5
addsd xmm1, QWORD PTR __real_1_over_2
mulsd xmm4, xmm0
mulsd xmm3, xmm2
mulsd xmm1, xmm0
addsd xmm3, QWORD PTR __real_1_over_4
addsd xmm1, xmm2
mulsd xmm3, xmm4
addsd xmm1, xmm3
movsd xmm5, QWORD PTR __real_log10_2_tail
mulsd xmm1, QWORD PTR __real_log10_e
; m*log(10) + log10(G) - poly
mulsd xmm5, xmm6
subsd xmm5, xmm1
movsd xmm0, QWORD PTR [r9+rax*8]
lea rdx, QWORD PTR __log10_256_tail
movsd xmm2, QWORD PTR [rdx+rax*8]
movsd xmm4, QWORD PTR __real_log10_2_lead
mulsd xmm4, xmm6
addsd xmm0, xmm4
addsd xmm2, xmm5
addsd xmm0, xmm2
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog10_sse2_near_one:
; r = x - 1.0
movsd xmm2, QWORD PTR __real_two
subsd xmm0, QWORD PTR __real_one ; r
addsd xmm2, xmm0
movapd xmm1, xmm0
divsd xmm1, xmm2 ; r/(2+r) = u/2
movsd xmm4, QWORD PTR __real_ca2
movsd xmm5, QWORD PTR __real_ca4
movapd xmm6, xmm0
mulsd xmm6, xmm1 ; correction
addsd xmm1, xmm1 ; u
movapd xmm2, xmm1
mulsd xmm2, xmm1 ; u^2
mulsd xmm4, xmm2
mulsd xmm5, xmm2
addsd xmm4, QWORD PTR __real_ca1
addsd xmm5, QWORD PTR __real_ca3
mulsd xmm2, xmm1 ; u^3
mulsd xmm4, xmm2
mulsd xmm2, xmm2
mulsd xmm2, xmm1 ; u^7
mulsd xmm5, xmm2
movsd xmm2, QWORD PTR __real_log10_e_tail
addsd xmm4, xmm5
subsd xmm4, xmm6
movsd xmm6, QWORD PTR __real_log10_e_lead
movapd xmm3, xmm0
pand xmm3, XMMWORD PTR __mask_lower
subsd xmm0, xmm3
addsd xmm4, xmm0
movapd xmm0, xmm3
movapd xmm1, xmm4
mulsd xmm4, xmm2
mulsd xmm0, xmm2
mulsd xmm1, xmm6
mulsd xmm3, xmm6
addsd xmm0, xmm4
addsd xmm0, xmm1
addsd xmm0, xmm3
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
Llog10_sse2_denormal_adjust:
por xmm2, XMMWORD PTR __real_one
subsd xmm2, QWORD PTR __real_one
movsd xmm5, xmm2
pand xmm2, XMMWORD PTR __mask_mant
movd rax, xmm2
psrlq xmm5, 52
psubd xmm5, XMMWORD PTR __mask_2045
cvtdq2pd xmm6, xmm5
jmp Llog10_sse2_continue_common
ALIGN 16
Llog10_sse2_x_is_zero_or_neg:
jne Llog10_sse2_x_is_neg
movsd xmm1, QWORD PTR __real_ninf
mov r8d, DWORD PTR __flag_x_zero
call fname_special
jmp Llog10_sse2_finish
ALIGN 16
Llog10_sse2_x_is_neg:
movsd xmm1, QWORD PTR __real_neg_qnan
mov r8d, DWORD PTR __flag_x_neg
call fname_special
jmp Llog10_sse2_finish
ALIGN 16
Llog10_sse2_x_is_inf_or_nan:
cmp rax, QWORD PTR __real_inf
je Llog10_sse2_finish
cmp rax, QWORD PTR __real_ninf
je Llog10_sse2_x_is_neg
or rax, QWORD PTR __real_qnanbit
movd xmm1, rax
mov r8d, DWORD PTR __flag_x_nan
call fname_special
jmp Llog10_sse2_finish
ALIGN 16
Llog10_sse2_finish:
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog10_fma3:
; compute exponent part
xor rax,rax
vpsrlq xmm3,xmm0,52
vmovq rax,xmm0
vpsubq xmm3,xmm3,QWORD PTR __int_1023
vcvtdq2pd xmm6,xmm3 ; xmm6 <-- (double)xexp
; NaN or Inf?
vpand xmm5,xmm0,__real_inf
vcomisd xmm5,QWORD PTR __real_inf
je Llog10_fma3_x_is_inf_or_nan
; negative number or zero?
vpxor xmm5,xmm5,xmm5
vcomisd xmm0,xmm5
jbe Llog10_fma3_x_is_zero_or_neg
vpand xmm2,xmm0,__mask_mant
vsubsd xmm4,xmm0,QWORD PTR __real_one
; Subnormal?
vcomisd xmm6,QWORD PTR __real_neg_1023
je Llog10_fma3_denormal_adjust
Llog10_fma3_continue_common:
; compute index into the log tables
vpand xmm1,xmm0,DWORD PTR __mask_mant_top8
vpand xmm3,xmm0,DWORD PTR __mask_mant9
vpsllq xmm3,xmm3,1
vpaddq xmm1,xmm3,xmm1
vmovq rax,xmm1
; near one codepath
vpand xmm4,xmm4,DWORD PTR __real_notsign
vcomisd xmm4,QWORD PTR __real_threshold
jb Llog10_fma3_near_one
; F,Y
shr rax,44
vpor xmm2,xmm2,DWORD PTR __real_half
vpor xmm1,xmm1,DWORD PTR __real_half
lea r9,DWORD PTR __log_F_inv_qword
; f = F - Y,r = f * inv
vsubsd xmm1,xmm1,xmm2
vmulsd xmm1,xmm1,QWORD PTR [r9 + rax * 8]
lea r9,DWORD PTR __log10_256_lead
; poly
vmulsd xmm0,xmm1,xmm1 ; r*r
vmovsd xmm3,QWORD PTR __real_1_over_6
vmovsd xmm5,QWORD PTR __real_1_over_3
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5
vfmadd213sd xmm5,xmm1,QWORD PTR __real_half ; 1/2+r*1/3
movsd xmm4,xmm0 ; r*r
vfmadd213sd xmm3 ,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)
vmulsd xmm4,xmm0,xmm0 ; r*r*r*r
vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r
vfmadd231sd xmm1,xmm3,xmm4
vmulsd xmm1,xmm1,QWORD PTR __real_log10_e
; m*log(2) + log(G) - poly*log10_e
vmovsd xmm5,QWORD PTR __real_log10_2_tail
vfmsub213sd xmm5,xmm6,xmm1
movsd xmm0,QWORD PTR [r9 + rax * 8]
lea rdx,DWORD PTR __log10_256_tail
movsd xmm2,QWORD PTR [rdx + rax * 8]
vaddsd xmm2,xmm2,xmm5
vfmadd231sd xmm0,xmm6,QWORD PTR __real_log10_2_lead
vaddsd xmm0,xmm0,xmm2
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog10_fma3_near_one:
; r = x - 1.0
vmovsd xmm2,QWORD PTR __real_two
vsubsd xmm0,xmm0,QWORD PTR __real_one ; r
vaddsd xmm2,xmm2,xmm0
vdivsd xmm1,xmm0,xmm2 ; r/(2+r) = u/2
vmovsd xmm4,QWORD PTR __real_ca2
vmovsd xmm5,QWORD PTR __real_ca4
vmulsd xmm6,xmm0,xmm1 ; correction
vaddsd xmm1,xmm1,xmm1 ; u
vmulsd xmm2,xmm1,xmm1 ; u^2
vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1
vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3
vmulsd xmm2,xmm2,xmm1 ; u^3
vmulsd xmm4,xmm4,xmm2
vmulsd xmm2,xmm2,xmm2
vmulsd xmm2,xmm2,xmm1 ; u^7
vmulsd xmm5,xmm5,xmm2
vaddsd xmm4,xmm4,xmm5
vsubsd xmm4,xmm4,xmm6
vpand xmm3,xmm0,QWORD PTR __mask_lower
vsubsd xmm0,xmm0,xmm3
vaddsd xmm4,xmm4,xmm0
vmulsd xmm1,xmm4,QWORD PTR __real_log10_e_lead
vmulsd xmm4,xmm4,QWORD PTR __real_log10_e_tail
vmulsd xmm0,xmm3,QWORD PTR __real_log10_e_tail
vmulsd xmm3,xmm3,QWORD PTR __real_log10_e_lead
vaddsd xmm0,xmm0,xmm4
vaddsd xmm0,xmm0,xmm1
vaddsd xmm0,xmm0,xmm3
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
Llog10_fma3_denormal_adjust:
vpor xmm2,xmm2,QWORD PTR __real_one
vsubsd xmm2,xmm2,QWORD PTR __real_one
vpsrlq xmm5,xmm2,52
vpand xmm2,xmm2,QWORD PTR __mask_mant
vmovapd xmm0,xmm2
vpsubd xmm5,xmm5,DWORD PTR __mask_2045
vcvtdq2pd xmm6,xmm5
jmp Llog10_fma3_continue_common
ALIGN 16
Llog10_fma3_x_is_zero_or_neg:
jne Llog10_fma3_x_is_neg
vmovsd xmm1,QWORD PTR __real_ninf
mov r8d,DWORD PTR __flag_x_zero
call fname_special
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog10_fma3_x_is_neg:
vmovsd xmm1,QWORD PTR __real_neg_qnan
mov r8d,DWORD PTR __flag_x_neg
call fname_special
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog10_fma3_x_is_inf_or_nan:
cmp rax,QWORD PTR __real_inf
je Llog10_fma3_finish
cmp rax,QWORD PTR __real_ninf
je Llog10_fma3_x_is_neg
or rax,QWORD PTR __real_qnanbit
movd xmm1,rax
mov r8d,DWORD PTR __flag_x_nan
call fname_special
jmp Llog10_fma3_finish
ALIGN 16
Llog10_fma3_finish:
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
fname endp
END

View File

@@ -0,0 +1,297 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;;
;; Defines __log_128_lead and __log_128_tail tables
;; Used by log and pow
;;
.const
ALIGN 16
PUBLIC __log10_128_lead
__log10_128_lead:
DD 00000000h
DD 3b5d4000h
DD 3bdc8000h
DD 3c24c000h
DD 3c5ac000h
DD 3c884000h
DD 3ca2c000h
DD 3cbd4000h
DD 3cd78000h
DD 3cf1c000h
DD 3d05c000h
DD 3d128000h
DD 3d1f4000h
DD 3d2c0000h
DD 3d388000h
DD 3d450000h
DD 3d518000h
DD 3d5dc000h
DD 3d6a0000h
DD 3d760000h
DD 3d810000h
DD 3d870000h
DD 3d8d0000h
DD 3d92c000h
DD 3d98c000h
DD 3d9e8000h
DD 3da44000h
DD 3daa0000h
DD 3dafc000h
DD 3db58000h
DD 3dbb4000h
DD 3dc0c000h
DD 3dc64000h
DD 3dcc0000h
DD 3dd18000h
DD 3dd6c000h
DD 3ddc4000h
DD 3de1c000h
DD 3de70000h
DD 3dec8000h
DD 3df1c000h
DD 3df70000h
DD 3dfc4000h
DD 3e00c000h
DD 3e034000h
DD 3e05c000h
DD 3e088000h
DD 3e0b0000h
DD 3e0d8000h
DD 3e100000h
DD 3e128000h
DD 3e150000h
DD 3e178000h
DD 3e1a0000h
DD 3e1c8000h
DD 3e1ec000h
DD 3e214000h
DD 3e23c000h
DD 3e260000h
DD 3e288000h
DD 3e2ac000h
DD 3e2d4000h
DD 3e2f8000h
DD 3e31c000h
DD 3e344000h
DD 3e368000h
DD 3e38c000h
DD 3e3b0000h
DD 3e3d4000h
DD 3e3fc000h
DD 3e420000h
DD 3e440000h
DD 3e464000h
DD 3e488000h
DD 3e4ac000h
DD 3e4d0000h
DD 3e4f4000h
DD 3e514000h
DD 3e538000h
DD 3e55c000h
DD 3e57c000h
DD 3e5a0000h
DD 3e5c0000h
DD 3e5e4000h
DD 3e604000h
DD 3e624000h
DD 3e648000h
DD 3e668000h
DD 3e688000h
DD 3e6ac000h
DD 3e6cc000h
DD 3e6ec000h
DD 3e70c000h
DD 3e72c000h
DD 3e74c000h
DD 3e76c000h
DD 3e78c000h
DD 3e7ac000h
DD 3e7cc000h
DD 3e7ec000h
DD 3e804000h
DD 3e814000h
DD 3e824000h
DD 3e834000h
DD 3e840000h
DD 3e850000h
DD 3e860000h
DD 3e870000h
DD 3e880000h
DD 3e88c000h
DD 3e89c000h
DD 3e8ac000h
DD 3e8bc000h
DD 3e8c8000h
DD 3e8d8000h
DD 3e8e8000h
DD 3e8f4000h
DD 3e904000h
DD 3e914000h
DD 3e920000h
DD 3e930000h
DD 3e93c000h
DD 3e94c000h
DD 3e958000h
DD 3e968000h
DD 3e978000h
DD 3e984000h
DD 3e994000h
DD 3e9a0000h
ALIGN 16
PUBLIC __log10_128_tail
__log10_128_tail:
DD 00000000h
DD 367a8e44h
DD 368ed49fh
DD 36c21451h
DD 375211d6h
DD 3720ea11h
DD 37e9eb59h
DD 37b87be7h
DD 37bf2560h
DD 33d597a0h
DD 37806a05h
DD 3820581fh
DD 38223334h
DD 378e3bach
DD 3810684fh
DD 37feb7aeh
DD 36a9d609h
DD 37a68163h
DD 376a8b27h
DD 384c8fd6h
DD 3885183eh
DD 3874a760h
DD 380d1154h
DD 38ea42bdh
DD 384c1571h
DD 38ba66b8h
DD 38e7da3bh
DD 38eee632h
DD 38d00911h
DD 388bbedeh
DD 378a0512h
DD 3894c7a0h
DD 38e30710h
DD 36db2829h
DD 3729d609h
DD 38fa0e82h
DD 38bc9a75h
DD 383a9297h
DD 38dc83c8h
DD 37eac335h
DD 38706ac3h
DD 389574c2h
DD 3892d068h
DD 38615032h
DD 3917acf4h
DD 3967a126h
DD 38217840h
DD 38b420abh
DD 38f9c7b2h
DD 391103bdh
DD 39169a6bh
DD 390dd194h
DD 38eda471h
DD 38a38950h
DD 37f6844ah
DD 395e1cdbh
DD 390fcffch
DD 38503e9dh
DD 394b00fdh
DD 38a9910ah
DD 39518a31h
DD 3882d2c2h
DD 392488e4h
DD 397b0affh
DD 388a22d8h
DD 3902bd5eh
DD 39342f85h
DD 39598811h
DD 3972e6b1h
DD 34d53654h
DD 360ca25eh
DD 39785cc0h
DD 39630710h
DD 39424ed7h
DD 39165101h
DD 38be5421h
DD 37e7b0c0h
DD 394fd0c3h
DD 38efaaaah
DD 37a8f566h
DD 3927c744h
DD 383fa4d5h
DD 392d9e39h
DD 3803feaeh
DD 390a268ch
DD 39692b80h
DD 38789b4fh
DD 3909307dh
DD 394a601ch
DD 35e67edch
DD 383e386dh
DD 38a7743dh
DD 38dccec3h
DD 38ff57e0h
DD 39079d8bh
DD 390651a6h
DD 38f7bad9h
DD 38d0ab82h
DD 38979e7dh
DD 381978eeh
DD 397816c8h
DD 39410cb2h
DD 39015384h
DD 3863fa28h
DD 39f41065h
DD 39c7668ah
DD 39968afah
DD 39430db9h
DD 38a18cf3h
DD 39eb2907h
DD 39a9e10ch
DD 39492800h
DD 385a53d1h
DD 39ce0cf7h
DD 3979c7b2h
DD 389f5d99h
DD 39ceefcbh
DD 39646a39h
DD 380d7a9bh
DD 39ad6650h
DD 390ac3b8h
DD 39d9a9a8h
DD 39548a99h
DD 39f73c4bh
DD 3980960eh
DD 374b3d5ah
DD 39888f1eh
DD 37679a07h
DD 39826a13h
END

View File

@@ -0,0 +1,552 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;;
;; Defines __log_256_lead and __log_256_tail tables
;; Used by log and pow
;;
.const
ALIGN 16
PUBLIC __log10_256_lead
__log10_256_lead:
DQ 0000000000000000h
DQ 3f5bbd9e90000000h
DQ 3f6bafd470000000h
DQ 3f74b99560000000h
DQ 3f7b9476a0000000h
DQ 3f81344da0000000h
DQ 3f849b0850000000h
DQ 3f87fe71c0000000h
DQ 3f8b5e9080000000h
DQ 3f8ebb6af0000000h
DQ 3f910a83a0000000h
DQ 3f92b5b5e0000000h
DQ 3f945f4f50000000h
DQ 3f96075300000000h
DQ 3f97adc3d0000000h
DQ 3f9952a4f0000000h
DQ 3f9af5f920000000h
DQ 3f9c97c370000000h
DQ 3f9e3806a0000000h
DQ 3f9fd6c5b0000000h
DQ 3fa0ba01a0000000h
DQ 3fa187e120000000h
DQ 3fa25502c0000000h
DQ 3fa32167c0000000h
DQ 3fa3ed1190000000h
DQ 3fa4b80180000000h
DQ 3fa58238e0000000h
DQ 3fa64bb910000000h
DQ 3fa7148340000000h
DQ 3fa7dc98c0000000h
DQ 3fa8a3fad0000000h
DQ 3fa96aaac0000000h
DQ 3faa30a9d0000000h
DQ 3faaf5f920000000h
DQ 3fabba9a00000000h
DQ 3fac7e8d90000000h
DQ 3fad41d510000000h
DQ 3fae0471a0000000h
DQ 3faec66470000000h
DQ 3faf87aeb0000000h
DQ 3fb02428c0000000h
DQ 3fb08426f0000000h
DQ 3fb0e3d290000000h
DQ 3fb1432c30000000h
DQ 3fb1a23440000000h
DQ 3fb200eb60000000h
DQ 3fb25f5210000000h
DQ 3fb2bd68e0000000h
DQ 3fb31b3050000000h
DQ 3fb378a8e0000000h
DQ 3fb3d5d330000000h
DQ 3fb432afa0000000h
DQ 3fb48f3ed0000000h
DQ 3fb4eb8120000000h
DQ 3fb5477730000000h
DQ 3fb5a32160000000h
DQ 3fb5fe8040000000h
DQ 3fb6599440000000h
DQ 3fb6b45df0000000h
DQ 3fb70eddb0000000h
DQ 3fb7691400000000h
DQ 3fb7c30160000000h
DQ 3fb81ca630000000h
DQ 3fb8760300000000h
DQ 3fb8cf1830000000h
DQ 3fb927e640000000h
DQ 3fb9806d90000000h
DQ 3fb9d8aea0000000h
DQ 3fba30a9d0000000h
DQ 3fba885fa0000000h
DQ 3fbadfd070000000h
DQ 3fbb36fcb0000000h
DQ 3fbb8de4d0000000h
DQ 3fbbe48930000000h
DQ 3fbc3aea40000000h
DQ 3fbc910870000000h
DQ 3fbce6e410000000h
DQ 3fbd3c7da0000000h
DQ 3fbd91d580000000h
DQ 3fbde6ec00000000h
DQ 3fbe3bc1a0000000h
DQ 3fbe9056b0000000h
DQ 3fbee4aba0000000h
DQ 3fbf38c0c0000000h
DQ 3fbf8c9680000000h
DQ 3fbfe02d30000000h
DQ 3fc019c2a0000000h
DQ 3fc0434f70000000h
DQ 3fc06cbd60000000h
DQ 3fc0960c80000000h
DQ 3fc0bf3d00000000h
DQ 3fc0e84f10000000h
DQ 3fc11142f0000000h
DQ 3fc13a18a0000000h
DQ 3fc162d080000000h
DQ 3fc18b6a90000000h
DQ 3fc1b3e710000000h
DQ 3fc1dc4630000000h
DQ 3fc2048810000000h
DQ 3fc22cace0000000h
DQ 3fc254b4d0000000h
DQ 3fc27c9ff0000000h
DQ 3fc2a46e80000000h
DQ 3fc2cc20b0000000h
DQ 3fc2f3b690000000h
DQ 3fc31b3050000000h
DQ 3fc3428e20000000h
DQ 3fc369d020000000h
DQ 3fc390f680000000h
DQ 3fc3b80160000000h
DQ 3fc3def0e0000000h
DQ 3fc405c530000000h
DQ 3fc42c7e70000000h
DQ 3fc4531cd0000000h
DQ 3fc479a070000000h
DQ 3fc4a00970000000h
DQ 3fc4c65800000000h
DQ 3fc4ec8c30000000h
DQ 3fc512a640000000h
DQ 3fc538a630000000h
DQ 3fc55e8c50000000h
DQ 3fc5845890000000h
DQ 3fc5aa0b40000000h
DQ 3fc5cfa470000000h
DQ 3fc5f52440000000h
DQ 3fc61a8ad0000000h
DQ 3fc63fd850000000h
DQ 3fc6650cd0000000h
DQ 3fc68a2880000000h
DQ 3fc6af2b80000000h
DQ 3fc6d415e0000000h
DQ 3fc6f8e7d0000000h
DQ 3fc71da170000000h
DQ 3fc74242e0000000h
DQ 3fc766cc40000000h
DQ 3fc78b3da0000000h
DQ 3fc7af9730000000h
DQ 3fc7d3d910000000h
DQ 3fc7f80350000000h
DQ 3fc81c1620000000h
DQ 3fc8401190000000h
DQ 3fc863f5c0000000h
DQ 3fc887c2e0000000h
DQ 3fc8ab7900000000h
DQ 3fc8cf1830000000h
DQ 3fc8f2a0a0000000h
DQ 3fc9161270000000h
DQ 3fc9396db0000000h
DQ 3fc95cb280000000h
DQ 3fc97fe100000000h
DQ 3fc9a2f950000000h
DQ 3fc9c5fb70000000h
DQ 3fc9e8e7b0000000h
DQ 3fca0bbdf0000000h
DQ 3fca2e7e80000000h
DQ 3fca512960000000h
DQ 3fca73bea0000000h
DQ 3fca963e70000000h
DQ 3fcab8a8f0000000h
DQ 3fcadafe20000000h
DQ 3fcafd3e30000000h
DQ 3fcb1f6930000000h
DQ 3fcb417f40000000h
DQ 3fcb638070000000h
DQ 3fcb856cf0000000h
DQ 3fcba744b0000000h
DQ 3fcbc907f0000000h
DQ 3fcbeab6c0000000h
DQ 3fcc0c5130000000h
DQ 3fcc2dd750000000h
DQ 3fcc4f4950000000h
DQ 3fcc70a740000000h
DQ 3fcc91f130000000h
DQ 3fccb32740000000h
DQ 3fccd44980000000h
DQ 3fccf55810000000h
DQ 3fcd165300000000h
DQ 3fcd373a60000000h
DQ 3fcd580e60000000h
DQ 3fcd78cf00000000h
DQ 3fcd997c70000000h
DQ 3fcdba16a0000000h
DQ 3fcdda9dd0000000h
DQ 3fcdfb11f0000000h
DQ 3fce1b7330000000h
DQ 3fce3bc1a0000000h
DQ 3fce5bfd50000000h
DQ 3fce7c2660000000h
DQ 3fce9c3ce0000000h
DQ 3fcebc40e0000000h
DQ 3fcedc3280000000h
DQ 3fcefc11d0000000h
DQ 3fcf1bdee0000000h
DQ 3fcf3b99d0000000h
DQ 3fcf5b42a0000000h
DQ 3fcf7ad980000000h
DQ 3fcf9a5e70000000h
DQ 3fcfb9d190000000h
DQ 3fcfd932f0000000h
DQ 3fcff882a0000000h
DQ 3fd00be050000000h
DQ 3fd01b76a0000000h
DQ 3fd02b0430000000h
DQ 3fd03a8910000000h
DQ 3fd04a0540000000h
DQ 3fd05978e0000000h
DQ 3fd068e3f0000000h
DQ 3fd0784670000000h
DQ 3fd087a080000000h
DQ 3fd096f210000000h
DQ 3fd0a63b30000000h
DQ 3fd0b57bf0000000h
DQ 3fd0c4b450000000h
DQ 3fd0d3e460000000h
DQ 3fd0e30c30000000h
DQ 3fd0f22bc0000000h
DQ 3fd1014310000000h
DQ 3fd1105240000000h
DQ 3fd11f5940000000h
DQ 3fd12e5830000000h
DQ 3fd13d4f00000000h
DQ 3fd14c3dd0000000h
DQ 3fd15b24a0000000h
DQ 3fd16a0370000000h
DQ 3fd178da50000000h
DQ 3fd187a940000000h
DQ 3fd1967060000000h
DQ 3fd1a52fa0000000h
DQ 3fd1b3e710000000h
DQ 3fd1c296c0000000h
DQ 3fd1d13eb0000000h
DQ 3fd1dfdef0000000h
DQ 3fd1ee7770000000h
DQ 3fd1fd0860000000h
DQ 3fd20b91a0000000h
DQ 3fd21a1350000000h
DQ 3fd2288d70000000h
DQ 3fd2370010000000h
DQ 3fd2456b30000000h
DQ 3fd253ced0000000h
DQ 3fd2622b00000000h
DQ 3fd2707fd0000000h
DQ 3fd27ecd40000000h
DQ 3fd28d1360000000h
DQ 3fd29b5220000000h
DQ 3fd2a989a0000000h
DQ 3fd2b7b9e0000000h
DQ 3fd2c5e2e0000000h
DQ 3fd2d404b0000000h
DQ 3fd2e21f50000000h
DQ 3fd2f032c0000000h
DQ 3fd2fe3f20000000h
DQ 3fd30c4470000000h
DQ 3fd31a42b0000000h
DQ 3fd32839e0000000h
DQ 3fd3362a10000000h
DQ 3fd3441350000000h
ALIGN 16
PUBLIC __log10_256_tail
__log10_256_tail:
DQ 0000000000000000h
DQ 3db20abc22b2208fh
DQ 3db10f69332e0dd4h
DQ 3dce950de87ed257h
DQ 3dd3f3443b626d69h
DQ 3df45aeaa5363e57h
DQ 3dc443683ce1bf0bh
DQ 3df989cd60c6a511h
DQ 3dfd626f201f2e9fh
DQ 3de94f8bb8dabdcdh
DQ 3e0088d8ef423015h
DQ 3e080413a62b79adh
DQ 3e059717c0eed3c4h
DQ 3dad4a77add44902h
DQ 3e0e763ff037300eh
DQ 3de162d74706f6c3h
DQ 3e0601cc1f4dbc14h
DQ 3deaf3e051f6e5bfh
DQ 3e097a0b1e1af3ebh
DQ 3dc0a38970c002c7h
DQ 3e102e000057c751h
DQ 3e155b00eecd6e0eh
DQ 3ddf86297003b5afh
DQ 3e1057b9b336a36dh
DQ 3e134bc84a06ea4fh
DQ 3e1643da9ea1bcadh
DQ 3e1d66a7b4f7ea2ah
DQ 3df6b2e038f7fcefh
DQ 3df3e954c670f088h
DQ 3e047209093acab3h
DQ 3e1d708fe7275da7h
DQ 3e1fdf9e7771b9e7h
DQ 3e0827bfa70a0660h
DQ 3e1601cc1f4dbc14h
DQ 3e0637f6106a5e5bh
DQ 3e126a13f17c624bh
DQ 3e093eb2ce80623ah
DQ 3e1430d1e91594deh
DQ 3e1d6b10108fa031h
DQ 3e16879c0bbaf241h
DQ 3dff08015ea6bc2bh
DQ 3e29b63dcdc6676ch
DQ 3e2b022cbcc4ab2ch
DQ 3df917d07ddd6544h
DQ 3e1540605703379eh
DQ 3e0cd18b947a1b60h
DQ 3e17ad65277ca97eh
DQ 3e11884dc59f5fa9h
DQ 3e1711c46006d082h
DQ 3e2f092e3c3108f8h
DQ 3e1714c5e32be13ah
DQ 3e26bba7fd734f9ah
DQ 3dfdf48fb5e08483h
DQ 3e232f9bc74d0b95h
DQ 3df973e848790c13h
DQ 3e1eccbc08c6586eh
DQ 3e2115e9f9524a98h
DQ 3e2f1740593131b8h
DQ 3e1bcf8b25643835h
DQ 3e1f5fa81d8bed80h
DQ 3e244a4df929d9e4h
DQ 3e129820d8220c94h
DQ 3e2a0b489304e309h
DQ 3e1f4d56aba665feh
DQ 3e210c9019365163h
DQ 3df80f78fe592736h
DQ 3e10528825c81ccah
DQ 3de095537d6d746ah
DQ 3e1827bfa70a0660h
DQ 3e06b0a8ec45933ch
DQ 3e105af81bf5dba9h
DQ 3e17e2fa2655d515h
DQ 3e0d59ecbfaee4bfh
DQ 3e1d8b2fda683fa3h
DQ 3e24b8ddfd3a3737h
DQ 3e13827e61ae1204h
DQ 3e2c8c7b49e90f9fh
DQ 3e29eaf01597591dh
DQ 3e19aaa66e317b36h
DQ 3e2e725609720655h
DQ 3e261c33fc7aac54h
DQ 3e29662bcf61a252h
DQ 3e1843c811c42730h
DQ 3e2064bb0b5acb36h
DQ 3e0a340c842701a4h
DQ 3e1a8e55b58f79d6h
DQ 3de92d219c5e9d9ah
DQ 3e3f63e60d7ffd6ah
DQ 3e2e9b0ed9516314h
DQ 3e2923901962350ch
DQ 3e326f8838785e81h
DQ 3e3b5b6a4caba6afh
DQ 3df0226adc8e761ch
DQ 3e3c4ad7313a1aedh
DQ 3e1564e87c738d17h
DQ 3e338fecf18a6618h
DQ 3e3d929ef5777666h
DQ 3e39483bf08da0b8h
DQ 3e3bdd0eeeaa5826h
DQ 3e39c4dd590237bah
DQ 3e1af3e9e0ebcac7h
DQ 3e35ce5382270dach
DQ 3e394f74532ab9bah
DQ 3e07342795888654h
DQ 3e0c5a000be34bf0h
DQ 3e2711c46006d082h
DQ 3e250025b4ed8cf8h
DQ 3e2ed18bcef2d2a0h
DQ 3e21282e0c0a7554h
DQ 3e0d70f33359a7cah
DQ 3e2b7f7e13a84025h
DQ 3e33306ec321891eh
DQ 3e3fc7f8038b7550h
DQ 3e3eb0358cd71d64h
DQ 3e3a76c822859474h
DQ 3e3d0ec652de86e3h
DQ 3e2fa4cce08658afh
DQ 3e3b84a2d2c00a9eh
DQ 3e20a5b0f2c25bd1h
DQ 3e3dd660225bf699h
DQ 3e08b10f859bf037h
DQ 3e3e8823b590cbe1h
DQ 3e361311f31e96f6h
DQ 3e2e1f875ca20f9ah
DQ 3e2c95724939b9a5h
DQ 3e3805957a3e58e2h
DQ 3e2ff126ea9f0334h
DQ 3e3953f5598e5609h
DQ 3e36c16ff856c448h
DQ 3e24cb220ff261f4h
DQ 3e35e120d53d53a2h
DQ 3e3a527f6189f256h
DQ 3e3856fcffd49c0fh
DQ 3e300c2e8228d7dah
DQ 3df113d09444dfe0h
DQ 3e2510630eea59a6h
DQ 3e262e780f32d711h
DQ 3ded3ed91a10f8cfh
DQ 3e23654a7e4bcd85h
DQ 3e055b784980ad21h
DQ 3e212f2dd4b16e64h
DQ 3e37c4add939f50ch
DQ 3e281784627180fch
DQ 3dea5162c7e14961h
DQ 3e310c9019365163h
DQ 3e373c4d2ba17688h
DQ 3e2ae8a5e0e93d81h
DQ 3e2ab0c6f01621afh
DQ 3e301e8b74dd5b66h
DQ 3e2d206fecbb5494h
DQ 3df0b48b724fcc00h
DQ 3e3f831f0b61e229h
DQ 3df81a97c407bcafh
DQ 3e3e286c1ccbb7aah
DQ 3e28630b49220a93h
DQ 3dff0b15c1a22c5ch
DQ 3e355445e71c0946h
DQ 3e3be630f8066d85h
DQ 3e2599dff0d96c39h
DQ 3e36cc85b18fb081h
DQ 3e34476d001ea8c8h
DQ 3e373f889e16d31fh
DQ 3e3357100d792a87h
DQ 3e3bd179ae6101f6h
DQ 3e0ca31056c3f6e2h
DQ 3e3d2870629c08fbh
DQ 3e3aba3880d2673fh
DQ 3e2c3633cb297da6h
DQ 3e21843899efea02h
DQ 3e3bccc99d2008e6h
DQ 3e38000544bdd350h
DQ 3e2b91c226606ae1h
DQ 3e2a7adf26b62bdfh
DQ 3e18764fc8826ec9h
DQ 3e1f4f3de50f68f0h
DQ 3df760ca757995e3h
DQ 3dfc667ed3805147h
DQ 3e3733f6196adf6fh
DQ 3e2fb710f33e836bh
DQ 3e39886eba641013h
DQ 3dfb5368d0af8c1ah
DQ 3e358c691b8d2971h
DQ 3dfe9465226d08fbh
DQ 3e33587e063f0097h
DQ 3e3618e702129f18h
DQ 3e361c33fc7aac54h
DQ 3e3f07a68408604ah
DQ 3e3c34bfe4945421h
DQ 3e38b1f00e41300bh
DQ 3e3f434284d61b63h
DQ 3e3a63095e397436h
DQ 3e34428656b919deh
DQ 3e36ca9201b2d9a6h
DQ 3e2738823a2a931ch
DQ 3e3c11880e179230h
DQ 3e313ddc8d6d52feh
DQ 3e33eed58922e917h
DQ 3e295992846bdd50h
DQ 3e0ddb4d5f2e278bh
DQ 3df1a5f12a0635c4h
DQ 3e4642f0882c3c34h
DQ 3e2aee9ba7f6475eh
DQ 3e264b7f834a60e4h
DQ 3e290d42e243792eh
DQ 3e4c272008134f01h
DQ 3e4a782e16d6cf5bh
DQ 3e44505c79da6648h
DQ 3e4ca9d4ea4dcd21h
DQ 3e297d3d627cd5bch
DQ 3e20b15cf9bcaa13h
DQ 3e315b2063cf76ddh
DQ 3e2983e6f3aa2748h
DQ 3e3f4c64f4ffe994h
DQ 3e46beba7ce85a0fh
DQ 3e3b9c69fd4ea6b8h
DQ 3e2b6aa5835fa4abh
DQ 3e43ccc3790fedd1h
DQ 3e29c04cc4404fe0h
DQ 3e40734b7a75d89dh
DQ 3e1b4404c4e01612h
DQ 3e40c565c2ce4894h
DQ 3e33c71441d935cdh
DQ 3d72a492556b3b4eh
DQ 3e20fa090341dc43h
DQ 3e2e8f7009e3d9f4h
DQ 3e4b1bf68b048a45h
DQ 3e3eee52dffaa956h
DQ 3e456b0900e465bdh
DQ 3e4d929ef5777666h
DQ 3e486ea28637e260h
DQ 3e4665aff10ca2f0h
DQ 3e2f11fdaf48ec74h
DQ 3e4cbe1b86a4d1c7h
DQ 3e25b05bfea87665h
DQ 3e41cec20a1a4a1dh
DQ 3e41cd5f0a409b9fh
DQ 3e453656c8265070h
DQ 3e377ed835282260h
DQ 3e2417bc3040b9d2h
DQ 3e408eef7b79eff2h
DQ 3e4dc76f39dc57e9h
DQ 3e4c0493a70cf457h
DQ 3e4a83d6cea5a60ch
DQ 3e30d6700dc557bah
DQ 3e44c96c12e8bd0ah
DQ 3e3d2c1993e32315h
DQ 3e22c721135f8242h
DQ 3e279a3e4dda747dh
DQ 3dfcf89f6941a72bh
DQ 3e2149a702f10831h
DQ 3e4ead4b7c8175dbh
DQ 3e4e6930fe63e70ah
DQ 3e41e106bed9ee2fh
DQ 3e2d682b82f11c92h
DQ 3e3a07f188dba47ch
DQ 3e40f9342dc172f6h
DQ 3e03ef3fde623e25h
END

View File

@@ -0,0 +1,294 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;; Defines __log_128_lead and __log_128_tail tables
;; Used by log and pow
;;
.const
ALIGN 16
PUBLIC __log_128_lead
__log_128_lead DD 000000000h
DD 03bff0000h
DD 03c7e0000h
DD 03cbdc000h
DD 03cfc1000h
DD 03d1cf000h
DD 03d3ba000h
DD 03d5a1000h
DD 03d785000h
DD 03d8b2000h
DD 03d9a0000h
DD 03da8d000h
DD 03db78000h
DD 03dc61000h
DD 03dd49000h
DD 03de2f000h
DD 03df13000h
DD 03dff6000h
DD 03e06b000h
DD 03e0db000h
DD 03e14a000h
DD 03e1b8000h
DD 03e226000h
DD 03e293000h
DD 03e2ff000h
DD 03e36b000h
DD 03e3d5000h
DD 03e43f000h
DD 03e4a9000h
DD 03e511000h
DD 03e579000h
DD 03e5e1000h
DD 03e647000h
DD 03e6ae000h
DD 03e713000h
DD 03e778000h
DD 03e7dc000h
DD 03e820000h
DD 03e851000h
DD 03e882000h
DD 03e8b3000h
DD 03e8e4000h
DD 03e914000h
DD 03e944000h
DD 03e974000h
DD 03e9a3000h
DD 03e9d3000h
DD 03ea02000h
DD 03ea30000h
DD 03ea5f000h
DD 03ea8d000h
DD 03eabb000h
DD 03eae8000h
DD 03eb16000h
DD 03eb43000h
DD 03eb70000h
DD 03eb9c000h
DD 03ebc9000h
DD 03ebf5000h
DD 03ec21000h
DD 03ec4d000h
DD 03ec78000h
DD 03eca3000h
DD 03ecce000h
DD 03ecf9000h
DD 03ed24000h
DD 03ed4e000h
DD 03ed78000h
DD 03eda2000h
DD 03edcc000h
DD 03edf5000h
DD 03ee1e000h
DD 03ee47000h
DD 03ee70000h
DD 03ee99000h
DD 03eec1000h
DD 03eeea000h
DD 03ef12000h
DD 03ef3a000h
DD 03ef61000h
DD 03ef89000h
DD 03efb0000h
DD 03efd7000h
DD 03effe000h
DD 03f012000h
DD 03f025000h
DD 03f039000h
DD 03f04c000h
DD 03f05f000h
DD 03f072000h
DD 03f084000h
DD 03f097000h
DD 03f0aa000h
DD 03f0bc000h
DD 03f0cf000h
DD 03f0e1000h
DD 03f0f4000h
DD 03f106000h
DD 03f118000h
DD 03f12a000h
DD 03f13c000h
DD 03f14e000h
DD 03f160000h
DD 03f172000h
DD 03f183000h
DD 03f195000h
DD 03f1a7000h
DD 03f1b8000h
DD 03f1c9000h
DD 03f1db000h
DD 03f1ec000h
DD 03f1fd000h
DD 03f20e000h
DD 03f21f000h
DD 03f230000h
DD 03f241000h
DD 03f252000h
DD 03f263000h
DD 03f273000h
DD 03f284000h
DD 03f295000h
DD 03f2a5000h
DD 03f2b5000h
DD 03f2c6000h
DD 03f2d6000h
DD 03f2e6000h
DD 03f2f7000h
DD 03f307000h
DD 03f317000h
ALIGN 16
PUBLIC __log_128_tail
__log_128_tail DD 000000000h
DD 03429ac41h
DD 035a8b0fch
DD 0368d83eah
DD 0361b0e78h
DD 03687b9feh
DD 03631ec65h
DD 036dd7119h
DD 035c30045h
DD 0379b7751h
DD 037ebcb0dh
DD 037839f83h
DD 037528ae5h
DD 037a2eb18h
DD 036da7495h
DD 036a91eb7h
DD 03783b715h
DD 0371131dbh
DD 0383f3e68h
DD 038156a97h
DD 038297c0fh
DD 0387e100fh
DD 03815b665h
DD 037e5e3a1h
DD 038183853h
DD 035fe719dh
DD 038448108h
DD 038503290h
DD 0373539e8h
DD 0385e0ff1h
DD 03864a740h
DD 03786742dh
DD 0387be3cdh
DD 03685ad3eh
DD 03803b715h
DD 037adcbdch
DD 0380c36afh
DD 0371652d3h
DD 038927139h
DD 038c5fcd7h
DD 038ae55d5h
DD 03818c169h
DD 038a0fde7h
DD 038ad09efh
DD 03862bae1h
DD 038eecd4ch
DD 03798aad2h
DD 037421a1ah
DD 038c5e10eh
DD 037bf2aeeh
DD 0382d872dh
DD 037ee2e8ah
DD 038dedfach
DD 03802f2b9h
DD 038481e9bh
DD 0380eaa2bh
DD 038ebfb5dh
DD 038255fddh
DD 038783b82h
DD 03851da1eh
DD 0374e1b05h
DD 0388f439bh
DD 038ca0e10h
DD 038cac08bh
DD 03891f65fh
DD 0378121cbh
DD 0386c9a9ah
DD 038949923h
DD 038777bcch
DD 037b12d26h
DD 038a6ced3h
DD 038ebd3e6h
DD 038fbe3cdh
DD 038d785c2h
DD 0387e7e00h
DD 038f392c5h
DD 037d40983h
DD 038081a7ch
DD 03784c3adh
DD 038cce923h
DD 0380f5fafh
DD 03891fd38h
DD 038ac47bch
DD 03897042bh
DD 0392952d2h
DD 0396fced4h
DD 037f97073h
DD 0385e9eaeh
DD 03865c84ah
DD 038130ba3h
DD 03979cf16h
DD 03938cac9h
DD 038c3d2f4h
DD 039755dech
DD 038e6b467h
DD 0395c0fb8h
DD 0383ebce0h
DD 038dcd192h
DD 039186bdfh
DD 0392de74ch
DD 0392f0944h
DD 0391bff61h
DD 038e9ed44h
DD 038686dc8h
DD 0396b99a7h
DD 039099c89h
DD 037a27673h
DD 0390bdaa3h
DD 0397069abh
DD 0388449ffh
DD 039013538h
DD 0392dc268h
DD 03947f423h
DD 0394ff17ch
DD 03945e10eh
DD 03929e8f5h
DD 038f85db0h
DD 038735f99h
DD 0396c08dbh
DD 03909e600h
DD 037b4996fh
DD 0391233cch
DD 0397cead9h
DD 038adb5cdh
DD 03920261ah
DD 03958ee36h
DD 035aa4905h
DD 037cbd11eh
DD 03805fdf4h
END

View File

@@ -0,0 +1,554 @@
;;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;; Defines __log_256_lead and __log_256_tail tables
;; Used by log and pow
;;
.const
ALIGN 16
PUBLIC __log_256_lead
__log_256_lead DQ 0000000000000000h
DQ 3f6ff00aa0000000h
DQ 3f7fe02a60000000h
DQ 3f87dc4750000000h
DQ 3f8fc0a8b0000000h
DQ 3f93cea440000000h
DQ 3f97b91b00000000h
DQ 3f9b9fc020000000h
DQ 3f9f829b00000000h
DQ 3fa1b0d980000000h
DQ 3fa39e87b0000000h
DQ 3fa58a5ba0000000h
DQ 3fa77458f0000000h
DQ 3fa95c8300000000h
DQ 3fab42dd70000000h
DQ 3fad276b80000000h
DQ 3faf0a30c0000000h
DQ 3fb0759830000000h
DQ 3fb16536e0000000h
DQ 3fb253f620000000h
DQ 3fb341d790000000h
DQ 3fb42edcb0000000h
DQ 3fb51b0730000000h
DQ 3fb60658a0000000h
DQ 3fb6f0d280000000h
DQ 3fb7da7660000000h
DQ 3fb8c345d0000000h
DQ 3fb9ab4240000000h
DQ 3fba926d30000000h
DQ 3fbb78c820000000h
DQ 3fbc5e5480000000h
DQ 3fbd4313d0000000h
DQ 3fbe270760000000h
DQ 3fbf0a30c0000000h
DQ 3fbfec9130000000h
DQ 3fc0671510000000h
DQ 3fc0d77e70000000h
DQ 3fc1478580000000h
DQ 3fc1b72ad0000000h
DQ 3fc2266f10000000h
DQ 3fc29552f0000000h
DQ 3fc303d710000000h
DQ 3fc371fc20000000h
DQ 3fc3dfc2b0000000h
DQ 3fc44d2b60000000h
DQ 3fc4ba36f0000000h
DQ 3fc526e5e0000000h
DQ 3fc59338d0000000h
DQ 3fc5ff3070000000h
DQ 3fc66acd40000000h
DQ 3fc6d60fe0000000h
DQ 3fc740f8f0000000h
DQ 3fc7ab8900000000h
DQ 3fc815c0a0000000h
DQ 3fc87fa060000000h
DQ 3fc8e928d0000000h
DQ 3fc9525a90000000h
DQ 3fc9bb3620000000h
DQ 3fca23bc10000000h
DQ 3fca8becf0000000h
DQ 3fcaf3c940000000h
DQ 3fcb5b5190000000h
DQ 3fcbc28670000000h
DQ 3fcc296850000000h
DQ 3fcc8ff7c0000000h
DQ 3fccf63540000000h
DQ 3fcd5c2160000000h
DQ 3fcdc1bca0000000h
DQ 3fce270760000000h
DQ 3fce8c0250000000h
DQ 3fcef0adc0000000h
DQ 3fcf550a50000000h
DQ 3fcfb91860000000h
DQ 3fd00e6c40000000h
DQ 3fd0402590000000h
DQ 3fd071b850000000h
DQ 3fd0a324e0000000h
DQ 3fd0d46b50000000h
DQ 3fd1058bf0000000h
DQ 3fd1368700000000h
DQ 3fd1675ca0000000h
DQ 3fd1980d20000000h
DQ 3fd1c898c0000000h
DQ 3fd1f8ff90000000h
DQ 3fd22941f0000000h
DQ 3fd2596010000000h
DQ 3fd2895a10000000h
DQ 3fd2b93030000000h
DQ 3fd2e8e2b0000000h
DQ 3fd31871c0000000h
DQ 3fd347dd90000000h
DQ 3fd3772660000000h
DQ 3fd3a64c50000000h
DQ 3fd3d54fa0000000h
DQ 3fd4043080000000h
DQ 3fd432ef20000000h
DQ 3fd4618bc0000000h
DQ 3fd4900680000000h
DQ 3fd4be5f90000000h
DQ 3fd4ec9730000000h
DQ 3fd51aad80000000h
DQ 3fd548a2c0000000h
DQ 3fd5767710000000h
DQ 3fd5a42ab0000000h
DQ 3fd5d1bdb0000000h
DQ 3fd5ff3070000000h
DQ 3fd62c82f0000000h
DQ 3fd659b570000000h
DQ 3fd686c810000000h
DQ 3fd6b3bb20000000h
DQ 3fd6e08ea0000000h
DQ 3fd70d42e0000000h
DQ 3fd739d7f0000000h
DQ 3fd7664e10000000h
DQ 3fd792a550000000h
DQ 3fd7bede00000000h
DQ 3fd7eaf830000000h
DQ 3fd816f410000000h
DQ 3fd842d1d0000000h
DQ 3fd86e9190000000h
DQ 3fd89a3380000000h
DQ 3fd8c5b7c0000000h
DQ 3fd8f11e80000000h
DQ 3fd91c67e0000000h
DQ 3fd9479410000000h
DQ 3fd972a340000000h
DQ 3fd99d9580000000h
DQ 3fd9c86b00000000h
DQ 3fd9f323e0000000h
DQ 3fda1dc060000000h
DQ 3fda484090000000h
DQ 3fda72a490000000h
DQ 3fda9cec90000000h
DQ 3fdac718c0000000h
DQ 3fdaf12930000000h
DQ 3fdb1b1e00000000h
DQ 3fdb44f770000000h
DQ 3fdb6eb590000000h
DQ 3fdb985890000000h
DQ 3fdbc1e080000000h
DQ 3fdbeb4d90000000h
DQ 3fdc149ff0000000h
DQ 3fdc3dd7a0000000h
DQ 3fdc66f4e0000000h
DQ 3fdc8ff7c0000000h
DQ 3fdcb8e070000000h
DQ 3fdce1af00000000h
DQ 3fdd0a63a0000000h
DQ 3fdd32fe70000000h
DQ 3fdd5b7f90000000h
DQ 3fdd83e720000000h
DQ 3fddac3530000000h
DQ 3fddd46a00000000h
DQ 3fddfc8590000000h
DQ 3fde248810000000h
DQ 3fde4c71a0000000h
DQ 3fde744260000000h
DQ 3fde9bfa60000000h
DQ 3fdec399d0000000h
DQ 3fdeeb20c0000000h
DQ 3fdf128f50000000h
DQ 3fdf39e5b0000000h
DQ 3fdf6123f0000000h
DQ 3fdf884a30000000h
DQ 3fdfaf5880000000h
DQ 3fdfd64f20000000h
DQ 3fdffd2e00000000h
DQ 3fe011fab0000000h
DQ 3fe02552a0000000h
DQ 3fe0389ee0000000h
DQ 3fe04bdf90000000h
DQ 3fe05f14b0000000h
DQ 3fe0723e50000000h
DQ 3fe0855c80000000h
DQ 3fe0986f40000000h
DQ 3fe0ab76b0000000h
DQ 3fe0be72e0000000h
DQ 3fe0d163c0000000h
DQ 3fe0e44980000000h
DQ 3fe0f72410000000h
DQ 3fe109f390000000h
DQ 3fe11cb810000000h
DQ 3fe12f7190000000h
DQ 3fe1422020000000h
DQ 3fe154c3d0000000h
DQ 3fe1675ca0000000h
DQ 3fe179eab0000000h
DQ 3fe18c6e00000000h
DQ 3fe19ee6b0000000h
DQ 3fe1b154b0000000h
DQ 3fe1c3b810000000h
DQ 3fe1d610f0000000h
DQ 3fe1e85f50000000h
DQ 3fe1faa340000000h
DQ 3fe20cdcd0000000h
DQ 3fe21f0bf0000000h
DQ 3fe23130d0000000h
DQ 3fe2434b60000000h
DQ 3fe2555bc0000000h
DQ 3fe2676200000000h
DQ 3fe2795e10000000h
DQ 3fe28b5000000000h
DQ 3fe29d37f0000000h
DQ 3fe2af15f0000000h
DQ 3fe2c0e9e0000000h
DQ 3fe2d2b400000000h
DQ 3fe2e47430000000h
DQ 3fe2f62a90000000h
DQ 3fe307d730000000h
DQ 3fe3197a00000000h
DQ 3fe32b1330000000h
DQ 3fe33ca2b0000000h
DQ 3fe34e2890000000h
DQ 3fe35fa4e0000000h
DQ 3fe37117b0000000h
DQ 3fe38280f0000000h
DQ 3fe393e0d0000000h
DQ 3fe3a53730000000h
DQ 3fe3b68440000000h
DQ 3fe3c7c7f0000000h
DQ 3fe3d90260000000h
DQ 3fe3ea3390000000h
DQ 3fe3fb5b80000000h
DQ 3fe40c7a40000000h
DQ 3fe41d8fe0000000h
DQ 3fe42e9c60000000h
DQ 3fe43f9fe0000000h
DQ 3fe4509a50000000h
DQ 3fe4618bc0000000h
DQ 3fe4727430000000h
DQ 3fe48353d0000000h
DQ 3fe4942a80000000h
DQ 3fe4a4f850000000h
DQ 3fe4b5bd60000000h
DQ 3fe4c679a0000000h
DQ 3fe4d72d30000000h
DQ 3fe4e7d810000000h
DQ 3fe4f87a30000000h
DQ 3fe50913c0000000h
DQ 3fe519a4c0000000h
DQ 3fe52a2d20000000h
DQ 3fe53aad00000000h
DQ 3fe54b2460000000h
DQ 3fe55b9350000000h
DQ 3fe56bf9d0000000h
DQ 3fe57c57f0000000h
DQ 3fe58cadb0000000h
DQ 3fe59cfb20000000h
DQ 3fe5ad4040000000h
DQ 3fe5bd7d30000000h
DQ 3fe5cdb1d0000000h
DQ 3fe5ddde50000000h
DQ 3fe5ee02a0000000h
DQ 3fe5fe1ed0000000h
DQ 3fe60e32f0000000h
DQ 3fe61e3ef0000000h
DQ 3fe62e42e0000000h
DQ 0000000000000000h
ALIGN 16
PUBLIC __log_256_tail
__log_256_tail DQ 0000000000000000h
DQ 3db5885e0250435ah
DQ 3de620cf11f86ed2h
DQ 3dff0214edba4a25h
DQ 3dbf807c79f3db4eh
DQ 3dea352ba779a52bh
DQ 3dff56c46aa49fd5h
DQ 3dfebe465fef5196h
DQ 3e0cf0660099f1f8h
DQ 3e1247b2ff85945dh
DQ 3e13fd7abf5202b6h
DQ 3e1f91c9a918d51eh
DQ 3e08cb73f118d3cah
DQ 3e1d91c7d6fad074h
DQ 3de1971bec28d14ch
DQ 3e15b616a423c78ah
DQ 3da162a6617cc971h
DQ 3e166391c4c06d29h
DQ 3e2d46f5c1d0c4b8h
DQ 3e2e14282df1f6d3h
DQ 3e186f47424a660dh
DQ 3e2d4c8de077753eh
DQ 3e2e0c307ed24f1ch
DQ 3e226ea18763bdd3h
DQ 3e25cad69737c933h
DQ 3e2af62599088901h
DQ 3e18c66c83d6b2d0h
DQ 3e1880ceb36fb30fh
DQ 3e2495aac6ca17a4h
DQ 3e2761db4210878ch
DQ 3e2eb78e862bac2fh
DQ 3e19b2cd75790dd9h
DQ 3e2c55e5cbd3d50fh
DQ 3db162a6617cc971h
DQ 3dfdbeabaaa2e519h
DQ 3e1652cb7150c647h
DQ 3e39a11cb2cd2ee2h
DQ 3e219d0ab1a28813h
DQ 3e24bd9e80a41811h
DQ 3e3214b596faa3dfh
DQ 3e303fea46980bb8h
DQ 3e31c8ffa5fd28c7h
DQ 3dce8f743bcd96c5h
DQ 3dfd98c5395315c6h
DQ 3e3996fa3ccfa7b2h
DQ 3e1cd2af2ad13037h
DQ 3e1d0da1bd17200eh
DQ 3e3330410ba68b75h
DQ 3df4f27a790e7c41h
DQ 3e13956a86f6ff1bh
DQ 3e2c6748723551d9h
DQ 3e2500de9326cdfch
DQ 3e1086c848df1b59h
DQ 3e04357ead6836ffh
DQ 3e24832442408024h
DQ 3e3d10da8154b13dh
DQ 3e39e8ad68ec8260h
DQ 3e3cfbf706abaf18h
DQ 3e3fc56ac6326e23h
DQ 3e39105e3185cf21h
DQ 3e3d017fe5b19cc0h
DQ 3e3d1f6b48dd13feh
DQ 3e20b63358a7e73ah
DQ 3e263063028c211ch
DQ 3e2e6a6886b09760h
DQ 3e3c138bb891cd03h
DQ 3e369f7722b7221ah
DQ 3df57d8fac1a628ch
DQ 3e3c55e5cbd3d50fh
DQ 3e1552d2ff48fe2eh
DQ 3e37b8b26ca431bch
DQ 3e292decdc1c5f6dh
DQ 3e3abc7c551aaa8ch
DQ 3e36b540731a354bh
DQ 3e32d341036b89efh
DQ 3e4f9ab21a3a2e0fh
DQ 3e239c871afb9fbdh
DQ 3e3e6add2c81f640h
DQ 3e435c95aa313f41h
DQ 3e249d4582f6cc53h
DQ 3e47574c1c07398fh
DQ 3e4ba846dece9e8dh
DQ 3e16999fafbc68e7h
DQ 3e4c9145e51b0103h
DQ 3e479ef2cb44850ah
DQ 3e0beec73de11275h
DQ 3e2ef4351af5a498h
DQ 3e45713a493b4a50h
DQ 3e45c23a61385992h
DQ 3e42a88309f57299h
DQ 3e4530faa9ac8aceh
DQ 3e25fec2d792a758h
DQ 3e35a517a71cbcd7h
DQ 3e3707dc3e1cd9a3h
DQ 3e3a1a9f8ef43049h
DQ 3e4409d0276b3674h
DQ 3e20e2f613e85bd9h
DQ 3df0027433001e5fh
DQ 3e35dde2836d3265h
DQ 3e2300134d7aaf04h
DQ 3e3cb7e0b42724f5h
DQ 3e2d6e93167e6308h
DQ 3e3d1569b1526adbh
DQ 3e0e99fc338a1a41h
DQ 3e4eb01394a11b1ch
DQ 3e04f27a790e7c41h
DQ 3e25ce3ca97b7af9h
DQ 3e281f0f940ed857h
DQ 3e4d36295d88857ch
DQ 3e21aca1ec4af526h
DQ 3e445743c7182726h
DQ 3e23c491aead337eh
DQ 3e3aef401a738931h
DQ 3e21cede76092a29h
DQ 3e4fba8f44f82bb4h
DQ 3e446f5f7f3c3e1ah
DQ 3e47055f86c9674bh
DQ 3e4b41a92b6b6e1ah
DQ 3e443d162e927628h
DQ 3e4466174013f9b1h
DQ 3e3b05096ad69c62h
DQ 3e40b169150faa58h
DQ 3e3cd98b1df85da7h
DQ 3e468b507b0f8fa8h
DQ 3e48422df57499bah
DQ 3e11351586970274h
DQ 3e117e08acba92eeh
DQ 3e26e04314dd0229h
DQ 3e497f3097e56d1ah
DQ 3e3356e655901286h
DQ 3e0cb761457f94d6h
DQ 3e39af67a85a9dach
DQ 3e453410931a909fh
DQ 3e22c587206058f5h
DQ 3e223bc358899c22h
DQ 3e4d7bf8b6d223cbh
DQ 3e47991ec5197ddbh
DQ 3e4a79e6bb3a9219h
DQ 3e3a4c43ed663ec5h
DQ 3e461b5a1484f438h
DQ 3e4b4e36f7ef0c3ah
DQ 3e115f026acd0d1bh
DQ 3e3f36b535cecf05h
DQ 3e2ffb7fbf3eb5c6h
DQ 3e3e6a6886b09760h
DQ 3e3135eb27f5bbc3h
DQ 3e470be7d6f6fa57h
DQ 3e4ce43cc84ab338h
DQ 3e4c01d7aac3bd91h
DQ 3e45c58d07961060h
DQ 3e3628bcf941456eh
DQ 3e4c58b2a8461cd2h
DQ 3e33071282fb989ah
DQ 3e420dab6a80f09ch
DQ 3e44f8d84c397b1eh
DQ 3e40d0ee08599e48h
DQ 3e1d68787e37da36h
DQ 3e366187d591bafch
DQ 3e22346600bae772h
DQ 3e390377d0d61b8eh
DQ 3e4f5e0dd966b907h
DQ 3e49023cb79a00e2h
DQ 3e44e05158c28ad8h
DQ 3e3bfa7b08b18ae4h
DQ 3e4ef1e63db35f67h
DQ 3e0ec2ae39493d4fh
DQ 3e40afe930ab2fa0h
DQ 3e225ff8a1810dd4h
DQ 3e469743fb1a71a5h
DQ 3e5f9cc676785571h
DQ 3e5b524da4cbf982h
DQ 3e5a4c8b381535b8h
DQ 3e5839be809caf2ch
DQ 3e50968a1cb82c13h
DQ 3e5eae6a41723fb5h
DQ 3e5d9c29a380a4dbh
DQ 3e4094aa0ada625eh
DQ 3e5973ad6fc108cah
DQ 3e4747322fdbab97h
DQ 3e593692fa9d4221h
DQ 3e5c5a992dfbc7d9h
DQ 3e4e1f33e102387ah
DQ 3e464fbef14c048ch
DQ 3e4490f513ca5e3bh
DQ 3e37a6af4d4c799dh
DQ 3e57574c1c07398fh
DQ 3e57b133417f8c1ch
DQ 3e5feb9e0c176514h
DQ 3e419f25bb3172f7h
DQ 3e45f68a7bbfb852h
DQ 3e5ee278497929f1h
DQ 3e5ccee006109d58h
DQ 3e5ce081a07bd8b3h
DQ 3e570e12981817b8h
DQ 3e292ab6d93503d0h
DQ 3e58cb7dd7c3b61eh
DQ 3e4efafd0a0b78dah
DQ 3e5e907267c4288eh
DQ 3e5d31ef96780875h
DQ 3e23430dfcd2ad50h
DQ 3e344d88d75bc1f9h
DQ 3e5bec0f055e04fch
DQ 3e5d85611590b9adh
DQ 3df320568e583229h
DQ 3e5a891d1772f538h
DQ 3e22edc9dabba74dh
DQ 3e4b9009a1015086h
DQ 3e52a12a8c5b1a19h
DQ 3e3a7885f0fdac85h
DQ 3e5f4ffcd43ac691h
DQ 3e52243ae2640aadh
DQ 3e546513299035d3h
DQ 3e5b39c3a62dd725h
DQ 3e5ba6dd40049f51h
DQ 3e451d1ed7177409h
DQ 3e5cb0f2fd7f5216h
DQ 3e3ab150cd4e2213h
DQ 3e5cfd7bf3193844h
DQ 3e53fff8455f1dbdh
DQ 3e5fee640b905fc9h
DQ 3e54e2adf548084ch
DQ 3e3b597adc1ecdd2h
DQ 3e4345bd096d3a75h
DQ 3e5101b9d2453c8bh
DQ 3e508ce55cc8c979h
DQ 3e5bbf017e595f71h
DQ 3e37ce733bd393dch
DQ 3e233bb0a503f8a1h
DQ 3e30e2f613e85bd9h
DQ 3e5e67555a635b3ch
DQ 3e2ea88df73d5e8bh
DQ 3e3d17e03bda18a8h
DQ 3e5b607d76044f7eh
DQ 3e52adc4e71bc2fch
DQ 3e5f99dc7362d1d9h
DQ 3e5473fa008e6a6ah
DQ 3e2b75bb09cb0985h
DQ 3e5ea04dd10b9abah
DQ 3e5802d0d6979674h
DQ 3e174688ccd99094h
DQ 3e496f16abb9df22h
DQ 3e46e66df2aa374fh
DQ 3e4e66525ea4550ah
DQ 3e42d02f34f20cbdh
DQ 3e46cfce65047188h
DQ 3e39b78c842d58b8h
DQ 3e4735e624c24bc9h
DQ 3e47eba1f7dd1adfh
DQ 3e586b3e59f65355h
DQ 3e1ce38e637f1b4dh
DQ 3e58d82ec919edc7h
DQ 3e4c52648ddcfa37h
DQ 3e52482ceae1ac12h
DQ 3e55a312311aba4fh
DQ 3e411e236329f225h
DQ 3e5b48c8cd2f246ch
DQ 3e6efa39ef35793ch
DQ 0000000000000000h
END

View File

@@ -0,0 +1,164 @@
;;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;; Defines __log_F_inv_dword
;; Used in log10f and logf
;;
.const
ALIGN 16
PUBLIC __log_F_inv_dword
__log_F_inv_dword DD 40000000h
DD 3ffe03f8h
DD 3ffc0fc1h
DD 3ffa232dh
DD 3ff83e10h
DD 3ff6603eh
DD 3ff4898dh
DD 3ff2b9d6h
DD 3ff0f0f1h
DD 3fef2eb7h
DD 3fed7304h
DD 3febbdb3h
DD 3fea0ea1h
DD 3fe865ach
DD 3fe6c2b4h
DD 3fe52598h
DD 3fe38e39h
DD 3fe1fc78h
DD 3fe07038h
DD 3fdee95ch
DD 3fdd67c9h
DD 3fdbeb62h
DD 3fda740eh
DD 3fd901b2h
DD 3fd79436h
DD 3fd62b81h
DD 3fd4c77bh
DD 3fd3680dh
DD 3fd20d21h
DD 3fd0b6a0h
DD 3fcf6475h
DD 3fce168ah
DD 3fcccccdh
DD 3fcb8728h
DD 3fca4588h
DD 3fc907dah
DD 3fc7ce0ch
DD 3fc6980ch
DD 3fc565c8h
DD 3fc43730h
DD 3fc30c31h
DD 3fc1e4bch
DD 3fc0c0c1h
DD 3fbfa030h
DD 3fbe82fah
DD 3fbd6910h
DD 3fbc5264h
DD 3fbb3ee7h
DD 3fba2e8ch
DD 3fb92144h
DD 3fb81703h
DD 3fb70fbbh
DD 3fb60b61h
DD 3fb509e7h
DD 3fb40b41h
DD 3fb30f63h
DD 3fb21643h
DD 3fb11fd4h
DD 3fb02c0bh
DD 3faf3adeh
DD 3fae4c41h
DD 3fad602bh
DD 3fac7692h
DD 3fab8f6ah
DD 3faaaaabh
DD 3fa9c84ah
DD 3fa8e83fh
DD 3fa80a81h
DD 3fa72f05h
DD 3fa655c4h
DD 3fa57eb5h
DD 3fa4a9cfh
DD 3fa3d70ah
DD 3fa3065eh
DD 3fa237c3h
DD 3fa16b31h
DD 3fa0a0a1h
DD 3f9fd80ah
DD 3f9f1166h
DD 3f9e4cadh
DD 3f9d89d9h
DD 3f9cc8e1h
DD 3f9c09c1h
DD 3f9b4c70h
DD 3f9a90e8h
DD 3f99d723h
DD 3f991f1ah
DD 3f9868c8h
DD 3f97b426h
DD 3f97012eh
DD 3f964fdah
DD 3f95a025h
DD 3f94f209h
DD 3f944581h
DD 3f939a86h
DD 3f92f114h
DD 3f924925h
DD 3f91a2b4h
DD 3f90fdbch
DD 3f905a38h
DD 3f8fb824h
DD 3f8f177ah
DD 3f8e7835h
DD 3f8dda52h
DD 3f8d3dcbh
DD 3f8ca29ch
DD 3f8c08c1h
DD 3f8b7034h
DD 3f8ad8f3h
DD 3f8a42f8h
DD 3f89ae41h
DD 3f891ac7h
DD 3f888889h
DD 3f87f781h
DD 3f8767abh
DD 3f86d905h
DD 3f864b8ah
DD 3f85bf37h
DD 3f853408h
DD 3f84a9fah
DD 3f842108h
DD 3f839930h
DD 3f83126fh
DD 3f828cc0h
DD 3f820821h
DD 3f81848eh
DD 3f810204h
DD 3f808081h
DD 3f800000h
END

View File

@@ -0,0 +1,294 @@
;;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;; Defines __log_F_inv_qword
;; Used in log10 and log
;;
.const
ALIGN 16
PUBLIC __log_F_inv_qword
__log_F_inv_qword DQ 4000000000000000h
DQ 3fffe01fe01fe020h
DQ 3fffc07f01fc07f0h
DQ 3fffa11caa01fa12h
DQ 3fff81f81f81f820h
DQ 3fff6310aca0dbb5h
DQ 3fff44659e4a4271h
DQ 3fff25f644230ab5h
DQ 3fff07c1f07c1f08h
DQ 3ffee9c7f8458e02h
DQ 3ffecc07b301ecc0h
DQ 3ffeae807aba01ebh
DQ 3ffe9131abf0b767h
DQ 3ffe741aa59750e4h
DQ 3ffe573ac901e574h
DQ 3ffe3a9179dc1a73h
DQ 3ffe1e1e1e1e1e1eh
DQ 3ffe01e01e01e01eh
DQ 3ffde5d6e3f8868ah
DQ 3ffdca01dca01dcah
DQ 3ffdae6076b981dbh
DQ 3ffd92f2231e7f8ah
DQ 3ffd77b654b82c34h
DQ 3ffd5cac807572b2h
DQ 3ffd41d41d41d41dh
DQ 3ffd272ca3fc5b1ah
DQ 3ffd0cb58f6ec074h
DQ 3ffcf26e5c44bfc6h
DQ 3ffcd85689039b0bh
DQ 3ffcbe6d9601cbe7h
DQ 3ffca4b3055ee191h
DQ 3ffc8b265afb8a42h
DQ 3ffc71c71c71c71ch
DQ 3ffc5894d10d4986h
DQ 3ffc3f8f01c3f8f0h
DQ 3ffc26b5392ea01ch
DQ 3ffc0e070381c0e0h
DQ 3ffbf583ee868d8bh
DQ 3ffbdd2b899406f7h
DQ 3ffbc4fd65883e7bh
DQ 3ffbacf914c1bad0h
DQ 3ffb951e2b18ff23h
DQ 3ffb7d6c3dda338bh
DQ 3ffb65e2e3beee05h
DQ 3ffb4e81b4e81b4fh
DQ 3ffb37484ad806ceh
DQ 3ffb2036406c80d9h
DQ 3ffb094b31d922a4h
DQ 3ffaf286bca1af28h
DQ 3ffadbe87f94905eh
DQ 3ffac5701ac5701bh
DQ 3ffaaf1d2f87ebfdh
DQ 3ffa98ef606a63beh
DQ 3ffa82e65130e159h
DQ 3ffa6d01a6d01a6dh
DQ 3ffa574107688a4ah
DQ 3ffa41a41a41a41ah
DQ 3ffa2c2a87c51ca0h
DQ 3ffa16d3f97a4b02h
DQ 3ffa01a01a01a01ah
DQ 3ff9ec8e951033d9h
DQ 3ff9d79f176b682dh
DQ 3ff9c2d14ee4a102h
DQ 3ff9ae24ea5510dah
DQ 3ff999999999999ah
DQ 3ff9852f0d8ec0ffh
DQ 3ff970e4f80cb872h
DQ 3ff95cbb0be377aeh
DQ 3ff948b0fcd6e9e0h
DQ 3ff934c67f9b2ce6h
DQ 3ff920fb49d0e229h
DQ 3ff90d4f120190d5h
DQ 3ff8f9c18f9c18fah
DQ 3ff8e6527af1373fh
DQ 3ff8d3018d3018d3h
DQ 3ff8bfce8062ff3ah
DQ 3ff8acb90f6bf3aah
DQ 3ff899c0f601899ch
DQ 3ff886e5f0abb04ah
DQ 3ff87427bcc092b9h
DQ 3ff8618618618618h
DQ 3ff84f00c2780614h
DQ 3ff83c977ab2beddh
DQ 3ff82a4a0182a4a0h
DQ 3ff8181818181818h
DQ 3ff8060180601806h
DQ 3ff7f405fd017f40h
DQ 3ff7e225515a4f1dh
DQ 3ff7d05f417d05f4h
DQ 3ff7beb3922e017ch
DQ 3ff7ad2208e0ecc3h
DQ 3ff79baa6bb6398bh
DQ 3ff78a4c8178a4c8h
DQ 3ff77908119ac60dh
DQ 3ff767dce434a9b1h
DQ 3ff756cac201756dh
DQ 3ff745d1745d1746h
DQ 3ff734f0c541fe8dh
DQ 3ff724287f46debch
DQ 3ff713786d9c7c09h
DQ 3ff702e05c0b8170h
DQ 3ff6f26016f26017h
DQ 3ff6e1f76b4337c7h
DQ 3ff6d1a62681c861h
DQ 3ff6c16c16c16c17h
DQ 3ff6b1490aa31a3dh
DQ 3ff6a13cd1537290h
DQ 3ff691473a88d0c0h
DQ 3ff6816816816817h
DQ 3ff6719f3601671ah
DQ 3ff661ec6a5122f9h
DQ 3ff6524f853b4aa3h
DQ 3ff642c8590b2164h
DQ 3ff63356b88ac0deh
DQ 3ff623fa77016240h
DQ 3ff614b36831ae94h
DQ 3ff6058160581606h
DQ 3ff5f66434292dfch
DQ 3ff5e75bb8d015e7h
DQ 3ff5d867c3ece2a5h
DQ 3ff5c9882b931057h
DQ 3ff5babcc647fa91h
DQ 3ff5ac056b015ac0h
DQ 3ff59d61f123ccaah
DQ 3ff58ed2308158edh
DQ 3ff5805601580560h
DQ 3ff571ed3c506b3ah
DQ 3ff56397ba7c52e2h
DQ 3ff5555555555555h
DQ 3ff54725e6bb82feh
DQ 3ff5390948f40febh
DQ 3ff52aff56a8054bh
DQ 3ff51d07eae2f815h
DQ 3ff50f22e111c4c5h
DQ 3ff5015015015015h
DQ 3ff4f38f62dd4c9bh
DQ 3ff4e5e0a72f0539h
DQ 3ff4d843bedc2c4ch
DQ 3ff4cab88725af6eh
DQ 3ff4bd3edda68fe1h
DQ 3ff4afd6a052bf5bh
DQ 3ff4a27fad76014ah
DQ 3ff49539e3b2d067h
DQ 3ff4880522014880h
DQ 3ff47ae147ae147bh
DQ 3ff46dce34596066h
DQ 3ff460cbc7f5cf9ah
DQ 3ff453d9e2c776cah
DQ 3ff446f86562d9fbh
DQ 3ff43a2730abee4dh
DQ 3ff42d6625d51f87h
DQ 3ff420b5265e5951h
DQ 3ff4141414141414h
DQ 3ff40782d10e6566h
DQ 3ff3fb013fb013fbh
DQ 3ff3ee8f42a5af07h
DQ 3ff3e22cbce4a902h
DQ 3ff3d5d991aa75c6h
DQ 3ff3c995a47babe7h
DQ 3ff3bd60d9232955h
DQ 3ff3b13b13b13b14h
DQ 3ff3a524387ac822h
DQ 3ff3991c2c187f63h
DQ 3ff38d22d366088eh
DQ 3ff3813813813814h
DQ 3ff3755bd1c945eeh
DQ 3ff3698df3de0748h
DQ 3ff35dce5f9f2af8h
DQ 3ff3521cfb2b78c1h
DQ 3ff34679ace01346h
DQ 3ff33ae45b57bcb2h
DQ 3ff32f5ced6a1dfah
DQ 3ff323e34a2b10bfh
DQ 3ff3187758e9ebb6h
DQ 3ff30d190130d190h
DQ 3ff301c82ac40260h
DQ 3ff2f684bda12f68h
DQ 3ff2eb4ea1fed14bh
DQ 3ff2e025c04b8097h
DQ 3ff2d50a012d50a0h
DQ 3ff2c9fb4d812ca0h
DQ 3ff2bef98e5a3711h
DQ 3ff2b404ad012b40h
DQ 3ff2a91c92f3c105h
DQ 3ff29e4129e4129eh
DQ 3ff293725bb804a5h
DQ 3ff288b01288b013h
DQ 3ff27dfa38a1ce4dh
DQ 3ff27350b8812735h
DQ 3ff268b37cd60127h
DQ 3ff25e22708092f1h
DQ 3ff2539d7e9177b2h
DQ 3ff2492492492492h
DQ 3ff23eb79717605bh
DQ 3ff23456789abcdfh
DQ 3ff22a0122a0122ah
DQ 3ff21fb78121fb78h
DQ 3ff21579804855e6h
DQ 3ff20b470c67c0d9h
DQ 3ff2012012012012h
DQ 3ff1f7047dc11f70h
DQ 3ff1ecf43c7fb84ch
DQ 3ff1e2ef3b3fb874h
DQ 3ff1d8f5672e4abdh
DQ 3ff1cf06ada2811dh
DQ 3ff1c522fc1ce059h
DQ 3ff1bb4a4046ed29h
DQ 3ff1b17c67f2bae3h
DQ 3ff1a7b9611a7b96h
DQ 3ff19e0119e0119eh
DQ 3ff19453808ca29ch
DQ 3ff18ab083902bdbh
DQ 3ff1811811811812h
DQ 3ff1778a191bd684h
DQ 3ff16e0689427379h
DQ 3ff1648d50fc3201h
DQ 3ff15b1e5f75270dh
DQ 3ff151b9a3fdd5c9h
DQ 3ff1485f0e0acd3bh
DQ 3ff13f0e8d344724h
DQ 3ff135c81135c811h
DQ 3ff12c8b89edc0ach
DQ 3ff12358e75d3033h
DQ 3ff11a3019a74826h
DQ 3ff1111111111111h
DQ 3ff107fbbe011080h
DQ 3ff0fef010fef011h
DQ 3ff0f5edfab325a2h
DQ 3ff0ecf56be69c90h
DQ 3ff0e40655826011h
DQ 3ff0db20a88f4696h
DQ 3ff0d24456359e3ah
DQ 3ff0c9714fbcda3bh
DQ 3ff0c0a7868b4171h
DQ 3ff0b7e6ec259dc8h
DQ 3ff0af2f722eecb5h
DQ 3ff0a6810a6810a7h
DQ 3ff09ddba6af8360h
DQ 3ff0953f39010954h
DQ 3ff08cabb37565e2h
DQ 3ff0842108421084h
DQ 3ff07b9f29b8eae2h
DQ 3ff073260a47f7c6h
DQ 3ff06ab59c7912fbh
DQ 3ff0624dd2f1a9fch
DQ 3ff059eea0727586h
DQ 3ff05197f7d73404h
DQ 3ff04949cc1664c5h
DQ 3ff0410410410410h
DQ 3ff038c6b78247fch
DQ 3ff03091b51f5e1ah
DQ 3ff02864fc7729e9h
DQ 3ff0204081020408h
DQ 3ff0182436517a37h
DQ 3ff0101010101010h
DQ 3ff0080402010080h
DQ 3ff0000000000000h
DQ 0000000000000000h
END

View File

@@ -0,0 +1,133 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include <fpieee.h>
#include <excpt.h>
#include <float.h>
#include <math.h>
#include <errno.h>
#include "libm_new.h"
// y = log10f(x)
// y = log10(x)
// y = logf(x)
// y = log(x)
// these codes and the ones in the related .asm files have to match
#define LOG_X_ZERO 1
#define LOG_X_NEG 2
#define LOG_X_NAN 3
static float _logf_special_common(float x, float y, U32 code, unsigned int op, char *name)
{
switch(code)
{
case LOG_X_ZERO:
{
UT64 ym; ym.u64 = 0; ym.f32[0] = y;
_handle_errorf(name, op, ym.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1);
}
break;
case LOG_X_NEG:
{
UT64 ym; ym.u64 = 0; ym.f32[0] = y;
_handle_errorf(name, op, ym.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
}
break;
case LOG_X_NAN:
{
unsigned int is_snan;
UT32 xm; UT64 ym;
xm.f32 = x;
is_snan = (((xm.u32 & QNAN_MASK_32) == QNAN_SET_32) ? 0 : 1);
ym.u64 = 0; ym.f32[0] = y;
if(is_snan)
{
_handle_errorf(name, op, ym.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
}
else
{
_handle_errorf(name, op, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1);
}
}
break;
}
return y;
}
float _logf_special(float x, float y, U32 code)
{
return _logf_special_common(x, y, code, _FpCodeLog, "logf");
}
float _log10f_special(float x, float y, U32 code)
{
return _logf_special_common(x, y, code, _FpCodeLog10, "log10f");
}
static double _log_special_common(double x, double y, U32 code, unsigned int op, char *name)
{
switch(code)
{
case LOG_X_ZERO:
{
UT64 ym; ym.f64 = y;
_handle_error(name, op, ym.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1);
}
break;
case LOG_X_NEG:
{
UT64 ym; ym.f64 = y;
_handle_error(name, op, ym.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
}
break;
case LOG_X_NAN:
{
UT64 ym; ym.f64 = y;
_handle_error(name, op, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1);
}
break;
}
return y;
}
double _log_special(double x, double y, U32 code)
{
return _log_special_common(x, y, code, _FpCodeLog, "log");
}
double _log10_special(double x, double y, U32 code)
{
return _log_special_common(x, y, code, _FpCodeLog10, "log10");
}

View File

@@ -0,0 +1,84 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_INFINITY_WITH_FLAGS
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_INFINITY_WITH_FLAGS
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
double _logb(double x)
{
unsigned long ux;
long u;
GET_BITS_DP64(x, ux);
u = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
if ((ux & ~SIGNBIT_DP64) == 0)
/* x is +/-zero. Return -infinity with div-by-zero flag. */
return _handle_error("_logb", OP_LOGB, NINFBITPATT_DP64, _SING,
AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1);
else if (EMIN_DP64 <= u && u <= EMAX_DP64)
/* x is a normal number */
return (double)u;
else if (u > EMAX_DP64)
{
/* x is infinity or NaN */
if ((ux & MANTBITS_DP64) == 0)
/* x is +/-infinity. For VC++, return infinity of same sign. */
return x;
else
/* x is NaN, result is NaN */
return _handle_error("_logb", OP_LOGB, ux|0x0008000000000000, _DOMAIN,
0, EDOM, x, 0.0, 1);
}
else
{
/* x is denormalized. */
#ifdef FOLLOW_IEEE754_LOGB
/* Return the value of the minimum exponent to ensure that
the relationship between logb and scalb, defined in
IEEE 754, holds. */
return EMIN_DP64;
#else
/* Follow the rule set by IEEE 854 for logb */
ux &= MANTBITS_DP64;
u = EMIN_DP64;
while (ux < IMPBIT_DP64)
{
ux <<= 1;
u--;
}
return (double)u;
#endif
}
}

View File

@@ -0,0 +1,82 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_INFINITYF_WITH_FLAGS
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_INFINITYF_WITH_FLAGS
#undef USE_HANDLE_ERRORF
#include "libm_errno.h"
float _logbf(float x)
{
unsigned int ux;
int u;
GET_BITS_SP32(x, ux);
u = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
if ((ux & ~SIGNBIT_SP32) == 0)
/* x is +/-zero. Return -infinity with div-by-zero flag. */
return _handle_errorf("_logbf", OP_LOGB, NINFBITPATT_SP32, _SING,
AMD_F_DIVBYZERO, ERANGE, x, 0.0F, 1);
else if (EMIN_SP32 <= u && u <= EMAX_SP32)
/* x is a normal number */
return (float)u;
else if (u > EMAX_SP32)
{
/* x is infinity or NaN */
if ((ux & MANTBITS_SP32) == 0)
/* x is +/-infinity. For VC++, return infinity of same sign. */
return x;
else
/* x is NaN, result is NaN */
return _handle_errorf("_logbf", OP_LOGB, ux|0x00400000, _DOMAIN,
0, EDOM, x, 0.0F, 1);
}
else
{
/* x is denormalized. */
#ifdef FOLLOW_IEEE754_LOGB
/* Return the value of the minimum exponent to ensure that
the relationship between logb and scalb, defined in
IEEE 754, holds. */
return EMIN_SP32;
#else
/* Follow the rule set by IEEE 854 for logb */
ux &= MANTBITS_SP32;
u = EMIN_SP32;
while (ux < IMPBIT_SP32)
{
ux <<= 1;
u--;
}
return (float)u;
#endif
}
}

View File

@@ -0,0 +1,451 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;
; logf.asm
;
; An implementation of the logf libm function.
;
; Prototype:
;
; float logf(float x);
;
;
; Algorithm:
; Similar to one presnted in log.asm
;
.const
ALIGN 16
L_real_one DQ 0000000003f800000h ; 1.0
DQ 0000000000000000h
L_real_two DQ 00000000040000000h ; 1.0
DQ 00000000000000000h
L_real_ninf DQ 000000000ff800000h ; -inf
DQ 0000000000000000h
L_real_inf DQ 0000000007f800000h ; +inf
DQ 0000000000000000h
L_real_nan DQ 0000000007fc00000h ; NaN
DQ 0000000000000000h
L_real_neg_qnan DQ 000000000ffc00000h
DQ 0000000000000000h
L_real_notsign DQ 0000000007ffFFFFFh ; ^sign bit
DQ 0000000000000000h
L_real_mant DQ 0007FFFFF007FFFFFh ; mantissa bits
DQ 0007FFFFF007FFFFFh
L_mask_127 DQ 00000007f0000007fh ;
DQ 00000007f0000007fh
L_mask_253 DQ 000000000000000fdh
DQ 00000000000000000h
L_mask_mant_all7 DQ 00000000007f0000h
DQ 00000000007f0000h
L_mask_mant8 DQ 0000000000008000h
DQ 0000000000000000h
L_real_ca1 DQ 0000000003DAAAAABh ; 8.33333333333317923934e-02
DQ 00000000000000000h
L_real_ca2 DQ 0000000003C4CCCCDh ; 1.25000000037717509602e-02
DQ 00000000000000000h
L_real_log2_lead DQ 03F3170003F317000h ; 0.693115234375
DQ 00000000000000000h
L_real_log2_tail DQ 0000000003805FDF4h ; 0.000031946183
DQ 00000000000000000h
L_real_half DQ 0000000003f000000h ; 1/2
DQ 00000000000000000h
L_real_1_over_3 DQ 0000000003eaaaaabh
DQ 00000000000000000h
L_real_1_over_2 DD 03f000000h
L_real_neg127 DD 0c2fe0000h
L_real_qnanbit DD 000400000h ; quiet nan bit
L_real_threshold DD 03d800000h
; these codes and the ones in the corresponding .c file have to match
L_flag_x_zero DD 00000001
L_flag_x_neg DD 00000002
L_flag_x_nan DD 00000003
EXTRN __log_128_lead:DWORD
EXTRN __log_128_tail:DWORD
EXTRN __log_F_inv_dword:DWORD
EXTRN __use_fma3_lib:DWORD
fname TEXTEQU <logf>
fname_special TEXTEQU <_logf_special>
; define local variable storage offsets
dummy_space EQU 020h
stack_size EQU 038h
include fm.inc
; external function
EXTERN fname_special:PROC
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Llogf_fma3
; Some of the placement of instructions below iwll be odd.
; We are attempting to have no more than one branch per 32-byte block.
Llogf_sse2:
; Zero the high bits of rax because it will be used as an index later.
xor rax, rax
movdqa xmm3, xmm0
movaps xmm4, xmm0
; This computation of the expoonent of x will produce nonsenes if x <= 0.,
; but those cases are eliminated below, so it does no harm.
psrld xmm3, 23 ; xmm3 <-- biased exp if x > 0.
; Is x Inf or NaN?
movd eax, xmm0 ; eax <-- x
mov ecx, eax
btr ecx, 31 ; ecx <-- |x|
cmp ecx, DWORD PTR L_real_inf
jae Llogf_sse2_x_is_inf_or_nan
; Finish computing exponent.
psubd xmm3, XMMWORD PTR L_mask_127 ; xmm3 <-- xexp (unbiased)
movdqa xmm2, xmm0
cvtdq2ps xmm5, xmm3 ; (float)xexp, unless x <= 0.
; Is x negative or zero?
xorps xmm1, xmm1
comiss xmm0, xmm1
jbe Llogf_sse2_x_is_zero_or_neg
pand xmm2, XMMWORD PTR L_real_mant ; xmm2 <-- x mantissa for later
subss xmm4, DWORD PTR L_real_one ; xmm4 <-- x - 1. for later
comiss xmm5, DWORD PTR L_real_neg127 ; x!=0, xexp==0 ==> subnormal
je Llogf_sse2_subnormal_adjust
Llogf_sse2_continue_common:
; At this point we need |x| (possibly adjusted) in eax
; and m = xexpx (possibly adjusted) in xmm5
; We also need the value of x - 1. computed above.
; compute the index into the log tables
mov r9d, eax
and eax, DWORD PTR L_mask_mant_all7 ; eax <-- 7 bits of x mantissa
and r9d, DWORD PTR L_mask_mant8 ; r9d <-- 8th bit
shl r9d, 1
add eax, r9d ; use 8th bit to round up
movd xmm1, eax
; Is x near 1.0 ?
; Note that if x is subnormal it is perforce not near one.
andps xmm4, XMMWORD PTR L_real_notsign ; xmm4 <-- |x-1|
comiss xmm4, DWORD PTR L_real_threshold ; is |x-1| < 1/16?
jb Llogf_sse2_near_one ; if so, handle elsewhere
; F, Y
; F is a number in [.5,1) scaled from the rounded mantissa bits computed
; above by oring in the exponent of .5.
; Y is all of the mantissa bits of X scaled to [.5,1.) similarly
shr eax, 16 ; shift eax to use as index
por xmm2, XMMWORD PTR L_real_half ; xmm2 <-- Y
por xmm1, XMMWORD PTR L_real_half ; xmm2 <-- F
lea r9, QWORD PTR __log_F_inv_dword
; f = F - Y, r = f * inv
subss xmm1, xmm2 ; xmm1 <-- f
mulss xmm1, DWORD PTR [r9+rax*4] ; xmm1 <-- r = f*inv (tabled)
movaps xmm2, xmm1
movaps xmm0, xmm1
; poly
mulss xmm2, DWORD PTR L_real_1_over_3 ; xmm2 <-- r/3
mulss xmm0, xmm1 ; xmm0 <-- r^2
addss xmm2, DWORD PTR L_real_1_over_2
movaps xmm3, XMMWORD PTR L_real_log2_tail
lea r9, QWORD PTR __log_128_tail
lea r10, QWORD PTR __log_128_lead
mulss xmm2, xmm0 ; xmm2 <-- r^2 * (r/3 + 1/2)
mulss xmm3, xmm5 ; xmm3 <-- (m=xexp)*log2_tail
addss xmm1, xmm2 ; xmm1 <-- poly
; m*log(2) + log(G) - poly, where G is just 2*F
; log(G) is precomputed to extra precision.
; small pieces and large pieces are separated until the final add,
; to preserve accuracy
movaps xmm0, XMMWORD PTR L_real_log2_lead
subss xmm3, xmm1 ; xmm3 <-- m*log2_tail - poly
mulss xmm0, xmm5 ; xmm0 <-- m*log1_lead
addss xmm3, DWORD PTR [r9+rax*4] ; xmm3 += log(G) tail
addss xmm0, DWORD PTR [r10+rax*4] ; xmm0 += log(G) lead
addss xmm0, xmm3 ; xmm0 <-- m*log(2)+log(G)-poly
StackDeallocate stack_size
ret
ALIGN 16
Llogf_sse2_near_one:
; Computation of the log for x near one requires special techniques.
movaps xmm2, DWORD PTR L_real_two
subss xmm0, DWORD PTR L_real_one ; xmm0 <-- r = x - 1.0
addss xmm2, xmm0
movaps xmm1, xmm0
divss xmm1, xmm2 ; xmm1 <-- u = r/(2.0+r)
movaps xmm4, xmm0
mulss xmm4, xmm1 ; xmm4 <-- correction = r*u
addss xmm1, xmm1 ; xmm1 <-- u = 2.*u
movaps xmm2, xmm1
mulss xmm2, xmm2 ; xmm2 <-- u^2
; r2 = (u^3 * (ca_1 + u^2 * ca_2) - correction)
movaps xmm3, xmm1
mulss xmm3, xmm2 ; xmm3 <-- u^3
mulss xmm2, DWORD PTR L_real_ca2 ; xmm2 <-- ca2*u^2
addss xmm2, DWORD PTR L_real_ca1 ; xmm2 <-- ca2*u^2 + ca1
mulss xmm2, xmm3 ; xmm2 <-- u^3*(ca1+u^2*ca2)
subss xmm2, xmm4 ; xmm2 <-- r2
; return r + r2
addss xmm0, xmm2
StackDeallocate stack_size
ret
ALIGN 16
Llogf_sse2_subnormal_adjust:
; This code adjusts eax and xmm5.
; It must preserve xmm4.
por xmm2, XMMWORD PTR L_real_one
subss xmm2, DWORD PTR L_real_one
movdqa xmm5, xmm2
pand xmm2, XMMWORD PTR L_real_mant
movd eax, xmm2
psrld xmm5, 23
psubd xmm5, XMMWORD PTR L_mask_253
cvtdq2ps xmm5, xmm5
jmp Llogf_sse2_continue_common
; Until we get to the FMA3 code, the rest of this is special case handling.
Llogf_sse2_x_is_zero_or_neg:
jne Llogf_sse2_x_is_neg
movaps xmm1, XMMWORD PTR L_real_ninf
mov r8d, DWORD PTR L_flag_x_zero
call fname_special
jmp Llogf_sse2_finish
Llogf_sse2_x_is_neg:
movaps xmm1, XMMWORD PTR L_real_neg_qnan
mov r8d, DWORD PTR L_flag_x_neg
call fname_special
jmp Llogf_sse2_finish
Llogf_sse2_x_is_inf_or_nan:
cmp eax, DWORD PTR L_real_inf
je Llogf_sse2_finish
cmp eax, DWORD PTR L_real_ninf
je Llogf_sse2_x_is_neg
or eax, DWORD PTR L_real_qnanbit
movd xmm1, eax
mov r8d, DWORD PTR L_flag_x_nan
call fname_special
jmp Llogf_sse2_finish
Llogf_sse2_finish:
StackDeallocate stack_size
ret
ALIGN 16
Llogf_fma3:
; compute exponent part
vmovaps xmm4,XMMWORD PTR L_real_inf ; preload for inf/nan test
xor rax,rax
vpsrld xmm3,xmm0,23 ; xmm3 <-- (ux>>23)
vmovd eax,xmm0 ;eax = x
vpsubd xmm3,xmm3,DWORD PTR L_mask_127 ; xmm3 <-- (ux>>23) - 127
vcvtdq2ps xmm5,xmm3 ; xmm5 <-- float((ux>>23)-127) = xexp
; NaN or inf
vpand xmm1,xmm0,xmm4 ; xmm1 <-- (ux & 07f800000h)
vcomiss xmm1,xmm4
je Llogf_fma3_x_is_inf_or_nan
; check for negative numbers or zero
vpxor xmm1,xmm1,xmm1
vcomiss xmm0,xmm1
jbe Llogf_fma3_x_is_zero_or_neg
vpand xmm2,xmm0,DWORD PTR L_real_mant ; xmm2 <-- ux & 0007FFFFFh
vsubss xmm4,xmm0,DWORD PTR L_real_one ; xmm4 <-- x - 1.0
vcomiss xmm5,DWORD PTR L_real_neg127
je Llogf_fma3_subnormal_adjust
Llogf_fma3_continue_common:
; compute the index into the log tables
vpand xmm1,xmm0,DWORD PTR L_mask_mant_all7 ; xmm1 = ux & 0007f0000h
vpand xmm3,xmm0,DWORD PTR L_mask_mant8 ; xmm3 = ux & 000008000h
vpslld xmm3,xmm3,1 ; xmm3 = (ux & 000008000h) << 1
vpaddd xmm1,xmm3,xmm1
; eax = (ux & 0007f0000h) + ((ux & 000008000h) << 1)
; eax <-- x/127., rounded to nearest
vmovd eax,xmm1
; near one codepath
vandps xmm4,xmm4,DWORD PTR L_real_notsign ; xmm4 <-- fabs (x - 1.0)
vcomiss xmm4,DWORD PTR L_real_threshold
jb Llogf_fma3_near_one
; F,Y
shr eax,16
vpor xmm2,xmm2,DWORD PTR L_real_half ; xmm2 <-- Y
vpor xmm1,xmm1,DWORD PTR L_real_half ; xmm1 <-- F
lea r9,QWORD PTR __log_F_inv_dword
; f = F - Y
vsubss xmm1,xmm1,xmm2 ; f = F - Y
; r = f * log_F_inv_dword[index]
vmulss xmm1,xmm1,DWORD PTR [r9 + rax * 4]
; poly
vmovaps xmm2,XMMWORD PTR L_real_1_over_3
vfmadd213ss xmm2,xmm1,DWORD PTR L_real_1_over_2 ; 1/3*r + 1/2
vmulss xmm0,xmm1,xmm1 ; r*r
vmovaps xmm3,DWORD PTR L_real_log2_tail;
lea r9,DWORD PTR __log_128_tail
lea r10,DWORD PTR __log_128_lead
vfmadd231ss xmm1,xmm2,xmm0 ; poly = r + 1/2*r*r + 1/3*r*r*r
vfmsub213ss xmm3,xmm5,xmm1 ; (xexp * log2_tail) - poly
; m*log(2) + log(G) - poly
vmovaps xmm0,DWORD PTR L_real_log2_lead
vfmadd213ss xmm0,xmm5,[r10 + rax * 4]
; z2 = (xexp * log2_tail) - poly + log_128_tail[index]
vaddss xmm3,xmm3,DWORD PTR [r9 + rax * 4]
vaddss xmm0,xmm0,xmm3 ; return z1 + z2
StackDeallocate stack_size
ret
ALIGN 16
Llogf_fma3_near_one:
; r = x - 1.0;
vmovaps xmm2,DWORD PTR L_real_two
vsubss xmm0,xmm0,DWORD PTR L_real_one ; xmm0 = r = = x - 1.0
; u = r / (2.0 + r)
vaddss xmm2,xmm2,xmm0 ; (r+2.0)
vdivss xmm1,xmm0,xmm2 ; u = r / (2.0 + r)
; correction = r * u
vmulss xmm4,xmm0,xmm1 ; correction = u*r
; u = u + u;
vaddss xmm1,xmm1,xmm1 ; u = u+u
vmulss xmm2,xmm1,xmm1 ; v = u^2
; r2 = (u * v * (ca_1 + v * ca_2) - correction)
vmulss xmm3,xmm1,xmm2 ; u^3
vmovaps xmm5,DWORD PTR L_real_ca2
vfmadd213ss xmm2,xmm5,DWORD PTR L_real_ca1
vfmsub213ss xmm2,xmm3,xmm4 ; r2 = (ca1 + ca2 * v) * u^3 - correction
; r + r2
vaddss xmm0,xmm0,xmm2
StackDeallocate stack_size
ret
ALIGN 16
Llogf_fma3_subnormal_adjust:
vmovaps xmm3,DWORD PTR L_real_one
vpor xmm2,xmm2,xmm3 ; xmm2 = temp = ((ux &0007FFFFFh) | 03f800000h)
vsubss xmm2,xmm2,xmm3 ; xmm2 = temp -1.0
vpsrld xmm5,xmm2,23 ; xmm5 = (utemp >> 23)
vpand xmm2,xmm2,DWORD PTR L_real_mant ; xmm2 = (utemp & 0007FFFFFh)
vmovaps xmm0,xmm2
vpsubd xmm5,xmm5,DWORD PTR L_mask_253 ; xmm5 = (utemp >> 23) - 253
vcvtdq2ps xmm5,xmm5 ; xmm5 = (float) ((utemp >> 23) - 253)
jmp Llogf_fma3_continue_common
Llogf_fma3_x_is_zero_or_neg:
jne Llogf_fma3_x_is_neg
vmovaps xmm1,DWORD PTR L_real_ninf
mov r8d,DWORD PTR L_flag_x_zero
call fname_special
StackDeallocate stack_size
ret
Llogf_fma3_x_is_neg:
vmovaps xmm1,DWORD PTR L_real_neg_qnan
mov r8d,DWORD PTR L_flag_x_neg
call fname_special
StackDeallocate stack_size
ret
Llogf_fma3_x_is_inf_or_nan:
cmp eax,DWORD PTR L_real_inf
je Llogf_fma3_finish
cmp eax,DWORD PTR L_real_ninf
je Llogf_fma3_x_is_neg
or eax,DWORD PTR L_real_qnanbit
vmovd xmm1,eax
mov r8d,DWORD PTR L_flag_x_nan
call fname_special
StackDeallocate stack_size
ret
Llogf_fma3_finish:
StackDeallocate stack_size
ret
fname endp
END

View File

@@ -0,0 +1,76 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
double modf(double x, double *iptr)
{
/* modf splits the argument x into integer and fraction parts,
each with the same sign as x. */
long xexp;
unsigned long ux, ax, mask;
GET_BITS_DP64(x, ux);
ax = ux & (~SIGNBIT_DP64);
if (ax >= 0x4340000000000000)
{
/* abs(x) is either NaN, infinity, or >= 2^53 */
if (ax > 0x7ff0000000000000)
{
/* x is NaN */
*iptr = x;
return x + x; /* Raise invalid if it is a signalling NaN */
}
else
{
/* x is infinity or large. Return zero with the sign of x */
*iptr = x;
PUT_BITS_DP64(ux & SIGNBIT_DP64, x);
return x;
}
}
else if (ax < 0x3ff0000000000000)
{
/* abs(x) < 1.0. Set iptr to zero with the sign of x
and return x. */
PUT_BITS_DP64(ux & SIGNBIT_DP64, *iptr);
return x;
}
else
{
xexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
/* Mask out the bits of x that we don't want */
mask = 1;
mask = (mask << (EXPSHIFTBITS_DP64 - xexp)) - 1;
PUT_BITS_DP64(ux & ~mask, *iptr);
return x - *iptr;
}
}

View File

@@ -0,0 +1,70 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
float modff(float x, float *iptr)
{
/* modff splits the argument x into integer and fraction parts,
each with the same sign as x. */
unsigned int ux, mask;
int xexp;
GET_BITS_SP32(x, ux);
xexp = ((ux & (~SIGNBIT_SP32)) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
if (xexp < 0)
{
/* abs(x) < 1.0. Set iptr to zero with the sign of x
and return x. */
PUT_BITS_SP32(ux & SIGNBIT_SP32, *iptr);
return x;
}
else if (xexp < EXPSHIFTBITS_SP32)
{
/* x lies between 1.0 and 2**(24) */
/* Mask out the bits of x that we don't want */
mask = (1 << (EXPSHIFTBITS_SP32 - xexp)) - 1;
PUT_BITS_SP32(ux & ~mask, *iptr);
return x - *iptr;
}
else if ((ux & (~SIGNBIT_SP32)) > 0x7f800000)
{
/* x is NaN */
*iptr = x;
return x + x; /* Raise invalid if it is a signalling NaN */
}
else
{
/* x is infinity or large. Set iptr to x and return zero
with the sign of x. */
*iptr = x;
PUT_BITS_SP32(ux & SIGNBIT_SP32, x);
return x;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,130 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include <fpieee.h>
#include <excpt.h>
#include <float.h>
#include <math.h>
#include <errno.h>
#include "libm_new.h"
// these codes and the ones in the related .asm files have to match
#define POW_X_ONE_Y_SNAN 1
#define POW_X_ZERO_Z_INF 2
#define POW_X_NAN 3
#define POW_Y_NAN 4
#define POW_X_NAN_Y_NAN 5
#define POW_X_NEG_Y_NOTINT 6
#define POW_Z_ZERO 7
#define POW_Z_DENORMAL 8
#define POW_Z_INF 9
float _powf_special(float x, float y, float z, U32 code)
{
switch(code)
{
case POW_X_ONE_Y_SNAN:
{
UT64 zm; zm.u64 = 0; zm.f32[0] = z;
_handle_errorf("powf", _FpCodePow, zm.u64, 0, AMD_F_INVALID, 0, x, y, 2);
}
break;
case POW_X_ZERO_Z_INF:
{
UT64 zm; zm.u64 = 0; zm.f32[0] = z;
_handle_errorf("powf", _FpCodePow, zm.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, y, 2);
}
break;
case POW_X_NAN:
case POW_Y_NAN:
case POW_X_NAN_Y_NAN:
case POW_X_NEG_Y_NOTINT:
{
UT64 zm; zm.u64 = 0; zm.f32[0] = z;
_handle_errorf("powf", _FpCodePow, zm.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, y, 2);
}
break;
case POW_Z_ZERO:
{
UT64 zm; zm.u64 = 0; zm.f32[0] = z;
_handle_errorf("powf", _FpCodePow, zm.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, y, 2);
}
break;
case POW_Z_INF:
{
UT64 zm; zm.u64 = 0; zm.f32[0] = z;
_handle_errorf("powf", _FpCodePow, zm.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, y, 2);
}
break;
}
return z;
}
double _pow_special(double x, double y, double z, U32 code)
{
switch(code)
{
case POW_X_ZERO_Z_INF:
{
UT64 zm; zm.f64 = z;
_handle_error("pow", _FpCodePow, zm.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, y, 2);
}
break;
case POW_X_NAN:
case POW_Y_NAN:
case POW_X_NAN_Y_NAN:
case POW_X_NEG_Y_NOTINT:
{
UT64 zm; zm.f64 = z;
_handle_error("pow", _FpCodePow, zm.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, y, 2);
}
break;
case POW_Z_ZERO:
case POW_Z_DENORMAL:
{
UT64 zm; zm.f64 = z;
_handle_error("pow", _FpCodePow, zm.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, y, 2);
}
break;
case POW_Z_INF:
{
UT64 zm; zm.f64 = z;
_handle_error("pow", _FpCodePow, zm.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, y, 2);
}
break;
}
return z;
}

View File

@@ -0,0 +1,319 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_NAN_WITH_FLAGS
#define USE_SCALEDOUBLE_3
#define USE_GET_FPSW_INLINE
#define USE_SET_FPSW_INLINE
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_NAN_WITH_FLAGS
#undef USE_SCALEDOUBLE_3
#undef USE_GET_FPSW_INLINE
#undef USE_SET_FPSW_INLINE
#undef USE_HANDLE_ERROR
#if !defined(_CRTBLD_C9X)
#define _CRTBLD_C9X
#endif
#include "libm_errno.h"
/* Computes the exact product of x and y, the result being the
nearly doublelength number (z,zz) */
static inline void dekker_mul12(double x, double y,
double *z, double *zz)
{
double hx, tx, hy, ty;
/* Split x into hx (head) and tx (tail). Do the same for y. */
unsigned long u;
GET_BITS_DP64(x, u);
u &= 0xfffffffff8000000;
PUT_BITS_DP64(u, hx);
tx = x - hx;
GET_BITS_DP64(y, u);
u &= 0xfffffffff8000000;
PUT_BITS_DP64(u, hy);
ty = y - hy;
*z = x * y;
*zz = (((hx * hy - *z) + hx * ty) + tx * hy) + tx * ty;
}
#pragma function(fmod)
#undef _FUNCNAME
#if defined(COMPILING_FMOD)
double fmod(double x, double y)
#define _FUNCNAME "fmod"
#define _OPERATION OP_FMOD
#else
double remainder(double x, double y)
#define _FUNCNAME "remainder"
#define _OPERATION OP_REM
#endif
{
double dx, dy, scale, w, t, v, c, cc;
int i, ntimes, xexp, yexp;
unsigned long u, ux, uy, ax, ay, todd;
unsigned int sw;
dx = x;
dy = y;
GET_BITS_DP64(dx, ux);
GET_BITS_DP64(dy, uy);
ax = ux & ~SIGNBIT_DP64;
ay = uy & ~SIGNBIT_DP64;
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
if (xexp < 1 || xexp > BIASEDEMAX_DP64 ||
yexp < 1 || yexp > BIASEDEMAX_DP64)
{
/* x or y is zero, denormalized, NaN or infinity */
if (xexp > BIASEDEMAX_DP64)
{
/* x is NaN or infinity */
if (ux & MANTBITS_DP64)
{
/* x is NaN */
return _handle_error(_FUNCNAME, _OPERATION, ux|0x0008000000000000, _DOMAIN, 0,
EDOM, x, y, 2);
}
else
{
/* x is infinity; result is NaN */
return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN,
AMD_F_INVALID, EDOM, x, y, 2);
}
}
else if (yexp > BIASEDEMAX_DP64)
{
/* y is NaN or infinity */
if (uy & MANTBITS_DP64)
{
/* y is NaN */
return _handle_error(_FUNCNAME, _OPERATION, uy|0x0008000000000000, _DOMAIN, 0,
EDOM, x, y, 2);
}
else
{
#ifdef _CRTBLD_C9X
/* C99 return for y = +-inf is x */
return x;
#else
/* y is infinity; result is indefinite */
return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN,
AMD_F_INVALID, EDOM, x, y, 2);
#endif
}
}
else if (ax == 0x0000000000000000)
{
/* x is zero */
if (ay == 0x0000000000000000)
{
/* y is zero */
return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN,
AMD_F_INVALID, EDOM, x, y, 2);
}
else
/* C99 return for x = 0 must preserve sign */
return x;
}
else if (ay == 0x0000000000000000)
{
/* y is zero */
return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN,
AMD_F_INVALID, EDOM, x, y, 2);
}
/* We've exhausted all other possibilities. One or both of x and
y must be denormalized */
if (xexp < 1)
{
/* x is denormalized. Figure out its exponent. */
u = ax;
while (u < IMPBIT_DP64)
{
xexp--;
u <<= 1;
}
}
if (yexp < 1)
{
/* y is denormalized. Figure out its exponent. */
u = ay;
while (u < IMPBIT_DP64)
{
yexp--;
u <<= 1;
}
}
}
else if (ax == ay)
{
/* abs(x) == abs(y); return zero with the sign of x */
PUT_BITS_DP64(ux & SIGNBIT_DP64, dx);
return dx;
}
/* Set x = abs(x), y = abs(y) */
PUT_BITS_DP64(ax, dx);
PUT_BITS_DP64(ay, dy);
if (ax < ay)
{
/* abs(x) < abs(y) */
#if !defined(COMPILING_FMOD)
if (dx > 0.5*dy)
dx -= dy;
#endif
return x < 0.0? -dx : dx;
}
/* Save the current floating-point status word. We need
to do this because the remainder function is always
exact for finite arguments, but our algorithm causes
the inexact flag to be raised. We therefore need to
restore the entry status before exiting. */
sw = get_fpsw_inline();
/* Set ntimes to the number of times we need to do a
partial remainder. If the exponent of x is an exact multiple
of 52 larger than the exponent of y, and the mantissa of x is
less than the mantissa of y, ntimes will be one too large
but it doesn't matter - it just means that we'll go round
the loop below one extra time. */
if (xexp <= yexp)
ntimes = 0;
else
ntimes = (xexp - yexp) / 52;
if (ntimes == 0)
{
w = dy;
scale = 1.0;
}
else
{
/* Set w = y * 2^(52*ntimes) */
w = scaleDouble_3(dy, ntimes * 52);
/* Set scale = 2^(-52) */
PUT_BITS_DP64((unsigned long)(-52 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64,
scale);
}
/* Each time round the loop we compute a partial remainder.
This is done by subtracting a large multiple of w
from x each time, where w is a scaled up version of y.
The subtraction must be performed exactly in quad
precision, though the result at each stage can
fit exactly in a double precision number. */
for (i = 0; i < ntimes; i++)
{
/* t is the integer multiple of w that we will subtract.
We use a truncated value for t.
N.B. w has been chosen so that the integer t will have
at most 52 significant bits. This is the amount by
which the exponent of the partial remainder dx gets reduced
every time around the loop. In theory we could use
53 bits in t, but the quad precision multiplication
routine dekker_mul12 does not allow us to do that because
it loses the last (106th) bit of its quad precision result. */
/* Set dx = dx - w * t, where t is equal to trunc(dx/w). */
t = (double)(long)(dx / w);
/* At this point, t may be one too large due to
rounding of dx/w */
/* Compute w * t in quad precision */
dekker_mul12(w, t, &c, &cc);
/* Subtract w * t from dx */
v = dx - c;
dx = v + (((dx - v) - c) - cc);
/* If t was one too large, dx will be negative. Add back
one w */
/* It might be possible to speed up this loop by finding
a way to compute correctly truncated t directly from dx and w.
We would then avoid the need for this check on negative dx. */
if (dx < 0.0)
dx += w;
/* Scale w down by 2^(-52) for the next iteration */
w *= scale;
}
/* One more time */
/* Variable todd says whether the integer t is odd or not */
t = (double)(long)(dx / w);
todd = ((long)(dx / w)) & 1;
dekker_mul12(w, t, &c, &cc);
v = dx - c;
dx = v + (((dx - v) - c) - cc);
if (dx < 0.0)
{
todd = !todd;
dx += w;
}
/* At this point, dx lies in the range [0,dy) */
#if !defined(COMPILING_FMOD)
/* For the fmod function, we're done apart from setting
the correct sign. */
/* For the remainder function, we need to adjust dx
so that it lies in the range (-y/2, y/2] by carefully
subtracting w (== dy == y) if necessary. The rigmarole
with todd is to get the correct sign of the result
when x/y lies exactly half way between two integers,
when we need to choose the even integer. */
if (ay < 0x7fd0000000000000)
{
if (dx + dx > w || (todd && (dx + dx == w)))
dx -= w;
}
else if (dx > 0.5 * w || (todd && (dx == 0.5 * w)))
dx -= w;
#endif
/* **** N.B. for some reason this breaks the 32 bit version
of remainder when compiling with optimization. */
/* Restore the entry status flags */
set_fpsw_inline(sw);
/* Set the result sign according to input argument x */
return x < 0.0? -dx : dx;
}

View File

@@ -0,0 +1,251 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
extra precision, and return the result in r, rr.
Return value "region" tells how many lots of pi/2 were subtracted
from x to put it in the range [-pi/4,pi/4], mod 4. */
void __remainder_piby2(double x, double *r, double *rr, int *region)
{
/* This method simulates multi-precision floating-point
arithmetic and is accurate for all 1 <= x < infinity */
static const double
piby2_lead = 1.57079632679489655800e+00, /* 0x3ff921fb54442d18 */
piby2_part1 = 1.57079631090164184570e+00, /* 0x3ff921fb50000000 */
piby2_part2 = 1.58932547122958567343e-08, /* 0x3e5110b460000000 */
piby2_part3 = 6.12323399573676480327e-17; /* 0x3c91a62633145c06 */
const int bitsper = 10;
unsigned long res[500];
unsigned long ux, u, carry, mask, mant, highbitsrr;
int first, last, i, rexp, xexp, resexp, ltb, determ;
double xx, t;
static unsigned long pibits[] =
{
0, 0, 0, 0, 0, 0,
162, 998, 54, 915, 580, 84, 671, 777, 855, 839,
851, 311, 448, 877, 553, 358, 316, 270, 260, 127,
593, 398, 701, 942, 965, 390, 882, 283, 570, 265,
221, 184, 6, 292, 750, 642, 465, 584, 463, 903,
491, 114, 786, 617, 830, 930, 35, 381, 302, 749,
72, 314, 412, 448, 619, 279, 894, 260, 921, 117,
569, 525, 307, 637, 156, 529, 504, 751, 505, 160,
945, 1022, 151, 1023, 480, 358, 15, 956, 753, 98,
858, 41, 721, 987, 310, 507, 242, 498, 777, 733,
244, 399, 870, 633, 510, 651, 373, 158, 940, 506,
997, 965, 947, 833, 825, 990, 165, 164, 746, 431,
949, 1004, 287, 565, 464, 533, 515, 193, 111, 798
};
GET_BITS_DP64(x, ux);
xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
ux = (ux & MANTBITS_DP64) | IMPBIT_DP64;
/* Now ux is the mantissa bit pattern of x as a long integer */
carry = 0;
mask = 1;
mask = (mask << bitsper) - 1;
/* Set first and last to the positions of the first
and last chunks of 2/pi that we need */
first = xexp / bitsper;
resexp = xexp - first * bitsper;
/* 180 is the theoretical maximum number of bits (actually
175 for IEEE double precision) that we need to extract
from the middle of 2/pi to compute the reduced argument
accurately enough for our purposes */
last = first + 180 / bitsper;
/* Do a long multiplication of the bits of 2/pi by the
integer mantissa */
#if 0
for (i = last; i >= first; i--)
{
u = pibits[i] * ux + carry;
res[i - first] = u & mask;
carry = u >> bitsper;
}
res[last - first + 1] = 0;
#else
/* Unroll the loop. This is only correct because we know
that bitsper is fixed as 10. */
res[19] = 0;
u = pibits[last] * ux;
res[18] = u & mask;
carry = u >> bitsper;
u = pibits[last-1] * ux + carry;
res[17] = u & mask;
carry = u >> bitsper;
u = pibits[last-2] * ux + carry;
res[16] = u & mask;
carry = u >> bitsper;
u = pibits[last-3] * ux + carry;
res[15] = u & mask;
carry = u >> bitsper;
u = pibits[last-4] * ux + carry;
res[14] = u & mask;
carry = u >> bitsper;
u = pibits[last-5] * ux + carry;
res[13] = u & mask;
carry = u >> bitsper;
u = pibits[last-6] * ux + carry;
res[12] = u & mask;
carry = u >> bitsper;
u = pibits[last-7] * ux + carry;
res[11] = u & mask;
carry = u >> bitsper;
u = pibits[last-8] * ux + carry;
res[10] = u & mask;
carry = u >> bitsper;
u = pibits[last-9] * ux + carry;
res[9] = u & mask;
carry = u >> bitsper;
u = pibits[last-10] * ux + carry;
res[8] = u & mask;
carry = u >> bitsper;
u = pibits[last-11] * ux + carry;
res[7] = u & mask;
carry = u >> bitsper;
u = pibits[last-12] * ux + carry;
res[6] = u & mask;
carry = u >> bitsper;
u = pibits[last-13] * ux + carry;
res[5] = u & mask;
carry = u >> bitsper;
u = pibits[last-14] * ux + carry;
res[4] = u & mask;
carry = u >> bitsper;
u = pibits[last-15] * ux + carry;
res[3] = u & mask;
carry = u >> bitsper;
u = pibits[last-16] * ux + carry;
res[2] = u & mask;
carry = u >> bitsper;
u = pibits[last-17] * ux + carry;
res[1] = u & mask;
carry = u >> bitsper;
u = pibits[last-18] * ux + carry;
res[0] = u & mask;
#endif
/* Reconstruct the result */
ltb = (int)((((res[0] << bitsper) | res[1])
>> (bitsper - 1 - resexp)) & 7);
/* determ says whether the fractional part is >= 0.5 */
determ = ltb & 1;
i = 1;
if (determ)
{
/* The mantissa is >= 0.5. We want to subtract it
from 1.0 by negating all the bits */
*region = ((ltb >> 1) + 1) & 3;
mant = 1;
mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
while (mant < 0x0020000000000000)
{
i++;
mant = (mant << bitsper) | (~(res[i]) & mask);
}
highbitsrr = ~(res[i + 1]) << (64 - bitsper);
}
else
{
*region = (ltb >> 1);
mant = 1;
mant = res[1] & ((mant << (bitsper - resexp)) - 1);
while (mant < 0x0020000000000000)
{
i++;
mant = (mant << bitsper) | res[i];
}
highbitsrr = res[i + 1] << (64 - bitsper);
}
rexp = 52 + resexp - i * bitsper;
while (mant >= 0x0020000000000000)
{
rexp++;
highbitsrr = (highbitsrr >> 1) | ((mant & 1) << 63);
mant >>= 1;
}
/* Put the result exponent rexp onto the mantissa pattern */
u = ((unsigned long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
ux = (mant & MANTBITS_DP64) | u;
if (determ)
/* If we negated the mantissa we negate x too */
ux |= SIGNBIT_DP64;
PUT_BITS_DP64(ux, x);
/* Create the bit pattern for rr */
highbitsrr >>= 12; /* Note this is shifted one place too far */
u = ((unsigned long)rexp + EXPBIAS_DP64 - 53) << EXPSHIFTBITS_DP64;
PUT_BITS_DP64(u, t);
u |= highbitsrr;
PUT_BITS_DP64(u, xx);
/* Subtract the implicit bit we accidentally added */
xx -= t;
/* Set the correct sign, and double to account for the
"one place too far" shift */
if (determ)
xx *= -2.0;
else
xx *= 2.0;
/* (x,xx) is an extra-precise version of the fractional part of
x * 2 / pi. Multiply (x,xx) by pi/2 in extra precision
to get the reduced argument (r,rr). */
{
double hx, tx, c, cc;
/* Split x into hx (head) and tx (tail) */
GET_BITS_DP64(x, ux);
ux &= 0xfffffffff8000000;
PUT_BITS_DP64(ux, hx);
tx = x - hx;
c = piby2_lead * x;
cc = ((((piby2_part1 * hx - c) + piby2_part1 * tx) +
piby2_part2 * hx) + piby2_part2 * tx) +
(piby2_lead * xx + piby2_part3 * x);
*r = c + cc;
*rr = (c - *r) + cc;
}
return;
}

View File

@@ -0,0 +1,415 @@
;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the remainder by pi/2 function
; This is a service routine for use by trig functions coded in asm
;
; On input,
; xmm0 = x;
; On ouput
; xmm0 = r
; xmm1 = rr
; xmm2 = region
.const
ALIGN 16
L__piby2_part3_piby2_lead DQ 03ff921fb54442d18h, 03c91a62633145c06h
L__piby2_part1 DQ 03ff921fb50000000h, 03ff921fb50000000h
L__piby2_part2 DQ 03e5110b460000000h, 03e5110b460000000h
;; constants for CW reduction
L_piby2_1 DQ 03FF921FB54400000h, 03FF921FB54400000h
L_piby2_2 DQ 03DD0B4611A600000h, 03DD0B4611A600000h
L_piby2_3 DQ 03BA3198A2E000000h, 03BA3198A2E000000h
L_piby2_1tail DQ 03DD0B4611A626331h, 03DD0B4611A626331h
L_piby2_2tail DQ 03BA3198A2E037073h, 03BA3198A2E037073h
L_piby2_3tail DQ 0397B839A252049C1h, 0397B839A252049C1h
L_twobypi DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h
L_point_five DQ 03FE0000000000000h, 03FE0000000000000h
L_int_three DQ 00000000000000003h, 00000000000000003h
L_inf_mask_64 DQ 07FF0000000000000h, 07FF0000000000000h
L_signbit DQ 08000000000000000h, 08000000000000000h
L_int_1 DQ 00000000000000001h, 00000000000000001h
L_int_15 DQ 0000000000000000Fh
L_int_48 DQ 00000000000000030h
L_3pio4 DQ 04002D97C7F3321D2h
L_5pio4 DQ 0400F6A7A2955385Eh
L_7pio4 DQ 04015FDBBE9BBA775h
L_9pio4 DQ 0401c463abeccb2bbh
ALIGN 16
L__2_by_pi_bits DB 224, 241, 27, 193, 12, 88, 33, 116
DB 53, 126, 196, 126, 237, 175, 169, 75
DB 74, 41, 222, 231, 28, 244, 236, 197
DB 151, 175, 31, 235, 158, 212, 181, 168
DB 127, 121, 154, 253, 24, 61, 221, 38
DB 44, 159, 60, 251, 217, 180, 125, 180
DB 41, 104, 45, 70, 188, 188, 63, 96
DB 22, 120, 255, 95, 226, 127, 236, 160
DB 228, 247, 46, 126, 17, 114, 210, 231
DB 76, 13, 230, 88, 71, 230, 4, 249
DB 125, 209, 154, 192, 113, 166, 19, 18
DB 237, 186, 212, 215, 8, 162, 251, 156
DB 166, 196, 114, 172, 119, 248, 115, 72
DB 70, 39, 168, 187, 36, 25, 128, 75
DB 55, 9, 233, 184, 145, 220, 134, 21
DB 239, 122, 175, 142, 69, 249, 7, 65
DB 14, 241, 100, 86, 138, 109, 3, 119
DB 211, 212, 71, 95, 157, 240, 167, 84
DB 16, 57, 185, 13, 230, 139, 2, 0
DB 0, 0, 0, 0, 0, 0
; local storage offsets
region EQU 000h
stack_size EQU 018h
sstack_size EQU 000h ; no stack for fsname
include fm.inc
fname TEXTEQU <__remainder_piby2_forAsm>
fsname TEXTEQU <__remainder_piby2_cw_forAsm>
.code
; xmm0l has |x|
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
; This function is not using rdx, r8, and r9 as pointers;
; all returns are in registers
; get the unbiased exponent and the mantissa part of x
lea r9,L__2_by_pi_bits
;xexp = (x >> 52) - 1023
movd r11,xmm0
mov rcx,r11
shr r11,52
sub r11,1023 ; r11 <-- xexp = exponent of input x
;calculate the last byte from which to start multiplication
;last = 134 - (xexp >> 3)
mov r10,r11
shr r10,3
sub r10,134 ; r10 <-- -last
neg r10 ; r10 <-- last
; load 64 bits of 2_by_pi
mov rax,[r9 + r10]
; mantissa of x = ((x << 12) >> 12) | implied bit
shl rcx,12
shr rcx,12 ; rcx <-- mantissa part of input x
bts rcx,52 ; add the implied bit as well
; load next 128 bits of 2_by_pi
add r10,8 ;increment to next 8 bytes of 2_by_pi
movdqu xmm0,[r9 + r10]
; do three 64-bit multiplications with mant of x
mul rcx
mov r8,rax ; r8 <-- last 64 bits of mul = res1[2]
mov r10,rdx ; r10 <-- carry
movd rax,xmm0
mul rcx
; resexp = xexp & 7
and r11,7 ; r11 <-- resexp = xexp & 7 = last 3 bits
psrldq xmm0,8
add rax,r10 ; add the previous carry
adc rdx,0
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
mov r10,rdx ; r10 <-- carry
movd rax,xmm0
mul rcx
add r10,rax ; r10 <-- most sig. 64 bits = res1[0]
; find the region
; last three bits ltb = most sig bits >> (54 - resexp));
; decimal point in last 18 bits ==> 8 lsb's in first 64 bits
; and 8 msb's in next 64 bits
; point_five = ltb & 01h;
; region = ((ltb >> 1) + point_five) & 3;
mov rcx,54
mov rax,r10
sub rcx,r11
xor rdx,rdx ; rdx <-- sign of x
shr rax,cl
jnc L__no_point_five
; if there is carry then negate the result of multiplication
not r10
not r9
not r8
mov rdx,08000000000000000h
ALIGN 16
L__no_point_five:
adc rax,0
and rax,3 ; rax now has region
mov QWORD PTR [region+rsp],rax
; calculate the number of integer bits and zero them out
mov rcx,r11
add rcx,10 ; rcx = no. of integer bits
shl r10,cl
shr r10,cl ; r10 contains only mant bits
sub rcx,64 ; form the exponent
mov r11,rcx
;find the highest set bit
bsr rcx,r10
jnz L__form_mantissa
mov r10,r9
mov r9,r8
mov r8,0
bsr rcx,r10 ; rcx = hsb
sub r11,64
ALIGN 16
L__form_mantissa:
add r11,rcx ; for exp of x
sub rcx,52 ; rcx = no. of bits to shift in r10
cmp rcx,0
jl L__hsb_below_52
je L__form_numbers
; hsb above 52
mov r8,r10 ; previous contents of r8 not required
shr r10,cl ; r10 = mantissa of x with hsb at 52
shr r9,cl ; make space for bits from r10
sub rcx,64
neg rcx
; rcx <-- no of bits to shift r10 to move those bits to r9
shl r8,cl
or r9,r8 ; r9 = mantissa bits of xx
jmp L__form_numbers
ALIGN 16
L__hsb_below_52:
neg rcx
mov rax,r9
shl r10,cl
shl r9,cl
sub rcx,64
neg rcx
shr rax,cl
or r10,rax
shr r8,cl
or r9,r8
ALIGN 16
L__form_numbers:
add r11,1023
btr r10,52 ; remove the implicit bit
mov rcx,r11
or r10,rdx ; put the sign
shl rcx,52
or r10,rcx ; r10 <-- x
movd xmm0,r10 ; xmm0 <-- x
movdqa xmm1,xmm0 ; xmm1 <-- x
psrlq xmm1,27
psllq xmm1,27 ; xmm1 <-- hx
movdqa xmm2,xmm0 ; xmm2 <-- x
subsd xmm2,xmm1 ; xmm2 <-- tx
movlhps xmm0,xmm0 ; xmm0 <-- x,x
movlhps xmm2,xmm1 ; xmm2 <-- hx,tx
movdqa xmm1,XMMWORD PTR L__piby2_part3_piby2_lead
movdqa xmm3,XMMWORD PTR L__piby2_part1
movdqa xmm4,XMMWORD PTR L__piby2_part2
; form xx
xor rcx,rcx
bsr rcx,r9
sub rcx,64 ; to shift the implicit bit as well
neg rcx
shl r9,cl
shr r9,12
add rcx,52
sub r11,rcx
shl r11,52
or r9,rdx
or r9,r11
movd xmm5,r9 ; xmm5 <-- xx
mulpd xmm0,xmm1 ; xmm0 <-- piby2_part3 * x,piby2_lead * x = c
mulpd xmm5,xmm1 ; xmm5 <-- piby2_lead * xx
mulpd xmm3,xmm2 ; xmm3 <-- piby2_part1 * hx,piby2_part1 * tx
mulpd xmm4,xmm2 ; xmm4 <-- piby2_part2 * hx,piby2_part2 * tx
; cc = (piby2_part1 * hx - c) + (piby2_part1 * tx) +
; (piby2_part2 * hx) + (piby2_part2 * tx) +
; (piby2_lead * xx + piby2_part3 * x)
movhlps xmm1,xmm3 ; xmm1 = piby2_part1 * hx
movhlps xmm2,xmm4 ; xmm2 = piby2_part2 * hx
subsd xmm1,xmm0 ; xmm1 = (piby2_part1 * hx - c)
addsd xmm1,xmm3 ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx)
movhlps xmm3,xmm0 ; xmm3 = piby2_part3 * x
addsd xmm1,xmm2
; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + (piby2_part2 * hx)
addsd xmm3,xmm5 ; xmm3 = (piby2_lead * xx + piby2_part3 * x)
addsd xmm1,xmm4
; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) +
; (piby2_part2 * hx) + (piby2_part2 * tx)
addsd xmm1,xmm3 ; xmm1 = cc
; xmm0 <-- c, xmm1 <-- cc
; r = c + cc
; rr = (c - r) + cc
movdqa xmm2,xmm0 ; xmm2 <-- copy of c
addsd xmm0,xmm1 ; xmm0 <-- r = c + cc
subsd xmm2,xmm0 ; xmm2 <-- c - r
addsd xmm1,xmm2 ; xmm1 <-- rr = cc + (c - r)
mov rax, QWORD PTR[region+rsp] ; rax <-- region
StackDeallocate stack_size
ret
fname endp
; NOTE: If this is not going to be used, should probably remove it. - WAT
ALIGN 16
PUBLIC fsname
fsname PROC FRAME
StackAllocate sstack_size
.ENDPROLOG
; xmm0l has |x|
; r9 also has |x|
; ASSUMPTION: if we call this function, |x| > pi/4
xor r8d,r8d
cmp r9, QWORD PTR L_5pio4
ja Lax_gt_5pio4
cmp r9, QWORD PTR L_3pio4
seta r8b
inc r8d
jmp Lstage_npi2
Lax_gt_5pio4:
cmp r9, QWORD PTR L_9pio4
ja Lnpi2_full_computation
cmp r9, QWORD PTR L_7pio4
seta r8b
add r8d,3
Lstage_npi2:
movd xmm2, r8d
cvtdq2pd xmm4, xmm2
jmp Lnpi2_known
Lnpi2_full_computation:
; movapd xmm1, L_twobypi
; movapd xmm3, L_point_five
movapd xmm5,xmm0
; mulsd xmm5,xmm1
; addsd xmm5,xmm3 ; xmm5 <-- |x|*2/pi + .5
mulsd xmm5, L_twobypi
addsd xmm5, L_point_five
cvttpd2dq xmm5,xmm5 ; xmm5 < npi2 = int part
movapd xmm2,xmm5
andpd xmm2,L_int_three
cvtdq2pd xmm4,xmm5
Lnpi2_known:
movapd xmm5,xmm4
mulsd xmm5,QWORD PTR L_piby2_1 ; xmm5 <-- npi2*piby2_1
xorpd xmm5,L_signbit ; xmm5 <-- -npi2*piby2_1
addpd xmm5,xmm0 ; xmm5 <-- rhead = x - npi2*piby2_1
movapd xmm3,xmm4
mulsd xmm3,QWORD PTR L_piby2_1tail ; xmm3 <-- rtail = npi2*piby2_1tail
; If x is nearly a multiple of pi/2, rhead will be small compared to |x|
; we check this by checking exponent difference.
; Note that both the unbiased exponents are positive, and that of rhead
; must be <= that of |x|
movapd xmm1,xmm5 ; xmm1l <-- rhead
subpd xmm1,xmm3 ; xmm1l <-- r = rhead - rtail
andpd xmm1,L_inf_mask_64
psubq xmm0,xmm1 ; xmm0 <-- |x| - r
psrlq xmm0,52
comisd xmm0,L_int_15
; movd rax, xmm5 ; really a movq
; shr rax, 52
; shr rdx, 52 ; get exponent of |x| (no and needed)
; sub rdx, rax
; cmp rdx, 15
jbe Lcw_get_r_rr
; here expdiff > 15, so x is nearly a multiple of pi/2 and things are hard
; we use another piece of pi/2 in the reduction
movapd xmm1,xmm5
movapd xmm3,xmm4
mulsd xmm3,QWORD PTR L_piby2_2 ; xmm3 <--- rtail = npi2*piby2_2
subsd xmm5,xmm3 ; xmm5 <-- rhead = t - rtail
; now rtail = npi2*piby2_2tail - ((t-rhead) - rtail)
subsd xmm1,xmm5
subsd xmm1,xmm3
movapd xmm3,xmm4
mulsd xmm3,QWORD PTR L_piby2_2tail
subsd xmm3,xmm1 ; xmm3 <-- rtail
comisd xmm0,L_int_48
; cmp rdx, 48
jbe Lcw_get_r_rr
; here expdiff > 48, so x is REALLY close to a multiple of pi/2
; and we use yet another piece of pi/2 in the reduction
movapd xmm0,xmm5 ; xmm0 <-- t = rhead
movapd xmm3,xmm4
mulsd xmm3,QWORD PTR L_piby2_3 ; xmm3 <-- rtail = npi2 * piby2_3
movapd xmm5,xmm0
subsd xmm5,xmm3 ; xmm5 <-- rhead = t - rtail
; now rtail = npi2 * piby2_3tail - ((t - rhead) - rtail)
movapd xmm1,xmm0
subsd xmm1,xmm5
subsd xmm1,xmm3
movapd xmm3,xmm4
mulsd xmm3,QWORD PTR L_piby2_3tail
subsd xmm3,xmm1 ; xmm3 <-- rtail
Lcw_get_r_rr:
; We have a satisfactory rhead in xmm5 and rtail in xmm3
; We now produce r in xmm0 and rr in xmm1, where the actual reduced argument
; is the sum of r and rr, and rr is insignificant
; with respect to r under addition (i.e., r + rr == r).
movapd xmm0,xmm5 ; xmm0 <-- rhead
subsd xmm0,xmm3 ; xmm0 <-- r = rhead - rtail
movapd xmm1,xmm5 ; xmm1 <-- rhead
subsd xmm1,xmm0 ; xmm1 <-- (rhead - r)
subsd xmm1,xmm3 ; xmm1 <-- rr = (rhead - r) - rtail
movd rax,xmm2 ; rax <-- region
StackDeallocate sstack_size
ret
fsname endp
END

View File

@@ -0,0 +1,283 @@
;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the remainder by pi/2 function using fma3
; This is a service routine for use by trig functions coded in asm that use fma3
;
; On input,
; xmm0 = x;
; On ouput
; xmm0 = r
; xmm1 = rr
; rax = region
.const
ALIGN 16
L_piby2_lead DQ 03ff921fb54442d18h, 03ff921fb54442d18h
L_fff800 DQ 0fffffffff8000000h, 0fffffffff8000000h
L_piby2_part1 DQ 03ff921fb50000000h, 03ff921fb50000000h
L_piby2_part2 DQ 03e5110b460000000h, 03e5110b460000000h
L_piby2_part3 DQ 03c91a62633145c06h, 03c91a62633145c06h
L_piby2_1 DQ 03FF921FB54400000h, 03FF921FB54400000h
L_piby2_2 DQ 03DD0B4611A600000h, 03DD0B4611A600000h
L_piby2_3 DQ 03BA3198A2E000000h, 03BA3198A2E000000h
L_piby2_1tail DQ 03DD0B4611A626331h, 03DD0B4611A626331h
L_piby2_2tail DQ 03BA3198A2E037073h, 03BA3198A2E037073h
L_piby2_3tail DQ 0397B839A252049C1h, 0397B839A252049C1h
L_sign_mask DQ 07FFFFFFFFFFFFFFFh, 07FFFFFFFFFFFFFFFh
L_twobypi DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h
L_point_five DQ 03FE0000000000000h, 03FE0000000000000h
L_int_three DQ 00000000000000003h, 00000000000000003h
L_inf_mask_64 DQ 07FF0000000000000h, 07FF0000000000000h
L_signbit DQ 08000000000000000h, 08000000000000000h
;; constants for BDL reduction
L_r DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h ; 2/pi
L_xc1 DQ 03FF921FB54442D18H, 03FF921FB54442D18h ; pi/2 (L_piby2_lead)
L_xc2 DQ 03C91A62633145C00H, 03C91A62633145C00h ; pi/2 part 2
L_xc3 DQ 0397B839A252049C0H, 0397B839A252049C0h ; pi/2 part 3
; sigma is 3*2^(p-n-2) where n is 0 and p is 53.
L_sigma DQ 04338000000000000h, 04338000000000000h ; 6755399441055744.
EXTRN __L_2_by_pi_bits:BYTE
region EQU 020h
stack_size EQU 038h
include fm.inc
fname TEXTEQU <__remainder_piby2_fma3>
fbname TEXTEQU <__remainder_piby2_fma3_bdl>
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
; This function is not using rdx, r8, and r9 as pointers;
; all returns are in registers
; get the unbiased exponent and the mantissa part of x
lea r9,__L_2_by_pi_bits
; xexp = (x >> 52) - 1023
vmovq r11,xmm0
mov rcx,r11
shr r11,52
sub r11,1023 ; r11 <-- xexp = exponent of input x
; calculate the last byte from which to start multiplication
; last = 134 - (xexp >> 3)
mov r10,r11
shr r10,3
sub r10,134 ; r10 <-- -last
neg r10 ; r10 <-- last
; load 64 bits of 2_by_pi
mov rax,[r9 + r10]
; mantissa of x = ((x << 12) >> 12) | implied bit
shl rcx,12
shr rcx,12 ; rcx <-- mantissa part of input x
bts rcx,52 ; add the implied bit as well
; load next 128 bits of 2_by_pi
add r10,8 ; increment to next 8 bytes of 2_by_pi
vmovdqu xmm0,XMMWORD PTR[r9 + r10]
; do three 64-bit multiplications with mant of x
mul rcx
mov r8,rax ; r8 <-- last 64 bits of mul = res1[2]
mov r10,rdx ; r10 <-- carry
vmovq rax,xmm0
mul rcx
; resexp = xexp & 7
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
vpsrldq xmm0,xmm0,8
add rax,r10 ; add the previous carry
adc rdx,0
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
mov r10,rdx ; r10 <-- carry
vmovq rax,xmm0
mul rcx
add r10,rax ; r10 <-- most sig. 64 bits = res1[0]
; find the region
; last three bits ltb = most sig bits >> (54 - resexp));
; decimal point in last 18 bits ==> 8 lsb's in first 64 bits
; and 8 msb's in next 64 bits
; point_five = ltb & 01h;
; region = ((ltb >> 1) + point_five) & 3;
mov rcx,54
mov rax,r10
sub rcx,r11
xor rdx,rdx ; rdx <-- sign of x
shr rax,cl
jnc L__no_point_five
; if there is carry then negate the result of multiplication
not r10
not r9
not r8
mov rdx,08000000000000000h
ALIGN 16
L__no_point_five:
adc rax,0
and rax,3 ; rax now has region
mov QWORD PTR [region+rsp], rax
; calculate the number of integer bits and zero them out
mov rcx,r11
add rcx,10 ; rcx = no. of integer bits
shl r10,cl
shr r10,cl ; r10 contains only mant bits
sub rcx,64 ; form the exponent
mov r11,rcx
; find the highest set bit
bsr rcx,r10
jnz L__form_mantissa
mov r10,r9
mov r9,r8
mov r8,0
bsr rcx,r10 ; rcx = hsb
sub r11,64
ALIGN 16
L__form_mantissa:
add r11,rcx ; for exp of x
sub rcx,52 ; rcx = no. of bits to shift in r10
cmp rcx,0
jl L__hsb_below_52
je L__form_numbers
; hsb above 52
mov r8,r10 ; previous r8 not required
shr r10,cl ; r10 = mantissa of x with hsb at 52
shr r9,cl ; make space for bits from r10
sub rcx,64
neg rcx
; rcx <-- no of bits to shift r10 to move those bits to r9
shl r8,cl
or r9,r8 ; r9 = mantissa bits of xx
jmp L__form_numbers
ALIGN 16
L__hsb_below_52:
; rcx has shift count (< 0)
neg rcx
mov rax,r9
shl r10,cl
shl r9,cl
sub rcx,64
neg rcx
shr rax,cl
or r10,rax
shr r8,cl
or r9,r8
ALIGN 16
; Here r11 has unbiased exponent
; r10 has mantissa, with implicit bit possibly set
; rdx has the sign bit
L__form_numbers:
add r11,1023 ; r11 <-- biased exponent
btr r10,52 ; remove the implicit bit
mov rcx,r11 ; rcx <-- copy of biased exponent
or r10,rdx ; put the sign
shl rcx,52 ; shift biased exponent into place
or r10,rcx ; r10 <-- x
vmovq xmm2,r10 ; xmm1l <-- x
; form xx
; xor rcx,rcx ; Why is this necessary???
bsr rcx,r9 ; scan for high bit of xx mantissa
sub rcx,64 ; to shift the implied bit as well
neg rcx
shl r9,cl
shr r9,12
add rcx,52
sub r11,rcx
shl r11,52
or r9,rdx
or r9,r11
vmovq xmm1,r9 ; xmm1 <-- xx
vandpd xmm4,xmm2,L_fff800 ; xmm4 <-- hx
vsubsd xmm0,xmm2,xmm4 ; xmm0 <-- tx
vmulsd xmm5,xmm2,L_piby2_lead ; xmm5 <-- c
vmulsd xmm3,xmm4,L_piby2_part1
vsubsd xmm3,xmm3,xmm5
vfmadd231sd xmm3,xmm0,L_piby2_part1
vfmadd231sd xmm3,xmm4,L_piby2_part2
vfmadd231sd xmm3,xmm0,L_piby2_part2
vmulsd xmm4,xmm1,L_piby2_lead
vfmadd231sd xmm4,xmm2,L_piby2_part3
vaddsd xmm3,xmm3,xmm4 ; xmm3 <-- cc
vaddsd xmm0,xmm5,xmm3 ; xmm0 <--r
vsubsd xmm1,xmm5,xmm0
vaddsd xmm1,xmm1,xmm3 ; xmm1 <-- rr
mov rax, QWORD PTR [region+rsp]
StackDeallocate stack_size
ret
fname endp
ALIGN 16
PUBLIC fbname
fbname PROC FRAME
.ENDPROLOG
; Boldo, Daumas, annd Li, "Formally Verified Argument
; Reduction With a Fused Multiply-Add,"
; IEEE Trans. Comp., vol. 58, #8, Aug. 2009
; coefficients are from table 1, mutatis mutandis
; algorithm is their formula 3.1 (for getting z from sigma) and
; algorithm 5.1 (and extended version) for actual reduction
vmovapd xmm1,xmm0
vmovapd xmm4,L_xc2 ; xmm4 <-- xc2
vmovapd xmm2,L_sigma
vfmadd132sd xmm1,xmm2,L_r ; z = arg*r + sigma
vsubsd xmm1,xmm1,xmm2 ; xmm1 <-- z -= sigma
vcvttpd2dq xmm5,xmm1
vmovq rax, xmm5
vmovapd xmm2,xmm1
vfnmadd132sd xmm2,xmm0,L_xc1 ; xmm2 <-- u = arg - z*xc1
vmulsd xmm3,xmm1,xmm4 ; xmm3 <-- p1 = z*xc2
vmovapd xmm0,xmm1 ; xmm0 <-- copy of z
vfmsub213sd xmm0,xmm4,xmm3 ; xmm0 <-- p2 = z*xc2 - p1
vsubsd xmm5,xmm2,xmm3 ; xmm5 <-- t1 = u - p1
; We really don't want to spill in this code, so we're commandeering xmm4
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- temp = u - t1
vsubsd xmm4,xmm4,xmm3 ; xmm4 <-- t2 = temp - p1
; used to use xmm4 here for L_xc2
vfnmadd231sd xmm2,xmm1,L_xc2 ; xmm2 <-- v1 = -xc2*z + u
vsubsd xmm5,xmm5,xmm2 ; xmm5 <-- v2 = t1 - v1
vaddsd xmm5,xmm5,xmm4 ; xmm5 <-- v2 += t2
vsubsd xmm5,xmm5,xmm0 ; xmm5 <-- v2 -= p2
vmovapd xmm0,xmm2 ; xmm0 <-- arghead = v1
vfnmadd132sd xmm1,xmm5,L_xc3 ; xmm1 <-- argtail = -xc3*z + v2
and rax, 3 ; rax <-- region
ret
fbname endp
END

View File

@@ -0,0 +1,173 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
extra precision, and return the result in r.
Return value "region" tells how many lots of pi/2 were subtracted
from x to put it in the range [-pi/4,pi/4], mod 4. */
void __remainder_piby2f(unsigned long ux, double *r, int *region)
{
/* This method simulates multi-precision floating-point
arithmetic and is accurate for all 1 <= x < infinity */
#define bitsper 36
unsigned long res[10];
unsigned long u, carry, mask, mant, nextbits;
int first, last, i, rexp, xexp, resexp, ltb, determ, bc;
double dx;
static const double
piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
static unsigned long pibits[] =
{
0LL,
5215LL, 13000023176LL, 11362338026LL, 67174558139LL,
34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL,
19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL
};
xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29;
/* Now ux is the mantissa bit pattern of x as a long integer */
mask = 1;
mask = (mask << bitsper) - 1;
/* Set first and last to the positions of the first
and last chunks of 2/pi that we need */
first = xexp / bitsper;
resexp = xexp - first * bitsper;
/* 120 is the theoretical maximum number of bits (actually
115 for IEEE single precision) that we need to extract
from the middle of 2/pi to compute the reduced argument
accurately enough for our purposes */
last = first + 120 / bitsper;
/* Do a long multiplication of the bits of 2/pi by the
integer mantissa */
#if 0
for (i = last; i >= first; i--)
{
u = pibits[i] * ux + carry;
res[i - first] = u & mask;
carry = u >> bitsper;
}
res[last - first + 1] = 0;
#else
/* Unroll the loop. This is only correct because we know
that bitsper is fixed as 36. */
res[4] = 0;
u = pibits[last] * ux;
res[3] = u & mask;
carry = u >> bitsper;
u = pibits[last - 1] * ux + carry;
res[2] = u & mask;
carry = u >> bitsper;
u = pibits[last - 2] * ux + carry;
res[1] = u & mask;
carry = u >> bitsper;
u = pibits[first] * ux + carry;
res[0] = u & mask;
#endif
/* Reconstruct the result */
ltb = (int)((((res[0] << bitsper) | res[1])
>> (bitsper - 1 - resexp)) & 7);
/* determ says whether the fractional part is >= 0.5 */
determ = ltb & 1;
i = 1;
if (determ)
{
/* The mantissa is >= 0.5. We want to subtract it
from 1.0 by negating all the bits */
*region = ((ltb >> 1) + 1) & 3;
mant = 1;
mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
while (mant < 0x0000000000010000)
{
i++;
mant = (mant << bitsper) | (~(res[i]) & mask);
}
nextbits = (~(res[i+1]) & mask);
}
else
{
*region = (ltb >> 1);
mant = 1;
mant = res[1] & ((mant << (bitsper - resexp)) - 1);
while (mant < 0x0000000000010000)
{
i++;
mant = (mant << bitsper) | res[i];
}
nextbits = res[i+1];
}
/* Normalize the mantissa. The shift value 6 here, determined by
trial and error, seems to give optimal speed. */
bc = 0;
while (mant < 0x0000400000000000)
{
bc += 6;
mant <<= 6;
}
while (mant < 0x0010000000000000)
{
bc++;
mant <<= 1;
}
mant |= nextbits >> (bitsper - bc);
rexp = 52 + resexp - bc - i * bitsper;
/* Put the result exponent rexp onto the mantissa pattern */
u = ((unsigned long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
ux = (mant & MANTBITS_DP64) | u;
if (determ)
/* If we negated the mantissa we negate x too */
ux |= SIGNBIT_DP64;
PUT_BITS_DP64(ux, dx);
/* x is a double precision version of the fractional part of
x * 2 / pi. Multiply x by pi/2 in double precision
to get the reduced argument r. */
*r = dx * piby2;
return;
}

View File

@@ -0,0 +1,180 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the remainder by pi/2 function
; This is a service routine for use by trig functions coded in asm
;
; On input,
; xmm0 = x; Note that we assume x >= pi/4
; On ouput
; xmm0 = r
; eax = region
.const
ALIGN 16
L__piby2 DQ 03ff921fb54442d18h
EXTRN __L_2_by_pi_bits:BYTE
fname TEXTEQU <__remainder_piby2d2f_forAsm>
stack_size EQU 000h
include fm.inc
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
lea r9,__L_2_by_pi_bits
;get the unbiased exponent and the mantissa part of x
;Since x >= pi/4, xexp = (x >> 52) - 1023
movd r11,xmm0
mov rcx,r11
shr r11,52
sub r11,1023 ; r11 <-- xexp = exponent of input x
;calculate the last byte from which to start multiplication
;last = 134 - (xexp >> 3)
mov r10,r11
shr r10,3
sub r10,134 ;r10 = -last
neg r10 ;r10 = last
;load 64 bits of 2_by_pi
mov rax,[r9 + r10]
;mantissa of x = ((x << 12) >> 12) | implied bit
shl rcx,12
shr rcx,12 ;rcx = mantissa part of input x
bts rcx,52 ;add the implied bit as well
;load next 128 bits of 2_by_pi
add r10,8 ;increment to next 8 bytes of 2_by_pi
movdqu xmm0,[r9 + r10]
;do three 64-bit multiplications with mant of x
mul rcx
mov r8,rax ;r8 = last 64 bits of mul = res1[2]
mov r10,rdx ;r10 <-- carry
movd rax,xmm0
mul rcx
;resexp = xexp & 7
and r11,7 ;r11 = resexp = xexp & 7 = last 3 bits
psrldq xmm0,8
add rax,r10 ; add the previous carry
adc rdx,0
mov r9,rax ;r9 = next 64 bits of mul = res1[1]
mov r10,rdx ;r10 <-- carry
movd rax,xmm0
mul rcx
add r10,rax ;r10 = most sig 64 bits = res1[0]
;find the region
;last three bits ltb = most sig bits >> (54 - resexp))
; decimal point in last 18 bits ==> 8 lsb's in first 64 bits and
; 8 msb's in next 64 bits
;point_five = ltb & 01h;
;region = ((ltb >> 1) + point_five) & 3;
mov rcx,54
mov rax,r10
sub rcx,r11
xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi)
shr rax,cl
jnc L__no_point_five
;;if there is carry.. then negate the result of multiplication
not r10
not r9
not r8
mov rdx,08000000000000000h
ALIGN 16
L__no_point_five:
adc rax,0
and rax,3
; Until / unless we find a better place to save it, we're putting
; the region in xmm1.
movd xmm1, rax
;calculate the number of integer bits and zero them out
mov rcx,r11
add rcx,10 ;rcx = no. of integer bits
shl r10,cl
shr r10,cl ;r10 contains only mant bits
sub rcx,64 ;form the exponent
mov r11,rcx
;find the highest set bit
bsr rcx,r10
jnz L__form_mantissa
mov r10,r9
mov r9,r8
bsr rcx,r10 ;rcx = hsb
sub r11,64
ALIGN 16
L__form_mantissa:
add r11,rcx ;for exp of x
sub rcx,52 ;rcx = no. of bits to shift in r10
cmp rcx,0
jl L__hsb_below_52
je L__form_numbers
;hsb above 52
mov r8,r10 ;previous contents of r8 not required
shr r10,cl ;r10 = mantissa of x with hsb at 52
jmp L__form_numbers
ALIGN 16
L__hsb_below_52:
neg rcx
mov rax,r9
shl r10,cl
shl r9,cl
sub rcx,64
neg rcx
shr rax,cl
or r10,rax
ALIGN 16
L__form_numbers:
add r11,1023
btr r10,52 ;remove the implied bit
mov rcx,r11
or r10,rdx ;put the sign
shl rcx,52
or r10,rcx ;x is in r10
movd xmm0,r10 ; xmm0 = x
movd rax, xmm1 ; rax <-- region
; At this point xmm0 has a double precision version of the fractional part
; of x * 2/pi. To get the reduced argument r, we multiply that by pi/2.
mulsd xmm0,L__piby2
StackDeallocate stack_size
ret
fname endp
END

View File

@@ -0,0 +1,341 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the remainder by pi/2 function
; This is a service routine for use by trig functions coded in C
;
fname TEXTEQU <__remainder_piby2d2f_forC>
save_rdi EQU 20h
save_rsi EQU 30h
stack_size EQU 088h
include fm.inc
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
SaveReg rdi,save_rdi
SaveReg rsi,save_rsi
.ENDPROLOG
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
;get the unbiased exponent and the mantissa part of x
movd xmm0,rdi
lea r9,L__2_by_pi_bits
;xexp = (x >> 52) - 1023
movd r11,xmm0
mov rcx,r11
shr r11,52
sub r11,1023 ;r11 = xexp = exponent of input x
;calculate the last byte from which to start multiplication
;last = 134 - (xexp >> 3)
mov r10,r11
shr r10,3
sub r10,134 ;r10 = -last
neg r10 ;r10 = last
;load 64 bits of 2_by_pi
mov rax,[r9 + r10]
mov rdi,rdx ; save address of region since mul modifies rdx
;mantissa of x = ((x << 12) >> 12) | implied bit
shl rcx,12
shr rcx,12 ;rcx = mantissa part of input x
bts rcx,52 ;add the implied bit as well
;load next 128 bits of 2_by_pi
add r10,8 ;increment to next 8 bytes of 2_by_pi
movdqu xmm0,[r9 + r10]
;do three 64-bit multiplications with mant of x
mul rcx
mov r8,rax ;r8 = last 64 bits of multiplication = res1[2]
mov r10,rdx ;r10 = carry
movd rax,xmm0
mul rcx
;resexp = xexp & 7
and r11,7 ;r11 = resexp = xexp & 7 = last 3 bits
psrldq xmm0,8
add rax,r10 ; add the previous carry
adc rdx,0
mov r9,rax ;r9 = next 64 bits of multiplication = res1[1]
mov r10,rdx ;r10 = carry
movd rax,xmm0
mul rcx
add r10,rax ;r10 = most significant 64 bits = res1[0]
;find the region
;last three bits ltb = most sig bits >> (54 - resexp)); decimal point in last 18 bits == 8 lsb's in first 64 bits and 8 msb's in next 64 bits
;point_five = ltb & 01h;
;region = ((ltb >> 1) + point_five) & 3;
mov rcx,54
mov rax,r10
sub rcx,r11
xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi)
shr rax,cl
jnc L__no_point_five
;;if there is carry.. then negate the result of multiplication
not r10
not r9
not r8
mov rdx,08000000000000000h
ALIGN 16
L__no_point_five:
adc rax,0
and rax,3
mov DWORD PTR[rdi],eax ;store region to memory
;calculate the number of integer bits and zero them out
mov rcx,r11
add rcx,10 ;rcx = no. of integer bits
shl r10,cl
shr r10,cl ;r10 contains only mant bits
sub rcx,64 ;form the exponent
mov r11,rcx
;find the highest set bit
bsr rcx,r10
jnz L__form_mantissa
mov r10,r9
mov r9,r8
bsr rcx,r10 ;rcx = hsb
sub r11,64
ALIGN 16
L__form_mantissa:
add r11,rcx ;for exp of x
sub rcx,52 ;rcx = no. of bits to shift in r10
cmp rcx,0
jl L__hsb_below_52
je L__form_numbers
;hsb above 52
mov r8,r10 ;previous contents of r8 not required
shr r10,cl ;r10 = mantissa of x with hsb at 52
jmp L__form_numbers
ALIGN 16
L__hsb_below_52:
neg rcx
mov rax,r9
shl r10,cl
shl r9,cl
sub rcx,64
neg rcx
shr rax,cl
or r10,rax
ALIGN 16
L__form_numbers:
add r11,1023
btr r10,52 ;remove the implied bit
mov rcx,r11
or r10,rdx ;put the sign
shl rcx,52
or r10,rcx ;x is in r10
movd xmm0,r10 ;xmm0 = x
mulsd xmm0,L__piby2
movsd QWORD PTR[rsi],xmm0
RestoreReg rsi,save_rsi
RestoreReg rdi,save_rdi
StackDeallocate stack_size
ret
fname endp
.const
ALIGN 16
L__piby2 DQ 03ff921fb54442d18h
ALIGN 16
L__2_by_pi_bits DB 224
DB 241
DB 27
DB 193
DB 12
DB 88
DB 33
DB 116
DB 53
DB 126
DB 196
DB 126
DB 237
DB 175
DB 169
DB 75
DB 74
DB 41
DB 222
DB 231
DB 28
DB 244
DB 236
DB 197
DB 151
DB 175
DB 31
DB 235
DB 158
DB 212
DB 181
DB 168
DB 127
DB 121
DB 154
DB 253
DB 24
DB 61
DB 221
DB 38
DB 44
DB 159
DB 60
DB 251
DB 217
DB 180
DB 125
DB 180
DB 41
DB 104
DB 45
DB 70
DB 188
DB 188
DB 63
DB 96
DB 22
DB 120
DB 255
DB 95
DB 226
DB 127
DB 236
DB 160
DB 228
DB 247
DB 46
DB 126
DB 17
DB 114
DB 210
DB 231
DB 76
DB 13
DB 230
DB 88
DB 71
DB 230
DB 4
DB 249
DB 125
DB 209
DB 154
DB 192
DB 113
DB 166
DB 19
DB 18
DB 237
DB 186
DB 212
DB 215
DB 8
DB 162
DB 251
DB 156
DB 166
DB 196
DB 114
DB 172
DB 119
DB 248
DB 115
DB 72
DB 70
DB 39
DB 168
DB 187
DB 36
DB 25
DB 128
DB 75
DB 55
DB 9
DB 233
DB 184
DB 145
DB 220
DB 134
DB 21
DB 239
DB 122
DB 175
DB 142
DB 69
DB 249
DB 7
DB 65
DB 14
DB 241
DB 100
DB 86
DB 138
DB 109
DB 3
DB 119
DB 211
DB 212
DB 71
DB 95
DB 157
DB 240
DB 167
DB 84
DB 16
DB 57
DB 185
DB 13
DB 230
DB 139
DB 2
DB 0
DB 0
DB 0
DB 0
DB 0
DB 0
DB 0
END

View File

@@ -0,0 +1,247 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_NANF_WITH_FLAGS
#define USE_SCALEDOUBLE_1
#define USE_GET_FPSW_INLINE
#define USE_SET_FPSW_INLINE
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_NANF_WITH_FLAGS
#undef USE_SCALEDOUBLE_1
#undef USE_GET_FPSW_INLINE
#undef USE_SET_FPSW_INLINE
#undef USE_HANDLE_ERRORF
#if !defined(_CRTBLD_C9X)
#define _CRTBLD_C9X
#endif
#include "libm_errno.h"
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(remainderf,fmodf)
#undef _FUNCNAME
#if defined(COMPILING_FMOD)
float fmodf(float x, float y)
#define _FUNCNAME "fmodf"
#define _OPERATION OP_FMOD
#else
float remainderf(float x, float y)
#define _FUNCNAME "remainderf"
#define _OPERATION OP_REM
#endif
{
double dx, dy, scale, w, t;
int i, ntimes, xexp, yexp;
unsigned long ux, uy, ax, ay;
unsigned int sw;
dx = x;
dy = y;
GET_BITS_DP64(dx, ux);
GET_BITS_DP64(dy, uy);
ax = ux & ~SIGNBIT_DP64;
ay = uy & ~SIGNBIT_DP64;
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
if (xexp < 1 || xexp > BIASEDEMAX_DP64 ||
yexp < 1 || yexp > BIASEDEMAX_DP64)
{
/* x or y is zero, NaN or infinity (neither x nor y can be
denormalized because we promoted from float to double) */
if (xexp > BIASEDEMAX_DP64)
{
/* x is NaN or infinity */
if (ux & MANTBITS_DP64)
{
/* x is NaN */
unsigned int ufx;
GET_BITS_SP32(x, ufx);
return _handle_errorf(_FUNCNAME, _OPERATION, ufx|0x00400000, _DOMAIN, 0,
EDOM, x, y, 2);
}
else
{
/* x is infinity; result is NaN */
return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN,
AMD_F_INVALID, EDOM, x, y, 2);
}
}
else if (yexp > BIASEDEMAX_DP64)
{
/* y is NaN or infinity */
if (uy & MANTBITS_DP64)
{
/* y is NaN */
unsigned int ufy;
GET_BITS_SP32(y, ufy);
return _handle_errorf(_FUNCNAME, _OPERATION, ufy|0x00400000, _DOMAIN, 0,
EDOM, x, y, 2);
}
else
{
#ifdef _CRTBLD_C9X
/* C99 return for y = +-inf is x */
return x;
#else
/* y is infinity; result is indefinite */
return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN,
AMD_F_INVALID, EDOM, x, y, 2);
#endif
}
}
else if (xexp < 1)
{
/* x must be zero (cannot be denormalized) */
if (yexp < 1)
{
/* y must be zero (cannot be denormalized) */
return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN,
AMD_F_INVALID, EDOM, x, y, 2);
}
else
/* C99 return for x = 0 must preserve sign */
return x;
}
else
{
/* y must be zero */
return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN,
AMD_F_INVALID, EDOM, x, y, 2);
}
}
else if (ax == ay)
{
/* abs(x) == abs(y); return zero with the sign of x */
PUT_BITS_DP64(ux & SIGNBIT_DP64, dx);
return (float)dx;
}
/* Set dx = abs(x), dy = abs(y) */
PUT_BITS_DP64(ax, dx);
PUT_BITS_DP64(ay, dy);
if (ax < ay)
{
/* abs(x) < abs(y) */
#if !defined(COMPILING_FMOD)
if (dx > 0.5*dy)
dx -= dy;
#endif
return (float)(x < 0.0? -dx : dx);
}
/* Save the current floating-point status word. We need
to do this because the remainder function is always
exact for finite arguments, but our algorithm causes
the inexact flag to be raised. We therefore need to
restore the entry status before exiting. */
sw = get_fpsw_inline();
/* Set ntimes to the number of times we need to do a
partial remainder. If the exponent of x is an exact multiple
of 24 larger than the exponent of y, and the mantissa of x is
less than the mantissa of y, ntimes will be one too large
but it doesn't matter - it just means that we'll go round
the loop below one extra time. */
if (xexp <= yexp)
{
ntimes = 0;
w = dy;
scale = 1.0;
}
else
{
ntimes = (xexp - yexp) / 24;
/* Set w = y * 2^(24*ntimes) */
PUT_BITS_DP64((unsigned long)(ntimes * 24 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64,
scale);
w = scale * dy;
/* Set scale = 2^(-24) */
PUT_BITS_DP64((unsigned long)(-24 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64,
scale);
}
/* Each time round the loop we compute a partial remainder.
This is done by subtracting a large multiple of w
from x each time, where w is a scaled up version of y.
The subtraction can be performed exactly when performed
in double precision, and the result at each stage can
fit exactly in a single precision number. */
for (i = 0; i < ntimes; i++)
{
/* t is the integer multiple of w that we will subtract.
We use a truncated value for t. */
t = (double)((int)(dx / w));
dx -= w * t;
/* Scale w down by 2^(-24) for the next iteration */
w *= scale;
}
/* One more time */
#if defined(COMPILING_FMOD)
t = (double)((int)(dx / w));
dx -= w * t;
#else
{
unsigned int todd;
/* Variable todd says whether the integer t is odd or not */
t = (double)((int)(dx / w));
todd = ((int)(dx / w)) & 1;
dx -= w * t;
/* At this point, dx lies in the range [0,dy) */
/* For the remainder function, we need to adjust dx
so that it lies in the range (-y/2, y/2] by carefully
subtracting w (== dy == y) if necessary. */
if (dx > 0.5 * w || ((dx == 0.5 * w) && todd))
dx -= w;
}
#endif
/* **** N.B. for some reason this breaks the 32 bit version
of remainder when compiling with optimization. */
/* Restore the entry status flags */
set_fpsw_inline(sw);
/* Set the result sign according to input argument x */
return (float)(x < 0.0? -dx : dx);
}

View File

@@ -0,0 +1,369 @@
/***********************************************************************************/
/** MIT License **/
/** ----------- **/
/** **/
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
/** **/
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
/** of this Software and associated documentaon files (the "Software"), to deal **/
/** in the Software without restriction, including without limitation the rights **/
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
/** copies of the Software, and to permit persons to whom the Software is **/
/** furnished to do so, subject to the following conditions: **/
/** **/
/** The above copyright notice and this permission notice shall be included in **/
/** all copies or substantial portions of the Software. **/
/** **/
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
/** THE SOFTWARE. **/
/***********************************************************************************/
/*
******************************************************************************
* Source File : simd.h
* Archive File : $Archive: $
* Date : 6/04/01
* Description : The include file for the AMD SIMD exception filter routine
* for Microsoft Structured Exception Handling
*
*
$Revision:$
$Name:$
$Date:$
$Author:$
$History: simd.h $
*
*/
#include <emmintrin.h>
// simd.h
// This file contains structure definitions to provide
// convenient access to SIMD and MMX data as unsigned
// integer data.
// change the following define to a 1 to print terse output
#define DO_PRINT 0
// can't use the 3DNOW SDK as written with 64 bit tools
#if !defined (_AMD64_)
#define USE_3DNOW_SDK 1
#define SUPPORTS_FTZ 1
#endif
/*****************************************************************/
// Basic type definitions
typedef UINT_PTR AWORD; // x86-64 safe
typedef union
{
float f;
unsigned long l;
} LFLOAT;
//typedef struct
//{
// DWORD dw[2];
//}
typedef unsigned _int64 QWORD;
typedef union
{
double f;
unsigned long l[2];
} LDOUBLE;
typedef __declspec(align(16)) struct
{
LFLOAT f0,f1,f2,f3;
} SSESINGLE;
typedef __declspec(align(16)) struct
{
LDOUBLE d0,d1;
} SSEDOUBLE;
// this is the key data structure type used by the filter
// and the test program. It will be aligned, since
// the __m128 types are all aligned. It allows the
// use of one variable to carry all the needed data
// types.
typedef union
{
__m128 m;
__m128d md;
__m128i mi;
__m64 m64[2];
DWORD l[4];
int i[4];
LFLOAT f[4];
QWORD q[2];
LDOUBLE d[2];
} ML128;
// this defined to provide a MMX type for the FXSTOR structure.
typedef union
{
unsigned short mmx[4]; // mmx regs are 64 bits
unsigned short fp[5]; // floating point regs are 80 bits
} MMX80;
/*****************************************************************/
// define constants used by SIMD
// define MXCSR rounding control bits.
#define SDIMCW_RC 0x6000
#define SDIRC_NEAR 0x0000
#define SDIRC_DOWN 0x2000
#define SDIRC_UP 0x4000
#define SDIRC_CHOP 0x6000
// define other MXCSR control bits
#define SDDAZ 0x0040
#define SDFTZ 0x8000
#define opADD 0x58
#define opAND 0x54
#define opANDN 0x55
#define opCMP 0xC2
#define opCOMISS 0x2F
#define opCVTPI2PS 0x2A
#define opCVTTPS2PI 0x2C
#define opCVTPS2PI 0x2D
#define opCVTPS2PD 0x5A
#define opCVTDQ2PS 0x5B
#define opCVTTPD2DQ 0xE6
#define opDIV 0x5E
#define opMAX 0x5F
#define opMIN 0x5D
#define opMUL 0x59
#define opSQRT 0x51
#define opSUB 0x5C
#define opUCOMISS 0x2E
// define EFlags bits
#define ZF (1 << 6)
#define PF (1 << 2)
#define CF (1 << 0)
// define the REX prefix bits
#define REX_PREFIX 0x40
#define REX_W 0x8
#define REX_R 0x4
#define REX_X 0x2
#define REX_B 0x1
// define the exception information record
// constants for the status bits
#define IEM_INEXACT 0x20
#define IEM_UNDERFLOW 0x10
#define IEM_OVERFLOW 0x08
#define IEM_ZERODIVIDE 0x04
#define IEM_DENORMAL 0x02
#define IEM_INVALID 0x01
#define IEM_MASK 0x3F
#define IMM_INEXACT 0x1000
#define IMM_UNDERFLOW 0x0800
#define IMM_OVERFLOW 0x0400
#define IMM_ZERODIVIDE 0x0200
#define IMM_DENORMAL 0x0100
#define IMM_INVALID 0x0080
#define IMM_MASK 0x1F80
/*****************************************************************/
// Instruction forms
// Type enumerations
//
typedef enum
{
fGdWsd,
fGdWss,
fQqWpd,
fQqWps,
fVpdQq,
fVpdWpd,
fVpdWpdIb,
fVpdWpdi,
fVpdWps,
fVpdiWpd,
fVpdiWps,
fVpsQq,
fVpsWpd,
fVpsWpdi,
fVpsWps,
fVpsWpsIb,
fVsdEd,
fVsdWsd,
fVsdWsdIb,
fVsdWss,
fVssEd,
fVssWsd,
fVssWss,
fVssWssIb
} InstType;
// operand types
typedef enum
{
oEd, //General register dword mod R/M
oGd, //General register dword
oQq, // MMX quadword mod R/M
oVpd, // XMM register
oVpdi,
oVps,
oVsd,
oVss,
oWpd, // XMM mod R/M
oWpdi,
oWps,
oWsd,
oWss
} OpType;
// operand class
typedef enum
{
oXMMreg,
oXMMmrm,
oMMXreg,
oMMXmrm,
oGENreg,
oGENmrm,
} OpClass;
// data types
typedef enum
{
dDW, // integer DWORD
dPD, // packed double precision
dPDI, // packed integer DWORD
dPS, // packed single precision
dQ, // integer quadword
dSD, // scalar double precision
dSS // scalar single precision
} DataType;
/*****************************************************************/
// Structure definitions
//
// define the format of the data used by
// the FXSAVE and FXRSTOR commands
typedef struct
{
MMX80 mmx; // the mmx/fp register
unsigned short reserved[3]; // floating point regs are 80 bits
} FPMMX;
#if defined (_AMD64_)
// x86-64 version
typedef struct _FXMM_SAVE_AREA {
WORD ControlWord;
WORD StatusWord;
WORD TagWord;
WORD OpCode;
QWORD ErrorOffset;
QWORD DataOffset;
DWORD Mxcsr;
DWORD reserved3;
FPMMX FMMXreg[8];
ML128 XMMreg[16];
} FXMM_SAVE_AREA;
#else
// 32 bit x86 version
typedef struct _FXMM_SAVE_AREA {
WORD ControlWord;
WORD StatusWord;
WORD TagWord;
WORD OpCode;
DWORD ErrorOffset;
WORD ErrorSelector;
WORD reserved1;
DWORD DataOffset;
WORD DataSelector;
WORD reserved2;
DWORD Mxcsr;
DWORD reserved3;
FPMMX FMMXreg[8];
ML128 XMMreg[8];
} FXMM_SAVE_AREA;
#endif
typedef FXMM_SAVE_AREA *PFXMM_SAVE_AREA;
/* This structure is used to access the excepting opcode */
typedef struct {
unsigned char opcode;
unsigned char rmbyte;
union {
unsigned long offset; // this will need work for x86-64
unsigned char imm8;
} data;
} SIMD_OP, *PSIMD_OP;
// Define a SIMD exception flag type.
// This is just like the _FPIEEE_EXCEPTION_FLAGS
// except that it adds the denormal field.
typedef struct {
unsigned int Inexact : 1;
unsigned int Underflow : 1;
unsigned int Overflow : 1;
unsigned int ZeroDivide : 1;
unsigned int InvalidOperation : 1;
unsigned int Denormal : 1;
} _SIMD_EXCEPTION_FLAGS;
/* define the local simd record structures */
typedef struct {
unsigned int RoundingMode;
_SIMD_EXCEPTION_FLAGS Cause;
_SIMD_EXCEPTION_FLAGS Enable;
_SIMD_EXCEPTION_FLAGS Status;
PSIMD_OP opaddress; // points to 0F xx opcode
int curAddr; // used when parsing mod R/M byte
unsigned char prefix;
unsigned char opcode;
unsigned char rmbyte;
unsigned char immediate8;
// add a rex field for x86-64
unsigned char rex;
int eopcode; // encoded opcode (index for tables)
int op_form;
int op1_class; // XMM, MMX, or gen register
int op1_type; // data format
int op2_class;
int op2_type;
int is_commiss;
int commiss_val;
unsigned int mxcsr; // value of mscsr from context record.
ML128 op1_value;
ML128 op2_value;
ML128 *op2_ptr;
} _SIMD_RECORD, *_PSIMD_RECORD;
/* define a record for the operand form table */
typedef struct {
int op1; // form of operand 1
int op2; // form of operand 2
} _OPERAND_RECORD;

View File

@@ -0,0 +1,511 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;
; An implementation of the sin function.
;
; Prototype:
;
; double sin(double x);
;
; Computes sin(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
; If FMA3 hardware is available, an FMA3 implementation of sin will be used.
.const
ALIGN 16
L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1
DQ 0
L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
DQ 0
L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
DQ 0
L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
DQ 0
ALIGN 16
L_one DQ 03FF0000000000000h, 03FF0000000000000h
L_signbit DQ 08000000000000000h, 00000000000000000h
L_int_one DQ 00000000000000001h, 00000000000000000h
L_int_two DQ 00000000000000002h, 00000000000000000h
L_int_three DQ 00000000000000003h, 00000000000000000h
L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi
L_one_half DQ 03FE0000000000000h ; .5
L_one_sixth DQ 03FC5555555555555h ; .1666...
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
L_piby4 DQ 03FE921FB54442D18h ; pi/4
L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW
L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
L__inf_mask_64 DQ 07FF0000000000000h ; +Inf
EXTRN __Lcosarray:QWORD
EXTRN __Lsinarray:QWORD
EXTRN __use_fma3_lib:DWORD
; define local variable storage offsets
p_temp EQU 030h
p_temp1 EQU 040h
save_r10 EQU 050h
dummy_space EQU 060h
stack_size EQU 078h
include fm.inc
fname TEXTEQU <sin>
fname_special TEXTEQU <_sin_special>
;Define name and any external functions being called
EXTERN __remainder_piby2_forAsm : PROC
EXTERN __remainder_piby2_fma3 : PROC
EXTERN __remainder_piby2_fma3_bdl : PROC
EXTERN fname_special : PROC
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Lsin_fma3
Lsin_sse2:
movd rdx, xmm0
xorpd xmm2, xmm2 ; zeroed out for later use
mov r10,rdx
mov r8d, 1 ; for determining region later on
btr r10,63 ; r10 <-- |x|
cmp r10,L_piby4
jb Lsin_sse2_absx_lt_piby4
Lsin_sse2_absx_nlt_piby4: ; common case
mov r11,rdx
shr r11,63
movd xmm0,r10 ; xmm0 <-- |x|
cmp r10, QWORD PTR L_small_arg_cw
jae Lsin_reduce_precise ; Note NaN/Inf will branch
; At this point we have |x| < L_small_arg_cw, which is currently 500000.
; Note that if |x| were too large, conversion of npi2 to integer would fail.
; We reduce the argument to be in a range from -pi/4 to +pi/4
; by subtracting multiples of pi/2
movapd xmm2, xmm0
mulsd xmm2, L_2_by_pi
movapd xmm4, xmm0
; xexp = ax >> EXPSHIFTBITS_DP64;
mov r9, r10
shr r9, 52 ; >>EXPSHIFTBITS_DP64
; How many pi/2 is |x| a multiple of?
; npi2 = (int)(x * twobypi + 0.5);
addsd xmm2, L_one_half ; npi2
movsd xmm3, L_real_piby2_1
cvttpd2dq xmm0, xmm2 ; convert npi2 to integer
movsd xmm1, L_real_piby2_1tail
cvtdq2pd xmm2, xmm0 ; npi2 back to double
; Subtract the multiple from x to get an extra-precision remainder
; rhead = x - npi2 * piby2_1;
mulsd xmm3, xmm2
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_1tail;
mulsd xmm1, xmm2 ; rtail
movd eax, xmm0 ; eax <-- npi2
; GET_BITS_DP64(rhead-rtail, uy);
; originally only rhead
movapd xmm0, xmm4
subsd xmm0, xmm1
movsd xmm3, L_real_piby2_2
movd rcx, xmm0 ; rcx <-- rhead - rtail
movsd xmm5, L_real_piby2_2tail ; piby2_2tail
; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc,
; xmm4=rhead, xmm5= temp for calc
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
; expdiff measures how close rhead - rtail is to |x|
; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer)
shl rcx, 1 ; strip any sign bit
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
sub r9, rcx ; expdiff
;; if (expdiff > 15)
cmp r9, 15
jle Lsin_sse2_cw_reduction_done
; Here the remainder is pretty small compared with x, which
; implies that x is a near multiple of pi/2
; (x matches the multiple to at least 15 bits)
; So we do another stage of argument reduction.
; t = rhead;
movapd xmm1, xmm4
; rtail = npi2 * piby2_2;
mulsd xmm3, xmm2
; rhead = t - rtail;
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subsd xmm1, xmm4 ; t - rhead
subsd xmm1, xmm3 ; -rtail
subsd xmm5, xmm1 ; rtail
; r = rhead - rtail;
movapd xmm0, xmm4
;HARSHA
;xmm1=rtail
movapd xmm1, xmm5 ; xmm1 <-- copy of rtail
subsd xmm0, xmm5
; xmm0=r, xmm4=rhead, xmm1=rtail
Lsin_sse2_cw_reduction_done:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; if the input was close to a pi/2 multiple
; The original NAG code missed this trick.
; If the input is very close to n*pi/2 after reduction, so r < 2^-27,
; then the sin is either ~ 1.0 or ~r, to within 53 bits.
; Note: Unfortunately this introduces two jcc instructions close to each
; other and to other branches. As r < 2^-13 should be rather uncommon, it
; almost certainly costs more than it saves. - WAT
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region = npi2 & 3;
subsd xmm4, xmm0 ; rhead-r
subsd xmm4, xmm1 ; rr = (rhead-r) - rtail
Lsin_piby4:
; perform taylor series to calc sinx, sinx for |x| <= pi/4
; x2 = r * r;
;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the sin path
;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
movapd xmm3, xmm0
movapd xmm2, xmm0
mulsd xmm2, xmm0 ;x2
bt eax,0
jc Lsin_sse2_calc_cos
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 or 2 do a sin calculation
movsd xmm3, __Lsinarray+50h ; s6
mulsd xmm3, xmm2 ; x2s6
movsd xmm5, __Lsinarray+20h ; s3
movsd QWORD PTR p_temp[rsp], xmm4 ; store xx
movapd xmm1, xmm2 ; move for x4
mulsd xmm1, xmm2 ; x4
movsd QWORD PTR p_temp1[rsp], xmm0 ; store x
mulsd xmm5, xmm2 ; x2s3
movapd xmm4, xmm0 ; move for x3
addsd xmm3, __Lsinarray+40h ; s5+x2s6
mulsd xmm1, xmm2 ; x6
mulsd xmm3, xmm2 ; x2(s5+x2s6)
mulsd xmm4, xmm2 ; x3
addsd xmm5, __Lsinarray+10h ; s2+x2s3
mulsd xmm5, xmm2 ; x2(s2+x2s3)
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
mulsd xmm2, L_one_half ; 0.5 *x2
movsd xmm0, QWORD PTR p_temp[rsp] ; load xx
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
mulsd xmm2, xmm0 ; 0.5 * x2 *xx
addsd xmm3, xmm5 ; zs
mulsd xmm4, xmm3 ; *x3
subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx
addsd xmm0, xmm4 ; +xx
addsd xmm0, QWORD PTR p_temp1[rsp] ; +x
jmp Lsin_sse2_adjust_region
ALIGN 16
Lsin_sse2_calc_cos:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 1 or 3 - do a cos calculation
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
mulsd xmm4, xmm0 ; x*xx
movsd xmm5, L_one_half
movsd xmm1, __Lcosarray+50h ; c6
movsd xmm0, __Lcosarray+20h ; c3
mulsd xmm5, xmm2 ; r = 0.5 *x2
movapd xmm3, xmm2 ; copy of x2
movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx
mulsd xmm1, xmm2 ; c6*x2
mulsd xmm0, xmm2 ; c3*x2
subsd xmm5, L_one ; -t=r-1.0, trash r
mulsd xmm3, xmm2 ; x4
addsd xmm1, __Lcosarray+40h ; c5+x2c6
addsd xmm0, __Lcosarray+10h ; c2+x2C3
addsd xmm5, L_one ; 1 + (-t), trash t
mulsd xmm3, xmm2 ; x6
mulsd xmm1, xmm2 ; x2(c5+x2c6)
mulsd xmm0, xmm2 ; x2(c2+x2C3)
movapd xmm4, xmm2 ; copy of x2
mulsd xmm4, L_one_half ; r recalculate
addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6)
addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3)
mulsd xmm2, xmm2 ; x4 recalculate
subsd xmm5, xmm4 ; (1 + (-t)) - r
mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6))
addsd xmm0, xmm1 ; zc
subsd xmm4, L_one ; t relaculate
subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx
mulsd xmm0, xmm2 ; x4 * zc
addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx)
subsd xmm0, xmm4 ; result - (-t)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lsin_sse2_adjust_region:
; positive or negative
; switch (region)
shr eax, 1
mov ecx, eax
and eax, r11d
not ecx
not r11d
and ecx, r11d
or eax, ecx
and eax, 1
jnz Lsin_sse2_cleanup
;; if the original region 0, 1 and arg is negative, then we negate the result.
;; if the original region 2, 3 and arg is positive, then we negate the result.
movapd xmm2, xmm0
xorpd xmm0, xmm0
subsd xmm0, xmm2
ALIGN 16
Lsin_sse2_cleanup:
StackDeallocate stack_size
ret
ALIGN 16
Lsin_sse2_absx_lt_piby4:
; sin = sin_piby4(x, 0.0);
; x2 = r * r;
movapd xmm2, xmm0
mulsd xmm2, xmm0 ; x2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 - do a sin calculation
; zs = (s2 + x2 * (s3 + x2 * (s4 + x2 * (s5 + x2 * s6))));
movsd xmm3, __Lsinarray+50h ; s6
mulsd xmm3, xmm2 ; x2s6
movsd xmm5, __Lsinarray+20h ; s3
movapd xmm1, xmm2 ; move for x4
mulsd xmm1, xmm2 ; x4
mulsd xmm5, xmm2 ; x2s3
movapd xmm4, xmm0 ; move for x3
addsd xmm3, __Lsinarray+40h ; s5+x2s6
mulsd xmm1, xmm2 ; x6
mulsd xmm3, xmm2 ; x2(s5+x2s6)
mulsd xmm4, xmm2 ; x3
addsd xmm5, __Lsinarray+10h ; s2+x2s3
mulsd xmm5, xmm2 ; x2(s2+x2s3)
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
addsd xmm3, xmm5 ; zs
mulsd xmm4, xmm3 ; *x3
addsd xmm0, xmm4 ; +x
jmp Lsin_sse2_cleanup
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lsin_reduce_precise:
; Reduce x into range [-pi/4, pih/4]
cmp r10,L__inf_mask_64
jae Lsin_x_naninf
mov QWORD PTR p_temp[rsp], r11
call __remainder_piby2_forAsm
mov r11, QWORD PTR p_temp[rsp]
; At this point xmm0 has r, xmm1 has rr, rax has region
movapd xmm4, xmm1 ; xmm4 <-- rr
jmp Lsin_piby4
; xmm0 = x, xmm4 = xx, eax= region
ALIGN 16
Lsin_x_naninf:
call fname_special
StackDeallocate stack_size
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; From this point we assume that FMA3 and AVX hardware are present.
ALIGN 16
Lsin_fma3:
vmovq r9,xmm0
mov r10,r9 ; save x to get sign later
btr r9,63 ; r9 <-- |x|
cmp r9,L_piby4
jae Lsin_fma3_absx_nlt_piby4 ; Note that NaN will branch
cmp r9,L_two_to_neg_13
jae Lsin_fma3_calc_sin_for_absx_lt_piby4
cmp r9,L_two_to_neg_27
jae Lsin_fma3_compute_x_xxx_0_1666
StackDeallocate stack_size
ret ; sin x ~= x for |x| < 2^-27
ALIGN 16
Lsin_fma3_compute_x_xxx_0_1666: ; |x| in [2^-27,2^-13]
vmulsd xmm1,xmm0,xmm0 ; xmm1l <-- x*x
vmulsd xmm1,xmm1,xmm0 ; xmm1l <-- x*x*x
vfnmadd231sd xmm0,xmm1,L_one_sixth ; xmm0l <-- x - x*x*x*(1/6)
StackDeallocate stack_size
ret
ALIGN 16
Lsin_fma3_calc_sin_for_absx_lt_piby4: ; |x| in [2^-13,pi/4]
vmovsd xmm5,__Lsinarray+050h
vmulsd xmm3,xmm0,xmm0 ; xmm3l <-- x^2
vfmadd213sd xmm5,xmm3,__Lsinarray+040h
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
vfmadd213sd xmm5,xmm3,__Lsinarray+010h
vmulsd xmm4,xmm0,xmm3 ; xmm4l <-- x^3
vfmadd213sd xmm5,xmm3,__Lsinarray
vfmadd231sd xmm0,xmm4,xmm5 ; xmm0l <-- x + x^3 p(x^2)
StackDeallocate stack_size
ret
ALIGN 16
Lsin_fma3_absx_nlt_piby4: ; !(|x| < pi/4)
; here r9 has |x|
cmp r9,L__inf_mask_64
jae Lsin_x_naninf
;Lrange_reduce: ;; unused label
vmovq xmm0,r9 ; xmm0 <-- |x|
cmp r9,L_small_arg_bdl
jae Lsin_fma3_do_general_arg_reduction
; Note that __remainder_piby2_fma3 conventions are
; on input
; |x| is in xmm0
; on output
; r is in xmm0
; rr is in xmm1
; region of |x| is in rax
; Boldo-Daumas-Li reduction for reasonably small |x|
call __remainder_piby2_fma3_bdl
Lsin_fma3_exit_s:
bt rax,0
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x
jc Lsin_fma3_calc_cos
Lsin_fma3_calc_sin: ;; unused label
; region 0 or 2
; compute the sine of r+rr, where this sum is in [-pi/4,pi/4]
vmovsd xmm5,__Lsinarray+050h
vfmadd213sd xmm5,xmm3,__Lsinarray+040h
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r
vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x
vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r
vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x
vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r
vmulsd xmm2,xmm3,xmm2
vsubsd xmm2,xmm2,xmm1
vfnmadd231sd xmm2, xmm4,__Lsinarray
vsubsd xmm0,xmm0,xmm2
jmp Lsin_fma3_exit_s_1
ALIGN 16
Lsin_fma3_calc_cos:
; region 1 or 3
; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4]
vmovapd xmm2,L_one
vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5
vsubsd xmm2,xmm2,xmm4 ; 1-t
vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r
vmovsd xmm5,__Lcosarray+050h
vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2
vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1
vfmadd213sd xmm5,xmm3,__Lcosarray+040h
vfmadd213sd xmm5,xmm3,__Lcosarray+030h
vfmadd213sd xmm5,xmm3,__Lcosarray+020h
vfmadd213sd xmm5,xmm3,__Lcosarray+010h
vfmadd213sd xmm5,xmm3,__Lcosarray
vfmadd213sd xmm5,xmm1,xmm2
vaddsd xmm0,xmm5,xmm4
Lsin_fma3_exit_s_1:
xor r8,r8 ; prepare r8 for cmov
and r10,L_signbit ; isolate original sign of x
bt eax,1
cmovc r8,L_signbit
xor r8,r10
vmovq xmm3,r8
vxorpd xmm0,xmm0,xmm3
StackDeallocate stack_size
ret
ALIGN 16
Lsin_fma3_do_general_arg_reduction:
; argument reduction for general x
; NOTE: the BDL argument reduction routine does not touch r10,
; but the general-purpose reduction does.
mov QWORD PTR [save_r10+rsp], r10
call __remainder_piby2_fma3
mov r10, QWORD PTR [save_r10+rsp]
jmp Lsin_fma3_exit_s
fname endp
END

View File

@@ -0,0 +1,130 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include <fpieee.h>
#include <excpt.h>
#include <float.h>
#include <math.h>
#include <errno.h>
#include "libm_new.h"
double _sincos_special(double x, char *name, unsigned int operation)
{
UT64 xu;
unsigned int is_snan;
xu.f64 = x;
if((xu.u64 & INF_POS_64) == INF_POS_64)
{
// x is Inf or NaN
if((xu.u64 & MANTISSA_MASK_64) == 0x0)
{
// x is Inf
xu.u64 = IND_64;
_handle_error(name, operation, xu.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0, 1);
}
else
{
// x is NaN
is_snan = (((xu.u64 & QNAN_MASK_64) == QNAN_MASK_64) ? 0 : 1);
if(is_snan)
{
xu.u64 |= QNAN_MASK_64;
}
_handle_error(name, operation, xu.u64, _DOMAIN, 0, EDOM, x, 0, 1);
}
}
return xu.f64;
}
float _sincosf_special(float x, char *name, unsigned int operation)
{
UT64 xu;
unsigned int is_snan;
xu.u64 = 0;
xu.f32[0] = x;
if((xu.u32[0] & INF_POS_32) == INF_POS_32)
{
// x is Inf or NaN
if((xu.u32[0] & MANTISSA_MASK_32) == 0x0)
{
// x is Inf
xu.u32[0] = IND_32;
_handle_errorf(name, operation, xu.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0, 1);
}
else
{
// x is NaN
is_snan = (((xu.u32[0] & QNAN_MASK_32) == QNAN_MASK_32) ? 0 : 1);
if(is_snan)
{
xu.u32[0] |= QNAN_SET_32;
_handle_errorf(name, operation, xu.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0, 1);
}
else
{
_handle_errorf(name, operation, xu.u64, _DOMAIN, 0, EDOM, x, 0, 1);
}
}
}
return xu.f32[0];
}
float _sinf_special(float x)
{
return _sincosf_special(x, "sinf", _FpCodeSin);
}
double _sin_special(double x)
{
return _sincos_special(x, "sin", _FpCodeSin);
}
float _cosf_special(float x)
{
return _sincosf_special(x, "cosf", _FpCodeCos);
}
double _cos_special(double x)
{
return _sincos_special(x, "cos", _FpCodeCos);
}
double _tan_special(double x)
{
return _sincos_special(x, "tan",_FpCodeTan);
}
float _tanf_special(float x)
{
return _sincosf_special(x, "tanf",_FpCodeTan);
}

View File

@@ -0,0 +1,664 @@
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;
; An implementation of the sinf function.
;
; Prototype
;
; float sinf(float x);
;
; Computes sinf(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
.const
ALIGN 16
L_signbit DQ 08000000000000000h
DQ 08000000000000000h
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
DQ 07FFFFFFFFFFFFFFFh
L_one DQ 03FF0000000000000h
DQ 03FF0000000000000h
L_int_three DQ 00000000000000003h
DQ 00000000000000003h
L_one_half DQ 03FE0000000000000h
DQ 03FE0000000000000h
L_twobypi DQ 03FE45F306DC9C883h
DQ 03FE45F306DC9C883h
L_piby2_1 DQ 03FF921FB54400000h
DQ 03FF921FB54400000h
L_one_sixth DQ 03FC5555555555555h
DQ 03FC5555555555555h
L_piby2_1tail DQ 03DD0B4611A626331h
DQ 03DD0B4611A626331h
L_piby2_2 DQ 03dd0b4611a600000h
DQ 03dd0b4611a600000h
L_piby2_2tail DQ 03ba3198a2e037073h
DQ 03ba3198a2e037073h
L_inf_mask_32 DD 07F800000h
DD 07F800000h
DQ 07F8000007F800000h
L_int_two DQ 00000000000000002h
DQ 00000000000000002h
L_piby2_lead DQ 03ff921fb54442d18h
DQ 03ff921fb54442d18h
L_piby4 DQ 03fe921fb54442d18h
DQ 03fe921fb54442d18h
L_mask_3f2 DQ 03f20000000000000h
DQ 03f20000000000000h
L_mask_3f8 DQ 03f80000000000000h
DQ 03f80000000000000h
; Do these really need to be different?
L_large_x_fma3 DQ 04170008AC0000000h ; 16779436
L_large_x_sse2 DQ 0416E848000000000h ; 16000000
EXTRN __Lcosfarray:QWORD
EXTRN __Lsinfarray:QWORD
EXTRN __use_fma3_lib:DWORD
EXTRN __L_2_by_pi_bits:BYTE
; define local variable storage offsets
p_temp EQU 010h ; temporary for get/put bits operation
p_temp1 EQU 018h ; temporary for get/put bits operation
region EQU 020h ; pointer to region for remainder_piby2
r EQU 028h ; pointer to r for remainder_piby2
dummy_space EQU 040h
stack_size EQU 058h
include fm.inc
fname TEXTEQU <sinf>
fname_special TEXTEQU <_sinf_special>
;Define name and any external functions being called
EXTRN __remainder_piby2d2f_forC : PROC ; NEAR
EXTERN fname_special : PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Lsinf_fma3
Lsinf_sse2:
xorpd xmm2, xmm2 ; zeroed out for later use
;; if NaN or inf
movd edx, xmm0
mov eax, 07f800000h
mov r10d, eax
and r10d, edx
cmp r10d, eax
jz Lsinf_sse2_naninf
; GET_BITS_DP64(x, ux);
; get the input value to an integer register.
cvtss2sd xmm0, xmm0 ; convert input to double.
movd rdx, xmm0 ; rdx is ux
; ax = (ux & ~SIGNBIT_DP64);
mov r10, rdx
btr r10, 63 ; r10 is ax
mov r8d, 1 ; for determining region later on
;; if (ax <= 0x3fe921fb54442d18) abs(x) <= pi/4
mov rax, 03fe921fb54442d18h
cmp r10, rax
jg Lsinf_absx_gt_piby4
;; if (ax < 0x3f80000000000000) abs(x) < 2.0^(-7)
mov rax, 3f80000000000000h
cmp r10, rax
jge Lsinf_sse2_small
;; if (ax < 0x3f20000000000000) abs(x) < 2.0^(-13)
mov rax, 3f20000000000000h
cmp r10, rax
jge Lsinf_sse2_smaller
; sinf = x;
jmp Lsinf_sse2_cleanup
ALIGN 16
Lsinf_sse2_smaller:
; sinf = x - x^3 * 0.1666666666666666666;
movsd xmm2, xmm0
movsd xmm4, QWORD PTR L_one_sixth ; 0.1666666666666666666
mulsd xmm2, xmm2 ; x^2
mulsd xmm2, xmm0 ; x^3
mulsd xmm2, xmm4 ; x^3 * 0.1666666666666666666
subsd xmm0, xmm2 ; x - x^3 * 0.1666666666666666666
jmp Lsinf_sse2_cleanup
ALIGN 16
Lsinf_sse2_small:
movsd xmm2, xmm0 ; x2 = r * r;
mulsd xmm2, xmm0 ; x2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 or 2 - do a sinf calculation
; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
mulsd xmm1, xmm2 ; s4x2
movsd xmm4, xmm2 ; move for x4
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
mulsd xmm4, xmm2 ; x4
movsd xmm3, xmm0 ; move for x3
mulsd xmm5, xmm2 ; s2x2
mulsd xmm3, xmm2 ; x3
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
mulsd xmm1, xmm4 ; s3x4+s4x6
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
jmp Lsinf_sse2_cleanup
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lsinf_absx_gt_piby4:
; xneg = (ax != ux);
cmp rdx, r10
mov r11d, 0
;; if (xneg) x = -x;
jz Lsinf_sse2_reduce_moderate
mov r11d, 1
subsd xmm2, xmm0
movsd xmm0, xmm2
Lsinf_sse2_reduce_moderate:
;; if (x < 5.0e6)
cmp r10, QWORD PTR L_large_x_sse2
jae Lsinf_sse2_reduce_large
; reduce the argument to be in a range from -pi/4 to +pi/4
; by subtracting multiples of pi/2
movsd xmm2, xmm0
movsd xmm3, QWORD PTR L_twobypi
movsd xmm4, xmm0
movsd xmm5, QWORD PTR L_one_half ; .5
mulsd xmm2, xmm3
;/* How many pi/2 is x a multiple of? */
; xexp = ax >> EXPSHIFTBITS_DP64;
mov r9, r10
shr r9, 52 ; >>EXPSHIFTBITS_DP64
; npi2 = (int)(x * twobypi + 0.5);
addsd xmm2, xmm5 ; npi2
movsd xmm3, QWORD PTR L_piby2_1
cvttpd2dq xmm0, xmm2 ; convert to integer
movsd xmm1, QWORD PTR L_piby2_1tail
cvtdq2pd xmm2, xmm0 ; and back to double.
; /* Subtract the multiple from x to get an extra-precision remainder */
; rhead = x - npi2 * piby2_1;
mulsd xmm3, xmm2
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_1tail;
mulsd xmm1, xmm2
movd eax, xmm0
; GET_BITS_DP64(rhead-rtail, uy);
; originally only rhead
movsd xmm0, xmm4
subsd xmm0, xmm1
movsd xmm3, QWORD PTR L_piby2_2
movd rcx, xmm0
movsd xmm5, QWORD PTR L_piby2_2tail
; xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
shl rcx, 1 ; strip any sign bit
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
sub r9, rcx ; expdiff
;; if (expdiff > 15)
cmp r9, 15
jle Lsinf_sse2_expdiff_le_15
; The remainder is pretty small compared with x, which
; implies that x is a near multiple of pi/2
; (x matches the multiple to at least 15 bits)
; t = rhead;
movsd xmm1, xmm4
; rtail = npi2 * piby2_2;
mulsd xmm3, xmm2
; rhead = t - rtail;
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subsd xmm1, xmm4 ; t - rhead
subsd xmm1, xmm3 ; -rtail
subsd xmm5, xmm1 ; rtail
; r = rhead - rtail;
movsd xmm0, xmm4
;HARSHA
;xmm1=rtail
movsd xmm1, xmm5
subsd xmm0, xmm5
; xmm0=r, xmm4=rhead, xmm1=rtail
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lsinf_sse2_expdiff_le_15:
cmp rcx, 03f2h ; is r < 2^-13 ?
jge Lsinf_sse2_calc_sincosf_piby4 ; use taylor series if not
cmp rcx, 03deh ; if r really small.
jle Lsinf_sse2_r_very_small ; then sinf(r) ~ r or 1
movsd xmm2, xmm0
mulsd xmm2, xmm0 ; xmm2 <-- r^2
;; if region is 0 or 2 do a sinf calc.
and r8d, eax
jnz Lsinf_sse2_small_calc_sin
; region 0 or 2 do a sinf calculation
; use simply polynomial
; x - x*x*x*0.166666666666666666;
movsd xmm3, QWORD PTR L_one_sixth
mulsd xmm3, xmm0 ; * x
mulsd xmm3, xmm2 ; * x^2
subsd xmm0, xmm3 ; xs
jmp Lsinf_sse2_adjust_region
ALIGN 16
Lsinf_sse2_small_calc_sin:
; region 1 or 3 do a cosf calculation
; use simply polynomial
; 1.0 - x*x*0.5;
movsd xmm0, QWORD PTR L_one ; 1.0
mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2
subsd xmm0, xmm2 ; xc
jmp Lsinf_sse2_adjust_region
ALIGN 16
Lsinf_sse2_r_very_small:
;; if region is 0 or 2 do a sinf calc. (sinf ~ x)
and r8d, eax
jz Lsinf_sse2_adjust_region
movsd xmm0, QWORD PTR L_one ; cosf(r) is a 1
jmp Lsinf_sse2_adjust_region
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lsinf_sse2_reduce_large:
; Reduce x into range [-pi/4, pi/4]
; __remainder_piby2d2f_forC(x, &r, &region);
mov QWORD PTR p_temp[rsp], r11
lea rdx, QWORD PTR r[rsp]
lea r8, QWORD PTR region[rsp]
movd rcx, xmm0
call __remainder_piby2d2f_forC
mov r11, QWORD PTR p_temp[rsp]
mov r8d, 1 ; for determining region later on
movsd xmm1, QWORD PTR r[rsp] ; x
mov eax, DWORD PTR region[rsp] ; region
; xmm0 = x, xmm4 = xx, r8d = 1, eax= region
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; perform taylor series to calc sinfx, cosfx
Lsinf_sse2_calc_sincosf_piby4:
; x2 = r * r;
movsd xmm2, xmm0
mulsd xmm2, xmm0 ; x2
;; if region is 1 or 3, do a cosf calc.
and r8d, eax
jnz Lsinf_sse2_do_cosf_calc
; region is 0 or 2: do a sinf calc.
; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
Lsinf_sse2_do_sinf_calc:
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
mulsd xmm1, xmm2 ; s4x2
movsd xmm4, xmm2 ; move for x4
mulsd xmm4, xmm2 ; x4
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
mulsd xmm5, xmm2 ; s2x2
movsd xmm3, xmm0 ; move for x3
mulsd xmm3, xmm2 ; x3
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
mulsd xmm1, xmm4 ; s3x4+s4x6
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
jmp Lsinf_sse2_adjust_region
ALIGN 16
Lsinf_sse2_do_cosf_calc:
; region 1 or 3 - do a cosf calculation
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4
movsd xmm4, xmm2 ; move for x4
mulsd xmm1, xmm2 ; c4x2
movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2
mulsd xmm4, xmm2 ; x4
movsd xmm0, QWORD PTR __Lcosfarray ; c0
mulsd xmm3, xmm2 ; c2x2
mulsd xmm0, xmm2 ; c0x2 (=-0.5x2)
addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2
mulsd xmm1, xmm4 ; c3x4 + c4x6
addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2
addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6
mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10
addsd xmm0, QWORD PTR L_one ; 1 - 0.5x2
addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lsinf_sse2_adjust_region:
; positive or negative
; switch (region)
shr eax, 1
mov ecx, eax
and eax, r11d
not ecx
not r11d
and ecx, r11d
or eax, ecx
and eax, 1
jnz Lsinf_sse2_cleanup
;; if the original region 0, 1 and arg is negative, then we negate the result.
;; if the original region 2, 3 and arg is positive, then we negate the result.
movsd xmm2, xmm0
xorpd xmm0, xmm0
subsd xmm0, xmm2
Lsinf_sse2_cleanup:
cvtsd2ss xmm0, xmm0
StackDeallocate stack_size
ret
ALIGN 16
Lsinf_sse2_naninf:
call fname_special
StackDeallocate stack_size
ret
ALIGN 16
Lsinf_fma3:
vmovd eax,xmm0
mov r8d,L_inf_mask_32
and eax,r8d
cmp eax, r8d
jz Lsinf_fma3_naninf
vcvtss2sd xmm5,xmm0,xmm0
vmovq r9,xmm5
btr r9,63 ; r9 <-- |x|
cmp r9,L_piby4
jg Lsinf_fma3_range_reduce
cmp r9,L_mask_3f8
jge Lsinf_fma3_compute_sinf_piby_4
cmp r9,L_mask_3f2
jge Lsinf_fma3_compute_x_xxx_0_1666
; Here |x| < 2^-13; just return sin x ~ x
StackDeallocate stack_size
ret
ALIGN 16
Lsinf_fma3_compute_x_xxx_0_1666:
; Here |x| < 2^-7; return sin x ~ x + 1/6 x^3
vmulsd xmm1,xmm5,xmm5
vmulsd xmm0,xmm1,xmm5 ; xmm1 <-- x^3
vfnmadd132sd xmm0,xmm5,L_one_sixth ; x - x*x*x*0.166666666666666666
jmp Lsinf_fma3_return_sinf_s
ALIGN 16
Lsinf_fma3_compute_sinf_piby_4:
vmovapd xmm0,xmm5
vmovsd xmm1,__Lsinfarray+010h
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x^2
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
vfmadd213sd xmm1,xmm3,__Lsinfarray+08h
vfmadd213sd xmm1,xmm3,__Lsinfarray
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
vfmadd231sd xmm0,xmm1,xmm3
jmp Lsinf_fma3_return_sinf_s
ALIGN 16
Lsinf_fma3_range_reduce:
vmovq xmm0,r9 ; xmm0 <-- |x|
cmp r9,L_large_x_fma3
jge Lsinf_fma3_reduce_large
Lsinf_fma3_sinf_reduce_moderate:
vandpd xmm1,xmm0,L_sign_mask ; xmm1 <-- |x| mov should suffice WAT
vmovapd xmm2,L_twobypi
vfmadd213sd xmm2,xmm1,L_one_half
vcvttpd2dq xmm2,xmm2
vpmovsxdq xmm1,xmm2
vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region
vshufps xmm1 ,xmm1,xmm1,8
vcvtdq2pd xmm1,xmm1
vmovdqa xmm2,xmm0
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 <-- rhead
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 <-- rtail
vsubsd xmm0,xmm2,xmm3 ; xmm0 <-- r_1
vsubsd xmm2,xmm2,xmm0
vsubsd xmm1,xmm2,xmm3 ; xmm4 <-- rr_1
jmp Lsinf_fma3_exit_s
ALIGN 16
Lsinf_fma3_reduce_large:
lea r9,__L_2_by_pi_bits
;xexp = (x >> 52) 1023
vmovq r11,xmm0
mov rcx,r11
shr r11,52
sub r11,1023 ; r11 <-- xexp = exponent of input x
;calculate the last byte from which to start multiplication
;last = 134 (xexp >> 3)
mov r10,r11
shr r10,3
sub r10,134 ;r10 = last
neg r10 ;r10 = last
;load 64 bits of 2_by_pi
mov rax,[r9+r10]
;mantissa of x = ((x << 12) >> 12) | implied bit
shl rcx,12
shr rcx,12 ;rcx = mantissa part of input x
bts rcx,52 ;add the implied bit as well
;load next 128 bits of 2_by_pi
add r10,8 ;increment to next 8 bytes of 2_by_pi
vmovdqu xmm0,XMMWORD PTR[r9+r10]
;do three 64bit multiplications with mant of x
mul rcx
mov r8,rax ; r8 <-- last 64 bits of mul = res1[2]
mov r10,rdx ; r10 <-- carry
vmovq rax,xmm0
mul rcx
;resexp = xexp & 7
and r11,7 ; r11 <-- resexp = last 3 bits
psrldq xmm0,8
add rax,r10 ; add the previous carry
adc rdx,0
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
mov r10,rdx ; r10 <-- carry
vmovq rax,xmm0
mul rcx
add r10,rax ; r10 = most sig 64 bits = res1[0]
;find the region
;last three bits ltb = most sig bits >> (54 resexp))
; decimal point in last 18 bits == 8 lsb's in first 64 bits
; and 8 msb's in next 64 bits
;point_five = ltb & 01h;
;region = ((ltb >> 1) + point_five) & 3;
mov rcx,54
mov rax,r10
sub rcx,r11
xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi)
shr rax,cl
jnc Lsinf_fma3_no_point_five_f
;;if there is carry.. then negate the result of multiplication
not r10
not r9
not r8
mov rdx,08000000000000000h
Lsinf_fma3_no_point_five_f:
adc rax,0
and rax,3
vmovd xmm4,eax ;store region to xmm4
;calculate the number of integer bits and zero them out
mov rcx,r11
add rcx,10 ; rcx <-- no. of integer bits
shl r10,cl
shr r10,cl ; r10 contains only mant bits
sub rcx,64 ; form the exponent
mov r11,rcx
;find the highest set bit
bsr rcx,r10
jnz Lsinf_fma3_form_mantissa_f
mov r10,r9
mov r9,r8
mov r8,0
bsr rcx,r10 ; rcx <-- hsb
sub r11,64
Lsinf_fma3_form_mantissa_f:
add r11,rcx ;for exp of x
sub rcx,52 ;rcx = no. of bits to shift in r10
cmp rcx,0
jl Lsinf_fma3_hsb_below_52_f
je Lsinf_fma3_form_numbers_f
;hsb above 52
mov r8,r10 ; previous contents of r8 not required
shr r10,cl ; r10 = mantissa of x with hsb at 52
shr r9,cl ; make space for bits from r10
sub rcx,64
neg rcx
shl r8,cl
or r9,r8 ; r9 = mantissa bits of xx
jmp Lsinf_fma3_form_numbers_f
ALIGN 16
Lsinf_fma3_hsb_below_52_f:
neg rcx
mov rax,r9
shl r10,cl
shl r9,cl
sub rcx,64
neg rcx
shr rax,cl
or r10,rax
shr r8,cl
or r9,r8
ALIGN 16
Lsinf_fma3_form_numbers_f:
add r11,1023
btr r10,52 ; remove the implied bit
mov rcx,r11
or r10,rdx ; put the sign
shl rcx,52
or r10,rcx ; r10 <-- x
vmovq xmm0,r10 ; xmm0 <-- x
vmulsd xmm0,xmm0,L_piby2_lead
Lsinf_fma3_exit_s:
vmovq rax,xmm4
and rax,01h
cmp rax,01h
jz Lsinf_fma3_cos_piby4_compute
Lsinf_fma3_sin_piby4_compute:
;; vmovapd xmm1,__Lsinfarray+010h
vmovsd xmm1,__Lsinfarray+010h
vmulsd xmm3,xmm0,xmm0
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
vfmadd213sd xmm1,xmm3,__Lsinfarray+008h
vfmadd213sd xmm1,xmm3,__Lsinfarray
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
vfmadd231sd xmm0,xmm1,xmm3
jmp Lsinf_fma3_exit_s_1
ALIGN 16
Lsinf_fma3_cos_piby4_compute:
vmovapd xmm2,L_one
vmulsd xmm3,xmm0,xmm0
vfmadd231sd xmm2,xmm3,__Lcosfarray ; xmm2 <-- 1 + c0 x^2
; would simple Horner's be slower?
vmovsd xmm1,__Lcosfarray+018h ; xmm1 <-- c3
vfmadd231sd xmm1,xmm3,__Lcosfarray+020h ; xmm1 <-- c4 x^2+ c3
vfmadd213sd xmm1,xmm3,__Lcosfarray+010h ; xmm1 <-- (c4 x^2+ c3)x^2 + c2
vfmadd213sd xmm1,xmm3,__Lcosfarray+008h ; xmm1 <-- ((c4 x^2+ c3)x^2 + c2)x^2 + c1
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
vmovdqa xmm0,xmm2
vfmadd231sd xmm0,xmm1,xmm3
Lsinf_fma3_exit_s_1:
; assuming FMA3 ==> AVX ==> SSE4.1
vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two
vpcmpeqq xmm3,xmm4,XMMWORD PTR L_int_three
vorpd xmm3,xmm2,xmm3
vandnpd xmm3,xmm3,L_signbit
vxorpd xmm0,xmm0,xmm3
vandnpd xmm1,xmm5,L_signbit
vxorpd xmm0,xmm1,xmm0
Lsinf_fma3_return_sinf_s:
vcvtsd2ss xmm0,xmm0,xmm0
StackDeallocate stack_size
ret
Lsinf_fma3_naninf:
call fname_special
StackDeallocate stack_size
ret
fname endp
END

View File

@@ -0,0 +1,340 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_SPLITEXP
#define USE_SCALEDOUBLE_1
#define USE_SCALEDOUBLE_2
#define USE_INFINITY_WITH_FLAGS
#define USE_VAL_WITH_FLAGS
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_SPLITEXP
#undef USE_SCALEDOUBLE_1
#undef USE_SCALEDOUBLE_2
#undef USE_INFINITY_WITH_FLAGS
#undef USE_VAL_WITH_FLAGS
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
#pragma function(sinh)
double sinh(double x)
{
/*
After dealing with special cases the computation is split into
regions as follows:
abs(x) >= max_sinh_arg:
sinh(x) = sign(x)*Inf
abs(x) >= small_threshold:
sinh(x) = sign(x)*exp(abs(x))/2 computed using the
splitexp and scaleDouble functions as for exp_amd().
abs(x) < small_threshold:
compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
sinh(x) is then sign(x)*z. */
static const double
max_sinh_arg = 7.10475860073943977113e+02, /* 0x408633ce8fb9f87e */
thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889;
/* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */
/* Lead and tail tabulated values of sinh(i) and cosh(i)
for i = 0,...,36. The lead part has 26 leading bits. */
static const double sinh_lead[37] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
1.17520117759704589844e+00, /* 0x3ff2cd9fc0000000 */
3.62686038017272949219e+00, /* 0x400d03cf60000000 */
1.00178747177124023438e+01, /* 0x40240926e0000000 */
2.72899169921875000000e+01, /* 0x403b4a3800000000 */
7.42032089233398437500e+01, /* 0x40528d0160000000 */
2.01713153839111328125e+02, /* 0x406936d228000000 */
5.48316116333007812500e+02, /* 0x4081228768000000 */
1.49047882080078125000e+03, /* 0x409749ea50000000 */
4.05154187011718750000e+03, /* 0x40afa71570000000 */
1.10132326660156250000e+04, /* 0x40c5829dc8000000 */
2.99370708007812500000e+04, /* 0x40dd3c4488000000 */
8.13773945312500000000e+04, /* 0x40f3de1650000000 */
2.21206695312500000000e+05, /* 0x410b00b590000000 */
6.01302140625000000000e+05, /* 0x412259ac48000000 */
1.63450865625000000000e+06, /* 0x4138f0cca8000000 */
4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */
1.20774762500000000000e+07, /* 0x4167093488000000 */
3.28299845000000000000e+07, /* 0x417f4f2208000000 */
8.92411500000000000000e+07, /* 0x419546d8f8000000 */
2.42582596000000000000e+08, /* 0x41aceb0888000000 */
6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */
1.79245641600000000000e+09, /* 0x41dab5adb8000000 */
4.87240166400000000000e+09, /* 0x41f226af30000000 */
1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */
3.60024494080000000000e+10, /* 0x4220c3d390000000 */
9.78648043520000000000e+10, /* 0x4236c93268000000 */
2.66024116224000000000e+11, /* 0x424ef822f0000000 */
7.23128516608000000000e+11, /* 0x42650bba30000000 */
1.96566712320000000000e+12, /* 0x427c9aae40000000 */
5.34323724288000000000e+12, /* 0x4293704708000000 */
1.45244246507520000000e+13, /* 0x42aa6b7658000000 */
3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */
1.07321789251584000000e+14, /* 0x42d866f348000000 */
2.91730863685632000000e+14, /* 0x42f0953e28000000 */
7.93006722514944000000e+14, /* 0x430689e220000000 */
2.15561576592179200000e+15}; /* 0x431ea215a0000000 */
static const double sinh_tail[37] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
1.60467555584448807892e-08, /* 0x3e513ae6096a0092 */
2.76742892754807136947e-08, /* 0x3e5db70cfb79a640 */
2.09697499555224576530e-07, /* 0x3e8c2526b66dc067 */
2.04940252448908240062e-07, /* 0x3e8b81b18647f380 */
1.65444891522700935932e-06, /* 0x3ebbc1cdd1e1eb08 */
3.53116789999998198721e-06, /* 0x3ecd9f201534fb09 */
6.94023870987375490695e-06, /* 0x3edd1c064a4e9954 */
4.98876893611587449271e-06, /* 0x3ed4eca65d06ea74 */
3.19656024605152215752e-05, /* 0x3f00c259bcc0ecc5 */
2.08687768377236501204e-04, /* 0x3f2b5a6647cf9016 */
4.84668088325403796299e-05, /* 0x3f09691adefb0870 */
1.17517985422733832468e-03, /* 0x3f53410fc29cde38 */
6.90830086959560562415e-04, /* 0x3f46a31a50b6fb3c */
1.45697262451506548420e-03, /* 0x3f57defc71805c40 */
2.99859023684906737806e-02, /* 0x3f9eb49fd80e0bab */
1.02538800507941396667e-02, /* 0x3f84fffc7bcd5920 */
1.26787628407699110022e-01, /* 0x3fc03a93b6c63435 */
6.86652479544033744752e-02, /* 0x3fb1940bb255fd1c */
4.81593627621056619148e-01, /* 0x3fded26e14260b50 */
1.70489513795397629181e+00, /* 0x3ffb47401fc9f2a2 */
1.12416073482258713767e+01, /* 0x40267bb3f55634f1 */
7.06579578070110514432e+00, /* 0x401c435ff8194ddc */
5.91244512999659974639e+01, /* 0x404d8fee052ba63a */
1.68921736147050694399e+02, /* 0x40651d7edccde3f6 */
2.60692936262073658327e+02, /* 0x40704b1644557d1a */
3.62419382134885609048e+02, /* 0x4076a6b5ca0a9dc4 */
4.07689930834187271103e+03, /* 0x40afd9cc72249aba */
1.55377375868385224749e+04, /* 0x40ce58de693edab5 */
2.53720210371943067003e+04, /* 0x40d8c70158ac6363 */
4.78822310734952334315e+04, /* 0x40e7614764f43e20 */
1.81871712615542812273e+05, /* 0x4106337db36fc718 */
5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */
6.41374032312148716301e+05, /* 0x412392bc108b37cc */
7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */
3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */
7.63580561355670914054e+06}; /* 0x415d20d76744835c */
static const double cosh_lead[37] = {
1.00000000000000000000e+00, /* 0x3ff0000000000000 */
1.54308062791824340820e+00, /* 0x3ff8b07550000000 */
3.76219564676284790039e+00, /* 0x400e18fa08000000 */
1.00676617622375488281e+01, /* 0x402422a490000000 */
2.73082327842712402344e+01, /* 0x403b4ee858000000 */
7.42099475860595703125e+01, /* 0x40528d6fc8000000 */
2.01715633392333984375e+02, /* 0x406936e678000000 */
5.48317031860351562500e+02, /* 0x4081228948000000 */
1.49047915649414062500e+03, /* 0x409749eaa8000000 */
4.05154199218750000000e+03, /* 0x40afa71580000000 */
1.10132329101562500000e+04, /* 0x40c5829dd0000000 */
2.99370708007812500000e+04, /* 0x40dd3c4488000000 */
8.13773945312500000000e+04, /* 0x40f3de1650000000 */
2.21206695312500000000e+05, /* 0x410b00b590000000 */
6.01302140625000000000e+05, /* 0x412259ac48000000 */
1.63450865625000000000e+06, /* 0x4138f0cca8000000 */
4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */
1.20774762500000000000e+07, /* 0x4167093488000000 */
3.28299845000000000000e+07, /* 0x417f4f2208000000 */
8.92411500000000000000e+07, /* 0x419546d8f8000000 */
2.42582596000000000000e+08, /* 0x41aceb0888000000 */
6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */
1.79245641600000000000e+09, /* 0x41dab5adb8000000 */
4.87240166400000000000e+09, /* 0x41f226af30000000 */
1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */
3.60024494080000000000e+10, /* 0x4220c3d390000000 */
9.78648043520000000000e+10, /* 0x4236c93268000000 */
2.66024116224000000000e+11, /* 0x424ef822f0000000 */
7.23128516608000000000e+11, /* 0x42650bba30000000 */
1.96566712320000000000e+12, /* 0x427c9aae40000000 */
5.34323724288000000000e+12, /* 0x4293704708000000 */
1.45244246507520000000e+13, /* 0x42aa6b7658000000 */
3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */
1.07321789251584000000e+14, /* 0x42d866f348000000 */
2.91730863685632000000e+14, /* 0x42f0953e28000000 */
7.93006722514944000000e+14, /* 0x430689e220000000 */
2.15561576592179200000e+15}; /* 0x431ea215a0000000 */
static const double cosh_tail[37] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
6.89700037027478056904e-09, /* 0x3e3d9f5504c2bd28 */
4.43207835591715833630e-08, /* 0x3e67cb66f0a4c9fd */
2.33540217013828929694e-07, /* 0x3e8f58617928e588 */
5.17452463948269748331e-08, /* 0x3e6bc7d000c38d48 */
9.38728274131605919153e-07, /* 0x3eaf7f9d4e329998 */
2.73012191010840495544e-06, /* 0x3ec6e6e464885269 */
3.29486051438996307950e-06, /* 0x3ecba3a8b946c154 */
4.75803746362771416375e-06, /* 0x3ed3f4e76110d5a4 */
3.33050940471947692369e-05, /* 0x3f017622515a3e2b */
9.94707313972136215365e-06, /* 0x3ee4dc4b528af3d0 */
6.51685096227860253398e-05, /* 0x3f11156278615e10 */
1.18132406658066663359e-03, /* 0x3f535ad50ed821f5 */
6.93090416366541877541e-04, /* 0x3f46b61055f2935c */
1.45780415323416845386e-03, /* 0x3f57e2794a601240 */
2.99862082708111758744e-02, /* 0x3f9eb4b45f6aadd3 */
1.02539925859688602072e-02, /* 0x3f85000b967b3698 */
1.26787669807076286421e-01, /* 0x3fc03a940fadc092 */
6.86652631843830962843e-02, /* 0x3fb1940bf3bf874c */
4.81593633223853068159e-01, /* 0x3fded26e1a2a2110 */
1.70489514001513020602e+00, /* 0x3ffb4740205796d6 */
1.12416073489841270572e+01, /* 0x40267bb3f55cb85d */
7.06579578098005001152e+00, /* 0x401c435ff81e18ac */
5.91244513000686140458e+01, /* 0x404d8fee052bdea4 */
1.68921736147088438429e+02, /* 0x40651d7edccde926 */
2.60692936262087528121e+02, /* 0x40704b1644557e0e */
3.62419382134890611269e+02, /* 0x4076a6b5ca0a9e1c */
4.07689930834187453002e+03, /* 0x40afd9cc72249abe */
1.55377375868385224749e+04, /* 0x40ce58de693edab5 */
2.53720210371943103382e+04, /* 0x40d8c70158ac6364 */
4.78822310734952334315e+04, /* 0x40e7614764f43e20 */
1.81871712615542812273e+05, /* 0x4106337db36fc718 */
5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */
6.41374032312148716301e+05, /* 0x412392bc108b37cc */
7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */
3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */
7.63580561355670914054e+06}; /* 0x415d20d76744835c */
unsigned long ux, aux, xneg;
double y, z, z1, z2;
int m;
/* Special cases */
GET_BITS_DP64(x, ux);
aux = ux & ~SIGNBIT_DP64;
if (aux < 0x3e30000000000000) /* |x| small enough that sinh(x) = x */
{
if (aux == 0)
/* with no inexact */
return x;
else
return val_with_flags(x, AMD_F_INEXACT);
}
else if (aux >= 0x7ff0000000000000) /* |x| is NaN or Inf */
{
if (aux > 0x7ff0000000000000)
/* x is NaN */
return _handle_error("sinh", OP_SINH, ux|0x0008000000000000, _DOMAIN,
0, EDOM, x, 0.0, 1);
else
return x + x;
}
xneg = (aux != ux);
y = x;
if (xneg) y = -x;
if (y >= max_sinh_arg)
{
if (xneg)
return _handle_error("sinh", OP_SINH, NINFBITPATT_DP64, _OVERFLOW,
AMD_F_OVERFLOW, ERANGE, x, 0.0, 1);
else
return _handle_error("sinh", OP_SINH, PINFBITPATT_DP64, _OVERFLOW,
AMD_F_OVERFLOW, ERANGE, x, 0.0, 1);
}
else if (y >= small_threshold)
{
/* In this range y is large enough so that
the negative exponential is negligible,
so sinh(y) is approximated by sign(x)*exp(y)/2. The
code below is an inlined version of that from
exp() with two changes (it operates on
y instead of x, and the division by 2 is
done by reducing m by 1). */
splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
log2_by_32_tail, &m, &z1, &z2);
m -= 1;
if (m >= EMIN_DP64 && m <= EMAX_DP64)
z = scaleDouble_1((z1+z2),m);
else
z = scaleDouble_2((z1+z2),m);
}
else
{
/* In this range we find the integer part y0 of y
and the increment dy = y - y0. We then compute
z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
where sinh(y0) and cosh(y0) are tabulated above. */
int ind;
double dy, dy2, sdy, cdy, sdy1, sdy2;
ind = (int)y;
dy = y - ind;
dy2 = dy*dy;
sdy = dy*dy2*(0.166666666666666667013899e0 +
(0.833333333333329931873097e-2 +
(0.198412698413242405162014e-3 +
(0.275573191913636406057211e-5 +
(0.250521176994133472333666e-7 +
(0.160576793121939886190847e-9 +
0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
cdy = dy2*(0.500000000000000005911074e0 +
(0.416666666666660876512776e-1 +
(0.138888888889814854814536e-2 +
(0.248015872460622433115785e-4 +
(0.275573350756016588011357e-6 +
(0.208744349831471353536305e-8 +
0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
/* At this point sinh(dy) is approximated by dy + sdy.
Shift some significant bits from dy to sdy. */
GET_BITS_DP64(dy, ux);
ux &= 0xfffffffff8000000;
PUT_BITS_DP64(ux, sdy1);
sdy2 = sdy + (dy - sdy1);
z = ((((((cosh_tail[ind]*sdy2 + sinh_tail[ind]*cdy)
+ cosh_tail[ind]*sdy1) + sinh_tail[ind])
+ cosh_lead[ind]*sdy2) + sinh_lead[ind]*cdy)
+ cosh_lead[ind]*sdy1) + sinh_lead[ind];
}
if (xneg) z = - z;
return z;
}

View File

@@ -0,0 +1,256 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_SPLITEXP
#define USE_SCALEDOUBLE_1
#define USE_INFINITY_WITH_FLAGS
#define USE_VALF_WITH_FLAGS
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_SPLITEXP
#undef USE_SCALEDOUBLE_1
#undef USE_INFINITY_WITH_FLAGS
#undef USE_VALF_WITH_FLAGS
#undef USE_HANDLE_ERRORF
#include "libm_errno.h"
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(sinhf)
float sinhf(float fx)
{
/*
After dealing with special cases the computation is split into
regions as follows:
abs(x) >= max_sinh_arg:
sinh(x) = sign(x)*Inf
abs(x) >= small_threshold:
sinh(x) = sign(x)*exp(abs(x))/2 computed using the
splitexp and scaleDouble functions as for exp_amd().
abs(x) < small_threshold:
compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
sinh(x) is then sign(x)*z. */
static const double
/* The max argument of sinhf, but stored as a double */
max_sinh_arg = 8.94159862922329438106e+01, /* 0x40565a9f84f82e63 */
thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889;
/* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */
/* Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. */
static const double sinh_lead[37] = {
0.00000000000000000000e+00, /* 0x0000000000000000 */
1.17520119364380137839e+00, /* 0x3ff2cd9fc44eb982 */
3.62686040784701857476e+00, /* 0x400d03cf63b6e19f */
1.00178749274099008204e+01, /* 0x40240926e70949ad */
2.72899171971277496596e+01, /* 0x403b4a3803703630 */
7.42032105777887522891e+01, /* 0x40528d0166f07374 */
2.01713157370279219549e+02, /* 0x406936d22f67c805 */
5.48316123273246489589e+02, /* 0x408122876ba380c9 */
1.49047882578955000099e+03, /* 0x409749ea514eca65 */
4.05154190208278987484e+03, /* 0x40afa7157430966f */
1.10132328747033916443e+04, /* 0x40c5829dced69991 */
2.99370708492480553105e+04, /* 0x40dd3c4488cb48d6 */
8.13773957064298447222e+04, /* 0x40f3de1654d043f0 */
2.21206696003330085659e+05, /* 0x410b00b5916a31a5 */
6.01302142081972560845e+05, /* 0x412259ac48bef7e3 */
1.63450868623590236530e+06, /* 0x4138f0ccafad27f6 */
4.44305526025387924165e+06, /* 0x4150f2ebd0a7ffe3 */
1.20774763767876271158e+07, /* 0x416709348c0ea4ed */
3.28299845686652474105e+07, /* 0x417f4f22091940bb */
8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */
2.42582597704895108938e+08, /* 0x41aceb088b68e803 */
6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */
1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */
4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */
1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */
3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */
9.78648047144193725586e+10, /* 0x4236c932696a6b5c */
2.66024120300899291992e+11, /* 0x424ef822f7f6731c */
7.23128532145737548828e+11, /* 0x42650bba3796379a */
1.96566714857202099609e+12, /* 0x427c9aae4631c056 */
5.34323729076223046875e+12, /* 0x429370470aec28ec */
1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */
3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */
1.07321789892958031250e+14, /* 0x42d866f34a725782 */
2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */
7.93006726156715250000e+14, /* 0x430689e221bc8d5a */
2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */
static const double cosh_lead[37] = {
1.00000000000000000000e+00, /* 0x3ff0000000000000 */
1.54308063481524371241e+00, /* 0x3ff8b07551d9f550 */
3.76219569108363138810e+00, /* 0x400e18fa0df2d9bc */
1.00676619957777653269e+01, /* 0x402422a497d6185e */
2.73082328360164865444e+01, /* 0x403b4ee858de3e80 */
7.42099485247878334349e+01, /* 0x40528d6fcbeff3a9 */
2.01715636122455890700e+02, /* 0x406936e67db9b919 */
5.48317035155212010977e+02, /* 0x4081228949ba3a8b */
1.49047916125217807348e+03, /* 0x409749eaa93f4e76 */
4.05154202549259389343e+03, /* 0x40afa715845d8894 */
1.10132329201033226127e+04, /* 0x40c5829dd053712d */
2.99370708659497577173e+04, /* 0x40dd3c4489115627 */
8.13773957125740562333e+04, /* 0x40f3de1654d6b543 */
2.21206696005590405548e+05, /* 0x410b00b5916b6105 */
6.01302142082804115489e+05, /* 0x412259ac48bf13ca */
1.63450868623620807193e+06, /* 0x4138f0ccafad2d17 */
4.44305526025399193168e+06, /* 0x4150f2ebd0a8005c */
1.20774763767876680940e+07, /* 0x416709348c0ea503 */
3.28299845686652623117e+07, /* 0x417f4f22091940bf */
8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */
2.42582597704895138741e+08, /* 0x41aceb088b68e804 */
6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */
1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */
4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */
1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */
3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */
9.78648047144193725586e+10, /* 0x4236c932696a6b5c */
2.66024120300899291992e+11, /* 0x424ef822f7f6731c */
7.23128532145737548828e+11, /* 0x42650bba3796379a */
1.96566714857202099609e+12, /* 0x427c9aae4631c056 */
5.34323729076223046875e+12, /* 0x429370470aec28ec */
1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */
3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */
1.07321789892958031250e+14, /* 0x42d866f34a725782 */
2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */
7.93006726156715250000e+14, /* 0x430689e221bc8d5a */
2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */
unsigned long ux, aux, xneg;
double x = fx, y, z, z1, z2;
int m;
/* Special cases */
GET_BITS_DP64(x, ux);
aux = ux & ~SIGNBIT_DP64;
if (aux < 0x3f10000000000000) /* |x| small enough that sinh(x) = x */
{
if (aux == 0)
/* with no inexact */
return fx;
else
return valf_with_flags(fx, AMD_F_INEXACT);
}
else if (aux >= 0x7ff0000000000000) /* |x| is NaN or Inf */
{
if (aux > 0x7ff0000000000000)
{
/* x is NaN */
unsigned int uhx;
GET_BITS_SP32(fx, uhx);
return _handle_errorf("sinhf", OP_SINH, uhx|0x00400000, _DOMAIN,
0, EDOM, fx, 0.0F, 1);
}
else
return fx + fx;
}
xneg = (aux != ux);
y = x;
if (xneg) y = -x;
if (y >= max_sinh_arg)
{
/* Return infinity with overflow flag. */
if (xneg)
return _handle_errorf("sinhf", OP_SINH, NINFBITPATT_SP32, _OVERFLOW,
AMD_F_OVERFLOW, ERANGE, fx, 0.0F, 1);
else
return _handle_errorf("sinhf", OP_SINH, PINFBITPATT_SP32, _OVERFLOW,
AMD_F_OVERFLOW, ERANGE, fx, 0.0F, 1);
}
else if (y >= small_threshold)
{
/* In this range y is large enough so that
the negative exponential is negligible,
so sinh(y) is approximated by sign(x)*exp(y)/2. The
code below is an inlined version of that from
exp() with two changes (it operates on
y instead of x, and the division by 2 is
done by reducing m by 1). */
splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
log2_by_32_tail, &m, &z1, &z2);
m -= 1;
/* scaleDouble_1 is always safe because the argument x was
float, rather than double */
z = scaleDouble_1((z1+z2),m);
}
else
{
/* In this range we find the integer part y0 of y
and the increment dy = y - y0. We then compute
z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
where sinh(y0) and cosh(y0) are tabulated above. */
int ind;
double dy, dy2, sdy, cdy;
ind = (int)y;
dy = y - ind;
dy2 = dy*dy;
sdy = dy + dy*dy2*(0.166666666666666667013899e0 +
(0.833333333333329931873097e-2 +
(0.198412698413242405162014e-3 +
(0.275573191913636406057211e-5 +
(0.250521176994133472333666e-7 +
(0.160576793121939886190847e-9 +
0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
cdy = 1 + dy2*(0.500000000000000005911074e0 +
(0.416666666666660876512776e-1 +
(0.138888888889814854814536e-2 +
(0.248015872460622433115785e-4 +
(0.275573350756016588011357e-6 +
(0.208744349831471353536305e-8 +
0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
z = sinh_lead[ind]*cdy + cosh_lead[ind]*sdy;
}
if (xneg) z = - z;
return (float)z;
}

View File

@@ -0,0 +1,88 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#if USE_SOFTWARE_SQRT
#define USE_SQRT_AMD_INLINE
#endif
#define USE_NAN_WITH_FLAGS
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#if USE_SOFTWARE_SQRT
#undef USE_SQRT_AMD_INLINE
#endif
#undef USE_NAN_WITH_FLAGS
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
#pragma function(sqrt)
double sqrt(double x)
{
#if USE_SOFTWARE_SQRT
return sqrt_amd_inline(x);
#else
double r;
unsigned long ux;
GET_BITS_DP64(x, ux);
/* Check for special cases for Microsoft error handling */
if ((ux & PINFBITPATT_DP64) == PINFBITPATT_DP64)
{
/* x is infinity, or NaN */
if (ux & MANTBITS_DP64)
{
/* NaN of some sort */
/* If it's a signaling NaN, convert to QNaN */
return _handle_error("sqrt", OP_SQRT, ux|0x0008000000000000,
_DOMAIN, 0,EDOM, x, 0.0, 1);
}
else
{
/* +/-infinity */
if (ux & SIGNBIT_DP64)
{
/* - infinity */
return _handle_error("sqrt", OP_SQRT, INDEFBITPATT_DP64,
_DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
}
/* positive infinite is not a problem */
}
}
if ((ux & SIGNBIT_DP64)&&(ux & ~SIGNBIT_DP64)) /* if x < zero */
{
return _handle_error("sqrt", OP_SQRT, INDEFBITPATT_DP64,
_DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
}
/* VC++ intrinsic call */
_mm_store_sd(&r, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&x)));
return r;
#endif
}

View File

@@ -0,0 +1,91 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#if USE_SOFTWARE_SQRT
#define USE_SQRTF_AMD_INLINE
#endif
#define USE_NANF_WITH_FLAGS
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#if USE_SOFTWARE_SQRT
#undef USE_SQRTF_AMD_INLINE
#endif
#undef USE_NANF_WITH_FLAGS
#undef USE_HANDLE_ERRORF
#include "libm_errno.h"
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(sqrtf)
float sqrtf(float x)
{
#if USE_SOFTWARE_SQRT
return sqrtf_amd_inline(x);
#else
float r;
unsigned int ux;
GET_BITS_SP32(x, ux);
/* Check for special cases for Microsoft error handling */
if ((ux & PINFBITPATT_SP32) == PINFBITPATT_SP32)
{
/* x is infinity, or NaN */
if (ux & MANTBITS_SP32)
{
/* NaN of some sort */
/* If it's a signaling NaN, convert to QNaN */
return _handle_errorf("sqrtf", OP_SQRT, ux|0x00400000, _DOMAIN, 0,
EDOM, x, 0.0F, 1);
}
else
{
/* +/-infinity */
if (ux & SIGNBIT_SP32)
{
/* - infinity */
return _handle_errorf("sqrtf", OP_SQRT, INDEFBITPATT_SP32,
_DOMAIN, AMD_F_INVALID, EDOM, x, 0.0F, 1);
}
/* positive infinite is not a problem */
}
}
if ((ux & SIGNBIT_SP32)&&(ux & ~SIGNBIT_SP32)) /* if x < zero */
{
return _handle_errorf("sqrtf", OP_SQRT, INDEFBITPATT_SP32,
_DOMAIN, AMD_F_INVALID, EDOM, x, 0.0F, 1);
}
/* VC++ intrinsic call */
_mm_store_ss(&r, _mm_sqrt_ss(_mm_load_ss(&x)));
return r;
#endif
}

View File

@@ -0,0 +1,762 @@
;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the tan function.
;
; Prototype:
;
; double tan(double x);
;
; Computes tan(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
; If FMA3 hardware is present, it will be used for the calculation.
;
.const
ALIGN 16
L_signbit DQ 08000000000000000h
DQ 08000000000000000h ; duplicate for pd
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
DQ 07FFFFFFFFFFFFFFFh ; duplicate for pd
L_int_one DQ 00000000000000001h
DQ 00000000000000001h ; duplicate for pd
L_twobypi DQ 03FE45F306DC9C883h
DQ 03FE45F306DC9C883h ; duplicate for pd
L_point_333 DQ 03FD5555555555555h; 1/3
DQ 03FD5555555555555h ; duplicate for pd
L_tan_p0 DQ 03FD7D50F6638564Ah ; 0.372379159759792203640806338901e0
DQ 03FD7D50F6638564Ah ; duplicate for pd
L_tan_p2 DQ 0BF977C24C7569ABBh ; -0.229345080057565662883358588111e-1
DQ 0BF977C24C7569ABBh ; duplicate for pd
L_tan_p4 DQ 03F2D5DAF289C385Ah ; 0.224044448537022097264602535574e-3
DQ 03F2D5DAF289C385Ah ; duplicate for pd
L_tan_q0 DQ 03FF1DFCB8CAA40B8h ; 0.111713747927937668539901657944e1
DQ 03FF1DFCB8CAA40B8h ; duplicate for pd
L_tan_q2 DQ 0BFE08046499EB90Fh ; -0.515658515729031149329237816945e0
DQ 0BFE08046499EB90Fh ; duplicate for pd
L_tan_q4 DQ 03F9AB0F4F80A0ACFh ; 0.260656620398645407524064091208e-1
DQ 03F9AB0F4F80A0ACFh ; duplicate for pd
L_tan_q6 DQ 0BF2E7517EF6D98F8h ; -0.232371494088563558304549252913e-3
DQ 0BF2E7517EF6D98F8h ; duplicate for pd
L_half_mask DQ 0ffffffff00000000h
DQ 0ffffffff00000000h ; duplicate for pd
L_piby4_lead DQ 03FE921FB54442D18h ; pi/4, high part
DQ 03FE921FB54442D18h ; duplicate for pd
L_piby4_tail DQ 03C81A62633145C06h ; pi/4, low parft
DQ 03C81A62633145C06h ; duplicate for pd
; Different parts of argument reduction need different versions of pi/2
L_piby2_1 DQ 03FF921FB54400000h ; pi/2, high 33 bits
L_piby2_1tail DQ 03DD0B4611A626331h ; pi/2, second 53 bits, overlaps...
L_piby2_2 DQ 03DD0B4611A600000h ; pi/2, second 33 bits
L_piby2_2tail DQ 03BA3198A2E037073h ; pi/2, third 53 bits, overlaps...
L_piby2_3 DQ 03BA3198A2E000000h ; pi/2, third 33 bits
L_piby2_3tail DQ 0397B839A252049C1h ; pi/2, fourth 53 bits
; end of pi/2 versions
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
L_inf_mask_64 DQ 07FF0000000000000h
L_point_five DQ 03FE0000000000000h
L_point_68 DQ 03FE5C28F5C28F5C3h ; .68
L_n_point_68 DQ 0BFE5C28F5C28F5C3h ; -.68
L_zero DQ -0000000000000000h ; 0.0
L_one DQ 03FF0000000000000h ; 1.0
L_n_one DQ 0BFF0000000000000h ; -1.0
L_two DQ 04000000000000000h ; 2.0
L_moderate_arg_cw DQ 0411E848000000000h ; 5.e5
L_moderate_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
fname TEXTEQU <tan>
fname_special TEXTEQU <_tan_special>
; local storage offsets
save_xmm6 EQU 020h
save_xmm7 EQU 030h
store_input EQU 040h
save_r10 EQU 050h
dummy_space EQU 060h
stack_size EQU 088h
include fm.inc
EXTERN __use_fma3_lib:DWORD
EXTERN fname_special : PROC
EXTERN __remainder_piby2_fma3 : PROC
EXTERN __remainder_piby2_fma3_bdl : PROC
EXTERN __remainder_piby2_forAsm : PROC
EXTERN _set_statfp : PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
SaveXmm xmm6, save_xmm6
SaveXmm xmm7, save_xmm7
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Ltan_fma3
Ltan_sse2:
movd rdx, xmm0 ; really movq
movaps xmm6, xmm0
mov rcx, rdx
btr rcx, 63 ; rcx <-- |x|
cmp rcx, L_piby4_lead
ja Ltan_abs_x_nle_pio4 ; branch if > pi/4 or NaN
cmp rcx, L_two_to_neg_13
jae Ltan_abs_x_ge_two_to_neg_13
cmp rcx, L_two_to_neg_27
jae Labs_x_ge_two_to_neg_27
; At this point tan(x) ~= x; if it's not exact, set the inexact flag
test rcx, rcx
je Ltan_return
mov ecx, 20h ; ecx <-- AMD_F_INEXACT
call _set_statfp
movaps xmm0, xmm6 ; may be redundant, but xmm0 <-- x
RestoreXmm xmm7, save_xmm7
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret 0
Labs_x_ge_two_to_neg_27:
mulsd xmm0, xmm0
mulsd xmm0, xmm6
mulsd xmm0, QWORD PTR L_point_333
addsd xmm0, xmm6
RestoreXmm xmm7, save_xmm7
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret 0
Ltan_abs_x_ge_two_to_neg_13:
xorps xmm1, xmm1 ; xmm1 <-- xx = 0
xor r8d, r8d ; r8 <-- recip flag = 0
call _tan_piby4
Ltan_return:
RestoreXmm xmm7, save_xmm7
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret 0
Ltan_abs_x_nle_pio4:
cmp rcx, L_inf_mask_64 ; |x| uint >= +inf as uint ?
jnae Ltan_x_is_finite
call fname_special
RestoreXmm xmm7, save_xmm7
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Ltan_x_is_finite:
xor r8d, r8d
xor r10, r10
cmp rcx, rdx
setne r10b ; r10 <-- x was negative flag
andpd xmm6, L_sign_mask
movsd xmm0, QWORD PTR L_moderate_arg_cw ; currently 5e5
comisd xmm0, xmm6
jbe Ltan_x_is_very_large
Ltan_x_is_moderate: ; unused label
; For these arguments we do a Cody-Waite reduction, subtracting the
; appropriate multiple of pi/2, using extra precision where x is close
; to an exact multiple of pi/2
; We special-case region setting for |x| <= 9pi/4
; It seems strange that this speeds things up, but it does
mov rdx, rcx
mov rax, 4616025215990052958 ; 400f6a7a2955385eH (5pi/4)
shr rdx, 52 ; rdx <-- xexp
cmp rcx, rax
ja Labs_x_gt_5pio4
mov rax, 4612488097114038738 ; 4002d97c7f3321d2H (3pi/4)
cmp rcx, rax
seta r8b
inc r8d ; r8d <-- region (1 or 2)
jmp Lhave_region
Labs_x_gt_5pio4:
mov rax, 4619644535898419899 ; 401c463abeccb2bbH (9pi/4)
cmp rcx, rax
ja Lneed_region_computation
mov rax, 4617875976460412789 ; 4015fdbbe9bba775H (7pi/4)
cmp rcx, rax
seta r8b
add r8d, 3 ; r8d <-- region (3 or 4)
jmp Lhave_region
ALIGN 16
Lneed_region_computation:
movaps xmm0, xmm6
mulsd xmm0, QWORD PTR L_twobypi
addsd xmm0, QWORD PTR L_point_five
cvttsd2si r8d, xmm0 ; r8d <-- region
Lhave_region:
movd xmm3, r8d
cvtdq2pd xmm3, xmm3
movaps xmm2, xmm3
movaps xmm0, xmm3
mulsd xmm0, QWORD PTR L_piby2_1
mulsd xmm2, QWORD PTR L_piby2_1tail ; xmm2 < rtail = npi2 * piby2_1tail
subsd xmm6, xmm0 ; xmm6 <-- rhead = x - npi2*piby2_1
; If x is not too close to multiple of pi/2,
; we're essentially done with reduction
; If the exponent of rhead is not close to that of x,
; then most of x has been subtracted away in computing rhead;
; i.e., x is close to a multiple of pi/2.
movd rax, xmm6
shr rax, 52
and eax, 2047
sub rdx, rax ; rdx <-- exp diff of x vs rhead
cmp rdx, 15
jbe Ltan_have_rhead_rtail
; Oops, x is almost a multiple of pi/2. Compute more bits of reduced x
; t = rhead;
; rtail = npi2 * piby2_2;
; rhead = t - rtail;
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
movaps xmm1, xmm6
movaps xmm0, xmm3
movaps xmm2, xmm3
mulsd xmm0, QWORD PTR L_piby2_2
mulsd xmm2, QWORD PTR L_piby2_2tail
subsd xmm6, xmm0
subsd xmm1, xmm6
subsd xmm1, xmm0
subsd xmm2, xmm1
cmp rdx, 48
jbe Ltan_have_rhead_rtail ; We've done enough
; Wow, x is REALLY close to a multiple of pi/2. Compute more bits.
; t = rhead;
; rtail = npi2 * piby2_3;
; rhead = t - rtail;
; rtail = npi2 * piby2_3tail - ((t - rhead) - rtail);
movaps xmm1, xmm6
movaps xmm0, xmm3
movaps xmm2, xmm3
mulsd xmm0, QWORD PTR L_piby2_3
mulsd xmm2, QWORD PTR L_piby2_3tail
subsd xmm6, xmm0 ; xmm6 <-- rhead = t - rtail
subsd xmm1, xmm6 ; xmm1 <-- t - rhead
subsd xmm1, xmm0 ; xmm1 <-- ((t - rhead) - rtail)
subsd xmm2, xmm1 ; xmm2 <-- final rtail
Ltan_have_rhead_rtail:
; At this point xmm6 has a suitable rhead, xmm2 a suitable rtail
movaps xmm0, xmm6 ; xmm0 <-- copy of rhead
; r = rhead - rtail
; rr = (rhead - r) - rtail;
; region = npi2 & 3;
and r8d, 3 ; r8d <-- region
subsd xmm0, xmm2 ; xmm0 <-- r = rhead - rtail
subsd xmm6, xmm0 ; xmm6 <-- rhead - r
subsd xmm6, xmm2 ; xmm6 <-- rr = (rhead - r) - rtail
Ltan_do_tan_computation:
and r8d, 1 ; r8d <-- region & 1
movaps xmm1, xmm6
call _tan_piby4
test r10d, r10d
je Ltan_pos_return
xorpd xmm0, QWORD PTR L_signbit
Ltan_pos_return:
RestoreXmm xmm7, save_xmm7
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret 0
ALIGN 16
Ltan_x_is_very_large:
; Reduce x into range [-pi/4,pi/4] (general case)
movaps xmm0, xmm6
mov QWORD PTR [rsp+save_r10], r10
call __remainder_piby2_forAsm ; this call clobbers r10
mov r10, QWORD PTR [rsp+save_r10]
movapd xmm6,xmm1 ; xmm6 <-- rr
mov r8d,eax ; r8d <-- region
jmp Ltan_do_tan_computation
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; From here on, it is assumed that the hardware supports FMA3 (and AVX).
ALIGN 16
Ltan_fma3:
vmovq r9,xmm0
mov rdx,r9 ; rdx <-- x
btr r9,63 ; r9 <-- |x|
cmp r9,L_piby4_lead
jae Ltan_fma3_absx_gt_pio4 ; Note that NaN will branch
Ltan_fma3_absx_le_pio4:
; no argument reduction is needed, so recip is 0, xx is 0.
; Note that this routine is not special-casing very small |x|
vmovsd xmm5,L_piby4_lead
vmovsd xmm6,L_piby4_tail
vxorpd xmm1,xmm1,xmm1 ; xx <-- 0.
vxorpd xmm7,xmm7,xmm7 ; transform <-- 0
comisd xmm0,L_point_68
jbe Ltan_fma3_small_x_le_point_68
Ltan_fma3_x_small_gt_point_68:
vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0
vsubsd xmm0,xmm5,xmm0 ; x = piby4_lead - x
vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail
jmp Ltan_fma3_compute_Remez_for_small_x
ALIGN 16
Ltan_fma3_small_x_le_point_68:
comisd xmm0,L_n_point_68
jae Ltan_fma3_compute_Remez_for_small_x
Ltan_fma3_small_x_lt_neg_point_68:
vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0
vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x
vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail
Ltan_fma3_compute_Remez_for_small_x:
; At this point xmm0 holds x, possibly transformed
; now do core Remez rational approximation for x in [0,0.68]
vmovsd xmm4,L_tan_q6
vmovsd xmm3,L_tan_p4
vmulsd xmm2,xmm0,xmm0 ; xx is 0, so xmm2 <-- r = x*x
vfmadd213sd xmm4,xmm2,L_tan_q4
vfmadd213sd xmm3,xmm2,L_tan_p2
vfmadd213sd xmm4,xmm2,L_tan_q2
vfmadd213sd xmm3,xmm2,L_tan_p0 ; xmm3 <-- p2 (polynomial)
vfmadd213sd xmm4,xmm2,L_tan_q0 ; xmm4 <-- q3 (polynomial)
vdivsd xmm3,xmm3,xmm4 ; xmm3 <-- r3 = p2/q3
vmulsd xmm3,xmm3,xmm2 ; xmm3 <-- r * r3
vfmadd132sd xmm0,xmm0,xmm3 ; xx = 0, so xmm0 <-- t = x + x*(r*r3)
comisd xmm7,L_zero ; did we transform x?
; if x was transformed, we need to transform t to get answer;
; if not, the answer is just t.
je Ltan_fma3_ext_piby4_zero
; x was transformed, so answer is +- (1. - 2.*t/(1.+t))
; (remember recip is 0 here)
vmovsd xmm3,L_one
vaddsd xmm4,xmm0,L_one ; xmm4 <-- 1. + t
vdivsd xmm6,xmm0,xmm4 ; xmm6 <-- t / (1.+t)
vfnmadd231sd xmm3,xmm6,L_two ; xmm3 <-- 1. - 2.*t/(1.+t)
vmulsd xmm0,xmm3,xmm7 ; multiply by +- 1.
Ltan_fma3_ext_piby4_zero:
; restore volatile registers
AVXRestoreXmm xmm7, save_xmm7
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret 0
ALIGN 16
Ltan_fma3_absx_gt_pio4: ;;; come here if |x| > pi/4
cmp r9, L_inf_mask_64
jae Ltan_fma3_naninf
;Ltan_fma3_range_reduce:
vmovapd [store_input + rsp],xmm0 ; save copy of x
vmovq xmm0,r9 ; xmm0l <-- |x|
cmp r9,L_moderate_arg_bdl
jge Ltan_fma3_remainder_piby2 ; go elsewhere if |x| > 500000.
; Note that __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
; have calling conventions that differ from the C routine
; on input
; |x| is in xmm0
; on output
; z is in xmm0
; zz is in xmm1
; where z + zz = arg reduced |x| and zz is small compared to z
; region of |x| is in rax
Ltan_fma3_remainder_piby2_small:
; Boldo-Daumas-Li reduction for reasonably small |x|
call __remainder_piby2_fma3_bdl
Ltan_fma3_full_computation:
; we have done argument reduction; recip and xx may be nonzero
; x is in xmm0, xx is in xmm1
; recip is region & 1, and region is in rax.
vmovsd xmm5,L_piby4_lead
vmovsd xmm6,L_piby4_tail
vxorpd xmm7,xmm7,xmm7 ; transform <-- 0
vcomisd xmm0,L_point_68
jbe Ltan_fma3_full_x_le_point_68
Ltan_fma3_full_x_gt_point_68:
vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0
vsubsd xmm0,xmm5,xmm0 ; xmm0 <-- x = piby4_lead - x
vsubsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = pibi4_tail - xx
vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl
vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0
jmp Ltan_fma3_compute_Remez
ALIGN 16
Ltan_fma3_full_x_le_point_68:
vcomisd xmm0,L_n_point_68
jae Ltan_fma3_compute_Remez
Ltan_fma3_full_x_lt_neg_point_68:
vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0
vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x
vaddsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = piby4_tail + xx
vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl
vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0
Ltan_fma3_compute_Remez:
vmulsd xmm2,xmm0,xmm0 ; xmm2 <-- x*x
vmulsd xmm5,xmm1,xmm0 ; xmm5 <-- x*xx
vfmadd132sd xmm5,xmm2,L_two ; xmm5 <-- r = x*x + 2.*x*xx
vmovsd xmm2,L_tan_p4
vfmadd213sd xmm2,xmm5,L_tan_p2 ; xmm2 <-- p4*r+p2
vfmadd213sd xmm2,xmm5,L_tan_p0 ; xmm2 <-- p = (p4*r+p2)*r+p0
vmovsd xmm4,L_tan_q6
vfmadd213sd xmm4,xmm5,L_tan_q4 ; xmm4 <-- q6*r+q4
vfmadd213sd xmm4,xmm5,L_tan_q2 ; xmm4 <-- (q6*r+q4)*r+q2
vfmadd213sd xmm4,xmm5,L_tan_q0 ; xmm4 <-- q = ((q6*r+q4)*r+q2)*r+q0
vdivsd xmm2,xmm2,xmm4 ; xmm2 <-- p/q
vmulsd xmm2,xmm2,xmm5 ; xmm2 <-- r*p/q
vfmadd213sd xmm2,xmm0,xmm1 ; xmm2 <-- t2 = xx + x*r*(p/q)
vaddsd xmm1,xmm0,xmm2 ; xmm1 <-- t = (t1=x) + t2
; If |x| > .68 we transformed, and t is an approximation of
; tan(pi/4 +- (x+xx))
; otherwise, t is just tan(x+xx)
vxorpd xmm6,xmm6,xmm6
vcomisd xmm7,xmm6 ; did we transform? (|x| > .68) ?
jz Ltan_fma3_if_recip_set ; if not, go check recip
Ltan_fma3_if_transfor_set:
; Because we transformed x+xx, we have to transform t before returning
; let transform be 1 for x > .68, -1 for x < -.68, then we return
; transform * (recip ? (2.*t/(t-1.) - 1.) : (1. - 2.*t/(1.+t)))
vaddsd xmm6,xmm1,xmm1 ; xmm6 <-- 2.*t
vmovsd xmm4,L_one
vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1
vsubsd xmm5,xmm1,xmm4 ; xmm5 <-- t-1
bt rax,0
jc Ltan_fma3_transform_and_recip_set
; here recip is not set
vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1
vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t+1)
vfnmadd132sd xmm2,xmm4,L_two ; xmm2 <-- 1 - 2*t/(t+1)
vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(1 - 2*t/(t+1))
jmp Ltan_fma3_exit_piby4
ALIGN 16
Ltan_fma3_transform_and_recip_set:
; here recip is set
vsubsd xmm2,xmm1,xmm4 ; xmm2 <-- t-1
vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t-1)
vfmsub132sd xmm2,xmm4,L_two ; xmm2 <-- 2*t/(t-1) - 1
vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(2*t/(t-1) - 1)
jmp Ltan_fma3_exit_piby4
ALIGN 16
Ltan_fma3_if_recip_set:
; Here we did not transform x and xx, but if we are in an odd quadrant
; we will need to return -1./(t1+t2), computed accurately
; (t=t1 is in xmm1, t2 is in xmm2)
bt rax,0
jnc Ltan_fma3_exit_piby4
vandpd xmm7,xmm1,L_half_mask ; xmm7 <-- z1 = high bits of t
vsubsd xmm4,xmm7,xmm0 ; xmm4 <-- z1 - t1
vsubsd xmm4,xmm2,xmm4 ; xmm4 <-- z2 = t2 - (z1-t1)
vmovsd xmm2,L_n_one
vdivsd xmm2,xmm2,xmm1 ; xmm2 <-- trec = -1./t
vandpd xmm5,xmm2,L_half_mask ; xmm5 <-- trec_top=high bits of trec
vfmadd213sd xmm7,xmm5,L_one ; xmm7 <-- trec_top*z1 + 1.
vfmadd231sd xmm7 ,xmm4,xmm5 ; xmm7 <-- z2*trec_top + (trec_top*z1 + 1.)
vfmadd213sd xmm7,xmm2,xmm5 ; xmm7 <-- u = trec_top + trec*(z2*trec_top + (trec_top*z1+1.))
vmovapd xmm1,xmm7 ; xmm1 <-- u
Ltan_fma3_exit_piby4:
vmovapd xmm0,xmm1 ; xmm0 <-- t, u, or v, as needed
vmovapd xmm1,[store_input + rsp]
vandpd xmm1,xmm1,L_signbit
vxorpd xmm0,xmm0,xmm1 ; tan(-x) = -tan(x)
; restore volatile registers
AVXRestoreXmm xmm7, save_xmm7
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Ltan_fma3_remainder_piby2:
; argument reduction for general x
call __remainder_piby2_fma3
jmp Ltan_fma3_full_computation
Ltan_fma3_naninf: ; here argument is +-Inf or NaN. Special case.
call fname_special
AVXRestoreXmm xmm7, save_xmm7
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
fname endp
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.const
tan_piby4_save_xmm6 EQU 030h
tan_piby4_stack_size EQU 048h
.code
ALIGN 16
_tan_piby4 PROC PRIVATE FRAME
StackAllocate tan_piby4_stack_size
SaveXmm xmm6, tan_piby4_save_xmm6
.ENDPROLOG
; Compute tangent for x+xx in [-pi/4,pi/4].
; xmm0 has x
; xmm1 has xx
; r8d has recip. If recip is true, return -1/tan(x+xx) else tan(x+xx)
xor eax, eax
comisd xmm0, QWORD PTR L_point_68
movaps xmm3, xmm1
movaps xmm6, xmm0
jbe Ltan_piby4_x_le_point_68
; Here x > .68, so we transform x using the identity
; tan(pi/4-x) = (1-tan(x))/(1+tan(x))
movsd xmm2, QWORD PTR L_piby4_lead
mov eax, 1 ; eax <-- transform = 1
subsd xmm2, xmm0 ; xmm2 <-- x = piby4_lead - x
movsd xmm0, QWORD PTR L_piby4_tail
subsd xmm0, xmm1 ; xmm0 <-- xl = piby4_tail - xx
movaps xmm6, xmm2
addsd xmm6, xmm0 ; xmm6 <-- x = x + xl
xorps xmm3,xmm3 ; xmm3 <-- xx = 0.
jmp Ltan_piby4_do_remez
Ltan_piby4_x_le_point_68:
; 43 : else if (x < -0.68)
movsd xmm0, QWORD PTR L_n_point_68
comisd xmm0, xmm6
jbe Ltan_piby4_do_remez ; jump if x >= -.68
; Here x < -.68, so we transform x using the identity
; tan(x-pi/4) = (tan(x)-1)/(tan(x)+1)
addsd xmm6, QWORD PTR L_piby4_lead ; xmm6 <-- x = piby4_lead + x
addsd xmm3, QWORD PTR L_piby4_tail ; xmm3 <-- xl = piby4_tail + xx
or eax, -1 ; eax <-- transform = -1
addsd xmm6, xmm3 ; xmm6 <-- x = x + xl
xorps xmm3, xmm3 ; xmm3 <-- xx = 0
Ltan_piby4_do_remez:
; Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
movaps xmm0, xmm6
movaps xmm2, xmm6;
; An implementation of the tan function.
;
; Prototype:
;
; double tan(double x);
;
; Computes tan(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
;
mulsd xmm0, xmm6 ; xmm0 <-- x*x
addsd xmm2, xmm2 ; xmm2 <-- 2*x
mulsd xmm2, xmm3 ; xmm2 <-- 2*x*xx
addsd xmm2, xmm0 ; xmm2 <-- r = x*x + 2*x*xx
; Magic Remez approximation
movaps xmm0, xmm2
movaps xmm5, xmm2
movaps xmm1, xmm2
mulsd xmm5, QWORD PTR L_tan_p4
mulsd xmm1, QWORD PTR L_tan_q6
mulsd xmm0, xmm6
addsd xmm5, QWORD PTR L_tan_p2
mulsd xmm5, xmm2
addsd xmm5, QWORD PTR L_tan_p0
mulsd xmm5, xmm0
movsd xmm0, QWORD PTR L_tan_q4
addsd xmm0, xmm1
mulsd xmm0, xmm2
addsd xmm0, QWORD PTR L_tan_q2
mulsd xmm0, xmm2
addsd xmm0, QWORD PTR L_tan_q0
divsd xmm5, xmm0
addsd xmm5, xmm3 ; xmm5 <-- t2
test eax, eax
je Ltan_piby4_transform_false
addsd xmm5, xmm6 ; xmm5 <-- t = t1 + t2 = x + t2
test r8d, r8d
je Ltan_piby4_transform_true_recip_false
; Here transform and recip are both true.
; return transform*(2*t/(t-1) - 1.0);
movaps xmm0, xmm5
subsd xmm5, QWORD PTR L_one
movd xmm1, eax
addsd xmm0, xmm0
divsd xmm0, xmm5
cvtdq2pd xmm1, xmm1
subsd xmm0, QWORD PTR L_one
mulsd xmm0, xmm1
RestoreXmm xmm6, tan_piby4_save_xmm6
StackDeallocate tan_piby4_stack_size
ret 0
Ltan_piby4_transform_true_recip_false:
; Here return transform*(1.0 - 2*t/(1+t));
movsd xmm0, QWORD PTR L_one
movaps xmm1, xmm5
addsd xmm5, xmm0
addsd xmm1, xmm1
divsd xmm1, xmm5
subsd xmm0, xmm1
movd xmm1, eax
cvtdq2pd xmm1, xmm1
mulsd xmm0, xmm1
RestoreXmm xmm6, tan_piby4_save_xmm6
StackDeallocate tan_piby4_stack_size
ret 0
Ltan_piby4_transform_false:
test r8d, r8d
je Ltan_piby4_atransform_false_recip_false
; Here transform is false but recip is true
; We return an accurate computation of -1.0/(t1 + t2).
movsd xmm4, QWORD PTR L_n_one
movaps xmm0, xmm5
mov rcx, -4294967296 ; ffffffff00000000H
addsd xmm0, xmm6
movd rax, xmm0 ; really movq
divsd xmm4, xmm0
and rax, rcx
movd xmm3, rax ; really movq
movaps xmm1, xmm3
subsd xmm1, xmm6
movd rax, xmm4 ; really movq
subsd xmm5, xmm1
and rax, rcx
movd xmm2, rax ; really movq
; return trec_top + trec * ((1.0 + trec_top * z1) + trec_top * z2);
movaps xmm0, xmm2
mulsd xmm5, xmm2
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR L_one
addsd xmm0, xmm5
mulsd xmm0, xmm4
addsd xmm0, xmm2
RestoreXmm xmm6, tan_piby4_save_xmm6
StackDeallocate tan_piby4_stack_size
ret 0
Ltan_piby4_atransform_false_recip_false:
; Here both transform and recip are false; we just return t1 + t2
addsd xmm5, xmm6
movaps xmm0, xmm5
RestoreXmm xmm6, tan_piby4_save_xmm6
StackDeallocate tan_piby4_stack_size
ret 0
_tan_piby4 endp
END

View File

@@ -0,0 +1,242 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_NAN_WITH_FLAGS
#define USE_VAL_WITH_FLAGS
#define USE_HANDLE_ERROR
#include "libm_inlines.h"
#undef USE_NAN_WITH_FLAGS
#undef USE_VAL_WITH_FLAGS
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
/* tan(x + xx) approximation valid on the interval [-pi/4,pi/4].
If recip is true return -1/tan(x + xx) instead. */
static inline double tan_piby4(double x, double xx, int recip)
{
double r, t1, t2, xl;
int transform = 0;
static const double
piby4_lead = 7.85398163397448278999e-01, /* 0x3fe921fb54442d18 */
piby4_tail = 3.06161699786838240164e-17; /* 0x3c81a62633145c06 */
/* In order to maintain relative precision transform using the identity:
tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4.
Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4. */
if (x > 0.68)
{
transform = 1;
x = piby4_lead - x;
xl = piby4_tail - xx;
x += xl;
xx = 0.0;
}
else if (x < -0.68)
{
transform = -1;
x = piby4_lead + x;
xl = piby4_tail + xx;
x += xl;
xx = 0.0;
}
/* Core Remez [2,3] approximation to tan(x+xx) on the
interval [0,0.68]. */
r = x*x + 2.0 * x * xx;
t1 = x;
t2 = xx + x*r*
(0.372379159759792203640806338901e0 +
(-0.229345080057565662883358588111e-1 +
0.224044448537022097264602535574e-3*r)*r)/
(0.111713747927937668539901657944e1 +
(-0.515658515729031149329237816945e0 +
(0.260656620398645407524064091208e-1 -
0.232371494088563558304549252913e-3*r)*r)*r);
/* Reconstruct tan(x) in the transformed case. */
if (transform)
{
double t;
t = t1 + t2;
if (recip)
return transform*(2*t/(t-1) - 1.0);
else
return transform*(1.0 - 2*t/(1+t));
}
if (recip)
{
/* Compute -1.0/(t1 + t2) accurately */
double trec, trec_top, z1, z2, t;
unsigned long u;
t = t1 + t2;
GET_BITS_DP64(t, u);
u &= 0xffffffff00000000;
PUT_BITS_DP64(u, z1);
z2 = t2 - (z1 - t1);
trec = -1.0 / t;
GET_BITS_DP64(trec, u);
u &= 0xffffffff00000000;
PUT_BITS_DP64(u, trec_top);
return trec_top + trec * ((1.0 + trec_top * z1) + trec_top * z2);
}
else
return t1 + t2;
}
#pragma function(tan)
double tan(double x)
{
double r, rr;
int region, xneg;
unsigned long ux, ax;
GET_BITS_DP64(x, ux);
ax = (ux & ~SIGNBIT_DP64);
if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
{
if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */
{
if (ax < 0x3e40000000000000) /* abs(x) < 2.0^(-27) */
{
if (ax == 0x0000000000000000) return x;
else return val_with_flags(x, AMD_F_INEXACT);
}
else
{
/* Using a temporary variable prevents 64-bit VC++ from
rearranging
x + x*x*x*0.333333333333333333;
into
x * (1 + x*x*0.333333333333333333);
The latter results in an incorrectly rounded answer. */
double tmp;
tmp = x*x*x*0.333333333333333333;
return x + tmp;
}
}
else
return tan_piby4(x, 0.0, 0);
}
else if ((ux & EXPBITS_DP64) == EXPBITS_DP64)
{
/* x is either NaN or infinity */
if (ux & MANTBITS_DP64)
/* x is NaN */
return _handle_error("tan", OP_TAN, ux|0x0008000000000000, _DOMAIN, 0,
EDOM, x, 0.0, 1);
else
/* x is infinity. Return a NaN */
return _handle_error("tan", OP_TAN, INDEFBITPATT_DP64, _DOMAIN, AMD_F_INVALID,
EDOM, x, 0.0, 1);
}
xneg = (ax != ux);
if (xneg)
x = -x;
if (x < 5.0e5)
{
/* For these size arguments we can just carefully subtract the
appropriate multiple of pi/2, using extra precision where
x is close to an exact multiple of pi/2 */
static const double
twobypi = 6.36619772367581382433e-01, /* 0x3fe45f306dc9c883 */
piby2_1 = 1.57079632673412561417e+00, /* 0x3ff921fb54400000 */
piby2_1tail = 6.07710050650619224932e-11, /* 0x3dd0b4611a626331 */
piby2_2 = 6.07710050630396597660e-11, /* 0x3dd0b4611a600000 */
piby2_2tail = 2.02226624879595063154e-21, /* 0x3ba3198a2e037073 */
piby2_3 = 2.02226624871116645580e-21, /* 0x3ba3198a2e000000 */
piby2_3tail = 8.47842766036889956997e-32; /* 0x397b839a252049c1 */
double t, rhead, rtail;
int npi2;
unsigned long uy, xexp, expdiff;
xexp = ax >> EXPSHIFTBITS_DP64;
/* How many pi/2 is x a multiple of? */
if (ax <= 0x400f6a7a2955385e) /* 5pi/4 */
{
if (ax <= 0x4002d97c7f3321d2) /* 3pi/4 */
npi2 = 1;
else
npi2 = 2;
}
else if (ax <= 0x401c463abeccb2bb) /* 9pi/4 */
{
if (ax <= 0x4015fdbbe9bba775) /* 7pi/4 */
npi2 = 3;
else
npi2 = 4;
}
else
npi2 = (int)(x * twobypi + 0.5);
/* Subtract the multiple from x to get an extra-precision remainder */
rhead = x - npi2 * piby2_1;
rtail = npi2 * piby2_1tail;
GET_BITS_DP64(rhead, uy);
expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
if (expdiff > 15)
{
/* The remainder is pretty small compared with x, which
implies that x is a near multiple of pi/2
(x matches the multiple to at least 15 bits) */
t = rhead;
rtail = npi2 * piby2_2;
rhead = t - rtail;
rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
if (expdiff > 48)
{
/* x matches a pi/2 multiple to at least 48 bits */
t = rhead;
rtail = npi2 * piby2_3;
rhead = t - rtail;
rtail = npi2 * piby2_3tail - ((t - rhead) - rtail);
}
}
r = rhead - rtail;
rr = (rhead - r) - rtail;
region = npi2 & 3;
}
else
{
/* Reduce x into range [-pi/4,pi/4] */
__remainder_piby2(x, &r, &rr, &region);
}
if (xneg)
return -tan_piby4(r, rr, region & 1);
else
return tan_piby4(r, rr, region & 1);
}

View File

@@ -0,0 +1,551 @@
;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the tanf function using the fma3 instruction.
;
; Prototype:
;
; float tanf(float x);
;
; Computes tanf(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
.const
ALIGN 16
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
DQ 07FFFFFFFFFFFFFFFh
L_twobypi DQ 03FE45F306DC9C883h
DQ 03FE45F306DC9C883h
L_int_three DQ 00000000000000003h
DQ 00000000000000003h
L_int_one DQ 00000000000000001h
DQ 00000000000000001h
L_signbit DQ 08000000000000000h
DQ 08000000000000000h
L_tanf DQ 03FD8A8B0DA56CB17h ; c0
DQ 0BF919DBA6EFD6AADh ; c1
DQ 03FF27E84A3E73A2Eh ; d0
DQ 0BFE07266D7B3511Bh ; d1
DQ 03F92E29003C692D9h ; d2
L_large_x_sse2 DQ 04160000000000000h ; 8388608.
L_large_x_fma3 DQ 041E921FB40000000h ; 3.373259264e9
L_point_333 DQ 03FD5555555555555h
L_mask_3e4 DQ 03e40000000000000h
L_mask_3f2 DQ 03f20000000000000h
L_point_five DQ 03FE0000000000000h
L_piby2_1 DQ 03FF921FB54400000h
L_piby2_1tail DQ 03DD0B4611A626331h
L_piby2_lead DQ 03ff921fb54442d18h
L_n_one DQ 0BFF0000000000000h
L_piby4 DQ 03fe921fb54442d18h
L_min_norm DQ 00010000000000000h
L_inf_mask_32 DD 07F800000h
DD 07F800000h
EXTRN __use_fma3_lib:DWORD
EXTRN __L_2_by_pi_bits:BYTE
fname TEXTEQU <tanf>
fname_special TEXTEQU <_tanf_special>
; define local variable storage offsets
; actually there aren't any, but we need to leave room for _tanf_special.
dummy_space EQU 20h
stack_size EQU 38h
include fm.inc
;Define name and any external functions being called
EXTERN fname_special : PROC
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Ltanf_fma3
Ltanf_sse2:
movd eax,xmm0
mov r8d,L_inf_mask_32
and eax,r8d
cmp eax, r8d
jz Ltanf_sse2_naninf
cvtss2sd xmm5,xmm0
movd r9,xmm5
btr r9,63 ; r9 <-- |x|
cmp r9,L_piby4
jg Ltanf_sse2_range_reduce
cmp r9,L_mask_3f2 ; compare to 2^-13 = 0.0001220703125
jge Ltanf_sse2_compute_tanf_piby_4
cmp r9,L_mask_3e4 ; compare to 2^-27 = 7.4505805969238281e-009
jge Ltanf_sse2_compute_x_xxx_0_333
; At this point tan(x) ~= x; if it's not exact, set the inexact flag.
test r9, r9
je Ltanf_sse2_exact_return
movsd xmm1, L_n_one
addsd xmm1, L_min_norm ; set inexact
Ltanf_sse2_exact_return:
StackDeallocate stack_size
ret
ALIGN 16
Ltanf_sse2_compute_x_xxx_0_333:
movapd xmm2,xmm5
mulsd xmm2,xmm2 ; xmm2 <-- x^2
movapd xmm0,xmm2
mulsd xmm0,xmm5 ; xmm0 <-- x^3
mulsd xmm0,L_point_333
addsd xmm0,xmm5 ; x + x*x*x*0.3333333333333333;
jmp Ltanf_sse2_return_s
ALIGN 16
Ltanf_sse2_compute_tanf_piby_4:
movapd xmm0,xmm5 ; xmm0 <-- x (as double)
movapd xmm1,xmm0
mulsd xmm1,xmm0 ; xmm1 <-- x*x
movsd xmm3,L_tanf+008h ; xmm3 <-- c1
mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2
addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0
movsd xmm2,L_tanf+020h ; xmm2 <-- d2
mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2
addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1
mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2
addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0
divsd xmm3,xmm2 ; xmm3 <-- c/d
mulsd xmm1,xmm0 ; xmm1 <-- x^3
mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d
addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d
jmp Ltanf_sse2_return_s
Ltanf_sse2_range_reduce:
movd xmm0,r9
cmp r9,L_large_x_sse2
jge Ltanf_sse2_tanf_reduce_large
Ltanf_sse2_tanf_reduce_moderate:
movapd xmm1,xmm0
andpd xmm1,L_sign_mask
movapd xmm2,L_twobypi
mulsd xmm2,xmm1
addsd xmm2,L_point_five
cvttpd2dq xmm4,xmm2
cvtdq2pd xmm1,xmm4
andpd xmm4,L_int_three ; xmm4 <-- region
movapd xmm2,xmm0
movapd xmm3,xmm1
mulsd xmm1,L_piby2_1
subsd xmm2,xmm1
mulsd xmm3,L_piby2_1tail ; xmm3 rtail
movapd xmm0,xmm2
subsd xmm0,xmm3
subsd xmm2,xmm0
movapd xmm1,xmm2
subsd xmm1,xmm3
jmp Ltanf_sse2_exit_s
Ltanf_sse2_tanf_reduce_large:
lea r9,__L_2_by_pi_bits
;xexp = (x >> 52) 1023
movd r11,xmm0
mov rcx,r11
shr r11,52
sub r11,1023 ; r11 <-- xexp = exponent of input x
;calculate the last byte from which to start multiplication
;last = 134 (xexp >> 3)
mov r10,r11
shr r10,3
sub r10,134 ; r10 <-- -last
neg r10 ; r10 <-- last
;load 64 bits of 2_by_pi
mov rax,[r9+r10]
;mantissa of x = ((x << 12) >> 12) | implied bit
shl rcx,12
shr rcx,12 ; rcx <-- mantissa part of input x
bts rcx,52 ; add the implied bit as well
;load next 128 bits of 2_by_pi
add r10,8 ; increment to next 8 bytes of 2_by_pi
movdqu xmm0,[r9+r10]
;do three 64bit multiplications with mant of x
mul rcx
mov r8,rax ; r8 = last 64 bits of mul = res1[2]
mov r10,rdx ; r10 = carry
vmovq rax,xmm0
mul rcx
;resexp = xexp & 7
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
psrldq xmm0,8
add rax,r10 ; add the previous carry
adc rdx,0
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
mov r10,rdx ; r10 <-- carry
movd rax,xmm0
mul rcx
add r10,rax ;r10 = most sig 64 bits = res1[0]
;find the region
;last three bits ltb = most sig bits >> (54 resexp))
; decimal point in last 18 bits == 8 lsb's in first 64 bits
; and 8 msb's in next 64 bits
;point_five = ltb & 01h;
;region = ((ltb >> 1) + point_five) & 3;
mov rcx,54
mov rax,r10
sub rcx,r11
xor rdx,rdx ;rdx = sign of x
shr rax,cl
jnc Ltanf_sse2_no_point_five_f
;;if there is carry.. then negate the result of multiplication
not r10
not r9
not r8
mov rdx,08000000000000000h
ALIGN 16
Ltanf_sse2_no_point_five_f:
adc rax,0
and rax,3
movd xmm4,eax ; xmm4 <-- region
;calculate the number of integer bits and zero them out
mov rcx,r11
add rcx,10 ; rcx = no. of integer bits
shl r10,cl
shr r10,cl ; r10 contains only mant bits
sub rcx,64 ; form the exponent
mov r11,rcx
;find the highest set bit
bsr rcx,r10
jnz Ltanf_sse2_form_mantissa_f
mov r10,r9
mov r9,r8
mov r8,0
bsr rcx,r10 ;rcx = hsb
sub r11,64
ALIGN 16
Ltanf_sse2_form_mantissa_f:
add r11,rcx ; for exp of x
sub rcx,52 ; rcx = no. of bits to shift in r10
cmp rcx,0
jl Ltanf_sse2_hsb_below_52_f
je Ltanf_sse2_form_numbers_f
;hsb above 52
mov r8,r10
shr r10,cl ; r10 = mantissa of x with hsb at 52
shr r9,cl ; make space for bits from r10
sub rcx,64
neg rcx ; rcx = no of bits to shift r10
shl r8,cl
or r9,r8 ; r9 = mantissa bits of xx
jmp Ltanf_sse2_form_numbers_f
ALIGN 16
Ltanf_sse2_hsb_below_52_f:
neg rcx
mov rax,r9
shl r10,cl
shl r9,cl
sub rcx,64
neg rcx
shr rax,cl
or r10,rax
shr r8,cl
or r9,r8
ALIGN 16
Ltanf_sse2_form_numbers_f:
add r11,1023
btr r10,52 ; remove the implied bit
mov rcx,r11
or r10,rdx ; put the sign
shl rcx,52
or r10,rcx ; x is in r10
movd xmm0,r10 ; xmm0 <-- x
mulsd xmm0,L_piby2_lead
Ltanf_sse2_exit_s:
movd eax,xmm4
and eax,1 ; eax <-- region & 1
movapd xmm1,xmm0
mulsd xmm1,xmm0 ; xmm1 <-- x*x
movsd xmm3,L_tanf+008h ; xmm3 <-- c1
mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2
addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0
movsd xmm2,L_tanf+020h ; xmm2 <-- d2
mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2
addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1
mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2
addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0
divsd xmm3,xmm2 ; xmm3 <-- c/d
mulsd xmm1,xmm0 ; xmm1 <-- x^3
mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d
addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d
cmp eax,01h
jne Ltanf_sse2_exit_tanpiby4
Ltanf_sse2_recip :
movd xmm3,L_n_one
divsd xmm3,xmm0
movsd xmm0,xmm3
Ltanf_sse2_exit_tanpiby4 :
andpd xmm5,L_signbit
xorpd xmm0,xmm5
Ltanf_sse2_return_s:
cvtsd2ss xmm0,xmm0
Ltanf_sse2_return_c:
StackDeallocate stack_size
ret
Ltanf_sse2_naninf:
call fname_special
StackDeallocate stack_size
ret
ALIGN 16
Ltanf_fma3:
vmovd eax,xmm0
mov r8d,L_inf_mask_32
and eax,r8d
cmp eax, r8d
jz Ltanf_fma3_naninf
vcvtss2sd xmm5,xmm0,xmm0
vmovq r9,xmm5
btr r9,63 ; r9 <-- |x|
cmp r9,L_piby4
jg Ltanf_fma3_range_reduce
cmp r9,L_mask_3f2
jge Ltanf_fma3_compute_tanf_piby_4
cmp r9,L_mask_3e4
jge Ltanf_fma3_compute_x_xxx_0_333
jmp Ltanf_fma3_return_c
Ltanf_fma3_compute_x_xxx_0_333:
vmulsd xmm2,xmm5,xmm5
vmulsd xmm0,xmm2,xmm5
vfmadd132sd xmm0,xmm5,L_point_333 ; x + x*x*x*0.3333333333333333;
jmp Ltanf_fma3_return_s
Ltanf_fma3_compute_tanf_piby_4:
vmovsd xmm0,xmm5,xmm5
vmulsd xmm1,xmm0,xmm0
vmovsd xmm3,L_tanf+008h
vfmadd213sd xmm3,xmm1,L_tanf
vmovsd xmm2,L_tanf+020h
vfmadd213sd xmm2,xmm1,L_tanf+018h
vfmadd213sd xmm2,xmm1,L_tanf+010h
vdivsd xmm3,xmm3,xmm2
vmulsd xmm1,xmm1,xmm0
vfmadd231sd xmm0,xmm1,xmm3
jmp Ltanf_fma3_return_s
Ltanf_fma3_range_reduce:
vmovq xmm0,r9
cmp r9,L_large_x_fma3
jge Ltanf_fma3_tanf_reduce_large
Ltanf_fma3_tanf_reduce_moderate:
vandpd xmm1,xmm0,L_sign_mask
vmovapd xmm2,L_twobypi
vfmadd213sd xmm2,xmm1,L_point_five
vcvttpd2dq xmm2,xmm2
vpmovsxdq xmm1,xmm2
vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region
vshufps xmm1 ,xmm1,xmm1,8
vcvtdq2pd xmm1,xmm1
vmovdqa xmm2,xmm0
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail
vsubsd xmm0,xmm2,xmm3
vsubsd xmm2,xmm2,xmm0
vsubsd xmm1,xmm2,xmm3
jmp Ltanf_fma3_exit_s
Ltanf_fma3_tanf_reduce_large:
lea r9,__L_2_by_pi_bits
;xexp = (x >> 52) 1023
vmovq r11,xmm0
mov rcx,r11
shr r11,52
sub r11,1023 ; r11 <-- xexp = exponent of input x
;calculate the last byte from which to start multiplication
;last = 134 (xexp >> 3)
mov r10,r11
shr r10,3
sub r10,134 ; r10 <-- -last
neg r10 ; r10 <-- last
;load 64 bits of 2_by_pi
mov rax,[r9+r10]
;mantissa of x = ((x << 12) >> 12) | implied bit
shl rcx,12
shr rcx,12 ; rcx <-- mantissa part of input x
bts rcx,52 ; add the implied bit as well
;load next 128 bits of 2_by_pi
add r10,8 ; increment to next 8 bytes of 2_by_pi
vmovdqu xmm0,XMMWORD PTR[r9+r10]
;do three 64bit multiplications with mant of x
mul rcx
mov r8,rax ; r8 = last 64 bits of mul = res1[2]
mov r10,rdx ; r10 = carry
vmovq rax,xmm0
mul rcx
;resexp = xexp & 7
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
vpsrldq xmm0,xmm0,8
add rax,r10 ; add the previous carry
adc rdx,0
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
mov r10,rdx ; r10 <-- carry
vmovq rax,xmm0
mul rcx
add r10,rax ;r10 = most sig 64 bits = res1[0]
;find the region
;last three bits ltb = most sig bits >> (54 resexp))
; decimal point in last 18 bits == 8 lsb's in first 64 bits
; and 8 msb's in next 64 bits
;point_five = ltb & 01h;
;region = ((ltb >> 1) + point_five) & 3;
mov rcx,54
mov rax,r10
sub rcx,r11
xor rdx,rdx ;rdx = sign of x
shr rax,cl
jnc Ltanf_fma3_no_point_five_f
;;if there is carry.. then negate the result of multiplication
not r10
not r9
not r8
mov rdx,08000000000000000h
ALIGN 16
Ltanf_fma3_no_point_five_f:
adc rax,0
and rax,3
vmovd xmm4,eax ; xmm4 <-- region
;calculate the number of integer bits and zero them out
mov rcx,r11
add rcx,10 ; rcx = no. of integer bits
shl r10,cl
shr r10,cl ; r10 contains only mant bits
sub rcx,64 ; form the exponent
mov r11,rcx
;find the highest set bit
bsr rcx,r10
jnz Ltanf_fma3_form_mantissa_f
mov r10,r9
mov r9,r8
mov r8,0
bsr rcx,r10 ;rcx = hsb
sub r11,64
ALIGN 16
Ltanf_fma3_form_mantissa_f:
add r11,rcx ; for exp of x
sub rcx,52 ; rcx = no. of bits to shift in r10
cmp rcx,0
jl Ltanf_fma3_hsb_below_52_f
je Ltanf_fma3_form_numbers_f
;hsb above 52
mov r8,r10
shr r10,cl ; r10 = mantissa of x with hsb at 52
shr r9,cl ; make space for bits from r10
sub rcx,64
neg rcx ; rcx = no of bits to shift r10
shl r8,cl
or r9,r8 ; r9 = mantissa bits of xx
jmp Ltanf_fma3_form_numbers_f
ALIGN 16
Ltanf_fma3_hsb_below_52_f:
neg rcx
mov rax,r9
shl r10,cl
shl r9,cl
sub rcx,64
neg rcx
shr rax,cl
or r10,rax
shr r8,cl
or r9,r8
ALIGN 16
Ltanf_fma3_form_numbers_f:
add r11,1023
btr r10,52 ; remove the implied bit
mov rcx,r11
or r10,rdx ; put the sign
shl rcx,52
or r10,rcx ; x is in r10
vmovq xmm0,r10 ; xmm0 <-- x
vmulsd xmm0,xmm0,L_piby2_lead
Ltanf_fma3_exit_s:
vandpd xmm2,xmm4,XMMWORD PTR L_int_one
vmovd eax,xmm2
vmulsd xmm1,xmm0,xmm0
vmovsd xmm3,L_tanf+008h
vfmadd213sd xmm3,xmm1,L_tanf
vmovsd xmm2,L_tanf+020h
vfmadd213sd xmm2,xmm1,L_tanf+018h
vfmadd213sd xmm2,xmm1,L_tanf+010h
vdivsd xmm3,xmm3,xmm2
vmulsd xmm1,xmm1,xmm0
vfmadd231sd xmm0,xmm1,xmm3
cmp eax,01h
je Ltanf_fma3_recip
jmp Ltanf_fma3_exit_tanpiby4
Ltanf_fma3_recip :
vmovq xmm3,L_n_one
vdivsd xmm0,xmm3,xmm0
Ltanf_fma3_exit_tanpiby4 :
vandpd xmm5,xmm5,L_signbit
vxorpd xmm0,xmm0,xmm5
Ltanf_fma3_return_s:
vcvtsd2ss xmm0,xmm0,xmm0
Ltanf_fma3_return_c:
StackDeallocate stack_size
ret
Ltanf_fma3_naninf:
call fname_special
StackDeallocate stack_size
ret
fname endp
END

View File

@@ -0,0 +1,193 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_REMAINDER_PIBY2F_INLINE
#define USE_VALF_WITH_FLAGS
#define USE_NANF_WITH_FLAGS
#define USE_HANDLE_ERRORF
#include "libm_inlines.h"
#undef USE_VALF_WITH_FLAGS
#undef USE_NANF_WITH_FLAGS
#undef USE_REMAINDER_PIBY2F_INLINE
#undef USE_HANDLE_ERRORF
#include "libm_errno.h"
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(tanf)
/* tan(x) approximation valid on the interval [-pi/4,pi/4].
If recip is true return -1/tan(x) instead. */
static inline double tanf_piby4(double x, int recip)
{
double r, t;
/* Core Remez [1,2] approximation to tan(x) on the
interval [0,pi/4]. */
r = x*x;
t = x + x*r*
(0.385296071263995406715129e0 -
0.172032480471481694693109e-1 * r) /
(0.115588821434688393452299e+1 +
(-0.51396505478854532132342e0 +
0.1844239256901656082986661e-1 * r) * r);
if (recip)
return -1.0 / t;
else
return t;
}
float tanf(float x)
{
double r, dx;
int region, xneg;
unsigned long ux, ax;
dx = x;
GET_BITS_DP64(dx, ux);
ax = (ux & ~SIGNBIT_DP64);
if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
{
if (ax < 0x3f80000000000000) /* abs(x) < 2.0^(-7) */
{
if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */
{
if (ax == 0x0000000000000000)
return x;
else
return valf_with_flags(x, AMD_F_INEXACT);
}
else
return (float)(dx + dx*dx*dx*0.333333333333333333);
}
else
return (float)tanf_piby4(x, 0);
}
else if ((ux & EXPBITS_DP64) == EXPBITS_DP64)
{
/* x is either NaN or infinity */
if (ux & MANTBITS_DP64)
{
/* x is NaN */
unsigned int ufx;
GET_BITS_SP32(x, ufx);
return _handle_errorf("tanf", OP_TAN, ufx|0x00400000, _DOMAIN, 0,
EDOM, x, 0.0F, 1);
}
else
{
/* x is infinity. Return a NaN */
return _handle_errorf("tanf", OP_TAN, INDEFBITPATT_SP32, _DOMAIN, AMD_F_INVALID,
EDOM, x, 0.0F, 1);
}
}
xneg = (int)(ux >> 63);
if (xneg)
dx = -dx;
if (dx < 5.0e5)
{
/* For these size arguments we can just carefully subtract the
appropriate multiple of pi/2, using extra precision where
dx is close to an exact multiple of pi/2 */
static const double
twobypi = 6.36619772367581382433e-01, /* 0x3fe45f306dc9c883 */
piby2_1 = 1.57079632673412561417e+00, /* 0x3ff921fb54400000 */
piby2_1tail = 6.07710050650619224932e-11, /* 0x3dd0b4611a626331 */
piby2_2 = 6.07710050630396597660e-11, /* 0x3dd0b4611a600000 */
piby2_2tail = 2.02226624879595063154e-21, /* 0x3ba3198a2e037073 */
piby2_3 = 2.02226624871116645580e-21, /* 0x3ba3198a2e000000 */
piby2_3tail = 8.47842766036889956997e-32; /* 0x397b839a252049c1 */
double t, rhead, rtail;
int npi2;
unsigned long uy, xexp, expdiff;
xexp = ax >> EXPSHIFTBITS_DP64;
/* How many pi/2 is dx a multiple of? */
if (ax <= 0x400f6a7a2955385e) /* 5pi/4 */
{
if (ax <= 0x4002d97c7f3321d2) /* 3pi/4 */
npi2 = 1;
else
npi2 = 2;
}
else if (ax <= 0x401c463abeccb2bb) /* 9pi/4 */
{
if (ax <= 0x4015fdbbe9bba775) /* 7pi/4 */
npi2 = 3;
else
npi2 = 4;
}
else
npi2 = (int)(dx * twobypi + 0.5);
/* Subtract the multiple from dx to get an extra-precision remainder */
rhead = dx - npi2 * piby2_1;
rtail = npi2 * piby2_1tail;
GET_BITS_DP64(rhead, uy);
expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
if (expdiff > 15)
{
/* The remainder is pretty small compared with dx, which
implies that dx is a near multiple of pi/2
(dx matches the multiple to at least 15 bits) */
t = rhead;
rtail = npi2 * piby2_2;
rhead = t - rtail;
rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
if (expdiff > 48)
{
/* dx matches a pi/2 multiple to at least 48 bits */
t = rhead;
rtail = npi2 * piby2_3;
rhead = t - rtail;
rtail = npi2 * piby2_3tail - ((t - rhead) - rtail);
}
}
r = rhead - rtail;
region = npi2 & 3;
}
else
{
/* Reduce x into range [-pi/4,pi/4] */
__remainder_piby2f_inline(ax, &r, &region);
}
if (xneg)
return (float)-tanf_piby4(r, region & 1);
else
return (float)tanf_piby4(r, region & 1);
}

View File

@@ -0,0 +1,137 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_HANDLE_ERROR
#define USE_SPLITEXP
#define USE_SCALEDOUBLE_2
#define USE_VAL_WITH_FLAGS
#include "libm_inlines.h"
#undef USE_SPLITEXP
#undef USE_SCALEDOUBLE_2
#undef USE_VAL_WITH_FLAGS
#undef USE_HANDLE_ERROR
#include "libm_errno.h"
#pragma function(tanh)
double tanh(double x)
{
/*
The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
to the following three formulae:
1. (exp(x) - exp(-x))/(exp(x) + exp(-x))
2. (1 - (2/(exp(2*x) + 1 )))
3. (exp(2*x) - 1)/(exp(2*x) + 1)
but computationally, some formulae are better on some ranges.
*/
static const double
thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
large_threshold = 20.0; /* 0x4034000000000000 */
unsigned long ux, aux, xneg;
double y, z, p, z1, z2;
int m;
/* Special cases */
GET_BITS_DP64(x, ux);
aux = ux & ~SIGNBIT_DP64;
if (aux < 0x3e30000000000000) /* |x| small enough that tanh(x) = x */
{
if (aux == 0)
return x; /* with no inexact */
else
return val_with_flags(x, AMD_F_INEXACT);
}
else if (aux > 0x7ff0000000000000) /* |x| is NaN */
return _handle_error("tanh", OP_TANH, ux|0x0008000000000000, _DOMAIN,
0, EDOM, x, 0.0, 1);
// return x + x;
xneg = (aux != ux);
y = x;
if (xneg) y = -x;
if (y > large_threshold)
{
/* If x is large then exp(-x) is negligible and
formula 1 reduces to plus or minus 1.0 */
z = 1.0;
}
else if (y <= 1.0)
{
double y2;
y2 = y*y;
if (y < 0.9)
{
/* Use a [3,3] Remez approximation on [0,0.9]. */
z = y + y*y2*
(-0.274030424656179760118928e0 +
(-0.176016349003044679402273e-1 +
(-0.200047621071909498730453e-3 -
0.142077926378834722618091e-7*y2)*y2)*y2)/
(0.822091273968539282568011e0 +
(0.381641414288328849317962e0 +
(0.201562166026937652780575e-1 +
0.2091140262529164482568557e-3*y2)*y2)*y2);
}
else
{
/* Use a [3,3] Remez approximation on [0.9,1]. */
z = y + y*y2*
(-0.227793870659088295252442e0 +
(-0.146173047288731678404066e-1 +
(-0.165597043903549960486816e-3 -
0.115475878996143396378318e-7*y2)*y2)*y2)/
(0.683381611977295894959554e0 +
(0.317204558977294374244770e0 +
(0.167358775461896562588695e-1 +
0.173076050126225961768710e-3*y2)*y2)*y2);
}
}
else
{
/* Compute p = exp(2*y) + 1. The code is basically inlined
from exp_amd. */
splitexp(2*y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
log2_by_32_tail, &m, &z1, &z2);
p = scaleDouble_2(z1 + z2, m) + 1.0;
/* Now reconstruct tanh from p. */
z = (1.0 - 2.0/p);
}
if (xneg) z = - z;
return z;
}

View File

@@ -0,0 +1,136 @@
/*******************************************************************************
MIT License
-----------
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this Software and associated documentaon files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "libm.h"
#include "libm_util.h"
#define USE_HANDLE_ERRORF
#define USE_SPLITEXPF
#define USE_SCALEFLOAT_2
#define USE_VALF_WITH_FLAGS
#include "libm_inlines.h"
#undef USE_SPLITEXPF
#undef USE_SCALEFLOAT_2
#undef USE_VALF_WITH_FLAGS
#undef USE_HANDLE_ERRORF
#include "libm_errno.h"
// Disable "C4163: not available as intrinsic function" warning that older
// compilers may issue here.
#pragma warning(disable:4163)
#pragma function(tanhf)
float tanhf(float x)
{
/*
The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
to the following three formulae:
1. (exp(x) - exp(-x))/(exp(x) + exp(-x))
2. (1 - (2/(exp(2*x) + 1 )))
3. (exp(2*x) - 1)/(exp(2*x) + 1)
but computationally, some formulae are better on some ranges.
*/
static const float
thirtytwo_by_log2 = 4.6166240692e+01F, /* 0x4238aa3b */
log2_by_32_lead = 2.1659851074e-02F, /* 0x3cb17000 */
log2_by_32_tail = 9.9831822808e-07F, /* 0x3585fdf4 */
large_threshold = 10.0F; /* 0x41200000 */
unsigned int ux, aux;
float y, z, p, z1, z2, xneg;
int m;
/* Special cases */
GET_BITS_SP32(x, ux);
aux = ux & ~SIGNBIT_SP32;
if (aux < 0x39000000) /* |x| small enough that tanh(x) = x */
{
if (aux == 0)
return x; /* with no inexact */
else
return valf_with_flags(x, AMD_F_INEXACT);
}
else if (aux > 0x7f800000) /* |x| is NaN */
{
unsigned int ufx;
GET_BITS_SP32(x, ufx);
return _handle_errorf("tanhf", OP_TANH, ufx|0x00400000, _DOMAIN, 0,
EDOM, x, 0.0F, 1);
}
// return x + x;
xneg = 1.0F - 2.0F * (aux != ux);
y = xneg * x;
if (y > large_threshold)
{
/* If x is large then exp(-x) is negligible and
formula 1 reduces to plus or minus 1.0 */
z = 1.0F;
}
else if (y <= 1.0F)
{
float y2;
y2 = y*y;
if (y < 0.9F)
{
/* Use a [2,1] Remez approximation on [0,0.9]. */
z = y + y*y2*
(-0.28192806108402678e0F +
(-0.14628356048797849e-2F +
0.4891631088530669873e-4F*y2)*y2)/
(0.845784192581041099e0F +
0.3427017942262751343e0F*y2);
}
else
{
/* Use a [2,1] Remez approximation on [0.9,1]. */
z = y + y*y2*
(-0.24069858695196524e0F +
(-0.12325644183611929e-2F +
0.3827534993599483396e-4F*y2)*y2)/
(0.72209738473684982e0F +
0.292529068698052819e0F*y2);
}
}
else
{
/* Compute p = exp(2*y) + 1. The code is basically inlined
from exp_amd. */
splitexpf(2*y, 1.0F, thirtytwo_by_log2, log2_by_32_lead,
log2_by_32_tail, &m, &z1, &z2);
p = scaleFloat_2(z1 + z2, m) + 1.0F;
/* Now reconstruct tanh from p. */
z = (1.0F - 2.0F/p);
}
return xneg * z;
}

View File

@@ -0,0 +1,165 @@
;;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;; Defines __two_to_jby64_head_table and __two_to_jby64_tail_table tables
;; Used in exp and pow
;;
.const
ALIGN 16
PUBLIC __two_to_jby64_head_table
__two_to_jby64_head_table DQ 3ff0000000000000h
DQ 3ff02c9a30000000h
DQ 3ff059b0d0000000h
DQ 3ff0874510000000h
DQ 3ff0b55860000000h
DQ 3ff0e3ec30000000h
DQ 3ff11301d0000000h
DQ 3ff1429aa0000000h
DQ 3ff172b830000000h
DQ 3ff1a35be0000000h
DQ 3ff1d48730000000h
DQ 3ff2063b80000000h
DQ 3ff2387a60000000h
DQ 3ff26b4560000000h
DQ 3ff29e9df0000000h
DQ 3ff2d285a0000000h
DQ 3ff306fe00000000h
DQ 3ff33c08b0000000h
DQ 3ff371a730000000h
DQ 3ff3a7db30000000h
DQ 3ff3dea640000000h
DQ 3ff4160a20000000h
DQ 3ff44e0860000000h
DQ 3ff486a2b0000000h
DQ 3ff4bfdad0000000h
DQ 3ff4f9b270000000h
DQ 3ff5342b50000000h
DQ 3ff56f4730000000h
DQ 3ff5ab07d0000000h
DQ 3ff5e76f10000000h
DQ 3ff6247eb0000000h
DQ 3ff6623880000000h
DQ 3ff6a09e60000000h
DQ 3ff6dfb230000000h
DQ 3ff71f75e0000000h
DQ 3ff75feb50000000h
DQ 3ff7a11470000000h
DQ 3ff7e2f330000000h
DQ 3ff8258990000000h
DQ 3ff868d990000000h
DQ 3ff8ace540000000h
DQ 3ff8f1ae90000000h
DQ 3ff93737b0000000h
DQ 3ff97d8290000000h
DQ 3ff9c49180000000h
DQ 3ffa0c6670000000h
DQ 3ffa5503b0000000h
DQ 3ffa9e6b50000000h
DQ 3ffae89f90000000h
DQ 3ffb33a2b0000000h
DQ 3ffb7f76f0000000h
DQ 3ffbcc1e90000000h
DQ 3ffc199bd0000000h
DQ 3ffc67f120000000h
DQ 3ffcb720d0000000h
DQ 3ffd072d40000000h
DQ 3ffd5818d0000000h
DQ 3ffda9e600000000h
DQ 3ffdfc9730000000h
DQ 3ffe502ee0000000h
DQ 3ffea4afa0000000h
DQ 3ffefa1be0000000h
DQ 3fff507650000000h
DQ 3fffa7c180000000h
ALIGN 16
PUBLIC __two_to_jby64_tail_table
__two_to_jby64_tail_table DQ 0000000000000000h
DQ 3e6cef00c1dcdef9h
DQ 3e48ac2ba1d73e2ah
DQ 3e60eb37901186beh
DQ 3e69f3121ec53172h
DQ 3e469e8d10103a17h
DQ 3df25b50a4ebbf1ah
DQ 3e6d525bbf668203h
DQ 3e68faa2f5b9bef9h
DQ 3e66df96ea796d31h
DQ 3e368b9aa7805b80h
DQ 3e60c519ac771dd6h
DQ 3e6ceac470cd83f5h
DQ 3e5789f37495e99ch
DQ 3e547f7b84b09745h
DQ 3e5b900c2d002475h
DQ 3e64636e2a5bd1abh
DQ 3e4320b7fa64e430h
DQ 3e5ceaa72a9c5154h
DQ 3e53967fdba86f24h
DQ 3e682468446b6824h
DQ 3e3f72e29f84325bh
DQ 3e18624b40c4dbd0h
DQ 3e5704f3404f068eh
DQ 3e54d8a89c750e5eh
DQ 3e5a74b29ab4cf62h
DQ 3e5a753e077c2a0fh
DQ 3e5ad49f699bb2c0h
DQ 3e6a90a852b19260h
DQ 3e56b48521ba6f93h
DQ 3e0d2ac258f87d03h
DQ 3e42a91124893ecfh
DQ 3e59fcef32422cbeh
DQ 3e68ca345de441c5h
DQ 3e61d8bee7ba46e1h
DQ 3e59099f22fdba6ah
DQ 3e4f580c36bea881h
DQ 3e5b3d398841740ah
DQ 3e62999c25159f11h
DQ 3e668925d901c83bh
DQ 3e415506dadd3e2ah
DQ 3e622aee6c57304eh
DQ 3e29b8bc9e8a0387h
DQ 3e6fbc9c9f173d24h
DQ 3e451f8480e3e235h
DQ 3e66bbcac96535b5h
DQ 3e41f12ae45a1224h
DQ 3e55e7f6fd0fac90h
DQ 3e62b5a75abd0e69h
DQ 3e609e2bf5ed7fa1h
DQ 3e47daf237553d84h
DQ 3e12f074891ee83dh
DQ 3e6b0aa538444196h
DQ 3e6cafa29694426fh
DQ 3e69df20d22a0797h
DQ 3e640f12f71a1e45h
DQ 3e69f7490e4bb40bh
DQ 3e4ed9942b84600dh
DQ 3e4bdcdaf5cb4656h
DQ 3e5e2cffd89cf44ch
DQ 3e452486cc2c7b9dh
DQ 3e6cc2b44eee3fa4h
DQ 3e66dc8a80ce9f09h
DQ 3e39e90d82e90a7eh
END

View File

@@ -0,0 +1,99 @@
;;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;; Defines __two_to_jby64_table table
;; Used by exp and expf
;;
.const
ALIGN 16
PUBLIC __two_to_jby64_table
__two_to_jby64_table DQ 3ff0000000000000h
DQ 3ff02c9a3e778061h
DQ 3ff059b0d3158574h
DQ 3ff0874518759bc8h
DQ 3ff0b5586cf9890fh
DQ 3ff0e3ec32d3d1a2h
DQ 3ff11301d0125b51h
DQ 3ff1429aaea92de0h
DQ 3ff172b83c7d517bh
DQ 3ff1a35beb6fcb75h
DQ 3ff1d4873168b9aah
DQ 3ff2063b88628cd6h
DQ 3ff2387a6e756238h
DQ 3ff26b4565e27cddh
DQ 3ff29e9df51fdee1h
DQ 3ff2d285a6e4030bh
DQ 3ff306fe0a31b715h
DQ 3ff33c08b26416ffh
DQ 3ff371a7373aa9cbh
DQ 3ff3a7db34e59ff7h
DQ 3ff3dea64c123422h
DQ 3ff4160a21f72e2ah
DQ 3ff44e086061892dh
DQ 3ff486a2b5c13cd0h
DQ 3ff4bfdad5362a27h
DQ 3ff4f9b2769d2ca7h
DQ 3ff5342b569d4f82h
DQ 3ff56f4736b527dah
DQ 3ff5ab07dd485429h
DQ 3ff5e76f15ad2148h
DQ 3ff6247eb03a5585h
DQ 3ff6623882552225h
DQ 3ff6a09e667f3bcdh
DQ 3ff6dfb23c651a2fh
DQ 3ff71f75e8ec5f74h
DQ 3ff75feb564267c9h
DQ 3ff7a11473eb0187h
DQ 3ff7e2f336cf4e62h
DQ 3ff82589994cce13h
DQ 3ff868d99b4492edh
DQ 3ff8ace5422aa0dbh
DQ 3ff8f1ae99157736h
DQ 3ff93737b0cdc5e5h
DQ 3ff97d829fde4e50h
DQ 3ff9c49182a3f090h
DQ 3ffa0c667b5de565h
DQ 3ffa5503b23e255dh
DQ 3ffa9e6b5579fdbfh
DQ 3ffae89f995ad3adh
DQ 3ffb33a2b84f15fbh
DQ 3ffb7f76f2fb5e47h
DQ 3ffbcc1e904bc1d2h
DQ 3ffc199bdd85529ch
DQ 3ffc67f12e57d14bh
DQ 3ffcb720dcef9069h
DQ 3ffd072d4a07897ch
DQ 3ffd5818dcfba487h
DQ 3ffda9e603db3285h
DQ 3ffdfc97337b9b5fh
DQ 3ffe502ee78b3ff6h
DQ 3ffea4afa2a490dah
DQ 3ffefa1bee615a27h
DQ 3fff50765b6e4540h
DQ 3fffa7c1819e90d8h
END