Code: Select all

```
//Approximates tan(x*pi*0.25), x on (1e-34, 2-1e-7). Useful for BLT prewrap. 4-way parallel.
__m128 tanpi4_blt(__m128 x) {
//Polynomial coeffs for tan(x*pi/4) for (-1,1) in even terms, so it goes like c0*x + c1*x^3 + ...
//Remez relative error fit ~6 ULPs
const __m128 c0 = _mm_set1_ps(7.853980064e-01);
const __m128 c1 = _mm_set1_ps(1.615037620e-01);
const __m128 c2 = _mm_set1_ps(3.970112652e-02);
const __m128 c3 = _mm_set1_ps(1.054308284e-02);
const __m128 c4 = _mm_set1_ps(1.422759611e-03);
const __m128 c5 = _mm_set1_ps(1.431057928e-03);
const __m128 one = _mm_set1_ps(1.0f);
const __m128 two = _mm_set1_ps(2.0f);
const __m128 signmask = _mm_set1_ps(-0.0f);
//We use this useful identity to compute on finite range of values:
// tan(x - pi/2) == 1 / -tan(x)
//Since we compute tan(x*pi/4), identity is tanpi4(x-2) == 1 / -tanpi4(x)
//Force the value into the finite range (-1,1)
//Compute the mask, whether we compute using the identity (x>1) or directly (x<1)
__m128 inverse_mask = _mm_cmpgt_ps(x, one);
//Subtract 2 for values greater than 1
x = _mm_sub_ps(x, _mm_and_ps(inverse_mask, two));
//Compute the polynomial for tan(x*pi/4)
//Hybrid scheme for polynomial evaluation, which is slightly faster than Horner's
__m128 f2 = _mm_mul_ps(x, x); //x^2
__m128 f4 = _mm_mul_ps(f2, f2); //x^4
__m128 p = _mm_add_ps(_mm_mul_ps(c5, f2), c4);
p = _mm_mul_ps(p, f4);// (c5*x^2 +c4)*x^4
p = _mm_add_ps(p, _mm_add_ps(_mm_mul_ps(c3, f2), c2));
p = _mm_mul_ps(p, f4);// ((c5*x^2 +c4)*x^4 + (c3*x^2 + c2))*x^4
p = _mm_add_ps(p, _mm_add_ps(_mm_mul_ps(c1, f2), c0));
p = _mm_mul_ps(p, x);// (((c5*x^2 +c4)*x^4 + (c3*x^2 + c2))*x^4)+(c1*x^2+c0*x))*x
//Compute the inverse for all values even if we don't need it, because it's a vector function
__m128 inverse = _mm_div_ps(one, p);
//Taking the abs of the inverted value with and(!signmask, inverse), because the result is positive
//Rearranging to reduce instruction dependence:
// (!signmask&inverse)&inverse_mask == (!signmask&inverse_mask)&inverse
//Select the right value based on the mask
p = _mm_or_ps(_mm_and_ps(_mm_andnot_ps(signmask, inverse_mask), inverse),
_mm_andnot_ps(inverse_mask, p));
return p;
}
//Approximates tan(x*pi*0.25), x on (1e-34, 2-1e-7). Useful for BLT prewrap. FMA instructions. 8-way parallel.
__m256 tanpi4_blt_fma(__m256 x) {
const __m256 c0 = _mm256_set1_ps(7.853980064e-01);
const __m256 c1 = _mm256_set1_ps(1.615037620e-01);
const __m256 c2 = _mm256_set1_ps(3.970112652e-02);
const __m256 c3 = _mm256_set1_ps(1.054308284e-02);
const __m256 c4 = _mm256_set1_ps(1.422759611e-03);
const __m256 c5 = _mm256_set1_ps(1.431057928e-03);
const __m256 one = _mm256_set1_ps(1.0f);
const __m256 two = _mm256_set1_ps(2.0f);
const __m256 signmask = _mm256_set1_ps(-0.0f);
__m256 inverse_mask = _mm256_cmp_ps(x, one, _CMP_GT_OS);
x = _mm256_sub_ps(x, _mm256_and_ps(inverse_mask, two));
__m256 f2 = _mm256_mul_ps(x, x);
__m256 p = c5;
p = _mm256_fmadd_ps(p, f2, c4);
p = _mm256_fmadd_ps(p, f2, c3);
p = _mm256_fmadd_ps(p, f2, c2);
p = _mm256_fmadd_ps(p, f2, c1);
p = _mm256_fmadd_ps(p, f2, c0);
p = _mm256_mul_ps(p, _mm256_andnot_ps(signmask, x));
__m256 inverse = _mm256_div_ps(one, p);
p = _mm256_blendv_ps(p, inverse, inverse_mask);
return p;
}
```

Code: Select all

```
[1e-34, 2-1e-7]
tanpi4_blt, Max error, ULPs : 6.00925
tanpi4_blt_fma, Max error, ULPs : 5.487955
```

Code: Select all

```
tanpi4_blt: 3.56934
tanpi4_blt_fma: 1.09766
msvc tan vec: 5.72974
msvc tan scalar: 29.979
```