KVR Audio

2DaT · Post by **2DaT** » Wed Feb 06, 2019 2:20 pm

There were a lots of topics on this subject, but usually proposed approximations are very rough.

It's somewhat inconvenient, because it approximates tanh(0.5*ln(2)*x), but almost every time you want to use a scaler anyway so it shouldn't matter.

It's based on a definition

Code: Select all

tanh(x)= (e^2x-1)/(e^2x+1)

But we can't use this formula right away, because of subtractive cancellation in the term e^2x-1, and we don't want to lose precision near zero.
This code computes 2^x-1 inside without subtractive precision loss.

SSE2:

Code: Select all

//Approximates tanh(0.5*ln(2)*x)
static __m128 tanh2x_mpv(__m128 x)
{
	const __m128 one = _mm_set1_ps(1.0f);
	const __m128 signmask = _mm_set1_ps(-0.0f);
	const __m128 max_val = _mm_set1_ps(30.0f);

	const __m128 c1 = _mm_set1_ps(6.931471825e-01);
	const __m128 c2 = _mm_set1_ps(2.402264923e-01);
	const __m128 c3 = _mm_set1_ps(5.550357327e-02);
	const __m128 c4 = _mm_set1_ps(9.618237615e-03);
	const __m128 c5 = _mm_set1_ps(1.339077600e-03);
	const __m128 c6 = _mm_set1_ps(1.540359954e-04);

	__m128 signs = _mm_and_ps(x, signmask);
	x = _mm_andnot_ps(signmask, x);
	x = _mm_min_ps(x, max_val);

	__m128 f = x;

	__m128i i = _mm_cvtps_epi32(f);
	f = _mm_sub_ps(f, _mm_cvtepi32_ps(i));

	__m128 f2 = _mm_mul_ps(f, f);
	__m128 p = _mm_add_ps(_mm_mul_ps(c6, f), c5);
	p = _mm_mul_ps(p, f2);
	p = _mm_add_ps(p, _mm_add_ps(_mm_mul_ps(c4, f), c3));
	p = _mm_mul_ps(p, f2);
	p = _mm_add_ps(p, _mm_add_ps(_mm_mul_ps(c2, f), c1));
	p = _mm_mul_ps(p, f);

	i = _mm_slli_epi32(i, 23);

	__m128 biased_expm = _mm_castsi128_ps(_mm_sub_epi32(_mm_castps_si128(one), i));
	__m128 exp_cor = _mm_sub_ps(one, biased_expm);
	__m128 exp_cor_p = _mm_add_ps(one, biased_expm);

	__m128 exp2xm1 = _mm_xor_ps(signs,_mm_add_ps(p, exp_cor));
	__m128 exp2xp1 = _mm_add_ps(p, exp_cor_p);
	return _mm_div_ps(exp2xm1, exp2xp1);
}

AVX2 + FMA

Code: Select all

//Approximates tanh(0.5*ln(2)*x)
static __m256 tanh2x_mpv_fma(__m256 x)
{
	const __m256 one = _mm256_set1_ps(1.0f);
	const __m256 signmask = _mm256_set1_ps(-0.0f);
	const __m256 max_val = _mm256_set1_ps(30.0f);

	const __m256 c1 = _mm256_set1_ps(6.931471825e-01);
	const __m256 c2 = _mm256_set1_ps(2.402264923e-01);
	const __m256 c3 = _mm256_set1_ps(5.550357327e-02);
	const __m256 c4 = _mm256_set1_ps(9.618237615e-03);
	const __m256 c5 = _mm256_set1_ps(1.339077600e-03);
	const __m256 c6 = _mm256_set1_ps(1.540359954e-04);

	__m256 signs = _mm256_and_ps(x, signmask);
	x = _mm256_andnot_ps(signmask, x);
	x = _mm256_min_ps(x, max_val);

	__m256 f = x;

	__m256i i = _mm256_cvtps_epi32(f);
	f = _mm256_sub_ps(f, _mm256_cvtepi32_ps(i));

	__m256 p = c6;
	p = _mm256_fmadd_ps(p, f, c5);
	p = _mm256_fmadd_ps(p, f, c4);
	p = _mm256_fmadd_ps(p, f, c3);
	p = _mm256_fmadd_ps(p, f, c2);
	p = _mm256_fmadd_ps(p, f, c1);
	i = _mm256_slli_epi32(i, 23);

	__m256 biased_expm = _mm256_castsi256_ps(_mm256_sub_epi32(_mm256_castps_si256(one), i));
	__m256 exp_cor = _mm256_sub_ps(one, biased_expm);
	__m256 exp_cor_p = _mm256_add_ps(one, biased_expm);

	__m256 exp2xm1 = _mm256_xor_ps(signs, _mm256_fmadd_ps(p, f, exp_cor));
	__m256 exp2xp1 = _mm256_fmadd_ps(p, f, exp_cor_p);
	return _mm256_div_ps(exp2xm1, exp2xp1);
}

Benchmarks (average time to tanh a lot of values):

Code: Select all

MSVC vector tanh: 5.08284 c/v
tanh2x_mpv_fma: 1.40598 c/v
tanh2x_mpv: 3.79001 c/v

Latency may be important if you use it in a feedback structure (like a IIR filter), but I didn't test it yet.

Accuracy:

Code: Select all

3.0507440678775311 ulps on [3 * std::numeric_limits<float>::min(), 300]
~3e-7 relative

Z1202 · Post by **Z1202** » Thu Feb 07, 2019 10:38 am

Very nice!

Z1202 · Post by **Z1202** » Thu Feb 07, 2019 12:54 pm

Is the precision loss significant on the negative side, or why do you tear the sign apart? It doesn't look immediately obvious to me.

2DaT · Post by **2DaT** » Thu Feb 07, 2019 1:53 pm

Z1202 wrote: ↑Thu Feb 07, 2019 12:54 pm Is the precision loss significant on the negative side, or why do you tear the sign apart? It doesn't look immediately obvious to me.

In modern cores vector logicals are free, because that kind of code is limited by adds and multiplies. Min/max do steal cycles from adds.

Code: Select all

static __m128 tanh2x_mpv_sse_v(__m128 x)
{
	const __m128 one = _mm_set1_ps(1.0f);
	const __m128 max_val = _mm_set1_ps(30.0f);
	const __m128 min_val = _mm_set1_ps(-30.0f);

	const __m128 c1 = _mm_set1_ps(6.931471825e-01);
	const __m128 c2 = _mm_set1_ps(2.402264923e-01);
	const __m128 c3 = _mm_set1_ps(5.550357327e-02);
	const __m128 c4 = _mm_set1_ps(9.618237615e-03);
	const __m128 c5 = _mm_set1_ps(1.339077600e-03);
	const __m128 c6 = _mm_set1_ps(1.540359954e-04);

	x = _mm_min_ps(x, max_val);
	x = _mm_max_ps(x, min_val);

	__m128 f = x;

	__m128i i = _mm_cvtps_epi32(f);
	f = _mm_sub_ps(f, _mm_cvtepi32_ps(i));

	__m128 f2 = _mm_mul_ps(f, f);
	__m128 p = _mm_add_ps(_mm_mul_ps(c6, f), c5);
	p = _mm_mul_ps(p, f2);
	p = _mm_add_ps(p, _mm_add_ps(_mm_mul_ps(c4, f), c3));
	p = _mm_mul_ps(p, f2);
	p = _mm_add_ps(p, _mm_add_ps(_mm_mul_ps(c2, f), c1));
	p = _mm_mul_ps(p, f);

	i = _mm_slli_epi32(i, 23);

	__m128 biased_expm = _mm_castsi128_ps(_mm_sub_epi32(_mm_castps_si128(one), i));
	__m128 exp_cor = _mm_sub_ps(one, biased_expm);
	__m128 exp_cor_p = _mm_add_ps(one, biased_expm);

	__m128 exp2xm1 = _mm_add_ps(p, exp_cor);
	__m128 exp2xp1 = _mm_add_ps(p, exp_cor_p);
	return _mm_div_ps(exp2xm1, exp2xp1);
}

Code: Select all

tanh2x_mpv_sse_v : 3.85271
tanh2x_mpv_sse : 3.79121

Z1202 · Post by **Z1202** » Thu Feb 07, 2019 2:20 pm

2DaT wrote: ↑Thu Feb 07, 2019 1:53 pmIn modern cores vector logicals are free, because that kind of code is limited by adds and multiplies. Min/max do steal cycles from adds.

Didn't know that! Doesn't that mean that in your code the sign could have been reapplied right after the clipping (potentially saving one register)?

juha_p · Post by **juha_p** » Thu Feb 07, 2019 7:13 pm

2DaT · Post by **2DaT** » Fri Feb 08, 2019 11:19 am

Z1202 wrote: ↑Thu Feb 07, 2019 2:20 pm
2DaT wrote: ↑Thu Feb 07, 2019 1:53 pmIn modern cores vector logicals are free, because that kind of code is limited by adds and multiplies. Min/max do steal cycles from adds.
Didn't know that! Doesn't that mean that in your code the sign could have been reapplied right after the clipping (potentially saving one register)?

Hmm. Tried this and it's also slightly faster.

Code: Select all

static __m128 tanh2x_mpv_sse_v(__m128 x)
{
	const __m128 one = _mm_set1_ps(1.0f);
	const __m128 signmask = _mm_set1_ps(-0.0f);
	const __m128 max_val = _mm_set1_ps(30.0f);

	const __m128 c1 = _mm_set1_ps(6.931471825e-01);
	const __m128 c2 = _mm_set1_ps(2.402264923e-01);
	const __m128 c3 = _mm_set1_ps(5.550357327e-02);
	const __m128 c4 = _mm_set1_ps(9.618237615e-03);
	const __m128 c5 = _mm_set1_ps(1.339077600e-03);
	const __m128 c6 = _mm_set1_ps(1.540359954e-04);

	__m128 signs = _mm_and_ps(x, signmask);
	x = _mm_andnot_ps(signmask, x);
	x = _mm_min_ps(x, max_val);
	x = _mm_xor_ps(x, signs);

	__m128 f = x;

	__m128i i = _mm_cvtps_epi32(f);
	f = _mm_sub_ps(f, _mm_cvtepi32_ps(i));

	__m128 f2 = _mm_mul_ps(f, f);
	__m128 p = _mm_add_ps(_mm_mul_ps(c6, f), c5);
	p = _mm_mul_ps(p, f2);
	p = _mm_add_ps(p, _mm_add_ps(_mm_mul_ps(c4, f), c3));
	p = _mm_mul_ps(p, f2);
	p = _mm_add_ps(p, _mm_add_ps(_mm_mul_ps(c2, f), c1));
	p = _mm_mul_ps(p, f);

	i = _mm_slli_epi32(i, 23);

	__m128 biased_expm = _mm_castsi128_ps(_mm_sub_epi32(_mm_castps_si128(one), i));
	__m128 exp_cor = _mm_sub_ps(one, biased_expm);
	__m128 exp_cor_p = _mm_add_ps(one, biased_expm);

	__m128 exp2xm1 = _mm_add_ps(p, exp_cor);
	__m128 exp2xp1 = _mm_add_ps(p, exp_cor_p);
	return _mm_div_ps(exp2xm1, exp2xp1);
}

Code: Select all

tanh2x_mpv_sse_v: 3.70217
tanh2x_mpv_sse: 3.79021

Good find!

Error is good on a negative side, but not symmetric.

IMO +-5% is nothing to worry about

.

Z1202 · Post by **Z1202** » Fri Feb 08, 2019 1:41 pm

In the neighbor thread I posted some 2^x approximations which could be used to build variants of your code with lower-order polynomials (to save CPU).
viewtopic.php?f=33&t=519852

Not sure how your approximation is built (is it minimax?), these ones are built from the maximum smoothness requirement.
FWIW

Rockatansky · Post by **Rockatansky** » Sat Feb 09, 2019 8:44 pm

What a coincidence, I've started getting into SSE as well, and I just spent my day re-implementing a tanh approximation with SSE instructions. However, I built mine as a per-sample function taking and returning doubles, and since this whole SSE thing is really new territory for myself, I thought I'd leave mine here, just for fun. Either it helps someone, or you'll tear it apart and in return that could help me.

Mine is based on an approximation I found on MusicDSP. Here's the original code, slightly reformatted because OCD:

Code: Select all

float rational_tanh (float x)
{
  if ( x < -3.0f ) return -1.0f;
  else if ( x > 3.0f ) return 1.0f;
  else return x * ( 27.0f + x * x ) / ( 27.0f + 9.0f * x * x );
}

I adjusted those just too comfortable looking constant numbers a bit, ended up with something that's closer to std::tanh than the "rational_tanh" code outside of +/- 0.6. It's off worse approaching 0, I have yet to check if that has any reeeally bad effects. Here's a Desmos graph comparing std::tanh, the original rational_tanh code and my version, disregarding the [-1,1] confinement because those Desmos editors are just too fiddly.

Here are a bunch of test values I ran:

Code: Select all

value       0.0001
std::tanh   9.99999996666666682732655e-05
rational_   9.99999997037037056642553e-05
SSE::Tanh   9.83103497639169248345156e-05

value      -0.51
std::tanh  -0.469945198933037655564249
rational_  -0.47383178430109507139889
SSE::Tanh  -0.466084416986829508378065

value       0.9
std::tanh   0.716297870199024466764115
rational_   0.729921259842519654092996
SSE::Tanh   0.718758020520595120039786

value       2.5
std::tanh   0.986614298151430313410515
rational_   0.998498498498498476827478
SSE::Tanh   0.990513221198213500429119

Now, for the actual code... here's hoping I'm not making too much of a fool of myself with this.

Code: Select all

const double Tanh (const double& Value)
{
    // Computation steps
    //
    //   result1   = input * input
    //   result2   = 25.95 + result1
    //   result3.1 = 8.78 * result1
    //   result3.2 = input * result2
    //   result4   = 26.396 + result3.1
    //   result5   = result3.2 / result4
    //   result6   = limit [-1, result5, 1]
    //
    const __m128d value    = _mm_set_sd(Value);
    const __m128d result1  = _mm_mul_sd(value, value); // pow2
    const __m128d result2  = _mm_add_sd(result1, _mm_set_sd(25.95));
    const __m128d factors1 = _mm_setr_pd(8.78, Value);
    const __m128d factors2 = _mm_setr_pd(_mm_cvtsd_f64(result1), _mm_cvtsd_f64(result2));
    const __m128d result3  = _mm_mul_pd(factors1, factors2);
    const __m128d result4  = _mm_add_sd(result3, _mm_set_sd(26.396));
    const __m128d result5  = _mm_div_sd(_mm_unpackhi_pd(result3, result3), _mm_unpacklo_pd(result4, result4));
    const __m128d result6  = _mm_max_pd(_mm_set_sd(-1.0), _mm_min_pd(_mm_set_sd(1.0), result5));
    return _mm_cvtsd_f64(result6);
}

Since I don't have a lot of ASM/SSE routine yet, I'd be really interested to know if it's faster to actually use _mm_load* and _mm_store* instructions to exchange value between double[] arrays and __m128d all the time? Or is it faster to use the _mm_unpack* instructions like I did, throwing together new __m128d vectors on the fly? Is there a better way to target individual doubles inside a __m128d, e.g. when computing result5?
Maybe a bit OT, I know, but well... "while you're at it".

mystran · Post by **mystran** » Sun Feb 10, 2019 3:24 pm

Rockatansky wrote: ↑Sat Feb 09, 2019 8:44 pm Since I don't have a lot of ASM/SSE routine yet, I'd be really interested to know if it's faster to actually use _mm_load* and _mm_store* instructions to exchange value between double[] arrays and __m128d all the time? Or is it faster to use the _mm_unpack* instructions like I did, throwing together new __m128d vectors on the fly? Is there a better way to target individual doubles inside a __m128d, e.g. when computing result5?

The shuffles are generally 1 cycle each and usually the fastest approach if you can't find a way to avoid them completely.

Looking at your code, you do stuff like:

Code: Select all

    const __m128d factors2 = _mm_setr_pd(_mm_cvtsd_f64(result1), _mm_cvtsd_f64(result2));

Here the _mm_cvtsd_f64 expands into movsd m64,xmm so you have two memory stores and a compiler pseudo-op _mm_setr_pd that might ideally become a movaps (or movapd, but that's one byte longer) if the compiler manages to put the two doubles into stack in the right order. The same thing could be done with a single _mm_unpacklo_pd which expands into unpcklpd xmm, xmm and takes one cycle. In theory the compiler might peephole it for you, but I don't know if any of them do.

Anyway, I picked this line as an example, because when you write SIMD code using intrinsics, you really need to pay attention to what kind of assembly the intrinsics are likely to generate and this is even more so than for regular C++ code, because compilers are likely to be more conservative about rewriting your intrinsics code to something more efficient.

That said, the fundamental problem here is that in general trying to use SIMD to compute one value faster is usually quite futile, as the shuffling costs (even using direct shuffles) tends to just negate any gains. If you look at your code, most of your operators are _sd type, which is what the compiler already generates for regular scalar code. Usually in order to get something out of SIMD, you need to compute multiple things in parallel on algorithmic level rather than just instruction level.

Finally, because packed doubles in SSE2 only give you 2-way parallel computation it is really sensitive to any overheads and it is quite easy to end up with code that is just slower no matter what. This is less of an issue for wider SIMD-widths, because with larger theoretical speedups the overhead is usually not as significant. The more recent compilers might actually auto-vectorise the most profitable cases anyway, so the number of cases where writing packed double coded manually makes sense is really quite limited.

Rockatansky · Post by **Rockatansky** » Mon Feb 11, 2019 8:26 pm

Thank you for your insightful feedback, much appreciated!

mystran wrote: ↑Sun Feb 10, 2019 3:24 pm Here the _mm_cvtsd_f64 expands into movsd m64,xmm so you have two memory stores and a compiler pseudo-op _mm_setr_pd that might ideally become a movaps (or movapd, but that's one byte longer) if the compiler manages to put the two doubles into stack in the right order. The same thing could be done with a single _mm_unpacklo_pd which expands into unpcklpd xmm, xmm and takes one cycle.

How do you figure out what asm instructions the compiler transforms the code into? I've played around with the Compiler Explorer website before, but it's a bit tedious to always translate back and forth with my own classes and typedefs. Are there any easy to use "offline" tools that let me check that? The "assembly" view in XCode's assistant editor isn't selectable for some reason.

mystran wrote: ↑Sun Feb 10, 2019 3:24 pm If you look at your code, most of your operators are _sd type, which is what the compiler already generates for regular scalar code. Usually in order to get something out of SIMD, you need to compute multiple things in parallel on algorithmic level rather than just instruction level.

Yes, I'm aware of that issue. Since my workflow used to be per-channel, per-sample oriented, I tried writing everything as 1-sample-in-and-1-sample-out functions, only using packed _pd computations for unrelated values side by side, like in my previous code with result3.1 and result3.2. I've since given up that ideology and started handling my samples in steps of 2 using a double[2] wrapper class, that I can easily pass into and return out of SSE-ified functions. The functions then process everything step by step, but in parallel for two samples at a time, I figure that's the way it's intended to be.

Here's the above code adapted to that workflow, slightly altered to handle a double* input rather than my wrapper class. Is this closer to what it should look like? (I omitted the const unsigned int& Length argument, since _mm_load_pd always only loads two array values anyway.)

Code: Select all

inline void Tanh (double* Samples)
{
    const __m128d samples = _mm_load_pd(Samples);
    const __m128d result1 = _mm_mul_pd(samples, samples);
    const __m128d result2 = _mm_add_pd(result1, _mm_set1_pd(25.95));
    const __m128d result31 = _mm_mul_pd(result1, _mm_set1_pd(8.78));
    const __m128d result32 = _mm_mul_pd(samples, result2);
    const __m128d result4 = _mm_add_pd(result31, _mm_set1_pd(26.396));
    const __m128d result5 = _mm_div_pd(result32, result4);
    _mm_store_pd(Samples, _mm_max_pd(_mm_set1_pd(-1.0), _mm_min_pd(_mm_set1_pd(1.0), result5)));
}

I would probably use less const __m128d variables and just insert the _mm_* calls directly where possible, like the [-1;+1] clipping in the last line, but I thought it's easier to follow like this.

2DaT wrote: ↑Fri Feb 08, 2019 11:19 am Tried this and it's also slightly faster.

Out of curiosity, what do you use to measure these things? Handmade microtests? An external test library?

mystran · Post by **mystran** » Mon Feb 11, 2019 10:00 pm

Rockatansky wrote: ↑Mon Feb 11, 2019 8:26 pm Thank you for your insightful feedback, much appreciated!

mystran wrote: ↑Sun Feb 10, 2019 3:24 pm Here the _mm_cvtsd_f64 expands into movsd m64,xmm so you have two memory stores and a compiler pseudo-op _mm_setr_pd that might ideally become a movaps (or movapd, but that's one byte longer) if the compiler manages to put the two doubles into stack in the right order. The same thing could be done with a single _mm_unpacklo_pd which expands into unpcklpd xmm, xmm and takes one cycle.
How do you figure out what asm instructions the compiler transforms the code into? I've played around with the Compiler Explorer website before, but it's a bit tedious to always translate back and forth with my own classes and typedefs. Are there any easy to use "offline" tools that let me check that? The "assembly" view in XCode's assistant editor isn't selectable for some reason.

Most of the intrinsics represent individual instructions and https://software.intel.com/sites/landin ... sicsGuide/ will tell you what those instructions are. Note that if you already know what instruction you want, you can also type the mnemonic into the search box and it'll show you the matching intrinsics (but check that you got what you wanted, it matches strings, not full words). While a compiler might theoretically optimise some of the instructions to something else, usually you get pretty much what you asked for.

Those that don't have any instruction listed for them are typically something like _mm_set_pd() which don't exists in the ISA directly and basically give the compiler to freedom to decide how to shuffle the data around (eg. load from memory, shuffle in registers, whatever the compiler's optimiser thinks it the most convenient).

If you want to see the actual code, then in Visual Studio on Windows I'd usually just break into debugger and let it disassemble. You can do the same in LLDB as well, but I guess that's not nearly as convenient, so using otool(?) or just asking for clang to generate assembly (-S) might be easier? The last option will let you get assembly for a single source file at a time... although you might get slightly different results compared to running full LTO.

2DaT · Post by **2DaT** » Tue Feb 12, 2019 6:28 pm

Rockatansky wrote: ↑Mon Feb 11, 2019 8:26 pm
2DaT wrote: ↑Fri Feb 08, 2019 11:19 am Tried this and it's also slightly faster.
Out of curiosity, what do you use to measure these things? Handmade microtests? An external test library?

Posted it here:
viewtopic.php?p=7170633#p7170633
Default data size is a little bit large

Code: Select all

size_t dataSize_Kb = 2*1024;

I recommend lowering it to 16Kb, and bumping up the iteration count to reduce noise.

mystran · Post by **mystran** » Tue Feb 12, 2019 8:32 pm

2DaT wrote: ↑Tue Feb 12, 2019 6:28 pm I recommend lowering it to 16Kb, and bumping up the iteration count to reduce noise.

This is an interesting subject though, because usually the audio blocks are somewhere in the 64 to 512 range and a lot of code might run between the current audio block and the next one... so one might actually want to factor in the performance hit from running cold code caches and such.

Interestingly enough, in the Linux-kernel development world, they apparently like to compile mostly for size (although not using actual "optimise for size" because then GCC starts to produce silly code). Apparently this tends to improve performance, because most of the routines end up getting called cache cold so the less code you have, the faster it runs.

Whether it's that bad for audio plugins is one thing, but I think one should be careful about trusting averaged cache warm performance tests when the goal is real-time performance at short block sizes. I've actually had quite a few cases in the past, where a micro-benchmark claims one thing is slightly faster than another, but then you throw it into an actual project and the situation is suddenly the opposite. That doesn't necessarily mean micro benchmarks are useless, but one should still be careful.

2DaT · Post by **2DaT** » Wed Aug 28, 2019 4:18 pm

Another tanh approx based on rational minmax. It's a straightforward tanh, so it's more convenient to use.

Code: Select all

inline __m128 tanh_sse(__m128 x) {
  const __m128 n0 = _mm_set1_ps(4.351839500e+06);
  const __m128 n1 = _mm_set1_ps(5.605646250e+05);
  const __m128 n2 = _mm_set1_ps(1.263485352e+04);
  const __m128 n3 = _mm_set1_ps(4.751771164e+01);

  const __m128 d0 = n0;
  const __m128 d1 = _mm_set1_ps(2.011170000e+06);
  const __m128 d2 = _mm_set1_ps(1.027901563e+05);
  const __m128 d3 = _mm_set1_ps(1.009453430e+03);

  const __m128 max_val = _mm_set1_ps(7.7539052963256836);
  const __m128 signmask = _mm_set1_ps(-0.0f);

  __m128 signs = _mm_and_ps(x, signmask);
  x = _mm_andnot_ps(signmask, x);
  x = _mm_min_ps(x, max_val);

  __m128 f2 = _mm_mul_ps(x, x);
  // Numerator, Horner's scheme
  __m128 num = n3;
  num = _mm_add_ps(_mm_mul_ps(num, f2), n2);
  num = _mm_add_ps(_mm_mul_ps(num, f2), n1);
  num = _mm_add_ps(_mm_mul_ps(num, f2), n0);
  num = _mm_mul_ps(_mm_xor_ps(x, signs), num);
  __m128 denom = f2;
  denom = _mm_add_ps(denom, d3);
  denom = _mm_add_ps(_mm_mul_ps(denom, f2), d2);
  denom = _mm_add_ps(_mm_mul_ps(denom, f2), d1);
  denom = _mm_add_ps(_mm_mul_ps(denom, f2), d0);
  // Denominator, Horner's scheme
  return _mm_div_ps(num, denom);
}

inline __m256 tanh_fma(__m256 x) {
  const __m256 n0 = _mm256_set1_ps(4.351839500e+06);
  const __m256 n1 = _mm256_set1_ps(5.605646250e+05);
  const __m256 n2 = _mm256_set1_ps(1.263485352e+04);
  const __m256 n3 = _mm256_set1_ps(4.751771164e+01);

  const __m256 d0 = n0;
  const __m256 d1 = _mm256_set1_ps(2.011170000e+06);
  const __m256 d2 = _mm256_set1_ps(1.027901563e+05);
  const __m256 d3 = _mm256_set1_ps(1.009453430e+03);

  const __m256 max_val = _mm256_set1_ps(7.8231868743896484);

  const __m256 signmask = _mm256_set1_ps(-0.0f);

  __m256 signs = _mm256_and_ps(x, signmask);
  x = _mm256_andnot_ps(signmask, x);
  x = _mm256_min_ps(x, max_val);

  __m256 f2 = _mm256_mul_ps(x, x);
  // Numerator, Horner's scheme
  __m256 num = n3;
  num = _mm256_fmadd_ps(num, f2, n2);
  num = _mm256_fmadd_ps(num, f2, n1);
  num = _mm256_fmadd_ps(num, f2, n0);
  num = _mm256_mul_ps(_mm256_xor_ps(x, signs), num);
  // Denominator, Horner's scheme
  __m256 denom = f2;
  denom = _mm256_add_ps(denom, d3);
  denom = _mm256_fmadd_ps(denom, f2, d2);
  denom = _mm256_fmadd_ps(denom, f2, d1);
  denom = _mm256_fmadd_ps(denom, f2, d0);

  return _mm256_div_ps(num, denom);
}

Perf:

Code: Select all

(OLD) tanh2x_mpv_fma: 1.38745
(OLD) tanh2x_mpv_sse: 3.73877
tanh_fma: 0.907715
tanh_sse: 2.78662

Less precise (~10ULP), but should be enough for saturation uses. I couldn't get it lower than 6 ULP though; it seems to be related with error in polynomial evaluation.

Sadly, I didn't find any suitable formula to shortcut the evaluation further in any meaningful way.

Accurate SSE2/FMA tanh approximation.