Well, debug builds are awfully inefficient, so it makes little sense to do performance comparisons on those.juha_p wrote: MonoDevelop in Ubuntu g++ as compiler w/o optimizations ... also, remember that I don't know much of implementing SSE code so many things can be wrong there and the test process is what it is (just look the code linked in one of my posts earlier in this thread).
My b^x approximation

2DaT
 KVRist
 312 posts since 21 Jun, 2013
Re: My b^x approximation

juha_p
 KVRist
 480 posts since 21 Feb, 2006 from FI
Re: My b^x approximation
Yes, the difference is huge between debug and release build. Can you wink some better performance test tools for function testing (c++).2DaT wrote:Well, debug builds are awfully inefficient, so it makes little sense to do performance comparisons on those.juha_p wrote: MonoDevelop in Ubuntu g++ as compiler w/o optimizations ... also, remember that I don't know much of implementing SSE code so many things can be wrong there and the test process is what it is (just look the code linked in one of my posts earlier in this thread).

2DaT
 KVRist
 312 posts since 21 Jun, 2013
Re: My b^x approximation
Test code:juha_p wrote: Yes, the difference is huge between debug and release build. Can you wink some better performance test tools for function testing (c++).
Code: Select all
#include <iostream>
#include <limits>
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <algorithm>
#include <xmmintrin.h>
#include "TestProcedures.h"
int main()
{
using namespace std;
//_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
//_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
size_t dataSize_Kb = 2*1024;
size_t dataSize_B = dataSize_Kb * 1024;
size_t dataSize = dataSize_B / sizeof(float);
float* input_data = (float*) _mm_malloc(dataSize_B, 4096);
float* output_data = (float*)_mm_malloc(dataSize_B, 4096);
for (int i = 0; i < dataSize; i++)
{
output_data[i] = rand()*0.0001f;
}
uint64_t minimum1=UINT64_MAX, minimum2= UINT64_MAX;
for (int i = 0; i < 50; i++)
{
auto ticks = __rdtsc();
testProc1(input_data, output_data, dataSize);
auto ticks1 = __rdtsc();
testProc2(input_data, output_data, dataSize);
auto ticks2 = __rdtsc();
ticks2 = ticks1;
ticks1 = ticks;
minimum1 = std::min(minimum1, (uint64_t)ticks1);
minimum2 = std::min(minimum2, (uint64_t)ticks2);
}
cout << "Approx. rdtsc/val...." << endl;
cout << "First : " << (double)minimum1 / dataSize << "\nSecond : " << (double)minimum2 / dataSize << endl;
_mm_free(input_data);
_mm_free(output_data);
return 0;
}
Code: Select all
#pragma once
void testProc1(const float * __restrict in, float * __restrict out, size_t size);
void testProc2(const float * __restrict in, float * __restrict out, size_t size);
Code: Select all
#include "TestProcedures.h"
#include <emmintrin.h>
#include <cmath>
void testProc1(const float * __restrict in, float * __restrict out, size_t size)
{
for (size_t i = 0; i < size; i++)
{
out[i] = exp(in[i]);
}
}
__m128 exp_mp(const __m128& x)
{
//[0.5,0.5] 2^x approx polynomial ~ 2.4 ulp
const __m128 c0 = _mm_set1_ps(1.000000119e+00f);
const __m128 c1 = _mm_set1_ps(6.931469440e01f);
const __m128 c2 = _mm_set1_ps(2.402212024e01f);
const __m128 c3 = _mm_set1_ps(5.550713092e02f);
const __m128 c4 = _mm_set1_ps(9.675540961e03f);
const __m128 c5 = _mm_set1_ps(1.327647245e03f);
const __m128 l2e = _mm_set1_ps(1.442695041f);
//f = log2(e) *x
__m128 f = _mm_mul_ps(x, l2e);
//i = round(f)
//f = f  i
__m128i i = _mm_cvtps_epi32(f);
f = _mm_sub_ps(f, _mm_cvtepi32_ps(i));
//estrinhorner evaluation scheme
__m128 f2 = _mm_mul_ps(f, f);
__m128 p = _mm_add_ps(_mm_mul_ps(c5, f), c4);
p = _mm_mul_ps(p, f2);
p = _mm_add_ps(p, _mm_add_ps(_mm_mul_ps(c3, f), c2));
p = _mm_mul_ps(p, f2);
p = _mm_add_ps(p, _mm_add_ps(_mm_mul_ps(c1, f), c0));
// i << 23
i = _mm_slli_epi32(i, 23);
//r = (2^i)*(2^f)
//directly in floating point exponent
return _mm_castsi128_ps(_mm_add_epi32(i, _mm_castps_si128(p)));
}
void testProc2(const float * __restrict in, float * __restrict out, size_t size)
{
for (size_t i = 0; i < size; i+=4)
{
_mm_store_ps(out + i, exp_mp(_mm_load_ps(in + i)));
}
}

juha_p
 KVRist
 480 posts since 21 Feb, 2006 from FI
Re: My b^x approximation
Thanks!
I have few versions of VS installed so no trouble getting it working.
Why I ran my tests in debug mode was just because of in release mode there were no differences even I set the test run many many times longer from what it is there in code (tried inner loop with j<10000000 > solving std::exp() and std::exp() would have taken long time to get the result for comparison use...).
EDIT: After transforming my easyEXP_sse methods to float data type, a quick test with your code brought these results:
Have to make the test to accept testing __m128d and float/double types.
EDIT:
Made a faster version of the float MacLaurinEXP_SSE() by putting the code to calculate 4 divs and adds at a time (isn't SSE best just in this?) and then added the horizontal sums ... result:
Quite fast already
I have few versions of VS installed so no trouble getting it working.
Why I ran my tests in debug mode was just because of in release mode there were no differences even I set the test run many many times longer from what it is there in code (tried inner loop with j<10000000 > solving std::exp() and std::exp() would have taken long time to get the result for comparison use...).
EDIT: After transforming my easyEXP_sse methods to float data type, a quick test with your code brought these results:
Code: Select all
Approx. rdtsc/val....
std::exp() : 26.6975
exp_mp() : 3.13861
easyEXP_sse() : 2.91754
easyEXP_sse_tweaked() : 3.15795
EDIT:
Made a faster version of the float MacLaurinEXP_SSE() by putting the code to calculate 4 divs and adds at a time (isn't SSE best just in this?) and then added the horizontal sums ... result:
Code: Select all
Approx. rdtsc/val....
std::exp() : 26.6983
MacLaurinEXP_SSE() : 0.579626
Last edited by juha_p on Sat Sep 15, 2018 10:16 pm, edited 3 times in total.

mystran
 KVRAF
 4983 posts since 12 Feb, 2006 from Helsinki, Finland
Re: My b^x approximation
There is like ZERO point in ever benchmarking debug builds, because those are specifically designed to be easy to for a debugger to follow, which involves intentionally producing really inefficient binary code to make sure all the variables are always accessible. Compilers might literally do things like store every temporary to memory, which takes orders of magnitude more time than the actual code you wrote and completely screws up even the relative performance of different algorithms.juha_p wrote:Yes, the difference is huge between debug and release build. Can you wink some better performance test tools for function testing (c++).2DaT wrote:Well, debug builds are awfully inefficient, so it makes little sense to do performance comparisons on those.juha_p wrote: MonoDevelop in Ubuntu g++ as compiler w/o optimizations ... also, remember that I don't know much of implementing SSE code so many things can be wrong there and the test process is what it is (just look the code linked in one of my posts earlier in this thread).
They might event throw in some extra code to do additional checking in order to give you helpful runtime diagnostics. Debug builds are literally only ever useful for debugging errors and even then only in case you have an error that's actually predictable enough to show up in a debug build.
ps. in general enabling debug symbols in release builds is fine, those are just data embedded for the debugger to try to figure out what's going on, although success is quite variable when optimisations are enabled... but you really REALLY need at least basic optimisations (often even things like register allocation are part of the optimiser) to produce anything that's even remotely useful for measuring performance
If you'd like Signaldust to return, please ask Katinka Tuisku to resign.

camsr
 KVRAF
 6859 posts since 17 Feb, 2005
Re: My b^x approximation
It's useful to use fgets() and the 'stdin' pipe so the compiler cannot "overoptimise" a basic test program. If the compiler can resolve that something is constant, it will not reflect the same build as when it's actually used on whatever data. fgets() will give you a basic input which should reflect the input variables in your tested functions, so they are absolutely unknown at compile time.
I also found that if the compiler cannot solve something, it will not optimize the constants into the program. For whatever reason, this incrementer I wrote does not resolve to a constant, possibly because either the loop it's used in is too large, or because it accesses the heap, or because of the punned types, or possibly the conditional statements...
I also found that if the compiler cannot solve something, it will not optimize the constants into the program. For whatever reason, this incrementer I wrote does not resolve to a constant, possibly because either the loop it's used in is too large, or because it accesses the heap, or because of the punned types, or possibly the conditional statements...
Code: Select all
AT(inline) void loop_inc(test_state* ptr_ts)
{
test_state ts = *ptr_ts;
union { float f; uint32_t u; }pun;
ts.c1; // dec loop counter
// input increment logic
pun.f = ts.low; ts.low2 = pun.u;
if (ts.low2 >= 2147483648u) { ts.low2; } // choose inc or dec to iterate towards 2147483648u (0.f)
if (ts.low2 < 2147483648u) { ts.low2++; }
if (ts.low2 == 2147483648u) { ts.low2 = 0u; } // filter 0.f
pun.u = ts.low2; ts.low = pun.f;
*ptr_ts = ts;
}

juha_p
 KVRist
 480 posts since 21 Feb, 2006 from FI
Re: My b^x approximation
EDIT: Simplified the question: How these two lines are translated to SSE intrinsics:
/* multiply by power of 2 */
z *= uint32_to_sp((n+0x7f)<<23);
and
// Build 2^n in double.
x *= uint642dp( (((uint64_t)n) + 1023) << 52);
Original source code.
/* multiply by power of 2 */
z *= uint32_to_sp((n+0x7f)<<23);
and
// Build 2^n in double.
x *= uint642dp( (((uint64_t)n) + 1023) << 52);
Original source code.

2DaT
 KVRist
 312 posts since 21 Jun, 2013
Re: My b^x approximation
Correct way: will handle every p right, but only when 127 < i < 128. Same as original.juha_p wrote:EDIT: Simplified the question: How these two lines are translated to SSE intrinsics:
/* multiply by power of 2 */
z *= uint32_to_sp((n+0x7f)<<23);
Code: Select all
i = _mm_add_epi32(i,_mm_set1_epi32(0x7f));
i = _mm_slli_epi32(i, 23);
return _mm_mul_ps(p,_mm_castsi128_ps(i));
Code: Select all
i = _mm_slli_epi32(i, 23);
return _mm_castsi128_ps(_mm_add_epi32(i, _mm_castps_si128(p)));
Same with double but with different constants/instructions.

juha_p
 KVRist
 480 posts since 21 Feb, 2006 from FI
Re: My b^x approximation
Thanks again!
Yes, looks like the implementation works in range [87.6, 87.9] ... .
Got both working (though had something to bite with the double version ).
BTW, that double precision version of "Pade based" exp approximation is excellent ... as accurate as the "Remez13 based" listed @ here.
EDIT:
Yes, looks like the implementation works in range [87.6, 87.9] ... .
Got both working (though had something to bite with the double version ).
BTW, that double precision version of "Pade based" exp approximation is excellent ... as accurate as the "Remez13 based" listed @ here.
EDIT:
Source: http://www.nersc.gov/users/computationa ... orization/For some data and operation types, we see the expected vector speedup with respect to the used vectorization type (4 times with SSE and 8 times with AVX in case of 4byte operands; 2 times with SSE and 4 times with AVX in case of 8byte operands). But others (such as SQRT() operations with AVX or integer operations with AVX) don't show the expected vectorization speedup. For some data and operation types such as all integer operations, no additional speedup with AVX over SSE is observed. Some operations (for example, EXP()) show worse performance with vectorization.