It may look like a lot of operations, but _mm_cast is no-op, so it doesn't count .
Code:
Code: Select all
__m128 log2_sse(__m128 x) {
// 12-13ulp
const __m128 c0 = _mm_set1_ps(1.011593342e+01);
const __m128 c1 = _mm_set1_ps(1.929443550e+01);
const __m128 d0 = _mm_set1_ps(2.095932245e+00);
const __m128 d1 = _mm_set1_ps(1.266638851e+01);
const __m128 d2 = _mm_set1_ps(6.316540241e+00);
const __m128 one = _mm_set1_ps(1.0);
const __m128 multi = _mm_set1_ps(1.41421356237);
const __m128i mantissa_mask = _mm_set1_epi32((1 << 23) - 1);
__m128i x_i = _mm_castps_si128(x);
__m128i spl_exp = _mm_castps_si128(_mm_mul_ps(x, multi));
spl_exp = _mm_sub_epi32(spl_exp, _mm_castps_si128(one));
spl_exp = _mm_andnot_si128(mantissa_mask, spl_exp);
__m128 spl_mantissa = _mm_castsi128_ps(_mm_sub_epi32(x_i, spl_exp));
spl_exp = _mm_srai_epi32(spl_exp, 23);
__m128 log2_exponent = _mm_cvtepi32_ps(spl_exp);
__m128 num = spl_mantissa;
num = _mm_add_ps(num, c1);
num = _mm_mul_ps(num, spl_mantissa);
num = _mm_add_ps(num, c0);
num = _mm_mul_ps(num, _mm_sub_ps(spl_mantissa, one));
__m128 denom = d2;
denom = _mm_mul_ps(denom, spl_mantissa);
denom = _mm_add_ps(denom, d1);
denom = _mm_mul_ps(denom, spl_mantissa);
denom = _mm_add_ps(denom, d0);
__m128 res = _mm_div_ps(num, denom);
res = _mm_add_ps(log2_exponent, res);
return res;
}
__m128 log_sse(__m128 x) {
const __m128 convert_e = _mm_set1_ps(0.69314718056);
return _mm_mul_ps(log2_sse(x), convert_e);
}
__m128 log10_sse(__m128 x) {
const __m128 convert_10 = _mm_set1_ps(0.301029995664);
return _mm_mul_ps(log2_sse(x), convert_10);
}
__m256 log2_fma(__m256 x) {
// 12-13ulp
const __m256 c0 = _mm256_set1_ps(1.011593342e+01);
const __m256 c1 = _mm256_set1_ps(1.929443550e+01);
const __m256 d0 = _mm256_set1_ps(2.095932245e+00);
const __m256 d1 = _mm256_set1_ps(1.266638851e+01);
const __m256 d2 = _mm256_set1_ps(6.316540241e+00);
const __m256 one = _mm256_set1_ps(1.0);
const __m256 multi = _mm256_set1_ps(1.41421356237);
const __m256i mantissa_mask = _mm256_set1_epi32((1 << 23) - 1);
__m256i x_i = _mm256_castps_si256(x);
__m256i spl_exp = _mm256_castps_si256(_mm256_mul_ps(x, multi));
spl_exp = _mm256_sub_epi32(spl_exp, _mm256_castps_si256(one));
spl_exp = _mm256_andnot_si256(mantissa_mask, spl_exp);
__m256 spl_mantissa = _mm256_castsi256_ps(_mm256_sub_epi32(x_i, spl_exp));
spl_exp = _mm256_srai_epi32(spl_exp, 23);
__m256 log2_exponent = _mm256_cvtepi32_ps(spl_exp);
__m256 num = spl_mantissa;
num = _mm256_add_ps(num, c1);
num = _mm256_fmadd_ps(num, spl_mantissa, c0);
num = _mm256_fmsub_ps(num, spl_mantissa, num);
__m256 denom = d2;
denom = _mm256_fmadd_ps(denom, spl_mantissa, d1);
denom = _mm256_fmadd_ps(denom, spl_mantissa, d0);
__m256 res = _mm256_div_ps(num, denom);
res = _mm256_add_ps(log2_exponent, res);
return res;
}
__m256 log_fma(__m256 x) {
const __m256 convert_e = _mm256_set1_ps(0.69314718056);
return _mm256_mul_ps(log2_fma(x), convert_e);
}
__m256 log10_fma(__m256 x) {
const __m256 convert_10 = _mm256_set1_ps(0.301029995664);
return _mm256_mul_ps(log2_fma(x), convert_10);
}
Code: Select all
log2_sse : 2.22534
log_sse: 2.40723
log10_sse : 2.40771
log2_fma : 0.837646
log_fma : 0.936523
log10_fma : 0.932373
fmath::log_ps 2.20972
msvc __vdecl_logf8 3.31006
msvc __vdecl_logf4 4.52124
msvc logf 17.8723
msvc log2f 106.934
msvc __vdecl_log10f8 2.92383
msvc __vdecl_log10f4 4.82275
msvc log10f 19.6677
I can get it below 6ULPs with 1 more mul+add, but error/perf ratio is better on this one.
To my surprise, fmath::log_ps is not that great: error is around 4000 ULPs and it does a lookup (32Kb table if I got this right) - it may perform worse in real code compared to this microbenchmark.
It is possible to construct a fast vectorized pow function from log2 and exp2.
Code: Select all
pow(x,y)=exp2(log2(x)*y)