SSE2/FMA Logarithms.

DSP, Plugin and Host development discussion.
Post Reply New Topic
RELATED
PRODUCTS

Post

These functions are useful in a DSP context, and rational approximations are fun so here they are :D.
It may look like a lot of operations, but _mm_cast is no-op, so it doesn't count :) .
Code:

Code: Select all

__m128 log2_sse(__m128 x) {
  // 12-13ulp
  const __m128 c0 = _mm_set1_ps(1.011593342e+01);
  const __m128 c1 = _mm_set1_ps(1.929443550e+01);
  const __m128 d0 = _mm_set1_ps(2.095932245e+00);
  const __m128 d1 = _mm_set1_ps(1.266638851e+01);
  const __m128 d2 = _mm_set1_ps(6.316540241e+00);
  const __m128 one = _mm_set1_ps(1.0);
  const __m128 multi = _mm_set1_ps(1.41421356237);
  const __m128i mantissa_mask = _mm_set1_epi32((1 << 23) - 1);
  __m128i x_i = _mm_castps_si128(x);
  __m128i spl_exp = _mm_castps_si128(_mm_mul_ps(x, multi));
  spl_exp = _mm_sub_epi32(spl_exp, _mm_castps_si128(one));
  spl_exp = _mm_andnot_si128(mantissa_mask, spl_exp);
  __m128 spl_mantissa = _mm_castsi128_ps(_mm_sub_epi32(x_i, spl_exp));
  spl_exp = _mm_srai_epi32(spl_exp, 23);
  __m128 log2_exponent = _mm_cvtepi32_ps(spl_exp);
  __m128 num = spl_mantissa;
  num = _mm_add_ps(num, c1);
  num = _mm_mul_ps(num, spl_mantissa);
  num = _mm_add_ps(num, c0);
  num = _mm_mul_ps(num, _mm_sub_ps(spl_mantissa, one));
  __m128 denom = d2;
  denom = _mm_mul_ps(denom, spl_mantissa);
  denom = _mm_add_ps(denom, d1);
  denom = _mm_mul_ps(denom, spl_mantissa);
  denom = _mm_add_ps(denom, d0);
  __m128 res = _mm_div_ps(num, denom);
  res = _mm_add_ps(log2_exponent, res);
  return res;
}
__m128 log_sse(__m128 x) {
  const __m128 convert_e = _mm_set1_ps(0.69314718056);
  return _mm_mul_ps(log2_sse(x), convert_e);
}
__m128 log10_sse(__m128 x) {
  const __m128 convert_10 = _mm_set1_ps(0.301029995664);
  return _mm_mul_ps(log2_sse(x), convert_10);
}
__m256 log2_fma(__m256 x) {
  // 12-13ulp
  const __m256 c0 = _mm256_set1_ps(1.011593342e+01);
  const __m256 c1 = _mm256_set1_ps(1.929443550e+01);
  const __m256 d0 = _mm256_set1_ps(2.095932245e+00);
  const __m256 d1 = _mm256_set1_ps(1.266638851e+01);
  const __m256 d2 = _mm256_set1_ps(6.316540241e+00);
  const __m256 one = _mm256_set1_ps(1.0);
  const __m256 multi = _mm256_set1_ps(1.41421356237);
  const __m256i mantissa_mask = _mm256_set1_epi32((1 << 23) - 1);
  __m256i x_i = _mm256_castps_si256(x);
  __m256i spl_exp = _mm256_castps_si256(_mm256_mul_ps(x, multi));
  spl_exp = _mm256_sub_epi32(spl_exp, _mm256_castps_si256(one));
  spl_exp = _mm256_andnot_si256(mantissa_mask, spl_exp);
  __m256 spl_mantissa = _mm256_castsi256_ps(_mm256_sub_epi32(x_i, spl_exp));
  spl_exp = _mm256_srai_epi32(spl_exp, 23);
  __m256 log2_exponent = _mm256_cvtepi32_ps(spl_exp);
  __m256 num = spl_mantissa;
  num = _mm256_add_ps(num, c1);
  num = _mm256_fmadd_ps(num, spl_mantissa, c0);
  num = _mm256_fmsub_ps(num, spl_mantissa, num);
  __m256 denom = d2;
  denom = _mm256_fmadd_ps(denom, spl_mantissa, d1);
  denom = _mm256_fmadd_ps(denom, spl_mantissa, d0);
  __m256 res = _mm256_div_ps(num, denom);
  res = _mm256_add_ps(log2_exponent, res);
  return res;
}
__m256 log_fma(__m256 x) {
  const __m256 convert_e = _mm256_set1_ps(0.69314718056);
  return _mm256_mul_ps(log2_fma(x), convert_e);
}
__m256 log10_fma(__m256 x) {
  const __m256 convert_10 = _mm256_set1_ps(0.301029995664);
  return _mm256_mul_ps(log2_fma(x), convert_10);
}
Perf:

Code: Select all

log2_sse : 2.22534
log_sse: 2.40723
log10_sse : 2.40771
log2_fma : 0.837646
log_fma : 0.936523
log10_fma : 0.932373

fmath::log_ps 2.20972
msvc __vdecl_logf8 3.31006
msvc __vdecl_logf4 4.52124
msvc logf 17.8723
msvc log2f 106.934
msvc __vdecl_log10f8 2.92383
msvc __vdecl_log10f4 4.82275
msvc log10f 19.6677
Error is around 15-17 ULPs for these functions on (std::numeric_limits<float>::min(),std::numeric_limits<float>::max()).
I can get it below 6ULPs with 1 more mul+add, but error/perf ratio is better on this one.

To my surprise, fmath::log_ps is not that great: error is around 4000 ULPs and it does a lookup (32Kb table if I got this right) - it may perform worse in real code compared to this microbenchmark.

It is possible to construct a fast vectorized pow function from log2 and exp2.

Code: Select all

pow(x,y)=exp2(log2(x)*y)

Post

Lovely. :)

Post Reply

Return to “DSP and Plugin Development”