Ported from MathLib, licensed MIT
Original code here
Ported to sse2, but can compile to avx and fma inst if compiler optimized it.
SSE2:
AVX:
AVX+FMA:
x86 benchmark:
SSE2 Code:
#include <math.h>
#include <xmmintrin.h>
#include <emmintrin.h>
typedef __m128 v4f;
typedef __m128i v4i;
#define v4f_madd(a, b, c) _mm_fmadd_ps(a, b, c)
#define v4f_abs(v) _mm_andnot_ps(c_v4f_Sign, v)
#define v4f_equal(a, b) _mm_cmpeq_ps(a, b)
#define _v4f_is_inf(x) v4f_equal(v4f_abs(x), _mm_set1_ps(INFINITY))
#ifdef __AVX__
#include <immintrin.h>
#else
#define _mm_blendv_ps(a, b, mask) _mm_xor_ps(a, _mm_and_ps(mask, _mm_xor_ps(b, a)))
#endif
#ifndef __FMA__
#define _mm_fmadd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
#endif
#define v4f_select(a, b, mask) _mm_blendv_ps(a, b, mask)
#define _v4f_vselect(mask, x, y) v4f_select(y, x, mask)
#define Cf_PI4_A 0.78515625f
#define Cf_PI4_B 0.00024187564849853515625f
#define Cf_PI4_C 3.7747668102383613586e-08f
#define Cf_PI4_D 1.2816720341285448015e-12
v4f emu_mm_sincos_ps(v4f* pCos, const v4f d)
{
v4i q = _mm_cvtps_epi32(_mm_mul_ps(d, _mm_set1_ps(0.63661977236758134308f)));
v4f s = d;
v4f u = _mm_cvtepi32_ps(q);
s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_A * 2.0f), s);
s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_B * 2.0f), s);
s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_C * 2.0f), s);
s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_D * 2.0f), s);
v4f t = s;
s = _mm_mul_ps(s, s);
u = _mm_set1_ps(-0.000195169282960705459117889f);
u = v4f_madd(u, s, _mm_set1_ps(0.00833215750753879547119141f));
u = v4f_madd(u, s, _mm_set1_ps(-0.166666537523269653320312f));
u = _mm_mul_ps(_mm_mul_ps(u, s), t);
v4f rx = _mm_add_ps(t, u);
u = _mm_set1_ps(-2.71811842367242206819355e-07f);
u = v4f_madd(u, s, _mm_set1_ps(2.47990446951007470488548e-05f));
u = v4f_madd(u, s, _mm_set1_ps(-0.00138888787478208541870117f));
u = v4f_madd(u, s, _mm_set1_ps(0.0416666641831398010253906f));
u = v4f_madd(u, s, _mm_set1_ps(-0.5f));
v4f ry = v4f_madd(s, u, _mm_set1_ps(1.0f));
v4f m = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(q, _mm_set1_epi32(1)), _mm_set1_epi32(0)));
v4f rrx = _v4f_vselect(m, rx, ry);
v4f rry = _v4f_vselect(m, ry, rx);
const v4f c_v4f_Sign = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
m = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(q, _mm_set1_epi32(2)), _mm_set1_epi32(2)));
rrx = _mm_xor_ps(_mm_and_ps(m, c_v4f_Sign), rrx);
m = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, _mm_set1_epi32(1)), _mm_set1_epi32(2)), _mm_set1_epi32(2)));
rry = _mm_xor_ps(_mm_and_ps(m, c_v4f_Sign), rry);
m = _v4f_is_inf(d);
*pCos = _mm_or_ps(m, rry);
return _mm_or_ps(m, rrx);
}
#undef v4f_madd
#undef v4f_abs
#undef v4f_equal
#undef _v4f_is_inf
#undef v4f_select
#undef _v4f_vselect
#undef Cf_PI4_A
#undef Cf_PI4_B
#undef Cf_PI4_C
#undef Cf_PI4_D
#ifndef __AVX__
#undef _mm_blendv_ps
#endif
#ifndef __FMA__
#undef _mm_fmadd_ps
#endif
The wasm code:
#include <wasm_simd128.h>
typedef __f32x4 v4f;
typedef __i32x4 v4i;
#define v4f_madd(a, b, c) _mm_fmadd_ps(a, b, c)
#define v4f_abs(v) wasm_f32x4_abs(v)
#define v4f_equal(a, b) wasm_f32x4_eq(a, b)
#define _v4f_is_inf(x) v4f_equal(v4f_abs(x), wasm_i32x4_const_splat(0x7F800000))
#define _mm_blendv_ps(a, b, mask) wasm_v128_bitselect(b, a, mask)
#define _mm_fmadd_ps(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(a, b), c)
#define v4f_select(a, b, mask) _mm_blendv_ps(a, b, mask)
#define _v4f_vselect(mask, x, y) v4f_select(y, x, mask)
#define Cf_PI4_A 0.78515625f
#define Cf_PI4_B 0.00024187564849853515625f
#define Cf_PI4_C 3.7747668102383613586e-08f
#define Cf_PI4_D 1.2816720341285448015e-12
static inline v4f emu_mm_sincos_ps(v4f* pCos, const v4f d)
{
v4i q = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(wasm_f32x4_mul(d, wasm_f32x4_const_splat(0.63661977236758134308f))));
v4f s = d;
v4f u = wasm_f32x4_convert_i32x4(q);
s = v4f_madd(u, wasm_f32x4_const_splat(-Cf_PI4_A * 2.0f), s);
s = v4f_madd(u, wasm_f32x4_const_splat(-Cf_PI4_B * 2.0f), s);
s = v4f_madd(u, wasm_f32x4_const_splat(-Cf_PI4_C * 2.0f), s);
s = v4f_madd(u, wasm_f32x4_const_splat(-Cf_PI4_D * 2.0f), s);
v4f t = s;
s = wasm_f32x4_mul(s, s);
u = wasm_f32x4_const_splat(-0.000195169282960705459117889f);
u = v4f_madd(u, s, wasm_f32x4_const_splat(0.00833215750753879547119141f));
u = v4f_madd(u, s, wasm_f32x4_const_splat(-0.166666537523269653320312f));
u = wasm_f32x4_mul(wasm_f32x4_mul(u, s), t);
v4f rx = wasm_f32x4_add(t, u);
u = wasm_f32x4_const_splat(-2.71811842367242206819355e-07f);
u = v4f_madd(u, s, wasm_f32x4_const_splat(2.47990446951007470488548e-05f));
u = v4f_madd(u, s, wasm_f32x4_const_splat(-0.00138888787478208541870117f));
u = v4f_madd(u, s, wasm_f32x4_const_splat(0.0416666641831398010253906f));
u = v4f_madd(u, s, wasm_f32x4_const_splat(-0.5f));
v4f ry = v4f_madd(s, u, wasm_f32x4_const_splat(1.0f));
v4f m = (v4f) (wasm_i32x4_eq(wasm_v128_and(q, wasm_i32x4_splat(1)), wasm_i32x4_splat(0)));
v4f rrx = _v4f_vselect(m, rx, ry);
v4f rry = _v4f_vselect(m, ry, rx);
const v4f c_v4f_Sign = (wasm_i32x4_splat(0x80000000));
m = (v4f) (wasm_i32x4_eq(wasm_v128_and(q, wasm_i32x4_splat(2)), wasm_i32x4_splat(2)));
rrx = wasm_v128_xor(wasm_v128_and(m, c_v4f_Sign), rrx);
m = (v4f) (wasm_i32x4_eq(wasm_v128_and(wasm_i32x4_add(q, wasm_i32x4_splat(1)), wasm_i32x4_splat(2)), wasm_i32x4_splat(2)));
rry = wasm_v128_xor(wasm_v128_and(m, c_v4f_Sign), rry);
m = _v4f_is_inf(d);
*pCos = wasm_v128_or(m, rry);
return wasm_v128_or(m, rrx);
}
#undef v4f_madd
#undef v4f_abs
#undef v4f_equal
#undef _v4f_is_inf
#undef v4f_select
#undef _v4f_vselect
#undef Cf_PI4_A
#undef Cf_PI4_B
#undef Cf_PI4_C
#undef Cf_PI4_D
#undef _mm_blendv_ps
#undef _mm_fmadd_ps
A playground for benchmarking is available here:
It shows about 8x speed vs js in modern chrome: