Emu_mm_sincos_ps ported to sse2 and wasm

Ported from MathLib, licensed MIT
Original code here
Ported to sse2, but can compile to avx and fma inst if compiler optimized it.

SSE2:

AVX:

AVX+FMA:

x86 benchmark:

SSE2 Code:

#include <math.h>
#include <xmmintrin.h>
#include <emmintrin.h>

typedef __m128 v4f;
typedef __m128i v4i;

#define v4f_madd(a, b, c)                               _mm_fmadd_ps(a, b, c)
#define v4f_abs(v)                                      _mm_andnot_ps(c_v4f_Sign, v)
#define v4f_equal(a, b)                                 _mm_cmpeq_ps(a, b)
#define _v4f_is_inf(x)                                  v4f_equal(v4f_abs(x), _mm_set1_ps(INFINITY))
#ifdef __AVX__
#include <immintrin.h>
#else
#define _mm_blendv_ps(a, b, mask) _mm_xor_ps(a, _mm_and_ps(mask, _mm_xor_ps(b, a)))
#endif
#ifndef __FMA__
#define _mm_fmadd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
#endif
#define v4f_select(a, b, mask)                          _mm_blendv_ps(a, b, mask)

#define _v4f_vselect(mask, x, y)                        v4f_select(y, x, mask)

#define Cf_PI4_A 0.78515625f
#define Cf_PI4_B 0.00024187564849853515625f
#define Cf_PI4_C 3.7747668102383613586e-08f
#define Cf_PI4_D 1.2816720341285448015e-12

v4f emu_mm_sincos_ps(v4f* pCos, const v4f d)
{
    v4i q = _mm_cvtps_epi32(_mm_mul_ps(d, _mm_set1_ps(0.63661977236758134308f)));

    v4f s = d;

    v4f u = _mm_cvtepi32_ps(q);
    s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_A * 2.0f), s);
    s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_B * 2.0f), s);
    s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_C * 2.0f), s);
    s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_D * 2.0f), s);

    v4f t = s;

    s = _mm_mul_ps(s, s);

    u = _mm_set1_ps(-0.000195169282960705459117889f);
    u = v4f_madd(u, s, _mm_set1_ps(0.00833215750753879547119141f));
    u = v4f_madd(u, s, _mm_set1_ps(-0.166666537523269653320312f));
    u = _mm_mul_ps(_mm_mul_ps(u, s), t);

    v4f rx = _mm_add_ps(t, u);

    u = _mm_set1_ps(-2.71811842367242206819355e-07f);
    u = v4f_madd(u, s, _mm_set1_ps(2.47990446951007470488548e-05f));
    u = v4f_madd(u, s, _mm_set1_ps(-0.00138888787478208541870117f));
    u = v4f_madd(u, s, _mm_set1_ps(0.0416666641831398010253906f));
    u = v4f_madd(u, s, _mm_set1_ps(-0.5f));

    v4f ry = v4f_madd(s, u, _mm_set1_ps(1.0f));

    v4f m = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(q, _mm_set1_epi32(1)), _mm_set1_epi32(0)));
    v4f rrx = _v4f_vselect(m, rx, ry);
    v4f rry = _v4f_vselect(m, ry, rx);
    const v4f c_v4f_Sign = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

    m = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(q, _mm_set1_epi32(2)), _mm_set1_epi32(2)));
    rrx = _mm_xor_ps(_mm_and_ps(m, c_v4f_Sign), rrx);

    m = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, _mm_set1_epi32(1)), _mm_set1_epi32(2)), _mm_set1_epi32(2)));
    rry = _mm_xor_ps(_mm_and_ps(m, c_v4f_Sign), rry);

    m = _v4f_is_inf(d);

    *pCos = _mm_or_ps(m, rry);

    return _mm_or_ps(m, rrx);
}

#undef v4f_madd
#undef v4f_abs
#undef v4f_equal
#undef _v4f_is_inf
#undef v4f_select
#undef _v4f_vselect

#undef Cf_PI4_A
#undef Cf_PI4_B
#undef Cf_PI4_C
#undef Cf_PI4_D

#ifndef __AVX__
#undef _mm_blendv_ps
#endif
#ifndef __FMA__
#undef _mm_fmadd_ps
#endif

The wasm code:

#include <wasm_simd128.h>

typedef __f32x4 v4f;
typedef __i32x4 v4i;

#define v4f_madd(a, b, c)                               _mm_fmadd_ps(a, b, c)
#define v4f_abs(v)                                      wasm_f32x4_abs(v)
#define v4f_equal(a, b)                                 wasm_f32x4_eq(a, b)
#define _v4f_is_inf(x)                                  v4f_equal(v4f_abs(x), wasm_i32x4_const_splat(0x7F800000))
#define _mm_blendv_ps(a, b, mask)  wasm_v128_bitselect(b, a, mask)
#define _mm_fmadd_ps(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(a, b), c)
#define v4f_select(a, b, mask)                          _mm_blendv_ps(a, b, mask)

#define _v4f_vselect(mask, x, y)                        v4f_select(y, x, mask)

#define Cf_PI4_A 0.78515625f
#define Cf_PI4_B 0.00024187564849853515625f
#define Cf_PI4_C 3.7747668102383613586e-08f
#define Cf_PI4_D 1.2816720341285448015e-12

static inline v4f emu_mm_sincos_ps(v4f* pCos, const v4f d)
{

    v4i q = wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(wasm_f32x4_mul(d, wasm_f32x4_const_splat(0.63661977236758134308f))));

    v4f s = d;

    v4f u = wasm_f32x4_convert_i32x4(q);
    s = v4f_madd(u, wasm_f32x4_const_splat(-Cf_PI4_A * 2.0f), s);
    s = v4f_madd(u, wasm_f32x4_const_splat(-Cf_PI4_B * 2.0f), s);
    s = v4f_madd(u, wasm_f32x4_const_splat(-Cf_PI4_C * 2.0f), s);
    s = v4f_madd(u, wasm_f32x4_const_splat(-Cf_PI4_D * 2.0f), s);

    v4f t = s;

    s = wasm_f32x4_mul(s, s);

    u = wasm_f32x4_const_splat(-0.000195169282960705459117889f);
    u = v4f_madd(u, s, wasm_f32x4_const_splat(0.00833215750753879547119141f));
    u = v4f_madd(u, s, wasm_f32x4_const_splat(-0.166666537523269653320312f));
    u = wasm_f32x4_mul(wasm_f32x4_mul(u, s), t);

    v4f rx = wasm_f32x4_add(t, u);

    u = wasm_f32x4_const_splat(-2.71811842367242206819355e-07f);
    u = v4f_madd(u, s, wasm_f32x4_const_splat(2.47990446951007470488548e-05f));
    u = v4f_madd(u, s, wasm_f32x4_const_splat(-0.00138888787478208541870117f));
    u = v4f_madd(u, s, wasm_f32x4_const_splat(0.0416666641831398010253906f));
    u = v4f_madd(u, s, wasm_f32x4_const_splat(-0.5f));

    v4f ry = v4f_madd(s, u, wasm_f32x4_const_splat(1.0f));

    v4f m = (v4f) (wasm_i32x4_eq(wasm_v128_and(q, wasm_i32x4_splat(1)), wasm_i32x4_splat(0)));
    v4f rrx = _v4f_vselect(m, rx, ry);
    v4f rry = _v4f_vselect(m, ry, rx);
    const v4f c_v4f_Sign = (wasm_i32x4_splat(0x80000000));

    m = (v4f) (wasm_i32x4_eq(wasm_v128_and(q, wasm_i32x4_splat(2)), wasm_i32x4_splat(2)));
    rrx = wasm_v128_xor(wasm_v128_and(m, c_v4f_Sign), rrx);

    m = (v4f) (wasm_i32x4_eq(wasm_v128_and(wasm_i32x4_add(q, wasm_i32x4_splat(1)), wasm_i32x4_splat(2)), wasm_i32x4_splat(2)));
    rry = wasm_v128_xor(wasm_v128_and(m, c_v4f_Sign), rry);

    m = _v4f_is_inf(d);

    *pCos = wasm_v128_or(m, rry);

    return wasm_v128_or(m, rrx);
}

#undef v4f_madd
#undef v4f_abs
#undef v4f_equal
#undef _v4f_is_inf
#undef v4f_select
#undef _v4f_vselect

#undef Cf_PI4_A
#undef Cf_PI4_B
#undef Cf_PI4_C
#undef Cf_PI4_D

#undef _mm_blendv_ps
#undef _mm_fmadd_ps

A playground for benchmarking is available here:

It shows about 8x speed vs js in modern chrome:
image

2 Likes

Hello!
Quite impressive but I don’t have a clue what are you talking about LOL
I believe the majority of others (197 views) neither understand this but I’m the one who is not ashamed to ask :smiley:

It would be cool to shed some light on this.

Is it about extending wasm functionality by enabling it to call low level cpu functionality thus making it faster?

This code is a SIMD optimisation of the Sincos algorithm (simultaneous calculation of sine and cosine), which uses vector instructions for four floating point values to perform parallel calculations on a single CPU and thus improve performance. It uses the instruction set extension SSE4.1 (Streaming SIMD Extensions 4.1) for x86 processors. The code may be used for calculations in 3D computer graphics or simulations.

v4f emu_mm_sincos_ps(v4f* pCos, const v4f d)

This function accepts two vectors of type v4f. The first is a pointer to a vector in which the result for cosine is stored. The second vector contains the input values for the angles in radians. v4f is probably a structure or an alias for a 128-bit vector with four floating point numbers.

pi.ai

1 Like

Thanks! What does “ported to sse2 and wasm” mean? I know what sse2 is but why does it need to be ported? It means that now one can use sse2 support for sincos in wasm?

Well, it’s originally coded in AVX, I ported it to sse so older cpu can use it. Also, I ported it to wasm so it can use wasm simd128 extension when being compiled to wasm.