Emu_mm_sincos_ps ported to sse2

Ported from MathLib, licensed MIT
Original code here
Ported to sse2, but can compile to avx and fma inst if compiler optimized it.

SSE2:

AVX:

AVX+FMA:

Code:

#include <math.h>
#include <xmmintrin.h>
#include <emmintrin.h>

typedef __m128 v4f;
typedef __m128i v4i;

#define v4f_madd(a, b, c)                               _mm_fmadd_ps(a, b, c)
#define v4f_abs(v)                                      _mm_andnot_ps(c_v4f_Sign, v)
#define v4f_equal(a, b)                                 _mm_cmpeq_ps(a, b)
#define _v4f_is_inf(x)                                  v4f_equal(v4f_abs(x), _mm_set1_ps(INFINITY))
#ifdef __AVX__
#include <immintrin.h>
#else
#define _mm_blendv_ps(a, b, mask) _mm_xor_ps(a, _mm_and_ps(mask, _mm_xor_ps(b, a)))
#endif
#ifndef __FMA__
#define _mm_fmadd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
#endif
#define v4f_select(a, b, mask)                          _mm_blendv_ps(a, b, mask)

#define _v4f_vselect(mask, x, y)                        v4f_select(y, x, mask)

#define Cf_PI4_A 0.78515625f
#define Cf_PI4_B 0.00024187564849853515625f
#define Cf_PI4_C 3.7747668102383613586e-08f
#define Cf_PI4_D 1.2816720341285448015e-12

v4f emu_mm_sincos_ps(v4f* pCos, const v4f d)
{
    v4i q = _mm_cvtps_epi32(_mm_mul_ps(d, _mm_set1_ps(0.63661977236758134308f)));

    v4f s = d;

    v4f u = _mm_cvtepi32_ps(q);
    s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_A * 2.0f), s);
    s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_B * 2.0f), s);
    s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_C * 2.0f), s);
    s = v4f_madd(u, _mm_set1_ps(-Cf_PI4_D * 2.0f), s);

    v4f t = s;

    s = _mm_mul_ps(s, s);

    u = _mm_set1_ps(-0.000195169282960705459117889f);
    u = v4f_madd(u, s, _mm_set1_ps(0.00833215750753879547119141f));
    u = v4f_madd(u, s, _mm_set1_ps(-0.166666537523269653320312f));
    u = _mm_mul_ps(_mm_mul_ps(u, s), t);

    v4f rx = _mm_add_ps(t, u);

    u = _mm_set1_ps(-2.71811842367242206819355e-07f);
    u = v4f_madd(u, s, _mm_set1_ps(2.47990446951007470488548e-05f));
    u = v4f_madd(u, s, _mm_set1_ps(-0.00138888787478208541870117f));
    u = v4f_madd(u, s, _mm_set1_ps(0.0416666641831398010253906f));
    u = v4f_madd(u, s, _mm_set1_ps(-0.5f));

    v4f ry = v4f_madd(s, u, _mm_set1_ps(1.0f));

    v4f m = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(q, _mm_set1_epi32(1)), _mm_set1_epi32(0)));
    v4f rrx = _v4f_vselect(m, rx, ry);
    v4f rry = _v4f_vselect(m, ry, rx);
    const v4f c_v4f_Sign = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

    m = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(q, _mm_set1_epi32(2)), _mm_set1_epi32(2)));
    rrx = _mm_xor_ps(_mm_and_ps(m, c_v4f_Sign), rrx);

    m = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, _mm_set1_epi32(1)), _mm_set1_epi32(2)), _mm_set1_epi32(2)));
    rry = _mm_xor_ps(_mm_and_ps(m, c_v4f_Sign), rry);

    m = _v4f_is_inf(d);

    *pCos = _mm_or_ps(m, rry);

    return _mm_or_ps(m, rrx);
}

#undef v4f_madd
#undef v4f_abs
#undef v4f_equal
#undef _v4f_is_inf
#undef v4f_select
#undef _v4f_vselect

#undef Cf_PI4_A
#undef Cf_PI4_B
#undef Cf_PI4_C
#undef Cf_PI4_D

#ifndef __AVX__
#undef _mm_blendv_ps
#endif
#ifndef __FMA__
#undef _mm_fmadd_ps
#endif