RotationYawPitchRollToRef in SIMD

kzhsw · March 19, 2023, 1:34pm

#include <wasm_simd128.h>
typedef float vec4[4];
typedef float quat[4];

// angles: yaw, pitch, roll, 0, using vec4 to force align
inline void RotationYawPitchRollToRef(vec4 angles, quat dest) {
    /*  // The original code
        const halfRoll = roll * 0.5;
        const halfPitch = pitch * 0.5;
        const halfYaw = yaw * 0.5;
        const sinRoll = Math.sin(halfRoll);
        const cosRoll = Math.cos(halfRoll);
        const sinPitch = Math.sin(halfPitch);
        const cosPitch = Math.cos(halfPitch);
        const sinYaw = Math.sin(halfYaw);
        const cosYaw = Math.cos(halfYaw);
        result._x = cosYaw * sinPitch * cosRoll + sinYaw * cosPitch * sinRoll;
        result._y = sinYaw * cosPitch * cosRoll - cosYaw * sinPitch * sinRoll;
        result._z = cosYaw * cosPitch * sinRoll - sinYaw * sinPitch * cosRoll;
        result._w = cosYaw * cosPitch * cosRoll + sinYaw * sinPitch * sinRoll;
    */
#if defined(USE_SIMD_SINCOS)
    a = wasm_f32x4_mul(wasm_v128_load(angles), wasm_f32x4_const_splat(0.5f));
    sin = (v128_t) emu_mm_sincos_ps((v4f *)&cos, a);
#else
    {
        // half_vec not needed outside this block
        vec4 half_vec;
        // halfPitch, halfYaw, halfRoll, 0
        wasm_v128_store(half_vec, wasm_f32x4_mul(wasm_v128_load(angles), wasm_f32x4_const_splat(0.5f)));
        // non-simd sinf, the performance bottleneck
        // sinYaw, sinPitch, sinRoll, 0
        sin = wasm_f32x4_make(sinf(half_vec[1]), sinf(half_vec[0]), sinf(half_vec[2]), 0);
    }
    // use simd to get cos from sin
    // https://stackoverflow.com/a/2683608
    // cos(x)^2 = 1 - sin(x)^2
    // cos(x) = sqrt(1 - sin(x)^2)
    // cosYaw, cosPitch, cosRoll, 0
    cos = wasm_f32x4_sqrt(wasm_f32x4_sub(wasm_f32x4_const_splat(1.0f), wasm_f32x4_mul(sin, sin)));
#endif
    // cosYaw, sinYaw, cosYaw, cosYaw
    b = wasm_i32x4_shuffle(sin, cos, 4, 0, 4, 4);
    // sinPitch, cosPitch, cosPitch, cosPitch
    c = wasm_i32x4_shuffle(sin, cos, 1, 5, 5, 5);
    // cosRoll, cosRoll, sinRoll, cosRoll
    d = wasm_i32x4_shuffle(sin, cos, 6, 6, 2, 6);
    // cosYaw * sinPitch * cosRoll
    // sinYaw * cosPitch * cosRoll
    // cosYaw * cosPitch * sinRoll
    // cosYaw * cosPitch * cosRoll
    a = wasm_f32x4_mul(wasm_f32x4_mul(b, c), d);
    // sinYaw, cosYaw, sinYaw, sinYaw
    b = wasm_i32x4_shuffle(sin, cos, 0, 4, 0, 0);
    // sinYaw, -cosYaw, -sinYaw, sinYaw
    b = wasm_v128_xor(b, wasm_f32x4_const(0.f, -0.f, -0.f, 0.f));
    // cosPitch, sinPitch, sinPitch, sinPitch
    c = wasm_i32x4_shuffle(sin, cos, 5, 1, 1, 1);
    // sinRoll, sinRoll, cosRoll, sinRoll
    d = wasm_f32x4_shuffle(sin, cos, 2, 2, 6, 2);
    // cosYaw * sinPitch * cosRoll + sinYaw * cosPitch * sinRoll;
    // sinYaw * cosPitch * cosRoll - cosYaw * sinPitch * sinRoll;
    // cosYaw * cosPitch * sinRoll - sinYaw * sinPitch * cosRoll;
    // cosYaw * cosPitch * cosRoll + sinYaw * sinPitch * sinRoll;
    a = wasm_f32x4_add(a, wasm_f32x4_mul(wasm_f32x4_mul(b, c), d));
    wasm_v128_store(dest, a);
}

// angles: yaw, pitch, roll
typedef float vec3[3];
inline void RotationYawPitchRollToRefUnaligned(vec3 angles, quat dest) {
    /*
        const halfRoll = roll * 0.5;
        const halfPitch = pitch * 0.5;
        const halfYaw = yaw * 0.5;
        const sinRoll = Math.sin(halfRoll);
        const cosRoll = Math.cos(halfRoll);
        const sinPitch = Math.sin(halfPitch);
        const cosPitch = Math.cos(halfPitch);
        const sinYaw = Math.sin(halfYaw);
        const cosYaw = Math.cos(halfYaw);
        result._x = cosYaw * sinPitch * cosRoll + sinYaw * cosPitch * sinRoll;
        result._y = sinYaw * cosPitch * cosRoll - cosYaw * sinPitch * sinRoll;
        result._z = cosYaw * cosPitch * sinRoll - sinYaw * sinPitch * cosRoll;
        result._w = cosYaw * cosPitch * cosRoll + sinYaw * sinPitch * sinRoll;
    */
    v128_t sin, cos, a, b, c, d;
    {
        // half_vec not needed outside this block
        vec4 half_vec;
        half_vec[0] = angles[0];
        half_vec[1] = angles[1];
        half_vec[2] = angles[2];
        half_vec[3] = 0; // make sure this is initialized
        // halfYaw, halfPitch, halfRoll, 0
        wasm_v128_store(half_vec, wasm_f32x4_mul(wasm_v128_load(half_vec), wasm_f32x4_const_splat(0.5)));
        // non-simd sinf and cosf
        // sinYaw, sinPitch, sinRoll, 0
        sin = wasm_f32x4_make(sinf(half_vec[0]), sinf(half_vec[1]), sinf(half_vec[2]), 0);
    }
    // https://stackoverflow.com/a/2683608
    // cos(x)^2 = 1 - sin(x)^2
    // cos(x) = sqrt(1 - sin(x)^2)
    // cosYaw, cosPitch, cosRoll, 0
    cos = wasm_f32x4_sqrt(wasm_f32x4_add(wasm_f32x4_const_splat(1.0f), wasm_f32x4_neg(wasm_f32x4_mul(sin, sin))));
    // cosYaw, sinYaw, cosYaw, cosYaw
    b = wasm_i32x4_shuffle(sin, cos, 4, 0, 4, 4);
    // sinPitch, cosPitch, cosPitch, cosPitch
    c = wasm_i32x4_shuffle(sin, cos, 1, 5, 5, 5);
    // cosRoll, cosRoll, sinRoll, cosRoll
    d = wasm_i32x4_shuffle(sin, cos, 6, 6, 2, 6);
    // cosYaw * sinPitch * cosRoll
    // sinYaw * cosPitch * cosRoll
    // cosYaw * cosPitch * sinRoll
    // cosYaw * cosPitch * cosRoll
    a = wasm_f32x4_mul(wasm_f32x4_mul(b, c), d);
    // sinYaw, cosYaw, sinYaw, sinYaw
    b = wasm_i32x4_shuffle(sin, cos, 0, 4, 0, 0);
    // sinYaw, -cosYaw, -sinYaw, sinYaw
    b = wasm_v128_xor(b, wasm_f32x4_const(0.f, -0.f, -0.f, 0.f));
    // cosPitch, sinPitch, sinPitch, sinPitch
    c = wasm_i32x4_shuffle(sin, cos, 5, 1, 1, 1);
    // sinRoll, sinRoll, cosRoll, sinRoll
    d = wasm_f32x4_shuffle(sin, cos, 2, 2, 6, 2);
    // cosYaw * sinPitch * cosRoll + sinYaw * cosPitch * sinRoll;
    // sinYaw * cosPitch * cosRoll - cosYaw * sinPitch * sinRoll;
    // cosYaw * cosPitch * sinRoll - sinYaw * sinPitch * cosRoll;
    // cosYaw * cosPitch * cosRoll + sinYaw * sinPitch * sinRoll;
    a = wasm_f32x4_add(a, wasm_f32x4_mul(wasm_f32x4_mul(b, c), d));
    wasm_v128_store(dest, a);
}

With emu_mm_sincos_ps, it can be faster.

Edit: For those wanted to use this, note that -0.f can be miscompiled in some compiler with fast-math.

jeremy-coleman · March 19, 2023, 10:41pm

Cool. Did you try without using intrinsics?

kzhsw · March 21, 2023, 1:04am

Auto-Vectorization or without SIMD?
This function is a port of Quaternion.RotationYawPitchRollToRef to wasm SIMD, referenced in The original code part of code.
This is not benchmarked yet.

jeremy-coleman · March 21, 2023, 3:04pm

Autovectorization

kzhsw · March 22, 2023, 3:11am

While on x86_64 platform compilers did an excllent work on Auto-Vectorization, they can not yet do the same targeting wasm simd128, as tested below.

code

/* Type your code here, or load an example. */
#include <math.h>

typedef float vec3[3];
typedef float versor[4];

void RotationYawPitchRollToRefScalar(vec3 angles, versor dest) {
        float halfRoll = angles[2] * 0.5f;
        float halfPitch = angles[1] * 0.5f;
        float halfYaw = angles[0] * 0.5f;
        float sinRoll = sinf(halfRoll);
        float cosRoll = cosf(halfRoll);
        float sinPitch = sinf(halfPitch);
        float cosPitch = cosf(halfPitch);
        float sinYaw = sinf(halfYaw);
        float cosYaw = cosf(halfYaw);
        dest[0] = cosYaw * sinPitch * cosRoll + sinYaw * cosPitch * sinRoll;
        dest[1] = sinYaw * cosPitch * cosRoll - cosYaw * sinPitch * sinRoll;
        dest[2] = cosYaw * cosPitch * sinRoll - sinYaw * sinPitch * cosRoll;
        dest[3] = cosYaw * cosPitch * cosRoll + sinYaw * sinPitch * sinRoll;
}

x86_64 compiler output

Compile options:

clang -O3 -nostartfiles -Wl,--export=RotationYawPitchRollToRefScalar -Wl,--no-entry -Wl,--allow-undefined test.c -o clang_O3.wasm
clang -O3 -nostartfiles -Wl,--export=RotationYawPitchRollToRefScalar -Wl,--no-entry -Wl,--allow-undefined -msimd128 test.c -o clang_O3_msimd128.wasm

compiled_wasm.zip (11.0 KB)

jeremy-coleman · March 22, 2023, 6:48am

Ive had the same problems. If you print the bailout info, clang knows it can autovec, it just doesnt do it. There must be a way -_-

Topic		Replies	Views
ComposeToRef in SIMD Off topic math , matrix , simd	0	229	November 21, 2023
Emu_mm_sincos_ps ported to sse2 and wasm Off topic math	4	262	June 22, 2024
Some tweaks toEulerAnglesToRef()? Questions math	2	398	May 25, 2022
Normal-Rust, a test of Rust / WebAssembly Demos and projects	12	1257	September 5, 2021
Optimizing performance of FromArrayToRef Feature requests quaternion , performance , vector3 , math	11	605	January 27, 2023

RotationYawPitchRollToRef in SIMD

Related topics