RotationYawPitchRollToRef in SIMD

#include <wasm_simd128.h>
typedef float vec4[4];
typedef float quat[4];

// angles: yaw, pitch, roll, 0, using vec4 to force align
inline void RotationYawPitchRollToRef(vec4 angles, quat dest) {
    /*  // The original code
        const halfRoll = roll * 0.5;
        const halfPitch = pitch * 0.5;
        const halfYaw = yaw * 0.5;
        const sinRoll = Math.sin(halfRoll);
        const cosRoll = Math.cos(halfRoll);
        const sinPitch = Math.sin(halfPitch);
        const cosPitch = Math.cos(halfPitch);
        const sinYaw = Math.sin(halfYaw);
        const cosYaw = Math.cos(halfYaw);
        result._x = cosYaw * sinPitch * cosRoll + sinYaw * cosPitch * sinRoll;
        result._y = sinYaw * cosPitch * cosRoll - cosYaw * sinPitch * sinRoll;
        result._z = cosYaw * cosPitch * sinRoll - sinYaw * sinPitch * cosRoll;
        result._w = cosYaw * cosPitch * cosRoll + sinYaw * sinPitch * sinRoll;
    */
    v128_t sin, cos, a, b, c, d;
    {
        // half_vec not needed outside this block
        vec4 half_vec;
        // halfYaw, halfPitch, halfRoll, 0
        wasm_v128_store(half_vec, wasm_f32x4_mul(wasm_v128_load(angles), wasm_f32x4_const_splat(0.5)));
        // non-simd sinf
        // sinYaw, sinPitch, sinRoll, 0
        sin = wasm_f32x4_make(sinf(half_vec[0]), sinf(half_vec[1]), sinf(half_vec[2]), 0);
    }
    // use simd to get cos from sin
    // https://stackoverflow.com/a/2683608
    // cos(x)^2 = 1 - sin(x)^2
    // cos(x) = sqrt(1 - sin(x)^2)
    // cosYaw, cosPitch, cosRoll, 0
    cos = wasm_f32x4_sqrt(wasm_f32x4_add(wasm_f32x4_const_splat(1.0f), wasm_f32x4_neg(wasm_f32x4_mul(sin, sin))));
    // cosYaw, sinYaw, cosYaw, cosYaw
    b = wasm_i32x4_shuffle(sin, cos, 4, 0, 4, 4);
    // sinPitch, cosPitch, cosPitch, cosPitch
    c = wasm_i32x4_shuffle(sin, cos, 1, 5, 5, 5);
    // cosRoll, cosRoll, sinRoll, cosRoll
    d = wasm_i32x4_shuffle(sin, cos, 6, 6, 2, 6);
    // cosYaw * sinPitch * cosRoll
    // sinYaw * cosPitch * cosRoll
    // cosYaw * cosPitch * sinRoll
    // cosYaw * cosPitch * cosRoll
    a = wasm_f32x4_mul(wasm_f32x4_mul(b, c), d);
    // sinYaw, cosYaw, sinYaw, sinYaw
    b = wasm_i32x4_shuffle(sin, cos, 0, 4, 0, 0);
    // sinYaw, -cosYaw, -sinYaw, sinYaw
    b = wasm_v128_xor(b, wasm_f32x4_const(0.f, -0.f, -0.f, 0.f));
    // cosPitch, sinPitch, sinPitch, sinPitch
    c = wasm_i32x4_shuffle(sin, cos, 5, 1, 1, 1);
    // sinRoll, sinRoll, cosRoll, sinRoll
    d = wasm_i32x4_shuffle(sin, cos, 4, 0, 4, 4);
    // cosYaw * sinPitch * cosRoll + sinYaw * cosPitch * sinRoll;
    // sinYaw * cosPitch * cosRoll - cosYaw * sinPitch * sinRoll;
    // cosYaw * cosPitch * sinRoll - sinYaw * sinPitch * cosRoll;
    // cosYaw * cosPitch * cosRoll + sinYaw * sinPitch * sinRoll;
    a = wasm_f32x4_add(a, wasm_f32x4_mul(wasm_f32x4_mul(b, c), d));
    wasm_v128_store(dest, a);
}

// angles: yaw, pitch, roll
typedef float vec3[3];
inline void RotationYawPitchRollToRefUnaligned(vec3 angles, quat dest) {
    /*
        const halfRoll = roll * 0.5;
        const halfPitch = pitch * 0.5;
        const halfYaw = yaw * 0.5;
        const sinRoll = Math.sin(halfRoll);
        const cosRoll = Math.cos(halfRoll);
        const sinPitch = Math.sin(halfPitch);
        const cosPitch = Math.cos(halfPitch);
        const sinYaw = Math.sin(halfYaw);
        const cosYaw = Math.cos(halfYaw);
        result._x = cosYaw * sinPitch * cosRoll + sinYaw * cosPitch * sinRoll;
        result._y = sinYaw * cosPitch * cosRoll - cosYaw * sinPitch * sinRoll;
        result._z = cosYaw * cosPitch * sinRoll - sinYaw * sinPitch * cosRoll;
        result._w = cosYaw * cosPitch * cosRoll + sinYaw * sinPitch * sinRoll;
    */
    v128_t sin, cos, a, b, c, d;
    {
        // half_vec not needed outside this block
        vec4 half_vec;
        half_vec[0] = angles[0];
        half_vec[1] = angles[1];
        half_vec[2] = angles[2];
        half_vec[3] = 0; // make sure this is initialized
        // halfYaw, halfPitch, halfRoll, 0
        wasm_v128_store(half_vec, wasm_f32x4_mul(wasm_v128_load(half_vec), wasm_f32x4_const_splat(0.5)));
        // non-simd sinf and cosf
        // sinYaw, sinPitch, sinRoll, 0
        sin = wasm_f32x4_make(sinf(half_vec[0]), sinf(half_vec[1]), sinf(half_vec[2]), 0);
    }
    // https://stackoverflow.com/a/2683608
    // cos(x)^2 = 1 - sin(x)^2
    // cos(x) = sqrt(1 - sin(x)^2)
    // cosYaw, cosPitch, cosRoll, 0
    cos = wasm_f32x4_sqrt(wasm_f32x4_add(wasm_f32x4_const_splat(1.0f), wasm_f32x4_neg(wasm_f32x4_mul(sin, sin))));
    // cosYaw, sinYaw, cosYaw, cosYaw
    b = wasm_i32x4_shuffle(sin, cos, 4, 0, 4, 4);
    // sinPitch, cosPitch, cosPitch, cosPitch
    c = wasm_i32x4_shuffle(sin, cos, 1, 5, 5, 5);
    // cosRoll, cosRoll, sinRoll, cosRoll
    d = wasm_i32x4_shuffle(sin, cos, 6, 6, 2, 6);
    // cosYaw * sinPitch * cosRoll
    // sinYaw * cosPitch * cosRoll
    // cosYaw * cosPitch * sinRoll
    // cosYaw * cosPitch * cosRoll
    a = wasm_f32x4_mul(wasm_f32x4_mul(b, c), d);
    // sinYaw, cosYaw, sinYaw, sinYaw
    b = wasm_i32x4_shuffle(sin, cos, 0, 4, 0, 0);
    // sinYaw, -cosYaw, -sinYaw, sinYaw
    b = wasm_v128_xor(b, wasm_f32x4_const(0.f, -0.f, -0.f, 0.f));
    // cosPitch, sinPitch, sinPitch, sinPitch
    c = wasm_i32x4_shuffle(sin, cos, 5, 1, 1, 1);
    // sinRoll, sinRoll, cosRoll, sinRoll
    d = wasm_i32x4_shuffle(sin, cos, 4, 0, 4, 4);
    // cosYaw * sinPitch * cosRoll + sinYaw * cosPitch * sinRoll;
    // sinYaw * cosPitch * cosRoll - cosYaw * sinPitch * sinRoll;
    // cosYaw * cosPitch * sinRoll - sinYaw * sinPitch * cosRoll;
    // cosYaw * cosPitch * cosRoll + sinYaw * sinPitch * sinRoll;
    a = wasm_f32x4_add(a, wasm_f32x4_mul(wasm_f32x4_mul(b, c), d));
    wasm_v128_store(dest, a);
}

1 Like

Cool. Did you try without using intrinsics?

Auto-Vectorization or without SIMD?
This function is a port of Quaternion.RotationYawPitchRollToRef to wasm SIMD, referenced in The original code part of code.
This is not benchmarked yet.

Autovectorization

While on x86_64 platform compilers did an excllent work on Auto-Vectorization, they can not yet do the same targeting wasm simd128, as tested below.

code
/* Type your code here, or load an example. */
#include <math.h>

typedef float vec3[3];
typedef float versor[4];

void RotationYawPitchRollToRefScalar(vec3 angles, versor dest) {
        float halfRoll = angles[2] * 0.5f;
        float halfPitch = angles[1] * 0.5f;
        float halfYaw = angles[0] * 0.5f;
        float sinRoll = sinf(halfRoll);
        float cosRoll = cosf(halfRoll);
        float sinPitch = sinf(halfPitch);
        float cosPitch = cosf(halfPitch);
        float sinYaw = sinf(halfYaw);
        float cosYaw = cosf(halfYaw);
        dest[0] = cosYaw * sinPitch * cosRoll + sinYaw * cosPitch * sinRoll;
        dest[1] = sinYaw * cosPitch * cosRoll - cosYaw * sinPitch * sinRoll;
        dest[2] = cosYaw * cosPitch * sinRoll - sinYaw * sinPitch * cosRoll;
        dest[3] = cosYaw * cosPitch * cosRoll + sinYaw * sinPitch * sinRoll;
}
x86_64 compiler output



Compile options:

clang -O3 -nostartfiles -Wl,--export=RotationYawPitchRollToRefScalar -Wl,--no-entry -Wl,--allow-undefined test.c -o clang_O3.wasm
clang -O3 -nostartfiles -Wl,--export=RotationYawPitchRollToRefScalar -Wl,--no-entry -Wl,--allow-undefined -msimd128 test.c -o clang_O3_msimd128.wasm

compiled_wasm.zip (11.0 KB)

Ive had the same problems. If you print the bailout info, clang knows it can autovec, it just doesnt do it. There must be a way -_-