ComposeToRef in SIMD

Also on Compiler Explorer

#include <wasm_simd128.h>
#define glmm_128     v128_t

#define glmm_shuff1(xmm, z, y, x, w) wasm_i32x4_shuffle(xmm, xmm, w, x, y, z)

#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)

#define glmm_splat_x(x) glmm_splat(x, 0)
#define glmm_splat_y(x) glmm_splat(x, 1)
#define glmm_splat_z(x) glmm_splat(x, 2)
#define glmm_splat_w(x) glmm_splat(x, 3)

typedef float vec4[4];
typedef vec4                    versor;     /* |x, y, z, w| -> w is the last */
typedef vec4  mat4[4];

// scale: x, y, z, padding
// rotation: x, y, z, w
// translation: x, y, z, padding
void ComposeToRef(vec4 scale, versor rotation, vec4 translation, mat4 result) {
    /*
    const x = rotation._x,
        y = rotation._y,
        z = rotation._z,
        w = rotation._w;
    const x2 = x + x,
        y2 = y + y,
        z2 = z + z;
    const xx = x * x2,
        xy = x * y2,
        xz = x * z2;
    const yy = y * y2,
        yz = y * z2,
        zz = z * z2;
    const wx = w * x2,
        wy = w * y2,
        wz = w * z2;
    */
    // x,y,z,w
    glmm_128 xyzw = wasm_v128_load(rotation);
    // x2,y2,z2,w2
    glmm_128 xyzw2 = wasm_f32x4_add(xyzw, xyzw);
    // xx, yy, zz, ww
    glmm_128 xx_yy_zz_ww = wasm_f32x4_mul(xyzw, xyzw2);
    // x, x, y, x
    glmm_128 a = wasm_i32x4_shuffle(xyzw, xyzw, 0, 0, 1, 0);
    // y2, z2, z2, w2
    glmm_128 b = wasm_i32x4_shuffle(xyzw2, xyzw2, 1, 2, 2, 3);
    // xy, xz, yz, xw
    glmm_128 xy_xz_yz_xw = wasm_f32x4_mul(a, b);
    // wx, wy, wz, ww
    glmm_128 wx_wy_wz = wasm_f32x4_mul(glmm_splat_x(xyzw), xyzw2);
    /*
    const sx = scale._x,
    sy = scale._y,
    sz = scale._z;
    */
    // sx, sy, sz, padding
    glmm_128 s = wasm_v128_load(scale);

    // yy, xx, xx, xx
    a = wasm_i32x4_shuffle(xx_yy_zz_ww, xx_yy_zz_ww, 1, 0, 0, 0);
    // zz, yy, yy, xx
    b = wasm_i32x4_shuffle(xx_yy_zz_ww, xx_yy_zz_ww, 2, 1, 1, 0);
    // yy + zz, xx + zz, xx + yy, xx + xx
    a = wasm_f32x4_add(a, b);
    // 1 - (yy + zz), 1 - (xx + zz), 1 - (xx + yy), 1 - (xx + xx)
    glmm_128 temp0 = wasm_f32x4_sub(wasm_f32x4_const_splat(1.f), a);

    // xy, yz, xz, xy
    a = wasm_i32x4_shuffle(xy_xz_yz_xw, xy_xz_yz_xw, 0, 2, 1, 0);
    // wz, wx, wy, wx
    b = wasm_i32x4_shuffle(wx_wy_wz, wx_wy_wz, 2, 0, 1, 0);
    // xy + wz, yz + wx, xz + wy, xy + wx
    glmm_128 temp1 = wasm_f32x4_add(a, b);
    // xz, xy, yz, xy
    a = wasm_i32x4_shuffle(xy_xz_yz_xw, xy_xz_yz_xw, 1, 0, 2, 0);
    // wy, wz, wx, wx
    b = wasm_i32x4_shuffle(wx_wy_wz, wx_wy_wz, 1, 2, 0, 0);
    // xz - wy, xy - wz, yz - wx, xy - wx
    glmm_128 temp2 = wasm_f32x4_sub(a, b);

    /*
    m[0] = (1 - (yy + zz)) * sx;
    m[1] = (xy + wz) * sx;
    m[2] = (xz - wy) * sx;
    m[3] = 0;
    */
    // 1.0f - yy - zz, xy + wz, 1.0f - xx - zz, yz + wx
    a = wasm_i32x4_shuffle(temp0, temp1, 0, 4, 1, 5);
    // xz - wy, xy - wz, xz + wy, yz - wx
    b = wasm_i32x4_shuffle(temp2, temp1, 0, 1, 6, 2);
    // 1.0f - yy - zz, xy + wz, xz - wy, 1.0f - yy - zz
    glmm_128 temp4 = wasm_i32x4_shuffle(a, b, 0, 1, 4, 0);
    // (1 - (yy + zz)) * sx, (xy + wz) * sx, (xz - wy) * sx, (1 - (yy + zz)) * sx
    temp4 = wasm_f32x4_mul(temp4, glmm_splat_x(s));
    // (1 - (yy + zz)) * sx, (xy + wz) * sx, (xz - wy) * sx, 0
    temp4 = wasm_f32x4_replace_lane(temp4, 3, 0.f);
    wasm_v128_store(result[0], temp4);

    /*
    m[4] = (xy - wz) * sy;
    m[5] = (1 - (xx + zz)) * sy;
    m[6] = (yz + wx) * sy;
    m[7] = 0;
    */
    // xy - wz, 1.0f - xx - zz, yz + wx,  xy - wz
    temp4 = wasm_i32x4_shuffle(temp4, b, 5, 2, 3, 0);
    // (xy - wz) * sy, (1 - (xx + zz)) * sy, (yz + wx) * sy, (xy - wz) * sy
    temp4 = wasm_f32x4_mul(temp4, glmm_splat_y(s));
    // (xy - wz) * sy, (1 - (xx + zz)) * sy, (yz + wx) * sy, 0
    temp4 = wasm_f32x4_replace_lane(temp4, 3, 0.f);
    wasm_v128_store(result[1], temp4);
    /*
    m[8] = (xz + wy) * sz;
    m[9] = (yz - wx) * sz;
    m[10] = (1 - (xx + yy)) * sz;
    m[11] = 0;
    */
    // xz + wy, yz - wx, 1.0f - xx - zz, yz + wx
    temp4 = wasm_i32x4_shuffle(b, a, 2, 3, 6, 7);
    // (xz + wy) * sz, (yz - wx) * sz, (1 - (xx + yy)) * sz, (yz + wx) * sz
    temp4 = wasm_f32x4_mul(temp4, glmm_splat_z(s));
    // (xz + wy) * sz, (yz - wx) * sz, (1 - (xx + yy)) * sz, 0
    temp4 = wasm_f32x4_replace_lane(temp4, 3, 0.f);
    wasm_v128_store(result[2], temp4);
    /*
    m[12] = translation._x;
    m[13] = translation._y;
    m[14] = translation._z;
    m[15] = 1;
    */
    temp4 = wasm_v128_load(translation);
    temp4 = wasm_f32x4_replace_lane(temp4, 3, 1.f);
    wasm_v128_store(result[3], temp4);
}
2 Likes