Also on Compiler Explorer
#include <wasm_simd128.h>
#define glmm_128 v128_t
#define glmm_shuff1(xmm, z, y, x, w) wasm_i32x4_shuffle(xmm, xmm, w, x, y, z)
#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)
#define glmm_splat_x(x) glmm_splat(x, 0)
#define glmm_splat_y(x) glmm_splat(x, 1)
#define glmm_splat_z(x) glmm_splat(x, 2)
#define glmm_splat_w(x) glmm_splat(x, 3)
typedef float vec4[4];
typedef vec4 versor; /* |x, y, z, w| -> w is the last */
typedef vec4 mat4[4];
// scale: x, y, z, padding
// rotation: x, y, z, w
// translation: x, y, z, padding
void ComposeToRef(vec4 scale, versor rotation, vec4 translation, mat4 result) {
/*
const x = rotation._x,
y = rotation._y,
z = rotation._z,
w = rotation._w;
const x2 = x + x,
y2 = y + y,
z2 = z + z;
const xx = x * x2,
xy = x * y2,
xz = x * z2;
const yy = y * y2,
yz = y * z2,
zz = z * z2;
const wx = w * x2,
wy = w * y2,
wz = w * z2;
*/
// x,y,z,w
glmm_128 xyzw = wasm_v128_load(rotation);
// x2,y2,z2,w2
glmm_128 xyzw2 = wasm_f32x4_add(xyzw, xyzw);
// xx, yy, zz, ww
glmm_128 xx_yy_zz_ww = wasm_f32x4_mul(xyzw, xyzw2);
// x, x, y, x
glmm_128 a = wasm_i32x4_shuffle(xyzw, xyzw, 0, 0, 1, 0);
// y2, z2, z2, w2
glmm_128 b = wasm_i32x4_shuffle(xyzw2, xyzw2, 1, 2, 2, 3);
// xy, xz, yz, xw
glmm_128 xy_xz_yz_xw = wasm_f32x4_mul(a, b);
// wx, wy, wz, ww
glmm_128 wx_wy_wz = wasm_f32x4_mul(glmm_splat_x(xyzw), xyzw2);
/*
const sx = scale._x,
sy = scale._y,
sz = scale._z;
*/
// sx, sy, sz, padding
glmm_128 s = wasm_v128_load(scale);
// yy, xx, xx, xx
a = wasm_i32x4_shuffle(xx_yy_zz_ww, xx_yy_zz_ww, 1, 0, 0, 0);
// zz, yy, yy, xx
b = wasm_i32x4_shuffle(xx_yy_zz_ww, xx_yy_zz_ww, 2, 1, 1, 0);
// yy + zz, xx + zz, xx + yy, xx + xx
a = wasm_f32x4_add(a, b);
// 1 - (yy + zz), 1 - (xx + zz), 1 - (xx + yy), 1 - (xx + xx)
glmm_128 temp0 = wasm_f32x4_sub(wasm_f32x4_const_splat(1.f), a);
// xy, yz, xz, xy
a = wasm_i32x4_shuffle(xy_xz_yz_xw, xy_xz_yz_xw, 0, 2, 1, 0);
// wz, wx, wy, wx
b = wasm_i32x4_shuffle(wx_wy_wz, wx_wy_wz, 2, 0, 1, 0);
// xy + wz, yz + wx, xz + wy, xy + wx
glmm_128 temp1 = wasm_f32x4_add(a, b);
// xz, xy, yz, xy
a = wasm_i32x4_shuffle(xy_xz_yz_xw, xy_xz_yz_xw, 1, 0, 2, 0);
// wy, wz, wx, wx
b = wasm_i32x4_shuffle(wx_wy_wz, wx_wy_wz, 1, 2, 0, 0);
// xz - wy, xy - wz, yz - wx, xy - wx
glmm_128 temp2 = wasm_f32x4_sub(a, b);
/*
m[0] = (1 - (yy + zz)) * sx;
m[1] = (xy + wz) * sx;
m[2] = (xz - wy) * sx;
m[3] = 0;
*/
// 1.0f - yy - zz, xy + wz, 1.0f - xx - zz, yz + wx
a = wasm_i32x4_shuffle(temp0, temp1, 0, 4, 1, 5);
// xz - wy, xy - wz, xz + wy, yz - wx
b = wasm_i32x4_shuffle(temp2, temp1, 0, 1, 6, 2);
// 1.0f - yy - zz, xy + wz, xz - wy, 1.0f - yy - zz
glmm_128 temp4 = wasm_i32x4_shuffle(a, b, 0, 1, 4, 0);
// (1 - (yy + zz)) * sx, (xy + wz) * sx, (xz - wy) * sx, (1 - (yy + zz)) * sx
temp4 = wasm_f32x4_mul(temp4, glmm_splat_x(s));
// (1 - (yy + zz)) * sx, (xy + wz) * sx, (xz - wy) * sx, 0
temp4 = wasm_f32x4_replace_lane(temp4, 3, 0.f);
wasm_v128_store(result[0], temp4);
/*
m[4] = (xy - wz) * sy;
m[5] = (1 - (xx + zz)) * sy;
m[6] = (yz + wx) * sy;
m[7] = 0;
*/
// xy - wz, 1.0f - xx - zz, yz + wx, xy - wz
temp4 = wasm_i32x4_shuffle(temp4, b, 5, 2, 3, 0);
// (xy - wz) * sy, (1 - (xx + zz)) * sy, (yz + wx) * sy, (xy - wz) * sy
temp4 = wasm_f32x4_mul(temp4, glmm_splat_y(s));
// (xy - wz) * sy, (1 - (xx + zz)) * sy, (yz + wx) * sy, 0
temp4 = wasm_f32x4_replace_lane(temp4, 3, 0.f);
wasm_v128_store(result[1], temp4);
/*
m[8] = (xz + wy) * sz;
m[9] = (yz - wx) * sz;
m[10] = (1 - (xx + yy)) * sz;
m[11] = 0;
*/
// xz + wy, yz - wx, 1.0f - xx - zz, yz + wx
temp4 = wasm_i32x4_shuffle(b, a, 2, 3, 6, 7);
// (xz + wy) * sz, (yz - wx) * sz, (1 - (xx + yy)) * sz, (yz + wx) * sz
temp4 = wasm_f32x4_mul(temp4, glmm_splat_z(s));
// (xz + wy) * sz, (yz - wx) * sz, (1 - (xx + yy)) * sz, 0
temp4 = wasm_f32x4_replace_lane(temp4, 3, 0.f);
wasm_v128_store(result[2], temp4);
/*
m[12] = translation._x;
m[13] = translation._y;
m[14] = translation._z;
m[15] = 1;
*/
temp4 = wasm_v128_load(translation);
temp4 = wasm_f32x4_replace_lane(temp4, 3, 1.f);
wasm_v128_store(result[3], temp4);
}