Comparing performance with or without SIMD:
Without SIMD (avg 3.694ms on my local chrome):
With SIMD (avg 3.277ms on my local chrome, ~11% diff):
source
#include <cglm/cglm.h>
extern unsigned char __heap_base;
uintptr_t get_heap_base() {
// align with 64 bytes
return (((uintptr_t) (&__heap_base)) + 63) & ~63;
}
unsigned bbox_compose(float * minmax, vec4 * mat, size_t count) {
CGLM_ALIGN_MAT mat4 tmp_mat;
CGLM_ALIGN_MAT vec4 diff, median;
glm_mat4_identity(tmp_mat);
float * m = (float *) tmp_mat;
for (size_t i = 0; i < count; i++) {
float * min = minmax;
minmax += 4;
float * max = minmax;
minmax += 4;
glm_vec4_sub(max, min, diff);
glm_vec4_scale(diff, 0.5, median);
glm_vec4_add(min, median, median);
// Directly update the matrix values in column-major order
m[0] = diff[0]; // Scale X
m[3] = median[0]; // Translate X
m[5] = diff[1]; // Scale Y
m[7] = median[1]; // Translate Y
m[10] = diff[2]; // Scale Z
m[11] = median[2]; // Translate Z
glm_mat4_mul(mat, tmp_mat, mat);
mat += 4;
}
return count;
}