Batched or Instanced BoundingBoxRenderer

Comparing performance with or without SIMD:

Without SIMD (avg 3.694ms on my local chrome):

With SIMD (avg 3.277ms on my local chrome, ~11% diff):

source
#include <cglm/cglm.h>

extern unsigned char __heap_base;

uintptr_t get_heap_base() {
    // align with 64 bytes
    return (((uintptr_t) (&__heap_base)) + 63) & ~63;
}

unsigned bbox_compose(float * minmax, vec4 * mat, size_t count) {
    CGLM_ALIGN_MAT mat4 tmp_mat;
    CGLM_ALIGN_MAT vec4 diff, median;
    glm_mat4_identity(tmp_mat);
    float * m = (float *) tmp_mat;
    for (size_t i = 0; i < count; i++) {
        float * min = minmax;
        minmax += 4;
        float *  max = minmax;
        minmax += 4;
        glm_vec4_sub(max, min, diff);
        glm_vec4_scale(diff, 0.5, median);
        glm_vec4_add(min, median, median);
        // Directly update the matrix values in column-major order
        m[0] = diff[0];  // Scale X
        m[3] = median[0];  // Translate X
        
        m[5] = diff[1];  // Scale Y
        m[7] = median[1];  // Translate Y
        
        m[10] = diff[2];  // Scale Z
        m[11] = median[2];  // Translate Z
        glm_mat4_mul(mat, tmp_mat, mat);
        mat += 4;
    }
    return count;
}
1 Like