Been focusing on getting this package in good shape for an initial release and have been proving out other useful concepts with the WASM kernels.
I was curious what custom culling on instances would look like when scaling up – here is a quick snapshot of 70k instances in memory using SIMD in WASM to do a frustum culling pass each frame which takes less than 1ms CPU time. This is effectively instance culling since it controls the instance draw count, there is some index indirection in the code to point back to the true contiguous list of visible instances.
This is using the npm release of shader-object which is close to finished. I will throw up the source on GitHub soon and create a video on “how to”.
Here is the PG with the above sample
Here is the relevant AssemblyScript WASM SIMD pass for frustum culling… geeking out over how performant this is, pretty cool stuff.
@inline
function loadPlane(ptr: usize): v128 { return v128.load(ptr); }
@inline
function planeNormal0(p: v128): v128 {
// zero W so mul ignores instance .w (scale)
return f32x4.replace_lane(p, 3, 0.0);
}
@inline
function planeD(p: v128): f32 { return f32x4.extract_lane(p, 3); }
// Returns 1 if inside, 0 if outside, with sphere radius = baseRadius * scale
@inline
function inside6(pos: v128, baseRadius: f32, pn0s: StaticArray<v128>, ds: StaticArray<f32>): i32 {
const scale = f32x4.extract_lane(pos, 3);
const radius = baseRadius * scale;
// For each plane: dot(n, xyz) + d >= -radius
for (let k = 0; k < 6; k++) {
const n0 = unchecked(pn0s[k]);
const d = unchecked(ds[k]);
const mul = f32x4.mul(pos, n0);
// horizontal sum of xyz lanes
const dot = f32x4.extract_lane(mul, 0)
+ f32x4.extract_lane(mul, 1)
+ f32x4.extract_lane(mul, 2);
const signed = dot + d;
if (signed < -radius) return 0;
}
return 1;
}
export function frustumMarkAoS(
base: usize,
planesPtr: usize,
baseRadius: f32
): void {
const h = changetype<InstancePoolHeader>(base);
const count = <i32>h.instancesCount;
if (count <= 0) {
h.visibleCount = 0;
return;
}
// Preload planes, derive (n, d) split once
const p0 = loadPlane(planesPtr + 0 * 16);
const p1 = loadPlane(planesPtr + 1 * 16);
const p2 = loadPlane(planesPtr + 2 * 16);
const p3 = loadPlane(planesPtr + 3 * 16);
const p4 = loadPlane(planesPtr + 4 * 16);
const p5 = loadPlane(planesPtr + 5 * 16);
const pn0s = StaticArray.fromArray<v128>([
planeNormal0(p0), planeNormal0(p1), planeNormal0(p2),
planeNormal0(p3), planeNormal0(p4), planeNormal0(p5)
]);
const ds = StaticArray.fromArray<f32>([
planeD(p0), planeD(p1), planeD(p2),
planeD(p3), planeD(p4), planeD(p5)
]);
// read pointer walks all instances
let readPtr = h.instancesPtr;
// write-head points at the *packed* area’s next visibleIndex slot (at array head)
let writeHead = h.instancesPtr + <usize>OFFSET_ActorInstance_visibleIndex;
let visCount = 0;
for (let i = 0; i < count; i++) {
store<i32>(readPtr + <usize>OFFSET_ActorInstance_visibleIndex, -1);
const pos = v128.load(readPtr + <usize>OFFSET_ActorInstance_translation);
let inside = 1;
for (let k = 0; k < 6; k++) {
const n0 = unchecked(pn0s[k]);
const d = unchecked(ds[k]);
const mul = f32x4.mul(pos, n0);
const dot = f32x4.extract_lane(mul, 0)
+ f32x4.extract_lane(mul, 1)
+ f32x4.extract_lane(mul, 2);
const radius = baseRadius * f32x4.extract_lane(pos, 3);
if (dot + d < -radius) { inside = 0; break; }
}
if (inside) {
store<i32>(writeHead, i);
writeHead += <usize>SIZEOF_ActorInstanceHeader;
visCount++;
}
readPtr += <usize>SIZEOF_ActorInstanceHeader;
}
h.visibleCount = visCount;
}
UPDATE: Added a distance check culling portion of the cull function for easy cull paths. Wanted to stress test the memory limits of the ShaderObject. Here is 150k instances running at 60 FPS.