WebGL Transform Feedback — GPU Particle Simulation

Transform Feedback captures vertex shader output back into a GPU buffer, bypassing the CPU entirely. The result: 1,000,000 particles updating at 60 FPS, compared to fewer than 5,000 in a naive CPU loop. This tutorial walks through the complete WebGL2 implementation — from buffer setup to the ping-pong pattern to NaN debugging.

1. How Transform Feedback Works

In a standard WebGL pipeline, vertex shader output feeds the rasterizer and then the fragment shader. Transform Feedback intercepts that output and writes it to a Transform Feedback Object (TFO) — an ordinary GPU buffer with special binding. The fragment shader is disabled during this pass (RASTERIZER_DISCARD).

Frame N: BufferA (pos, vel) \to VERTEX SHADER \to TFO \to BufferB (new pos, new vel) Frame N+1: BufferB \to VERTEX SHADER \to TFO \to BufferA ↑ "ping-pong"

The vertex shader here is not computing screen positions — it's computing physics. You pass particle data as attributes, run Euler integration in GLSL, and write gl_Position (overridden) or custom varyings to the TFO.

2. Buffer and VAO Setup

Each particle stores 4 floats for position (x, y, z, age) and 3 floats for velocity (vx, vy, vz). Create two interleaved buffers and two VAOs for the ping-pong:

const N      = 1_000_000;  // particle count
const STRIDE = 7;          // 4 pos floats + 3 vel floats
const BYTES  = 4;          // bytes per float32

// Initialise particles with random spawn positions
const init = new Float32Array(N * STRIDE);
for (let i = 0; i < N; i++) {
  const b = i * STRIDE;
  init[b    ] = (Math.random() - .5) * 20;  // x
  init[b + 1] = (Math.random() - .5) * 20;  // y
  init[b + 2] = (Math.random() - .5) * 20;  // z
  init[b + 3] = Math.random() * 5;         // age
  init[b + 4] = (Math.random() - .5);      // vx
  init[b + 5] = Math.random() * 2 + 1;     // vy (upward bias)
  init[b + 6] = (Math.random() - .5);      // vz
}

function makeBuffer(data) {
  const buf = gl.createBuffer();
  gl.bindBuffer(gl.ARRAY_BUFFER, buf);
  gl.bufferData(gl.ARRAY_BUFFER, data, gl.STREAM_COPY);
  return buf;
}

const bufA = makeBuffer(init);
const bufB = makeBuffer(init); // same init data, will be overwritten

// Create VAOs pointing to each buffer as input attributes
function makeVAO(buf, prog) {
  const vao = gl.createVertexArray();
  gl.bindVertexArray(vao);
  gl.bindBuffer(gl.ARRAY_BUFFER, buf);
  const posLoc = gl.getAttribLocation(prog, 'a_pos');
  const velLoc = gl.getAttribLocation(prog, 'a_vel');
  gl.enableVertexAttribArray(posLoc);
  gl.vertexAttribPointer(posLoc, 4, gl.FLOAT, false, STRIDE * BYTES, 0);
  gl.enableVertexAttribArray(velLoc);
  gl.vertexAttribPointer(velLoc, 3, gl.FLOAT, false, STRIDE * BYTES, 4 * BYTES);
  return vao;
}

const vaoA = makeVAO(bufA, updateProg); // A → reads from bufA
const vaoB = makeVAO(bufB, updateProg); // B → reads from bufB

3. The Particle Update Shader

This is a vertex shader that runs physics, not geometry. Declare output varyings and register them with transformFeedbackVaryings before linking the program:

// particles.vert — update pass (no fragment shader needed)
const updateVS = `#version 300 es
precision highp float;

in vec4 a_pos;   // xyz = position, w = age
in vec3 a_vel;   // velocity

out vec4 v_pos;  // NEW position → TFO
out vec3 v_vel;  // NEW velocity → TFO

uniform float u_dt;
uniform vec3  u_gravity;
uniform float u_drag;
uniform float u_maxAge;

void main() {
  float age = a_pos.w + u_dt;
  bool  dead = age > u_maxAge;

  // If particle dies, respawn at origin with random velocity
  vec3 pos = dead ? vec3(0.0) : a_pos.xyz;
  vec3 vel = dead ? vec3(
      (fract(sin(float(gl_VertexID) * 127.1) * 43758.5) - 0.5) * 2.0,
      fract(sin(float(gl_VertexID) * 311.7) * 43758.5) * 3.0 + 0.5,
      (fract(sin(float(gl_VertexID) * 74.9)  * 43758.5) - 0.5) * 2.0
    ) : a_vel;
  age = dead ? 0.0 : age;

  // Euler integration
  vel += u_gravity * u_dt;
  vel *= (1.0 - u_drag * u_dt);
  pos += vel * u_dt;

  v_pos = vec4(pos, age);
  v_vel = vel;

  // Required but ignored (RASTERIZER_DISCARD is active)
  gl_Position = vec4(0.0);
}`;

// After compileShader / createProgram, BEFORE linkProgram:
gl.transformFeedbackVaryings(
  updateProg,
  ['v_pos', 'v_vel'],
  gl.INTERLEAVED_ATTRIBS  // output interleaved into one buffer
);
gl.linkProgram(updateProg);

Order matters: transformFeedbackVaryings must be called before linkProgram. If you call it after, the program will link successfully but no data will be captured — a silent bug.

4. Ping-Pong Update Loop

let readBuf = bufA, readVAO = vaoA;
let writeBuf = bufB, writeVAO = vaoB;

const tfo = gl.createTransformFeedback();

function update(dt) {
  gl.useProgram(updateProg);
  gl.uniform1f(gl.getUniformLocation(updateProg, 'u_dt'), dt);
  gl.uniform3f(gl.getUniformLocation(updateProg, 'u_gravity'), 0, -9.8, 0);
  gl.uniform1f(gl.getUniformLocation(updateProg, 'u_drag'), 0.02);
  gl.uniform1f(gl.getUniformLocation(updateProg, 'u_maxAge'), 5.0);

  // Bind the output buffer to TFO slot 0
  gl.bindTransformFeedback(gl.TRANSFORM_FEEDBACK, tfo);
  gl.bindBufferBase(gl.TRANSFORM_FEEDBACK_BUFFER, 0, writeBuf);

  gl.enable(gl.RASTERIZER_DISCARD);  // skip fragment shader
  gl.bindVertexArray(readVAO);
  gl.beginTransformFeedback(gl.POINTS);
  gl.drawArrays(gl.POINTS, 0, N);
  gl.endTransformFeedback();
  gl.disable(gl.RASTERIZER_DISCARD);
  gl.bindTransformFeedback(gl.TRANSFORM_FEEDBACK, null);

  // Swap ping-pong
  [readBuf, writeBuf] = [writeBuf, readBuf];
  [readVAO, writeVAO] = [writeVAO, readVAO];
}

5. Rendering the Particles

For the render pass, use a separate shader program that reads from the current readBuf (which has the freshly updated data) and draws it as gl.POINTS with a view-projection matrix:

// render.vert — display pass
const renderVS = `#version 300 es
in vec4 a_pos;  // xyz=position, w=age
in vec3 a_vel;
uniform mat4 u_mvp;
uniform float u_maxAge;
out float v_alpha;
void main() {
  gl_Position  = u_mvp * vec4(a_pos.xyz, 1.0);
  gl_PointSize = max(1.0, 4.0 * (1.0 - a_pos.w / u_maxAge));
  v_alpha      = 1.0 - a_pos.w / u_maxAge;
}`;

const renderFS = `#version 300 es
precision mediump float;
in  float v_alpha;
out vec4  fragColor;
void main() {
  // Circular point shape
  vec2 c = gl_PointCoord - 0.5;
  if (dot(c, c) > 0.25) discard;
  fragColor = vec4(1.0, 0.6, 0.2, v_alpha);
}`;

function render(mvp) {
  gl.useProgram(renderProg);
  gl.uniformMatrix4fv(gl.getUniformLocation(renderProg, 'u_mvp'), false, mvp);
  // Bind read-side VAO (same buffer, but renderVAO uses renderProg attribute locations)
  gl.bindVertexArray(renderReadVAO);
  gl.drawArrays(gl.POINTS, 0, N);
}

Separate VAOs for separate programs: Attribute locations differ between updateProg and renderProg. Create a dedicated VAO (renderReadVAO) for each of bufA/bufB bound to renderProg's locations — then swap them in sync with the ping-pong.

6. Benchmark Results

Approach	Particle Count	Frame Time	Notes
CPU (Float32Array loop)	5,000	~16 ms	Bottleneck: JS loop + bufferData each frame
CPU (SharedArrayBuffer + Worker)	25,000	~16 ms	Better, but still CPU-GPU transfer cost
GPU Transform Feedback	1,000,000	~4 ms	Zero CPU per particle; GPU fully utilized
WebGPU Compute Shaders	5,000,000+	~4 ms	Next-gen; Chrome 113+ required

Tested on an NVIDIA RTX 3070; Chrome 120; M1 MacBook Pro achieves similar numbers via WebGL2 on Metal.

7. Debugging: NaN Particles on the GPU

Silent NaN propagation is the most common GPU GPGPU bug. Once one particle's position becomes NaN (e.g., from a 0/0 division), it often spreads and "kills" large chunks of the buffer. How to find it:

// Read back a small slice of the output buffer to CPU for inspection
const check = new Float32Array(7 * 100);  // first 100 particles
gl.bindBuffer(gl.ARRAY_BUFFER, readBuf);
gl.getBufferSubData(gl.ARRAY_BUFFER, 0, check);

const badIdx = Array.from({length: 100}, (_, i) => i)
  .filter(i => !isFinite(check[i * 7]) || !isFinite(check[i * 7 + 1]));

if (badIdx.length > 0) console.warn('NaN particles at:', badIdx);

Common NaN sources: normalize(vec3(0)) (zero-length vector), 1.0 / 0.0 in GLSL (returns Infinity, not NaN, but can propagate), sqrt(x) where x < 0. Guard these with max(length, 0.0001) or max(x, 0.0).