WebGPU Rendering: Part 6 Multiple Render Passes

Matthew MacFarquhar
5 min readDec 5, 2024

--

Introduction

I have been reading through this online book on WebGPU. In this series of articles, I will be going through this book and implementing the lessons in a more structured typescript class approach and eventually we will build three types of WebGPU renderers: Gaussian Splatting, Ray tracing and Rasterization.

In this article we will talk about an interesting rendering use case, multiple render passes. Multiple render passes can be useful for cases like post processing. When you have multiple render passes, you render your first pass to a texture instead of to the target, then your final pass will use the last pass’s texture as input to its render pass.

The following link is the commit in my Github repo that matches the code we will go over.

Multiple Render Passes

The meat and potatoes of this article is really in the render code so lets jump right in.

Our code will create all our uniforms up front for our transformation matrix, projection matrix, image texture, sampler, kernel, kernel size and render pass one out texture.

public async render_gaussian_blur(shaderCodeOne: string, shaderCodeTwo: string, vertexCount: number, instanceCount: number, vertices: Float32Array, texCoords: Float32Array,
transformationMatrix: Float32Array, projectionMatrix: Float32Array, imgUri: string) {
const response = await fetch(imgUri);
const blob = await response.blob();
const imageBitmap = await createImageBitmap(blob);

// CREATE UNIFORMS
const transformationMatrixBuffer = this._createGPUBuffer(transformationMatrix, GPUBufferUsage.UNIFORM);
const projectionMatrixBuffer = this._createGPUBuffer(projectionMatrix, GPUBufferUsage.UNIFORM);
const texture = this._createTextureFromImage(imageBitmap);
const sampler = this._createSampler();
const passOneTexture = this._createTexture(texture.width, texture.height);

const imgSizeBuffer = this._createGPUBuffer(new Float32Array([imageBitmap.width, imageBitmap.height]), GPUBufferUsage.UNIFORM);
let kValues = []

const kernelSize = 8.0;
const sigma = 8.0;
let intensity = 0.0;

for (let y = - kernelSize; y <= kernelSize; y += 1.0) {
let gaussian_value = 1.0 / Math.sqrt(2.0 * Math.PI * sigma * sigma) * Math.exp(-y * y / (2.0 * sigma * sigma));
intensity += gaussian_value;
kValues.push(gaussian_value);
}
const kernelBuffer = this._createGPUBuffer(new Float32Array(kValues), GPUBufferUsage.STORAGE);
const kernelSizeBuffer = this._createGPUBuffer(new Float32Array([kernelSize]), GPUBufferUsage.UNIFORM);

const transformationMatrixBindGroupInput: IBindGroupInput = {
type: "buffer",
visibility: GPUShaderStage.VERTEX,
buffer: transformationMatrixBuffer,
}
const projectionMatrixBindGroupInput: IBindGroupInput = {
type: "buffer",
visibility: GPUShaderStage.VERTEX,
buffer: projectionMatrixBuffer,
}
const imageSizeBindGroupInput: IBindGroupInput = {
type: "buffer",
visibility: GPUShaderStage.FRAGMENT,
buffer: imgSizeBuffer,
}
const textureBindGroupInput: IBindGroupInput = {
type: "texture",
visibility: GPUShaderStage.FRAGMENT,
texture: texture,
}
const samplerBindGroupInput: IBindGroupInput = {
type: "sampler",
visibility: GPUShaderStage.FRAGMENT,
sampler: sampler,
}
const kernelBindGroupInput: IBindGroupInput = {
type: "buffer",
visibility: GPUShaderStage.FRAGMENT,
readonly: true,
buffer: kernelBuffer,
}
const kernelSizeBindGroupInput: IBindGroupInput = {
type: "buffer",
visibility: GPUShaderStage.FRAGMENT,
buffer: kernelSizeBuffer,
}
const passOneTextureBindGroupInput: IBindGroupInput = {
type: "texture",
visibility: GPUShaderStage.FRAGMENT,
texture: passOneTexture,
}


const { bindGroupLayout: uniformBindGroupLayoutPassOne, bindGroup: uniformBindGroupPassOne } = this._createUniformBindGroup([imageSizeBindGroupInput, textureBindGroupInput, samplerBindGroupInput, kernelBindGroupInput, kernelSizeBindGroupInput]);
const { bindGroupLayout: uniformBindGroupLayoutPassTwo, bindGroup: uniformBindGroupPassTwo } = this._createUniformBindGroup([transformationMatrixBindGroupInput, projectionMatrixBindGroupInput, imageSizeBindGroupInput, passOneTextureBindGroupInput, samplerBindGroupInput, kernelBindGroupInput, kernelSizeBindGroupInput]);

// CREATE VERTEX BUFFERS
const { buffer: positionBuffer, layout: positionBufferLayout } = this._createSingleAttributeVertexBuffer(vertices, { format: "float32x3", offset: 0, shaderLocation: 0 }, 3 * Float32Array.BYTES_PER_ELEMENT);
const { buffer: texCoordBufferOne, layout: texCoordBufferLayoutOne } = this._createSingleAttributeVertexBuffer(texCoords, { format: "float32x2", offset: 0, shaderLocation: 0 }, 2 * Float32Array.BYTES_PER_ELEMENT);
const { buffer: texCoordBufferTwo, layout: texCoordBufferLayoutTwo } = this._createSingleAttributeVertexBuffer(texCoords, { format: "float32x2", offset: 0, shaderLocation: 1 }, 2 * Float32Array.BYTES_PER_ELEMENT);

// CREATE COMMAND ENCODER
const commandEncoder = this._device.createCommandEncoder();

const passEncoder = commandEncoder.beginRenderPass(this._createRenderTarget(passOneTexture, {r: 0.0, g: 0.0, b: 0.0, a: 0.0}));
passEncoder.setViewport(0, 0, texture.width, texture.height, 0, 1);
passEncoder.setPipeline(this._createPipeline(this._createShaderModule(shaderCodeOne), [texCoordBufferLayoutOne], [uniformBindGroupLayoutPassOne], "rgba8unorm"));
passEncoder.setVertexBuffer(0, texCoordBufferOne);
passEncoder.setBindGroup(0, uniformBindGroupPassOne);
passEncoder.draw(vertexCount, instanceCount);
passEncoder.end();

const passEncoderTwo = commandEncoder.beginRenderPass(this._createRenderTarget(this._context.getCurrentTexture(), {r: 1.0, g: 0.0, b: 0.0, a: 1.0}));
passEncoderTwo.setViewport(0, 0, this._canvas.width, this._canvas.height, 0, 1);
passEncoderTwo.setPipeline(this._createPipeline(this._createShaderModule(shaderCodeTwo), [positionBufferLayout, texCoordBufferLayoutTwo], [uniformBindGroupLayoutPassTwo], "bgra8unorm"));
passEncoderTwo.setVertexBuffer(0, positionBuffer);
passEncoderTwo.setVertexBuffer(1, texCoordBufferTwo);
passEncoderTwo.setBindGroup(0, uniformBindGroupPassTwo);
passEncoderTwo.draw(vertexCount, instanceCount);
passEncoderTwo.end();

this._device.queue.submit([commandEncoder.finish()]);
}

We also create the bindgroup layouts for the two shaders we have (one for each render pass), and create our vertex buffers for each pipeline too.

Finally, we go through two pipeline render passes before submitting the commands to the GPU. Notice how in the first pass’s createRenderTarget, the target is the texture buffer and in the second we are actually targeting the GPU canvas context on our screen.

Shaders

Now that we have gone over the infrastructure for making two render passes, let’s talk about what each one is doing so that we have a complete picture of this example.

The first shader can be thought of as a pre-process step of sorts, it only interacts with the image and therefore has no positions and defines its own positions in the shader.


struct VertexOutput {
@builtin(position) position: vec4<f32>,
@location(0) tex_coords: vec2<f32>,
};

@vertex
fn vs_main(
@builtin(vertex_index) inVertIndex: u32,
@location(0) inTexCoords: vec2<f32>
) -> VertexOutput {
const pos = array<vec3<f32>, 4>(
vec3<f32>(1.0, 1.0, 0.0),
vec3<f32>(1.0, -1.0, 0.0),
vec3<f32>(-1.0, 1.0, 0.0),
vec3<f32>(-1.0, -1.0, 0.0)
);

var out: VertexOutput;
out.position = vec4<f32>(pos[inVertIndex], 1.0);
out.tex_coords = inTexCoords;
return out;
}

@group(0) @binding(0)
var<uniform> img_size: vec2<f32>;
@group(0) @binding(1)
var t_diffuse: texture_2d<f32>;
@group(0) @binding(2)
var s_diffuse: sampler;
@group(0) @binding(3)
var<storage> kernel: array<f32>;
@group(0) @binding(4)
var<uniform> kernel_size: f32;

@fragment
fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {
var color = vec4<f32>(0.0, 0.0, 0.0, 0.0);
var intensity: f32 = 0.0;

for(var y: f32 = - kernel_size; y <= kernel_size; y+=1.0) {
let offsettedY = y / img_size.y + in.tex_coords.y;
if (offsettedY >= 0.0 && offsettedY <= 1.0 ) {
let indexY = u32(y + kernel_size);
let tex_coord = vec2(in.tex_coords.x, offsettedY);
let gasussian_val = kernel[indexY];
let c = textureSampleLevel(t_diffuse, s_diffuse, tex_coord,0);
color += c * gasussian_val;
intensity += gasussian_val;
}
}

color /= intensity;
color.w = 1.0;

return color;
}

This shader blurs up and down, so pixels become a Gaussian average based on there top and bottom neighbors. We have created the kernel in the CPU and passed it in via a uniform group since it does not need to change (and therefore we shouldn’t re-compute it in each shader run).

Our next shader takes the vertically blurred texture as the input texture and actually uses our positions in the vertex shader.

@group(0) @binding(0)
var<uniform> transform: mat4x4<f32>;
@group(0) @binding(1)
var<uniform> projection: mat4x4<f32>;

struct VertexOutput {
@builtin(position) position: vec4<f32>,
@location(0) tex_coords: vec2<f32>,
};

@vertex
fn vs_main(
@location(0) inPos: vec3<f32>,
@location(1) inTexCoords: vec2<f32>
) -> VertexOutput {
var out: VertexOutput;
out.position = projection * transform * vec4<f32>(inPos, 1.0);
out.tex_coords = inTexCoords;
return out;
}

@group(0) @binding(2)
var<uniform> img_size: vec2<f32>;
@group(0) @binding(3)
var t_diffuse: texture_2d<f32>;
@group(0) @binding(4)
var s_diffuse: sampler;
@group(0) @binding(5)
var<storage> kernel: array<f32>;
@group(0) @binding(6)
var<uniform> kernel_size: f32;

@fragment
fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {
var color = vec4<f32>(0.0, 0.0, 0.0, 0.0);
var intensity: f32 = 0.0;

for(var x: f32 = - kernel_size; x <= kernel_size; x+=1.0) {
let offsettedX = x / img_size.y + in.tex_coords.x;
if (offsettedX >= 0.0 && offsettedX <= 1.0 ) {
let indexX = u32(x + kernel_size);
let tex_coord = vec2(offsettedX, in.tex_coords.y);
let gasussian_val = kernel[indexX];
let c = textureSampleLevel(t_diffuse, s_diffuse, tex_coord,0);
color += c * gasussian_val;
intensity += gasussian_val;
}
}

color /= intensity;
color.w = 1.0;

return color;
}

Our fragment shader will then blur horizontally giving us a final gaussian effect.

Splitting the blurring up this way speeds up our compute a lot, if we had a kernel of size N, we would need to do NxN operations per pixel in a single shader approach (NxNxP). Splitting it into two shader steps means that our first shader does NxP ops and our second shader does NxP ops (2xNxP).

Conclusion

In this article we discussed how to set up a render system to preform multiple render passes by piping the output texture from pass n-1 into the input texture of pass n. We also discussed how this approach could speed up rendering for some use cases compared to a single shader approach.

--

--

Matthew MacFarquhar
Matthew MacFarquhar

Written by Matthew MacFarquhar

I am a software engineer working for Amazon living in SF/NYC.

No responses yet