代码分析说明:
使用计算管线
import { makeSample, SampleInit } from '../../components/SampleLayout';
import blurWGSL from './blur.wgsl';
import fullscreenTexturedQuadWGSL from '../../shaders/fullscreenTexturedQuad.wgsl';
// Contants from the blur.wgsl shader.
const tileDim = 128;
const batch = [4, 4];
const init: SampleInit = async ({ canvas, pageState, gui }) => {
const adapter = await navigator.gpu.requestAdapter();
const device = await adapter.requestDevice();
if (!pageState.active) return;
const context = canvas.getContext('webgpu') as GPUCanvasContext;
const devicePixelRatio = window.devicePixelRatio;
canvas.width = canvas.clientWidth * devicePixelRatio;
canvas.height = canvas.clientHeight * devicePixelRatio;
const presentationFormat = navigator.gpu.getPreferredCanvasFormat();
context.configure({
device,
format: presentationFormat,
alphaMode: 'premultiplied',
});
const blurPipeline = device.createComputePipeline({
layout: 'auto',
compute: {
module: device.createShaderModule({
code: blurWGSL,
}),
entryPoint: 'main',
},
});
const fullscreenQuadPipeline = device.createRenderPipeline({
layout: 'auto',
vertex: {
module: device.createShaderModule({
code: fullscreenTexturedQuadWGSL,
}),
entryPoint: 'vert_main',
},
fragment: {
module: device.createShaderModule({
code: fullscreenTexturedQuadWGSL,
}),
entryPoint: 'frag_main',
targets: [
{
format: presentationFormat,
},
],
},
primitive: {
topology: 'triangle-list',
},
});
const sampler = device.createSampler({
magFilter: 'linear',
minFilter: 'linear',
});
const response = await fetch('../assets/img/Di-3d.png');
const imageBitmap = await createImageBitmap(await response.blob());
const [srcWidth, srcHeight] = [imageBitmap.width, imageBitmap.height];
const cubeTexture = device.createTexture({
size: [srcWidth, srcHeight, 1],
format: 'rgba8unorm',
usage:
GPUTextureUsage.TEXTURE_BINDING |
GPUTextureUsage.COPY_DST |
GPUTextureUsage.RENDER_ATTACHMENT,
});
device.queue.copyExternalImageToTexture(
{ source: imageBitmap },
{ texture: cubeTexture },
[imageBitmap.width, imageBitmap.height]
);
// 创建两个纹理buffer
const textures = [0, 1].map(() => {
return device.createTexture({
size: {
width: srcWidth,
height: srcHeight,
},
format: 'rgba8unorm',
usage:
GPUTextureUsage.COPY_DST |
GPUTextureUsage.STORAGE_BINDING |
GPUTextureUsage.TEXTURE_BINDING,
});
});
// 创建数据Buffer0
const buffer0 = (() => {
const buffer = device.createBuffer({
size: 4,
mappedAtCreation: true,
usage: GPUBufferUsage.UNIFORM,
});
new Uint32Array(buffer.getMappedRange())[0] = 0;
buffer.unmap();
return buffer;
})();
// 创建数据Buffer1
const buffer1 = (() => {
const buffer = device.createBuffer({
size: 4,
mappedAtCreation: true,
usage: GPUBufferUsage.UNIFORM,
});
new Uint32Array(buffer.getMappedRange())[0] = 1;
buffer.unmap();
return buffer;
})();
// 创建模糊数据buffer
const blurParamsBuffer = device.createBuffer({
size: 8,
usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.UNIFORM,
});
// 创建计算管线的绑定组
const computeConstants = device.createBindGroup({
layout: blurPipeline.getBindGroupLayout(0),
entries: [
{
binding: 0,
resource: sampler,
},
{
binding: 1,
resource: {
buffer: blurParamsBuffer,
},
},
],
});
// 创建计算管线的绑定组0
const computeBindGroup0 = device.createBindGroup({
layout: blurPipeline.getBindGroupLayout(1),
entries: [
{
binding: 1,
resource: cubeTexture.createView(),
},
{
binding: 2,
resource: textures[0].createView(),
},
{
binding: 3,
resource: {
buffer: buffer0,
},
},
],
});
// 创建计算管线的绑定组1
const computeBindGroup1 = device.createBindGroup({
layout: blurPipeline.getBindGroupLayout(1),
entries: [
{
binding: 1,
resource: textures[0].createView(),
},
{
binding: 2,
resource: textures[1].createView(),
},
{
binding: 3,
resource: {
buffer: buffer1,
},
},
],
});
// 创建计算管线的绑定组2
const computeBindGroup2 = device.createBindGroup({
layout: blurPipeline.getBindGroupLayout(1),
entries: [
{
binding: 1,
resource: textures[1].createView(),
},
{
binding: 2,
resource: textures[0].createView(),
},
{
binding: 3,
resource: {
buffer: buffer0,
},
},
],
});
// 创建显示绑定组
const showResultBindGroup = device.createBindGroup({
layout: fullscreenQuadPipeline.getBindGroupLayout(0),
entries: [
{
binding: 0,
resource: sampler,
},
{
binding: 1,
// 计算管线处理完的buffer图像
resource: textures[1].createView(),
},
],
});
const settings = {
filterSize: 15,
iterations: 2,
};
let blockDim: number;
const updateSettings = () => {
blockDim = tileDim - (settings.filterSize - 1);
device.queue.writeBuffer(
blurParamsBuffer,
0,
new Uint32Array([settings.filterSize, blockDim])
);
};
gui.add(settings, 'filterSize', 1, 33).step(2).onChange(updateSettings);
gui.add(settings, 'iterations', 1, 10).step(1);
updateSettings();
function frame() {
// Sample is no longer the active page.
if (!pageState.active) return;
const commandEncoder = device.createCommandEncoder();
// 创建一个计算的pass
const computePass = commandEncoder.beginComputePass();
computePass.setPipeline(blurPipeline);
computePass.setBindGroup(0, computeConstants);
computePass.setBindGroup(1, computeBindGroup0);
/**
* 要使用当前 GPUComputePipeline
* @param workgroupCountX 要调度的工作组网格的X维度
* @param workgroupCountY 要调度的工作组网格的Y维度。
* @param workgroupCountZ 要调度的工作组网格的Z维度。
*
* 注意: 传递给 dispatchWorkgroups() 和 dispatchWorkgroupsIndirect() 的 x, y和 z 值是要为每个维度调度的工作组数, 而不是 而不是要在每个维度上执行的着色器调用数。这与现代原生GPU API的行为相匹配,但与OpenCL的行为不同。 这意味着,如果一个 GPUShaderModule 用 @workgroup_size(4, 4)定义了一个入口点,并且通过调用 computePass.dispatchWorkgroups(8, 8); 将工作分派给它; 入口点将被调用1024次:沿X轴和Y轴调度4x4工作组8次。
// 经过多次的模糊处理
dispatchWorkgroups(
workgroupCountX: GPUSize32,
workgroupCountY?: GPUSize32,
workgroupCountZ?: GPUSize32
): void;
*/
computePass.dispatchWorkgroups(
Math.ceil(srcWidth / blockDim),
Math.ceil(srcHeight / batch[1])
);
computePass.setBindGroup(1, computeBindGroup1);
computePass.dispatchWorkgroups(
Math.ceil(srcHeight / blockDim),
Math.ceil(srcWidth / batch[1])
);
for (let i = 0; i < settings.iterations - 1; ++i) {
computePass.setBindGroup(1, computeBindGroup2);
computePass.dispatchWorkgroups(
Math.ceil(srcWidth / blockDim),
Math.ceil(srcHeight / batch[1])
);
computePass.setBindGroup(1, computeBindGroup1);
computePass.dispatchWorkgroups(
Math.ceil(srcHeight / blockDim),
Math.ceil(srcWidth / batch[1])
);
}
computePass.end();
// 创建一个渲染的pass
const passEncoder = commandEncoder.beginRenderPass({
colorAttachments: [
{
view: context.getCurrentTexture().createView(),
clearValue: { r: 0.0, g: 0.0, b: 0.0, a: 1.0 },
loadOp: 'clear',
storeOp: 'store',
},
],
});
passEncoder.setPipeline(fullscreenQuadPipeline);
passEncoder.setBindGroup(0, showResultBindGroup);
passEncoder.draw(6);
passEncoder.end();
device.queue.submit([commandEncoder.finish()]);
requestAnimationFrame(frame);
}
requestAnimationFrame(frame);
};
blur.wgsl
struct Params {
filterDim : i32,
blockDim : u32,
}
@group(0) @binding(0) var samp : sampler;
@group(0) @binding(1) var<uniform> params : Params;
@group(1) @binding(1) var inputTex : texture_2d<f32>;
@group(1) @binding(2) var outputTex : texture_storage_2d<rgba8unorm, write>;
struct Flip {
value : u32,
}
@group(1) @binding(3) var<uniform> flip : Flip;
// This shader blurs the input texture in one direction, depending on whether
// |flip.value| is 0 or 1.
// It does so by running (128 / 4) threads per workgroup to load 128
// texels into 4 rows of shared memory. Each thread loads a
// 4 x 4 block of texels to take advantage of the texture sampling
// hardware.
// Then, each thread computes the blur result by averaging the adjacent texel values
// in shared memory.
// Because we're operating on a subset of the texture, we cannot compute all of the
// results since not all of the neighbors are available in shared memory.
// Specifically, with 128 x 128 tiles, we can only compute and write out
// square blocks of size 128 - (filterSize - 1). We compute the number of blocks
// needed in Javascript and dispatch that amount.
var<workgroup> tile : array<array<vec3<f32>, 128>, 4>;
@compute @workgroup_size(32, 1, 1)
fn main(
@builtin(workgroup_id) WorkGroupID : vec3<u32>,
@builtin(local_invocation_id) LocalInvocationID : vec3<u32>
) {
let filterOffset = (params.filterDim - 1) / 2;
let dims = vec2<i32>(textureDimensions(inputTex, 0));
let baseIndex = vec2<i32>(WorkGroupID.xy * vec2(params.blockDim, 4) +
LocalInvocationID.xy * vec2(4, 1))
- vec2(filterOffset, 0);
for (var r = 0; r < 4; r++) {
for (var c = 0; c < 4; c++) {
var loadIndex = baseIndex + vec2(c, r);
if (flip.value != 0u) {
loadIndex = loadIndex.yx;
}
tile[r][4 * LocalInvocationID.x + u32(c)] = textureSampleLevel(
inputTex,
samp,
(vec2<f32>(loadIndex) + vec2<f32>(0.25, 0.25)) / vec2<f32>(dims),
0.0
).rgb;
}
}
workgroupBarrier();
for (var r = 0; r < 4; r++) {
for (var c = 0; c < 4; c++) {
var writeIndex = baseIndex + vec2(c, r);
if (flip.value != 0) {
writeIndex = writeIndex.yx;
}
let center = i32(4 * LocalInvocationID.x) + c;
if (center >= filterOffset &&
center < 128 - filterOffset &&
all(writeIndex < dims)) {
var acc = vec3(0.0, 0.0, 0.0);
for (var f = 0; f < params.filterDim; f++) {
var i = center + f - filterOffset;
acc = acc + (1.0 / f32(params.filterDim)) * tile[r][i];
}
textureStore(outputTex, writeIndex, vec4(acc, 1.0));
}
}
}
}
fullscreenTexturedQuad.wgsl
struct Params {
filterDim : i32,
blockDim : u32,
}
@group(0) @binding(0) var samp : sampler;
@group(0) @binding(1) var<uniform> params : Params;
@group(1) @binding(1) var inputTex : texture_2d<f32>;
@group(1) @binding(2) var outputTex : texture_storage_2d<rgba8unorm, write>;
struct Flip {
value : u32,
}
@group(1) @binding(3) var<uniform> flip : Flip;
// This shader blurs the input texture in one direction, depending on whether
// |flip.value| is 0 or 1.
// It does so by running (128 / 4) threads per workgroup to load 128
// texels into 4 rows of shared memory. Each thread loads a
// 4 x 4 block of texels to take advantage of the texture sampling
// hardware.
// Then, each thread computes the blur result by averaging the adjacent texel values
// in shared memory.
// Because we're operating on a subset of the texture, we cannot compute all of the
// results since not all of the neighbors are available in shared memory.
// Specifically, with 128 x 128 tiles, we can only compute and write out
// square blocks of size 128 - (filterSize - 1). We compute the number of blocks
// needed in Javascript and dispatch that amount.
var<workgroup> tile : array<array<vec3<f32>, 128>, 4>;
@compute @workgroup_size(32, 1, 1)
fn main(
@builtin(workgroup_id) WorkGroupID : vec3<u32>,
@builtin(local_invocation_id) LocalInvocationID : vec3<u32>
) {
let filterOffset = (params.filterDim - 1) / 2;
// 纹理大小
let dims = vec2<i32>(textureDimensions(inputTex, 0));
let baseIndex = vec2<i32>(WorkGroupID.xy * vec2(params.blockDim, 4) +
LocalInvocationID.xy * vec2(4, 1))
- vec2(filterOffset, 0);
for (var r = 0; r < 4; r++) {
for (var c = 0; c < 4; c++) {
var loadIndex = baseIndex + vec2(c, r);
if (flip.value != 0u) {
loadIndex = loadIndex.yx;
}
tile[r][4 * LocalInvocationID.x + u32(c)] = textureSampleLevel(
inputTex,
samp,
// 坐标偏移
(vec2<f32>(loadIndex) + vec2<f32>(0.25, 0.25)) / vec2<f32>(dims),
0.0
).rgb;
}
}
workgroupBarrier();
// 纹理写入
for (var r = 0; r < 4; r++) {
for (var c = 0; c < 4; c++) {
var writeIndex = baseIndex + vec2(c, r);
if (flip.value != 0) {
writeIndex = writeIndex.yx;
}
let center = i32(4 * LocalInvocationID.x) + c;
if (center >= filterOffset &&
center < 128 - filterOffset &&
all(writeIndex < dims)) {
var acc = vec3(0.0, 0.0, 0.0);
for (var f = 0; f < params.filterDim; f++) {
var i = center + f - filterOffset;
acc = acc + (1.0 / f32(params.filterDim)) * tile[r][i];
}
textureStore(outputTex, writeIndex, vec4(acc, 1.0));
}
}
}
}
总结步骤
简单总结就是,通过计算管线将图片进行了多轮的采样计算,使图像模糊,在使用写回的buffer传递给渲染管线