-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtriangle_tess_template_instantiate.comp.glsl
374 lines (294 loc) · 12.7 KB
/
triangle_tess_template_instantiate.comp.glsl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
/*
* Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
/*
Shader Description
==================
This compute shader computes the build arguments and vertices
for template instantiations of a tessellated triangle region.
The details of the tessellation are provided via `build.partTriangles`
Depending on the value of TESS_INSTANTIATE_BATCHSIZE
a single workgroup may operate on multiple tessellated
triangles at once.This helps with improving utilization of
low-tessellated triangles.
*/
#version 460
#extension GL_GOOGLE_include_directive : enable
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
#extension GL_EXT_buffer_reference : enable
#extension GL_EXT_buffer_reference2 : enable
#extension GL_EXT_scalar_block_layout : enable
#extension GL_EXT_shader_atomic_int64 : enable
#extension GL_EXT_nonuniform_qualifier : enable
#extension GL_EXT_control_flow_attributes : require
#extension GL_KHR_shader_subgroup_vote : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_KHR_shader_subgroup_shuffle : require
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_clustered : require
#extension GL_KHR_shader_subgroup_arithmetic : require
#include "shaderio.h"
layout(push_constant) uniform pushData
{
uint instanceID;
} push;
layout(scalar, binding = BINDINGS_FRAME_UBO, set = 0) uniform frameConstantsBuffer
{
FrameConstants view;
};
layout(scalar, binding = BINDINGS_READBACK_SSBO, set = 0) buffer readbackBuffer
{
Readback readback;
};
layout(scalar, binding = BINDINGS_RENDERINSTANCES_SSBO, set = 0) buffer renderInstancesBuffer
{
RenderInstance instances[];
};
layout(binding = BINDINGS_HIZ_TEX) uniform sampler2D texHizFar;
layout(scalar, binding = BINDINGS_SCENEBUILDING_UBO, set = 0) uniform buildBuffer
{
SceneBuilding build;
};
layout(scalar, binding = BINDINGS_SCENEBUILDING_SSBO, set = 0) buffer buildBufferRW
{
SceneBuilding buildRW;
};
layout(scalar, binding = BINDINGS_TESSTABLE_UBO, set = 0) uniform tessTableBuffer
{
TessellationTable tessTable;
};
#if HAS_DISPLACEMENT_TEXTURES
layout(binding = BINDINGS_DISPLACED_TEXTURES, set = 0) uniform sampler2D displacementTextures[];
#endif
////////////////////////////////////////////
layout(local_size_x=CLUSTER_TEMPLATE_INSTANTIATE_WORKGROUP) in;
#define SUBGROUP_COUNT (CLUSTER_TEMPLATE_INSTANTIATE_WORKGROUP/SUBGROUP_SIZE)
////////////////////////////////////////////
#include "tessellation.glsl"
#if TESS_USE_PN || DO_ANIMATION
#include "displacement.glsl"
#endif
////////////////////////////////////////////
shared uint s_vertexOffset[TESS_INSTANTIATE_BATCHSIZE];
#if TESS_INSTANTIATE_BATCHSIZE > 1
shared uint s_taskOffsets[TESS_INSTANTIATE_BATCHSIZE];
shared uint s_tasks[TESS_INSTANTIATE_BATCHSIZE];
shared uint s_numThreads;
#endif
////////////////////////////////////////////
void main()
{
#if TESS_INSTANTIATE_BATCHSIZE > 1
uint partIndex = gl_WorkGroupID.x * TESS_INSTANTIATE_BATCHSIZE + gl_SubgroupInvocationID;
uint partLoad = min(gl_WorkGroupID.x * TESS_INSTANTIATE_BATCHSIZE + min(gl_SubgroupInvocationID,TESS_INSTANTIATE_BATCHSIZE-1), build.partTriangleCounter-1);
#else
uint partIndex = gl_WorkGroupID.x;
uint partLoad = partIndex;
#endif
//if(true) return;
//if(gl_WorkGroupID.x > 0) return;
TessTriangleInfo tessInfo = build.partTriangles.d[partLoad];
uvec3 vtxEncoded = tessInfo.subTriangle.vtxEncoded;
uint instanceID = tessInfo.cluster.instanceID;
uint clusterID = tessInfo.cluster.clusterID;
uint triangleID_config = tessInfo.subTriangle.triangleID_config;
uint triangleID = triangleID_config & 0xFFFF;
uint cfg = triangleID_config >> 16;
uint numVertices = partLoad == partIndex ? tess_getConfigVertexCount(cfg) : 0;
uint vertexOffset = ~0;
// first subgroup in workgroup handles template instantiation setup for all
// partial triangle regions
if (gl_LocalInvocationID.x < TESS_INSTANTIATE_BATCHSIZE && partLoad == partIndex)
{
uint genOffset = atomicAdd(buildRW.genClusterCounter, 1);
uint64_t dataSize = uint64_t(tessTable.templateInstantiationSizes.d[tess_getConfigIndex(cfg)]);
uint64_t dataOffset = atomicAdd(buildRW.genClusterDataCounter, dataSize);
vertexOffset = atomicAdd(buildRW.genVertexCounter, numVertices);
// test if we have enough space to perform the instantiation
if ( (vertexOffset + numVertices > MAX_GENERATED_VERTICES) ||
(genOffset + 1 > MAX_GENERATED_CLUSTERS) ||
(dataOffset + dataSize > (uint64_t(MAX_GENERATED_CLUSTER_MEGS) * 1024 * 1024)))
{
vertexOffset = ~0;
numVertices = 0;
}
else
{
TemplateInstantiateInfo tempInfo;
// given we have multiple kinds of clusters (full, partial etc.) we need to tag the
// clusterID so that the hit shader will later know where to get information from.
// the template's clusterID is zero
tempInfo.clusterIdOffset = partIndex | (RT_CLUSTER_MODE_SINGLE_TESSELLATED << 30);
tempInfo.geometryIndexOffset = 0;
tempInfo.clusterTemplateAddress = tessTable.templateAddresses.d[tess_getConfigIndex(cfg)];
// vertices are provided through a memory region that we fill every frame
tempInfo.vertexBufferAddress = uint64_t(build.genVertices) + uint64_t(vertexOffset * 4 * 3);
tempInfo.vertexBufferStride = 4 * 3;
uint tempOffset = atomicAdd(buildRW.tempInstantiateCounter, 1); // actual template instantiations
build.tempInstantiations.d[tempOffset] = tempInfo;
// need to know which instance the instantiated CLAS belongs to, so we can insert it into its BLAS
build.tempInstanceIDs.d[tempOffset] = instanceID;
// we instantiate in explicit mode, so we provide the CLAS destination address
build.tempClusterAddresses.d[tempOffset] = uint64_t(build.genClusterData) + dataOffset;
uint numTriangles = tess_getConfigTriangleCount(cfg);
// increment the instance's BLAS number of CLAS it will reference
// we need this to later build per-BLAS arrays of references
atomicAdd(build.blasBuildInfos.d[instanceID].clusterReferencesCount, 1);
// for stats
atomicAdd(readback.numTotalTriangles, numTriangles);
}
#if TESS_INSTANTIATE_BATCHSIZE < 2
s_vertexOffset[0] = vertexOffset;
#endif
}
#if TESS_INSTANTIATE_BATCHSIZE > 1
// when we batched multiple into one workgroup, then we need to
// make the information handled by the first subgroup available to
// all threads, via shared memory
if (gl_LocalInvocationID.x < TESS_INSTANTIATE_BATCHSIZE)
{
uint threadOffset = subgroupInclusiveAdd(numVertices);
uint numThreads = subgroupShuffle(threadOffset, TESS_INSTANTIATE_BATCHSIZE-1);
uvec4 voteTasks = subgroupBallot(numVertices != 0);
s_numThreads = numThreads;
uint taskOffset = subgroupBallotExclusiveBitCount(voteTasks);
s_taskOffsets[gl_LocalInvocationID.x] = ~0;
memoryBarrierShared();
if (numVertices != 0)
{
s_taskOffsets[taskOffset] = threadOffset - numVertices;
s_tasks[taskOffset] = gl_SubgroupInvocationID;
s_vertexOffset[taskOffset] = vertexOffset;
}
}
#endif
memoryBarrierShared();
barrier();
#if TESS_INSTANTIATE_BATCHSIZE > 1
uint numThreads = s_numThreads;
if (numThreads == 0) return;
uint in_instanceID = instanceID;
uint in_clusterID = clusterID;
uint in_triangleID_config = triangleID_config;
uvec3 in_vtxEncoded = vtxEncoded;
// distribute the filling of vertices of multiple tasks across
// the workgroup's subgroups
// ensure full subgroups are executed for shuffle
uint numThreadsRun = ((numThreads + SUBGROUP_SIZE - 1) & ~(SUBGROUP_SIZE-1));
for (uint t = gl_LocalInvocationID.x; t < numThreadsRun; t += CLUSTER_TEMPLATE_INSTANTIATE_WORKGROUP)
{
uint start = 0;
uint taskID = s_tasks[0];
vertexOffset = s_vertexOffset[0];
// binary search wasn't faster
[[unroll]] for (uint i = 1; i < TESS_INSTANTIATE_BATCHSIZE; i++)
{
uint taskOffset = s_taskOffsets[i];
if (t >= taskOffset)
{
taskID = s_tasks[i];
vertexOffset = s_vertexOffset[i];
start = taskOffset;
}
}
triangleID_config = subgroupShuffle(in_triangleID_config, taskID);
instanceID = subgroupShuffle(in_instanceID, taskID);
clusterID = subgroupShuffle(in_clusterID, taskID);
vtxEncoded = subgroupShuffle(in_vtxEncoded, taskID);
triangleID = triangleID_config & 0xFFFF;
cfg = triangleID_config >> 16;
if (t >= numThreads) continue;
uint vert = t - start;
#else
// simpler setup we know we only have one task at a time
vertexOffset = s_vertexOffset[0];
// ran out of memory
if (vertexOffset == ~0) return;
{
#endif
RenderInstance instance = instances[instanceID];
Cluster cluster = instance.clusters.d[clusterID];
vec3 baseBarycentrics[3];
{
// get vertices
[[unroll]] for (uint v = 0; v < 3; v++) {
uint vtxEncoded = vtxEncoded[v];
baseBarycentrics[v] = tess_decodeBarycentrics(vtxEncoded);
}
}
vec3s_in oPositions = vec3s_in(instance.positions);
vec3s_in oNormals = vec3s_in(instance.normals);
vec2s_in oTexcoords = vec2s_in(instance.texcoords);
uint8s_in localTriangles = uint8s_in(instance.clusterLocalTriangles);
mat4 worldMatrix = instance.worldMatrix;
mat3 worldMatrixIT = transpose(inverse(mat3(worldMatrix)));
uvec3 baseIndices = uvec3(localTriangles.d[cluster.firstLocalTriangle + triangleID * 3 + 0],
localTriangles.d[cluster.firstLocalTriangle + triangleID * 3 + 1],
localTriangles.d[cluster.firstLocalTriangle + triangleID * 3 + 2])
+ uint(cluster.firstLocalVertex);
vec3 basePositions[3];
vec3 baseNormals[3];
vec2 baseTexcoords[3];
{
// get vertices
[[unroll]] for (uint v = 0; v < 3; v++) {
basePositions[v] = oPositions.d[baseIndices[v]];
baseNormals[v] = normalize(oNormals.d[baseIndices[v]]);
baseTexcoords[v] = oTexcoords.d[baseIndices[v]];
}
}
#if TESS_USE_PN
DeformBasePN basePN;
deform_setupPN(basePN, basePositions, baseNormals);
#endif
#if TESS_INSTANTIATE_BATCHSIZE > 1
// the more sophisticated distribution of vertices of multiple tasks already makes us operate
// on a per-vertex level at this point
#else
// otherwise we switch to iteration over all vertices of the single tessellated triangle region we operate on
for (uint vert = gl_LocalInvocationID.x; vert < numVertices; vert += CLUSTER_TEMPLATE_INSTANTIATE_WORKGROUP)
#endif
{
vec3 vertexBarycentrics = tess_getConfigVertexBarycentrics(cfg, vert);
vertexBarycentrics = tess_interpolate(baseBarycentrics, vertexBarycentrics);
#if TESS_USE_PN
vec3 oPos = deform_getPN(basePN, vertexBarycentrics);
#else
vec3 oPos = tess_interpolate(basePositions, vertexBarycentrics);
#endif
#if HAS_DISPLACEMENT_TEXTURES
if (instance.displacementIndex >= 0)
{
vec3 oNormal = tess_interpolate(baseNormals, vertexBarycentrics);
vec2 uv = tess_interpolate(baseTexcoords, vertexBarycentrics);
float height = texture(displacementTextures[nonuniformEXT(instance.displacementIndex)], uv).r;
height = (height * instance.displacementScale * view.displacementScale) + instance.displacementOffset + view.displacementOffset;
oPos += normalize(oNormal) * height;
}
#endif
#if DO_ANIMATION
oPos = rippleDeform(oPos, instanceID, instance.geoHi.w);
#endif
// these are the vertices the template instantiation will use
build.genVertices.d[vert + vertexOffset] = oPos;
}
}
}