Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion examples/projects/Vehicle/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ int main() {
if (auto wgpu = dynamic_cast<WgpuRenderer*>(renderer.get())) {
wgpu->usePathTracer = true;
auto& pt = wgpu->pathTracer();
pt.setTlasEnabled(false);
pt.setMaxBounces(1);
pt.setDenoiserEnabled(false);
}
Expand Down
5 changes: 1 addition & 4 deletions examples/wgpu/wgpu_cornell_box.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ int main() {
bool restdirOn = pathTracer.restirEnabled();
bool restirGIOn = pathTracer.restirGiEnabled();
bool foveatOn = pathTracer.foveatedRendering();
bool tlasOn = pathTracer.tlasEnabled();
bool wobbleOn = false;
int maxBounces = pathTracer.maxBounces();
float exposure = pathTracer.exposure();
Expand Down Expand Up @@ -221,8 +220,6 @@ int main() {
pathTracer.setReSTIRGIEnabled(restirGIOn);
if (ImGui::Checkbox("Foveated", &foveatOn))
pathTracer.setFoveatedRendering(foveatOn);
if (ImGui::Checkbox("TLAS", &tlasOn))
pathTracer.setTlasEnabled(tlasOn);
ImGui::Checkbox("Wobble back wall", &wobbleOn);
if (ImGui::SliderInt("Max bounces", &maxBounces, 1, 8))
pathTracer.setMaxBounces(maxBounces);
Expand Down Expand Up @@ -264,7 +261,7 @@ int main() {
}

// Animate back wall vertices — exercises the path tracer's per-frame
// geometry fast path (CPU repack + partial upload + BVH/BLAS refit).
// geometry fast path (CPU repack + partial upload + BVH refit).
if (wobbleOn) {
wobbleT += dt;
const float wave = std::sin(math::TWO_PI * 0.5f * wobbleT);
Expand Down
1 change: 0 additions & 1 deletion examples/wgpu/wgpu_denoise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ int main() {
pathTracer.setMaxBounces(4);
pathTracer.setReSTIREnabled(true);
pathTracer.setReSTIRGIEnabled(true);
pathTracer.setTlasEnabled(true);
// pathTracer.setFireflyClamp(0.001);

// ---- Scene ----
Expand Down
4 changes: 0 additions & 4 deletions examples/wgpu/wgpu_gltf_samples.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ int main(int argc, char** argv) {
pathTracer.setReSTIREnabled(false);
pathTracer.setMaxBounces(4);
pathTracer.setFoveatedRendering(false);
pathTracer.setTlasEnabled(false);
pathTracer.setTextureResolution(1024);

RGBELoader imgLoader;
Expand Down Expand Up @@ -203,7 +202,6 @@ int main(int argc, char** argv) {
int fpsFrames = 0;
int aovMode = pathTracer.aovMode();
bool foveatOn = pathTracer.foveatedRendering();
bool tlasOn = pathTracer.tlasEnabled();

bool dofEnabled = false;
float lensFStop = 2.8f;
Expand Down Expand Up @@ -279,8 +277,6 @@ int main(int argc, char** argv) {
pathTracer.setReSTIRGIEnabled(restdirGIOn);
if (ImGui::Checkbox("Foveated Rendering", &foveatOn))
pathTracer.setFoveatedRendering(foveatOn);
if (ImGui::Checkbox("TLAS/BLAS", &tlasOn))
pathTracer.setTlasEnabled(tlasOn);

if (ImGui::Checkbox("Show DirLight", &dirLight)) {
light->visible = dirLight;
Expand Down
6 changes: 0 additions & 6 deletions include/threepp/renderers/wgpu/WgpuPathTracer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,6 @@ namespace threepp {
void setReSTIRGIEnabled(bool enabled);
[[nodiscard]] bool restirGiEnabled() const;

/// Enable/disable the two-level TLAS/BLAS acceleration structure. Default: false.
/// Experimental — plumbing only in the current build; single-level BVH remains
/// the active traversal path until the shader rewrite lands.
void setTlasEnabled(bool enabled);
[[nodiscard]] bool tlasEnabled() const;

/// Samples per pixel per frame. Default: 1. Higher values reduce noise
/// at the cost of proportionally more RT time per frame.
void setSamplesPerPixel(int spp);
Expand Down
395 changes: 12 additions & 383 deletions src/threepp/renderers/wgpu/pathtracer/WgpuPathTracer.cpp

Large diffs are not rendered by default.

211 changes: 5 additions & 206 deletions src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerBvh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@
#include "threepp/renderers/wgpu/pathtracer/WgpuPathTracerAtlas.hpp"
#include "threepp/renderers/wgpu/pathtracer/WgpuPathTracerGeometry.hpp"

#include "threepp/math/Matrix4.hpp"
#include "threepp/math/Vector3.hpp"

#include <algorithm>
#include <array>
#include <climits>
#include <cmath>
#include <cstring>
#include <numeric>
Expand Down Expand Up @@ -441,8 +437,7 @@ void packRefitMetadata(const std::vector<Bvh4Node>& nodes, std::vector<int32_t>&
void buildBVH(std::vector<float>& triBuffer, int triCount,
std::vector<Bvh4Node>& wideNodes, std::vector<int>& indices,
std::vector<int>& leafIndices,
std::vector<float>& rawObjTriBuf,
bool preserveObjTriOrder) {
std::vector<float>& rawObjTriBuf) {
indices.resize(triCount);
std::iota(indices.begin(), indices.end(), 0);

Expand All @@ -462,11 +457,6 @@ void buildBVH(std::vector<float>& triBuffer, int triCount,
// Sort triangle data to match BVH index ordering.
// Uses in-place cycle-following permutation to avoid allocating two full copies
// (~1.2 GB for large scenes). Extra memory: O(n/8) for the visited bitmap + 2 tri temps.
//
// When `preserveObjTriOrder` is set, the TLAS/BLAS path has already built
// BLAS records that address `rawObjTriBuf` by its pre-sort (entry-contiguous)
// indices. Reordering it here would silently invalidate those indices, so
// the secondary buffer is skipped and only `triBuffer` gets permuted.
{
std::vector<bool> visited(triCount, false);
std::vector<float> tmpTri(TRI_TEX_HEIGHT * 4); // one triangle's worth of triBuffer rows (8*4=32)
Expand All @@ -481,9 +471,7 @@ void buildBVH(std::vector<float>& triBuffer, int triCount,
for (int row = 0; row < TRI_TEX_HEIGHT; row++)
for (int c = 0; c < 4; c++)
tmpTri[row * 4 + c] = triBuffer[pagedIdx(i, row) + c];
if (!preserveObjTriOrder) {
std::memcpy(tmpObj.data(), rawObjTriBuf.data() + i * 32, 32 * sizeof(float));
}
std::memcpy(tmpObj.data(), rawObjTriBuf.data() + i * 32, 32 * sizeof(float));

int j = i;
while (true) {
Expand All @@ -493,20 +481,16 @@ void buildBVH(std::vector<float>& triBuffer, int triCount,
for (int row = 0; row < TRI_TEX_HEIGHT; row++)
for (int c = 0; c < 4; c++)
triBuffer[pagedIdx(j, row) + c] = triBuffer[pagedIdx(k, row) + c];
if (!preserveObjTriOrder) {
std::memcpy(rawObjTriBuf.data() + j * 32,
rawObjTriBuf.data() + k * 32, 32 * sizeof(float));
}
std::memcpy(rawObjTriBuf.data() + j * 32,
rawObjTriBuf.data() + k * 32, 32 * sizeof(float));
visited[k] = true;
j = k;
}
// j is the last position in the cycle; receives the saved start element
for (int row = 0; row < TRI_TEX_HEIGHT; row++)
for (int c = 0; c < 4; c++)
triBuffer[pagedIdx(j, row) + c] = tmpTri[row * 4 + c];
if (!preserveObjTriOrder) {
std::memcpy(rawObjTriBuf.data() + j * 32, tmpObj.data(), 32 * sizeof(float));
}
std::memcpy(rawObjTriBuf.data() + j * 32, tmpObj.data(), 32 * sizeof(float));
}
}
}
Expand Down Expand Up @@ -618,189 +602,4 @@ void buildOverlayBVH(
}
}

BlasRecord buildBlas(
std::vector<float>& objTriBuf,
int triStartLocal,
int triCount,
std::vector<Bvh4Node>& blasNodes,
std::vector<int>& leafIndicesOut) {

BlasRecord rec{};
rec.triStart = static_cast<std::uint32_t>(triStartLocal);
rec.triCount = static_cast<std::uint32_t>(std::max(0, triCount));
rec.rootNodeOffset = static_cast<std::uint32_t>(blasNodes.size());
rec.nodeCount = 0;
for (int c = 0; c < 3; c++) { rec.aabbMin[c] = 0.f; rec.aabbMax[c] = 0.f; }
rec.aabbMin[3] = rec.aabbMax[3] = 0.f;

if (triCount <= 0) return rec;

// Pack object-space vertex positions into a temporary paged buffer so the
// shared `buildBvhNode` code can read them via `triGet` without a second
// accessor. Only rows 0/1/2 (the three vertex positions) are needed by
// the builder — normals/UVs don't affect tree shape.
const int localPages = triTexPages(triCount);
const std::size_t localWords =
static_cast<std::size_t>(localPages) * TEX_PAGE_WIDTH * TRI_TEX_HEIGHT * 4;
std::vector<float> localPaged(localWords, 0.f);
for (int li = 0; li < triCount; li++) {
const int gi = triStartLocal + li;
const float* src = objTriBuf.data() + static_cast<std::size_t>(gi) * 32;
for (int row = 0; row < 3; row++) {
const int lp = ((li / TEX_PAGE_WIDTH * TRI_TEX_HEIGHT + row) * TEX_PAGE_WIDTH
+ li % TEX_PAGE_WIDTH) * 4;
localPaged[lp + 0] = src[row * 4 + 0];
localPaged[lp + 1] = src[row * 4 + 1];
localPaged[lp + 2] = src[row * 4 + 2];
}
}

// Phase 1: binary BVH over local indices.
std::vector<int> localIdx(triCount);
std::iota(localIdx.begin(), localIdx.end(), 0);
std::vector<BvhNode> binNodes;
binNodes.reserve(static_cast<std::size_t>(triCount) * 2);
buildBvhNode(binNodes, localIdx, localPaged, 0, triCount, -1);

// Phase 2: collapse into the shared `blasNodes` buffer. Because
// `collapseBvh4` uses `wide.size()` for self-indexing, child indices
// emitted into `blasNodes` are already absolute (they account for prior
// BLASes already in the buffer). `leafIndicesOut` receives absolute
// indices by the same mechanism.
const std::size_t nodeBase = blasNodes.size();
if (!binNodes.empty()) {
collapseBvh4(binNodes, blasNodes, leafIndicesOut, 0, -1);
}
rec.nodeCount = static_cast<std::uint32_t>(blasNodes.size() - nodeBase);

// Offset leaf `triStart` values in the newly-emitted nodes from local
// (0..triCount-1) to global (triStartLocal + localStart), matching the
// overlay builder's convention.
for (std::size_t ni = nodeBase; ni < blasNodes.size(); ni++) {
auto& node = blasNodes[ni];
for (int c = 0; c < 4; c++) {
const int ci = node.childIdx[c];
if (ci >= 0 || ci == INT_MIN) continue; // internal or empty
const int raw = -ci;
const int lStart = (raw - 1) / MAX_LEAF_TRIS;
const int cnt = ((raw - 1) % MAX_LEAF_TRIS) + 1;
node.childIdx[c] = -(((lStart + triStartLocal) * MAX_LEAF_TRIS) + cnt);
}
}

// Reorder the object-space triangle slice in `objTriBuf` into BVH leaf
// order. Same cycle-permutation pattern as `buildBVH` / `buildOverlayBVH`
// but operating on the 32-float linear layout only (no paged world buf).
{
std::vector<bool> visited(triCount, false);
std::array<float, 32> tmp{};
for (int i = 0; i < triCount; i++) {
if (visited[i]) continue;
visited[i] = true;
if (localIdx[i] == i) continue;

std::memcpy(tmp.data(),
objTriBuf.data() + static_cast<std::size_t>(triStartLocal + i) * 32,
32 * sizeof(float));
int j = i;
while (true) {
const int k = localIdx[j];
if (k == i) break;
std::memcpy(objTriBuf.data() + static_cast<std::size_t>(triStartLocal + j) * 32,
objTriBuf.data() + static_cast<std::size_t>(triStartLocal + k) * 32,
32 * sizeof(float));
visited[k] = true;
j = k;
}
std::memcpy(objTriBuf.data() + static_cast<std::size_t>(triStartLocal + j) * 32,
tmp.data(), 32 * sizeof(float));
}
}

// Root AABB — union of the root node's children. Used later by the TLAS
// builder to produce each instance's world-space bounding box.
if (rec.nodeCount > 0) {
const auto& root = blasNodes[nodeBase];
float mnX = 1e30f, mnY = 1e30f, mnZ = 1e30f;
float mxX = -1e30f, mxY = -1e30f, mxZ = -1e30f;
for (int c = 0; c < root.childCount; c++) {
if (root.childIdx[c] == INT_MIN) continue;
mnX = std::min(mnX, root.childMinX[c]);
mnY = std::min(mnY, root.childMinY[c]);
mnZ = std::min(mnZ, root.childMinZ[c]);
mxX = std::max(mxX, root.childMaxX[c]);
mxY = std::max(mxY, root.childMaxY[c]);
mxZ = std::max(mxZ, root.childMaxZ[c]);
}
rec.aabbMin[0] = mnX; rec.aabbMin[1] = mnY; rec.aabbMin[2] = mnZ;
rec.aabbMax[0] = mxX; rec.aabbMax[1] = mxY; rec.aabbMax[2] = mxZ;
}

return rec;
}

void buildBlasesForEntries(
const std::vector<RtMeshEntry>& entries,
const std::vector<std::pair<int, int>>& entryTriRanges,
std::vector<float>& objTriBuf,
std::vector<Bvh4Node>& blasNodes,
std::vector<int>& blasLeafIndices,
std::vector<BlasRecord>& blasRecords,
std::vector<TlasInstance>& tlasInstances,
std::vector<std::uint32_t>& tlasToEntryIdx) {

auto writeMat3x4 = [](const Matrix4& m, float out[3][4]) {
const auto& e = m.elements;
for (int row = 0; row < 3; ++row) {
out[row][0] = e[row];
out[row][1] = e[4 + row];
out[row][2] = e[8 + row];
out[row][3] = e[12 + row];
}
};

const std::size_t n = std::min(entries.size(), entryTriRanges.size());
for (std::size_t i = 0; i < n; ++i) {
const auto [triStart, triCount] = entryTriRanges[i];
if (triCount <= 0) continue;

const std::uint32_t blasIndex =
static_cast<std::uint32_t>(blasRecords.size());
BlasRecord rec = buildBlas(objTriBuf, triStart, triCount,
blasNodes, blasLeafIndices);
blasRecords.push_back(rec);

// Pull matIdx / meshIdx out of the first tri in the slice. All tris
// in one entry share both, so any index in [triStart, triStart+triCount)
// is fine — and buildBlas only reorders within that slice.
const float* first = objTriBuf.data() + static_cast<std::size_t>(triStart) * 32;
const auto matIdx = static_cast<std::uint32_t>(first[3]); // field 0.w
const auto meshIdx = static_cast<std::uint32_t>(first[7]); // field 1.w

TlasInstance inst{};
writeMat3x4(entries[i].worldMatrix, inst.objToWorld);
Matrix4 inv(entries[i].worldMatrix);
inv.invert();
writeMat3x4(inv, inst.worldToObj);
inst.blasIndex = blasIndex;
inst.matIdx = matIdx;
inst.meshId = meshIdx;

// Non-uniform scale flag (bit 0): compare basis-vector lengths from the
// 3x3 upper-left of the world matrix (column-major elements).
const auto& e = entries[i].worldMatrix.elements;
const Vector3 bx(e[0], e[1], e[2]);
const Vector3 by(e[4], e[5], e[6]);
const Vector3 bz(e[8], e[9], e[10]);
const float lx = bx.length(), ly = by.length(), lz = bz.length();
const float eps = 1e-4f * std::max({lx, ly, lz, 1.f});
const bool nonUniform =
std::abs(lx - ly) > eps || std::abs(ly - lz) > eps;
inst.flags = nonUniform ? 1u : 0u;

tlasInstances.push_back(inst);
tlasToEntryIdx.push_back(static_cast<std::uint32_t>(i));
}
}

}// namespace threepp::wgpu_pt
Loading
Loading