Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combine draw commands to improve rendering performance #2421

Open
wants to merge 41 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
d8f6517
rename some things for clarity
douira Feb 19, 2024
e17aca9
fix waterlogged glass panes (once again, but more this time) by avoid…
douira Feb 19, 2024
0e9f45b
use Mth.clamp for clarity
douira Feb 20, 2024
856f96d
refactor buffer and sort result handling, buffers are now freed immed…
douira Mar 2, 2024
f969b7f
reduce number of unique triggers by around 5 percent without impactin…
douira Mar 4, 2024
e9c9062
importantly sort a little farther away, sort tasks are fast
douira Mar 4, 2024
bee8d00
use defer zero frames for important sort tasks by default
douira Mar 4, 2024
7d8587d
fix build
douira Mar 4, 2024
8ccba8c
clarify authorship of BitArray
douira Mar 13, 2024
be07541
fix bug with radix sort for SNR heuristic in BSP partition generating…
douira Mar 27, 2024
37f1f67
Merge branch 'dev' into ts-waterlogged-glass-panes
douira Apr 6, 2024
2dd7f5e
combine draw commands
douira Apr 14, 2024
d407a83
correctly reset accumulated element count
douira Apr 16, 2024
2520c25
remove draw call combining for indexed rendering as it's broken and h…
douira Apr 21, 2024
a84c18f
skip heuristic if there's no quads
douira Apr 21, 2024
298522f
refactor primary intersector detection to handle large cases better,
douira Apr 21, 2024
6b7bc8f
fix topo sorting in some situations where the dot product was wrongly…
douira Apr 27, 2024
4322aaf
reorder vertex ranges before uploaded to optimize for combined draw c…
douira Apr 29, 2024
53c8e79
Merge branch 'ts-waterlogged-glass-panes' into combine-draw-commands
douira Apr 30, 2024
8953480
tune primary intersector detection to handle situations where only a …
douira May 9, 2024
1907715
Merge branch 'dev' into ts-waterlogged-glass-panes
douira May 12, 2024
3da73cb
Merge branch 'dev' into ts-waterlogged-glass-panes
douira May 12, 2024
d4ac4c1
Merge branch 'ts-waterlogged-glass-panes' into combine-draw-commands
douira May 20, 2024
51fe61d
Merge branch 'dev' into combine-draw-commands
douira May 20, 2024
47f11ac
Merge branch 'dev' into combine-draw-commands
douira Jul 9, 2024
548f2a1
Merge branch 'dev' into combine-draw-commands
douira Sep 20, 2024
575ecb5
fix draw command combining, remove aggressive non-empty command skipp…
douira Sep 20, 2024
e936630
fix graphical corruption when there's a lot of geometry by appropriat…
douira Sep 26, 2024
2af56fc
cleanup unused and broken code
douira Sep 26, 2024
a395364
cleanup calculation of mask bit and element count
douira Sep 28, 2024
2163c9e
cleanup meshing, storage, and renderer
douira Sep 28, 2024
c01a6bd
fix translucent rendering by correctly decoding vertex segments
douira Sep 28, 2024
fde4e60
cleanup misc, remove unused code
douira Sep 29, 2024
59a9517
refactor translucent AnyOrderData to not generate its own trivial ind…
douira Sep 29, 2024
8334733
add Index Pool arena size
douira Sep 30, 2024
35e374f
add arena content caching
douira Oct 1, 2024
b449bbf
Merge branch 'dev' into combine-draw-commands
douira Oct 4, 2024
85bc54e
refactor storage to cope with larger amounts of geometry and use less…
douira Oct 4, 2024
9b24127
remove debug code
douira Oct 5, 2024
e962b37
Merge branch 'dev' into combine-draw-commands
douira Oct 26, 2024
0de9587
Merge branch 'dev' into combine-draw-commands
douira Nov 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package net.caffeinemc.mods.sodium.client.gl.arena;

import it.unimi.dsi.fastutil.longs.Long2ReferenceOpenHashMap;
import net.caffeinemc.mods.sodium.client.gl.arena.staging.StagingBuffer;
import net.caffeinemc.mods.sodium.client.gl.buffer.GlBuffer;
import net.caffeinemc.mods.sodium.client.gl.buffer.GlBufferUsage;
import net.caffeinemc.mods.sodium.client.gl.buffer.GlMutableBuffer;
import net.caffeinemc.mods.sodium.client.gl.device.CommandList;
import net.jpountz.xxhash.XXHash64;
import net.jpountz.xxhash.XXHashFactory;

import java.nio.ByteBuffer;
import java.util.ArrayList;
Expand All @@ -26,12 +29,17 @@ public class GlBufferArena {

private GlBufferSegment head;

private static final XXHash64 NATIVE_HASH = XXHashFactory.fastestInstance().hash64();
private static final XXHash64 JAVA_HASH = XXHashFactory.fastestJavaInstance().hash64();
private static final int NATIVE_HASH_BYTES_THRESHOLD = 512; // TODO: tune this?
private final Long2ReferenceOpenHashMap<GlBufferSegment> cache;

private long capacity;
private long used;

private final int stride;

public GlBufferArena(CommandList commands, int initialCapacity, int stride, StagingBuffer stagingBuffer) {
public GlBufferArena(CommandList commands, int initialCapacity, int stride, StagingBuffer stagingBuffer, boolean enableCache) {
this.capacity = initialCapacity;
this.resizeIncrement = initialCapacity / 16;

Expand All @@ -44,6 +52,12 @@ public GlBufferArena(CommandList commands, int initialCapacity, int stride, Stag
commands.allocateStorage(this.arenaBuffer, this.capacity * stride, BUFFER_USAGE);

this.stagingBuffer = stagingBuffer;

if (enableCache) {
this.cache = new Long2ReferenceOpenHashMap<>();
} else {
this.cache = null;
}
}

private void resize(CommandList commandList, long newCapacity) {
Expand Down Expand Up @@ -226,6 +240,10 @@ public void free(GlBufferSegment entry) {
throw new IllegalStateException("Already freed");
}

if (entry.isHashed()) {
this.cache.remove(entry.getHash());
}

entry.setFree(true);

this.used -= entry.getLength();
Expand Down Expand Up @@ -265,7 +283,7 @@ public boolean upload(CommandList commandList, Stream<PendingUpload> stream) {
// A linked list is used as we'll be randomly removing elements and want O(1) performance
List<PendingUpload> queue = stream.collect(Collectors.toCollection(LinkedList::new));

// Try to upload all of the data into free segments first
// Try to upload all the data into free segments first
this.tryUploads(commandList, queue);

// If we weren't able to upload some buffers, they will have been left behind in the queue
Expand Down Expand Up @@ -297,18 +315,46 @@ private void tryUploads(CommandList commandList, List<PendingUpload> queue) {
this.stagingBuffer.flush(commandList);
}

private long getBufferHash(ByteBuffer data) {
var seed = System.identityHashCode(this);
var length = data.remaining();
if (length < NATIVE_HASH_BYTES_THRESHOLD) {
return JAVA_HASH.hash(data, 0, length, seed);
} else {
return NATIVE_HASH.hash(data, 0, length, seed);
}
}

private boolean tryUpload(CommandList commandList, PendingUpload upload) {
ByteBuffer data = upload.getDataBuffer()
.getDirectBuffer();
ByteBuffer data = upload.getDataBuffer().getDirectBuffer();

int elementCount = data.remaining() / this.stride;

// return a buffer segment with the same content if there is one based on the hash of the incoming content
GlBufferSegment matchingSegment = null;
long hash = 0;
if (this.cache != null) {
hash = this.getBufferHash(data);
matchingSegment = this.cache.get(hash);
}
if (matchingSegment != null) {
upload.setResult(matchingSegment);
matchingSegment.addRef();
return true;
}

GlBufferSegment dst = this.alloc(elementCount);

if (dst == null) {
return false;
}

// if a new segment was needed (cache miss), set the calculated hash on the segment
if (this.cache != null) {
dst.setHash(hash);
this.cache.put(hash, dst);
}

// Copy the data into our staging buffer, then copy it into the arena's buffer
this.stagingBuffer.enqueueCopy(commandList, data, this.arenaBuffer, dst.getOffset() * this.stride);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ public class GlBufferSegment {
private final GlBufferArena arena;

private boolean free = false;
private int refCount = 1;
private long hash;
private boolean isHashed = false;

private int offset; /* Uint32 */
private int length; /* Uint32 */
Expand Down Expand Up @@ -42,8 +45,33 @@ protected void setLength(long length /* Uint32 */) {
this.length = UInt32.downcast(length);
}

public void setHash(long hash) {
this.hash = hash;
this.isHashed = true;
}

public long getHash() {
return this.hash;
}

public boolean isHashed() {
return this.isHashed;
}

public void addRef() {
if (this.isFree()) {
throw new IllegalStateException("Cannot add ref to free segment");
}
this.refCount++;
}

protected void setFree(boolean free) {
this.free = free;
if (this.free) {
this.refCount = 0;
} else {
this.refCount = Math.max(this.refCount, 1);
}
}

protected boolean isFree() {
Expand All @@ -67,7 +95,11 @@ protected void setPrev(GlBufferSegment prev) {
}

public void delete() {
this.arena.free(this);
// only actually free if there's no more users
if (--this.refCount == 0) {
this.arena.free(this);
this.isHashed = false;
}
}

protected void mergeInto(GlBufferSegment entry) {
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ public boolean isEmpty() {
public int getIndexBufferSize() {
int elements = 0;

// since there's command combining, all facings might be rendered at the same time with a single command which requires a bigger index buffer
for (var index = 0; index < this.size; index++) {
elements = Math.max(elements, MemoryUtil.memGetInt(this.pElementCount + ((long) index * Integer.BYTES)));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package net.caffeinemc.mods.sodium.client.render.chunk;

import net.caffeinemc.mods.sodium.client.SodiumClientMod;
import net.caffeinemc.mods.sodium.client.gl.attribute.GlVertexAttributeBinding;
import net.caffeinemc.mods.sodium.client.gl.device.CommandList;
import net.caffeinemc.mods.sodium.client.gl.device.DrawCommandList;
import net.caffeinemc.mods.sodium.client.gl.device.MultiDrawBatch;
Expand All @@ -16,7 +15,6 @@
import net.caffeinemc.mods.sodium.client.render.chunk.lists.ChunkRenderList;
import net.caffeinemc.mods.sodium.client.render.chunk.lists.ChunkRenderListIterable;
import net.caffeinemc.mods.sodium.client.render.chunk.region.RenderRegion;
import net.caffeinemc.mods.sodium.client.render.chunk.shader.ChunkShaderBindingPoints;
import net.caffeinemc.mods.sodium.client.render.chunk.shader.ChunkShaderInterface;
import net.caffeinemc.mods.sodium.client.render.chunk.terrain.TerrainRenderPass;
import net.caffeinemc.mods.sodium.client.render.chunk.translucent_sorting.SortBehavior;
Expand Down Expand Up @@ -72,7 +70,7 @@ public void render(ChunkRenderMatrices matrices,
continue;
}

fillCommandBuffer(this.batch, region, storage, renderList, camera, renderPass, useBlockFaceCulling);
fillCommandBuffer(this.batch, region, storage, renderList, camera, renderPass, useBlockFaceCulling, useIndexedTessellation);

if (this.batch.isEmpty()) {
continue;
Expand Down Expand Up @@ -109,7 +107,8 @@ private static void fillCommandBuffer(MultiDrawBatch batch,
ChunkRenderList renderList,
CameraTransform camera,
TerrainRenderPass pass,
boolean useBlockFaceCulling) {
boolean useBlockFaceCulling,
boolean useIndexedTessellation) {
batch.clear();

var iterator = renderList.sectionsWithGeometryIterator(pass.isTranslucent());
Expand Down Expand Up @@ -149,30 +148,48 @@ private static void fillCommandBuffer(MultiDrawBatch batch,
continue;
}

if (pass.isTranslucent()) {
addIndexedDrawCommands(batch, pMeshData, slices);
// it's necessary to sometimes not the locally-indexed command generator even for indexed tessellations since
// sometimes the index buffer is shared, but not globally shared. This means that translucent sections that
// are sharing an index buffer amongst them need to use the shared index command generator since it sets the
// same element offset for each draw command and doesn't increment it. Recall that in each draw command the indexing
// of the elements needs to start at 0 and thus starting somewhere further into the shared index buffer is invalid.
// there's also the optimization that draw commands can be combined when using a shared index buffer, be it
// globally shared or just shared within the region, which isn't possible with the locally-indexed command generator.
if (useIndexedTessellation && SectionRenderDataUnsafe.isLocalIndex(pMeshData)) {
addLocalIndexedDrawCommands(batch, pMeshData, slices);
} else {
addNonIndexedDrawCommands(batch, pMeshData, slices);
addSharedIndexedDrawCommands(batch, pMeshData, slices);
}
}
}

/**
* Generates the draw commands for a chunk's meshes using the shared index buffer.
* Generates the draw commands for a chunk's meshes, where each mesh has a separate index buffer. This is used
* when rendering translucent geometry, as each geometry set needs a sorted index buffer.
*/
@SuppressWarnings("IntegerMultiplicationImplicitCastToLong")
private static void addNonIndexedDrawCommands(MultiDrawBatch batch, long pMeshData, int mask) {
private static void addLocalIndexedDrawCommands(MultiDrawBatch batch, long pMeshData, int mask) {
final var pElementPointer = batch.pElementPointer;
final var pBaseVertex = batch.pBaseVertex;
final var pElementCount = batch.pElementCount;

int size = batch.size;

long elementOffset = SectionRenderDataUnsafe.getBaseElement(pMeshData);
long baseVertex = SectionRenderDataUnsafe.getBaseVertex(pMeshData);

for (int facing = 0; facing < ModelQuadFacing.COUNT; facing++) {
// Uint32 -> Int32 cast is always safe and should be optimized away
MemoryUtil.memPutInt(pBaseVertex + (size << 2), (int) SectionRenderDataUnsafe.getVertexOffset(pMeshData, facing));
MemoryUtil.memPutInt(pElementCount + (size << 2), (int) SectionRenderDataUnsafe.getElementCount(pMeshData, facing));
MemoryUtil.memPutAddress(pElementPointer + (size << 3), 0 /* using a shared index buffer */);
final long vertexCount = SectionRenderDataUnsafe.getVertexCount(pMeshData, facing);
final long elementCount = (vertexCount >> 2) * 6;

MemoryUtil.memPutInt(pElementCount + (size << 2), UInt32.uncheckedDowncast(elementCount));
MemoryUtil.memPutInt(pBaseVertex + (size << 2), UInt32.uncheckedDowncast(baseVertex));

// * 4 to convert to bytes (the index buffer contains integers)
MemoryUtil.memPutAddress(pElementPointer + (size << 3), elementOffset << 2);

baseVertex += vertexCount;
elementOffset += elementCount;

size += (mask >> facing) & 1;
}
Expand All @@ -181,34 +198,57 @@ private static void addNonIndexedDrawCommands(MultiDrawBatch batch, long pMeshDa
}

/**
* Generates the draw commands for a chunk's meshes, where each mesh has a separate index buffer. This is used
* when rendering translucent geometry, as each geometry set needs a sorted index buffer.
* Generates the draw commands for a chunk's meshes using the shared index buffer.
*/
@SuppressWarnings("IntegerMultiplicationImplicitCastToLong")
private static void addIndexedDrawCommands(MultiDrawBatch batch, long pMeshData, int mask) {
private static void addSharedIndexedDrawCommands(MultiDrawBatch batch, long pMeshData, int mask) {
final var pElementPointer = batch.pElementPointer;
final var pBaseVertex = batch.pBaseVertex;
final var pElementCount = batch.pElementCount;

int size = batch.size;

long elementOffset = SectionRenderDataUnsafe.getBaseElement(pMeshData);
// this is either zero (global shared index buffer) or the offset to the location of the shared element buffer (region shared index buffer)
final var elementOffsetBytes = SectionRenderDataUnsafe.getBaseElement(pMeshData) << 2;
final var facingList = SectionRenderDataUnsafe.getFacingList(pMeshData);

for (int facing = 0; facing < ModelQuadFacing.COUNT; facing++) {
final long vertexOffset = SectionRenderDataUnsafe.getVertexOffset(pMeshData, facing);
final long elementCount = SectionRenderDataUnsafe.getElementCount(pMeshData, facing);

// Uint32 -> Int32 cast is always safe and should be optimized away
MemoryUtil.memPutInt(pBaseVertex + (size << 2), UInt32.uncheckedDowncast(vertexOffset));
MemoryUtil.memPutInt(pElementCount + (size << 2), UInt32.uncheckedDowncast(elementCount));
int size = batch.size;
long groupVertexCount = 0;
long baseVertex = SectionRenderDataUnsafe.getBaseVertex(pMeshData);
int lastMaskBit = 0;

for (int i = 0; i <= ModelQuadFacing.COUNT; i++) {
var maskBit = 0;
long vertexCount = 0;
if (i < ModelQuadFacing.COUNT) {
vertexCount = SectionRenderDataUnsafe.getVertexCount(pMeshData, i);

// if there's no vertexes, the mask bit is just 0
if (vertexCount != 0) {
var facing = (facingList >>> (i * 8)) & 0xFF;
maskBit = (mask >>> facing) & 1;
}
}

// * 4 to convert to bytes (the index buffer contains integers)
// the section render data storage for the indices stores the offset in indices (also called elements)
MemoryUtil.memPutAddress(pElementPointer + (size << 3), elementOffset << 2);
if (maskBit == 0) {
if (lastMaskBit == 1) {
// delay writing out draw command if there's a zero-size group
if (i < ModelQuadFacing.COUNT && vertexCount == 0) {
continue;
}

MemoryUtil.memPutInt(pElementCount + (size << 2), UInt32.uncheckedDowncast((groupVertexCount >> 2) * 6));
MemoryUtil.memPutInt(pBaseVertex + (size << 2), UInt32.uncheckedDowncast(baseVertex));
MemoryUtil.memPutAddress(pElementPointer + (size << 3), elementOffsetBytes);
size++;
baseVertex += groupVertexCount;
groupVertexCount = 0;
}

baseVertex += vertexCount;
} else {
groupVertexCount += vertexCount;
}

// adding the number of elements works because the index data has one index per element (which are the indices)
elementOffset += elementCount;
size += (mask >> facing) & 1;
lastMaskBit = maskBit;
}

batch.size = size;
Expand All @@ -223,7 +263,7 @@ private static void addIndexedDrawCommands(MultiDrawBatch batch, long pMeshData,
private static final int MODEL_NEG_Y = ModelQuadFacing.NEG_Y.ordinal();
private static final int MODEL_NEG_Z = ModelQuadFacing.NEG_Z.ordinal();

private static int getVisibleFaces(int originX, int originY, int originZ, int chunkX, int chunkY, int chunkZ) {
public static int getVisibleFaces(int originX, int originY, int originZ, int chunkX, int chunkY, int chunkZ) {
// This is carefully written so that we can keep everything branch-less.
//
// Normally, this would be a ridiculous way to handle the problem. But the Hotspot VM's
Expand Down
Loading