Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Depth raster: more improvements #19758

Merged
merged 5 commits into from
Dec 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions Common/Math/CrossSIMD.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,13 @@ struct Vec4F32 {
return Vec4F32{ _mm_and_ps(v, _mm_load_ps((float *)mask)) };
}

// Swaps the two lower elements. Useful for reversing triangles..
Vec4F32 SwapLowerElements() {
return Vec4F32{
_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1))
};
}

inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
return Vec4F32{ _mm_add_ps(
_mm_add_ps(
Expand Down Expand Up @@ -210,8 +217,8 @@ struct Vec4S32 {
// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
// This is quite awkward on ARM64 :/ Maybe there's a better solution?
Vec4S32 SwapLowerElements() {
float32x2_t upper = vget_high_s32(v);
float32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
int32x2_t upper = vget_high_s32(v);
int32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
};

Expand Down Expand Up @@ -281,6 +288,13 @@ struct Vec4F32 {
return Vec4F32{ vsetq_lane_f32(0.0f, v, 3) };
}

// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
// This is quite awkward on ARM64 :/ Maybe there's a better solution?
Vec4F32 SwapLowerElements() {
float32x2_t lowerSwapped = vrev64_f32(vget_low_f32(v));
return Vec4F32{ vcombine_f32(lowerSwapped, vget_high_f32(v)) };
};

// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
#if PPSSPP_ARCH(ARM64_NEON)
Expand Down
82 changes: 52 additions & 30 deletions GPU/Common/DepthRaster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,19 @@ struct Edge {
}
};

enum class TriangleResult {
OK,
NoPixels,
Backface,
TooSmall,
};

constexpr int MIN_TRI_AREA = 10;

// Adapted from Intel's depth rasterizer example.
// Started with the scalar version, will SIMD-ify later.
// x1/y1 etc are the scissor rect.
void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, ZCompareMode compareMode) {
TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const float *tz, ZCompareMode compareMode) {
int tileStartX = x1;
int tileEndX = x2;

Expand All @@ -95,16 +104,12 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
// Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls
// are slow on SSE2.

// Convert to whole pixels for now. Later subpixel precision.
int v0x = tx[0];
int v0y = ty[0];
int v0z = tz[0];
int v1x = tx[1];
int v1y = ty[1];
int v1z = tz[1];
int v2x = tx[2];
int v2y = ty[2];
int v2z = tz[2];

// use fixed-point only for X and Y. Avoid work for Z and W.
// We use 4x1 tiles for simplicity.
Expand All @@ -114,13 +119,16 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
int maxY = std::min(std::max(std::max(v0y, v1y), v2y), tileEndY);
if (maxX == minX || maxY == minY) {
// No pixels, or outside screen.
return;
return TriangleResult::NoPixels;
}

// TODO: Cull really small triangles here.
// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
if (triArea <= 0) {
return;
return TriangleResult::Backface;
}
if (triArea < MIN_TRI_AREA) {
return TriangleResult::TooSmall;
}

float oneOverTriArea = 1.0f / (float)triArea;
Expand All @@ -132,20 +140,25 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY);

// Prepare to interpolate Z
Vec4F32 zz0 = Vec4F32::Splat((float)v0z);
Vec4F32 zz1 = Vec4F32::Splat((float)(v1z - v0z) * oneOverTriArea);
Vec4F32 zz2 = Vec4F32::Splat((float)(v2z - v0z) * oneOverTriArea);
Vec4F32 zz0 = Vec4F32::Splat(tz[0]);
Vec4F32 zz1 = Vec4F32::Splat((tz[1] - tz[0]) * oneOverTriArea);
Vec4F32 zz2 = Vec4F32::Splat((tz[2] - tz[0]) * oneOverTriArea);

Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(e20.oneStepX) + zz2 * Vec4F32FromS32(e01.oneStepX);
Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(e20.oneStepY) + zz2 * Vec4F32FromS32(e01.oneStepY);
Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2;

// Rasterize
for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY) {
for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) {
// Barycentric coordinates at start of row
Vec4S32 w0 = w0_row;
Vec4S32 w1 = w1_row;
Vec4S32 w2 = w2_row;
Vec4F32 zs = zrow;

uint16_t *rowPtr = depthBuf + stride * y;

for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX) {
for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) {
// If p is on or inside all edges for any pixels,
// render those pixels.
Vec4S32 signCalc = w0 | w1 | w2;
Expand All @@ -157,9 +170,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.

// Compute the Z value for all four pixels.
// float depth = zz[0] + beta * zz[1] + gamma * zz[2];
Vec4U16 shortZ = Vec4U16::FromVec4F32(zz0 + Vec4F32FromS32(w1) * zz1 + Vec4F32FromS32(w2) * zz2);
Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);

// TODO: Lift this switch out of the inner loop, or even out of the function with templating.
switch (compareMode) {
Expand All @@ -180,6 +191,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
}
}
}
return TriangleResult::OK;
}

void DecodeAndTransformForDepthRaster(float *dest, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) {
Expand Down Expand Up @@ -249,7 +261,7 @@ void ConvertPredecodedThroughForDepthRaster(float *dest, const void *decodedVert
}
}

int DepthRasterClipIndexedRectangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
int DepthRasterClipIndexedRectangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
// TODO: On ARM we can do better by keeping these in lanes instead of splatting.
// However, hard to find a common abstraction.
const Vec4F32 viewportX = Vec4F32::Splat(gstate.getViewportXCenter());
Expand Down Expand Up @@ -289,20 +301,21 @@ int DepthRasterClipIndexedRectangles(int *tx, int *ty, int *tz, const float *tra
y *= recipW;
z *= recipW;

Vec4S32 screen[3];
Vec4S32 screen[2];
Vec4F32 depth;
screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX);
screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY);
screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));
depth = (z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f);

screen[0].Store(tx + outCount);
screen[1].Store(ty + outCount);
screen[2].Store(tz + outCount);
depth.Store(tz + outCount);
outCount += 2;
}
return outCount;
}

int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
bool cullEnabled = gstate.isCullEnabled();
GECullMode cullMode = gstate.getCullMode();

Expand Down Expand Up @@ -351,14 +364,14 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran
y *= recipW;
z *= recipW;

Vec4S32 screen[3];
Vec4S32 screen[2];
screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX);
screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY);
screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));
Vec4F32 depth = (z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f);

screen[0].Store(tx + outCount);
screen[1].Store(ty + outCount);
screen[2].Store(tz + outCount);
depth.Store(tz + outCount);
outCount += 3;

if (!cullEnabled) {
Expand All @@ -371,25 +384,25 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran

screen[0].SwapLowerElements().Store(tx + outCount);
screen[1].SwapLowerElements().Store(ty + outCount);
screen[2].SwapLowerElements().Store(tz + outCount);
depth.SwapLowerElements().Store(tz + outCount);
outCount += 3;
}
}
return outCount;
}

void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
void DepthRasterConvertTransformed(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
// TODO: This is basically a transpose, or AoS->SoA conversion. There may be fast ways.
for (int i = 0; i < count; i++) {
const float *pos = transformed + indexBuffer[i] * 4;
tx[i] = (int)pos[0];
ty[i] = (int)pos[1];
tz[i] = (u16)pos[2];
tz[i] = pos[2]; // clamp?
}
}

// Rasterizes screen-space vertices.
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count) {
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const float *tz, int count) {
// Prim should now be either TRIANGLES or RECTs.
_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);

Expand Down Expand Up @@ -438,17 +451,26 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr
switch (prim) {
case GE_PRIM_RECTANGLES:
for (int i = 0; i < count; i += 2) {
uint16_t z = tz[i + 1]; // depth from second vertex
uint16_t z = (uint16_t)tz[i + 1]; // depth from second vertex
// TODO: Should clip coordinates to the scissor rectangle.
// We remove the subpixel information here.
DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, comp);
}
gpuStats.numDepthRasterPrims += count / 2;
break;
case GE_PRIM_TRIANGLES:
{
int stats[4]{};
for (int i = 0; i < count; i += 3) {
DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], comp);
TriangleResult result = DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], comp);
stats[(int)result]++;
}
gpuStats.numDepthRasterBackface += stats[(int)TriangleResult::Backface];
gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::TooSmall];
gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK];
break;
}
default:
_dbg_assert_(false);
}
Expand Down
8 changes: 4 additions & 4 deletions GPU/Common/DepthRaster.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ struct DepthScreenVertex {
class VertexDecoder;
struct TransformedVertex;

int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
int DepthRasterClipIndexedRectangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count);
int DepthRasterClipIndexedRectangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count);
void DecodeAndTransformForDepthRaster(float *dest, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID);
void TransformPredecodedForDepthRaster(float *dest, const float *worldviewproj, const void *decodedVertexData, VertexDecoder *dec, int count);
void ConvertPredecodedThroughForDepthRaster(float *dest, const void *decodedVertexData, VertexDecoder *dec, int count);
void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
void DepthRasterConvertTransformed(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count);

// void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count);

void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count);
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const float *tz, int count);
17 changes: 13 additions & 4 deletions GPU/Common/DrawEngineCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "Common/LogReporting.h"
#include "Common/Math/SIMDHeaders.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Common/TimeUtil.h"
#include "Core/System.h"
#include "Core/Config.h"
#include "GPU/Common/DrawEngineCommon.h"
Expand Down Expand Up @@ -914,6 +915,7 @@ inline void ComputeFinalProjMatrix(float *worldviewproj) {
}

void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) {

switch (prim) {
case GE_PRIM_INVALID:
case GE_PRIM_KEEP_PREVIOUS:
Expand All @@ -929,6 +931,8 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
return;
}

TimeCollector collectStat(&gpuStats.msRasterizingDepth, coreCollectDebugStats);

float worldviewproj[16];
ComputeFinalProjMatrix(worldviewproj);

Expand All @@ -953,7 +957,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder

int *tx = depthScreenVerts_;
int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
float *tz = (float *)(depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2);

// Clip and triangulate using the index buffer.
int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, vertexCount);
Expand All @@ -962,7 +966,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) {
tx[i] = 0;
ty[i] = 0;
tz[i] = 0;
tz[i] = 0.0f;
}
}

Expand All @@ -972,6 +976,8 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
}

void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *inVerts, int numDecoded, VertexDecoder *dec, int vertexCount) {
TimeCollector collectStat(&gpuStats.msRasterizingDepth, coreCollectDebugStats);

switch (prim) {
case GE_PRIM_INVALID:
case GE_PRIM_KEEP_PREVIOUS:
Expand All @@ -987,7 +993,7 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i

int *tx = depthScreenVerts_;
int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
float *tz = (float *)(depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2);

int outVertCount = 0;

Expand All @@ -996,6 +1002,9 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i
DepthRasterConvertTransformed(tx, ty, tz, depthTransformed_, decIndex_, vertexCount);
outVertCount = vertexCount;
} else {
if (dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) {
return;
}
float worldviewproj[16];
ComputeFinalProjMatrix(worldviewproj);
TransformPredecodedForDepthRaster(depthTransformed_, worldviewproj, decoded_, dec, numDecoded);
Expand All @@ -1018,7 +1027,7 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i
for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) {
tx[i] = 0;
ty[i] = 0;
tz[i] = 0;
tz[i] = 0.0f;
}
}
DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
Expand Down
2 changes: 2 additions & 0 deletions GPU/Common/FramebufferManagerCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1866,6 +1866,8 @@ void FramebufferManagerCommon::ResizeFramebufFBO(VirtualFramebuffer *vfb, int w,
char tag[128];
size_t len = FormatFramebufferName(vfb, tag, sizeof(tag));

gpuStats.numFBOsCreated++;

vfb->fbo = draw_->CreateFramebuffer({ vfb->renderWidth, vfb->renderHeight, 1, GetFramebufferLayers(), msaaLevel_, true, tag });
if (Memory::IsVRAMAddress(vfb->fb_address) && vfb->fb_stride != 0) {
NotifyMemInfo(MemBlockFlags::ALLOC, vfb->fb_address, vfb->BufferByteSize(RASTER_COLOR), tag, len);
Expand Down
Loading
Loading