Skip to content

Commit

Permalink
Merge pull request #19758 from hrydgard/depth-raster-more-improvements
Browse files Browse the repository at this point in the history
Depth raster: more improvements
  • Loading branch information
hrydgard authored Dec 22, 2024
2 parents ad2714a + 3ccb01b commit fed54f6
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 43 deletions.
18 changes: 16 additions & 2 deletions Common/Math/CrossSIMD.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,13 @@ struct Vec4F32 {
return Vec4F32{ _mm_and_ps(v, _mm_load_ps((float *)mask)) };
}

// Swaps the two lower elements. Useful for reversing triangles..
Vec4F32 SwapLowerElements() {
return Vec4F32{
_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1))
};
}

inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
return Vec4F32{ _mm_add_ps(
_mm_add_ps(
Expand Down Expand Up @@ -210,8 +217,8 @@ struct Vec4S32 {
// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
// This is quite awkward on ARM64 :/ Maybe there's a better solution?
Vec4S32 SwapLowerElements() {
float32x2_t upper = vget_high_s32(v);
float32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
int32x2_t upper = vget_high_s32(v);
int32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
};

Expand Down Expand Up @@ -281,6 +288,13 @@ struct Vec4F32 {
return Vec4F32{ vsetq_lane_f32(0.0f, v, 3) };
}

// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
// This is quite awkward on ARM64 :/ Maybe there's a better solution?
Vec4F32 SwapLowerElements() {
float32x2_t lowerSwapped = vrev64_f32(vget_low_f32(v));
return Vec4F32{ vcombine_f32(lowerSwapped, vget_high_f32(v)) };
};

// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
#if PPSSPP_ARCH(ARM64_NEON)
Expand Down
82 changes: 52 additions & 30 deletions GPU/Common/DepthRaster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,19 @@ struct Edge {
}
};

enum class TriangleResult {
OK,
NoPixels,
Backface,
TooSmall,
};

constexpr int MIN_TRI_AREA = 10;

// Adapted from Intel's depth rasterizer example.
// Started with the scalar version, will SIMD-ify later.
// x1/y1 etc are the scissor rect.
void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, ZCompareMode compareMode) {
TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const float *tz, ZCompareMode compareMode) {
int tileStartX = x1;
int tileEndX = x2;

Expand All @@ -95,16 +104,12 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
// Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls
// are slow on SSE2.

// Convert to whole pixels for now. Later subpixel precision.
int v0x = tx[0];
int v0y = ty[0];
int v0z = tz[0];
int v1x = tx[1];
int v1y = ty[1];
int v1z = tz[1];
int v2x = tx[2];
int v2y = ty[2];
int v2z = tz[2];

// use fixed-point only for X and Y. Avoid work for Z and W.
// We use 4x1 tiles for simplicity.
Expand All @@ -114,13 +119,16 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
int maxY = std::min(std::max(std::max(v0y, v1y), v2y), tileEndY);
if (maxX == minX || maxY == minY) {
// No pixels, or outside screen.
return;
return TriangleResult::NoPixels;
}

// TODO: Cull really small triangles here.
// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
if (triArea <= 0) {
return;
return TriangleResult::Backface;
}
if (triArea < MIN_TRI_AREA) {
return TriangleResult::TooSmall;
}

float oneOverTriArea = 1.0f / (float)triArea;
Expand All @@ -132,20 +140,25 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY);

// Prepare to interpolate Z
Vec4F32 zz0 = Vec4F32::Splat((float)v0z);
Vec4F32 zz1 = Vec4F32::Splat((float)(v1z - v0z) * oneOverTriArea);
Vec4F32 zz2 = Vec4F32::Splat((float)(v2z - v0z) * oneOverTriArea);
Vec4F32 zz0 = Vec4F32::Splat(tz[0]);
Vec4F32 zz1 = Vec4F32::Splat((tz[1] - tz[0]) * oneOverTriArea);
Vec4F32 zz2 = Vec4F32::Splat((tz[2] - tz[0]) * oneOverTriArea);

Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(e20.oneStepX) + zz2 * Vec4F32FromS32(e01.oneStepX);
Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(e20.oneStepY) + zz2 * Vec4F32FromS32(e01.oneStepY);
Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2;

// Rasterize
for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY) {
for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) {
// Barycentric coordinates at start of row
Vec4S32 w0 = w0_row;
Vec4S32 w1 = w1_row;
Vec4S32 w2 = w2_row;
Vec4F32 zs = zrow;

uint16_t *rowPtr = depthBuf + stride * y;

for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX) {
for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) {
// If p is on or inside all edges for any pixels,
// render those pixels.
Vec4S32 signCalc = w0 | w1 | w2;
Expand All @@ -157,9 +170,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.

// Compute the Z value for all four pixels.
// float depth = zz[0] + beta * zz[1] + gamma * zz[2];
Vec4U16 shortZ = Vec4U16::FromVec4F32(zz0 + Vec4F32FromS32(w1) * zz1 + Vec4F32FromS32(w2) * zz2);
Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);

// TODO: Lift this switch out of the inner loop, or even out of the function with templating.
switch (compareMode) {
Expand All @@ -180,6 +191,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
}
}
}
return TriangleResult::OK;
}

void DecodeAndTransformForDepthRaster(float *dest, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) {
Expand Down Expand Up @@ -249,7 +261,7 @@ void ConvertPredecodedThroughForDepthRaster(float *dest, const void *decodedVert
}
}

int DepthRasterClipIndexedRectangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
int DepthRasterClipIndexedRectangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
// TODO: On ARM we can do better by keeping these in lanes instead of splatting.
// However, hard to find a common abstraction.
const Vec4F32 viewportX = Vec4F32::Splat(gstate.getViewportXCenter());
Expand Down Expand Up @@ -289,20 +301,21 @@ int DepthRasterClipIndexedRectangles(int *tx, int *ty, int *tz, const float *tra
y *= recipW;
z *= recipW;

Vec4S32 screen[3];
Vec4S32 screen[2];
Vec4F32 depth;
screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX);
screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY);
screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));
depth = (z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f);

screen[0].Store(tx + outCount);
screen[1].Store(ty + outCount);
screen[2].Store(tz + outCount);
depth.Store(tz + outCount);
outCount += 2;
}
return outCount;
}

int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
bool cullEnabled = gstate.isCullEnabled();
GECullMode cullMode = gstate.getCullMode();

Expand Down Expand Up @@ -351,14 +364,14 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran
y *= recipW;
z *= recipW;

Vec4S32 screen[3];
Vec4S32 screen[2];
screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX);
screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY);
screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));
Vec4F32 depth = (z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f);

screen[0].Store(tx + outCount);
screen[1].Store(ty + outCount);
screen[2].Store(tz + outCount);
depth.Store(tz + outCount);
outCount += 3;

if (!cullEnabled) {
Expand All @@ -371,25 +384,25 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran

screen[0].SwapLowerElements().Store(tx + outCount);
screen[1].SwapLowerElements().Store(ty + outCount);
screen[2].SwapLowerElements().Store(tz + outCount);
depth.SwapLowerElements().Store(tz + outCount);
outCount += 3;
}
}
return outCount;
}

void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
void DepthRasterConvertTransformed(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
// TODO: This is basically a transpose, or AoS->SoA conversion. There may be fast ways.
for (int i = 0; i < count; i++) {
const float *pos = transformed + indexBuffer[i] * 4;
tx[i] = (int)pos[0];
ty[i] = (int)pos[1];
tz[i] = (u16)pos[2];
tz[i] = pos[2]; // clamp?
}
}

// Rasterizes screen-space vertices.
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count) {
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const float *tz, int count) {
// Prim should now be either TRIANGLES or RECTs.
_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);

Expand Down Expand Up @@ -438,17 +451,26 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr
switch (prim) {
case GE_PRIM_RECTANGLES:
for (int i = 0; i < count; i += 2) {
uint16_t z = tz[i + 1]; // depth from second vertex
uint16_t z = (uint16_t)tz[i + 1]; // depth from second vertex
// TODO: Should clip coordinates to the scissor rectangle.
// We remove the subpixel information here.
DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, comp);
}
gpuStats.numDepthRasterPrims += count / 2;
break;
case GE_PRIM_TRIANGLES:
{
int stats[4]{};
for (int i = 0; i < count; i += 3) {
DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], comp);
TriangleResult result = DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], comp);
stats[(int)result]++;
}
gpuStats.numDepthRasterBackface += stats[(int)TriangleResult::Backface];
gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::TooSmall];
gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK];
break;
}
default:
_dbg_assert_(false);
}
Expand Down
8 changes: 4 additions & 4 deletions GPU/Common/DepthRaster.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ struct DepthScreenVertex {
class VertexDecoder;
struct TransformedVertex;

int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
int DepthRasterClipIndexedRectangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count);
int DepthRasterClipIndexedRectangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count);
void DecodeAndTransformForDepthRaster(float *dest, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID);
void TransformPredecodedForDepthRaster(float *dest, const float *worldviewproj, const void *decodedVertexData, VertexDecoder *dec, int count);
void ConvertPredecodedThroughForDepthRaster(float *dest, const void *decodedVertexData, VertexDecoder *dec, int count);
void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
void DepthRasterConvertTransformed(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, int count);

// void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count);

void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count);
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const float *tz, int count);
17 changes: 13 additions & 4 deletions GPU/Common/DrawEngineCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "Common/LogReporting.h"
#include "Common/Math/SIMDHeaders.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Common/TimeUtil.h"
#include "Core/System.h"
#include "Core/Config.h"
#include "GPU/Common/DrawEngineCommon.h"
Expand Down Expand Up @@ -914,6 +915,7 @@ inline void ComputeFinalProjMatrix(float *worldviewproj) {
}

void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) {

switch (prim) {
case GE_PRIM_INVALID:
case GE_PRIM_KEEP_PREVIOUS:
Expand All @@ -929,6 +931,8 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
return;
}

TimeCollector collectStat(&gpuStats.msRasterizingDepth, coreCollectDebugStats);

float worldviewproj[16];
ComputeFinalProjMatrix(worldviewproj);

Expand All @@ -953,7 +957,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder

int *tx = depthScreenVerts_;
int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
float *tz = (float *)(depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2);

// Clip and triangulate using the index buffer.
int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, vertexCount);
Expand All @@ -962,7 +966,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) {
tx[i] = 0;
ty[i] = 0;
tz[i] = 0;
tz[i] = 0.0f;
}
}

Expand All @@ -972,6 +976,8 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
}

void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *inVerts, int numDecoded, VertexDecoder *dec, int vertexCount) {
TimeCollector collectStat(&gpuStats.msRasterizingDepth, coreCollectDebugStats);

switch (prim) {
case GE_PRIM_INVALID:
case GE_PRIM_KEEP_PREVIOUS:
Expand All @@ -987,7 +993,7 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i

int *tx = depthScreenVerts_;
int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
float *tz = (float *)(depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2);

int outVertCount = 0;

Expand All @@ -996,6 +1002,9 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i
DepthRasterConvertTransformed(tx, ty, tz, depthTransformed_, decIndex_, vertexCount);
outVertCount = vertexCount;
} else {
if (dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) {
return;
}
float worldviewproj[16];
ComputeFinalProjMatrix(worldviewproj);
TransformPredecodedForDepthRaster(depthTransformed_, worldviewproj, decoded_, dec, numDecoded);
Expand All @@ -1018,7 +1027,7 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i
for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) {
tx[i] = 0;
ty[i] = 0;
tz[i] = 0;
tz[i] = 0.0f;
}
}
DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
Expand Down
2 changes: 2 additions & 0 deletions GPU/Common/FramebufferManagerCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1866,6 +1866,8 @@ void FramebufferManagerCommon::ResizeFramebufFBO(VirtualFramebuffer *vfb, int w,
char tag[128];
size_t len = FormatFramebufferName(vfb, tag, sizeof(tag));

gpuStats.numFBOsCreated++;

vfb->fbo = draw_->CreateFramebuffer({ vfb->renderWidth, vfb->renderHeight, 1, GetFramebufferLayers(), msaaLevel_, true, tag });
if (Memory::IsVRAMAddress(vfb->fb_address) && vfb->fb_stride != 0) {
NotifyMemInfo(MemBlockFlags::ALLOC, vfb->fb_address, vfb->BufferByteSize(RASTER_COLOR), tag, len);
Expand Down
Loading

0 comments on commit fed54f6

Please sign in to comment.