Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable depth raster for Armored Core by default, minor speedup #19761

Merged
merged 3 commits into from
Dec 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions Common/Math/CrossSIMD.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ struct Vec4S32 {
};
}

// Reads 16 bits from both operands, produces a 32-bit result per lane.
// On SSE2, much faster than _mm_mullo_epi32_SSE2.
// On NEON though, it'll read the full 32 bits, so beware.
// See https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/.
Vec4S32 MulAsS16(Vec4S32 other) const {
// Note that we only need to mask one of the inputs, so we get zeroes - multiplying
// by zero is zero, so it doesn't matter what the upper halfword of each 32-bit word is
// in the other register.
return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) };
}

Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
Expand Down Expand Up @@ -222,6 +233,9 @@ struct Vec4S32 {
return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
};

// Warning: Unlike on x86, this is a full 32-bit multiplication.
Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }

Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
Expand Down Expand Up @@ -269,6 +283,7 @@ struct Vec4F32 {
Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }

Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }

Vec4F32 Recip() {
float32x4_t recip = vrecpeq_f32(v);
// Use a couple Newton-Raphson steps to refine the estimate.
Expand Down
33 changes: 27 additions & 6 deletions GPU/Common/DepthRaster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,13 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
ptr += 8;
w -= 8;
}
// Non-simd trailer.
while (w > 0) {
*ptr++ = depthValue;
w--;
}
}
break;
// TODO: Trailer
default:
// TODO
break;
Expand Down Expand Up @@ -111,7 +115,8 @@ constexpr int MIN_TRI_AREA = 10;
// Adapted from Intel's depth rasterizer example.
// Started with the scalar version, will SIMD-ify later.
// x1/y1 etc are the scissor rect.
TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const float *tz, ZCompareMode compareMode) {
template<ZCompareMode compareMode>
TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const float *tz) {
int tileStartX = x1;
int tileEndX = x2;

Expand Down Expand Up @@ -190,7 +195,7 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y

Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);

// TODO: Lift this switch out of the inner loop, or even out of the function with templating.
// This switch is on a templated constant, so should collapse away.
switch (compareMode) {
case ZCompareMode::Greater:
// To implement the greater/greater-than comparison, we can combine mask and max.
Expand Down Expand Up @@ -479,9 +484,25 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr
case GE_PRIM_TRIANGLES:
{
int stats[4]{};
for (int i = 0; i < count; i += 3) {
TriangleResult result = DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], comp);
stats[(int)result]++;
switch (comp) {
case ZCompareMode::Greater:
for (int i = 0; i < count; i += 3) {
TriangleResult result = DepthRasterTriangle<ZCompareMode::Greater>(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i]);
stats[(int)result]++;
}
break;
case ZCompareMode::Less:
for (int i = 0; i < count; i += 3) {
TriangleResult result = DepthRasterTriangle<ZCompareMode::Less>(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i]);
stats[(int)result]++;
}
break;
case ZCompareMode::Always:
for (int i = 0; i < count; i += 3) {
TriangleResult result = DepthRasterTriangle<ZCompareMode::Always>(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i]);
stats[(int)result]++;
}
break;
}
gpuStats.numDepthRasterBackface += stats[(int)TriangleResult::Backface];
gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
Expand Down
15 changes: 11 additions & 4 deletions assets/compat.ini
Original file line number Diff line number Diff line change
Expand Up @@ -1636,21 +1636,21 @@ ULJM05216 = true
NPJH50181 = true

[SoftwareRasterDepth]
# Midnight Club: LA Remix
# Midnight Club: LA Remix (see #18625)
ULUS10383 = true
ULES01144 = true
ULJS00180 = true
ULJS00267 = true
ULJM05904 = true
NPJH50440 = true

# Syphon Filter - Dark Mirror
# Syphon Filter - Dark Mirror (light flares, see #10229)
UCES00310 = true
UCUS98641 = true
UCUS98656 = true
UCUS98656 = true # Demo

# Syphon Filter - Logan's Shadow
# Syphon Filter - Logan's Shadow (light flares, see #10229)
UCUS98606 = true
UCES00710 = true
NPUG80173 = true
Expand All @@ -1663,7 +1663,7 @@ SYPH04036 = true # Prototype?
NPUG80114 = true
NPEG00004 = true

# Wipeout Pure
# Wipeout Pure (see #13344, sun lens flare)
UCUS98612 = true
UCJS10007 = true
UCES00001 = true
Expand All @@ -1675,7 +1675,14 @@ UCES01184 = true
UCUS98668 = true
UCJP00174 = true

# Armored Core: Silent Line (see #17597)
ULJM05552 = true
UCAS40289 = true
NPUH10025 = true
NPEH00047 = true

[ReadbackDepth]
# Obsolete, SoftwareRasterDepth is a far better solution.

[BlockTransferDepth]
# Iron Man - see issue #16530
Expand Down
Loading