Skip to content

Commit

Permalink
Merge branch 'xenia-project:master' into stfs-writer
Browse files Browse the repository at this point in the history
  • Loading branch information
epozzobon authored Oct 21, 2022
2 parents 1b1e386 + 5fde7c6 commit f118eb5
Show file tree
Hide file tree
Showing 33 changed files with 6,328 additions and 1,538 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
url = https://github.com/microsoft/DirectXShaderCompiler.git
[submodule "third_party/premake-cmake"]
path = third_party/premake-cmake
url = https://github.com/Enhex/premake-cmake.git
url = https://github.com/JoelLinn/premake-cmake.git
[submodule "third_party/date"]
path = third_party/date
url = https://github.com/HowardHinnant/date.git
Expand Down
1 change: 1 addition & 0 deletions src/xenia/cpu/backend/x64/x64_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ DEFINE_int32(x64_extension_mask, -1,
" 512 = AVX512VL\n"
" 1024 = AVX512BW\n"
" 2048 = AVX512DQ\n"
" 4096 = AVX512VBMI\n"
" -1 = Detect and utilize all possible processor features\n",
"x64");

Expand Down
1 change: 1 addition & 0 deletions src/xenia/cpu/backend/x64/x64_emitter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512_VBMI);

#undef TEST_EMIT_FEATURE
}
Expand Down
2 changes: 2 additions & 0 deletions src/xenia/cpu/backend/x64/x64_emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ enum X64EmitterFeatureFlags {
kX64EmitAVX512BW = 1 << 10,
kX64EmitAVX512DQ = 1 << 11,

kX64EmitAVX512VBMI = 1 << 12,

kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ
};
Expand Down
147 changes: 146 additions & 1 deletion src/xenia/cpu/backend/x64/x64_seq_vector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,19 @@ struct VECTOR_CONVERT_F2I
I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
Opmask mask = e.k1;
// Mask positive values and unordered values
// _CMP_NLT_UQ
e.vcmpps(mask, i.src1, e.GetXmmConstPtr(XMMZero), 0x15);

// vcvttps2udq will saturate overflowing positive values and unordered
// values to UINT_MAX. Mask registers will write zero everywhere
// else (negative values)
e.vcvttps2udq(i.dest.reg() | mask | e.T_z, i.src1);
return;
}

// clamp to min 0
e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));

Expand Down Expand Up @@ -547,6 +560,15 @@ struct VECTOR_ADD
case INT32_TYPE:
if (saturate) {
if (is_unsigned) {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vpaddd(dest, src1, src2);
Opmask saturate = e.k1;
// _mm_cmplt_epu32_mask
e.vpcmpud(saturate, dest, src1, 0x1);
e.vpternlogd(dest | saturate, dest, dest, 0xFF);
return;
}

// xmm0 is the only temp register that can be used by
// src1/src2.
e.vpaddd(e.xmm1, src1, src2);
Expand All @@ -562,6 +584,20 @@ struct VECTOR_ADD
} else {
e.vpaddd(e.xmm1, src1, src2);

if (e.IsFeatureEnabled(kX64EmitAVX512Ortho |
kX64EmitAVX512DQ)) {
e.vmovdqa32(e.xmm3, src1);
e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00100100);

const Opmask saturate = e.k1;
e.vpmovd2m(saturate, e.xmm3);

e.vpsrad(e.xmm2, e.xmm1, 31);
e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32));
e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
return;
}

// Overflow results if two inputs are the same sign and the
// result isn't the same sign. if ((s32b)(~(src1 ^ src2) &
// (src1 ^ res)) < 0) then overflowed
Expand Down Expand Up @@ -643,6 +679,19 @@ struct VECTOR_SUB
// src1/src2.
e.vpsubd(e.xmm1, src1, src2);

if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
// If the result is less or equal to the first operand then
// we did not underflow
Opmask not_underflow = e.k1;
// _mm_cmple_epu32_mask
e.vpcmpud(not_underflow, e.xmm1, src1, 0x2);

// Copy over values that did not underflow, write zero
// everywhere else
e.vmovdqa32(dest | not_underflow | e.T_z, e.xmm1);
return;
}

// If result is greater than either of the inputs, we've
// underflowed (only need to check one input)
// if (res > src1) then underflowed
Expand All @@ -654,6 +703,21 @@ struct VECTOR_SUB
} else {
e.vpsubd(e.xmm1, src1, src2);

if (e.IsFeatureEnabled(kX64EmitAVX512Ortho |
kX64EmitAVX512DQ)) {
e.vmovdqa32(e.xmm3, src1);
e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00011000);

const Opmask saturate = e.k1;
e.vpmovd2m(saturate, e.xmm3);

e.vpsrad(e.xmm2, e.xmm1, 31);
e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32));

e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
return;
}

// We can only overflow if the signs of the operands are
// opposite. If signs are opposite and result sign isn't the
// same as src1's sign, we've overflowed. if ((s32b)((src1 ^
Expand Down Expand Up @@ -1740,7 +1804,23 @@ struct PERMUTE_V128
} else {
e.vxorps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
}

if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512VBMI)) {
Xmm table_lo = e.xmm1;
if (i.src2.is_constant) {
e.LoadConstantXmm(table_lo, i.src2.constant());
} else {
table_lo = i.src2;
}
Opmask zeroes = e.k1;
// _mm_cmple_epu8_mask
e.vpcmpub(zeroes, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15), 2);
e.vpermb(i.dest.reg() | zeroes | e.T_z, e.xmm0, table_lo);
return;
}

e.vpand(e.xmm0, e.GetXmmConstPtr(XMMPermuteByteMask));

if (i.src2.is_constant) {
e.LoadConstantXmm(i.dest, i.src2.constant());
e.vpshufb(i.dest, i.dest, e.xmm0);
Expand All @@ -1756,13 +1836,47 @@ struct PERMUTE_V128
// General permute.
// Control mask needs to be shuffled.
// TODO(benvanik): do constants here instead of in generated code.
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
kX64EmitAVX512VBMI)) {
Xmm table_idx = e.xmm0;
if (i.src1.is_constant) {
e.LoadConstantXmm(table_idx, i.src1.constant());
e.vxorps(table_idx, table_idx, e.GetXmmConstPtr(XMMSwapWordMask));
} else {
e.vxorps(table_idx, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
}

Xmm table_lo = e.xmm1;
if (i.src2.value->IsConstantZero()) {
e.vpxor(table_lo, table_lo);
} else if (i.src2.is_constant) {
e.LoadConstantXmm(table_lo, i.src2.constant());
} else {
table_lo = i.src2;
}

Xmm table_hi = e.xmm2;
if (i.src3.value->IsConstantZero()) {
e.vpxor(table_hi, table_hi);
} else if (i.src3.is_constant) {
e.LoadConstantXmm(table_hi, i.src3.constant());
} else {
table_hi = i.src3;
}

e.vpermi2b(table_idx, table_lo, table_hi);
e.vmovdqu8(i.dest, table_idx);
return;
}

if (i.src1.is_constant) {
e.LoadConstantXmm(e.xmm2, i.src1.constant());
e.vxorps(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask));
} else {
e.vxorps(e.xmm2, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
}
e.vpand(e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask));

Xmm src2_shuf = e.xmm0;
if (i.src2.value->IsConstantZero()) {
e.vpxor(src2_shuf, src2_shuf);
Expand All @@ -1789,8 +1903,39 @@ struct PERMUTE_V128

static void EmitByInt16(X64Emitter& e, const EmitArgType& i) {
// src1 is an array of indices corresponding to positions within src2 and
// src3.
// src3
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW)) {
e.LoadConstantXmm(e.xmm1, vec128s(0x1));

Xmm table_idx = e.xmm0;
if (i.src1.is_constant) {
e.LoadConstantXmm(table_idx, i.src1.constant());
e.vpxord(table_idx, table_idx, e.xmm1);
} else {
e.vpxord(table_idx, i.src1, e.xmm1);
}

Xmm table_lo = e.xmm1;
if (i.src2.is_constant) {
e.LoadConstantXmm(table_lo, i.src2.constant());
} else {
table_lo = i.src2;
}

Xmm table_hi = e.xmm2;
if (i.src3.is_constant) {
e.LoadConstantXmm(table_hi, i.src3.constant());
} else {
table_hi = i.src3;
}

e.vpermi2w(table_idx, table_lo, table_hi);
e.vmovdqu8(i.dest, table_idx);
return;
}

assert_true(i.src1.is_constant);

vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1);
vec128_t perm_ctrl = vec128b(0);
for (int i = 0; i < 8; i++) {
Expand Down
4 changes: 4 additions & 0 deletions src/xenia/cpu/backend/x64/x64_sequences.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2891,6 +2891,10 @@ struct NOT_I64 : Sequence<NOT_I64, I<OPCODE_NOT, I64Op, I64Op>> {
};
struct NOT_V128 : Sequence<NOT_V128, I<OPCODE_NOT, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vpternlogd(i.dest, i.src1, i.src1, 0b01010101);
return;
}
// dest = src ^ 0xFFFF...
e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
}
Expand Down
4 changes: 2 additions & 2 deletions src/xenia/cpu/ppc/testing/ppc_testing_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -349,8 +349,8 @@ class TestRunner {
uint32_t expected = std::strtoul(ccs, nullptr, 16);
uint8_t actual = *p;

expecteds.AppendFormat(" %02X", expected);
actuals.AppendFormat(" %02X", actual);
expecteds.AppendFormat(" {:02X}", expected);
actuals.AppendFormat(" {:02X}", actual);

if (expected != actual) {
any_failed = true;
Expand Down
9 changes: 4 additions & 5 deletions src/xenia/gpu/d3d12/d3d12_command_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3189,15 +3189,14 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
// flow.
reg::RB_COLOR_INFO color_infos[4];
float rt_clamp[4][4];
// Two UINT32_MAX if no components actually existing in the RT are written.
uint32_t rt_keep_masks[4][2];
for (uint32_t i = 0; i < 4; ++i) {
auto color_info = regs.Get<reg::RB_COLOR_INFO>(
reg::RB_COLOR_INFO::rt_register_indices[i]);
color_infos[i] = color_info;
if (edram_rov_used) {
// Get the mask for keeping previous color's components unmodified,
// or two UINT32_MAX if no colors actually existing in the RT are written.
DxbcShaderTranslator::ROV_GetColorFormatSystemConstants(
RenderTargetCache::GetPSIColorFormatInfo(
color_info.color_format, (normalized_color_mask >> (i * 4)) & 0b1111,
rt_clamp[i][0], rt_clamp[i][1], rt_clamp[i][2], rt_clamp[i][3],
rt_keep_masks[i][0], rt_keep_masks[i][1]);
Expand Down Expand Up @@ -3506,8 +3505,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
rt_base_dwords_scaled;
system_constants_.edram_rt_base_dwords_scaled[i] =
rt_base_dwords_scaled;
uint32_t format_flags = DxbcShaderTranslator::ROV_AddColorFormatFlags(
color_info.color_format);
uint32_t format_flags =
RenderTargetCache::AddPSIColorFormatFlags(color_info.color_format);
dirty |= system_constants_.edram_rt_format_flags[i] != format_flags;
system_constants_.edram_rt_format_flags[i] = format_flags;
// Can't do float comparisons here because NaNs would result in always
Expand Down
17 changes: 8 additions & 9 deletions src/xenia/gpu/dxbc_shader_translator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -691,15 +691,14 @@ void DxbcShaderTranslator::StartPixelShader() {
if (i == param_gen_interpolator) {
continue;
}
a_.OpMov(
uses_register_dynamic_addressing ? dxbc::Dest::X(0, i)
: dxbc::Dest::R(i),
(i < xenos::kMaxInterpolators &&
(interpolator_mask & (UINT32_C(1) << i)))
? dxbc::Src::V1D(
in_reg_ps_interpolators_ +
xe::bit_count((interpolator_mask & (UINT32_C(1) << i)) - 1))
: dxbc::Src::LF(0.0f));
a_.OpMov(uses_register_dynamic_addressing ? dxbc::Dest::X(0, i)
: dxbc::Dest::R(i),
(i < xenos::kMaxInterpolators &&
(interpolator_mask & (UINT32_C(1) << i)))
? dxbc::Src::V1D(in_reg_ps_interpolators_ +
xe::bit_count(interpolator_mask &
((UINT32_C(1) << i) - 1)))
: dxbc::Src::LF(0.0f));
}

// Write the pixel parameters to the specified interpolator register
Expand Down
51 changes: 3 additions & 48 deletions src/xenia/gpu/dxbc_shader_translator.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,19 +267,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
};
static_assert(kSysFlag_Count <= 32, "Too many flags in the system constants");

// Appended to the format in the format constant.
enum : uint32_t {
// Starting from bit 4 because the format itself needs 4 bits.
kRTFormatFlag_64bpp_Shift = 4,
// Requires clamping of blending sources and factors.
kRTFormatFlag_FixedPointColor_Shift,
kRTFormatFlag_FixedPointAlpha_Shift,

kRTFormatFlag_64bpp = 1u << kRTFormatFlag_64bpp_Shift,
kRTFormatFlag_FixedPointColor = 1u << kRTFormatFlag_FixedPointColor_Shift,
kRTFormatFlag_FixedPointAlpha = 1u << kRTFormatFlag_FixedPointAlpha_Shift,
};

// IF SYSTEM CONSTANTS ARE CHANGED OR ADDED, THE FOLLOWING MUST BE UPDATED:
// - SystemConstants::Index enum.
// - system_constant_rdef_.
Expand Down Expand Up @@ -383,7 +370,8 @@ class DxbcShaderTranslator : public ShaderTranslator {

uint32_t edram_rt_base_dwords_scaled[4];

// RT format combined with kRTFormatFlags.
// RT format combined with RenderTargetCache::kPSIColorFormatFlag values
// (pass via RenderTargetCache::AddPSIColorFormatFlags).
uint32_t edram_rt_format_flags[4];

// Format info - values to clamp the color to before blending or storing.
Expand Down Expand Up @@ -524,40 +512,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
kEdram,
};

// Returns the format with internal flags for passing via the
// edram_rt_format_flags system constant.
static constexpr uint32_t ROV_AddColorFormatFlags(
xenos::ColorRenderTargetFormat format) {
uint32_t format_flags = uint32_t(format);
if (format == xenos::ColorRenderTargetFormat::k_16_16_16_16 ||
format == xenos::ColorRenderTargetFormat::k_16_16_16_16_FLOAT ||
format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) {
format_flags |= kRTFormatFlag_64bpp;
}
if (format == xenos::ColorRenderTargetFormat::k_8_8_8_8 ||
format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA ||
format == xenos::ColorRenderTargetFormat::k_2_10_10_10 ||
format == xenos::ColorRenderTargetFormat::k_16_16 ||
format == xenos::ColorRenderTargetFormat::k_16_16_16_16 ||
format == xenos::ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10) {
format_flags |=
kRTFormatFlag_FixedPointColor | kRTFormatFlag_FixedPointAlpha;
} else if (format == xenos::ColorRenderTargetFormat::k_2_10_10_10_FLOAT ||
format == xenos::ColorRenderTargetFormat::
k_2_10_10_10_FLOAT_AS_16_16_16_16) {
format_flags |= kRTFormatFlag_FixedPointAlpha;
}
return format_flags;
}
// Returns the bits that need to be added to the RT flags constant - needs to
// be done externally, not in SetColorFormatConstants, because the flags
// contain other state.
static void ROV_GetColorFormatSystemConstants(
xenos::ColorRenderTargetFormat format, uint32_t write_mask,
float& clamp_rgb_low, float& clamp_alpha_low, float& clamp_rgb_high,
float& clamp_alpha_high, uint32_t& keep_mask_low,
uint32_t& keep_mask_high);

uint64_t GetDefaultVertexShaderModification(
uint32_t dynamic_addressable_register_count,
Shader::HostVertexShaderType host_vertex_shader_type =
Expand Down Expand Up @@ -772,6 +726,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
// Whether it's possible and worth skipping running the translated shader for
// 2x2 quads.
bool ROV_IsDepthStencilEarly() const {
assert_true(edram_rov_used_);
return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
!current_shader().is_valid_memexport_used();
}
Expand Down
Loading

0 comments on commit f118eb5

Please sign in to comment.