mirror of
https://git.proxmox.com/git/llvm-toolchain
synced 2025-06-14 05:46:27 +00:00

* debian/patches/r600-snapshot.diff: Update to mesa-9.1 git tag. (Closes: #703671, #697356)
24963 lines
847 KiB
Diff
24963 lines
847 KiB
Diff
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
|
|
index 7715531..1330c36 100644
|
|
--- a/autoconf/configure.ac
|
|
+++ b/autoconf/configure.ac
|
|
@@ -751,6 +751,11 @@ AC_ARG_ENABLE([experimental-targets],AS_HELP_STRING([--enable-experimental-targe
|
|
|
|
if test ${enableval} != "disable"
|
|
then
|
|
+ if test ${enableval} = "AMDGPU"
|
|
+ then
|
|
+ AC_MSG_ERROR([The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600])
|
|
+ enableval="R600"
|
|
+ fi
|
|
TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD"
|
|
fi
|
|
|
|
diff --git a/configure b/configure
|
|
index 4fa0705..02012b9 100755
|
|
--- a/configure
|
|
+++ b/configure
|
|
@@ -5473,6 +5473,13 @@ fi
|
|
|
|
if test ${enableval} != "disable"
|
|
then
|
|
+ if test ${enableval} = "AMDGPU"
|
|
+ then
|
|
+ { { echo "$as_me:$LINENO: error: The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600" >&5
|
|
+echo "$as_me: error: The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600" >&2;}
|
|
+ { (exit 1); exit 1; }; }
|
|
+ enableval="R600"
|
|
+ fi
|
|
TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD"
|
|
fi
|
|
|
|
@@ -10316,7 +10323,7 @@ else
|
|
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
|
|
lt_status=$lt_dlunknown
|
|
cat > conftest.$ac_ext <<EOF
|
|
-#line 10317 "configure"
|
|
+#line 10326 "configure"
|
|
#include "confdefs.h"
|
|
|
|
#if HAVE_DLFCN_H
|
|
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
|
|
index 8995080..84c4111 100644
|
|
--- a/lib/Target/LLVMBuild.txt
|
|
+++ b/lib/Target/LLVMBuild.txt
|
|
@@ -16,7 +16,7 @@
|
|
;===------------------------------------------------------------------------===;
|
|
|
|
[common]
|
|
-subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC Sparc X86 XCore
|
|
+subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc X86 XCore
|
|
|
|
; This is a special group whose required libraries are extended (by llvm-build)
|
|
; with the best execution engine (the native JIT, if available, or the
|
|
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
|
|
new file mode 100644
|
|
index 0000000..ba87918
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPU.h
|
|
@@ -0,0 +1,51 @@
|
|
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPU_H
|
|
+#define AMDGPU_H
|
|
+
|
|
+#include "AMDGPUTargetMachine.h"
|
|
+#include "llvm/Support/TargetRegistry.h"
|
|
+#include "llvm/Target/TargetMachine.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class FunctionPass;
|
|
+class AMDGPUTargetMachine;
|
|
+
|
|
+// R600 Passes
|
|
+FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
|
|
+FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
|
|
+FunctionPass *createR600LowerConstCopy(TargetMachine &tm);
|
|
+
|
|
+// SI Passes
|
|
+FunctionPass *createSIAnnotateControlFlowPass();
|
|
+FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
|
|
+FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
|
|
+FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
|
|
+FunctionPass *createSIInsertWaits(TargetMachine &tm);
|
|
+
|
|
+// Passes common to R600 and SI
|
|
+Pass *createAMDGPUStructurizeCFGPass();
|
|
+FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
|
|
+FunctionPass* createAMDGPUIndirectAddressingPass(TargetMachine &tm);
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+namespace ShaderType {
|
|
+ enum Type {
|
|
+ PIXEL = 0,
|
|
+ VERTEX = 1,
|
|
+ GEOMETRY = 2,
|
|
+ COMPUTE = 3
|
|
+ };
|
|
+}
|
|
+
|
|
+#endif // AMDGPU_H
|
|
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
|
|
new file mode 100644
|
|
index 0000000..40f4741
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPU.td
|
|
@@ -0,0 +1,40 @@
|
|
+//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+
|
|
+// Include AMDIL TD files
|
|
+include "AMDILBase.td"
|
|
+
|
|
+
|
|
+def AMDGPUInstrInfo : InstrInfo {
|
|
+ let guessInstructionProperties = 1;
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Declare the target which we are implementing
|
|
+//===----------------------------------------------------------------------===//
|
|
+def AMDGPUAsmWriter : AsmWriter {
|
|
+ string AsmWriterClassName = "InstPrinter";
|
|
+ int Variant = 0;
|
|
+ bit isMCAsmWriter = 1;
|
|
+}
|
|
+
|
|
+def AMDGPU : Target {
|
|
+ // Pull in Instruction Info:
|
|
+ let InstructionSet = AMDGPUInstrInfo;
|
|
+ let AssemblyWriters = [AMDGPUAsmWriter];
|
|
+}
|
|
+
|
|
+// Include AMDGPU TD files
|
|
+include "R600Schedule.td"
|
|
+include "SISchedule.td"
|
|
+include "Processors.td"
|
|
+include "AMDGPUInstrInfo.td"
|
|
+include "AMDGPUIntrinsics.td"
|
|
+include "AMDGPURegisterInfo.td"
|
|
+include "AMDGPUInstructions.td"
|
|
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
|
|
new file mode 100644
|
|
index 0000000..254e62e
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
|
|
@@ -0,0 +1,145 @@
|
|
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+///
|
|
+/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
|
|
+/// code. When passed an MCAsmStreamer it prints assembly and when passed
|
|
+/// an MCObjectStreamer it outputs binary code.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+
|
|
+
|
|
+#include "AMDGPUAsmPrinter.h"
|
|
+#include "AMDGPU.h"
|
|
+#include "SIMachineFunctionInfo.h"
|
|
+#include "SIRegisterInfo.h"
|
|
+#include "llvm/MC/MCStreamer.h"
|
|
+#include "llvm/Target/TargetLoweringObjectFile.h"
|
|
+#include "llvm/Support/TargetRegistry.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+
|
|
+static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
|
|
+ MCStreamer &Streamer) {
|
|
+ return new AMDGPUAsmPrinter(tm, Streamer);
|
|
+}
|
|
+
|
|
+extern "C" void LLVMInitializeR600AsmPrinter() {
|
|
+ TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
|
|
+}
|
|
+
|
|
+/// We need to override this function so we can avoid
|
|
+/// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle.
|
|
+bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
|
|
+ const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
|
|
+ if (STM.dumpCode()) {
|
|
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
+ MF.dump();
|
|
+#endif
|
|
+ }
|
|
+ SetupMachineFunction(MF);
|
|
+ if (OutStreamer.hasRawTextSupport()) {
|
|
+ OutStreamer.EmitRawText("@" + MF.getName() + ":");
|
|
+ }
|
|
+ OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
|
|
+ if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
|
|
+ EmitProgramInfo(MF);
|
|
+ }
|
|
+ EmitFunctionBody();
|
|
+ return false;
|
|
+}
|
|
+
|
|
+void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
|
|
+ unsigned MaxSGPR = 0;
|
|
+ unsigned MaxVGPR = 0;
|
|
+ bool VCCUsed = false;
|
|
+ const SIRegisterInfo * RI =
|
|
+ static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
|
|
+
|
|
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
|
|
+ BB != BB_E; ++BB) {
|
|
+ MachineBasicBlock &MBB = *BB;
|
|
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
|
|
+ I != E; ++I) {
|
|
+ MachineInstr &MI = *I;
|
|
+
|
|
+ unsigned numOperands = MI.getNumOperands();
|
|
+ for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
|
|
+ MachineOperand & MO = MI.getOperand(op_idx);
|
|
+ unsigned maxUsed;
|
|
+ unsigned width = 0;
|
|
+ bool isSGPR = false;
|
|
+ unsigned reg;
|
|
+ unsigned hwReg;
|
|
+ if (!MO.isReg()) {
|
|
+ continue;
|
|
+ }
|
|
+ reg = MO.getReg();
|
|
+ if (reg == AMDGPU::VCC) {
|
|
+ VCCUsed = true;
|
|
+ continue;
|
|
+ }
|
|
+ switch (reg) {
|
|
+ default: break;
|
|
+ case AMDGPU::EXEC:
|
|
+ case AMDGPU::M0:
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (AMDGPU::SReg_32RegClass.contains(reg)) {
|
|
+ isSGPR = true;
|
|
+ width = 1;
|
|
+ } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
|
|
+ isSGPR = false;
|
|
+ width = 1;
|
|
+ } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
|
|
+ isSGPR = true;
|
|
+ width = 2;
|
|
+ } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
|
|
+ isSGPR = false;
|
|
+ width = 2;
|
|
+ } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
|
|
+ isSGPR = true;
|
|
+ width = 4;
|
|
+ } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
|
|
+ isSGPR = false;
|
|
+ width = 4;
|
|
+ } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
|
|
+ isSGPR = true;
|
|
+ width = 8;
|
|
+ } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
|
|
+ isSGPR = false;
|
|
+ width = 8;
|
|
+ } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
|
|
+ isSGPR = false;
|
|
+ width = 16;
|
|
+ } else {
|
|
+ assert(!"Unknown register class");
|
|
+ }
|
|
+ hwReg = RI->getEncodingValue(reg) & 0xff;
|
|
+ maxUsed = hwReg + width - 1;
|
|
+ if (isSGPR) {
|
|
+ MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
|
|
+ } else {
|
|
+ MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ if (VCCUsed) {
|
|
+ MaxSGPR += 2;
|
|
+ }
|
|
+ SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
+ OutStreamer.EmitIntValue(MaxSGPR + 1, 4);
|
|
+ OutStreamer.EmitIntValue(MaxVGPR + 1, 4);
|
|
+ OutStreamer.EmitIntValue(MFI->SPIPSInputAddr, 4);
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
|
|
new file mode 100644
|
|
index 0000000..3812282
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
|
|
@@ -0,0 +1,44 @@
|
|
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief AMDGPU Assembly printer class.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPU_ASMPRINTER_H
|
|
+#define AMDGPU_ASMPRINTER_H
|
|
+
|
|
+#include "llvm/CodeGen/AsmPrinter.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class AMDGPUAsmPrinter : public AsmPrinter {
|
|
+
|
|
+public:
|
|
+ explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
|
|
+ : AsmPrinter(TM, Streamer) { }
|
|
+
|
|
+ virtual bool runOnMachineFunction(MachineFunction &MF);
|
|
+
|
|
+ virtual const char *getPassName() const {
|
|
+ return "AMDGPU Assembly Printer";
|
|
+ }
|
|
+
|
|
+ /// \brief Emit register usage information so that the GPU driver
|
|
+ /// can correctly setup the GPU state.
|
|
+ void EmitProgramInfo(MachineFunction &MF);
|
|
+
|
|
+ /// Implemented in AMDGPUMCInstLower.cpp
|
|
+ virtual void EmitInstruction(const MachineInstr *MI);
|
|
+};
|
|
+
|
|
+} // End anonymous llvm
|
|
+
|
|
+#endif //AMDGPU_ASMPRINTER_H
|
|
diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp
|
|
new file mode 100644
|
|
index 0000000..50297d1
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUConvertToISA.cpp
|
|
@@ -0,0 +1,62 @@
|
|
+//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief This pass lowers AMDIL machine instructions to the appropriate
|
|
+/// hardware instructions.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPU.h"
|
|
+#include "AMDGPUInstrInfo.h"
|
|
+#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+namespace {
|
|
+
|
|
+class AMDGPUConvertToISAPass : public MachineFunctionPass {
|
|
+
|
|
+private:
|
|
+ static char ID;
|
|
+ TargetMachine &TM;
|
|
+
|
|
+public:
|
|
+ AMDGPUConvertToISAPass(TargetMachine &tm) :
|
|
+ MachineFunctionPass(ID), TM(tm) { }
|
|
+
|
|
+ virtual bool runOnMachineFunction(MachineFunction &MF);
|
|
+
|
|
+ virtual const char *getPassName() const {return "AMDGPU Convert to ISA";}
|
|
+
|
|
+};
|
|
+
|
|
+} // End anonymous namespace
|
|
+
|
|
+char AMDGPUConvertToISAPass::ID = 0;
|
|
+
|
|
+FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
|
|
+ return new AMDGPUConvertToISAPass(tm);
|
|
+}
|
|
+
|
|
+bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) {
|
|
+ const AMDGPUInstrInfo * TII =
|
|
+ static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
|
|
+
|
|
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
|
|
+ BB != BB_E; ++BB) {
|
|
+ MachineBasicBlock &MBB = *BB;
|
|
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
|
|
+ I != E; ++I) {
|
|
+ MachineInstr &MI = *I;
|
|
+ TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
|
|
+ }
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
|
|
new file mode 100644
|
|
index 0000000..a3b6936
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
|
|
@@ -0,0 +1,122 @@
|
|
+//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+// Interface to describe a layout of a stack frame on a AMDIL target machine
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+#include "AMDGPUFrameLowering.h"
|
|
+#include "AMDGPURegisterInfo.h"
|
|
+#include "R600MachineFunctionInfo.h"
|
|
+#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+#include "llvm/Instructions.h"
|
|
+
|
|
+using namespace llvm;
|
|
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
|
|
+ int LAO, unsigned TransAl)
|
|
+ : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
|
|
+
|
|
+AMDGPUFrameLowering::~AMDGPUFrameLowering() { }
|
|
+
|
|
+unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
|
|
+
|
|
+ // XXX: Hardcoding to 1 for now.
|
|
+ //
|
|
+ // I think the StackWidth should stored as metadata associated with the
|
|
+ // MachineFunction. This metadata can either be added by a frontend, or
|
|
+ // calculated by a R600 specific LLVM IR pass.
|
|
+ //
|
|
+ // The StackWidth determines how stack objects are laid out in memory.
|
|
+ // For a vector stack variable, like: int4 stack[2], the data will be stored
|
|
+ // in the following ways depending on the StackWidth.
|
|
+ //
|
|
+ // StackWidth = 1:
|
|
+ //
|
|
+ // T0.X = stack[0].x
|
|
+ // T1.X = stack[0].y
|
|
+ // T2.X = stack[0].z
|
|
+ // T3.X = stack[0].w
|
|
+ // T4.X = stack[1].x
|
|
+ // T5.X = stack[1].y
|
|
+ // T6.X = stack[1].z
|
|
+ // T7.X = stack[1].w
|
|
+ //
|
|
+ // StackWidth = 2:
|
|
+ //
|
|
+ // T0.X = stack[0].x
|
|
+ // T0.Y = stack[0].y
|
|
+ // T1.X = stack[0].z
|
|
+ // T1.Y = stack[0].w
|
|
+ // T2.X = stack[1].x
|
|
+ // T2.Y = stack[1].y
|
|
+ // T3.X = stack[1].z
|
|
+ // T3.Y = stack[1].w
|
|
+ //
|
|
+ // StackWidth = 4:
|
|
+ // T0.X = stack[0].x
|
|
+ // T0.Y = stack[0].y
|
|
+ // T0.Z = stack[0].z
|
|
+ // T0.W = stack[0].w
|
|
+ // T1.X = stack[1].x
|
|
+ // T1.Y = stack[1].y
|
|
+ // T1.Z = stack[1].z
|
|
+ // T1.W = stack[1].w
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+/// \returns The number of registers allocated for \p FI.
|
|
+int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
|
|
+ int FI) const {
|
|
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
|
|
+ unsigned Offset = 0;
|
|
+ int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
|
|
+
|
|
+ for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
|
|
+ const AllocaInst *Alloca = MFI->getObjectAllocation(i);
|
|
+ unsigned ArrayElements;
|
|
+ const Type *AllocaType = Alloca->getAllocatedType();
|
|
+ const Type *ElementType;
|
|
+
|
|
+ if (AllocaType->isArrayTy()) {
|
|
+ ArrayElements = AllocaType->getArrayNumElements();
|
|
+ ElementType = AllocaType->getArrayElementType();
|
|
+ } else {
|
|
+ ArrayElements = 1;
|
|
+ ElementType = AllocaType;
|
|
+ }
|
|
+
|
|
+ unsigned VectorElements;
|
|
+ if (ElementType->isVectorTy()) {
|
|
+ VectorElements = ElementType->getVectorNumElements();
|
|
+ } else {
|
|
+ VectorElements = 1;
|
|
+ }
|
|
+
|
|
+ Offset += (VectorElements / getStackWidth(MF)) * ArrayElements;
|
|
+ }
|
|
+ return Offset;
|
|
+}
|
|
+
|
|
+const TargetFrameLowering::SpillSlot *
|
|
+AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
|
|
+ NumEntries = 0;
|
|
+ return 0;
|
|
+}
|
|
+void
|
|
+AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
|
|
+}
|
|
+void
|
|
+AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF,
|
|
+ MachineBasicBlock &MBB) const {
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
|
|
+ return false;
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h
|
|
new file mode 100644
|
|
index 0000000..cf5742e
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUFrameLowering.h
|
|
@@ -0,0 +1,44 @@
|
|
+//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Interface to describe a layout of a stack frame on a AMDIL target
|
|
+/// machine.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+#ifndef AMDILFRAME_LOWERING_H
|
|
+#define AMDILFRAME_LOWERING_H
|
|
+
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+#include "llvm/Target/TargetFrameLowering.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+/// \brief Information about the stack frame layout on the AMDGPU targets.
|
|
+///
|
|
+/// It holds the direction of the stack growth, the known stack alignment on
|
|
+/// entry to each function, and the offset to the locals area.
|
|
+/// See TargetFrameInfo for more comments.
|
|
+class AMDGPUFrameLowering : public TargetFrameLowering {
|
|
+public:
|
|
+ AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
|
|
+ unsigned TransAl = 1);
|
|
+ virtual ~AMDGPUFrameLowering();
|
|
+
|
|
+ /// \returns The number of 32-bit sub-registers that are used when storing
|
|
+ /// values to the stack.
|
|
+ virtual unsigned getStackWidth(const MachineFunction &MF) const;
|
|
+ virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
|
|
+ virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const;
|
|
+ virtual void emitPrologue(MachineFunction &MF) const;
|
|
+ virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
|
|
+ virtual bool hasFP(const MachineFunction &MF) const;
|
|
+};
|
|
+} // namespace llvm
|
|
+#endif // AMDILFRAME_LOWERING_H
|
|
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
|
|
new file mode 100644
|
|
index 0000000..d0d23d6
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
|
|
@@ -0,0 +1,418 @@
|
|
+//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief This is the parent TargetLowering class for hardware code gen
|
|
+/// targets.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPUISelLowering.h"
|
|
+#include "AMDILIntrinsicInfo.h"
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+#include "llvm/CodeGen/SelectionDAG.h"
|
|
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
|
|
+ TargetLowering(TM, new TargetLoweringObjectFileELF()) {
|
|
+
|
|
+ // Initialize target lowering borrowed from AMDIL
|
|
+ InitAMDILLowering();
|
|
+
|
|
+ // We need to custom lower some of the intrinsics
|
|
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
|
|
+
|
|
+ // Library functions. These default to Expand, but we have instructions
|
|
+ // for them.
|
|
+ setOperationAction(ISD::FCEIL, MVT::f32, Legal);
|
|
+ setOperationAction(ISD::FEXP2, MVT::f32, Legal);
|
|
+ setOperationAction(ISD::FPOW, MVT::f32, Legal);
|
|
+ setOperationAction(ISD::FLOG2, MVT::f32, Legal);
|
|
+ setOperationAction(ISD::FABS, MVT::f32, Legal);
|
|
+ setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
|
|
+ setOperationAction(ISD::FRINT, MVT::f32, Legal);
|
|
+
|
|
+ // Lower floating point store/load to integer store/load to reduce the number
|
|
+ // of patterns in tablegen.
|
|
+ setOperationAction(ISD::STORE, MVT::f32, Promote);
|
|
+ AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
|
|
+
|
|
+ setOperationAction(ISD::STORE, MVT::v4f32, Promote);
|
|
+ AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
|
|
+
|
|
+ setOperationAction(ISD::LOAD, MVT::f32, Promote);
|
|
+ AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
|
|
+
|
|
+ setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
|
|
+ AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
|
|
+
|
|
+ setOperationAction(ISD::UDIV, MVT::i32, Expand);
|
|
+ setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
|
|
+ setOperationAction(ISD::UREM, MVT::i32, Expand);
|
|
+}
|
|
+
|
|
+//===---------------------------------------------------------------------===//
|
|
+// TargetLowering Callbacks
|
|
+//===---------------------------------------------------------------------===//
|
|
+
|
|
+SDValue AMDGPUTargetLowering::LowerFormalArguments(
|
|
+ SDValue Chain,
|
|
+ CallingConv::ID CallConv,
|
|
+ bool isVarArg,
|
|
+ const SmallVectorImpl<ISD::InputArg> &Ins,
|
|
+ DebugLoc DL, SelectionDAG &DAG,
|
|
+ SmallVectorImpl<SDValue> &InVals) const {
|
|
+ for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
|
|
+ InVals.push_back(SDValue());
|
|
+ }
|
|
+ return Chain;
|
|
+}
|
|
+
|
|
+SDValue AMDGPUTargetLowering::LowerReturn(
|
|
+ SDValue Chain,
|
|
+ CallingConv::ID CallConv,
|
|
+ bool isVarArg,
|
|
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
+ const SmallVectorImpl<SDValue> &OutVals,
|
|
+ DebugLoc DL, SelectionDAG &DAG) const {
|
|
+ return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
|
|
+}
|
|
+
|
|
+//===---------------------------------------------------------------------===//
|
|
+// Target specific lowering
|
|
+//===---------------------------------------------------------------------===//
|
|
+
|
|
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
|
|
+ const {
|
|
+ switch (Op.getOpcode()) {
|
|
+ default:
|
|
+ Op.getNode()->dump();
|
|
+ assert(0 && "Custom lowering code for this"
|
|
+ "instruction is not implemented yet!");
|
|
+ break;
|
|
+ // AMDIL DAG lowering
|
|
+ case ISD::SDIV: return LowerSDIV(Op, DAG);
|
|
+ case ISD::SREM: return LowerSREM(Op, DAG);
|
|
+ case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
|
|
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
|
|
+ // AMDGPU DAG lowering
|
|
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
|
|
+ case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
|
|
+ }
|
|
+ return Op;
|
|
+}
|
|
+
|
|
+SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|
+ SelectionDAG &DAG) const {
|
|
+ unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT VT = Op.getValueType();
|
|
+
|
|
+ switch (IntrinsicID) {
|
|
+ default: return Op;
|
|
+ case AMDGPUIntrinsic::AMDIL_abs:
|
|
+ return LowerIntrinsicIABS(Op, DAG);
|
|
+ case AMDGPUIntrinsic::AMDIL_exp:
|
|
+ return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
|
|
+ case AMDGPUIntrinsic::AMDGPU_lrp:
|
|
+ return LowerIntrinsicLRP(Op, DAG);
|
|
+ case AMDGPUIntrinsic::AMDIL_fraction:
|
|
+ return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
|
|
+ case AMDGPUIntrinsic::AMDIL_mad:
|
|
+ return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1),
|
|
+ Op.getOperand(2), Op.getOperand(3));
|
|
+ case AMDGPUIntrinsic::AMDIL_max:
|
|
+ return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
|
|
+ Op.getOperand(2));
|
|
+ case AMDGPUIntrinsic::AMDGPU_imax:
|
|
+ return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
|
|
+ Op.getOperand(2));
|
|
+ case AMDGPUIntrinsic::AMDGPU_umax:
|
|
+ return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
|
|
+ Op.getOperand(2));
|
|
+ case AMDGPUIntrinsic::AMDIL_min:
|
|
+ return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1),
|
|
+ Op.getOperand(2));
|
|
+ case AMDGPUIntrinsic::AMDGPU_imin:
|
|
+ return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
|
|
+ Op.getOperand(2));
|
|
+ case AMDGPUIntrinsic::AMDGPU_umin:
|
|
+ return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
|
|
+ Op.getOperand(2));
|
|
+ case AMDGPUIntrinsic::AMDIL_round_nearest:
|
|
+ return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
|
|
+ }
|
|
+}
|
|
+
|
|
+///IABS(a) = SMAX(sub(0, a), a)
|
|
+SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
|
|
+ SelectionDAG &DAG) const {
|
|
+
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT VT = Op.getValueType();
|
|
+ SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
|
|
+ Op.getOperand(1));
|
|
+
|
|
+ return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
|
|
+}
|
|
+
|
|
+/// Linear Interpolation
|
|
+/// LRP(a, b, c) = muladd(a, b, (1 - a) * c)
|
|
+SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
|
|
+ SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT VT = Op.getValueType();
|
|
+ SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
|
|
+ DAG.getConstantFP(1.0f, MVT::f32),
|
|
+ Op.getOperand(1));
|
|
+ SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
|
|
+ Op.getOperand(3));
|
|
+ return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1),
|
|
+ Op.getOperand(2),
|
|
+ OneSubAC);
|
|
+}
|
|
+
|
|
+/// \brief Generate Min/Max node
|
|
+SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
|
|
+ SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT VT = Op.getValueType();
|
|
+
|
|
+ SDValue LHS = Op.getOperand(0);
|
|
+ SDValue RHS = Op.getOperand(1);
|
|
+ SDValue True = Op.getOperand(2);
|
|
+ SDValue False = Op.getOperand(3);
|
|
+ SDValue CC = Op.getOperand(4);
|
|
+
|
|
+ if (VT != MVT::f32 ||
|
|
+ !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
|
|
+ return SDValue();
|
|
+ }
|
|
+
|
|
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
|
|
+ switch (CCOpcode) {
|
|
+ case ISD::SETOEQ:
|
|
+ case ISD::SETONE:
|
|
+ case ISD::SETUNE:
|
|
+ case ISD::SETNE:
|
|
+ case ISD::SETUEQ:
|
|
+ case ISD::SETEQ:
|
|
+ case ISD::SETFALSE:
|
|
+ case ISD::SETFALSE2:
|
|
+ case ISD::SETTRUE:
|
|
+ case ISD::SETTRUE2:
|
|
+ case ISD::SETUO:
|
|
+ case ISD::SETO:
|
|
+ assert(0 && "Operation should already be optimised !");
|
|
+ case ISD::SETULE:
|
|
+ case ISD::SETULT:
|
|
+ case ISD::SETOLE:
|
|
+ case ISD::SETOLT:
|
|
+ case ISD::SETLE:
|
|
+ case ISD::SETLT: {
|
|
+ if (LHS == True)
|
|
+ return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
|
|
+ else
|
|
+ return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
|
|
+ }
|
|
+ case ISD::SETGT:
|
|
+ case ISD::SETGE:
|
|
+ case ISD::SETUGE:
|
|
+ case ISD::SETOGE:
|
|
+ case ISD::SETUGT:
|
|
+ case ISD::SETOGT: {
|
|
+ if (LHS == True)
|
|
+ return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
|
|
+ else
|
|
+ return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
|
|
+ }
|
|
+ case ISD::SETCC_INVALID:
|
|
+ assert(0 && "Invalid setcc condcode !");
|
|
+ }
|
|
+ return Op;
|
|
+}
|
|
+
|
|
+
|
|
+
|
|
+SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
|
|
+ SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT VT = Op.getValueType();
|
|
+
|
|
+ SDValue Num = Op.getOperand(0);
|
|
+ SDValue Den = Op.getOperand(1);
|
|
+
|
|
+ SmallVector<SDValue, 8> Results;
|
|
+
|
|
+ // RCP = URECIP(Den) = 2^32 / Den + e
|
|
+ // e is rounding error.
|
|
+ SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
|
|
+
|
|
+ // RCP_LO = umulo(RCP, Den) */
|
|
+ SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
|
|
+
|
|
+ // RCP_HI = mulhu (RCP, Den) */
|
|
+ SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
|
|
+
|
|
+ // NEG_RCP_LO = -RCP_LO
|
|
+ SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
|
|
+ RCP_LO);
|
|
+
|
|
+ // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
|
|
+ SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
|
|
+ NEG_RCP_LO, RCP_LO,
|
|
+ ISD::SETEQ);
|
|
+ // Calculate the rounding error from the URECIP instruction
|
|
+ // E = mulhu(ABS_RCP_LO, RCP)
|
|
+ SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
|
|
+
|
|
+ // RCP_A_E = RCP + E
|
|
+ SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
|
|
+
|
|
+ // RCP_S_E = RCP - E
|
|
+ SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
|
|
+
|
|
+ // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
|
|
+ SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
|
|
+ RCP_A_E, RCP_S_E,
|
|
+ ISD::SETEQ);
|
|
+ // Quotient = mulhu(Tmp0, Num)
|
|
+ SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
|
|
+
|
|
+ // Num_S_Remainder = Quotient * Den
|
|
+ SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
|
|
+
|
|
+ // Remainder = Num - Num_S_Remainder
|
|
+ SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
|
|
+
|
|
+ // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
|
|
+ SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
|
|
+ DAG.getConstant(-1, VT),
|
|
+ DAG.getConstant(0, VT),
|
|
+ ISD::SETGE);
|
|
+ // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0)
|
|
+ SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder,
|
|
+ DAG.getConstant(0, VT),
|
|
+ DAG.getConstant(-1, VT),
|
|
+ DAG.getConstant(0, VT),
|
|
+ ISD::SETGE);
|
|
+ // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
|
|
+ SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
|
|
+ Remainder_GE_Zero);
|
|
+
|
|
+ // Calculate Division result:
|
|
+
|
|
+ // Quotient_A_One = Quotient + 1
|
|
+ SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
|
|
+ DAG.getConstant(1, VT));
|
|
+
|
|
+ // Quotient_S_One = Quotient - 1
|
|
+ SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
|
|
+ DAG.getConstant(1, VT));
|
|
+
|
|
+ // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
|
|
+ SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
|
|
+ Quotient, Quotient_A_One, ISD::SETEQ);
|
|
+
|
|
+ // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
|
|
+ Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
|
|
+ Quotient_S_One, Div, ISD::SETEQ);
|
|
+
|
|
+ // Calculate Rem result:
|
|
+
|
|
+ // Remainder_S_Den = Remainder - Den
|
|
+ SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
|
|
+
|
|
+ // Remainder_A_Den = Remainder + Den
|
|
+ SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
|
|
+
|
|
+ // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
|
|
+ SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
|
|
+ Remainder, Remainder_S_Den, ISD::SETEQ);
|
|
+
|
|
+ // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
|
|
+ Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
|
|
+ Remainder_A_Den, Rem, ISD::SETEQ);
|
|
+ SDValue Ops[2];
|
|
+ Ops[0] = Div;
|
|
+ Ops[1] = Rem;
|
|
+ return DAG.getMergeValues(Ops, 2, DL);
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Helper functions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
|
|
+ if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
|
|
+ return CFP->isExactlyValue(1.0);
|
|
+ }
|
|
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
|
|
+ return C->isAllOnesValue();
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
|
|
+ if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
|
|
+ return CFP->getValueAPF().isZero();
|
|
+ }
|
|
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
|
|
+ return C->isNullValue();
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
|
|
+ const TargetRegisterClass *RC,
|
|
+ unsigned Reg, EVT VT) const {
|
|
+ MachineFunction &MF = DAG.getMachineFunction();
|
|
+ MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
+ unsigned VirtualRegister;
|
|
+ if (!MRI.isLiveIn(Reg)) {
|
|
+ VirtualRegister = MRI.createVirtualRegister(RC);
|
|
+ MRI.addLiveIn(Reg, VirtualRegister);
|
|
+ } else {
|
|
+ VirtualRegister = MRI.getLiveInVirtReg(Reg);
|
|
+ }
|
|
+ return DAG.getRegister(VirtualRegister, VT);
|
|
+}
|
|
+
|
|
+#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
|
|
+
|
|
+const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|
+ switch (Opcode) {
|
|
+ default: return 0;
|
|
+ // AMDIL DAG nodes
|
|
+ NODE_NAME_CASE(MAD);
|
|
+ NODE_NAME_CASE(CALL);
|
|
+ NODE_NAME_CASE(UMUL);
|
|
+ NODE_NAME_CASE(DIV_INF);
|
|
+ NODE_NAME_CASE(RET_FLAG);
|
|
+ NODE_NAME_CASE(BRANCH_COND);
|
|
+
|
|
+ // AMDGPU DAG nodes
|
|
+ NODE_NAME_CASE(DWORDADDR)
|
|
+ NODE_NAME_CASE(FRACT)
|
|
+ NODE_NAME_CASE(FMAX)
|
|
+ NODE_NAME_CASE(SMAX)
|
|
+ NODE_NAME_CASE(UMAX)
|
|
+ NODE_NAME_CASE(FMIN)
|
|
+ NODE_NAME_CASE(SMIN)
|
|
+ NODE_NAME_CASE(UMIN)
|
|
+ NODE_NAME_CASE(URECIP)
|
|
+ NODE_NAME_CASE(EXPORT)
|
|
+ NODE_NAME_CASE(CONST_ADDRESS)
|
|
+ NODE_NAME_CASE(REGISTER_LOAD)
|
|
+ NODE_NAME_CASE(REGISTER_STORE)
|
|
+ }
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
|
|
new file mode 100644
|
|
index 0000000..99a11ff
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUISelLowering.h
|
|
@@ -0,0 +1,140 @@
|
|
+//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Interface definition of the TargetLowering class that is common
|
|
+/// to all AMD GPUs.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPUISELLOWERING_H
|
|
+#define AMDGPUISELLOWERING_H
|
|
+
|
|
+#include "llvm/Target/TargetLowering.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class MachineRegisterInfo;
|
|
+
|
|
+class AMDGPUTargetLowering : public TargetLowering {
|
|
+private:
|
|
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
|
|
+
|
|
+protected:
|
|
+
|
|
+ /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
|
|
+ /// MachineFunction.
|
|
+ ///
|
|
+ /// \returns a RegisterSDNode representing Reg.
|
|
+ SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
|
|
+ unsigned Reg, EVT VT) const;
|
|
+
|
|
+ bool isHWTrueValue(SDValue Op) const;
|
|
+ bool isHWFalseValue(SDValue Op) const;
|
|
+
|
|
+public:
|
|
+ AMDGPUTargetLowering(TargetMachine &TM);
|
|
+
|
|
+ virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
|
|
+ bool isVarArg,
|
|
+ const SmallVectorImpl<ISD::InputArg> &Ins,
|
|
+ DebugLoc DL, SelectionDAG &DAG,
|
|
+ SmallVectorImpl<SDValue> &InVals) const;
|
|
+
|
|
+ virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
|
+ bool isVarArg,
|
|
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
+ const SmallVectorImpl<SDValue> &OutVals,
|
|
+ DebugLoc DL, SelectionDAG &DAG) const;
|
|
+ virtual SDValue LowerCall(CallLoweringInfo &CLI,
|
|
+ SmallVectorImpl<SDValue> &InVals) const {
|
|
+ CLI.Callee.dump();
|
|
+ llvm_unreachable("Undefined function");
|
|
+ }
|
|
+
|
|
+ virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
|
|
+ virtual const char* getTargetNodeName(unsigned Opcode) const;
|
|
+
|
|
+// Functions defined in AMDILISelLowering.cpp
|
|
+public:
|
|
+
|
|
+ /// \brief Determine which of the bits specified in \p Mask are known to be
|
|
+ /// either zero or one and return them in the \p KnownZero and \p KnownOne
|
|
+ /// bitsets.
|
|
+ virtual void computeMaskedBitsForTargetNode(const SDValue Op,
|
|
+ APInt &KnownZero,
|
|
+ APInt &KnownOne,
|
|
+ const SelectionDAG &DAG,
|
|
+ unsigned Depth = 0) const;
|
|
+
|
|
+ virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|
+ const CallInst &I, unsigned Intrinsic) const;
|
|
+
|
|
+ /// We want to mark f32/f64 floating point values as legal.
|
|
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
|
|
+
|
|
+ /// We don't want to shrink f64/f32 constants.
|
|
+ bool ShouldShrinkFPConstant(EVT VT) const;
|
|
+
|
|
+private:
|
|
+ void InitAMDILLowering();
|
|
+ SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
|
|
+ EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
|
|
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
|
|
+};
|
|
+
|
|
+namespace AMDGPUISD {
|
|
+
|
|
+enum {
|
|
+ // AMDIL ISD Opcodes
|
|
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
|
|
+ MAD, // 32bit Fused Multiply Add instruction
|
|
+ CALL, // Function call based on a single integer
|
|
+ UMUL, // 32bit unsigned multiplication
|
|
+ DIV_INF, // Divide with infinity returned on zero divisor
|
|
+ RET_FLAG,
|
|
+ BRANCH_COND,
|
|
+ // End AMDIL ISD Opcodes
|
|
+ BITALIGN,
|
|
+ DWORDADDR,
|
|
+ FRACT,
|
|
+ FMAX,
|
|
+ SMAX,
|
|
+ UMAX,
|
|
+ FMIN,
|
|
+ SMIN,
|
|
+ UMIN,
|
|
+ URECIP,
|
|
+ EXPORT,
|
|
+ CONST_ADDRESS,
|
|
+ REGISTER_LOAD,
|
|
+ REGISTER_STORE,
|
|
+ LAST_AMDGPU_ISD_NUMBER
|
|
+};
|
|
+
|
|
+
|
|
+} // End namespace AMDGPUISD
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+#endif // AMDGPUISELLOWERING_H
|
|
diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
|
|
new file mode 100644
|
|
index 0000000..15840b3
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
|
|
@@ -0,0 +1,344 @@
|
|
+//===-- AMDGPUIndirectAddressing.cpp - Indirect Adressing Support ---------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+///
|
|
+/// Instructions can use indirect addressing to index the register file as if it
|
|
+/// were memory. This pass lowers RegisterLoad and RegisterStore instructions
|
|
+/// to either a COPY or a MOV that uses indirect addressing.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPU.h"
|
|
+#include "R600InstrInfo.h"
|
|
+#include "R600MachineFunctionInfo.h"
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+#include "llvm/Support/Debug.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+namespace {
|
|
+
|
|
+class AMDGPUIndirectAddressingPass : public MachineFunctionPass {
|
|
+
|
|
+private:
|
|
+ static char ID;
|
|
+ const AMDGPUInstrInfo *TII;
|
|
+
|
|
+ bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const;
|
|
+
|
|
+public:
|
|
+ AMDGPUIndirectAddressingPass(TargetMachine &tm) :
|
|
+ MachineFunctionPass(ID),
|
|
+ TII(static_cast<const AMDGPUInstrInfo*>(tm.getInstrInfo()))
|
|
+ { }
|
|
+
|
|
+ virtual bool runOnMachineFunction(MachineFunction &MF);
|
|
+
|
|
+ const char *getPassName() const { return "R600 Handle indirect addressing"; }
|
|
+
|
|
+};
|
|
+
|
|
+} // End anonymous namespace
|
|
+
|
|
+char AMDGPUIndirectAddressingPass::ID = 0;
|
|
+
|
|
+FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) {
|
|
+ return new AMDGPUIndirectAddressingPass(tm);
|
|
+}
|
|
+
|
|
+bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
|
|
+ MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
+
|
|
+ int IndirectBegin = TII->getIndirectIndexBegin(MF);
|
|
+ int IndirectEnd = TII->getIndirectIndexEnd(MF);
|
|
+
|
|
+ if (IndirectBegin == -1) {
|
|
+ // No indirect addressing, we can skip this pass
|
|
+ assert(IndirectEnd == -1);
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ // The map keeps track of the indirect address that is represented by
|
|
+ // each virtual register. The key is the register and the value is the
|
|
+ // indirect address it uses.
|
|
+ std::map<unsigned, unsigned> RegisterAddressMap;
|
|
+
|
|
+ // First pass - Lower all of the RegisterStore instructions and track which
|
|
+ // registers are live.
|
|
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
|
|
+ BB != BB_E; ++BB) {
|
|
+ // This map keeps track of the current live indirect registers.
|
|
+ // The key is the address and the value is the register
|
|
+ std::map<unsigned, unsigned> LiveAddressRegisterMap;
|
|
+ MachineBasicBlock &MBB = *BB;
|
|
+
|
|
+ for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
|
|
+ I != MBB.end(); I = Next) {
|
|
+ Next = llvm::next(I);
|
|
+ MachineInstr &MI = *I;
|
|
+
|
|
+ if (!TII->isRegisterStore(MI)) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ // Lower RegisterStore
|
|
+
|
|
+ unsigned RegIndex = MI.getOperand(2).getImm();
|
|
+ unsigned Channel = MI.getOperand(3).getImm();
|
|
+ unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
|
|
+ const TargetRegisterClass *IndirectStoreRegClass =
|
|
+ TII->getIndirectAddrStoreRegClass(MI.getOperand(0).getReg());
|
|
+
|
|
+ if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
|
|
+ // Direct register access.
|
|
+ unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
|
|
+
|
|
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg)
|
|
+ .addOperand(MI.getOperand(0));
|
|
+
|
|
+ RegisterAddressMap[DstReg] = Address;
|
|
+ LiveAddressRegisterMap[Address] = DstReg;
|
|
+ } else {
|
|
+ // Indirect register access.
|
|
+ MachineInstrBuilder MOV = TII->buildIndirectWrite(BB, I,
|
|
+ MI.getOperand(0).getReg(), // Value
|
|
+ Address,
|
|
+ MI.getOperand(1).getReg()); // Offset
|
|
+ for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
|
|
+ unsigned Addr = TII->calculateIndirectAddress(i, Channel);
|
|
+ unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
|
|
+ MOV.addReg(DstReg, RegState::Define | RegState::Implicit);
|
|
+ RegisterAddressMap[DstReg] = Addr;
|
|
+ LiveAddressRegisterMap[Addr] = DstReg;
|
|
+ }
|
|
+ }
|
|
+ MI.eraseFromParent();
|
|
+ }
|
|
+
|
|
+ // Update the live-ins of the succesor blocks
|
|
+ for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(),
|
|
+ SuccEnd = MBB.succ_end();
|
|
+ SuccEnd != Succ; ++Succ) {
|
|
+ std::map<unsigned, unsigned>::const_iterator Key, KeyEnd;
|
|
+ for (Key = LiveAddressRegisterMap.begin(),
|
|
+ KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) {
|
|
+ (*Succ)->addLiveIn(Key->second);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Second pass - Lower the RegisterLoad instructions
|
|
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
|
|
+ BB != BB_E; ++BB) {
|
|
+ // Key is the address and the value is the register
|
|
+ std::map<unsigned, unsigned> LiveAddressRegisterMap;
|
|
+ MachineBasicBlock &MBB = *BB;
|
|
+
|
|
+ MachineBasicBlock::livein_iterator LI = MBB.livein_begin();
|
|
+ while (LI != MBB.livein_end()) {
|
|
+ std::vector<unsigned> PhiRegisters;
|
|
+
|
|
+ // Make sure this live in is used for indirect addressing
|
|
+ if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) {
|
|
+ ++LI;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ unsigned Address = RegisterAddressMap[*LI];
|
|
+ LiveAddressRegisterMap[Address] = *LI;
|
|
+ PhiRegisters.push_back(*LI);
|
|
+
|
|
+ // Check if there are other live in registers which map to the same
|
|
+ // indirect address.
|
|
+ for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI),
|
|
+ LE = MBB.livein_end();
|
|
+ LJ != LE; ++LJ) {
|
|
+ unsigned Reg = *LJ;
|
|
+ if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (RegisterAddressMap[Reg] == Address) {
|
|
+ PhiRegisters.push_back(Reg);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (PhiRegisters.size() == 1) {
|
|
+ // We don't need to insert a Phi instruction, so we can just add the
|
|
+ // registers to the live list for the block.
|
|
+ LiveAddressRegisterMap[Address] = *LI;
|
|
+ MBB.removeLiveIn(*LI);
|
|
+ } else {
|
|
+ // We need to insert a PHI, because we have the same address being
|
|
+ // written in multiple predecessor blocks.
|
|
+ const TargetRegisterClass *PhiDstClass =
|
|
+ TII->getIndirectAddrStoreRegClass(*(PhiRegisters.begin()));
|
|
+ unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass);
|
|
+ MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(),
|
|
+ MBB.findDebugLoc(MBB.begin()),
|
|
+ TII->get(AMDGPU::PHI), PhiDstReg);
|
|
+
|
|
+ for (std::vector<unsigned>::const_iterator RI = PhiRegisters.begin(),
|
|
+ RE = PhiRegisters.end();
|
|
+ RI != RE; ++RI) {
|
|
+ unsigned Reg = *RI;
|
|
+ MachineInstr *DefInst = MRI.getVRegDef(Reg);
|
|
+ assert(DefInst);
|
|
+ MachineBasicBlock *RegBlock = DefInst->getParent();
|
|
+ Phi.addReg(Reg);
|
|
+ Phi.addMBB(RegBlock);
|
|
+ MBB.removeLiveIn(Reg);
|
|
+ }
|
|
+ RegisterAddressMap[PhiDstReg] = Address;
|
|
+ LiveAddressRegisterMap[Address] = PhiDstReg;
|
|
+ }
|
|
+ LI = MBB.livein_begin();
|
|
+ }
|
|
+
|
|
+ for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
|
|
+ I != MBB.end(); I = Next) {
|
|
+ Next = llvm::next(I);
|
|
+ MachineInstr &MI = *I;
|
|
+
|
|
+ if (!TII->isRegisterLoad(MI)) {
|
|
+ if (MI.getOpcode() == AMDGPU::PHI) {
|
|
+ continue;
|
|
+ }
|
|
+ // Check for indirect register defs
|
|
+ for (unsigned OpIdx = 0, NumOperands = MI.getNumOperands();
|
|
+ OpIdx < NumOperands; ++OpIdx) {
|
|
+ MachineOperand &MO = MI.getOperand(OpIdx);
|
|
+ if (MO.isReg() && MO.isDef() &&
|
|
+ RegisterAddressMap.find(MO.getReg()) != RegisterAddressMap.end()) {
|
|
+ unsigned Reg = MO.getReg();
|
|
+ unsigned LiveAddress = RegisterAddressMap[Reg];
|
|
+ // Chain the live-ins
|
|
+ if (LiveAddressRegisterMap.find(LiveAddress) !=
|
|
+ RegisterAddressMap.end()) {
|
|
+ MI.addOperand(MachineOperand::CreateReg(
|
|
+ LiveAddressRegisterMap[LiveAddress],
|
|
+ false, // isDef
|
|
+ true, // isImp
|
|
+ true)); // isKill
|
|
+ }
|
|
+ LiveAddressRegisterMap[LiveAddress] = Reg;
|
|
+ }
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ const TargetRegisterClass *SuperIndirectRegClass =
|
|
+ TII->getSuperIndirectRegClass();
|
|
+ const TargetRegisterClass *IndirectLoadRegClass =
|
|
+ TII->getIndirectAddrLoadRegClass();
|
|
+ unsigned IndirectReg = MRI.createVirtualRegister(SuperIndirectRegClass);
|
|
+
|
|
+ unsigned RegIndex = MI.getOperand(2).getImm();
|
|
+ unsigned Channel = MI.getOperand(3).getImm();
|
|
+ unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
|
|
+
|
|
+ if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
|
|
+ // Direct register access
|
|
+ unsigned Reg = LiveAddressRegisterMap[Address];
|
|
+ unsigned AddrReg = IndirectLoadRegClass->getRegister(Address);
|
|
+
|
|
+ if (regHasExplicitDef(MRI, Reg)) {
|
|
+ // If the register we are reading from has an explicit def, then that
|
|
+ // means it was written via a direct register access (i.e. COPY
|
|
+ // or other instruction that doesn't use indirect addressing). In
|
|
+ // this case we know where the value has been stored, so we can just
|
|
+ // issue a copy.
|
|
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
|
|
+ MI.getOperand(0).getReg())
|
|
+ .addReg(Reg);
|
|
+ } else {
|
|
+ // If the register we are reading has an implicit def, then that
|
|
+ // means it was written by an indirect register access (i.e. An
|
|
+ // instruction that uses indirect addressing.
|
|
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
|
|
+ MI.getOperand(0).getReg())
|
|
+ .addReg(AddrReg)
|
|
+ .addReg(Reg, RegState::Implicit);
|
|
+ }
|
|
+ } else {
|
|
+ // Indirect register access
|
|
+
|
|
+ // Note on REQ_SEQUENCE instructons: You can't actually use the register
|
|
+ // it defines unless you have an instruction that takes the defined
|
|
+ // register class as an operand.
|
|
+
|
|
+ MachineInstrBuilder Sequence = BuildMI(MBB, I, MBB.findDebugLoc(I),
|
|
+ TII->get(AMDGPU::REG_SEQUENCE),
|
|
+ IndirectReg);
|
|
+ for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
|
|
+ unsigned Addr = TII->calculateIndirectAddress(i, Channel);
|
|
+ if (LiveAddressRegisterMap.find(Addr) == LiveAddressRegisterMap.end()) {
|
|
+ continue;
|
|
+ }
|
|
+ unsigned Reg = LiveAddressRegisterMap[Addr];
|
|
+
|
|
+ // We only need to use REG_SEQUENCE for explicit defs, since the
|
|
+ // register coalescer won't do anything with the implicit defs.
|
|
+ MachineInstr *DefInstr = MRI.getVRegDef(Reg);
|
|
+ if (!regHasExplicitDef(MRI, Reg)) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ // Insert a REQ_SEQUENCE instruction to force the register allocator
|
|
+ // to allocate the virtual register to the correct physical register.
|
|
+ Sequence.addReg(LiveAddressRegisterMap[Addr]);
|
|
+ Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(Addr));
|
|
+ }
|
|
+ MachineInstrBuilder Mov = TII->buildIndirectRead(BB, I,
|
|
+ MI.getOperand(0).getReg(), // Value
|
|
+ Address,
|
|
+ MI.getOperand(1).getReg()); // Offset
|
|
+
|
|
+
|
|
+
|
|
+ Mov.addReg(IndirectReg, RegState::Implicit | RegState::Kill);
|
|
+ Mov.addReg(LiveAddressRegisterMap[Address], RegState::Implicit);
|
|
+
|
|
+ }
|
|
+ MI.eraseFromParent();
|
|
+ }
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUIndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI,
|
|
+ unsigned Reg) const {
|
|
+ MachineInstr *DefInstr = MRI.getVRegDef(Reg);
|
|
+
|
|
+ if (!DefInstr) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (DefInstr->getOpcode() == AMDGPU::PHI) {
|
|
+ bool Explicit = false;
|
|
+ for (MachineInstr::const_mop_iterator I = DefInstr->operands_begin(),
|
|
+ E = DefInstr->operands_end();
|
|
+ I != E; ++I) {
|
|
+ const MachineOperand &MO = *I;
|
|
+ if (!MO.isReg() || MO.isDef()) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ Explicit = Explicit || regHasExplicitDef(MRI, MO.getReg());
|
|
+ }
|
|
+ return Explicit;
|
|
+ }
|
|
+
|
|
+ return DefInstr->getOperand(0).isReg() &&
|
|
+ DefInstr->getOperand(0).getReg() == Reg;
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..640707d
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
|
|
@@ -0,0 +1,266 @@
|
|
+//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Implementation of the TargetInstrInfo class that is common to all
|
|
+/// AMD GPUs.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPUInstrInfo.h"
|
|
+#include "AMDGPURegisterInfo.h"
|
|
+#include "AMDGPUTargetMachine.h"
|
|
+#include "AMDIL.h"
|
|
+#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+
|
|
+#define GET_INSTRINFO_CTOR
|
|
+#include "AMDGPUGenInstrInfo.inc"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
|
|
+ : AMDGPUGenInstrInfo(0,0), RI(tm, *this), TM(tm) { }
|
|
+
|
|
+const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
|
|
+ return RI;
|
|
+}
|
|
+
|
|
+bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
|
|
+ unsigned &SrcReg, unsigned &DstReg,
|
|
+ unsigned &SubIdx) const {
|
|
+// TODO: Implement this function
|
|
+ return false;
|
|
+}
|
|
+
|
|
+unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
|
|
+ int &FrameIndex) const {
|
|
+// TODO: Implement this function
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
|
|
+ int &FrameIndex) const {
|
|
+// TODO: Implement this function
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
|
|
+ const MachineMemOperand *&MMO,
|
|
+ int &FrameIndex) const {
|
|
+// TODO: Implement this function
|
|
+ return false;
|
|
+}
|
|
+unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
|
|
+ int &FrameIndex) const {
|
|
+// TODO: Implement this function
|
|
+ return 0;
|
|
+}
|
|
+unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
|
|
+ int &FrameIndex) const {
|
|
+// TODO: Implement this function
|
|
+ return 0;
|
|
+}
|
|
+bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
|
|
+ const MachineMemOperand *&MMO,
|
|
+ int &FrameIndex) const {
|
|
+// TODO: Implement this function
|
|
+ return false;
|
|
+}
|
|
+
|
|
+MachineInstr *
|
|
+AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
|
|
+ MachineBasicBlock::iterator &MBBI,
|
|
+ LiveVariables *LV) const {
|
|
+// TODO: Implement this function
|
|
+ return NULL;
|
|
+}
|
|
+bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
|
|
+ MachineBasicBlock &MBB) const {
|
|
+ while (iter != MBB.end()) {
|
|
+ switch (iter->getOpcode()) {
|
|
+ default:
|
|
+ break;
|
|
+ case AMDGPU::BRANCH_COND_i32:
|
|
+ case AMDGPU::BRANCH_COND_f32:
|
|
+ case AMDGPU::BRANCH:
|
|
+ return true;
|
|
+ };
|
|
+ ++iter;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) {
|
|
+ MachineBasicBlock::iterator tmp = MBB->end();
|
|
+ if (!MBB->size()) {
|
|
+ return MBB->end();
|
|
+ }
|
|
+ while (--tmp) {
|
|
+ if (tmp->getOpcode() == AMDGPU::ENDLOOP
|
|
+ || tmp->getOpcode() == AMDGPU::ENDIF
|
|
+ || tmp->getOpcode() == AMDGPU::ELSE) {
|
|
+ if (tmp == MBB->begin()) {
|
|
+ return tmp;
|
|
+ } else {
|
|
+ continue;
|
|
+ }
|
|
+ } else {
|
|
+ return ++tmp;
|
|
+ }
|
|
+ }
|
|
+ return MBB->end();
|
|
+}
|
|
+
|
|
+void
|
|
+AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI,
|
|
+ unsigned SrcReg, bool isKill,
|
|
+ int FrameIndex,
|
|
+ const TargetRegisterClass *RC,
|
|
+ const TargetRegisterInfo *TRI) const {
|
|
+ assert(!"Not Implemented");
|
|
+}
|
|
+
|
|
+void
|
|
+AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI,
|
|
+ unsigned DestReg, int FrameIndex,
|
|
+ const TargetRegisterClass *RC,
|
|
+ const TargetRegisterInfo *TRI) const {
|
|
+ assert(!"Not Implemented");
|
|
+}
|
|
+
|
|
+MachineInstr *
|
|
+AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
|
+ MachineInstr *MI,
|
|
+ const SmallVectorImpl<unsigned> &Ops,
|
|
+ int FrameIndex) const {
|
|
+// TODO: Implement this function
|
|
+ return 0;
|
|
+}
|
|
+MachineInstr*
|
|
+AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
|
+ MachineInstr *MI,
|
|
+ const SmallVectorImpl<unsigned> &Ops,
|
|
+ MachineInstr *LoadMI) const {
|
|
+ // TODO: Implement this function
|
|
+ return 0;
|
|
+}
|
|
+bool
|
|
+AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
|
|
+ const SmallVectorImpl<unsigned> &Ops) const {
|
|
+ // TODO: Implement this function
|
|
+ return false;
|
|
+}
|
|
+bool
|
|
+AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
|
|
+ unsigned Reg, bool UnfoldLoad,
|
|
+ bool UnfoldStore,
|
|
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
|
|
+ // TODO: Implement this function
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
|
|
+ SmallVectorImpl<SDNode*> &NewNodes) const {
|
|
+ // TODO: Implement this function
|
|
+ return false;
|
|
+}
|
|
+
|
|
+unsigned
|
|
+AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
|
|
+ bool UnfoldLoad, bool UnfoldStore,
|
|
+ unsigned *LoadRegIndex) const {
|
|
+ // TODO: Implement this function
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
|
|
+ int64_t Offset1, int64_t Offset2,
|
|
+ unsigned NumLoads) const {
|
|
+ assert(Offset2 > Offset1
|
|
+ && "Second offset should be larger than first offset!");
|
|
+ // If we have less than 16 loads in a row, and the offsets are within 16,
|
|
+ // then schedule together.
|
|
+ // TODO: Make the loads schedule near if it fits in a cacheline
|
|
+ return (NumLoads < 16 && (Offset2 - Offset1) < 16);
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
|
|
+ const {
|
|
+ // TODO: Implement this function
|
|
+ return true;
|
|
+}
|
|
+void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI) const {
|
|
+ // TODO: Implement this function
|
|
+}
|
|
+
|
|
+bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const {
|
|
+ // TODO: Implement this function
|
|
+ return false;
|
|
+}
|
|
+bool
|
|
+AMDGPUInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
|
|
+ const SmallVectorImpl<MachineOperand> &Pred2)
|
|
+ const {
|
|
+ // TODO: Implement this function
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI,
|
|
+ std::vector<MachineOperand> &Pred) const {
|
|
+ // TODO: Implement this function
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const {
|
|
+ // TODO: Implement this function
|
|
+ return MI->getDesc().isPredicable();
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
|
|
+ // TODO: Implement this function
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const {
|
|
+ return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
|
|
+}
|
|
+
|
|
+bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
|
|
+ return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
|
|
+}
|
|
+
|
|
+
|
|
+void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
|
|
+ DebugLoc DL) const {
|
|
+ MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
+ const AMDGPURegisterInfo & RI = getRegisterInfo();
|
|
+
|
|
+ for (unsigned i = 0; i < MI.getNumOperands(); i++) {
|
|
+ MachineOperand &MO = MI.getOperand(i);
|
|
+ // Convert dst regclass to one that is supported by the ISA
|
|
+ if (MO.isReg() && MO.isDef()) {
|
|
+ if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
|
|
+ const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg());
|
|
+ const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass);
|
|
+
|
|
+ assert(newRegClass);
|
|
+
|
|
+ MRI.setRegClass(MO.getReg(), newRegClass);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
|
|
new file mode 100644
|
|
index 0000000..5220aa0
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
|
|
@@ -0,0 +1,207 @@
|
|
+//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Contains the definition of a TargetInstrInfo class that is common
|
|
+/// to all AMD GPUs.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPUINSTRUCTIONINFO_H
|
|
+#define AMDGPUINSTRUCTIONINFO_H
|
|
+
|
|
+#include "AMDGPURegisterInfo.h"
|
|
+#include "AMDGPUInstrInfo.h"
|
|
+#include "llvm/Target/TargetInstrInfo.h"
|
|
+
|
|
+#include <map>
|
|
+
|
|
+#define GET_INSTRINFO_HEADER
|
|
+#define GET_INSTRINFO_ENUM
|
|
+#include "AMDGPUGenInstrInfo.inc"
|
|
+
|
|
+#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT
|
|
+#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT
|
|
+#define OPCODE_IS_ZERO AMDGPU::PRED_SETE
|
|
+#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class AMDGPUTargetMachine;
|
|
+class MachineFunction;
|
|
+class MachineInstr;
|
|
+class MachineInstrBuilder;
|
|
+
|
|
+class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
|
|
+private:
|
|
+ const AMDGPURegisterInfo RI;
|
|
+ bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
|
|
+ MachineBasicBlock &MBB) const;
|
|
+protected:
|
|
+ TargetMachine &TM;
|
|
+public:
|
|
+ explicit AMDGPUInstrInfo(TargetMachine &tm);
|
|
+
|
|
+ virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
|
|
+
|
|
+ bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
|
|
+ unsigned &DstReg, unsigned &SubIdx) const;
|
|
+
|
|
+ unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
|
|
+ unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
|
|
+ int &FrameIndex) const;
|
|
+ bool hasLoadFromStackSlot(const MachineInstr *MI,
|
|
+ const MachineMemOperand *&MMO,
|
|
+ int &FrameIndex) const;
|
|
+ unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
|
|
+ unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
|
|
+ int &FrameIndex) const;
|
|
+ bool hasStoreFromStackSlot(const MachineInstr *MI,
|
|
+ const MachineMemOperand *&MMO,
|
|
+ int &FrameIndex) const;
|
|
+
|
|
+ MachineInstr *
|
|
+ convertToThreeAddress(MachineFunction::iterator &MFI,
|
|
+ MachineBasicBlock::iterator &MBBI,
|
|
+ LiveVariables *LV) const;
|
|
+
|
|
+
|
|
+ virtual void copyPhysReg(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI, DebugLoc DL,
|
|
+ unsigned DestReg, unsigned SrcReg,
|
|
+ bool KillSrc) const = 0;
|
|
+
|
|
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI,
|
|
+ unsigned SrcReg, bool isKill, int FrameIndex,
|
|
+ const TargetRegisterClass *RC,
|
|
+ const TargetRegisterInfo *TRI) const;
|
|
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI,
|
|
+ unsigned DestReg, int FrameIndex,
|
|
+ const TargetRegisterClass *RC,
|
|
+ const TargetRegisterInfo *TRI) const;
|
|
+
|
|
+protected:
|
|
+ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
|
|
+ MachineInstr *MI,
|
|
+ const SmallVectorImpl<unsigned> &Ops,
|
|
+ int FrameIndex) const;
|
|
+ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
|
|
+ MachineInstr *MI,
|
|
+ const SmallVectorImpl<unsigned> &Ops,
|
|
+ MachineInstr *LoadMI) const;
|
|
+public:
|
|
+ bool canFoldMemoryOperand(const MachineInstr *MI,
|
|
+ const SmallVectorImpl<unsigned> &Ops) const;
|
|
+ bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
|
|
+ unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
|
|
+ SmallVectorImpl<MachineInstr *> &NewMIs) const;
|
|
+ bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
|
|
+ SmallVectorImpl<SDNode *> &NewNodes) const;
|
|
+ unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
|
|
+ bool UnfoldLoad, bool UnfoldStore,
|
|
+ unsigned *LoadRegIndex = 0) const;
|
|
+ bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
|
|
+ int64_t Offset1, int64_t Offset2,
|
|
+ unsigned NumLoads) const;
|
|
+
|
|
+ bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
|
|
+ void insertNoop(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI) const;
|
|
+ bool isPredicated(const MachineInstr *MI) const;
|
|
+ bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
|
|
+ const SmallVectorImpl<MachineOperand> &Pred2) const;
|
|
+ bool DefinesPredicate(MachineInstr *MI,
|
|
+ std::vector<MachineOperand> &Pred) const;
|
|
+ bool isPredicable(MachineInstr *MI) const;
|
|
+ bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
|
|
+
|
|
+ // Helper functions that check the opcode for status information
|
|
+ bool isLoadInst(llvm::MachineInstr *MI) const;
|
|
+ bool isExtLoadInst(llvm::MachineInstr *MI) const;
|
|
+ bool isSWSExtLoadInst(llvm::MachineInstr *MI) const;
|
|
+ bool isSExtLoadInst(llvm::MachineInstr *MI) const;
|
|
+ bool isZExtLoadInst(llvm::MachineInstr *MI) const;
|
|
+ bool isAExtLoadInst(llvm::MachineInstr *MI) const;
|
|
+ bool isStoreInst(llvm::MachineInstr *MI) const;
|
|
+ bool isTruncStoreInst(llvm::MachineInstr *MI) const;
|
|
+ bool isRegisterStore(const MachineInstr &MI) const;
|
|
+ bool isRegisterLoad(const MachineInstr &MI) const;
|
|
+
|
|
+//===---------------------------------------------------------------------===//
|
|
+// Pure virtual funtions to be implemented by sub-classes.
|
|
+//===---------------------------------------------------------------------===//
|
|
+
|
|
+ virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg,
|
|
+ int64_t Imm) const = 0;
|
|
+ virtual unsigned getIEQOpcode() const = 0;
|
|
+ virtual bool isMov(unsigned opcode) const = 0;
|
|
+
|
|
+ /// \returns the smallest register index that will be accessed by an indirect
|
|
+ /// read or write or -1 if indirect addressing is not used by this program.
|
|
+ virtual int getIndirectIndexBegin(const MachineFunction &MF) const = 0;
|
|
+
|
|
+ /// \returns the largest register index that will be accessed by an indirect
|
|
+ /// read or write or -1 if indirect addressing is not used by this program.
|
|
+ virtual int getIndirectIndexEnd(const MachineFunction &MF) const = 0;
|
|
+
|
|
+ /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
|
|
+ /// \p Channel
|
|
+ ///
|
|
+ /// We model indirect addressing using a virtual address space that can be
|
|
+ /// accesed with loads and stores. The "Indirect Address" is the memory
|
|
+ /// address in this virtual address space that maps to the given \p RegIndex
|
|
+ /// and \p Channel.
|
|
+ virtual unsigned calculateIndirectAddress(unsigned RegIndex,
|
|
+ unsigned Channel) const = 0;
|
|
+
|
|
+ /// \returns The register class to be used for storing values to an
|
|
+ /// "Indirect Address" .
|
|
+ virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
|
|
+ unsigned SourceReg) const = 0;
|
|
+
|
|
+ /// \returns The register class to be used for loading values from
|
|
+ /// an "Indirect Address" .
|
|
+ virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const = 0;
|
|
+
|
|
+ /// \brief Build instruction(s) for an indirect register write.
|
|
+ ///
|
|
+ /// \returns The instruction that performs the indirect register write
|
|
+ virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned ValueReg, unsigned Address,
|
|
+ unsigned OffsetReg) const = 0;
|
|
+
|
|
+ /// \brief Build instruction(s) for an indirect register read.
|
|
+ ///
|
|
+ /// \returns The instruction that performs the indirect register read
|
|
+ virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned ValueReg, unsigned Address,
|
|
+ unsigned OffsetReg) const = 0;
|
|
+
|
|
+ /// \returns the register class whose sub registers are the set of all
|
|
+ /// possible registers that can be used for indirect addressing.
|
|
+ virtual const TargetRegisterClass *getSuperIndirectRegClass() const = 0;
|
|
+
|
|
+
|
|
+ /// \brief Convert the AMDIL MachineInstr to a supported ISA
|
|
+ /// MachineInstr
|
|
+ virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
|
|
+ DebugLoc DL) const;
|
|
+
|
|
+};
|
|
+
|
|
+} // End llvm namespace
|
|
+
|
|
+#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63)
|
|
+#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62)
|
|
+
|
|
+#endif // AMDGPUINSTRINFO_H
|
|
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
|
|
new file mode 100644
|
|
index 0000000..b66ae87
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
|
|
@@ -0,0 +1,82 @@
|
|
+//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// This file contains DAG node defintions for the AMDGPU target.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// AMDGPU DAG Profiles
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
|
|
+ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
|
|
+]>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// AMDGPU DAG Nodes
|
|
+//
|
|
+
|
|
+// out = ((a << 32) | b) >> c)
|
|
+//
|
|
+// Can be used to optimize rtol:
|
|
+// rotl(a, b) = bitalign(a, a, 32 - b)
|
|
+def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>;
|
|
+
|
|
+// This argument to this node is a dword address.
|
|
+def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
|
|
+
|
|
+// out = a - floor(a)
|
|
+def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
|
|
+
|
|
+// out = max(a, b) a and b are floats
|
|
+def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
|
|
+ [SDNPCommutative, SDNPAssociative]
|
|
+>;
|
|
+
|
|
+// out = max(a, b) a and b are signed ints
|
|
+def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
|
|
+ [SDNPCommutative, SDNPAssociative]
|
|
+>;
|
|
+
|
|
+// out = max(a, b) a and b are unsigned ints
|
|
+def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
|
|
+ [SDNPCommutative, SDNPAssociative]
|
|
+>;
|
|
+
|
|
+// out = min(a, b) a and b are floats
|
|
+def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp,
|
|
+ [SDNPCommutative, SDNPAssociative]
|
|
+>;
|
|
+
|
|
+// out = min(a, b) a snd b are signed ints
|
|
+def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
|
|
+ [SDNPCommutative, SDNPAssociative]
|
|
+>;
|
|
+
|
|
+// out = min(a, b) a and b are unsigned ints
|
|
+def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
|
|
+ [SDNPCommutative, SDNPAssociative]
|
|
+>;
|
|
+
|
|
+// urecip - This operation is a helper for integer division, it returns the
|
|
+// result of 1 / a as a fractional unsigned integer.
|
|
+// out = (2^32 / a) + e
|
|
+// e is rounding error
|
|
+def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
|
|
+
|
|
+def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>;
|
|
+
|
|
+def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
|
|
+ SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
|
|
+ [SDNPHasChain, SDNPMayLoad]>;
|
|
+
|
|
+def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
|
|
+ SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
|
|
+ [SDNPHasChain, SDNPMayStore]>;
|
|
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
|
|
new file mode 100644
|
|
index 0000000..0559a5a
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUInstructions.td
|
|
@@ -0,0 +1,268 @@
|
|
+//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// This file contains instruction defs that are common to all hw codegen
|
|
+// targets.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
|
|
+ field bit isRegisterLoad = 0;
|
|
+ field bit isRegisterStore = 0;
|
|
+
|
|
+ let Namespace = "AMDGPU";
|
|
+ let OutOperandList = outs;
|
|
+ let InOperandList = ins;
|
|
+ let AsmString = asm;
|
|
+ let Pattern = pattern;
|
|
+ let Itinerary = NullALU;
|
|
+
|
|
+ let TSFlags{63} = isRegisterLoad;
|
|
+ let TSFlags{62} = isRegisterStore;
|
|
+}
|
|
+
|
|
+class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
|
|
+ : AMDGPUInst<outs, ins, asm, pattern> {
|
|
+
|
|
+ field bits<32> Inst = 0xffffffff;
|
|
+
|
|
+}
|
|
+
|
|
+def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
|
|
+
|
|
+def COND_EQ : PatLeaf <
|
|
+ (cond),
|
|
+ [{switch(N->get()){{default: return false;
|
|
+ case ISD::SETOEQ: case ISD::SETUEQ:
|
|
+ case ISD::SETEQ: return true;}}}]
|
|
+>;
|
|
+
|
|
+def COND_NE : PatLeaf <
|
|
+ (cond),
|
|
+ [{switch(N->get()){{default: return false;
|
|
+ case ISD::SETONE: case ISD::SETUNE:
|
|
+ case ISD::SETNE: return true;}}}]
|
|
+>;
|
|
+def COND_GT : PatLeaf <
|
|
+ (cond),
|
|
+ [{switch(N->get()){{default: return false;
|
|
+ case ISD::SETOGT: case ISD::SETUGT:
|
|
+ case ISD::SETGT: return true;}}}]
|
|
+>;
|
|
+
|
|
+def COND_GE : PatLeaf <
|
|
+ (cond),
|
|
+ [{switch(N->get()){{default: return false;
|
|
+ case ISD::SETOGE: case ISD::SETUGE:
|
|
+ case ISD::SETGE: return true;}}}]
|
|
+>;
|
|
+
|
|
+def COND_LT : PatLeaf <
|
|
+ (cond),
|
|
+ [{switch(N->get()){{default: return false;
|
|
+ case ISD::SETOLT: case ISD::SETULT:
|
|
+ case ISD::SETLT: return true;}}}]
|
|
+>;
|
|
+
|
|
+def COND_LE : PatLeaf <
|
|
+ (cond),
|
|
+ [{switch(N->get()){{default: return false;
|
|
+ case ISD::SETOLE: case ISD::SETULE:
|
|
+ case ISD::SETLE: return true;}}}]
|
|
+>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Load/Store Pattern Fragments
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
|
|
+ return isGlobalLoad(dyn_cast<LoadSDNode>(N));
|
|
+}]>;
|
|
+
|
|
+class Constants {
|
|
+int TWO_PI = 0x40c90fdb;
|
|
+int PI = 0x40490fdb;
|
|
+int TWO_PI_INV = 0x3e22f983;
|
|
+}
|
|
+def CONST : Constants;
|
|
+
|
|
+def FP_ZERO : PatLeaf <
|
|
+ (fpimm),
|
|
+ [{return N->getValueAPF().isZero();}]
|
|
+>;
|
|
+
|
|
+def FP_ONE : PatLeaf <
|
|
+ (fpimm),
|
|
+ [{return N->isExactlyValue(1.0);}]
|
|
+>;
|
|
+
|
|
+let isCodeGenOnly = 1, isPseudo = 1 in {
|
|
+
|
|
+let usesCustomInserter = 1 in {
|
|
+
|
|
+class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
|
|
+ (outs rc:$dst),
|
|
+ (ins rc:$src0),
|
|
+ "CLAMP $dst, $src0",
|
|
+ [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
|
|
+>;
|
|
+
|
|
+class FABS <RegisterClass rc> : AMDGPUShaderInst <
|
|
+ (outs rc:$dst),
|
|
+ (ins rc:$src0),
|
|
+ "FABS $dst, $src0",
|
|
+ [(set rc:$dst, (fabs rc:$src0))]
|
|
+>;
|
|
+
|
|
+class FNEG <RegisterClass rc> : AMDGPUShaderInst <
|
|
+ (outs rc:$dst),
|
|
+ (ins rc:$src0),
|
|
+ "FNEG $dst, $src0",
|
|
+ [(set rc:$dst, (fneg rc:$src0))]
|
|
+>;
|
|
+
|
|
+def SHADER_TYPE : AMDGPUShaderInst <
|
|
+ (outs),
|
|
+ (ins i32imm:$type),
|
|
+ "SHADER_TYPE $type",
|
|
+ [(int_AMDGPU_shader_type imm:$type)]
|
|
+>;
|
|
+
|
|
+} // usesCustomInserter = 1
|
|
+
|
|
+multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
|
|
+ ComplexPattern addrPat> {
|
|
+ def RegisterLoad : AMDGPUShaderInst <
|
|
+ (outs dstClass:$dst),
|
|
+ (ins addrClass:$addr, i32imm:$chan),
|
|
+ "RegisterLoad $dst, $addr",
|
|
+ [(set (i32 dstClass:$dst), (AMDGPUregister_load addrPat:$addr,
|
|
+ (i32 timm:$chan)))]
|
|
+ > {
|
|
+ let isRegisterLoad = 1;
|
|
+ }
|
|
+
|
|
+ def RegisterStore : AMDGPUShaderInst <
|
|
+ (outs),
|
|
+ (ins dstClass:$val, addrClass:$addr, i32imm:$chan),
|
|
+ "RegisterStore $val, $addr",
|
|
+ [(AMDGPUregister_store (i32 dstClass:$val), addrPat:$addr, (i32 timm:$chan))]
|
|
+ > {
|
|
+ let isRegisterStore = 1;
|
|
+ }
|
|
+}
|
|
+
|
|
+} // End isCodeGenOnly = 1, isPseudo = 1
|
|
+
|
|
+/* Generic helper patterns for intrinsics */
|
|
+/* -------------------------------------- */
|
|
+
|
|
+class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul,
|
|
+ RegisterClass rc> : Pat <
|
|
+ (fpow rc:$src0, rc:$src1),
|
|
+ (exp_ieee (mul rc:$src1, (log_ieee rc:$src0)))
|
|
+>;
|
|
+
|
|
+/* Other helper patterns */
|
|
+/* --------------------- */
|
|
+
|
|
+/* Extract element pattern */
|
|
+class Extract_Element <ValueType sub_type, ValueType vec_type,
|
|
+ RegisterClass vec_class, int sub_idx,
|
|
+ SubRegIndex sub_reg>: Pat<
|
|
+ (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)),
|
|
+ (EXTRACT_SUBREG vec_class:$src, sub_reg)
|
|
+>;
|
|
+
|
|
+/* Insert element pattern */
|
|
+class Insert_Element <ValueType elem_type, ValueType vec_type,
|
|
+ RegisterClass elem_class, RegisterClass vec_class,
|
|
+ int sub_idx, SubRegIndex sub_reg> : Pat <
|
|
+
|
|
+ (vec_type (vector_insert (vec_type vec_class:$vec),
|
|
+ (elem_type elem_class:$elem), sub_idx)),
|
|
+ (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg)
|
|
+>;
|
|
+
|
|
+// Vector Build pattern
|
|
+class Vector1_Build <ValueType vecType, RegisterClass vectorClass,
|
|
+ ValueType elemType, RegisterClass elemClass> : Pat <
|
|
+ (vecType (build_vector (elemType elemClass:$src))),
|
|
+ (vecType elemClass:$src)
|
|
+>;
|
|
+
|
|
+class Vector2_Build <ValueType vecType, RegisterClass vectorClass,
|
|
+ ValueType elemType, RegisterClass elemClass> : Pat <
|
|
+ (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1))),
|
|
+ (INSERT_SUBREG (INSERT_SUBREG
|
|
+ (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1)
|
|
+>;
|
|
+
|
|
+class Vector_Build <ValueType vecType, RegisterClass vectorClass,
|
|
+ ValueType elemType, RegisterClass elemClass> : Pat <
|
|
+ (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y),
|
|
+ (elemType elemClass:$z), (elemType elemClass:$w))),
|
|
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
|
|
+ (vecType (IMPLICIT_DEF)), elemClass:$x, sub0), elemClass:$y, sub1),
|
|
+ elemClass:$z, sub2), elemClass:$w, sub3)
|
|
+>;
|
|
+
|
|
+class Vector8_Build <ValueType vecType, RegisterClass vectorClass,
|
|
+ ValueType elemType, RegisterClass elemClass> : Pat <
|
|
+ (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1),
|
|
+ (elemType elemClass:$sub2), (elemType elemClass:$sub3),
|
|
+ (elemType elemClass:$sub4), (elemType elemClass:$sub5),
|
|
+ (elemType elemClass:$sub6), (elemType elemClass:$sub7))),
|
|
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
|
|
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
|
|
+ (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1),
|
|
+ elemClass:$sub2, sub2), elemClass:$sub3, sub3),
|
|
+ elemClass:$sub4, sub4), elemClass:$sub5, sub5),
|
|
+ elemClass:$sub6, sub6), elemClass:$sub7, sub7)
|
|
+>;
|
|
+
|
|
+class Vector16_Build <ValueType vecType, RegisterClass vectorClass,
|
|
+ ValueType elemType, RegisterClass elemClass> : Pat <
|
|
+ (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1),
|
|
+ (elemType elemClass:$sub2), (elemType elemClass:$sub3),
|
|
+ (elemType elemClass:$sub4), (elemType elemClass:$sub5),
|
|
+ (elemType elemClass:$sub6), (elemType elemClass:$sub7),
|
|
+ (elemType elemClass:$sub8), (elemType elemClass:$sub9),
|
|
+ (elemType elemClass:$sub10), (elemType elemClass:$sub11),
|
|
+ (elemType elemClass:$sub12), (elemType elemClass:$sub13),
|
|
+ (elemType elemClass:$sub14), (elemType elemClass:$sub15))),
|
|
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
|
|
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
|
|
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
|
|
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
|
|
+ (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1),
|
|
+ elemClass:$sub2, sub2), elemClass:$sub3, sub3),
|
|
+ elemClass:$sub4, sub4), elemClass:$sub5, sub5),
|
|
+ elemClass:$sub6, sub6), elemClass:$sub7, sub7),
|
|
+ elemClass:$sub8, sub8), elemClass:$sub9, sub9),
|
|
+ elemClass:$sub10, sub10), elemClass:$sub11, sub11),
|
|
+ elemClass:$sub12, sub12), elemClass:$sub13, sub13),
|
|
+ elemClass:$sub14, sub14), elemClass:$sub15, sub15)
|
|
+>;
|
|
+
|
|
+// bitconvert pattern
|
|
+class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
|
|
+ (dt (bitconvert (st rc:$src0))),
|
|
+ (dt rc:$src0)
|
|
+>;
|
|
+
|
|
+class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
|
|
+ (vt (AMDGPUdwordaddr (vt rc:$addr))),
|
|
+ (vt rc:$addr)
|
|
+>;
|
|
+
|
|
+include "R600Instructions.td"
|
|
+
|
|
+include "SIInstrInfo.td"
|
|
+
|
|
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
|
|
new file mode 100644
|
|
index 0000000..2ba2d4b
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
|
|
@@ -0,0 +1,62 @@
|
|
+//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// This file defines intrinsics that are used by all hw codegen targets.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+let TargetPrefix = "AMDGPU", isTarget = 1 in {
|
|
+
|
|
+ def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
|
|
+ def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+
|
|
+ def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
|
|
+ def int_AMDGPU_kilp : Intrinsic<[], [], []>;
|
|
+ def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
|
|
+
|
|
+ def int_AMDGPU_shader_type : Intrinsic<[], [llvm_i32_ty], []>;
|
|
+}
|
|
+
|
|
+let TargetPrefix = "TGSI", isTarget = 1 in {
|
|
+
|
|
+ def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>;
|
|
+}
|
|
+
|
|
+include "SIIntrinsics.td"
|
|
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
|
|
new file mode 100644
|
|
index 0000000..32275a2b
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
|
|
@@ -0,0 +1,83 @@
|
|
+//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+
|
|
+#include "AMDGPUMCInstLower.h"
|
|
+#include "AMDGPUAsmPrinter.h"
|
|
+#include "R600InstrInfo.h"
|
|
+#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
+#include "llvm/CodeGen/MachineInstr.h"
|
|
+#include "llvm/Constants.h"
|
|
+#include "llvm/MC/MCInst.h"
|
|
+#include "llvm/MC/MCStreamer.h"
|
|
+#include "llvm/MC/MCExpr.h"
|
|
+#include "llvm/Support/ErrorHandling.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx):
|
|
+ Ctx(ctx)
|
|
+{ }
|
|
+
|
|
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
|
|
+ OutMI.setOpcode(MI->getOpcode());
|
|
+
|
|
+ for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) {
|
|
+ const MachineOperand &MO = MI->getOperand(i);
|
|
+
|
|
+ MCOperand MCOp;
|
|
+ switch (MO.getType()) {
|
|
+ default:
|
|
+ llvm_unreachable("unknown operand type");
|
|
+ case MachineOperand::MO_FPImmediate: {
|
|
+ const APFloat &FloatValue = MO.getFPImm()->getValueAPF();
|
|
+ assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle &&
|
|
+ "Only floating point immediates are supported at the moment.");
|
|
+ MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat());
|
|
+ break;
|
|
+ }
|
|
+ case MachineOperand::MO_Immediate:
|
|
+ MCOp = MCOperand::CreateImm(MO.getImm());
|
|
+ break;
|
|
+ case MachineOperand::MO_Register:
|
|
+ MCOp = MCOperand::CreateReg(MO.getReg());
|
|
+ break;
|
|
+ case MachineOperand::MO_MachineBasicBlock:
|
|
+ MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
|
|
+ MO.getMBB()->getSymbol(), Ctx));
|
|
+ }
|
|
+ OutMI.addOperand(MCOp);
|
|
+ }
|
|
+}
|
|
+
|
|
+void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
|
|
+ AMDGPUMCInstLower MCInstLowering(OutContext);
|
|
+
|
|
+ if (MI->isBundle()) {
|
|
+ const MachineBasicBlock *MBB = MI->getParent();
|
|
+ MachineBasicBlock::const_instr_iterator I = MI;
|
|
+ ++I;
|
|
+ while (I != MBB->end() && I->isInsideBundle()) {
|
|
+ MCInst MCBundleInst;
|
|
+ const MachineInstr *BundledInst = I;
|
|
+ MCInstLowering.lower(BundledInst, MCBundleInst);
|
|
+ OutStreamer.EmitInstruction(MCBundleInst);
|
|
+ ++I;
|
|
+ }
|
|
+ } else {
|
|
+ MCInst TmpInst;
|
|
+ MCInstLowering.lower(MI, TmpInst);
|
|
+ OutStreamer.EmitInstruction(TmpInst);
|
|
+ }
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
|
|
new file mode 100644
|
|
index 0000000..d7d538e
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
|
|
@@ -0,0 +1,34 @@
|
|
+//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPU_MCINSTLOWER_H
|
|
+#define AMDGPU_MCINSTLOWER_H
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class MCInst;
|
|
+class MCContext;
|
|
+class MachineInstr;
|
|
+
|
|
+class AMDGPUMCInstLower {
|
|
+
|
|
+ MCContext &Ctx;
|
|
+
|
|
+public:
|
|
+ AMDGPUMCInstLower(MCContext &ctx);
|
|
+
|
|
+ /// \brief Lower a MachineInstr to an MCInst
|
|
+ void lower(const MachineInstr *MI, MCInst &OutMI) const;
|
|
+
|
|
+};
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+#endif //AMDGPU_MCINSTLOWER_H
|
|
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..d62e57b
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
|
|
@@ -0,0 +1,74 @@
|
|
+//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPURegisterInfo.h"
|
|
+#include "AMDGPUTargetMachine.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm,
|
|
+ const TargetInstrInfo &tii)
|
|
+: AMDGPUGenRegisterInfo(0),
|
|
+ TM(tm),
|
|
+ TII(tii)
|
|
+ { }
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Function handling callbacks - Functions are a seldom used feature of GPUS, so
|
|
+// they are not supported at this time.
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
|
|
+
|
|
+const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
|
|
+ const {
|
|
+ return &CalleeSavedReg;
|
|
+}
|
|
+
|
|
+void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
|
|
+ int SPAdj,
|
|
+ RegScavenger *RS) const {
|
|
+ assert(!"Subroutines not supported yet");
|
|
+}
|
|
+
|
|
+unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
|
|
+ assert(!"Subroutines not supported yet");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
|
|
+
|
|
+ switch(IndirectIndex) {
|
|
+ case 0: return AMDGPU::sub0;
|
|
+ case 1: return AMDGPU::sub1;
|
|
+ case 2: return AMDGPU::sub2;
|
|
+ case 3: return AMDGPU::sub3;
|
|
+ case 4: return AMDGPU::sub4;
|
|
+ case 5: return AMDGPU::sub5;
|
|
+ case 6: return AMDGPU::sub6;
|
|
+ case 7: return AMDGPU::sub7;
|
|
+ case 8: return AMDGPU::sub8;
|
|
+ case 9: return AMDGPU::sub9;
|
|
+ case 10: return AMDGPU::sub10;
|
|
+ case 11: return AMDGPU::sub11;
|
|
+ case 12: return AMDGPU::sub12;
|
|
+ case 13: return AMDGPU::sub13;
|
|
+ case 14: return AMDGPU::sub14;
|
|
+ case 15: return AMDGPU::sub15;
|
|
+ default: llvm_unreachable("indirect index out of range");
|
|
+ }
|
|
+}
|
|
+
|
|
+#define GET_REGINFO_TARGET_DESC
|
|
+#include "AMDGPUGenRegisterInfo.inc"
|
|
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
|
|
new file mode 100644
|
|
index 0000000..5007ff5
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
|
|
@@ -0,0 +1,65 @@
|
|
+//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
|
|
+/// targets.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPUREGISTERINFO_H
|
|
+#define AMDGPUREGISTERINFO_H
|
|
+
|
|
+#include "llvm/ADT/BitVector.h"
|
|
+#include "llvm/Target/TargetRegisterInfo.h"
|
|
+
|
|
+#define GET_REGINFO_HEADER
|
|
+#define GET_REGINFO_ENUM
|
|
+#include "AMDGPUGenRegisterInfo.inc"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class AMDGPUTargetMachine;
|
|
+class TargetInstrInfo;
|
|
+
|
|
+struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
|
|
+ TargetMachine &TM;
|
|
+ const TargetInstrInfo &TII;
|
|
+ static const uint16_t CalleeSavedReg;
|
|
+
|
|
+ AMDGPURegisterInfo(TargetMachine &tm, const TargetInstrInfo &tii);
|
|
+
|
|
+ virtual BitVector getReservedRegs(const MachineFunction &MF) const {
|
|
+ assert(!"Unimplemented"); return BitVector();
|
|
+ }
|
|
+
|
|
+ /// \param RC is an AMDIL reg class.
|
|
+ ///
|
|
+ /// \returns The ISA reg class that is equivalent to \p RC.
|
|
+ virtual const TargetRegisterClass * getISARegClass(
|
|
+ const TargetRegisterClass * RC) const {
|
|
+ assert(!"Unimplemented"); return NULL;
|
|
+ }
|
|
+
|
|
+ virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
|
|
+ assert(!"Unimplemented"); return NULL;
|
|
+ }
|
|
+
|
|
+ const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
|
|
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
|
|
+ RegScavenger *RS) const;
|
|
+ unsigned getFrameRegister(const MachineFunction &MF) const;
|
|
+
|
|
+ unsigned getIndirectSubReg(unsigned IndirectIndex) const;
|
|
+
|
|
+};
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+#endif // AMDIDSAREGISTERINFO_H
|
|
diff --git a/lib/Target/R600/AMDGPURegisterInfo.td b/lib/Target/R600/AMDGPURegisterInfo.td
|
|
new file mode 100644
|
|
index 0000000..b5aca03
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPURegisterInfo.td
|
|
@@ -0,0 +1,25 @@
|
|
+//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// Tablegen register definitions common to all hw codegen targets.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+let Namespace = "AMDGPU" in {
|
|
+
|
|
+foreach Index = 0-15 in {
|
|
+ def sub#Index : SubRegIndex;
|
|
+}
|
|
+
|
|
+def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
|
|
+
|
|
+}
|
|
+
|
|
+include "R600RegisterInfo.td"
|
|
+include "SIRegisterInfo.td"
|
|
diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
|
|
new file mode 100644
|
|
index 0000000..a8c9621
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
|
|
@@ -0,0 +1,893 @@
|
|
+//===-- AMDGPUStructurizeCFG.cpp - ------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// The pass implemented in this file transforms the programs control flow
|
|
+/// graph into a form that's suitable for code generation on hardware that
|
|
+/// implements control flow by execution masking. This currently includes all
|
|
+/// AMD GPUs but may as well be useful for other types of hardware.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPU.h"
|
|
+#include "llvm/Module.h"
|
|
+#include "llvm/ADT/SCCIterator.h"
|
|
+#include "llvm/Analysis/RegionIterator.h"
|
|
+#include "llvm/Analysis/RegionInfo.h"
|
|
+#include "llvm/Analysis/RegionPass.h"
|
|
+#include "llvm/Transforms/Utils/SSAUpdater.h"
|
|
+#include "llvm/Support/PatternMatch.h"
|
|
+
|
|
+using namespace llvm;
|
|
+using namespace llvm::PatternMatch;
|
|
+
|
|
+namespace {
|
|
+
|
|
+// Definition of the complex types used in this pass.
|
|
+
|
|
+typedef std::pair<BasicBlock *, Value *> BBValuePair;
|
|
+
|
|
+typedef SmallVector<RegionNode*, 8> RNVector;
|
|
+typedef SmallVector<BasicBlock*, 8> BBVector;
|
|
+typedef SmallVector<BranchInst*, 8> BranchVector;
|
|
+typedef SmallVector<BBValuePair, 2> BBValueVector;
|
|
+
|
|
+typedef SmallPtrSet<BasicBlock *, 8> BBSet;
|
|
+
|
|
+typedef DenseMap<PHINode *, BBValueVector> PhiMap;
|
|
+typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap;
|
|
+typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
|
|
+typedef DenseMap<BasicBlock *, Value *> BBPredicates;
|
|
+typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
|
|
+typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
|
|
+typedef DenseMap<BasicBlock *, BBVector> BB2BBVecMap;
|
|
+
|
|
+// The name for newly created blocks.
|
|
+
|
|
+static const char *FlowBlockName = "Flow";
|
|
+
|
|
+/// @brief Find the nearest common dominator for multiple BasicBlocks
|
|
+///
|
|
+/// Helper class for AMDGPUStructurizeCFG
|
|
+/// TODO: Maybe move into common code
|
|
+class NearestCommonDominator {
|
|
+
|
|
+ DominatorTree *DT;
|
|
+
|
|
+ DTN2UnsignedMap IndexMap;
|
|
+
|
|
+ BasicBlock *Result;
|
|
+ unsigned ResultIndex;
|
|
+ bool ExplicitMentioned;
|
|
+
|
|
+public:
|
|
+ /// \brief Start a new query
|
|
+ NearestCommonDominator(DominatorTree *DomTree) {
|
|
+ DT = DomTree;
|
|
+ Result = 0;
|
|
+ }
|
|
+
|
|
+ /// \brief Add BB to the resulting dominator
|
|
+ void addBlock(BasicBlock *BB, bool Remember = true) {
|
|
+
|
|
+ DomTreeNode *Node = DT->getNode(BB);
|
|
+
|
|
+ if (Result == 0) {
|
|
+ unsigned Numbering = 0;
|
|
+ for (;Node;Node = Node->getIDom())
|
|
+ IndexMap[Node] = ++Numbering;
|
|
+ Result = BB;
|
|
+ ResultIndex = 1;
|
|
+ ExplicitMentioned = Remember;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ for (;Node;Node = Node->getIDom())
|
|
+ if (IndexMap.count(Node))
|
|
+ break;
|
|
+ else
|
|
+ IndexMap[Node] = 0;
|
|
+
|
|
+ assert(Node && "Dominator tree invalid!");
|
|
+
|
|
+ unsigned Numbering = IndexMap[Node];
|
|
+ if (Numbering > ResultIndex) {
|
|
+ Result = Node->getBlock();
|
|
+ ResultIndex = Numbering;
|
|
+ ExplicitMentioned = Remember && (Result == BB);
|
|
+ } else if (Numbering == ResultIndex) {
|
|
+ ExplicitMentioned |= Remember;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /// \brief Is "Result" one of the BBs added with "Remember" = True?
|
|
+ bool wasResultExplicitMentioned() {
|
|
+ return ExplicitMentioned;
|
|
+ }
|
|
+
|
|
+ /// \brief Get the query result
|
|
+ BasicBlock *getResult() {
|
|
+ return Result;
|
|
+ }
|
|
+};
|
|
+
|
|
+/// @brief Transforms the control flow graph on one single entry/exit region
|
|
+/// at a time.
|
|
+///
|
|
+/// After the transform all "If"/"Then"/"Else" style control flow looks like
|
|
+/// this:
|
|
+///
|
|
+/// \verbatim
|
|
+/// 1
|
|
+/// ||
|
|
+/// | |
|
|
+/// 2 |
|
|
+/// | /
|
|
+/// |/
|
|
+/// 3
|
|
+/// || Where:
|
|
+/// | | 1 = "If" block, calculates the condition
|
|
+/// 4 | 2 = "Then" subregion, runs if the condition is true
|
|
+/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
|
|
+/// |/ 4 = "Else" optional subregion, runs if the condition is false
|
|
+/// 5 5 = "End" block, also rejoins the control flow
|
|
+/// \endverbatim
|
|
+///
|
|
+/// Control flow is expressed as a branch where the true exit goes into the
|
|
+/// "Then"/"Else" region, while the false exit skips the region
|
|
+/// The condition for the optional "Else" region is expressed as a PHI node.
|
|
+/// The incomming values of the PHI node are true for the "If" edge and false
|
|
+/// for the "Then" edge.
|
|
+///
|
|
+/// Additionally to that even complicated loops look like this:
|
|
+///
|
|
+/// \verbatim
|
|
+/// 1
|
|
+/// ||
|
|
+/// | |
|
|
+/// 2 ^ Where:
|
|
+/// | / 1 = "Entry" block
|
|
+/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block
|
|
+/// 3 3 = "Flow" block, with back edge to entry block
|
|
+/// |
|
|
+/// \endverbatim
|
|
+///
|
|
+/// The back edge of the "Flow" block is always on the false side of the branch
|
|
+/// while the true side continues the general flow. So the loop condition
|
|
+/// consist of a network of PHI nodes where the true incoming values expresses
|
|
+/// breaks and the false values expresses continue states.
|
|
+class AMDGPUStructurizeCFG : public RegionPass {
|
|
+
|
|
+ static char ID;
|
|
+
|
|
+ Type *Boolean;
|
|
+ ConstantInt *BoolTrue;
|
|
+ ConstantInt *BoolFalse;
|
|
+ UndefValue *BoolUndef;
|
|
+
|
|
+ Function *Func;
|
|
+ Region *ParentRegion;
|
|
+
|
|
+ DominatorTree *DT;
|
|
+
|
|
+ RNVector Order;
|
|
+ BBSet Visited;
|
|
+
|
|
+ BBPhiMap DeletedPhis;
|
|
+ BB2BBVecMap AddedPhis;
|
|
+
|
|
+ PredMap Predicates;
|
|
+ BranchVector Conditions;
|
|
+
|
|
+ BB2BBMap Loops;
|
|
+ PredMap LoopPreds;
|
|
+ BranchVector LoopConds;
|
|
+
|
|
+ RegionNode *PrevNode;
|
|
+
|
|
+ void orderNodes();
|
|
+
|
|
+ void analyzeLoops(RegionNode *N);
|
|
+
|
|
+ Value *invert(Value *Condition);
|
|
+
|
|
+ Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
|
|
+
|
|
+ void gatherPredicates(RegionNode *N);
|
|
+
|
|
+ void collectInfos();
|
|
+
|
|
+ void insertConditions(bool Loops);
|
|
+
|
|
+ void delPhiValues(BasicBlock *From, BasicBlock *To);
|
|
+
|
|
+ void addPhiValues(BasicBlock *From, BasicBlock *To);
|
|
+
|
|
+ void setPhiValues();
|
|
+
|
|
+ void killTerminator(BasicBlock *BB);
|
|
+
|
|
+ void changeExit(RegionNode *Node, BasicBlock *NewExit,
|
|
+ bool IncludeDominator);
|
|
+
|
|
+ BasicBlock *getNextFlow(BasicBlock *Dominator);
|
|
+
|
|
+ BasicBlock *needPrefix(bool NeedEmpty);
|
|
+
|
|
+ BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
|
|
+
|
|
+ void setPrevNode(BasicBlock *BB);
|
|
+
|
|
+ bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
|
|
+
|
|
+ bool isPredictableTrue(RegionNode *Node);
|
|
+
|
|
+ void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
|
|
+
|
|
+ void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
|
|
+
|
|
+ void createFlow();
|
|
+
|
|
+ void rebuildSSA();
|
|
+
|
|
+public:
|
|
+ AMDGPUStructurizeCFG():
|
|
+ RegionPass(ID) {
|
|
+
|
|
+ initializeRegionInfoPass(*PassRegistry::getPassRegistry());
|
|
+ }
|
|
+
|
|
+ virtual bool doInitialization(Region *R, RGPassManager &RGM);
|
|
+
|
|
+ virtual bool runOnRegion(Region *R, RGPassManager &RGM);
|
|
+
|
|
+ virtual const char *getPassName() const {
|
|
+ return "AMDGPU simplify control flow";
|
|
+ }
|
|
+
|
|
+ void getAnalysisUsage(AnalysisUsage &AU) const {
|
|
+
|
|
+ AU.addRequired<DominatorTree>();
|
|
+ AU.addPreserved<DominatorTree>();
|
|
+ RegionPass::getAnalysisUsage(AU);
|
|
+ }
|
|
+
|
|
+};
|
|
+
|
|
+} // end anonymous namespace
|
|
+
|
|
+char AMDGPUStructurizeCFG::ID = 0;
|
|
+
|
|
+/// \brief Initialize the types and constants used in the pass
|
|
+bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
|
|
+ LLVMContext &Context = R->getEntry()->getContext();
|
|
+
|
|
+ Boolean = Type::getInt1Ty(Context);
|
|
+ BoolTrue = ConstantInt::getTrue(Context);
|
|
+ BoolFalse = ConstantInt::getFalse(Context);
|
|
+ BoolUndef = UndefValue::get(Boolean);
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/// \brief Build up the general order of nodes
|
|
+void AMDGPUStructurizeCFG::orderNodes() {
|
|
+ scc_iterator<Region *> I = scc_begin(ParentRegion),
|
|
+ E = scc_end(ParentRegion);
|
|
+ for (Order.clear(); I != E; ++I) {
|
|
+ std::vector<RegionNode *> &Nodes = *I;
|
|
+ Order.append(Nodes.begin(), Nodes.end());
|
|
+ }
|
|
+}
|
|
+
|
|
+/// \brief Determine the end of the loops
|
|
+void AMDGPUStructurizeCFG::analyzeLoops(RegionNode *N) {
|
|
+
|
|
+ if (N->isSubRegion()) {
|
|
+ // Test for exit as back edge
|
|
+ BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
|
|
+ if (Visited.count(Exit))
|
|
+ Loops[Exit] = N->getEntry();
|
|
+
|
|
+ } else {
|
|
+ // Test for sucessors as back edge
|
|
+ BasicBlock *BB = N->getNodeAs<BasicBlock>();
|
|
+ BranchInst *Term = cast<BranchInst>(BB->getTerminator());
|
|
+
|
|
+ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
|
|
+ BasicBlock *Succ = Term->getSuccessor(i);
|
|
+
|
|
+ if (Visited.count(Succ))
|
|
+ Loops[Succ] = BB;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/// \brief Invert the given condition
|
|
+Value *AMDGPUStructurizeCFG::invert(Value *Condition) {
|
|
+
|
|
+ // First: Check if it's a constant
|
|
+ if (Condition == BoolTrue)
|
|
+ return BoolFalse;
|
|
+
|
|
+ if (Condition == BoolFalse)
|
|
+ return BoolTrue;
|
|
+
|
|
+ if (Condition == BoolUndef)
|
|
+ return BoolUndef;
|
|
+
|
|
+ // Second: If the condition is already inverted, return the original value
|
|
+ if (match(Condition, m_Not(m_Value(Condition))))
|
|
+ return Condition;
|
|
+
|
|
+ // Third: Check all the users for an invert
|
|
+ BasicBlock *Parent = cast<Instruction>(Condition)->getParent();
|
|
+ for (Value::use_iterator I = Condition->use_begin(),
|
|
+ E = Condition->use_end(); I != E; ++I) {
|
|
+
|
|
+ Instruction *User = dyn_cast<Instruction>(*I);
|
|
+ if (!User || User->getParent() != Parent)
|
|
+ continue;
|
|
+
|
|
+ if (match(*I, m_Not(m_Specific(Condition))))
|
|
+ return *I;
|
|
+ }
|
|
+
|
|
+ // Last option: Create a new instruction
|
|
+ return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
|
|
+}
|
|
+
|
|
+/// \brief Build the condition for one edge
|
|
+Value *AMDGPUStructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
|
|
+ bool Invert) {
|
|
+ Value *Cond = Invert ? BoolFalse : BoolTrue;
|
|
+ if (Term->isConditional()) {
|
|
+ Cond = Term->getCondition();
|
|
+
|
|
+ if (Idx != Invert)
|
|
+ Cond = invert(Cond);
|
|
+ }
|
|
+ return Cond;
|
|
+}
|
|
+
|
|
+/// \brief Analyze the predecessors of each block and build up predicates
|
|
+void AMDGPUStructurizeCFG::gatherPredicates(RegionNode *N) {
|
|
+
|
|
+ RegionInfo *RI = ParentRegion->getRegionInfo();
|
|
+ BasicBlock *BB = N->getEntry();
|
|
+ BBPredicates &Pred = Predicates[BB];
|
|
+ BBPredicates &LPred = LoopPreds[BB];
|
|
+
|
|
+ for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
|
|
+ PI != PE; ++PI) {
|
|
+
|
|
+ // Ignore it if it's a branch from outside into our region entry
|
|
+ if (!ParentRegion->contains(*PI))
|
|
+ continue;
|
|
+
|
|
+ Region *R = RI->getRegionFor(*PI);
|
|
+ if (R == ParentRegion) {
|
|
+
|
|
+ // It's a top level block in our region
|
|
+ BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
|
|
+ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
|
|
+ BasicBlock *Succ = Term->getSuccessor(i);
|
|
+ if (Succ != BB)
|
|
+ continue;
|
|
+
|
|
+ if (Visited.count(*PI)) {
|
|
+ // Normal forward edge
|
|
+ if (Term->isConditional()) {
|
|
+ // Try to treat it like an ELSE block
|
|
+ BasicBlock *Other = Term->getSuccessor(!i);
|
|
+ if (Visited.count(Other) && !Loops.count(Other) &&
|
|
+ !Pred.count(Other) && !Pred.count(*PI)) {
|
|
+
|
|
+ Pred[Other] = BoolFalse;
|
|
+ Pred[*PI] = BoolTrue;
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+ Pred[*PI] = buildCondition(Term, i, false);
|
|
+
|
|
+ } else {
|
|
+ // Back edge
|
|
+ LPred[*PI] = buildCondition(Term, i, true);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ } else {
|
|
+
|
|
+ // It's an exit from a sub region
|
|
+ while(R->getParent() != ParentRegion)
|
|
+ R = R->getParent();
|
|
+
|
|
+ // Edge from inside a subregion to its entry, ignore it
|
|
+ if (R == N)
|
|
+ continue;
|
|
+
|
|
+ BasicBlock *Entry = R->getEntry();
|
|
+ if (Visited.count(Entry))
|
|
+ Pred[Entry] = BoolTrue;
|
|
+ else
|
|
+ LPred[Entry] = BoolFalse;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/// \brief Collect various loop and predicate infos
|
|
+void AMDGPUStructurizeCFG::collectInfos() {
|
|
+
|
|
+ // Reset predicate
|
|
+ Predicates.clear();
|
|
+
|
|
+ // and loop infos
|
|
+ Loops.clear();
|
|
+ LoopPreds.clear();
|
|
+
|
|
+ // Reset the visited nodes
|
|
+ Visited.clear();
|
|
+
|
|
+ for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
|
|
+ OI != OE; ++OI) {
|
|
+
|
|
+ // Analyze all the conditions leading to a node
|
|
+ gatherPredicates(*OI);
|
|
+
|
|
+ // Remember that we've seen this node
|
|
+ Visited.insert((*OI)->getEntry());
|
|
+
|
|
+ // Find the last back edges
|
|
+ analyzeLoops(*OI);
|
|
+ }
|
|
+}
|
|
+
|
|
+/// \brief Insert the missing branch conditions
|
|
+void AMDGPUStructurizeCFG::insertConditions(bool Loops) {
|
|
+ BranchVector &Conds = Loops ? LoopConds : Conditions;
|
|
+ Value *Default = Loops ? BoolTrue : BoolFalse;
|
|
+ SSAUpdater PhiInserter;
|
|
+
|
|
+ for (BranchVector::iterator I = Conds.begin(),
|
|
+ E = Conds.end(); I != E; ++I) {
|
|
+
|
|
+ BranchInst *Term = *I;
|
|
+ assert(Term->isConditional());
|
|
+
|
|
+ BasicBlock *Parent = Term->getParent();
|
|
+ BasicBlock *SuccTrue = Term->getSuccessor(0);
|
|
+ BasicBlock *SuccFalse = Term->getSuccessor(1);
|
|
+
|
|
+ PhiInserter.Initialize(Boolean, "");
|
|
+ PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
|
|
+ PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
|
|
+
|
|
+ BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
|
|
+
|
|
+ NearestCommonDominator Dominator(DT);
|
|
+ Dominator.addBlock(Parent, false);
|
|
+
|
|
+ Value *ParentValue = 0;
|
|
+ for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
|
|
+ PI != PE; ++PI) {
|
|
+
|
|
+ if (PI->first == Parent) {
|
|
+ ParentValue = PI->second;
|
|
+ break;
|
|
+ }
|
|
+ PhiInserter.AddAvailableValue(PI->first, PI->second);
|
|
+ Dominator.addBlock(PI->first);
|
|
+ }
|
|
+
|
|
+ if (ParentValue) {
|
|
+ Term->setCondition(ParentValue);
|
|
+ } else {
|
|
+ if (!Dominator.wasResultExplicitMentioned())
|
|
+ PhiInserter.AddAvailableValue(Dominator.getResult(), Default);
|
|
+
|
|
+ Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/// \brief Remove all PHI values coming from "From" into "To" and remember
|
|
+/// them in DeletedPhis
|
|
+void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
|
|
+ PhiMap &Map = DeletedPhis[To];
|
|
+ for (BasicBlock::iterator I = To->begin(), E = To->end();
|
|
+ I != E && isa<PHINode>(*I);) {
|
|
+
|
|
+ PHINode &Phi = cast<PHINode>(*I++);
|
|
+ while (Phi.getBasicBlockIndex(From) != -1) {
|
|
+ Value *Deleted = Phi.removeIncomingValue(From, false);
|
|
+ Map[&Phi].push_back(std::make_pair(From, Deleted));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/// \brief Add a dummy PHI value as soon as we knew the new predecessor
|
|
+void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
|
|
+ for (BasicBlock::iterator I = To->begin(), E = To->end();
|
|
+ I != E && isa<PHINode>(*I);) {
|
|
+
|
|
+ PHINode &Phi = cast<PHINode>(*I++);
|
|
+ Value *Undef = UndefValue::get(Phi.getType());
|
|
+ Phi.addIncoming(Undef, From);
|
|
+ }
|
|
+ AddedPhis[To].push_back(From);
|
|
+}
|
|
+
|
|
+/// \brief Add the real PHI value as soon as everything is set up
|
|
+void AMDGPUStructurizeCFG::setPhiValues() {
|
|
+
|
|
+ SSAUpdater Updater;
|
|
+ for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end();
|
|
+ AI != AE; ++AI) {
|
|
+
|
|
+ BasicBlock *To = AI->first;
|
|
+ BBVector &From = AI->second;
|
|
+
|
|
+ if (!DeletedPhis.count(To))
|
|
+ continue;
|
|
+
|
|
+ PhiMap &Map = DeletedPhis[To];
|
|
+ for (PhiMap::iterator PI = Map.begin(), PE = Map.end();
|
|
+ PI != PE; ++PI) {
|
|
+
|
|
+ PHINode *Phi = PI->first;
|
|
+ Value *Undef = UndefValue::get(Phi->getType());
|
|
+ Updater.Initialize(Phi->getType(), "");
|
|
+ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
|
|
+ Updater.AddAvailableValue(To, Undef);
|
|
+
|
|
+ NearestCommonDominator Dominator(DT);
|
|
+ Dominator.addBlock(To, false);
|
|
+ for (BBValueVector::iterator VI = PI->second.begin(),
|
|
+ VE = PI->second.end(); VI != VE; ++VI) {
|
|
+
|
|
+ Updater.AddAvailableValue(VI->first, VI->second);
|
|
+ Dominator.addBlock(VI->first);
|
|
+ }
|
|
+
|
|
+ if (!Dominator.wasResultExplicitMentioned())
|
|
+ Updater.AddAvailableValue(Dominator.getResult(), Undef);
|
|
+
|
|
+ for (BBVector::iterator FI = From.begin(), FE = From.end();
|
|
+ FI != FE; ++FI) {
|
|
+
|
|
+ int Idx = Phi->getBasicBlockIndex(*FI);
|
|
+ assert(Idx != -1);
|
|
+ Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI));
|
|
+ }
|
|
+ }
|
|
+
|
|
+ DeletedPhis.erase(To);
|
|
+ }
|
|
+ assert(DeletedPhis.empty());
|
|
+}
|
|
+
|
|
+/// \brief Remove phi values from all successors and then remove the terminator.
|
|
+void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
|
|
+ TerminatorInst *Term = BB->getTerminator();
|
|
+ if (!Term)
|
|
+ return;
|
|
+
|
|
+ for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
|
|
+ SI != SE; ++SI) {
|
|
+
|
|
+ delPhiValues(BB, *SI);
|
|
+ }
|
|
+
|
|
+ Term->eraseFromParent();
|
|
+}
|
|
+
|
|
+/// \brief Let node exit(s) point to NewExit
|
|
+void AMDGPUStructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
|
|
+ bool IncludeDominator) {
|
|
+
|
|
+ if (Node->isSubRegion()) {
|
|
+ Region *SubRegion = Node->getNodeAs<Region>();
|
|
+ BasicBlock *OldExit = SubRegion->getExit();
|
|
+ BasicBlock *Dominator = 0;
|
|
+
|
|
+ // Find all the edges from the sub region to the exit
|
|
+ for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit);
|
|
+ I != E;) {
|
|
+
|
|
+ BasicBlock *BB = *I++;
|
|
+ if (!SubRegion->contains(BB))
|
|
+ continue;
|
|
+
|
|
+ // Modify the edges to point to the new exit
|
|
+ delPhiValues(BB, OldExit);
|
|
+ BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
|
|
+ addPhiValues(BB, NewExit);
|
|
+
|
|
+ // Find the new dominator (if requested)
|
|
+ if (IncludeDominator) {
|
|
+ if (!Dominator)
|
|
+ Dominator = BB;
|
|
+ else
|
|
+ Dominator = DT->findNearestCommonDominator(Dominator, BB);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Change the dominator (if requested)
|
|
+ if (Dominator)
|
|
+ DT->changeImmediateDominator(NewExit, Dominator);
|
|
+
|
|
+ // Update the region info
|
|
+ SubRegion->replaceExit(NewExit);
|
|
+
|
|
+ } else {
|
|
+ BasicBlock *BB = Node->getNodeAs<BasicBlock>();
|
|
+ killTerminator(BB);
|
|
+ BranchInst::Create(NewExit, BB);
|
|
+ addPhiValues(BB, NewExit);
|
|
+ if (IncludeDominator)
|
|
+ DT->changeImmediateDominator(NewExit, BB);
|
|
+ }
|
|
+}
|
|
+
|
|
+/// \brief Create a new flow node and update dominator tree and region info
|
|
+BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Dominator) {
|
|
+ LLVMContext &Context = Func->getContext();
|
|
+ BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
|
|
+ Order.back()->getEntry();
|
|
+ BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
|
|
+ Func, Insert);
|
|
+ DT->addNewBlock(Flow, Dominator);
|
|
+ ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
|
|
+ return Flow;
|
|
+}
|
|
+
|
|
+/// \brief Create a new or reuse the previous node as flow node
|
|
+BasicBlock *AMDGPUStructurizeCFG::needPrefix(bool NeedEmpty) {
|
|
+
|
|
+ BasicBlock *Entry = PrevNode->getEntry();
|
|
+
|
|
+ if (!PrevNode->isSubRegion()) {
|
|
+ killTerminator(Entry);
|
|
+ if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
|
|
+ return Entry;
|
|
+
|
|
+ }
|
|
+
|
|
+ // create a new flow node
|
|
+ BasicBlock *Flow = getNextFlow(Entry);
|
|
+
|
|
+ // and wire it up
|
|
+ changeExit(PrevNode, Flow, true);
|
|
+ PrevNode = ParentRegion->getBBNode(Flow);
|
|
+ return Flow;
|
|
+}
|
|
+
|
|
+/// \brief Returns the region exit if possible, otherwise just a new flow node
|
|
+BasicBlock *AMDGPUStructurizeCFG::needPostfix(BasicBlock *Flow,
|
|
+ bool ExitUseAllowed) {
|
|
+
|
|
+ if (Order.empty() && ExitUseAllowed) {
|
|
+ BasicBlock *Exit = ParentRegion->getExit();
|
|
+ DT->changeImmediateDominator(Exit, Flow);
|
|
+ addPhiValues(Flow, Exit);
|
|
+ return Exit;
|
|
+ }
|
|
+ return getNextFlow(Flow);
|
|
+}
|
|
+
|
|
+/// \brief Set the previous node
|
|
+void AMDGPUStructurizeCFG::setPrevNode(BasicBlock *BB) {
|
|
+ PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0;
|
|
+}
|
|
+
|
|
+/// \brief Does BB dominate all the predicates of Node ?
|
|
+bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
|
|
+ BBPredicates &Preds = Predicates[Node->getEntry()];
|
|
+ for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
|
|
+ PI != PE; ++PI) {
|
|
+
|
|
+ if (!DT->dominates(BB, PI->first))
|
|
+ return false;
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/// \brief Can we predict that this node will always be called?
|
|
+bool AMDGPUStructurizeCFG::isPredictableTrue(RegionNode *Node) {
|
|
+
|
|
+ BBPredicates &Preds = Predicates[Node->getEntry()];
|
|
+ bool Dominated = false;
|
|
+
|
|
+ // Regionentry is always true
|
|
+ if (PrevNode == 0)
|
|
+ return true;
|
|
+
|
|
+ for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
|
|
+ I != E; ++I) {
|
|
+
|
|
+ if (I->second != BoolTrue)
|
|
+ return false;
|
|
+
|
|
+ if (!Dominated && DT->dominates(I->first, PrevNode->getEntry()))
|
|
+ Dominated = true;
|
|
+ }
|
|
+
|
|
+ // TODO: The dominator check is too strict
|
|
+ return Dominated;
|
|
+}
|
|
+
|
|
+/// Take one node from the order vector and wire it up
|
|
+void AMDGPUStructurizeCFG::wireFlow(bool ExitUseAllowed,
|
|
+ BasicBlock *LoopEnd) {
|
|
+
|
|
+ RegionNode *Node = Order.pop_back_val();
|
|
+ Visited.insert(Node->getEntry());
|
|
+
|
|
+ if (isPredictableTrue(Node)) {
|
|
+ // Just a linear flow
|
|
+ if (PrevNode) {
|
|
+ changeExit(PrevNode, Node->getEntry(), true);
|
|
+ }
|
|
+ PrevNode = Node;
|
|
+
|
|
+ } else {
|
|
+ // Insert extra prefix node (or reuse last one)
|
|
+ BasicBlock *Flow = needPrefix(false);
|
|
+
|
|
+ // Insert extra postfix node (or use exit instead)
|
|
+ BasicBlock *Entry = Node->getEntry();
|
|
+ BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
|
|
+
|
|
+ // let it point to entry and next block
|
|
+ Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
|
|
+ addPhiValues(Flow, Entry);
|
|
+ DT->changeImmediateDominator(Entry, Flow);
|
|
+
|
|
+ PrevNode = Node;
|
|
+ while (!Order.empty() && !Visited.count(LoopEnd) &&
|
|
+ dominatesPredicates(Entry, Order.back())) {
|
|
+ handleLoops(false, LoopEnd);
|
|
+ }
|
|
+
|
|
+ changeExit(PrevNode, Next, false);
|
|
+ setPrevNode(Next);
|
|
+ }
|
|
+}
|
|
+
|
|
+void AMDGPUStructurizeCFG::handleLoops(bool ExitUseAllowed,
|
|
+ BasicBlock *LoopEnd) {
|
|
+ RegionNode *Node = Order.back();
|
|
+ BasicBlock *LoopStart = Node->getEntry();
|
|
+
|
|
+ if (!Loops.count(LoopStart)) {
|
|
+ wireFlow(ExitUseAllowed, LoopEnd);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (!isPredictableTrue(Node))
|
|
+ LoopStart = needPrefix(true);
|
|
+
|
|
+ LoopEnd = Loops[Node->getEntry()];
|
|
+ wireFlow(false, LoopEnd);
|
|
+ while (!Visited.count(LoopEnd)) {
|
|
+ handleLoops(false, LoopEnd);
|
|
+ }
|
|
+
|
|
+ // Create an extra loop end node
|
|
+ LoopEnd = needPrefix(false);
|
|
+ BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
|
|
+ LoopConds.push_back(BranchInst::Create(Next, LoopStart,
|
|
+ BoolUndef, LoopEnd));
|
|
+ addPhiValues(LoopEnd, LoopStart);
|
|
+ setPrevNode(Next);
|
|
+}
|
|
+
|
|
+/// After this function control flow looks like it should be, but
|
|
+/// branches and PHI nodes only have undefined conditions.
|
|
+void AMDGPUStructurizeCFG::createFlow() {
|
|
+
|
|
+ BasicBlock *Exit = ParentRegion->getExit();
|
|
+ bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
|
|
+
|
|
+ DeletedPhis.clear();
|
|
+ AddedPhis.clear();
|
|
+ Conditions.clear();
|
|
+ LoopConds.clear();
|
|
+
|
|
+ PrevNode = 0;
|
|
+ Visited.clear();
|
|
+
|
|
+ while (!Order.empty()) {
|
|
+ handleLoops(EntryDominatesExit, 0);
|
|
+ }
|
|
+
|
|
+ if (PrevNode)
|
|
+ changeExit(PrevNode, Exit, EntryDominatesExit);
|
|
+ else
|
|
+ assert(EntryDominatesExit);
|
|
+}
|
|
+
|
|
+/// Handle a rare case where the disintegrated nodes instructions
|
|
+/// no longer dominate all their uses. Not sure if this is really nessasary
|
|
+void AMDGPUStructurizeCFG::rebuildSSA() {
|
|
+ SSAUpdater Updater;
|
|
+ for (Region::block_iterator I = ParentRegion->block_begin(),
|
|
+ E = ParentRegion->block_end();
|
|
+ I != E; ++I) {
|
|
+
|
|
+ BasicBlock *BB = *I;
|
|
+ for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
|
|
+ II != IE; ++II) {
|
|
+
|
|
+ bool Initialized = false;
|
|
+ for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) {
|
|
+
|
|
+ Next = I->getNext();
|
|
+
|
|
+ Instruction *User = cast<Instruction>(I->getUser());
|
|
+ if (User->getParent() == BB) {
|
|
+ continue;
|
|
+
|
|
+ } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
|
|
+ if (UserPN->getIncomingBlock(*I) == BB)
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (DT->dominates(II, User))
|
|
+ continue;
|
|
+
|
|
+ if (!Initialized) {
|
|
+ Value *Undef = UndefValue::get(II->getType());
|
|
+ Updater.Initialize(II->getType(), "");
|
|
+ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
|
|
+ Updater.AddAvailableValue(BB, II);
|
|
+ Initialized = true;
|
|
+ }
|
|
+ Updater.RewriteUseAfterInsertions(*I);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/// \brief Run the transformation for each region found
|
|
+bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
|
|
+ if (R->isTopLevelRegion())
|
|
+ return false;
|
|
+
|
|
+ Func = R->getEntry()->getParent();
|
|
+ ParentRegion = R;
|
|
+
|
|
+ DT = &getAnalysis<DominatorTree>();
|
|
+
|
|
+ orderNodes();
|
|
+ collectInfos();
|
|
+ createFlow();
|
|
+ insertConditions(false);
|
|
+ insertConditions(true);
|
|
+ setPhiValues();
|
|
+ rebuildSSA();
|
|
+
|
|
+ // Cleanup
|
|
+ Order.clear();
|
|
+ Visited.clear();
|
|
+ DeletedPhis.clear();
|
|
+ AddedPhis.clear();
|
|
+ Predicates.clear();
|
|
+ Conditions.clear();
|
|
+ Loops.clear();
|
|
+ LoopPreds.clear();
|
|
+ LoopConds.clear();
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/// \brief Create the pass
|
|
+Pass *llvm::createAMDGPUStructurizeCFGPass() {
|
|
+ return new AMDGPUStructurizeCFG();
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
|
|
new file mode 100644
|
|
index 0000000..0f356a1
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
|
|
@@ -0,0 +1,87 @@
|
|
+//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPUSubtarget.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+#define GET_SUBTARGETINFO_ENUM
|
|
+#define GET_SUBTARGETINFO_TARGET_DESC
|
|
+#define GET_SUBTARGETINFO_CTOR
|
|
+#include "AMDGPUGenSubtargetInfo.inc"
|
|
+
|
|
+AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
|
|
+ AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) {
|
|
+ InstrItins = getInstrItineraryForCPU(CPU);
|
|
+
|
|
+ memset(CapsOverride, 0, sizeof(*CapsOverride)
|
|
+ * AMDGPUDeviceInfo::MaxNumberCapabilities);
|
|
+ // Default card
|
|
+ StringRef GPU = CPU;
|
|
+ Is64bit = false;
|
|
+ DefaultSize[0] = 64;
|
|
+ DefaultSize[1] = 1;
|
|
+ DefaultSize[2] = 1;
|
|
+ ParseSubtargetFeatures(GPU, FS);
|
|
+ DevName = GPU;
|
|
+ Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit);
|
|
+}
|
|
+
|
|
+AMDGPUSubtarget::~AMDGPUSubtarget() {
|
|
+ delete Device;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUSubtarget::isOverride(AMDGPUDeviceInfo::Caps caps) const {
|
|
+ assert(caps < AMDGPUDeviceInfo::MaxNumberCapabilities &&
|
|
+ "Caps index is out of bounds!");
|
|
+ return CapsOverride[caps];
|
|
+}
|
|
+bool
|
|
+AMDGPUSubtarget::is64bit() const {
|
|
+ return Is64bit;
|
|
+}
|
|
+bool
|
|
+AMDGPUSubtarget::isTargetELF() const {
|
|
+ return false;
|
|
+}
|
|
+size_t
|
|
+AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
|
|
+ if (dim > 3) {
|
|
+ return 1;
|
|
+ } else {
|
|
+ return DefaultSize[dim];
|
|
+ }
|
|
+}
|
|
+
|
|
+std::string
|
|
+AMDGPUSubtarget::getDataLayout() const {
|
|
+ if (!Device) {
|
|
+ return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
|
|
+ "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
|
|
+ "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
|
|
+ "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
|
|
+ "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64");
|
|
+ }
|
|
+ return Device->getDataLayout();
|
|
+}
|
|
+
|
|
+std::string
|
|
+AMDGPUSubtarget::getDeviceName() const {
|
|
+ return DevName;
|
|
+}
|
|
+const AMDGPUDevice *
|
|
+AMDGPUSubtarget::device() const {
|
|
+ return Device;
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
|
|
new file mode 100644
|
|
index 0000000..cab7884
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUSubtarget.h
|
|
@@ -0,0 +1,65 @@
|
|
+//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief AMDGPU specific subclass of TargetSubtarget.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPUSUBTARGET_H
|
|
+#define AMDGPUSUBTARGET_H
|
|
+#include "AMDILDevice.h"
|
|
+#include "llvm/ADT/StringExtras.h"
|
|
+#include "llvm/ADT/StringRef.h"
|
|
+#include "llvm/Target/TargetSubtargetInfo.h"
|
|
+
|
|
+#define GET_SUBTARGETINFO_HEADER
|
|
+#include "AMDGPUGenSubtargetInfo.inc"
|
|
+
|
|
+#define MAX_CB_SIZE (1 << 16)
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
|
|
+private:
|
|
+ bool CapsOverride[AMDGPUDeviceInfo::MaxNumberCapabilities];
|
|
+ const AMDGPUDevice *Device;
|
|
+ size_t DefaultSize[3];
|
|
+ std::string DevName;
|
|
+ bool Is64bit;
|
|
+ bool Is32on64bit;
|
|
+ bool DumpCode;
|
|
+ bool R600ALUInst;
|
|
+
|
|
+ InstrItineraryData InstrItins;
|
|
+
|
|
+public:
|
|
+ AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
|
|
+ virtual ~AMDGPUSubtarget();
|
|
+
|
|
+ const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
|
|
+ virtual void ParseSubtargetFeatures(llvm::StringRef CPU, llvm::StringRef FS);
|
|
+
|
|
+ bool isOverride(AMDGPUDeviceInfo::Caps) const;
|
|
+ bool is64bit() const;
|
|
+
|
|
+ // Helper functions to simplify if statements
|
|
+ bool isTargetELF() const;
|
|
+ const AMDGPUDevice* device() const;
|
|
+ std::string getDataLayout() const;
|
|
+ std::string getDeviceName() const;
|
|
+ virtual size_t getDefaultSize(uint32_t dim) const;
|
|
+ bool dumpCode() const { return DumpCode; }
|
|
+ bool r600ALUEncoding() const { return R600ALUInst; }
|
|
+
|
|
+};
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+#endif // AMDGPUSUBTARGET_H
|
|
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
|
|
new file mode 100644
|
|
index 0000000..e2f00be
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
|
|
@@ -0,0 +1,153 @@
|
|
+//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief The AMDGPU target machine contains all of the hardware specific
|
|
+/// information needed to emit code for R600 and SI GPUs.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPUTargetMachine.h"
|
|
+#include "AMDGPU.h"
|
|
+#include "R600ISelLowering.h"
|
|
+#include "R600InstrInfo.h"
|
|
+#include "SIISelLowering.h"
|
|
+#include "SIInstrInfo.h"
|
|
+#include "llvm/Analysis/Passes.h"
|
|
+#include "llvm/Analysis/Verifier.h"
|
|
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
|
|
+#include "llvm/CodeGen/MachineModuleInfo.h"
|
|
+#include "llvm/CodeGen/Passes.h"
|
|
+#include "llvm/MC/MCAsmInfo.h"
|
|
+#include "llvm/PassManager.h"
|
|
+#include "llvm/Support/TargetRegistry.h"
|
|
+#include "llvm/Support/raw_os_ostream.h"
|
|
+#include "llvm/Transforms/IPO.h"
|
|
+#include "llvm/Transforms/Scalar.h"
|
|
+#include <llvm/CodeGen/Passes.h>
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+extern "C" void LLVMInitializeR600Target() {
|
|
+ // Register the target
|
|
+ RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
|
|
+}
|
|
+
|
|
+AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
|
|
+ StringRef CPU, StringRef FS,
|
|
+ TargetOptions Options,
|
|
+ Reloc::Model RM, CodeModel::Model CM,
|
|
+ CodeGenOpt::Level OptLevel
|
|
+)
|
|
+:
|
|
+ LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
|
|
+ Subtarget(TT, CPU, FS),
|
|
+ Layout(Subtarget.getDataLayout()),
|
|
+ FrameLowering(TargetFrameLowering::StackGrowsUp,
|
|
+ Subtarget.device()->getStackAlignment(), 0),
|
|
+ IntrinsicInfo(this),
|
|
+ InstrItins(&Subtarget.getInstrItineraryData()) {
|
|
+ // TLInfo uses InstrInfo so it must be initialized after.
|
|
+ if (Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
|
|
+ InstrInfo = new R600InstrInfo(*this);
|
|
+ TLInfo = new R600TargetLowering(*this);
|
|
+ } else {
|
|
+ InstrInfo = new SIInstrInfo(*this);
|
|
+ TLInfo = new SITargetLowering(*this);
|
|
+ }
|
|
+}
|
|
+
|
|
+AMDGPUTargetMachine::~AMDGPUTargetMachine() {
|
|
+}
|
|
+
|
|
+namespace {
|
|
+class AMDGPUPassConfig : public TargetPassConfig {
|
|
+public:
|
|
+ AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
|
|
+ : TargetPassConfig(TM, PM) {}
|
|
+
|
|
+ AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
|
|
+ return getTM<AMDGPUTargetMachine>();
|
|
+ }
|
|
+
|
|
+ virtual bool addPreISel();
|
|
+ virtual bool addInstSelector();
|
|
+ virtual bool addPreRegAlloc();
|
|
+ virtual bool addPostRegAlloc();
|
|
+ virtual bool addPreSched2();
|
|
+ virtual bool addPreEmitPass();
|
|
+};
|
|
+} // End of anonymous namespace
|
|
+
|
|
+TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
|
|
+ return new AMDGPUPassConfig(this, PM);
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPassConfig::addPreISel() {
|
|
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
|
|
+ if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
|
|
+ addPass(createAMDGPUStructurizeCFGPass());
|
|
+ addPass(createSIAnnotateControlFlowPass());
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUPassConfig::addInstSelector() {
|
|
+ addPass(createAMDGPUPeepholeOpt(*TM));
|
|
+ addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
|
|
+
|
|
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
|
|
+ if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
|
|
+ // This callbacks this pass uses are not implemented yet on SI.
|
|
+ addPass(createAMDGPUIndirectAddressingPass(*TM));
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUPassConfig::addPreRegAlloc() {
|
|
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
|
|
+
|
|
+ if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
|
|
+ addPass(createSIAssignInterpRegsPass(*TM));
|
|
+ }
|
|
+ addPass(createAMDGPUConvertToISAPass(*TM));
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUPassConfig::addPostRegAlloc() {
|
|
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
|
|
+
|
|
+ if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
|
|
+ addPass(createSIInsertWaits(*TM));
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUPassConfig::addPreSched2() {
|
|
+
|
|
+ addPass(&IfConverterID);
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUPassConfig::addPreEmitPass() {
|
|
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
|
|
+ if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
|
|
+ addPass(createAMDGPUCFGPreparationPass(*TM));
|
|
+ addPass(createAMDGPUCFGStructurizerPass(*TM));
|
|
+ addPass(createR600ExpandSpecialInstrsPass(*TM));
|
|
+ addPass(&FinalizeMachineBundlesID);
|
|
+ addPass(createR600LowerConstCopy(*TM));
|
|
+ } else {
|
|
+ addPass(createSILowerControlFlowPass(*TM));
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
|
|
new file mode 100644
|
|
index 0000000..5a1dcf4
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
|
|
@@ -0,0 +1,70 @@
|
|
+//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPU_TARGET_MACHINE_H
|
|
+#define AMDGPU_TARGET_MACHINE_H
|
|
+
|
|
+#include "AMDGPUFrameLowering.h"
|
|
+#include "AMDGPUInstrInfo.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+#include "AMDILIntrinsicInfo.h"
|
|
+#include "R600ISelLowering.h"
|
|
+#include "llvm/ADT/OwningPtr.h"
|
|
+#include "llvm/DataLayout.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT);
|
|
+
|
|
+class AMDGPUTargetMachine : public LLVMTargetMachine {
|
|
+
|
|
+ AMDGPUSubtarget Subtarget;
|
|
+ const DataLayout Layout;
|
|
+ AMDGPUFrameLowering FrameLowering;
|
|
+ AMDGPUIntrinsicInfo IntrinsicInfo;
|
|
+ const AMDGPUInstrInfo * InstrInfo;
|
|
+ AMDGPUTargetLowering * TLInfo;
|
|
+ const InstrItineraryData* InstrItins;
|
|
+
|
|
+public:
|
|
+ AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
|
|
+ StringRef CPU,
|
|
+ TargetOptions Options,
|
|
+ Reloc::Model RM, CodeModel::Model CM,
|
|
+ CodeGenOpt::Level OL);
|
|
+ ~AMDGPUTargetMachine();
|
|
+ virtual const AMDGPUFrameLowering* getFrameLowering() const {
|
|
+ return &FrameLowering;
|
|
+ }
|
|
+ virtual const AMDGPUIntrinsicInfo* getIntrinsicInfo() const {
|
|
+ return &IntrinsicInfo;
|
|
+ }
|
|
+ virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;}
|
|
+ virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; }
|
|
+ virtual const AMDGPURegisterInfo *getRegisterInfo() const {
|
|
+ return &InstrInfo->getRegisterInfo();
|
|
+ }
|
|
+ virtual AMDGPUTargetLowering * getTargetLowering() const {
|
|
+ return TLInfo;
|
|
+ }
|
|
+ virtual const InstrItineraryData* getInstrItineraryData() const {
|
|
+ return InstrItins;
|
|
+ }
|
|
+ virtual const DataLayout* getDataLayout() const { return &Layout; }
|
|
+ virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
|
|
+};
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+#endif // AMDGPU_TARGET_MACHINE_H
|
|
diff --git a/lib/Target/R600/AMDIL.h b/lib/Target/R600/AMDIL.h
|
|
new file mode 100644
|
|
index 0000000..b39fbdb
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDIL.h
|
|
@@ -0,0 +1,122 @@
|
|
+//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+/// This file contains the entry points for global functions defined in the LLVM
|
|
+/// AMDGPU back-end.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDIL_H
|
|
+#define AMDIL_H
|
|
+
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+#include "llvm/Target/TargetMachine.h"
|
|
+
|
|
+#define ARENA_SEGMENT_RESERVED_UAVS 12
|
|
+#define DEFAULT_ARENA_UAV_ID 8
|
|
+#define DEFAULT_RAW_UAV_ID 7
|
|
+#define GLOBAL_RETURN_RAW_UAV_ID 11
|
|
+#define HW_MAX_NUM_CB 8
|
|
+#define MAX_NUM_UNIQUE_UAVS 8
|
|
+#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8
|
|
+#define OPENCL_MAX_READ_IMAGES 128
|
|
+#define OPENCL_MAX_WRITE_IMAGES 8
|
|
+#define OPENCL_MAX_SAMPLERS 16
|
|
+
|
|
+// The next two values can never be zero, as zero is the ID that is
|
|
+// used to assert against.
|
|
+#define DEFAULT_LDS_ID 1
|
|
+#define DEFAULT_GDS_ID 1
|
|
+#define DEFAULT_SCRATCH_ID 1
|
|
+#define DEFAULT_VEC_SLOTS 8
|
|
+
|
|
+#define OCL_DEVICE_RV710 0x0001
|
|
+#define OCL_DEVICE_RV730 0x0002
|
|
+#define OCL_DEVICE_RV770 0x0004
|
|
+#define OCL_DEVICE_CEDAR 0x0008
|
|
+#define OCL_DEVICE_REDWOOD 0x0010
|
|
+#define OCL_DEVICE_JUNIPER 0x0020
|
|
+#define OCL_DEVICE_CYPRESS 0x0040
|
|
+#define OCL_DEVICE_CAICOS 0x0080
|
|
+#define OCL_DEVICE_TURKS 0x0100
|
|
+#define OCL_DEVICE_BARTS 0x0200
|
|
+#define OCL_DEVICE_CAYMAN 0x0400
|
|
+#define OCL_DEVICE_ALL 0x3FFF
|
|
+
|
|
+/// The number of function ID's that are reserved for
|
|
+/// internal compiler usage.
|
|
+const unsigned int RESERVED_FUNCS = 1024;
|
|
+
|
|
+namespace llvm {
|
|
+class AMDGPUInstrPrinter;
|
|
+class FunctionPass;
|
|
+class MCAsmInfo;
|
|
+class raw_ostream;
|
|
+class Target;
|
|
+class TargetMachine;
|
|
+
|
|
+// Instruction selection passes.
|
|
+FunctionPass*
|
|
+ createAMDGPUISelDag(TargetMachine &TM);
|
|
+FunctionPass*
|
|
+ createAMDGPUPeepholeOpt(TargetMachine &TM);
|
|
+
|
|
+// Pre emit passes.
|
|
+FunctionPass*
|
|
+ createAMDGPUCFGPreparationPass(TargetMachine &TM);
|
|
+FunctionPass*
|
|
+ createAMDGPUCFGStructurizerPass(TargetMachine &TM);
|
|
+
|
|
+extern Target TheAMDGPUTarget;
|
|
+} // end namespace llvm;
|
|
+
|
|
+// Include device information enumerations
|
|
+#include "AMDILDeviceInfo.h"
|
|
+
|
|
+namespace llvm {
|
|
+/// OpenCL uses address spaces to differentiate between
|
|
+/// various memory regions on the hardware. On the CPU
|
|
+/// all of the address spaces point to the same memory,
|
|
+/// however on the GPU, each address space points to
|
|
+/// a seperate piece of memory that is unique from other
|
|
+/// memory locations.
|
|
+namespace AMDGPUAS {
|
|
+enum AddressSpaces {
|
|
+ PRIVATE_ADDRESS = 0, ///< Address space for private memory.
|
|
+ GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
|
|
+ CONSTANT_ADDRESS = 2, ///< Address space for constant memory
|
|
+ LOCAL_ADDRESS = 3, ///< Address space for local memory.
|
|
+ REGION_ADDRESS = 4, ///< Address space for region memory.
|
|
+ ADDRESS_NONE = 5, ///< Address space for unknown memory.
|
|
+ PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0)
|
|
+ PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1)
|
|
+ USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI
|
|
+ CONSTANT_BUFFER_0 = 9,
|
|
+ CONSTANT_BUFFER_1 = 10,
|
|
+ CONSTANT_BUFFER_2 = 11,
|
|
+ CONSTANT_BUFFER_3 = 12,
|
|
+ CONSTANT_BUFFER_4 = 13,
|
|
+ CONSTANT_BUFFER_5 = 14,
|
|
+ CONSTANT_BUFFER_6 = 15,
|
|
+ CONSTANT_BUFFER_7 = 16,
|
|
+ CONSTANT_BUFFER_8 = 17,
|
|
+ CONSTANT_BUFFER_9 = 18,
|
|
+ CONSTANT_BUFFER_10 = 19,
|
|
+ CONSTANT_BUFFER_11 = 20,
|
|
+ CONSTANT_BUFFER_12 = 21,
|
|
+ CONSTANT_BUFFER_13 = 22,
|
|
+ CONSTANT_BUFFER_14 = 23,
|
|
+ CONSTANT_BUFFER_15 = 24,
|
|
+ LAST_ADDRESS = 25
|
|
+};
|
|
+
|
|
+} // namespace AMDGPUAS
|
|
+
|
|
+} // end namespace llvm
|
|
+#endif // AMDIL_H
|
|
diff --git a/lib/Target/R600/AMDIL7XXDevice.cpp b/lib/Target/R600/AMDIL7XXDevice.cpp
|
|
new file mode 100644
|
|
index 0000000..ea6ac34
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDIL7XXDevice.cpp
|
|
@@ -0,0 +1,115 @@
|
|
+//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+// \file
|
|
+//==-----------------------------------------------------------------------===//
|
|
+#include "AMDIL7XXDevice.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+#include "AMDILDevice.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+AMDGPU7XXDevice::AMDGPU7XXDevice(AMDGPUSubtarget *ST) : AMDGPUDevice(ST) {
|
|
+ setCaps();
|
|
+ std::string name = mSTM->getDeviceName();
|
|
+ if (name == "rv710") {
|
|
+ DeviceFlag = OCL_DEVICE_RV710;
|
|
+ } else if (name == "rv730") {
|
|
+ DeviceFlag = OCL_DEVICE_RV730;
|
|
+ } else {
|
|
+ DeviceFlag = OCL_DEVICE_RV770;
|
|
+ }
|
|
+}
|
|
+
|
|
+AMDGPU7XXDevice::~AMDGPU7XXDevice() {
|
|
+}
|
|
+
|
|
+void AMDGPU7XXDevice::setCaps() {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::LocalMem);
|
|
+}
|
|
+
|
|
+size_t AMDGPU7XXDevice::getMaxLDSSize() const {
|
|
+ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
|
|
+ return MAX_LDS_SIZE_700;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+size_t AMDGPU7XXDevice::getWavefrontSize() const {
|
|
+ return AMDGPUDevice::HalfWavefrontSize;
|
|
+}
|
|
+
|
|
+uint32_t AMDGPU7XXDevice::getGeneration() const {
|
|
+ return AMDGPUDeviceInfo::HD4XXX;
|
|
+}
|
|
+
|
|
+uint32_t AMDGPU7XXDevice::getResourceID(uint32_t DeviceID) const {
|
|
+ switch (DeviceID) {
|
|
+ default:
|
|
+ assert(0 && "ID type passed in is unknown!");
|
|
+ break;
|
|
+ case GLOBAL_ID:
|
|
+ case CONSTANT_ID:
|
|
+ case RAW_UAV_ID:
|
|
+ case ARENA_UAV_ID:
|
|
+ break;
|
|
+ case LDS_ID:
|
|
+ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
|
|
+ return DEFAULT_LDS_ID;
|
|
+ }
|
|
+ break;
|
|
+ case SCRATCH_ID:
|
|
+ if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
|
|
+ return DEFAULT_SCRATCH_ID;
|
|
+ }
|
|
+ break;
|
|
+ case GDS_ID:
|
|
+ assert(0 && "GDS UAV ID is not supported on this chip");
|
|
+ if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
|
|
+ return DEFAULT_GDS_ID;
|
|
+ }
|
|
+ break;
|
|
+ };
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+uint32_t AMDGPU7XXDevice::getMaxNumUAVs() const {
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+AMDGPU770Device::AMDGPU770Device(AMDGPUSubtarget *ST): AMDGPU7XXDevice(ST) {
|
|
+ setCaps();
|
|
+}
|
|
+
|
|
+AMDGPU770Device::~AMDGPU770Device() {
|
|
+}
|
|
+
|
|
+void AMDGPU770Device::setCaps() {
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::FMA);
|
|
+ mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
|
|
+ }
|
|
+ mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
|
|
+ mHWBits.reset(AMDGPUDeviceInfo::LongOps);
|
|
+ mSWBits.set(AMDGPUDeviceInfo::LongOps);
|
|
+ mSWBits.set(AMDGPUDeviceInfo::LocalMem);
|
|
+}
|
|
+
|
|
+size_t AMDGPU770Device::getWavefrontSize() const {
|
|
+ return AMDGPUDevice::WavefrontSize;
|
|
+}
|
|
+
|
|
+AMDGPU710Device::AMDGPU710Device(AMDGPUSubtarget *ST) : AMDGPU7XXDevice(ST) {
|
|
+}
|
|
+
|
|
+AMDGPU710Device::~AMDGPU710Device() {
|
|
+}
|
|
+
|
|
+size_t AMDGPU710Device::getWavefrontSize() const {
|
|
+ return AMDGPUDevice::QuarterWavefrontSize;
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDIL7XXDevice.h b/lib/Target/R600/AMDIL7XXDevice.h
|
|
new file mode 100644
|
|
index 0000000..1cf4ca4
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDIL7XXDevice.h
|
|
@@ -0,0 +1,72 @@
|
|
+//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+/// \file
|
|
+/// \brief Interface for the subtarget data classes.
|
|
+///
|
|
+/// This file will define the interface that each generation needs to
|
|
+/// implement in order to correctly answer queries on the capabilities of the
|
|
+/// specific hardware.
|
|
+//===----------------------------------------------------------------------===//
|
|
+#ifndef AMDIL7XXDEVICEIMPL_H
|
|
+#define AMDIL7XXDEVICEIMPL_H
|
|
+#include "AMDILDevice.h"
|
|
+
|
|
+namespace llvm {
|
|
+class AMDGPUSubtarget;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// 7XX generation of devices and their respective sub classes
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+/// \brief The AMDGPU7XXDevice class represents the generic 7XX device.
|
|
+///
|
|
+/// All 7XX devices are derived from this class. The AMDGPU7XX device will only
|
|
+/// support the minimal features that are required to be considered OpenCL 1.0
|
|
+/// compliant and nothing more.
|
|
+class AMDGPU7XXDevice : public AMDGPUDevice {
|
|
+public:
|
|
+ AMDGPU7XXDevice(AMDGPUSubtarget *ST);
|
|
+ virtual ~AMDGPU7XXDevice();
|
|
+ virtual size_t getMaxLDSSize() const;
|
|
+ virtual size_t getWavefrontSize() const;
|
|
+ virtual uint32_t getGeneration() const;
|
|
+ virtual uint32_t getResourceID(uint32_t DeviceID) const;
|
|
+ virtual uint32_t getMaxNumUAVs() const;
|
|
+
|
|
+protected:
|
|
+ virtual void setCaps();
|
|
+};
|
|
+
|
|
+/// \brief The AMDGPU770Device class represents the RV770 chip and it's
|
|
+/// derivative cards.
|
|
+///
|
|
+/// The difference between this device and the base class is this device device
|
|
+/// adds support for double precision and has a larger wavefront size.
|
|
+class AMDGPU770Device : public AMDGPU7XXDevice {
|
|
+public:
|
|
+ AMDGPU770Device(AMDGPUSubtarget *ST);
|
|
+ virtual ~AMDGPU770Device();
|
|
+ virtual size_t getWavefrontSize() const;
|
|
+private:
|
|
+ virtual void setCaps();
|
|
+};
|
|
+
|
|
+/// \brief The AMDGPU710Device class derives from the 7XX base class.
|
|
+///
|
|
+/// This class is a smaller derivative, so we need to overload some of the
|
|
+/// functions in order to correctly specify this information.
|
|
+class AMDGPU710Device : public AMDGPU7XXDevice {
|
|
+public:
|
|
+ AMDGPU710Device(AMDGPUSubtarget *ST);
|
|
+ virtual ~AMDGPU710Device();
|
|
+ virtual size_t getWavefrontSize() const;
|
|
+};
|
|
+
|
|
+} // namespace llvm
|
|
+#endif // AMDILDEVICEIMPL_H
|
|
diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td
|
|
new file mode 100644
|
|
index 0000000..c12cedc
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILBase.td
|
|
@@ -0,0 +1,85 @@
|
|
+//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Target-independent interfaces which we are implementing
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+include "llvm/Target/Target.td"
|
|
+
|
|
+// Dummy Instruction itineraries for pseudo instructions
|
|
+def ALU_NULL : FuncUnit;
|
|
+def NullALU : InstrItinClass;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// AMDIL Subtarget features.
|
|
+//===----------------------------------------------------------------------===//
|
|
+def FeatureFP64 : SubtargetFeature<"fp64",
|
|
+ "CapsOverride[AMDGPUDeviceInfo::DoubleOps]",
|
|
+ "true",
|
|
+ "Enable 64bit double precision operations">;
|
|
+def FeatureByteAddress : SubtargetFeature<"byte_addressable_store",
|
|
+ "CapsOverride[AMDGPUDeviceInfo::ByteStores]",
|
|
+ "true",
|
|
+ "Enable byte addressable stores">;
|
|
+def FeatureBarrierDetect : SubtargetFeature<"barrier_detect",
|
|
+ "CapsOverride[AMDGPUDeviceInfo::BarrierDetect]",
|
|
+ "true",
|
|
+ "Enable duplicate barrier detection(HD5XXX or later).">;
|
|
+def FeatureImages : SubtargetFeature<"images",
|
|
+ "CapsOverride[AMDGPUDeviceInfo::Images]",
|
|
+ "true",
|
|
+ "Enable image functions">;
|
|
+def FeatureMultiUAV : SubtargetFeature<"multi_uav",
|
|
+ "CapsOverride[AMDGPUDeviceInfo::MultiUAV]",
|
|
+ "true",
|
|
+ "Generate multiple UAV code(HD5XXX family or later)">;
|
|
+def FeatureMacroDB : SubtargetFeature<"macrodb",
|
|
+ "CapsOverride[AMDGPUDeviceInfo::MacroDB]",
|
|
+ "true",
|
|
+ "Use internal macrodb, instead of macrodb in driver">;
|
|
+def FeatureNoAlias : SubtargetFeature<"noalias",
|
|
+ "CapsOverride[AMDGPUDeviceInfo::NoAlias]",
|
|
+ "true",
|
|
+ "assert that all kernel argument pointers are not aliased">;
|
|
+def FeatureNoInline : SubtargetFeature<"no-inline",
|
|
+ "CapsOverride[AMDGPUDeviceInfo::NoInline]",
|
|
+ "true",
|
|
+ "specify whether to not inline functions">;
|
|
+
|
|
+def Feature64BitPtr : SubtargetFeature<"64BitPtr",
|
|
+ "Is64bit",
|
|
+ "false",
|
|
+ "Specify if 64bit addressing should be used.">;
|
|
+
|
|
+def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
|
|
+ "Is32on64bit",
|
|
+ "false",
|
|
+ "Specify if 64bit sized pointers with 32bit addressing should be used.">;
|
|
+def FeatureDebug : SubtargetFeature<"debug",
|
|
+ "CapsOverride[AMDGPUDeviceInfo::Debug]",
|
|
+ "true",
|
|
+ "Debug mode is enabled, so disable hardware accelerated address spaces.">;
|
|
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
|
|
+ "DumpCode",
|
|
+ "true",
|
|
+ "Dump MachineInstrs in the CodeEmitter">;
|
|
+
|
|
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
|
|
+ "R600ALUInst",
|
|
+ "false",
|
|
+ "Older version of ALU instructions encoding.">;
|
|
+
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Register File, Calling Conv, Instruction Descriptions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+include "AMDILRegisterInfo.td"
|
|
+include "AMDILInstrInfo.td"
|
|
+
|
|
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
|
|
new file mode 100644
|
|
index 0000000..568d281
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
|
|
@@ -0,0 +1,3045 @@
|
|
+//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//==-----------------------------------------------------------------------===//
|
|
+
|
|
+#define DEBUGME 0
|
|
+#define DEBUG_TYPE "structcfg"
|
|
+
|
|
+#include "AMDGPUInstrInfo.h"
|
|
+#include "AMDIL.h"
|
|
+#include "llvm/ADT/SCCIterator.h"
|
|
+#include "llvm/ADT/SmallVector.h"
|
|
+#include "llvm/ADT/Statistic.h"
|
|
+#include "llvm/Analysis/DominatorInternals.h"
|
|
+#include "llvm/Analysis/Dominators.h"
|
|
+#include "llvm/CodeGen/MachinePostDominators.h"
|
|
+#include "llvm/CodeGen/MachineDominators.h"
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
|
|
+#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
|
|
+#include "llvm/CodeGen/MachineLoopInfo.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+#include "llvm/Target/TargetInstrInfo.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+// TODO: move-begin.
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// Statistics for CFGStructurizer.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern "
|
|
+ "matched");
|
|
+STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern "
|
|
+ "matched");
|
|
+STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
|
|
+ "pattern matched");
|
|
+STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue "
|
|
+ "pattern matched");
|
|
+STATISTIC(numLoopPatternMatch, "CFGStructurizer number of loop pattern "
|
|
+ "matched");
|
|
+STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks");
|
|
+STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions");
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// Miscellaneous utility for CFGStructurizer.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+namespace llvmCFGStruct {
|
|
+#define SHOWNEWINSTR(i) \
|
|
+ if (DEBUGME) errs() << "New instr: " << *i << "\n"
|
|
+
|
|
+#define SHOWNEWBLK(b, msg) \
|
|
+if (DEBUGME) { \
|
|
+ errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
|
|
+ errs() << "\n"; \
|
|
+}
|
|
+
|
|
+#define SHOWBLK_DETAIL(b, msg) \
|
|
+if (DEBUGME) { \
|
|
+ if (b) { \
|
|
+ errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
|
|
+ b->print(errs()); \
|
|
+ errs() << "\n"; \
|
|
+ } \
|
|
+}
|
|
+
|
|
+#define INVALIDSCCNUM -1
|
|
+#define INVALIDREGNUM 0
|
|
+
|
|
+template<class LoopinfoT>
|
|
+void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
|
|
+ for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
|
|
+ iterEnd = LoopInfo.end();
|
|
+ iter != iterEnd; ++iter) {
|
|
+ (*iter)->print(OS, 0);
|
|
+ }
|
|
+}
|
|
+
|
|
+template<class NodeT>
|
|
+void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
|
|
+ size_t sz = Src.size();
|
|
+ for (size_t i = 0; i < sz/2; ++i) {
|
|
+ NodeT *t = Src[i];
|
|
+ Src[i] = Src[sz - i - 1];
|
|
+ Src[sz - i - 1] = t;
|
|
+ }
|
|
+}
|
|
+
|
|
+} //end namespace llvmCFGStruct
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// supporting data structure for CFGStructurizer
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+namespace llvmCFGStruct {
|
|
+template<class PassT>
|
|
+struct CFGStructTraits {
|
|
+};
|
|
+
|
|
+template <class InstrT>
|
|
+class BlockInformation {
|
|
+public:
|
|
+ bool isRetired;
|
|
+ int sccNum;
|
|
+ //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
|
|
+ //Instructions defining the corresponding successor.
|
|
+ BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
|
|
+};
|
|
+
|
|
+template <class BlockT, class InstrT, class RegiT>
|
|
+class LandInformation {
|
|
+public:
|
|
+ BlockT *landBlk;
|
|
+ std::set<RegiT> breakInitRegs; //Registers that need to "reg = 0", before
|
|
+ //WHILELOOP(thisloop) init before entering
|
|
+ //thisloop.
|
|
+ std::set<RegiT> contInitRegs; //Registers that need to "reg = 0", after
|
|
+ //WHILELOOP(thisloop) init after entering
|
|
+ //thisloop.
|
|
+ std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
|
|
+ //land block, branch cond on this reg.
|
|
+ std::set<RegiT> breakOnRegs; //registers that need to "if (reg) break
|
|
+ //endif" after ENDLOOP(thisloop) break
|
|
+ //outerLoopOf(thisLoop).
|
|
+ std::set<RegiT> contOnRegs; //registers that need to "if (reg) continue
|
|
+ //endif" after ENDLOOP(thisloop) continue on
|
|
+ //outerLoopOf(thisLoop).
|
|
+ LandInformation() : landBlk(NULL) {}
|
|
+};
|
|
+
|
|
+} //end of namespace llvmCFGStruct
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// CFGStructurizer
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+namespace llvmCFGStruct {
|
|
+// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
|
|
+template<class PassT>
|
|
+class CFGStructurizer {
|
|
+public:
|
|
+ typedef enum {
|
|
+ Not_SinglePath = 0,
|
|
+ SinglePath_InPath = 1,
|
|
+ SinglePath_NotInPath = 2
|
|
+ } PathToKind;
|
|
+
|
|
+public:
|
|
+ typedef typename PassT::InstructionType InstrT;
|
|
+ typedef typename PassT::FunctionType FuncT;
|
|
+ typedef typename PassT::DominatortreeType DomTreeT;
|
|
+ typedef typename PassT::PostDominatortreeType PostDomTreeT;
|
|
+ typedef typename PassT::DomTreeNodeType DomTreeNodeT;
|
|
+ typedef typename PassT::LoopinfoType LoopInfoT;
|
|
+
|
|
+ typedef GraphTraits<FuncT *> FuncGTraits;
|
|
+ //typedef FuncGTraits::nodes_iterator BlockIterator;
|
|
+ typedef typename FuncT::iterator BlockIterator;
|
|
+
|
|
+ typedef typename FuncGTraits::NodeType BlockT;
|
|
+ typedef GraphTraits<BlockT *> BlockGTraits;
|
|
+ typedef GraphTraits<Inverse<BlockT *> > InvBlockGTraits;
|
|
+ //typedef BlockGTraits::succ_iterator InstructionIterator;
|
|
+ typedef typename BlockT::iterator InstrIterator;
|
|
+
|
|
+ typedef CFGStructTraits<PassT> CFGTraits;
|
|
+ typedef BlockInformation<InstrT> BlockInfo;
|
|
+ typedef std::map<BlockT *, BlockInfo *> BlockInfoMap;
|
|
+
|
|
+ typedef int RegiT;
|
|
+ typedef typename PassT::LoopType LoopT;
|
|
+ typedef LandInformation<BlockT, InstrT, RegiT> LoopLandInfo;
|
|
+ typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
|
|
+ //landing info for loop break
|
|
+ typedef SmallVector<BlockT *, 32> BlockTSmallerVector;
|
|
+
|
|
+public:
|
|
+ CFGStructurizer();
|
|
+ ~CFGStructurizer();
|
|
+
|
|
+ /// Perform the CFG structurization
|
|
+ bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
|
|
+
|
|
+ /// Perform the CFG preparation
|
|
+ bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
|
|
+
|
|
+private:
|
|
+ void reversePredicateSetter(typename BlockT::iterator);
|
|
+ void orderBlocks();
|
|
+ void printOrderedBlocks(llvm::raw_ostream &OS);
|
|
+ int patternMatch(BlockT *CurBlock);
|
|
+ int patternMatchGroup(BlockT *CurBlock);
|
|
+
|
|
+ int serialPatternMatch(BlockT *CurBlock);
|
|
+ int ifPatternMatch(BlockT *CurBlock);
|
|
+ int switchPatternMatch(BlockT *CurBlock);
|
|
+ int loopendPatternMatch(BlockT *CurBlock);
|
|
+ int loopPatternMatch(BlockT *CurBlock);
|
|
+
|
|
+ int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
|
|
+ int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
|
|
+ //int loopWithoutBreak(BlockT *);
|
|
+
|
|
+ void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
|
|
+ BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
|
|
+ void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
|
|
+ BlockT *ContBlock, LoopT *contLoop);
|
|
+ bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
|
|
+ int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
|
|
+ BlockT *FalseBlock);
|
|
+ int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
|
|
+ BlockT *FalseBlock);
|
|
+ int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
|
|
+ BlockT *FalseBlock, BlockT **LandBlockPtr);
|
|
+ void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
|
|
+ BlockT *FalseBlock, BlockT *LandBlock,
|
|
+ bool Detail = false);
|
|
+ PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
|
|
+ bool AllowSideEntry = true);
|
|
+ BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
|
|
+ bool AllowSideEntry = true);
|
|
+ int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
|
|
+ void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
|
|
+
|
|
+ void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
|
|
+ BlockT *TrueBlock, BlockT *FalseBlock,
|
|
+ BlockT *LandBlock);
|
|
+ void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
|
|
+ void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
|
|
+ BlockT *ExitLandBlock, RegiT SetReg);
|
|
+ void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
|
|
+ RegiT SetReg);
|
|
+ BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
|
|
+ std::set<BlockT*> &ExitBlockSet,
|
|
+ BlockT *ExitLandBlk);
|
|
+ BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
|
|
+ BlockTSmallerVector &ExitingBlocks,
|
|
+ BlockTSmallerVector &ExitBlocks);
|
|
+ BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
|
|
+ void removeUnconditionalBranch(BlockT *SrcBlock);
|
|
+ void removeRedundantConditionalBranch(BlockT *SrcBlock);
|
|
+ void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks);
|
|
+
|
|
+ void removeSuccessor(BlockT *SrcBlock);
|
|
+ BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
|
|
+ BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
|
|
+
|
|
+ void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
|
|
+ InstrIterator InsertPos);
|
|
+
|
|
+ void recordSccnum(BlockT *SrcBlock, int SCCNum);
|
|
+ int getSCCNum(BlockT *srcBlk);
|
|
+
|
|
+ void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
|
|
+ bool isRetiredBlock(BlockT *SrcBlock);
|
|
+ bool isActiveLoophead(BlockT *CurBlock);
|
|
+ bool needMigrateBlock(BlockT *Block);
|
|
+
|
|
+ BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
|
|
+ BlockTSmallerVector &exitBlocks,
|
|
+ std::set<BlockT*> &ExitBlockSet);
|
|
+ void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
|
|
+ BlockT *getLoopLandBlock(LoopT *LoopRep);
|
|
+ LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
|
|
+
|
|
+ void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
|
|
+ void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
|
|
+ void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
|
|
+ void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
|
|
+ void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
|
|
+
|
|
+ bool hasBackEdge(BlockT *curBlock);
|
|
+ unsigned getLoopDepth (LoopT *LoopRep);
|
|
+ int countActiveBlock(
|
|
+ typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart,
|
|
+ typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd);
|
|
+ BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
|
|
+ BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
|
|
+
|
|
+private:
|
|
+ DomTreeT *domTree;
|
|
+ PostDomTreeT *postDomTree;
|
|
+ LoopInfoT *loopInfo;
|
|
+ PassT *passRep;
|
|
+ FuncT *funcRep;
|
|
+
|
|
+ BlockInfoMap blockInfoMap;
|
|
+ LoopLandInfoMap loopLandInfoMap;
|
|
+ SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
|
|
+ const AMDGPURegisterInfo *TRI;
|
|
+
|
|
+}; //template class CFGStructurizer
|
|
+
|
|
+template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
|
|
+ : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
|
|
+}
|
|
+
|
|
+template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
|
|
+ for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
|
|
+ E = blockInfoMap.end(); I != E; ++I) {
|
|
+ delete I->second;
|
|
+ }
|
|
+}
|
|
+
|
|
+template<class PassT>
|
|
+bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass,
|
|
+ const AMDGPURegisterInfo * tri) {
|
|
+ passRep = &pass;
|
|
+ funcRep = &func;
|
|
+ TRI = tri;
|
|
+
|
|
+ bool changed = false;
|
|
+
|
|
+ //FIXME: if not reducible flow graph, make it so ???
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "AMDGPUCFGStructurizer::prepare\n";
|
|
+ }
|
|
+
|
|
+ loopInfo = CFGTraits::getLoopInfo(pass);
|
|
+ if (DEBUGME) {
|
|
+ errs() << "LoopInfo:\n";
|
|
+ PrintLoopinfo(*loopInfo, errs());
|
|
+ }
|
|
+
|
|
+ orderBlocks();
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Ordered blocks:\n";
|
|
+ printOrderedBlocks(errs());
|
|
+ }
|
|
+
|
|
+ SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
|
|
+
|
|
+ for (typename LoopInfoT::iterator iter = loopInfo->begin(),
|
|
+ iterEnd = loopInfo->end();
|
|
+ iter != iterEnd; ++iter) {
|
|
+ LoopT* loopRep = (*iter);
|
|
+ BlockTSmallerVector exitingBlks;
|
|
+ loopRep->getExitingBlocks(exitingBlks);
|
|
+
|
|
+ if (exitingBlks.size() == 0) {
|
|
+ BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
|
|
+ if (dummyExitBlk != NULL)
|
|
+ retBlks.push_back(dummyExitBlk);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Remove unconditional branch instr.
|
|
+ // Add dummy exit block iff there are multiple returns.
|
|
+
|
|
+ for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
|
|
+ iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
|
|
+ iterBlk != iterEndBlk;
|
|
+ ++iterBlk) {
|
|
+ BlockT *curBlk = *iterBlk;
|
|
+ removeUnconditionalBranch(curBlk);
|
|
+ removeRedundantConditionalBranch(curBlk);
|
|
+ if (CFGTraits::isReturnBlock(curBlk)) {
|
|
+ retBlks.push_back(curBlk);
|
|
+ }
|
|
+ assert(curBlk->succ_size() <= 2);
|
|
+ } //for
|
|
+
|
|
+ if (retBlks.size() >= 2) {
|
|
+ addDummyExitBlock(retBlks);
|
|
+ changed = true;
|
|
+ }
|
|
+
|
|
+ return changed;
|
|
+} //CFGStructurizer::prepare
|
|
+
|
|
+template<class PassT>
|
|
+bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass,
|
|
+ const AMDGPURegisterInfo * tri) {
|
|
+ passRep = &pass;
|
|
+ funcRep = &func;
|
|
+ TRI = tri;
|
|
+
|
|
+ //Assume reducible CFG...
|
|
+ if (DEBUGME) {
|
|
+ errs() << "AMDGPUCFGStructurizer::run\n";
|
|
+ func.viewCFG();
|
|
+ }
|
|
+
|
|
+ domTree = CFGTraits::getDominatorTree(pass);
|
|
+ if (DEBUGME) {
|
|
+ domTree->print(errs(), (const llvm::Module*)0);
|
|
+ }
|
|
+
|
|
+ postDomTree = CFGTraits::getPostDominatorTree(pass);
|
|
+ if (DEBUGME) {
|
|
+ postDomTree->print(errs());
|
|
+ }
|
|
+
|
|
+ loopInfo = CFGTraits::getLoopInfo(pass);
|
|
+ if (DEBUGME) {
|
|
+ errs() << "LoopInfo:\n";
|
|
+ PrintLoopinfo(*loopInfo, errs());
|
|
+ }
|
|
+
|
|
+ orderBlocks();
|
|
+#ifdef STRESSTEST
|
|
+ //Use the worse block ordering to test the algorithm.
|
|
+ ReverseVector(orderedBlks);
|
|
+#endif
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Ordered blocks:\n";
|
|
+ printOrderedBlocks(errs());
|
|
+ }
|
|
+ int numIter = 0;
|
|
+ bool finish = false;
|
|
+ BlockT *curBlk;
|
|
+ bool makeProgress = false;
|
|
+ int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
|
|
+ orderedBlks.end());
|
|
+
|
|
+ do {
|
|
+ ++numIter;
|
|
+ if (DEBUGME) {
|
|
+ errs() << "numIter = " << numIter
|
|
+ << ", numRemaintedBlk = " << numRemainedBlk << "\n";
|
|
+ }
|
|
+
|
|
+ typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
|
|
+ iterBlk = orderedBlks.begin();
|
|
+ typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
|
|
+ iterBlkEnd = orderedBlks.end();
|
|
+
|
|
+ typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
|
|
+ sccBeginIter = iterBlk;
|
|
+ BlockT *sccBeginBlk = NULL;
|
|
+ int sccNumBlk = 0; // The number of active blocks, init to a
|
|
+ // maximum possible number.
|
|
+ int sccNumIter; // Number of iteration in this SCC.
|
|
+
|
|
+ while (iterBlk != iterBlkEnd) {
|
|
+ curBlk = *iterBlk;
|
|
+
|
|
+ if (sccBeginBlk == NULL) {
|
|
+ sccBeginIter = iterBlk;
|
|
+ sccBeginBlk = curBlk;
|
|
+ sccNumIter = 0;
|
|
+ sccNumBlk = numRemainedBlk; // Init to maximum possible number.
|
|
+ if (DEBUGME) {
|
|
+ errs() << "start processing SCC" << getSCCNum(sccBeginBlk);
|
|
+ errs() << "\n";
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!isRetiredBlock(curBlk)) {
|
|
+ patternMatch(curBlk);
|
|
+ }
|
|
+
|
|
+ ++iterBlk;
|
|
+
|
|
+ bool contNextScc = true;
|
|
+ if (iterBlk == iterBlkEnd
|
|
+ || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
|
|
+ // Just finish one scc.
|
|
+ ++sccNumIter;
|
|
+ int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
|
|
+ if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Can't reduce SCC " << getSCCNum(curBlk)
|
|
+ << ", sccNumIter = " << sccNumIter;
|
|
+ errs() << "doesn't make any progress\n";
|
|
+ }
|
|
+ contNextScc = true;
|
|
+ } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
|
|
+ sccNumBlk = sccRemainedNumBlk;
|
|
+ iterBlk = sccBeginIter;
|
|
+ contNextScc = false;
|
|
+ if (DEBUGME) {
|
|
+ errs() << "repeat processing SCC" << getSCCNum(curBlk)
|
|
+ << "sccNumIter = " << sccNumIter << "\n";
|
|
+ func.viewCFG();
|
|
+ }
|
|
+ } else {
|
|
+ // Finish the current scc.
|
|
+ contNextScc = true;
|
|
+ }
|
|
+ } else {
|
|
+ // Continue on next component in the current scc.
|
|
+ contNextScc = false;
|
|
+ }
|
|
+
|
|
+ if (contNextScc) {
|
|
+ sccBeginBlk = NULL;
|
|
+ }
|
|
+ } //while, "one iteration" over the function.
|
|
+
|
|
+ BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
|
|
+ if (entryBlk->succ_size() == 0) {
|
|
+ finish = true;
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Reduce to one block\n";
|
|
+ }
|
|
+ } else {
|
|
+ int newnumRemainedBlk
|
|
+ = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
|
|
+ // consider cloned blocks ??
|
|
+ if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
|
|
+ makeProgress = true;
|
|
+ numRemainedBlk = newnumRemainedBlk;
|
|
+ } else {
|
|
+ makeProgress = false;
|
|
+ if (DEBUGME) {
|
|
+ errs() << "No progress\n";
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ } while (!finish && makeProgress);
|
|
+
|
|
+ // Misc wrap up to maintain the consistency of the Function representation.
|
|
+ CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
|
|
+
|
|
+ // Detach retired Block, release memory.
|
|
+ for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
|
|
+ iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
|
|
+ if ((*iterMap).second && (*iterMap).second->isRetired) {
|
|
+ assert(((*iterMap).first)->getNumber() != -1);
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
|
|
+ }
|
|
+ (*iterMap).first->eraseFromParent(); //Remove from the parent Function.
|
|
+ }
|
|
+ delete (*iterMap).second;
|
|
+ }
|
|
+ blockInfoMap.clear();
|
|
+
|
|
+ // clear loopLandInfoMap
|
|
+ for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
|
|
+ iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
|
|
+ delete (*iterMap).second;
|
|
+ }
|
|
+ loopLandInfoMap.clear();
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ func.viewCFG();
|
|
+ }
|
|
+
|
|
+ if (!finish) {
|
|
+ assert(!"IRREDUCIBL_CF");
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+} //CFGStructurizer::run
|
|
+
|
|
+/// Print the ordered Blocks.
|
|
+///
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
|
|
+ size_t i = 0;
|
|
+ for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
|
|
+ iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
|
|
+ iterBlk != iterBlkEnd;
|
|
+ ++iterBlk, ++i) {
|
|
+ os << "BB" << (*iterBlk)->getNumber();
|
|
+ os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
|
|
+ if (i != 0 && i % 10 == 0) {
|
|
+ os << "\n";
|
|
+ } else {
|
|
+ os << " ";
|
|
+ }
|
|
+ }
|
|
+} //printOrderedBlocks
|
|
+
|
|
+/// Compute the reversed DFS post order of Blocks
|
|
+///
|
|
+template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
|
|
+ int sccNum = 0;
|
|
+ BlockT *bb;
|
|
+ for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
|
|
+ sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
|
|
+ std::vector<BlockT *> &sccNext = *sccIter;
|
|
+ for (typename std::vector<BlockT *>::const_iterator
|
|
+ blockIter = sccNext.begin(), blockEnd = sccNext.end();
|
|
+ blockIter != blockEnd; ++blockIter) {
|
|
+ bb = *blockIter;
|
|
+ orderedBlks.push_back(bb);
|
|
+ recordSccnum(bb, sccNum);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ //walk through all the block in func to check for unreachable
|
|
+ for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
|
|
+ blockEnd1 = FuncGTraits::nodes_end(funcRep);
|
|
+ blockIter1 != blockEnd1; ++blockIter1) {
|
|
+ BlockT *bb = &(*blockIter1);
|
|
+ sccNum = getSCCNum(bb);
|
|
+ if (sccNum == INVALIDSCCNUM) {
|
|
+ errs() << "unreachable block BB" << bb->getNumber() << "\n";
|
|
+ }
|
|
+ }
|
|
+} //orderBlocks
|
|
+
|
|
+template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
|
|
+ int numMatch = 0;
|
|
+ int curMatch;
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
|
|
+ }
|
|
+
|
|
+ while ((curMatch = patternMatchGroup(curBlk)) > 0) {
|
|
+ numMatch += curMatch;
|
|
+ }
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "End patternMatch BB" << curBlk->getNumber()
|
|
+ << ", numMatch = " << numMatch << "\n";
|
|
+ }
|
|
+
|
|
+ return numMatch;
|
|
+} //patternMatch
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
|
|
+ int numMatch = 0;
|
|
+ numMatch += serialPatternMatch(curBlk);
|
|
+ numMatch += ifPatternMatch(curBlk);
|
|
+ numMatch += loopendPatternMatch(curBlk);
|
|
+ numMatch += loopPatternMatch(curBlk);
|
|
+ return numMatch;
|
|
+}//patternMatchGroup
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
|
|
+ if (curBlk->succ_size() != 1) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ BlockT *childBlk = *curBlk->succ_begin();
|
|
+ if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ mergeSerialBlock(curBlk, childBlk);
|
|
+ ++numSerialPatternMatch;
|
|
+ return 1;
|
|
+} //serialPatternMatch
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
|
|
+ //two edges
|
|
+ if (curBlk->succ_size() != 2) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (hasBackEdge(curBlk)) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
|
|
+ if (branchInstr == NULL) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ assert(CFGTraits::isCondBranch(branchInstr));
|
|
+
|
|
+ BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
|
|
+ BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
|
|
+ BlockT *landBlk;
|
|
+ int cloned = 0;
|
|
+
|
|
+ // TODO: Simplify
|
|
+ if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
|
|
+ && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
|
|
+ landBlk = *trueBlk->succ_begin();
|
|
+ } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
|
|
+ landBlk = NULL;
|
|
+ } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
|
|
+ landBlk = falseBlk;
|
|
+ falseBlk = NULL;
|
|
+ } else if (falseBlk->succ_size() == 1
|
|
+ && *falseBlk->succ_begin() == trueBlk) {
|
|
+ landBlk = trueBlk;
|
|
+ trueBlk = NULL;
|
|
+ } else if (falseBlk->succ_size() == 1
|
|
+ && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
|
|
+ landBlk = *falseBlk->succ_begin();
|
|
+ } else if (trueBlk->succ_size() == 1
|
|
+ && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
|
|
+ landBlk = *trueBlk->succ_begin();
|
|
+ } else {
|
|
+ return handleJumpintoIf(curBlk, trueBlk, falseBlk);
|
|
+ }
|
|
+
|
|
+ // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
|
|
+ // new BB created for landBlk==NULL may introduce new challenge to the
|
|
+ // reduction process.
|
|
+ if (landBlk != NULL &&
|
|
+ ((trueBlk && trueBlk->pred_size() > 1)
|
|
+ || (falseBlk && falseBlk->pred_size() > 1))) {
|
|
+ cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
|
|
+ }
|
|
+
|
|
+ if (trueBlk && trueBlk->pred_size() > 1) {
|
|
+ trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
|
|
+ ++cloned;
|
|
+ }
|
|
+
|
|
+ if (falseBlk && falseBlk->pred_size() > 1) {
|
|
+ falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
|
|
+ ++cloned;
|
|
+ }
|
|
+
|
|
+ mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
|
|
+
|
|
+ ++numIfPatternMatch;
|
|
+
|
|
+ numClonedBlock += cloned;
|
|
+
|
|
+ return 1 + cloned;
|
|
+} //ifPatternMatch
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
|
|
+ return 0;
|
|
+} //switchPatternMatch
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
|
|
+ LoopT *loopRep = loopInfo->getLoopFor(curBlk);
|
|
+ typename std::vector<LoopT *> nestedLoops;
|
|
+ while (loopRep) {
|
|
+ nestedLoops.push_back(loopRep);
|
|
+ loopRep = loopRep->getParentLoop();
|
|
+ }
|
|
+
|
|
+ if (nestedLoops.size() == 0) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ // Process nested loop outside->inside, so "continue" to a outside loop won't
|
|
+ // be mistaken as "break" of the current loop.
|
|
+ int num = 0;
|
|
+ for (typename std::vector<LoopT *>::reverse_iterator
|
|
+ iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
|
|
+ iter != iterEnd; ++iter) {
|
|
+ loopRep = *iter;
|
|
+
|
|
+ if (getLoopLandBlock(loopRep) != NULL) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ BlockT *loopHeader = loopRep->getHeader();
|
|
+
|
|
+ int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
|
|
+
|
|
+ if (numBreak == -1) {
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ int numCont = loopcontPatternMatch(loopRep, loopHeader);
|
|
+ num += numBreak + numCont;
|
|
+ }
|
|
+
|
|
+ return num;
|
|
+} //loopendPatternMatch
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
|
|
+ if (curBlk->succ_size() != 0) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ int numLoop = 0;
|
|
+ LoopT *loopRep = loopInfo->getLoopFor(curBlk);
|
|
+ while (loopRep && loopRep->getHeader() == curBlk) {
|
|
+ LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
|
|
+ if (loopLand) {
|
|
+ BlockT *landBlk = loopLand->landBlk;
|
|
+ assert(landBlk);
|
|
+ if (!isRetiredBlock(landBlk)) {
|
|
+ mergeLooplandBlock(curBlk, loopLand);
|
|
+ ++numLoop;
|
|
+ }
|
|
+ }
|
|
+ loopRep = loopRep->getParentLoop();
|
|
+ }
|
|
+
|
|
+ numLoopPatternMatch += numLoop;
|
|
+
|
|
+ return numLoop;
|
|
+} //loopPatternMatch
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
|
|
+ BlockT *loopHeader) {
|
|
+ BlockTSmallerVector exitingBlks;
|
|
+ loopRep->getExitingBlocks(exitingBlks);
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
|
|
+ }
|
|
+
|
|
+ if (exitingBlks.size() == 0) {
|
|
+ setLoopLandBlock(loopRep);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ // Compute the corresponding exitBlks and exit block set.
|
|
+ BlockTSmallerVector exitBlks;
|
|
+ std::set<BlockT *> exitBlkSet;
|
|
+ for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
|
|
+ iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
|
|
+ BlockT *exitingBlk = *iter;
|
|
+ BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
|
|
+ exitBlks.push_back(exitBlk);
|
|
+ exitBlkSet.insert(exitBlk); //non-duplicate insert
|
|
+ }
|
|
+
|
|
+ assert(exitBlkSet.size() > 0);
|
|
+ assert(exitBlks.size() == exitingBlks.size());
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
|
|
+ }
|
|
+
|
|
+ // Find exitLandBlk.
|
|
+ BlockT *exitLandBlk = NULL;
|
|
+ int numCloned = 0;
|
|
+ int numSerial = 0;
|
|
+
|
|
+ if (exitBlkSet.size() == 1) {
|
|
+ exitLandBlk = *exitBlkSet.begin();
|
|
+ } else {
|
|
+ exitLandBlk = findNearestCommonPostDom(exitBlkSet);
|
|
+
|
|
+ if (exitLandBlk == NULL) {
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ bool allInPath = true;
|
|
+ bool allNotInPath = true;
|
|
+ for (typename std::set<BlockT*>::const_iterator
|
|
+ iter = exitBlkSet.begin(),
|
|
+ iterEnd = exitBlkSet.end();
|
|
+ iter != iterEnd; ++iter) {
|
|
+ BlockT *exitBlk = *iter;
|
|
+
|
|
+ PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
|
|
+ if (DEBUGME) {
|
|
+ errs() << "BB" << exitBlk->getNumber()
|
|
+ << " to BB" << exitLandBlk->getNumber() << " PathToKind="
|
|
+ << pathKind << "\n";
|
|
+ }
|
|
+
|
|
+ allInPath = allInPath && (pathKind == SinglePath_InPath);
|
|
+ allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
|
|
+
|
|
+ if (!allInPath && !allNotInPath) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "singlePath check fail\n";
|
|
+ }
|
|
+ return -1;
|
|
+ }
|
|
+ } // check all exit blocks
|
|
+
|
|
+ if (allNotInPath) {
|
|
+
|
|
+ // TODO: Simplify, maybe separate function?
|
|
+ LoopT *parentLoopRep = loopRep->getParentLoop();
|
|
+ BlockT *parentLoopHeader = NULL;
|
|
+ if (parentLoopRep)
|
|
+ parentLoopHeader = parentLoopRep->getHeader();
|
|
+
|
|
+ if (exitLandBlk == parentLoopHeader &&
|
|
+ (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
|
|
+ loopRep,
|
|
+ exitBlkSet,
|
|
+ exitLandBlk)) != NULL) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "relocateLoopcontBlock success\n";
|
|
+ }
|
|
+ } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
|
|
+ exitingBlks,
|
|
+ exitBlks)) != NULL) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "insertEndbranchBlock success\n";
|
|
+ }
|
|
+ } else {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "loop exit fail\n";
|
|
+ }
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Handle side entry to exit path.
|
|
+ exitBlks.clear();
|
|
+ exitBlkSet.clear();
|
|
+ for (typename BlockTSmallerVector::iterator iterExiting =
|
|
+ exitingBlks.begin(),
|
|
+ iterExitingEnd = exitingBlks.end();
|
|
+ iterExiting != iterExitingEnd; ++iterExiting) {
|
|
+ BlockT *exitingBlk = *iterExiting;
|
|
+ BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
|
|
+ BlockT *newExitBlk = exitBlk;
|
|
+
|
|
+ if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
|
|
+ newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
|
|
+ ++numCloned;
|
|
+ }
|
|
+
|
|
+ numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
|
|
+
|
|
+ exitBlks.push_back(newExitBlk);
|
|
+ exitBlkSet.insert(newExitBlk);
|
|
+ }
|
|
+
|
|
+ for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
|
|
+ iterExitEnd = exitBlks.end();
|
|
+ iterExit != iterExitEnd; ++iterExit) {
|
|
+ BlockT *exitBlk = *iterExit;
|
|
+ numSerial += serialPatternMatch(exitBlk);
|
|
+ }
|
|
+
|
|
+ for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
|
|
+ iterExitEnd = exitBlks.end();
|
|
+ iterExit != iterExitEnd; ++iterExit) {
|
|
+ BlockT *exitBlk = *iterExit;
|
|
+ if (exitBlk->pred_size() > 1) {
|
|
+ if (exitBlk != exitLandBlk) {
|
|
+ return -1;
|
|
+ }
|
|
+ } else {
|
|
+ if (exitBlk != exitLandBlk &&
|
|
+ (exitBlk->succ_size() != 1 ||
|
|
+ *exitBlk->succ_begin() != exitLandBlk)) {
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ } // else
|
|
+
|
|
+ exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
|
|
+
|
|
+ // Fold break into the breaking block. Leverage across level breaks.
|
|
+ assert(exitingBlks.size() == exitBlks.size());
|
|
+ for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
|
|
+ iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
|
|
+ iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
|
|
+ BlockT *exitBlk = *iterExit;
|
|
+ BlockT *exitingBlk = *iterExiting;
|
|
+ assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
|
|
+ LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
|
|
+ handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
|
|
+ }
|
|
+
|
|
+ int numBreak = static_cast<int>(exitingBlks.size());
|
|
+ numLoopbreakPatternMatch += numBreak;
|
|
+ numClonedBlock += numCloned;
|
|
+ return numBreak + numSerial + numCloned;
|
|
+} //loopbreakPatternMatch
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
|
|
+ BlockT *loopHeader) {
|
|
+ int numCont = 0;
|
|
+ SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
|
|
+ for (typename InvBlockGTraits::ChildIteratorType iter =
|
|
+ InvBlockGTraits::child_begin(loopHeader),
|
|
+ iterEnd = InvBlockGTraits::child_end(loopHeader);
|
|
+ iter != iterEnd; ++iter) {
|
|
+ BlockT *curBlk = *iter;
|
|
+ if (loopRep->contains(curBlk)) {
|
|
+ handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
|
|
+ loopHeader, loopRep);
|
|
+ contBlk.push_back(curBlk);
|
|
+ ++numCont;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator
|
|
+ iter = contBlk.begin(), iterEnd = contBlk.end();
|
|
+ iter != iterEnd; ++iter) {
|
|
+ (*iter)->removeSuccessor(loopHeader);
|
|
+ }
|
|
+
|
|
+ numLoopcontPatternMatch += numCont;
|
|
+
|
|
+ return numCont;
|
|
+} //loopcontPatternMatch
|
|
+
|
|
+
|
|
+template<class PassT>
|
|
+bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
|
|
+ BlockT *src2Blk) {
|
|
+ // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
|
|
+ // same loop with LoopLandInfo without explicitly keeping track of
|
|
+ // loopContBlks and loopBreakBlks, this is a method to get the information.
|
|
+ //
|
|
+ if (src1Blk->succ_size() == 0) {
|
|
+ LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
|
|
+ if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
|
|
+ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
|
|
+ if (theEntry != NULL) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "isLoopContBreakBlock yes src1 = BB"
|
|
+ << src1Blk->getNumber()
|
|
+ << " src2 = BB" << src2Blk->getNumber() << "\n";
|
|
+ }
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return false;
|
|
+} //isSameloopDetachedContbreak
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
|
|
+ BlockT *trueBlk,
|
|
+ BlockT *falseBlk) {
|
|
+ int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
|
|
+ if (num == 0) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
|
|
+ }
|
|
+ num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
|
|
+ }
|
|
+ return num;
|
|
+}
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
|
|
+ BlockT *trueBlk,
|
|
+ BlockT *falseBlk) {
|
|
+ int num = 0;
|
|
+ BlockT *downBlk;
|
|
+
|
|
+ //trueBlk could be the common post dominator
|
|
+ downBlk = trueBlk;
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
|
|
+ << " true = BB" << trueBlk->getNumber()
|
|
+ << ", numSucc=" << trueBlk->succ_size()
|
|
+ << " false = BB" << falseBlk->getNumber() << "\n";
|
|
+ }
|
|
+
|
|
+ while (downBlk) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "check down = BB" << downBlk->getNumber();
|
|
+ }
|
|
+
|
|
+ if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << " working\n";
|
|
+ }
|
|
+
|
|
+ num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
|
|
+ num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
|
|
+
|
|
+ numClonedBlock += num;
|
|
+ num += serialPatternMatch(*headBlk->succ_begin());
|
|
+ num += serialPatternMatch(*(++headBlk->succ_begin()));
|
|
+ num += ifPatternMatch(headBlk);
|
|
+ assert(num > 0);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ if (DEBUGME) {
|
|
+ errs() << " not working\n";
|
|
+ }
|
|
+ downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
|
|
+ } // walk down the postDomTree
|
|
+
|
|
+ return num;
|
|
+} //handleJumpintoIf
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
|
|
+ BlockT *trueBlk,
|
|
+ BlockT *falseBlk,
|
|
+ BlockT *landBlk,
|
|
+ bool detail) {
|
|
+ errs() << "head = BB" << headBlk->getNumber()
|
|
+ << " size = " << headBlk->size();
|
|
+ if (detail) {
|
|
+ errs() << "\n";
|
|
+ headBlk->print(errs());
|
|
+ errs() << "\n";
|
|
+ }
|
|
+
|
|
+ if (trueBlk) {
|
|
+ errs() << ", true = BB" << trueBlk->getNumber() << " size = "
|
|
+ << trueBlk->size() << " numPred = " << trueBlk->pred_size();
|
|
+ if (detail) {
|
|
+ errs() << "\n";
|
|
+ trueBlk->print(errs());
|
|
+ errs() << "\n";
|
|
+ }
|
|
+ }
|
|
+ if (falseBlk) {
|
|
+ errs() << ", false = BB" << falseBlk->getNumber() << " size = "
|
|
+ << falseBlk->size() << " numPred = " << falseBlk->pred_size();
|
|
+ if (detail) {
|
|
+ errs() << "\n";
|
|
+ falseBlk->print(errs());
|
|
+ errs() << "\n";
|
|
+ }
|
|
+ }
|
|
+ if (landBlk) {
|
|
+ errs() << ", land = BB" << landBlk->getNumber() << " size = "
|
|
+ << landBlk->size() << " numPred = " << landBlk->pred_size();
|
|
+ if (detail) {
|
|
+ errs() << "\n";
|
|
+ landBlk->print(errs());
|
|
+ errs() << "\n";
|
|
+ }
|
|
+ }
|
|
+
|
|
+ errs() << "\n";
|
|
+} //showImproveSimpleJumpintoIf
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
|
|
+ BlockT *trueBlk,
|
|
+ BlockT *falseBlk,
|
|
+ BlockT **plandBlk) {
|
|
+ bool migrateTrue = false;
|
|
+ bool migrateFalse = false;
|
|
+
|
|
+ BlockT *landBlk = *plandBlk;
|
|
+
|
|
+ assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
|
|
+ && (falseBlk == NULL || falseBlk->succ_size() <= 1));
|
|
+
|
|
+ if (trueBlk == falseBlk) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ migrateTrue = needMigrateBlock(trueBlk);
|
|
+ migrateFalse = needMigrateBlock(falseBlk);
|
|
+
|
|
+ if (!migrateTrue && !migrateFalse) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ // If we need to migrate either trueBlk and falseBlk, migrate the rest that
|
|
+ // have more than one predecessors. without doing this, its predecessor
|
|
+ // rather than headBlk will have undefined value in initReg.
|
|
+ if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
|
|
+ migrateTrue = true;
|
|
+ }
|
|
+ if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
|
|
+ migrateFalse = true;
|
|
+ }
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "before improveSimpleJumpintoIf: ";
|
|
+ showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
|
|
+ }
|
|
+
|
|
+ // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
|
|
+ //
|
|
+ // new: headBlk => if () {initReg = 1; org trueBlk branch} else
|
|
+ // {initReg = 0; org falseBlk branch }
|
|
+ // => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
|
|
+ // => org landBlk
|
|
+ // if landBlk->pred_size() > 2, put the about if-else inside
|
|
+ // if (initReg !=2) {...}
|
|
+ //
|
|
+ // add initReg = initVal to headBlk
|
|
+
|
|
+ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
|
|
+ unsigned initReg =
|
|
+ funcRep->getRegInfo().createVirtualRegister(I32RC);
|
|
+ if (!migrateTrue || !migrateFalse) {
|
|
+ int initVal = migrateTrue ? 0 : 1;
|
|
+ CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
|
|
+ }
|
|
+
|
|
+ int numNewBlk = 0;
|
|
+
|
|
+ if (landBlk == NULL) {
|
|
+ landBlk = funcRep->CreateMachineBasicBlock();
|
|
+ funcRep->push_back(landBlk); //insert to function
|
|
+
|
|
+ if (trueBlk) {
|
|
+ trueBlk->addSuccessor(landBlk);
|
|
+ } else {
|
|
+ headBlk->addSuccessor(landBlk);
|
|
+ }
|
|
+
|
|
+ if (falseBlk) {
|
|
+ falseBlk->addSuccessor(landBlk);
|
|
+ } else {
|
|
+ headBlk->addSuccessor(landBlk);
|
|
+ }
|
|
+
|
|
+ numNewBlk ++;
|
|
+ }
|
|
+
|
|
+ bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
|
|
+
|
|
+ //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
|
|
+ typename BlockT::iterator insertPos =
|
|
+ CFGTraits::getInstrPos
|
|
+ (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep));
|
|
+
|
|
+ if (landBlkHasOtherPred) {
|
|
+ unsigned immReg =
|
|
+ funcRep->getRegInfo().createVirtualRegister(I32RC);
|
|
+ CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
|
|
+ unsigned cmpResReg =
|
|
+ funcRep->getRegInfo().createVirtualRegister(I32RC);
|
|
+
|
|
+ CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
|
|
+ initReg, immReg);
|
|
+ CFGTraits::insertCondBranchBefore(landBlk, insertPos,
|
|
+ AMDGPU::IF_PREDICATE_SET, passRep,
|
|
+ cmpResReg, DebugLoc());
|
|
+ }
|
|
+
|
|
+ CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET,
|
|
+ passRep, initReg, DebugLoc());
|
|
+
|
|
+ if (migrateTrue) {
|
|
+ migrateInstruction(trueBlk, landBlk, insertPos);
|
|
+ // need to uncondionally insert the assignment to ensure a path from its
|
|
+ // predecessor rather than headBlk has valid value in initReg if
|
|
+ // (initVal != 1).
|
|
+ CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
|
|
+ }
|
|
+ CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep);
|
|
+
|
|
+ if (migrateFalse) {
|
|
+ migrateInstruction(falseBlk, landBlk, insertPos);
|
|
+ // need to uncondionally insert the assignment to ensure a path from its
|
|
+ // predecessor rather than headBlk has valid value in initReg if
|
|
+ // (initVal != 0)
|
|
+ CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
|
|
+ }
|
|
+
|
|
+ if (landBlkHasOtherPred) {
|
|
+ // add endif
|
|
+ CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep);
|
|
+
|
|
+ // put initReg = 2 to other predecessors of landBlk
|
|
+ for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
|
|
+ predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
|
|
+ ++predIter) {
|
|
+ BlockT *curBlk = *predIter;
|
|
+ if (curBlk != trueBlk && curBlk != falseBlk) {
|
|
+ CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
|
|
+ }
|
|
+ } //for
|
|
+ }
|
|
+ if (DEBUGME) {
|
|
+ errs() << "result from improveSimpleJumpintoIf: ";
|
|
+ showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
|
|
+ }
|
|
+
|
|
+ // update landBlk
|
|
+ *plandBlk = landBlk;
|
|
+
|
|
+ return numNewBlk;
|
|
+} //improveSimpleJumpintoIf
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
|
|
+ LoopT *exitingLoop,
|
|
+ BlockT *exitBlk,
|
|
+ LoopT *exitLoop,
|
|
+ BlockT *landBlk) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
|
|
+ << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
|
|
+ }
|
|
+ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
|
|
+
|
|
+ RegiT initReg = INVALIDREGNUM;
|
|
+ if (exitingLoop != exitLoop) {
|
|
+ initReg = static_cast<int>
|
|
+ (funcRep->getRegInfo().createVirtualRegister(I32RC));
|
|
+ assert(initReg != INVALIDREGNUM);
|
|
+ addLoopBreakInitReg(exitLoop, initReg);
|
|
+ while (exitingLoop != exitLoop && exitingLoop) {
|
|
+ addLoopBreakOnReg(exitingLoop, initReg);
|
|
+ exitingLoop = exitingLoop->getParentLoop();
|
|
+ }
|
|
+ assert(exitingLoop == exitLoop);
|
|
+ }
|
|
+
|
|
+ mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
|
|
+
|
|
+} //handleLoopbreak
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
|
|
+ LoopT *contingLoop,
|
|
+ BlockT *contBlk,
|
|
+ LoopT *contLoop) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
|
|
+ << " header = BB" << contBlk->getNumber() << "\n";
|
|
+
|
|
+ errs() << "Trying to continue loop-depth = "
|
|
+ << getLoopDepth(contLoop)
|
|
+ << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
|
|
+ }
|
|
+
|
|
+ RegiT initReg = INVALIDREGNUM;
|
|
+ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
|
|
+ if (contingLoop != contLoop) {
|
|
+ initReg = static_cast<int>
|
|
+ (funcRep->getRegInfo().createVirtualRegister(I32RC));
|
|
+ assert(initReg != INVALIDREGNUM);
|
|
+ addLoopContInitReg(contLoop, initReg);
|
|
+ while (contingLoop && contingLoop->getParentLoop() != contLoop) {
|
|
+ addLoopBreakOnReg(contingLoop, initReg); //not addLoopContOnReg
|
|
+ contingLoop = contingLoop->getParentLoop();
|
|
+ }
|
|
+ assert(contingLoop && contingLoop->getParentLoop() == contLoop);
|
|
+ addLoopContOnReg(contingLoop, initReg);
|
|
+ }
|
|
+
|
|
+ settleLoopcontBlock(contingBlk, contBlk, initReg);
|
|
+} //handleLoopcontBlock
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "serialPattern BB" << dstBlk->getNumber()
|
|
+ << " <= BB" << srcBlk->getNumber() << "\n";
|
|
+ }
|
|
+ dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end());
|
|
+
|
|
+ dstBlk->removeSuccessor(srcBlk);
|
|
+ CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
|
|
+
|
|
+ removeSuccessor(srcBlk);
|
|
+ retireBlock(dstBlk, srcBlk);
|
|
+} //mergeSerialBlock
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
|
|
+ BlockT *curBlk,
|
|
+ BlockT *trueBlk,
|
|
+ BlockT *falseBlk,
|
|
+ BlockT *landBlk) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "ifPattern BB" << curBlk->getNumber();
|
|
+ errs() << "{ ";
|
|
+ if (trueBlk) {
|
|
+ errs() << "BB" << trueBlk->getNumber();
|
|
+ }
|
|
+ errs() << " } else ";
|
|
+ errs() << "{ ";
|
|
+ if (falseBlk) {
|
|
+ errs() << "BB" << falseBlk->getNumber();
|
|
+ }
|
|
+ errs() << " }\n ";
|
|
+ errs() << "landBlock: ";
|
|
+ if (landBlk == NULL) {
|
|
+ errs() << "NULL";
|
|
+ } else {
|
|
+ errs() << "BB" << landBlk->getNumber();
|
|
+ }
|
|
+ errs() << "\n";
|
|
+ }
|
|
+
|
|
+ int oldOpcode = branchInstr->getOpcode();
|
|
+ DebugLoc branchDL = branchInstr->getDebugLoc();
|
|
+
|
|
+// transform to
|
|
+// if cond
|
|
+// trueBlk
|
|
+// else
|
|
+// falseBlk
|
|
+// endif
|
|
+// landBlk
|
|
+
|
|
+ typename BlockT::iterator branchInstrPos =
|
|
+ CFGTraits::getInstrPos(curBlk, branchInstr);
|
|
+ CFGTraits::insertCondBranchBefore(branchInstrPos,
|
|
+ CFGTraits::getBranchNzeroOpcode(oldOpcode),
|
|
+ passRep,
|
|
+ branchDL);
|
|
+
|
|
+ if (trueBlk) {
|
|
+ curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end());
|
|
+ curBlk->removeSuccessor(trueBlk);
|
|
+ if (landBlk && trueBlk->succ_size()!=0) {
|
|
+ trueBlk->removeSuccessor(landBlk);
|
|
+ }
|
|
+ retireBlock(curBlk, trueBlk);
|
|
+ }
|
|
+ CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep);
|
|
+
|
|
+ if (falseBlk) {
|
|
+ curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(),
|
|
+ falseBlk->end());
|
|
+ curBlk->removeSuccessor(falseBlk);
|
|
+ if (landBlk && falseBlk->succ_size() != 0) {
|
|
+ falseBlk->removeSuccessor(landBlk);
|
|
+ }
|
|
+ retireBlock(curBlk, falseBlk);
|
|
+ }
|
|
+ CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep);
|
|
+
|
|
+ branchInstr->eraseFromParent();
|
|
+
|
|
+ if (landBlk && trueBlk && falseBlk) {
|
|
+ curBlk->addSuccessor(landBlk);
|
|
+ }
|
|
+
|
|
+} //mergeIfthenelseBlock
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
|
|
+ LoopLandInfo *loopLand) {
|
|
+ BlockT *landBlk = loopLand->landBlk;
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "loopPattern header = BB" << dstBlk->getNumber()
|
|
+ << " land = BB" << landBlk->getNumber() << "\n";
|
|
+ }
|
|
+
|
|
+ // Loop contInitRegs are init at the beginning of the loop.
|
|
+ for (typename std::set<RegiT>::const_iterator iter =
|
|
+ loopLand->contInitRegs.begin(),
|
|
+ iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
|
|
+ CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
|
|
+ }
|
|
+
|
|
+ /* we last inserterd the DebugLoc in the
|
|
+ * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk.
|
|
+ * search for the DebugLoc in the that statement.
|
|
+ * if not found, we have to insert the empty/default DebugLoc */
|
|
+ InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
|
|
+ DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
|
|
+
|
|
+ CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak);
|
|
+ // Loop breakInitRegs are init before entering the loop.
|
|
+ for (typename std::set<RegiT>::const_iterator iter =
|
|
+ loopLand->breakInitRegs.begin(),
|
|
+ iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) {
|
|
+ CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
|
|
+ }
|
|
+ // Loop endbranchInitRegs are init before entering the loop.
|
|
+ for (typename std::set<RegiT>::const_iterator iter =
|
|
+ loopLand->endbranchInitRegs.begin(),
|
|
+ iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
|
|
+ CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
|
|
+ }
|
|
+
|
|
+ /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
|
|
+ * search for the DebugLoc in the continue statement.
|
|
+ * if not found, we have to insert the empty/default DebugLoc */
|
|
+ InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
|
|
+ DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
|
|
+
|
|
+ CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue);
|
|
+ // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
|
|
+ // loop.
|
|
+ for (typename std::set<RegiT>::const_iterator iter =
|
|
+ loopLand->breakOnRegs.begin(),
|
|
+ iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
|
|
+ CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep,
|
|
+ *iter);
|
|
+ }
|
|
+
|
|
+ // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
|
|
+ // loop.
|
|
+ for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
|
|
+ iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
|
|
+ CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32,
|
|
+ passRep, *iter);
|
|
+ }
|
|
+
|
|
+ dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
|
|
+
|
|
+ for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
|
|
+ iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
|
|
+ dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of.
|
|
+ }
|
|
+
|
|
+ removeSuccessor(landBlk);
|
|
+ retireBlock(dstBlk, landBlk);
|
|
+} //mergeLooplandBlock
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::reversePredicateSetter(typename BlockT::iterator I) {
|
|
+ while (I--) {
|
|
+ if (I->getOpcode() == AMDGPU::PRED_X) {
|
|
+ switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
|
|
+ case OPCODE_IS_ZERO_INT:
|
|
+ static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT);
|
|
+ return;
|
|
+ case OPCODE_IS_NOT_ZERO_INT:
|
|
+ static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT);
|
|
+ return;
|
|
+ case OPCODE_IS_ZERO:
|
|
+ static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO);
|
|
+ return;
|
|
+ case OPCODE_IS_NOT_ZERO:
|
|
+ static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO);
|
|
+ return;
|
|
+ default:
|
|
+ assert(0 && "PRED_X Opcode invalid!");
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
|
|
+ BlockT *exitBlk,
|
|
+ BlockT *exitLandBlk,
|
|
+ RegiT setReg) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
|
|
+ << " exit = BB" << exitBlk->getNumber()
|
|
+ << " land = BB" << exitLandBlk->getNumber() << "\n";
|
|
+ }
|
|
+
|
|
+ InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
|
|
+ assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
|
|
+
|
|
+ DebugLoc DL = branchInstr->getDebugLoc();
|
|
+
|
|
+ BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
|
|
+
|
|
+ // transform exitingBlk to
|
|
+ // if ( ) {
|
|
+ // exitBlk (if exitBlk != exitLandBlk)
|
|
+ // setReg = 1
|
|
+ // break
|
|
+ // }endif
|
|
+ // successor = {orgSuccessor(exitingBlk) - exitBlk}
|
|
+
|
|
+ typename BlockT::iterator branchInstrPos =
|
|
+ CFGTraits::getInstrPos(exitingBlk, branchInstr);
|
|
+
|
|
+ if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
|
|
+ //break_logical
|
|
+
|
|
+ if (trueBranch != exitBlk) {
|
|
+ reversePredicateSetter(branchInstrPos);
|
|
+ }
|
|
+ CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
|
|
+ } else {
|
|
+ if (trueBranch != exitBlk) {
|
|
+ reversePredicateSetter(branchInstr);
|
|
+ }
|
|
+ CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
|
|
+ if (exitBlk != exitLandBlk) {
|
|
+ //splice is insert-before ...
|
|
+ exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
|
|
+ exitBlk->end());
|
|
+ }
|
|
+ if (setReg != INVALIDREGNUM) {
|
|
+ CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
|
|
+ }
|
|
+ CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep);
|
|
+ } //if_logical
|
|
+
|
|
+ //now branchInst can be erase safely
|
|
+ branchInstr->eraseFromParent();
|
|
+
|
|
+ //now take care of successors, retire blocks
|
|
+ exitingBlk->removeSuccessor(exitBlk);
|
|
+ if (exitBlk != exitLandBlk) {
|
|
+ //splice is insert-before ...
|
|
+ exitBlk->removeSuccessor(exitLandBlk);
|
|
+ retireBlock(exitingBlk, exitBlk);
|
|
+ }
|
|
+
|
|
+} //mergeLoopbreakBlock
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
|
|
+ BlockT *contBlk,
|
|
+ RegiT setReg) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "settleLoopcontBlock conting = BB"
|
|
+ << contingBlk->getNumber()
|
|
+ << ", cont = BB" << contBlk->getNumber() << "\n";
|
|
+ }
|
|
+
|
|
+ InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
|
|
+ if (branchInstr) {
|
|
+ assert(CFGTraits::isCondBranch(branchInstr));
|
|
+ typename BlockT::iterator branchInstrPos =
|
|
+ CFGTraits::getInstrPos(contingBlk, branchInstr);
|
|
+ BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
|
|
+ int oldOpcode = branchInstr->getOpcode();
|
|
+ DebugLoc DL = branchInstr->getDebugLoc();
|
|
+
|
|
+ // transform contingBlk to
|
|
+ // if () {
|
|
+ // move instr after branchInstr
|
|
+ // continue
|
|
+ // or
|
|
+ // setReg = 1
|
|
+ // break
|
|
+ // }endif
|
|
+ // successor = {orgSuccessor(contingBlk) - loopHeader}
|
|
+
|
|
+ bool useContinueLogical =
|
|
+ (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
|
|
+
|
|
+ if (useContinueLogical == false) {
|
|
+ int branchOpcode =
|
|
+ trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
|
|
+ : CFGTraits::getBranchZeroOpcode(oldOpcode);
|
|
+
|
|
+ CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
|
|
+
|
|
+ if (setReg != INVALIDREGNUM) {
|
|
+ CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
|
|
+ // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
|
|
+ CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL);
|
|
+ } else {
|
|
+ // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
|
|
+ CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL);
|
|
+ }
|
|
+
|
|
+ CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL);
|
|
+ } else {
|
|
+ int branchOpcode =
|
|
+ trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
|
|
+ : CFGTraits::getContinueZeroOpcode(oldOpcode);
|
|
+
|
|
+ CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
|
|
+ }
|
|
+
|
|
+ branchInstr->eraseFromParent();
|
|
+ } else {
|
|
+ // if we've arrived here then we've already erased the branch instruction
|
|
+ // travel back up the basic block to see the last reference of our debug location
|
|
+ // we've just inserted that reference here so it should be representative
|
|
+ if (setReg != INVALIDREGNUM) {
|
|
+ CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
|
|
+ // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
|
|
+ CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
|
|
+ } else {
|
|
+ // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
|
|
+ CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
|
|
+ }
|
|
+ } //else
|
|
+
|
|
+} //settleLoopcontBlock
|
|
+
|
|
+// BBs in exitBlkSet are determined as in break-path for loopRep,
|
|
+// before we can put code for BBs as inside loop-body for loopRep
|
|
+// check whether those BBs are determined as cont-BB for parentLoopRep
|
|
+// earlier.
|
|
+// If so, generate a new BB newBlk
|
|
+// (1) set newBlk common successor of BBs in exitBlkSet
|
|
+// (2) change the continue-instr in BBs in exitBlkSet to break-instr
|
|
+// (3) generate continue-instr in newBlk
|
|
+//
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::BlockT *
|
|
+CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
|
|
+ LoopT *loopRep,
|
|
+ std::set<BlockT *> &exitBlkSet,
|
|
+ BlockT *exitLandBlk) {
|
|
+ std::set<BlockT *> endBlkSet;
|
|
+
|
|
+
|
|
+
|
|
+ for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
|
|
+ iterEnd = exitBlkSet.end();
|
|
+ iter != iterEnd; ++iter) {
|
|
+ BlockT *exitBlk = *iter;
|
|
+ BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
|
|
+
|
|
+ if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
|
|
+ return NULL;
|
|
+
|
|
+ endBlkSet.insert(endBlk);
|
|
+ }
|
|
+
|
|
+ BlockT *newBlk = funcRep->CreateMachineBasicBlock();
|
|
+ funcRep->push_back(newBlk); //insert to function
|
|
+ CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep);
|
|
+ SHOWNEWBLK(newBlk, "New continue block: ");
|
|
+
|
|
+ for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
|
|
+ iterEnd = endBlkSet.end();
|
|
+ iter != iterEnd; ++iter) {
|
|
+ BlockT *endBlk = *iter;
|
|
+ InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
|
|
+ if (contInstr) {
|
|
+ contInstr->eraseFromParent();
|
|
+ }
|
|
+ endBlk->addSuccessor(newBlk);
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Add new continue Block to BB"
|
|
+ << endBlk->getNumber() << " successors\n";
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return newBlk;
|
|
+} //relocateLoopcontBlock
|
|
+
|
|
+
|
|
+// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
|
|
+// LoopLandBlock. This BB branch on the loop endBranchInit register to the
|
|
+// pathes corresponding to the loop exiting branches.
|
|
+
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::BlockT *
|
|
+CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
|
|
+ BlockTSmallerVector &exitingBlks,
|
|
+ BlockTSmallerVector &exitBlks) {
|
|
+ const AMDGPUInstrInfo *tii =
|
|
+ static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
|
|
+ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
|
|
+
|
|
+ RegiT endBranchReg = static_cast<int>
|
|
+ (funcRep->getRegInfo().createVirtualRegister(I32RC));
|
|
+ assert(endBranchReg >= 0);
|
|
+
|
|
+ // reg = 0 before entering the loop
|
|
+ addLoopEndbranchInitReg(loopRep, endBranchReg);
|
|
+
|
|
+ uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
|
|
+ assert(numBlks >=2 && numBlks == exitBlks.size());
|
|
+
|
|
+ BlockT *preExitingBlk = exitingBlks[0];
|
|
+ BlockT *preExitBlk = exitBlks[0];
|
|
+ BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
|
|
+ funcRep->push_back(preBranchBlk); //insert to function
|
|
+ SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
|
|
+
|
|
+ BlockT *newLandBlk = preBranchBlk;
|
|
+
|
|
+ CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
|
|
+ newLandBlk);
|
|
+ preExitingBlk->removeSuccessor(preExitBlk);
|
|
+ preExitingBlk->addSuccessor(newLandBlk);
|
|
+
|
|
+ //it is redundant to add reg = 0 to exitingBlks[0]
|
|
+
|
|
+ // For 1..n th exiting path (the last iteration handles two pathes) create the
|
|
+ // branch to the previous path and the current path.
|
|
+ for (uint32_t i = 1; i < numBlks; ++i) {
|
|
+ BlockT *curExitingBlk = exitingBlks[i];
|
|
+ BlockT *curExitBlk = exitBlks[i];
|
|
+ BlockT *curBranchBlk;
|
|
+
|
|
+ if (i == numBlks - 1) {
|
|
+ curBranchBlk = curExitBlk;
|
|
+ } else {
|
|
+ curBranchBlk = funcRep->CreateMachineBasicBlock();
|
|
+ funcRep->push_back(curBranchBlk); //insert to function
|
|
+ SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
|
|
+ }
|
|
+
|
|
+ // Add reg = i to exitingBlks[i].
|
|
+ CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
|
|
+ endBranchReg, i);
|
|
+
|
|
+ // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
|
|
+ // (exitingBlks[i], newLandBlk).
|
|
+ CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
|
|
+ newLandBlk);
|
|
+ curExitingBlk->removeSuccessor(curExitBlk);
|
|
+ curExitingBlk->addSuccessor(newLandBlk);
|
|
+
|
|
+ // add to preBranchBlk the branch instruction:
|
|
+ // if (endBranchReg == preVal)
|
|
+ // preExitBlk
|
|
+ // else
|
|
+ // curBranchBlk
|
|
+ //
|
|
+ // preValReg = i - 1
|
|
+
|
|
+ DebugLoc DL;
|
|
+ RegiT preValReg = static_cast<int>
|
|
+ (funcRep->getRegInfo().createVirtualRegister(I32RC));
|
|
+
|
|
+ preBranchBlk->insert(preBranchBlk->begin(),
|
|
+ tii->getMovImmInstr(preBranchBlk->getParent(), preValReg,
|
|
+ i - 1));
|
|
+
|
|
+ // condResReg = (endBranchReg == preValReg)
|
|
+ RegiT condResReg = static_cast<int>
|
|
+ (funcRep->getRegInfo().createVirtualRegister(I32RC));
|
|
+ BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg)
|
|
+ .addReg(endBranchReg).addReg(preValReg);
|
|
+
|
|
+ BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32))
|
|
+ .addMBB(preExitBlk).addReg(condResReg);
|
|
+
|
|
+ preBranchBlk->addSuccessor(preExitBlk);
|
|
+ preBranchBlk->addSuccessor(curBranchBlk);
|
|
+
|
|
+ // Update preExitingBlk, preExitBlk, preBranchBlk.
|
|
+ preExitingBlk = curExitingBlk;
|
|
+ preExitBlk = curExitBlk;
|
|
+ preBranchBlk = curBranchBlk;
|
|
+
|
|
+ } //end for 1 .. n blocks
|
|
+
|
|
+ return newLandBlk;
|
|
+} //addLoopEndbranchBlock
|
|
+
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::PathToKind
|
|
+CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
|
|
+ bool allowSideEntry) {
|
|
+ assert(dstBlk);
|
|
+
|
|
+ if (srcBlk == dstBlk) {
|
|
+ return SinglePath_InPath;
|
|
+ }
|
|
+
|
|
+ while (srcBlk && srcBlk->succ_size() == 1) {
|
|
+ srcBlk = *srcBlk->succ_begin();
|
|
+ if (srcBlk == dstBlk) {
|
|
+ return SinglePath_InPath;
|
|
+ }
|
|
+
|
|
+ if (!allowSideEntry && srcBlk->pred_size() > 1) {
|
|
+ return Not_SinglePath;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (srcBlk && srcBlk->succ_size()==0) {
|
|
+ return SinglePath_NotInPath;
|
|
+ }
|
|
+
|
|
+ return Not_SinglePath;
|
|
+} //singlePathTo
|
|
+
|
|
+// If there is a single path from srcBlk to dstBlk, return the last block before
|
|
+// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
|
|
+// last block in the path Otherwise, return NULL
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::BlockT *
|
|
+CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
|
|
+ bool allowSideEntry) {
|
|
+ assert(dstBlk);
|
|
+
|
|
+ if (srcBlk == dstBlk) {
|
|
+ return srcBlk;
|
|
+ }
|
|
+
|
|
+ if (srcBlk->succ_size() == 0) {
|
|
+ return srcBlk;
|
|
+ }
|
|
+
|
|
+ while (srcBlk && srcBlk->succ_size() == 1) {
|
|
+ BlockT *preBlk = srcBlk;
|
|
+
|
|
+ srcBlk = *srcBlk->succ_begin();
|
|
+ if (srcBlk == NULL) {
|
|
+ return preBlk;
|
|
+ }
|
|
+
|
|
+ if (!allowSideEntry && srcBlk->pred_size() > 1) {
|
|
+ return NULL;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (srcBlk && srcBlk->succ_size()==0) {
|
|
+ return srcBlk;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+
|
|
+} //singlePathEnd
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
|
|
+ BlockT *dstBlk) {
|
|
+ int cloned = 0;
|
|
+ assert(preBlk->isSuccessor(srcBlk));
|
|
+ while (srcBlk && srcBlk != dstBlk) {
|
|
+ assert(srcBlk->succ_size() == 1);
|
|
+ if (srcBlk->pred_size() > 1) {
|
|
+ srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
|
|
+ ++cloned;
|
|
+ }
|
|
+
|
|
+ preBlk = srcBlk;
|
|
+ srcBlk = *srcBlk->succ_begin();
|
|
+ }
|
|
+
|
|
+ return cloned;
|
|
+} //cloneOnSideEntryTo
|
|
+
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::BlockT *
|
|
+CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
|
|
+ BlockT *predBlk) {
|
|
+ assert(predBlk->isSuccessor(curBlk) &&
|
|
+ "succBlk is not a prececessor of curBlk");
|
|
+
|
|
+ BlockT *cloneBlk = CFGTraits::clone(curBlk); //clone instructions
|
|
+ CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
|
|
+ //srcBlk, oldBlk, newBlk
|
|
+
|
|
+ predBlk->removeSuccessor(curBlk);
|
|
+ predBlk->addSuccessor(cloneBlk);
|
|
+
|
|
+ // add all successor to cloneBlk
|
|
+ CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
|
|
+
|
|
+ numClonedInstr += curBlk->size();
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Cloned block: " << "BB"
|
|
+ << curBlk->getNumber() << "size " << curBlk->size() << "\n";
|
|
+ }
|
|
+
|
|
+ SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
|
|
+
|
|
+ return cloneBlk;
|
|
+} //cloneBlockForPredecessor
|
|
+
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::BlockT *
|
|
+CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
|
|
+ BlockT *exitingBlk) {
|
|
+ BlockT *exitBlk = NULL;
|
|
+
|
|
+ for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
|
|
+ iterSuccEnd = exitingBlk->succ_end();
|
|
+ iterSucc != iterSuccEnd; ++iterSucc) {
|
|
+ BlockT *curBlk = *iterSucc;
|
|
+ if (!loopRep->contains(curBlk)) {
|
|
+ assert(exitBlk == NULL);
|
|
+ exitBlk = curBlk;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ assert(exitBlk != NULL);
|
|
+
|
|
+ return exitBlk;
|
|
+} //exitingBlock2ExitBlock
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
|
|
+ BlockT *dstBlk,
|
|
+ InstrIterator insertPos) {
|
|
+ InstrIterator spliceEnd;
|
|
+ //look for the input branchinstr, not the AMDGPU branchinstr
|
|
+ InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
|
|
+ if (branchInstr == NULL) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "migrateInstruction don't see branch instr\n" ;
|
|
+ }
|
|
+ spliceEnd = srcBlk->end();
|
|
+ } else {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "migrateInstruction see branch instr\n" ;
|
|
+ branchInstr->dump();
|
|
+ }
|
|
+ spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
|
|
+ }
|
|
+ if (DEBUGME) {
|
|
+ errs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
|
|
+ << "srcSize = " << srcBlk->size() << "\n";
|
|
+ }
|
|
+
|
|
+ //splice insert before insertPos
|
|
+ dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
|
|
+ << "srcSize = " << srcBlk->size() << "\n";
|
|
+ }
|
|
+} //migrateInstruction
|
|
+
|
|
+// normalizeInfiniteLoopExit change
|
|
+// B1:
|
|
+// uncond_br LoopHeader
|
|
+//
|
|
+// to
|
|
+// B1:
|
|
+// cond_br 1 LoopHeader dummyExit
|
|
+// and return the newly added dummy exit block
|
|
+//
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::BlockT *
|
|
+CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
|
|
+ BlockT *loopHeader;
|
|
+ BlockT *loopLatch;
|
|
+ loopHeader = LoopRep->getHeader();
|
|
+ loopLatch = LoopRep->getLoopLatch();
|
|
+ BlockT *dummyExitBlk = NULL;
|
|
+ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
|
|
+ if (loopHeader!=NULL && loopLatch!=NULL) {
|
|
+ InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
|
|
+ if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
|
|
+ dummyExitBlk = funcRep->CreateMachineBasicBlock();
|
|
+ funcRep->push_back(dummyExitBlk); //insert to function
|
|
+ SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
|
|
+
|
|
+ if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n";
|
|
+
|
|
+ typename BlockT::iterator insertPos =
|
|
+ CFGTraits::getInstrPos(loopLatch, branchInstr);
|
|
+ unsigned immReg =
|
|
+ funcRep->getRegInfo().createVirtualRegister(I32RC);
|
|
+ CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
|
|
+ InstrT *newInstr =
|
|
+ CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep);
|
|
+ MachineInstrBuilder(newInstr).addMBB(loopHeader).addReg(immReg, false);
|
|
+
|
|
+ SHOWNEWINSTR(newInstr);
|
|
+
|
|
+ branchInstr->eraseFromParent();
|
|
+ loopLatch->addSuccessor(dummyExitBlk);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return dummyExitBlk;
|
|
+} //normalizeInfiniteLoopExit
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
|
|
+ InstrT *branchInstr;
|
|
+
|
|
+ // I saw two unconditional branch in one basic block in example
|
|
+ // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
|
|
+ while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
|
|
+ && CFGTraits::isUncondBranch(branchInstr)) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Removing unconditional branch instruction" ;
|
|
+ branchInstr->dump();
|
|
+ }
|
|
+ branchInstr->eraseFromParent();
|
|
+ }
|
|
+} //removeUnconditionalBranch
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
|
|
+ if (srcBlk->succ_size() == 2) {
|
|
+ BlockT *blk1 = *srcBlk->succ_begin();
|
|
+ BlockT *blk2 = *(++srcBlk->succ_begin());
|
|
+
|
|
+ if (blk1 == blk2) {
|
|
+ InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
|
|
+ assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Removing unneeded conditional branch instruction" ;
|
|
+ branchInstr->dump();
|
|
+ }
|
|
+ branchInstr->eraseFromParent();
|
|
+ SHOWNEWBLK(blk1, "Removing redundant successor");
|
|
+ srcBlk->removeSuccessor(blk1);
|
|
+ }
|
|
+ }
|
|
+} //removeRedundantConditionalBranch
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*,
|
|
+ DEFAULT_VEC_SLOTS> &retBlks) {
|
|
+ BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
|
|
+ funcRep->push_back(dummyExitBlk); //insert to function
|
|
+ CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep);
|
|
+
|
|
+ for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter =
|
|
+ retBlks.begin(),
|
|
+ iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
|
|
+ BlockT *curBlk = *iter;
|
|
+ InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
|
|
+ if (curInstr) {
|
|
+ curInstr->eraseFromParent();
|
|
+ }
|
|
+ curBlk->addSuccessor(dummyExitBlk);
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
|
|
+ << " successors\n";
|
|
+ }
|
|
+ } //for
|
|
+
|
|
+ SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
|
|
+} //addDummyExitBlock
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
|
|
+ while (srcBlk->succ_size()) {
|
|
+ srcBlk->removeSuccessor(*srcBlk->succ_begin());
|
|
+ }
|
|
+}
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
|
|
+ BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
|
|
+
|
|
+ if (srcBlkInfo == NULL) {
|
|
+ srcBlkInfo = new BlockInfo();
|
|
+ }
|
|
+
|
|
+ srcBlkInfo->sccNum = sccNum;
|
|
+}
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
|
|
+ BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
|
|
+ return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
|
|
+}
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Retiring BB" << srcBlk->getNumber() << "\n";
|
|
+ }
|
|
+
|
|
+ BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
|
|
+
|
|
+ if (srcBlkInfo == NULL) {
|
|
+ srcBlkInfo = new BlockInfo();
|
|
+ }
|
|
+
|
|
+ srcBlkInfo->isRetired = true;
|
|
+ assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
|
|
+ && "can't retire block yet");
|
|
+}
|
|
+
|
|
+template<class PassT>
|
|
+bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
|
|
+ BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
|
|
+ return (srcBlkInfo && srcBlkInfo->isRetired);
|
|
+}
|
|
+
|
|
+template<class PassT>
|
|
+bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
|
|
+ LoopT *loopRep = loopInfo->getLoopFor(curBlk);
|
|
+ while (loopRep && loopRep->getHeader() == curBlk) {
|
|
+ LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
|
|
+
|
|
+ if(loopLand == NULL)
|
|
+ return true;
|
|
+
|
|
+ BlockT *landBlk = loopLand->landBlk;
|
|
+ assert(landBlk);
|
|
+ if (!isRetiredBlock(landBlk)) {
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ loopRep = loopRep->getParentLoop();
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+} //isActiveLoophead
|
|
+
|
|
+template<class PassT>
|
|
+bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
|
|
+ const unsigned blockSizeThreshold = 30;
|
|
+ const unsigned cloneInstrThreshold = 100;
|
|
+
|
|
+ bool multiplePreds = blk && (blk->pred_size() > 1);
|
|
+
|
|
+ if(!multiplePreds)
|
|
+ return false;
|
|
+
|
|
+ unsigned blkSize = blk->size();
|
|
+ return ((blkSize > blockSizeThreshold)
|
|
+ && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
|
|
+} //needMigrateBlock
|
|
+
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::BlockT *
|
|
+CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
|
|
+ BlockTSmallerVector &exitBlks,
|
|
+ std::set<BlockT *> &exitBlkSet) {
|
|
+ SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks; //in exit path blocks
|
|
+
|
|
+ for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
|
|
+ predIterEnd = landBlk->pred_end();
|
|
+ predIter != predIterEnd; ++predIter) {
|
|
+ BlockT *curBlk = *predIter;
|
|
+ if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
|
|
+ inpathBlks.push_back(curBlk);
|
|
+ }
|
|
+ } //for
|
|
+
|
|
+ //if landBlk has predecessors that are not in the given loop,
|
|
+ //create a new block
|
|
+ BlockT *newLandBlk = landBlk;
|
|
+ if (inpathBlks.size() != landBlk->pred_size()) {
|
|
+ newLandBlk = funcRep->CreateMachineBasicBlock();
|
|
+ funcRep->push_back(newLandBlk); //insert to function
|
|
+ newLandBlk->addSuccessor(landBlk);
|
|
+ for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter =
|
|
+ inpathBlks.begin(),
|
|
+ iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
|
|
+ BlockT *curBlk = *iter;
|
|
+ CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
|
|
+ //srcBlk, oldBlk, newBlk
|
|
+ curBlk->removeSuccessor(landBlk);
|
|
+ curBlk->addSuccessor(newLandBlk);
|
|
+ }
|
|
+ for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
|
|
+ if (exitBlks[i] == landBlk) {
|
|
+ exitBlks[i] = newLandBlk;
|
|
+ }
|
|
+ }
|
|
+ SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
|
|
+ }
|
|
+
|
|
+ setLoopLandBlock(loopRep, newLandBlk);
|
|
+
|
|
+ return newLandBlk;
|
|
+} // recordLoopbreakLand
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
|
|
+ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
|
|
+
|
|
+ if (theEntry == NULL) {
|
|
+ theEntry = new LoopLandInfo();
|
|
+ }
|
|
+ assert(theEntry->landBlk == NULL);
|
|
+
|
|
+ if (blk == NULL) {
|
|
+ blk = funcRep->CreateMachineBasicBlock();
|
|
+ funcRep->push_back(blk); //insert to function
|
|
+ SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
|
|
+ }
|
|
+
|
|
+ theEntry->landBlk = blk;
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "setLoopLandBlock loop-header = BB"
|
|
+ << loopRep->getHeader()->getNumber()
|
|
+ << " landing-block = BB" << blk->getNumber() << "\n";
|
|
+ }
|
|
+} // setLoopLandBlock
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
|
|
+ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
|
|
+
|
|
+ if (theEntry == NULL) {
|
|
+ theEntry = new LoopLandInfo();
|
|
+ }
|
|
+
|
|
+ theEntry->breakOnRegs.insert(regNum);
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "addLoopBreakOnReg loop-header = BB"
|
|
+ << loopRep->getHeader()->getNumber()
|
|
+ << " regNum = " << regNum << "\n";
|
|
+ }
|
|
+} // addLoopBreakOnReg
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
|
|
+ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
|
|
+
|
|
+ if (theEntry == NULL) {
|
|
+ theEntry = new LoopLandInfo();
|
|
+ }
|
|
+ theEntry->contOnRegs.insert(regNum);
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "addLoopContOnReg loop-header = BB"
|
|
+ << loopRep->getHeader()->getNumber()
|
|
+ << " regNum = " << regNum << "\n";
|
|
+ }
|
|
+} // addLoopContOnReg
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
|
|
+ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
|
|
+
|
|
+ if (theEntry == NULL) {
|
|
+ theEntry = new LoopLandInfo();
|
|
+ }
|
|
+ theEntry->breakInitRegs.insert(regNum);
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "addLoopBreakInitReg loop-header = BB"
|
|
+ << loopRep->getHeader()->getNumber()
|
|
+ << " regNum = " << regNum << "\n";
|
|
+ }
|
|
+} // addLoopBreakInitReg
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
|
|
+ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
|
|
+
|
|
+ if (theEntry == NULL) {
|
|
+ theEntry = new LoopLandInfo();
|
|
+ }
|
|
+ theEntry->contInitRegs.insert(regNum);
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "addLoopContInitReg loop-header = BB"
|
|
+ << loopRep->getHeader()->getNumber()
|
|
+ << " regNum = " << regNum << "\n";
|
|
+ }
|
|
+} // addLoopContInitReg
|
|
+
|
|
+template<class PassT>
|
|
+void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
|
|
+ RegiT regNum) {
|
|
+ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
|
|
+
|
|
+ if (theEntry == NULL) {
|
|
+ theEntry = new LoopLandInfo();
|
|
+ }
|
|
+ theEntry->endbranchInitRegs.insert(regNum);
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "addLoopEndbranchInitReg loop-header = BB"
|
|
+ << loopRep->getHeader()->getNumber()
|
|
+ << " regNum = " << regNum << "\n";
|
|
+ }
|
|
+} // addLoopEndbranchInitReg
|
|
+
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::LoopLandInfo *
|
|
+CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
|
|
+ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
|
|
+
|
|
+ return theEntry;
|
|
+} // getLoopLandInfo
|
|
+
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::BlockT *
|
|
+CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
|
|
+ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
|
|
+
|
|
+ return theEntry ? theEntry->landBlk : NULL;
|
|
+} // getLoopLandBlock
|
|
+
|
|
+
|
|
+template<class PassT>
|
|
+bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
|
|
+ LoopT *loopRep = loopInfo->getLoopFor(curBlk);
|
|
+ if (loopRep == NULL)
|
|
+ return false;
|
|
+
|
|
+ BlockT *loopHeader = loopRep->getHeader();
|
|
+
|
|
+ return curBlk->isSuccessor(loopHeader);
|
|
+
|
|
+} //hasBackEdge
|
|
+
|
|
+template<class PassT>
|
|
+unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
|
|
+ return loopRep ? loopRep->getLoopDepth() : 0;
|
|
+} //getLoopDepth
|
|
+
|
|
+template<class PassT>
|
|
+int CFGStructurizer<PassT>::countActiveBlock
|
|
+(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart,
|
|
+ typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) {
|
|
+ int count = 0;
|
|
+ while (iterStart != iterEnd) {
|
|
+ if (!isRetiredBlock(*iterStart)) {
|
|
+ ++count;
|
|
+ }
|
|
+ ++iterStart;
|
|
+ }
|
|
+
|
|
+ return count;
|
|
+} //countActiveBlock
|
|
+
|
|
+// This is work around solution for findNearestCommonDominator not avaiable to
|
|
+// post dom a proper fix should go to Dominators.h.
|
|
+
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::BlockT*
|
|
+CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
|
|
+
|
|
+ if (postDomTree->dominates(blk1, blk2)) {
|
|
+ return blk1;
|
|
+ }
|
|
+ if (postDomTree->dominates(blk2, blk1)) {
|
|
+ return blk2;
|
|
+ }
|
|
+
|
|
+ DomTreeNodeT *node1 = postDomTree->getNode(blk1);
|
|
+ DomTreeNodeT *node2 = postDomTree->getNode(blk2);
|
|
+
|
|
+ // Handle newly cloned node.
|
|
+ if (node1 == NULL && blk1->succ_size() == 1) {
|
|
+ return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
|
|
+ }
|
|
+ if (node2 == NULL && blk2->succ_size() == 1) {
|
|
+ return findNearestCommonPostDom(blk1, *blk2->succ_begin());
|
|
+ }
|
|
+
|
|
+ if (node1 == NULL || node2 == NULL) {
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ node1 = node1->getIDom();
|
|
+ while (node1) {
|
|
+ if (postDomTree->dominates(node1, node2)) {
|
|
+ return node1->getBlock();
|
|
+ }
|
|
+ node1 = node1->getIDom();
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+template<class PassT>
|
|
+typename CFGStructurizer<PassT>::BlockT *
|
|
+CFGStructurizer<PassT>::findNearestCommonPostDom
|
|
+(typename std::set<BlockT *> &blks) {
|
|
+ BlockT *commonDom;
|
|
+ typename std::set<BlockT *>::const_iterator iter = blks.begin();
|
|
+ typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
|
|
+ for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
|
|
+ BlockT *curBlk = *iter;
|
|
+ if (curBlk != commonDom) {
|
|
+ commonDom = findNearestCommonPostDom(curBlk, commonDom);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (DEBUGME) {
|
|
+ errs() << "Common post dominator for exit blocks is ";
|
|
+ if (commonDom) {
|
|
+ errs() << "BB" << commonDom->getNumber() << "\n";
|
|
+ } else {
|
|
+ errs() << "NULL\n";
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return commonDom;
|
|
+} //findNearestCommonPostDom
|
|
+
|
|
+} //end namespace llvm
|
|
+
|
|
+//todo: move-end
|
|
+
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// CFGStructurizer for AMDGPU
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+using namespace llvmCFGStruct;
|
|
+
|
|
+namespace llvm {
|
|
+class AMDGPUCFGStructurizer : public MachineFunctionPass {
|
|
+public:
|
|
+ typedef MachineInstr InstructionType;
|
|
+ typedef MachineFunction FunctionType;
|
|
+ typedef MachineBasicBlock BlockType;
|
|
+ typedef MachineLoopInfo LoopinfoType;
|
|
+ typedef MachineDominatorTree DominatortreeType;
|
|
+ typedef MachinePostDominatorTree PostDominatortreeType;
|
|
+ typedef MachineDomTreeNode DomTreeNodeType;
|
|
+ typedef MachineLoop LoopType;
|
|
+
|
|
+protected:
|
|
+ TargetMachine &TM;
|
|
+ const TargetInstrInfo *TII;
|
|
+ const AMDGPURegisterInfo *TRI;
|
|
+
|
|
+public:
|
|
+ AMDGPUCFGStructurizer(char &pid, TargetMachine &tm);
|
|
+ const TargetInstrInfo *getTargetInstrInfo() const;
|
|
+
|
|
+private:
|
|
+
|
|
+};
|
|
+
|
|
+} //end of namespace llvm
|
|
+AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm)
|
|
+: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()),
|
|
+ TRI(static_cast<const AMDGPURegisterInfo *>(tm.getRegisterInfo())) {
|
|
+}
|
|
+
|
|
+const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const {
|
|
+ return TII;
|
|
+}
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// CFGPrepare
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+using namespace llvmCFGStruct;
|
|
+
|
|
+namespace llvm {
|
|
+class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer {
|
|
+public:
|
|
+ static char ID;
|
|
+
|
|
+public:
|
|
+ AMDGPUCFGPrepare(TargetMachine &tm);
|
|
+
|
|
+ virtual const char *getPassName() const;
|
|
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const;
|
|
+
|
|
+ bool runOnMachineFunction(MachineFunction &F);
|
|
+
|
|
+private:
|
|
+
|
|
+};
|
|
+
|
|
+char AMDGPUCFGPrepare::ID = 0;
|
|
+} //end of namespace llvm
|
|
+
|
|
+AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm)
|
|
+ : AMDGPUCFGStructurizer(ID, tm ) {
|
|
+}
|
|
+const char *AMDGPUCFGPrepare::getPassName() const {
|
|
+ return "AMD IL Control Flow Graph Preparation Pass";
|
|
+}
|
|
+
|
|
+void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
|
|
+ AU.addPreserved<MachineFunctionAnalysis>();
|
|
+ AU.addRequired<MachineFunctionAnalysis>();
|
|
+ AU.addRequired<MachineDominatorTree>();
|
|
+ AU.addRequired<MachinePostDominatorTree>();
|
|
+ AU.addRequired<MachineLoopInfo>();
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// CFGPerform
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+using namespace llvmCFGStruct;
|
|
+
|
|
+namespace llvm {
|
|
+class AMDGPUCFGPerform : public AMDGPUCFGStructurizer {
|
|
+public:
|
|
+ static char ID;
|
|
+
|
|
+public:
|
|
+ AMDGPUCFGPerform(TargetMachine &tm);
|
|
+ virtual const char *getPassName() const;
|
|
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const;
|
|
+ bool runOnMachineFunction(MachineFunction &F);
|
|
+
|
|
+private:
|
|
+
|
|
+};
|
|
+
|
|
+char AMDGPUCFGPerform::ID = 0;
|
|
+} //end of namespace llvm
|
|
+
|
|
+ AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm)
|
|
+: AMDGPUCFGStructurizer(ID, tm) {
|
|
+}
|
|
+
|
|
+const char *AMDGPUCFGPerform::getPassName() const {
|
|
+ return "AMD IL Control Flow Graph structurizer Pass";
|
|
+}
|
|
+
|
|
+void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
|
|
+ AU.addPreserved<MachineFunctionAnalysis>();
|
|
+ AU.addRequired<MachineFunctionAnalysis>();
|
|
+ AU.addRequired<MachineDominatorTree>();
|
|
+ AU.addRequired<MachinePostDominatorTree>();
|
|
+ AU.addRequired<MachineLoopInfo>();
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// CFGStructTraits<AMDGPUCFGStructurizer>
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+namespace llvmCFGStruct {
|
|
+// this class is tailor to the AMDGPU backend
|
|
+template<>
|
|
+struct CFGStructTraits<AMDGPUCFGStructurizer> {
|
|
+ typedef int RegiT;
|
|
+
|
|
+ static int getBranchNzeroOpcode(int oldOpcode) {
|
|
+ switch(oldOpcode) {
|
|
+ case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
|
|
+ case AMDGPU::BRANCH_COND_i32:
|
|
+ case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
|
|
+ default:
|
|
+ assert(0 && "internal error");
|
|
+ }
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ static int getBranchZeroOpcode(int oldOpcode) {
|
|
+ switch(oldOpcode) {
|
|
+ case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
|
|
+ case AMDGPU::BRANCH_COND_i32:
|
|
+ case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
|
|
+ default:
|
|
+ assert(0 && "internal error");
|
|
+ }
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ static int getContinueNzeroOpcode(int oldOpcode) {
|
|
+ switch(oldOpcode) {
|
|
+ case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
|
|
+ default:
|
|
+ assert(0 && "internal error");
|
|
+ };
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ static int getContinueZeroOpcode(int oldOpcode) {
|
|
+ switch(oldOpcode) {
|
|
+ case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
|
|
+ default:
|
|
+ assert(0 && "internal error");
|
|
+ }
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
|
|
+ return instr->getOperand(0).getMBB();
|
|
+ }
|
|
+
|
|
+ static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
|
|
+ instr->getOperand(0).setMBB(blk);
|
|
+ }
|
|
+
|
|
+ static MachineBasicBlock *
|
|
+ getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
|
|
+ assert(blk->succ_size() == 2);
|
|
+ MachineBasicBlock *trueBranch = getTrueBranch(instr);
|
|
+ MachineBasicBlock::succ_iterator iter = blk->succ_begin();
|
|
+ MachineBasicBlock::succ_iterator iterNext = iter;
|
|
+ ++iterNext;
|
|
+
|
|
+ return (*iter == trueBranch) ? *iterNext : *iter;
|
|
+ }
|
|
+
|
|
+ static bool isCondBranch(MachineInstr *instr) {
|
|
+ switch (instr->getOpcode()) {
|
|
+ case AMDGPU::JUMP:
|
|
+ return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0;
|
|
+ case AMDGPU::BRANCH_COND_i32:
|
|
+ case AMDGPU::BRANCH_COND_f32:
|
|
+ break;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ static bool isUncondBranch(MachineInstr *instr) {
|
|
+ switch (instr->getOpcode()) {
|
|
+ case AMDGPU::JUMP:
|
|
+ return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() == 0;
|
|
+ case AMDGPU::BRANCH:
|
|
+ return true;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
|
|
+ //get DebugLoc from the first MachineBasicBlock instruction with debug info
|
|
+ DebugLoc DL;
|
|
+ for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
|
|
+ MachineInstr *instr = &(*iter);
|
|
+ if (instr->getDebugLoc().isUnknown() == false) {
|
|
+ DL = instr->getDebugLoc();
|
|
+ }
|
|
+ }
|
|
+ return DL;
|
|
+ }
|
|
+
|
|
+ static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
|
|
+ MachineBasicBlock::reverse_iterator iter = blk->rbegin();
|
|
+ MachineInstr *instr = &*iter;
|
|
+ if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
|
|
+ return instr;
|
|
+ }
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ // The correct naming for this is getPossibleLoopendBlockBranchInstr.
|
|
+ //
|
|
+ // BB with backward-edge could have move instructions after the branch
|
|
+ // instruction. Such move instruction "belong to" the loop backward-edge.
|
|
+ //
|
|
+ static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
|
|
+ const AMDGPUInstrInfo * TII = static_cast<const AMDGPUInstrInfo *>(
|
|
+ blk->getParent()->getTarget().getInstrInfo());
|
|
+
|
|
+ for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
|
|
+ iterEnd = blk->rend(); iter != iterEnd; ++iter) {
|
|
+ // FIXME: Simplify
|
|
+ MachineInstr *instr = &*iter;
|
|
+ if (instr) {
|
|
+ if (isCondBranch(instr) || isUncondBranch(instr)) {
|
|
+ return instr;
|
|
+ } else if (!TII->isMov(instr->getOpcode())) {
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
|
|
+ MachineBasicBlock::reverse_iterator iter = blk->rbegin();
|
|
+ if (iter != blk->rend()) {
|
|
+ MachineInstr *instr = &(*iter);
|
|
+ if (instr->getOpcode() == AMDGPU::RETURN) {
|
|
+ return instr;
|
|
+ }
|
|
+ }
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
|
|
+ MachineBasicBlock::reverse_iterator iter = blk->rbegin();
|
|
+ if (iter != blk->rend()) {
|
|
+ MachineInstr *instr = &(*iter);
|
|
+ if (instr->getOpcode() == AMDGPU::CONTINUE) {
|
|
+ return instr;
|
|
+ }
|
|
+ }
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
|
|
+ for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
|
|
+ MachineInstr *instr = &(*iter);
|
|
+ if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) {
|
|
+ return instr;
|
|
+ }
|
|
+ }
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ static bool isReturnBlock(MachineBasicBlock *blk) {
|
|
+ MachineInstr *instr = getReturnInstr(blk);
|
|
+ bool isReturn = (blk->succ_size() == 0);
|
|
+ if (instr) {
|
|
+ assert(isReturn);
|
|
+ } else if (isReturn) {
|
|
+ if (DEBUGME) {
|
|
+ errs() << "BB" << blk->getNumber()
|
|
+ <<" is return block without RETURN instr\n";
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return isReturn;
|
|
+ }
|
|
+
|
|
+ static MachineBasicBlock::iterator
|
|
+ getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
|
|
+ assert(instr->getParent() == blk && "instruction doesn't belong to block");
|
|
+ MachineBasicBlock::iterator iter = blk->begin();
|
|
+ MachineBasicBlock::iterator iterEnd = blk->end();
|
|
+ while (&(*iter) != instr && iter != iterEnd) {
|
|
+ ++iter;
|
|
+ }
|
|
+
|
|
+ assert(iter != iterEnd);
|
|
+ return iter;
|
|
+ }//getInstrPos
|
|
+
|
|
+ static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
|
|
+ AMDGPUCFGStructurizer *passRep) {
|
|
+ return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
|
|
+ } //insertInstrBefore
|
|
+
|
|
+ static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
|
|
+ AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
|
|
+ const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
|
|
+ MachineInstr *newInstr =
|
|
+ blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
|
|
+
|
|
+ MachineBasicBlock::iterator res;
|
|
+ if (blk->begin() != blk->end()) {
|
|
+ blk->insert(blk->begin(), newInstr);
|
|
+ } else {
|
|
+ blk->push_back(newInstr);
|
|
+ }
|
|
+
|
|
+ SHOWNEWINSTR(newInstr);
|
|
+
|
|
+ return newInstr;
|
|
+ } //insertInstrBefore
|
|
+
|
|
+ static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
|
|
+ AMDGPUCFGStructurizer *passRep) {
|
|
+ insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
|
|
+ } //insertInstrEnd
|
|
+
|
|
+ static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
|
|
+ AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
|
|
+ const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
|
|
+ MachineInstr *newInstr = blk->getParent()
|
|
+ ->CreateMachineInstr(tii->get(newOpcode), DL);
|
|
+
|
|
+ blk->push_back(newInstr);
|
|
+ //assume the instruction doesn't take any reg operand ...
|
|
+
|
|
+ SHOWNEWINSTR(newInstr);
|
|
+ } //insertInstrEnd
|
|
+
|
|
+ static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
|
|
+ int newOpcode,
|
|
+ AMDGPUCFGStructurizer *passRep) {
|
|
+ MachineInstr *oldInstr = &(*instrPos);
|
|
+ const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
|
|
+ MachineBasicBlock *blk = oldInstr->getParent();
|
|
+ MachineInstr *newInstr =
|
|
+ blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
|
|
+ DebugLoc());
|
|
+
|
|
+ blk->insert(instrPos, newInstr);
|
|
+ //assume the instruction doesn't take any reg operand ...
|
|
+
|
|
+ SHOWNEWINSTR(newInstr);
|
|
+ return newInstr;
|
|
+ } //insertInstrBefore
|
|
+
|
|
+ static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
|
|
+ int newOpcode,
|
|
+ AMDGPUCFGStructurizer *passRep,
|
|
+ DebugLoc DL) {
|
|
+ MachineInstr *oldInstr = &(*instrPos);
|
|
+ const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
|
|
+ MachineBasicBlock *blk = oldInstr->getParent();
|
|
+ MachineInstr *newInstr =
|
|
+ blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
|
|
+ DL);
|
|
+
|
|
+ blk->insert(instrPos, newInstr);
|
|
+ MachineInstrBuilder(newInstr).addReg(oldInstr->getOperand(1).getReg(),
|
|
+ false);
|
|
+
|
|
+ SHOWNEWINSTR(newInstr);
|
|
+ //erase later oldInstr->eraseFromParent();
|
|
+ } //insertCondBranchBefore
|
|
+
|
|
+ static void insertCondBranchBefore(MachineBasicBlock *blk,
|
|
+ MachineBasicBlock::iterator insertPos,
|
|
+ int newOpcode,
|
|
+ AMDGPUCFGStructurizer *passRep,
|
|
+ RegiT regNum,
|
|
+ DebugLoc DL) {
|
|
+ const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
|
|
+
|
|
+ MachineInstr *newInstr =
|
|
+ blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
|
|
+
|
|
+ //insert before
|
|
+ blk->insert(insertPos, newInstr);
|
|
+ MachineInstrBuilder(newInstr).addReg(regNum, false);
|
|
+
|
|
+ SHOWNEWINSTR(newInstr);
|
|
+ } //insertCondBranchBefore
|
|
+
|
|
+ static void insertCondBranchEnd(MachineBasicBlock *blk,
|
|
+ int newOpcode,
|
|
+ AMDGPUCFGStructurizer *passRep,
|
|
+ RegiT regNum) {
|
|
+ const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
|
|
+ MachineInstr *newInstr =
|
|
+ blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
|
|
+
|
|
+ blk->push_back(newInstr);
|
|
+ MachineInstrBuilder(newInstr).addReg(regNum, false);
|
|
+
|
|
+ SHOWNEWINSTR(newInstr);
|
|
+ } //insertCondBranchEnd
|
|
+
|
|
+
|
|
+ static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
|
|
+ AMDGPUCFGStructurizer *passRep,
|
|
+ RegiT regNum, int regVal) {
|
|
+ MachineInstr *oldInstr = &(*instrPos);
|
|
+ const AMDGPUInstrInfo *tii =
|
|
+ static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
|
|
+ MachineBasicBlock *blk = oldInstr->getParent();
|
|
+ MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
|
|
+ regVal);
|
|
+ blk->insert(instrPos, newInstr);
|
|
+
|
|
+ SHOWNEWINSTR(newInstr);
|
|
+ } //insertAssignInstrBefore
|
|
+
|
|
+ static void insertAssignInstrBefore(MachineBasicBlock *blk,
|
|
+ AMDGPUCFGStructurizer *passRep,
|
|
+ RegiT regNum, int regVal) {
|
|
+ const AMDGPUInstrInfo *tii =
|
|
+ static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
|
|
+
|
|
+ MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
|
|
+ regVal);
|
|
+ if (blk->begin() != blk->end()) {
|
|
+ blk->insert(blk->begin(), newInstr);
|
|
+ } else {
|
|
+ blk->push_back(newInstr);
|
|
+ }
|
|
+
|
|
+ SHOWNEWINSTR(newInstr);
|
|
+
|
|
+ } //insertInstrBefore
|
|
+
|
|
+ static void insertCompareInstrBefore(MachineBasicBlock *blk,
|
|
+ MachineBasicBlock::iterator instrPos,
|
|
+ AMDGPUCFGStructurizer *passRep,
|
|
+ RegiT dstReg, RegiT src1Reg,
|
|
+ RegiT src2Reg) {
|
|
+ const AMDGPUInstrInfo *tii =
|
|
+ static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
|
|
+ MachineInstr *newInstr =
|
|
+ blk->getParent()->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc());
|
|
+
|
|
+ MachineInstrBuilder(newInstr).addReg(dstReg, RegState::Define); //set target
|
|
+ MachineInstrBuilder(newInstr).addReg(src1Reg); //set src value
|
|
+ MachineInstrBuilder(newInstr).addReg(src2Reg); //set src value
|
|
+
|
|
+ blk->insert(instrPos, newInstr);
|
|
+ SHOWNEWINSTR(newInstr);
|
|
+
|
|
+ } //insertCompareInstrBefore
|
|
+
|
|
+ static void cloneSuccessorList(MachineBasicBlock *dstBlk,
|
|
+ MachineBasicBlock *srcBlk) {
|
|
+ for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
|
|
+ iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
|
|
+ dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of
|
|
+ }
|
|
+ } //cloneSuccessorList
|
|
+
|
|
+ static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
|
|
+ MachineFunction *func = srcBlk->getParent();
|
|
+ MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
|
|
+ func->push_back(newBlk); //insert to function
|
|
+ for (MachineBasicBlock::iterator iter = srcBlk->begin(),
|
|
+ iterEnd = srcBlk->end();
|
|
+ iter != iterEnd; ++iter) {
|
|
+ MachineInstr *instr = func->CloneMachineInstr(iter);
|
|
+ newBlk->push_back(instr);
|
|
+ }
|
|
+ return newBlk;
|
|
+ }
|
|
+
|
|
+ //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
|
|
+ //the AMDGPU instruction is not recognized as terminator fix this and retire
|
|
+ //this routine
|
|
+ static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
|
|
+ MachineBasicBlock *oldBlk,
|
|
+ MachineBasicBlock *newBlk) {
|
|
+ MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
|
|
+ if (branchInstr && isCondBranch(branchInstr) &&
|
|
+ getTrueBranch(branchInstr) == oldBlk) {
|
|
+ setTrueBranch(branchInstr, newBlk);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ static void wrapup(MachineBasicBlock *entryBlk) {
|
|
+ assert((!entryBlk->getParent()->getJumpTableInfo()
|
|
+ || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
|
|
+ && "found a jump table");
|
|
+
|
|
+ //collect continue right before endloop
|
|
+ SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
|
|
+ MachineBasicBlock::iterator pre = entryBlk->begin();
|
|
+ MachineBasicBlock::iterator iterEnd = entryBlk->end();
|
|
+ MachineBasicBlock::iterator iter = pre;
|
|
+ while (iter != iterEnd) {
|
|
+ if (pre->getOpcode() == AMDGPU::CONTINUE
|
|
+ && iter->getOpcode() == AMDGPU::ENDLOOP) {
|
|
+ contInstr.push_back(pre);
|
|
+ }
|
|
+ pre = iter;
|
|
+ ++iter;
|
|
+ } //end while
|
|
+
|
|
+ //delete continue right before endloop
|
|
+ for (unsigned i = 0; i < contInstr.size(); ++i) {
|
|
+ contInstr[i]->eraseFromParent();
|
|
+ }
|
|
+
|
|
+ // TODO to fix up jump table so later phase won't be confused. if
|
|
+ // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
|
|
+ // there isn't such an interface yet. alternatively, replace all the other
|
|
+ // blocks in the jump table with the entryBlk //}
|
|
+
|
|
+ } //wrapup
|
|
+
|
|
+ static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) {
|
|
+ return &pass.getAnalysis<MachineDominatorTree>();
|
|
+ }
|
|
+
|
|
+ static MachinePostDominatorTree*
|
|
+ getPostDominatorTree(AMDGPUCFGStructurizer &pass) {
|
|
+ return &pass.getAnalysis<MachinePostDominatorTree>();
|
|
+ }
|
|
+
|
|
+ static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) {
|
|
+ return &pass.getAnalysis<MachineLoopInfo>();
|
|
+ }
|
|
+}; // template class CFGStructTraits
|
|
+} //end of namespace llvm
|
|
+
|
|
+// createAMDGPUCFGPreparationPass- Returns a pass
|
|
+FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm
|
|
+ ) {
|
|
+ return new AMDGPUCFGPrepare(tm );
|
|
+}
|
|
+
|
|
+bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) {
|
|
+ return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func,
|
|
+ *this,
|
|
+ TRI);
|
|
+}
|
|
+
|
|
+// createAMDGPUCFGStructurizerPass- Returns a pass
|
|
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm
|
|
+ ) {
|
|
+ return new AMDGPUCFGPerform(tm );
|
|
+}
|
|
+
|
|
+bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) {
|
|
+ return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().run(func,
|
|
+ *this,
|
|
+ TRI);
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDILDevice.cpp b/lib/Target/R600/AMDILDevice.cpp
|
|
new file mode 100644
|
|
index 0000000..eec5059
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILDevice.cpp
|
|
@@ -0,0 +1,124 @@
|
|
+//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//==-----------------------------------------------------------------------===//
|
|
+#include "AMDILDevice.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+
|
|
+using namespace llvm;
|
|
+// Default implementation for all of the classes.
|
|
+AMDGPUDevice::AMDGPUDevice(AMDGPUSubtarget *ST) : mSTM(ST) {
|
|
+ mHWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
|
|
+ mSWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
|
|
+ setCaps();
|
|
+ DeviceFlag = OCL_DEVICE_ALL;
|
|
+}
|
|
+
|
|
+AMDGPUDevice::~AMDGPUDevice() {
|
|
+ mHWBits.clear();
|
|
+ mSWBits.clear();
|
|
+}
|
|
+
|
|
+size_t AMDGPUDevice::getMaxGDSSize() const {
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+uint32_t
|
|
+AMDGPUDevice::getDeviceFlag() const {
|
|
+ return DeviceFlag;
|
|
+}
|
|
+
|
|
+size_t AMDGPUDevice::getMaxNumCBs() const {
|
|
+ if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
|
|
+ return HW_MAX_NUM_CB;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+size_t AMDGPUDevice::getMaxCBSize() const {
|
|
+ if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
|
|
+ return MAX_CB_SIZE;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+size_t AMDGPUDevice::getMaxScratchSize() const {
|
|
+ return 65536;
|
|
+}
|
|
+
|
|
+uint32_t AMDGPUDevice::getStackAlignment() const {
|
|
+ return 16;
|
|
+}
|
|
+
|
|
+void AMDGPUDevice::setCaps() {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::HalfOps);
|
|
+ mSWBits.set(AMDGPUDeviceInfo::ByteOps);
|
|
+ mSWBits.set(AMDGPUDeviceInfo::ShortOps);
|
|
+ mSWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::NoInline)) {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::NoInline);
|
|
+ }
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::MacroDB)) {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::MacroDB);
|
|
+ }
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::ConstantMem);
|
|
+ } else {
|
|
+ mHWBits.set(AMDGPUDeviceInfo::ConstantMem);
|
|
+ }
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::PrivateMem);
|
|
+ } else {
|
|
+ mHWBits.set(AMDGPUDeviceInfo::PrivateMem);
|
|
+ }
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::BarrierDetect)) {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
|
|
+ }
|
|
+ mSWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
|
|
+ mSWBits.set(AMDGPUDeviceInfo::LongOps);
|
|
+}
|
|
+
|
|
+AMDGPUDeviceInfo::ExecutionMode
|
|
+AMDGPUDevice::getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const {
|
|
+ if (mHWBits[Caps]) {
|
|
+ assert(!mSWBits[Caps] && "Cannot set both SW and HW caps");
|
|
+ return AMDGPUDeviceInfo::Hardware;
|
|
+ }
|
|
+
|
|
+ if (mSWBits[Caps]) {
|
|
+ assert(!mHWBits[Caps] && "Cannot set both SW and HW caps");
|
|
+ return AMDGPUDeviceInfo::Software;
|
|
+ }
|
|
+
|
|
+ return AMDGPUDeviceInfo::Unsupported;
|
|
+
|
|
+}
|
|
+
|
|
+bool AMDGPUDevice::isSupported(AMDGPUDeviceInfo::Caps Mode) const {
|
|
+ return getExecutionMode(Mode) != AMDGPUDeviceInfo::Unsupported;
|
|
+}
|
|
+
|
|
+bool AMDGPUDevice::usesHardware(AMDGPUDeviceInfo::Caps Mode) const {
|
|
+ return getExecutionMode(Mode) == AMDGPUDeviceInfo::Hardware;
|
|
+}
|
|
+
|
|
+bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const {
|
|
+ return getExecutionMode(Mode) == AMDGPUDeviceInfo::Software;
|
|
+}
|
|
+
|
|
+std::string
|
|
+AMDGPUDevice::getDataLayout() const {
|
|
+ return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
|
|
+ "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
|
|
+ "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
|
|
+ "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
|
|
+ "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
|
|
+ "-n8:16:32:64");
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDILDevice.h b/lib/Target/R600/AMDILDevice.h
|
|
new file mode 100644
|
|
index 0000000..b9a1560
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILDevice.h
|
|
@@ -0,0 +1,117 @@
|
|
+//===---- AMDILDevice.h - Define Device Data for AMDGPU -----*- C++ -*------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Interface for the subtarget data classes.
|
|
+//
|
|
+/// This file will define the interface that each generation needs to
|
|
+/// implement in order to correctly answer queries on the capabilities of the
|
|
+/// specific hardware.
|
|
+//===----------------------------------------------------------------------===//
|
|
+#ifndef AMDILDEVICEIMPL_H
|
|
+#define AMDILDEVICEIMPL_H
|
|
+#include "AMDIL.h"
|
|
+#include "llvm/ADT/BitVector.h"
|
|
+
|
|
+namespace llvm {
|
|
+ class AMDGPUSubtarget;
|
|
+ class MCStreamer;
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Interface for data that is specific to a single device
|
|
+//===----------------------------------------------------------------------===//
|
|
+class AMDGPUDevice {
|
|
+public:
|
|
+ AMDGPUDevice(AMDGPUSubtarget *ST);
|
|
+ virtual ~AMDGPUDevice();
|
|
+
|
|
+ // Enum values for the various memory types.
|
|
+ enum {
|
|
+ RAW_UAV_ID = 0,
|
|
+ ARENA_UAV_ID = 1,
|
|
+ LDS_ID = 2,
|
|
+ GDS_ID = 3,
|
|
+ SCRATCH_ID = 4,
|
|
+ CONSTANT_ID = 5,
|
|
+ GLOBAL_ID = 6,
|
|
+ MAX_IDS = 7
|
|
+ } IO_TYPE_IDS;
|
|
+
|
|
+ /// \returns The max LDS size that the hardware supports. Size is in
|
|
+ /// bytes.
|
|
+ virtual size_t getMaxLDSSize() const = 0;
|
|
+
|
|
+ /// \returns The max GDS size that the hardware supports if the GDS is
|
|
+ /// supported by the hardware. Size is in bytes.
|
|
+ virtual size_t getMaxGDSSize() const;
|
|
+
|
|
+ /// \returns The max number of hardware constant address spaces that
|
|
+ /// are supported by this device.
|
|
+ virtual size_t getMaxNumCBs() const;
|
|
+
|
|
+ /// \returns The max number of bytes a single hardware constant buffer
|
|
+ /// can support. Size is in bytes.
|
|
+ virtual size_t getMaxCBSize() const;
|
|
+
|
|
+ /// \returns The max number of bytes allowed by the hardware scratch
|
|
+ /// buffer. Size is in bytes.
|
|
+ virtual size_t getMaxScratchSize() const;
|
|
+
|
|
+ /// \brief Get the flag that corresponds to the device.
|
|
+ virtual uint32_t getDeviceFlag() const;
|
|
+
|
|
+ /// \returns The number of work-items that exist in a single hardware
|
|
+ /// wavefront.
|
|
+ virtual size_t getWavefrontSize() const = 0;
|
|
+
|
|
+ /// \brief Get the generational name of this specific device.
|
|
+ virtual uint32_t getGeneration() const = 0;
|
|
+
|
|
+ /// \brief Get the stack alignment of this specific device.
|
|
+ virtual uint32_t getStackAlignment() const;
|
|
+
|
|
+ /// \brief Get the resource ID for this specific device.
|
|
+ virtual uint32_t getResourceID(uint32_t DeviceID) const = 0;
|
|
+
|
|
+ /// \brief Get the max number of UAV's for this device.
|
|
+ virtual uint32_t getMaxNumUAVs() const = 0;
|
|
+
|
|
+
|
|
+ // API utilizing more detailed capabilities of each family of
|
|
+ // cards. If a capability is supported, then either usesHardware or
|
|
+ // usesSoftware returned true. If usesHardware returned true, then
|
|
+ // usesSoftware must return false for the same capability. Hardware
|
|
+ // execution means that the feature is done natively by the hardware
|
|
+ // and is not emulated by the softare. Software execution means
|
|
+ // that the feature could be done in the hardware, but there is
|
|
+ // software that emulates it with possibly using the hardware for
|
|
+ // support since the hardware does not fully comply with OpenCL
|
|
+ // specs.
|
|
+
|
|
+ bool isSupported(AMDGPUDeviceInfo::Caps Mode) const;
|
|
+ bool usesHardware(AMDGPUDeviceInfo::Caps Mode) const;
|
|
+ bool usesSoftware(AMDGPUDeviceInfo::Caps Mode) const;
|
|
+ virtual std::string getDataLayout() const;
|
|
+ static const unsigned int MAX_LDS_SIZE_700 = 16384;
|
|
+ static const unsigned int MAX_LDS_SIZE_800 = 32768;
|
|
+ static const unsigned int WavefrontSize = 64;
|
|
+ static const unsigned int HalfWavefrontSize = 32;
|
|
+ static const unsigned int QuarterWavefrontSize = 16;
|
|
+protected:
|
|
+ virtual void setCaps();
|
|
+ llvm::BitVector mHWBits;
|
|
+ llvm::BitVector mSWBits;
|
|
+ AMDGPUSubtarget *mSTM;
|
|
+ uint32_t DeviceFlag;
|
|
+private:
|
|
+ AMDGPUDeviceInfo::ExecutionMode
|
|
+ getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const;
|
|
+};
|
|
+
|
|
+} // namespace llvm
|
|
+#endif // AMDILDEVICEIMPL_H
|
|
diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..9605fbe
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILDeviceInfo.cpp
|
|
@@ -0,0 +1,94 @@
|
|
+//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Function that creates DeviceInfo from a device name and other information.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+#include "AMDILDevices.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+
|
|
+using namespace llvm;
|
|
+namespace llvm {
|
|
+namespace AMDGPUDeviceInfo {
|
|
+
|
|
+AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
|
|
+ AMDGPUSubtarget *ptr,
|
|
+ bool is64bit, bool is64on32bit) {
|
|
+ if (deviceName.c_str()[2] == '7') {
|
|
+ switch (deviceName.c_str()[3]) {
|
|
+ case '1':
|
|
+ return new AMDGPU710Device(ptr);
|
|
+ case '7':
|
|
+ return new AMDGPU770Device(ptr);
|
|
+ default:
|
|
+ return new AMDGPU7XXDevice(ptr);
|
|
+ }
|
|
+ } else if (deviceName == "cypress") {
|
|
+#if DEBUG
|
|
+ assert(!is64bit && "This device does not support 64bit pointers!");
|
|
+ assert(!is64on32bit && "This device does not support 64bit"
|
|
+ " on 32bit pointers!");
|
|
+#endif
|
|
+ return new AMDGPUCypressDevice(ptr);
|
|
+ } else if (deviceName == "juniper") {
|
|
+#if DEBUG
|
|
+ assert(!is64bit && "This device does not support 64bit pointers!");
|
|
+ assert(!is64on32bit && "This device does not support 64bit"
|
|
+ " on 32bit pointers!");
|
|
+#endif
|
|
+ return new AMDGPUEvergreenDevice(ptr);
|
|
+ } else if (deviceName == "redwood") {
|
|
+#if DEBUG
|
|
+ assert(!is64bit && "This device does not support 64bit pointers!");
|
|
+ assert(!is64on32bit && "This device does not support 64bit"
|
|
+ " on 32bit pointers!");
|
|
+#endif
|
|
+ return new AMDGPURedwoodDevice(ptr);
|
|
+ } else if (deviceName == "cedar") {
|
|
+#if DEBUG
|
|
+ assert(!is64bit && "This device does not support 64bit pointers!");
|
|
+ assert(!is64on32bit && "This device does not support 64bit"
|
|
+ " on 32bit pointers!");
|
|
+#endif
|
|
+ return new AMDGPUCedarDevice(ptr);
|
|
+ } else if (deviceName == "barts" || deviceName == "turks") {
|
|
+#if DEBUG
|
|
+ assert(!is64bit && "This device does not support 64bit pointers!");
|
|
+ assert(!is64on32bit && "This device does not support 64bit"
|
|
+ " on 32bit pointers!");
|
|
+#endif
|
|
+ return new AMDGPUNIDevice(ptr);
|
|
+ } else if (deviceName == "cayman") {
|
|
+#if DEBUG
|
|
+ assert(!is64bit && "This device does not support 64bit pointers!");
|
|
+ assert(!is64on32bit && "This device does not support 64bit"
|
|
+ " on 32bit pointers!");
|
|
+#endif
|
|
+ return new AMDGPUCaymanDevice(ptr);
|
|
+ } else if (deviceName == "caicos") {
|
|
+#if DEBUG
|
|
+ assert(!is64bit && "This device does not support 64bit pointers!");
|
|
+ assert(!is64on32bit && "This device does not support 64bit"
|
|
+ " on 32bit pointers!");
|
|
+#endif
|
|
+ return new AMDGPUNIDevice(ptr);
|
|
+ } else if (deviceName == "SI") {
|
|
+ return new AMDGPUSIDevice(ptr);
|
|
+ } else {
|
|
+#if DEBUG
|
|
+ assert(!is64bit && "This device does not support 64bit pointers!");
|
|
+ assert(!is64on32bit && "This device does not support 64bit"
|
|
+ " on 32bit pointers!");
|
|
+#endif
|
|
+ return new AMDGPU7XXDevice(ptr);
|
|
+ }
|
|
+}
|
|
+} // End namespace AMDGPUDeviceInfo
|
|
+} // End namespace llvm
|
|
diff --git a/lib/Target/R600/AMDILDeviceInfo.h b/lib/Target/R600/AMDILDeviceInfo.h
|
|
new file mode 100644
|
|
index 0000000..4b2c3a5
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILDeviceInfo.h
|
|
@@ -0,0 +1,88 @@
|
|
+//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//==-----------------------------------------------------------------------===//
|
|
+#ifndef AMDILDEVICEINFO_H
|
|
+#define AMDILDEVICEINFO_H
|
|
+
|
|
+
|
|
+#include <string>
|
|
+
|
|
+namespace llvm {
|
|
+ class AMDGPUDevice;
|
|
+ class AMDGPUSubtarget;
|
|
+ namespace AMDGPUDeviceInfo {
|
|
+ /// Each Capabilities can be executed using a hardware instruction,
|
|
+ /// emulated with a sequence of software instructions, or not
|
|
+ /// supported at all.
|
|
+ enum ExecutionMode {
|
|
+ Unsupported = 0, ///< Unsupported feature on the card(Default value)
|
|
+ /// This is the execution mode that is set if the feature is emulated in
|
|
+ /// software.
|
|
+ Software,
|
|
+ /// This execution mode is set if the feature exists natively in hardware
|
|
+ Hardware
|
|
+ };
|
|
+
|
|
+ enum Caps {
|
|
+ HalfOps = 0x1, ///< Half float is supported or not.
|
|
+ DoubleOps = 0x2, ///< Double is supported or not.
|
|
+ ByteOps = 0x3, ///< Byte(char) is support or not.
|
|
+ ShortOps = 0x4, ///< Short is supported or not.
|
|
+ LongOps = 0x5, ///< Long is supported or not.
|
|
+ Images = 0x6, ///< Images are supported or not.
|
|
+ ByteStores = 0x7, ///< ByteStores available(!HD4XXX).
|
|
+ ConstantMem = 0x8, ///< Constant/CB memory.
|
|
+ LocalMem = 0x9, ///< Local/LDS memory.
|
|
+ PrivateMem = 0xA, ///< Scratch/Private/Stack memory.
|
|
+ RegionMem = 0xB, ///< OCL GDS Memory Extension.
|
|
+ FMA = 0xC, ///< Use HW FMA or SW FMA.
|
|
+ ArenaSegment = 0xD, ///< Use for Arena UAV per pointer 12-1023.
|
|
+ MultiUAV = 0xE, ///< Use for UAV per Pointer 0-7.
|
|
+ Reserved0 = 0xF, ///< ReservedFlag
|
|
+ NoAlias = 0x10, ///< Cached loads.
|
|
+ Signed24BitOps = 0x11, ///< Peephole Optimization.
|
|
+ /// Debug mode implies that no hardware features or optimizations
|
|
+ /// are performned and that all memory access go through a single
|
|
+ /// uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX).
|
|
+ Debug = 0x12,
|
|
+ CachedMem = 0x13, ///< Cached mem is available or not.
|
|
+ BarrierDetect = 0x14, ///< Detect duplicate barriers.
|
|
+ Reserved1 = 0x15, ///< Reserved flag
|
|
+ ByteLDSOps = 0x16, ///< Flag to specify if byte LDS ops are available.
|
|
+ ArenaVectors = 0x17, ///< Flag to specify if vector loads from arena work.
|
|
+ TmrReg = 0x18, ///< Flag to specify if Tmr register is supported.
|
|
+ NoInline = 0x19, ///< Flag to specify that no inlining should occur.
|
|
+ MacroDB = 0x1A, ///< Flag to specify that backend handles macrodb.
|
|
+ HW64BitDivMod = 0x1B, ///< Flag for backend to generate 64bit div/mod.
|
|
+ ArenaUAV = 0x1C, ///< Flag to specify that arena uav is supported.
|
|
+ PrivateUAV = 0x1D, ///< Flag to specify that private memory uses uav's.
|
|
+ /// If more capabilities are required, then
|
|
+ /// this number needs to be increased.
|
|
+ /// All capabilities must come before this
|
|
+ /// number.
|
|
+ MaxNumberCapabilities = 0x20
|
|
+ };
|
|
+ /// These have to be in order with the older generations
|
|
+ /// having the lower number enumerations.
|
|
+ enum Generation {
|
|
+ HD4XXX = 0, ///< 7XX based devices.
|
|
+ HD5XXX, ///< Evergreen based devices.
|
|
+ HD6XXX, ///< NI/Evergreen+ based devices.
|
|
+ HD7XXX, ///< Southern Islands based devices.
|
|
+ HDTEST, ///< Experimental feature testing device.
|
|
+ HDNUMGEN
|
|
+ };
|
|
+
|
|
+
|
|
+ AMDGPUDevice*
|
|
+ getDeviceFromName(const std::string &name, AMDGPUSubtarget *ptr,
|
|
+ bool is64bit = false, bool is64on32bit = false);
|
|
+ } // namespace AMDILDeviceInfo
|
|
+} // namespace llvm
|
|
+#endif // AMDILDEVICEINFO_H
|
|
diff --git a/lib/Target/R600/AMDILDevices.h b/lib/Target/R600/AMDILDevices.h
|
|
new file mode 100644
|
|
index 0000000..636fa6d
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILDevices.h
|
|
@@ -0,0 +1,19 @@
|
|
+//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//==-----------------------------------------------------------------------===//
|
|
+#ifndef AMDIL_DEVICES_H
|
|
+#define AMDIL_DEVICES_H
|
|
+// Include all of the device specific header files
|
|
+#include "AMDIL7XXDevice.h"
|
|
+#include "AMDILDevice.h"
|
|
+#include "AMDILEvergreenDevice.h"
|
|
+#include "AMDILNIDevice.h"
|
|
+#include "AMDILSIDevice.h"
|
|
+
|
|
+#endif // AMDIL_DEVICES_H
|
|
diff --git a/lib/Target/R600/AMDILEvergreenDevice.cpp b/lib/Target/R600/AMDILEvergreenDevice.cpp
|
|
new file mode 100644
|
|
index 0000000..c5213a0
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILEvergreenDevice.cpp
|
|
@@ -0,0 +1,169 @@
|
|
+//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//==-----------------------------------------------------------------------===//
|
|
+#include "AMDILEvergreenDevice.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+AMDGPUEvergreenDevice::AMDGPUEvergreenDevice(AMDGPUSubtarget *ST)
|
|
+: AMDGPUDevice(ST) {
|
|
+ setCaps();
|
|
+ std::string name = ST->getDeviceName();
|
|
+ if (name == "cedar") {
|
|
+ DeviceFlag = OCL_DEVICE_CEDAR;
|
|
+ } else if (name == "redwood") {
|
|
+ DeviceFlag = OCL_DEVICE_REDWOOD;
|
|
+ } else if (name == "cypress") {
|
|
+ DeviceFlag = OCL_DEVICE_CYPRESS;
|
|
+ } else {
|
|
+ DeviceFlag = OCL_DEVICE_JUNIPER;
|
|
+ }
|
|
+}
|
|
+
|
|
+AMDGPUEvergreenDevice::~AMDGPUEvergreenDevice() {
|
|
+}
|
|
+
|
|
+size_t AMDGPUEvergreenDevice::getMaxLDSSize() const {
|
|
+ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
|
|
+ return MAX_LDS_SIZE_800;
|
|
+ } else {
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+size_t AMDGPUEvergreenDevice::getMaxGDSSize() const {
|
|
+ if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
|
|
+ return MAX_LDS_SIZE_800;
|
|
+ } else {
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+uint32_t AMDGPUEvergreenDevice::getMaxNumUAVs() const {
|
|
+ return 12;
|
|
+}
|
|
+
|
|
+uint32_t AMDGPUEvergreenDevice::getResourceID(uint32_t id) const {
|
|
+ switch(id) {
|
|
+ default:
|
|
+ assert(0 && "ID type passed in is unknown!");
|
|
+ break;
|
|
+ case CONSTANT_ID:
|
|
+ case RAW_UAV_ID:
|
|
+ return GLOBAL_RETURN_RAW_UAV_ID;
|
|
+ case GLOBAL_ID:
|
|
+ case ARENA_UAV_ID:
|
|
+ return DEFAULT_ARENA_UAV_ID;
|
|
+ case LDS_ID:
|
|
+ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
|
|
+ return DEFAULT_LDS_ID;
|
|
+ } else {
|
|
+ return DEFAULT_ARENA_UAV_ID;
|
|
+ }
|
|
+ case GDS_ID:
|
|
+ if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
|
|
+ return DEFAULT_GDS_ID;
|
|
+ } else {
|
|
+ return DEFAULT_ARENA_UAV_ID;
|
|
+ }
|
|
+ case SCRATCH_ID:
|
|
+ if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
|
|
+ return DEFAULT_SCRATCH_ID;
|
|
+ } else {
|
|
+ return DEFAULT_ARENA_UAV_ID;
|
|
+ }
|
|
+ };
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+size_t AMDGPUEvergreenDevice::getWavefrontSize() const {
|
|
+ return AMDGPUDevice::WavefrontSize;
|
|
+}
|
|
+
|
|
+uint32_t AMDGPUEvergreenDevice::getGeneration() const {
|
|
+ return AMDGPUDeviceInfo::HD5XXX;
|
|
+}
|
|
+
|
|
+void AMDGPUEvergreenDevice::setCaps() {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
|
|
+ mHWBits.set(AMDGPUDeviceInfo::ArenaUAV);
|
|
+ mHWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
|
|
+ mSWBits.reset(AMDGPUDeviceInfo::HW64BitDivMod);
|
|
+ mSWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::ByteStores)) {
|
|
+ mHWBits.set(AMDGPUDeviceInfo::ByteStores);
|
|
+ }
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::LocalMem);
|
|
+ mSWBits.set(AMDGPUDeviceInfo::RegionMem);
|
|
+ } else {
|
|
+ mHWBits.set(AMDGPUDeviceInfo::LocalMem);
|
|
+ mHWBits.set(AMDGPUDeviceInfo::RegionMem);
|
|
+ }
|
|
+ mHWBits.set(AMDGPUDeviceInfo::Images);
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::NoAlias)) {
|
|
+ mHWBits.set(AMDGPUDeviceInfo::NoAlias);
|
|
+ }
|
|
+ mHWBits.set(AMDGPUDeviceInfo::CachedMem);
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::MultiUAV)) {
|
|
+ mHWBits.set(AMDGPUDeviceInfo::MultiUAV);
|
|
+ }
|
|
+ mHWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
|
|
+ mSWBits.reset(AMDGPUDeviceInfo::ByteLDSOps);
|
|
+ mHWBits.set(AMDGPUDeviceInfo::ArenaVectors);
|
|
+ mHWBits.set(AMDGPUDeviceInfo::LongOps);
|
|
+ mSWBits.reset(AMDGPUDeviceInfo::LongOps);
|
|
+ mHWBits.set(AMDGPUDeviceInfo::TmrReg);
|
|
+}
|
|
+
|
|
+AMDGPUCypressDevice::AMDGPUCypressDevice(AMDGPUSubtarget *ST)
|
|
+ : AMDGPUEvergreenDevice(ST) {
|
|
+ setCaps();
|
|
+}
|
|
+
|
|
+AMDGPUCypressDevice::~AMDGPUCypressDevice() {
|
|
+}
|
|
+
|
|
+void AMDGPUCypressDevice::setCaps() {
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
|
|
+ mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
|
|
+ mHWBits.set(AMDGPUDeviceInfo::FMA);
|
|
+ }
|
|
+}
|
|
+
|
|
+
|
|
+AMDGPUCedarDevice::AMDGPUCedarDevice(AMDGPUSubtarget *ST)
|
|
+ : AMDGPUEvergreenDevice(ST) {
|
|
+ setCaps();
|
|
+}
|
|
+
|
|
+AMDGPUCedarDevice::~AMDGPUCedarDevice() {
|
|
+}
|
|
+
|
|
+void AMDGPUCedarDevice::setCaps() {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::FMA);
|
|
+}
|
|
+
|
|
+size_t AMDGPUCedarDevice::getWavefrontSize() const {
|
|
+ return AMDGPUDevice::QuarterWavefrontSize;
|
|
+}
|
|
+
|
|
+AMDGPURedwoodDevice::AMDGPURedwoodDevice(AMDGPUSubtarget *ST)
|
|
+ : AMDGPUEvergreenDevice(ST) {
|
|
+ setCaps();
|
|
+}
|
|
+
|
|
+AMDGPURedwoodDevice::~AMDGPURedwoodDevice() {
|
|
+}
|
|
+
|
|
+void AMDGPURedwoodDevice::setCaps() {
|
|
+ mSWBits.set(AMDGPUDeviceInfo::FMA);
|
|
+}
|
|
+
|
|
+size_t AMDGPURedwoodDevice::getWavefrontSize() const {
|
|
+ return AMDGPUDevice::HalfWavefrontSize;
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDILEvergreenDevice.h b/lib/Target/R600/AMDILEvergreenDevice.h
|
|
new file mode 100644
|
|
index 0000000..6dc2deb
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILEvergreenDevice.h
|
|
@@ -0,0 +1,93 @@
|
|
+//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Interface for the subtarget data classes.
|
|
+///
|
|
+/// This file will define the interface that each generation needs to
|
|
+/// implement in order to correctly answer queries on the capabilities of the
|
|
+/// specific hardware.
|
|
+//===----------------------------------------------------------------------===//
|
|
+#ifndef AMDILEVERGREENDEVICE_H
|
|
+#define AMDILEVERGREENDEVICE_H
|
|
+#include "AMDILDevice.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+
|
|
+namespace llvm {
|
|
+ class AMDGPUSubtarget;
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Evergreen generation of devices and their respective sub classes
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+/// \brief The AMDGPUEvergreenDevice is the base device class for all of the Evergreen
|
|
+/// series of cards.
|
|
+///
|
|
+/// This class contains information required to differentiate
|
|
+/// the Evergreen device from the generic AMDGPUDevice. This device represents
|
|
+/// that capabilities of the 'Juniper' cards, also known as the HD57XX.
|
|
+class AMDGPUEvergreenDevice : public AMDGPUDevice {
|
|
+public:
|
|
+ AMDGPUEvergreenDevice(AMDGPUSubtarget *ST);
|
|
+ virtual ~AMDGPUEvergreenDevice();
|
|
+ virtual size_t getMaxLDSSize() const;
|
|
+ virtual size_t getMaxGDSSize() const;
|
|
+ virtual size_t getWavefrontSize() const;
|
|
+ virtual uint32_t getGeneration() const;
|
|
+ virtual uint32_t getMaxNumUAVs() const;
|
|
+ virtual uint32_t getResourceID(uint32_t) const;
|
|
+protected:
|
|
+ virtual void setCaps();
|
|
+};
|
|
+
|
|
+/// The AMDGPUCypressDevice is similiar to the AMDGPUEvergreenDevice, except it has
|
|
+/// support for double precision operations. This device is used to represent
|
|
+/// both the Cypress and Hemlock cards, which are commercially known as HD58XX
|
|
+/// and HD59XX cards.
|
|
+class AMDGPUCypressDevice : public AMDGPUEvergreenDevice {
|
|
+public:
|
|
+ AMDGPUCypressDevice(AMDGPUSubtarget *ST);
|
|
+ virtual ~AMDGPUCypressDevice();
|
|
+private:
|
|
+ virtual void setCaps();
|
|
+};
|
|
+
|
|
+
|
|
+/// \brief The AMDGPUCedarDevice is the class that represents all of the 'Cedar' based
|
|
+/// devices.
|
|
+///
|
|
+/// This class differs from the base AMDGPUEvergreenDevice in that the
|
|
+/// device is a ~quarter of the 'Juniper'. These are commercially known as the
|
|
+/// HD54XX and HD53XX series of cards.
|
|
+class AMDGPUCedarDevice : public AMDGPUEvergreenDevice {
|
|
+public:
|
|
+ AMDGPUCedarDevice(AMDGPUSubtarget *ST);
|
|
+ virtual ~AMDGPUCedarDevice();
|
|
+ virtual size_t getWavefrontSize() const;
|
|
+private:
|
|
+ virtual void setCaps();
|
|
+};
|
|
+
|
|
+/// \brief The AMDGPURedwoodDevice is the class the represents all of the 'Redwood' based
|
|
+/// devices.
|
|
+///
|
|
+/// This class differs from the base class, in that these devices are
|
|
+/// considered about half of a 'Juniper' device. These are commercially known as
|
|
+/// the HD55XX and HD56XX series of cards.
|
|
+class AMDGPURedwoodDevice : public AMDGPUEvergreenDevice {
|
|
+public:
|
|
+ AMDGPURedwoodDevice(AMDGPUSubtarget *ST);
|
|
+ virtual ~AMDGPURedwoodDevice();
|
|
+ virtual size_t getWavefrontSize() const;
|
|
+private:
|
|
+ virtual void setCaps();
|
|
+};
|
|
+
|
|
+} // namespace llvm
|
|
+#endif // AMDILEVERGREENDEVICE_H
|
|
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
|
|
new file mode 100644
|
|
index 0000000..2e726e9
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
|
|
@@ -0,0 +1,577 @@
|
|
+//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Defines an instruction selector for the AMDGPU target.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+#include "AMDGPUInstrInfo.h"
|
|
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
|
|
+#include "AMDGPURegisterInfo.h"
|
|
+#include "AMDILDevices.h"
|
|
+#include "R600InstrInfo.h"
|
|
+#include "llvm/ADT/ValueMap.h"
|
|
+#include "llvm/CodeGen/PseudoSourceValue.h"
|
|
+#include "llvm/CodeGen/SelectionDAGISel.h"
|
|
+#include "llvm/Support/Compiler.h"
|
|
+#include "llvm/CodeGen/SelectionDAG.h"
|
|
+#include <list>
|
|
+#include <queue>
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Instruction Selector Implementation
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+namespace {
|
|
+/// AMDGPU specific code to select AMDGPU machine instructions for
|
|
+/// SelectionDAG operations.
|
|
+class AMDGPUDAGToDAGISel : public SelectionDAGISel {
|
|
+ // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
|
|
+ // make the right decision when generating code for different targets.
|
|
+ const AMDGPUSubtarget &Subtarget;
|
|
+public:
|
|
+ AMDGPUDAGToDAGISel(TargetMachine &TM);
|
|
+ virtual ~AMDGPUDAGToDAGISel();
|
|
+
|
|
+ SDNode *Select(SDNode *N);
|
|
+ virtual const char *getPassName() const;
|
|
+
|
|
+private:
|
|
+ inline SDValue getSmallIPtrImm(unsigned Imm);
|
|
+ bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
|
|
+
|
|
+ // Complex pattern selectors
|
|
+ bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
|
|
+ bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
|
|
+ bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
|
|
+
|
|
+ static bool checkType(const Value *ptr, unsigned int addrspace);
|
|
+ static const Value *getBasePointerValue(const Value *V);
|
|
+
|
|
+ static bool isGlobalStore(const StoreSDNode *N);
|
|
+ static bool isPrivateStore(const StoreSDNode *N);
|
|
+ static bool isLocalStore(const StoreSDNode *N);
|
|
+ static bool isRegionStore(const StoreSDNode *N);
|
|
+
|
|
+ static bool isCPLoad(const LoadSDNode *N);
|
|
+ static bool isConstantLoad(const LoadSDNode *N, int cbID);
|
|
+ static bool isGlobalLoad(const LoadSDNode *N);
|
|
+ static bool isParamLoad(const LoadSDNode *N);
|
|
+ static bool isPrivateLoad(const LoadSDNode *N);
|
|
+ static bool isLocalLoad(const LoadSDNode *N);
|
|
+ static bool isRegionLoad(const LoadSDNode *N);
|
|
+
|
|
+ bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
|
|
+ bool SelectGlobalValueVariableOffset(SDValue Addr,
|
|
+ SDValue &BaseReg, SDValue& Offset);
|
|
+ bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
|
|
+ bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
|
|
+
|
|
+ // Include the pieces autogenerated from the target description.
|
|
+#include "AMDGPUGenDAGISel.inc"
|
|
+};
|
|
+} // end anonymous namespace
|
|
+
|
|
+/// \brief This pass converts a legalized DAG into a AMDGPU-specific
|
|
+// DAG, ready for instruction scheduling.
|
|
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
|
|
+ ) {
|
|
+ return new AMDGPUDAGToDAGISel(TM);
|
|
+}
|
|
+
|
|
+AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM
|
|
+ )
|
|
+ : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
|
|
+}
|
|
+
|
|
+AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
|
|
+}
|
|
+
|
|
+SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
|
|
+ return CurDAG->getTargetConstant(Imm, MVT::i32);
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::SelectADDRParam(
|
|
+ SDValue Addr, SDValue& R1, SDValue& R2) {
|
|
+
|
|
+ if (Addr.getOpcode() == ISD::FrameIndex) {
|
|
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
|
|
+ R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
|
|
+ R2 = CurDAG->getTargetConstant(0, MVT::i32);
|
|
+ } else {
|
|
+ R1 = Addr;
|
|
+ R2 = CurDAG->getTargetConstant(0, MVT::i32);
|
|
+ }
|
|
+ } else if (Addr.getOpcode() == ISD::ADD) {
|
|
+ R1 = Addr.getOperand(0);
|
|
+ R2 = Addr.getOperand(1);
|
|
+ } else {
|
|
+ R1 = Addr;
|
|
+ R2 = CurDAG->getTargetConstant(0, MVT::i32);
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
|
|
+ if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
|
|
+ Addr.getOpcode() == ISD::TargetGlobalAddress) {
|
|
+ return false;
|
|
+ }
|
|
+ return SelectADDRParam(Addr, R1, R2);
|
|
+}
|
|
+
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
|
|
+ if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
|
|
+ Addr.getOpcode() == ISD::TargetGlobalAddress) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (Addr.getOpcode() == ISD::FrameIndex) {
|
|
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
|
|
+ R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
|
|
+ R2 = CurDAG->getTargetConstant(0, MVT::i64);
|
|
+ } else {
|
|
+ R1 = Addr;
|
|
+ R2 = CurDAG->getTargetConstant(0, MVT::i64);
|
|
+ }
|
|
+ } else if (Addr.getOpcode() == ISD::ADD) {
|
|
+ R1 = Addr.getOperand(0);
|
|
+ R2 = Addr.getOperand(1);
|
|
+ } else {
|
|
+ R1 = Addr;
|
|
+ R2 = CurDAG->getTargetConstant(0, MVT::i64);
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
|
+ unsigned int Opc = N->getOpcode();
|
|
+ if (N->isMachineOpcode()) {
|
|
+ return NULL; // Already selected.
|
|
+ }
|
|
+ switch (Opc) {
|
|
+ default: break;
|
|
+ case ISD::ConstantFP:
|
|
+ case ISD::Constant: {
|
|
+ const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
|
|
+ // XXX: Custom immediate lowering not implemented yet. Instead we use
|
|
+ // pseudo instructions defined in SIInstructions.td
|
|
+ if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
|
|
+ break;
|
|
+ }
|
|
+ const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
|
|
+
|
|
+ uint64_t ImmValue = 0;
|
|
+ unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
|
|
+
|
|
+ if (N->getOpcode() == ISD::ConstantFP) {
|
|
+ // XXX: 64-bit Immediates not supported yet
|
|
+ assert(N->getValueType(0) != MVT::f64);
|
|
+
|
|
+ ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
|
|
+ APFloat Value = C->getValueAPF();
|
|
+ float FloatValue = Value.convertToFloat();
|
|
+ if (FloatValue == 0.0) {
|
|
+ ImmReg = AMDGPU::ZERO;
|
|
+ } else if (FloatValue == 0.5) {
|
|
+ ImmReg = AMDGPU::HALF;
|
|
+ } else if (FloatValue == 1.0) {
|
|
+ ImmReg = AMDGPU::ONE;
|
|
+ } else {
|
|
+ ImmValue = Value.bitcastToAPInt().getZExtValue();
|
|
+ }
|
|
+ } else {
|
|
+ // XXX: 64-bit Immediates not supported yet
|
|
+ assert(N->getValueType(0) != MVT::i64);
|
|
+
|
|
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
|
|
+ if (C->getZExtValue() == 0) {
|
|
+ ImmReg = AMDGPU::ZERO;
|
|
+ } else if (C->getZExtValue() == 1) {
|
|
+ ImmReg = AMDGPU::ONE_INT;
|
|
+ } else {
|
|
+ ImmValue = C->getZExtValue();
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use);
|
|
+ Use != SDNode::use_end(); Use = Next) {
|
|
+ Next = llvm::next(Use);
|
|
+ std::vector<SDValue> Ops;
|
|
+ for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
|
|
+ Ops.push_back(Use->getOperand(i));
|
|
+ }
|
|
+
|
|
+ if (!Use->isMachineOpcode()) {
|
|
+ if (ImmReg == AMDGPU::ALU_LITERAL_X) {
|
|
+ // We can only use literal constants (e.g. AMDGPU::ZERO,
|
|
+ // AMDGPU::ONE, etc) in machine opcodes.
|
|
+ continue;
|
|
+ }
|
|
+ } else {
|
|
+ if (!TII->isALUInstr(Use->getMachineOpcode()) ||
|
|
+ (TII->get(Use->getMachineOpcode()).TSFlags &
|
|
+ R600_InstFlag::VECTOR)) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM);
|
|
+ assert(ImmIdx != -1);
|
|
+
|
|
+ // subtract one from ImmIdx, because the DST operand is usually index
|
|
+ // 0 for MachineInstrs, but we have no DST in the Ops vector.
|
|
+ ImmIdx--;
|
|
+
|
|
+ // Check that we aren't already using an immediate.
|
|
+ // XXX: It's possible for an instruction to have more than one
|
|
+ // immediate operand, but this is not supported yet.
|
|
+ if (ImmReg == AMDGPU::ALU_LITERAL_X) {
|
|
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
|
|
+ assert(C);
|
|
+
|
|
+ if (C->getZExtValue() != 0) {
|
|
+ // This instruction is already using an immediate.
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ // Set the immediate value
|
|
+ Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
|
|
+ }
|
|
+ }
|
|
+ // Set the immediate register
|
|
+ Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
|
|
+
|
|
+ CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands());
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ SDNode *Result = SelectCode(N);
|
|
+
|
|
+ // Fold operands of selected node
|
|
+
|
|
+ const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
|
|
+ if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
|
|
+ const R600InstrInfo *TII =
|
|
+ static_cast<const R600InstrInfo*>(TM.getInstrInfo());
|
|
+ if (Result && Result->isMachineOpcode() &&
|
|
+ !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
|
|
+ && TII->isALUInstr(Result->getMachineOpcode())) {
|
|
+ // Fold FNEG/FABS/CONST_ADDRESS
|
|
+ // TODO: Isel can generate multiple MachineInst, we need to recursively
|
|
+ // parse Result
|
|
+ bool IsModified = false;
|
|
+ do {
|
|
+ std::vector<SDValue> Ops;
|
|
+ for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
|
|
+ I != E; ++I)
|
|
+ Ops.push_back(*I);
|
|
+ IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
|
|
+ if (IsModified) {
|
|
+ Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
|
|
+ }
|
|
+ } while (IsModified);
|
|
+
|
|
+ // If node has a single use which is CLAMP_R600, folds it
|
|
+ if (Result->hasOneUse() && Result->isMachineOpcode()) {
|
|
+ SDNode *PotentialClamp = *Result->use_begin();
|
|
+ if (PotentialClamp->isMachineOpcode() &&
|
|
+ PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) {
|
|
+ unsigned ClampIdx =
|
|
+ TII->getOperandIdx(Result->getMachineOpcode(), R600Operands::CLAMP);
|
|
+ std::vector<SDValue> Ops;
|
|
+ unsigned NumOp = Result->getNumOperands();
|
|
+ for (unsigned i = 0; i < NumOp; ++i) {
|
|
+ Ops.push_back(Result->getOperand(i));
|
|
+ }
|
|
+ Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32);
|
|
+ Result = CurDAG->SelectNodeTo(PotentialClamp,
|
|
+ Result->getMachineOpcode(), PotentialClamp->getVTList(),
|
|
+ Ops.data(), NumOp);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return Result;
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
|
|
+ const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
|
|
+ int OperandIdx[] = {
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC0),
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC1),
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC2)
|
|
+ };
|
|
+ int SelIdx[] = {
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL),
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL),
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL)
|
|
+ };
|
|
+ int NegIdx[] = {
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG),
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG),
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC2_NEG)
|
|
+ };
|
|
+ int AbsIdx[] = {
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS),
|
|
+ TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS),
|
|
+ -1
|
|
+ };
|
|
+
|
|
+ for (unsigned i = 0; i < 3; i++) {
|
|
+ if (OperandIdx[i] < 0)
|
|
+ return false;
|
|
+ SDValue Operand = Ops[OperandIdx[i] - 1];
|
|
+ switch (Operand.getOpcode()) {
|
|
+ case AMDGPUISD::CONST_ADDRESS: {
|
|
+ if (i == 2)
|
|
+ break;
|
|
+ SDValue CstOffset;
|
|
+ if (!Operand.getValueType().isVector() &&
|
|
+ SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) {
|
|
+ Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
|
|
+ Ops[SelIdx[i] - 1] = CstOffset;
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ case ISD::FNEG:
|
|
+ if (NegIdx[i] < 0)
|
|
+ break;
|
|
+ Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
|
|
+ Ops[NegIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
|
|
+ return true;
|
|
+ case ISD::FABS:
|
|
+ if (AbsIdx[i] < 0)
|
|
+ break;
|
|
+ Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
|
|
+ Ops[AbsIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
|
|
+ return true;
|
|
+ case ISD::BITCAST:
|
|
+ Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
|
|
+ return true;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
|
|
+ if (!ptr) {
|
|
+ return false;
|
|
+ }
|
|
+ Type *ptrType = ptr->getType();
|
|
+ return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
|
|
+}
|
|
+
|
|
+const Value * AMDGPUDAGToDAGISel::getBasePointerValue(const Value *V) {
|
|
+ if (!V) {
|
|
+ return NULL;
|
|
+ }
|
|
+ const Value *ret = NULL;
|
|
+ ValueMap<const Value *, bool> ValueBitMap;
|
|
+ std::queue<const Value *, std::list<const Value *> > ValueQueue;
|
|
+ ValueQueue.push(V);
|
|
+ while (!ValueQueue.empty()) {
|
|
+ V = ValueQueue.front();
|
|
+ if (ValueBitMap.find(V) == ValueBitMap.end()) {
|
|
+ ValueBitMap[V] = true;
|
|
+ if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
|
|
+ ret = V;
|
|
+ break;
|
|
+ } else if (dyn_cast<GlobalVariable>(V)) {
|
|
+ ret = V;
|
|
+ break;
|
|
+ } else if (dyn_cast<Constant>(V)) {
|
|
+ const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
|
|
+ if (CE) {
|
|
+ ValueQueue.push(CE->getOperand(0));
|
|
+ }
|
|
+ } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
|
|
+ ret = AI;
|
|
+ break;
|
|
+ } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
|
|
+ uint32_t numOps = I->getNumOperands();
|
|
+ for (uint32_t x = 0; x < numOps; ++x) {
|
|
+ ValueQueue.push(I->getOperand(x));
|
|
+ }
|
|
+ } else {
|
|
+ assert(!"Found a Value that we didn't know how to handle!");
|
|
+ }
|
|
+ }
|
|
+ ValueQueue.pop();
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
|
|
+ return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
|
|
+ return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
|
|
+ && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
|
|
+ && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
|
|
+ return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
|
|
+ return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
|
|
+ if (checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)) {
|
|
+ return true;
|
|
+ }
|
|
+ MachineMemOperand *MMO = N->getMemOperand();
|
|
+ const Value *V = MMO->getValue();
|
|
+ const Value *BV = getBasePointerValue(V);
|
|
+ if (MMO
|
|
+ && MMO->getValue()
|
|
+ && ((V && dyn_cast<GlobalValue>(V))
|
|
+ || (BV && dyn_cast<GlobalValue>(
|
|
+ getBasePointerValue(MMO->getValue()))))) {
|
|
+ return checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS);
|
|
+ } else {
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
|
|
+ return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) {
|
|
+ return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) {
|
|
+ return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) {
|
|
+ return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
|
|
+ MachineMemOperand *MMO = N->getMemOperand();
|
|
+ if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
|
|
+ if (MMO) {
|
|
+ const Value *V = MMO->getValue();
|
|
+ const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
|
|
+ if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
|
|
+ if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
|
|
+ // Check to make sure we are not a constant pool load or a constant load
|
|
+ // that is marked as a private load
|
|
+ if (isCPLoad(N) || isConstantLoad(N, -1)) {
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+ if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
|
|
+ && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
|
|
+ && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
|
|
+ && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
|
|
+ && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
|
|
+ && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
|
|
+ return true;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+const char *AMDGPUDAGToDAGISel::getPassName() const {
|
|
+ return "AMDGPU DAG->DAG Pattern Instruction Selection";
|
|
+}
|
|
+
|
|
+#ifdef DEBUGTMP
|
|
+#undef INT64_C
|
|
+#endif
|
|
+#undef DEBUGTMP
|
|
+
|
|
+///==== AMDGPU Functions ====///
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
|
|
+ SDValue& IntPtr) {
|
|
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
|
|
+ IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
|
|
+ return true;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
|
|
+ SDValue& BaseReg, SDValue &Offset) {
|
|
+ if (!dyn_cast<ConstantSDNode>(Addr)) {
|
|
+ BaseReg = Addr;
|
|
+ Offset = CurDAG->getIntPtrConstant(0, true);
|
|
+ return true;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
|
|
+ SDValue &Offset) {
|
|
+ ConstantSDNode * IMMOffset;
|
|
+
|
|
+ if (Addr.getOpcode() == ISD::ADD
|
|
+ && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
|
|
+ && isInt<16>(IMMOffset->getZExtValue())) {
|
|
+
|
|
+ Base = Addr.getOperand(0);
|
|
+ Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
|
|
+ return true;
|
|
+ // If the pointer address is constant, we can move it to the offset field.
|
|
+ } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
|
|
+ && isInt<16>(IMMOffset->getZExtValue())) {
|
|
+ Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
|
|
+ CurDAG->getEntryNode().getDebugLoc(),
|
|
+ AMDGPU::ZERO, MVT::i32);
|
|
+ Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ // Default case, no offset
|
|
+ Base = Addr;
|
|
+ Offset = CurDAG->getTargetConstant(0, MVT::i32);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
|
|
+ SDValue &Offset) {
|
|
+ ConstantSDNode *C;
|
|
+
|
|
+ if ((C = dyn_cast<ConstantSDNode>(Addr))) {
|
|
+ Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
|
|
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
|
|
+ } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
|
|
+ (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
|
|
+ Base = Addr.getOperand(0);
|
|
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
|
|
+ } else {
|
|
+ Base = Addr;
|
|
+ Offset = CurDAG->getTargetConstant(0, MVT::i32);
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
|
|
new file mode 100644
|
|
index 0000000..8bfd30c
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILISelLowering.cpp
|
|
@@ -0,0 +1,651 @@
|
|
+//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief TargetLowering functions borrowed from AMDIL.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPUISelLowering.h"
|
|
+#include "AMDGPURegisterInfo.h"
|
|
+#include "AMDILDevices.h"
|
|
+#include "AMDILIntrinsicInfo.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+#include "llvm/CallingConv.h"
|
|
+#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+#include "llvm/CodeGen/PseudoSourceValue.h"
|
|
+#include "llvm/CodeGen/SelectionDAG.h"
|
|
+#include "llvm/CodeGen/SelectionDAGNodes.h"
|
|
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
|
+#include "llvm/DerivedTypes.h"
|
|
+#include "llvm/Instructions.h"
|
|
+#include "llvm/Intrinsics.h"
|
|
+#include "llvm/Support/raw_ostream.h"
|
|
+#include "llvm/Target/TargetInstrInfo.h"
|
|
+#include "llvm/Target/TargetOptions.h"
|
|
+
|
|
+using namespace llvm;
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Calling Convention Implementation
|
|
+//===----------------------------------------------------------------------===//
|
|
+#include "AMDGPUGenCallingConv.inc"
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// TargetLowering Implementation Help Functions End
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// TargetLowering Class Implementation Begins
|
|
+//===----------------------------------------------------------------------===//
|
|
+void AMDGPUTargetLowering::InitAMDILLowering() {
|
|
+ int types[] = {
|
|
+ (int)MVT::i8,
|
|
+ (int)MVT::i16,
|
|
+ (int)MVT::i32,
|
|
+ (int)MVT::f32,
|
|
+ (int)MVT::f64,
|
|
+ (int)MVT::i64,
|
|
+ (int)MVT::v2i8,
|
|
+ (int)MVT::v4i8,
|
|
+ (int)MVT::v2i16,
|
|
+ (int)MVT::v4i16,
|
|
+ (int)MVT::v4f32,
|
|
+ (int)MVT::v4i32,
|
|
+ (int)MVT::v2f32,
|
|
+ (int)MVT::v2i32,
|
|
+ (int)MVT::v2f64,
|
|
+ (int)MVT::v2i64
|
|
+ };
|
|
+
|
|
+ int IntTypes[] = {
|
|
+ (int)MVT::i8,
|
|
+ (int)MVT::i16,
|
|
+ (int)MVT::i32,
|
|
+ (int)MVT::i64
|
|
+ };
|
|
+
|
|
+ int FloatTypes[] = {
|
|
+ (int)MVT::f32,
|
|
+ (int)MVT::f64
|
|
+ };
|
|
+
|
|
+ int VectorTypes[] = {
|
|
+ (int)MVT::v2i8,
|
|
+ (int)MVT::v4i8,
|
|
+ (int)MVT::v2i16,
|
|
+ (int)MVT::v4i16,
|
|
+ (int)MVT::v4f32,
|
|
+ (int)MVT::v4i32,
|
|
+ (int)MVT::v2f32,
|
|
+ (int)MVT::v2i32,
|
|
+ (int)MVT::v2f64,
|
|
+ (int)MVT::v2i64
|
|
+ };
|
|
+ size_t NumTypes = sizeof(types) / sizeof(*types);
|
|
+ size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
|
|
+ size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
|
|
+ size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);
|
|
+
|
|
+ const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
|
|
+ // These are the current register classes that are
|
|
+ // supported
|
|
+
|
|
+ for (unsigned int x = 0; x < NumTypes; ++x) {
|
|
+ MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
|
|
+
|
|
+ //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
|
|
+ // We cannot sextinreg, expand to shifts
|
|
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
|
|
+ setOperationAction(ISD::SUBE, VT, Expand);
|
|
+ setOperationAction(ISD::SUBC, VT, Expand);
|
|
+ setOperationAction(ISD::ADDE, VT, Expand);
|
|
+ setOperationAction(ISD::ADDC, VT, Expand);
|
|
+ setOperationAction(ISD::BRCOND, VT, Custom);
|
|
+ setOperationAction(ISD::BR_JT, VT, Expand);
|
|
+ setOperationAction(ISD::BRIND, VT, Expand);
|
|
+ // TODO: Implement custom UREM/SREM routines
|
|
+ setOperationAction(ISD::SREM, VT, Expand);
|
|
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
|
|
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
|
|
+ if (VT != MVT::i64 && VT != MVT::v2i64) {
|
|
+ setOperationAction(ISD::SDIV, VT, Custom);
|
|
+ }
|
|
+ }
|
|
+ for (unsigned int x = 0; x < NumFloatTypes; ++x) {
|
|
+ MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
|
|
+
|
|
+ // IL does not have these operations for floating point types
|
|
+ setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
|
|
+ setOperationAction(ISD::SETOLT, VT, Expand);
|
|
+ setOperationAction(ISD::SETOGE, VT, Expand);
|
|
+ setOperationAction(ISD::SETOGT, VT, Expand);
|
|
+ setOperationAction(ISD::SETOLE, VT, Expand);
|
|
+ setOperationAction(ISD::SETULT, VT, Expand);
|
|
+ setOperationAction(ISD::SETUGE, VT, Expand);
|
|
+ setOperationAction(ISD::SETUGT, VT, Expand);
|
|
+ setOperationAction(ISD::SETULE, VT, Expand);
|
|
+ }
|
|
+
|
|
+ for (unsigned int x = 0; x < NumIntTypes; ++x) {
|
|
+ MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
|
|
+
|
|
+ // GPU also does not have divrem function for signed or unsigned
|
|
+ setOperationAction(ISD::SDIVREM, VT, Expand);
|
|
+
|
|
+ // GPU does not have [S|U]MUL_LOHI functions as a single instruction
|
|
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
|
|
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
|
|
+
|
|
+ // GPU doesn't have a rotl, rotr, or byteswap instruction
|
|
+ setOperationAction(ISD::ROTR, VT, Expand);
|
|
+ setOperationAction(ISD::BSWAP, VT, Expand);
|
|
+
|
|
+ // GPU doesn't have any counting operators
|
|
+ setOperationAction(ISD::CTPOP, VT, Expand);
|
|
+ setOperationAction(ISD::CTTZ, VT, Expand);
|
|
+ setOperationAction(ISD::CTLZ, VT, Expand);
|
|
+ }
|
|
+
|
|
+ for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
|
|
+ MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
|
|
+
|
|
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
|
|
+ setOperationAction(ISD::SDIVREM, VT, Expand);
|
|
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
|
|
+ // setOperationAction(ISD::VSETCC, VT, Expand);
|
|
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
|
|
+
|
|
+ }
|
|
+ if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
|
|
+ setOperationAction(ISD::MULHU, MVT::i64, Expand);
|
|
+ setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
|
|
+ setOperationAction(ISD::MULHS, MVT::i64, Expand);
|
|
+ setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
|
|
+ setOperationAction(ISD::ADD, MVT::v2i64, Expand);
|
|
+ setOperationAction(ISD::SREM, MVT::v2i64, Expand);
|
|
+ setOperationAction(ISD::Constant , MVT::i64 , Legal);
|
|
+ setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
|
|
+ setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
|
|
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
|
|
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
|
|
+ setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
|
|
+ }
|
|
+ if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
|
|
+ // we support loading/storing v2f64 but not operations on the type
|
|
+ setOperationAction(ISD::FADD, MVT::v2f64, Expand);
|
|
+ setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
|
|
+ setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
|
|
+ setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
|
|
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
|
|
+ setOperationAction(ISD::ConstantFP , MVT::f64 , Legal);
|
|
+ // We want to expand vector conversions into their scalar
|
|
+ // counterparts.
|
|
+ setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
|
|
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
|
|
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
|
|
+ setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
|
|
+ setOperationAction(ISD::FABS, MVT::f64, Expand);
|
|
+ setOperationAction(ISD::FABS, MVT::v2f64, Expand);
|
|
+ }
|
|
+ // TODO: Fix the UDIV24 algorithm so it works for these
|
|
+ // types correctly. This needs vector comparisons
|
|
+ // for this to work correctly.
|
|
+ setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
|
|
+ setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
|
|
+ setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
|
|
+ setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
|
|
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
|
|
+ setOperationAction(ISD::SUBC, MVT::Other, Expand);
|
|
+ setOperationAction(ISD::ADDE, MVT::Other, Expand);
|
|
+ setOperationAction(ISD::ADDC, MVT::Other, Expand);
|
|
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
|
|
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
|
|
+ setOperationAction(ISD::BRIND, MVT::Other, Expand);
|
|
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
|
|
+
|
|
+
|
|
+ // Use the default implementation.
|
|
+ setOperationAction(ISD::ConstantFP , MVT::f32 , Legal);
|
|
+ setOperationAction(ISD::Constant , MVT::i32 , Legal);
|
|
+
|
|
+ setSchedulingPreference(Sched::RegPressure);
|
|
+ setPow2DivIsCheap(false);
|
|
+ setSelectIsExpensive(true);
|
|
+ setJumpIsExpensive(true);
|
|
+
|
|
+ maxStoresPerMemcpy = 4096;
|
|
+ maxStoresPerMemmove = 4096;
|
|
+ maxStoresPerMemset = 4096;
|
|
+
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|
+ const CallInst &I, unsigned Intrinsic) const {
|
|
+ return false;
|
|
+}
|
|
+
|
|
+// The backend supports 32 and 64 bit floating point immediates
|
|
+bool
|
|
+AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
|
|
+ if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
|
|
+ || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
|
|
+ return true;
|
|
+ } else {
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
|
|
+ if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
|
|
+ || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
|
|
+ return false;
|
|
+ } else {
|
|
+ return true;
|
|
+ }
|
|
+}
|
|
+
|
|
+
|
|
+// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
|
|
+// be zero. Op is expected to be a target specific node. Used by DAG
|
|
+// combiner.
|
|
+
|
|
+void
|
|
+AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
|
|
+ const SDValue Op,
|
|
+ APInt &KnownZero,
|
|
+ APInt &KnownOne,
|
|
+ const SelectionDAG &DAG,
|
|
+ unsigned Depth) const {
|
|
+ APInt KnownZero2;
|
|
+ APInt KnownOne2;
|
|
+ KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
|
|
+ switch (Op.getOpcode()) {
|
|
+ default: break;
|
|
+ case ISD::SELECT_CC:
|
|
+ DAG.ComputeMaskedBits(
|
|
+ Op.getOperand(1),
|
|
+ KnownZero,
|
|
+ KnownOne,
|
|
+ Depth + 1
|
|
+ );
|
|
+ DAG.ComputeMaskedBits(
|
|
+ Op.getOperand(0),
|
|
+ KnownZero2,
|
|
+ KnownOne2
|
|
+ );
|
|
+ assert((KnownZero & KnownOne) == 0
|
|
+ && "Bits known to be one AND zero?");
|
|
+ assert((KnownZero2 & KnownOne2) == 0
|
|
+ && "Bits known to be one AND zero?");
|
|
+ // Only known if known in both the LHS and RHS
|
|
+ KnownOne &= KnownOne2;
|
|
+ KnownZero &= KnownZero2;
|
|
+ break;
|
|
+ };
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Other Lowering Hooks
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
|
|
+ EVT OVT = Op.getValueType();
|
|
+ SDValue DST;
|
|
+ if (OVT.getScalarType() == MVT::i64) {
|
|
+ DST = LowerSDIV64(Op, DAG);
|
|
+ } else if (OVT.getScalarType() == MVT::i32) {
|
|
+ DST = LowerSDIV32(Op, DAG);
|
|
+ } else if (OVT.getScalarType() == MVT::i16
|
|
+ || OVT.getScalarType() == MVT::i8) {
|
|
+ DST = LowerSDIV24(Op, DAG);
|
|
+ } else {
|
|
+ DST = SDValue(Op.getNode(), 0);
|
|
+ }
|
|
+ return DST;
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
|
|
+ EVT OVT = Op.getValueType();
|
|
+ SDValue DST;
|
|
+ if (OVT.getScalarType() == MVT::i64) {
|
|
+ DST = LowerSREM64(Op, DAG);
|
|
+ } else if (OVT.getScalarType() == MVT::i32) {
|
|
+ DST = LowerSREM32(Op, DAG);
|
|
+ } else if (OVT.getScalarType() == MVT::i16) {
|
|
+ DST = LowerSREM16(Op, DAG);
|
|
+ } else if (OVT.getScalarType() == MVT::i8) {
|
|
+ DST = LowerSREM8(Op, DAG);
|
|
+ } else {
|
|
+ DST = SDValue(Op.getNode(), 0);
|
|
+ }
|
|
+ return DST;
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
|
|
+ SDValue Data = Op.getOperand(0);
|
|
+ VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT DVT = Data.getValueType();
|
|
+ EVT BVT = BaseType->getVT();
|
|
+ unsigned baseBits = BVT.getScalarType().getSizeInBits();
|
|
+ unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
|
|
+ unsigned shiftBits = srcBits - baseBits;
|
|
+ if (srcBits < 32) {
|
|
+ // If the op is less than 32 bits, then it needs to extend to 32bits
|
|
+ // so it can properly keep the upper bits valid.
|
|
+ EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
|
|
+ Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
|
|
+ shiftBits = 32 - baseBits;
|
|
+ DVT = IVT;
|
|
+ }
|
|
+ SDValue Shift = DAG.getConstant(shiftBits, DVT);
|
|
+ // Shift left by 'Shift' bits.
|
|
+ Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
|
|
+ // Signed shift Right by 'Shift' bits.
|
|
+ Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
|
|
+ if (srcBits < 32) {
|
|
+ // Once the sign extension is done, the op needs to be converted to
|
|
+ // its original type.
|
|
+ Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
|
|
+ }
|
|
+ return Data;
|
|
+}
|
|
+EVT
|
|
+AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
|
|
+ int iSize = (size * numEle);
|
|
+ int vEle = (iSize >> ((size == 64) ? 6 : 5));
|
|
+ if (!vEle) {
|
|
+ vEle = 1;
|
|
+ }
|
|
+ if (size == 64) {
|
|
+ if (vEle == 1) {
|
|
+ return EVT(MVT::i64);
|
|
+ } else {
|
|
+ return EVT(MVT::getVectorVT(MVT::i64, vEle));
|
|
+ }
|
|
+ } else {
|
|
+ if (vEle == 1) {
|
|
+ return EVT(MVT::i32);
|
|
+ } else {
|
|
+ return EVT(MVT::getVectorVT(MVT::i32, vEle));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
|
|
+ SDValue Chain = Op.getOperand(0);
|
|
+ SDValue Cond = Op.getOperand(1);
|
|
+ SDValue Jump = Op.getOperand(2);
|
|
+ SDValue Result;
|
|
+ Result = DAG.getNode(
|
|
+ AMDGPUISD::BRANCH_COND,
|
|
+ Op.getDebugLoc(),
|
|
+ Op.getValueType(),
|
|
+ Chain, Jump, Cond);
|
|
+ return Result;
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT OVT = Op.getValueType();
|
|
+ SDValue LHS = Op.getOperand(0);
|
|
+ SDValue RHS = Op.getOperand(1);
|
|
+ MVT INTTY;
|
|
+ MVT FLTTY;
|
|
+ if (!OVT.isVector()) {
|
|
+ INTTY = MVT::i32;
|
|
+ FLTTY = MVT::f32;
|
|
+ } else if (OVT.getVectorNumElements() == 2) {
|
|
+ INTTY = MVT::v2i32;
|
|
+ FLTTY = MVT::v2f32;
|
|
+ } else if (OVT.getVectorNumElements() == 4) {
|
|
+ INTTY = MVT::v4i32;
|
|
+ FLTTY = MVT::v4f32;
|
|
+ }
|
|
+ unsigned bitsize = OVT.getScalarType().getSizeInBits();
|
|
+ // char|short jq = ia ^ ib;
|
|
+ SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
|
|
+
|
|
+ // jq = jq >> (bitsize - 2)
|
|
+ jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
|
|
+
|
|
+ // jq = jq | 0x1
|
|
+ jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
|
|
+
|
|
+ // jq = (int)jq
|
|
+ jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
|
|
+
|
|
+ // int ia = (int)LHS;
|
|
+ SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
|
|
+
|
|
+ // int ib, (int)RHS;
|
|
+ SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
|
|
+
|
|
+ // float fa = (float)ia;
|
|
+ SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
|
|
+
|
|
+ // float fb = (float)ib;
|
|
+ SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
|
|
+
|
|
+ // float fq = native_divide(fa, fb);
|
|
+ SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
|
|
+
|
|
+ // fq = trunc(fq);
|
|
+ fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
|
|
+
|
|
+ // float fqneg = -fq;
|
|
+ SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
|
|
+
|
|
+ // float fr = mad(fqneg, fb, fa);
|
|
+ SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa);
|
|
+
|
|
+ // int iq = (int)fq;
|
|
+ SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
|
|
+
|
|
+ // fr = fabs(fr);
|
|
+ fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
|
|
+
|
|
+ // fb = fabs(fb);
|
|
+ fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
|
|
+
|
|
+ // int cv = fr >= fb;
|
|
+ SDValue cv;
|
|
+ if (INTTY == MVT::i32) {
|
|
+ cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
|
|
+ } else {
|
|
+ cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
|
|
+ }
|
|
+ // jq = (cv ? jq : 0);
|
|
+ jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
|
|
+ DAG.getConstant(0, OVT));
|
|
+ // dst = iq + jq;
|
|
+ iq = DAG.getSExtOrTrunc(iq, DL, OVT);
|
|
+ iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
|
|
+ return iq;
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT OVT = Op.getValueType();
|
|
+ SDValue LHS = Op.getOperand(0);
|
|
+ SDValue RHS = Op.getOperand(1);
|
|
+ // The LowerSDIV32 function generates equivalent to the following IL.
|
|
+ // mov r0, LHS
|
|
+ // mov r1, RHS
|
|
+ // ilt r10, r0, 0
|
|
+ // ilt r11, r1, 0
|
|
+ // iadd r0, r0, r10
|
|
+ // iadd r1, r1, r11
|
|
+ // ixor r0, r0, r10
|
|
+ // ixor r1, r1, r11
|
|
+ // udiv r0, r0, r1
|
|
+ // ixor r10, r10, r11
|
|
+ // iadd r0, r0, r10
|
|
+ // ixor DST, r0, r10
|
|
+
|
|
+ // mov r0, LHS
|
|
+ SDValue r0 = LHS;
|
|
+
|
|
+ // mov r1, RHS
|
|
+ SDValue r1 = RHS;
|
|
+
|
|
+ // ilt r10, r0, 0
|
|
+ SDValue r10 = DAG.getSelectCC(DL,
|
|
+ r0, DAG.getConstant(0, OVT),
|
|
+ DAG.getConstant(-1, MVT::i32),
|
|
+ DAG.getConstant(0, MVT::i32),
|
|
+ ISD::SETLT);
|
|
+
|
|
+ // ilt r11, r1, 0
|
|
+ SDValue r11 = DAG.getSelectCC(DL,
|
|
+ r1, DAG.getConstant(0, OVT),
|
|
+ DAG.getConstant(-1, MVT::i32),
|
|
+ DAG.getConstant(0, MVT::i32),
|
|
+ ISD::SETLT);
|
|
+
|
|
+ // iadd r0, r0, r10
|
|
+ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
|
|
+
|
|
+ // iadd r1, r1, r11
|
|
+ r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
|
|
+
|
|
+ // ixor r0, r0, r10
|
|
+ r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
|
|
+
|
|
+ // ixor r1, r1, r11
|
|
+ r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
|
|
+
|
|
+ // udiv r0, r0, r1
|
|
+ r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
|
|
+
|
|
+ // ixor r10, r10, r11
|
|
+ r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
|
|
+
|
|
+ // iadd r0, r0, r10
|
|
+ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
|
|
+
|
|
+ // ixor DST, r0, r10
|
|
+ SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
|
|
+ return DST;
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
|
|
+ return SDValue(Op.getNode(), 0);
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT OVT = Op.getValueType();
|
|
+ MVT INTTY = MVT::i32;
|
|
+ if (OVT == MVT::v2i8) {
|
|
+ INTTY = MVT::v2i32;
|
|
+ } else if (OVT == MVT::v4i8) {
|
|
+ INTTY = MVT::v4i32;
|
|
+ }
|
|
+ SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
|
|
+ SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
|
|
+ LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
|
|
+ LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
|
|
+ return LHS;
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT OVT = Op.getValueType();
|
|
+ MVT INTTY = MVT::i32;
|
|
+ if (OVT == MVT::v2i16) {
|
|
+ INTTY = MVT::v2i32;
|
|
+ } else if (OVT == MVT::v4i16) {
|
|
+ INTTY = MVT::v4i32;
|
|
+ }
|
|
+ SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
|
|
+ SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
|
|
+ LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
|
|
+ LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
|
|
+ return LHS;
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT OVT = Op.getValueType();
|
|
+ SDValue LHS = Op.getOperand(0);
|
|
+ SDValue RHS = Op.getOperand(1);
|
|
+ // The LowerSREM32 function generates equivalent to the following IL.
|
|
+ // mov r0, LHS
|
|
+ // mov r1, RHS
|
|
+ // ilt r10, r0, 0
|
|
+ // ilt r11, r1, 0
|
|
+ // iadd r0, r0, r10
|
|
+ // iadd r1, r1, r11
|
|
+ // ixor r0, r0, r10
|
|
+ // ixor r1, r1, r11
|
|
+ // udiv r20, r0, r1
|
|
+ // umul r20, r20, r1
|
|
+ // sub r0, r0, r20
|
|
+ // iadd r0, r0, r10
|
|
+ // ixor DST, r0, r10
|
|
+
|
|
+ // mov r0, LHS
|
|
+ SDValue r0 = LHS;
|
|
+
|
|
+ // mov r1, RHS
|
|
+ SDValue r1 = RHS;
|
|
+
|
|
+ // ilt r10, r0, 0
|
|
+ SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
|
|
+
|
|
+ // ilt r11, r1, 0
|
|
+ SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
|
|
+
|
|
+ // iadd r0, r0, r10
|
|
+ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
|
|
+
|
|
+ // iadd r1, r1, r11
|
|
+ r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
|
|
+
|
|
+ // ixor r0, r0, r10
|
|
+ r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
|
|
+
|
|
+ // ixor r1, r1, r11
|
|
+ r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
|
|
+
|
|
+ // udiv r20, r0, r1
|
|
+ SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
|
|
+
|
|
+ // umul r20, r20, r1
|
|
+ r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
|
|
+
|
|
+ // sub r0, r0, r20
|
|
+ r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
|
|
+
|
|
+ // iadd r0, r0, r10
|
|
+ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
|
|
+
|
|
+ // ixor DST, r0, r10
|
|
+ SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
|
|
+ return DST;
|
|
+}
|
|
+
|
|
+SDValue
|
|
+AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
|
|
+ return SDValue(Op.getNode(), 0);
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td
|
|
new file mode 100644
|
|
index 0000000..e969bbf
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILInstrInfo.td
|
|
@@ -0,0 +1,208 @@
|
|
+//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+// This file describes the AMDIL instructions in TableGen format.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+// AMDIL Instruction Predicate Definitions
|
|
+// Predicate that is set to true if the hardware supports double precision
|
|
+// divide
|
|
+def HasHWDDiv : Predicate<"Subtarget.device()"
|
|
+ "->getGeneration() > AMDGPUDeviceInfo::HD4XXX && "
|
|
+ "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
|
|
+
|
|
+// Predicate that is set to true if the hardware supports double, but not double
|
|
+// precision divide in hardware
|
|
+def HasSWDDiv : Predicate<"Subtarget.device()"
|
|
+ "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
|
|
+ "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
|
|
+
|
|
+// Predicate that is set to true if the hardware support 24bit signed
|
|
+// math ops. Otherwise a software expansion to 32bit math ops is used instead.
|
|
+def HasHWSign24Bit : Predicate<"Subtarget.device()"
|
|
+ "->getGeneration() > AMDGPUDeviceInfo::HD5XXX">;
|
|
+
|
|
+// Predicate that is set to true if 64bit operations are supported or not
|
|
+def HasHW64Bit : Predicate<"Subtarget.device()"
|
|
+ "->usesHardware(AMDGPUDeviceInfo::LongOps)">;
|
|
+def HasSW64Bit : Predicate<"Subtarget.device()"
|
|
+ "->usesSoftware(AMDGPUDeviceInfo::LongOps)">;
|
|
+
|
|
+// Predicate that is set to true if the timer register is supported
|
|
+def HasTmrRegister : Predicate<"Subtarget.device()"
|
|
+ "->isSupported(AMDGPUDeviceInfo::TmrReg)">;
|
|
+// Predicate that is true if we are at least evergreen series
|
|
+def HasDeviceIDInst : Predicate<"Subtarget.device()"
|
|
+ "->getGeneration() >= AMDGPUDeviceInfo::HD5XXX">;
|
|
+
|
|
+// Predicate that is true if we have region address space.
|
|
+def hasRegionAS : Predicate<"Subtarget.device()"
|
|
+ "->usesHardware(AMDGPUDeviceInfo::RegionMem)">;
|
|
+
|
|
+// Predicate that is false if we don't have region address space.
|
|
+def noRegionAS : Predicate<"!Subtarget.device()"
|
|
+ "->isSupported(AMDGPUDeviceInfo::RegionMem)">;
|
|
+
|
|
+
|
|
+// Predicate that is set to true if 64bit Mul is supported in the IL or not
|
|
+def HasHW64Mul : Predicate<"Subtarget.calVersion()"
|
|
+ ">= CAL_VERSION_SC_139"
|
|
+ "&& Subtarget.device()"
|
|
+ "->getGeneration() >="
|
|
+ "AMDGPUDeviceInfo::HD5XXX">;
|
|
+def HasSW64Mul : Predicate<"Subtarget.calVersion()"
|
|
+ "< CAL_VERSION_SC_139">;
|
|
+// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not
|
|
+def HasHW64DivMod : Predicate<"Subtarget.device()"
|
|
+ "->usesHardware(AMDGPUDeviceInfo::HW64BitDivMod)">;
|
|
+def HasSW64DivMod : Predicate<"Subtarget.device()"
|
|
+ "->usesSoftware(AMDGPUDeviceInfo::HW64BitDivMod)">;
|
|
+
|
|
+// Predicate that is set to true if 64bit pointer are used.
|
|
+def Has64BitPtr : Predicate<"Subtarget.is64bit()">;
|
|
+def Has32BitPtr : Predicate<"!Subtarget.is64bit()">;
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Custom Operands
|
|
+//===--------------------------------------------------------------------===//
|
|
+def brtarget : Operand<OtherVT>;
|
|
+
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Custom Selection DAG Type Profiles
|
|
+//===--------------------------------------------------------------------===//
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Generic Profile Types
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [
|
|
+ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
|
|
+ ]>;
|
|
+def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [
|
|
+ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3>
|
|
+ ]>;
|
|
+def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [
|
|
+ SDTCisEltOfVec<1, 0>
|
|
+ ]>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Flow Control Profile Types
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Branch instruction where second and third are basic blocks
|
|
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
|
|
+ SDTCisVT<0, OtherVT>
|
|
+ ]>;
|
|
+
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Custom Selection DAG Nodes
|
|
+//===--------------------------------------------------------------------===//
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Flow Control DAG Nodes
|
|
+//===----------------------------------------------------------------------===//
|
|
+def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Call/Return DAG Nodes
|
|
+//===----------------------------------------------------------------------===//
|
|
+def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
|
|
+ [SDNPHasChain, SDNPOptInGlue]>;
|
|
+
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Instructions
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Floating point math functions
|
|
+def IL_div_inf : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>;
|
|
+def IL_mad : SDNode<"AMDGPUISD::MAD", SDTIL_GenTernaryOp>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Integer functions
|
|
+//===----------------------------------------------------------------------===//
|
|
+def IL_umul : SDNode<"AMDGPUISD::UMUL" , SDTIntBinOp,
|
|
+ [SDNPCommutative, SDNPAssociative]>;
|
|
+
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Custom Pattern DAG Nodes
|
|
+//===--------------------------------------------------------------------===//
|
|
+def global_store : PatFrag<(ops node:$val, node:$ptr),
|
|
+ (store node:$val, node:$ptr), [{
|
|
+ return isGlobalStore(dyn_cast<StoreSDNode>(N));
|
|
+}]>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Load pattern fragments
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Global address space loads
|
|
+def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
|
+ return isGlobalLoad(dyn_cast<LoadSDNode>(N));
|
|
+}]>;
|
|
+// Constant address space loads
|
|
+def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
|
+ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
|
|
+}]>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Complex addressing mode patterns
|
|
+//===----------------------------------------------------------------------===//
|
|
+def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>;
|
|
+def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>;
|
|
+def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>;
|
|
+def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Instruction format classes
|
|
+//===----------------------------------------------------------------------===//
|
|
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
|
|
+: Instruction {
|
|
+
|
|
+ let Namespace = "AMDGPU";
|
|
+ dag OutOperandList = outs;
|
|
+ dag InOperandList = ins;
|
|
+ let Pattern = pattern;
|
|
+ let AsmString = !strconcat(asmstr, "\n");
|
|
+ let isPseudo = 1;
|
|
+ let Itinerary = NullALU;
|
|
+ bit hasIEEEFlag = 0;
|
|
+ bit hasZeroOpFlag = 0;
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Multiclass Instruction formats
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Multiclass that handles branch instructions
|
|
+multiclass BranchConditional<SDNode Op> {
|
|
+ def _i32 : ILFormat<(outs),
|
|
+ (ins brtarget:$target, GPRI32:$src0),
|
|
+ "; i32 Pseudo branch instruction",
|
|
+ [(Op bb:$target, GPRI32:$src0)]>;
|
|
+ def _f32 : ILFormat<(outs),
|
|
+ (ins brtarget:$target, GPRF32:$src0),
|
|
+ "; f32 Pseudo branch instruction",
|
|
+ [(Op bb:$target, GPRF32:$src0)]>;
|
|
+}
|
|
+
|
|
+// Only scalar types should generate flow control
|
|
+multiclass BranchInstr<string name> {
|
|
+ def _i32 : ILFormat<(outs), (ins GPRI32:$src),
|
|
+ !strconcat(name, " $src"), []>;
|
|
+ def _f32 : ILFormat<(outs), (ins GPRF32:$src),
|
|
+ !strconcat(name, " $src"), []>;
|
|
+}
|
|
+// Only scalar types should generate flow control
|
|
+multiclass BranchInstr2<string name> {
|
|
+ def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1),
|
|
+ !strconcat(name, " $src0, $src1"), []>;
|
|
+ def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1),
|
|
+ !strconcat(name, " $src0, $src1"), []>;
|
|
+}
|
|
+
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Intrinsics support
|
|
+//===--------------------------------------------------------------------===//
|
|
+include "AMDILIntrinsics.td"
|
|
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDILIntrinsicInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..02d06d6
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILIntrinsicInfo.cpp
|
|
@@ -0,0 +1,79 @@
|
|
+//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief AMDGPU Implementation of the IntrinsicInfo class.
|
|
+//
|
|
+//===-----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDILIntrinsicInfo.h"
|
|
+#include "AMDIL.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+#include "llvm/DerivedTypes.h"
|
|
+#include "llvm/Intrinsics.h"
|
|
+#include "llvm/Module.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
|
|
+#include "AMDGPUGenIntrinsics.inc"
|
|
+#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
|
|
+
|
|
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm)
|
|
+ : TargetIntrinsicInfo() {
|
|
+}
|
|
+
|
|
+std::string
|
|
+AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
|
|
+ unsigned int numTys) const {
|
|
+ static const char* const names[] = {
|
|
+#define GET_INTRINSIC_NAME_TABLE
|
|
+#include "AMDGPUGenIntrinsics.inc"
|
|
+#undef GET_INTRINSIC_NAME_TABLE
|
|
+ };
|
|
+
|
|
+ if (IntrID < Intrinsic::num_intrinsics) {
|
|
+ return 0;
|
|
+ }
|
|
+ assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
|
|
+ && "Invalid intrinsic ID");
|
|
+
|
|
+ std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
|
|
+ return Result;
|
|
+}
|
|
+
|
|
+unsigned int
|
|
+AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const {
|
|
+#define GET_FUNCTION_RECOGNIZER
|
|
+#include "AMDGPUGenIntrinsics.inc"
|
|
+#undef GET_FUNCTION_RECOGNIZER
|
|
+ AMDGPUIntrinsic::ID IntrinsicID
|
|
+ = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
|
|
+ IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
|
|
+
|
|
+ if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
|
|
+ return IntrinsicID;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
|
|
+ // Overload Table
|
|
+#define GET_INTRINSIC_OVERLOAD_TABLE
|
|
+#include "AMDGPUGenIntrinsics.inc"
|
|
+#undef GET_INTRINSIC_OVERLOAD_TABLE
|
|
+}
|
|
+
|
|
+Function*
|
|
+AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
|
|
+ Type **Tys,
|
|
+ unsigned numTys) const {
|
|
+ assert(!"Not implemented");
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDILIntrinsicInfo.h
|
|
new file mode 100644
|
|
index 0000000..83f4933
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILIntrinsicInfo.h
|
|
@@ -0,0 +1,49 @@
|
|
+//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
|
|
+//
|
|
+//===-----------------------------------------------------------------------===//
|
|
+#ifndef AMDIL_INTRINSICS_H
|
|
+#define AMDIL_INTRINSICS_H
|
|
+
|
|
+#include "llvm/Intrinsics.h"
|
|
+#include "llvm/Target/TargetIntrinsicInfo.h"
|
|
+
|
|
+namespace llvm {
|
|
+class TargetMachine;
|
|
+
|
|
+namespace AMDGPUIntrinsic {
|
|
+enum ID {
|
|
+ last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
|
|
+#define GET_INTRINSIC_ENUM_VALUES
|
|
+#include "AMDGPUGenIntrinsics.inc"
|
|
+#undef GET_INTRINSIC_ENUM_VALUES
|
|
+ , num_AMDGPU_intrinsics
|
|
+};
|
|
+
|
|
+} // end namespace AMDGPUIntrinsic
|
|
+
|
|
+class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
|
|
+public:
|
|
+ AMDGPUIntrinsicInfo(TargetMachine *tm);
|
|
+ std::string getName(unsigned int IntrId, Type **Tys = 0,
|
|
+ unsigned int numTys = 0) const;
|
|
+ unsigned int lookupName(const char *Name, unsigned int Len) const;
|
|
+ bool isOverloaded(unsigned int IID) const;
|
|
+ Function *getDeclaration(Module *M, unsigned int ID,
|
|
+ Type **Tys = 0,
|
|
+ unsigned int numTys = 0) const;
|
|
+};
|
|
+
|
|
+} // end namespace llvm
|
|
+
|
|
+#endif // AMDIL_INTRINSICS_H
|
|
+
|
|
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
|
|
new file mode 100644
|
|
index 0000000..3f9e20f
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILIntrinsics.td
|
|
@@ -0,0 +1,242 @@
|
|
+//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+// This file defines all of the amdil-specific intrinsics
|
|
+//
|
|
+//===---------------------------------------------------------------===//
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Intrinsic classes
|
|
+// Generic versions of the above classes but for Target specific intrinsics
|
|
+// instead of SDNode patterns.
|
|
+//===--------------------------------------------------------------------===//
|
|
+let TargetPrefix = "AMDIL", isTarget = 1 in {
|
|
+ class VoidIntLong :
|
|
+ Intrinsic<[llvm_i64_ty], [], []>;
|
|
+ class VoidIntInt :
|
|
+ Intrinsic<[llvm_i32_ty], [], []>;
|
|
+ class VoidIntBool :
|
|
+ Intrinsic<[llvm_i32_ty], [], []>;
|
|
+ class UnaryIntInt :
|
|
+ Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
|
|
+ class UnaryIntFloat :
|
|
+ Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
|
|
+ class ConvertIntFTOI :
|
|
+ Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
|
|
+ class ConvertIntITOF :
|
|
+ Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>;
|
|
+ class UnaryIntNoRetInt :
|
|
+ Intrinsic<[], [llvm_anyint_ty], []>;
|
|
+ class UnaryIntNoRetFloat :
|
|
+ Intrinsic<[], [llvm_anyfloat_ty], []>;
|
|
+ class BinaryIntInt :
|
|
+ Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
|
|
+ class BinaryIntFloat :
|
|
+ Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
|
|
+ class BinaryIntNoRetInt :
|
|
+ Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>;
|
|
+ class BinaryIntNoRetFloat :
|
|
+ Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>;
|
|
+ class TernaryIntInt :
|
|
+ Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
|
|
+ LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
|
|
+ class TernaryIntFloat :
|
|
+ Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>,
|
|
+ LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
|
|
+ class QuaternaryIntInt :
|
|
+ Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
|
|
+ LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
|
|
+ class UnaryAtomicInt :
|
|
+ Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
|
|
+ class BinaryAtomicInt :
|
|
+ Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
|
|
+ class TernaryAtomicInt :
|
|
+ Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
|
|
+ class UnaryAtomicIntNoRet :
|
|
+ Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
|
|
+ class BinaryAtomicIntNoRet :
|
|
+ Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
|
|
+ class TernaryAtomicIntNoRet :
|
|
+ Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
|
|
+}
|
|
+
|
|
+let TargetPrefix = "AMDIL", isTarget = 1 in {
|
|
+ def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
|
|
+
|
|
+ def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
|
|
+ UnaryIntInt;
|
|
+ def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
|
|
+ UnaryIntInt;
|
|
+ def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">,
|
|
+ UnaryIntInt;
|
|
+ def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">,
|
|
+ UnaryIntInt;
|
|
+ def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">,
|
|
+ UnaryIntInt;
|
|
+ def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">,
|
|
+ QuaternaryIntInt;
|
|
+ def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_mad_i32 : GCCBuiltin<"__amdil_imad">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_mad_u32 : GCCBuiltin<"__amdil_umad">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_mad : GCCBuiltin<"__amdil_mad">,
|
|
+ TernaryIntFloat;
|
|
+ def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_mad24_i32 : GCCBuiltin<"__amdil_imad24">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_mad24_u32 : GCCBuiltin<"__amdil_umad24">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_min : GCCBuiltin<"__amdil_min">,
|
|
+ BinaryIntFloat;
|
|
+ def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">,
|
|
+ BinaryIntInt;
|
|
+ def int_AMDIL_max : GCCBuiltin<"__amdil_max">,
|
|
+ BinaryIntFloat;
|
|
+ def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">,
|
|
+ TernaryIntInt;
|
|
+ def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">,
|
|
+ TernaryIntFloat;
|
|
+ def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat;
|
|
+ def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat;
|
|
+ def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt;
|
|
+ def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">,
|
|
+ TernaryIntFloat;
|
|
+ def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_length : GCCBuiltin<"__amdil_length">,
|
|
+ UnaryIntFloat;
|
|
+ def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">,
|
|
+ TernaryIntFloat;
|
|
+ def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">,
|
|
+ Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty,
|
|
+ llvm_v4i32_ty, llvm_i32_ty], []>;
|
|
+
|
|
+ def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">,
|
|
+ Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>;
|
|
+ def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">,
|
|
+ Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>;
|
|
+ def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">,
|
|
+ Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
|
|
+ def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">,
|
|
+ ConvertIntITOF;
|
|
+ def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">,
|
|
+ ConvertIntFTOI;
|
|
+ def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">,
|
|
+ ConvertIntFTOI;
|
|
+ def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">,
|
|
+ ConvertIntFTOI;
|
|
+ def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">,
|
|
+ ConvertIntFTOI;
|
|
+ def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">,
|
|
+ ConvertIntFTOI;
|
|
+ def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">,
|
|
+ ConvertIntFTOI;
|
|
+ def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">,
|
|
+ Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>;
|
|
+ def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">,
|
|
+ ConvertIntITOF;
|
|
+ def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">,
|
|
+ ConvertIntITOF;
|
|
+ def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">,
|
|
+ ConvertIntITOF;
|
|
+ def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">,
|
|
+ ConvertIntITOF;
|
|
+ def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">,
|
|
+ Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
|
|
+ llvm_v2f32_ty, llvm_float_ty], []>;
|
|
+ def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">,
|
|
+ Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
|
|
+ llvm_v2f32_ty], []>;
|
|
+ def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">,
|
|
+ Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
|
|
+ llvm_v4f32_ty], []>;
|
|
+ def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">,
|
|
+ Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
|
|
+ llvm_v4f32_ty], []>;
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDILNIDevice.cpp b/lib/Target/R600/AMDILNIDevice.cpp
|
|
new file mode 100644
|
|
index 0000000..b82da59
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILNIDevice.cpp
|
|
@@ -0,0 +1,65 @@
|
|
+//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//==-----------------------------------------------------------------------===//
|
|
+#include "AMDILNIDevice.h"
|
|
+#include "AMDILEvergreenDevice.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+AMDGPUNIDevice::AMDGPUNIDevice(AMDGPUSubtarget *ST)
|
|
+ : AMDGPUEvergreenDevice(ST) {
|
|
+ std::string name = ST->getDeviceName();
|
|
+ if (name == "caicos") {
|
|
+ DeviceFlag = OCL_DEVICE_CAICOS;
|
|
+ } else if (name == "turks") {
|
|
+ DeviceFlag = OCL_DEVICE_TURKS;
|
|
+ } else if (name == "cayman") {
|
|
+ DeviceFlag = OCL_DEVICE_CAYMAN;
|
|
+ } else {
|
|
+ DeviceFlag = OCL_DEVICE_BARTS;
|
|
+ }
|
|
+}
|
|
+AMDGPUNIDevice::~AMDGPUNIDevice() {
|
|
+}
|
|
+
|
|
+size_t
|
|
+AMDGPUNIDevice::getMaxLDSSize() const {
|
|
+ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
|
|
+ return MAX_LDS_SIZE_900;
|
|
+ } else {
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+uint32_t
|
|
+AMDGPUNIDevice::getGeneration() const {
|
|
+ return AMDGPUDeviceInfo::HD6XXX;
|
|
+}
|
|
+
|
|
+
|
|
+AMDGPUCaymanDevice::AMDGPUCaymanDevice(AMDGPUSubtarget *ST)
|
|
+ : AMDGPUNIDevice(ST) {
|
|
+ setCaps();
|
|
+}
|
|
+
|
|
+AMDGPUCaymanDevice::~AMDGPUCaymanDevice() {
|
|
+}
|
|
+
|
|
+void
|
|
+AMDGPUCaymanDevice::setCaps() {
|
|
+ if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
|
|
+ mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
|
|
+ mHWBits.set(AMDGPUDeviceInfo::FMA);
|
|
+ }
|
|
+ mHWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
|
|
+ mSWBits.reset(AMDGPUDeviceInfo::Signed24BitOps);
|
|
+ mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
|
|
+}
|
|
+
|
|
diff --git a/lib/Target/R600/AMDILNIDevice.h b/lib/Target/R600/AMDILNIDevice.h
|
|
new file mode 100644
|
|
index 0000000..bc7df37
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILNIDevice.h
|
|
@@ -0,0 +1,57 @@
|
|
+//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+/// \file
|
|
+/// \brief Interface for the subtarget data classes.
|
|
+///
|
|
+/// This file will define the interface that each generation needs to
|
|
+/// implement in order to correctly answer queries on the capabilities of the
|
|
+/// specific hardware.
|
|
+//===---------------------------------------------------------------------===//
|
|
+#ifndef AMDILNIDEVICE_H
|
|
+#define AMDILNIDEVICE_H
|
|
+#include "AMDILEvergreenDevice.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class AMDGPUSubtarget;
|
|
+//===---------------------------------------------------------------------===//
|
|
+// NI generation of devices and their respective sub classes
|
|
+//===---------------------------------------------------------------------===//
|
|
+
|
|
+/// \brief The AMDGPUNIDevice is the base class for all Northern Island series of
|
|
+/// cards.
|
|
+///
|
|
+/// It is very similiar to the AMDGPUEvergreenDevice, with the major
|
|
+/// exception being differences in wavefront size and hardware capabilities. The
|
|
+/// NI devices are all 64 wide wavefronts and also add support for signed 24 bit
|
|
+/// integer operations
|
|
+class AMDGPUNIDevice : public AMDGPUEvergreenDevice {
|
|
+public:
|
|
+ AMDGPUNIDevice(AMDGPUSubtarget*);
|
|
+ virtual ~AMDGPUNIDevice();
|
|
+ virtual size_t getMaxLDSSize() const;
|
|
+ virtual uint32_t getGeneration() const;
|
|
+};
|
|
+
|
|
+/// Just as the AMDGPUCypressDevice is the double capable version of the
|
|
+/// AMDGPUEvergreenDevice, the AMDGPUCaymanDevice is the double capable version
|
|
+/// of the AMDGPUNIDevice. The other major difference is that the Cayman Device
|
|
+/// has 4 wide ALU's, whereas the rest of the NI family is a 5 wide.
|
|
+class AMDGPUCaymanDevice: public AMDGPUNIDevice {
|
|
+public:
|
|
+ AMDGPUCaymanDevice(AMDGPUSubtarget*);
|
|
+ virtual ~AMDGPUCaymanDevice();
|
|
+private:
|
|
+ virtual void setCaps();
|
|
+};
|
|
+
|
|
+static const unsigned int MAX_LDS_SIZE_900 = AMDGPUDevice::MAX_LDS_SIZE_800;
|
|
+} // namespace llvm
|
|
+#endif // AMDILNIDEVICE_H
|
|
diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
|
|
new file mode 100644
|
|
index 0000000..57317ac
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
|
|
@@ -0,0 +1,1256 @@
|
|
+//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//==-----------------------------------------------------------------------===//
|
|
+
|
|
+#define DEBUG_TYPE "PeepholeOpt"
|
|
+#ifdef DEBUG
|
|
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
|
|
+#else
|
|
+#define DEBUGME 0
|
|
+#endif
|
|
+
|
|
+#include "AMDILDevices.h"
|
|
+#include "AMDGPUInstrInfo.h"
|
|
+#include "llvm/ADT/Statistic.h"
|
|
+#include "llvm/ADT/StringExtras.h"
|
|
+#include "llvm/ADT/StringRef.h"
|
|
+#include "llvm/ADT/Twine.h"
|
|
+#include "llvm/Constants.h"
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
|
|
+#include "llvm/Function.h"
|
|
+#include "llvm/Instructions.h"
|
|
+#include "llvm/Module.h"
|
|
+#include "llvm/Support/Debug.h"
|
|
+#include "llvm/Support/MathExtras.h"
|
|
+
|
|
+#include <sstream>
|
|
+
|
|
+#if 0
|
|
+STATISTIC(PointerAssignments, "Number of dynamic pointer "
|
|
+ "assigments discovered");
|
|
+STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
|
|
+#endif
|
|
+
|
|
+using namespace llvm;
|
|
+// The Peephole optimization pass is used to do simple last minute optimizations
|
|
+// that are required for correct code or to remove redundant functions
|
|
+namespace {
|
|
+
|
|
+class OpaqueType;
|
|
+
|
|
+class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
|
|
+public:
|
|
+ TargetMachine &TM;
|
|
+ static char ID;
|
|
+ AMDGPUPeepholeOpt(TargetMachine &tm);
|
|
+ ~AMDGPUPeepholeOpt();
|
|
+ const char *getPassName() const;
|
|
+ bool runOnFunction(Function &F);
|
|
+ bool doInitialization(Module &M);
|
|
+ bool doFinalization(Module &M);
|
|
+ void getAnalysisUsage(AnalysisUsage &AU) const;
|
|
+protected:
|
|
+private:
|
|
+ // Function to initiate all of the instruction level optimizations.
|
|
+ bool instLevelOptimizations(BasicBlock::iterator *inst);
|
|
+ // Quick check to see if we need to dump all of the pointers into the
|
|
+ // arena. If this is correct, then we set all pointers to exist in arena. This
|
|
+ // is a workaround for aliasing of pointers in a struct/union.
|
|
+ bool dumpAllIntoArena(Function &F);
|
|
+ // Because I don't want to invalidate any pointers while in the
|
|
+ // safeNestedForEachFunction. I push atomic conversions to a vector and handle
|
|
+ // it later. This function does the conversions if required.
|
|
+ void doAtomicConversionIfNeeded(Function &F);
|
|
+ // Because __amdil_is_constant cannot be properly evaluated if
|
|
+ // optimizations are disabled, the call's are placed in a vector
|
|
+ // and evaluated after the __amdil_image* functions are evaluated
|
|
+ // which should allow the __amdil_is_constant function to be
|
|
+ // evaluated correctly.
|
|
+ void doIsConstCallConversionIfNeeded();
|
|
+ bool mChanged;
|
|
+ bool mDebug;
|
|
+ bool mConvertAtomics;
|
|
+ CodeGenOpt::Level optLevel;
|
|
+ // Run a series of tests to see if we can optimize a CALL instruction.
|
|
+ bool optimizeCallInst(BasicBlock::iterator *bbb);
|
|
+ // A peephole optimization to optimize bit extract sequences.
|
|
+ bool optimizeBitExtract(Instruction *inst);
|
|
+ // A peephole optimization to optimize bit insert sequences.
|
|
+ bool optimizeBitInsert(Instruction *inst);
|
|
+ bool setupBitInsert(Instruction *base,
|
|
+ Instruction *&src,
|
|
+ Constant *&mask,
|
|
+ Constant *&shift);
|
|
+ // Expand the bit field insert instruction on versions of OpenCL that
|
|
+ // don't support it.
|
|
+ bool expandBFI(CallInst *CI);
|
|
+ // Expand the bit field mask instruction on version of OpenCL that
|
|
+ // don't support it.
|
|
+ bool expandBFM(CallInst *CI);
|
|
+ // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
|
|
+ // this case we need to expand them. These functions check for 24bit functions
|
|
+ // and then expand.
|
|
+ bool isSigned24BitOps(CallInst *CI);
|
|
+ void expandSigned24BitOps(CallInst *CI);
|
|
+ // One optimization that can occur is that if the required workgroup size is
|
|
+ // specified then the result of get_local_size is known at compile time and
|
|
+ // can be returned accordingly.
|
|
+ bool isRWGLocalOpt(CallInst *CI);
|
|
+ // On northern island cards, the division is slightly less accurate than on
|
|
+ // previous generations, so we need to utilize a more accurate division. So we
|
|
+ // can translate the accurate divide to a normal divide on all other cards.
|
|
+ bool convertAccurateDivide(CallInst *CI);
|
|
+ void expandAccurateDivide(CallInst *CI);
|
|
+ // If the alignment is set incorrectly, it can produce really inefficient
|
|
+ // code. This checks for this scenario and fixes it if possible.
|
|
+ bool correctMisalignedMemOp(Instruction *inst);
|
|
+
|
|
+ // If we are in no opt mode, then we need to make sure that
|
|
+ // local samplers are properly propagated as constant propagation
|
|
+ // doesn't occur and we need to know the value of kernel defined
|
|
+ // samplers at compile time.
|
|
+ bool propagateSamplerInst(CallInst *CI);
|
|
+
|
|
+ // Helper functions
|
|
+
|
|
+ // Group of functions that recursively calculate the size of a structure based
|
|
+ // on it's sub-types.
|
|
+ size_t getTypeSize(Type * const T, bool dereferencePtr = false);
|
|
+ size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
|
|
+ size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
|
|
+ size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
|
|
+ size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
|
|
+ size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
|
|
+ size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
|
|
+ size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
|
|
+
|
|
+ LLVMContext *mCTX;
|
|
+ Function *mF;
|
|
+ const AMDGPUSubtarget *mSTM;
|
|
+ SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
|
|
+ SmallVector<CallInst *, 16> isConstVec;
|
|
+}; // class AMDGPUPeepholeOpt
|
|
+ char AMDGPUPeepholeOpt::ID = 0;
|
|
+
|
|
+// A template function that has two levels of looping before calling the
|
|
+// function with a pointer to the current iterator.
|
|
+template<class InputIterator, class SecondIterator, class Function>
|
|
+Function safeNestedForEach(InputIterator First, InputIterator Last,
|
|
+ SecondIterator S, Function F) {
|
|
+ for ( ; First != Last; ++First) {
|
|
+ SecondIterator sf, sl;
|
|
+ for (sf = First->begin(), sl = First->end();
|
|
+ sf != sl; ) {
|
|
+ if (!F(&sf)) {
|
|
+ ++sf;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return F;
|
|
+}
|
|
+
|
|
+} // anonymous namespace
|
|
+
|
|
+namespace llvm {
|
|
+ FunctionPass *
|
|
+ createAMDGPUPeepholeOpt(TargetMachine &tm) {
|
|
+ return new AMDGPUPeepholeOpt(tm);
|
|
+ }
|
|
+} // llvm namespace
|
|
+
|
|
+AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
|
|
+ : FunctionPass(ID), TM(tm) {
|
|
+ mDebug = DEBUGME;
|
|
+ optLevel = TM.getOptLevel();
|
|
+
|
|
+}
|
|
+
|
|
+AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() {
|
|
+}
|
|
+
|
|
+const char *
|
|
+AMDGPUPeepholeOpt::getPassName() const {
|
|
+ return "AMDGPU PeepHole Optimization Pass";
|
|
+}
|
|
+
|
|
+bool
|
|
+containsPointerType(Type *Ty) {
|
|
+ if (!Ty) {
|
|
+ return false;
|
|
+ }
|
|
+ switch(Ty->getTypeID()) {
|
|
+ default:
|
|
+ return false;
|
|
+ case Type::StructTyID: {
|
|
+ const StructType *ST = dyn_cast<StructType>(Ty);
|
|
+ for (StructType::element_iterator stb = ST->element_begin(),
|
|
+ ste = ST->element_end(); stb != ste; ++stb) {
|
|
+ if (!containsPointerType(*stb)) {
|
|
+ continue;
|
|
+ }
|
|
+ return true;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case Type::VectorTyID:
|
|
+ case Type::ArrayTyID:
|
|
+ return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
|
|
+ case Type::PointerTyID:
|
|
+ return true;
|
|
+ };
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) {
|
|
+ bool dumpAll = false;
|
|
+ for (Function::const_arg_iterator cab = F.arg_begin(),
|
|
+ cae = F.arg_end(); cab != cae; ++cab) {
|
|
+ const Argument *arg = cab;
|
|
+ const PointerType *PT = dyn_cast<PointerType>(arg->getType());
|
|
+ if (!PT) {
|
|
+ continue;
|
|
+ }
|
|
+ Type *DereferencedType = PT->getElementType();
|
|
+ if (!dyn_cast<StructType>(DereferencedType)
|
|
+ ) {
|
|
+ continue;
|
|
+ }
|
|
+ if (!containsPointerType(DereferencedType)) {
|
|
+ continue;
|
|
+ }
|
|
+ // FIXME: Because a pointer inside of a struct/union may be aliased to
|
|
+ // another pointer we need to take the conservative approach and place all
|
|
+ // pointers into the arena until more advanced detection is implemented.
|
|
+ dumpAll = true;
|
|
+ }
|
|
+ return dumpAll;
|
|
+}
|
|
+void
|
|
+AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
|
|
+ if (isConstVec.empty()) {
|
|
+ return;
|
|
+ }
|
|
+ for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
|
|
+ CallInst *CI = isConstVec[x];
|
|
+ Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
|
|
+ Type *aType = Type::getInt32Ty(*mCTX);
|
|
+ Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
|
|
+ : ConstantInt::get(aType, 0);
|
|
+ CI->replaceAllUsesWith(Val);
|
|
+ CI->eraseFromParent();
|
|
+ }
|
|
+ isConstVec.clear();
|
|
+}
|
|
+void
|
|
+AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) {
|
|
+ // Don't do anything if we don't have any atomic operations.
|
|
+ if (atomicFuncs.empty()) {
|
|
+ return;
|
|
+ }
|
|
+ // Change the function name for the atomic if it is required
|
|
+ uint32_t size = atomicFuncs.size();
|
|
+ for (uint32_t x = 0; x < size; ++x) {
|
|
+ atomicFuncs[x].first->setOperand(
|
|
+ atomicFuncs[x].first->getNumOperands()-1,
|
|
+ atomicFuncs[x].second);
|
|
+
|
|
+ }
|
|
+ mChanged = true;
|
|
+ if (mConvertAtomics) {
|
|
+ return;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::runOnFunction(Function &MF) {
|
|
+ mChanged = false;
|
|
+ mF = &MF;
|
|
+ mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
|
|
+ if (mDebug) {
|
|
+ MF.dump();
|
|
+ }
|
|
+ mCTX = &MF.getType()->getContext();
|
|
+ mConvertAtomics = true;
|
|
+ safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
|
|
+ std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
|
|
+ this));
|
|
+
|
|
+ doAtomicConversionIfNeeded(MF);
|
|
+ doIsConstCallConversionIfNeeded();
|
|
+
|
|
+ if (mDebug) {
|
|
+ MF.dump();
|
|
+ }
|
|
+ return mChanged;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) {
|
|
+ Instruction *inst = (*bbb);
|
|
+ CallInst *CI = dyn_cast<CallInst>(inst);
|
|
+ if (!CI) {
|
|
+ return false;
|
|
+ }
|
|
+ if (isSigned24BitOps(CI)) {
|
|
+ expandSigned24BitOps(CI);
|
|
+ ++(*bbb);
|
|
+ CI->eraseFromParent();
|
|
+ return true;
|
|
+ }
|
|
+ if (propagateSamplerInst(CI)) {
|
|
+ return false;
|
|
+ }
|
|
+ if (expandBFI(CI) || expandBFM(CI)) {
|
|
+ ++(*bbb);
|
|
+ CI->eraseFromParent();
|
|
+ return true;
|
|
+ }
|
|
+ if (convertAccurateDivide(CI)) {
|
|
+ expandAccurateDivide(CI);
|
|
+ ++(*bbb);
|
|
+ CI->eraseFromParent();
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
|
|
+ if (calleeName.startswith("__amdil_is_constant")) {
|
|
+ // If we do not have optimizations, then this
|
|
+ // cannot be properly evaluated, so we add the
|
|
+ // call instruction to a vector and process
|
|
+ // them at the end of processing after the
|
|
+ // samplers have been correctly handled.
|
|
+ if (optLevel == CodeGenOpt::None) {
|
|
+ isConstVec.push_back(CI);
|
|
+ return false;
|
|
+ } else {
|
|
+ Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
|
|
+ Type *aType = Type::getInt32Ty(*mCTX);
|
|
+ Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
|
|
+ : ConstantInt::get(aType, 0);
|
|
+ CI->replaceAllUsesWith(Val);
|
|
+ ++(*bbb);
|
|
+ CI->eraseFromParent();
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (calleeName.equals("__amdil_is_asic_id_i32")) {
|
|
+ ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
|
|
+ Type *aType = Type::getInt32Ty(*mCTX);
|
|
+ Value *Val = CV;
|
|
+ if (Val) {
|
|
+ Val = ConstantInt::get(aType,
|
|
+ mSTM->device()->getDeviceFlag() & CV->getZExtValue());
|
|
+ } else {
|
|
+ Val = ConstantInt::get(aType, 0);
|
|
+ }
|
|
+ CI->replaceAllUsesWith(Val);
|
|
+ ++(*bbb);
|
|
+ CI->eraseFromParent();
|
|
+ return true;
|
|
+ }
|
|
+ Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
|
|
+ if (!F) {
|
|
+ return false;
|
|
+ }
|
|
+ if (F->getName().startswith("__atom") && !CI->getNumUses()
|
|
+ && F->getName().find("_xchg") == StringRef::npos) {
|
|
+ std::string buffer(F->getName().str() + "_noret");
|
|
+ F = dyn_cast<Function>(
|
|
+ F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
|
|
+ atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
|
|
+ }
|
|
+
|
|
+ if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
|
|
+ && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
|
|
+ return false;
|
|
+ }
|
|
+ if (!mConvertAtomics) {
|
|
+ return false;
|
|
+ }
|
|
+ StringRef name = F->getName();
|
|
+ if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
|
|
+ mConvertAtomics = false;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
|
|
+ Instruction *&src,
|
|
+ Constant *&mask,
|
|
+ Constant *&shift) {
|
|
+ if (!base) {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Null pointer passed into function.\n";
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+ bool andOp = false;
|
|
+ if (base->getOpcode() == Instruction::Shl) {
|
|
+ shift = dyn_cast<Constant>(base->getOperand(1));
|
|
+ } else if (base->getOpcode() == Instruction::And) {
|
|
+ mask = dyn_cast<Constant>(base->getOperand(1));
|
|
+ andOp = true;
|
|
+ } else {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
|
|
+ }
|
|
+ // If the base is neither a Shl or a And, we don't fit any of the patterns above.
|
|
+ return false;
|
|
+ }
|
|
+ src = dyn_cast<Instruction>(base->getOperand(0));
|
|
+ if (!src) {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Failed setup since the base operand is not an instruction!\n";
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+ // If we find an 'and' operation, then we don't need to
|
|
+ // find the next operation as we already know the
|
|
+ // bits that are valid at this point.
|
|
+ if (andOp) {
|
|
+ return true;
|
|
+ }
|
|
+ if (src->getOpcode() == Instruction::Shl && !shift) {
|
|
+ shift = dyn_cast<Constant>(src->getOperand(1));
|
|
+ src = dyn_cast<Instruction>(src->getOperand(0));
|
|
+ } else if (src->getOpcode() == Instruction::And && !mask) {
|
|
+ mask = dyn_cast<Constant>(src->getOperand(1));
|
|
+ }
|
|
+ if (!mask && !shift) {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Failed setup since both mask and shift are NULL!\n";
|
|
+ }
|
|
+ // Did not find a constant mask or a shift.
|
|
+ return false;
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+bool
|
|
+AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) {
|
|
+ if (!inst) {
|
|
+ return false;
|
|
+ }
|
|
+ if (!inst->isBinaryOp()) {
|
|
+ return false;
|
|
+ }
|
|
+ if (inst->getOpcode() != Instruction::Or) {
|
|
+ return false;
|
|
+ }
|
|
+ if (optLevel == CodeGenOpt::None) {
|
|
+ return false;
|
|
+ }
|
|
+ // We want to do an optimization on a sequence of ops that in the end equals a
|
|
+ // single ISA instruction.
|
|
+ // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
|
|
+ // Some simplified versions of this pattern are as follows:
|
|
+ // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
|
|
+ // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
|
|
+ // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
|
|
+ // (A & B) | (D << F) when (1 << F) >= B
|
|
+ // (A << C) | (D & E) when (1 << C) >= E
|
|
+ if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
|
|
+ // The HD4XXX hardware doesn't support the ubit_insert instruction.
|
|
+ return false;
|
|
+ }
|
|
+ Type *aType = inst->getType();
|
|
+ bool isVector = aType->isVectorTy();
|
|
+ int numEle = 1;
|
|
+ // This optimization only works on 32bit integers.
|
|
+ if (aType->getScalarType()
|
|
+ != Type::getInt32Ty(inst->getContext())) {
|
|
+ return false;
|
|
+ }
|
|
+ if (isVector) {
|
|
+ const VectorType *VT = dyn_cast<VectorType>(aType);
|
|
+ numEle = VT->getNumElements();
|
|
+ // We currently cannot support more than 4 elements in a intrinsic and we
|
|
+ // cannot support Vec3 types.
|
|
+ if (numEle > 4 || numEle == 3) {
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+ // TODO: Handle vectors.
|
|
+ if (isVector) {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "!!! Vectors are not supported yet!\n";
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+ Instruction *LHSSrc = NULL, *RHSSrc = NULL;
|
|
+ Constant *LHSMask = NULL, *RHSMask = NULL;
|
|
+ Constant *LHSShift = NULL, *RHSShift = NULL;
|
|
+ Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
|
|
+ Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
|
|
+ if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Found an OR Operation that failed setup!\n";
|
|
+ inst->dump();
|
|
+ if (LHS) { LHS->dump(); }
|
|
+ if (LHSSrc) { LHSSrc->dump(); }
|
|
+ if (LHSMask) { LHSMask->dump(); }
|
|
+ if (LHSShift) { LHSShift->dump(); }
|
|
+ }
|
|
+ // There was an issue with the setup for BitInsert.
|
|
+ return false;
|
|
+ }
|
|
+ if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Found an OR Operation that failed setup!\n";
|
|
+ inst->dump();
|
|
+ if (RHS) { RHS->dump(); }
|
|
+ if (RHSSrc) { RHSSrc->dump(); }
|
|
+ if (RHSMask) { RHSMask->dump(); }
|
|
+ if (RHSShift) { RHSShift->dump(); }
|
|
+ }
|
|
+ // There was an issue with the setup for BitInsert.
|
|
+ return false;
|
|
+ }
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
|
|
+ dbgs() << "Op: "; inst->dump();
|
|
+ dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
|
|
+ dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
|
|
+ dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
|
|
+ dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
|
|
+ dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
|
|
+ dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
|
|
+ dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
|
|
+ dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
|
|
+ }
|
|
+ Constant *offset = NULL;
|
|
+ Constant *width = NULL;
|
|
+ uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
|
|
+ uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
|
|
+ uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
|
|
+ uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
|
|
+ lhsMaskVal = (LHSMask
|
|
+ ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
|
|
+ rhsMaskVal = (RHSMask
|
|
+ ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
|
|
+ lhsShiftVal = (LHSShift
|
|
+ ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
|
|
+ rhsShiftVal = (RHSShift
|
|
+ ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
|
|
+ lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
|
|
+ rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
|
|
+ lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
|
|
+ rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
|
|
+ // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
|
|
+ dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
|
|
+ dbgs() << (RHSMask ? " & E)" : ")");
|
|
+ dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
|
|
+ dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
|
|
+ dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
|
|
+ dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
|
|
+ dbgs() << "width(B) = " << lhsMaskWidth;
|
|
+ dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
|
|
+ dbgs() << "offset(B) = " << lhsMaskOffset;
|
|
+ dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
|
|
+ dbgs() << "Constraints: \n";
|
|
+ dbgs() << "\t(1) B ^ E == 0\n";
|
|
+ dbgs() << "\t(2-LHS) B is a mask\n";
|
|
+ dbgs() << "\t(2-LHS) E is a mask\n";
|
|
+ dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
|
|
+ dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
|
|
+ }
|
|
+ if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
|
|
+ if (mDebug) {
|
|
+ dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
|
|
+ dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
|
|
+ dbgs() << "Failed constraint 1!\n";
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+ if (mDebug) {
|
|
+ dbgs() << "LHS = " << lhsMaskOffset << "";
|
|
+ dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
|
|
+ dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
|
|
+ dbgs() << "\nRHS = " << rhsMaskOffset << "";
|
|
+ dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
|
|
+ dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
|
|
+ dbgs() << "\n";
|
|
+ }
|
|
+ if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
|
|
+ offset = ConstantInt::get(aType, lhsMaskOffset, false);
|
|
+ width = ConstantInt::get(aType, lhsMaskWidth, false);
|
|
+ RHSSrc = RHS;
|
|
+ if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
|
|
+ dbgs() << "Failed constraint 2!\n";
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+ if (!LHSShift) {
|
|
+ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
|
|
+ "MaskShr", LHS);
|
|
+ } else if (lhsShiftVal != lhsMaskOffset) {
|
|
+ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
|
|
+ "MaskShr", LHS);
|
|
+ }
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Optimizing LHS!\n";
|
|
+ }
|
|
+ } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
|
|
+ offset = ConstantInt::get(aType, rhsMaskOffset, false);
|
|
+ width = ConstantInt::get(aType, rhsMaskWidth, false);
|
|
+ LHSSrc = RHSSrc;
|
|
+ RHSSrc = LHS;
|
|
+ if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
|
|
+ dbgs() << "Failed constraint 2!\n";
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+ if (!RHSShift) {
|
|
+ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
|
|
+ "MaskShr", RHS);
|
|
+ } else if (rhsShiftVal != rhsMaskOffset) {
|
|
+ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
|
|
+ "MaskShr", RHS);
|
|
+ }
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Optimizing RHS!\n";
|
|
+ }
|
|
+ } else {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Failed constraint 3!\n";
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
|
|
+ dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
|
|
+ dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
|
|
+ dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
|
|
+ }
|
|
+ if (!offset || !width) {
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Either width or offset are NULL, failed detection!\n";
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+ // Lets create the function signature.
|
|
+ std::vector<Type *> callTypes;
|
|
+ callTypes.push_back(aType);
|
|
+ callTypes.push_back(aType);
|
|
+ callTypes.push_back(aType);
|
|
+ callTypes.push_back(aType);
|
|
+ FunctionType *funcType = FunctionType::get(aType, callTypes, false);
|
|
+ std::string name = "__amdil_ubit_insert";
|
|
+ if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
|
|
+ Function *Func =
|
|
+ dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
|
|
+ getOrInsertFunction(llvm::StringRef(name), funcType));
|
|
+ Value *Operands[4] = {
|
|
+ width,
|
|
+ offset,
|
|
+ LHSSrc,
|
|
+ RHSSrc
|
|
+ };
|
|
+ CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
|
|
+ if (mDebug) {
|
|
+ dbgs() << "Old Inst: ";
|
|
+ inst->dump();
|
|
+ dbgs() << "New Inst: ";
|
|
+ CI->dump();
|
|
+ dbgs() << "\n\n";
|
|
+ }
|
|
+ CI->insertBefore(inst);
|
|
+ inst->replaceAllUsesWith(CI);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) {
|
|
+ if (!inst) {
|
|
+ return false;
|
|
+ }
|
|
+ if (!inst->isBinaryOp()) {
|
|
+ return false;
|
|
+ }
|
|
+ if (inst->getOpcode() != Instruction::And) {
|
|
+ return false;
|
|
+ }
|
|
+ if (optLevel == CodeGenOpt::None) {
|
|
+ return false;
|
|
+ }
|
|
+ // We want to do some simple optimizations on Shift right/And patterns. The
|
|
+ // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
|
|
+ // value smaller than 32 and C is a mask. If C is a constant value, then the
|
|
+ // following transformation can occur. For signed integers, it turns into the
|
|
+ // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
|
|
+ // integers, it turns into the function call dst =
|
|
+ // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
|
|
+ // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
|
|
+ // Evergreen hardware.
|
|
+ if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
|
|
+ // This does not work on HD4XXX hardware.
|
|
+ return false;
|
|
+ }
|
|
+ Type *aType = inst->getType();
|
|
+ bool isVector = aType->isVectorTy();
|
|
+
|
|
+ // XXX Support vector types
|
|
+ if (isVector) {
|
|
+ return false;
|
|
+ }
|
|
+ int numEle = 1;
|
|
+ // This only works on 32bit integers
|
|
+ if (aType->getScalarType()
|
|
+ != Type::getInt32Ty(inst->getContext())) {
|
|
+ return false;
|
|
+ }
|
|
+ if (isVector) {
|
|
+ const VectorType *VT = dyn_cast<VectorType>(aType);
|
|
+ numEle = VT->getNumElements();
|
|
+ // We currently cannot support more than 4 elements in a intrinsic and we
|
|
+ // cannot support Vec3 types.
|
|
+ if (numEle > 4 || numEle == 3) {
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+ BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
|
|
+ // If the first operand is not a shift instruction, then we can return as it
|
|
+ // doesn't match this pattern.
|
|
+ if (!ShiftInst || !ShiftInst->isShift()) {
|
|
+ return false;
|
|
+ }
|
|
+ // If we are a shift left, then we need don't match this pattern.
|
|
+ if (ShiftInst->getOpcode() == Instruction::Shl) {
|
|
+ return false;
|
|
+ }
|
|
+ bool isSigned = ShiftInst->isArithmeticShift();
|
|
+ Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
|
|
+ Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
|
|
+ // Lets make sure that the shift value and the and mask are constant integers.
|
|
+ if (!AndMask || !ShrVal) {
|
|
+ return false;
|
|
+ }
|
|
+ Constant *newMaskConst;
|
|
+ Constant *shiftValConst;
|
|
+ if (isVector) {
|
|
+ // Handle the vector case
|
|
+ std::vector<Constant *> maskVals;
|
|
+ std::vector<Constant *> shiftVals;
|
|
+ ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
|
|
+ ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
|
|
+ Type *scalarType = AndMaskVec->getType()->getScalarType();
|
|
+ assert(AndMaskVec->getNumOperands() ==
|
|
+ ShrValVec->getNumOperands() && "cannot have a "
|
|
+ "combination where the number of elements to a "
|
|
+ "shift and an and are different!");
|
|
+ for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
|
|
+ ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
|
|
+ ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
|
|
+ if (!AndCI || !ShiftIC) {
|
|
+ return false;
|
|
+ }
|
|
+ uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
|
|
+ if (!isMask_32(maskVal)) {
|
|
+ return false;
|
|
+ }
|
|
+ maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
|
|
+ uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
|
|
+ // If the mask or shiftval is greater than the bitcount, then break out.
|
|
+ if (maskVal >= 32 || shiftVal >= 32) {
|
|
+ return false;
|
|
+ }
|
|
+ // If the mask val is greater than the the number of original bits left
|
|
+ // then this optimization is invalid.
|
|
+ if (maskVal > (32 - shiftVal)) {
|
|
+ return false;
|
|
+ }
|
|
+ maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
|
|
+ shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
|
|
+ }
|
|
+ newMaskConst = ConstantVector::get(maskVals);
|
|
+ shiftValConst = ConstantVector::get(shiftVals);
|
|
+ } else {
|
|
+ // Handle the scalar case
|
|
+ uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
|
|
+ // This must be a mask value where all lower bits are set to 1 and then any
|
|
+ // bit higher is set to 0.
|
|
+ if (!isMask_32(maskVal)) {
|
|
+ return false;
|
|
+ }
|
|
+ maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
|
|
+ // Count the number of bits set in the mask, this is the width of the
|
|
+ // resulting bit set that is extracted from the source value.
|
|
+ uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
|
|
+ // If the mask or shift val is greater than the bitcount, then break out.
|
|
+ if (maskVal >= 32 || shiftVal >= 32) {
|
|
+ return false;
|
|
+ }
|
|
+ // If the mask val is greater than the the number of original bits left then
|
|
+ // this optimization is invalid.
|
|
+ if (maskVal > (32 - shiftVal)) {
|
|
+ return false;
|
|
+ }
|
|
+ newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
|
|
+ shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
|
|
+ }
|
|
+ // Lets create the function signature.
|
|
+ std::vector<Type *> callTypes;
|
|
+ callTypes.push_back(aType);
|
|
+ callTypes.push_back(aType);
|
|
+ callTypes.push_back(aType);
|
|
+ FunctionType *funcType = FunctionType::get(aType, callTypes, false);
|
|
+ std::string name = "llvm.AMDGPU.bit.extract.u32";
|
|
+ if (isVector) {
|
|
+ name += ".v" + itostr(numEle) + "i32";
|
|
+ } else {
|
|
+ name += ".";
|
|
+ }
|
|
+ // Lets create the function.
|
|
+ Function *Func =
|
|
+ dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
|
|
+ getOrInsertFunction(llvm::StringRef(name), funcType));
|
|
+ Value *Operands[3] = {
|
|
+ ShiftInst->getOperand(0),
|
|
+ shiftValConst,
|
|
+ newMaskConst
|
|
+ };
|
|
+ // Lets create the Call with the operands
|
|
+ CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
|
|
+ CI->setDoesNotAccessMemory();
|
|
+ CI->insertBefore(inst);
|
|
+ inst->replaceAllUsesWith(CI);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
|
|
+ if (!CI) {
|
|
+ return false;
|
|
+ }
|
|
+ Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
|
|
+ if (!LHS->getName().startswith("__amdil_bfi")) {
|
|
+ return false;
|
|
+ }
|
|
+ Type* type = CI->getOperand(0)->getType();
|
|
+ Constant *negOneConst = NULL;
|
|
+ if (type->isVectorTy()) {
|
|
+ std::vector<Constant *> negOneVals;
|
|
+ negOneConst = ConstantInt::get(CI->getContext(),
|
|
+ APInt(32, StringRef("-1"), 10));
|
|
+ for (size_t x = 0,
|
|
+ y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
|
|
+ negOneVals.push_back(negOneConst);
|
|
+ }
|
|
+ negOneConst = ConstantVector::get(negOneVals);
|
|
+ } else {
|
|
+ negOneConst = ConstantInt::get(CI->getContext(),
|
|
+ APInt(32, StringRef("-1"), 10));
|
|
+ }
|
|
+ // __amdil_bfi => (A & B) | (~A & C)
|
|
+ BinaryOperator *lhs =
|
|
+ BinaryOperator::Create(Instruction::And, CI->getOperand(0),
|
|
+ CI->getOperand(1), "bfi_and", CI);
|
|
+ BinaryOperator *rhs =
|
|
+ BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
|
|
+ "bfi_not", CI);
|
|
+ rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
|
|
+ "bfi_and", CI);
|
|
+ lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
|
|
+ CI->replaceAllUsesWith(lhs);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
|
|
+ if (!CI) {
|
|
+ return false;
|
|
+ }
|
|
+ Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
|
|
+ if (!LHS->getName().startswith("__amdil_bfm")) {
|
|
+ return false;
|
|
+ }
|
|
+ // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
|
|
+ Constant *newMaskConst = NULL;
|
|
+ Constant *newShiftConst = NULL;
|
|
+ Type* type = CI->getOperand(0)->getType();
|
|
+ if (type->isVectorTy()) {
|
|
+ std::vector<Constant*> newMaskVals, newShiftVals;
|
|
+ newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
|
|
+ newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
|
|
+ for (size_t x = 0,
|
|
+ y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
|
|
+ newMaskVals.push_back(newMaskConst);
|
|
+ newShiftVals.push_back(newShiftConst);
|
|
+ }
|
|
+ newMaskConst = ConstantVector::get(newMaskVals);
|
|
+ newShiftConst = ConstantVector::get(newShiftVals);
|
|
+ } else {
|
|
+ newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
|
|
+ newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
|
|
+ }
|
|
+ BinaryOperator *lhs =
|
|
+ BinaryOperator::Create(Instruction::And, CI->getOperand(0),
|
|
+ newMaskConst, "bfm_mask", CI);
|
|
+ lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
|
|
+ lhs, "bfm_shl", CI);
|
|
+ lhs = BinaryOperator::Create(Instruction::Sub, lhs,
|
|
+ newShiftConst, "bfm_sub", CI);
|
|
+ BinaryOperator *rhs =
|
|
+ BinaryOperator::Create(Instruction::And, CI->getOperand(1),
|
|
+ newMaskConst, "bfm_mask", CI);
|
|
+ lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
|
|
+ CI->replaceAllUsesWith(lhs);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) {
|
|
+ Instruction *inst = (*bbb);
|
|
+ if (optimizeCallInst(bbb)) {
|
|
+ return true;
|
|
+ }
|
|
+ if (optimizeBitExtract(inst)) {
|
|
+ return false;
|
|
+ }
|
|
+ if (optimizeBitInsert(inst)) {
|
|
+ return false;
|
|
+ }
|
|
+ if (correctMisalignedMemOp(inst)) {
|
|
+ return false;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+bool
|
|
+AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
|
|
+ LoadInst *linst = dyn_cast<LoadInst>(inst);
|
|
+ StoreInst *sinst = dyn_cast<StoreInst>(inst);
|
|
+ unsigned alignment;
|
|
+ Type* Ty = inst->getType();
|
|
+ if (linst) {
|
|
+ alignment = linst->getAlignment();
|
|
+ Ty = inst->getType();
|
|
+ } else if (sinst) {
|
|
+ alignment = sinst->getAlignment();
|
|
+ Ty = sinst->getValueOperand()->getType();
|
|
+ } else {
|
|
+ return false;
|
|
+ }
|
|
+ unsigned size = getTypeSize(Ty);
|
|
+ if (size == alignment || size < alignment) {
|
|
+ return false;
|
|
+ }
|
|
+ if (!Ty->isStructTy()) {
|
|
+ return false;
|
|
+ }
|
|
+ if (alignment < 4) {
|
|
+ if (linst) {
|
|
+ linst->setAlignment(0);
|
|
+ return true;
|
|
+ } else if (sinst) {
|
|
+ sinst->setAlignment(0);
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+bool
|
|
+AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) {
|
|
+ if (!CI) {
|
|
+ return false;
|
|
+ }
|
|
+ Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
|
|
+ std::string namePrefix = LHS->getName().substr(0, 14);
|
|
+ if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
|
|
+ && namePrefix != "__amdil__imul24_high") {
|
|
+ return false;
|
|
+ }
|
|
+ if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
|
|
+ return false;
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+void
|
|
+AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) {
|
|
+ assert(isSigned24BitOps(CI) && "Must be a "
|
|
+ "signed 24 bit operation to call this function!");
|
|
+ Value *LHS = CI->getOperand(CI->getNumOperands()-1);
|
|
+ // On 7XX and 8XX we do not have signed 24bit, so we need to
|
|
+ // expand it to the following:
|
|
+ // imul24 turns into 32bit imul
|
|
+ // imad24 turns into 32bit imad
|
|
+ // imul24_high turns into 32bit imulhigh
|
|
+ if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
|
|
+ Type *aType = CI->getOperand(0)->getType();
|
|
+ bool isVector = aType->isVectorTy();
|
|
+ int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
|
|
+ std::vector<Type*> callTypes;
|
|
+ callTypes.push_back(CI->getOperand(0)->getType());
|
|
+ callTypes.push_back(CI->getOperand(1)->getType());
|
|
+ callTypes.push_back(CI->getOperand(2)->getType());
|
|
+ FunctionType *funcType =
|
|
+ FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
|
|
+ std::string name = "__amdil_imad";
|
|
+ if (isVector) {
|
|
+ name += "_v" + itostr(numEle) + "i32";
|
|
+ } else {
|
|
+ name += "_i32";
|
|
+ }
|
|
+ Function *Func = dyn_cast<Function>(
|
|
+ CI->getParent()->getParent()->getParent()->
|
|
+ getOrInsertFunction(llvm::StringRef(name), funcType));
|
|
+ Value *Operands[3] = {
|
|
+ CI->getOperand(0),
|
|
+ CI->getOperand(1),
|
|
+ CI->getOperand(2)
|
|
+ };
|
|
+ CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
|
|
+ nCI->insertBefore(CI);
|
|
+ CI->replaceAllUsesWith(nCI);
|
|
+ } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
|
|
+ BinaryOperator *mulOp =
|
|
+ BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
|
|
+ CI->getOperand(1), "imul24", CI);
|
|
+ CI->replaceAllUsesWith(mulOp);
|
|
+ } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
|
|
+ Type *aType = CI->getOperand(0)->getType();
|
|
+
|
|
+ bool isVector = aType->isVectorTy();
|
|
+ int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
|
|
+ std::vector<Type*> callTypes;
|
|
+ callTypes.push_back(CI->getOperand(0)->getType());
|
|
+ callTypes.push_back(CI->getOperand(1)->getType());
|
|
+ FunctionType *funcType =
|
|
+ FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
|
|
+ std::string name = "__amdil_imul_high";
|
|
+ if (isVector) {
|
|
+ name += "_v" + itostr(numEle) + "i32";
|
|
+ } else {
|
|
+ name += "_i32";
|
|
+ }
|
|
+ Function *Func = dyn_cast<Function>(
|
|
+ CI->getParent()->getParent()->getParent()->
|
|
+ getOrInsertFunction(llvm::StringRef(name), funcType));
|
|
+ Value *Operands[2] = {
|
|
+ CI->getOperand(0),
|
|
+ CI->getOperand(1)
|
|
+ };
|
|
+ CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
|
|
+ nCI->insertBefore(CI);
|
|
+ CI->replaceAllUsesWith(nCI);
|
|
+ }
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) {
|
|
+ return (CI != NULL
|
|
+ && CI->getOperand(CI->getNumOperands() - 1)->getName()
|
|
+ == "__amdil_get_local_size_int");
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) {
|
|
+ if (!CI) {
|
|
+ return false;
|
|
+ }
|
|
+ if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
|
|
+ && (mSTM->getDeviceName() == "cayman")) {
|
|
+ return false;
|
|
+ }
|
|
+ return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
|
|
+ == "__amdil_improved_div";
|
|
+}
|
|
+
|
|
+void
|
|
+AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) {
|
|
+ assert(convertAccurateDivide(CI)
|
|
+ && "expanding accurate divide can only happen if it is expandable!");
|
|
+ BinaryOperator *divOp =
|
|
+ BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
|
|
+ CI->getOperand(1), "fdiv32", CI);
|
|
+ CI->replaceAllUsesWith(divOp);
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
|
|
+ if (optLevel != CodeGenOpt::None) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (!CI) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ unsigned funcNameIdx = 0;
|
|
+ funcNameIdx = CI->getNumOperands() - 1;
|
|
+ StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
|
|
+ if (calleeName != "__amdil_image2d_read_norm"
|
|
+ && calleeName != "__amdil_image2d_read_unnorm"
|
|
+ && calleeName != "__amdil_image3d_read_norm"
|
|
+ && calleeName != "__amdil_image3d_read_unnorm") {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ unsigned samplerIdx = 2;
|
|
+ samplerIdx = 1;
|
|
+ Value *sampler = CI->getOperand(samplerIdx);
|
|
+ LoadInst *lInst = dyn_cast<LoadInst>(sampler);
|
|
+ if (!lInst) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
|
|
+ // If we are loading from what is not a global value, then we
|
|
+ // fail and return.
|
|
+ if (!gv) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ // If we don't have an initializer or we have an initializer and
|
|
+ // the initializer is not a 32bit integer, we fail.
|
|
+ if (!gv->hasInitializer()
|
|
+ || !gv->getInitializer()->getType()->isIntegerTy(32)) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ // Now that we have the global variable initializer, lets replace
|
|
+ // all uses of the load instruction with the samplerVal and
|
|
+ // reparse the __amdil_is_constant() function.
|
|
+ Constant *samplerVal = gv->getInitializer();
|
|
+ lInst->replaceAllUsesWith(samplerVal);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::doInitialization(Module &M) {
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool
|
|
+AMDGPUPeepholeOpt::doFinalization(Module &M) {
|
|
+ return false;
|
|
+}
|
|
+
|
|
+void
|
|
+AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const {
|
|
+ AU.addRequired<MachineFunctionAnalysis>();
|
|
+ FunctionPass::getAnalysisUsage(AU);
|
|
+ AU.setPreservesAll();
|
|
+}
|
|
+
|
|
+size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
|
|
+ size_t size = 0;
|
|
+ if (!T) {
|
|
+ return size;
|
|
+ }
|
|
+ switch (T->getTypeID()) {
|
|
+ case Type::X86_FP80TyID:
|
|
+ case Type::FP128TyID:
|
|
+ case Type::PPC_FP128TyID:
|
|
+ case Type::LabelTyID:
|
|
+ assert(0 && "These types are not supported by this backend");
|
|
+ default:
|
|
+ case Type::FloatTyID:
|
|
+ case Type::DoubleTyID:
|
|
+ size = T->getPrimitiveSizeInBits() >> 3;
|
|
+ break;
|
|
+ case Type::PointerTyID:
|
|
+ size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
|
|
+ break;
|
|
+ case Type::IntegerTyID:
|
|
+ size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
|
|
+ break;
|
|
+ case Type::StructTyID:
|
|
+ size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
|
|
+ break;
|
|
+ case Type::ArrayTyID:
|
|
+ size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
|
|
+ break;
|
|
+ case Type::FunctionTyID:
|
|
+ size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
|
|
+ break;
|
|
+ case Type::VectorTyID:
|
|
+ size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
|
|
+ break;
|
|
+ };
|
|
+ return size;
|
|
+}
|
|
+
|
|
+size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
|
|
+ bool dereferencePtr) {
|
|
+ size_t size = 0;
|
|
+ if (!ST) {
|
|
+ return size;
|
|
+ }
|
|
+ Type *curType;
|
|
+ StructType::element_iterator eib;
|
|
+ StructType::element_iterator eie;
|
|
+ for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
|
|
+ curType = *eib;
|
|
+ size += getTypeSize(curType, dereferencePtr);
|
|
+ }
|
|
+ return size;
|
|
+}
|
|
+
|
|
+size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
|
|
+ bool dereferencePtr) {
|
|
+ return IT ? (IT->getBitWidth() >> 3) : 0;
|
|
+}
|
|
+
|
|
+size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
|
|
+ bool dereferencePtr) {
|
|
+ assert(0 && "Should not be able to calculate the size of an function type");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
|
|
+ bool dereferencePtr) {
|
|
+ return (size_t)(AT ? (getTypeSize(AT->getElementType(),
|
|
+ dereferencePtr) * AT->getNumElements())
|
|
+ : 0);
|
|
+}
|
|
+
|
|
+size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
|
|
+ bool dereferencePtr) {
|
|
+ return VT ? (VT->getBitWidth() >> 3) : 0;
|
|
+}
|
|
+
|
|
+size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
|
|
+ bool dereferencePtr) {
|
|
+ if (!PT) {
|
|
+ return 0;
|
|
+ }
|
|
+ Type *CT = PT->getElementType();
|
|
+ if (CT->getTypeID() == Type::StructTyID &&
|
|
+ PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
|
|
+ return getTypeSize(dyn_cast<StructType>(CT));
|
|
+ } else if (dereferencePtr) {
|
|
+ size_t size = 0;
|
|
+ for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
|
|
+ size += getTypeSize(PT->getContainedType(x), dereferencePtr);
|
|
+ }
|
|
+ return size;
|
|
+ } else {
|
|
+ return 4;
|
|
+ }
|
|
+}
|
|
+
|
|
+size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
|
|
+ bool dereferencePtr) {
|
|
+ //assert(0 && "Should not be able to calculate the size of an opaque type");
|
|
+ return 4;
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDILRegisterInfo.td b/lib/Target/R600/AMDILRegisterInfo.td
|
|
new file mode 100644
|
|
index 0000000..b9d0334
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILRegisterInfo.td
|
|
@@ -0,0 +1,107 @@
|
|
+//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+// Declarations that describe the AMDIL register file
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+class AMDILReg<bits<16> num, string n> : Register<n> {
|
|
+ field bits<16> Value;
|
|
+ let Value = num;
|
|
+ let Namespace = "AMDGPU";
|
|
+}
|
|
+
|
|
+// We will start with 8 registers for each class before expanding to more
|
|
+// Since the swizzle is added based on the register class, we can leave it
|
|
+// off here and just specify different registers for different register classes
|
|
+def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>;
|
|
+def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>;
|
|
+def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>;
|
|
+def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>;
|
|
+def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>;
|
|
+def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>;
|
|
+def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>;
|
|
+def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>;
|
|
+def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>;
|
|
+def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>;
|
|
+def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>;
|
|
+def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>;
|
|
+def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>;
|
|
+def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>;
|
|
+def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>;
|
|
+def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>;
|
|
+def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>;
|
|
+def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>;
|
|
+def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>;
|
|
+def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>;
|
|
+
|
|
+// All registers between 1000 and 1024 are reserved and cannot be used
|
|
+// unless commented in this section
|
|
+// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's
|
|
+// r1020 is used to hold the frame index for local arrays
|
|
+// r1019 is used to hold the dynamic stack allocation pointer
|
|
+// r1018 is used as a temporary register for handwritten code
|
|
+// r1017 is used as a temporary register for handwritten code
|
|
+// r1016 is used as a temporary register for load/store code
|
|
+// r1015 is used as a temporary register for data segment offset
|
|
+// r1014 is used as a temporary register for store code
|
|
+// r1013 is used as the section data pointer register
|
|
+// r1012-r1010 and r1001-r1008 are used for temporary I/O registers
|
|
+// r1009 is used as the frame pointer register
|
|
+// r999 is used as the mem register.
|
|
+// r998 is used as the return address register.
|
|
+//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>;
|
|
+//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>;
|
|
+//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>;
|
|
+//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>;
|
|
+//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>;
|
|
+//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>;
|
|
+def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>;
|
|
+def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>;
|
|
+def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>;
|
|
+def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>;
|
|
+def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>;
|
|
+def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>;
|
|
+def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>;
|
|
+def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>;
|
|
+def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>;
|
|
+def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>;
|
|
+def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>;
|
|
+def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>;
|
|
+def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>;
|
|
+def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>;
|
|
+def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>;
|
|
+def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>;
|
|
+def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>;
|
|
+def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>;
|
|
+def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>;
|
|
+def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>;
|
|
+def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>;
|
|
+def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>;
|
|
+def GPRI16 : RegisterClass<"AMDGPU", [i16], 16,
|
|
+ (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
|
|
+ let AltOrders = [(add (sequence "R%u", 1, 20))];
|
|
+ let AltOrderSelect = [{
|
|
+ return 1;
|
|
+ }];
|
|
+ }
|
|
+def GPRI32 : RegisterClass<"AMDGPU", [i32], 32,
|
|
+ (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
|
|
+ let AltOrders = [(add (sequence "R%u", 1, 20))];
|
|
+ let AltOrderSelect = [{
|
|
+ return 1;
|
|
+ }];
|
|
+ }
|
|
+def GPRF32 : RegisterClass<"AMDGPU", [f32], 32,
|
|
+ (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
|
|
+ let AltOrders = [(add (sequence "R%u", 1, 20))];
|
|
+ let AltOrderSelect = [{
|
|
+ return 1;
|
|
+ }];
|
|
+ }
|
|
diff --git a/lib/Target/R600/AMDILSIDevice.cpp b/lib/Target/R600/AMDILSIDevice.cpp
|
|
new file mode 100644
|
|
index 0000000..7c2710f
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILSIDevice.cpp
|
|
@@ -0,0 +1,45 @@
|
|
+//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//==-----------------------------------------------------------------------===//
|
|
+#include "AMDILSIDevice.h"
|
|
+#include "AMDILEvergreenDevice.h"
|
|
+#include "AMDILNIDevice.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+AMDGPUSIDevice::AMDGPUSIDevice(AMDGPUSubtarget *ST)
|
|
+ : AMDGPUEvergreenDevice(ST) {
|
|
+}
|
|
+AMDGPUSIDevice::~AMDGPUSIDevice() {
|
|
+}
|
|
+
|
|
+size_t
|
|
+AMDGPUSIDevice::getMaxLDSSize() const {
|
|
+ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
|
|
+ return MAX_LDS_SIZE_900;
|
|
+ } else {
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+uint32_t
|
|
+AMDGPUSIDevice::getGeneration() const {
|
|
+ return AMDGPUDeviceInfo::HD7XXX;
|
|
+}
|
|
+
|
|
+std::string
|
|
+AMDGPUSIDevice::getDataLayout() const {
|
|
+ return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
|
|
+ "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
|
|
+ "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
|
|
+ "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
|
|
+ "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
|
|
+ "-n8:16:32:64");
|
|
+}
|
|
diff --git a/lib/Target/R600/AMDILSIDevice.h b/lib/Target/R600/AMDILSIDevice.h
|
|
new file mode 100644
|
|
index 0000000..5b2cb25
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/AMDILSIDevice.h
|
|
@@ -0,0 +1,39 @@
|
|
+//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//==-----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Interface for the subtarget data classes.
|
|
+///
|
|
+/// This file will define the interface that each generation needs to
|
|
+/// implement in order to correctly answer queries on the capabilities of the
|
|
+/// specific hardware.
|
|
+//===---------------------------------------------------------------------===//
|
|
+#ifndef AMDILSIDEVICE_H
|
|
+#define AMDILSIDEVICE_H
|
|
+#include "AMDILEvergreenDevice.h"
|
|
+
|
|
+namespace llvm {
|
|
+class AMDGPUSubtarget;
|
|
+//===---------------------------------------------------------------------===//
|
|
+// SI generation of devices and their respective sub classes
|
|
+//===---------------------------------------------------------------------===//
|
|
+
|
|
+/// \brief The AMDGPUSIDevice is the base class for all Southern Island series
|
|
+/// of cards.
|
|
+class AMDGPUSIDevice : public AMDGPUEvergreenDevice {
|
|
+public:
|
|
+ AMDGPUSIDevice(AMDGPUSubtarget*);
|
|
+ virtual ~AMDGPUSIDevice();
|
|
+ virtual size_t getMaxLDSSize() const;
|
|
+ virtual uint32_t getGeneration() const;
|
|
+ virtual std::string getDataLayout() const;
|
|
+};
|
|
+
|
|
+} // namespace llvm
|
|
+#endif // AMDILSIDEVICE_H
|
|
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
|
|
new file mode 100644
|
|
index 0000000..8ef9f8c
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/CMakeLists.txt
|
|
@@ -0,0 +1,56 @@
|
|
+set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
|
|
+
|
|
+tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
|
|
+tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
|
|
+tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
|
|
+tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
|
|
+tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
|
|
+tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
|
|
+tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
|
|
+tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
|
|
+tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
|
|
+add_public_tablegen_target(AMDGPUCommonTableGen)
|
|
+
|
|
+add_llvm_target(AMDGPUCodeGen
|
|
+ AMDIL7XXDevice.cpp
|
|
+ AMDILCFGStructurizer.cpp
|
|
+ AMDILDevice.cpp
|
|
+ AMDILDeviceInfo.cpp
|
|
+ AMDILEvergreenDevice.cpp
|
|
+ AMDILFrameLowering.cpp
|
|
+ AMDILIntrinsicInfo.cpp
|
|
+ AMDILISelDAGToDAG.cpp
|
|
+ AMDILISelLowering.cpp
|
|
+ AMDILNIDevice.cpp
|
|
+ AMDILPeepholeOptimizer.cpp
|
|
+ AMDILSIDevice.cpp
|
|
+ AMDGPUAsmPrinter.cpp
|
|
+ AMDGPUIndirectAddressing.cpp
|
|
+ AMDGPUMCInstLower.cpp
|
|
+ AMDGPUSubtarget.cpp
|
|
+ AMDGPUTargetMachine.cpp
|
|
+ AMDGPUISelLowering.cpp
|
|
+ AMDGPUConvertToISA.cpp
|
|
+ AMDGPUInstrInfo.cpp
|
|
+ AMDGPURegisterInfo.cpp
|
|
+ R600ExpandSpecialInstrs.cpp
|
|
+ R600InstrInfo.cpp
|
|
+ R600ISelLowering.cpp
|
|
+ R600LowerConstCopy.cpp
|
|
+ R600MachineFunctionInfo.cpp
|
|
+ R600RegisterInfo.cpp
|
|
+ SIAssignInterpRegs.cpp
|
|
+ SIInstrInfo.cpp
|
|
+ SIISelLowering.cpp
|
|
+ SILowerLiteralConstants.cpp
|
|
+ SILowerControlFlow.cpp
|
|
+ SIMachineFunctionInfo.cpp
|
|
+ SIRegisterInfo.cpp
|
|
+ SIFixSGPRLiveness.cpp
|
|
+ )
|
|
+
|
|
+add_dependencies(LLVMR600CodeGen intrinsics_gen)
|
|
+
|
|
+add_subdirectory(InstPrinter)
|
|
+add_subdirectory(TargetInfo)
|
|
+add_subdirectory(MCTargetDesc)
|
|
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
|
|
new file mode 100644
|
|
index 0000000..d6450a0
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
|
|
@@ -0,0 +1,168 @@
|
|
+//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+// \file
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPUInstPrinter.h"
|
|
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
+#include "llvm/MC/MCInst.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
|
|
+ StringRef Annot) {
|
|
+ printInstruction(MI, OS);
|
|
+
|
|
+ printAnnotation(OS, Annot);
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+
|
|
+ const MCOperand &Op = MI->getOperand(OpNo);
|
|
+ if (Op.isReg()) {
|
|
+ switch (Op.getReg()) {
|
|
+ // This is the default predicate state, so we don't need to print it.
|
|
+ case AMDGPU::PRED_SEL_OFF: break;
|
|
+ default: O << getRegisterName(Op.getReg()); break;
|
|
+ }
|
|
+ } else if (Op.isImm()) {
|
|
+ O << Op.getImm();
|
|
+ } else if (Op.isFPImm()) {
|
|
+ O << Op.getFPImm();
|
|
+ } else {
|
|
+ assert(!"unknown operand type in printOperand");
|
|
+ }
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
|
|
+ raw_ostream &O) {
|
|
+ unsigned Imm = MI->getOperand(OpNum).getImm();
|
|
+
|
|
+ if (Imm == 2) {
|
|
+ O << "P0";
|
|
+ } else if (Imm == 1) {
|
|
+ O << "P20";
|
|
+ } else if (Imm == 0) {
|
|
+ O << "P10";
|
|
+ } else {
|
|
+ assert(!"Invalid interpolation parameter slot");
|
|
+ }
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ printOperand(MI, OpNo, O);
|
|
+ O << ", ";
|
|
+ printOperand(MI, OpNo + 1, O);
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O, StringRef Asm) {
|
|
+ const MCOperand &Op = MI->getOperand(OpNo);
|
|
+ assert(Op.isImm());
|
|
+ if (Op.getImm() == 1) {
|
|
+ O << Asm;
|
|
+ }
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ printIfSet(MI, OpNo, O, "|");
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ printIfSet(MI, OpNo, O, "_SAT");
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ union Literal {
|
|
+ float f;
|
|
+ int32_t i;
|
|
+ } L;
|
|
+
|
|
+ L.i = MI->getOperand(OpNo).getImm();
|
|
+ O << L.i << "(" << L.f << ")";
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ printIfSet(MI, OpNo, O, " *");
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ printIfSet(MI, OpNo, O, "-");
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ switch (MI->getOperand(OpNo).getImm()) {
|
|
+ default: break;
|
|
+ case 1:
|
|
+ O << " * 2.0";
|
|
+ break;
|
|
+ case 2:
|
|
+ O << " * 4.0";
|
|
+ break;
|
|
+ case 3:
|
|
+ O << " / 2.0";
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ printIfSet(MI, OpNo, O, "+");
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ printIfSet(MI, OpNo, O, "ExecMask,");
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ printIfSet(MI, OpNo, O, "Pred,");
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ const MCOperand &Op = MI->getOperand(OpNo);
|
|
+ if (Op.getImm() == 0) {
|
|
+ O << " (MASKED)";
|
|
+ }
|
|
+}
|
|
+
|
|
+void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
|
|
+ raw_ostream &O) {
|
|
+ const char * chans = "XYZW";
|
|
+ int sel = MI->getOperand(OpNo).getImm();
|
|
+
|
|
+ int chan = sel & 3;
|
|
+ sel >>= 2;
|
|
+
|
|
+ if (sel >= 512) {
|
|
+ sel -= 512;
|
|
+ int cb = sel >> 12;
|
|
+ sel &= 4095;
|
|
+ O << cb << "[" << sel << "]";
|
|
+ } else if (sel >= 448) {
|
|
+ sel -= 448;
|
|
+ O << sel;
|
|
+ } else if (sel >= 0){
|
|
+ O << sel;
|
|
+ }
|
|
+
|
|
+ if (sel >= 0)
|
|
+ O << "." << chans[chan];
|
|
+}
|
|
+
|
|
+#include "AMDGPUGenAsmWriter.inc"
|
|
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
|
|
new file mode 100644
|
|
index 0000000..767a708
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
|
|
@@ -0,0 +1,54 @@
|
|
+//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPUINSTPRINTER_H
|
|
+#define AMDGPUINSTPRINTER_H
|
|
+
|
|
+#include "llvm/ADT/StringRef.h"
|
|
+#include "llvm/MC/MCInstPrinter.h"
|
|
+#include "llvm/Support/raw_ostream.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class AMDGPUInstPrinter : public MCInstPrinter {
|
|
+public:
|
|
+ AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
|
|
+ const MCRegisterInfo &MRI)
|
|
+ : MCInstPrinter(MAI, MII, MRI) {}
|
|
+
|
|
+ //Autogenerated by tblgen
|
|
+ void printInstruction(const MCInst *MI, raw_ostream &O);
|
|
+ static const char *getRegisterName(unsigned RegNo);
|
|
+
|
|
+ virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
|
|
+
|
|
+private:
|
|
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
|
|
+ void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm);
|
|
+ void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+ void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
|
+};
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+#endif // AMDGPUINSTRPRINTER_H
|
|
diff --git a/lib/Target/R600/InstPrinter/CMakeLists.txt b/lib/Target/R600/InstPrinter/CMakeLists.txt
|
|
new file mode 100644
|
|
index 0000000..6776337
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/InstPrinter/CMakeLists.txt
|
|
@@ -0,0 +1,7 @@
|
|
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
|
|
+
|
|
+add_llvm_library(LLVMR600AsmPrinter
|
|
+ AMDGPUInstPrinter.cpp
|
|
+ )
|
|
+
|
|
+add_dependencies(LLVMR600AsmPrinter R600CommonTableGen)
|
|
diff --git a/lib/Target/R600/InstPrinter/LLVMBuild.txt b/lib/Target/R600/InstPrinter/LLVMBuild.txt
|
|
new file mode 100644
|
|
index 0000000..ec0be89
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/InstPrinter/LLVMBuild.txt
|
|
@@ -0,0 +1,24 @@
|
|
+;===- ./lib/Target/R600/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
|
|
+;
|
|
+; The LLVM Compiler Infrastructure
|
|
+;
|
|
+; This file is distributed under the University of Illinois Open Source
|
|
+; License. See LICENSE.TXT for details.
|
|
+;
|
|
+;===------------------------------------------------------------------------===;
|
|
+;
|
|
+; This is an LLVMBuild description file for the components in this subdirectory.
|
|
+;
|
|
+; For more information on the LLVMBuild system, please see:
|
|
+;
|
|
+; http://llvm.org/docs/LLVMBuild.html
|
|
+;
|
|
+;===------------------------------------------------------------------------===;
|
|
+
|
|
+[component_0]
|
|
+type = Library
|
|
+name = R600AsmPrinter
|
|
+parent = R600
|
|
+required_libraries = MC Support
|
|
+add_to_library_groups = R600
|
|
+
|
|
diff --git a/lib/Target/R600/InstPrinter/Makefile b/lib/Target/R600/InstPrinter/Makefile
|
|
new file mode 100644
|
|
index 0000000..a794cc1
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/InstPrinter/Makefile
|
|
@@ -0,0 +1,15 @@
|
|
+#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===##
|
|
+#
|
|
+# The LLVM Compiler Infrastructure
|
|
+#
|
|
+# This file is distributed under the University of Illinois Open Source
|
|
+# License. See LICENSE.TXT for details.
|
|
+#
|
|
+##===----------------------------------------------------------------------===##
|
|
+LEVEL = ../../../..
|
|
+LIBRARYNAME = LLVMR600AsmPrinter
|
|
+
|
|
+# Hack: we need to include 'main' x86 target directory to grab private headers
|
|
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
|
|
+
|
|
+include $(LEVEL)/Makefile.common
|
|
diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt
|
|
new file mode 100644
|
|
index 0000000..f2a7554
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/LLVMBuild.txt
|
|
@@ -0,0 +1,32 @@
|
|
+;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===;
|
|
+;
|
|
+; The LLVM Compiler Infrastructure
|
|
+;
|
|
+; This file is distributed under the University of Illinois Open Source
|
|
+; License. See LICENSE.TXT for details.
|
|
+;
|
|
+;===------------------------------------------------------------------------===;
|
|
+;
|
|
+; This is an LLVMBuild description file for the components in this subdirectory.
|
|
+;
|
|
+; For more information on the LLVMBuild system, please see:
|
|
+;
|
|
+; http://llvm.org/docs/LLVMBuild.html
|
|
+;
|
|
+;===------------------------------------------------------------------------===;
|
|
+
|
|
+[common]
|
|
+subdirectories = InstPrinter MCTargetDesc TargetInfo
|
|
+
|
|
+[component_0]
|
|
+type = TargetGroup
|
|
+name = R600
|
|
+parent = Target
|
|
+has_asmprinter = 1
|
|
+
|
|
+[component_1]
|
|
+type = Library
|
|
+name = R600CodeGen
|
|
+parent = R600
|
|
+required_libraries = AsmPrinter CodeGen Core SelectionDAG Support Target MC R600AsmPrinter R600Desc R600Info
|
|
+add_to_library_groups = R600
|
|
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
|
|
new file mode 100644
|
|
index 0000000..8f41ebb
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
|
|
@@ -0,0 +1,90 @@
|
|
+//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
+#include "llvm/ADT/StringRef.h"
|
|
+#include "llvm/MC/MCAsmBackend.h"
|
|
+#include "llvm/MC/MCAssembler.h"
|
|
+#include "llvm/MC/MCObjectWriter.h"
|
|
+#include "llvm/MC/MCValue.h"
|
|
+#include "llvm/Support/TargetRegistry.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+namespace {
|
|
+
|
|
+class AMDGPUMCObjectWriter : public MCObjectWriter {
|
|
+public:
|
|
+ AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { }
|
|
+ virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
|
|
+ const MCAsmLayout &Layout) {
|
|
+ //XXX: Implement if necessary.
|
|
+ }
|
|
+ virtual void RecordRelocation(const MCAssembler &Asm,
|
|
+ const MCAsmLayout &Layout,
|
|
+ const MCFragment *Fragment,
|
|
+ const MCFixup &Fixup,
|
|
+ MCValue Target, uint64_t &FixedValue) {
|
|
+ assert(!"Not implemented");
|
|
+ }
|
|
+
|
|
+ virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
|
|
+
|
|
+};
|
|
+
|
|
+class AMDGPUAsmBackend : public MCAsmBackend {
|
|
+public:
|
|
+ AMDGPUAsmBackend(const Target &T)
|
|
+ : MCAsmBackend() {}
|
|
+
|
|
+ virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const;
|
|
+ virtual unsigned getNumFixupKinds() const { return 0; };
|
|
+ virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
|
|
+ uint64_t Value) const;
|
|
+ virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
|
|
+ const MCInstFragment *DF,
|
|
+ const MCAsmLayout &Layout) const {
|
|
+ return false;
|
|
+ }
|
|
+ virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
|
|
+ assert(!"Not implemented");
|
|
+ }
|
|
+ virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; }
|
|
+ virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
|
|
+ return true;
|
|
+ }
|
|
+};
|
|
+
|
|
+} //End anonymous namespace
|
|
+
|
|
+void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm,
|
|
+ const MCAsmLayout &Layout) {
|
|
+ for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) {
|
|
+ Asm.writeSectionData(I, Layout);
|
|
+ }
|
|
+}
|
|
+
|
|
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
|
|
+ StringRef CPU) {
|
|
+ return new AMDGPUAsmBackend(T);
|
|
+}
|
|
+
|
|
+AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter(
|
|
+ raw_ostream &OS) const {
|
|
+ return new AMDGPUMCObjectWriter(OS);
|
|
+}
|
|
+
|
|
+void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
|
|
+ unsigned DataSize, uint64_t Value) const {
|
|
+
|
|
+ uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
|
|
+ assert(Fixup.getKind() == FK_PCRel_4);
|
|
+ *Dst = (Value - 4) / 4;
|
|
+}
|
|
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..4d3d3e7
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
|
|
@@ -0,0 +1,85 @@
|
|
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPUMCAsmInfo.h"
|
|
+
|
|
+using namespace llvm;
|
|
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
|
|
+ HasSingleParameterDotFile = false;
|
|
+ WeakDefDirective = 0;
|
|
+ //===------------------------------------------------------------------===//
|
|
+ HasSubsectionsViaSymbols = true;
|
|
+ HasMachoZeroFillDirective = false;
|
|
+ HasMachoTBSSDirective = false;
|
|
+ HasStaticCtorDtorReferenceInStaticMode = false;
|
|
+ LinkerRequiresNonEmptyDwarfLines = true;
|
|
+ MaxInstLength = 16;
|
|
+ PCSymbol = "$";
|
|
+ SeparatorString = "\n";
|
|
+ CommentColumn = 40;
|
|
+ CommentString = ";";
|
|
+ LabelSuffix = ":";
|
|
+ GlobalPrefix = "@";
|
|
+ PrivateGlobalPrefix = ";.";
|
|
+ LinkerPrivateGlobalPrefix = "!";
|
|
+ InlineAsmStart = ";#ASMSTART";
|
|
+ InlineAsmEnd = ";#ASMEND";
|
|
+ AssemblerDialect = 0;
|
|
+ AllowQuotesInName = false;
|
|
+ AllowNameToStartWithDigit = false;
|
|
+ AllowPeriodsInName = false;
|
|
+
|
|
+ //===--- Data Emission Directives -------------------------------------===//
|
|
+ ZeroDirective = ".zero";
|
|
+ AsciiDirective = ".ascii\t";
|
|
+ AscizDirective = ".asciz\t";
|
|
+ Data8bitsDirective = ".byte\t";
|
|
+ Data16bitsDirective = ".short\t";
|
|
+ Data32bitsDirective = ".long\t";
|
|
+ Data64bitsDirective = ".quad\t";
|
|
+ GPRel32Directive = 0;
|
|
+ SunStyleELFSectionSwitchSyntax = true;
|
|
+ UsesELFSectionDirectiveForBSS = true;
|
|
+ HasMicrosoftFastStdCallMangling = false;
|
|
+
|
|
+ //===--- Alignment Information ----------------------------------------===//
|
|
+ AlignDirective = ".align\t";
|
|
+ AlignmentIsInBytes = true;
|
|
+ TextAlignFillValue = 0;
|
|
+
|
|
+ //===--- Global Variable Emission Directives --------------------------===//
|
|
+ GlobalDirective = ".global";
|
|
+ ExternDirective = ".extern";
|
|
+ HasSetDirective = false;
|
|
+ HasAggressiveSymbolFolding = true;
|
|
+ COMMDirectiveAlignmentIsInBytes = false;
|
|
+ HasDotTypeDotSizeDirective = false;
|
|
+ HasNoDeadStrip = true;
|
|
+ HasSymbolResolver = false;
|
|
+ WeakRefDirective = ".weakref\t";
|
|
+ LinkOnceDirective = 0;
|
|
+ //===--- Dwarf Emission Directives -----------------------------------===//
|
|
+ HasLEB128 = true;
|
|
+ SupportsDebugInformation = true;
|
|
+ ExceptionsType = ExceptionHandling::None;
|
|
+ DwarfUsesInlineInfoSection = false;
|
|
+ DwarfSectionOffsetDirective = ".offset";
|
|
+
|
|
+}
|
|
+
|
|
+const char*
|
|
+AMDGPUMCAsmInfo::getDataASDirective(unsigned int Size, unsigned int AS) const {
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+const MCSection*
|
|
+AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
|
|
+ return 0;
|
|
+}
|
|
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
|
|
new file mode 100644
|
|
index 0000000..3ad0fa6
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
|
|
@@ -0,0 +1,30 @@
|
|
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface ----------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPUMCASMINFO_H
|
|
+#define AMDGPUMCASMINFO_H
|
|
+
|
|
+#include "llvm/MC/MCAsmInfo.h"
|
|
+namespace llvm {
|
|
+
|
|
+class Target;
|
|
+class StringRef;
|
|
+
|
|
+class AMDGPUMCAsmInfo : public MCAsmInfo {
|
|
+public:
|
|
+ explicit AMDGPUMCAsmInfo(const Target &T, StringRef &TT);
|
|
+ const char* getDataASDirective(unsigned int Size, unsigned int AS) const;
|
|
+ const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
|
|
+};
|
|
+} // namespace llvm
|
|
+#endif // AMDGPUMCASMINFO_H
|
|
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
|
|
new file mode 100644
|
|
index 0000000..8721f80
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
|
|
@@ -0,0 +1,49 @@
|
|
+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief CodeEmitter interface for R600 and SI codegen.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef AMDGPUCODEEMITTER_H
|
|
+#define AMDGPUCODEEMITTER_H
|
|
+
|
|
+#include "llvm/MC/MCCodeEmitter.h"
|
|
+#include "llvm/Support/raw_ostream.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class MCInst;
|
|
+class MCOperand;
|
|
+
|
|
+class AMDGPUMCCodeEmitter : public MCCodeEmitter {
|
|
+public:
|
|
+
|
|
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const;
|
|
+
|
|
+ virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const {
|
|
+ return 0;
|
|
+ }
|
|
+ virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const {
|
|
+ return 0;
|
|
+ }
|
|
+};
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+#endif // AMDGPUCODEEMITTER_H
|
|
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
|
|
new file mode 100644
|
|
index 0000000..6a62856
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
|
|
@@ -0,0 +1,113 @@
|
|
+//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief This file provides AMDGPU specific target descriptions.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPUMCTargetDesc.h"
|
|
+#include "AMDGPUMCAsmInfo.h"
|
|
+#include "InstPrinter/AMDGPUInstPrinter.h"
|
|
+#include "llvm/MC/MachineLocation.h"
|
|
+#include "llvm/MC/MCCodeGenInfo.h"
|
|
+#include "llvm/MC/MCInstrInfo.h"
|
|
+#include "llvm/MC/MCRegisterInfo.h"
|
|
+#include "llvm/MC/MCStreamer.h"
|
|
+#include "llvm/MC/MCSubtargetInfo.h"
|
|
+#include "llvm/Support/ErrorHandling.h"
|
|
+#include "llvm/Support/TargetRegistry.h"
|
|
+
|
|
+#define GET_INSTRINFO_MC_DESC
|
|
+#include "AMDGPUGenInstrInfo.inc"
|
|
+
|
|
+#define GET_SUBTARGETINFO_MC_DESC
|
|
+#include "AMDGPUGenSubtargetInfo.inc"
|
|
+
|
|
+#define GET_REGINFO_MC_DESC
|
|
+#include "AMDGPUGenRegisterInfo.inc"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+static MCInstrInfo *createAMDGPUMCInstrInfo() {
|
|
+ MCInstrInfo *X = new MCInstrInfo();
|
|
+ InitAMDGPUMCInstrInfo(X);
|
|
+ return X;
|
|
+}
|
|
+
|
|
+static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
|
|
+ MCRegisterInfo *X = new MCRegisterInfo();
|
|
+ InitAMDGPUMCRegisterInfo(X, 0);
|
|
+ return X;
|
|
+}
|
|
+
|
|
+static MCSubtargetInfo *createAMDGPUMCSubtargetInfo(StringRef TT, StringRef CPU,
|
|
+ StringRef FS) {
|
|
+ MCSubtargetInfo * X = new MCSubtargetInfo();
|
|
+ InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS);
|
|
+ return X;
|
|
+}
|
|
+
|
|
+static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
|
|
+ CodeModel::Model CM,
|
|
+ CodeGenOpt::Level OL) {
|
|
+ MCCodeGenInfo *X = new MCCodeGenInfo();
|
|
+ X->InitMCCodeGenInfo(RM, CM, OL);
|
|
+ return X;
|
|
+}
|
|
+
|
|
+static MCInstPrinter *createAMDGPUMCInstPrinter(const Target &T,
|
|
+ unsigned SyntaxVariant,
|
|
+ const MCAsmInfo &MAI,
|
|
+ const MCInstrInfo &MII,
|
|
+ const MCRegisterInfo &MRI,
|
|
+ const MCSubtargetInfo &STI) {
|
|
+ return new AMDGPUInstPrinter(MAI, MII, MRI);
|
|
+}
|
|
+
|
|
+static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
|
|
+ const MCRegisterInfo &MRI,
|
|
+ const MCSubtargetInfo &STI,
|
|
+ MCContext &Ctx) {
|
|
+ if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) {
|
|
+ return createSIMCCodeEmitter(MCII, MRI, STI, Ctx);
|
|
+ } else {
|
|
+ return createR600MCCodeEmitter(MCII, MRI, STI, Ctx);
|
|
+ }
|
|
+}
|
|
+
|
|
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
|
|
+ MCContext &Ctx, MCAsmBackend &MAB,
|
|
+ raw_ostream &_OS,
|
|
+ MCCodeEmitter *_Emitter,
|
|
+ bool RelaxAll,
|
|
+ bool NoExecStack) {
|
|
+ return createPureStreamer(Ctx, MAB, _OS, _Emitter);
|
|
+}
|
|
+
|
|
+extern "C" void LLVMInitializeR600TargetMC() {
|
|
+
|
|
+ RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget);
|
|
+
|
|
+ TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo);
|
|
+
|
|
+ TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo);
|
|
+
|
|
+ TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo);
|
|
+
|
|
+ TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo);
|
|
+
|
|
+ TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter);
|
|
+
|
|
+ TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter);
|
|
+
|
|
+ TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend);
|
|
+
|
|
+ TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer);
|
|
+}
|
|
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
|
|
new file mode 100644
|
|
index 0000000..363a4af
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
|
|
@@ -0,0 +1,55 @@
|
|
+//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Provides AMDGPU specific target descriptions.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+
|
|
+#ifndef AMDGPUMCTARGETDESC_H
|
|
+#define AMDGPUMCTARGETDESC_H
|
|
+
|
|
+#include "llvm/ADT/StringRef.h"
|
|
+
|
|
+namespace llvm {
|
|
+class MCAsmBackend;
|
|
+class MCCodeEmitter;
|
|
+class MCContext;
|
|
+class MCInstrInfo;
|
|
+class MCRegisterInfo;
|
|
+class MCSubtargetInfo;
|
|
+class Target;
|
|
+
|
|
+extern Target TheAMDGPUTarget;
|
|
+
|
|
+MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
|
|
+ const MCRegisterInfo &MRI,
|
|
+ const MCSubtargetInfo &STI,
|
|
+ MCContext &Ctx);
|
|
+
|
|
+MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
|
|
+ const MCRegisterInfo &MRI,
|
|
+ const MCSubtargetInfo &STI,
|
|
+ MCContext &Ctx);
|
|
+
|
|
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT,
|
|
+ StringRef CPU);
|
|
+} // End llvm namespace
|
|
+
|
|
+#define GET_REGINFO_ENUM
|
|
+#include "AMDGPUGenRegisterInfo.inc"
|
|
+
|
|
+#define GET_INSTRINFO_ENUM
|
|
+#include "AMDGPUGenInstrInfo.inc"
|
|
+
|
|
+#define GET_SUBTARGETINFO_ENUM
|
|
+#include "AMDGPUGenSubtargetInfo.inc"
|
|
+
|
|
+#endif // AMDGPUMCTARGETDESC_H
|
|
diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
|
|
new file mode 100644
|
|
index 0000000..37e714c
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
|
|
@@ -0,0 +1,10 @@
|
|
+
|
|
+add_llvm_library(LLVMR600Desc
|
|
+ AMDGPUAsmBackend.cpp
|
|
+ AMDGPUMCTargetDesc.cpp
|
|
+ AMDGPUMCAsmInfo.cpp
|
|
+ R600MCCodeEmitter.cpp
|
|
+ SIMCCodeEmitter.cpp
|
|
+ )
|
|
+
|
|
+add_dependencies(LLVMR600Desc AMDGPUCommonTableGen)
|
|
diff --git a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
|
|
new file mode 100644
|
|
index 0000000..b1beab0
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
|
|
@@ -0,0 +1,23 @@
|
|
+;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
|
|
+;
|
|
+; The LLVM Compiler Infrastructure
|
|
+;
|
|
+; This file is distributed under the University of Illinois Open Source
|
|
+; License. See LICENSE.TXT for details.
|
|
+;
|
|
+;===------------------------------------------------------------------------===;
|
|
+;
|
|
+; This is an LLVMBuild description file for the components in this subdirectory.
|
|
+;
|
|
+; For more information on the LLVMBuild system, please see:
|
|
+;
|
|
+; http://llvm.org/docs/LLVMBuild.html
|
|
+;
|
|
+;===------------------------------------------------------------------------===;
|
|
+
|
|
+[component_0]
|
|
+type = Library
|
|
+name = R600Desc
|
|
+parent = R600
|
|
+required_libraries = R600AsmPrinter R600Info MC
|
|
+add_to_library_groups = R600
|
|
diff --git a/lib/Target/R600/MCTargetDesc/Makefile b/lib/Target/R600/MCTargetDesc/Makefile
|
|
new file mode 100644
|
|
index 0000000..8894a76
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/Makefile
|
|
@@ -0,0 +1,16 @@
|
|
+##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===##
|
|
+#
|
|
+# The LLVM Compiler Infrastructure
|
|
+#
|
|
+# This file is distributed under the University of Illinois Open Source
|
|
+# License. See LICENSE.TXT for details.
|
|
+#
|
|
+##===----------------------------------------------------------------------===##
|
|
+
|
|
+LEVEL = ../../../..
|
|
+LIBRARYNAME = LLVMR600Desc
|
|
+
|
|
+# Hack: we need to include 'main' target directory to grab private headers
|
|
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
|
|
+
|
|
+include $(LEVEL)/Makefile.common
|
|
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
|
|
new file mode 100644
|
|
index 0000000..115fe8d
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
|
|
@@ -0,0 +1,582 @@
|
|
+//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+///
|
|
+/// This code emitter outputs bytecode that is understood by the r600g driver
|
|
+/// in the Mesa [1] project. The bytecode is very similar to the hardware's ISA,
|
|
+/// but it still needs to be run through a finalizer in order to be executed
|
|
+/// by the GPU.
|
|
+///
|
|
+/// [1] http://www.mesa3d.org/
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "R600Defines.h"
|
|
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
|
|
+#include "llvm/MC/MCCodeEmitter.h"
|
|
+#include "llvm/MC/MCContext.h"
|
|
+#include "llvm/MC/MCInst.h"
|
|
+#include "llvm/MC/MCInstrInfo.h"
|
|
+#include "llvm/MC/MCRegisterInfo.h"
|
|
+#include "llvm/MC/MCSubtargetInfo.h"
|
|
+#include "llvm/Support/raw_ostream.h"
|
|
+
|
|
+#include <stdio.h>
|
|
+
|
|
+#define SRC_BYTE_COUNT 11
|
|
+#define DST_BYTE_COUNT 5
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+namespace {
|
|
+
|
|
+class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
|
|
+ R600MCCodeEmitter(const R600MCCodeEmitter &); // DO NOT IMPLEMENT
|
|
+ void operator=(const R600MCCodeEmitter &); // DO NOT IMPLEMENT
|
|
+ const MCInstrInfo &MCII;
|
|
+ const MCRegisterInfo &MRI;
|
|
+ const MCSubtargetInfo &STI;
|
|
+ MCContext &Ctx;
|
|
+
|
|
+public:
|
|
+
|
|
+ R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
|
|
+ const MCSubtargetInfo &sti, MCContext &ctx)
|
|
+ : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
|
|
+
|
|
+ /// \brief Encode the instruction and write it to the OS.
|
|
+ virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const;
|
|
+
|
|
+ /// \returns the encoding for an MCOperand.
|
|
+ virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const;
|
|
+private:
|
|
+
|
|
+ void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
|
|
+ raw_ostream &OS) const;
|
|
+ void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const;
|
|
+ void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx,
|
|
+ raw_ostream &OS) const;
|
|
+ void EmitDst(const MCInst &MI, raw_ostream &OS) const;
|
|
+ void EmitTexInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
|
|
+ raw_ostream &OS) const;
|
|
+ void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const;
|
|
+
|
|
+ void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const;
|
|
+
|
|
+ void EmitByte(unsigned int byte, raw_ostream &OS) const;
|
|
+
|
|
+ void EmitTwoBytes(uint32_t bytes, raw_ostream &OS) const;
|
|
+
|
|
+ void Emit(uint32_t value, raw_ostream &OS) const;
|
|
+ void Emit(uint64_t value, raw_ostream &OS) const;
|
|
+
|
|
+ unsigned getHWRegChan(unsigned reg) const;
|
|
+ unsigned getHWReg(unsigned regNo) const;
|
|
+
|
|
+ bool isFCOp(unsigned opcode) const;
|
|
+ bool isTexOp(unsigned opcode) const;
|
|
+ bool isFlagSet(const MCInst &MI, unsigned Operand, unsigned Flag) const;
|
|
+
|
|
+};
|
|
+
|
|
+} // End anonymous namespace
|
|
+
|
|
+enum RegElement {
|
|
+ ELEMENT_X = 0,
|
|
+ ELEMENT_Y,
|
|
+ ELEMENT_Z,
|
|
+ ELEMENT_W
|
|
+};
|
|
+
|
|
+enum InstrTypes {
|
|
+ INSTR_ALU = 0,
|
|
+ INSTR_TEX,
|
|
+ INSTR_FC,
|
|
+ INSTR_NATIVE,
|
|
+ INSTR_VTX,
|
|
+ INSTR_EXPORT
|
|
+};
|
|
+
|
|
+enum FCInstr {
|
|
+ FC_IF_PREDICATE = 0,
|
|
+ FC_ELSE,
|
|
+ FC_ENDIF,
|
|
+ FC_BGNLOOP,
|
|
+ FC_ENDLOOP,
|
|
+ FC_BREAK_PREDICATE,
|
|
+ FC_CONTINUE
|
|
+};
|
|
+
|
|
+enum TextureTypes {
|
|
+ TEXTURE_1D = 1,
|
|
+ TEXTURE_2D,
|
|
+ TEXTURE_3D,
|
|
+ TEXTURE_CUBE,
|
|
+ TEXTURE_RECT,
|
|
+ TEXTURE_SHADOW1D,
|
|
+ TEXTURE_SHADOW2D,
|
|
+ TEXTURE_SHADOWRECT,
|
|
+ TEXTURE_1D_ARRAY,
|
|
+ TEXTURE_2D_ARRAY,
|
|
+ TEXTURE_SHADOW1D_ARRAY,
|
|
+ TEXTURE_SHADOW2D_ARRAY
|
|
+};
|
|
+
|
|
+MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
|
|
+ const MCRegisterInfo &MRI,
|
|
+ const MCSubtargetInfo &STI,
|
|
+ MCContext &Ctx) {
|
|
+ return new R600MCCodeEmitter(MCII, MRI, STI, Ctx);
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const {
|
|
+ if (isTexOp(MI.getOpcode())) {
|
|
+ EmitTexInstr(MI, Fixups, OS);
|
|
+ } else if (isFCOp(MI.getOpcode())){
|
|
+ EmitFCInstr(MI, OS);
|
|
+ } else if (MI.getOpcode() == AMDGPU::RETURN ||
|
|
+ MI.getOpcode() == AMDGPU::BUNDLE ||
|
|
+ MI.getOpcode() == AMDGPU::KILL) {
|
|
+ return;
|
|
+ } else {
|
|
+ switch(MI.getOpcode()) {
|
|
+ case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
|
|
+ case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
|
|
+ uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
|
|
+ EmitByte(INSTR_NATIVE, OS);
|
|
+ Emit(inst, OS);
|
|
+ break;
|
|
+ }
|
|
+ case AMDGPU::CONSTANT_LOAD_eg:
|
|
+ case AMDGPU::VTX_READ_PARAM_8_eg:
|
|
+ case AMDGPU::VTX_READ_PARAM_16_eg:
|
|
+ case AMDGPU::VTX_READ_PARAM_32_eg:
|
|
+ case AMDGPU::VTX_READ_PARAM_128_eg:
|
|
+ case AMDGPU::VTX_READ_GLOBAL_8_eg:
|
|
+ case AMDGPU::VTX_READ_GLOBAL_32_eg:
|
|
+ case AMDGPU::VTX_READ_GLOBAL_128_eg:
|
|
+ case AMDGPU::TEX_VTX_CONSTBUF:
|
|
+ case AMDGPU::TEX_VTX_TEXBUF : {
|
|
+ uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
|
|
+ uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
|
|
+
|
|
+ EmitByte(INSTR_VTX, OS);
|
|
+ Emit(InstWord01, OS);
|
|
+ Emit(InstWord2, OS);
|
|
+ break;
|
|
+ }
|
|
+ case AMDGPU::EG_ExportSwz:
|
|
+ case AMDGPU::R600_ExportSwz:
|
|
+ case AMDGPU::EG_ExportBuf:
|
|
+ case AMDGPU::R600_ExportBuf: {
|
|
+ uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
|
|
+ EmitByte(INSTR_EXPORT, OS);
|
|
+ Emit(Inst, OS);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ default:
|
|
+ EmitALUInstr(MI, Fixups, OS);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
|
|
+ SmallVectorImpl<MCFixup> &Fixups,
|
|
+ raw_ostream &OS) const {
|
|
+ const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
|
|
+
|
|
+ // Emit instruction type
|
|
+ EmitByte(INSTR_ALU, OS);
|
|
+
|
|
+ uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
|
|
+
|
|
+ //older alu have different encoding for instructions with one or two src
|
|
+ //parameters.
|
|
+ if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
|
|
+ !(MCDesc.TSFlags & R600_InstFlag::OP3)) {
|
|
+ uint64_t ISAOpCode = InstWord01 & (0x3FFULL << 39);
|
|
+ InstWord01 &= ~(0x3FFULL << 39);
|
|
+ InstWord01 |= ISAOpCode << 1;
|
|
+ }
|
|
+
|
|
+ unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 :
|
|
+ MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1;
|
|
+
|
|
+ EmitByte(SrcNum, OS);
|
|
+
|
|
+ const unsigned SrcOps[3][2] = {
|
|
+ {R600Operands::SRC0, R600Operands::SRC0_SEL},
|
|
+ {R600Operands::SRC1, R600Operands::SRC1_SEL},
|
|
+ {R600Operands::SRC2, R600Operands::SRC2_SEL}
|
|
+ };
|
|
+
|
|
+ for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) {
|
|
+ unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]];
|
|
+ unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]];
|
|
+ EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS);
|
|
+ }
|
|
+
|
|
+ Emit(InstWord01, OS);
|
|
+ return;
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx,
|
|
+ raw_ostream &OS) const {
|
|
+ const MCOperand &MO = MI.getOperand(OpIdx);
|
|
+ union {
|
|
+ float f;
|
|
+ uint32_t i;
|
|
+ } Value;
|
|
+ Value.i = 0;
|
|
+ // Emit the source select (2 bytes). For GPRs, this is the register index.
|
|
+ // For other potential instruction operands, (e.g. constant registers) the
|
|
+ // value of the source select is defined in the r600isa docs.
|
|
+ if (MO.isReg()) {
|
|
+ unsigned reg = MO.getReg();
|
|
+ EmitTwoBytes(getHWReg(reg), OS);
|
|
+ if (reg == AMDGPU::ALU_LITERAL_X) {
|
|
+ unsigned ImmOpIndex = MI.getNumOperands() - 1;
|
|
+ MCOperand ImmOp = MI.getOperand(ImmOpIndex);
|
|
+ if (ImmOp.isFPImm()) {
|
|
+ Value.f = ImmOp.getFPImm();
|
|
+ } else {
|
|
+ assert(ImmOp.isImm());
|
|
+ Value.i = ImmOp.getImm();
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ // XXX: Handle other operand types.
|
|
+ EmitTwoBytes(0, OS);
|
|
+ }
|
|
+
|
|
+ // Emit the source channel (1 byte)
|
|
+ if (MO.isReg()) {
|
|
+ EmitByte(getHWRegChan(MO.getReg()), OS);
|
|
+ } else {
|
|
+ EmitByte(0, OS);
|
|
+ }
|
|
+
|
|
+ // XXX: Emit isNegated (1 byte)
|
|
+ if ((!(isFlagSet(MI, OpIdx, MO_FLAG_ABS)))
|
|
+ && (isFlagSet(MI, OpIdx, MO_FLAG_NEG) ||
|
|
+ (MO.isReg() &&
|
|
+ (MO.getReg() == AMDGPU::NEG_ONE || MO.getReg() == AMDGPU::NEG_HALF)))){
|
|
+ EmitByte(1, OS);
|
|
+ } else {
|
|
+ EmitByte(0, OS);
|
|
+ }
|
|
+
|
|
+ // Emit isAbsolute (1 byte)
|
|
+ if (isFlagSet(MI, OpIdx, MO_FLAG_ABS)) {
|
|
+ EmitByte(1, OS);
|
|
+ } else {
|
|
+ EmitByte(0, OS);
|
|
+ }
|
|
+
|
|
+ // XXX: Emit relative addressing mode (1 byte)
|
|
+ EmitByte(0, OS);
|
|
+
|
|
+ // Emit kc_bank, This will be adjusted later by r600_asm
|
|
+ EmitByte(0, OS);
|
|
+
|
|
+ // Emit the literal value, if applicable (4 bytes).
|
|
+ Emit(Value.i, OS);
|
|
+
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx,
|
|
+ unsigned SelOpIdx, raw_ostream &OS) const {
|
|
+ const MCOperand &RegMO = MI.getOperand(RegOpIdx);
|
|
+ const MCOperand &SelMO = MI.getOperand(SelOpIdx);
|
|
+
|
|
+ union {
|
|
+ float f;
|
|
+ uint32_t i;
|
|
+ } InlineConstant;
|
|
+ InlineConstant.i = 0;
|
|
+ // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0
|
|
+ // and select is 0 (GPR index is encoded in the instr encoding. For constants
|
|
+ // type is 1 and select is the original const select passed from the driver.
|
|
+ unsigned Reg = RegMO.getReg();
|
|
+ if (Reg == AMDGPU::ALU_CONST) {
|
|
+ EmitByte(1, OS);
|
|
+ uint32_t Sel = SelMO.getImm();
|
|
+ Emit(Sel, OS);
|
|
+ } else {
|
|
+ EmitByte(0, OS);
|
|
+ Emit((uint32_t)0, OS);
|
|
+ }
|
|
+
|
|
+ if (Reg == AMDGPU::ALU_LITERAL_X) {
|
|
+ unsigned ImmOpIndex = MI.getNumOperands() - 1;
|
|
+ MCOperand ImmOp = MI.getOperand(ImmOpIndex);
|
|
+ if (ImmOp.isFPImm()) {
|
|
+ InlineConstant.f = ImmOp.getFPImm();
|
|
+ } else {
|
|
+ assert(ImmOp.isImm());
|
|
+ InlineConstant.i = ImmOp.getImm();
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Emit the literal value, if applicable (4 bytes).
|
|
+ Emit(InlineConstant.i, OS);
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::EmitTexInstr(const MCInst &MI,
|
|
+ SmallVectorImpl<MCFixup> &Fixups,
|
|
+ raw_ostream &OS) const {
|
|
+
|
|
+ unsigned Opcode = MI.getOpcode();
|
|
+ bool hasOffsets = (Opcode == AMDGPU::TEX_LD);
|
|
+ unsigned OpOffset = hasOffsets ? 3 : 0;
|
|
+ int64_t Resource = MI.getOperand(OpOffset + 2).getImm();
|
|
+ int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
|
|
+ int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
|
|
+ unsigned srcSelect[4] = {0, 1, 2, 3};
|
|
+
|
|
+ // Emit instruction type
|
|
+ EmitByte(1, OS);
|
|
+
|
|
+ // Emit instruction
|
|
+ EmitByte(getBinaryCodeForInstr(MI, Fixups), OS);
|
|
+
|
|
+ // Emit resource id
|
|
+ EmitByte(Resource, OS);
|
|
+
|
|
+ // Emit source register
|
|
+ EmitByte(getHWReg(MI.getOperand(1).getReg()), OS);
|
|
+
|
|
+ // XXX: Emit src isRelativeAddress
|
|
+ EmitByte(0, OS);
|
|
+
|
|
+ // Emit destination register
|
|
+ EmitByte(getHWReg(MI.getOperand(0).getReg()), OS);
|
|
+
|
|
+ // XXX: Emit dst isRealtiveAddress
|
|
+ EmitByte(0, OS);
|
|
+
|
|
+ // XXX: Emit dst select
|
|
+ EmitByte(0, OS); // X
|
|
+ EmitByte(1, OS); // Y
|
|
+ EmitByte(2, OS); // Z
|
|
+ EmitByte(3, OS); // W
|
|
+
|
|
+ // XXX: Emit lod bias
|
|
+ EmitByte(0, OS);
|
|
+
|
|
+ // XXX: Emit coord types
|
|
+ unsigned coordType[4] = {1, 1, 1, 1};
|
|
+
|
|
+ if (TextureType == TEXTURE_RECT
|
|
+ || TextureType == TEXTURE_SHADOWRECT) {
|
|
+ coordType[ELEMENT_X] = 0;
|
|
+ coordType[ELEMENT_Y] = 0;
|
|
+ }
|
|
+
|
|
+ if (TextureType == TEXTURE_1D_ARRAY
|
|
+ || TextureType == TEXTURE_SHADOW1D_ARRAY) {
|
|
+ if (Opcode == AMDGPU::TEX_SAMPLE_C_L || Opcode == AMDGPU::TEX_SAMPLE_C_LB) {
|
|
+ coordType[ELEMENT_Y] = 0;
|
|
+ } else {
|
|
+ coordType[ELEMENT_Z] = 0;
|
|
+ srcSelect[ELEMENT_Z] = ELEMENT_Y;
|
|
+ }
|
|
+ } else if (TextureType == TEXTURE_2D_ARRAY
|
|
+ || TextureType == TEXTURE_SHADOW2D_ARRAY) {
|
|
+ coordType[ELEMENT_Z] = 0;
|
|
+ }
|
|
+
|
|
+ for (unsigned i = 0; i < 4; i++) {
|
|
+ EmitByte(coordType[i], OS);
|
|
+ }
|
|
+
|
|
+ // XXX: Emit offsets
|
|
+ if (hasOffsets)
|
|
+ for (unsigned i = 2; i < 5; i++)
|
|
+ EmitByte(MI.getOperand(i).getImm()<<1, OS);
|
|
+ else
|
|
+ EmitNullBytes(3, OS);
|
|
+
|
|
+ // Emit sampler id
|
|
+ EmitByte(Sampler, OS);
|
|
+
|
|
+ // XXX:Emit source select
|
|
+ if ((TextureType == TEXTURE_SHADOW1D
|
|
+ || TextureType == TEXTURE_SHADOW2D
|
|
+ || TextureType == TEXTURE_SHADOWRECT
|
|
+ || TextureType == TEXTURE_SHADOW1D_ARRAY)
|
|
+ && Opcode != AMDGPU::TEX_SAMPLE_C_L
|
|
+ && Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
|
|
+ srcSelect[ELEMENT_W] = ELEMENT_Z;
|
|
+ }
|
|
+
|
|
+ for (unsigned i = 0; i < 4; i++) {
|
|
+ EmitByte(srcSelect[i], OS);
|
|
+ }
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const {
|
|
+
|
|
+ // Emit instruction type
|
|
+ EmitByte(INSTR_FC, OS);
|
|
+
|
|
+ // Emit SRC
|
|
+ unsigned NumOperands = MI.getNumOperands();
|
|
+ if (NumOperands > 0) {
|
|
+ assert(NumOperands == 1);
|
|
+ EmitSrc(MI, 0, OS);
|
|
+ } else {
|
|
+ EmitNullBytes(SRC_BYTE_COUNT, OS);
|
|
+ }
|
|
+
|
|
+ // Emit FC Instruction
|
|
+ enum FCInstr instr;
|
|
+ switch (MI.getOpcode()) {
|
|
+ case AMDGPU::PREDICATED_BREAK:
|
|
+ instr = FC_BREAK_PREDICATE;
|
|
+ break;
|
|
+ case AMDGPU::CONTINUE:
|
|
+ instr = FC_CONTINUE;
|
|
+ break;
|
|
+ case AMDGPU::IF_PREDICATE_SET:
|
|
+ instr = FC_IF_PREDICATE;
|
|
+ break;
|
|
+ case AMDGPU::ELSE:
|
|
+ instr = FC_ELSE;
|
|
+ break;
|
|
+ case AMDGPU::ENDIF:
|
|
+ instr = FC_ENDIF;
|
|
+ break;
|
|
+ case AMDGPU::ENDLOOP:
|
|
+ instr = FC_ENDLOOP;
|
|
+ break;
|
|
+ case AMDGPU::WHILELOOP:
|
|
+ instr = FC_BGNLOOP;
|
|
+ break;
|
|
+ default:
|
|
+ abort();
|
|
+ break;
|
|
+ }
|
|
+ EmitByte(instr, OS);
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::EmitNullBytes(unsigned int ByteCount,
|
|
+ raw_ostream &OS) const {
|
|
+
|
|
+ for (unsigned int i = 0; i < ByteCount; i++) {
|
|
+ EmitByte(0, OS);
|
|
+ }
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
|
|
+ OS.write((uint8_t) Byte & 0xff);
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::EmitTwoBytes(unsigned int Bytes,
|
|
+ raw_ostream &OS) const {
|
|
+ OS.write((uint8_t) (Bytes & 0xff));
|
|
+ OS.write((uint8_t) ((Bytes >> 8) & 0xff));
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
|
|
+ for (unsigned i = 0; i < 4; i++) {
|
|
+ OS.write((uint8_t) ((Value >> (8 * i)) & 0xff));
|
|
+ }
|
|
+}
|
|
+
|
|
+void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
|
|
+ for (unsigned i = 0; i < 8; i++) {
|
|
+ EmitByte((Value >> (8 * i)) & 0xff, OS);
|
|
+ }
|
|
+}
|
|
+
|
|
+unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
|
|
+ return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT;
|
|
+}
|
|
+
|
|
+unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
|
|
+ return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
|
|
+}
|
|
+
|
|
+uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
|
|
+ const MCOperand &MO,
|
|
+ SmallVectorImpl<MCFixup> &Fixup) const {
|
|
+ if (MO.isReg()) {
|
|
+ if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) {
|
|
+ return MRI.getEncodingValue(MO.getReg());
|
|
+ } else {
|
|
+ return getHWReg(MO.getReg());
|
|
+ }
|
|
+ } else if (MO.isImm()) {
|
|
+ return MO.getImm();
|
|
+ } else {
|
|
+ assert(0);
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Encoding helper functions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+bool R600MCCodeEmitter::isFCOp(unsigned opcode) const {
|
|
+ switch(opcode) {
|
|
+ default: return false;
|
|
+ case AMDGPU::PREDICATED_BREAK:
|
|
+ case AMDGPU::CONTINUE:
|
|
+ case AMDGPU::IF_PREDICATE_SET:
|
|
+ case AMDGPU::ELSE:
|
|
+ case AMDGPU::ENDIF:
|
|
+ case AMDGPU::ENDLOOP:
|
|
+ case AMDGPU::WHILELOOP:
|
|
+ return true;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool R600MCCodeEmitter::isTexOp(unsigned opcode) const {
|
|
+ switch(opcode) {
|
|
+ default: return false;
|
|
+ case AMDGPU::TEX_LD:
|
|
+ case AMDGPU::TEX_GET_TEXTURE_RESINFO:
|
|
+ case AMDGPU::TEX_SAMPLE:
|
|
+ case AMDGPU::TEX_SAMPLE_C:
|
|
+ case AMDGPU::TEX_SAMPLE_L:
|
|
+ case AMDGPU::TEX_SAMPLE_C_L:
|
|
+ case AMDGPU::TEX_SAMPLE_LB:
|
|
+ case AMDGPU::TEX_SAMPLE_C_LB:
|
|
+ case AMDGPU::TEX_SAMPLE_G:
|
|
+ case AMDGPU::TEX_SAMPLE_C_G:
|
|
+ case AMDGPU::TEX_GET_GRADIENTS_H:
|
|
+ case AMDGPU::TEX_GET_GRADIENTS_V:
|
|
+ case AMDGPU::TEX_SET_GRADIENTS_H:
|
|
+ case AMDGPU::TEX_SET_GRADIENTS_V:
|
|
+ return true;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool R600MCCodeEmitter::isFlagSet(const MCInst &MI, unsigned Operand,
|
|
+ unsigned Flag) const {
|
|
+ const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
|
|
+ unsigned FlagIndex = GET_FLAG_OPERAND_IDX(MCDesc.TSFlags);
|
|
+ if (FlagIndex == 0) {
|
|
+ return false;
|
|
+ }
|
|
+ assert(MI.getOperand(FlagIndex).isImm());
|
|
+ return !!((MI.getOperand(FlagIndex).getImm() >>
|
|
+ (NUM_MO_FLAGS * Operand)) & Flag);
|
|
+}
|
|
+
|
|
+#include "AMDGPUGenMCCodeEmitter.inc"
|
|
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
|
|
new file mode 100644
|
|
index 0000000..6dfbbe8
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
|
|
@@ -0,0 +1,235 @@
|
|
+//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief The SI code emitter produces machine code that can be executed
|
|
+/// directly on the GPU device.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
|
|
+#include "llvm/MC/MCCodeEmitter.h"
|
|
+#include "llvm/MC/MCContext.h"
|
|
+#include "llvm/MC/MCInst.h"
|
|
+#include "llvm/MC/MCInstrInfo.h"
|
|
+#include "llvm/MC/MCRegisterInfo.h"
|
|
+#include "llvm/MC/MCSubtargetInfo.h"
|
|
+#include "llvm/MC/MCFixup.h"
|
|
+#include "llvm/Support/raw_ostream.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+namespace {
|
|
+
|
|
+/// \brief Helper type used in encoding
|
|
+typedef union {
|
|
+ int32_t I;
|
|
+ float F;
|
|
+} IntFloatUnion;
|
|
+
|
|
+class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
|
|
+ SIMCCodeEmitter(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
|
|
+ void operator=(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
|
|
+ const MCInstrInfo &MCII;
|
|
+ const MCRegisterInfo &MRI;
|
|
+ const MCSubtargetInfo &STI;
|
|
+ MCContext &Ctx;
|
|
+
|
|
+ /// \brief Encode a sequence of registers with the correct alignment.
|
|
+ unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const;
|
|
+
|
|
+ /// \brief Can this operand also contain immediate values?
|
|
+ bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
|
|
+
|
|
+ /// \brief Encode an fp or int literal
|
|
+ uint32_t getLitEncoding(const MCOperand &MO) const;
|
|
+
|
|
+public:
|
|
+ SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
|
|
+ const MCSubtargetInfo &sti, MCContext &ctx)
|
|
+ : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
|
|
+
|
|
+ ~SIMCCodeEmitter() { }
|
|
+
|
|
+ /// \breif Encode the instruction and write it to the OS.
|
|
+ virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const;
|
|
+
|
|
+ /// \returns the encoding for an MCOperand.
|
|
+ virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const;
|
|
+
|
|
+ /// \brief Encoding for when 2 consecutive registers are used
|
|
+ virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
|
|
+ SmallVectorImpl<MCFixup> &Fixup) const;
|
|
+
|
|
+ /// \brief Encoding for when 4 consectuive registers are used
|
|
+ virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
|
|
+ SmallVectorImpl<MCFixup> &Fixup) const;
|
|
+};
|
|
+
|
|
+} // End anonymous namespace
|
|
+
|
|
+MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
|
|
+ const MCRegisterInfo &MRI,
|
|
+ const MCSubtargetInfo &STI,
|
|
+ MCContext &Ctx) {
|
|
+ return new SIMCCodeEmitter(MCII, MRI, STI, Ctx);
|
|
+}
|
|
+
|
|
+bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
|
|
+ unsigned OpNo) const {
|
|
+
|
|
+ unsigned RegClass = Desc.OpInfo[OpNo].RegClass;
|
|
+ return (AMDGPU::SSrc_32RegClassID == RegClass) ||
|
|
+ (AMDGPU::SSrc_64RegClassID == RegClass) ||
|
|
+ (AMDGPU::VSrc_32RegClassID == RegClass) ||
|
|
+ (AMDGPU::VSrc_64RegClassID == RegClass);
|
|
+}
|
|
+
|
|
+uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const {
|
|
+
|
|
+ IntFloatUnion Imm;
|
|
+ if (MO.isImm())
|
|
+ Imm.I = MO.getImm();
|
|
+ else if (MO.isFPImm())
|
|
+ Imm.F = MO.getFPImm();
|
|
+ else
|
|
+ return ~0;
|
|
+
|
|
+ if (Imm.I >= 0 && Imm.I <= 64)
|
|
+ return 128 + Imm.I;
|
|
+
|
|
+ if (Imm.I >= -16 && Imm.I <= -1)
|
|
+ return 192 + abs(Imm.I);
|
|
+
|
|
+ if (Imm.F == 0.5f)
|
|
+ return 240;
|
|
+
|
|
+ if (Imm.F == -0.5f)
|
|
+ return 241;
|
|
+
|
|
+ if (Imm.F == 1.0f)
|
|
+ return 242;
|
|
+
|
|
+ if (Imm.F == -1.0f)
|
|
+ return 243;
|
|
+
|
|
+ if (Imm.F == 2.0f)
|
|
+ return 244;
|
|
+
|
|
+ if (Imm.F == -2.0f)
|
|
+ return 245;
|
|
+
|
|
+ if (Imm.F == 4.0f)
|
|
+ return 246;
|
|
+
|
|
+ if (Imm.F == 4.0f)
|
|
+ return 247;
|
|
+
|
|
+ return 255;
|
|
+}
|
|
+
|
|
+void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const {
|
|
+
|
|
+ uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups);
|
|
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
|
|
+ unsigned bytes = Desc.getSize();
|
|
+
|
|
+ for (unsigned i = 0; i < bytes; i++) {
|
|
+ OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
|
|
+ }
|
|
+
|
|
+ if (bytes > 4)
|
|
+ return;
|
|
+
|
|
+ // Check for additional literals in SRC0/1/2 (Op 1/2/3)
|
|
+ for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
|
|
+
|
|
+ // Check if this operand should be encoded as [SV]Src
|
|
+ if (!isSrcOperand(Desc, i))
|
|
+ continue;
|
|
+
|
|
+ // Is this operand a literal immediate?
|
|
+ const MCOperand &Op = MI.getOperand(i);
|
|
+ if (getLitEncoding(Op) != 255)
|
|
+ continue;
|
|
+
|
|
+ // Yes! Encode it
|
|
+ IntFloatUnion Imm;
|
|
+ if (Op.isImm())
|
|
+ Imm.I = Op.getImm();
|
|
+ else
|
|
+ Imm.F = Op.getFPImm();
|
|
+
|
|
+ for (unsigned j = 0; j < 4; j++) {
|
|
+ OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff));
|
|
+ }
|
|
+
|
|
+ // Only one literal value allowed
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
|
|
+ const MCOperand &MO,
|
|
+ SmallVectorImpl<MCFixup> &Fixups) const {
|
|
+ if (MO.isReg())
|
|
+ return MRI.getEncodingValue(MO.getReg());
|
|
+
|
|
+ if (MO.isExpr()) {
|
|
+ const MCExpr *Expr = MO.getExpr();
|
|
+ MCFixupKind Kind = MCFixupKind(FK_PCRel_4);
|
|
+ Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ // Figure out the operand number, needed for isSrcOperand check
|
|
+ unsigned OpNo = 0;
|
|
+ for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
|
|
+ if (&MO == &MI.getOperand(OpNo))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
|
|
+ if (isSrcOperand(Desc, OpNo)) {
|
|
+ uint32_t Enc = getLitEncoding(MO);
|
|
+ if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
|
|
+ return Enc;
|
|
+
|
|
+ } else if (MO.isImm())
|
|
+ return MO.getImm();
|
|
+
|
|
+ llvm_unreachable("Encoding of this operand type is not supported yet.");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Custom Operand Encodings
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo,
|
|
+ unsigned shift) const {
|
|
+ unsigned regCode = MRI.getEncodingValue(MI.getOperand(OpNo).getReg());
|
|
+ return (regCode & 0xff) >> shift;
|
|
+}
|
|
+
|
|
+unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI,
|
|
+ unsigned OpNo ,
|
|
+ SmallVectorImpl<MCFixup> &Fixup) const {
|
|
+ return GPRAlign(MI, OpNo, 1);
|
|
+}
|
|
+
|
|
+unsigned SIMCCodeEmitter::GPR4AlignEncode(const MCInst &MI,
|
|
+ unsigned OpNo,
|
|
+ SmallVectorImpl<MCFixup> &Fixup) const {
|
|
+ return GPRAlign(MI, OpNo, 2);
|
|
+}
|
|
diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile
|
|
new file mode 100644
|
|
index 0000000..1b3ebbe
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/Makefile
|
|
@@ -0,0 +1,23 @@
|
|
+##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===##
|
|
+#
|
|
+# The LLVM Compiler Infrastructure
|
|
+#
|
|
+# This file is distributed under the University of Illinois Open Source
|
|
+# License. See LICENSE.TXT for details.
|
|
+#
|
|
+##===----------------------------------------------------------------------===##
|
|
+
|
|
+LEVEL = ../../..
|
|
+LIBRARYNAME = LLVMR600CodeGen
|
|
+TARGET = AMDGPU
|
|
+
|
|
+# Make sure that tblgen is run, first thing.
|
|
+BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \
|
|
+ AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \
|
|
+ AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \
|
|
+ AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \
|
|
+ AMDGPUGenAsmWriter.inc
|
|
+
|
|
+DIRS = InstPrinter TargetInfo MCTargetDesc
|
|
+
|
|
+include $(LEVEL)/Makefile.common
|
|
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
|
|
new file mode 100644
|
|
index 0000000..868810c
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/Processors.td
|
|
@@ -0,0 +1,30 @@
|
|
+//===-- Processors.td - TODO: Add brief description -------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// AMDIL processors supported.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
|
|
+: Processor<Name, itin, Features>;
|
|
+def : Proc<"", R600_EG_Itin, [FeatureR600ALUInst]>;
|
|
+def : Proc<"r600", R600_EG_Itin, [FeatureR600ALUInst]>;
|
|
+def : Proc<"rv710", R600_EG_Itin, []>;
|
|
+def : Proc<"rv730", R600_EG_Itin, []>;
|
|
+def : Proc<"rv770", R600_EG_Itin, [FeatureFP64]>;
|
|
+def : Proc<"cedar", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
|
|
+def : Proc<"redwood", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
|
|
+def : Proc<"juniper", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
|
|
+def : Proc<"cypress", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
|
|
+def : Proc<"barts", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
|
|
+def : Proc<"turks", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
|
|
+def : Proc<"caicos", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
|
|
+def : Proc<"cayman", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
|
|
+def : Proc<"SI", SI_Itin, [Feature64BitPtr]>;
|
|
+
|
|
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
|
|
new file mode 100644
|
|
index 0000000..16cfcf5
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600Defines.h
|
|
@@ -0,0 +1,97 @@
|
|
+//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef R600DEFINES_H_
|
|
+#define R600DEFINES_H_
|
|
+
|
|
+#include "llvm/MC/MCRegisterInfo.h"
|
|
+
|
|
+// Operand Flags
|
|
+#define MO_FLAG_CLAMP (1 << 0)
|
|
+#define MO_FLAG_NEG (1 << 1)
|
|
+#define MO_FLAG_ABS (1 << 2)
|
|
+#define MO_FLAG_MASK (1 << 3)
|
|
+#define MO_FLAG_PUSH (1 << 4)
|
|
+#define MO_FLAG_NOT_LAST (1 << 5)
|
|
+#define MO_FLAG_LAST (1 << 6)
|
|
+#define NUM_MO_FLAGS 7
|
|
+
|
|
+/// \brief Helper for getting the operand index for the instruction flags
|
|
+/// operand.
|
|
+#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
|
|
+
|
|
+namespace R600_InstFlag {
|
|
+ enum TIF {
|
|
+ TRANS_ONLY = (1 << 0),
|
|
+ TEX = (1 << 1),
|
|
+ REDUCTION = (1 << 2),
|
|
+ FC = (1 << 3),
|
|
+ TRIG = (1 << 4),
|
|
+ OP3 = (1 << 5),
|
|
+ VECTOR = (1 << 6),
|
|
+ //FlagOperand bits 7, 8
|
|
+ NATIVE_OPERANDS = (1 << 9),
|
|
+ OP1 = (1 << 10),
|
|
+ OP2 = (1 << 11)
|
|
+ };
|
|
+}
|
|
+
|
|
+#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
|
|
+
|
|
+/// \brief Defines for extracting register infomation from register encoding
|
|
+#define HW_REG_MASK 0x1ff
|
|
+#define HW_CHAN_SHIFT 9
|
|
+
|
|
+#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT)
|
|
+#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK)
|
|
+
|
|
+namespace R600Operands {
|
|
+ enum Ops {
|
|
+ DST,
|
|
+ UPDATE_EXEC_MASK,
|
|
+ UPDATE_PREDICATE,
|
|
+ WRITE,
|
|
+ OMOD,
|
|
+ DST_REL,
|
|
+ CLAMP,
|
|
+ SRC0,
|
|
+ SRC0_NEG,
|
|
+ SRC0_REL,
|
|
+ SRC0_ABS,
|
|
+ SRC0_SEL,
|
|
+ SRC1,
|
|
+ SRC1_NEG,
|
|
+ SRC1_REL,
|
|
+ SRC1_ABS,
|
|
+ SRC1_SEL,
|
|
+ SRC2,
|
|
+ SRC2_NEG,
|
|
+ SRC2_REL,
|
|
+ SRC2_SEL,
|
|
+ LAST,
|
|
+ PRED_SEL,
|
|
+ IMM,
|
|
+ COUNT
|
|
+ };
|
|
+
|
|
+ const static int ALUOpTable[3][R600Operands::COUNT] = {
|
|
+// W C S S S S S S S S S S S
|
|
+// R O D L S R R R R S R R R R S R R R L P
|
|
+// D U I M R A R C C C C R C C C C R C C C A R I
|
|
+// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M
|
|
+// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M
|
|
+ {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12},
|
|
+ {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19},
|
|
+ {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17}
|
|
+ };
|
|
+
|
|
+}
|
|
+
|
|
+#endif // R600DEFINES_H_
|
|
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
|
|
new file mode 100644
|
|
index 0000000..c00c349
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
|
|
@@ -0,0 +1,290 @@
|
|
+//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// Vector, Reduction, and Cube instructions need to fill the entire instruction
|
|
+/// group to work correctly. This pass expands these individual instructions
|
|
+/// into several instructions that will completely fill the instruction group.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPU.h"
|
|
+#include "R600Defines.h"
|
|
+#include "R600InstrInfo.h"
|
|
+#include "R600RegisterInfo.h"
|
|
+#include "R600MachineFunctionInfo.h"
|
|
+#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+namespace {
|
|
+
|
|
+class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
|
|
+
|
|
+private:
|
|
+ static char ID;
|
|
+ const R600InstrInfo *TII;
|
|
+
|
|
+ bool ExpandInputPerspective(MachineInstr& MI);
|
|
+ bool ExpandInputConstant(MachineInstr& MI);
|
|
+
|
|
+public:
|
|
+ R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
|
|
+ TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
|
|
+
|
|
+ virtual bool runOnMachineFunction(MachineFunction &MF);
|
|
+
|
|
+ const char *getPassName() const {
|
|
+ return "R600 Expand special instructions pass";
|
|
+ }
|
|
+};
|
|
+
|
|
+} // End anonymous namespace
|
|
+
|
|
+char R600ExpandSpecialInstrsPass::ID = 0;
|
|
+
|
|
+FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
|
|
+ return new R600ExpandSpecialInstrsPass(TM);
|
|
+}
|
|
+
|
|
+bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
|
|
+
|
|
+ const R600RegisterInfo &TRI = TII->getRegisterInfo();
|
|
+
|
|
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
|
|
+ BB != BB_E; ++BB) {
|
|
+ MachineBasicBlock &MBB = *BB;
|
|
+ MachineBasicBlock::iterator I = MBB.begin();
|
|
+ while (I != MBB.end()) {
|
|
+ MachineInstr &MI = *I;
|
|
+ I = llvm::next(I);
|
|
+
|
|
+ switch (MI.getOpcode()) {
|
|
+ default: break;
|
|
+ // Expand PRED_X to one of the PRED_SET instructions.
|
|
+ case AMDGPU::PRED_X: {
|
|
+ uint64_t Flags = MI.getOperand(3).getImm();
|
|
+ // The native opcode used by PRED_X is stored as an immediate in the
|
|
+ // third operand.
|
|
+ MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
|
|
+ MI.getOperand(2).getImm(), // opcode
|
|
+ MI.getOperand(0).getReg(), // dst
|
|
+ MI.getOperand(1).getReg(), // src0
|
|
+ AMDGPU::ZERO); // src1
|
|
+ TII->addFlag(PredSet, 0, MO_FLAG_MASK);
|
|
+ if (Flags & MO_FLAG_PUSH) {
|
|
+ TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
|
|
+ } else {
|
|
+ TII->setImmOperand(PredSet, R600Operands::UPDATE_PREDICATE, 1);
|
|
+ }
|
|
+ MI.eraseFromParent();
|
|
+ continue;
|
|
+ }
|
|
+ case AMDGPU::BREAK: {
|
|
+ MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
|
|
+ AMDGPU::PRED_SETE_INT,
|
|
+ AMDGPU::PREDICATE_BIT,
|
|
+ AMDGPU::ZERO,
|
|
+ AMDGPU::ZERO);
|
|
+ TII->addFlag(PredSet, 0, MO_FLAG_MASK);
|
|
+ TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
|
|
+
|
|
+ BuildMI(MBB, I, MBB.findDebugLoc(I),
|
|
+ TII->get(AMDGPU::PREDICATED_BREAK))
|
|
+ .addReg(AMDGPU::PREDICATE_BIT);
|
|
+ MI.eraseFromParent();
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::INTERP_PAIR_XY: {
|
|
+ MachineInstr *BMI;
|
|
+ unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
|
|
+ MI.getOperand(2).getImm());
|
|
+
|
|
+ for (unsigned Chan = 0; Chan < 4; ++Chan) {
|
|
+ unsigned DstReg;
|
|
+
|
|
+ if (Chan < 2)
|
|
+ DstReg = MI.getOperand(Chan).getReg();
|
|
+ else
|
|
+ DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;
|
|
+
|
|
+ BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
|
|
+ DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
|
|
+
|
|
+ BMI->setIsInsideBundle(Chan > 0);
|
|
+ if (Chan >= 2)
|
|
+ TII->addFlag(BMI, 0, MO_FLAG_MASK);
|
|
+ if (Chan != 3)
|
|
+ TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
|
|
+ }
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::INTERP_PAIR_ZW: {
|
|
+ MachineInstr *BMI;
|
|
+ unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
|
|
+ MI.getOperand(2).getImm());
|
|
+
|
|
+ for (unsigned Chan = 0; Chan < 4; ++Chan) {
|
|
+ unsigned DstReg;
|
|
+
|
|
+ if (Chan < 2)
|
|
+ DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
|
|
+ else
|
|
+ DstReg = MI.getOperand(Chan-2).getReg();
|
|
+
|
|
+ BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
|
|
+ DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
|
|
+
|
|
+ BMI->setIsInsideBundle(Chan > 0);
|
|
+ if (Chan < 2)
|
|
+ TII->addFlag(BMI, 0, MO_FLAG_MASK);
|
|
+ if (Chan != 3)
|
|
+ TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
|
|
+ }
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::INTERP_VEC_LOAD: {
|
|
+ const R600RegisterInfo &TRI = TII->getRegisterInfo();
|
|
+ MachineInstr *BMI;
|
|
+ unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
|
|
+ MI.getOperand(1).getImm());
|
|
+ unsigned DstReg = MI.getOperand(0).getReg();
|
|
+
|
|
+ for (unsigned Chan = 0; Chan < 4; ++Chan) {
|
|
+ BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
|
|
+ TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
|
|
+ BMI->setIsInsideBundle(Chan > 0);
|
|
+ if (Chan != 3)
|
|
+ TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
|
|
+ }
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bool IsReduction = TII->isReductionOp(MI.getOpcode());
|
|
+ bool IsVector = TII->isVector(MI);
|
|
+ bool IsCube = TII->isCubeOp(MI.getOpcode());
|
|
+ if (!IsReduction && !IsVector && !IsCube) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ // Expand the instruction
|
|
+ //
|
|
+ // Reduction instructions:
|
|
+ // T0_X = DP4 T1_XYZW, T2_XYZW
|
|
+ // becomes:
|
|
+ // TO_X = DP4 T1_X, T2_X
|
|
+ // TO_Y (write masked) = DP4 T1_Y, T2_Y
|
|
+ // TO_Z (write masked) = DP4 T1_Z, T2_Z
|
|
+ // TO_W (write masked) = DP4 T1_W, T2_W
|
|
+ //
|
|
+ // Vector instructions:
|
|
+ // T0_X = MULLO_INT T1_X, T2_X
|
|
+ // becomes:
|
|
+ // T0_X = MULLO_INT T1_X, T2_X
|
|
+ // T0_Y (write masked) = MULLO_INT T1_X, T2_X
|
|
+ // T0_Z (write masked) = MULLO_INT T1_X, T2_X
|
|
+ // T0_W (write masked) = MULLO_INT T1_X, T2_X
|
|
+ //
|
|
+ // Cube instructions:
|
|
+ // T0_XYZW = CUBE T1_XYZW
|
|
+ // becomes:
|
|
+ // TO_X = CUBE T1_Z, T1_Y
|
|
+ // T0_Y = CUBE T1_Z, T1_X
|
|
+ // T0_Z = CUBE T1_X, T1_Z
|
|
+ // T0_W = CUBE T1_Y, T1_Z
|
|
+ for (unsigned Chan = 0; Chan < 4; Chan++) {
|
|
+ unsigned DstReg = MI.getOperand(
|
|
+ TII->getOperandIdx(MI, R600Operands::DST)).getReg();
|
|
+ unsigned Src0 = MI.getOperand(
|
|
+ TII->getOperandIdx(MI, R600Operands::SRC0)).getReg();
|
|
+ unsigned Src1 = 0;
|
|
+
|
|
+ // Determine the correct source registers
|
|
+ if (!IsCube) {
|
|
+ int Src1Idx = TII->getOperandIdx(MI, R600Operands::SRC1);
|
|
+ if (Src1Idx != -1) {
|
|
+ Src1 = MI.getOperand(Src1Idx).getReg();
|
|
+ }
|
|
+ }
|
|
+ if (IsReduction) {
|
|
+ unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
|
|
+ Src0 = TRI.getSubReg(Src0, SubRegIndex);
|
|
+ Src1 = TRI.getSubReg(Src1, SubRegIndex);
|
|
+ } else if (IsCube) {
|
|
+ static const int CubeSrcSwz[] = {2, 2, 0, 1};
|
|
+ unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
|
|
+ unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
|
|
+ Src1 = TRI.getSubReg(Src0, SubRegIndex1);
|
|
+ Src0 = TRI.getSubReg(Src0, SubRegIndex0);
|
|
+ }
|
|
+
|
|
+ // Determine the correct destination registers;
|
|
+ bool Mask = false;
|
|
+ bool NotLast = true;
|
|
+ if (IsCube) {
|
|
+ unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
|
|
+ DstReg = TRI.getSubReg(DstReg, SubRegIndex);
|
|
+ } else {
|
|
+ // Mask the write if the original instruction does not write to
|
|
+ // the current Channel.
|
|
+ Mask = (Chan != TRI.getHWRegChan(DstReg));
|
|
+ unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
|
|
+ DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
|
|
+ }
|
|
+
|
|
+ // Set the IsLast bit
|
|
+ NotLast = (Chan != 3 );
|
|
+
|
|
+ // Add the new instruction
|
|
+ unsigned Opcode = MI.getOpcode();
|
|
+ switch (Opcode) {
|
|
+ case AMDGPU::CUBE_r600_pseudo:
|
|
+ Opcode = AMDGPU::CUBE_r600_real;
|
|
+ break;
|
|
+ case AMDGPU::CUBE_eg_pseudo:
|
|
+ Opcode = AMDGPU::CUBE_eg_real;
|
|
+ break;
|
|
+ case AMDGPU::DOT4_r600_pseudo:
|
|
+ Opcode = AMDGPU::DOT4_r600_real;
|
|
+ break;
|
|
+ case AMDGPU::DOT4_eg_pseudo:
|
|
+ Opcode = AMDGPU::DOT4_eg_real;
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ MachineInstr *NewMI =
|
|
+ TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
|
|
+
|
|
+ NewMI->setIsInsideBundle(Chan != 0);
|
|
+ if (Mask) {
|
|
+ TII->addFlag(NewMI, 0, MO_FLAG_MASK);
|
|
+ }
|
|
+ if (NotLast) {
|
|
+ TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
|
|
+ }
|
|
+ }
|
|
+ MI.eraseFromParent();
|
|
+ }
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
|
|
new file mode 100644
|
|
index 0000000..9c38522
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600ISelLowering.cpp
|
|
@@ -0,0 +1,1195 @@
|
|
+//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Custom DAG lowering for R600
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "R600ISelLowering.h"
|
|
+#include "R600Defines.h"
|
|
+#include "R600InstrInfo.h"
|
|
+#include "R600MachineFunctionInfo.h"
|
|
+#include "llvm/Argument.h"
|
|
+#include "llvm/Function.h"
|
|
+#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+#include "llvm/CodeGen/SelectionDAG.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
|
|
+ AMDGPUTargetLowering(TM),
|
|
+ TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
|
|
+ setOperationAction(ISD::MUL, MVT::i64, Expand);
|
|
+ addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
|
|
+ addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
|
|
+ addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
|
|
+ addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
|
|
+ computeRegisterProperties();
|
|
+
|
|
+ setOperationAction(ISD::FADD, MVT::v4f32, Expand);
|
|
+ setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
|
|
+ setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
|
|
+ setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
|
|
+
|
|
+ setOperationAction(ISD::ADD, MVT::v4i32, Expand);
|
|
+ setOperationAction(ISD::AND, MVT::v4i32, Expand);
|
|
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
|
|
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
|
|
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
|
|
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
|
|
+ setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
|
|
+ setOperationAction(ISD::UREM, MVT::v4i32, Expand);
|
|
+ setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
|
|
+
|
|
+ setOperationAction(ISD::BR_CC, MVT::i32, Custom);
|
|
+ setOperationAction(ISD::BR_CC, MVT::f32, Custom);
|
|
+
|
|
+ setOperationAction(ISD::FSUB, MVT::f32, Expand);
|
|
+
|
|
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
|
|
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
|
|
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
|
|
+ setOperationAction(ISD::FPOW, MVT::f32, Custom);
|
|
+
|
|
+ setOperationAction(ISD::ROTL, MVT::i32, Custom);
|
|
+
|
|
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
|
|
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
|
|
+
|
|
+ setOperationAction(ISD::SETCC, MVT::i32, Custom);
|
|
+ setOperationAction(ISD::SETCC, MVT::f32, Custom);
|
|
+ setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
|
|
+
|
|
+ setOperationAction(ISD::SELECT, MVT::i32, Custom);
|
|
+ setOperationAction(ISD::SELECT, MVT::f32, Custom);
|
|
+
|
|
+ // Legalize loads and stores to the private address space.
|
|
+ setOperationAction(ISD::LOAD, MVT::i32, Custom);
|
|
+ setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
|
|
+ setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
|
|
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
|
|
+ setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
|
|
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
|
|
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
|
|
+ setOperationAction(ISD::STORE, MVT::i8, Custom);
|
|
+ setOperationAction(ISD::STORE, MVT::i32, Custom);
|
|
+ setOperationAction(ISD::STORE, MVT::v2i32, Custom);
|
|
+ setOperationAction(ISD::STORE, MVT::v4i32, Custom);
|
|
+
|
|
+ setOperationAction(ISD::LOAD, MVT::i32, Custom);
|
|
+ setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
|
|
+ setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
|
|
+
|
|
+ setTargetDAGCombine(ISD::FP_ROUND);
|
|
+ setTargetDAGCombine(ISD::FP_TO_SINT);
|
|
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
|
|
+ setTargetDAGCombine(ISD::SELECT_CC);
|
|
+
|
|
+ setSchedulingPreference(Sched::VLIW);
|
|
+}
|
|
+
|
|
+MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
|
|
+ MachineInstr * MI, MachineBasicBlock * BB) const {
|
|
+ MachineFunction * MF = BB->getParent();
|
|
+ MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
+ MachineBasicBlock::iterator I = *MI;
|
|
+
|
|
+ switch (MI->getOpcode()) {
|
|
+ default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
|
|
+ case AMDGPU::SHADER_TYPE: break;
|
|
+ case AMDGPU::CLAMP_R600: {
|
|
+ MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
|
|
+ AMDGPU::MOV,
|
|
+ MI->getOperand(0).getReg(),
|
|
+ MI->getOperand(1).getReg());
|
|
+ TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::FABS_R600: {
|
|
+ MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
|
|
+ AMDGPU::MOV,
|
|
+ MI->getOperand(0).getReg(),
|
|
+ MI->getOperand(1).getReg());
|
|
+ TII->addFlag(NewMI, 0, MO_FLAG_ABS);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::FNEG_R600: {
|
|
+ MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
|
|
+ AMDGPU::MOV,
|
|
+ MI->getOperand(0).getReg(),
|
|
+ MI->getOperand(1).getReg());
|
|
+ TII->addFlag(NewMI, 0, MO_FLAG_NEG);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::MASK_WRITE: {
|
|
+ unsigned maskedRegister = MI->getOperand(0).getReg();
|
|
+ assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
|
|
+ MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
|
|
+ TII->addFlag(defInstr, 0, MO_FLAG_MASK);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::MOV_IMM_F32:
|
|
+ TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
|
|
+ MI->getOperand(1).getFPImm()->getValueAPF()
|
|
+ .bitcastToAPInt().getZExtValue());
|
|
+ break;
|
|
+ case AMDGPU::MOV_IMM_I32:
|
|
+ TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
|
|
+ MI->getOperand(1).getImm());
|
|
+ break;
|
|
+
|
|
+
|
|
+ case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
|
|
+ case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
|
|
+ unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
|
|
+
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
|
|
+ .addOperand(MI->getOperand(0))
|
|
+ .addOperand(MI->getOperand(1))
|
|
+ .addImm(EOP); // Set End of program bit
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::TXD: {
|
|
+ unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
|
|
+ unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
|
|
+
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
|
|
+ .addOperand(MI->getOperand(3))
|
|
+ .addOperand(MI->getOperand(4))
|
|
+ .addOperand(MI->getOperand(5))
|
|
+ .addOperand(MI->getOperand(6));
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
|
|
+ .addOperand(MI->getOperand(2))
|
|
+ .addOperand(MI->getOperand(4))
|
|
+ .addOperand(MI->getOperand(5))
|
|
+ .addOperand(MI->getOperand(6));
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
|
|
+ .addOperand(MI->getOperand(0))
|
|
+ .addOperand(MI->getOperand(1))
|
|
+ .addOperand(MI->getOperand(4))
|
|
+ .addOperand(MI->getOperand(5))
|
|
+ .addOperand(MI->getOperand(6))
|
|
+ .addReg(T0, RegState::Implicit)
|
|
+ .addReg(T1, RegState::Implicit);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::TXD_SHADOW: {
|
|
+ unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
|
|
+ unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
|
|
+
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
|
|
+ .addOperand(MI->getOperand(3))
|
|
+ .addOperand(MI->getOperand(4))
|
|
+ .addOperand(MI->getOperand(5))
|
|
+ .addOperand(MI->getOperand(6));
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
|
|
+ .addOperand(MI->getOperand(2))
|
|
+ .addOperand(MI->getOperand(4))
|
|
+ .addOperand(MI->getOperand(5))
|
|
+ .addOperand(MI->getOperand(6));
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
|
|
+ .addOperand(MI->getOperand(0))
|
|
+ .addOperand(MI->getOperand(1))
|
|
+ .addOperand(MI->getOperand(4))
|
|
+ .addOperand(MI->getOperand(5))
|
|
+ .addOperand(MI->getOperand(6))
|
|
+ .addReg(T0, RegState::Implicit)
|
|
+ .addReg(T1, RegState::Implicit);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::BRANCH:
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
|
|
+ .addOperand(MI->getOperand(0))
|
|
+ .addReg(0);
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::BRANCH_COND_f32: {
|
|
+ MachineInstr *NewMI =
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
|
|
+ AMDGPU::PREDICATE_BIT)
|
|
+ .addOperand(MI->getOperand(1))
|
|
+ .addImm(OPCODE_IS_NOT_ZERO)
|
|
+ .addImm(0); // Flags
|
|
+ TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
|
|
+ .addOperand(MI->getOperand(0))
|
|
+ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::BRANCH_COND_i32: {
|
|
+ MachineInstr *NewMI =
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
|
|
+ AMDGPU::PREDICATE_BIT)
|
|
+ .addOperand(MI->getOperand(1))
|
|
+ .addImm(OPCODE_IS_NOT_ZERO_INT)
|
|
+ .addImm(0); // Flags
|
|
+ TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
|
|
+ .addOperand(MI->getOperand(0))
|
|
+ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ case AMDGPU::EG_ExportSwz:
|
|
+ case AMDGPU::R600_ExportSwz: {
|
|
+ // Instruction is left unmodified if its not the last one of its type
|
|
+ bool isLastInstructionOfItsType = true;
|
|
+ unsigned InstExportType = MI->getOperand(1).getImm();
|
|
+ for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
|
|
+ EndBlock = BB->end(); NextExportInst != EndBlock;
|
|
+ NextExportInst = llvm::next(NextExportInst)) {
|
|
+ if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
|
|
+ NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
|
|
+ unsigned CurrentInstExportType = NextExportInst->getOperand(1)
|
|
+ .getImm();
|
|
+ if (CurrentInstExportType == InstExportType) {
|
|
+ isLastInstructionOfItsType = false;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
|
|
+ if (!EOP && !isLastInstructionOfItsType)
|
|
+ return BB;
|
|
+ unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
|
|
+ .addOperand(MI->getOperand(0))
|
|
+ .addOperand(MI->getOperand(1))
|
|
+ .addOperand(MI->getOperand(2))
|
|
+ .addOperand(MI->getOperand(3))
|
|
+ .addOperand(MI->getOperand(4))
|
|
+ .addOperand(MI->getOperand(5))
|
|
+ .addOperand(MI->getOperand(6))
|
|
+ .addImm(CfInst)
|
|
+ .addImm(EOP);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ MI->eraseFromParent();
|
|
+ return BB;
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Custom DAG Lowering Operations
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+using namespace llvm::Intrinsic;
|
|
+using namespace llvm::AMDGPUIntrinsic;
|
|
+
|
|
+static SDValue
|
|
+InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
|
|
+ unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
|
|
+ SDValue Scalar, SDValue Chain) {
|
|
+ if (!ExportMap[Slot]) {
|
|
+ SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
|
|
+ DL, MVT::v4f32,
|
|
+ DAG.getUNDEF(MVT::v4f32),
|
|
+ Scalar,
|
|
+ DAG.getConstant(Channel, MVT::i32));
|
|
+
|
|
+ unsigned Mask = 1 << Channel;
|
|
+
|
|
+ const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
|
|
+ DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
|
|
+ DAG.getConstant(Mask, MVT::i32)};
|
|
+
|
|
+ SDValue Res = DAG.getNode(
|
|
+ AMDGPUISD::EXPORT,
|
|
+ DL,
|
|
+ MVT::Other,
|
|
+ Ops, 6);
|
|
+ ExportMap[Slot] = Res.getNode();
|
|
+ return Res;
|
|
+ }
|
|
+
|
|
+ SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
|
|
+ SDValue PreviousVector = ExportInstruction->getOperand(1);
|
|
+ SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
|
|
+ DL, MVT::v4f32,
|
|
+ PreviousVector,
|
|
+ Scalar,
|
|
+ DAG.getConstant(Channel, MVT::i32));
|
|
+
|
|
+ unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
|
|
+ ->getZExtValue();
|
|
+ Mask |= (1 << Channel);
|
|
+
|
|
+ const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
|
|
+ DAG.getConstant(Inst, MVT::i32),
|
|
+ DAG.getConstant(Type, MVT::i32),
|
|
+ DAG.getConstant(Slot, MVT::i32),
|
|
+ DAG.getConstant(Mask, MVT::i32)};
|
|
+
|
|
+ DAG.UpdateNodeOperands(ExportInstruction,
|
|
+ Ops, 6);
|
|
+
|
|
+ return Chain;
|
|
+
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|
+ switch (Op.getOpcode()) {
|
|
+ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
|
+ case ISD::BR_CC: return LowerBR_CC(Op, DAG);
|
|
+ case ISD::ROTL: return LowerROTL(Op, DAG);
|
|
+ case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
|
+ case ISD::SELECT: return LowerSELECT(Op, DAG);
|
|
+ case ISD::SETCC: return LowerSETCC(Op, DAG);
|
|
+ case ISD::STORE: return LowerSTORE(Op, DAG);
|
|
+ case ISD::LOAD: return LowerLOAD(Op, DAG);
|
|
+ case ISD::FPOW: return LowerFPOW(Op, DAG);
|
|
+ case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
|
|
+ case ISD::INTRINSIC_VOID: {
|
|
+ SDValue Chain = Op.getOperand(0);
|
|
+ unsigned IntrinsicID =
|
|
+ cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
|
|
+ switch (IntrinsicID) {
|
|
+ case AMDGPUIntrinsic::AMDGPU_store_output: {
|
|
+ MachineFunction &MF = DAG.getMachineFunction();
|
|
+ MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
+ int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
|
|
+ unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
|
|
+ if (!MRI.isLiveOut(Reg)) {
|
|
+ MRI.addLiveOut(Reg);
|
|
+ }
|
|
+ return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
|
|
+ }
|
|
+ case AMDGPUIntrinsic::R600_store_pixel_color: {
|
|
+ MachineFunction &MF = DAG.getMachineFunction();
|
|
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
|
|
+ int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
|
|
+
|
|
+ SDNode **OutputsMap = MFI->Outputs;
|
|
+ return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
|
|
+ RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
|
|
+ Chain);
|
|
+
|
|
+ }
|
|
+
|
|
+ // default for switch(IntrinsicID)
|
|
+ default: break;
|
|
+ }
|
|
+ // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
|
|
+ break;
|
|
+ }
|
|
+ case ISD::INTRINSIC_WO_CHAIN: {
|
|
+ unsigned IntrinsicID =
|
|
+ cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
|
+ EVT VT = Op.getValueType();
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ switch(IntrinsicID) {
|
|
+ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
|
+ case AMDGPUIntrinsic::R600_load_input: {
|
|
+ int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
|
|
+ unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
|
|
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
|
|
+ }
|
|
+
|
|
+ case AMDGPUIntrinsic::R600_interp_input: {
|
|
+ int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
|
|
+ int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
|
|
+ MachineSDNode *interp;
|
|
+ if (ijb < 0) {
|
|
+ interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
|
|
+ MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
|
|
+ return DAG.getTargetExtractSubreg(
|
|
+ TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
|
|
+ DL, MVT::f32, SDValue(interp, 0));
|
|
+ }
|
|
+
|
|
+ if (slot % 4 < 2)
|
|
+ interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
|
|
+ MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
|
|
+ CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
|
+ AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
|
|
+ CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
|
+ AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
|
|
+ else
|
|
+ interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
|
|
+ MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
|
|
+ CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
|
+ AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
|
|
+ CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
|
+ AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
|
|
+
|
|
+ return SDValue(interp, slot % 2);
|
|
+ }
|
|
+
|
|
+ case r600_read_ngroups_x:
|
|
+ return LowerImplicitParameter(DAG, VT, DL, 0);
|
|
+ case r600_read_ngroups_y:
|
|
+ return LowerImplicitParameter(DAG, VT, DL, 1);
|
|
+ case r600_read_ngroups_z:
|
|
+ return LowerImplicitParameter(DAG, VT, DL, 2);
|
|
+ case r600_read_global_size_x:
|
|
+ return LowerImplicitParameter(DAG, VT, DL, 3);
|
|
+ case r600_read_global_size_y:
|
|
+ return LowerImplicitParameter(DAG, VT, DL, 4);
|
|
+ case r600_read_global_size_z:
|
|
+ return LowerImplicitParameter(DAG, VT, DL, 5);
|
|
+ case r600_read_local_size_x:
|
|
+ return LowerImplicitParameter(DAG, VT, DL, 6);
|
|
+ case r600_read_local_size_y:
|
|
+ return LowerImplicitParameter(DAG, VT, DL, 7);
|
|
+ case r600_read_local_size_z:
|
|
+ return LowerImplicitParameter(DAG, VT, DL, 8);
|
|
+
|
|
+ case r600_read_tgid_x:
|
|
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
|
+ AMDGPU::T1_X, VT);
|
|
+ case r600_read_tgid_y:
|
|
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
|
+ AMDGPU::T1_Y, VT);
|
|
+ case r600_read_tgid_z:
|
|
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
|
+ AMDGPU::T1_Z, VT);
|
|
+ case r600_read_tidig_x:
|
|
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
|
+ AMDGPU::T0_X, VT);
|
|
+ case r600_read_tidig_y:
|
|
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
|
+ AMDGPU::T0_Y, VT);
|
|
+ case r600_read_tidig_z:
|
|
+ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
|
+ AMDGPU::T0_Z, VT);
|
|
+ }
|
|
+ // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
|
|
+ break;
|
|
+ }
|
|
+ } // end switch(Op.getOpcode())
|
|
+ return SDValue();
|
|
+}
|
|
+
|
|
+void R600TargetLowering::ReplaceNodeResults(SDNode *N,
|
|
+ SmallVectorImpl<SDValue> &Results,
|
|
+ SelectionDAG &DAG) const {
|
|
+ switch (N->getOpcode()) {
|
|
+ default: return;
|
|
+ case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
|
|
+ return;
|
|
+ case ISD::LOAD: {
|
|
+ SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
|
|
+ Results.push_back(SDValue(Node, 0));
|
|
+ Results.push_back(SDValue(Node, 1));
|
|
+ // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
|
|
+ // function
|
|
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
|
|
+ return;
|
|
+ }
|
|
+ case ISD::STORE:
|
|
+ SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
|
|
+ Results.push_back(SDValue(Node, 0));
|
|
+ return;
|
|
+ }
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
|
|
+ return DAG.getNode(
|
|
+ ISD::SETCC,
|
|
+ Op.getDebugLoc(),
|
|
+ MVT::i1,
|
|
+ Op, DAG.getConstantFP(0.0f, MVT::f32),
|
|
+ DAG.getCondCode(ISD::SETNE)
|
|
+ );
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
|
|
+ SDValue Chain = Op.getOperand(0);
|
|
+ SDValue CC = Op.getOperand(1);
|
|
+ SDValue LHS = Op.getOperand(2);
|
|
+ SDValue RHS = Op.getOperand(3);
|
|
+ SDValue JumpT = Op.getOperand(4);
|
|
+ SDValue CmpValue;
|
|
+ SDValue Result;
|
|
+
|
|
+ if (LHS.getValueType() == MVT::i32) {
|
|
+ CmpValue = DAG.getNode(
|
|
+ ISD::SELECT_CC,
|
|
+ Op.getDebugLoc(),
|
|
+ MVT::i32,
|
|
+ LHS, RHS,
|
|
+ DAG.getConstant(-1, MVT::i32),
|
|
+ DAG.getConstant(0, MVT::i32),
|
|
+ CC);
|
|
+ } else if (LHS.getValueType() == MVT::f32) {
|
|
+ CmpValue = DAG.getNode(
|
|
+ ISD::SELECT_CC,
|
|
+ Op.getDebugLoc(),
|
|
+ MVT::f32,
|
|
+ LHS, RHS,
|
|
+ DAG.getConstantFP(1.0f, MVT::f32),
|
|
+ DAG.getConstantFP(0.0f, MVT::f32),
|
|
+ CC);
|
|
+ } else {
|
|
+ assert(0 && "Not valid type for br_cc");
|
|
+ }
|
|
+ Result = DAG.getNode(
|
|
+ AMDGPUISD::BRANCH_COND,
|
|
+ CmpValue.getDebugLoc(),
|
|
+ MVT::Other, Chain,
|
|
+ JumpT, CmpValue);
|
|
+ return Result;
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
|
|
+ DebugLoc DL,
|
|
+ unsigned DwordOffset) const {
|
|
+ unsigned ByteOffset = DwordOffset * 4;
|
|
+ PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
|
|
+ AMDGPUAS::PARAM_I_ADDRESS);
|
|
+
|
|
+ // We shouldn't be using an offset wider than 16-bits for implicit parameters.
|
|
+ assert(isInt<16>(ByteOffset));
|
|
+
|
|
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(),
|
|
+ DAG.getConstant(ByteOffset, MVT::i32), // PTR
|
|
+ MachinePointerInfo(ConstantPointerNull::get(PtrType)),
|
|
+ false, false, false, 0);
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
|
|
+
|
|
+ MachineFunction &MF = DAG.getMachineFunction();
|
|
+ const AMDGPUFrameLowering *TFL =
|
|
+ static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
|
|
+
|
|
+ FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
|
|
+ assert(FIN);
|
|
+
|
|
+ unsigned FrameIndex = FIN->getIndex();
|
|
+ unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
|
|
+ return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT VT = Op.getValueType();
|
|
+
|
|
+ return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
|
|
+ Op.getOperand(0),
|
|
+ Op.getOperand(0),
|
|
+ DAG.getNode(ISD::SUB, DL, VT,
|
|
+ DAG.getConstant(32, MVT::i32),
|
|
+ Op.getOperand(1)));
|
|
+}
|
|
+
|
|
+bool R600TargetLowering::isZero(SDValue Op) const {
|
|
+ if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
|
|
+ return Cst->isNullValue();
|
|
+ } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
|
|
+ return CstFP->isZero();
|
|
+ } else {
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT VT = Op.getValueType();
|
|
+
|
|
+ SDValue LHS = Op.getOperand(0);
|
|
+ SDValue RHS = Op.getOperand(1);
|
|
+ SDValue True = Op.getOperand(2);
|
|
+ SDValue False = Op.getOperand(3);
|
|
+ SDValue CC = Op.getOperand(4);
|
|
+ SDValue Temp;
|
|
+
|
|
+ // LHS and RHS are guaranteed to be the same value type
|
|
+ EVT CompareVT = LHS.getValueType();
|
|
+
|
|
+ // Check if we can lower this to a native operation.
|
|
+
|
|
+ // Try to lower to a CND* instruction:
|
|
+ // CND* instructions requires RHS to be zero. Some SELECT_CC nodes that
|
|
+ // can be lowered to CND* instructions can also be lowered to SET*
|
|
+ // instructions. CND* instructions are cheaper, because they dont't
|
|
+ // require additional instructions to convert their result to the correct
|
|
+ // value type, so this check should be first.
|
|
+ if (isZero(LHS) || isZero(RHS)) {
|
|
+ SDValue Cond = (isZero(LHS) ? RHS : LHS);
|
|
+ SDValue Zero = (isZero(LHS) ? LHS : RHS);
|
|
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
|
|
+ if (CompareVT != VT) {
|
|
+ // Bitcast True / False to the correct types. This will end up being
|
|
+ // a nop, but it allows us to define only a single pattern in the
|
|
+ // .TD files for each CND* instruction rather than having to have
|
|
+ // one pattern for integer True/False and one for fp True/False
|
|
+ True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
|
|
+ False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
|
|
+ }
|
|
+ if (isZero(LHS)) {
|
|
+ CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
|
|
+ }
|
|
+
|
|
+ switch (CCOpcode) {
|
|
+ case ISD::SETONE:
|
|
+ case ISD::SETUNE:
|
|
+ case ISD::SETNE:
|
|
+ case ISD::SETULE:
|
|
+ case ISD::SETULT:
|
|
+ case ISD::SETOLE:
|
|
+ case ISD::SETOLT:
|
|
+ case ISD::SETLE:
|
|
+ case ISD::SETLT:
|
|
+ CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
|
|
+ Temp = True;
|
|
+ True = False;
|
|
+ False = Temp;
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+ SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
|
|
+ Cond, Zero,
|
|
+ True, False,
|
|
+ DAG.getCondCode(CCOpcode));
|
|
+ return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
|
|
+ }
|
|
+
|
|
+ // Try to lower to a SET* instruction:
|
|
+ //
|
|
+ // CompareVT == MVT::f32 and VT == MVT::i32 is supported by the hardware,
|
|
+ // but for the other case where CompareVT != VT, all operands of
|
|
+ // SELECT_CC need to have the same value type, so we need to change True and
|
|
+ // False to be the same type as LHS and RHS, and then convert the result of
|
|
+ // the select_cc back to the correct type.
|
|
+
|
|
+ // Move hardware True/False values to the correct operand.
|
|
+ if (isHWTrueValue(False) && isHWFalseValue(True)) {
|
|
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
|
|
+ std::swap(False, True);
|
|
+ CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
|
|
+ }
|
|
+
|
|
+ if (isHWTrueValue(True) && isHWFalseValue(False)) {
|
|
+ if (CompareVT != VT && VT == MVT::f32 && CompareVT == MVT::i32) {
|
|
+ SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
|
|
+ LHS, RHS,
|
|
+ DAG.getConstant(-1, MVT::i32),
|
|
+ DAG.getConstant(0, MVT::i32),
|
|
+ CC);
|
|
+ // Convert integer values of true (-1) and false (0) to fp values of
|
|
+ // true (1.0f) and false (0.0f).
|
|
+ SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
|
|
+ DAG.getConstant(1, MVT::i32));
|
|
+ return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
|
|
+ } else {
|
|
+ // This SELECT_CC is already legal.
|
|
+ return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Possible Min/Max pattern
|
|
+ SDValue MinMax = LowerMinMax(Op, DAG);
|
|
+ if (MinMax.getNode()) {
|
|
+ return MinMax;
|
|
+ }
|
|
+
|
|
+ // If we make it this for it means we have no native instructions to handle
|
|
+ // this SELECT_CC, so we must lower it.
|
|
+ SDValue HWTrue, HWFalse;
|
|
+
|
|
+ if (CompareVT == MVT::f32) {
|
|
+ HWTrue = DAG.getConstantFP(1.0f, CompareVT);
|
|
+ HWFalse = DAG.getConstantFP(0.0f, CompareVT);
|
|
+ } else if (CompareVT == MVT::i32) {
|
|
+ HWTrue = DAG.getConstant(-1, CompareVT);
|
|
+ HWFalse = DAG.getConstant(0, CompareVT);
|
|
+ }
|
|
+ else {
|
|
+ assert(!"Unhandled value type in LowerSELECT_CC");
|
|
+ }
|
|
+
|
|
+ // Lower this unsupported SELECT_CC into a combination of two supported
|
|
+ // SELECT_CC operations.
|
|
+ SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
|
|
+
|
|
+ return DAG.getNode(ISD::SELECT_CC, DL, VT,
|
|
+ Cond, HWFalse,
|
|
+ True, False,
|
|
+ DAG.getCondCode(ISD::SETNE));
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
|
|
+ return DAG.getNode(ISD::SELECT_CC,
|
|
+ Op.getDebugLoc(),
|
|
+ Op.getValueType(),
|
|
+ Op.getOperand(0),
|
|
+ DAG.getConstant(0, MVT::i32),
|
|
+ Op.getOperand(1),
|
|
+ Op.getOperand(2),
|
|
+ DAG.getCondCode(ISD::SETNE));
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
|
|
+ SDValue Cond;
|
|
+ SDValue LHS = Op.getOperand(0);
|
|
+ SDValue RHS = Op.getOperand(1);
|
|
+ SDValue CC = Op.getOperand(2);
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ assert(Op.getValueType() == MVT::i32);
|
|
+ if (LHS.getValueType() == MVT::i32) {
|
|
+ Cond = DAG.getNode(
|
|
+ ISD::SELECT_CC,
|
|
+ Op.getDebugLoc(),
|
|
+ MVT::i32,
|
|
+ LHS, RHS,
|
|
+ DAG.getConstant(-1, MVT::i32),
|
|
+ DAG.getConstant(0, MVT::i32),
|
|
+ CC);
|
|
+ } else if (LHS.getValueType() == MVT::f32) {
|
|
+ Cond = DAG.getNode(
|
|
+ ISD::SELECT_CC,
|
|
+ Op.getDebugLoc(),
|
|
+ MVT::f32,
|
|
+ LHS, RHS,
|
|
+ DAG.getConstantFP(1.0f, MVT::f32),
|
|
+ DAG.getConstantFP(0.0f, MVT::f32),
|
|
+ CC);
|
|
+ Cond = DAG.getNode(
|
|
+ ISD::FP_TO_SINT,
|
|
+ DL,
|
|
+ MVT::i32,
|
|
+ Cond);
|
|
+ } else {
|
|
+ assert(0 && "Not valid type for set_cc");
|
|
+ }
|
|
+ Cond = DAG.getNode(
|
|
+ ISD::AND,
|
|
+ DL,
|
|
+ MVT::i32,
|
|
+ DAG.getConstant(1, MVT::i32),
|
|
+ Cond);
|
|
+ return Cond;
|
|
+}
|
|
+
|
|
+/// LLVM generates byte-addresed pointers. For indirect addressing, we need to
|
|
+/// convert these pointers to a register index. Each register holds
|
|
+/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
|
|
+/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
|
|
+/// for indirect addressing.
|
|
+SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
|
|
+ unsigned StackWidth,
|
|
+ SelectionDAG &DAG) const {
|
|
+ unsigned SRLPad;
|
|
+ switch(StackWidth) {
|
|
+ case 1:
|
|
+ SRLPad = 2;
|
|
+ break;
|
|
+ case 2:
|
|
+ SRLPad = 3;
|
|
+ break;
|
|
+ case 4:
|
|
+ SRLPad = 4;
|
|
+ break;
|
|
+ default: llvm_unreachable("Invalid stack width");
|
|
+ }
|
|
+
|
|
+ return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
|
|
+ DAG.getConstant(SRLPad, MVT::i32));
|
|
+}
|
|
+
|
|
+void R600TargetLowering::getStackAddress(unsigned StackWidth,
|
|
+ unsigned ElemIdx,
|
|
+ unsigned &Channel,
|
|
+ unsigned &PtrIncr) const {
|
|
+ switch (StackWidth) {
|
|
+ default:
|
|
+ case 1:
|
|
+ Channel = 0;
|
|
+ if (ElemIdx > 0) {
|
|
+ PtrIncr = 1;
|
|
+ } else {
|
|
+ PtrIncr = 0;
|
|
+ }
|
|
+ break;
|
|
+ case 2:
|
|
+ Channel = ElemIdx % 2;
|
|
+ if (ElemIdx == 2) {
|
|
+ PtrIncr = 1;
|
|
+ } else {
|
|
+ PtrIncr = 0;
|
|
+ }
|
|
+ break;
|
|
+ case 4:
|
|
+ Channel = ElemIdx;
|
|
+ PtrIncr = 0;
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
|
|
+ SDValue Chain = Op.getOperand(0);
|
|
+ SDValue Value = Op.getOperand(1);
|
|
+ SDValue Ptr = Op.getOperand(2);
|
|
+
|
|
+ if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
|
|
+ Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
|
|
+ // Convert pointer from byte address to dword address.
|
|
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
|
|
+ DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
|
|
+ Ptr, DAG.getConstant(2, MVT::i32)));
|
|
+
|
|
+ if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
|
|
+ assert(!"Truncated and indexed stores not supported yet");
|
|
+ } else {
|
|
+ Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
|
|
+ }
|
|
+ return Chain;
|
|
+ }
|
|
+
|
|
+ EVT ValueVT = Value.getValueType();
|
|
+
|
|
+ if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
|
|
+ return SDValue();
|
|
+ }
|
|
+
|
|
+ // Lowering for indirect addressing
|
|
+
|
|
+ const MachineFunction &MF = DAG.getMachineFunction();
|
|
+ const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
|
|
+ getTargetMachine().getFrameLowering());
|
|
+ unsigned StackWidth = TFL->getStackWidth(MF);
|
|
+
|
|
+ Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
|
|
+
|
|
+ if (ValueVT.isVector()) {
|
|
+ unsigned NumElemVT = ValueVT.getVectorNumElements();
|
|
+ EVT ElemVT = ValueVT.getVectorElementType();
|
|
+ SDValue Stores[4];
|
|
+
|
|
+ assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
|
|
+ "vector width in load");
|
|
+
|
|
+ for (unsigned i = 0; i < NumElemVT; ++i) {
|
|
+ unsigned Channel, PtrIncr;
|
|
+ getStackAddress(StackWidth, i, Channel, PtrIncr);
|
|
+ Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
|
|
+ DAG.getConstant(PtrIncr, MVT::i32));
|
|
+ SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
|
|
+ Value, DAG.getConstant(i, MVT::i32));
|
|
+
|
|
+ Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
|
|
+ Chain, Elem, Ptr,
|
|
+ DAG.getTargetConstant(Channel, MVT::i32));
|
|
+ }
|
|
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
|
|
+ } else {
|
|
+ if (ValueVT == MVT::i8) {
|
|
+ Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
|
|
+ }
|
|
+ Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
|
|
+ DAG.getTargetConstant(0, MVT::i32)); // Channel
|
|
+ }
|
|
+
|
|
+ return Chain;
|
|
+}
|
|
+
|
|
+// return (512 + (kc_bank << 12)
|
|
+static int
|
|
+ConstantAddressBlock(unsigned AddressSpace) {
|
|
+ switch (AddressSpace) {
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_0:
|
|
+ return 512;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_1:
|
|
+ return 512 + 4096;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_2:
|
|
+ return 512 + 4096 * 2;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_3:
|
|
+ return 512 + 4096 * 3;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_4:
|
|
+ return 512 + 4096 * 4;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_5:
|
|
+ return 512 + 4096 * 5;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_6:
|
|
+ return 512 + 4096 * 6;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_7:
|
|
+ return 512 + 4096 * 7;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_8:
|
|
+ return 512 + 4096 * 8;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_9:
|
|
+ return 512 + 4096 * 9;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_10:
|
|
+ return 512 + 4096 * 10;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_11:
|
|
+ return 512 + 4096 * 11;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_12:
|
|
+ return 512 + 4096 * 12;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_13:
|
|
+ return 512 + 4096 * 13;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_14:
|
|
+ return 512 + 4096 * 14;
|
|
+ case AMDGPUAS::CONSTANT_BUFFER_15:
|
|
+ return 512 + 4096 * 15;
|
|
+ default:
|
|
+ return -1;
|
|
+ }
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
|
|
+{
|
|
+ EVT VT = Op.getValueType();
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
|
|
+ SDValue Chain = Op.getOperand(0);
|
|
+ SDValue Ptr = Op.getOperand(1);
|
|
+ SDValue LoweredLoad;
|
|
+
|
|
+ int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
|
|
+ if (ConstantBlock > -1) {
|
|
+ SDValue Result;
|
|
+ if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
|
|
+ dyn_cast<Constant>(LoadNode->getSrcValue())) {
|
|
+ SDValue Slots[4];
|
|
+ for (unsigned i = 0; i < 4; i++) {
|
|
+ // We want Const position encoded with the following formula :
|
|
+ // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
|
|
+ // const_index is Ptr computed by llvm using an alignment of 16.
|
|
+ // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
|
|
+ // then div by 4 at the ISel step
|
|
+ SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
|
|
+ DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
|
|
+ Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
|
|
+ }
|
|
+ Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
|
|
+ } else {
|
|
+ // non constant ptr cant be folded, keeps it as a v4f32 load
|
|
+ Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
|
|
+ DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
|
|
+ );
|
|
+ }
|
|
+
|
|
+ if (!VT.isVector()) {
|
|
+ Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
|
|
+ DAG.getConstant(0, MVT::i32));
|
|
+ }
|
|
+
|
|
+ SDValue MergedValues[2] = {
|
|
+ Result,
|
|
+ Chain
|
|
+ };
|
|
+ return DAG.getMergeValues(MergedValues, 2, DL);
|
|
+ }
|
|
+
|
|
+ if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
|
|
+ return SDValue();
|
|
+ }
|
|
+
|
|
+ // Lowering for indirect addressing
|
|
+ const MachineFunction &MF = DAG.getMachineFunction();
|
|
+ const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
|
|
+ getTargetMachine().getFrameLowering());
|
|
+ unsigned StackWidth = TFL->getStackWidth(MF);
|
|
+
|
|
+ Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
|
|
+
|
|
+ if (VT.isVector()) {
|
|
+ unsigned NumElemVT = VT.getVectorNumElements();
|
|
+ EVT ElemVT = VT.getVectorElementType();
|
|
+ SDValue Loads[4];
|
|
+
|
|
+ assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
|
|
+ "vector width in load");
|
|
+
|
|
+ for (unsigned i = 0; i < NumElemVT; ++i) {
|
|
+ unsigned Channel, PtrIncr;
|
|
+ getStackAddress(StackWidth, i, Channel, PtrIncr);
|
|
+ Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
|
|
+ DAG.getConstant(PtrIncr, MVT::i32));
|
|
+ Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
|
|
+ Chain, Ptr,
|
|
+ DAG.getTargetConstant(Channel, MVT::i32),
|
|
+ Op.getOperand(2));
|
|
+ }
|
|
+ for (unsigned i = NumElemVT; i < 4; ++i) {
|
|
+ Loads[i] = DAG.getUNDEF(ElemVT);
|
|
+ }
|
|
+ EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
|
|
+ LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
|
|
+ } else {
|
|
+ LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
|
|
+ Chain, Ptr,
|
|
+ DAG.getTargetConstant(0, MVT::i32), // Channel
|
|
+ Op.getOperand(2));
|
|
+ }
|
|
+
|
|
+ SDValue Ops[2];
|
|
+ Ops[0] = LoweredLoad;
|
|
+ Ops[1] = Chain;
|
|
+
|
|
+ return DAG.getMergeValues(Ops, 2, DL);
|
|
+}
|
|
+
|
|
+SDValue R600TargetLowering::LowerFPOW(SDValue Op,
|
|
+ SelectionDAG &DAG) const {
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+ EVT VT = Op.getValueType();
|
|
+ SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
|
|
+ SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
|
|
+ return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
|
|
+}
|
|
+
|
|
+/// XXX Only kernel functions are supported, so we can assume for now that
|
|
+/// every function is a kernel function, but in the future we should use
|
|
+/// separate calling conventions for kernel and non-kernel functions.
|
|
+SDValue R600TargetLowering::LowerFormalArguments(
|
|
+ SDValue Chain,
|
|
+ CallingConv::ID CallConv,
|
|
+ bool isVarArg,
|
|
+ const SmallVectorImpl<ISD::InputArg> &Ins,
|
|
+ DebugLoc DL, SelectionDAG &DAG,
|
|
+ SmallVectorImpl<SDValue> &InVals) const {
|
|
+ unsigned ParamOffsetBytes = 36;
|
|
+ Function::const_arg_iterator FuncArg =
|
|
+ DAG.getMachineFunction().getFunction()->arg_begin();
|
|
+ for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
|
|
+ EVT VT = Ins[i].VT;
|
|
+ Type *ArgType = FuncArg->getType();
|
|
+ unsigned ArgSizeInBits = ArgType->isPointerTy() ?
|
|
+ 32 : ArgType->getPrimitiveSizeInBits();
|
|
+ unsigned ArgBytes = ArgSizeInBits >> 3;
|
|
+ EVT ArgVT;
|
|
+ if (ArgSizeInBits < VT.getSizeInBits()) {
|
|
+ assert(!ArgType->isFloatTy() &&
|
|
+ "Extending floating point arguments not supported yet");
|
|
+ ArgVT = MVT::getIntegerVT(ArgSizeInBits);
|
|
+ } else {
|
|
+ ArgVT = VT;
|
|
+ }
|
|
+ PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
|
|
+ AMDGPUAS::PARAM_I_ADDRESS);
|
|
+ SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
|
|
+ DAG.getConstant(ParamOffsetBytes, MVT::i32),
|
|
+ MachinePointerInfo(UndefValue::get(PtrTy)),
|
|
+ ArgVT, false, false, ArgBytes);
|
|
+ InVals.push_back(Arg);
|
|
+ ParamOffsetBytes += ArgBytes;
|
|
+ }
|
|
+ return Chain;
|
|
+}
|
|
+
|
|
+EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
|
|
+ if (!VT.isVector()) return MVT::i32;
|
|
+ return VT.changeVectorElementTypeToInteger();
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Custom DAG Optimizations
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
|
|
+ DAGCombinerInfo &DCI) const {
|
|
+ SelectionDAG &DAG = DCI.DAG;
|
|
+
|
|
+ switch (N->getOpcode()) {
|
|
+ // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
|
|
+ case ISD::FP_ROUND: {
|
|
+ SDValue Arg = N->getOperand(0);
|
|
+ if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
|
|
+ return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
|
|
+ Arg.getOperand(0));
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
|
|
+ // (i32 select_cc f32, f32, -1, 0 cc)
|
|
+ //
|
|
+ // Mesa's GLSL frontend generates the above pattern a lot and we can lower
|
|
+ // this to one of the SET*_DX10 instructions.
|
|
+ case ISD::FP_TO_SINT: {
|
|
+ SDValue FNeg = N->getOperand(0);
|
|
+ if (FNeg.getOpcode() != ISD::FNEG) {
|
|
+ return SDValue();
|
|
+ }
|
|
+ SDValue SelectCC = FNeg.getOperand(0);
|
|
+ if (SelectCC.getOpcode() != ISD::SELECT_CC ||
|
|
+ SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
|
|
+ SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
|
|
+ !isHWTrueValue(SelectCC.getOperand(2)) ||
|
|
+ !isHWFalseValue(SelectCC.getOperand(3))) {
|
|
+ return SDValue();
|
|
+ }
|
|
+
|
|
+ return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
|
|
+ SelectCC.getOperand(0), // LHS
|
|
+ SelectCC.getOperand(1), // RHS
|
|
+ DAG.getConstant(-1, MVT::i32), // True
|
|
+ DAG.getConstant(0, MVT::i32), // Flase
|
|
+ SelectCC.getOperand(4)); // CC
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ // Extract_vec (Build_vector) generated by custom lowering
|
|
+ // also needs to be customly combined
|
|
+ case ISD::EXTRACT_VECTOR_ELT: {
|
|
+ SDValue Arg = N->getOperand(0);
|
|
+ if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
|
|
+ if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
|
|
+ unsigned Element = Const->getZExtValue();
|
|
+ return Arg->getOperand(Element);
|
|
+ }
|
|
+ }
|
|
+ if (Arg.getOpcode() == ISD::BITCAST &&
|
|
+ Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
|
|
+ if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
|
|
+ unsigned Element = Const->getZExtValue();
|
|
+ return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
|
|
+ Arg->getOperand(0).getOperand(Element));
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ case ISD::SELECT_CC: {
|
|
+ // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
|
|
+ // selectcc x, y, a, b, inv(cc)
|
|
+ SDValue LHS = N->getOperand(0);
|
|
+ if (LHS.getOpcode() != ISD::SELECT_CC) {
|
|
+ return SDValue();
|
|
+ }
|
|
+
|
|
+ SDValue RHS = N->getOperand(1);
|
|
+ SDValue True = N->getOperand(2);
|
|
+ SDValue False = N->getOperand(3);
|
|
+
|
|
+ if (LHS.getOperand(2).getNode() != True.getNode() ||
|
|
+ LHS.getOperand(3).getNode() != False.getNode() ||
|
|
+ RHS.getNode() != False.getNode() ||
|
|
+ cast<CondCodeSDNode>(N->getOperand(4))->get() != ISD::SETEQ) {
|
|
+ return SDValue();
|
|
+ }
|
|
+
|
|
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(LHS->getOperand(4))->get();
|
|
+ CCOpcode = ISD::getSetCCInverse(
|
|
+ CCOpcode, LHS.getOperand(0).getValueType().isInteger());
|
|
+ return DAG.getSelectCC(N->getDebugLoc(),
|
|
+ LHS.getOperand(0),
|
|
+ LHS.getOperand(1),
|
|
+ LHS.getOperand(2),
|
|
+ LHS.getOperand(3),
|
|
+ CCOpcode);
|
|
+
|
|
+ }
|
|
+ }
|
|
+ return SDValue();
|
|
+}
|
|
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
|
|
new file mode 100644
|
|
index 0000000..afa3897
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600ISelLowering.h
|
|
@@ -0,0 +1,78 @@
|
|
+//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief R600 DAG Lowering interface definition
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef R600ISELLOWERING_H
|
|
+#define R600ISELLOWERING_H
|
|
+
|
|
+#include "AMDGPUISelLowering.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class R600InstrInfo;
|
|
+
|
|
+class R600TargetLowering : public AMDGPUTargetLowering {
|
|
+public:
|
|
+ R600TargetLowering(TargetMachine &TM);
|
|
+ virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
|
|
+ MachineBasicBlock * BB) const;
|
|
+ virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
|
|
+ virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
|
+ void ReplaceNodeResults(SDNode * N,
|
|
+ SmallVectorImpl<SDValue> &Results,
|
|
+ SelectionDAG &DAG) const;
|
|
+ virtual SDValue LowerFormalArguments(
|
|
+ SDValue Chain,
|
|
+ CallingConv::ID CallConv,
|
|
+ bool isVarArg,
|
|
+ const SmallVectorImpl<ISD::InputArg> &Ins,
|
|
+ DebugLoc DL, SelectionDAG &DAG,
|
|
+ SmallVectorImpl<SDValue> &InVals) const;
|
|
+ virtual EVT getSetCCResultType(EVT VT) const;
|
|
+private:
|
|
+ const R600InstrInfo * TII;
|
|
+
|
|
+ /// Each OpenCL kernel has nine implicit parameters that are stored in the
|
|
+ /// first nine dwords of a Vertex Buffer. These implicit parameters are
|
|
+ /// lowered to load instructions which retreive the values from the Vertex
|
|
+ /// Buffer.
|
|
+ SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
|
|
+ DebugLoc DL, unsigned DwordOffset) const;
|
|
+
|
|
+ void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
|
|
+ MachineRegisterInfo & MRI, unsigned dword_offset) const;
|
|
+
|
|
+ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
|
|
+
|
|
+ /// \brief Lower ROTL opcode to BITALIGN
|
|
+ SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
|
|
+
|
|
+ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
|
|
+
|
|
+ SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
|
|
+ SelectionDAG &DAG) const;
|
|
+ void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
|
|
+ unsigned &Channel, unsigned &PtrIncr) const;
|
|
+ bool isZero(SDValue Op) const;
|
|
+};
|
|
+
|
|
+} // End namespace llvm;
|
|
+
|
|
+#endif // R600ISELLOWERING_H
|
|
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..31671ea
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600InstrInfo.cpp
|
|
@@ -0,0 +1,776 @@
|
|
+//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief R600 Implementation of TargetInstrInfo.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "R600InstrInfo.h"
|
|
+#include "AMDGPUTargetMachine.h"
|
|
+#include "AMDGPUSubtarget.h"
|
|
+#include "R600Defines.h"
|
|
+#include "R600MachineFunctionInfo.h"
|
|
+#include "R600RegisterInfo.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+#include "llvm/Instructions.h"
|
|
+
|
|
+#define GET_INSTRINFO_CTOR
|
|
+#include "AMDGPUGenDFAPacketizer.inc"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
|
|
+ : AMDGPUInstrInfo(tm),
|
|
+ RI(tm, *this)
|
|
+ { }
|
|
+
|
|
+const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
|
|
+ return RI;
|
|
+}
|
|
+
|
|
+bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
|
|
+ return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
|
|
+}
|
|
+
|
|
+bool R600InstrInfo::isVector(const MachineInstr &MI) const {
|
|
+ return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
|
|
+}
|
|
+
|
|
+void
|
|
+R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI, DebugLoc DL,
|
|
+ unsigned DestReg, unsigned SrcReg,
|
|
+ bool KillSrc) const {
|
|
+ if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
|
|
+ && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
|
|
+ for (unsigned I = 0; I < 4; I++) {
|
|
+ unsigned SubRegIndex = RI.getSubRegFromChannel(I);
|
|
+ buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
|
|
+ RI.getSubReg(DestReg, SubRegIndex),
|
|
+ RI.getSubReg(SrcReg, SubRegIndex))
|
|
+ .addReg(DestReg,
|
|
+ RegState::Define | RegState::Implicit);
|
|
+ }
|
|
+ } else {
|
|
+
|
|
+ // We can't copy vec4 registers
|
|
+ assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
|
|
+ && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
|
|
+
|
|
+ MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
|
|
+ DestReg, SrcReg);
|
|
+ NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0))
|
|
+ .setIsKill(KillSrc);
|
|
+ }
|
|
+}
|
|
+
|
|
+MachineInstr * R600InstrInfo::getMovImmInstr(MachineFunction *MF,
|
|
+ unsigned DstReg, int64_t Imm) const {
|
|
+ MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::MOV), DebugLoc());
|
|
+ MachineInstrBuilder(MI).addReg(DstReg, RegState::Define);
|
|
+ MachineInstrBuilder(MI).addReg(AMDGPU::ALU_LITERAL_X);
|
|
+ MachineInstrBuilder(MI).addImm(Imm);
|
|
+ MachineInstrBuilder(MI).addReg(0); // PREDICATE_BIT
|
|
+
|
|
+ return MI;
|
|
+}
|
|
+
|
|
+unsigned R600InstrInfo::getIEQOpcode() const {
|
|
+ return AMDGPU::SETE_INT;
|
|
+}
|
|
+
|
|
+bool R600InstrInfo::isMov(unsigned Opcode) const {
|
|
+
|
|
+
|
|
+ switch(Opcode) {
|
|
+ default: return false;
|
|
+ case AMDGPU::MOV:
|
|
+ case AMDGPU::MOV_IMM_F32:
|
|
+ case AMDGPU::MOV_IMM_I32:
|
|
+ return true;
|
|
+ }
|
|
+}
|
|
+
|
|
+// Some instructions act as place holders to emulate operations that the GPU
|
|
+// hardware does automatically. This function can be used to check if
|
|
+// an opcode falls into this category.
|
|
+bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
|
|
+ switch (Opcode) {
|
|
+ default: return false;
|
|
+ case AMDGPU::RETURN:
|
|
+ return true;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
|
|
+ switch(Opcode) {
|
|
+ default: return false;
|
|
+ case AMDGPU::DOT4_r600_pseudo:
|
|
+ case AMDGPU::DOT4_eg_pseudo:
|
|
+ return true;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
|
|
+ switch(Opcode) {
|
|
+ default: return false;
|
|
+ case AMDGPU::CUBE_r600_pseudo:
|
|
+ case AMDGPU::CUBE_r600_real:
|
|
+ case AMDGPU::CUBE_eg_pseudo:
|
|
+ case AMDGPU::CUBE_eg_real:
|
|
+ return true;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
|
|
+ unsigned TargetFlags = get(Opcode).TSFlags;
|
|
+
|
|
+ return ((TargetFlags & R600_InstFlag::OP1) |
|
|
+ (TargetFlags & R600_InstFlag::OP2) |
|
|
+ (TargetFlags & R600_InstFlag::OP3));
|
|
+}
|
|
+
|
|
+DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
|
|
+ const ScheduleDAG *DAG) const {
|
|
+ const InstrItineraryData *II = TM->getInstrItineraryData();
|
|
+ return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II);
|
|
+}
|
|
+
|
|
+static bool
|
|
+isPredicateSetter(unsigned Opcode) {
|
|
+ switch (Opcode) {
|
|
+ case AMDGPU::PRED_X:
|
|
+ return true;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+static MachineInstr *
|
|
+findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator I) {
|
|
+ while (I != MBB.begin()) {
|
|
+ --I;
|
|
+ MachineInstr *MI = I;
|
|
+ if (isPredicateSetter(MI->getOpcode()))
|
|
+ return MI;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+bool
|
|
+R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock *&TBB,
|
|
+ MachineBasicBlock *&FBB,
|
|
+ SmallVectorImpl<MachineOperand> &Cond,
|
|
+ bool AllowModify) const {
|
|
+ // Most of the following comes from the ARM implementation of AnalyzeBranch
|
|
+
|
|
+ // If the block has no terminators, it just falls into the block after it.
|
|
+ MachineBasicBlock::iterator I = MBB.end();
|
|
+ if (I == MBB.begin())
|
|
+ return false;
|
|
+ --I;
|
|
+ while (I->isDebugValue()) {
|
|
+ if (I == MBB.begin())
|
|
+ return false;
|
|
+ --I;
|
|
+ }
|
|
+ if (static_cast<MachineInstr *>(I)->getOpcode() != AMDGPU::JUMP) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ // Get the last instruction in the block.
|
|
+ MachineInstr *LastInst = I;
|
|
+
|
|
+ // If there is only one terminator instruction, process it.
|
|
+ unsigned LastOpc = LastInst->getOpcode();
|
|
+ if (I == MBB.begin() ||
|
|
+ static_cast<MachineInstr *>(--I)->getOpcode() != AMDGPU::JUMP) {
|
|
+ if (LastOpc == AMDGPU::JUMP) {
|
|
+ if(!isPredicated(LastInst)) {
|
|
+ TBB = LastInst->getOperand(0).getMBB();
|
|
+ return false;
|
|
+ } else {
|
|
+ MachineInstr *predSet = I;
|
|
+ while (!isPredicateSetter(predSet->getOpcode())) {
|
|
+ predSet = --I;
|
|
+ }
|
|
+ TBB = LastInst->getOperand(0).getMBB();
|
|
+ Cond.push_back(predSet->getOperand(1));
|
|
+ Cond.push_back(predSet->getOperand(2));
|
|
+ Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+ return true; // Can't handle indirect branch.
|
|
+ }
|
|
+
|
|
+ // Get the instruction before it if it is a terminator.
|
|
+ MachineInstr *SecondLastInst = I;
|
|
+ unsigned SecondLastOpc = SecondLastInst->getOpcode();
|
|
+
|
|
+ // If the block ends with a B and a Bcc, handle it.
|
|
+ if (SecondLastOpc == AMDGPU::JUMP &&
|
|
+ isPredicated(SecondLastInst) &&
|
|
+ LastOpc == AMDGPU::JUMP &&
|
|
+ !isPredicated(LastInst)) {
|
|
+ MachineInstr *predSet = --I;
|
|
+ while (!isPredicateSetter(predSet->getOpcode())) {
|
|
+ predSet = --I;
|
|
+ }
|
|
+ TBB = SecondLastInst->getOperand(0).getMBB();
|
|
+ FBB = LastInst->getOperand(0).getMBB();
|
|
+ Cond.push_back(predSet->getOperand(1));
|
|
+ Cond.push_back(predSet->getOperand(2));
|
|
+ Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ // Otherwise, can't handle this.
|
|
+ return true;
|
|
+}
|
|
+
|
|
+int R600InstrInfo::getBranchInstr(const MachineOperand &op) const {
|
|
+ const MachineInstr *MI = op.getParent();
|
|
+
|
|
+ switch (MI->getDesc().OpInfo->RegClass) {
|
|
+ default: // FIXME: fallthrough??
|
|
+ case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32;
|
|
+ case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32;
|
|
+ };
|
|
+}
|
|
+
|
|
+unsigned
|
|
+R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock *TBB,
|
|
+ MachineBasicBlock *FBB,
|
|
+ const SmallVectorImpl<MachineOperand> &Cond,
|
|
+ DebugLoc DL) const {
|
|
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
|
|
+
|
|
+ if (FBB == 0) {
|
|
+ if (Cond.empty()) {
|
|
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB).addReg(0);
|
|
+ return 1;
|
|
+ } else {
|
|
+ MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
|
|
+ assert(PredSet && "No previous predicate !");
|
|
+ addFlag(PredSet, 0, MO_FLAG_PUSH);
|
|
+ PredSet->getOperand(2).setImm(Cond[1].getImm());
|
|
+
|
|
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP))
|
|
+ .addMBB(TBB)
|
|
+ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
|
|
+ return 1;
|
|
+ }
|
|
+ } else {
|
|
+ MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
|
|
+ assert(PredSet && "No previous predicate !");
|
|
+ addFlag(PredSet, 0, MO_FLAG_PUSH);
|
|
+ PredSet->getOperand(2).setImm(Cond[1].getImm());
|
|
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP))
|
|
+ .addMBB(TBB)
|
|
+ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
|
|
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB).addReg(0);
|
|
+ return 2;
|
|
+ }
|
|
+}
|
|
+
|
|
+unsigned
|
|
+R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
|
|
+
|
|
+ // Note : we leave PRED* instructions there.
|
|
+ // They may be needed when predicating instructions.
|
|
+
|
|
+ MachineBasicBlock::iterator I = MBB.end();
|
|
+
|
|
+ if (I == MBB.begin()) {
|
|
+ return 0;
|
|
+ }
|
|
+ --I;
|
|
+ switch (I->getOpcode()) {
|
|
+ default:
|
|
+ return 0;
|
|
+ case AMDGPU::JUMP:
|
|
+ if (isPredicated(I)) {
|
|
+ MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
|
|
+ clearFlag(predSet, 0, MO_FLAG_PUSH);
|
|
+ }
|
|
+ I->eraseFromParent();
|
|
+ break;
|
|
+ }
|
|
+ I = MBB.end();
|
|
+
|
|
+ if (I == MBB.begin()) {
|
|
+ return 1;
|
|
+ }
|
|
+ --I;
|
|
+ switch (I->getOpcode()) {
|
|
+ // FIXME: only one case??
|
|
+ default:
|
|
+ return 1;
|
|
+ case AMDGPU::JUMP:
|
|
+ if (isPredicated(I)) {
|
|
+ MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
|
|
+ clearFlag(predSet, 0, MO_FLAG_PUSH);
|
|
+ }
|
|
+ I->eraseFromParent();
|
|
+ break;
|
|
+ }
|
|
+ return 2;
|
|
+}
|
|
+
|
|
+bool
|
|
+R600InstrInfo::isPredicated(const MachineInstr *MI) const {
|
|
+ int idx = MI->findFirstPredOperandIdx();
|
|
+ if (idx < 0)
|
|
+ return false;
|
|
+
|
|
+ unsigned Reg = MI->getOperand(idx).getReg();
|
|
+ switch (Reg) {
|
|
+ default: return false;
|
|
+ case AMDGPU::PRED_SEL_ONE:
|
|
+ case AMDGPU::PRED_SEL_ZERO:
|
|
+ case AMDGPU::PREDICATE_BIT:
|
|
+ return true;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool
|
|
+R600InstrInfo::isPredicable(MachineInstr *MI) const {
|
|
+ // XXX: KILL* instructions can be predicated, but they must be the last
|
|
+ // instruction in a clause, so this means any instructions after them cannot
|
|
+ // be predicated. Until we have proper support for instruction clauses in the
|
|
+ // backend, we will mark KILL* instructions as unpredicable.
|
|
+
|
|
+ if (MI->getOpcode() == AMDGPU::KILLGT) {
|
|
+ return false;
|
|
+ } else {
|
|
+ return AMDGPUInstrInfo::isPredicable(MI);
|
|
+ }
|
|
+}
|
|
+
|
|
+
|
|
+bool
|
|
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
|
|
+ unsigned NumCyles,
|
|
+ unsigned ExtraPredCycles,
|
|
+ const BranchProbability &Probability) const{
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool
|
|
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
|
|
+ unsigned NumTCycles,
|
|
+ unsigned ExtraTCycles,
|
|
+ MachineBasicBlock &FMBB,
|
|
+ unsigned NumFCycles,
|
|
+ unsigned ExtraFCycles,
|
|
+ const BranchProbability &Probability) const {
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool
|
|
+R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
|
|
+ unsigned NumCyles,
|
|
+ const BranchProbability &Probability)
|
|
+ const {
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool
|
|
+R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
|
|
+ MachineBasicBlock &FMBB) const {
|
|
+ return false;
|
|
+}
|
|
+
|
|
+
|
|
+bool
|
|
+R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
|
|
+ MachineOperand &MO = Cond[1];
|
|
+ switch (MO.getImm()) {
|
|
+ case OPCODE_IS_ZERO_INT:
|
|
+ MO.setImm(OPCODE_IS_NOT_ZERO_INT);
|
|
+ break;
|
|
+ case OPCODE_IS_NOT_ZERO_INT:
|
|
+ MO.setImm(OPCODE_IS_ZERO_INT);
|
|
+ break;
|
|
+ case OPCODE_IS_ZERO:
|
|
+ MO.setImm(OPCODE_IS_NOT_ZERO);
|
|
+ break;
|
|
+ case OPCODE_IS_NOT_ZERO:
|
|
+ MO.setImm(OPCODE_IS_ZERO);
|
|
+ break;
|
|
+ default:
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ MachineOperand &MO2 = Cond[2];
|
|
+ switch (MO2.getReg()) {
|
|
+ case AMDGPU::PRED_SEL_ZERO:
|
|
+ MO2.setReg(AMDGPU::PRED_SEL_ONE);
|
|
+ break;
|
|
+ case AMDGPU::PRED_SEL_ONE:
|
|
+ MO2.setReg(AMDGPU::PRED_SEL_ZERO);
|
|
+ break;
|
|
+ default:
|
|
+ return true;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool
|
|
+R600InstrInfo::DefinesPredicate(MachineInstr *MI,
|
|
+ std::vector<MachineOperand> &Pred) const {
|
|
+ return isPredicateSetter(MI->getOpcode());
|
|
+}
|
|
+
|
|
+
|
|
+bool
|
|
+R600InstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
|
|
+ const SmallVectorImpl<MachineOperand> &Pred2) const {
|
|
+ return false;
|
|
+}
|
|
+
|
|
+
|
|
+bool
|
|
+R600InstrInfo::PredicateInstruction(MachineInstr *MI,
|
|
+ const SmallVectorImpl<MachineOperand> &Pred) const {
|
|
+ int PIdx = MI->findFirstPredOperandIdx();
|
|
+
|
|
+ if (PIdx != -1) {
|
|
+ MachineOperand &PMO = MI->getOperand(PIdx);
|
|
+ PMO.setReg(Pred[2].getReg());
|
|
+ MachineInstrBuilder(MI).addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
|
|
+ const MachineInstr *MI,
|
|
+ unsigned *PredCost) const {
|
|
+ if (PredCost)
|
|
+ *PredCost = 2;
|
|
+ return 2;
|
|
+}
|
|
+
|
|
+int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
|
|
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
|
|
+ int Offset = 0;
|
|
+
|
|
+ if (MFI->getNumObjects() == 0) {
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ if (MRI.livein_empty()) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
|
|
+ LE = MRI.livein_end();
|
|
+ LI != LE; ++LI) {
|
|
+ Offset = std::max(Offset,
|
|
+ GET_REG_INDEX(RI.getEncodingValue(LI->first)));
|
|
+ }
|
|
+
|
|
+ return Offset + 1;
|
|
+}
|
|
+
|
|
+int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
|
|
+ int Offset = 0;
|
|
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
|
|
+
|
|
+ // Variable sized objects are not supported
|
|
+ assert(!MFI->hasVarSizedObjects());
|
|
+
|
|
+ if (MFI->getNumObjects() == 0) {
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
|
|
+
|
|
+ return getIndirectIndexBegin(MF) + Offset;
|
|
+}
|
|
+
|
|
+std::vector<unsigned> R600InstrInfo::getIndirectReservedRegs(
|
|
+ const MachineFunction &MF) const {
|
|
+ const AMDGPUFrameLowering *TFL =
|
|
+ static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering());
|
|
+ std::vector<unsigned> Regs;
|
|
+
|
|
+ unsigned StackWidth = TFL->getStackWidth(MF);
|
|
+ int End = getIndirectIndexEnd(MF);
|
|
+
|
|
+ if (End == -1) {
|
|
+ return Regs;
|
|
+ }
|
|
+
|
|
+ for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
|
|
+ unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
|
|
+ Regs.push_back(SuperReg);
|
|
+ for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
|
|
+ unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
|
|
+ Regs.push_back(Reg);
|
|
+ }
|
|
+ }
|
|
+ return Regs;
|
|
+}
|
|
+
|
|
+unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
|
|
+ unsigned Channel) const {
|
|
+ // XXX: Remove when we support a stack width > 2
|
|
+ assert(Channel == 0);
|
|
+ return RegIndex;
|
|
+}
|
|
+
|
|
+const TargetRegisterClass * R600InstrInfo::getIndirectAddrStoreRegClass(
|
|
+ unsigned SourceReg) const {
|
|
+ return &AMDGPU::R600_TReg32RegClass;
|
|
+}
|
|
+
|
|
+const TargetRegisterClass *R600InstrInfo::getIndirectAddrLoadRegClass() const {
|
|
+ return &AMDGPU::TRegMemRegClass;
|
|
+}
|
|
+
|
|
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned ValueReg, unsigned Address,
|
|
+ unsigned OffsetReg) const {
|
|
+ unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
|
|
+ MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
|
|
+ AMDGPU::AR_X, OffsetReg);
|
|
+ setImmOperand(MOVA, R600Operands::WRITE, 0);
|
|
+
|
|
+ MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
|
|
+ AddrReg, ValueReg)
|
|
+ .addReg(AMDGPU::AR_X, RegState::Implicit);
|
|
+ setImmOperand(Mov, R600Operands::DST_REL, 1);
|
|
+ return Mov;
|
|
+}
|
|
+
|
|
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned ValueReg, unsigned Address,
|
|
+ unsigned OffsetReg) const {
|
|
+ unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
|
|
+ MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
|
|
+ AMDGPU::AR_X,
|
|
+ OffsetReg);
|
|
+ setImmOperand(MOVA, R600Operands::WRITE, 0);
|
|
+ MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
|
|
+ ValueReg,
|
|
+ AddrReg)
|
|
+ .addReg(AMDGPU::AR_X, RegState::Implicit);
|
|
+ setImmOperand(Mov, R600Operands::SRC0_REL, 1);
|
|
+
|
|
+ return Mov;
|
|
+}
|
|
+
|
|
+const TargetRegisterClass *R600InstrInfo::getSuperIndirectRegClass() const {
|
|
+ return &AMDGPU::IndirectRegRegClass;
|
|
+}
|
|
+
|
|
+
|
|
+MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned Opcode,
|
|
+ unsigned DstReg,
|
|
+ unsigned Src0Reg,
|
|
+ unsigned Src1Reg) const {
|
|
+ MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode),
|
|
+ DstReg); // $dst
|
|
+
|
|
+ if (Src1Reg) {
|
|
+ MIB.addImm(0) // $update_exec_mask
|
|
+ .addImm(0); // $update_predicate
|
|
+ }
|
|
+ MIB.addImm(1) // $write
|
|
+ .addImm(0) // $omod
|
|
+ .addImm(0) // $dst_rel
|
|
+ .addImm(0) // $dst_clamp
|
|
+ .addReg(Src0Reg) // $src0
|
|
+ .addImm(0) // $src0_neg
|
|
+ .addImm(0) // $src0_rel
|
|
+ .addImm(0) // $src0_abs
|
|
+ .addImm(-1); // $src0_sel
|
|
+
|
|
+ if (Src1Reg) {
|
|
+ MIB.addReg(Src1Reg) // $src1
|
|
+ .addImm(0) // $src1_neg
|
|
+ .addImm(0) // $src1_rel
|
|
+ .addImm(0) // $src1_abs
|
|
+ .addImm(-1); // $src1_sel
|
|
+ }
|
|
+
|
|
+ //XXX: The r600g finalizer expects this to be 1, once we've moved the
|
|
+ //scheduling to the backend, we can change the default to 0.
|
|
+ MIB.addImm(1) // $last
|
|
+ .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
|
|
+ .addImm(0); // $literal
|
|
+
|
|
+ return MIB;
|
|
+}
|
|
+
|
|
+MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned DstReg,
|
|
+ uint64_t Imm) const {
|
|
+ MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
|
|
+ AMDGPU::ALU_LITERAL_X);
|
|
+ setImmOperand(MovImm, R600Operands::IMM, Imm);
|
|
+ return MovImm;
|
|
+}
|
|
+
|
|
+int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
|
|
+ R600Operands::Ops Op) const {
|
|
+ return getOperandIdx(MI.getOpcode(), Op);
|
|
+}
|
|
+
|
|
+int R600InstrInfo::getOperandIdx(unsigned Opcode,
|
|
+ R600Operands::Ops Op) const {
|
|
+ unsigned TargetFlags = get(Opcode).TSFlags;
|
|
+ unsigned OpTableIdx;
|
|
+
|
|
+ if (!HAS_NATIVE_OPERANDS(TargetFlags)) {
|
|
+ switch (Op) {
|
|
+ case R600Operands::DST: return 0;
|
|
+ case R600Operands::SRC0: return 1;
|
|
+ case R600Operands::SRC1: return 2;
|
|
+ case R600Operands::SRC2: return 3;
|
|
+ default:
|
|
+ assert(!"Unknown operand type for instruction");
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (TargetFlags & R600_InstFlag::OP1) {
|
|
+ OpTableIdx = 0;
|
|
+ } else if (TargetFlags & R600_InstFlag::OP2) {
|
|
+ OpTableIdx = 1;
|
|
+ } else {
|
|
+ assert((TargetFlags & R600_InstFlag::OP3) && "OP1, OP2, or OP3 not defined "
|
|
+ "for this instruction");
|
|
+ OpTableIdx = 2;
|
|
+ }
|
|
+
|
|
+ return R600Operands::ALUOpTable[OpTableIdx][Op];
|
|
+}
|
|
+
|
|
+void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
|
|
+ int64_t Imm) const {
|
|
+ int Idx = getOperandIdx(*MI, Op);
|
|
+ assert(Idx != -1 && "Operand not supported for this instruction.");
|
|
+ assert(MI->getOperand(Idx).isImm());
|
|
+ MI->getOperand(Idx).setImm(Imm);
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Instruction flag getters/setters
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const {
|
|
+ return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0;
|
|
+}
|
|
+
|
|
+MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
|
|
+ unsigned Flag) const {
|
|
+ unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
|
|
+ int FlagIndex = 0;
|
|
+ if (Flag != 0) {
|
|
+ // If we pass something other than the default value of Flag to this
|
|
+ // function, it means we are want to set a flag on an instruction
|
|
+ // that uses native encoding.
|
|
+ assert(HAS_NATIVE_OPERANDS(TargetFlags));
|
|
+ bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
|
|
+ switch (Flag) {
|
|
+ case MO_FLAG_CLAMP:
|
|
+ FlagIndex = getOperandIdx(*MI, R600Operands::CLAMP);
|
|
+ break;
|
|
+ case MO_FLAG_MASK:
|
|
+ FlagIndex = getOperandIdx(*MI, R600Operands::WRITE);
|
|
+ break;
|
|
+ case MO_FLAG_NOT_LAST:
|
|
+ case MO_FLAG_LAST:
|
|
+ FlagIndex = getOperandIdx(*MI, R600Operands::LAST);
|
|
+ break;
|
|
+ case MO_FLAG_NEG:
|
|
+ switch (SrcIdx) {
|
|
+ case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_NEG); break;
|
|
+ case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_NEG); break;
|
|
+ case 2: FlagIndex = getOperandIdx(*MI, R600Operands::SRC2_NEG); break;
|
|
+ }
|
|
+ break;
|
|
+
|
|
+ case MO_FLAG_ABS:
|
|
+ assert(!IsOP3 && "Cannot set absolute value modifier for OP3 "
|
|
+ "instructions.");
|
|
+ switch (SrcIdx) {
|
|
+ case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_ABS); break;
|
|
+ case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_ABS); break;
|
|
+ }
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ FlagIndex = -1;
|
|
+ break;
|
|
+ }
|
|
+ assert(FlagIndex != -1 && "Flag not supported for this instruction");
|
|
+ } else {
|
|
+ FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags);
|
|
+ assert(FlagIndex != 0 &&
|
|
+ "Instruction flags not supported for this instruction");
|
|
+ }
|
|
+
|
|
+ MachineOperand &FlagOp = MI->getOperand(FlagIndex);
|
|
+ assert(FlagOp.isImm());
|
|
+ return FlagOp;
|
|
+}
|
|
+
|
|
+void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
|
|
+ unsigned Flag) const {
|
|
+ unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
|
|
+ if (Flag == 0) {
|
|
+ return;
|
|
+ }
|
|
+ if (HAS_NATIVE_OPERANDS(TargetFlags)) {
|
|
+ MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
|
|
+ if (Flag == MO_FLAG_NOT_LAST) {
|
|
+ clearFlag(MI, Operand, MO_FLAG_LAST);
|
|
+ } else if (Flag == MO_FLAG_MASK) {
|
|
+ clearFlag(MI, Operand, Flag);
|
|
+ } else {
|
|
+ FlagOp.setImm(1);
|
|
+ }
|
|
+ } else {
|
|
+ MachineOperand &FlagOp = getFlagOp(MI, Operand);
|
|
+ FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand)));
|
|
+ }
|
|
+}
|
|
+
|
|
+void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
|
|
+ unsigned Flag) const {
|
|
+ unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
|
|
+ if (HAS_NATIVE_OPERANDS(TargetFlags)) {
|
|
+ MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
|
|
+ FlagOp.setImm(0);
|
|
+ } else {
|
|
+ MachineOperand &FlagOp = getFlagOp(MI);
|
|
+ unsigned InstFlags = FlagOp.getImm();
|
|
+ InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand));
|
|
+ FlagOp.setImm(InstFlags);
|
|
+ }
|
|
+}
|
|
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
|
|
new file mode 100644
|
|
index 0000000..278fad1
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600InstrInfo.h
|
|
@@ -0,0 +1,201 @@
|
|
+//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Interface definition for R600InstrInfo
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef R600INSTRUCTIONINFO_H_
|
|
+#define R600INSTRUCTIONINFO_H_
|
|
+
|
|
+#include "AMDIL.h"
|
|
+#include "AMDGPUInstrInfo.h"
|
|
+#include "R600Defines.h"
|
|
+#include "R600RegisterInfo.h"
|
|
+
|
|
+#include <map>
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+ class AMDGPUTargetMachine;
|
|
+ class DFAPacketizer;
|
|
+ class ScheduleDAG;
|
|
+ class MachineFunction;
|
|
+ class MachineInstr;
|
|
+ class MachineInstrBuilder;
|
|
+
|
|
+ class R600InstrInfo : public AMDGPUInstrInfo {
|
|
+ private:
|
|
+ const R600RegisterInfo RI;
|
|
+
|
|
+ int getBranchInstr(const MachineOperand &op) const;
|
|
+
|
|
+ public:
|
|
+ explicit R600InstrInfo(AMDGPUTargetMachine &tm);
|
|
+
|
|
+ const R600RegisterInfo &getRegisterInfo() const;
|
|
+ virtual void copyPhysReg(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI, DebugLoc DL,
|
|
+ unsigned DestReg, unsigned SrcReg,
|
|
+ bool KillSrc) const;
|
|
+
|
|
+ bool isTrig(const MachineInstr &MI) const;
|
|
+ bool isPlaceHolderOpcode(unsigned opcode) const;
|
|
+ bool isReductionOp(unsigned opcode) const;
|
|
+ bool isCubeOp(unsigned opcode) const;
|
|
+
|
|
+ /// \returns true if this \p Opcode represents an ALU instruction.
|
|
+ bool isALUInstr(unsigned Opcode) const;
|
|
+
|
|
+ /// \breif Vector instructions are instructions that must fill all
|
|
+ /// instruction slots within an instruction group.
|
|
+ bool isVector(const MachineInstr &MI) const;
|
|
+
|
|
+ virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
|
|
+ int64_t Imm) const;
|
|
+
|
|
+ virtual unsigned getIEQOpcode() const;
|
|
+ virtual bool isMov(unsigned Opcode) const;
|
|
+
|
|
+ DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
|
|
+ const ScheduleDAG *DAG) const;
|
|
+
|
|
+ bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
|
|
+
|
|
+ bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
|
|
+ SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
|
|
+
|
|
+ unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
|
|
+
|
|
+ unsigned RemoveBranch(MachineBasicBlock &MBB) const;
|
|
+
|
|
+ bool isPredicated(const MachineInstr *MI) const;
|
|
+
|
|
+ bool isPredicable(MachineInstr *MI) const;
|
|
+
|
|
+ bool
|
|
+ isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
|
|
+ const BranchProbability &Probability) const;
|
|
+
|
|
+ bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
|
|
+ unsigned ExtraPredCycles,
|
|
+ const BranchProbability &Probability) const ;
|
|
+
|
|
+ bool
|
|
+ isProfitableToIfCvt(MachineBasicBlock &TMBB,
|
|
+ unsigned NumTCycles, unsigned ExtraTCycles,
|
|
+ MachineBasicBlock &FMBB,
|
|
+ unsigned NumFCycles, unsigned ExtraFCycles,
|
|
+ const BranchProbability &Probability) const;
|
|
+
|
|
+ bool DefinesPredicate(MachineInstr *MI,
|
|
+ std::vector<MachineOperand> &Pred) const;
|
|
+
|
|
+ bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
|
|
+ const SmallVectorImpl<MachineOperand> &Pred2) const;
|
|
+
|
|
+ bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
|
|
+ MachineBasicBlock &FMBB) const;
|
|
+
|
|
+ bool PredicateInstruction(MachineInstr *MI,
|
|
+ const SmallVectorImpl<MachineOperand> &Pred) const;
|
|
+
|
|
+ unsigned int getInstrLatency(const InstrItineraryData *ItinData,
|
|
+ const MachineInstr *MI,
|
|
+ unsigned *PredCost = 0) const;
|
|
+
|
|
+ virtual int getInstrLatency(const InstrItineraryData *ItinData,
|
|
+ SDNode *Node) const { return 1;}
|
|
+
|
|
+ /// \returns a list of all the registers that may be accesed using indirect
|
|
+ /// addressing.
|
|
+ std::vector<unsigned> getIndirectReservedRegs(const MachineFunction &MF) const;
|
|
+
|
|
+ virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
|
|
+
|
|
+ virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
|
|
+
|
|
+
|
|
+ virtual unsigned calculateIndirectAddress(unsigned RegIndex,
|
|
+ unsigned Channel) const;
|
|
+
|
|
+ virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
|
|
+ unsigned SourceReg) const;
|
|
+
|
|
+ virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
|
|
+
|
|
+ virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned ValueReg, unsigned Address,
|
|
+ unsigned OffsetReg) const;
|
|
+
|
|
+ virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned ValueReg, unsigned Address,
|
|
+ unsigned OffsetReg) const;
|
|
+
|
|
+ virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
|
|
+
|
|
+
|
|
+ ///buildDefaultInstruction - This function returns a MachineInstr with
|
|
+ /// all the instruction modifiers initialized to their default values.
|
|
+ /// You can use this function to avoid manually specifying each instruction
|
|
+ /// modifier operand when building a new instruction.
|
|
+ ///
|
|
+ /// \returns a MachineInstr with all the instruction modifiers initialized
|
|
+ /// to their default values.
|
|
+ MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned Opcode,
|
|
+ unsigned DstReg,
|
|
+ unsigned Src0Reg,
|
|
+ unsigned Src1Reg = 0) const;
|
|
+
|
|
+ MachineInstr *buildMovImm(MachineBasicBlock &BB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned DstReg,
|
|
+ uint64_t Imm) const;
|
|
+
|
|
+ /// \brief Get the index of Op in the MachineInstr.
|
|
+ ///
|
|
+ /// \returns -1 if the Instruction does not contain the specified \p Op.
|
|
+ int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
|
|
+
|
|
+ /// \brief Get the index of \p Op for the given Opcode.
|
|
+ ///
|
|
+ /// \returns -1 if the Instruction does not contain the specified \p Op.
|
|
+ int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
|
|
+
|
|
+ /// \brief Helper function for setting instruction flag values.
|
|
+ void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const;
|
|
+
|
|
+ /// \returns true if this instruction has an operand for storing target flags.
|
|
+ bool hasFlagOperand(const MachineInstr &MI) const;
|
|
+
|
|
+ ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
|
|
+ void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
|
|
+
|
|
+ ///\brief Determine if the specified \p Flag is set on this \p Operand.
|
|
+ bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
|
|
+
|
|
+ /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
|
|
+ /// \param Flag The flag being set.
|
|
+ ///
|
|
+ /// \returns the operand containing the flags for this instruction.
|
|
+ MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0,
|
|
+ unsigned Flag = 0) const;
|
|
+
|
|
+ /// \brief Clear the specified flag on the instruction.
|
|
+ void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
|
|
+};
|
|
+
|
|
+} // End llvm namespace
|
|
+
|
|
+#endif // R600INSTRINFO_H_
|
|
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
|
|
new file mode 100644
|
|
index 0000000..409da07
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600Instructions.td
|
|
@@ -0,0 +1,1976 @@
|
|
+//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// R600 Tablegen instruction definitions
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+include "R600Intrinsics.td"
|
|
+
|
|
+class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
|
|
+ InstrItinClass itin>
|
|
+ : AMDGPUInst <outs, ins, asm, pattern> {
|
|
+
|
|
+ field bits<64> Inst;
|
|
+ bit Trig = 0;
|
|
+ bit Op3 = 0;
|
|
+ bit isVector = 0;
|
|
+ bits<2> FlagOperandIdx = 0;
|
|
+ bit Op1 = 0;
|
|
+ bit Op2 = 0;
|
|
+ bit HasNativeOperands = 0;
|
|
+
|
|
+ bits<11> op_code = inst;
|
|
+ //let Inst = inst;
|
|
+ let Namespace = "AMDGPU";
|
|
+ let OutOperandList = outs;
|
|
+ let InOperandList = ins;
|
|
+ let AsmString = asm;
|
|
+ let Pattern = pattern;
|
|
+ let Itinerary = itin;
|
|
+
|
|
+ let TSFlags{4} = Trig;
|
|
+ let TSFlags{5} = Op3;
|
|
+
|
|
+ // Vector instructions are instructions that must fill all slots in an
|
|
+ // instruction group
|
|
+ let TSFlags{6} = isVector;
|
|
+ let TSFlags{8-7} = FlagOperandIdx;
|
|
+ let TSFlags{9} = HasNativeOperands;
|
|
+ let TSFlags{10} = Op1;
|
|
+ let TSFlags{11} = Op2;
|
|
+}
|
|
+
|
|
+class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ AMDGPUInst <outs, ins, asm, pattern> {
|
|
+ field bits<64> Inst;
|
|
+
|
|
+ let Namespace = "AMDGPU";
|
|
+}
|
|
+
|
|
+def MEMxi : Operand<iPTR> {
|
|
+ let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index);
|
|
+ let PrintMethod = "printMemOperand";
|
|
+}
|
|
+
|
|
+def MEMrr : Operand<iPTR> {
|
|
+ let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
|
|
+}
|
|
+
|
|
+// Operands for non-registers
|
|
+
|
|
+class InstFlag<string PM = "printOperand", int Default = 0>
|
|
+ : OperandWithDefaultOps <i32, (ops (i32 Default))> {
|
|
+ let PrintMethod = PM;
|
|
+}
|
|
+
|
|
+// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
|
|
+def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
|
|
+ let PrintMethod = "printSel";
|
|
+}
|
|
+
|
|
+def LITERAL : InstFlag<"printLiteral">;
|
|
+
|
|
+def WRITE : InstFlag <"printWrite", 1>;
|
|
+def OMOD : InstFlag <"printOMOD">;
|
|
+def REL : InstFlag <"printRel">;
|
|
+def CLAMP : InstFlag <"printClamp">;
|
|
+def NEG : InstFlag <"printNeg">;
|
|
+def ABS : InstFlag <"printAbs">;
|
|
+def UEM : InstFlag <"printUpdateExecMask">;
|
|
+def UP : InstFlag <"printUpdatePred">;
|
|
+
|
|
+// XXX: The r600g finalizer in Mesa expects last to be one in most cases.
|
|
+// Once we start using the packetizer in this backend we should have this
|
|
+// default to 0.
|
|
+def LAST : InstFlag<"printLast", 1>;
|
|
+
|
|
+def FRAMEri : Operand<iPTR> {
|
|
+ let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
|
|
+}
|
|
+
|
|
+def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
|
|
+def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
|
|
+def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
|
|
+def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
|
|
+def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
|
|
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
|
|
+
|
|
+class R600ALU_Word0 {
|
|
+ field bits<32> Word0;
|
|
+
|
|
+ bits<11> src0;
|
|
+ bits<1> src0_neg;
|
|
+ bits<1> src0_rel;
|
|
+ bits<11> src1;
|
|
+ bits<1> src1_rel;
|
|
+ bits<1> src1_neg;
|
|
+ bits<3> index_mode = 0;
|
|
+ bits<2> pred_sel;
|
|
+ bits<1> last;
|
|
+
|
|
+ bits<9> src0_sel = src0{8-0};
|
|
+ bits<2> src0_chan = src0{10-9};
|
|
+ bits<9> src1_sel = src1{8-0};
|
|
+ bits<2> src1_chan = src1{10-9};
|
|
+
|
|
+ let Word0{8-0} = src0_sel;
|
|
+ let Word0{9} = src0_rel;
|
|
+ let Word0{11-10} = src0_chan;
|
|
+ let Word0{12} = src0_neg;
|
|
+ let Word0{21-13} = src1_sel;
|
|
+ let Word0{22} = src1_rel;
|
|
+ let Word0{24-23} = src1_chan;
|
|
+ let Word0{25} = src1_neg;
|
|
+ let Word0{28-26} = index_mode;
|
|
+ let Word0{30-29} = pred_sel;
|
|
+ let Word0{31} = last;
|
|
+}
|
|
+
|
|
+class R600ALU_Word1 {
|
|
+ field bits<32> Word1;
|
|
+
|
|
+ bits<11> dst;
|
|
+ bits<3> bank_swizzle = 0;
|
|
+ bits<1> dst_rel;
|
|
+ bits<1> clamp;
|
|
+
|
|
+ bits<7> dst_sel = dst{6-0};
|
|
+ bits<2> dst_chan = dst{10-9};
|
|
+
|
|
+ let Word1{20-18} = bank_swizzle;
|
|
+ let Word1{27-21} = dst_sel;
|
|
+ let Word1{28} = dst_rel;
|
|
+ let Word1{30-29} = dst_chan;
|
|
+ let Word1{31} = clamp;
|
|
+}
|
|
+
|
|
+class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
|
|
+
|
|
+ bits<1> src0_abs;
|
|
+ bits<1> src1_abs;
|
|
+ bits<1> update_exec_mask;
|
|
+ bits<1> update_pred;
|
|
+ bits<1> write;
|
|
+ bits<2> omod;
|
|
+
|
|
+ let Word1{0} = src0_abs;
|
|
+ let Word1{1} = src1_abs;
|
|
+ let Word1{2} = update_exec_mask;
|
|
+ let Word1{3} = update_pred;
|
|
+ let Word1{4} = write;
|
|
+ let Word1{6-5} = omod;
|
|
+ let Word1{17-7} = alu_inst;
|
|
+}
|
|
+
|
|
+class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
|
|
+
|
|
+ bits<11> src2;
|
|
+ bits<1> src2_rel;
|
|
+ bits<1> src2_neg;
|
|
+
|
|
+ bits<9> src2_sel = src2{8-0};
|
|
+ bits<2> src2_chan = src2{10-9};
|
|
+
|
|
+ let Word1{8-0} = src2_sel;
|
|
+ let Word1{9} = src2_rel;
|
|
+ let Word1{11-10} = src2_chan;
|
|
+ let Word1{12} = src2_neg;
|
|
+ let Word1{17-13} = alu_inst;
|
|
+}
|
|
+
|
|
+class VTX_WORD0 {
|
|
+ field bits<32> Word0;
|
|
+ bits<7> SRC_GPR;
|
|
+ bits<5> VC_INST;
|
|
+ bits<2> FETCH_TYPE;
|
|
+ bits<1> FETCH_WHOLE_QUAD;
|
|
+ bits<8> BUFFER_ID;
|
|
+ bits<1> SRC_REL;
|
|
+ bits<2> SRC_SEL_X;
|
|
+ bits<6> MEGA_FETCH_COUNT;
|
|
+
|
|
+ let Word0{4-0} = VC_INST;
|
|
+ let Word0{6-5} = FETCH_TYPE;
|
|
+ let Word0{7} = FETCH_WHOLE_QUAD;
|
|
+ let Word0{15-8} = BUFFER_ID;
|
|
+ let Word0{22-16} = SRC_GPR;
|
|
+ let Word0{23} = SRC_REL;
|
|
+ let Word0{25-24} = SRC_SEL_X;
|
|
+ let Word0{31-26} = MEGA_FETCH_COUNT;
|
|
+}
|
|
+
|
|
+class VTX_WORD1_GPR {
|
|
+ field bits<32> Word1;
|
|
+ bits<7> DST_GPR;
|
|
+ bits<1> DST_REL;
|
|
+ bits<3> DST_SEL_X;
|
|
+ bits<3> DST_SEL_Y;
|
|
+ bits<3> DST_SEL_Z;
|
|
+ bits<3> DST_SEL_W;
|
|
+ bits<1> USE_CONST_FIELDS;
|
|
+ bits<6> DATA_FORMAT;
|
|
+ bits<2> NUM_FORMAT_ALL;
|
|
+ bits<1> FORMAT_COMP_ALL;
|
|
+ bits<1> SRF_MODE_ALL;
|
|
+
|
|
+ let Word1{6-0} = DST_GPR;
|
|
+ let Word1{7} = DST_REL;
|
|
+ let Word1{8} = 0; // Reserved
|
|
+ let Word1{11-9} = DST_SEL_X;
|
|
+ let Word1{14-12} = DST_SEL_Y;
|
|
+ let Word1{17-15} = DST_SEL_Z;
|
|
+ let Word1{20-18} = DST_SEL_W;
|
|
+ let Word1{21} = USE_CONST_FIELDS;
|
|
+ let Word1{27-22} = DATA_FORMAT;
|
|
+ let Word1{29-28} = NUM_FORMAT_ALL;
|
|
+ let Word1{30} = FORMAT_COMP_ALL;
|
|
+ let Word1{31} = SRF_MODE_ALL;
|
|
+}
|
|
+
|
|
+/*
|
|
+XXX: R600 subtarget uses a slightly different encoding than the other
|
|
+subtargets. We currently handle this in R600MCCodeEmitter, but we may
|
|
+want to use these instruction classes in the future.
|
|
+
|
|
+class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
|
|
+
|
|
+ bits<1> fog_merge;
|
|
+ bits<10> alu_inst;
|
|
+
|
|
+ let Inst{37} = fog_merge;
|
|
+ let Inst{39-38} = omod;
|
|
+ let Inst{49-40} = alu_inst;
|
|
+}
|
|
+
|
|
+class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
|
|
+
|
|
+ bits<11> alu_inst;
|
|
+
|
|
+ let Inst{38-37} = omod;
|
|
+ let Inst{49-39} = alu_inst;
|
|
+}
|
|
+*/
|
|
+
|
|
+def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
|
|
+ (ops PRED_SEL_OFF)>;
|
|
+
|
|
+
|
|
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
|
|
+
|
|
+// Class for instructions with only one source register.
|
|
+// If you add new ins to this instruction, make sure they are listed before
|
|
+// $literal, because the backend currently assumes that the last operand is
|
|
+// a literal. Also be sure to update the enum R600Op1OperandIndex::ROI in
|
|
+// R600Defines.h, R600InstrInfo::buildDefaultInstruction(),
|
|
+// and R600InstrInfo::getOperandIdx().
|
|
+class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
|
|
+ InstrItinClass itin = AnyALU> :
|
|
+ InstR600 <0,
|
|
+ (outs R600_Reg32:$dst),
|
|
+ (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
|
|
+ R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
|
|
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
|
|
+ !strconcat(opName,
|
|
+ "$clamp $dst$write$dst_rel$omod, "
|
|
+ "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, "
|
|
+ "$literal $pred_sel$last"),
|
|
+ pattern,
|
|
+ itin>,
|
|
+ R600ALU_Word0,
|
|
+ R600ALU_Word1_OP2 <inst> {
|
|
+
|
|
+ let src1 = 0;
|
|
+ let src1_rel = 0;
|
|
+ let src1_neg = 0;
|
|
+ let src1_abs = 0;
|
|
+ let update_exec_mask = 0;
|
|
+ let update_pred = 0;
|
|
+ let HasNativeOperands = 1;
|
|
+ let Op1 = 1;
|
|
+ let DisableEncoding = "$literal";
|
|
+
|
|
+ let Inst{31-0} = Word0;
|
|
+ let Inst{63-32} = Word1;
|
|
+}
|
|
+
|
|
+class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
|
|
+ InstrItinClass itin = AnyALU> :
|
|
+ R600_1OP <inst, opName,
|
|
+ [(set R600_Reg32:$dst, (node R600_Reg32:$src0))]
|
|
+>;
|
|
+
|
|
+// If you add our change the operands for R600_2OP instructions, you must
|
|
+// also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
|
|
+// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
|
|
+class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
|
|
+ InstrItinClass itin = AnyALU> :
|
|
+ InstR600 <inst,
|
|
+ (outs R600_Reg32:$dst),
|
|
+ (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
|
|
+ OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
|
|
+ R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
|
|
+ R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
|
|
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
|
|
+ !strconcat(opName,
|
|
+ "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
|
|
+ "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, "
|
|
+ "$src1_neg$src1_abs$src1$src1_sel$src1_abs$src1_rel, "
|
|
+ "$literal $pred_sel$last"),
|
|
+ pattern,
|
|
+ itin>,
|
|
+ R600ALU_Word0,
|
|
+ R600ALU_Word1_OP2 <inst> {
|
|
+
|
|
+ let HasNativeOperands = 1;
|
|
+ let Op2 = 1;
|
|
+ let DisableEncoding = "$literal";
|
|
+
|
|
+ let Inst{31-0} = Word0;
|
|
+ let Inst{63-32} = Word1;
|
|
+}
|
|
+
|
|
+class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
|
|
+ InstrItinClass itim = AnyALU> :
|
|
+ R600_2OP <inst, opName,
|
|
+ [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
|
|
+ R600_Reg32:$src1))]
|
|
+>;
|
|
+
|
|
+// If you add our change the operands for R600_3OP instructions, you must
|
|
+// also update the R600Op3OperandIndex::ROI enum in R600Defines.h,
|
|
+// R600InstrInfo::buildDefaultInstruction(), and
|
|
+// R600InstrInfo::getOperandIdx().
|
|
+class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
|
|
+ InstrItinClass itin = AnyALU> :
|
|
+ InstR600 <0,
|
|
+ (outs R600_Reg32:$dst),
|
|
+ (ins REL:$dst_rel, CLAMP:$clamp,
|
|
+ R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
|
|
+ R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
|
|
+ R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
|
|
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
|
|
+ !strconcat(opName, "$clamp $dst$dst_rel, "
|
|
+ "$src0_neg$src0$src0_sel$src0_rel, "
|
|
+ "$src1_neg$src1$src1_sel$src1_rel, "
|
|
+ "$src2_neg$src2$src2_sel$src2_rel, "
|
|
+ "$literal $pred_sel$last"),
|
|
+ pattern,
|
|
+ itin>,
|
|
+ R600ALU_Word0,
|
|
+ R600ALU_Word1_OP3<inst>{
|
|
+
|
|
+ let HasNativeOperands = 1;
|
|
+ let DisableEncoding = "$literal";
|
|
+ let Op3 = 1;
|
|
+
|
|
+ let Inst{31-0} = Word0;
|
|
+ let Inst{63-32} = Word1;
|
|
+}
|
|
+
|
|
+class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
|
|
+ InstrItinClass itin = VecALU> :
|
|
+ InstR600 <inst,
|
|
+ (outs R600_Reg32:$dst),
|
|
+ ins,
|
|
+ asm,
|
|
+ pattern,
|
|
+ itin>;
|
|
+
|
|
+class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
|
|
+ InstrItinClass itin = AnyALU> :
|
|
+ InstR600 <inst,
|
|
+ (outs R600_Reg128:$dst),
|
|
+ (ins R600_Reg128:$src0, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
|
|
+ !strconcat(opName, "$dst, $src0, $resourceId, $samplerId, $textureTarget"),
|
|
+ pattern,
|
|
+ itin>{
|
|
+ let Inst {10-0} = inst;
|
|
+ }
|
|
+
|
|
+} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
|
|
+
|
|
+def TEX_SHADOW : PatLeaf<
|
|
+ (imm),
|
|
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
|
|
+ return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13);
|
|
+ }]
|
|
+>;
|
|
+
|
|
+def TEX_RECT : PatLeaf<
|
|
+ (imm),
|
|
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
|
|
+ return TType == 5;
|
|
+ }]
|
|
+>;
|
|
+
|
|
+def TEX_ARRAY : PatLeaf<
|
|
+ (imm),
|
|
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
|
|
+ return TType == 9 || TType == 10 || TType == 15 || TType == 16;
|
|
+ }]
|
|
+>;
|
|
+
|
|
+def TEX_SHADOW_ARRAY : PatLeaf<
|
|
+ (imm),
|
|
+ [{uint32_t TType = (uint32_t)N->getZExtValue();
|
|
+ return TType == 11 || TType == 12 || TType == 17;
|
|
+ }]
|
|
+>;
|
|
+
|
|
+class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, bits<4> rat_id, dag outs,
|
|
+ dag ins, string asm, list<dag> pattern> :
|
|
+ InstR600ISA <outs, ins, asm, pattern> {
|
|
+ bits<7> RW_GPR;
|
|
+ bits<7> INDEX_GPR;
|
|
+
|
|
+ bits<2> RIM;
|
|
+ bits<2> TYPE;
|
|
+ bits<1> RW_REL;
|
|
+ bits<2> ELEM_SIZE;
|
|
+
|
|
+ bits<12> ARRAY_SIZE;
|
|
+ bits<4> COMP_MASK;
|
|
+ bits<4> BURST_COUNT;
|
|
+ bits<1> VPM;
|
|
+ bits<1> eop;
|
|
+ bits<1> MARK;
|
|
+ bits<1> BARRIER;
|
|
+
|
|
+ // CF_ALLOC_EXPORT_WORD0_RAT
|
|
+ let Inst{3-0} = rat_id;
|
|
+ let Inst{9-4} = rat_inst;
|
|
+ let Inst{10} = 0; // Reserved
|
|
+ let Inst{12-11} = RIM;
|
|
+ let Inst{14-13} = TYPE;
|
|
+ let Inst{21-15} = RW_GPR;
|
|
+ let Inst{22} = RW_REL;
|
|
+ let Inst{29-23} = INDEX_GPR;
|
|
+ let Inst{31-30} = ELEM_SIZE;
|
|
+
|
|
+ // CF_ALLOC_EXPORT_WORD1_BUF
|
|
+ let Inst{43-32} = ARRAY_SIZE;
|
|
+ let Inst{47-44} = COMP_MASK;
|
|
+ let Inst{51-48} = BURST_COUNT;
|
|
+ let Inst{52} = VPM;
|
|
+ let Inst{53} = eop;
|
|
+ let Inst{61-54} = cf_inst;
|
|
+ let Inst{62} = MARK;
|
|
+ let Inst{63} = BARRIER;
|
|
+}
|
|
+
|
|
+class LoadParamFrag <PatFrag load_type> : PatFrag <
|
|
+ (ops node:$ptr), (load_type node:$ptr),
|
|
+ [{ return isParamLoad(dyn_cast<LoadSDNode>(N)); }]
|
|
+>;
|
|
+
|
|
+def load_param : LoadParamFrag<load>;
|
|
+def load_param_zexti8 : LoadParamFrag<zextloadi8>;
|
|
+def load_param_zexti16 : LoadParamFrag<zextloadi16>;
|
|
+
|
|
+def isR600 : Predicate<"Subtarget.device()"
|
|
+ "->getGeneration() == AMDGPUDeviceInfo::HD4XXX">;
|
|
+def isR700 : Predicate<"Subtarget.device()"
|
|
+ "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
|
|
+ "Subtarget.device()->getDeviceFlag()"
|
|
+ ">= OCL_DEVICE_RV710">;
|
|
+def isEG : Predicate<
|
|
+ "Subtarget.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX && "
|
|
+ "Subtarget.device()->getGeneration() < AMDGPUDeviceInfo::HD7XXX && "
|
|
+ "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">;
|
|
+
|
|
+def isCayman : Predicate<"Subtarget.device()"
|
|
+ "->getDeviceFlag() == OCL_DEVICE_CAYMAN">;
|
|
+def isEGorCayman : Predicate<"Subtarget.device()"
|
|
+ "->getGeneration() == AMDGPUDeviceInfo::HD5XXX"
|
|
+ "|| Subtarget.device()->getGeneration() =="
|
|
+ "AMDGPUDeviceInfo::HD6XXX">;
|
|
+
|
|
+def isR600toCayman : Predicate<
|
|
+ "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// R600 SDNodes
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+def INTERP_PAIR_XY : AMDGPUShaderInst <
|
|
+ (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
|
|
+ (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
|
|
+ "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1",
|
|
+ []>;
|
|
+
|
|
+def INTERP_PAIR_ZW : AMDGPUShaderInst <
|
|
+ (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1),
|
|
+ (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
|
|
+ "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
|
|
+ []>;
|
|
+
|
|
+def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
|
|
+ SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
|
|
+ [SDNPMayLoad]
|
|
+>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Interpolation Instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+def INTERP_VEC_LOAD : AMDGPUShaderInst <
|
|
+ (outs R600_Reg128:$dst),
|
|
+ (ins i32imm:$src0),
|
|
+ "INTERP_LOAD $src0 : $dst",
|
|
+ []>;
|
|
+
|
|
+def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
|
|
+ let bank_swizzle = 5;
|
|
+}
|
|
+
|
|
+def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> {
|
|
+ let bank_swizzle = 5;
|
|
+}
|
|
+
|
|
+def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Export Instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+def ExportType : SDTypeProfile<0, 5, [SDTCisFP<0>, SDTCisInt<1>]>;
|
|
+
|
|
+def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
|
|
+ [SDNPHasChain, SDNPSideEffect]>;
|
|
+
|
|
+class ExportWord0 {
|
|
+ field bits<32> Word0;
|
|
+
|
|
+ bits<13> arraybase;
|
|
+ bits<2> type;
|
|
+ bits<7> gpr;
|
|
+ bits<2> elem_size;
|
|
+
|
|
+ let Word0{12-0} = arraybase;
|
|
+ let Word0{14-13} = type;
|
|
+ let Word0{21-15} = gpr;
|
|
+ let Word0{22} = 0; // RW_REL
|
|
+ let Word0{29-23} = 0; // INDEX_GPR
|
|
+ let Word0{31-30} = elem_size;
|
|
+}
|
|
+
|
|
+class ExportSwzWord1 {
|
|
+ field bits<32> Word1;
|
|
+
|
|
+ bits<3> sw_x;
|
|
+ bits<3> sw_y;
|
|
+ bits<3> sw_z;
|
|
+ bits<3> sw_w;
|
|
+ bits<1> eop;
|
|
+ bits<8> inst;
|
|
+
|
|
+ let Word1{2-0} = sw_x;
|
|
+ let Word1{5-3} = sw_y;
|
|
+ let Word1{8-6} = sw_z;
|
|
+ let Word1{11-9} = sw_w;
|
|
+}
|
|
+
|
|
+class ExportBufWord1 {
|
|
+ field bits<32> Word1;
|
|
+
|
|
+ bits<12> arraySize;
|
|
+ bits<4> compMask;
|
|
+ bits<1> eop;
|
|
+ bits<8> inst;
|
|
+
|
|
+ let Word1{11-0} = arraySize;
|
|
+ let Word1{15-12} = compMask;
|
|
+}
|
|
+
|
|
+multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
|
|
+ def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
|
|
+ (ExportInst
|
|
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
|
|
+ 0, 61, 0, 7, 7, 7, cf_inst, 0)
|
|
+ >;
|
|
+
|
|
+ def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
|
|
+ (ExportInst
|
|
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
|
|
+ 0, 61, 7, 0, 7, 7, cf_inst, 0)
|
|
+ >;
|
|
+
|
|
+ def : Pat<(int_R600_store_dummy (i32 imm:$type)),
|
|
+ (ExportInst
|
|
+ (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0)
|
|
+ >;
|
|
+
|
|
+ def : Pat<(int_R600_store_dummy 1),
|
|
+ (ExportInst
|
|
+ (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0)
|
|
+ >;
|
|
+
|
|
+ def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 0),
|
|
+ (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)),
|
|
+ (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
|
|
+ 0, 1, 2, 3, cf_inst, 0)
|
|
+ >;
|
|
+ def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 1),
|
|
+ (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)),
|
|
+ (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
|
|
+ 0, 1, 2, 3, cf_inst, 0)
|
|
+ >;
|
|
+
|
|
+ def : Pat<(int_R600_store_swizzle (v4f32 R600_Reg128:$src), imm:$arraybase,
|
|
+ imm:$type),
|
|
+ (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
|
|
+ 0, 1, 2, 3, cf_inst, 0)
|
|
+ >;
|
|
+}
|
|
+
|
|
+multiclass SteamOutputExportPattern<Instruction ExportInst,
|
|
+ bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
|
|
+// Stream0
|
|
+ def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
|
|
+ (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
|
|
+ (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
|
|
+ 4095, imm:$mask, buf0inst, 0)>;
|
|
+// Stream1
|
|
+ def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
|
|
+ (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
|
|
+ (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
|
|
+ 4095, imm:$mask, buf1inst, 0)>;
|
|
+// Stream2
|
|
+ def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
|
|
+ (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
|
|
+ (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
|
|
+ 4095, imm:$mask, buf2inst, 0)>;
|
|
+// Stream3
|
|
+ def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
|
|
+ (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
|
|
+ (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
|
|
+ 4095, imm:$mask, buf3inst, 0)>;
|
|
+}
|
|
+
|
|
+let isTerminator = 1, usesCustomInserter = 1 in {
|
|
+
|
|
+class ExportSwzInst : InstR600ISA<(
|
|
+ outs),
|
|
+ (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
|
|
+ i32imm:$sw_x, i32imm:$sw_y, i32imm:$sw_z, i32imm:$sw_w, i32imm:$inst,
|
|
+ i32imm:$eop),
|
|
+ !strconcat("EXPORT", " $gpr"),
|
|
+ []>, ExportWord0, ExportSwzWord1 {
|
|
+ let elem_size = 3;
|
|
+ let Inst{31-0} = Word0;
|
|
+ let Inst{63-32} = Word1;
|
|
+}
|
|
+
|
|
+} // End isTerminator = 1, usesCustomInserter = 1
|
|
+
|
|
+class ExportBufInst : InstR600ISA<(
|
|
+ outs),
|
|
+ (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
|
|
+ i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop),
|
|
+ !strconcat("EXPORT", " $gpr"),
|
|
+ []>, ExportWord0, ExportBufWord1 {
|
|
+ let elem_size = 0;
|
|
+ let Inst{31-0} = Word0;
|
|
+ let Inst{63-32} = Word1;
|
|
+}
|
|
+
|
|
+let Predicates = [isR600toCayman] in {
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Common Instructions R600, R700, Evergreen, Cayman
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
|
|
+// Non-IEEE MUL: 0 * anything = 0
|
|
+def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
|
|
+def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
|
|
+def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>;
|
|
+def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
|
|
+
|
|
+// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
|
|
+// so some of the instruction names don't match the asm string.
|
|
+// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
|
|
+def SETE : R600_2OP <
|
|
+ 0x08, "SETE",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
|
|
+ COND_EQ))]
|
|
+>;
|
|
+
|
|
+def SGT : R600_2OP <
|
|
+ 0x09, "SETGT",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
|
|
+ COND_GT))]
|
|
+>;
|
|
+
|
|
+def SGE : R600_2OP <
|
|
+ 0xA, "SETGE",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
|
|
+ COND_GE))]
|
|
+>;
|
|
+
|
|
+def SNE : R600_2OP <
|
|
+ 0xB, "SETNE",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
|
|
+ COND_NE))]
|
|
+>;
|
|
+
|
|
+def SETE_DX10 : R600_2OP <
|
|
+ 0xC, "SETE_DX10",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
|
|
+ COND_EQ))]
|
|
+>;
|
|
+
|
|
+def SETGT_DX10 : R600_2OP <
|
|
+ 0xD, "SETGT_DX10",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
|
|
+ COND_GT))]
|
|
+>;
|
|
+
|
|
+def SETGE_DX10 : R600_2OP <
|
|
+ 0xE, "SETGE_DX10",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
|
|
+ COND_GE))]
|
|
+>;
|
|
+
|
|
+def SETNE_DX10 : R600_2OP <
|
|
+ 0xF, "SETNE_DX10",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
|
|
+ COND_NE))]
|
|
+>;
|
|
+
|
|
+def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
|
|
+def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>;
|
|
+def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
|
|
+def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
|
|
+def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
|
|
+
|
|
+def MOV : R600_1OP <0x19, "MOV", []>;
|
|
+
|
|
+let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
|
|
+
|
|
+class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
|
|
+ (outs R600_Reg32:$dst),
|
|
+ (ins immType:$imm),
|
|
+ "",
|
|
+ []
|
|
+>;
|
|
+
|
|
+} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
|
|
+
|
|
+def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
|
|
+def : Pat <
|
|
+ (imm:$val),
|
|
+ (MOV_IMM_I32 imm:$val)
|
|
+>;
|
|
+
|
|
+def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
|
|
+def : Pat <
|
|
+ (fpimm:$val),
|
|
+ (MOV_IMM_F32 fpimm:$val)
|
|
+>;
|
|
+
|
|
+def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>;
|
|
+def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>;
|
|
+def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>;
|
|
+def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>;
|
|
+
|
|
+let hasSideEffects = 1 in {
|
|
+
|
|
+def KILLGT : R600_2OP <0x2D, "KILLGT", []>;
|
|
+
|
|
+} // end hasSideEffects
|
|
+
|
|
+def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>;
|
|
+def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>;
|
|
+def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
|
|
+def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
|
|
+def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
|
|
+def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
|
|
+def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", AMDGPUsmax>;
|
|
+def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", AMDGPUsmin>;
|
|
+def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", AMDGPUumax>;
|
|
+def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>;
|
|
+
|
|
+def SETE_INT : R600_2OP <
|
|
+ 0x3A, "SETE_INT",
|
|
+ [(set (i32 R600_Reg32:$dst),
|
|
+ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETEQ))]
|
|
+>;
|
|
+
|
|
+def SETGT_INT : R600_2OP <
|
|
+ 0x3B, "SETGT_INT",
|
|
+ [(set (i32 R600_Reg32:$dst),
|
|
+ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))]
|
|
+>;
|
|
+
|
|
+def SETGE_INT : R600_2OP <
|
|
+ 0x3C, "SETGE_INT",
|
|
+ [(set (i32 R600_Reg32:$dst),
|
|
+ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGE))]
|
|
+>;
|
|
+
|
|
+def SETNE_INT : R600_2OP <
|
|
+ 0x3D, "SETNE_INT",
|
|
+ [(set (i32 R600_Reg32:$dst),
|
|
+ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETNE))]
|
|
+>;
|
|
+
|
|
+def SETGT_UINT : R600_2OP <
|
|
+ 0x3E, "SETGT_UINT",
|
|
+ [(set (i32 R600_Reg32:$dst),
|
|
+ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGT))]
|
|
+>;
|
|
+
|
|
+def SETGE_UINT : R600_2OP <
|
|
+ 0x3F, "SETGE_UINT",
|
|
+ [(set (i32 R600_Reg32:$dst),
|
|
+ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGE))]
|
|
+>;
|
|
+
|
|
+def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
|
|
+def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>;
|
|
+def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>;
|
|
+def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
|
|
+
|
|
+def CNDE_INT : R600_3OP <
|
|
+ 0x1C, "CNDE_INT",
|
|
+ [(set (i32 R600_Reg32:$dst),
|
|
+ (selectcc (i32 R600_Reg32:$src0), 0,
|
|
+ (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
|
|
+ COND_EQ))]
|
|
+>;
|
|
+
|
|
+def CNDGE_INT : R600_3OP <
|
|
+ 0x1E, "CNDGE_INT",
|
|
+ [(set (i32 R600_Reg32:$dst),
|
|
+ (selectcc (i32 R600_Reg32:$src0), 0,
|
|
+ (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
|
|
+ COND_GE))]
|
|
+>;
|
|
+
|
|
+def CNDGT_INT : R600_3OP <
|
|
+ 0x1D, "CNDGT_INT",
|
|
+ [(set (i32 R600_Reg32:$dst),
|
|
+ (selectcc (i32 R600_Reg32:$src0), 0,
|
|
+ (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
|
|
+ COND_GT))]
|
|
+>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Texture instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+def TEX_LD : R600_TEX <
|
|
+ 0x03, "TEX_LD",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_txf R600_Reg128:$src0, imm:$src1, imm:$src2, imm:$src3, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
|
|
+> {
|
|
+let AsmString = "TEX_LD $dst, $src0, $src1, $src2, $src3, $resourceId, $samplerId, $textureTarget";
|
|
+let InOperandList = (ins R600_Reg128:$src0, i32imm:$src1, i32imm:$src2, i32imm:$src3, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget);
|
|
+}
|
|
+
|
|
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <
|
|
+ 0x04, "TEX_GET_TEXTURE_RESINFO",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_txq R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
|
|
+>;
|
|
+
|
|
+def TEX_GET_GRADIENTS_H : R600_TEX <
|
|
+ 0x07, "TEX_GET_GRADIENTS_H",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_ddx R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
|
|
+>;
|
|
+
|
|
+def TEX_GET_GRADIENTS_V : R600_TEX <
|
|
+ 0x08, "TEX_GET_GRADIENTS_V",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_ddy R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
|
|
+>;
|
|
+
|
|
+def TEX_SET_GRADIENTS_H : R600_TEX <
|
|
+ 0x0B, "TEX_SET_GRADIENTS_H",
|
|
+ []
|
|
+>;
|
|
+
|
|
+def TEX_SET_GRADIENTS_V : R600_TEX <
|
|
+ 0x0C, "TEX_SET_GRADIENTS_V",
|
|
+ []
|
|
+>;
|
|
+
|
|
+def TEX_SAMPLE : R600_TEX <
|
|
+ 0x10, "TEX_SAMPLE",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
|
|
+>;
|
|
+
|
|
+def TEX_SAMPLE_C : R600_TEX <
|
|
+ 0x18, "TEX_SAMPLE_C",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
|
|
+>;
|
|
+
|
|
+def TEX_SAMPLE_L : R600_TEX <
|
|
+ 0x11, "TEX_SAMPLE_L",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
|
|
+>;
|
|
+
|
|
+def TEX_SAMPLE_C_L : R600_TEX <
|
|
+ 0x19, "TEX_SAMPLE_C_L",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
|
|
+>;
|
|
+
|
|
+def TEX_SAMPLE_LB : R600_TEX <
|
|
+ 0x12, "TEX_SAMPLE_LB",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0,imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
|
|
+>;
|
|
+
|
|
+def TEX_SAMPLE_C_LB : R600_TEX <
|
|
+ 0x1A, "TEX_SAMPLE_C_LB",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
|
|
+>;
|
|
+
|
|
+def TEX_SAMPLE_G : R600_TEX <
|
|
+ 0x14, "TEX_SAMPLE_G",
|
|
+ []
|
|
+>;
|
|
+
|
|
+def TEX_SAMPLE_C_G : R600_TEX <
|
|
+ 0x1C, "TEX_SAMPLE_C_G",
|
|
+ []
|
|
+>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Helper classes for common instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+class MUL_LIT_Common <bits<5> inst> : R600_3OP <
|
|
+ inst, "MUL_LIT",
|
|
+ []
|
|
+>;
|
|
+
|
|
+class MULADD_Common <bits<5> inst> : R600_3OP <
|
|
+ inst, "MULADD",
|
|
+ [(set (f32 R600_Reg32:$dst),
|
|
+ (IL_mad R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2))]
|
|
+>;
|
|
+
|
|
+class CNDE_Common <bits<5> inst> : R600_3OP <
|
|
+ inst, "CNDE",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
|
|
+ (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
|
|
+ COND_EQ))]
|
|
+>;
|
|
+
|
|
+class CNDGT_Common <bits<5> inst> : R600_3OP <
|
|
+ inst, "CNDGT",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
|
|
+ (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
|
|
+ COND_GT))]
|
|
+>;
|
|
+
|
|
+class CNDGE_Common <bits<5> inst> : R600_3OP <
|
|
+ inst, "CNDGE",
|
|
+ [(set R600_Reg32:$dst,
|
|
+ (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
|
|
+ (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
|
|
+ COND_GE))]
|
|
+>;
|
|
+
|
|
+multiclass DOT4_Common <bits<11> inst> {
|
|
+
|
|
+ def _pseudo : R600_REDUCTION <inst,
|
|
+ (ins R600_Reg128:$src0, R600_Reg128:$src1),
|
|
+ "DOT4 $dst $src0, $src1",
|
|
+ [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))]
|
|
+ >;
|
|
+
|
|
+ def _real : R600_2OP <inst, "DOT4", []>;
|
|
+}
|
|
+
|
|
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
|
|
+multiclass CUBE_Common <bits<11> inst> {
|
|
+
|
|
+ def _pseudo : InstR600 <
|
|
+ inst,
|
|
+ (outs R600_Reg128:$dst),
|
|
+ (ins R600_Reg128:$src),
|
|
+ "CUBE $dst $src",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))],
|
|
+ VecALU
|
|
+ > {
|
|
+ let isPseudo = 1;
|
|
+ }
|
|
+
|
|
+ def _real : R600_2OP <inst, "CUBE", []>;
|
|
+}
|
|
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
|
|
+
|
|
+class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
|
|
+ inst, "EXP_IEEE", fexp2
|
|
+>;
|
|
+
|
|
+class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
|
|
+ inst, "FLT_TO_INT", fp_to_sint
|
|
+>;
|
|
+
|
|
+class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
|
|
+ inst, "INT_TO_FLT", sint_to_fp
|
|
+>;
|
|
+
|
|
+class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
|
|
+ inst, "FLT_TO_UINT", fp_to_uint
|
|
+>;
|
|
+
|
|
+class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
|
|
+ inst, "UINT_TO_FLT", uint_to_fp
|
|
+>;
|
|
+
|
|
+class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
|
|
+ inst, "LOG_CLAMPED", []
|
|
+>;
|
|
+
|
|
+class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
|
|
+ inst, "LOG_IEEE", flog2
|
|
+>;
|
|
+
|
|
+class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
|
|
+class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
|
|
+class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
|
|
+class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
|
|
+ inst, "MULHI_INT", mulhs
|
|
+>;
|
|
+class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
|
|
+ inst, "MULHI", mulhu
|
|
+>;
|
|
+class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
|
|
+ inst, "MULLO_INT", mul
|
|
+>;
|
|
+class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []>;
|
|
+
|
|
+class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
|
|
+ inst, "RECIP_CLAMPED", []
|
|
+>;
|
|
+
|
|
+class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
|
|
+ inst, "RECIP_IEEE", [(set R600_Reg32:$dst, (fdiv FP_ONE, R600_Reg32:$src0))]
|
|
+>;
|
|
+
|
|
+class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
|
|
+ inst, "RECIP_UINT", AMDGPUurecip
|
|
+>;
|
|
+
|
|
+class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
|
|
+ inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
|
|
+>;
|
|
+
|
|
+class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
|
|
+ inst, "RECIPSQRT_IEEE", []
|
|
+>;
|
|
+
|
|
+class SIN_Common <bits<11> inst> : R600_1OP <
|
|
+ inst, "SIN", []>{
|
|
+ let Trig = 1;
|
|
+}
|
|
+
|
|
+class COS_Common <bits<11> inst> : R600_1OP <
|
|
+ inst, "COS", []> {
|
|
+ let Trig = 1;
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Helper patterns for complex intrinsics
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+multiclass DIV_Common <InstR600 recip_ieee> {
|
|
+def : Pat<
|
|
+ (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1),
|
|
+ (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
|
|
+>;
|
|
+
|
|
+def : Pat<
|
|
+ (fdiv R600_Reg32:$src0, R600_Reg32:$src1),
|
|
+ (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
|
|
+>;
|
|
+}
|
|
+
|
|
+class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat <
|
|
+ (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w),
|
|
+ (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x))
|
|
+>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// R600 / R700 Instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+let Predicates = [isR600] in {
|
|
+
|
|
+ def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
|
|
+ def MULADD_r600 : MULADD_Common<0x10>;
|
|
+ def CNDE_r600 : CNDE_Common<0x18>;
|
|
+ def CNDGT_r600 : CNDGT_Common<0x19>;
|
|
+ def CNDGE_r600 : CNDGE_Common<0x1A>;
|
|
+ defm DOT4_r600 : DOT4_Common<0x50>;
|
|
+ defm CUBE_r600 : CUBE_Common<0x52>;
|
|
+ def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
|
|
+ def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
|
|
+ def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
|
|
+ def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
|
|
+ def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
|
|
+ def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
|
|
+ def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
|
|
+ def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
|
|
+ def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
|
|
+ def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>;
|
|
+ def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>;
|
|
+ def SIN_r600 : SIN_Common<0x6E>;
|
|
+ def COS_r600 : COS_Common<0x6F>;
|
|
+ def ASHR_r600 : ASHR_Common<0x70>;
|
|
+ def LSHR_r600 : LSHR_Common<0x71>;
|
|
+ def LSHL_r600 : LSHL_Common<0x72>;
|
|
+ def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
|
|
+ def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
|
|
+ def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
|
|
+ def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
|
|
+ def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
|
|
+
|
|
+ defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
|
|
+ def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
|
|
+
|
|
+ def : Pat<(fsqrt R600_Reg32:$src),
|
|
+ (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src))>;
|
|
+
|
|
+ def R600_ExportSwz : ExportSwzInst {
|
|
+ let Word1{20-17} = 1; // BURST_COUNT
|
|
+ let Word1{21} = eop;
|
|
+ let Word1{22} = 1; // VALID_PIXEL_MODE
|
|
+ let Word1{30-23} = inst;
|
|
+ let Word1{31} = 1; // BARRIER
|
|
+ }
|
|
+ defm : ExportPattern<R600_ExportSwz, 39>;
|
|
+
|
|
+ def R600_ExportBuf : ExportBufInst {
|
|
+ let Word1{20-17} = 1; // BURST_COUNT
|
|
+ let Word1{21} = eop;
|
|
+ let Word1{22} = 1; // VALID_PIXEL_MODE
|
|
+ let Word1{30-23} = inst;
|
|
+ let Word1{31} = 1; // BARRIER
|
|
+ }
|
|
+ defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
|
|
+}
|
|
+
|
|
+// Helper pattern for normalizing inputs to triginomic instructions for R700+
|
|
+// cards.
|
|
+class COS_PAT <InstR600 trig> : Pat<
|
|
+ (fcos R600_Reg32:$src),
|
|
+ (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
|
|
+>;
|
|
+
|
|
+class SIN_PAT <InstR600 trig> : Pat<
|
|
+ (fsin R600_Reg32:$src),
|
|
+ (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
|
|
+>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// R700 Only instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+let Predicates = [isR700] in {
|
|
+ def SIN_r700 : SIN_Common<0x6E>;
|
|
+ def COS_r700 : COS_Common<0x6F>;
|
|
+
|
|
+ // R700 normalizes inputs to SIN/COS the same as EG
|
|
+ def : SIN_PAT <SIN_r700>;
|
|
+ def : COS_PAT <COS_r700>;
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Evergreen Only instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+let Predicates = [isEG] in {
|
|
+
|
|
+def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
|
|
+defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
|
|
+
|
|
+def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
|
|
+def MULHI_INT_eg : MULHI_INT_Common<0x90>;
|
|
+def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
|
|
+def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
|
|
+def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
|
|
+def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
|
|
+def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
|
|
+def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
|
|
+def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
|
|
+def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
|
|
+def SIN_eg : SIN_Common<0x8D>;
|
|
+def COS_eg : COS_Common<0x8E>;
|
|
+
|
|
+def : SIN_PAT <SIN_eg>;
|
|
+def : COS_PAT <COS_eg>;
|
|
+def : Pat<(fsqrt R600_Reg32:$src),
|
|
+ (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_eg R600_Reg32:$src))>;
|
|
+} // End Predicates = [isEG]
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Evergreen / Cayman Instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+let Predicates = [isEGorCayman] in {
|
|
+
|
|
+ // BFE_UINT - bit_extract, an optimization for mask and shift
|
|
+ // Src0 = Input
|
|
+ // Src1 = Offset
|
|
+ // Src2 = Width
|
|
+ //
|
|
+ // bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
|
|
+ //
|
|
+ // Example Usage:
|
|
+ // (Offset, Width)
|
|
+ //
|
|
+ // (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0
|
|
+ // (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8
|
|
+ // (16,8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16
|
|
+ // (24,8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24
|
|
+ def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
|
|
+ [(set R600_Reg32:$dst, (int_AMDIL_bit_extract_u32 R600_Reg32:$src0,
|
|
+ R600_Reg32:$src1,
|
|
+ R600_Reg32:$src2))],
|
|
+ VecALU
|
|
+ >;
|
|
+
|
|
+ def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
|
|
+ [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1,
|
|
+ R600_Reg32:$src2))],
|
|
+ VecALU
|
|
+ >;
|
|
+
|
|
+ def MULADD_eg : MULADD_Common<0x14>;
|
|
+ def ASHR_eg : ASHR_Common<0x15>;
|
|
+ def LSHR_eg : LSHR_Common<0x16>;
|
|
+ def LSHL_eg : LSHL_Common<0x17>;
|
|
+ def CNDE_eg : CNDE_Common<0x19>;
|
|
+ def CNDGT_eg : CNDGT_Common<0x1A>;
|
|
+ def CNDGE_eg : CNDGE_Common<0x1B>;
|
|
+ def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
|
|
+ def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
|
|
+ defm DOT4_eg : DOT4_Common<0xBE>;
|
|
+ defm CUBE_eg : CUBE_Common<0xC0>;
|
|
+
|
|
+let hasSideEffects = 1 in {
|
|
+ def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", []>;
|
|
+}
|
|
+
|
|
+ def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
|
|
+
|
|
+ def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
|
|
+ let Pattern = [];
|
|
+ }
|
|
+
|
|
+ def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
|
|
+
|
|
+ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
|
|
+ let Pattern = [];
|
|
+ }
|
|
+
|
|
+ def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
|
|
+
|
|
+ // TRUNC is used for the FLT_TO_INT instructions to work around a
|
|
+ // perceived problem where the rounding modes are applied differently
|
|
+ // depending on the instruction and the slot they are in.
|
|
+ // See:
|
|
+ // https://bugs.freedesktop.org/show_bug.cgi?id=50232
|
|
+ // Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
|
|
+ //
|
|
+ // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
|
|
+ // which do not need to be truncated since the fp values are 0.0f or 1.0f.
|
|
+ // We should look into handling these cases separately.
|
|
+ def : Pat<(fp_to_sint R600_Reg32:$src0),
|
|
+ (FLT_TO_INT_eg (TRUNC R600_Reg32:$src0))>;
|
|
+
|
|
+ def : Pat<(fp_to_uint R600_Reg32:$src0),
|
|
+ (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>;
|
|
+
|
|
+ def EG_ExportSwz : ExportSwzInst {
|
|
+ let Word1{19-16} = 1; // BURST_COUNT
|
|
+ let Word1{20} = 1; // VALID_PIXEL_MODE
|
|
+ let Word1{21} = eop;
|
|
+ let Word1{29-22} = inst;
|
|
+ let Word1{30} = 0; // MARK
|
|
+ let Word1{31} = 1; // BARRIER
|
|
+ }
|
|
+ defm : ExportPattern<EG_ExportSwz, 83>;
|
|
+
|
|
+ def EG_ExportBuf : ExportBufInst {
|
|
+ let Word1{19-16} = 1; // BURST_COUNT
|
|
+ let Word1{20} = 1; // VALID_PIXEL_MODE
|
|
+ let Word1{21} = eop;
|
|
+ let Word1{29-22} = inst;
|
|
+ let Word1{30} = 0; // MARK
|
|
+ let Word1{31} = 1; // BARRIER
|
|
+ }
|
|
+ defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Memory read/write instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+let usesCustomInserter = 1 in {
|
|
+
|
|
+class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name,
|
|
+ list<dag> pattern>
|
|
+ : EG_CF_RAT <0x57, 0x2, 0, (outs), ins,
|
|
+ !strconcat(name, " $rw_gpr, $index_gpr, $eop"), pattern> {
|
|
+ let RIM = 0;
|
|
+ // XXX: Have a separate instruction for non-indexed writes.
|
|
+ let TYPE = 1;
|
|
+ let RW_REL = 0;
|
|
+ let ELEM_SIZE = 0;
|
|
+
|
|
+ let ARRAY_SIZE = 0;
|
|
+ let COMP_MASK = comp_mask;
|
|
+ let BURST_COUNT = 0;
|
|
+ let VPM = 0;
|
|
+ let MARK = 0;
|
|
+ let BARRIER = 1;
|
|
+}
|
|
+
|
|
+} // End usesCustomInserter = 1
|
|
+
|
|
+// 32-bit store
|
|
+def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
|
|
+ (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
|
|
+ 0x1, "RAT_WRITE_CACHELESS_32_eg",
|
|
+ [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)]
|
|
+>;
|
|
+
|
|
+//128-bit store
|
|
+def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
|
|
+ (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
|
|
+ 0xf, "RAT_WRITE_CACHELESS_128",
|
|
+ [(global_store (v4i32 R600_Reg128:$rw_gpr), R600_TReg32_X:$index_gpr)]
|
|
+>;
|
|
+
|
|
+class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
|
|
+ : InstR600ISA <outs, (ins MEMxi:$ptr), name#" $dst, $ptr", pattern>,
|
|
+ VTX_WORD1_GPR, VTX_WORD0 {
|
|
+
|
|
+ // Static fields
|
|
+ let VC_INST = 0;
|
|
+ let FETCH_TYPE = 2;
|
|
+ let FETCH_WHOLE_QUAD = 0;
|
|
+ let BUFFER_ID = buffer_id;
|
|
+ let SRC_REL = 0;
|
|
+ // XXX: We can infer this field based on the SRC_GPR. This would allow us
|
|
+ // to store vertex addresses in any channel, not just X.
|
|
+ let SRC_SEL_X = 0;
|
|
+ let DST_REL = 0;
|
|
+ // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
|
|
+ // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
|
|
+ // however, based on my testing if USE_CONST_FIELDS is set, then all
|
|
+ // these fields need to be set to 0.
|
|
+ let USE_CONST_FIELDS = 0;
|
|
+ let NUM_FORMAT_ALL = 1;
|
|
+ let FORMAT_COMP_ALL = 0;
|
|
+ let SRF_MODE_ALL = 0;
|
|
+
|
|
+ let Inst{31-0} = Word0;
|
|
+ let Inst{63-32} = Word1;
|
|
+ // LLVM can only encode 64-bit instructions, so these fields are manually
|
|
+ // encoded in R600CodeEmitter
|
|
+ //
|
|
+ // bits<16> OFFSET;
|
|
+ // bits<2> ENDIAN_SWAP = 0;
|
|
+ // bits<1> CONST_BUF_NO_STRIDE = 0;
|
|
+ // bits<1> MEGA_FETCH = 0;
|
|
+ // bits<1> ALT_CONST = 0;
|
|
+ // bits<2> BUFFER_INDEX_MODE = 0;
|
|
+
|
|
+
|
|
+
|
|
+ // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
|
|
+ // is done in R600CodeEmitter
|
|
+ //
|
|
+ // Inst{79-64} = OFFSET;
|
|
+ // Inst{81-80} = ENDIAN_SWAP;
|
|
+ // Inst{82} = CONST_BUF_NO_STRIDE;
|
|
+ // Inst{83} = MEGA_FETCH;
|
|
+ // Inst{84} = ALT_CONST;
|
|
+ // Inst{86-85} = BUFFER_INDEX_MODE;
|
|
+ // Inst{95-86} = 0; Reserved
|
|
+
|
|
+ // VTX_WORD3 (Padding)
|
|
+ //
|
|
+ // Inst{127-96} = 0;
|
|
+}
|
|
+
|
|
+class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
|
|
+ : VTX_READ_eg <"VTX_READ_8", buffer_id, (outs R600_TReg32_X:$dst),
|
|
+ pattern> {
|
|
+
|
|
+ let MEGA_FETCH_COUNT = 1;
|
|
+ let DST_SEL_X = 0;
|
|
+ let DST_SEL_Y = 7; // Masked
|
|
+ let DST_SEL_Z = 7; // Masked
|
|
+ let DST_SEL_W = 7; // Masked
|
|
+ let DATA_FORMAT = 1; // FMT_8
|
|
+}
|
|
+
|
|
+class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
|
|
+ : VTX_READ_eg <"VTX_READ_16", buffer_id, (outs R600_TReg32_X:$dst),
|
|
+ pattern> {
|
|
+ let MEGA_FETCH_COUNT = 2;
|
|
+ let DST_SEL_X = 0;
|
|
+ let DST_SEL_Y = 7; // Masked
|
|
+ let DST_SEL_Z = 7; // Masked
|
|
+ let DST_SEL_W = 7; // Masked
|
|
+ let DATA_FORMAT = 5; // FMT_16
|
|
+
|
|
+}
|
|
+
|
|
+class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
|
|
+ : VTX_READ_eg <"VTX_READ_32", buffer_id, (outs R600_TReg32_X:$dst),
|
|
+ pattern> {
|
|
+
|
|
+ let MEGA_FETCH_COUNT = 4;
|
|
+ let DST_SEL_X = 0;
|
|
+ let DST_SEL_Y = 7; // Masked
|
|
+ let DST_SEL_Z = 7; // Masked
|
|
+ let DST_SEL_W = 7; // Masked
|
|
+ let DATA_FORMAT = 0xD; // COLOR_32
|
|
+
|
|
+ // This is not really necessary, but there were some GPU hangs that appeared
|
|
+ // to be caused by ALU instructions in the next instruction group that wrote
|
|
+ // to the $ptr registers of the VTX_READ.
|
|
+ // e.g.
|
|
+ // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
|
|
+ // %T2_X<def> = MOV %ZERO
|
|
+ //Adding this constraint prevents this from happening.
|
|
+ let Constraints = "$ptr.ptr = $dst";
|
|
+}
|
|
+
|
|
+class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
|
|
+ : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
|
|
+ pattern> {
|
|
+
|
|
+ let MEGA_FETCH_COUNT = 16;
|
|
+ let DST_SEL_X = 0;
|
|
+ let DST_SEL_Y = 1;
|
|
+ let DST_SEL_Z = 2;
|
|
+ let DST_SEL_W = 3;
|
|
+ let DATA_FORMAT = 0x22; // COLOR_32_32_32_32
|
|
+
|
|
+ // XXX: Need to force VTX_READ_128 instructions to write to the same register
|
|
+ // that holds its buffer address to avoid potential hangs. We can't use
|
|
+ // the same constraint as VTX_READ_32_eg, because the $ptr.ptr and $dst
|
|
+ // registers are different sizes.
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// VTX Read from parameter memory space
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
|
|
+ [(set (i32 R600_TReg32_X:$dst), (load_param_zexti8 ADDRVTX_READ:$ptr))]
|
|
+>;
|
|
+
|
|
+def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
|
|
+ [(set (i32 R600_TReg32_X:$dst), (load_param_zexti16 ADDRVTX_READ:$ptr))]
|
|
+>;
|
|
+
|
|
+def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
|
|
+ [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
|
|
+>;
|
|
+
|
|
+def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
|
|
+ [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))]
|
|
+>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// VTX Read from global memory space
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+// 8-bit reads
|
|
+def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
|
|
+ [(set (i32 R600_TReg32_X:$dst), (zextloadi8_global ADDRVTX_READ:$ptr))]
|
|
+>;
|
|
+
|
|
+// 32-bit reads
|
|
+def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
|
|
+ [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))]
|
|
+>;
|
|
+
|
|
+// 128-bit reads
|
|
+def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
|
|
+ [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))]
|
|
+>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Constant Loads
|
|
+// XXX: We are currently storing all constants in the global address space.
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+def CONSTANT_LOAD_eg : VTX_READ_32_eg <1,
|
|
+ [(set (i32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))]
|
|
+>;
|
|
+
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Regist loads and stores - for indirect addressing
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
|
|
+
|
|
+let Predicates = [isCayman] in {
|
|
+
|
|
+let isVector = 1 in {
|
|
+
|
|
+def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
|
|
+
|
|
+def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
|
|
+def MULHI_INT_cm : MULHI_INT_Common<0x90>;
|
|
+def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
|
|
+def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
|
|
+def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
|
|
+def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
|
|
+def LOG_IEEE_ : LOG_IEEE_Common<0x83>;
|
|
+def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
|
|
+def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
|
|
+def SIN_cm : SIN_Common<0x8D>;
|
|
+def COS_cm : COS_Common<0x8E>;
|
|
+} // End isVector = 1
|
|
+
|
|
+def : SIN_PAT <SIN_cm>;
|
|
+def : COS_PAT <COS_cm>;
|
|
+
|
|
+defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
|
|
+
|
|
+// RECIP_UINT emulation for Cayman
|
|
+def : Pat <
|
|
+ (AMDGPUurecip R600_Reg32:$src0),
|
|
+ (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg R600_Reg32:$src0)),
|
|
+ (MOV_IMM_I32 0x4f800000)))
|
|
+>;
|
|
+
|
|
+
|
|
+def : Pat<(fsqrt R600_Reg32:$src),
|
|
+ (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm R600_Reg32:$src))>;
|
|
+
|
|
+} // End isCayman
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Branch Instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+def IF_PREDICATE_SET : ILFormat<(outs), (ins GPRI32:$src),
|
|
+ "IF_PREDICATE_SET $src", []>;
|
|
+
|
|
+def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src),
|
|
+ "PREDICATED_BREAK $src", []>;
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Pseudo instructions
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+let isPseudo = 1 in {
|
|
+
|
|
+def PRED_X : InstR600 <
|
|
+ 0, (outs R600_Predicate_Bit:$dst),
|
|
+ (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
|
|
+ "", [], NullALU> {
|
|
+ let FlagOperandIdx = 3;
|
|
+ let isTerminator = 1;
|
|
+}
|
|
+
|
|
+let isTerminator = 1, isBranch = 1, isBarrier = 1 in {
|
|
+
|
|
+def JUMP : InstR600 <0x10,
|
|
+ (outs),
|
|
+ (ins brtarget:$target, R600_Pred:$p),
|
|
+ "JUMP $target ($p)",
|
|
+ [], AnyALU
|
|
+ >;
|
|
+
|
|
+} // End isTerminator = 1, isBranch = 1, isBarrier = 1
|
|
+
|
|
+let usesCustomInserter = 1 in {
|
|
+
|
|
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
|
|
+
|
|
+def MASK_WRITE : AMDGPUShaderInst <
|
|
+ (outs),
|
|
+ (ins R600_Reg32:$src),
|
|
+ "MASK_WRITE $src",
|
|
+ []
|
|
+>;
|
|
+
|
|
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
|
|
+
|
|
+
|
|
+def TXD: AMDGPUShaderInst <
|
|
+ (outs R600_Reg128:$dst),
|
|
+ (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
|
|
+ "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
|
|
+>;
|
|
+
|
|
+def TXD_SHADOW: AMDGPUShaderInst <
|
|
+ (outs R600_Reg128:$dst),
|
|
+ (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
|
|
+ "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
|
|
+ [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
|
|
+>;
|
|
+
|
|
+} // End isPseudo = 1
|
|
+} // End usesCustomInserter = 1
|
|
+
|
|
+def CLAMP_R600 : CLAMP <R600_Reg32>;
|
|
+def FABS_R600 : FABS<R600_Reg32>;
|
|
+def FNEG_R600 : FNEG<R600_Reg32>;
|
|
+
|
|
+//===---------------------------------------------------------------------===//
|
|
+// Return instruction
|
|
+//===---------------------------------------------------------------------===//
|
|
+let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in {
|
|
+ def RETURN : ILFormat<(outs), (ins variable_ops),
|
|
+ "RETURN", [(IL_retflag)]>;
|
|
+}
|
|
+
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Constant Buffer Addressing Support
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
|
|
+def CONST_COPY : Instruction {
|
|
+ let OutOperandList = (outs R600_Reg32:$dst);
|
|
+ let InOperandList = (ins i32imm:$src);
|
|
+ let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
|
|
+ let AsmString = "CONST_COPY";
|
|
+ let neverHasSideEffects = 1;
|
|
+ let isAsCheapAsAMove = 1;
|
|
+ let Itinerary = NullALU;
|
|
+}
|
|
+} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
|
|
+
|
|
+def TEX_VTX_CONSTBUF :
|
|
+ InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr), "VTX_READ_eg $dst, $ptr",
|
|
+ [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))]>,
|
|
+ VTX_WORD1_GPR, VTX_WORD0 {
|
|
+
|
|
+ let VC_INST = 0;
|
|
+ let FETCH_TYPE = 2;
|
|
+ let FETCH_WHOLE_QUAD = 0;
|
|
+ let BUFFER_ID = 0;
|
|
+ let SRC_REL = 0;
|
|
+ let SRC_SEL_X = 0;
|
|
+ let DST_REL = 0;
|
|
+ let USE_CONST_FIELDS = 0;
|
|
+ let NUM_FORMAT_ALL = 2;
|
|
+ let FORMAT_COMP_ALL = 1;
|
|
+ let SRF_MODE_ALL = 1;
|
|
+ let MEGA_FETCH_COUNT = 16;
|
|
+ let DST_SEL_X = 0;
|
|
+ let DST_SEL_Y = 1;
|
|
+ let DST_SEL_Z = 2;
|
|
+ let DST_SEL_W = 3;
|
|
+ let DATA_FORMAT = 35;
|
|
+
|
|
+ let Inst{31-0} = Word0;
|
|
+ let Inst{63-32} = Word1;
|
|
+
|
|
+// LLVM can only encode 64-bit instructions, so these fields are manually
|
|
+// encoded in R600CodeEmitter
|
|
+//
|
|
+// bits<16> OFFSET;
|
|
+// bits<2> ENDIAN_SWAP = 0;
|
|
+// bits<1> CONST_BUF_NO_STRIDE = 0;
|
|
+// bits<1> MEGA_FETCH = 0;
|
|
+// bits<1> ALT_CONST = 0;
|
|
+// bits<2> BUFFER_INDEX_MODE = 0;
|
|
+
|
|
+
|
|
+
|
|
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
|
|
+// is done in R600CodeEmitter
|
|
+//
|
|
+// Inst{79-64} = OFFSET;
|
|
+// Inst{81-80} = ENDIAN_SWAP;
|
|
+// Inst{82} = CONST_BUF_NO_STRIDE;
|
|
+// Inst{83} = MEGA_FETCH;
|
|
+// Inst{84} = ALT_CONST;
|
|
+// Inst{86-85} = BUFFER_INDEX_MODE;
|
|
+// Inst{95-86} = 0; Reserved
|
|
+
|
|
+// VTX_WORD3 (Padding)
|
|
+//
|
|
+// Inst{127-96} = 0;
|
|
+}
|
|
+
|
|
+def TEX_VTX_TEXBUF:
|
|
+ InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
|
|
+ [(set R600_Reg128:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
|
|
+VTX_WORD1_GPR, VTX_WORD0 {
|
|
+
|
|
+let VC_INST = 0;
|
|
+let FETCH_TYPE = 2;
|
|
+let FETCH_WHOLE_QUAD = 0;
|
|
+let SRC_REL = 0;
|
|
+let SRC_SEL_X = 0;
|
|
+let DST_REL = 0;
|
|
+let USE_CONST_FIELDS = 1;
|
|
+let NUM_FORMAT_ALL = 0;
|
|
+let FORMAT_COMP_ALL = 0;
|
|
+let SRF_MODE_ALL = 1;
|
|
+let MEGA_FETCH_COUNT = 16;
|
|
+let DST_SEL_X = 0;
|
|
+let DST_SEL_Y = 1;
|
|
+let DST_SEL_Z = 2;
|
|
+let DST_SEL_W = 3;
|
|
+let DATA_FORMAT = 0;
|
|
+
|
|
+let Inst{31-0} = Word0;
|
|
+let Inst{63-32} = Word1;
|
|
+
|
|
+// LLVM can only encode 64-bit instructions, so these fields are manually
|
|
+// encoded in R600CodeEmitter
|
|
+//
|
|
+// bits<16> OFFSET;
|
|
+// bits<2> ENDIAN_SWAP = 0;
|
|
+// bits<1> CONST_BUF_NO_STRIDE = 0;
|
|
+// bits<1> MEGA_FETCH = 0;
|
|
+// bits<1> ALT_CONST = 0;
|
|
+// bits<2> BUFFER_INDEX_MODE = 0;
|
|
+
|
|
+
|
|
+
|
|
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
|
|
+// is done in R600CodeEmitter
|
|
+//
|
|
+// Inst{79-64} = OFFSET;
|
|
+// Inst{81-80} = ENDIAN_SWAP;
|
|
+// Inst{82} = CONST_BUF_NO_STRIDE;
|
|
+// Inst{83} = MEGA_FETCH;
|
|
+// Inst{84} = ALT_CONST;
|
|
+// Inst{86-85} = BUFFER_INDEX_MODE;
|
|
+// Inst{95-86} = 0; Reserved
|
|
+
|
|
+// VTX_WORD3 (Padding)
|
|
+//
|
|
+// Inst{127-96} = 0;
|
|
+}
|
|
+
|
|
+
|
|
+
|
|
+//===--------------------------------------------------------------------===//
|
|
+// Instructions support
|
|
+//===--------------------------------------------------------------------===//
|
|
+//===---------------------------------------------------------------------===//
|
|
+// Custom Inserter for Branches and returns, this eventually will be a
|
|
+// seperate pass
|
|
+//===---------------------------------------------------------------------===//
|
|
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
|
|
+ def BRANCH : ILFormat<(outs), (ins brtarget:$target),
|
|
+ "; Pseudo unconditional branch instruction",
|
|
+ [(br bb:$target)]>;
|
|
+ defm BRANCH_COND : BranchConditional<IL_brcond>;
|
|
+}
|
|
+
|
|
+//===---------------------------------------------------------------------===//
|
|
+// Flow and Program control Instructions
|
|
+//===---------------------------------------------------------------------===//
|
|
+let isTerminator=1 in {
|
|
+ def SWITCH : ILFormat< (outs), (ins GPRI32:$src),
|
|
+ !strconcat("SWITCH", " $src"), []>;
|
|
+ def CASE : ILFormat< (outs), (ins GPRI32:$src),
|
|
+ !strconcat("CASE", " $src"), []>;
|
|
+ def BREAK : ILFormat< (outs), (ins),
|
|
+ "BREAK", []>;
|
|
+ def CONTINUE : ILFormat< (outs), (ins),
|
|
+ "CONTINUE", []>;
|
|
+ def DEFAULT : ILFormat< (outs), (ins),
|
|
+ "DEFAULT", []>;
|
|
+ def ELSE : ILFormat< (outs), (ins),
|
|
+ "ELSE", []>;
|
|
+ def ENDSWITCH : ILFormat< (outs), (ins),
|
|
+ "ENDSWITCH", []>;
|
|
+ def ENDMAIN : ILFormat< (outs), (ins),
|
|
+ "ENDMAIN", []>;
|
|
+ def END : ILFormat< (outs), (ins),
|
|
+ "END", []>;
|
|
+ def ENDFUNC : ILFormat< (outs), (ins),
|
|
+ "ENDFUNC", []>;
|
|
+ def ENDIF : ILFormat< (outs), (ins),
|
|
+ "ENDIF", []>;
|
|
+ def WHILELOOP : ILFormat< (outs), (ins),
|
|
+ "WHILE", []>;
|
|
+ def ENDLOOP : ILFormat< (outs), (ins),
|
|
+ "ENDLOOP", []>;
|
|
+ def FUNC : ILFormat< (outs), (ins),
|
|
+ "FUNC", []>;
|
|
+ def RETDYN : ILFormat< (outs), (ins),
|
|
+ "RET_DYN", []>;
|
|
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
|
|
+ defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">;
|
|
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
|
|
+ defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">;
|
|
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
|
|
+ defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
|
|
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
|
|
+ defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
|
|
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
|
|
+ defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
|
|
+ // This opcode has custom swizzle pattern encoded in Swizzle Encoder
|
|
+ defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
|
|
+ defm IFC : BranchInstr2<"IFC">;
|
|
+ defm BREAKC : BranchInstr2<"BREAKC">;
|
|
+ defm CONTINUEC : BranchInstr2<"CONTINUEC">;
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// ISel Patterns
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+//CNDGE_INT extra pattern
|
|
+def : Pat <
|
|
+ (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1),
|
|
+ (i32 R600_Reg32:$src2), COND_GT),
|
|
+ (CNDGE_INT R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2)
|
|
+>;
|
|
+
|
|
+// KIL Patterns
|
|
+def KILP : Pat <
|
|
+ (int_AMDGPU_kilp),
|
|
+ (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
|
|
+>;
|
|
+
|
|
+def KIL : Pat <
|
|
+ (int_AMDGPU_kill R600_Reg32:$src0),
|
|
+ (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0)))
|
|
+>;
|
|
+
|
|
+// SGT Reverse args
|
|
+def : Pat <
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LT),
|
|
+ (SGT R600_Reg32:$src1, R600_Reg32:$src0)
|
|
+>;
|
|
+
|
|
+// SGE Reverse args
|
|
+def : Pat <
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE),
|
|
+ (SGE R600_Reg32:$src1, R600_Reg32:$src0)
|
|
+>;
|
|
+
|
|
+// SETGT_DX10 reverse args
|
|
+def : Pat <
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LT),
|
|
+ (SETGT_DX10 R600_Reg32:$src1, R600_Reg32:$src0)
|
|
+>;
|
|
+
|
|
+// SETGE_DX10 reverse args
|
|
+def : Pat <
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LE),
|
|
+ (SETGE_DX10 R600_Reg32:$src1, R600_Reg32:$src0)
|
|
+>;
|
|
+
|
|
+// SETGT_INT reverse args
|
|
+def : Pat <
|
|
+ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT),
|
|
+ (SETGT_INT R600_Reg32:$src1, R600_Reg32:$src0)
|
|
+>;
|
|
+
|
|
+// SETGE_INT reverse args
|
|
+def : Pat <
|
|
+ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLE),
|
|
+ (SETGE_INT R600_Reg32:$src1, R600_Reg32:$src0)
|
|
+>;
|
|
+
|
|
+// SETGT_UINT reverse args
|
|
+def : Pat <
|
|
+ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULT),
|
|
+ (SETGT_UINT R600_Reg32:$src1, R600_Reg32:$src0)
|
|
+>;
|
|
+
|
|
+// SETGE_UINT reverse args
|
|
+def : Pat <
|
|
+ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULE),
|
|
+ (SETGE_UINT R600_Reg32:$src1, R600_Reg32:$src0)
|
|
+>;
|
|
+
|
|
+// The next two patterns are special cases for handling 'true if ordered' and
|
|
+// 'true if unordered' conditionals. The assumption here is that the behavior of
|
|
+// SETE and SNE conforms to the Direct3D 10 rules for floating point values
|
|
+// described here:
|
|
+// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308050.aspx#alpha_32_bit
|
|
+// We assume that SETE returns false when one of the operands is NAN and
|
|
+// SNE returns true when on of the operands is NAN
|
|
+
|
|
+//SETE - 'true if ordered'
|
|
+def : Pat <
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETO),
|
|
+ (SETE R600_Reg32:$src0, R600_Reg32:$src1)
|
|
+>;
|
|
+
|
|
+//SETE_DX10 - 'true if ordered'
|
|
+def : Pat <
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETO),
|
|
+ (SETE_DX10 R600_Reg32:$src0, R600_Reg32:$src1)
|
|
+>;
|
|
+
|
|
+//SNE - 'true if unordered'
|
|
+def : Pat <
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO),
|
|
+ (SNE R600_Reg32:$src0, R600_Reg32:$src1)
|
|
+>;
|
|
+
|
|
+//SETNE_DX10 - 'true if ordered'
|
|
+def : Pat <
|
|
+ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUO),
|
|
+ (SETNE_DX10 R600_Reg32:$src0, R600_Reg32:$src1)
|
|
+>;
|
|
+
|
|
+def : Extract_Element <f32, v4f32, R600_Reg128, 0, sub0>;
|
|
+def : Extract_Element <f32, v4f32, R600_Reg128, 1, sub1>;
|
|
+def : Extract_Element <f32, v4f32, R600_Reg128, 2, sub2>;
|
|
+def : Extract_Element <f32, v4f32, R600_Reg128, 3, sub3>;
|
|
+
|
|
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sub0>;
|
|
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sub1>;
|
|
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sub2>;
|
|
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sub3>;
|
|
+
|
|
+def : Extract_Element <i32, v4i32, R600_Reg128, 0, sub0>;
|
|
+def : Extract_Element <i32, v4i32, R600_Reg128, 1, sub1>;
|
|
+def : Extract_Element <i32, v4i32, R600_Reg128, 2, sub2>;
|
|
+def : Extract_Element <i32, v4i32, R600_Reg128, 3, sub3>;
|
|
+
|
|
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sub0>;
|
|
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>;
|
|
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>;
|
|
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
|
|
+
|
|
+def : Vector_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
|
|
+def : Vector_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
|
|
+
|
|
+// bitconvert patterns
|
|
+
|
|
+def : BitConvert <i32, f32, R600_Reg32>;
|
|
+def : BitConvert <f32, i32, R600_Reg32>;
|
|
+def : BitConvert <v4f32, v4i32, R600_Reg128>;
|
|
+def : BitConvert <v4i32, v4f32, R600_Reg128>;
|
|
+
|
|
+// DWORDADDR pattern
|
|
+def : DwordAddrPat <i32, R600_Reg32>;
|
|
+
|
|
+} // End isR600toCayman Predicate
|
|
diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td
|
|
new file mode 100644
|
|
index 0000000..6046f0d
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600Intrinsics.td
|
|
@@ -0,0 +1,57 @@
|
|
+//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// R600 Intrinsic Definitions
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+let TargetPrefix = "R600", isTarget = 1 in {
|
|
+ def int_R600_load_input :
|
|
+ Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_R600_interp_input :
|
|
+ Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_R600_load_texbuf :
|
|
+ Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
|
+ def int_R600_store_swizzle :
|
|
+ Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
|
|
+
|
|
+ def int_R600_store_stream_output :
|
|
+ Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
|
|
+ def int_R600_store_pixel_color :
|
|
+ Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
|
|
+ def int_R600_store_pixel_depth :
|
|
+ Intrinsic<[], [llvm_float_ty], []>;
|
|
+ def int_R600_store_pixel_stencil :
|
|
+ Intrinsic<[], [llvm_float_ty], []>;
|
|
+ def int_R600_store_dummy :
|
|
+ Intrinsic<[], [llvm_i32_ty], []>;
|
|
+}
|
|
+let TargetPrefix = "r600", isTarget = 1 in {
|
|
+
|
|
+class R600ReadPreloadRegisterIntrinsic<string name>
|
|
+ : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
|
|
+ GCCBuiltin<name>;
|
|
+
|
|
+multiclass R600ReadPreloadRegisterIntrinsic_xyz<string prefix> {
|
|
+ def _x : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_x")>;
|
|
+ def _y : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_y")>;
|
|
+ def _z : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_z")>;
|
|
+}
|
|
+
|
|
+defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz <
|
|
+ "__builtin_r600_read_global_size">;
|
|
+defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz <
|
|
+ "__builtin_r600_read_local_size">;
|
|
+defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz <
|
|
+ "__builtin_r600_read_ngroups">;
|
|
+defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
|
|
+ "__builtin_r600_read_tgid">;
|
|
+defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
|
|
+ "__builtin_r600_read_tidig">;
|
|
+}
|
|
diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp
|
|
new file mode 100644
|
|
index 0000000..c8c27a8
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600LowerConstCopy.cpp
|
|
@@ -0,0 +1,222 @@
|
|
+//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr.
|
|
+/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot
|
|
+/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits
|
|
+/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try
|
|
+/// to fold them if possible or replace them by MOV otherwise.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPU.h"
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
+#include "R600InstrInfo.h"
|
|
+#include "llvm/GlobalValue.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class R600LowerConstCopy : public MachineFunctionPass {
|
|
+private:
|
|
+ static char ID;
|
|
+ const R600InstrInfo *TII;
|
|
+
|
|
+ struct ConstPairs {
|
|
+ unsigned XYPair;
|
|
+ unsigned ZWPair;
|
|
+ };
|
|
+
|
|
+ bool canFoldInBundle(ConstPairs &UsedConst, unsigned ReadConst) const;
|
|
+public:
|
|
+ R600LowerConstCopy(TargetMachine &tm);
|
|
+ virtual bool runOnMachineFunction(MachineFunction &MF);
|
|
+
|
|
+ const char *getPassName() const { return "R600 Eliminate Symbolic Operand"; }
|
|
+};
|
|
+
|
|
+char R600LowerConstCopy::ID = 0;
|
|
+
|
|
+R600LowerConstCopy::R600LowerConstCopy(TargetMachine &tm) :
|
|
+ MachineFunctionPass(ID),
|
|
+ TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo()))
|
|
+{
|
|
+}
|
|
+
|
|
+bool R600LowerConstCopy::canFoldInBundle(ConstPairs &UsedConst,
|
|
+ unsigned ReadConst) const {
|
|
+ unsigned ReadConstChan = ReadConst & 3;
|
|
+ unsigned ReadConstIndex = ReadConst & (~3);
|
|
+ if (ReadConstChan < 2) {
|
|
+ if (!UsedConst.XYPair) {
|
|
+ UsedConst.XYPair = ReadConstIndex;
|
|
+ }
|
|
+ return UsedConst.XYPair == ReadConstIndex;
|
|
+ } else {
|
|
+ if (!UsedConst.ZWPair) {
|
|
+ UsedConst.ZWPair = ReadConstIndex;
|
|
+ }
|
|
+ return UsedConst.ZWPair == ReadConstIndex;
|
|
+ }
|
|
+}
|
|
+
|
|
+static bool isControlFlow(const MachineInstr &MI) {
|
|
+ return (MI.getOpcode() == AMDGPU::IF_PREDICATE_SET) ||
|
|
+ (MI.getOpcode() == AMDGPU::ENDIF) ||
|
|
+ (MI.getOpcode() == AMDGPU::ELSE) ||
|
|
+ (MI.getOpcode() == AMDGPU::WHILELOOP) ||
|
|
+ (MI.getOpcode() == AMDGPU::BREAK);
|
|
+}
|
|
+
|
|
+bool R600LowerConstCopy::runOnMachineFunction(MachineFunction &MF) {
|
|
+
|
|
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
|
|
+ BB != BB_E; ++BB) {
|
|
+ MachineBasicBlock &MBB = *BB;
|
|
+ DenseMap<unsigned, MachineInstr *> RegToConstIndex;
|
|
+ for (MachineBasicBlock::instr_iterator I = MBB.instr_begin(),
|
|
+ E = MBB.instr_end(); I != E;) {
|
|
+
|
|
+ if (I->getOpcode() == AMDGPU::CONST_COPY) {
|
|
+ MachineInstr &MI = *I;
|
|
+ I = llvm::next(I);
|
|
+ unsigned DstReg = MI.getOperand(0).getReg();
|
|
+ DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
|
|
+ RegToConstIndex.find(DstReg);
|
|
+ if (SrcMI != RegToConstIndex.end()) {
|
|
+ SrcMI->second->eraseFromParent();
|
|
+ RegToConstIndex.erase(SrcMI);
|
|
+ }
|
|
+ MachineInstr *NewMI =
|
|
+ TII->buildDefaultInstruction(MBB, &MI, AMDGPU::MOV,
|
|
+ MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
|
|
+ TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
|
|
+ MI.getOperand(1).getImm());
|
|
+ RegToConstIndex[DstReg] = NewMI;
|
|
+ MI.eraseFromParent();
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ std::vector<unsigned> Defs;
|
|
+ // We consider all Instructions as bundled because algorithm that handle
|
|
+ // const read port limitations inside an IG is still valid with single
|
|
+ // instructions.
|
|
+ std::vector<MachineInstr *> Bundle;
|
|
+
|
|
+ if (I->isBundle()) {
|
|
+ unsigned BundleSize = I->getBundleSize();
|
|
+ for (unsigned i = 0; i < BundleSize; i++) {
|
|
+ I = llvm::next(I);
|
|
+ Bundle.push_back(I);
|
|
+ }
|
|
+ } else if (TII->isALUInstr(I->getOpcode())){
|
|
+ Bundle.push_back(I);
|
|
+ } else if (isControlFlow(*I)) {
|
|
+ RegToConstIndex.clear();
|
|
+ I = llvm::next(I);
|
|
+ continue;
|
|
+ } else {
|
|
+ MachineInstr &MI = *I;
|
|
+ for (MachineInstr::mop_iterator MOp = MI.operands_begin(),
|
|
+ MOpE = MI.operands_end(); MOp != MOpE; ++MOp) {
|
|
+ MachineOperand &MO = *MOp;
|
|
+ if (!MO.isReg())
|
|
+ continue;
|
|
+ if (MO.isDef()) {
|
|
+ Defs.push_back(MO.getReg());
|
|
+ } else {
|
|
+ // Either a TEX or an Export inst, prevent from erasing def of used
|
|
+ // operand
|
|
+ RegToConstIndex.erase(MO.getReg());
|
|
+ for (MCSubRegIterator SR(MO.getReg(), &TII->getRegisterInfo());
|
|
+ SR.isValid(); ++SR) {
|
|
+ RegToConstIndex.erase(*SR);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+
|
|
+ R600Operands::Ops OpTable[3][2] = {
|
|
+ {R600Operands::SRC0, R600Operands::SRC0_SEL},
|
|
+ {R600Operands::SRC1, R600Operands::SRC1_SEL},
|
|
+ {R600Operands::SRC2, R600Operands::SRC2_SEL},
|
|
+ };
|
|
+
|
|
+ for(std::vector<MachineInstr *>::iterator It = Bundle.begin(),
|
|
+ ItE = Bundle.end(); It != ItE; ++It) {
|
|
+ MachineInstr *MI = *It;
|
|
+ if (TII->isPredicated(MI)) {
|
|
+ // We don't want to erase previous assignment
|
|
+ RegToConstIndex.erase(MI->getOperand(0).getReg());
|
|
+ } else {
|
|
+ int WriteIDX = TII->getOperandIdx(MI->getOpcode(), R600Operands::WRITE);
|
|
+ if (WriteIDX < 0 || MI->getOperand(WriteIDX).getImm())
|
|
+ Defs.push_back(MI->getOperand(0).getReg());
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ConstPairs CP = {0,0};
|
|
+ for (unsigned SrcOp = 0; SrcOp < 3; SrcOp++) {
|
|
+ for(std::vector<MachineInstr *>::iterator It = Bundle.begin(),
|
|
+ ItE = Bundle.end(); It != ItE; ++It) {
|
|
+ MachineInstr *MI = *It;
|
|
+ int SrcIdx = TII->getOperandIdx(MI->getOpcode(), OpTable[SrcOp][0]);
|
|
+ if (SrcIdx < 0)
|
|
+ continue;
|
|
+ MachineOperand &MO = MI->getOperand(SrcIdx);
|
|
+ DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
|
|
+ RegToConstIndex.find(MO.getReg());
|
|
+ if (SrcMI != RegToConstIndex.end()) {
|
|
+ MachineInstr *CstMov = SrcMI->second;
|
|
+ int ConstMovSel =
|
|
+ TII->getOperandIdx(CstMov->getOpcode(), R600Operands::SRC0_SEL);
|
|
+ unsigned ConstIndex = CstMov->getOperand(ConstMovSel).getImm();
|
|
+ if (MI->isInsideBundle() && canFoldInBundle(CP, ConstIndex)) {
|
|
+ TII->setImmOperand(MI, OpTable[SrcOp][1], ConstIndex);
|
|
+ MI->getOperand(SrcIdx).setReg(AMDGPU::ALU_CONST);
|
|
+ } else {
|
|
+ RegToConstIndex.erase(SrcMI);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for (std::vector<unsigned>::iterator It = Defs.begin(), ItE = Defs.end();
|
|
+ It != ItE; ++It) {
|
|
+ DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
|
|
+ RegToConstIndex.find(*It);
|
|
+ if (SrcMI != RegToConstIndex.end()) {
|
|
+ SrcMI->second->eraseFromParent();
|
|
+ RegToConstIndex.erase(SrcMI);
|
|
+ }
|
|
+ }
|
|
+ I = llvm::next(I);
|
|
+ }
|
|
+
|
|
+ if (MBB.succ_empty()) {
|
|
+ for (DenseMap<unsigned, MachineInstr *>::iterator
|
|
+ DI = RegToConstIndex.begin(), DE = RegToConstIndex.end();
|
|
+ DI != DE; ++DI) {
|
|
+ DI->second->eraseFromParent();
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+FunctionPass *createR600LowerConstCopy(TargetMachine &tm) {
|
|
+ return new R600LowerConstCopy(tm);
|
|
+}
|
|
+
|
|
+}
|
|
+
|
|
+
|
|
diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..40aec83
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600MachineFunctionInfo.cpp
|
|
@@ -0,0 +1,18 @@
|
|
+//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "R600MachineFunctionInfo.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
|
|
+ : MachineFunctionInfo() {
|
|
+ memset(Outputs, 0, sizeof(Outputs));
|
|
+ }
|
|
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
|
|
new file mode 100644
|
|
index 0000000..41e4894
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
|
|
@@ -0,0 +1,33 @@
|
|
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef R600MACHINEFUNCTIONINFO_H
|
|
+#define R600MACHINEFUNCTIONINFO_H
|
|
+
|
|
+#include "llvm/ADT/BitVector.h"
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+#include "llvm/CodeGen/SelectionDAG.h"
|
|
+#include <vector>
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class R600MachineFunctionInfo : public MachineFunctionInfo {
|
|
+
|
|
+public:
|
|
+ R600MachineFunctionInfo(const MachineFunction &MF);
|
|
+ std::vector<unsigned> IndirectRegs;
|
|
+ SDNode *Outputs[16];
|
|
+};
|
|
+
|
|
+} // End llvm namespace
|
|
+
|
|
+#endif //R600MACHINEFUNCTIONINFO_H
|
|
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..bbd7995
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600RegisterInfo.cpp
|
|
@@ -0,0 +1,99 @@
|
|
+//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief R600 implementation of the TargetRegisterInfo class.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "R600RegisterInfo.h"
|
|
+#include "AMDGPUTargetMachine.h"
|
|
+#include "R600Defines.h"
|
|
+#include "R600InstrInfo.h"
|
|
+#include "R600MachineFunctionInfo.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
|
|
+ const TargetInstrInfo &tii)
|
|
+: AMDGPURegisterInfo(tm, tii),
|
|
+ TM(tm),
|
|
+ TII(tii)
|
|
+ { }
|
|
+
|
|
+BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
|
+ BitVector Reserved(getNumRegs());
|
|
+
|
|
+ Reserved.set(AMDGPU::ZERO);
|
|
+ Reserved.set(AMDGPU::HALF);
|
|
+ Reserved.set(AMDGPU::ONE);
|
|
+ Reserved.set(AMDGPU::ONE_INT);
|
|
+ Reserved.set(AMDGPU::NEG_HALF);
|
|
+ Reserved.set(AMDGPU::NEG_ONE);
|
|
+ Reserved.set(AMDGPU::PV_X);
|
|
+ Reserved.set(AMDGPU::ALU_LITERAL_X);
|
|
+ Reserved.set(AMDGPU::ALU_CONST);
|
|
+ Reserved.set(AMDGPU::PREDICATE_BIT);
|
|
+ Reserved.set(AMDGPU::PRED_SEL_OFF);
|
|
+ Reserved.set(AMDGPU::PRED_SEL_ZERO);
|
|
+ Reserved.set(AMDGPU::PRED_SEL_ONE);
|
|
+
|
|
+ for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
|
|
+ E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
|
|
+ Reserved.set(*I);
|
|
+ }
|
|
+
|
|
+ for (TargetRegisterClass::iterator I = AMDGPU::TRegMemRegClass.begin(),
|
|
+ E = AMDGPU::TRegMemRegClass.end();
|
|
+ I != E; ++I) {
|
|
+ Reserved.set(*I);
|
|
+ }
|
|
+
|
|
+ const R600InstrInfo *RII = static_cast<const R600InstrInfo*>(&TII);
|
|
+ std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF);
|
|
+ for (std::vector<unsigned>::iterator I = IndirectRegs.begin(),
|
|
+ E = IndirectRegs.end();
|
|
+ I != E; ++I) {
|
|
+ Reserved.set(*I);
|
|
+ }
|
|
+ return Reserved;
|
|
+}
|
|
+
|
|
+const TargetRegisterClass *
|
|
+R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
|
|
+ switch (rc->getID()) {
|
|
+ case AMDGPU::GPRF32RegClassID:
|
|
+ case AMDGPU::GPRI32RegClassID:
|
|
+ return &AMDGPU::R600_Reg32RegClass;
|
|
+ default: return rc;
|
|
+ }
|
|
+}
|
|
+
|
|
+unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
|
|
+ return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
|
|
+}
|
|
+
|
|
+const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
|
|
+ MVT VT) const {
|
|
+ switch(VT.SimpleTy) {
|
|
+ default:
|
|
+ case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
|
|
+ }
|
|
+}
|
|
+
|
|
+unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
|
|
+ switch (Channel) {
|
|
+ default: assert(!"Invalid channel index"); return 0;
|
|
+ case 0: return AMDGPU::sub0;
|
|
+ case 1: return AMDGPU::sub1;
|
|
+ case 2: return AMDGPU::sub2;
|
|
+ case 3: return AMDGPU::sub3;
|
|
+ }
|
|
+}
|
|
+
|
|
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
|
|
new file mode 100644
|
|
index 0000000..c170ccb
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600RegisterInfo.h
|
|
@@ -0,0 +1,55 @@
|
|
+//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Interface definition for R600RegisterInfo
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef R600REGISTERINFO_H_
|
|
+#define R600REGISTERINFO_H_
|
|
+
|
|
+#include "AMDGPUTargetMachine.h"
|
|
+#include "AMDGPURegisterInfo.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class R600TargetMachine;
|
|
+class TargetInstrInfo;
|
|
+
|
|
+struct R600RegisterInfo : public AMDGPURegisterInfo {
|
|
+ AMDGPUTargetMachine &TM;
|
|
+ const TargetInstrInfo &TII;
|
|
+
|
|
+ R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
|
|
+
|
|
+ virtual BitVector getReservedRegs(const MachineFunction &MF) const;
|
|
+
|
|
+ /// \param RC is an AMDIL reg class.
|
|
+ ///
|
|
+ /// \returns the R600 reg class that is equivalent to \p RC.
|
|
+ virtual const TargetRegisterClass *getISARegClass(
|
|
+ const TargetRegisterClass *RC) const;
|
|
+
|
|
+ /// \brief get the HW encoding for a register's channel.
|
|
+ unsigned getHWRegChan(unsigned reg) const;
|
|
+
|
|
+ /// \brief get the register class of the specified type to use in the
|
|
+ /// CFGStructurizer
|
|
+ virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
|
|
+
|
|
+ /// \returns the sub reg enum value for the given \p Channel
|
|
+ /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
|
|
+ unsigned getSubRegFromChannel(unsigned Channel) const;
|
|
+
|
|
+};
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+#endif // AMDIDSAREGISTERINFO_H_
|
|
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
|
|
new file mode 100644
|
|
index 0000000..a7d847a
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600RegisterInfo.td
|
|
@@ -0,0 +1,146 @@
|
|
+
|
|
+class R600Reg <string name, bits<16> encoding> : Register<name> {
|
|
+ let Namespace = "AMDGPU";
|
|
+ let HWEncoding = encoding;
|
|
+}
|
|
+
|
|
+class R600RegWithChan <string name, bits<9> sel, string chan> :
|
|
+ Register <name> {
|
|
+
|
|
+ field bits<2> chan_encoding = !if(!eq(chan, "X"), 0,
|
|
+ !if(!eq(chan, "Y"), 1,
|
|
+ !if(!eq(chan, "Z"), 2,
|
|
+ !if(!eq(chan, "W"), 3, 0))));
|
|
+ let HWEncoding{8-0} = sel;
|
|
+ let HWEncoding{10-9} = chan_encoding;
|
|
+ let Namespace = "AMDGPU";
|
|
+}
|
|
+
|
|
+class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
|
|
+ RegisterWithSubRegs<n, subregs> {
|
|
+ let Namespace = "AMDGPU";
|
|
+ let SubRegIndices = [sub0, sub1, sub2, sub3];
|
|
+ let HWEncoding = encoding;
|
|
+}
|
|
+
|
|
+foreach Index = 0-127 in {
|
|
+ foreach Chan = [ "X", "Y", "Z", "W" ] in {
|
|
+ // 32-bit Temporary Registers
|
|
+ def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
|
|
+
|
|
+ // Indirect addressing offset registers
|
|
+ def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
|
|
+ Index, Chan>;
|
|
+ def TRegMem#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index,
|
|
+ Chan>;
|
|
+ }
|
|
+ // 128-bit Temporary Registers
|
|
+ def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW",
|
|
+ [!cast<Register>("T"#Index#"_X"),
|
|
+ !cast<Register>("T"#Index#"_Y"),
|
|
+ !cast<Register>("T"#Index#"_Z"),
|
|
+ !cast<Register>("T"#Index#"_W")],
|
|
+ Index>;
|
|
+}
|
|
+
|
|
+// Array Base Register holding input in FS
|
|
+foreach Index = 448-464 in {
|
|
+ def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>;
|
|
+}
|
|
+
|
|
+
|
|
+// Special Registers
|
|
+
|
|
+def ZERO : R600Reg<"0.0", 248>;
|
|
+def ONE : R600Reg<"1.0", 249>;
|
|
+def NEG_ONE : R600Reg<"-1.0", 249>;
|
|
+def ONE_INT : R600Reg<"1", 250>;
|
|
+def HALF : R600Reg<"0.5", 252>;
|
|
+def NEG_HALF : R600Reg<"-0.5", 252>;
|
|
+def ALU_LITERAL_X : R600Reg<"literal.x", 253>;
|
|
+def PV_X : R600Reg<"pv.x", 254>;
|
|
+def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
|
|
+def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
|
|
+def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
|
|
+def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
|
|
+def AR_X : R600Reg<"AR.x", 0>;
|
|
+
|
|
+def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
|
|
+ (add (sequence "ArrayBase%u", 448, 464))>;
|
|
+// special registers for ALU src operands
|
|
+// const buffer reference, SRCx_SEL contains index
|
|
+def ALU_CONST : R600Reg<"CBuf", 0>;
|
|
+// interpolation param reference, SRCx_SEL contains index
|
|
+def ALU_PARAM : R600Reg<"Param", 0>;
|
|
+
|
|
+let isAllocatable = 0 in {
|
|
+
|
|
+// XXX: Only use the X channel, until we support wider stack widths
|
|
+def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
|
|
+
|
|
+} // End isAllocatable = 0
|
|
+
|
|
+def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
|
|
+ (add (sequence "T%u_X", 0, 127), AR_X)>;
|
|
+
|
|
+def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
|
|
+ (add (sequence "T%u_Y", 0, 127))>;
|
|
+
|
|
+def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
|
|
+ (add (sequence "T%u_Z", 0, 127))>;
|
|
+
|
|
+def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
|
|
+ (add (sequence "T%u_W", 0, 127))>;
|
|
+
|
|
+def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
|
|
+ (interleave R600_TReg32_X, R600_TReg32_Y,
|
|
+ R600_TReg32_Z, R600_TReg32_W)>;
|
|
+
|
|
+def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
|
|
+ R600_TReg32,
|
|
+ R600_ArrayBase,
|
|
+ R600_Addr,
|
|
+ ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
|
|
+ ALU_CONST, ALU_PARAM
|
|
+ )>;
|
|
+
|
|
+def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
|
|
+ PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
|
|
+
|
|
+def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
|
|
+ PREDICATE_BIT)>;
|
|
+
|
|
+def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
|
|
+ (add (sequence "T%u_XYZW", 0, 127))> {
|
|
+ let CopyCost = -1;
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Register classes for indirect addressing
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+// Super register for all the Indirect Registers. This register class is used
|
|
+// by the REG_SEQUENCE instruction to specify the registers to use for direct
|
|
+// reads / writes which may be written / read by an indirect address.
|
|
+class IndirectSuper<string n, list<Register> subregs> :
|
|
+ RegisterWithSubRegs<n, subregs> {
|
|
+ let Namespace = "AMDGPU";
|
|
+ let SubRegIndices =
|
|
+ [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
|
|
+ sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15];
|
|
+}
|
|
+
|
|
+def IndirectSuperReg : IndirectSuper<"Indirect",
|
|
+ [TRegMem0_X, TRegMem1_X, TRegMem2_X, TRegMem3_X, TRegMem4_X, TRegMem5_X,
|
|
+ TRegMem6_X, TRegMem7_X, TRegMem8_X, TRegMem9_X, TRegMem10_X, TRegMem11_X,
|
|
+ TRegMem12_X, TRegMem13_X, TRegMem14_X, TRegMem15_X]
|
|
+>;
|
|
+
|
|
+def IndirectReg : RegisterClass<"AMDGPU", [f32, i32], 32, (add IndirectSuperReg)>;
|
|
+
|
|
+// This register class defines the registers that are the storage units for
|
|
+// the "Indirect Addressing" pseudo memory space.
|
|
+// XXX: Only use the X channel, until we support wider stack widths
|
|
+def TRegMem : RegisterClass<"AMDGPU", [f32, i32], 32,
|
|
+ (add (sequence "TRegMem%u_X", 0, 16))
|
|
+>;
|
|
diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td
|
|
new file mode 100644
|
|
index 0000000..7ede181
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/R600Schedule.td
|
|
@@ -0,0 +1,36 @@
|
|
+//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction
|
|
+// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS
|
|
+// slot has been removed.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+def ALU_X : FuncUnit;
|
|
+def ALU_Y : FuncUnit;
|
|
+def ALU_Z : FuncUnit;
|
|
+def ALU_W : FuncUnit;
|
|
+def TRANS : FuncUnit;
|
|
+
|
|
+def AnyALU : InstrItinClass;
|
|
+def VecALU : InstrItinClass;
|
|
+def TransALU : InstrItinClass;
|
|
+
|
|
+def R600_EG_Itin : ProcessorItineraries <
|
|
+ [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
|
|
+ [],
|
|
+ [
|
|
+ InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
|
|
+ InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>,
|
|
+ InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
|
|
+ InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
|
|
+ ]
|
|
+>;
|
|
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
|
|
new file mode 100644
|
|
index 0000000..92385b6
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
|
|
@@ -0,0 +1,330 @@
|
|
+//===-- SIAnnotateControlFlow.cpp - ------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// Annotates the control flow with hardware specific intrinsics.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPU.h"
|
|
+
|
|
+#include "llvm/Pass.h"
|
|
+#include "llvm/Module.h"
|
|
+#include "llvm/Analysis/Dominators.h"
|
|
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
|
+#include "llvm/ADT/DepthFirstIterator.h"
|
|
+#include "llvm/Transforms/Utils/SSAUpdater.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+namespace {
|
|
+
|
|
+// Complex types used in this pass
|
|
+typedef std::pair<BasicBlock *, Value *> StackEntry;
|
|
+typedef SmallVector<StackEntry, 16> StackVector;
|
|
+
|
|
+// Intrinsic names the control flow is annotated with
|
|
+static const char *IfIntrinsic = "llvm.SI.if";
|
|
+static const char *ElseIntrinsic = "llvm.SI.else";
|
|
+static const char *BreakIntrinsic = "llvm.SI.break";
|
|
+static const char *IfBreakIntrinsic = "llvm.SI.if.break";
|
|
+static const char *ElseBreakIntrinsic = "llvm.SI.else.break";
|
|
+static const char *LoopIntrinsic = "llvm.SI.loop";
|
|
+static const char *EndCfIntrinsic = "llvm.SI.end.cf";
|
|
+
|
|
+class SIAnnotateControlFlow : public FunctionPass {
|
|
+
|
|
+ static char ID;
|
|
+
|
|
+ Type *Boolean;
|
|
+ Type *Void;
|
|
+ Type *Int64;
|
|
+ Type *ReturnStruct;
|
|
+
|
|
+ ConstantInt *BoolTrue;
|
|
+ ConstantInt *BoolFalse;
|
|
+ UndefValue *BoolUndef;
|
|
+ Constant *Int64Zero;
|
|
+
|
|
+ Constant *If;
|
|
+ Constant *Else;
|
|
+ Constant *Break;
|
|
+ Constant *IfBreak;
|
|
+ Constant *ElseBreak;
|
|
+ Constant *Loop;
|
|
+ Constant *EndCf;
|
|
+
|
|
+ DominatorTree *DT;
|
|
+ StackVector Stack;
|
|
+ SSAUpdater PhiInserter;
|
|
+
|
|
+ bool isTopOfStack(BasicBlock *BB);
|
|
+
|
|
+ Value *popSaved();
|
|
+
|
|
+ void push(BasicBlock *BB, Value *Saved);
|
|
+
|
|
+ bool isElse(PHINode *Phi);
|
|
+
|
|
+ void eraseIfUnused(PHINode *Phi);
|
|
+
|
|
+ void openIf(BranchInst *Term);
|
|
+
|
|
+ void insertElse(BranchInst *Term);
|
|
+
|
|
+ void handleLoopCondition(Value *Cond);
|
|
+
|
|
+ void handleLoop(BranchInst *Term);
|
|
+
|
|
+ void closeControlFlow(BasicBlock *BB);
|
|
+
|
|
+public:
|
|
+ SIAnnotateControlFlow():
|
|
+ FunctionPass(ID) { }
|
|
+
|
|
+ virtual bool doInitialization(Module &M);
|
|
+
|
|
+ virtual bool runOnFunction(Function &F);
|
|
+
|
|
+ virtual const char *getPassName() const {
|
|
+ return "SI annotate control flow";
|
|
+ }
|
|
+
|
|
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
|
|
+ AU.addRequired<DominatorTree>();
|
|
+ AU.addPreserved<DominatorTree>();
|
|
+ FunctionPass::getAnalysisUsage(AU);
|
|
+ }
|
|
+
|
|
+};
|
|
+
|
|
+} // end anonymous namespace
|
|
+
|
|
+char SIAnnotateControlFlow::ID = 0;
|
|
+
|
|
+/// \brief Initialize all the types and constants used in the pass
|
|
+bool SIAnnotateControlFlow::doInitialization(Module &M) {
|
|
+ LLVMContext &Context = M.getContext();
|
|
+
|
|
+ Void = Type::getVoidTy(Context);
|
|
+ Boolean = Type::getInt1Ty(Context);
|
|
+ Int64 = Type::getInt64Ty(Context);
|
|
+ ReturnStruct = StructType::get(Boolean, Int64, (Type *)0);
|
|
+
|
|
+ BoolTrue = ConstantInt::getTrue(Context);
|
|
+ BoolFalse = ConstantInt::getFalse(Context);
|
|
+ BoolUndef = UndefValue::get(Boolean);
|
|
+ Int64Zero = ConstantInt::get(Int64, 0);
|
|
+
|
|
+ If = M.getOrInsertFunction(
|
|
+ IfIntrinsic, ReturnStruct, Boolean, (Type *)0);
|
|
+
|
|
+ Else = M.getOrInsertFunction(
|
|
+ ElseIntrinsic, ReturnStruct, Int64, (Type *)0);
|
|
+
|
|
+ Break = M.getOrInsertFunction(
|
|
+ BreakIntrinsic, Int64, Int64, (Type *)0);
|
|
+
|
|
+ IfBreak = M.getOrInsertFunction(
|
|
+ IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0);
|
|
+
|
|
+ ElseBreak = M.getOrInsertFunction(
|
|
+ ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0);
|
|
+
|
|
+ Loop = M.getOrInsertFunction(
|
|
+ LoopIntrinsic, Boolean, Int64, (Type *)0);
|
|
+
|
|
+ EndCf = M.getOrInsertFunction(
|
|
+ EndCfIntrinsic, Void, Int64, (Type *)0);
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/// \brief Is BB the last block saved on the stack ?
|
|
+bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
|
|
+ return Stack.back().first == BB;
|
|
+}
|
|
+
|
|
+/// \brief Pop the last saved value from the control flow stack
|
|
+Value *SIAnnotateControlFlow::popSaved() {
|
|
+ return Stack.pop_back_val().second;
|
|
+}
|
|
+
|
|
+/// \brief Push a BB and saved value to the control flow stack
|
|
+void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
|
|
+ Stack.push_back(std::make_pair(BB, Saved));
|
|
+}
|
|
+
|
|
+/// \brief Can the condition represented by this PHI node treated like
|
|
+/// an "Else" block?
|
|
+bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
|
|
+ BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
|
|
+ for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
|
|
+ if (Phi->getIncomingBlock(i) == IDom) {
|
|
+
|
|
+ if (Phi->getIncomingValue(i) != BoolTrue)
|
|
+ return false;
|
|
+
|
|
+ } else {
|
|
+ if (Phi->getIncomingValue(i) != BoolFalse)
|
|
+ return false;
|
|
+
|
|
+ }
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+// \brief Erase "Phi" if it is not used any more
|
|
+void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
|
|
+ if (!Phi->hasNUsesOrMore(1))
|
|
+ Phi->eraseFromParent();
|
|
+}
|
|
+
|
|
+/// \brief Open a new "If" block
|
|
+void SIAnnotateControlFlow::openIf(BranchInst *Term) {
|
|
+ Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
|
|
+ Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
|
|
+ push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
|
|
+}
|
|
+
|
|
+/// \brief Close the last "If" block and open a new "Else" block
|
|
+void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
|
|
+ Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
|
|
+ Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
|
|
+ push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
|
|
+}
|
|
+
|
|
+/// \brief Recursively handle the condition leading to a loop
|
|
+void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
|
|
+ if (PHINode *Phi = dyn_cast<PHINode>(Cond)) {
|
|
+
|
|
+ // Handle all non constant incoming values first
|
|
+ for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
|
|
+ Value *Incoming = Phi->getIncomingValue(i);
|
|
+ if (isa<ConstantInt>(Incoming))
|
|
+ continue;
|
|
+
|
|
+ Phi->setIncomingValue(i, BoolFalse);
|
|
+ handleLoopCondition(Incoming);
|
|
+ }
|
|
+
|
|
+ BasicBlock *Parent = Phi->getParent();
|
|
+ BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
|
|
+
|
|
+ for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
|
|
+
|
|
+ Value *Incoming = Phi->getIncomingValue(i);
|
|
+ if (Incoming != BoolTrue)
|
|
+ continue;
|
|
+
|
|
+ BasicBlock *From = Phi->getIncomingBlock(i);
|
|
+ if (From == IDom) {
|
|
+ CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
|
|
+ if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
|
|
+ Value *Args[] = {
|
|
+ OldEnd->getArgOperand(0),
|
|
+ PhiInserter.GetValueAtEndOfBlock(Parent)
|
|
+ };
|
|
+ Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
|
|
+ PhiInserter.AddAvailableValue(Parent, Ret);
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ TerminatorInst *Insert = From->getTerminator();
|
|
+ Value *Arg = PhiInserter.GetValueAtEndOfBlock(From);
|
|
+ Value *Ret = CallInst::Create(Break, Arg, "", Insert);
|
|
+ PhiInserter.AddAvailableValue(From, Ret);
|
|
+ }
|
|
+ eraseIfUnused(Phi);
|
|
+
|
|
+ } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
|
|
+ BasicBlock *Parent = Inst->getParent();
|
|
+ TerminatorInst *Insert = Parent->getTerminator();
|
|
+ Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) };
|
|
+ Value *Ret = CallInst::Create(IfBreak, Args, "", Insert);
|
|
+ PhiInserter.AddAvailableValue(Parent, Ret);
|
|
+
|
|
+ } else {
|
|
+ assert(0 && "Unhandled loop condition!");
|
|
+ }
|
|
+}
|
|
+
|
|
+/// \brief Handle a back edge (loop)
|
|
+void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
|
|
+ BasicBlock *Target = Term->getSuccessor(1);
|
|
+ PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
|
|
+
|
|
+ PhiInserter.Initialize(Int64, "");
|
|
+ PhiInserter.AddAvailableValue(Target, Broken);
|
|
+
|
|
+ Value *Cond = Term->getCondition();
|
|
+ Term->setCondition(BoolTrue);
|
|
+ handleLoopCondition(Cond);
|
|
+
|
|
+ BasicBlock *BB = Term->getParent();
|
|
+ Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB);
|
|
+ for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
|
|
+ PI != PE; ++PI) {
|
|
+
|
|
+ Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
|
|
+ }
|
|
+
|
|
+ Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
|
|
+ push(Term->getSuccessor(0), Arg);
|
|
+}
|
|
+
|
|
+/// \brief Close the last opened control flow
|
|
+void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
|
|
+ CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
|
|
+}
|
|
+
|
|
+/// \brief Annotate the control flow with intrinsics so the backend can
|
|
+/// recognize if/then/else and loops.
|
|
+bool SIAnnotateControlFlow::runOnFunction(Function &F) {
|
|
+ DT = &getAnalysis<DominatorTree>();
|
|
+
|
|
+ for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
|
|
+ E = df_end(&F.getEntryBlock()); I != E; ++I) {
|
|
+
|
|
+ BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
|
|
+
|
|
+ if (!Term || Term->isUnconditional()) {
|
|
+ if (isTopOfStack(*I))
|
|
+ closeControlFlow(*I);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (I.nodeVisited(Term->getSuccessor(1))) {
|
|
+ if (isTopOfStack(*I))
|
|
+ closeControlFlow(*I);
|
|
+ handleLoop(Term);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (isTopOfStack(*I)) {
|
|
+ PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
|
|
+ if (Phi && Phi->getParent() == *I && isElse(Phi)) {
|
|
+ insertElse(Term);
|
|
+ eraseIfUnused(Phi);
|
|
+ continue;
|
|
+ }
|
|
+ closeControlFlow(*I);
|
|
+ }
|
|
+ openIf(Term);
|
|
+ }
|
|
+
|
|
+ assert(Stack.empty());
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/// \brief Create the annotation pass
|
|
+FunctionPass *llvm::createSIAnnotateControlFlowPass() {
|
|
+ return new SIAnnotateControlFlow();
|
|
+}
|
|
diff --git a/lib/Target/R600/SIAssignInterpRegs.cpp b/lib/Target/R600/SIAssignInterpRegs.cpp
|
|
new file mode 100644
|
|
index 0000000..832e44d
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIAssignInterpRegs.cpp
|
|
@@ -0,0 +1,152 @@
|
|
+//===-- SIAssignInterpRegs.cpp - Assign interpolation registers -----------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief This pass maps the pseudo interpolation registers to the correct physical
|
|
+/// registers.
|
|
+//
|
|
+/// Prior to executing a fragment shader, the GPU loads interpolation
|
|
+/// parameters into physical registers. The specific physical register that each
|
|
+/// interpolation parameter ends up in depends on the type of the interpolation
|
|
+/// parameter as well as how many interpolation parameters are used by the
|
|
+/// shader.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+
|
|
+#include "AMDGPU.h"
|
|
+#include "AMDIL.h"
|
|
+#include "SIMachineFunctionInfo.h"
|
|
+#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+namespace {
|
|
+
|
|
+class SIAssignInterpRegsPass : public MachineFunctionPass {
|
|
+
|
|
+private:
|
|
+ static char ID;
|
|
+ TargetMachine &TM;
|
|
+
|
|
+ void addLiveIn(MachineFunction * MF, MachineRegisterInfo & MRI,
|
|
+ unsigned physReg, unsigned virtReg);
|
|
+
|
|
+public:
|
|
+ SIAssignInterpRegsPass(TargetMachine &tm) :
|
|
+ MachineFunctionPass(ID), TM(tm) { }
|
|
+
|
|
+ virtual bool runOnMachineFunction(MachineFunction &MF);
|
|
+
|
|
+ const char *getPassName() const { return "SI Assign intrpolation registers"; }
|
|
+};
|
|
+
|
|
+} // End anonymous namespace
|
|
+
|
|
+char SIAssignInterpRegsPass::ID = 0;
|
|
+
|
|
+#define INTERP_VALUES 16
|
|
+#define REQUIRED_VALUE_MAX_INDEX 7
|
|
+
|
|
+struct InterpInfo {
|
|
+ bool Enabled;
|
|
+ unsigned Regs[3];
|
|
+ unsigned RegCount;
|
|
+};
|
|
+
|
|
+
|
|
+FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) {
|
|
+ return new SIAssignInterpRegsPass(tm);
|
|
+}
|
|
+
|
|
+bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF) {
|
|
+
|
|
+ struct InterpInfo InterpUse[INTERP_VALUES] = {
|
|
+ {false, {AMDGPU::PERSP_SAMPLE_I, AMDGPU::PERSP_SAMPLE_J}, 2},
|
|
+ {false, {AMDGPU::PERSP_CENTER_I, AMDGPU::PERSP_CENTER_J}, 2},
|
|
+ {false, {AMDGPU::PERSP_CENTROID_I, AMDGPU::PERSP_CENTROID_J}, 2},
|
|
+ {false, {AMDGPU::PERSP_I_W, AMDGPU::PERSP_J_W, AMDGPU::PERSP_1_W}, 3},
|
|
+ {false, {AMDGPU::LINEAR_SAMPLE_I, AMDGPU::LINEAR_SAMPLE_J}, 2},
|
|
+ {false, {AMDGPU::LINEAR_CENTER_I, AMDGPU::LINEAR_CENTER_J}, 2},
|
|
+ {false, {AMDGPU::LINEAR_CENTROID_I, AMDGPU::LINEAR_CENTROID_J}, 2},
|
|
+ {false, {AMDGPU::LINE_STIPPLE_TEX_COORD}, 1},
|
|
+ {false, {AMDGPU::POS_X_FLOAT}, 1},
|
|
+ {false, {AMDGPU::POS_Y_FLOAT}, 1},
|
|
+ {false, {AMDGPU::POS_Z_FLOAT}, 1},
|
|
+ {false, {AMDGPU::POS_W_FLOAT}, 1},
|
|
+ {false, {AMDGPU::FRONT_FACE}, 1},
|
|
+ {false, {AMDGPU::ANCILLARY}, 1},
|
|
+ {false, {AMDGPU::SAMPLE_COVERAGE}, 1},
|
|
+ {false, {AMDGPU::POS_FIXED_PT}, 1}
|
|
+ };
|
|
+
|
|
+ SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
+ // This pass is only needed for pixel shaders.
|
|
+ if (MFI->ShaderType != ShaderType::PIXEL) {
|
|
+ return false;
|
|
+ }
|
|
+ MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
+ bool ForceEnable = true;
|
|
+
|
|
+ // First pass, mark the interpolation values that are used.
|
|
+ for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
|
|
+ for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
|
|
+ RegIdx++) {
|
|
+ InterpUse[InterpIdx].Enabled = InterpUse[InterpIdx].Enabled ||
|
|
+ !MRI.use_empty(InterpUse[InterpIdx].Regs[RegIdx]);
|
|
+ if (InterpUse[InterpIdx].Enabled &&
|
|
+ InterpIdx <= REQUIRED_VALUE_MAX_INDEX) {
|
|
+ ForceEnable = false;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // At least one interpolation mode must be enabled or else the GPU will hang.
|
|
+ if (ForceEnable) {
|
|
+ InterpUse[0].Enabled = true;
|
|
+ }
|
|
+
|
|
+ unsigned UsedVgprs = 0;
|
|
+
|
|
+ // Second pass, replace with VGPRs.
|
|
+ for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
|
|
+ if (!InterpUse[InterpIdx].Enabled) {
|
|
+ continue;
|
|
+ }
|
|
+ MFI->SPIPSInputAddr |= (1 << InterpIdx);
|
|
+
|
|
+ for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
|
|
+ RegIdx++, UsedVgprs++) {
|
|
+ unsigned NewReg = AMDGPU::VReg_32RegClass.getRegister(UsedVgprs);
|
|
+ unsigned VirtReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
|
|
+ MRI.replaceRegWith(InterpUse[InterpIdx].Regs[RegIdx], VirtReg);
|
|
+ addLiveIn(&MF, MRI, NewReg, VirtReg);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+void SIAssignInterpRegsPass::addLiveIn(MachineFunction * MF,
|
|
+ MachineRegisterInfo & MRI,
|
|
+ unsigned physReg, unsigned virtReg) {
|
|
+ const TargetInstrInfo * TII = TM.getInstrInfo();
|
|
+ if (!MRI.isLiveIn(physReg)) {
|
|
+ MRI.addLiveIn(physReg, virtReg);
|
|
+ MF->front().addLiveIn(physReg);
|
|
+ BuildMI(MF->front(), MF->front().begin(), DebugLoc(),
|
|
+ TII->get(TargetOpcode::COPY), virtReg)
|
|
+ .addReg(physReg);
|
|
+ } else {
|
|
+ MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg));
|
|
+ }
|
|
+}
|
|
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
|
|
new file mode 100644
|
|
index 0000000..694c045
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIISelLowering.cpp
|
|
@@ -0,0 +1,399 @@
|
|
+//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Custom DAG lowering for SI
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "SIISelLowering.h"
|
|
+#include "AMDIL.h"
|
|
+#include "AMDILIntrinsicInfo.h"
|
|
+#include "SIInstrInfo.h"
|
|
+#include "SIMachineFunctionInfo.h"
|
|
+#include "SIRegisterInfo.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+#include "llvm/CodeGen/SelectionDAG.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+SITargetLowering::SITargetLowering(TargetMachine &TM) :
|
|
+ AMDGPUTargetLowering(TM),
|
|
+ TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())) {
|
|
+ addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
|
|
+ addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
|
|
+ addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
|
|
+ addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
|
|
+ addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
|
|
+
|
|
+ addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass);
|
|
+ addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass);
|
|
+ addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
|
|
+ addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
|
|
+ addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
|
|
+
|
|
+ computeRegisterProperties();
|
|
+
|
|
+ setOperationAction(ISD::ADD, MVT::i64, Legal);
|
|
+ setOperationAction(ISD::ADD, MVT::i32, Legal);
|
|
+
|
|
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
|
|
+
|
|
+ // We need to custom lower loads from the USER_SGPR address space, so we can
|
|
+ // add the SGPRs as livein registers.
|
|
+ setOperationAction(ISD::LOAD, MVT::i32, Custom);
|
|
+ setOperationAction(ISD::LOAD, MVT::i64, Custom);
|
|
+
|
|
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
|
|
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
|
|
+
|
|
+ setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
|
|
+ setTargetDAGCombine(ISD::SELECT_CC);
|
|
+
|
|
+ setTargetDAGCombine(ISD::SETCC);
|
|
+}
|
|
+
|
|
+MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
|
|
+ MachineInstr * MI, MachineBasicBlock * BB) const {
|
|
+ const TargetInstrInfo * TII = getTargetMachine().getInstrInfo();
|
|
+ MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
|
|
+ MachineBasicBlock::iterator I = MI;
|
|
+
|
|
+ switch (MI->getOpcode()) {
|
|
+ default:
|
|
+ return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
|
|
+ case AMDGPU::BRANCH: return BB;
|
|
+ case AMDGPU::CLAMP_SI:
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_ADD_F32_e64))
|
|
+ .addOperand(MI->getOperand(0))
|
|
+ .addOperand(MI->getOperand(1))
|
|
+ .addImm(0x80) // SRC1
|
|
+ .addImm(0x80) // SRC2
|
|
+ .addImm(0) // ABS
|
|
+ .addImm(1) // CLAMP
|
|
+ .addImm(0) // OMOD
|
|
+ .addImm(0); // NEG
|
|
+ MI->eraseFromParent();
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::FABS_SI:
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_ADD_F32_e64))
|
|
+ .addOperand(MI->getOperand(0))
|
|
+ .addOperand(MI->getOperand(1))
|
|
+ .addImm(0x80) // SRC1
|
|
+ .addImm(0x80) // SRC2
|
|
+ .addImm(1) // ABS
|
|
+ .addImm(0) // CLAMP
|
|
+ .addImm(0) // OMOD
|
|
+ .addImm(0); // NEG
|
|
+ MI->eraseFromParent();
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::FNEG_SI:
|
|
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_ADD_F32_e64))
|
|
+ .addOperand(MI->getOperand(0))
|
|
+ .addOperand(MI->getOperand(1))
|
|
+ .addImm(0x80) // SRC1
|
|
+ .addImm(0x80) // SRC2
|
|
+ .addImm(0) // ABS
|
|
+ .addImm(0) // CLAMP
|
|
+ .addImm(0) // OMOD
|
|
+ .addImm(1); // NEG
|
|
+ MI->eraseFromParent();
|
|
+ break;
|
|
+ case AMDGPU::SHADER_TYPE:
|
|
+ BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType =
|
|
+ MI->getOperand(0).getImm();
|
|
+ MI->eraseFromParent();
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::SI_INTERP:
|
|
+ LowerSI_INTERP(MI, *BB, I, MRI);
|
|
+ break;
|
|
+ case AMDGPU::SI_WQM:
|
|
+ LowerSI_WQM(MI, *BB, I, MRI);
|
|
+ break;
|
|
+ }
|
|
+ return BB;
|
|
+}
|
|
+
|
|
+void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
|
|
+ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
|
|
+ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
|
|
+ .addReg(AMDGPU::EXEC);
|
|
+
|
|
+ MI->eraseFromParent();
|
|
+}
|
|
+
|
|
+void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
|
|
+ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
|
|
+ unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
|
|
+ unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
|
|
+ MachineOperand dst = MI->getOperand(0);
|
|
+ MachineOperand iReg = MI->getOperand(1);
|
|
+ MachineOperand jReg = MI->getOperand(2);
|
|
+ MachineOperand attr_chan = MI->getOperand(3);
|
|
+ MachineOperand attr = MI->getOperand(4);
|
|
+ MachineOperand params = MI->getOperand(5);
|
|
+
|
|
+ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
|
|
+ .addOperand(params);
|
|
+
|
|
+ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp)
|
|
+ .addOperand(iReg)
|
|
+ .addOperand(attr_chan)
|
|
+ .addOperand(attr)
|
|
+ .addReg(M0);
|
|
+
|
|
+ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32))
|
|
+ .addOperand(dst)
|
|
+ .addReg(tmp)
|
|
+ .addOperand(jReg)
|
|
+ .addOperand(attr_chan)
|
|
+ .addOperand(attr)
|
|
+ .addReg(M0);
|
|
+
|
|
+ MI->eraseFromParent();
|
|
+}
|
|
+
|
|
+EVT SITargetLowering::getSetCCResultType(EVT VT) const {
|
|
+ return MVT::i1;
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Custom DAG Lowering Operations
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|
+ switch (Op.getOpcode()) {
|
|
+ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
|
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
|
|
+ case ISD::LOAD: return LowerLOAD(Op, DAG);
|
|
+ case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
|
+ case ISD::INTRINSIC_WO_CHAIN: {
|
|
+ unsigned IntrinsicID =
|
|
+ cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
|
+ EVT VT = Op.getValueType();
|
|
+ switch (IntrinsicID) {
|
|
+ case AMDGPUIntrinsic::SI_vs_load_buffer_index:
|
|
+ return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
|
|
+ AMDGPU::VGPR0, VT);
|
|
+ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ return SDValue();
|
|
+}
|
|
+
|
|
+/// \brief Helper function for LowerBRCOND
|
|
+static SDNode *findUser(SDValue Value, unsigned Opcode) {
|
|
+
|
|
+ SDNode *Parent = Value.getNode();
|
|
+ for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
|
|
+ I != E; ++I) {
|
|
+
|
|
+ if (I.getUse().get() != Value)
|
|
+ continue;
|
|
+
|
|
+ if (I->getOpcode() == Opcode)
|
|
+ return *I;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/// This transforms the control flow intrinsics to get the branch destination as
|
|
+/// last parameter, also switches branch target with BR if the need arise
|
|
+SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
|
|
+ SelectionDAG &DAG) const {
|
|
+
|
|
+ DebugLoc DL = BRCOND.getDebugLoc();
|
|
+
|
|
+ SDNode *Intr = BRCOND.getOperand(1).getNode();
|
|
+ SDValue Target = BRCOND.getOperand(2);
|
|
+ SDNode *BR = 0;
|
|
+
|
|
+ if (Intr->getOpcode() == ISD::SETCC) {
|
|
+ // As long as we negate the condition everything is fine
|
|
+ SDNode *SetCC = Intr;
|
|
+ assert(SetCC->getConstantOperandVal(1) == 1);
|
|
+
|
|
+ CondCodeSDNode *CC = cast<CondCodeSDNode>(SetCC->getOperand(2).getNode());
|
|
+ assert(CC->get() == ISD::SETNE);
|
|
+ Intr = SetCC->getOperand(0).getNode();
|
|
+
|
|
+ } else {
|
|
+ // Get the target from BR if we don't negate the condition
|
|
+ BR = findUser(BRCOND, ISD::BR);
|
|
+ Target = BR->getOperand(1);
|
|
+ }
|
|
+
|
|
+ assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
|
|
+
|
|
+ // Build the result and
|
|
+ SmallVector<EVT, 4> Res;
|
|
+ for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
|
|
+ Res.push_back(Intr->getValueType(i));
|
|
+
|
|
+ // operands of the new intrinsic call
|
|
+ SmallVector<SDValue, 4> Ops;
|
|
+ Ops.push_back(BRCOND.getOperand(0));
|
|
+ for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
|
|
+ Ops.push_back(Intr->getOperand(i));
|
|
+ Ops.push_back(Target);
|
|
+
|
|
+ // build the new intrinsic call
|
|
+ SDNode *Result = DAG.getNode(
|
|
+ Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
|
|
+ DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
|
|
+
|
|
+ if (BR) {
|
|
+ // Give the branch instruction our target
|
|
+ SDValue Ops[] = {
|
|
+ BR->getOperand(0),
|
|
+ BRCOND.getOperand(2)
|
|
+ };
|
|
+ DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
|
|
+ }
|
|
+
|
|
+ SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
|
|
+
|
|
+ // Copy the intrinsic results to registers
|
|
+ for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
|
|
+ SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
|
|
+ if (!CopyToReg)
|
|
+ continue;
|
|
+
|
|
+ Chain = DAG.getCopyToReg(
|
|
+ Chain, DL,
|
|
+ CopyToReg->getOperand(1),
|
|
+ SDValue(Result, i - 1),
|
|
+ SDValue());
|
|
+
|
|
+ DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
|
|
+ }
|
|
+
|
|
+ // Remove the old intrinsic from the chain
|
|
+ DAG.ReplaceAllUsesOfValueWith(
|
|
+ SDValue(Intr, Intr->getNumValues() - 1),
|
|
+ Intr->getOperand(0));
|
|
+
|
|
+ return Chain;
|
|
+}
|
|
+
|
|
+SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|
+ EVT VT = Op.getValueType();
|
|
+ LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op);
|
|
+
|
|
+ assert(Ptr);
|
|
+
|
|
+ unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace();
|
|
+
|
|
+ // We only need to lower USER_SGPR address space loads
|
|
+ if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) {
|
|
+ return SDValue();
|
|
+ }
|
|
+
|
|
+ // Loads from the USER_SGPR address space can only have constant value
|
|
+ // pointers.
|
|
+ ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr());
|
|
+ assert(BasePtr);
|
|
+
|
|
+ unsigned TypeDwordWidth = VT.getSizeInBits() / 32;
|
|
+ const TargetRegisterClass * dstClass;
|
|
+ switch (TypeDwordWidth) {
|
|
+ default:
|
|
+ assert(!"USER_SGPR value size not implemented");
|
|
+ return SDValue();
|
|
+ case 1:
|
|
+ dstClass = &AMDGPU::SReg_32RegClass;
|
|
+ break;
|
|
+ case 2:
|
|
+ dstClass = &AMDGPU::SReg_64RegClass;
|
|
+ break;
|
|
+ }
|
|
+ uint64_t Index = BasePtr->getZExtValue();
|
|
+ assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned");
|
|
+ unsigned SGPRIndex = Index / TypeDwordWidth;
|
|
+ unsigned Reg = dstClass->getRegister(SGPRIndex);
|
|
+
|
|
+ DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg,
|
|
+ VT));
|
|
+ return SDValue();
|
|
+}
|
|
+
|
|
+SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
|
|
+ SDValue LHS = Op.getOperand(0);
|
|
+ SDValue RHS = Op.getOperand(1);
|
|
+ SDValue True = Op.getOperand(2);
|
|
+ SDValue False = Op.getOperand(3);
|
|
+ SDValue CC = Op.getOperand(4);
|
|
+ EVT VT = Op.getValueType();
|
|
+ DebugLoc DL = Op.getDebugLoc();
|
|
+
|
|
+ // Possible Min/Max pattern
|
|
+ SDValue MinMax = LowerMinMax(Op, DAG);
|
|
+ if (MinMax.getNode()) {
|
|
+ return MinMax;
|
|
+ }
|
|
+
|
|
+ SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
|
|
+ return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Custom DAG optimizations
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
|
|
+ DAGCombinerInfo &DCI) const {
|
|
+ SelectionDAG &DAG = DCI.DAG;
|
|
+ DebugLoc DL = N->getDebugLoc();
|
|
+ EVT VT = N->getValueType(0);
|
|
+
|
|
+ switch (N->getOpcode()) {
|
|
+ default: break;
|
|
+ case ISD::SELECT_CC: {
|
|
+ N->dump();
|
|
+ ConstantSDNode *True, *False;
|
|
+ // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
|
|
+ if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
|
|
+ && (False = dyn_cast<ConstantSDNode>(N->getOperand(3)))
|
|
+ && True->isAllOnesValue()
|
|
+ && False->isNullValue()
|
|
+ && VT == MVT::i1) {
|
|
+ return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0),
|
|
+ N->getOperand(1), N->getOperand(4));
|
|
+
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case ISD::SETCC: {
|
|
+ SDValue Arg0 = N->getOperand(0);
|
|
+ SDValue Arg1 = N->getOperand(1);
|
|
+ SDValue CC = N->getOperand(2);
|
|
+ ConstantSDNode * C = NULL;
|
|
+ ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
|
|
+
|
|
+ // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
|
|
+ if (VT == MVT::i1
|
|
+ && Arg0.getOpcode() == ISD::SIGN_EXTEND
|
|
+ && Arg0.getOperand(0).getValueType() == MVT::i1
|
|
+ && (C = dyn_cast<ConstantSDNode>(Arg1))
|
|
+ && C->isNullValue()
|
|
+ && CCOp == ISD::SETNE) {
|
|
+ return SimplifySetCC(VT, Arg0.getOperand(0),
|
|
+ DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ return SDValue();
|
|
+}
|
|
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
|
|
new file mode 100644
|
|
index 0000000..5d048f8
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIISelLowering.h
|
|
@@ -0,0 +1,48 @@
|
|
+//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief SI DAG Lowering interface definition
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#ifndef SIISELLOWERING_H
|
|
+#define SIISELLOWERING_H
|
|
+
|
|
+#include "AMDGPUISelLowering.h"
|
|
+#include "SIInstrInfo.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class SITargetLowering : public AMDGPUTargetLowering {
|
|
+ const SIInstrInfo * TII;
|
|
+
|
|
+ void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
|
|
+ MachineBasicBlock::iterator I, unsigned Opocde) const;
|
|
+ void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
|
|
+ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
|
|
+ void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
|
|
+ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
|
|
+
|
|
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
|
|
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
|
|
+
|
|
+public:
|
|
+ SITargetLowering(TargetMachine &tm);
|
|
+ virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
|
|
+ MachineBasicBlock * BB) const;
|
|
+ virtual EVT getSetCCResultType(EVT VT) const;
|
|
+ virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
|
|
+ virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
|
+};
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+#endif //SIISELLOWERING_H
|
|
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
|
|
new file mode 100644
|
|
index 0000000..24fc929
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIInsertWaits.cpp
|
|
@@ -0,0 +1,353 @@
|
|
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Insert wait instructions for memory reads and writes.
|
|
+///
|
|
+/// Memory reads and writes are issued asynchronously, so we need to insert
|
|
+/// S_WAITCNT instructions when we want to access any of their results or
|
|
+/// overwrite any register that's used asynchronously.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPU.h"
|
|
+#include "SIInstrInfo.h"
|
|
+#include "SIMachineFunctionInfo.h"
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+namespace {
|
|
+
|
|
+/// \brief One variable for each of the hardware counters
|
|
+typedef union {
|
|
+ struct {
|
|
+ unsigned VM;
|
|
+ unsigned EXP;
|
|
+ unsigned LGKM;
|
|
+ } Named;
|
|
+ unsigned Array[3];
|
|
+
|
|
+} Counters;
|
|
+
|
|
+typedef Counters RegCounters[512];
|
|
+typedef std::pair<unsigned, unsigned> RegInterval;
|
|
+
|
|
+class SIInsertWaits : public MachineFunctionPass {
|
|
+
|
|
+private:
|
|
+ static char ID;
|
|
+ const SIInstrInfo *TII;
|
|
+ const SIRegisterInfo &TRI;
|
|
+ const MachineRegisterInfo *MRI;
|
|
+
|
|
+ /// \brief Constant hardware limits
|
|
+ static const Counters WaitCounts;
|
|
+
|
|
+ /// \brief Constant zero value
|
|
+ static const Counters ZeroCounts;
|
|
+
|
|
+ /// \brief Counter values we have already waited on.
|
|
+ Counters WaitedOn;
|
|
+
|
|
+ /// \brief Counter values for last instruction issued.
|
|
+ Counters LastIssued;
|
|
+
|
|
+ /// \brief Registers used by async instructions.
|
|
+ RegCounters UsedRegs;
|
|
+
|
|
+ /// \brief Registers defined by async instructions.
|
|
+ RegCounters DefinedRegs;
|
|
+
|
|
+ /// \brief Different export instruction types seen since last wait.
|
|
+ unsigned ExpInstrTypesSeen;
|
|
+
|
|
+ /// \brief Get increment/decrement amount for this instruction.
|
|
+ Counters getHwCounts(MachineInstr &MI);
|
|
+
|
|
+ /// \brief Is operand relevant for async execution?
|
|
+ bool isOpRelevant(MachineOperand &Op);
|
|
+
|
|
+ /// \brief Get register interval an operand affects.
|
|
+ RegInterval getRegInterval(MachineOperand &Op);
|
|
+
|
|
+ /// \brief Handle instructions async components
|
|
+ void pushInstruction(MachineInstr &MI);
|
|
+
|
|
+ /// \brief Insert the actual wait instruction
|
|
+ bool insertWait(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ const Counters &Counts);
|
|
+
|
|
+ /// \brief Resolve all operand dependencies to counter requirements
|
|
+ Counters handleOperands(MachineInstr &MI);
|
|
+
|
|
+public:
|
|
+ SIInsertWaits(TargetMachine &tm) :
|
|
+ MachineFunctionPass(ID),
|
|
+ TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
|
|
+ TRI(TII->getRegisterInfo()) { }
|
|
+
|
|
+ virtual bool runOnMachineFunction(MachineFunction &MF);
|
|
+
|
|
+ const char *getPassName() const {
|
|
+ return "SI insert wait instructions";
|
|
+ }
|
|
+
|
|
+};
|
|
+
|
|
+} // End anonymous namespace
|
|
+
|
|
+char SIInsertWaits::ID = 0;
|
|
+
|
|
+const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
|
|
+const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
|
|
+
|
|
+FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
|
|
+ return new SIInsertWaits(tm);
|
|
+}
|
|
+
|
|
+Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
|
|
+
|
|
+ uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
|
|
+ Counters Result;
|
|
+
|
|
+ Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
|
|
+
|
|
+ // Only consider stores or EXP for EXP_CNT
|
|
+ Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
|
|
+ (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore()));
|
|
+
|
|
+ // LGKM may uses larger values
|
|
+ if (TSFlags & SIInstrFlags::LGKM_CNT) {
|
|
+
|
|
+ MachineOperand &Op = MI.getOperand(0);
|
|
+ assert(Op.isReg() && "First LGKM operand must be a register!");
|
|
+
|
|
+ unsigned Reg = Op.getReg();
|
|
+ unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
|
|
+ Result.Named.LGKM = Size > 4 ? 2 : 1;
|
|
+
|
|
+ } else {
|
|
+ Result.Named.LGKM = 0;
|
|
+ }
|
|
+
|
|
+ return Result;
|
|
+}
|
|
+
|
|
+bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
|
|
+
|
|
+ // Constants are always irrelevant
|
|
+ if (!Op.isReg())
|
|
+ return false;
|
|
+
|
|
+ // Defines are always relevant
|
|
+ if (Op.isDef())
|
|
+ return true;
|
|
+
|
|
+ // For exports all registers are relevant
|
|
+ MachineInstr &MI = *Op.getParent();
|
|
+ if (MI.getOpcode() == AMDGPU::EXP)
|
|
+ return true;
|
|
+
|
|
+ // For stores the stored value is also relevant
|
|
+ if (!MI.getDesc().mayStore())
|
|
+ return false;
|
|
+
|
|
+ for (MachineInstr::mop_iterator I = MI.operands_begin(),
|
|
+ E = MI.operands_end(); I != E; ++I) {
|
|
+
|
|
+ if (I->isReg() && I->isUse())
|
|
+ return Op.isIdenticalTo(*I);
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
|
|
+
|
|
+ if (!Op.isReg())
|
|
+ return std::make_pair(0, 0);
|
|
+
|
|
+ unsigned Reg = Op.getReg();
|
|
+ unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
|
|
+
|
|
+ assert(Size >= 4);
|
|
+
|
|
+ RegInterval Result;
|
|
+ Result.first = TRI.getEncodingValue(Reg);
|
|
+ Result.second = Result.first + Size / 4;
|
|
+
|
|
+ return Result;
|
|
+}
|
|
+
|
|
+void SIInsertWaits::pushInstruction(MachineInstr &MI) {
|
|
+
|
|
+ // Get the hardware counter increments and sum them up
|
|
+ Counters Increment = getHwCounts(MI);
|
|
+ unsigned Sum = 0;
|
|
+
|
|
+ for (unsigned i = 0; i < 3; ++i) {
|
|
+ LastIssued.Array[i] += Increment.Array[i];
|
|
+ Sum += Increment.Array[i];
|
|
+ }
|
|
+
|
|
+ // If we don't increase anything then that's it
|
|
+ if (Sum == 0)
|
|
+ return;
|
|
+
|
|
+ // Remember which export instructions we have seen
|
|
+ if (Increment.Named.EXP) {
|
|
+ ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
|
|
+ }
|
|
+
|
|
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
|
|
+
|
|
+ MachineOperand &Op = MI.getOperand(i);
|
|
+ if (!isOpRelevant(Op))
|
|
+ continue;
|
|
+
|
|
+ RegInterval Interval = getRegInterval(Op);
|
|
+ for (unsigned j = Interval.first; j < Interval.second; ++j) {
|
|
+
|
|
+ // Remember which registers we define
|
|
+ if (Op.isDef())
|
|
+ DefinedRegs[j] = LastIssued;
|
|
+
|
|
+ // and which one we are using
|
|
+ if (Op.isUse())
|
|
+ UsedRegs[j] = LastIssued;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ const Counters &Required) {
|
|
+
|
|
+ // End of program? No need to wait on anything
|
|
+ if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
|
|
+ return false;
|
|
+
|
|
+ // Figure out if the async instructions execute in order
|
|
+ bool Ordered[3];
|
|
+
|
|
+ // VM_CNT is always ordered
|
|
+ Ordered[0] = true;
|
|
+
|
|
+ // EXP_CNT is unordered if we have both EXP & VM-writes
|
|
+ Ordered[1] = ExpInstrTypesSeen == 3;
|
|
+
|
|
+ // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
|
|
+ Ordered[2] = false;
|
|
+
|
|
+ // The values we are going to put into the S_WAITCNT instruction
|
|
+ Counters Counts = WaitCounts;
|
|
+
|
|
+ // Do we really need to wait?
|
|
+ bool NeedWait = false;
|
|
+
|
|
+ for (unsigned i = 0; i < 3; ++i) {
|
|
+
|
|
+ if (Required.Array[i] <= WaitedOn.Array[i])
|
|
+ continue;
|
|
+
|
|
+ NeedWait = true;
|
|
+
|
|
+ if (Ordered[i]) {
|
|
+ unsigned Value = LastIssued.Array[i] - Required.Array[i];
|
|
+
|
|
+ // adjust the value to the real hardware posibilities
|
|
+ Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
|
|
+
|
|
+ } else
|
|
+ Counts.Array[i] = 0;
|
|
+
|
|
+ // Remember on what we have waited on
|
|
+ WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
|
|
+ }
|
|
+
|
|
+ if (!NeedWait)
|
|
+ return false;
|
|
+
|
|
+ // Reset EXP_CNT instruction types
|
|
+ if (Counts.Named.EXP == 0)
|
|
+ ExpInstrTypesSeen = 0;
|
|
+
|
|
+ // Build the wait instruction
|
|
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
|
|
+ .addImm((Counts.Named.VM & 0xF) |
|
|
+ ((Counts.Named.EXP & 0x7) << 4) |
|
|
+ ((Counts.Named.LGKM & 0x7) << 8));
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/// \brief helper function for handleOperands
|
|
+static void increaseCounters(Counters &Dst, const Counters &Src) {
|
|
+
|
|
+ for (unsigned i = 0; i < 3; ++i)
|
|
+ Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
|
|
+}
|
|
+
|
|
+Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
|
|
+
|
|
+ Counters Result = ZeroCounts;
|
|
+
|
|
+ // For each register affected by this
|
|
+ // instruction increase the result sequence
|
|
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
|
|
+
|
|
+ MachineOperand &Op = MI.getOperand(i);
|
|
+ RegInterval Interval = getRegInterval(Op);
|
|
+ for (unsigned j = Interval.first; j < Interval.second; ++j) {
|
|
+
|
|
+ if (Op.isDef())
|
|
+ increaseCounters(Result, UsedRegs[j]);
|
|
+
|
|
+ if (Op.isUse())
|
|
+ increaseCounters(Result, DefinedRegs[j]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return Result;
|
|
+}
|
|
+
|
|
+bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
|
|
+
|
|
+ bool Changes = false;
|
|
+
|
|
+ MRI = &MF.getRegInfo();
|
|
+
|
|
+ WaitedOn = ZeroCounts;
|
|
+ LastIssued = ZeroCounts;
|
|
+
|
|
+ memset(&UsedRegs, 0, sizeof(UsedRegs));
|
|
+ memset(&DefinedRegs, 0, sizeof(DefinedRegs));
|
|
+
|
|
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
|
|
+ BI != BE; ++BI) {
|
|
+
|
|
+ MachineBasicBlock &MBB = *BI;
|
|
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
|
|
+ I != E; ++I) {
|
|
+
|
|
+ Changes |= insertWait(MBB, I, handleOperands(*I));
|
|
+ pushInstruction(*I);
|
|
+ }
|
|
+
|
|
+ // Wait for everything at the end of the MBB
|
|
+ Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
|
|
+ }
|
|
+
|
|
+ return Changes;
|
|
+}
|
|
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
|
|
new file mode 100644
|
|
index 0000000..40e37aa
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIInstrFormats.td
|
|
@@ -0,0 +1,188 @@
|
|
+//===-- SIInstrFormats.td - SI Instruction Formats ------------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// SI Instruction format definitions.
|
|
+//
|
|
+// Instructions with _32 take 32-bit operands.
|
|
+// Instructions with _64 take 64-bit operands.
|
|
+//
|
|
+// VOP_* instructions can use either a 32-bit or 64-bit encoding. The 32-bit
|
|
+// encoding is the standard encoding, but instruction that make use of
|
|
+// any of the instruction modifiers must use the 64-bit encoding.
|
|
+//
|
|
+// Instructions with _e32 use the 32-bit encoding.
|
|
+// Instructions with _e64 use the 64-bit encoding.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+class VOP3_32 <bits<9> op, string opName, list<dag> pattern>
|
|
+ : VOP3 <op, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1, VReg_32:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
|
|
+
|
|
+class VOP3_64 <bits<9> op, string opName, list<dag> pattern>
|
|
+ : VOP3 <op, (outs VReg_64:$dst), (ins VSrc_64:$src0, VReg_64:$src1, VReg_64:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
|
|
+
|
|
+class SOP1_32 <bits<8> op, string opName, list<dag> pattern>
|
|
+ : SOP1 <op, (outs SReg_32:$dst), (ins SSrc_32:$src0), opName, pattern>;
|
|
+
|
|
+class SOP1_64 <bits<8> op, string opName, list<dag> pattern>
|
|
+ : SOP1 <op, (outs SReg_64:$dst), (ins SSrc_64:$src0), opName, pattern>;
|
|
+
|
|
+class SOP2_32 <bits<7> op, string opName, list<dag> pattern>
|
|
+ : SOP2 <op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), opName, pattern>;
|
|
+
|
|
+class SOP2_64 <bits<7> op, string opName, list<dag> pattern>
|
|
+ : SOP2 <op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), opName, pattern>;
|
|
+
|
|
+class VOP1_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
|
|
+ string opName, list<dag> pattern> :
|
|
+ VOP1 <
|
|
+ op, (outs vrc:$dst), (ins arc:$src0), opName, pattern
|
|
+ >;
|
|
+
|
|
+multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern> {
|
|
+ def _e32: VOP1_Helper <op, VReg_32, VSrc_32, opName, pattern>;
|
|
+ def _e64 : VOP3_32 <{1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
|
|
+ opName, []
|
|
+ >;
|
|
+}
|
|
+
|
|
+multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> {
|
|
+
|
|
+ def _e32 : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>;
|
|
+
|
|
+ def _e64 : VOP3_64 <
|
|
+ {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
|
|
+ opName, []
|
|
+ >;
|
|
+}
|
|
+
|
|
+class VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
|
|
+ string opName, list<dag> pattern> :
|
|
+ VOP2 <
|
|
+ op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), opName, pattern
|
|
+ >;
|
|
+
|
|
+multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern> {
|
|
+
|
|
+ def _e32 : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern>;
|
|
+
|
|
+ def _e64 : VOP3_32 <{1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
|
|
+ opName, []
|
|
+ >;
|
|
+}
|
|
+
|
|
+multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern> {
|
|
+ def _e32: VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern>;
|
|
+
|
|
+ def _e64 : VOP3_64 <
|
|
+ {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
|
|
+ opName, []
|
|
+ >;
|
|
+}
|
|
+
|
|
+class SOPK_32 <bits<5> op, string opName, list<dag> pattern>
|
|
+ : SOPK <op, (outs SReg_32:$dst), (ins i16imm:$src0), opName, pattern>;
|
|
+
|
|
+class SOPK_64 <bits<5> op, string opName, list<dag> pattern>
|
|
+ : SOPK <op, (outs SReg_64:$dst), (ins i16imm:$src0), opName, pattern>;
|
|
+
|
|
+multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
|
|
+ string opName, list<dag> pattern> {
|
|
+
|
|
+ def _e32 : VOPC <op, (ins arc:$src0, vrc:$src1), opName, pattern>;
|
|
+ def _e64 : VOP3 <
|
|
+ {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
|
|
+ (outs SReg_64:$dst),
|
|
+ (ins arc:$src0, vrc:$src1,
|
|
+ InstFlag:$abs, InstFlag:$clamp,
|
|
+ InstFlag:$omod, InstFlag:$neg),
|
|
+ opName, pattern
|
|
+ > {
|
|
+ let SRC2 = 0x80;
|
|
+ }
|
|
+}
|
|
+
|
|
+multiclass VOPC_32 <bits<8> op, string opName, list<dag> pattern>
|
|
+ : VOPC_Helper <op, VReg_32, VSrc_32, opName, pattern>;
|
|
+
|
|
+multiclass VOPC_64 <bits<8> op, string opName, list<dag> pattern>
|
|
+ : VOPC_Helper <op, VReg_64, VSrc_64, opName, pattern>;
|
|
+
|
|
+class SOPC_32 <bits<7> op, string opName, list<dag> pattern>
|
|
+ : SOPC <op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), opName, pattern>;
|
|
+
|
|
+class SOPC_64 <bits<7> op, string opName, list<dag> pattern>
|
|
+ : SOPC <op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), opName, pattern>;
|
|
+
|
|
+class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
|
|
+ op,
|
|
+ (outs VReg_128:$vdata),
|
|
+ (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
|
|
+ i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr,
|
|
+ GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
|
|
+ asm,
|
|
+ []> {
|
|
+ let mayLoad = 1;
|
|
+ let mayStore = 0;
|
|
+}
|
|
+
|
|
+class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
|
|
+ op,
|
|
+ (outs),
|
|
+ (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
|
|
+ i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
|
|
+ GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
|
|
+ asm,
|
|
+ []> {
|
|
+ let mayStore = 1;
|
|
+ let mayLoad = 0;
|
|
+}
|
|
+
|
|
+class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
|
|
+ op,
|
|
+ (outs regClass:$dst),
|
|
+ (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
|
|
+ i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc,
|
|
+ i1imm:$tfe, SSrc_32:$soffset),
|
|
+ asm,
|
|
+ []> {
|
|
+ let mayLoad = 1;
|
|
+ let mayStore = 0;
|
|
+}
|
|
+
|
|
+class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
|
|
+ op,
|
|
+ (outs regClass:$dst),
|
|
+ (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
|
|
+ i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc,
|
|
+ i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
|
|
+ asm,
|
|
+ []> {
|
|
+ let mayLoad = 1;
|
|
+ let mayStore = 0;
|
|
+}
|
|
+
|
|
+multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass> {
|
|
+ def _IMM : SMRD <
|
|
+ op, 1,
|
|
+ (outs dstClass:$dst),
|
|
+ (ins GPR2Align<SReg_64>:$sbase, i32imm:$offset),
|
|
+ asm,
|
|
+ []
|
|
+ >;
|
|
+
|
|
+ def _SGPR : SMRD <
|
|
+ op, 0,
|
|
+ (outs dstClass:$dst),
|
|
+ (ins GPR2Align<SReg_64>:$sbase, SReg_32:$soff),
|
|
+ asm,
|
|
+ []
|
|
+ >;
|
|
+}
|
|
+
|
|
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..1c4b3cf
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIInstrInfo.cpp
|
|
@@ -0,0 +1,143 @@
|
|
+//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief SI Implementation of TargetInstrInfo.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+#include "SIInstrInfo.h"
|
|
+#include "AMDGPUTargetMachine.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+#include "llvm/MC/MCInstrDesc.h"
|
|
+
|
|
+#include <stdio.h>
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
|
|
+ : AMDGPUInstrInfo(tm),
|
|
+ RI(tm, *this)
|
|
+ { }
|
|
+
|
|
+const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
|
|
+ return RI;
|
|
+}
|
|
+
|
|
+void
|
|
+SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI, DebugLoc DL,
|
|
+ unsigned DestReg, unsigned SrcReg,
|
|
+ bool KillSrc) const {
|
|
+ // If we are trying to copy to or from SCC, there is a bug somewhere else in
|
|
+ // the backend. While it may be theoretically possible to do this, it should
|
|
+ // never be necessary.
|
|
+ assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
|
|
+
|
|
+ if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
|
|
+ assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
|
|
+ AMDGPU::SReg_64RegClass.contains(SrcReg));
|
|
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub0))
|
|
+ .addReg(RI.getSubReg(SrcReg, AMDGPU::sub0), getKillRegState(KillSrc))
|
|
+ .addReg(DestReg, RegState::Define | RegState::Implicit);
|
|
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub1))
|
|
+ .addReg(RI.getSubReg(SrcReg, AMDGPU::sub1), getKillRegState(KillSrc));
|
|
+ } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
|
|
+ assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
|
|
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
|
|
+ .addReg(SrcReg, getKillRegState(KillSrc));
|
|
+ } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
|
|
+ assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
|
|
+ AMDGPU::SReg_32RegClass.contains(SrcReg));
|
|
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
|
|
+ .addReg(SrcReg, getKillRegState(KillSrc));
|
|
+ } else {
|
|
+ assert(AMDGPU::SReg_32RegClass.contains(DestReg));
|
|
+ assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
|
|
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
|
|
+ .addReg(SrcReg, getKillRegState(KillSrc));
|
|
+ }
|
|
+}
|
|
+
|
|
+MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg,
|
|
+ int64_t Imm) const {
|
|
+ MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_B32_e32), DebugLoc());
|
|
+ MachineInstrBuilder MIB(MI);
|
|
+ MIB.addReg(DstReg, RegState::Define);
|
|
+ MIB.addImm(Imm);
|
|
+
|
|
+ return MI;
|
|
+
|
|
+}
|
|
+
|
|
+bool SIInstrInfo::isMov(unsigned Opcode) const {
|
|
+ switch(Opcode) {
|
|
+ default: return false;
|
|
+ case AMDGPU::S_MOV_B32:
|
|
+ case AMDGPU::S_MOV_B64:
|
|
+ case AMDGPU::V_MOV_B32_e32:
|
|
+ case AMDGPU::V_MOV_B32_e64:
|
|
+ return true;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool
|
|
+SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
|
|
+ return RC != &AMDGPU::EXECRegRegClass;
|
|
+}
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// Indirect addressing callbacks
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
|
|
+ unsigned Channel) const {
|
|
+ assert(Channel == 0);
|
|
+ return RegIndex;
|
|
+}
|
|
+
|
|
+
|
|
+int SIInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
|
|
+ llvm_unreachable("Unimplemented");
|
|
+}
|
|
+
|
|
+int SIInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
|
|
+ llvm_unreachable("Unimplemented");
|
|
+}
|
|
+
|
|
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrStoreRegClass(
|
|
+ unsigned SourceReg) const {
|
|
+ llvm_unreachable("Unimplemented");
|
|
+}
|
|
+
|
|
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrLoadRegClass() const {
|
|
+ llvm_unreachable("Unimplemented");
|
|
+}
|
|
+
|
|
+MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
|
|
+ MachineBasicBlock *MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned ValueReg,
|
|
+ unsigned Address, unsigned OffsetReg) const {
|
|
+ llvm_unreachable("Unimplemented");
|
|
+}
|
|
+
|
|
+MachineInstrBuilder SIInstrInfo::buildIndirectRead(
|
|
+ MachineBasicBlock *MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned ValueReg,
|
|
+ unsigned Address, unsigned OffsetReg) const {
|
|
+ llvm_unreachable("Unimplemented");
|
|
+}
|
|
+
|
|
+const TargetRegisterClass *SIInstrInfo::getSuperIndirectRegClass() const {
|
|
+ llvm_unreachable("Unimplemented");
|
|
+}
|
|
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
|
|
new file mode 100644
|
|
index 0000000..a65f7b6
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIInstrInfo.h
|
|
@@ -0,0 +1,84 @@
|
|
+//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Interface definition for SIInstrInfo.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+#ifndef SIINSTRINFO_H
|
|
+#define SIINSTRINFO_H
|
|
+
|
|
+#include "AMDGPUInstrInfo.h"
|
|
+#include "SIRegisterInfo.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class SIInstrInfo : public AMDGPUInstrInfo {
|
|
+private:
|
|
+ const SIRegisterInfo RI;
|
|
+
|
|
+public:
|
|
+ explicit SIInstrInfo(AMDGPUTargetMachine &tm);
|
|
+
|
|
+ const SIRegisterInfo &getRegisterInfo() const;
|
|
+
|
|
+ virtual void copyPhysReg(MachineBasicBlock &MBB,
|
|
+ MachineBasicBlock::iterator MI, DebugLoc DL,
|
|
+ unsigned DestReg, unsigned SrcReg,
|
|
+ bool KillSrc) const;
|
|
+
|
|
+ virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
|
|
+ int64_t Imm) const;
|
|
+
|
|
+ virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;}
|
|
+ virtual bool isMov(unsigned Opcode) const;
|
|
+
|
|
+ virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
|
|
+
|
|
+ virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
|
|
+
|
|
+ virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
|
|
+
|
|
+ virtual unsigned calculateIndirectAddress(unsigned RegIndex,
|
|
+ unsigned Channel) const;
|
|
+
|
|
+ virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
|
|
+ unsigned SourceReg) const;
|
|
+
|
|
+ virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
|
|
+
|
|
+ virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned ValueReg,
|
|
+ unsigned Address,
|
|
+ unsigned OffsetReg) const;
|
|
+
|
|
+ virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
|
|
+ MachineBasicBlock::iterator I,
|
|
+ unsigned ValueReg,
|
|
+ unsigned Address,
|
|
+ unsigned OffsetReg) const;
|
|
+
|
|
+ virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
|
|
+ };
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+namespace SIInstrFlags {
|
|
+ enum Flags {
|
|
+ // First 4 bits are the instruction encoding
|
|
+ VM_CNT = 1 << 0,
|
|
+ EXP_CNT = 1 << 1,
|
|
+ LGKM_CNT = 1 << 2
|
|
+ };
|
|
+}
|
|
+
|
|
+#endif //SIINSTRINFO_H
|
|
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
|
|
new file mode 100644
|
|
index 0000000..8c4e5af
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIInstrInfo.td
|
|
@@ -0,0 +1,465 @@
|
|
+//===-- SIInstrInfo.td - SI Instruction Encodings ---------*- tablegen -*--===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+//===----------------------------------------------------------------------===//
|
|
+// SI DAG Nodes
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+// SMRD takes a 64bit memory address and can only add an 32bit offset
|
|
+def SIadd64bit32bit : SDNode<"ISD::ADD",
|
|
+ SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
|
|
+>;
|
|
+
|
|
+// Transformation function, extract the lower 32bit of a 64bit immediate
|
|
+def LO32 : SDNodeXForm<imm, [{
|
|
+ return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
|
|
+}]>;
|
|
+
|
|
+// Transformation function, extract the upper 32bit of a 64bit immediate
|
|
+def HI32 : SDNodeXForm<imm, [{
|
|
+ return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32);
|
|
+}]>;
|
|
+
|
|
+def IMM8bitDWORD : ImmLeaf <
|
|
+ i32, [{
|
|
+ return (Imm & ~0x3FC) == 0;
|
|
+ }], SDNodeXForm<imm, [{
|
|
+ return CurDAG->getTargetConstant(
|
|
+ N->getZExtValue() >> 2, MVT::i32);
|
|
+ }]>
|
|
+>;
|
|
+
|
|
+def IMM12bit : ImmLeaf <
|
|
+ i16,
|
|
+ [{return isUInt<12>(Imm);}]
|
|
+>;
|
|
+
|
|
+class InlineImm <ValueType vt> : ImmLeaf <vt, [{
|
|
+ return -16 <= Imm && Imm <= 64;
|
|
+}]>;
|
|
+
|
|
+class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ AMDGPUInst<outs, ins, asm, pattern> {
|
|
+
|
|
+ field bits<1> VM_CNT = 0;
|
|
+ field bits<1> EXP_CNT = 0;
|
|
+ field bits<1> LGKM_CNT = 0;
|
|
+
|
|
+ let TSFlags{0} = VM_CNT;
|
|
+ let TSFlags{1} = EXP_CNT;
|
|
+ let TSFlags{2} = LGKM_CNT;
|
|
+}
|
|
+
|
|
+class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ InstSI <outs, ins, asm, pattern> {
|
|
+
|
|
+ field bits<32> Inst;
|
|
+ let Size = 4;
|
|
+}
|
|
+
|
|
+class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ InstSI <outs, ins, asm, pattern> {
|
|
+
|
|
+ field bits<64> Inst;
|
|
+ let Size = 8;
|
|
+}
|
|
+
|
|
+class SIOperand <ValueType vt, dag opInfo>: Operand <vt> {
|
|
+ let EncoderMethod = "encodeOperand";
|
|
+ let MIOperandInfo = opInfo;
|
|
+}
|
|
+
|
|
+class GPR4Align <RegisterClass rc> : Operand <vAny> {
|
|
+ let EncoderMethod = "GPR4AlignEncode";
|
|
+ let MIOperandInfo = (ops rc:$reg);
|
|
+}
|
|
+
|
|
+class GPR2Align <RegisterClass rc> : Operand <iPTR> {
|
|
+ let EncoderMethod = "GPR2AlignEncode";
|
|
+ let MIOperandInfo = (ops rc:$reg);
|
|
+}
|
|
+
|
|
+let Uses = [EXEC] in {
|
|
+
|
|
+def EXP : Enc64<
|
|
+ (outs),
|
|
+ (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
|
|
+ VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
|
|
+ "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
|
|
+ [] > {
|
|
+
|
|
+ bits<4> EN;
|
|
+ bits<6> TGT;
|
|
+ bits<1> COMPR;
|
|
+ bits<1> DONE;
|
|
+ bits<1> VM;
|
|
+ bits<8> VSRC0;
|
|
+ bits<8> VSRC1;
|
|
+ bits<8> VSRC2;
|
|
+ bits<8> VSRC3;
|
|
+
|
|
+ let Inst{3-0} = EN;
|
|
+ let Inst{9-4} = TGT;
|
|
+ let Inst{10} = COMPR;
|
|
+ let Inst{11} = DONE;
|
|
+ let Inst{12} = VM;
|
|
+ let Inst{31-26} = 0x3e;
|
|
+ let Inst{39-32} = VSRC0;
|
|
+ let Inst{47-40} = VSRC1;
|
|
+ let Inst{55-48} = VSRC2;
|
|
+ let Inst{63-56} = VSRC3;
|
|
+
|
|
+ let EXP_CNT = 1;
|
|
+}
|
|
+
|
|
+class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc64 <outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<8> VDATA;
|
|
+ bits<4> DMASK;
|
|
+ bits<1> UNORM;
|
|
+ bits<1> GLC;
|
|
+ bits<1> DA;
|
|
+ bits<1> R128;
|
|
+ bits<1> TFE;
|
|
+ bits<1> LWE;
|
|
+ bits<1> SLC;
|
|
+ bits<8> VADDR;
|
|
+ bits<5> SRSRC;
|
|
+ bits<5> SSAMP;
|
|
+
|
|
+ let Inst{11-8} = DMASK;
|
|
+ let Inst{12} = UNORM;
|
|
+ let Inst{13} = GLC;
|
|
+ let Inst{14} = DA;
|
|
+ let Inst{15} = R128;
|
|
+ let Inst{16} = TFE;
|
|
+ let Inst{17} = LWE;
|
|
+ let Inst{24-18} = op;
|
|
+ let Inst{25} = SLC;
|
|
+ let Inst{31-26} = 0x3c;
|
|
+ let Inst{39-32} = VADDR;
|
|
+ let Inst{47-40} = VDATA;
|
|
+ let Inst{52-48} = SRSRC;
|
|
+ let Inst{57-53} = SSAMP;
|
|
+
|
|
+ let VM_CNT = 1;
|
|
+ let EXP_CNT = 1;
|
|
+}
|
|
+
|
|
+class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc64<outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<8> VDATA;
|
|
+ bits<12> OFFSET;
|
|
+ bits<1> OFFEN;
|
|
+ bits<1> IDXEN;
|
|
+ bits<1> GLC;
|
|
+ bits<1> ADDR64;
|
|
+ bits<4> DFMT;
|
|
+ bits<3> NFMT;
|
|
+ bits<8> VADDR;
|
|
+ bits<5> SRSRC;
|
|
+ bits<1> SLC;
|
|
+ bits<1> TFE;
|
|
+ bits<8> SOFFSET;
|
|
+
|
|
+ let Inst{11-0} = OFFSET;
|
|
+ let Inst{12} = OFFEN;
|
|
+ let Inst{13} = IDXEN;
|
|
+ let Inst{14} = GLC;
|
|
+ let Inst{15} = ADDR64;
|
|
+ let Inst{18-16} = op;
|
|
+ let Inst{22-19} = DFMT;
|
|
+ let Inst{25-23} = NFMT;
|
|
+ let Inst{31-26} = 0x3a; //encoding
|
|
+ let Inst{39-32} = VADDR;
|
|
+ let Inst{47-40} = VDATA;
|
|
+ let Inst{52-48} = SRSRC;
|
|
+ let Inst{54} = SLC;
|
|
+ let Inst{55} = TFE;
|
|
+ let Inst{63-56} = SOFFSET;
|
|
+
|
|
+ let VM_CNT = 1;
|
|
+ let EXP_CNT = 1;
|
|
+
|
|
+ let neverHasSideEffects = 1;
|
|
+}
|
|
+
|
|
+class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc64<outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<8> VDATA;
|
|
+ bits<12> OFFSET;
|
|
+ bits<1> OFFEN;
|
|
+ bits<1> IDXEN;
|
|
+ bits<1> GLC;
|
|
+ bits<1> ADDR64;
|
|
+ bits<1> LDS;
|
|
+ bits<8> VADDR;
|
|
+ bits<5> SRSRC;
|
|
+ bits<1> SLC;
|
|
+ bits<1> TFE;
|
|
+ bits<8> SOFFSET;
|
|
+
|
|
+ let Inst{11-0} = OFFSET;
|
|
+ let Inst{12} = OFFEN;
|
|
+ let Inst{13} = IDXEN;
|
|
+ let Inst{14} = GLC;
|
|
+ let Inst{15} = ADDR64;
|
|
+ let Inst{16} = LDS;
|
|
+ let Inst{24-18} = op;
|
|
+ let Inst{31-26} = 0x38; //encoding
|
|
+ let Inst{39-32} = VADDR;
|
|
+ let Inst{47-40} = VDATA;
|
|
+ let Inst{52-48} = SRSRC;
|
|
+ let Inst{54} = SLC;
|
|
+ let Inst{55} = TFE;
|
|
+ let Inst{63-56} = SOFFSET;
|
|
+
|
|
+ let VM_CNT = 1;
|
|
+ let EXP_CNT = 1;
|
|
+
|
|
+ let neverHasSideEffects = 1;
|
|
+}
|
|
+
|
|
+} // End Uses = [EXEC]
|
|
+
|
|
+class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
|
|
+ list<dag> pattern> : Enc32<outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<7> SDST;
|
|
+ bits<6> SBASE;
|
|
+ bits<8> OFFSET;
|
|
+
|
|
+ let Inst{7-0} = OFFSET;
|
|
+ let Inst{8} = imm;
|
|
+ let Inst{14-9} = SBASE;
|
|
+ let Inst{21-15} = SDST;
|
|
+ let Inst{26-22} = op;
|
|
+ let Inst{31-27} = 0x18; //encoding
|
|
+
|
|
+ let LGKM_CNT = 1;
|
|
+}
|
|
+
|
|
+class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc32<outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<7> SDST;
|
|
+ bits<8> SSRC0;
|
|
+
|
|
+ let Inst{7-0} = SSRC0;
|
|
+ let Inst{15-8} = op;
|
|
+ let Inst{22-16} = SDST;
|
|
+ let Inst{31-23} = 0x17d; //encoding;
|
|
+
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc32 <outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<7> SDST;
|
|
+ bits<8> SSRC0;
|
|
+ bits<8> SSRC1;
|
|
+
|
|
+ let Inst{7-0} = SSRC0;
|
|
+ let Inst{15-8} = SSRC1;
|
|
+ let Inst{22-16} = SDST;
|
|
+ let Inst{29-23} = op;
|
|
+ let Inst{31-30} = 0x2; // encoding
|
|
+
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc32<outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<8> SSRC0;
|
|
+ bits<8> SSRC1;
|
|
+
|
|
+ let Inst{7-0} = SSRC0;
|
|
+ let Inst{15-8} = SSRC1;
|
|
+ let Inst{22-16} = op;
|
|
+ let Inst{31-23} = 0x17e;
|
|
+
|
|
+ let DisableEncoding = "$dst";
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc32 <outs, ins , asm, pattern> {
|
|
+
|
|
+ bits <7> SDST;
|
|
+ bits <16> SIMM16;
|
|
+
|
|
+ let Inst{15-0} = SIMM16;
|
|
+ let Inst{22-16} = SDST;
|
|
+ let Inst{27-23} = op;
|
|
+ let Inst{31-28} = 0xb; //encoding
|
|
+
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
|
|
+ (outs),
|
|
+ ins,
|
|
+ asm,
|
|
+ pattern > {
|
|
+
|
|
+ bits <16> SIMM16;
|
|
+
|
|
+ let Inst{15-0} = SIMM16;
|
|
+ let Inst{22-16} = op;
|
|
+ let Inst{31-23} = 0x17f; // encoding
|
|
+
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+let Uses = [EXEC] in {
|
|
+
|
|
+class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc32 <outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<8> VDST;
|
|
+ bits<8> VSRC;
|
|
+ bits<2> ATTRCHAN;
|
|
+ bits<6> ATTR;
|
|
+
|
|
+ let Inst{7-0} = VSRC;
|
|
+ let Inst{9-8} = ATTRCHAN;
|
|
+ let Inst{15-10} = ATTR;
|
|
+ let Inst{17-16} = op;
|
|
+ let Inst{25-18} = VDST;
|
|
+ let Inst{31-26} = 0x32; // encoding
|
|
+
|
|
+ let neverHasSideEffects = 1;
|
|
+ let mayLoad = 1;
|
|
+ let mayStore = 0;
|
|
+}
|
|
+
|
|
+class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc32 <outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<8> VDST;
|
|
+ bits<9> SRC0;
|
|
+
|
|
+ let Inst{8-0} = SRC0;
|
|
+ let Inst{16-9} = op;
|
|
+ let Inst{24-17} = VDST;
|
|
+ let Inst{31-25} = 0x3f; //encoding
|
|
+
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc32 <outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<8> VDST;
|
|
+ bits<9> SRC0;
|
|
+ bits<8> VSRC1;
|
|
+
|
|
+ let Inst{8-0} = SRC0;
|
|
+ let Inst{16-9} = VSRC1;
|
|
+ let Inst{24-17} = VDST;
|
|
+ let Inst{30-25} = op;
|
|
+ let Inst{31} = 0x0; //encoding
|
|
+
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc64 <outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<8> VDST;
|
|
+ bits<9> SRC0;
|
|
+ bits<9> SRC1;
|
|
+ bits<9> SRC2;
|
|
+ bits<3> ABS;
|
|
+ bits<1> CLAMP;
|
|
+ bits<2> OMOD;
|
|
+ bits<3> NEG;
|
|
+
|
|
+ let Inst{7-0} = VDST;
|
|
+ let Inst{10-8} = ABS;
|
|
+ let Inst{11} = CLAMP;
|
|
+ let Inst{25-17} = op;
|
|
+ let Inst{31-26} = 0x34; //encoding
|
|
+ let Inst{40-32} = SRC0;
|
|
+ let Inst{49-41} = SRC1;
|
|
+ let Inst{58-50} = SRC2;
|
|
+ let Inst{60-59} = OMOD;
|
|
+ let Inst{63-61} = NEG;
|
|
+
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc64 <outs, ins, asm, pattern> {
|
|
+
|
|
+ bits<8> VDST;
|
|
+ bits<9> SRC0;
|
|
+ bits<9> SRC1;
|
|
+ bits<9> SRC2;
|
|
+ bits<7> SDST;
|
|
+ bits<2> OMOD;
|
|
+ bits<3> NEG;
|
|
+
|
|
+ let Inst{7-0} = VDST;
|
|
+ let Inst{14-8} = SDST;
|
|
+ let Inst{25-17} = op;
|
|
+ let Inst{31-26} = 0x34; //encoding
|
|
+ let Inst{40-32} = SRC0;
|
|
+ let Inst{49-41} = SRC1;
|
|
+ let Inst{58-50} = SRC2;
|
|
+ let Inst{60-59} = OMOD;
|
|
+ let Inst{63-61} = NEG;
|
|
+
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
|
|
+ Enc32 <(outs VCCReg:$dst), ins, asm, pattern> {
|
|
+
|
|
+ bits<9> SRC0;
|
|
+ bits<8> VSRC1;
|
|
+
|
|
+ let Inst{8-0} = SRC0;
|
|
+ let Inst{16-9} = VSRC1;
|
|
+ let Inst{24-17} = op;
|
|
+ let Inst{31-25} = 0x3e;
|
|
+
|
|
+ let DisableEncoding = "$dst";
|
|
+ let mayLoad = 0;
|
|
+ let mayStore = 0;
|
|
+ let hasSideEffects = 0;
|
|
+}
|
|
+
|
|
+} // End Uses = [EXEC]
|
|
+
|
|
+include "SIInstrFormats.td"
|
|
+include "SIInstructions.td"
|
|
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
|
|
new file mode 100644
|
|
index 0000000..3a9822a
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIInstructions.td
|
|
@@ -0,0 +1,1462 @@
|
|
+//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+// This file was originally auto-generated from a GPU register header file and
|
|
+// all the instruction definitions were originally commented out. Instructions
|
|
+// that are not yet supported remain commented out.
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+class InterpSlots {
|
|
+int P0 = 2;
|
|
+int P10 = 0;
|
|
+int P20 = 1;
|
|
+}
|
|
+def INTERP : InterpSlots;
|
|
+
|
|
+def InterpSlot : Operand<i32> {
|
|
+ let PrintMethod = "printInterpSlot";
|
|
+}
|
|
+
|
|
+def isSI : Predicate<"Subtarget.device()"
|
|
+ "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">;
|
|
+
|
|
+let Predicates = [isSI] in {
|
|
+
|
|
+let neverHasSideEffects = 1 in {
|
|
+def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>;
|
|
+def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>;
|
|
+def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
|
|
+def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
|
|
+def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
|
|
+def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
|
|
+def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
|
|
+def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
|
|
+def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>;
|
|
+def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
|
|
+} // End neverHasSideEffects = 1
|
|
+////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
|
|
+////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
|
|
+////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
|
|
+////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
|
|
+////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
|
|
+////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
|
|
+////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
|
|
+////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
|
|
+//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>;
|
|
+//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
|
|
+def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
|
|
+//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
|
|
+//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>;
|
|
+//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>;
|
|
+////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
|
|
+////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
|
|
+////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
|
|
+////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
|
|
+def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>;
|
|
+def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
|
|
+def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
|
|
+def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
|
|
+
|
|
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in {
|
|
+
|
|
+def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>;
|
|
+def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>;
|
|
+def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>;
|
|
+def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>;
|
|
+def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>;
|
|
+def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>;
|
|
+def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>;
|
|
+def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>;
|
|
+
|
|
+} // End hasSideEffects = 1
|
|
+
|
|
+def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>;
|
|
+def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>;
|
|
+def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>;
|
|
+def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>;
|
|
+def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>;
|
|
+def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
|
|
+//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>;
|
|
+def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
|
|
+def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
|
|
+def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
|
|
+def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
|
|
+def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
|
|
+
|
|
+/*
|
|
+This instruction is disabled for now until we can figure out how to teach
|
|
+the instruction selector to correctly use the S_CMP* vs V_CMP*
|
|
+instructions.
|
|
+
|
|
+When this instruction is enabled the code generator sometimes produces this
|
|
+invalid sequence:
|
|
+
|
|
+SCC = S_CMPK_EQ_I32 SGPR0, imm
|
|
+VCC = COPY SCC
|
|
+VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
|
|
+
|
|
+def S_CMPK_EQ_I32 : SOPK <
|
|
+ 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
|
|
+ "S_CMPK_EQ_I32",
|
|
+ [(set SCCReg:$dst, (setcc SReg_32:$src0, imm:$src1, SETEQ))]
|
|
+>;
|
|
+*/
|
|
+
|
|
+def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
|
|
+def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
|
|
+def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
|
|
+def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>;
|
|
+def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>;
|
|
+def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
|
|
+def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
|
|
+def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
|
|
+def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
|
|
+def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
|
|
+def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
|
|
+def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
|
|
+def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
|
|
+//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
|
|
+def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
|
|
+def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
|
|
+def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
|
|
+//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
|
|
+//def EXP : EXP_ <0x00000000, "EXP", []>;
|
|
+
|
|
+defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32", []>;
|
|
+defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_LT)),
|
|
+ (V_CMP_LT_F32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_EQ)),
|
|
+ (V_CMP_EQ_F32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_LE)),
|
|
+ (V_CMP_LE_F32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_GT)),
|
|
+ (V_CMP_GT_F32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_NE)),
|
|
+ (V_CMP_LG_F32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_GE)),
|
|
+ (V_CMP_GE_F32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", []>;
|
|
+defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", []>;
|
|
+defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32", []>;
|
|
+defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32", []>;
|
|
+defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32", []>;
|
|
+defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32", []>;
|
|
+defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_NE)),
|
|
+ (V_CMP_NEQ_F32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32", []>;
|
|
+defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32", []>;
|
|
+
|
|
+//Side effect is writing to EXEC
|
|
+let hasSideEffects = 1 in {
|
|
+
|
|
+defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32", []>;
|
|
+defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32", []>;
|
|
+defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32", []>;
|
|
+defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32", []>;
|
|
+defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32", []>;
|
|
+defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32", []>;
|
|
+defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32", []>;
|
|
+defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32", []>;
|
|
+defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32", []>;
|
|
+defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32", []>;
|
|
+defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32", []>;
|
|
+defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32", []>;
|
|
+defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32", []>;
|
|
+defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32", []>;
|
|
+defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32", []>;
|
|
+defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32", []>;
|
|
+
|
|
+} // End hasSideEffects = 1
|
|
+
|
|
+defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64", []>;
|
|
+defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", []>;
|
|
+defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", []>;
|
|
+defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", []>;
|
|
+defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", []>;
|
|
+defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64", []>;
|
|
+defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", []>;
|
|
+defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", []>;
|
|
+defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", []>;
|
|
+defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64", []>;
|
|
+defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64", []>;
|
|
+defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64", []>;
|
|
+defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64", []>;
|
|
+defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", []>;
|
|
+defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64", []>;
|
|
+defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64", []>;
|
|
+
|
|
+//Side effect is writing to EXEC
|
|
+let hasSideEffects = 1 in {
|
|
+
|
|
+defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64", []>;
|
|
+defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64", []>;
|
|
+defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64", []>;
|
|
+defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64", []>;
|
|
+defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64", []>;
|
|
+defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64", []>;
|
|
+defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64", []>;
|
|
+defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64", []>;
|
|
+defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64", []>;
|
|
+defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64", []>;
|
|
+defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64", []>;
|
|
+defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64", []>;
|
|
+defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64", []>;
|
|
+defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64", []>;
|
|
+defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64", []>;
|
|
+defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64", []>;
|
|
+
|
|
+} // End hasSideEffects = 1
|
|
+
|
|
+defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32", []>;
|
|
+defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32", []>;
|
|
+defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32", []>;
|
|
+defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32", []>;
|
|
+defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32", []>;
|
|
+defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32", []>;
|
|
+defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32", []>;
|
|
+defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32", []>;
|
|
+defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32", []>;
|
|
+defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32", []>;
|
|
+defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32", []>;
|
|
+defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32", []>;
|
|
+defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32", []>;
|
|
+defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32", []>;
|
|
+defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32", []>;
|
|
+defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32", []>;
|
|
+defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32", []>;
|
|
+defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32", []>;
|
|
+defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32", []>;
|
|
+defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32", []>;
|
|
+defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32", []>;
|
|
+defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32", []>;
|
|
+defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32", []>;
|
|
+defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32", []>;
|
|
+defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32", []>;
|
|
+defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32", []>;
|
|
+defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32", []>;
|
|
+defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32", []>;
|
|
+defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32", []>;
|
|
+defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32", []>;
|
|
+defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32", []>;
|
|
+defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32", []>;
|
|
+defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64", []>;
|
|
+defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64", []>;
|
|
+defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64", []>;
|
|
+defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64", []>;
|
|
+defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64", []>;
|
|
+defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64", []>;
|
|
+defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64", []>;
|
|
+defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64", []>;
|
|
+defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64", []>;
|
|
+defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64", []>;
|
|
+defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64", []>;
|
|
+defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64", []>;
|
|
+defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64", []>;
|
|
+defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64", []>;
|
|
+defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64", []>;
|
|
+defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64", []>;
|
|
+defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64", []>;
|
|
+defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64", []>;
|
|
+defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64", []>;
|
|
+defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64", []>;
|
|
+defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64", []>;
|
|
+defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64", []>;
|
|
+defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64", []>;
|
|
+defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64", []>;
|
|
+defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64", []>;
|
|
+defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64", []>;
|
|
+defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64", []>;
|
|
+defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64", []>;
|
|
+defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64", []>;
|
|
+defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64", []>;
|
|
+defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64", []>;
|
|
+defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64", []>;
|
|
+defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32", []>;
|
|
+defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_LT)),
|
|
+ (V_CMP_LT_I32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_EQ)),
|
|
+ (V_CMP_EQ_I32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_LE)),
|
|
+ (V_CMP_LE_I32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_GT)),
|
|
+ (V_CMP_GT_I32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_NE)),
|
|
+ (V_CMP_NE_I32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", []>;
|
|
+def : Pat <
|
|
+ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_GE)),
|
|
+ (V_CMP_GE_I32_e64 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32", []>;
|
|
+
|
|
+let hasSideEffects = 1 in {
|
|
+
|
|
+defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32", []>;
|
|
+defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32", []>;
|
|
+defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32", []>;
|
|
+defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32", []>;
|
|
+defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32", []>;
|
|
+defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32", []>;
|
|
+defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32", []>;
|
|
+defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32", []>;
|
|
+
|
|
+} // End hasSideEffects
|
|
+
|
|
+defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64", []>;
|
|
+defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", []>;
|
|
+defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", []>;
|
|
+defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", []>;
|
|
+defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", []>;
|
|
+defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", []>;
|
|
+defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", []>;
|
|
+defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64", []>;
|
|
+
|
|
+let hasSideEffects = 1 in {
|
|
+
|
|
+defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64", []>;
|
|
+defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64", []>;
|
|
+defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64", []>;
|
|
+defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64", []>;
|
|
+defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64", []>;
|
|
+defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64", []>;
|
|
+defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64", []>;
|
|
+defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64", []>;
|
|
+
|
|
+} // End hasSideEffects
|
|
+
|
|
+defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32", []>;
|
|
+defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", []>;
|
|
+defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", []>;
|
|
+defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", []>;
|
|
+defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", []>;
|
|
+defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", []>;
|
|
+defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", []>;
|
|
+defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32", []>;
|
|
+
|
|
+let hasSideEffects = 1 in {
|
|
+
|
|
+defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32", []>;
|
|
+defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32", []>;
|
|
+defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32", []>;
|
|
+defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32", []>;
|
|
+defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32", []>;
|
|
+defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32", []>;
|
|
+defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32", []>;
|
|
+defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32", []>;
|
|
+
|
|
+} // End hasSideEffects
|
|
+
|
|
+defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64", []>;
|
|
+defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", []>;
|
|
+defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", []>;
|
|
+defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", []>;
|
|
+defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", []>;
|
|
+defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", []>;
|
|
+defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", []>;
|
|
+defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64", []>;
|
|
+defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64", []>;
|
|
+defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64", []>;
|
|
+defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64", []>;
|
|
+defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64", []>;
|
|
+defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64", []>;
|
|
+defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64", []>;
|
|
+defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64", []>;
|
|
+defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64", []>;
|
|
+defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32", []>;
|
|
+defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32", []>;
|
|
+defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64", []>;
|
|
+defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64", []>;
|
|
+//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
|
|
+//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
|
|
+//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
|
|
+def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
|
|
+//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
|
|
+//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
|
|
+//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
|
|
+//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
|
|
+//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>;
|
|
+//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
|
|
+//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
|
|
+//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
|
|
+//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>;
|
|
+//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>;
|
|
+//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>;
|
|
+//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
|
|
+//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
|
|
+//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
|
|
+//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>;
|
|
+//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
|
|
+//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
|
|
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
|
|
+//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
|
|
+//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>;
|
|
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>;
|
|
+//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>;
|
|
+//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>;
|
|
+//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>;
|
|
+//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>;
|
|
+//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>;
|
|
+//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>;
|
|
+//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>;
|
|
+//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>;
|
|
+//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>;
|
|
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>;
|
|
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>;
|
|
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>;
|
|
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>;
|
|
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>;
|
|
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>;
|
|
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>;
|
|
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>;
|
|
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>;
|
|
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>;
|
|
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>;
|
|
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>;
|
|
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>;
|
|
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>;
|
|
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>;
|
|
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>;
|
|
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>;
|
|
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>;
|
|
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>;
|
|
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
|
|
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
|
|
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
|
|
+//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
|
|
+//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
|
|
+//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
|
|
+def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
|
|
+//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>;
|
|
+//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>;
|
|
+//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
|
|
+//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
|
|
+
|
|
+let mayLoad = 1 in {
|
|
+
|
|
+defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>;
|
|
+
|
|
+//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>;
|
|
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>;
|
|
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>;
|
|
+//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>;
|
|
+//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>;
|
|
+//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>;
|
|
+//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>;
|
|
+//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>;
|
|
+//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>;
|
|
+
|
|
+} // mayLoad = 1
|
|
+
|
|
+//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
|
|
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
|
|
+//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
|
|
+//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>;
|
|
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
|
|
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
|
|
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
|
|
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>;
|
|
+//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>;
|
|
+//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
|
|
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
|
|
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
|
|
+//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>;
|
|
+//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
|
|
+//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
|
|
+//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
|
|
+//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>;
|
|
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>;
|
|
+//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>;
|
|
+//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>;
|
|
+//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>;
|
|
+//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>;
|
|
+//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>;
|
|
+//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>;
|
|
+//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>;
|
|
+//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>;
|
|
+//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>;
|
|
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
|
|
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
|
|
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
|
|
+def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">;
|
|
+//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
|
|
+def IMAGE_SAMPLE_D : MIMG_Load_Helper <0x00000022, "IMAGE_SAMPLE_D">;
|
|
+//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
|
|
+def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">;
|
|
+def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">;
|
|
+//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
|
|
+//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
|
|
+def IMAGE_SAMPLE_C : MIMG_Load_Helper <0x00000028, "IMAGE_SAMPLE_C">;
|
|
+//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
|
|
+//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
|
|
+//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
|
|
+def IMAGE_SAMPLE_C_L : MIMG_Load_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
|
|
+def IMAGE_SAMPLE_C_B : MIMG_Load_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
|
|
+//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
|
|
+//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
|
|
+//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
|
|
+//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>;
|
|
+//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>;
|
|
+//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>;
|
|
+//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>;
|
|
+//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>;
|
|
+//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>;
|
|
+//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>;
|
|
+//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>;
|
|
+//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>;
|
|
+//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>;
|
|
+//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>;
|
|
+//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>;
|
|
+//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>;
|
|
+//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>;
|
|
+//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>;
|
|
+//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>;
|
|
+//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>;
|
|
+//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>;
|
|
+//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>;
|
|
+//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>;
|
|
+//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>;
|
|
+//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>;
|
|
+//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>;
|
|
+//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>;
|
|
+//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>;
|
|
+//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>;
|
|
+//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>;
|
|
+//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>;
|
|
+//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>;
|
|
+//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>;
|
|
+//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>;
|
|
+//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>;
|
|
+//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>;
|
|
+//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>;
|
|
+//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>;
|
|
+//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>;
|
|
+//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>;
|
|
+//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>;
|
|
+//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>;
|
|
+//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>;
|
|
+//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>;
|
|
+//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>;
|
|
+//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>;
|
|
+//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>;
|
|
+//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>;
|
|
+//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>;
|
|
+//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>;
|
|
+//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
|
|
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
|
|
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
|
|
+//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
|
|
+
|
|
+let neverHasSideEffects = 1 in {
|
|
+defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
|
|
+} // End neverHasSideEffects
|
|
+defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
|
|
+//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
|
|
+//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
|
|
+defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
|
|
+ [(set VReg_32:$dst, (sint_to_fp VSrc_32:$src0))]
|
|
+>;
|
|
+//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>;
|
|
+//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
|
|
+defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
|
|
+ [(set (i32 VReg_32:$dst), (fp_to_sint VSrc_32:$src0))]
|
|
+>;
|
|
+defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
|
|
+////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
|
|
+//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>;
|
|
+//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
|
|
+//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
|
|
+//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
|
|
+//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>;
|
|
+//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>;
|
|
+//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
|
|
+//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
|
|
+//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
|
|
+//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
|
|
+//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
|
|
+//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
|
|
+defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
|
|
+ [(set VReg_32:$dst, (AMDGPUfract VSrc_32:$src0))]
|
|
+>;
|
|
+defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>;
|
|
+defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
|
|
+ [(set VReg_32:$dst, (fceil VSrc_32:$src0))]
|
|
+>;
|
|
+defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
|
|
+ [(set VReg_32:$dst, (frint VSrc_32:$src0))]
|
|
+>;
|
|
+defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
|
|
+ [(set VReg_32:$dst, (ffloor VSrc_32:$src0))]
|
|
+>;
|
|
+defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
|
|
+ [(set VReg_32:$dst, (fexp2 VSrc_32:$src0))]
|
|
+>;
|
|
+defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
|
|
+defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
|
|
+ [(set VReg_32:$dst, (flog2 VSrc_32:$src0))]
|
|
+>;
|
|
+defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
|
|
+defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
|
|
+defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
|
|
+ [(set VReg_32:$dst, (fdiv FP_ONE, VSrc_32:$src0))]
|
|
+>;
|
|
+defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
|
|
+defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
|
|
+defm V_RSQ_LEGACY_F32 : VOP1_32 <
|
|
+ 0x0000002d, "V_RSQ_LEGACY_F32",
|
|
+ [(set VReg_32:$dst, (int_AMDGPU_rsq VSrc_32:$src0))]
|
|
+>;
|
|
+defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
|
|
+defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>;
|
|
+defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
|
|
+defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>;
|
|
+defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
|
|
+defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", []>;
|
|
+defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", []>;
|
|
+defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>;
|
|
+defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>;
|
|
+defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>;
|
|
+defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>;
|
|
+defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>;
|
|
+defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>;
|
|
+defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>;
|
|
+//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>;
|
|
+defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>;
|
|
+defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>;
|
|
+//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>;
|
|
+defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>;
|
|
+//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>;
|
|
+defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
|
|
+defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
|
|
+defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
|
|
+
|
|
+def V_INTERP_P1_F32 : VINTRP <
|
|
+ 0x00000000,
|
|
+ (outs VReg_32:$dst),
|
|
+ (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
|
|
+ "V_INTERP_P1_F32",
|
|
+ []> {
|
|
+ let DisableEncoding = "$m0";
|
|
+}
|
|
+
|
|
+def V_INTERP_P2_F32 : VINTRP <
|
|
+ 0x00000001,
|
|
+ (outs VReg_32:$dst),
|
|
+ (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
|
|
+ "V_INTERP_P2_F32",
|
|
+ []> {
|
|
+
|
|
+ let Constraints = "$src0 = $dst";
|
|
+ let DisableEncoding = "$src0,$m0";
|
|
+
|
|
+}
|
|
+
|
|
+def V_INTERP_MOV_F32 : VINTRP <
|
|
+ 0x00000002,
|
|
+ (outs VReg_32:$dst),
|
|
+ (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
|
|
+ "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr",
|
|
+ []> {
|
|
+ let DisableEncoding = "$m0";
|
|
+}
|
|
+
|
|
+//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
|
|
+
|
|
+let isTerminator = 1 in {
|
|
+
|
|
+def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
|
|
+ [(IL_retflag)]> {
|
|
+ let SIMM16 = 0;
|
|
+ let isBarrier = 1;
|
|
+ let hasCtrlDep = 1;
|
|
+}
|
|
+
|
|
+let isBranch = 1 in {
|
|
+def S_BRANCH : SOPP <
|
|
+ 0x00000002, (ins brtarget:$target), "S_BRANCH",
|
|
+ [(br bb:$target)]> {
|
|
+ let isBarrier = 1;
|
|
+}
|
|
+
|
|
+let DisableEncoding = "$scc" in {
|
|
+def S_CBRANCH_SCC0 : SOPP <
|
|
+ 0x00000004, (ins brtarget:$target, SCCReg:$scc),
|
|
+ "S_CBRANCH_SCC0", []
|
|
+>;
|
|
+def S_CBRANCH_SCC1 : SOPP <
|
|
+ 0x00000005, (ins brtarget:$target, SCCReg:$scc),
|
|
+ "S_CBRANCH_SCC1",
|
|
+ []
|
|
+>;
|
|
+} // End DisableEncoding = "$scc"
|
|
+
|
|
+def S_CBRANCH_VCCZ : SOPP <
|
|
+ 0x00000006, (ins brtarget:$target, VCCReg:$vcc),
|
|
+ "S_CBRANCH_VCCZ",
|
|
+ []
|
|
+>;
|
|
+def S_CBRANCH_VCCNZ : SOPP <
|
|
+ 0x00000007, (ins brtarget:$target, VCCReg:$vcc),
|
|
+ "S_CBRANCH_VCCNZ",
|
|
+ []
|
|
+>;
|
|
+
|
|
+let DisableEncoding = "$exec" in {
|
|
+def S_CBRANCH_EXECZ : SOPP <
|
|
+ 0x00000008, (ins brtarget:$target, EXECReg:$exec),
|
|
+ "S_CBRANCH_EXECZ",
|
|
+ []
|
|
+>;
|
|
+def S_CBRANCH_EXECNZ : SOPP <
|
|
+ 0x00000009, (ins brtarget:$target, EXECReg:$exec),
|
|
+ "S_CBRANCH_EXECNZ",
|
|
+ []
|
|
+>;
|
|
+} // End DisableEncoding = "$exec"
|
|
+
|
|
+
|
|
+} // End isBranch = 1
|
|
+} // End isTerminator = 1
|
|
+
|
|
+//def S_BARRIER : SOPP_ <0x0000000a, "S_BARRIER", []>;
|
|
+let hasSideEffects = 1 in {
|
|
+def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16",
|
|
+ []
|
|
+>;
|
|
+} // End hasSideEffects
|
|
+//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
|
|
+//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
|
|
+//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
|
|
+//def S_SENDMSG : SOPP_ <0x00000010, "S_SENDMSG", []>;
|
|
+//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
|
|
+//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
|
|
+//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
|
|
+//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
|
|
+//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
|
|
+//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
|
|
+
|
|
+def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
|
|
+ (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc), "V_CNDMASK_B32_e32",
|
|
+ []
|
|
+>{
|
|
+ let DisableEncoding = "$vcc";
|
|
+}
|
|
+
|
|
+def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
|
|
+ (ins VReg_32:$src0, VReg_32:$src1, SReg_64:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
|
|
+ "V_CNDMASK_B32_e64",
|
|
+ [(set (i32 VReg_32:$dst), (select (i1 SReg_64:$src2), VReg_32:$src1, VReg_32:$src0))]
|
|
+>;
|
|
+
|
|
+//f32 pattern for V_CNDMASK_B32_e64
|
|
+def : Pat <
|
|
+ (f32 (select (i1 SReg_64:$src2), VReg_32:$src1, VReg_32:$src0)),
|
|
+ (V_CNDMASK_B32_e64 VReg_32:$src0, VReg_32:$src1, SReg_64:$src2)
|
|
+>;
|
|
+
|
|
+defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
|
|
+defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>;
|
|
+
|
|
+defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", []>;
|
|
+def : Pat <
|
|
+ (f32 (fadd VSrc_32:$src0, VReg_32:$src1)),
|
|
+ (V_ADD_F32_e32 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+
|
|
+defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", []>;
|
|
+def : Pat <
|
|
+ (f32 (fsub VSrc_32:$src0, VReg_32:$src1)),
|
|
+ (V_SUB_F32_e32 VSrc_32:$src0, VReg_32:$src1)
|
|
+>;
|
|
+defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>;
|
|
+defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
|
|
+defm V_MUL_LEGACY_F32 : VOP2_32 <
|
|
+ 0x00000007, "V_MUL_LEGACY_F32",
|
|
+ [(set VReg_32:$dst, (int_AMDGPU_mul VSrc_32:$src0, VReg_32:$src1))]
|
|
+>;
|
|
+
|
|
+defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
|
|
+ [(set VReg_32:$dst, (fmul VSrc_32:$src0, VReg_32:$src1))]
|
|
+>;
|
|
+//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>;
|
|
+//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
|
|
+//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>;
|
|
+//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
|
|
+defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
|
|
+ [(set VReg_32:$dst, (AMDGPUfmin VSrc_32:$src0, VReg_32:$src1))]
|
|
+>;
|
|
+
|
|
+defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
|
|
+ [(set VReg_32:$dst, (AMDGPUfmax VSrc_32:$src0, VReg_32:$src1))]
|
|
+>;
|
|
+defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
|
|
+defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
|
|
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
|
|
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
|
|
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
|
|
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
|
|
+defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
|
|
+defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>;
|
|
+defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
|
|
+defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>;
|
|
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
|
|
+defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>;
|
|
+defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
|
|
+ [(set VReg_32:$dst, (and VSrc_32:$src0, VReg_32:$src1))]
|
|
+>;
|
|
+defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
|
|
+ [(set VReg_32:$dst, (or VSrc_32:$src0, VReg_32:$src1))]
|
|
+>;
|
|
+defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
|
|
+ [(set VReg_32:$dst, (xor VSrc_32:$src0, VReg_32:$src1))]
|
|
+>;
|
|
+defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>;
|
|
+defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
|
|
+defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
|
|
+defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
|
|
+//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
|
|
+//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
|
|
+//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
|
|
+let Defs = [VCC] in { // Carry-out goes to VCC
|
|
+defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32",
|
|
+ [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
|
|
+>;
|
|
+defm V_SUB_I32 : VOP2_32 <0x00000026, "V_SUB_I32",
|
|
+ [(set VReg_32:$dst, (sub (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
|
|
+>;
|
|
+} // End Defs = [VCC]
|
|
+defm V_SUBREV_I32 : VOP2_32 <0x00000027, "V_SUBREV_I32", []>;
|
|
+defm V_ADDC_U32 : VOP2_32 <0x00000028, "V_ADDC_U32", []>;
|
|
+defm V_SUBB_U32 : VOP2_32 <0x00000029, "V_SUBB_U32", []>;
|
|
+defm V_SUBBREV_U32 : VOP2_32 <0x0000002a, "V_SUBBREV_U32", []>;
|
|
+defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
|
|
+////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>;
|
|
+////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
|
|
+////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
|
|
+defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
|
|
+ [(set VReg_32:$dst, (int_SI_packf16 VSrc_32:$src0, VReg_32:$src1))]
|
|
+>;
|
|
+////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
|
|
+////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
|
|
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>;
|
|
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>;
|
|
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>;
|
|
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>;
|
|
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>;
|
|
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>;
|
|
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>;
|
|
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>;
|
|
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>;
|
|
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>;
|
|
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>;
|
|
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>;
|
|
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
|
|
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
|
|
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
|
|
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
|
|
+//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
|
|
+
|
|
+let neverHasSideEffects = 1 in {
|
|
+
|
|
+def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
|
|
+def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
|
|
+//def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", []>;
|
|
+//def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", []>;
|
|
+
|
|
+} // End neverHasSideEffects
|
|
+def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
|
|
+def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
|
|
+def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
|
|
+def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
|
|
+def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
|
|
+def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
|
|
+def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
|
|
+def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
|
|
+def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
|
|
+//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
|
|
+def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
|
|
+def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
|
|
+def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
|
|
+////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
|
|
+////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
|
|
+////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
|
|
+////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
|
|
+////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
|
|
+////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
|
|
+////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
|
|
+////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
|
|
+////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
|
|
+//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
|
|
+//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
|
|
+//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
|
|
+def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
|
|
+////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
|
|
+def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
|
|
+def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
|
|
+def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>;
|
|
+def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>;
|
|
+def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>;
|
|
+def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
|
|
+def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
|
|
+def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
|
|
+def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
|
|
+def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
|
|
+def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
|
|
+def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
|
|
+def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
|
|
+def : Pat <
|
|
+ (mul VSrc_32:$src0, VReg_32:$src1),
|
|
+ (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (IMPLICIT_DEF), 0, 0, 0, 0)
|
|
+>;
|
|
+def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
|
|
+def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
|
|
+def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
|
|
+def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
|
|
+def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
|
|
+//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
|
|
+//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
|
|
+//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
|
|
+def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
|
|
+def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
|
|
+def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
|
|
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>;
|
|
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>;
|
|
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>;
|
|
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>;
|
|
+def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>;
|
|
+def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>;
|
|
+def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>;
|
|
+def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>;
|
|
+
|
|
+def S_CSELECT_B32 : SOP2 <
|
|
+ 0x0000000a, (outs SReg_32:$dst),
|
|
+ (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
|
|
+ [(set (i32 SReg_32:$dst), (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1))]
|
|
+>;
|
|
+
|
|
+def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
|
|
+
|
|
+// f32 pattern for S_CSELECT_B32
|
|
+def : Pat <
|
|
+ (f32 (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1)),
|
|
+ (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc)
|
|
+>;
|
|
+
|
|
+def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>;
|
|
+
|
|
+def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
|
|
+ [(set SReg_64:$dst, (i64 (and SSrc_64:$src0, SSrc_64:$src1)))]
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (i1 (and SSrc_64:$src0, SSrc_64:$src1)),
|
|
+ (S_AND_B64 SSrc_64:$src0, SSrc_64:$src1)
|
|
+>;
|
|
+
|
|
+def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>;
|
|
+def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>;
|
|
+def : Pat <
|
|
+ (i1 (or SSrc_64:$src0, SSrc_64:$src1)),
|
|
+ (S_OR_B64 SSrc_64:$src0, SSrc_64:$src1)
|
|
+>;
|
|
+def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
|
|
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
|
|
+def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
|
|
+def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
|
|
+def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
|
|
+def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
|
|
+def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
|
|
+def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
|
|
+def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
|
|
+def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
|
|
+def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
|
|
+def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
|
|
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>;
|
|
+def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>;
|
|
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>;
|
|
+def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>;
|
|
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>;
|
|
+def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>;
|
|
+def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
|
|
+def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
|
|
+def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
|
|
+def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
|
|
+def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
|
|
+def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
|
|
+def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
|
|
+//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
|
|
+def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
|
|
+
|
|
+let isCodeGenOnly = 1, isPseudo = 1 in {
|
|
+
|
|
+def SET_M0 : InstSI <
|
|
+ (outs SReg_32:$dst),
|
|
+ (ins i32imm:$src0),
|
|
+ "SET_M0",
|
|
+ [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))]
|
|
+>;
|
|
+
|
|
+def LOAD_CONST : AMDGPUShaderInst <
|
|
+ (outs GPRF32:$dst),
|
|
+ (ins i32imm:$src),
|
|
+ "LOAD_CONST $dst, $src",
|
|
+ [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
|
|
+>;
|
|
+
|
|
+let usesCustomInserter = 1 in {
|
|
+
|
|
+def SI_INTERP : InstSI <
|
|
+ (outs VReg_32:$dst),
|
|
+ (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
|
|
+ "SI_INTERP $dst, $i, $j, $attr_chan, $attr, $params",
|
|
+ []
|
|
+>;
|
|
+
|
|
+def SI_WQM : InstSI <
|
|
+ (outs),
|
|
+ (ins),
|
|
+ "SI_WQM",
|
|
+ [(int_SI_wqm)]
|
|
+>;
|
|
+
|
|
+} // end usesCustomInserter
|
|
+
|
|
+// SI Psuedo instructions. These are used by the CFG structurizer pass
|
|
+// and should be lowered to ISA instructions prior to codegen.
|
|
+
|
|
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
|
|
+ Uses = [EXEC], Defs = [EXEC] in {
|
|
+
|
|
+let isBranch = 1, isTerminator = 1 in {
|
|
+
|
|
+def SI_IF : InstSI <
|
|
+ (outs SReg_64:$dst),
|
|
+ (ins SReg_64:$vcc, brtarget:$target),
|
|
+ "SI_IF",
|
|
+ [(set SReg_64:$dst, (int_SI_if SReg_64:$vcc, bb:$target))]
|
|
+>;
|
|
+
|
|
+def SI_ELSE : InstSI <
|
|
+ (outs SReg_64:$dst),
|
|
+ (ins SReg_64:$src, brtarget:$target),
|
|
+ "SI_ELSE",
|
|
+ [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> {
|
|
+
|
|
+ let Constraints = "$src = $dst";
|
|
+}
|
|
+
|
|
+def SI_LOOP : InstSI <
|
|
+ (outs),
|
|
+ (ins SReg_64:$saved, brtarget:$target),
|
|
+ "SI_LOOP",
|
|
+ [(int_SI_loop SReg_64:$saved, bb:$target)]
|
|
+>;
|
|
+
|
|
+} // end isBranch = 1, isTerminator = 1
|
|
+
|
|
+def SI_BREAK : InstSI <
|
|
+ (outs SReg_64:$dst),
|
|
+ (ins SReg_64:$src),
|
|
+ "SI_ELSE",
|
|
+ [(set SReg_64:$dst, (int_SI_break SReg_64:$src))]
|
|
+>;
|
|
+
|
|
+def SI_IF_BREAK : InstSI <
|
|
+ (outs SReg_64:$dst),
|
|
+ (ins SReg_64:$vcc, SReg_64:$src),
|
|
+ "SI_IF_BREAK",
|
|
+ [(set SReg_64:$dst, (int_SI_if_break SReg_64:$vcc, SReg_64:$src))]
|
|
+>;
|
|
+
|
|
+def SI_ELSE_BREAK : InstSI <
|
|
+ (outs SReg_64:$dst),
|
|
+ (ins SReg_64:$src0, SReg_64:$src1),
|
|
+ "SI_ELSE_BREAK",
|
|
+ [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))]
|
|
+>;
|
|
+
|
|
+def SI_END_CF : InstSI <
|
|
+ (outs),
|
|
+ (ins SReg_64:$saved),
|
|
+ "SI_END_CF",
|
|
+ [(int_SI_end_cf SReg_64:$saved)]
|
|
+>;
|
|
+
|
|
+def SI_KILL : InstSI <
|
|
+ (outs),
|
|
+ (ins VReg_32:$src),
|
|
+ "SI_KIL $src",
|
|
+ [(int_AMDGPU_kill VReg_32:$src)]
|
|
+>;
|
|
+
|
|
+} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
|
|
+ // Uses = [EXEC], Defs = [EXEC]
|
|
+
|
|
+} // end IsCodeGenOnly, isPseudo
|
|
+
|
|
+def : Pat<
|
|
+ (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
|
|
+ (V_CNDMASK_B32_e64 VReg_32:$src2, VReg_32:$src1, (V_CMP_GT_F32_e64 0, VReg_32:$src0))
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_AMDGPU_kilp),
|
|
+ (SI_KILL (V_MOV_B32_e32 0xbf800000))
|
|
+>;
|
|
+
|
|
+/* int_SI_vs_load_input */
|
|
+def : Pat<
|
|
+ (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset,
|
|
+ VReg_32:$buf_idx_vgpr),
|
|
+ (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
|
|
+ VReg_32:$buf_idx_vgpr, SReg_128:$tlst,
|
|
+ 0, 0, 0)
|
|
+>;
|
|
+
|
|
+/* int_SI_export */
|
|
+def : Pat <
|
|
+ (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
|
|
+ VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
|
|
+ (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
|
|
+ VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3)
|
|
+>;
|
|
+
|
|
+
|
|
+/* int_SI_sample for simple 1D texture lookup */
|
|
+def : Pat <
|
|
+ (int_SI_sample imm:$writemask, (v1i32 VReg_32:$addr),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler, imm),
|
|
+ (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0,
|
|
+ (i32 (COPY_TO_REGCLASS VReg_32:$addr, VReg_32)),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler)
|
|
+>;
|
|
+
|
|
+class SamplePattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
|
|
+ ValueType addr_type> : Pat <
|
|
+ (name imm:$writemask, (addr_type addr_class:$addr),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler, imm),
|
|
+ (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0,
|
|
+ (EXTRACT_SUBREG addr_class:$addr, sub0),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler)
|
|
+>;
|
|
+
|
|
+class SampleRectPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
|
|
+ ValueType addr_type> : Pat <
|
|
+ (name imm:$writemask, (addr_type addr_class:$addr),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler, TEX_RECT),
|
|
+ (opcode imm:$writemask, 1, 0, 0, 0, 0, 0, 0,
|
|
+ (EXTRACT_SUBREG addr_class:$addr, sub0),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler)
|
|
+>;
|
|
+
|
|
+class SampleArrayPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
|
|
+ ValueType addr_type> : Pat <
|
|
+ (name imm:$writemask, (addr_type addr_class:$addr),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler, TEX_ARRAY),
|
|
+ (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0,
|
|
+ (EXTRACT_SUBREG addr_class:$addr, sub0),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler)
|
|
+>;
|
|
+
|
|
+class SampleShadowPattern<Intrinsic name, MIMG opcode,
|
|
+ RegisterClass addr_class, ValueType addr_type> : Pat <
|
|
+ (name imm:$writemask, (addr_type addr_class:$addr),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW),
|
|
+ (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0,
|
|
+ (EXTRACT_SUBREG addr_class:$addr, sub0),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler)
|
|
+>;
|
|
+
|
|
+class SampleShadowArrayPattern<Intrinsic name, MIMG opcode,
|
|
+ RegisterClass addr_class, ValueType addr_type> : Pat <
|
|
+ (name imm:$writemask, (addr_type addr_class:$addr),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW_ARRAY),
|
|
+ (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0,
|
|
+ (EXTRACT_SUBREG addr_class:$addr, sub0),
|
|
+ SReg_256:$rsrc, SReg_128:$sampler)
|
|
+>;
|
|
+
|
|
+/* int_SI_sample* for texture lookups consuming more address parameters */
|
|
+multiclass SamplePatterns<RegisterClass addr_class, ValueType addr_type> {
|
|
+ def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
|
|
+ def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
|
|
+ def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
|
|
+ def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>;
|
|
+ def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>;
|
|
+
|
|
+ def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>;
|
|
+ def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>;
|
|
+ def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>;
|
|
+ def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>;
|
|
+
|
|
+ def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>;
|
|
+ def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>;
|
|
+ def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>;
|
|
+ def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>;
|
|
+}
|
|
+
|
|
+defm : SamplePatterns<VReg_64, v2i32>;
|
|
+defm : SamplePatterns<VReg_128, v4i32>;
|
|
+defm : SamplePatterns<VReg_256, v8i32>;
|
|
+defm : SamplePatterns<VReg_512, v16i32>;
|
|
+
|
|
+def CLAMP_SI : CLAMP<VReg_32>;
|
|
+def FABS_SI : FABS<VReg_32>;
|
|
+def FNEG_SI : FNEG<VReg_32>;
|
|
+
|
|
+def : Extract_Element <f32, v4f32, VReg_128, 0, sub0>;
|
|
+def : Extract_Element <f32, v4f32, VReg_128, 1, sub1>;
|
|
+def : Extract_Element <f32, v4f32, VReg_128, 2, sub2>;
|
|
+def : Extract_Element <f32, v4f32, VReg_128, 3, sub3>;
|
|
+
|
|
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sub0>;
|
|
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sub1>;
|
|
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sub2>;
|
|
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sub3>;
|
|
+
|
|
+def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>;
|
|
+def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>;
|
|
+def : Vector_Build <v4f32, VReg_128, f32, VReg_32>;
|
|
+def : Vector_Build <v4i32, VReg_128, i32, VReg_32>;
|
|
+def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>;
|
|
+def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>;
|
|
+
|
|
+def : BitConvert <i32, f32, SReg_32>;
|
|
+def : BitConvert <i32, f32, VReg_32>;
|
|
+
|
|
+def : BitConvert <f32, i32, SReg_32>;
|
|
+def : BitConvert <f32, i32, VReg_32>;
|
|
+
|
|
+/********** ================== **********/
|
|
+/********** Immediate Patterns **********/
|
|
+/********** ================== **********/
|
|
+
|
|
+def : Pat <
|
|
+ (i1 imm:$imm),
|
|
+ (S_MOV_B64 imm:$imm)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (i32 imm:$imm),
|
|
+ (V_MOV_B32_e32 imm:$imm)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (f32 fpimm:$imm),
|
|
+ (V_MOV_B32_e32 fpimm:$imm)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (i32 imm:$imm),
|
|
+ (S_MOV_B32 imm:$imm)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (f32 fpimm:$imm),
|
|
+ (S_MOV_B32 fpimm:$imm)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (i64 InlineImm<i64>:$imm),
|
|
+ (S_MOV_B64 InlineImm<i64>:$imm)
|
|
+>;
|
|
+
|
|
+// i64 immediates aren't supported in hardware, split it into two 32bit values
|
|
+def : Pat <
|
|
+ (i64 imm:$imm),
|
|
+ (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
|
|
+ (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0),
|
|
+ (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1)
|
|
+>;
|
|
+
|
|
+/********** ===================== **********/
|
|
+/********** Interpolation Paterns **********/
|
|
+/********** ===================== **********/
|
|
+
|
|
+def : Pat <
|
|
+ (int_SI_fs_interp_constant imm:$attr_chan, imm:$attr, SReg_32:$params),
|
|
+ (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr,
|
|
+ (S_MOV_B32 SReg_32:$params))
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params),
|
|
+ (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan,
|
|
+ imm:$attr, SReg_32:$params)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_SI_fs_interp_linear_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
|
|
+ (SI_INTERP (f32 LINEAR_CENTROID_I), (f32 LINEAR_CENTROID_J), imm:$attr_chan,
|
|
+ imm:$attr, SReg_32:$params)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_SI_fs_interp_persp_center imm:$attr_chan, imm:$attr, SReg_32:$params),
|
|
+ (SI_INTERP (f32 PERSP_CENTER_I), (f32 PERSP_CENTER_J), imm:$attr_chan,
|
|
+ imm:$attr, SReg_32:$params)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_SI_fs_interp_persp_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
|
|
+ (SI_INTERP (f32 PERSP_CENTROID_I), (f32 PERSP_CENTROID_J), imm:$attr_chan,
|
|
+ imm:$attr, SReg_32:$params)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_SI_fs_read_face),
|
|
+ (f32 FRONT_FACE)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_SI_fs_read_pos 0),
|
|
+ (f32 POS_X_FLOAT)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_SI_fs_read_pos 1),
|
|
+ (f32 POS_Y_FLOAT)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_SI_fs_read_pos 2),
|
|
+ (f32 POS_Z_FLOAT)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_SI_fs_read_pos 3),
|
|
+ (f32 POS_W_FLOAT)
|
|
+>;
|
|
+
|
|
+/********** ================== **********/
|
|
+/********** Intrinsic Patterns **********/
|
|
+/********** ================== **********/
|
|
+
|
|
+/* llvm.AMDGPU.pow */
|
|
+/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this correct? */
|
|
+def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_AMDGPU_div VSrc_32:$src0, VSrc_32:$src1),
|
|
+ (V_MUL_LEGACY_F32_e32 VSrc_32:$src0, (V_RCP_LEGACY_F32_e32 VSrc_32:$src1))
|
|
+>;
|
|
+
|
|
+def : Pat<
|
|
+ (fdiv VSrc_32:$src0, VSrc_32:$src1),
|
|
+ (V_MUL_F32_e32 VSrc_32:$src0, (V_RCP_F32_e32 VSrc_32:$src1))
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (fcos VSrc_32:$src0),
|
|
+ (V_COS_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (fsin VSrc_32:$src0),
|
|
+ (V_SIN_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (int_AMDGPU_cube VReg_128:$src),
|
|
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
|
|
+ (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
|
|
+ (EXTRACT_SUBREG VReg_128:$src, sub1),
|
|
+ (EXTRACT_SUBREG VReg_128:$src, sub2),
|
|
+ 0, 0, 0, 0), sub0),
|
|
+ (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
|
|
+ (EXTRACT_SUBREG VReg_128:$src, sub1),
|
|
+ (EXTRACT_SUBREG VReg_128:$src, sub2),
|
|
+ 0, 0, 0, 0), sub1),
|
|
+ (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
|
|
+ (EXTRACT_SUBREG VReg_128:$src, sub1),
|
|
+ (EXTRACT_SUBREG VReg_128:$src, sub2),
|
|
+ 0, 0, 0, 0), sub2),
|
|
+ (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
|
|
+ (EXTRACT_SUBREG VReg_128:$src, sub1),
|
|
+ (EXTRACT_SUBREG VReg_128:$src, sub2),
|
|
+ 0, 0, 0, 0), sub3)
|
|
+>;
|
|
+
|
|
+def : Pat <
|
|
+ (i32 (sext (i1 SReg_64:$src0))),
|
|
+ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0)
|
|
+>;
|
|
+
|
|
+/********** ================== **********/
|
|
+/********** VOP3 Patterns **********/
|
|
+/********** ================== **********/
|
|
+
|
|
+def : Pat <(f32 (IL_mad VSrc_32:$src0, VReg_32:$src1, VReg_32:$src2)),
|
|
+ (V_MAD_LEGACY_F32 VSrc_32:$src0, VReg_32:$src1, VReg_32:$src2,
|
|
+ 0, 0, 0, 0)>;
|
|
+
|
|
+/********** ================== **********/
|
|
+/********** SMRD Patterns **********/
|
|
+/********** ================== **********/
|
|
+
|
|
+multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
|
|
+ // 1. Offset as 8bit DWORD immediate
|
|
+ def : Pat <
|
|
+ (constant_load (SIadd64bit32bit SReg_64:$sbase, IMM8bitDWORD:$offset)),
|
|
+ (vt (Instr_IMM SReg_64:$sbase, IMM8bitDWORD:$offset))
|
|
+ >;
|
|
+
|
|
+ // 2. Offset loaded in an 32bit SGPR
|
|
+ def : Pat <
|
|
+ (constant_load (SIadd64bit32bit SReg_64:$sbase, imm:$offset)),
|
|
+ (vt (Instr_SGPR SReg_64:$sbase, (S_MOV_B32 imm:$offset)))
|
|
+ >;
|
|
+
|
|
+ // 3. No offset at all
|
|
+ def : Pat <
|
|
+ (constant_load SReg_64:$sbase),
|
|
+ (vt (Instr_IMM SReg_64:$sbase, 0))
|
|
+ >;
|
|
+}
|
|
+
|
|
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
|
|
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
|
|
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
|
|
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
|
|
+
|
|
+} // End isSI predicate
|
|
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
|
|
new file mode 100644
|
|
index 0000000..611b9c4
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIIntrinsics.td
|
|
@@ -0,0 +1,54 @@
|
|
+//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// SI Intrinsic Definitions
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+let TargetPrefix = "SI", isTarget = 1 in {
|
|
+
|
|
+ def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
|
+ def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
|
|
+ /* XXX: We may need a seperate intrinsic here for loading integer values */
|
|
+ def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>;
|
|
+ def int_SI_vs_load_buffer_index : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
|
|
+ def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ;
|
|
+ def int_SI_wqm : Intrinsic <[], [], []>;
|
|
+
|
|
+ class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v8i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrReadMem]>;
|
|
+
|
|
+ def int_SI_sample : Sample;
|
|
+ def int_SI_sampleb : Sample;
|
|
+ def int_SI_samplel : Sample;
|
|
+
|
|
+ /* Interpolation Intrinsics */
|
|
+
|
|
+ def int_SI_set_M0 : Intrinsic <[llvm_i32_ty], [llvm_i32_ty]>;
|
|
+ class Interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
|
|
+
|
|
+ def int_SI_fs_interp_linear_center : Interp;
|
|
+ def int_SI_fs_interp_linear_centroid : Interp;
|
|
+ def int_SI_fs_interp_persp_center : Interp;
|
|
+ def int_SI_fs_interp_persp_centroid : Interp;
|
|
+ def int_SI_fs_interp_constant : Interp;
|
|
+
|
|
+ def int_SI_fs_read_face : Intrinsic <[llvm_float_ty], [], [IntrNoMem]>;
|
|
+ def int_SI_fs_read_pos : Intrinsic <[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
|
|
+
|
|
+ /* Control flow Intrinsics */
|
|
+
|
|
+ def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
|
|
+ def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
|
|
+ def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
|
|
+ def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
|
|
+ def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
|
|
+ def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
|
|
+ def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
|
|
+}
|
|
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
|
|
new file mode 100644
|
|
index 0000000..2007d30
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SILowerControlFlow.cpp
|
|
@@ -0,0 +1,372 @@
|
|
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief This pass lowers the pseudo control flow instructions to real
|
|
+/// machine instructions.
|
|
+///
|
|
+/// All control flow is handled using predicated instructions and
|
|
+/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
|
|
+/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
|
|
+/// by writting to the 64-bit EXEC register (each bit corresponds to a
|
|
+/// single vector ALU). Typically, for predicates, a vector ALU will write
|
|
+/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
|
|
+/// Vector ALU) and then the ScalarALU will AND the VCC register with the
|
|
+/// EXEC to update the predicates.
|
|
+///
|
|
+/// For example:
|
|
+/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
|
|
+/// %SGPR0 = SI_IF %VCC
|
|
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
|
|
+/// %SGPR0 = SI_ELSE %SGPR0
|
|
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
|
|
+/// SI_END_CF %SGPR0
|
|
+///
|
|
+/// becomes:
|
|
+///
|
|
+/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
|
|
+/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
|
|
+/// S_CBRANCH_EXECZ label0 // This instruction is an optional
|
|
+/// // optimization which allows us to
|
|
+/// // branch if all the bits of
|
|
+/// // EXEC are zero.
|
|
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
|
|
+///
|
|
+/// label0:
|
|
+/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
|
|
+/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
|
|
+/// S_BRANCH_EXECZ label1 // Use our branch optimization
|
|
+/// // instruction again.
|
|
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
|
|
+/// label1:
|
|
+/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPU.h"
|
|
+#include "SIInstrInfo.h"
|
|
+#include "SIMachineFunctionInfo.h"
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
+#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
+#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+namespace {
|
|
+
|
|
+class SILowerControlFlowPass : public MachineFunctionPass {
|
|
+
|
|
+private:
|
|
+ static const unsigned SkipThreshold = 12;
|
|
+
|
|
+ static char ID;
|
|
+ const TargetInstrInfo *TII;
|
|
+
|
|
+ bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
|
|
+
|
|
+ void Skip(MachineInstr &From, MachineOperand &To);
|
|
+ void SkipIfDead(MachineInstr &MI);
|
|
+
|
|
+ void If(MachineInstr &MI);
|
|
+ void Else(MachineInstr &MI);
|
|
+ void Break(MachineInstr &MI);
|
|
+ void IfBreak(MachineInstr &MI);
|
|
+ void ElseBreak(MachineInstr &MI);
|
|
+ void Loop(MachineInstr &MI);
|
|
+ void EndCf(MachineInstr &MI);
|
|
+
|
|
+ void Kill(MachineInstr &MI);
|
|
+ void Branch(MachineInstr &MI);
|
|
+
|
|
+public:
|
|
+ SILowerControlFlowPass(TargetMachine &tm) :
|
|
+ MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
|
|
+
|
|
+ virtual bool runOnMachineFunction(MachineFunction &MF);
|
|
+
|
|
+ const char *getPassName() const {
|
|
+ return "SI Lower control flow instructions";
|
|
+ }
|
|
+
|
|
+};
|
|
+
|
|
+} // End anonymous namespace
|
|
+
|
|
+char SILowerControlFlowPass::ID = 0;
|
|
+
|
|
+FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
|
|
+ return new SILowerControlFlowPass(tm);
|
|
+}
|
|
+
|
|
+bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
|
|
+ MachineBasicBlock *To) {
|
|
+
|
|
+ unsigned NumInstr = 0;
|
|
+
|
|
+ for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
|
|
+ MBB = *MBB->succ_begin()) {
|
|
+
|
|
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
|
|
+ NumInstr < SkipThreshold && I != E; ++I) {
|
|
+
|
|
+ if (I->isBundle() || !I->isBundled())
|
|
+ if (++NumInstr >= SkipThreshold)
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
|
|
+
|
|
+ if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
|
|
+ return;
|
|
+
|
|
+ DebugLoc DL = From.getDebugLoc();
|
|
+ BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
|
|
+ .addOperand(To)
|
|
+ .addReg(AMDGPU::EXEC);
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
|
|
+
|
|
+ MachineBasicBlock &MBB = *MI.getParent();
|
|
+ DebugLoc DL = MI.getDebugLoc();
|
|
+
|
|
+ if (!shouldSkip(&MBB, &MBB.getParent()->back()))
|
|
+ return;
|
|
+
|
|
+ MachineBasicBlock::iterator Insert = &MI;
|
|
+ ++Insert;
|
|
+
|
|
+ // If the exec mask is non-zero, skip the next two instructions
|
|
+ BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
|
+ .addImm(3)
|
|
+ .addReg(AMDGPU::EXEC);
|
|
+
|
|
+ // Exec mask is zero: Export to NULL target...
|
|
+ BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
|
|
+ .addImm(0)
|
|
+ .addImm(0x09) // V_008DFC_SQ_EXP_NULL
|
|
+ .addImm(0)
|
|
+ .addImm(1)
|
|
+ .addImm(1)
|
|
+ .addReg(AMDGPU::VGPR0)
|
|
+ .addReg(AMDGPU::VGPR0)
|
|
+ .addReg(AMDGPU::VGPR0)
|
|
+ .addReg(AMDGPU::VGPR0);
|
|
+
|
|
+ // ... and terminate wavefront
|
|
+ BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::If(MachineInstr &MI) {
|
|
+ MachineBasicBlock &MBB = *MI.getParent();
|
|
+ DebugLoc DL = MI.getDebugLoc();
|
|
+ unsigned Reg = MI.getOperand(0).getReg();
|
|
+ unsigned Vcc = MI.getOperand(1).getReg();
|
|
+
|
|
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
|
|
+ .addReg(Vcc);
|
|
+
|
|
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
|
|
+ .addReg(AMDGPU::EXEC)
|
|
+ .addReg(Reg);
|
|
+
|
|
+ Skip(MI, MI.getOperand(2));
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::Else(MachineInstr &MI) {
|
|
+ MachineBasicBlock &MBB = *MI.getParent();
|
|
+ DebugLoc DL = MI.getDebugLoc();
|
|
+ unsigned Dst = MI.getOperand(0).getReg();
|
|
+ unsigned Src = MI.getOperand(1).getReg();
|
|
+
|
|
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
|
|
+ .addReg(Src); // Saved EXEC
|
|
+
|
|
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
|
|
+ .addReg(AMDGPU::EXEC)
|
|
+ .addReg(Dst);
|
|
+
|
|
+ Skip(MI, MI.getOperand(2));
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::Break(MachineInstr &MI) {
|
|
+ MachineBasicBlock &MBB = *MI.getParent();
|
|
+ DebugLoc DL = MI.getDebugLoc();
|
|
+
|
|
+ unsigned Dst = MI.getOperand(0).getReg();
|
|
+ unsigned Src = MI.getOperand(1).getReg();
|
|
+
|
|
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
|
|
+ .addReg(AMDGPU::EXEC)
|
|
+ .addReg(Src);
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
|
|
+ MachineBasicBlock &MBB = *MI.getParent();
|
|
+ DebugLoc DL = MI.getDebugLoc();
|
|
+
|
|
+ unsigned Dst = MI.getOperand(0).getReg();
|
|
+ unsigned Vcc = MI.getOperand(1).getReg();
|
|
+ unsigned Src = MI.getOperand(2).getReg();
|
|
+
|
|
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
|
|
+ .addReg(Vcc)
|
|
+ .addReg(Src);
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
|
|
+ MachineBasicBlock &MBB = *MI.getParent();
|
|
+ DebugLoc DL = MI.getDebugLoc();
|
|
+
|
|
+ unsigned Dst = MI.getOperand(0).getReg();
|
|
+ unsigned Saved = MI.getOperand(1).getReg();
|
|
+ unsigned Src = MI.getOperand(2).getReg();
|
|
+
|
|
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
|
|
+ .addReg(Saved)
|
|
+ .addReg(Src);
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::Loop(MachineInstr &MI) {
|
|
+ MachineBasicBlock &MBB = *MI.getParent();
|
|
+ DebugLoc DL = MI.getDebugLoc();
|
|
+ unsigned Src = MI.getOperand(0).getReg();
|
|
+
|
|
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
|
|
+ .addReg(AMDGPU::EXEC)
|
|
+ .addReg(Src);
|
|
+
|
|
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
|
+ .addOperand(MI.getOperand(1))
|
|
+ .addReg(AMDGPU::EXEC);
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
|
|
+ MachineBasicBlock &MBB = *MI.getParent();
|
|
+ DebugLoc DL = MI.getDebugLoc();
|
|
+ unsigned Reg = MI.getOperand(0).getReg();
|
|
+
|
|
+ BuildMI(MBB, MBB.getFirstNonPHI(), DL,
|
|
+ TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
|
|
+ .addReg(AMDGPU::EXEC)
|
|
+ .addReg(Reg);
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::Branch(MachineInstr &MI) {
|
|
+ MachineBasicBlock *Next = MI.getParent()->getNextNode();
|
|
+ MachineBasicBlock *Target = MI.getOperand(0).getMBB();
|
|
+ if (Target == Next)
|
|
+ MI.eraseFromParent();
|
|
+ else
|
|
+ assert(0);
|
|
+}
|
|
+
|
|
+void SILowerControlFlowPass::Kill(MachineInstr &MI) {
|
|
+
|
|
+ MachineBasicBlock &MBB = *MI.getParent();
|
|
+ DebugLoc DL = MI.getDebugLoc();
|
|
+
|
|
+ // Kill is only allowed in pixel shaders
|
|
+ MachineFunction &MF = *MBB.getParent();
|
|
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
+ assert(Info->ShaderType == ShaderType::PIXEL);
|
|
+
|
|
+ // Clear this pixel from the exec mask if the operand is negative
|
|
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
|
|
+ .addImm(0)
|
|
+ .addOperand(MI.getOperand(0));
|
|
+
|
|
+ MI.eraseFromParent();
|
|
+}
|
|
+
|
|
+bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
|
|
+
|
|
+ bool HaveKill = false;
|
|
+ unsigned Depth = 0;
|
|
+
|
|
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
|
|
+ BI != BE; ++BI) {
|
|
+
|
|
+ MachineBasicBlock &MBB = *BI;
|
|
+ for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
|
|
+ I != MBB.end(); I = Next) {
|
|
+
|
|
+ Next = llvm::next(I);
|
|
+ MachineInstr &MI = *I;
|
|
+ switch (MI.getOpcode()) {
|
|
+ default: break;
|
|
+ case AMDGPU::SI_IF:
|
|
+ ++Depth;
|
|
+ If(MI);
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::SI_ELSE:
|
|
+ Else(MI);
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::SI_BREAK:
|
|
+ Break(MI);
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::SI_IF_BREAK:
|
|
+ IfBreak(MI);
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::SI_ELSE_BREAK:
|
|
+ ElseBreak(MI);
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::SI_LOOP:
|
|
+ ++Depth;
|
|
+ Loop(MI);
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::SI_END_CF:
|
|
+ if (--Depth == 0 && HaveKill) {
|
|
+ SkipIfDead(MI);
|
|
+ HaveKill = false;
|
|
+ }
|
|
+ EndCf(MI);
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::SI_KILL:
|
|
+ if (Depth == 0)
|
|
+ SkipIfDead(MI);
|
|
+ else
|
|
+ HaveKill = true;
|
|
+ Kill(MI);
|
|
+ break;
|
|
+
|
|
+ case AMDGPU::S_BRANCH:
|
|
+ Branch(MI);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..7e59b42
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
|
|
@@ -0,0 +1,20 @@
|
|
+//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+/// \file
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+#include "SIMachineFunctionInfo.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|
+ : MachineFunctionInfo(),
|
|
+ SPIPSInputAddr(0),
|
|
+ ShaderType(0)
|
|
+ { }
|
|
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
|
|
new file mode 100644
|
|
index 0000000..47271f5
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
|
|
@@ -0,0 +1,34 @@
|
|
+//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+#ifndef SIMACHINEFUNCTIONINFO_H_
|
|
+#define SIMACHINEFUNCTIONINFO_H_
|
|
+
|
|
+#include "llvm/CodeGen/MachineFunction.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
|
|
+/// tells the hardware which interpolation parameters to load.
|
|
+class SIMachineFunctionInfo : public MachineFunctionInfo {
|
|
+public:
|
|
+ SIMachineFunctionInfo(const MachineFunction &MF);
|
|
+ unsigned SPIPSInputAddr;
|
|
+ unsigned ShaderType;
|
|
+};
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+
|
|
+#endif //_SIMACHINEFUNCTIONINFO_H_
|
|
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..88275c5
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIRegisterInfo.cpp
|
|
@@ -0,0 +1,48 @@
|
|
+//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief SI implementation of the TargetRegisterInfo class.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+#include "SIRegisterInfo.h"
|
|
+#include "AMDGPUTargetMachine.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm,
|
|
+ const TargetInstrInfo &tii)
|
|
+: AMDGPURegisterInfo(tm, tii),
|
|
+ TM(tm),
|
|
+ TII(tii)
|
|
+ { }
|
|
+
|
|
+BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
|
+ BitVector Reserved(getNumRegs());
|
|
+ return Reserved;
|
|
+}
|
|
+
|
|
+const TargetRegisterClass *
|
|
+SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
|
|
+ switch (rc->getID()) {
|
|
+ case AMDGPU::GPRF32RegClassID:
|
|
+ return &AMDGPU::VReg_32RegClass;
|
|
+ default: return rc;
|
|
+ }
|
|
+}
|
|
+
|
|
+const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
|
|
+ MVT VT) const {
|
|
+ switch(VT.SimpleTy) {
|
|
+ default:
|
|
+ case MVT::i32: return &AMDGPU::VReg_32RegClass;
|
|
+ }
|
|
+}
|
|
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
|
|
new file mode 100644
|
|
index 0000000..40171e4
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIRegisterInfo.h
|
|
@@ -0,0 +1,47 @@
|
|
+//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+/// \brief Interface definition for SIRegisterInfo
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+#ifndef SIREGISTERINFO_H_
|
|
+#define SIREGISTERINFO_H_
|
|
+
|
|
+#include "AMDGPURegisterInfo.h"
|
|
+
|
|
+namespace llvm {
|
|
+
|
|
+class AMDGPUTargetMachine;
|
|
+class TargetInstrInfo;
|
|
+
|
|
+struct SIRegisterInfo : public AMDGPURegisterInfo {
|
|
+ AMDGPUTargetMachine &TM;
|
|
+ const TargetInstrInfo &TII;
|
|
+
|
|
+ SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
|
|
+
|
|
+ virtual BitVector getReservedRegs(const MachineFunction &MF) const;
|
|
+
|
|
+ /// \param RC is an AMDIL reg class.
|
|
+ ///
|
|
+ /// \returns the SI register class that is equivalent to \p RC.
|
|
+ virtual const TargetRegisterClass *
|
|
+ getISARegClass(const TargetRegisterClass *RC) const;
|
|
+
|
|
+ /// \brief get the register class of the specified type to use in the
|
|
+ /// CFGStructurizer
|
|
+ virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
|
|
+};
|
|
+
|
|
+} // End namespace llvm
|
|
+
|
|
+#endif // SIREGISTERINFO_H_
|
|
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
|
|
new file mode 100644
|
|
index 0000000..ab36b87
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SIRegisterInfo.td
|
|
@@ -0,0 +1,190 @@
|
|
+
|
|
+class SIReg <string n, bits<16> encoding = 0> : Register<n> {
|
|
+ let Namespace = "AMDGPU";
|
|
+ let HWEncoding = encoding;
|
|
+}
|
|
+
|
|
+class SI_64 <string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> {
|
|
+ let Namespace = "AMDGPU";
|
|
+ let SubRegIndices = [sub0, sub1];
|
|
+ let HWEncoding = encoding;
|
|
+}
|
|
+
|
|
+class SGPR_32 <bits<16> num, string name> : SIReg<name, num>;
|
|
+
|
|
+class VGPR_32 <bits<16> num, string name> : SIReg<name, num> {
|
|
+ let HWEncoding{8} = 1;
|
|
+}
|
|
+
|
|
+// Special Registers
|
|
+def VCC : SIReg<"VCC", 106>;
|
|
+def EXEC_LO : SIReg <"EXEC LO", 126>;
|
|
+def EXEC_HI : SIReg <"EXEC HI", 127>;
|
|
+def EXEC : SI_64<"EXEC", [EXEC_LO, EXEC_HI], 126>;
|
|
+def SCC : SIReg<"SCC", 253>;
|
|
+def M0 : SIReg <"M0", 124>;
|
|
+
|
|
+//Interpolation registers
|
|
+def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">;
|
|
+def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">;
|
|
+def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">;
|
|
+def PERSP_CENTER_J : SIReg <"PERSP_CENTER_J">;
|
|
+def PERSP_CENTROID_I : SIReg <"PERSP_CENTROID_I">;
|
|
+def PERSP_CENTROID_J : SIReg <"PERP_CENTROID_J">;
|
|
+def PERSP_I_W : SIReg <"PERSP_I_W">;
|
|
+def PERSP_J_W : SIReg <"PERSP_J_W">;
|
|
+def PERSP_1_W : SIReg <"PERSP_1_W">;
|
|
+def LINEAR_SAMPLE_I : SIReg <"LINEAR_SAMPLE_I">;
|
|
+def LINEAR_SAMPLE_J : SIReg <"LINEAR_SAMPLE_J">;
|
|
+def LINEAR_CENTER_I : SIReg <"LINEAR_CENTER_I">;
|
|
+def LINEAR_CENTER_J : SIReg <"LINEAR_CENTER_J">;
|
|
+def LINEAR_CENTROID_I : SIReg <"LINEAR_CENTROID_I">;
|
|
+def LINEAR_CENTROID_J : SIReg <"LINEAR_CENTROID_J">;
|
|
+def LINE_STIPPLE_TEX_COORD : SIReg <"LINE_STIPPLE_TEX_COORD">;
|
|
+def POS_X_FLOAT : SIReg <"POS_X_FLOAT">;
|
|
+def POS_Y_FLOAT : SIReg <"POS_Y_FLOAT">;
|
|
+def POS_Z_FLOAT : SIReg <"POS_Z_FLOAT">;
|
|
+def POS_W_FLOAT : SIReg <"POS_W_FLOAT">;
|
|
+def FRONT_FACE : SIReg <"FRONT_FACE">;
|
|
+def ANCILLARY : SIReg <"ANCILLARY">;
|
|
+def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">;
|
|
+def POS_FIXED_PT : SIReg <"POS_FIXED_PT">;
|
|
+
|
|
+// SGPR 32-bit registers
|
|
+foreach Index = 0-101 in {
|
|
+ def SGPR#Index : SGPR_32 <Index, "SGPR"#Index>;
|
|
+}
|
|
+
|
|
+def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
|
|
+ (add (sequence "SGPR%u", 0, 101))>;
|
|
+
|
|
+// SGPR 64-bit registers
|
|
+def SGPR_64 : RegisterTuples<[sub0, sub1],
|
|
+ [(add (decimate SGPR_32, 2)),
|
|
+ (add(decimate (rotl SGPR_32, 1), 2))]>;
|
|
+
|
|
+// SGPR 128-bit registers
|
|
+def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
|
|
+ [(add (decimate SGPR_32, 4)),
|
|
+ (add (decimate (rotl SGPR_32, 1), 4)),
|
|
+ (add (decimate (rotl SGPR_32, 2), 4)),
|
|
+ (add (decimate (rotl SGPR_32, 3), 4))]>;
|
|
+
|
|
+// SGPR 256-bit registers
|
|
+def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
|
|
+ [(add (decimate SGPR_32, 8)),
|
|
+ (add (decimate (rotl SGPR_32, 1), 8)),
|
|
+ (add (decimate (rotl SGPR_32, 2), 8)),
|
|
+ (add (decimate (rotl SGPR_32, 3), 8)),
|
|
+ (add (decimate (rotl SGPR_32, 4), 8)),
|
|
+ (add (decimate (rotl SGPR_32, 5), 8)),
|
|
+ (add (decimate (rotl SGPR_32, 6), 8)),
|
|
+ (add (decimate (rotl SGPR_32, 7), 8))]>;
|
|
+
|
|
+// VGPR 32-bit registers
|
|
+foreach Index = 0-255 in {
|
|
+ def VGPR#Index : VGPR_32 <Index, "VGPR"#Index>;
|
|
+}
|
|
+
|
|
+def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
|
|
+ (add (sequence "VGPR%u", 0, 255))>;
|
|
+
|
|
+// VGPR 64-bit registers
|
|
+def VGPR_64 : RegisterTuples<[sub0, sub1],
|
|
+ [(add VGPR_32),
|
|
+ (add (rotl VGPR_32, 1))]>;
|
|
+
|
|
+// VGPR 128-bit registers
|
|
+def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
|
|
+ [(add VGPR_32),
|
|
+ (add (rotl VGPR_32, 1)),
|
|
+ (add (rotl VGPR_32, 2)),
|
|
+ (add (rotl VGPR_32, 3))]>;
|
|
+
|
|
+// VGPR 256-bit registers
|
|
+def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
|
|
+ [(add VGPR_32),
|
|
+ (add (rotl VGPR_32, 1)),
|
|
+ (add (rotl VGPR_32, 2)),
|
|
+ (add (rotl VGPR_32, 3)),
|
|
+ (add (rotl VGPR_32, 4)),
|
|
+ (add (rotl VGPR_32, 5)),
|
|
+ (add (rotl VGPR_32, 6)),
|
|
+ (add (rotl VGPR_32, 7))]>;
|
|
+
|
|
+// VGPR 512-bit registers
|
|
+def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
|
|
+ sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
|
|
+ [(add VGPR_32),
|
|
+ (add (rotl VGPR_32, 1)),
|
|
+ (add (rotl VGPR_32, 2)),
|
|
+ (add (rotl VGPR_32, 3)),
|
|
+ (add (rotl VGPR_32, 4)),
|
|
+ (add (rotl VGPR_32, 5)),
|
|
+ (add (rotl VGPR_32, 6)),
|
|
+ (add (rotl VGPR_32, 7)),
|
|
+ (add (rotl VGPR_32, 8)),
|
|
+ (add (rotl VGPR_32, 9)),
|
|
+ (add (rotl VGPR_32, 10)),
|
|
+ (add (rotl VGPR_32, 11)),
|
|
+ (add (rotl VGPR_32, 12)),
|
|
+ (add (rotl VGPR_32, 13)),
|
|
+ (add (rotl VGPR_32, 14)),
|
|
+ (add (rotl VGPR_32, 15))]>;
|
|
+
|
|
+// Register class for all scalar registers (SGPRs + Special Registers)
|
|
+def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
|
|
+ (add SGPR_32, M0, EXEC_LO, EXEC_HI)
|
|
+>;
|
|
+
|
|
+def SReg_64 : RegisterClass<"AMDGPU", [i1, i64], 64, (add SGPR_64, VCC, EXEC)>;
|
|
+
|
|
+def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>;
|
|
+
|
|
+def SReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add SGPR_256)>;
|
|
+
|
|
+// Register class for all vector registers (VGPRs + Interploation Registers)
|
|
+def VReg_32 : RegisterClass<"AMDGPU", [f32, i32, v1i32], 32,
|
|
+ (add VGPR_32,
|
|
+ PERSP_SAMPLE_I, PERSP_SAMPLE_J,
|
|
+ PERSP_CENTER_I, PERSP_CENTER_J,
|
|
+ PERSP_CENTROID_I, PERSP_CENTROID_J,
|
|
+ PERSP_I_W, PERSP_J_W, PERSP_1_W,
|
|
+ LINEAR_SAMPLE_I, LINEAR_SAMPLE_J,
|
|
+ LINEAR_CENTER_I, LINEAR_CENTER_J,
|
|
+ LINEAR_CENTROID_I, LINEAR_CENTROID_J,
|
|
+ LINE_STIPPLE_TEX_COORD,
|
|
+ POS_X_FLOAT,
|
|
+ POS_Y_FLOAT,
|
|
+ POS_Z_FLOAT,
|
|
+ POS_W_FLOAT,
|
|
+ FRONT_FACE,
|
|
+ ANCILLARY,
|
|
+ SAMPLE_COVERAGE,
|
|
+ POS_FIXED_PT
|
|
+ )
|
|
+>;
|
|
+
|
|
+def VReg_64 : RegisterClass<"AMDGPU", [i64, v2i32], 64, (add VGPR_64)>;
|
|
+
|
|
+def VReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add VGPR_128)>;
|
|
+
|
|
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add VGPR_256)>;
|
|
+
|
|
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>;
|
|
+
|
|
+// [SV]Src_* operands can have either an immediate or an register
|
|
+def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
|
|
+
|
|
+def SSrc_64 : RegisterClass<"AMDGPU", [i1, i64], 64, (add SReg_64)>;
|
|
+
|
|
+def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
|
|
+
|
|
+def VSrc_64 : RegisterClass<"AMDGPU", [i64], 64, (add SReg_64, VReg_64)>;
|
|
+
|
|
+// Special register classes for predicates and the M0 register
|
|
+def SCCReg : RegisterClass<"AMDGPU", [i1], 1, (add SCC)>;
|
|
+def VCCReg : RegisterClass<"AMDGPU", [i1], 1, (add VCC)>;
|
|
+def EXECReg : RegisterClass<"AMDGPU", [i1], 1, (add EXEC)>;
|
|
+def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
|
|
+
|
|
diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td
|
|
new file mode 100644
|
|
index 0000000..28b65b8
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/SISchedule.td
|
|
@@ -0,0 +1,15 @@
|
|
+//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+// TODO: This is just a place holder for now.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+
|
|
+def SI_Itin : ProcessorItineraries <[], [], []>;
|
|
diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
|
|
new file mode 100644
|
|
index 0000000..46b1f18
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
|
|
@@ -0,0 +1,26 @@
|
|
+//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
|
|
+//
|
|
+// The LLVM Compiler Infrastructure
|
|
+//
|
|
+// This file is distributed under the University of Illinois Open Source
|
|
+// License. See LICENSE.TXT for details.
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+//
|
|
+/// \file
|
|
+//
|
|
+//===----------------------------------------------------------------------===//
|
|
+
|
|
+#include "AMDGPU.h"
|
|
+#include "llvm/Support/TargetRegistry.h"
|
|
+
|
|
+using namespace llvm;
|
|
+
|
|
+/// \brief The target for the AMDGPU backend
|
|
+Target llvm::TheAMDGPUTarget;
|
|
+
|
|
+/// \brief Extern function to initialize the targets for the AMDGPU backend
|
|
+extern "C" void LLVMInitializeR600TargetInfo() {
|
|
+ RegisterTarget<Triple::r600, false>
|
|
+ R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
|
|
+}
|
|
diff --git a/lib/Target/R600/TargetInfo/CMakeLists.txt b/lib/Target/R600/TargetInfo/CMakeLists.txt
|
|
new file mode 100644
|
|
index 0000000..3d1584e
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/TargetInfo/CMakeLists.txt
|
|
@@ -0,0 +1,7 @@
|
|
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
|
|
+
|
|
+add_llvm_library(LLVMR600Info
|
|
+ AMDGPUTargetInfo.cpp
|
|
+ )
|
|
+
|
|
+add_dependencies(LLVMR600Info AMDGPUCommonTableGen intrinsics_gen)
|
|
diff --git a/lib/Target/R600/TargetInfo/LLVMBuild.txt b/lib/Target/R600/TargetInfo/LLVMBuild.txt
|
|
new file mode 100644
|
|
index 0000000..4c6fea4
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/TargetInfo/LLVMBuild.txt
|
|
@@ -0,0 +1,23 @@
|
|
+;===- ./lib/Target/R600/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
|
|
+;
|
|
+; The LLVM Compiler Infrastructure
|
|
+;
|
|
+; This file is distributed under the University of Illinois Open Source
|
|
+; License. See LICENSE.TXT for details.
|
|
+;
|
|
+;===------------------------------------------------------------------------===;
|
|
+;
|
|
+; This is an LLVMBuild description file for the components in this subdirectory.
|
|
+;
|
|
+; For more information on the LLVMBuild system, please see:
|
|
+;
|
|
+; http://llvm.org/docs/LLVMBuild.html
|
|
+;
|
|
+;===------------------------------------------------------------------------===;
|
|
+
|
|
+[component_0]
|
|
+type = Library
|
|
+name = R600Info
|
|
+parent = R600
|
|
+required_libraries = MC Support
|
|
+add_to_library_groups = R600
|
|
diff --git a/lib/Target/R600/TargetInfo/Makefile b/lib/Target/R600/TargetInfo/Makefile
|
|
new file mode 100644
|
|
index 0000000..b8ac4e7
|
|
--- /dev/null
|
|
+++ b/lib/Target/R600/TargetInfo/Makefile
|
|
@@ -0,0 +1,15 @@
|
|
+##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===##
|
|
+#
|
|
+# The LLVM Compiler Infrastructure
|
|
+#
|
|
+# This file is distributed under the University of Illinois Open Source
|
|
+# License. See LICENSE.TXT for details.
|
|
+#
|
|
+##===----------------------------------------------------------------------===##
|
|
+LEVEL = ../../../..
|
|
+LIBRARYNAME = LLVMR600Info
|
|
+
|
|
+# Hack: we need to include 'main' target directory to grab private headers
|
|
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
|
|
+
|
|
+include $(LEVEL)/Makefile.common
|
|
diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll
|
|
new file mode 100644
|
|
index 0000000..114f9e7
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/128bit-kernel-args.ll
|
|
@@ -0,0 +1,18 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; CHECK: @v4i32_kernel_arg
|
|
+; CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 40
|
|
+
|
|
+define void @v4i32_kernel_arg(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
|
|
+entry:
|
|
+ store <4 x i32> %in, <4 x i32> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @v4f32_kernel_arg
|
|
+; CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 40
|
|
+define void @v4f32_kernel_args(<4 x float> addrspace(1)* %out, <4 x float> %in) {
|
|
+entry:
|
|
+ store <4 x float> %in, <4 x float> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/add.v4i32.ll b/test/CodeGen/R600/add.v4i32.ll
|
|
new file mode 100644
|
|
index 0000000..ac4a874
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/add.v4i32.ll
|
|
@@ -0,0 +1,15 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
|
|
+ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
|
|
+ %a = load <4 x i32> addrspace(1) * %in
|
|
+ %b = load <4 x i32> addrspace(1) * %b_ptr
|
|
+ %result = add <4 x i32> %a, %b
|
|
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/and.v4i32.ll b/test/CodeGen/R600/and.v4i32.ll
|
|
new file mode 100644
|
|
index 0000000..662085e
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/and.v4i32.ll
|
|
@@ -0,0 +1,15 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
|
|
+ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
|
|
+ %a = load <4 x i32> addrspace(1) * %in
|
|
+ %b = load <4 x i32> addrspace(1) * %b_ptr
|
|
+ %result = and <4 x i32> %a, %b
|
|
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
|
|
new file mode 100644
|
|
index 0000000..fd958b3
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
|
|
@@ -0,0 +1,36 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; This test is for a bug in
|
|
+; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where
|
|
+; the wrong type was being passed to
|
|
+; TargetLowering::getOperationAction() when checking the legality of
|
|
+; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes.
|
|
+
|
|
+
|
|
+; CHECK: @sint
|
|
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
|
|
+entry:
|
|
+ %ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
|
+ %sint = load i32 addrspace(1) * %in
|
|
+ %conv = sitofp i32 %sint to float
|
|
+ %0 = insertelement <4 x float> undef, float %conv, i32 0
|
|
+ %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
|
|
+ store <4 x float> %splat, <4 x float> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+;CHECK: @uint
|
|
+;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
|
|
+entry:
|
|
+ %ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
|
+ %uint = load i32 addrspace(1) * %in
|
|
+ %conv = uitofp i32 %uint to float
|
|
+ %0 = insertelement <4 x float> undef, float %conv, i32 0
|
|
+ %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
|
|
+ store <4 x float> %splat, <4 x float> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/disconnected-predset-break-bug.ll b/test/CodeGen/R600/disconnected-predset-break-bug.ll
|
|
new file mode 100644
|
|
index 0000000..a586742
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/disconnected-predset-break-bug.ll
|
|
@@ -0,0 +1,28 @@
|
|
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; PRED_SET* instructions must be tied to any instruction that uses their
|
|
+; result. This tests that there are no instructions between the PRED_SET*
|
|
+; and the PREDICATE_BREAK in this loop.
|
|
+
|
|
+; CHECK: @loop_ge
|
|
+; CHECK: WHILE
|
|
+; CHECK: PRED_SET
|
|
+; CHECK-NEXT: PREDICATED_BREAK
|
|
+define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
|
|
+entry:
|
|
+ %cmp5 = icmp sgt i32 %iterations, 0
|
|
+ br i1 %cmp5, label %for.body, label %for.end
|
|
+
|
|
+for.body: ; preds = %for.body, %entry
|
|
+ %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
|
|
+ %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
+ %i.07 = add nsw i32 %i.07.in, -1
|
|
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %out, i32 %ai.06
|
|
+ store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
|
|
+ %add = add nsw i32 %ai.06, 1
|
|
+ %exitcond = icmp eq i32 %add, %iterations
|
|
+ br i1 %exitcond, label %for.end, label %for.body
|
|
+
|
|
+for.end: ; preds = %for.body, %entry
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
|
|
new file mode 100644
|
|
index 0000000..0407533
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fabs.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: MOV T{{[0-9]+\.[XYZW], \|T[0-9]+\.[XYZW]\|}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @fabs( float %r0)
|
|
+ call void @llvm.AMDGPU.store.output(float %r1, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
+
|
|
+declare float @fabs(float ) readnone
|
|
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
|
|
new file mode 100644
|
|
index 0000000..d7d1b65
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fadd.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @llvm.R600.load.input(i32 1)
|
|
+ %r2 = fadd float %r0, %r1
|
|
+ call void @llvm.AMDGPU.store.output(float %r2, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
+
|
|
diff --git a/test/CodeGen/R600/fadd.v4f32.ll b/test/CodeGen/R600/fadd.v4f32.ll
|
|
new file mode 100644
|
|
index 0000000..85dbfd5
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fadd.v4f32.ll
|
|
@@ -0,0 +1,15 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
|
+ %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
|
|
+ %a = load <4 x float> addrspace(1) * %in
|
|
+ %b = load <4 x float> addrspace(1) * %b_ptr
|
|
+ %result = fadd <4 x float> %a, %b
|
|
+ store <4 x float> %result, <4 x float> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
|
|
new file mode 100644
|
|
index 0000000..a94cfb5
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fcmp-cnd.ll
|
|
@@ -0,0 +1,14 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;Not checking arguments 2 and 3 to CNDE, because they may change between
|
|
+;registers and literal.x depending on what the optimizer does.
|
|
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
|
|
+entry:
|
|
+ %0 = load float addrspace(1)* %in
|
|
+ %cmp = fcmp oeq float %0, 0.000000e+00
|
|
+ %value = select i1 %cmp, i32 2, i32 3
|
|
+ store i32 %value, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll b/test/CodeGen/R600/fcmp-cnde-int-args.ll
|
|
new file mode 100644
|
|
index 0000000..5c981ef
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the
|
|
+; chance to optimize the fcmp + select instructions to CNDE was missed
|
|
+; due to the fact that the operands to fcmp and select had different types
|
|
+
|
|
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
|
|
+
|
|
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
|
|
+entry:
|
|
+ %0 = load float addrspace(1)* %in
|
|
+ %cmp = fcmp oeq float %0, 0.000000e+00
|
|
+ %value = select i1 %cmp, i32 -1, i32 0
|
|
+ store i32 %value, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
|
|
new file mode 100644
|
|
index 0000000..89f5e9e
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fcmp.ll
|
|
@@ -0,0 +1,14 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
|
|
+entry:
|
|
+ %0 = load float addrspace(1)* %in
|
|
+ %arrayidx1 = getelementptr inbounds float addrspace(1)* %in, i32 1
|
|
+ %1 = load float addrspace(1)* %arrayidx1
|
|
+ %cmp = fcmp oeq float %0, %1
|
|
+ %sext = sext i1 %cmp to i32
|
|
+ store i32 %sext, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/fdiv.v4f32.ll b/test/CodeGen/R600/fdiv.v4f32.ll
|
|
new file mode 100644
|
|
index 0000000..b013fd6
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fdiv.v4f32.ll
|
|
@@ -0,0 +1,19 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
|
+ %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
|
|
+ %a = load <4 x float> addrspace(1) * %in
|
|
+ %b = load <4 x float> addrspace(1) * %b_ptr
|
|
+ %result = fdiv <4 x float> %a, %b
|
|
+ store <4 x float> %result, <4 x float> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
|
|
new file mode 100644
|
|
index 0000000..845330f
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/floor.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: FLOOR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @floor(float %r0)
|
|
+ call void @llvm.AMDGPU.store.output(float %r1, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
+
|
|
+declare float @floor(float) readonly
|
|
diff --git a/test/CodeGen/R600/fmax.ll b/test/CodeGen/R600/fmax.ll
|
|
new file mode 100644
|
|
index 0000000..3708f0b
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fmax.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: MAX T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @llvm.R600.load.input(i32 1)
|
|
+ %r2 = fcmp uge float %r0, %r1
|
|
+ %r3 = select i1 %r2, float %r0, float %r1
|
|
+ call void @llvm.AMDGPU.store.output(float %r3, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
diff --git a/test/CodeGen/R600/fmin.ll b/test/CodeGen/R600/fmin.ll
|
|
new file mode 100644
|
|
index 0000000..19d59ab
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fmin.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: MIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @llvm.R600.load.input(i32 1)
|
|
+ %r2 = fcmp uge float %r0, %r1
|
|
+ %r3 = select i1 %r2, float %r1, float %r0
|
|
+ call void @llvm.AMDGPU.store.output(float %r3, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
|
|
new file mode 100644
|
|
index 0000000..eb1d523
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fmul.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @llvm.R600.load.input(i32 1)
|
|
+ %r2 = fmul float %r0, %r1
|
|
+ call void @llvm.AMDGPU.store.output(float %r2, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
+
|
|
diff --git a/test/CodeGen/R600/fmul.v4f32.ll b/test/CodeGen/R600/fmul.v4f32.ll
|
|
new file mode 100644
|
|
index 0000000..6d44a0c
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fmul.v4f32.ll
|
|
@@ -0,0 +1,15 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
|
+ %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
|
|
+ %a = load <4 x float> addrspace(1) * %in
|
|
+ %b = load <4 x float> addrspace(1) * %b_ptr
|
|
+ %result = fmul <4 x float> %a, %b
|
|
+ store <4 x float> %result, <4 x float> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
|
|
new file mode 100644
|
|
index 0000000..591aa52
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fsub.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @llvm.R600.load.input(i32 1)
|
|
+ %r2 = fsub float %r0, %r1
|
|
+ call void @llvm.AMDGPU.store.output(float %r2, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
+
|
|
diff --git a/test/CodeGen/R600/fsub.v4f32.ll b/test/CodeGen/R600/fsub.v4f32.ll
|
|
new file mode 100644
|
|
index 0000000..612a57e
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/fsub.v4f32.ll
|
|
@@ -0,0 +1,15 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
|
+ %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
|
|
+ %a = load <4 x float> addrspace(1) * %in
|
|
+ %b = load <4 x float> addrspace(1) * %b_ptr
|
|
+ %result = fsub <4 x float> %a, %b
|
|
+ store <4 x float> %result, <4 x float> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/i8_to_double_to_float.ll b/test/CodeGen/R600/i8_to_double_to_float.ll
|
|
new file mode 100644
|
|
index 0000000..39f3322
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/i8_to_double_to_float.ll
|
|
@@ -0,0 +1,11 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
|
|
+ %1 = load i8 addrspace(1)* %in
|
|
+ %2 = uitofp i8 %1 to double
|
|
+ %3 = fptrunc double %2 to float
|
|
+ store float %3, float addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
|
|
new file mode 100644
|
|
index 0000000..aad44d9
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
|
|
@@ -0,0 +1,18 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;Test that a select with reversed True/False values is correctly lowered
|
|
+;to a SETNE_INT. There should only be one SETNE_INT instruction.
|
|
+
|
|
+;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK_NOT: SETNE_INT
|
|
+
|
|
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
|
+entry:
|
|
+ %0 = load i32 addrspace(1)* %in
|
|
+ %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %in, i32 1
|
|
+ %1 = load i32 addrspace(1)* %arrayidx1
|
|
+ %cmp = icmp eq i32 %0, %1
|
|
+ %value = select i1 %cmp, i32 0, i32 -1
|
|
+ store i32 %value, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll
|
|
new file mode 100644
|
|
index 0000000..382f78c
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/kcache-fold.ll
|
|
@@ -0,0 +1,52 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; CHECK: MOV T{{[0-9]+\.[XYZW], CBuf0\[[0-9]+\]\.[XYZW]}}
|
|
+
|
|
+define void @main() {
|
|
+main_body:
|
|
+ %0 = load <4 x float> addrspace(9)* null
|
|
+ %1 = extractelement <4 x float> %0, i32 0
|
|
+ %2 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
|
|
+ %3 = extractelement <4 x float> %2, i32 0
|
|
+ %4 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
|
|
+ %5 = extractelement <4 x float> %4, i32 0
|
|
+ %6 = fcmp ult float %1, 0.000000e+00
|
|
+ %7 = select i1 %6, float %3, float %5
|
|
+ %8 = load <4 x float> addrspace(9)* null
|
|
+ %9 = extractelement <4 x float> %8, i32 1
|
|
+ %10 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
|
|
+ %11 = extractelement <4 x float> %10, i32 1
|
|
+ %12 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
|
|
+ %13 = extractelement <4 x float> %12, i32 1
|
|
+ %14 = fcmp ult float %9, 0.000000e+00
|
|
+ %15 = select i1 %14, float %11, float %13
|
|
+ %16 = load <4 x float> addrspace(9)* null
|
|
+ %17 = extractelement <4 x float> %16, i32 2
|
|
+ %18 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
|
|
+ %19 = extractelement <4 x float> %18, i32 2
|
|
+ %20 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
|
|
+ %21 = extractelement <4 x float> %20, i32 2
|
|
+ %22 = fcmp ult float %17, 0.000000e+00
|
|
+ %23 = select i1 %22, float %19, float %21
|
|
+ %24 = load <4 x float> addrspace(9)* null
|
|
+ %25 = extractelement <4 x float> %24, i32 3
|
|
+ %26 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
|
|
+ %27 = extractelement <4 x float> %26, i32 3
|
|
+ %28 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
|
|
+ %29 = extractelement <4 x float> %28, i32 3
|
|
+ %30 = fcmp ult float %25, 0.000000e+00
|
|
+ %31 = select i1 %30, float %27, float %29
|
|
+ %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
|
|
+ %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
|
|
+ %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
|
|
+ %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
|
|
+ %36 = insertelement <4 x float> undef, float %32, i32 0
|
|
+ %37 = insertelement <4 x float> %36, float %33, i32 1
|
|
+ %38 = insertelement <4 x float> %37, float %34, i32 2
|
|
+ %39 = insertelement <4 x float> %38, float %35, i32 3
|
|
+ call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.AMDIL.clamp.(float, float, float) readnone
|
|
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
|
diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg
|
|
new file mode 100644
|
|
index 0000000..36ee493
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/lit.local.cfg
|
|
@@ -0,0 +1,13 @@
|
|
+config.suffixes = ['.ll', '.c', '.cpp']
|
|
+
|
|
+def getRoot(config):
|
|
+ if not config.parent:
|
|
+ return config
|
|
+ return getRoot(config.parent)
|
|
+
|
|
+root = getRoot(config)
|
|
+
|
|
+targets = set(root.targets_to_build.split())
|
|
+if not 'R600' in targets:
|
|
+ config.unsupported = True
|
|
+
|
|
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
|
|
new file mode 100644
|
|
index 0000000..be62342
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/literals.ll
|
|
@@ -0,0 +1,32 @@
|
|
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; Test using an integer literal constant.
|
|
+; Generated ASM should be:
|
|
+; ADD_INT REG literal.x, 5
|
|
+; or
|
|
+; ADD_INT literal.x REG, 5
|
|
+
|
|
+; CHECK; @i32_literal
|
|
+; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5
|
|
+define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
|
|
+entry:
|
|
+ %0 = add i32 5, %in
|
|
+ store i32 %0, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; Test using a float literal constant.
|
|
+; Generated ASM should be:
|
|
+; ADD REG literal.x, 5.0
|
|
+; or
|
|
+; ADD literal.x REG, 5.0
|
|
+
|
|
+; CHECK: @float_literal
|
|
+; CHECK: ADD {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} {{[0-9]+}}(5.0
|
|
+define void @float_literal(float addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fadd float 5.0, %in
|
|
+ store float %0, float addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
diff --git a/test/CodeGen/R600/llvm.AMDGPU.mul.ll b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
|
|
new file mode 100644
|
|
index 0000000..693eb27
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
|
|
@@ -0,0 +1,17 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @llvm.R600.load.input(i32 1)
|
|
+ %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
|
|
+ call void @llvm.AMDGPU.store.output(float %r2, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
+
|
|
+declare float @llvm.AMDGPU.mul(float ,float ) readnone
|
|
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
|
|
new file mode 100644
|
|
index 0000000..fac957f
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: TRUNC T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @llvm.AMDGPU.trunc( float %r0)
|
|
+ call void @llvm.AMDGPU.store.output(float %r1, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
+
|
|
+declare float @llvm.AMDGPU.trunc(float ) readnone
|
|
diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
|
|
new file mode 100644
|
|
index 0000000..0c19f14
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
|
|
@@ -0,0 +1,23 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
|
|
+
|
|
+;CHECK: S_MOV_B32
|
|
+;CHECK-NEXT: V_INTERP_MOV_F32
|
|
+
|
|
+define void @main() {
|
|
+main_body:
|
|
+ call void @llvm.AMDGPU.shader.type(i32 0)
|
|
+ %0 = load i32 addrspace(8)* inttoptr (i32 6 to i32 addrspace(8)*)
|
|
+ %1 = call float @llvm.SI.fs.interp.constant(i32 0, i32 0, i32 %0)
|
|
+ %2 = call i32 @llvm.SI.packf16(float %1, float %1)
|
|
+ %3 = bitcast i32 %2 to float
|
|
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare void @llvm.AMDGPU.shader.type(i32)
|
|
+
|
|
+declare float @llvm.SI.fs.interp.constant(i32, i32, i32) readonly
|
|
+
|
|
+declare i32 @llvm.SI.packf16(float, float) readnone
|
|
+
|
|
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
|
|
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
|
|
new file mode 100644
|
|
index 0000000..dc120bf
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/llvm.cos.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: COS T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @llvm.cos.f32(float %r0)
|
|
+ call void @llvm.AMDGPU.store.output(float %r1, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.cos.f32(float) readnone
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
|
|
new file mode 100644
|
|
index 0000000..0ae9172
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/llvm.pow.ll
|
|
@@ -0,0 +1,19 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: LOG_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK-NEXT: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+;CHECK-NEXT: EXP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @llvm.R600.load.input(i32 1)
|
|
+ %r2 = call float @llvm.pow.f32( float %r0, float %r1)
|
|
+ call void @llvm.AMDGPU.store.output(float %r2, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
+
|
|
+declare float @llvm.pow.f32(float ,float ) readonly
|
|
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
|
|
new file mode 100644
|
|
index 0000000..5cd6998
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/llvm.sin.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: SIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = call float @llvm.sin.f32( float %r0)
|
|
+ call void @llvm.AMDGPU.store.output(float %r1, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.sin.f32(float) readnone
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
diff --git a/test/CodeGen/R600/load.constant_addrspace.f32.ll b/test/CodeGen/R600/load.constant_addrspace.f32.ll
|
|
new file mode 100644
|
|
index 0000000..9362728
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/load.constant_addrspace.f32.ll
|
|
@@ -0,0 +1,9 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: VTX_READ_32 T{{[0-9]+\.X, T[0-9]+\.X}}
|
|
+
|
|
+define void @test(float addrspace(1)* %out, float addrspace(2)* %in) {
|
|
+ %1 = load float addrspace(2)* %in
|
|
+ store float %1, float addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/load.i8.ll b/test/CodeGen/R600/load.i8.ll
|
|
new file mode 100644
|
|
index 0000000..b070dcd
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/load.i8.ll
|
|
@@ -0,0 +1,10 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
|
|
+
|
|
+define void @test(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
|
+ %1 = load i8 addrspace(1)* %in
|
|
+ %2 = zext i8 %1 to i32
|
|
+ store i32 %2, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/predicates.ll b/test/CodeGen/R600/predicates.ll
|
|
new file mode 100644
|
|
index 0000000..18895a4
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/predicates.ll
|
|
@@ -0,0 +1,100 @@
|
|
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; These tests make sure the compiler is optimizing branches using predicates
|
|
+; when it is legal to do so.
|
|
+
|
|
+; CHECK: @simple_if
|
|
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT Pred,
|
|
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
|
|
+define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
|
|
+entry:
|
|
+ %0 = icmp sgt i32 %in, 0
|
|
+ br i1 %0, label %IF, label %ENDIF
|
|
+
|
|
+IF:
|
|
+ %1 = shl i32 %in, 1
|
|
+ br label %ENDIF
|
|
+
|
|
+ENDIF:
|
|
+ %2 = phi i32 [ %in, %entry ], [ %1, %IF ]
|
|
+ store i32 %2, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @simple_if_else
|
|
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT Pred,
|
|
+; CHECK: LSH{{[LR] T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
|
|
+; CHECK: LSH{{[LR] T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
|
|
+define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
|
|
+entry:
|
|
+ %0 = icmp sgt i32 %in, 0
|
|
+ br i1 %0, label %IF, label %ELSE
|
|
+
|
|
+IF:
|
|
+ %1 = shl i32 %in, 1
|
|
+ br label %ENDIF
|
|
+
|
|
+ELSE:
|
|
+ %2 = lshr i32 %in, 1
|
|
+ br label %ENDIF
|
|
+
|
|
+ENDIF:
|
|
+ %3 = phi i32 [ %1, %IF ], [ %2, %ELSE ]
|
|
+ store i32 %3, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @nested_if
|
|
+; CHECK: IF_PREDICATE_SET
|
|
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT Pred,
|
|
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
|
|
+; CHECK: ENDIF
|
|
+define void @nested_if(i32 addrspace(1)* %out, i32 %in) {
|
|
+entry:
|
|
+ %0 = icmp sgt i32 %in, 0
|
|
+ br i1 %0, label %IF0, label %ENDIF
|
|
+
|
|
+IF0:
|
|
+ %1 = add i32 %in, 10
|
|
+ %2 = icmp sgt i32 %1, 0
|
|
+ br i1 %2, label %IF1, label %ENDIF
|
|
+
|
|
+IF1:
|
|
+ %3 = shl i32 %1, 1
|
|
+ br label %ENDIF
|
|
+
|
|
+ENDIF:
|
|
+ %4 = phi i32 [%in, %entry], [%1, %IF0], [%3, %IF1]
|
|
+ store i32 %4, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @nested_if_else
|
|
+; CHECK: IF_PREDICATE_SET
|
|
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT Pred,
|
|
+; CHECK: LSH{{[LR] T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
|
|
+; CHECK: LSH{{[LR] T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
|
|
+; CHECK: ENDIF
|
|
+define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
|
|
+entry:
|
|
+ %0 = icmp sgt i32 %in, 0
|
|
+ br i1 %0, label %IF0, label %ENDIF
|
|
+
|
|
+IF0:
|
|
+ %1 = add i32 %in, 10
|
|
+ %2 = icmp sgt i32 %1, 0
|
|
+ br i1 %2, label %IF1, label %ELSE1
|
|
+
|
|
+IF1:
|
|
+ %3 = shl i32 %1, 1
|
|
+ br label %ENDIF
|
|
+
|
|
+ELSE1:
|
|
+ %4 = lshr i32 %in, 1
|
|
+ br label %ENDIF
|
|
+
|
|
+ENDIF:
|
|
+ %5 = phi i32 [%in, %entry], [%3, %IF1], [%4, %ELSE1]
|
|
+ store i32 %5, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/reciprocal.ll b/test/CodeGen/R600/reciprocal.ll
|
|
new file mode 100644
|
|
index 0000000..6838c1a
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/reciprocal.ll
|
|
@@ -0,0 +1,16 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test() {
|
|
+ %r0 = call float @llvm.R600.load.input(i32 0)
|
|
+ %r1 = fdiv float 1.0, %r0
|
|
+ call void @llvm.AMDGPU.store.output(float %r1, i32 0)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare float @llvm.R600.load.input(i32) readnone
|
|
+
|
|
+declare void @llvm.AMDGPU.store.output(float, i32)
|
|
+
|
|
+declare float @llvm.AMDGPU.rcp(float ) readnone
|
|
diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
|
|
new file mode 100644
|
|
index 0000000..3556fac
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/sdiv.ll
|
|
@@ -0,0 +1,21 @@
|
|
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; The code generated by sdiv is long and complex and may frequently change.
|
|
+; The goal of this test is to make sure the ISel doesn't fail.
|
|
+;
|
|
+; This program was previously failing to compile when one of the selectcc
|
|
+; opcodes generated by the sdiv lowering was being legalized and optimized to:
|
|
+; selectcc Remainder -1, 0, -1, SETGT
|
|
+; This was fixed by adding an additional pattern in R600Instructions.td to
|
|
+; match this pattern with a CNDGE_INT.
|
|
+
|
|
+; CHECK: RETURN
|
|
+
|
|
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
|
+ %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
|
+ %num = load i32 addrspace(1) * %in
|
|
+ %den = load i32 addrspace(1) * %den_ptr
|
|
+ %result = sdiv i32 %num, %den
|
|
+ store i32 %result, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll b/test/CodeGen/R600/selectcc-icmp-select-float.ll
|
|
new file mode 100644
|
|
index 0000000..359ca1e
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll
|
|
@@ -0,0 +1,15 @@
|
|
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; Note additional optimizations may cause this SGT to be replaced with a
|
|
+; CND* instruction.
|
|
+; CHECK: SETGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
|
|
+; Test a selectcc with i32 LHS/RHS and float True/False
|
|
+
|
|
+define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
|
|
+entry:
|
|
+ %0 = load i32 addrspace(1)* %in
|
|
+ %1 = icmp sge i32 %0, 0
|
|
+ %2 = select i1 %1, float 1.0, float 0.0
|
|
+ store float %2, float addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/selectcc_cnde.ll b/test/CodeGen/R600/selectcc_cnde.ll
|
|
new file mode 100644
|
|
index 0000000..f0a0f51
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/selectcc_cnde.ll
|
|
@@ -0,0 +1,11 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK-NOT: SETE
|
|
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, [-0-9]+\(2.0}}
|
|
+define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
|
|
+ %1 = load float addrspace(1)* %in
|
|
+ %2 = fcmp oeq float %1, 0.0
|
|
+ %3 = select i1 %2, float 1.0, float 2.0
|
|
+ store float %3, float addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/selectcc_cnde_int.ll b/test/CodeGen/R600/selectcc_cnde_int.ll
|
|
new file mode 100644
|
|
index 0000000..b38078e
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/selectcc_cnde_int.ll
|
|
@@ -0,0 +1,11 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK-NOT: SETE_INT
|
|
+;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}}
|
|
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
|
+ %1 = load i32 addrspace(1)* %in
|
|
+ %2 = icmp eq i32 %1, 0
|
|
+ %3 = select i1 %2, i32 1, i32 2
|
|
+ store i32 %3, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/set-dx10.ll b/test/CodeGen/R600/set-dx10.ll
|
|
new file mode 100644
|
|
index 0000000..54febcf
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/set-dx10.ll
|
|
@@ -0,0 +1,137 @@
|
|
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; These tests check that floating point comparisons which are used by select
|
|
+; to store integer true (-1) and false (0) values are lowered to one of the
|
|
+; SET*DX10 instructions.
|
|
+
|
|
+; CHECK: @fcmp_une_select_fptosi
|
|
+; CHECK: SETNE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
|
|
+define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp une float %in, 5.0
|
|
+ %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
|
|
+ %2 = fsub float -0.000000e+00, %1
|
|
+ %3 = fptosi float %2 to i32
|
|
+ store i32 %3, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_une_select_i32
|
|
+; CHECK: SETNE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
|
|
+define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp une float %in, 5.0
|
|
+ %1 = select i1 %0, i32 -1, i32 0
|
|
+ store i32 %1, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_ueq_select_fptosi
|
|
+; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
|
|
+define void @fcmp_ueq_select_fptosi(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ueq float %in, 5.0
|
|
+ %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
|
|
+ %2 = fsub float -0.000000e+00, %1
|
|
+ %3 = fptosi float %2 to i32
|
|
+ store i32 %3, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_ueq_select_i32
|
|
+; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
|
|
+define void @fcmp_ueq_select_i32(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ueq float %in, 5.0
|
|
+ %1 = select i1 %0, i32 -1, i32 0
|
|
+ store i32 %1, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_ugt_select_fptosi
|
|
+; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
|
|
+define void @fcmp_ugt_select_fptosi(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ugt float %in, 5.0
|
|
+ %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
|
|
+ %2 = fsub float -0.000000e+00, %1
|
|
+ %3 = fptosi float %2 to i32
|
|
+ store i32 %3, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_ugt_select_i32
|
|
+; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
|
|
+define void @fcmp_ugt_select_i32(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ugt float %in, 5.0
|
|
+ %1 = select i1 %0, i32 -1, i32 0
|
|
+ store i32 %1, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_uge_select_fptosi
|
|
+; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
|
|
+define void @fcmp_uge_select_fptosi(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp uge float %in, 5.0
|
|
+ %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
|
|
+ %2 = fsub float -0.000000e+00, %1
|
|
+ %3 = fptosi float %2 to i32
|
|
+ store i32 %3, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_uge_select_i32
|
|
+; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
|
|
+define void @fcmp_uge_select_i32(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp uge float %in, 5.0
|
|
+ %1 = select i1 %0, i32 -1, i32 0
|
|
+ store i32 %1, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_ule_select_fptosi
|
|
+; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
|
|
+define void @fcmp_ule_select_fptosi(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ule float %in, 5.0
|
|
+ %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
|
|
+ %2 = fsub float -0.000000e+00, %1
|
|
+ %3 = fptosi float %2 to i32
|
|
+ store i32 %3, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_ule_select_i32
|
|
+; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
|
|
+define void @fcmp_ule_select_i32(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ule float %in, 5.0
|
|
+ %1 = select i1 %0, i32 -1, i32 0
|
|
+ store i32 %1, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_ult_select_fptosi
|
|
+; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
|
|
+define void @fcmp_ult_select_fptosi(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ult float %in, 5.0
|
|
+ %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
|
|
+ %2 = fsub float -0.000000e+00, %1
|
|
+ %3 = fptosi float %2 to i32
|
|
+ store i32 %3, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fcmp_ult_select_i32
|
|
+; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
|
|
+define void @fcmp_ult_select_i32(i32 addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ult float %in, 5.0
|
|
+ %1 = select i1 %0, i32 -1, i32 0
|
|
+ store i32 %1, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/setcc.v4i32.ll b/test/CodeGen/R600/setcc.v4i32.ll
|
|
new file mode 100644
|
|
index 0000000..0752f2e
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/setcc.v4i32.ll
|
|
@@ -0,0 +1,12 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
|
|
+ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
|
|
+ %a = load <4 x i32> addrspace(1) * %in
|
|
+ %b = load <4 x i32> addrspace(1) * %b_ptr
|
|
+ %result = icmp eq <4 x i32> %a, %b
|
|
+ %sext = sext <4 x i1> %result to <4 x i32>
|
|
+ store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/short-args.ll b/test/CodeGen/R600/short-args.ll
|
|
new file mode 100644
|
|
index 0000000..b69e327
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/short-args.ll
|
|
@@ -0,0 +1,41 @@
|
|
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; CHECK: @i8_arg
|
|
+; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
|
|
+
|
|
+define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
|
|
+entry:
|
|
+ %0 = zext i8 %in to i32
|
|
+ store i32 %0, i32 addrspace(1)* %out, align 4
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @i8_zext_arg
|
|
+; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
|
|
+
|
|
+define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
|
|
+entry:
|
|
+ %0 = zext i8 %in to i32
|
|
+ store i32 %0, i32 addrspace(1)* %out, align 4
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @i16_arg
|
|
+; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
|
|
+
|
|
+define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
|
|
+entry:
|
|
+ %0 = zext i16 %in to i32
|
|
+ store i32 %0, i32 addrspace(1)* %out, align 4
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @i16_zext_arg
|
|
+; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
|
|
+
|
|
+define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
|
|
+entry:
|
|
+ %0 = zext i16 %in to i32
|
|
+ store i32 %0, i32 addrspace(1)* %out, align 4
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/store.v4f32.ll b/test/CodeGen/R600/store.v4f32.ll
|
|
new file mode 100644
|
|
index 0000000..8b0d244
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/store.v4f32.ll
|
|
@@ -0,0 +1,9 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
|
|
+
|
|
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
|
+ %1 = load <4 x float> addrspace(1) * %in
|
|
+ store <4 x float> %1, <4 x float> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/store.v4i32.ll b/test/CodeGen/R600/store.v4i32.ll
|
|
new file mode 100644
|
|
index 0000000..a659815
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/store.v4i32.ll
|
|
@@ -0,0 +1,9 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
|
|
+
|
|
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
|
|
+ %1 = load <4 x i32> addrspace(1) * %in
|
|
+ store <4 x i32> %1, <4 x i32> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/udiv.v4i32.ll b/test/CodeGen/R600/udiv.v4i32.ll
|
|
new file mode 100644
|
|
index 0000000..47657a6
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/udiv.v4i32.ll
|
|
@@ -0,0 +1,15 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;The code generated by udiv is long and complex and may frequently change.
|
|
+;The goal of this test is to make sure the ISel doesn't fail when it gets
|
|
+;a v4i32 udiv
|
|
+;CHECK: RETURN
|
|
+
|
|
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
|
|
+ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
|
|
+ %a = load <4 x i32> addrspace(1) * %in
|
|
+ %b = load <4 x i32> addrspace(1) * %b_ptr
|
|
+ %result = udiv <4 x i32> %a, %b
|
|
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/unsupported-cc.ll b/test/CodeGen/R600/unsupported-cc.ll
|
|
new file mode 100644
|
|
index 0000000..b48c591
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/unsupported-cc.ll
|
|
@@ -0,0 +1,83 @@
|
|
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; These tests are for condition codes that are not supported by the hardware
|
|
+
|
|
+; CHECK: @slt
|
|
+; CHECK: SETGT_INT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 5(7.006492e-45)
|
|
+define void @slt(i32 addrspace(1)* %out, i32 %in) {
|
|
+entry:
|
|
+ %0 = icmp slt i32 %in, 5
|
|
+ %1 = select i1 %0, i32 -1, i32 0
|
|
+ store i32 %1, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @ult_i32
|
|
+; CHECK: SETGT_UINT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 5(7.006492e-45)
|
|
+define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
|
|
+entry:
|
|
+ %0 = icmp ult i32 %in, 5
|
|
+ %1 = select i1 %0, i32 -1, i32 0
|
|
+ store i32 %1, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @ult_float
|
|
+; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
|
|
+define void @ult_float(float addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ult float %in, 5.0
|
|
+ %1 = select i1 %0, float 1.0, float 0.0
|
|
+ store float %1, float addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @olt
|
|
+; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
|
|
+define void @olt(float addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp olt float %in, 5.0
|
|
+ %1 = select i1 %0, float 1.0, float 0.0
|
|
+ store float %1, float addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @sle
|
|
+; CHECK: SETGT_INT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 6(8.407791e-45)
|
|
+define void @sle(i32 addrspace(1)* %out, i32 %in) {
|
|
+entry:
|
|
+ %0 = icmp sle i32 %in, 5
|
|
+ %1 = select i1 %0, i32 -1, i32 0
|
|
+ store i32 %1, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @ule_i32
|
|
+; CHECK: SETGT_UINT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 6(8.407791e-45)
|
|
+define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
|
|
+entry:
|
|
+ %0 = icmp ule i32 %in, 5
|
|
+ %1 = select i1 %0, i32 -1, i32 0
|
|
+ store i32 %1, i32 addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @ule_float
|
|
+; CHECK: SETGE T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
|
|
+define void @ule_float(float addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ule float %in, 5.0
|
|
+ %1 = select i1 %0, float 1.0, float 0.0
|
|
+ store float %1, float addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @ole
|
|
+; CHECK: SETGE T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
|
|
+define void @ole(float addrspace(1)* %out, float %in) {
|
|
+entry:
|
|
+ %0 = fcmp ole float %in, 5.0
|
|
+ %1 = select i1 %0, float 1.0, float 0.0
|
|
+ store float %1, float addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/urem.v4i32.ll b/test/CodeGen/R600/urem.v4i32.ll
|
|
new file mode 100644
|
|
index 0000000..2e7388c
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/urem.v4i32.ll
|
|
@@ -0,0 +1,15 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+;The code generated by urem is long and complex and may frequently change.
|
|
+;The goal of this test is to make sure the ISel doesn't fail when it gets
|
|
+;a v4i32 urem
|
|
+;CHECK: RETURN
|
|
+
|
|
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
|
|
+ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
|
|
+ %a = load <4 x i32> addrspace(1) * %in
|
|
+ %b = load <4 x i32> addrspace(1) * %b_ptr
|
|
+ %result = urem <4 x i32> %a, %b
|
|
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/R600/vec4-expand.ll b/test/CodeGen/R600/vec4-expand.ll
|
|
new file mode 100644
|
|
index 0000000..8f62bc6
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/R600/vec4-expand.ll
|
|
@@ -0,0 +1,53 @@
|
|
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
|
+
|
|
+; CHECK: @fp_to_sint
|
|
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @fp_to_sint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
|
+ %value = load <4 x float> addrspace(1) * %in
|
|
+ %result = fptosi <4 x float> %value to <4 x i32>
|
|
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @fp_to_uint
|
|
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @fp_to_uint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
|
+ %value = load <4 x float> addrspace(1) * %in
|
|
+ %result = fptoui <4 x float> %value to <4 x i32>
|
|
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @sint_to_fp
|
|
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @sint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
|
|
+ %value = load <4 x i32> addrspace(1) * %in
|
|
+ %result = sitofp <4 x i32> %value to <4 x float>
|
|
+ store <4 x float> %result, <4 x float> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
+
|
|
+; CHECK: @uint_to_fp
|
|
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
|
+
|
|
+define void @uint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
|
|
+ %value = load <4 x i32> addrspace(1) * %in
|
|
+ %result = uitofp <4 x i32> %value to <4 x float>
|
|
+ store <4 x float> %result, <4 x float> addrspace(1)* %out
|
|
+ ret void
|
|
+}
|
|
diff --git a/test/CodeGen/SI/sanity.ll b/test/CodeGen/SI/sanity.ll
|
|
new file mode 100644
|
|
index 0000000..62cdcf5
|
|
--- /dev/null
|
|
+++ b/test/CodeGen/SI/sanity.ll
|
|
@@ -0,0 +1,37 @@
|
|
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
|
|
+
|
|
+; CHECK: S_ENDPGM
|
|
+
|
|
+define void @main() {
|
|
+main_body:
|
|
+ call void @llvm.AMDGPU.shader.type(i32 1)
|
|
+ %0 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
|
|
+ %1 = getelementptr <4 x i32> addrspace(2)* %0, i32 0
|
|
+ %2 = load <4 x i32> addrspace(2)* %1
|
|
+ %3 = call i32 @llvm.SI.vs.load.buffer.index()
|
|
+ %4 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %2, i32 0, i32 %3)
|
|
+ %5 = extractelement <4 x float> %4, i32 0
|
|
+ %6 = extractelement <4 x float> %4, i32 1
|
|
+ %7 = extractelement <4 x float> %4, i32 2
|
|
+ %8 = extractelement <4 x float> %4, i32 3
|
|
+ %9 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
|
|
+ %10 = getelementptr <4 x i32> addrspace(2)* %9, i32 1
|
|
+ %11 = load <4 x i32> addrspace(2)* %10
|
|
+ %12 = call i32 @llvm.SI.vs.load.buffer.index()
|
|
+ %13 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %11, i32 0, i32 %12)
|
|
+ %14 = extractelement <4 x float> %13, i32 0
|
|
+ %15 = extractelement <4 x float> %13, i32 1
|
|
+ %16 = extractelement <4 x float> %13, i32 2
|
|
+ %17 = extractelement <4 x float> %13, i32 3
|
|
+ call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %14, float %15, float %16, float %17)
|
|
+ call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %5, float %6, float %7, float %8)
|
|
+ ret void
|
|
+}
|
|
+
|
|
+declare void @llvm.AMDGPU.shader.type(i32)
|
|
+
|
|
+declare i32 @llvm.SI.vs.load.buffer.index() readnone
|
|
+
|
|
+declare <4 x float> @llvm.SI.vs.load.input(<4 x i32>, i32, i32)
|
|
+
|
|
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
|