diff --git a/debian/changelog b/debian/changelog index a4101940..1bff38c2 100644 --- a/debian/changelog +++ b/debian/changelog @@ -7,6 +7,9 @@ llvm-toolchain-3.2 (1:3.2repack-1~exp4) UNRELEASED; urgency=low See: 31-powerpcspe.diff * Fix the path detection of scan-build (Closes: #698352) See: 32-scan-build-path.diff + * debian/patches/r600-snapshot.diff: Move backports into individual patches. + * debian/patches/r600-snapshot.diff: Update to mesa-9.1 git tag. + (Closes: #703671, #697356) -- Sylvestre Ledru Thu, 14 Mar 2013 17:47:12 +0100 diff --git a/debian/patches/0101-LegalizeDAG-Allow-type-promotion-for-scalar-stores.patch b/debian/patches/0101-LegalizeDAG-Allow-type-promotion-for-scalar-stores.patch new file mode 100644 index 00000000..d6dd4ef0 --- /dev/null +++ b/debian/patches/0101-LegalizeDAG-Allow-type-promotion-for-scalar-stores.patch @@ -0,0 +1,32 @@ +From 7fd12aa7dac937573ea8a48fbe646b334ece4b74 Mon Sep 17 00:00:00 2001 +From: Tom Stellard +Date: Mon, 10 Dec 2012 21:41:54 +0000 +Subject: [PATCH 1/4] LegalizeDAG: Allow type promotion for scalar stores + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169772 91177308-0d34-0410-b5e6-96231b3b80d8 +(cherry picked from commit 8b7f16e9719a64973e3b4d35e122222c26839c44) +--- + lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 7 ++++--- + 1 fil ändrad, 4 tillägg(+), 3 borttagningar(-) + +diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +index abf40b7..9946694 100644 +--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp ++++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +@@ -731,9 +731,10 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { + return; + } + case TargetLowering::Promote: { +- assert(VT.isVector() && "Unknown legal promote case!"); +- Value = DAG.getNode(ISD::BITCAST, dl, +- TLI.getTypeToPromoteTo(ISD::STORE, VT), Value); ++ EVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT); ++ assert(NVT.getSizeInBits() == VT.getSizeInBits() && ++ "Can only promote stores to same size type"); ++ Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value); + SDValue Result = + DAG.getStore(Chain, dl, Value, Ptr, + ST->getPointerInfo(), isVolatile, +-- +1.7.10.4 + diff --git a/debian/patches/0102-LegalizeDAG-Allow-type-promotion-of-scalar-loads.patch b/debian/patches/0102-LegalizeDAG-Allow-type-promotion-of-scalar-loads.patch new file mode 100644 index 00000000..3dca7854 --- /dev/null +++ b/debian/patches/0102-LegalizeDAG-Allow-type-promotion-of-scalar-loads.patch @@ -0,0 +1,31 @@ +From 0530926051350bd36e1ea974066c5a14ae0fe202 Mon Sep 17 00:00:00 2001 +From: Tom Stellard +Date: Mon, 10 Dec 2012 21:41:58 +0000 +Subject: [PATCH 2/4] LegalizeDAG: Allow type promotion of scalar loads + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169773 91177308-0d34-0410-b5e6-96231b3b80d8 +(cherry picked from commit f45d11b56bffeaec94291f330dc9f7f7aae5a741) +--- + lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 5 ++--- + 1 fil ändrad, 2 tillägg(+), 3 borttagningar(-) + +diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +index 9946694..2596f00 100644 +--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp ++++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +@@ -890,10 +890,9 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { + break; + } + case TargetLowering::Promote: { +- // Only promote a load of vector type to another. +- assert(VT.isVector() && "Cannot promote this load!"); +- // Change base type to a different vector type. + EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT); ++ assert(NVT.getSizeInBits() == VT.getSizeInBits() && ++ "Can only promote loads to same size type"); + + SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(), + LD->isVolatile(), LD->isNonTemporal(), +-- +1.7.10.4 + diff --git a/debian/patches/0103-DAGCombiner-Avoid-generating-illegal-vector-INT_TO_F.patch b/debian/patches/0103-DAGCombiner-Avoid-generating-illegal-vector-INT_TO_F.patch new file mode 100644 index 00000000..49422983 --- /dev/null +++ b/debian/patches/0103-DAGCombiner-Avoid-generating-illegal-vector-INT_TO_F.patch @@ -0,0 +1,70 @@ +From 1d10f5a4c953104cf44c7c3e5927aec536b734f4 Mon Sep 17 00:00:00 2001 +From: Tom Stellard +Date: Wed, 2 Jan 2013 22:13:01 +0000 +Subject: [PATCH 3/4] DAGCombiner: Avoid generating illegal vector INT_TO_FP + nodes + +DAGCombiner::reduceBuildVecConvertToConvertBuildVec() was making two +mistakes: + +1. It was checking the legality of scalar INT_TO_FP nodes and then generating +vector nodes. + +2. It was passing the result value type to +TargetLoweringInfo::getOperationAction() when it should have been +passing the value type of the first operand. + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171420 91177308-0d34-0410-b5e6-96231b3b80d8 +(cherry picked from commit d40758b24ebab5777131533d9369e707fc852594) + +Conflicts: + test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll + test/CodeGen/R600/vec4-expand.ll +--- + lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 9 +++++---- + test/CodeGen/X86/cvtv2f32.ll | 4 ++++ + 2 filer ändrade, 9 tillägg(+), 4 borttagningar(-) + +diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +index 37d7731..d0ca5c0 100644 +--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp ++++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +@@ -8514,11 +8514,8 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { + if (Opcode == ISD::DELETED_NODE && + (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) { + Opcode = Opc; +- // If not supported by target, bail out. +- if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Legal && +- TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom) +- return SDValue(); + } ++ + if (Opc != Opcode) + return SDValue(); + +@@ -8543,6 +8540,10 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { + assert(SrcVT != MVT::Other && "Cannot determine source type!"); + + EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars); ++ ++ if (!TLI.isOperationLegalOrCustom(Opcode, NVT)) ++ return SDValue(); ++ + SmallVector Opnds; + for (unsigned i = 0; i != NumInScalars; ++i) { + SDValue In = N->getOperand(i); +diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll +index 466b096..d11bb9e 100644 +--- a/test/CodeGen/X86/cvtv2f32.ll ++++ b/test/CodeGen/X86/cvtv2f32.ll +@@ -1,3 +1,7 @@ ++; A bug fix in the DAGCombiner made this test fail, so marking as xfail ++; until this can be investigated further. ++; XFAIL: * ++ + ; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s + + define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) { +-- +1.7.10.4 + diff --git a/debian/patches/r600-snapshot.diff b/debian/patches/r600-snapshot.diff index 47c36306..64bab75d 100644 --- a/debian/patches/r600-snapshot.diff +++ b/debian/patches/r600-snapshot.diff @@ -41,116 +41,6 @@ index 4fa0705..02012b9 100755 #include "confdefs.h" #if HAVE_DLFCN_H -diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td -index 2e1597f..059bd80 100644 ---- a/include/llvm/Intrinsics.td -+++ b/include/llvm/Intrinsics.td -@@ -469,3 +469,4 @@ include "llvm/IntrinsicsXCore.td" - include "llvm/IntrinsicsHexagon.td" - include "llvm/IntrinsicsNVVM.td" - include "llvm/IntrinsicsMips.td" -+include "llvm/IntrinsicsR600.td" -diff --git a/include/llvm/IntrinsicsR600.td b/include/llvm/IntrinsicsR600.td -new file mode 100644 -index 0000000..ecb5668 ---- /dev/null -+++ b/include/llvm/IntrinsicsR600.td -@@ -0,0 +1,36 @@ -+//===- IntrinsicsR600.td - Defines R600 intrinsics ---------*- tablegen -*-===// -+// -+// The LLVM Compiler Infrastructure -+// -+// This file is distributed under the University of Illinois Open Source -+// License. See LICENSE.TXT for details. -+// -+//===----------------------------------------------------------------------===// -+// -+// This file defines all of the R600-specific intrinsics. -+// -+//===----------------------------------------------------------------------===// -+ -+let TargetPrefix = "r600" in { -+ -+class R600ReadPreloadRegisterIntrinsic -+ : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>, -+ GCCBuiltin; -+ -+multiclass R600ReadPreloadRegisterIntrinsic_xyz { -+ def _x : R600ReadPreloadRegisterIntrinsic; -+ def _y : R600ReadPreloadRegisterIntrinsic; -+ def _z : R600ReadPreloadRegisterIntrinsic; -+} -+ -+defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz < -+ "__builtin_r600_read_global_size">; -+defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz < -+ "__builtin_r600_read_local_size">; -+defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz < -+ "__builtin_r600_read_ngroups">; -+defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz < -+ "__builtin_r600_read_tgid">; -+defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz < -+ "__builtin_r600_read_tidig">; -+} // End TargetPrefix = "r600" -diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp -index 37d7731..d0ca5c0 100644 ---- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp -+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp -@@ -8514,11 +8514,8 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { - if (Opcode == ISD::DELETED_NODE && - (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) { - Opcode = Opc; -- // If not supported by target, bail out. -- if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Legal && -- TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom) -- return SDValue(); - } -+ - if (Opc != Opcode) - return SDValue(); - -@@ -8543,6 +8540,10 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { - assert(SrcVT != MVT::Other && "Cannot determine source type!"); - - EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars); -+ -+ if (!TLI.isOperationLegalOrCustom(Opcode, NVT)) -+ return SDValue(); -+ - SmallVector Opnds; - for (unsigned i = 0; i != NumInScalars; ++i) { - SDValue In = N->getOperand(i); -diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp -index abf40b7..2596f00 100644 ---- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp -+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp -@@ -731,9 +731,10 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { - return; - } - case TargetLowering::Promote: { -- assert(VT.isVector() && "Unknown legal promote case!"); -- Value = DAG.getNode(ISD::BITCAST, dl, -- TLI.getTypeToPromoteTo(ISD::STORE, VT), Value); -+ EVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT); -+ assert(NVT.getSizeInBits() == VT.getSizeInBits() && -+ "Can only promote stores to same size type"); -+ Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value); - SDValue Result = - DAG.getStore(Chain, dl, Value, Ptr, - ST->getPointerInfo(), isVolatile, -@@ -889,10 +890,9 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { - break; - } - case TargetLowering::Promote: { -- // Only promote a load of vector type to another. -- assert(VT.isVector() && "Cannot promote this load!"); -- // Change base type to a different vector type. - EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT); -+ assert(NVT.getSizeInBits() == VT.getSizeInBits() && -+ "Can only promote loads to same size type"); - - SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(), - LD->isVolatile(), LD->isNonTemporal(), diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt index 8995080..84c4111 100644 --- a/lib/Target/LLVMBuild.txt @@ -166,10 +56,10 @@ index 8995080..84c4111 100644 ; with the best execution engine (the native JIT, if available, or the diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h new file mode 100644 -index 0000000..bac01a3 +index 0000000..ba87918 --- /dev/null +++ b/lib/Target/R600/AMDGPU.h -@@ -0,0 +1,52 @@ +@@ -0,0 +1,51 @@ +//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure @@ -202,7 +92,6 @@ index 0000000..bac01a3 +FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm); +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); +FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); -+FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm); +FunctionPass *createSIInsertWaits(TargetMachine &tm); + +// Passes common to R600 and SI @@ -270,10 +159,10 @@ index 0000000..40f4741 +include "AMDGPUInstructions.td" diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp new file mode 100644 -index 0000000..a8c9bb4 +index 0000000..254e62e --- /dev/null +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp -@@ -0,0 +1,147 @@ +@@ -0,0 +1,145 @@ +//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// +// +// The LLVM Compiler Infrastructure @@ -367,8 +256,6 @@ index 0000000..a8c9bb4 + switch (reg) { + default: break; + case AMDGPU::EXEC: -+ case AMDGPU::SI_LITERAL_CONSTANT: -+ case AMDGPU::SREG_LIT_0: + case AMDGPU::M0: + continue; + } @@ -403,7 +290,7 @@ index 0000000..a8c9bb4 + } else { + assert(!"Unknown register class"); + } -+ hwReg = RI->getEncodingValue(reg); ++ hwReg = RI->getEncodingValue(reg) & 0xff; + maxUsed = hwReg + width - 1; + if (isSGPR) { + MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; @@ -471,61 +358,6 @@ index 0000000..3812282 +} // End anonymous llvm + +#endif //AMDGPU_ASMPRINTER_H -diff --git a/lib/Target/R600/AMDGPUCodeEmitter.h b/lib/Target/R600/AMDGPUCodeEmitter.h -new file mode 100644 -index 0000000..84f3588 ---- /dev/null -+++ b/lib/Target/R600/AMDGPUCodeEmitter.h -@@ -0,0 +1,49 @@ -+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===// -+// -+// The LLVM Compiler Infrastructure -+// -+// This file is distributed under the University of Illinois Open Source -+// License. See LICENSE.TXT for details. -+// -+//===----------------------------------------------------------------------===// -+// -+/// \file -+/// \brief CodeEmitter interface for R600 and SI codegen. -+// -+//===----------------------------------------------------------------------===// -+ -+#ifndef AMDGPUCODEEMITTER_H -+#define AMDGPUCODEEMITTER_H -+ -+namespace llvm { -+ -+class AMDGPUCodeEmitter { -+public: -+ uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const; -+ virtual uint64_t getMachineOpValue(const MachineInstr &MI, -+ const MachineOperand &MO) const { return 0; } -+ virtual unsigned GPR4AlignEncode(const MachineInstr &MI, -+ unsigned OpNo) const { -+ return 0; -+ } -+ virtual unsigned GPR2AlignEncode(const MachineInstr &MI, -+ unsigned OpNo) const { -+ return 0; -+ } -+ virtual uint64_t VOPPostEncode(const MachineInstr &MI, -+ uint64_t Value) const { -+ return Value; -+ } -+ virtual uint64_t i32LiteralEncode(const MachineInstr &MI, -+ unsigned OpNo) const { -+ return 0; -+ } -+ virtual uint32_t SMRDmemriEncode(const MachineInstr &MI, unsigned OpNo) -+ const { -+ return 0; -+ } -+}; -+ -+} // End namespace llvm -+ -+#endif // AMDGPUCODEEMITTER_H diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp new file mode 100644 index 0000000..50297d1 @@ -1198,10 +1030,10 @@ index 0000000..d0d23d6 +} diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h new file mode 100644 -index 0000000..927ed09 +index 0000000..99a11ff --- /dev/null +++ b/lib/Target/R600/AMDGPUISelLowering.h -@@ -0,0 +1,145 @@ +@@ -0,0 +1,140 @@ +//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure @@ -1257,6 +1089,11 @@ index 0000000..927ed09 + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + DebugLoc DL, SelectionDAG &DAG) const; ++ virtual SDValue LowerCall(CallLoweringInfo &CLI, ++ SmallVectorImpl &InVals) const { ++ CLI.Callee.dump(); ++ llvm_unreachable("Undefined function"); ++ } + + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; @@ -1334,25 +1171,15 @@ index 0000000..927ed09 + +} // End namespace AMDGPUISD + -+namespace SIISD { -+ -+enum { -+ SI_FIRST = AMDGPUISD::LAST_AMDGPU_ISD_NUMBER, -+ VCC_AND, -+ VCC_BITCAST -+}; -+ -+} // End namespace SIISD -+ +} // End namespace llvm + +#endif // AMDGPUISELLOWERING_H diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp new file mode 100644 -index 0000000..56aaf23 +index 0000000..15840b3 --- /dev/null +++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp -@@ -0,0 +1,326 @@ +@@ -0,0 +1,344 @@ +//===-- AMDGPUIndirectAddressing.cpp - Indirect Adressing Support ---------===// +// +// The LLVM Compiler Infrastructure @@ -1524,9 +1351,6 @@ index 0000000..56aaf23 + } + + if (RegisterAddressMap[Reg] == Address) { -+ if (!regHasExplicitDef(MRI, Reg)) { -+ continue; -+ } + PhiRegisters.push_back(Reg); + } + } @@ -1625,7 +1449,8 @@ index 0000000..56aaf23 + // instruction that uses indirect addressing. + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), + MI.getOperand(0).getReg()) -+ .addReg(AddrReg); ++ .addReg(AddrReg) ++ .addReg(Reg, RegState::Implicit); + } + } else { + // Indirect register access @@ -1647,8 +1472,7 @@ index 0000000..56aaf23 + // We only need to use REG_SEQUENCE for explicit defs, since the + // register coalescer won't do anything with the implicit defs. + MachineInstr *DefInstr = MRI.getVRegDef(Reg); -+ if (!DefInstr->getOperand(0).isReg() || -+ DefInstr->getOperand(0).getReg() != Reg) { ++ if (!regHasExplicitDef(MRI, Reg)) { + continue; + } + @@ -1665,6 +1489,7 @@ index 0000000..56aaf23 + + + Mov.addReg(IndirectReg, RegState::Implicit | RegState::Kill); ++ Mov.addReg(LiveAddressRegisterMap[Address], RegState::Implicit); + + } + MI.eraseFromParent(); @@ -1676,7 +1501,27 @@ index 0000000..56aaf23 +bool AMDGPUIndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI, + unsigned Reg) const { + MachineInstr *DefInstr = MRI.getVRegDef(Reg); -+ return DefInstr && DefInstr->getOperand(0).isReg() && ++ ++ if (!DefInstr) { ++ return false; ++ } ++ ++ if (DefInstr->getOpcode() == AMDGPU::PHI) { ++ bool Explicit = false; ++ for (MachineInstr::const_mop_iterator I = DefInstr->operands_begin(), ++ E = DefInstr->operands_end(); ++ I != E; ++I) { ++ const MachineOperand &MO = *I; ++ if (!MO.isReg() || MO.isDef()) { ++ continue; ++ } ++ ++ Explicit = Explicit || regHasExplicitDef(MRI, MO.getReg()); ++ } ++ return Explicit; ++ } ++ ++ return DefInstr->getOperand(0).isReg() && + DefInstr->getOperand(0).getReg() == Reg; +} diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -1953,7 +1798,7 @@ index 0000000..640707d +} diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h new file mode 100644 -index 0000000..bca3aa2 +index 0000000..5220aa0 --- /dev/null +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -0,0 +1,207 @@ @@ -2160,8 +2005,8 @@ index 0000000..bca3aa2 + +} // End llvm namespace + -+#define AMDGPU_FLAG_REGISTER_LOAD (1UL << 63) -+#define AMDGPU_FLAG_REGISTER_STORE (1UL << 62) ++#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) ++#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) + +#endif // AMDGPUINSTRINFO_H diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td @@ -2907,10 +2752,10 @@ index 0000000..b5aca03 +include "SIRegisterInfo.td" diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Target/R600/AMDGPUStructurizeCFG.cpp new file mode 100644 -index 0000000..22338b5 +index 0000000..a8c9621 --- /dev/null +++ b/lib/Target/R600/AMDGPUStructurizeCFG.cpp -@@ -0,0 +1,714 @@ +@@ -0,0 +1,893 @@ +//===-- AMDGPUStructurizeCFG.cpp - ------------------===// +// +// The LLVM Compiler Infrastructure @@ -2935,30 +2780,101 @@ index 0000000..22338b5 +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RegionPass.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" ++#include "llvm/Support/PatternMatch.h" + +using namespace llvm; ++using namespace llvm::PatternMatch; + +namespace { + +// Definition of the complex types used in this pass. + +typedef std::pair BBValuePair; -+typedef ArrayRef BBVecRef; + +typedef SmallVector RNVector; +typedef SmallVector BBVector; ++typedef SmallVector BranchVector; +typedef SmallVector BBValueVector; + ++typedef SmallPtrSet BBSet; ++ +typedef DenseMap PhiMap; ++typedef DenseMap DTN2UnsignedMap; +typedef DenseMap BBPhiMap; +typedef DenseMap BBPredicates; +typedef DenseMap PredMap; -+typedef DenseMap VisitedMap; ++typedef DenseMap BB2BBMap; ++typedef DenseMap BB2BBVecMap; + +// The name for newly created blocks. + +static const char *FlowBlockName = "Flow"; + ++/// @brief Find the nearest common dominator for multiple BasicBlocks ++/// ++/// Helper class for AMDGPUStructurizeCFG ++/// TODO: Maybe move into common code ++class NearestCommonDominator { ++ ++ DominatorTree *DT; ++ ++ DTN2UnsignedMap IndexMap; ++ ++ BasicBlock *Result; ++ unsigned ResultIndex; ++ bool ExplicitMentioned; ++ ++public: ++ /// \brief Start a new query ++ NearestCommonDominator(DominatorTree *DomTree) { ++ DT = DomTree; ++ Result = 0; ++ } ++ ++ /// \brief Add BB to the resulting dominator ++ void addBlock(BasicBlock *BB, bool Remember = true) { ++ ++ DomTreeNode *Node = DT->getNode(BB); ++ ++ if (Result == 0) { ++ unsigned Numbering = 0; ++ for (;Node;Node = Node->getIDom()) ++ IndexMap[Node] = ++Numbering; ++ Result = BB; ++ ResultIndex = 1; ++ ExplicitMentioned = Remember; ++ return; ++ } ++ ++ for (;Node;Node = Node->getIDom()) ++ if (IndexMap.count(Node)) ++ break; ++ else ++ IndexMap[Node] = 0; ++ ++ assert(Node && "Dominator tree invalid!"); ++ ++ unsigned Numbering = IndexMap[Node]; ++ if (Numbering > ResultIndex) { ++ Result = Node->getBlock(); ++ ResultIndex = Numbering; ++ ExplicitMentioned = Remember && (Result == BB); ++ } else if (Numbering == ResultIndex) { ++ ExplicitMentioned |= Remember; ++ } ++ } ++ ++ /// \brief Is "Result" one of the BBs added with "Remember" = True? ++ bool wasResultExplicitMentioned() { ++ return ExplicitMentioned; ++ } ++ ++ /// \brief Get the query result ++ BasicBlock *getResult() { ++ return Result; ++ } ++}; ++ +/// @brief Transforms the control flow graph on one single entry/exit region +/// at a time. +/// @@ -3019,46 +2935,63 @@ index 0000000..22338b5 + DominatorTree *DT; + + RNVector Order; -+ VisitedMap Visited; -+ PredMap Predicates; -+ BBPhiMap DeletedPhis; -+ BBVector FlowsInserted; ++ BBSet Visited; + -+ BasicBlock *LoopStart; -+ BasicBlock *LoopEnd; -+ BBPredicates LoopPred; ++ BBPhiMap DeletedPhis; ++ BB2BBVecMap AddedPhis; ++ ++ PredMap Predicates; ++ BranchVector Conditions; ++ ++ BB2BBMap Loops; ++ PredMap LoopPreds; ++ BranchVector LoopConds; ++ ++ RegionNode *PrevNode; + + void orderNodes(); + -+ void buildPredicate(BranchInst *Term, unsigned Idx, -+ BBPredicates &Pred, bool Invert); ++ void analyzeLoops(RegionNode *N); + -+ void analyzeBlock(BasicBlock *BB); ++ Value *invert(Value *Condition); + -+ void analyzeLoop(BasicBlock *BB, unsigned &LoopIdx); ++ Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert); ++ ++ void gatherPredicates(RegionNode *N); + + void collectInfos(); + -+ bool dominatesPredicates(BasicBlock *A, BasicBlock *B); -+ -+ void killTerminator(BasicBlock *BB); -+ -+ RegionNode *skipChained(RegionNode *Node); ++ void insertConditions(bool Loops); + + void delPhiValues(BasicBlock *From, BasicBlock *To); + + void addPhiValues(BasicBlock *From, BasicBlock *To); + -+ BasicBlock *getNextFlow(BasicBlock *Prev); ++ void setPhiValues(); + -+ bool isPredictableTrue(BasicBlock *Prev, BasicBlock *Node); ++ void killTerminator(BasicBlock *BB); + -+ BasicBlock *wireFlowBlock(BasicBlock *Prev, RegionNode *Node); ++ void changeExit(RegionNode *Node, BasicBlock *NewExit, ++ bool IncludeDominator); ++ ++ BasicBlock *getNextFlow(BasicBlock *Dominator); ++ ++ BasicBlock *needPrefix(bool NeedEmpty); ++ ++ BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed); ++ ++ void setPrevNode(BasicBlock *BB); ++ ++ bool dominatesPredicates(BasicBlock *BB, RegionNode *Node); ++ ++ bool isPredictableTrue(RegionNode *Node); ++ ++ void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd); ++ ++ void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd); + + void createFlow(); + -+ void insertConditions(); -+ + void rebuildSSA(); + +public: @@ -3111,212 +3044,214 @@ index 0000000..22338b5 + } +} + -+/// \brief Build blocks and loop predicates -+void AMDGPUStructurizeCFG::buildPredicate(BranchInst *Term, unsigned Idx, -+ BBPredicates &Pred, bool Invert) { -+ Value *True = Invert ? BoolFalse : BoolTrue; -+ Value *False = Invert ? BoolTrue : BoolFalse; ++/// \brief Determine the end of the loops ++void AMDGPUStructurizeCFG::analyzeLoops(RegionNode *N) { + -+ RegionInfo *RI = ParentRegion->getRegionInfo(); -+ BasicBlock *BB = Term->getParent(); -+ -+ // Handle the case where multiple regions start at the same block -+ Region *R = BB != ParentRegion->getEntry() ? -+ RI->getRegionFor(BB) : ParentRegion; -+ -+ if (R == ParentRegion) { -+ // It's a top level block in our region -+ Value *Cond = True; -+ if (Term->isConditional()) { -+ BasicBlock *Other = Term->getSuccessor(!Idx); -+ -+ if (Visited.count(Other)) { -+ if (!Pred.count(Other)) -+ Pred[Other] = False; -+ -+ if (!Pred.count(BB)) -+ Pred[BB] = True; -+ return; -+ } -+ Cond = Term->getCondition(); -+ -+ if (Idx != Invert) -+ Cond = BinaryOperator::CreateNot(Cond, "", Term); -+ } -+ -+ Pred[BB] = Cond; -+ -+ } else if (ParentRegion->contains(R)) { -+ // It's a block in a sub region -+ while(R->getParent() != ParentRegion) -+ R = R->getParent(); -+ -+ Pred[R->getEntry()] = True; ++ if (N->isSubRegion()) { ++ // Test for exit as back edge ++ BasicBlock *Exit = N->getNodeAs()->getExit(); ++ if (Visited.count(Exit)) ++ Loops[Exit] = N->getEntry(); + + } else { -+ // It's a branch from outside into our parent region -+ Pred[BB] = True; -+ } -+} -+ -+/// \brief Analyze the successors of each block and build up predicates -+void AMDGPUStructurizeCFG::analyzeBlock(BasicBlock *BB) { -+ pred_iterator PI = pred_begin(BB), PE = pred_end(BB); -+ BBPredicates &Pred = Predicates[BB]; -+ -+ for (; PI != PE; ++PI) { -+ BranchInst *Term = cast((*PI)->getTerminator()); ++ // Test for sucessors as back edge ++ BasicBlock *BB = N->getNodeAs(); ++ BranchInst *Term = cast(BB->getTerminator()); + + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); -+ if (Succ != BB) -+ continue; -+ buildPredicate(Term, i, Pred, false); ++ ++ if (Visited.count(Succ)) ++ Loops[Succ] = BB; + } + } +} + -+/// \brief Analyze the conditions leading to loop to a previous block -+void AMDGPUStructurizeCFG::analyzeLoop(BasicBlock *BB, unsigned &LoopIdx) { -+ BranchInst *Term = cast(BB->getTerminator()); ++/// \brief Invert the given condition ++Value *AMDGPUStructurizeCFG::invert(Value *Condition) { + -+ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { -+ BasicBlock *Succ = Term->getSuccessor(i); ++ // First: Check if it's a constant ++ if (Condition == BoolTrue) ++ return BoolFalse; + -+ // Ignore it if it's not a back edge -+ if (!Visited.count(Succ)) ++ if (Condition == BoolFalse) ++ return BoolTrue; ++ ++ if (Condition == BoolUndef) ++ return BoolUndef; ++ ++ // Second: If the condition is already inverted, return the original value ++ if (match(Condition, m_Not(m_Value(Condition)))) ++ return Condition; ++ ++ // Third: Check all the users for an invert ++ BasicBlock *Parent = cast(Condition)->getParent(); ++ for (Value::use_iterator I = Condition->use_begin(), ++ E = Condition->use_end(); I != E; ++I) { ++ ++ Instruction *User = dyn_cast(*I); ++ if (!User || User->getParent() != Parent) + continue; + -+ buildPredicate(Term, i, LoopPred, true); ++ if (match(*I, m_Not(m_Specific(Condition)))) ++ return *I; ++ } + -+ LoopEnd = BB; -+ if (Visited[Succ] < LoopIdx) { -+ LoopIdx = Visited[Succ]; -+ LoopStart = Succ; ++ // Last option: Create a new instruction ++ return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); ++} ++ ++/// \brief Build the condition for one edge ++Value *AMDGPUStructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx, ++ bool Invert) { ++ Value *Cond = Invert ? BoolFalse : BoolTrue; ++ if (Term->isConditional()) { ++ Cond = Term->getCondition(); ++ ++ if (Idx != Invert) ++ Cond = invert(Cond); ++ } ++ return Cond; ++} ++ ++/// \brief Analyze the predecessors of each block and build up predicates ++void AMDGPUStructurizeCFG::gatherPredicates(RegionNode *N) { ++ ++ RegionInfo *RI = ParentRegion->getRegionInfo(); ++ BasicBlock *BB = N->getEntry(); ++ BBPredicates &Pred = Predicates[BB]; ++ BBPredicates &LPred = LoopPreds[BB]; ++ ++ for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); ++ PI != PE; ++PI) { ++ ++ // Ignore it if it's a branch from outside into our region entry ++ if (!ParentRegion->contains(*PI)) ++ continue; ++ ++ Region *R = RI->getRegionFor(*PI); ++ if (R == ParentRegion) { ++ ++ // It's a top level block in our region ++ BranchInst *Term = cast((*PI)->getTerminator()); ++ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { ++ BasicBlock *Succ = Term->getSuccessor(i); ++ if (Succ != BB) ++ continue; ++ ++ if (Visited.count(*PI)) { ++ // Normal forward edge ++ if (Term->isConditional()) { ++ // Try to treat it like an ELSE block ++ BasicBlock *Other = Term->getSuccessor(!i); ++ if (Visited.count(Other) && !Loops.count(Other) && ++ !Pred.count(Other) && !Pred.count(*PI)) { ++ ++ Pred[Other] = BoolFalse; ++ Pred[*PI] = BoolTrue; ++ continue; ++ } ++ } ++ Pred[*PI] = buildCondition(Term, i, false); ++ ++ } else { ++ // Back edge ++ LPred[*PI] = buildCondition(Term, i, true); ++ } ++ } ++ ++ } else { ++ ++ // It's an exit from a sub region ++ while(R->getParent() != ParentRegion) ++ R = R->getParent(); ++ ++ // Edge from inside a subregion to its entry, ignore it ++ if (R == N) ++ continue; ++ ++ BasicBlock *Entry = R->getEntry(); ++ if (Visited.count(Entry)) ++ Pred[Entry] = BoolTrue; ++ else ++ LPred[Entry] = BoolFalse; + } + } +} + +/// \brief Collect various loop and predicate infos +void AMDGPUStructurizeCFG::collectInfos() { -+ unsigned Number = 0, LoopIdx = ~0; + + // Reset predicate + Predicates.clear(); + + // and loop infos -+ LoopStart = LoopEnd = 0; -+ LoopPred.clear(); ++ Loops.clear(); ++ LoopPreds.clear(); + -+ RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); -+ for (Visited.clear(); OI != OE; Visited[(*OI++)->getEntry()] = ++Number) { ++ // Reset the visited nodes ++ Visited.clear(); ++ ++ for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); ++ OI != OE; ++OI) { + + // Analyze all the conditions leading to a node -+ analyzeBlock((*OI)->getEntry()); ++ gatherPredicates(*OI); + -+ if ((*OI)->isSubRegion()) -+ continue; ++ // Remember that we've seen this node ++ Visited.insert((*OI)->getEntry()); + -+ // Find the first/last loop nodes and loop predicates -+ analyzeLoop((*OI)->getNodeAs(), LoopIdx); ++ // Find the last back edges ++ analyzeLoops(*OI); + } +} + -+/// \brief Does A dominate all the predicates of B ? -+bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *A, BasicBlock *B) { -+ BBPredicates &Preds = Predicates[B]; -+ for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); -+ PI != PE; ++PI) { ++/// \brief Insert the missing branch conditions ++void AMDGPUStructurizeCFG::insertConditions(bool Loops) { ++ BranchVector &Conds = Loops ? LoopConds : Conditions; ++ Value *Default = Loops ? BoolTrue : BoolFalse; ++ SSAUpdater PhiInserter; + -+ if (!DT->dominates(A, PI->first)) -+ return false; -+ } -+ return true; -+} ++ for (BranchVector::iterator I = Conds.begin(), ++ E = Conds.end(); I != E; ++I) { + -+/// \brief Remove phi values from all successors and the remove the terminator. -+void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) { -+ TerminatorInst *Term = BB->getTerminator(); -+ if (!Term) -+ return; ++ BranchInst *Term = *I; ++ assert(Term->isConditional()); + -+ for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); -+ SI != SE; ++SI) { ++ BasicBlock *Parent = Term->getParent(); ++ BasicBlock *SuccTrue = Term->getSuccessor(0); ++ BasicBlock *SuccFalse = Term->getSuccessor(1); + -+ delPhiValues(BB, *SI); -+ } ++ PhiInserter.Initialize(Boolean, ""); ++ PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default); ++ PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default); + -+ Term->eraseFromParent(); -+} ++ BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue]; + -+/// First: Skip forward to the first region node that either isn't a subregion or not -+/// dominating it's exit, remove all the skipped nodes from the node order. -+/// -+/// Second: Handle the first successor directly if the resulting nodes successor -+/// predicates are still dominated by the original entry -+RegionNode *AMDGPUStructurizeCFG::skipChained(RegionNode *Node) { -+ BasicBlock *Entry = Node->getEntry(); ++ NearestCommonDominator Dominator(DT); ++ Dominator.addBlock(Parent, false); + -+ // Skip forward as long as it is just a linear flow -+ while (true) { -+ BasicBlock *Entry = Node->getEntry(); -+ BasicBlock *Exit; ++ Value *ParentValue = 0; ++ for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); ++ PI != PE; ++PI) { + -+ if (Node->isSubRegion()) { -+ Exit = Node->getNodeAs()->getExit(); -+ } else { -+ TerminatorInst *Term = Entry->getTerminator(); -+ if (Term->getNumSuccessors() != 1) ++ if (PI->first == Parent) { ++ ParentValue = PI->second; + break; -+ Exit = Term->getSuccessor(0); ++ } ++ PhiInserter.AddAvailableValue(PI->first, PI->second); ++ Dominator.addBlock(PI->first); + } + -+ // It's a back edge, break here so we can insert a loop node -+ if (!Visited.count(Exit)) -+ return Node; ++ if (ParentValue) { ++ Term->setCondition(ParentValue); ++ } else { ++ if (!Dominator.wasResultExplicitMentioned()) ++ PhiInserter.AddAvailableValue(Dominator.getResult(), Default); + -+ // More than node edges are pointing to exit -+ if (!DT->dominates(Entry, Exit)) -+ return Node; -+ -+ RegionNode *Next = ParentRegion->getNode(Exit); -+ RNVector::iterator I = std::find(Order.begin(), Order.end(), Next); -+ assert(I != Order.end()); -+ -+ Visited.erase(Next->getEntry()); -+ Order.erase(I); -+ Node = Next; ++ Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent)); ++ } + } -+ -+ BasicBlock *BB = Node->getEntry(); -+ TerminatorInst *Term = BB->getTerminator(); -+ if (Term->getNumSuccessors() != 2) -+ return Node; -+ -+ // Our node has exactly two succesors, check if we can handle -+ // any of them directly -+ BasicBlock *Succ = Term->getSuccessor(0); -+ if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) { -+ Succ = Term->getSuccessor(1); -+ if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) -+ return Node; -+ } else { -+ BasicBlock *Succ2 = Term->getSuccessor(1); -+ if (Visited.count(Succ2) && Visited[Succ] > Visited[Succ2] && -+ dominatesPredicates(Entry, Succ2)) -+ Succ = Succ2; -+ } -+ -+ RegionNode *Next = ParentRegion->getNode(Succ); -+ RNVector::iterator E = Order.end(); -+ RNVector::iterator I = std::find(Order.begin(), E, Next); -+ assert(I != E); -+ -+ killTerminator(BB); -+ FlowsInserted.push_back(BB); -+ Visited.erase(Succ); -+ Order.erase(I); -+ return ParentRegion->getNode(wireFlowBlock(BB, Next)); +} + +/// \brief Remove all PHI values coming from "From" into "To" and remember @@ -3334,224 +3269,306 @@ index 0000000..22338b5 + } +} + -+/// \brief Add the PHI values back once we knew the new predecessor ++/// \brief Add a dummy PHI value as soon as we knew the new predecessor +void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { -+ if (!DeletedPhis.count(To)) ++ for (BasicBlock::iterator I = To->begin(), E = To->end(); ++ I != E && isa(*I);) { ++ ++ PHINode &Phi = cast(*I++); ++ Value *Undef = UndefValue::get(Phi.getType()); ++ Phi.addIncoming(Undef, From); ++ } ++ AddedPhis[To].push_back(From); ++} ++ ++/// \brief Add the real PHI value as soon as everything is set up ++void AMDGPUStructurizeCFG::setPhiValues() { ++ ++ SSAUpdater Updater; ++ for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end(); ++ AI != AE; ++AI) { ++ ++ BasicBlock *To = AI->first; ++ BBVector &From = AI->second; ++ ++ if (!DeletedPhis.count(To)) ++ continue; ++ ++ PhiMap &Map = DeletedPhis[To]; ++ for (PhiMap::iterator PI = Map.begin(), PE = Map.end(); ++ PI != PE; ++PI) { ++ ++ PHINode *Phi = PI->first; ++ Value *Undef = UndefValue::get(Phi->getType()); ++ Updater.Initialize(Phi->getType(), ""); ++ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); ++ Updater.AddAvailableValue(To, Undef); ++ ++ NearestCommonDominator Dominator(DT); ++ Dominator.addBlock(To, false); ++ for (BBValueVector::iterator VI = PI->second.begin(), ++ VE = PI->second.end(); VI != VE; ++VI) { ++ ++ Updater.AddAvailableValue(VI->first, VI->second); ++ Dominator.addBlock(VI->first); ++ } ++ ++ if (!Dominator.wasResultExplicitMentioned()) ++ Updater.AddAvailableValue(Dominator.getResult(), Undef); ++ ++ for (BBVector::iterator FI = From.begin(), FE = From.end(); ++ FI != FE; ++FI) { ++ ++ int Idx = Phi->getBasicBlockIndex(*FI); ++ assert(Idx != -1); ++ Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI)); ++ } ++ } ++ ++ DeletedPhis.erase(To); ++ } ++ assert(DeletedPhis.empty()); ++} ++ ++/// \brief Remove phi values from all successors and then remove the terminator. ++void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) { ++ TerminatorInst *Term = BB->getTerminator(); ++ if (!Term) + return; + -+ PhiMap &Map = DeletedPhis[To]; -+ SSAUpdater Updater; ++ for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); ++ SI != SE; ++SI) { + -+ for (PhiMap::iterator I = Map.begin(), E = Map.end(); I != E; ++I) { -+ -+ PHINode *Phi = I->first; -+ Updater.Initialize(Phi->getType(), ""); -+ BasicBlock *Fallback = To; -+ bool HaveFallback = false; -+ -+ for (BBValueVector::iterator VI = I->second.begin(), VE = I->second.end(); -+ VI != VE; ++VI) { -+ -+ Updater.AddAvailableValue(VI->first, VI->second); -+ BasicBlock *Dom = DT->findNearestCommonDominator(Fallback, VI->first); -+ if (Dom == VI->first) -+ HaveFallback = true; -+ else if (Dom != Fallback) -+ HaveFallback = false; -+ Fallback = Dom; -+ } -+ if (!HaveFallback) { -+ Value *Undef = UndefValue::get(Phi->getType()); -+ Updater.AddAvailableValue(Fallback, Undef); -+ } -+ -+ Phi->addIncoming(Updater.GetValueAtEndOfBlock(From), From); ++ delPhiValues(BB, *SI); ++ } ++ ++ Term->eraseFromParent(); ++} ++ ++/// \brief Let node exit(s) point to NewExit ++void AMDGPUStructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, ++ bool IncludeDominator) { ++ ++ if (Node->isSubRegion()) { ++ Region *SubRegion = Node->getNodeAs(); ++ BasicBlock *OldExit = SubRegion->getExit(); ++ BasicBlock *Dominator = 0; ++ ++ // Find all the edges from the sub region to the exit ++ for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit); ++ I != E;) { ++ ++ BasicBlock *BB = *I++; ++ if (!SubRegion->contains(BB)) ++ continue; ++ ++ // Modify the edges to point to the new exit ++ delPhiValues(BB, OldExit); ++ BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit); ++ addPhiValues(BB, NewExit); ++ ++ // Find the new dominator (if requested) ++ if (IncludeDominator) { ++ if (!Dominator) ++ Dominator = BB; ++ else ++ Dominator = DT->findNearestCommonDominator(Dominator, BB); ++ } ++ } ++ ++ // Change the dominator (if requested) ++ if (Dominator) ++ DT->changeImmediateDominator(NewExit, Dominator); ++ ++ // Update the region info ++ SubRegion->replaceExit(NewExit); ++ ++ } else { ++ BasicBlock *BB = Node->getNodeAs(); ++ killTerminator(BB); ++ BranchInst::Create(NewExit, BB); ++ addPhiValues(BB, NewExit); ++ if (IncludeDominator) ++ DT->changeImmediateDominator(NewExit, BB); + } -+ DeletedPhis.erase(To); +} + +/// \brief Create a new flow node and update dominator tree and region info -+BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Prev) { ++BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Dominator) { + LLVMContext &Context = Func->getContext(); + BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() : + Order.back()->getEntry(); + BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, + Func, Insert); -+ DT->addNewBlock(Flow, Prev); ++ DT->addNewBlock(Flow, Dominator); + ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); -+ FlowsInserted.push_back(Flow); + return Flow; +} + ++/// \brief Create a new or reuse the previous node as flow node ++BasicBlock *AMDGPUStructurizeCFG::needPrefix(bool NeedEmpty) { ++ ++ BasicBlock *Entry = PrevNode->getEntry(); ++ ++ if (!PrevNode->isSubRegion()) { ++ killTerminator(Entry); ++ if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end()) ++ return Entry; ++ ++ } ++ ++ // create a new flow node ++ BasicBlock *Flow = getNextFlow(Entry); ++ ++ // and wire it up ++ changeExit(PrevNode, Flow, true); ++ PrevNode = ParentRegion->getBBNode(Flow); ++ return Flow; ++} ++ ++/// \brief Returns the region exit if possible, otherwise just a new flow node ++BasicBlock *AMDGPUStructurizeCFG::needPostfix(BasicBlock *Flow, ++ bool ExitUseAllowed) { ++ ++ if (Order.empty() && ExitUseAllowed) { ++ BasicBlock *Exit = ParentRegion->getExit(); ++ DT->changeImmediateDominator(Exit, Flow); ++ addPhiValues(Flow, Exit); ++ return Exit; ++ } ++ return getNextFlow(Flow); ++} ++ ++/// \brief Set the previous node ++void AMDGPUStructurizeCFG::setPrevNode(BasicBlock *BB) { ++ PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0; ++} ++ ++/// \brief Does BB dominate all the predicates of Node ? ++bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) { ++ BBPredicates &Preds = Predicates[Node->getEntry()]; ++ for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); ++ PI != PE; ++PI) { ++ ++ if (!DT->dominates(BB, PI->first)) ++ return false; ++ } ++ return true; ++} ++ +/// \brief Can we predict that this node will always be called? -+bool AMDGPUStructurizeCFG::isPredictableTrue(BasicBlock *Prev, -+ BasicBlock *Node) { -+ BBPredicates &Preds = Predicates[Node]; ++bool AMDGPUStructurizeCFG::isPredictableTrue(RegionNode *Node) { ++ ++ BBPredicates &Preds = Predicates[Node->getEntry()]; + bool Dominated = false; + ++ // Regionentry is always true ++ if (PrevNode == 0) ++ return true; ++ + for (BBPredicates::iterator I = Preds.begin(), E = Preds.end(); + I != E; ++I) { + + if (I->second != BoolTrue) + return false; + -+ if (!Dominated && DT->dominates(I->first, Prev)) ++ if (!Dominated && DT->dominates(I->first, PrevNode->getEntry())) + Dominated = true; + } ++ ++ // TODO: The dominator check is too strict + return Dominated; +} + -+/// \brief Wire up the new control flow by inserting or updating the branch -+/// instructions at node exits -+BasicBlock *AMDGPUStructurizeCFG::wireFlowBlock(BasicBlock *Prev, -+ RegionNode *Node) { -+ BasicBlock *Entry = Node->getEntry(); ++/// Take one node from the order vector and wire it up ++void AMDGPUStructurizeCFG::wireFlow(bool ExitUseAllowed, ++ BasicBlock *LoopEnd) { + -+ if (LoopStart == Entry) { -+ LoopStart = Prev; -+ LoopPred[Prev] = BoolTrue; -+ } ++ RegionNode *Node = Order.pop_back_val(); ++ Visited.insert(Node->getEntry()); + -+ // Wire it up temporary, skipChained may recurse into us -+ BranchInst::Create(Entry, Prev); -+ DT->changeImmediateDominator(Entry, Prev); -+ addPhiValues(Prev, Entry); -+ -+ Node = skipChained(Node); -+ -+ BasicBlock *Next = getNextFlow(Prev); -+ if (!isPredictableTrue(Prev, Entry)) { -+ // Let Prev point to entry and next block -+ Prev->getTerminator()->eraseFromParent(); -+ BranchInst::Create(Entry, Next, BoolUndef, Prev); -+ } else { -+ DT->changeImmediateDominator(Next, Entry); -+ } -+ -+ // Let node exit(s) point to next block -+ if (Node->isSubRegion()) { -+ Region *SubRegion = Node->getNodeAs(); -+ BasicBlock *Exit = SubRegion->getExit(); -+ -+ // Find all the edges from the sub region to the exit -+ BBVector ToDo; -+ for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { -+ if (SubRegion->contains(*I)) -+ ToDo.push_back(*I); ++ if (isPredictableTrue(Node)) { ++ // Just a linear flow ++ if (PrevNode) { ++ changeExit(PrevNode, Node->getEntry(), true); + } -+ -+ // Modify the edges to point to the new flow block -+ for (BBVector::iterator I = ToDo.begin(), E = ToDo.end(); I != E; ++I) { -+ delPhiValues(*I, Exit); -+ TerminatorInst *Term = (*I)->getTerminator(); -+ Term->replaceUsesOfWith(Exit, Next); -+ } -+ -+ // Update the region info -+ SubRegion->replaceExit(Next); ++ PrevNode = Node; + + } else { -+ BasicBlock *BB = Node->getNodeAs(); -+ killTerminator(BB); -+ BranchInst::Create(Next, BB); ++ // Insert extra prefix node (or reuse last one) ++ BasicBlock *Flow = needPrefix(false); + -+ if (BB == LoopEnd) -+ LoopEnd = 0; ++ // Insert extra postfix node (or use exit instead) ++ BasicBlock *Entry = Node->getEntry(); ++ BasicBlock *Next = needPostfix(Flow, ExitUseAllowed); ++ ++ // let it point to entry and next block ++ Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow)); ++ addPhiValues(Flow, Entry); ++ DT->changeImmediateDominator(Entry, Flow); ++ ++ PrevNode = Node; ++ while (!Order.empty() && !Visited.count(LoopEnd) && ++ dominatesPredicates(Entry, Order.back())) { ++ handleLoops(false, LoopEnd); ++ } ++ ++ changeExit(PrevNode, Next, false); ++ setPrevNode(Next); + } -+ -+ return Next; +} + -+/// Destroy node order and visited map, build up flow order instead. ++void AMDGPUStructurizeCFG::handleLoops(bool ExitUseAllowed, ++ BasicBlock *LoopEnd) { ++ RegionNode *Node = Order.back(); ++ BasicBlock *LoopStart = Node->getEntry(); ++ ++ if (!Loops.count(LoopStart)) { ++ wireFlow(ExitUseAllowed, LoopEnd); ++ return; ++ } ++ ++ if (!isPredictableTrue(Node)) ++ LoopStart = needPrefix(true); ++ ++ LoopEnd = Loops[Node->getEntry()]; ++ wireFlow(false, LoopEnd); ++ while (!Visited.count(LoopEnd)) { ++ handleLoops(false, LoopEnd); ++ } ++ ++ // Create an extra loop end node ++ LoopEnd = needPrefix(false); ++ BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed); ++ LoopConds.push_back(BranchInst::Create(Next, LoopStart, ++ BoolUndef, LoopEnd)); ++ addPhiValues(LoopEnd, LoopStart); ++ setPrevNode(Next); ++} ++ +/// After this function control flow looks like it should be, but -+/// branches only have undefined conditions. ++/// branches and PHI nodes only have undefined conditions. +void AMDGPUStructurizeCFG::createFlow() { -+ DeletedPhis.clear(); -+ -+ BasicBlock *Prev = Order.pop_back_val()->getEntry(); -+ assert(Prev == ParentRegion->getEntry() && "Incorrect node order!"); -+ Visited.erase(Prev); -+ -+ if (LoopStart == Prev) { -+ // Loop starts at entry, split entry so that we can predicate it -+ BasicBlock::iterator Insert = Prev->getFirstInsertionPt(); -+ BasicBlock *Split = Prev->splitBasicBlock(Insert, FlowBlockName); -+ DT->addNewBlock(Split, Prev); -+ ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion); -+ Predicates[Split] = Predicates[Prev]; -+ Order.push_back(ParentRegion->getBBNode(Split)); -+ LoopPred[Prev] = BoolTrue; -+ -+ } else if (LoopStart == Order.back()->getEntry()) { -+ // Loop starts behind entry, split entry so that we can jump to it -+ Instruction *Term = Prev->getTerminator(); -+ BasicBlock *Split = Prev->splitBasicBlock(Term, FlowBlockName); -+ DT->addNewBlock(Split, Prev); -+ ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion); -+ Prev = Split; -+ } -+ -+ killTerminator(Prev); -+ FlowsInserted.clear(); -+ FlowsInserted.push_back(Prev); -+ -+ while (!Order.empty()) { -+ RegionNode *Node = Order.pop_back_val(); -+ Visited.erase(Node->getEntry()); -+ Prev = wireFlowBlock(Prev, Node); -+ if (LoopStart && !LoopEnd) { -+ // Create an extra loop end node -+ LoopEnd = Prev; -+ Prev = getNextFlow(LoopEnd); -+ BranchInst::Create(Prev, LoopStart, BoolUndef, LoopEnd); -+ addPhiValues(LoopEnd, LoopStart); -+ } -+ } + + BasicBlock *Exit = ParentRegion->getExit(); -+ BranchInst::Create(Exit, Prev); -+ addPhiValues(Prev, Exit); -+ if (DT->dominates(ParentRegion->getEntry(), Exit)) -+ DT->changeImmediateDominator(Exit, Prev); ++ bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit); + -+ if (LoopStart && LoopEnd) { -+ BBVector::iterator FI = std::find(FlowsInserted.begin(), -+ FlowsInserted.end(), -+ LoopStart); -+ for (; *FI != LoopEnd; ++FI) { -+ addPhiValues(*FI, (*FI)->getTerminator()->getSuccessor(0)); -+ } ++ DeletedPhis.clear(); ++ AddedPhis.clear(); ++ Conditions.clear(); ++ LoopConds.clear(); ++ ++ PrevNode = 0; ++ Visited.clear(); ++ ++ while (!Order.empty()) { ++ handleLoops(EntryDominatesExit, 0); + } + -+ assert(Order.empty()); -+ assert(Visited.empty()); -+ assert(DeletedPhis.empty()); -+} -+ -+/// \brief Insert the missing branch conditions -+void AMDGPUStructurizeCFG::insertConditions() { -+ SSAUpdater PhiInserter; -+ -+ for (BBVector::iterator FI = FlowsInserted.begin(), FE = FlowsInserted.end(); -+ FI != FE; ++FI) { -+ -+ BranchInst *Term = cast((*FI)->getTerminator()); -+ if (Term->isUnconditional()) -+ continue; -+ -+ PhiInserter.Initialize(Boolean, ""); -+ PhiInserter.AddAvailableValue(&Func->getEntryBlock(), BoolFalse); -+ -+ BasicBlock *Succ = Term->getSuccessor(0); -+ BBPredicates &Preds = (*FI == LoopEnd) ? LoopPred : Predicates[Succ]; -+ for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); -+ PI != PE; ++PI) { -+ -+ PhiInserter.AddAvailableValue(PI->first, PI->second); -+ } -+ -+ Term->setCondition(PhiInserter.GetValueAtEndOfBlock(*FI)); -+ } ++ if (PrevNode) ++ changeExit(PrevNode, Exit, EntryDominatesExit); ++ else ++ assert(EntryDominatesExit); +} + +/// Handle a rare case where the disintegrated nodes instructions @@ -3609,14 +3626,21 @@ index 0000000..22338b5 + orderNodes(); + collectInfos(); + createFlow(); -+ insertConditions(); ++ insertConditions(false); ++ insertConditions(true); ++ setPhiValues(); + rebuildSSA(); + ++ // Cleanup + Order.clear(); + Visited.clear(); -+ Predicates.clear(); + DeletedPhis.clear(); -+ FlowsInserted.clear(); ++ AddedPhis.clear(); ++ Predicates.clear(); ++ Conditions.clear(); ++ Loops.clear(); ++ LoopPreds.clear(); ++ LoopConds.clear(); + + return true; +} @@ -3791,10 +3815,10 @@ index 0000000..cab7884 +#endif // AMDGPUSUBTARGET_H diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp new file mode 100644 -index 0000000..821e864 +index 0000000..e2f00be --- /dev/null +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp -@@ -0,0 +1,154 @@ +@@ -0,0 +1,153 @@ +//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// +// +// The LLVM Compiler Infrastructure @@ -3942,7 +3966,6 @@ index 0000000..821e864 + addPass(&FinalizeMachineBundlesID); + addPass(createR600LowerConstCopy(*TM)); + } else { -+ addPass(createSILowerLiteralConstantsPass(*TM)); + addPass(createSILowerControlFlowPass(*TM)); + } + @@ -8242,10 +8265,10 @@ index 0000000..6dc2deb +#endif // AMDILEVERGREENDEVICE_H diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp new file mode 100644 -index 0000000..2699409 +index 0000000..2e726e9 --- /dev/null +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp -@@ -0,0 +1,625 @@ +@@ -0,0 +1,577 @@ +//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// +// +// The LLVM Compiler Infrastructure @@ -8320,8 +8343,6 @@ index 0000000..2699409 + bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); + bool SelectGlobalValueVariableOffset(SDValue Addr, + SDValue &BaseReg, SDValue& Offset); -+ bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset); -+ bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset); + bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); + bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + @@ -8468,7 +8489,9 @@ index 0000000..2699409 + continue; + } + } else { -+ if (!TII->isALUInstr(Use->getMachineOpcode())) { ++ if (!TII->isALUInstr(Use->getMachineOpcode()) || ++ (TII->get(Use->getMachineOpcode()).TSFlags & ++ R600_InstFlag::VECTOR)) { + continue; + } + @@ -8511,7 +8534,8 @@ index 0000000..2699409 + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + const R600InstrInfo *TII = + static_cast(TM.getInstrInfo()); -+ if (Result && Result->isMachineOpcode() ++ if (Result && Result->isMachineOpcode() && ++ !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR) + && TII->isALUInstr(Result->getMachineOpcode())) { + // Fold FNEG/FABS/CONST_ADDRESS + // TODO: Isel can generate multiple MachineInst, we need to recursively @@ -8581,6 +8605,8 @@ index 0000000..2699409 + SDValue Operand = Ops[OperandIdx[i] - 1]; + switch (Operand.getOpcode()) { + case AMDGPUISD::CONST_ADDRESS: { ++ if (i == 2) ++ break; + SDValue CstOffset; + if (!Operand.getValueType().isVector() && + SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) { @@ -8775,43 +8801,6 @@ index 0000000..2699409 + return false; +} + -+bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base, -+ SDValue& Offset) { -+ if (Addr.getOpcode() == ISD::TargetExternalSymbol || -+ Addr.getOpcode() == ISD::TargetGlobalAddress) { -+ return false; -+ } -+ -+ -+ if (Addr.getOpcode() == ISD::ADD) { -+ bool Match = false; -+ -+ // Find the base ptr and the offset -+ for (unsigned i = 0; i < Addr.getNumOperands(); i++) { -+ SDValue Arg = Addr.getOperand(i); -+ ConstantSDNode * OffsetNode = dyn_cast(Arg); -+ // This arg isn't a constant so it must be the base PTR. -+ if (!OffsetNode) { -+ Base = Addr.getOperand(i); -+ continue; -+ } -+ // Check if the constant argument fits in 8-bits. The offset is in bytes -+ // so we need to convert it to dwords. -+ if (isUInt<8>(OffsetNode->getZExtValue() >> 2)) { -+ Match = true; -+ Offset = CurDAG->getTargetConstant(OffsetNode->getZExtValue() >> 2, -+ MVT::i32); -+ } -+ } -+ return Match; -+ } -+ -+ // Default case, no offset -+ Base = Addr; -+ Offset = CurDAG->getTargetConstant(0, MVT::i32); -+ return true; -+} -+ +bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode * IMMOffset; @@ -8839,20 +8828,6 @@ index 0000000..2699409 + return true; +} + -+bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base, -+ SDValue& Offset) { -+ if (Addr.getOpcode() == ISD::TargetExternalSymbol || -+ Addr.getOpcode() == ISD::TargetGlobalAddress || -+ Addr.getOpcode() != ISD::ADD) { -+ return false; -+ } -+ -+ Base = Addr.getOperand(0); -+ Offset = Addr.getOperand(1); -+ -+ return true; -+} -+ +bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *C; @@ -11799,10 +11774,10 @@ index 0000000..8ef9f8c +add_subdirectory(MCTargetDesc) diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp new file mode 100644 -index 0000000..fb17ab7 +index 0000000..d6450a0 --- /dev/null +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp -@@ -0,0 +1,153 @@ +@@ -0,0 +1,168 @@ +//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// +// +// The LLVM Compiler Infrastructure @@ -11845,6 +11820,21 @@ index 0000000..fb17ab7 + } +} + ++void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, ++ raw_ostream &O) { ++ unsigned Imm = MI->getOperand(OpNum).getImm(); ++ ++ if (Imm == 2) { ++ O << "P0"; ++ } else if (Imm == 1) { ++ O << "P20"; ++ } else if (Imm == 0) { ++ O << "P10"; ++ } else { ++ assert(!"Invalid interpolation parameter slot"); ++ } ++} ++ +void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printOperand(MI, OpNo, O); @@ -11958,10 +11948,10 @@ index 0000000..fb17ab7 +#include "AMDGPUGenAsmWriter.inc" diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h new file mode 100644 -index 0000000..e775c4c +index 0000000..767a708 --- /dev/null +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h -@@ -0,0 +1,53 @@ +@@ -0,0 +1,54 @@ +//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure @@ -11997,6 +11987,7 @@ index 0000000..e775c4c + +private: + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm); + void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); @@ -12342,10 +12333,10 @@ index 0000000..3ad0fa6 +#endif // AMDGPUMCASMINFO_H diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h new file mode 100644 -index 0000000..9d0d6cf +index 0000000..8721f80 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h -@@ -0,0 +1,60 @@ +@@ -0,0 +1,49 @@ +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===// +// +// The LLVM Compiler Infrastructure @@ -12390,17 +12381,6 @@ index 0000000..9d0d6cf + SmallVectorImpl &Fixups) const { + return 0; + } -+ virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const { -+ return Value; -+ } -+ virtual uint64_t i32LiteralEncode(const MCInst &MI, unsigned OpNo, -+ SmallVectorImpl &Fixups) const { -+ return 0; -+ } -+ virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo, -+ SmallVectorImpl &Fixups) const { -+ return 0; -+ } +}; + +} // End namespace llvm @@ -12655,10 +12635,10 @@ index 0000000..8894a76 +include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp new file mode 100644 -index 0000000..e061b18 +index 0000000..115fe8d --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp -@@ -0,0 +1,580 @@ +@@ -0,0 +1,582 @@ +//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===// +// +// The LLVM Compiler Infrastructure @@ -12823,10 +12803,12 @@ index 0000000..e061b18 + case AMDGPU::VTX_READ_PARAM_8_eg: + case AMDGPU::VTX_READ_PARAM_16_eg: + case AMDGPU::VTX_READ_PARAM_32_eg: ++ case AMDGPU::VTX_READ_PARAM_128_eg: + case AMDGPU::VTX_READ_GLOBAL_8_eg: + case AMDGPU::VTX_READ_GLOBAL_32_eg: + case AMDGPU::VTX_READ_GLOBAL_128_eg: -+ case AMDGPU::TEX_VTX_CONSTBUF: { ++ case AMDGPU::TEX_VTX_CONSTBUF: ++ case AMDGPU::TEX_VTX_TEXBUF : { + uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); + uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset + @@ -13241,10 +13223,10 @@ index 0000000..e061b18 +#include "AMDGPUGenMCCodeEmitter.inc" diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp new file mode 100644 -index 0000000..c47dc99 +index 0000000..6dfbbe8 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp -@@ -0,0 +1,298 @@ +@@ -0,0 +1,235 @@ +//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===// +// +// The LLVM Compiler Infrastructure @@ -13271,38 +13253,16 @@ index 0000000..c47dc99 +#include "llvm/MC/MCFixup.h" +#include "llvm/Support/raw_ostream.h" + -+#define VGPR_BIT(src_idx) (1ULL << (9 * src_idx - 1)) -+#define SI_INSTR_FLAGS_ENCODING_MASK 0xf -+ -+// These must be kept in sync with SIInstructions.td and also the -+// InstrEncodingInfo array in SIInstrInfo.cpp. -+// -+// NOTE: This enum is only used to identify the encoding type within LLVM, -+// the actual encoding type that is part of the instruction format is different -+namespace SIInstrEncodingType { -+ enum Encoding { -+ EXP = 0, -+ LDS = 1, -+ MIMG = 2, -+ MTBUF = 3, -+ MUBUF = 4, -+ SMRD = 5, -+ SOP1 = 6, -+ SOP2 = 7, -+ SOPC = 8, -+ SOPK = 9, -+ SOPP = 10, -+ VINTRP = 11, -+ VOP1 = 12, -+ VOP2 = 13, -+ VOP3 = 14, -+ VOPC = 15 -+ }; -+} -+ +using namespace llvm; + +namespace { ++ ++/// \brief Helper type used in encoding ++typedef union { ++ int32_t I; ++ float F; ++} IntFloatUnion; ++ +class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { + SIMCCodeEmitter(const SIMCCodeEmitter &); // DO NOT IMPLEMENT + void operator=(const SIMCCodeEmitter &); // DO NOT IMPLEMENT @@ -13311,6 +13271,15 @@ index 0000000..c47dc99 + const MCSubtargetInfo &STI; + MCContext &Ctx; + ++ /// \brief Encode a sequence of registers with the correct alignment. ++ unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const; ++ ++ /// \brief Can this operand also contain immediate values? ++ bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; ++ ++ /// \brief Encode an fp or int literal ++ uint32_t getLitEncoding(const MCOperand &MO) const; ++ +public: + SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, + const MCSubtargetInfo &sti, MCContext &ctx) @@ -13326,11 +13295,6 @@ index 0000000..c47dc99 + virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl &Fixups) const; + -+public: -+ -+ /// \brief Encode a sequence of registers with the correct alignment. -+ unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const; -+ + /// \brief Encoding for when 2 consecutive registers are used + virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixup) const; @@ -13338,29 +13302,6 @@ index 0000000..c47dc99 + /// \brief Encoding for when 4 consectuive registers are used + virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixup) const; -+ -+ /// \brief Encoding for SMRD indexed loads -+ virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo, -+ SmallVectorImpl &Fixup) const; -+ -+ /// \brief Post-Encoder method for VOP instructions -+ virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const; -+ -+private: -+ -+ /// \returns this SIInstrEncodingType for this instruction. -+ unsigned getEncodingType(const MCInst &MI) const; -+ -+ /// \brief Get then size in bytes of this instructions encoding. -+ unsigned getEncodingBytes(const MCInst &MI) const; -+ -+ /// \returns the hardware encoding for a register -+ unsigned getRegBinaryCode(unsigned reg) const; -+ -+ /// \brief Generated function that returns the hardware encoding for -+ /// a register -+ unsigned getHWRegNum(unsigned reg) const; -+ +}; + +} // End anonymous namespace @@ -13372,39 +13313,131 @@ index 0000000..c47dc99 + return new SIMCCodeEmitter(MCII, MRI, STI, Ctx); +} + ++bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, ++ unsigned OpNo) const { ++ ++ unsigned RegClass = Desc.OpInfo[OpNo].RegClass; ++ return (AMDGPU::SSrc_32RegClassID == RegClass) || ++ (AMDGPU::SSrc_64RegClassID == RegClass) || ++ (AMDGPU::VSrc_32RegClassID == RegClass) || ++ (AMDGPU::VSrc_64RegClassID == RegClass); ++} ++ ++uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const { ++ ++ IntFloatUnion Imm; ++ if (MO.isImm()) ++ Imm.I = MO.getImm(); ++ else if (MO.isFPImm()) ++ Imm.F = MO.getFPImm(); ++ else ++ return ~0; ++ ++ if (Imm.I >= 0 && Imm.I <= 64) ++ return 128 + Imm.I; ++ ++ if (Imm.I >= -16 && Imm.I <= -1) ++ return 192 + abs(Imm.I); ++ ++ if (Imm.F == 0.5f) ++ return 240; ++ ++ if (Imm.F == -0.5f) ++ return 241; ++ ++ if (Imm.F == 1.0f) ++ return 242; ++ ++ if (Imm.F == -1.0f) ++ return 243; ++ ++ if (Imm.F == 2.0f) ++ return 244; ++ ++ if (Imm.F == -2.0f) ++ return 245; ++ ++ if (Imm.F == 4.0f) ++ return 246; ++ ++ if (Imm.F == 4.0f) ++ return 247; ++ ++ return 255; ++} ++ +void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl &Fixups) const { ++ + uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups); -+ unsigned bytes = getEncodingBytes(MI); ++ const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); ++ unsigned bytes = Desc.getSize(); ++ + for (unsigned i = 0; i < bytes; i++) { + OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); + } ++ ++ if (bytes > 4) ++ return; ++ ++ // Check for additional literals in SRC0/1/2 (Op 1/2/3) ++ for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { ++ ++ // Check if this operand should be encoded as [SV]Src ++ if (!isSrcOperand(Desc, i)) ++ continue; ++ ++ // Is this operand a literal immediate? ++ const MCOperand &Op = MI.getOperand(i); ++ if (getLitEncoding(Op) != 255) ++ continue; ++ ++ // Yes! Encode it ++ IntFloatUnion Imm; ++ if (Op.isImm()) ++ Imm.I = Op.getImm(); ++ else ++ Imm.F = Op.getFPImm(); ++ ++ for (unsigned j = 0; j < 4; j++) { ++ OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff)); ++ } ++ ++ // Only one literal value allowed ++ break; ++ } +} + +uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, + SmallVectorImpl &Fixups) const { -+ if (MO.isReg()) { -+ return getRegBinaryCode(MO.getReg()); -+ } else if (MO.isImm()) { -+ return MO.getImm(); -+ } else if (MO.isFPImm()) { -+ // XXX: Not all instructions can use inline literals -+ // XXX: We should make sure this is a 32-bit constant -+ union { -+ float F; -+ uint32_t I; -+ } Imm; -+ Imm.F = MO.getFPImm(); -+ return Imm.I; -+ } else if (MO.isExpr()) { ++ if (MO.isReg()) ++ return MRI.getEncodingValue(MO.getReg()); ++ ++ if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = MCFixupKind(FK_PCRel_4); + Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); + return 0; -+ } else{ -+ llvm_unreachable("Encoding of this operand type is not supported yet."); + } ++ ++ // Figure out the operand number, needed for isSrcOperand check ++ unsigned OpNo = 0; ++ for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) { ++ if (&MO == &MI.getOperand(OpNo)) ++ break; ++ } ++ ++ const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); ++ if (isSrcOperand(Desc, OpNo)) { ++ uint32_t Enc = getLitEncoding(MO); ++ if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) ++ return Enc; ++ ++ } else if (MO.isImm()) ++ return MO.getImm(); ++ ++ llvm_unreachable("Encoding of this operand type is not supported yet."); + return 0; +} + @@ -13414,10 +13447,10 @@ index 0000000..c47dc99 + +unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo, + unsigned shift) const { -+ unsigned regCode = getRegBinaryCode(MI.getOperand(OpNo).getReg()); -+ return regCode >> shift; -+ return 0; ++ unsigned regCode = MRI.getEncodingValue(MI.getOperand(OpNo).getReg()); ++ return (regCode & 0xff) >> shift; +} ++ +unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI, + unsigned OpNo , + SmallVectorImpl &Fixup) const { @@ -13429,120 +13462,6 @@ index 0000000..c47dc99 + SmallVectorImpl &Fixup) const { + return GPRAlign(MI, OpNo, 2); +} -+ -+#define SMRD_OFFSET_MASK 0xff -+#define SMRD_IMM_SHIFT 8 -+#define SMRD_SBASE_MASK 0x3f -+#define SMRD_SBASE_SHIFT 9 -+/// This function is responsibe for encoding the offset -+/// and the base ptr for SMRD instructions it should return a bit string in -+/// this format: -+/// -+/// OFFSET = bits{7-0} -+/// IMM = bits{8} -+/// SBASE = bits{14-9} -+/// -+uint32_t SIMCCodeEmitter::SMRDmemriEncode(const MCInst &MI, unsigned OpNo, -+ SmallVectorImpl &Fixup) const { -+ uint32_t Encoding; -+ -+ const MCOperand &OffsetOp = MI.getOperand(OpNo + 1); -+ -+ //XXX: Use this function for SMRD loads with register offsets -+ assert(OffsetOp.isImm()); -+ -+ Encoding = -+ (getMachineOpValue(MI, OffsetOp, Fixup) & SMRD_OFFSET_MASK) -+ | (1 << SMRD_IMM_SHIFT) //XXX If the Offset is a register we shouldn't set this bit -+ | ((GPR2AlignEncode(MI, OpNo, Fixup) & SMRD_SBASE_MASK) << SMRD_SBASE_SHIFT) -+ ; -+ -+ return Encoding; -+} -+ -+//===----------------------------------------------------------------------===// -+// Post Encoder Callbacks -+//===----------------------------------------------------------------------===// -+ -+uint64_t SIMCCodeEmitter::VOPPostEncode(const MCInst &MI, uint64_t Value) const{ -+ unsigned encodingType = getEncodingType(MI); -+ unsigned numSrcOps; -+ unsigned vgprBitOffset; -+ -+ if (encodingType == SIInstrEncodingType::VOP3) { -+ numSrcOps = 3; -+ vgprBitOffset = 32; -+ } else { -+ numSrcOps = 1; -+ vgprBitOffset = 0; -+ } -+ -+ // Add one to skip over the destination reg operand. -+ for (unsigned opIdx = 1; opIdx < numSrcOps + 1; opIdx++) { -+ const MCOperand &MO = MI.getOperand(opIdx); -+ if (MO.isReg()) { -+ unsigned reg = MI.getOperand(opIdx).getReg(); -+ if (AMDGPUMCRegisterClasses[AMDGPU::VReg_32RegClassID].contains(reg) || -+ AMDGPUMCRegisterClasses[AMDGPU::VReg_64RegClassID].contains(reg)) { -+ Value |= (VGPR_BIT(opIdx)) << vgprBitOffset; -+ } -+ } else if (MO.isFPImm()) { -+ union { -+ float f; -+ uint32_t i; -+ } Imm; -+ // XXX: Not all instructions can use inline literals -+ // XXX: We should make sure this is a 32-bit constant -+ Imm.f = MO.getFPImm(); -+ Value |= ((uint64_t)Imm.i) << 32; -+ } -+ } -+ return Value; -+} -+ -+//===----------------------------------------------------------------------===// -+// Encoding helper functions -+//===----------------------------------------------------------------------===// -+ -+unsigned SIMCCodeEmitter::getEncodingType(const MCInst &MI) const { -+ return MCII.get(MI.getOpcode()).TSFlags & SI_INSTR_FLAGS_ENCODING_MASK; -+} -+ -+unsigned SIMCCodeEmitter::getEncodingBytes(const MCInst &MI) const { -+ -+ // These instructions aren't real instructions with an encoding type, so -+ // we need to manually specify their size. -+ switch (MI.getOpcode()) { -+ default: break; -+ case AMDGPU::SI_LOAD_LITERAL_I32: -+ case AMDGPU::SI_LOAD_LITERAL_F32: -+ return 4; -+ } -+ -+ unsigned encoding_type = getEncodingType(MI); -+ switch (encoding_type) { -+ case SIInstrEncodingType::EXP: -+ case SIInstrEncodingType::LDS: -+ case SIInstrEncodingType::MUBUF: -+ case SIInstrEncodingType::MTBUF: -+ case SIInstrEncodingType::MIMG: -+ case SIInstrEncodingType::VOP3: -+ return 8; -+ default: -+ return 4; -+ } -+} -+ -+ -+unsigned SIMCCodeEmitter::getRegBinaryCode(unsigned reg) const { -+ switch (reg) { -+ case AMDGPU::M0: return 124; -+ case AMDGPU::SREG_LIT_0: return 128; -+ case AMDGPU::SI_LITERAL_CONSTANT: return 255; -+ default: return MRI.getEncodingValue(reg); -+ } -+} -+ diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile new file mode 100644 index 0000000..1b3ebbe @@ -13574,10 +13493,10 @@ index 0000000..1b3ebbe +include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td new file mode 100644 -index 0000000..3dc1ecd +index 0000000..868810c --- /dev/null +++ b/lib/Target/R600/Processors.td -@@ -0,0 +1,29 @@ +@@ -0,0 +1,30 @@ +//===-- Processors.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure @@ -13593,6 +13512,7 @@ index 0000000..3dc1ecd + +class Proc Features> +: Processor; ++def : Proc<"", R600_EG_Itin, [FeatureR600ALUInst]>; +def : Proc<"r600", R600_EG_Itin, [FeatureR600ALUInst]>; +def : Proc<"rv710", R600_EG_Itin, []>; +def : Proc<"rv730", R600_EG_Itin, []>; @@ -14008,7 +13928,7 @@ index 0000000..c00c349 +} diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp new file mode 100644 -index 0000000..a479cee +index 0000000..9c38522 --- /dev/null +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -0,0 +1,1195 @@ @@ -15094,7 +15014,7 @@ index 0000000..a479cee + AMDGPUAS::PARAM_I_ADDRESS); + SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), + DAG.getConstant(ParamOffsetBytes, MVT::i32), -+ MachinePointerInfo(new Argument(PtrTy)), ++ MachinePointerInfo(UndefValue::get(PtrTy)), + ArgVT, false, false, ArgBytes); + InVals.push_back(Arg); + ParamOffsetBytes += ArgBytes; @@ -16282,10 +16202,10 @@ index 0000000..278fad1 +#endif // R600INSTRINFO_H_ diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td new file mode 100644 -index 0000000..d307ed2 +index 0000000..409da07 --- /dev/null +++ b/lib/Target/R600/R600Instructions.td -@@ -0,0 +1,1917 @@ +@@ -0,0 +1,1976 @@ +//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure @@ -16687,7 +16607,7 @@ index 0000000..d307ed2 +def TEX_SHADOW : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); -+ return (TType >= 6 && TType <= 8) || TType == 13; ++ return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13); + }] +>; + @@ -17779,6 +17699,10 @@ index 0000000..d307ed2 + [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))] +>; + ++def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, ++ [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))] ++>; ++ +//===----------------------------------------------------------------------===// +// VTX Read from global memory space +//===----------------------------------------------------------------------===// @@ -17874,6 +17798,7 @@ index 0000000..d307ed2 + (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), + "", [], NullALU> { + let FlagOperandIdx = 3; ++ let isTerminator = 1; +} + +let isTerminator = 1, isBranch = 1, isBarrier = 1 in { @@ -18001,6 +17926,60 @@ index 0000000..d307ed2 +// Inst{127-96} = 0; +} + ++def TEX_VTX_TEXBUF: ++ InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", ++ [(set R600_Reg128:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, ++VTX_WORD1_GPR, VTX_WORD0 { ++ ++let VC_INST = 0; ++let FETCH_TYPE = 2; ++let FETCH_WHOLE_QUAD = 0; ++let SRC_REL = 0; ++let SRC_SEL_X = 0; ++let DST_REL = 0; ++let USE_CONST_FIELDS = 1; ++let NUM_FORMAT_ALL = 0; ++let FORMAT_COMP_ALL = 0; ++let SRF_MODE_ALL = 1; ++let MEGA_FETCH_COUNT = 16; ++let DST_SEL_X = 0; ++let DST_SEL_Y = 1; ++let DST_SEL_Z = 2; ++let DST_SEL_W = 3; ++let DATA_FORMAT = 0; ++ ++let Inst{31-0} = Word0; ++let Inst{63-32} = Word1; ++ ++// LLVM can only encode 64-bit instructions, so these fields are manually ++// encoded in R600CodeEmitter ++// ++// bits<16> OFFSET; ++// bits<2> ENDIAN_SWAP = 0; ++// bits<1> CONST_BUF_NO_STRIDE = 0; ++// bits<1> MEGA_FETCH = 0; ++// bits<1> ALT_CONST = 0; ++// bits<2> BUFFER_INDEX_MODE = 0; ++ ++ ++ ++// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding ++// is done in R600CodeEmitter ++// ++// Inst{79-64} = OFFSET; ++// Inst{81-80} = ENDIAN_SWAP; ++// Inst{82} = CONST_BUF_NO_STRIDE; ++// Inst{83} = MEGA_FETCH; ++// Inst{84} = ALT_CONST; ++// Inst{86-85} = BUFFER_INDEX_MODE; ++// Inst{95-86} = 0; Reserved ++ ++// VTX_WORD3 (Padding) ++// ++// Inst{127-96} = 0; ++} ++ ++ + +//===--------------------------------------------------------------------===// +// Instructions support @@ -18205,10 +18184,10 @@ index 0000000..d307ed2 +} // End isR600toCayman Predicate diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td new file mode 100644 -index 0000000..284d4d8 +index 0000000..6046f0d --- /dev/null +++ b/lib/Target/R600/R600Intrinsics.td -@@ -0,0 +1,32 @@ +@@ -0,0 +1,57 @@ +//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===// +// +// The LLVM Compiler Infrastructure @@ -18227,6 +18206,8 @@ index 0000000..284d4d8 + Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_R600_interp_input : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_R600_load_texbuf : ++ Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_R600_store_swizzle : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; + @@ -18241,9 +18222,32 @@ index 0000000..284d4d8 + def int_R600_store_dummy : + Intrinsic<[], [llvm_i32_ty], []>; +} ++let TargetPrefix = "r600", isTarget = 1 in { ++ ++class R600ReadPreloadRegisterIntrinsic ++ : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>, ++ GCCBuiltin; ++ ++multiclass R600ReadPreloadRegisterIntrinsic_xyz { ++ def _x : R600ReadPreloadRegisterIntrinsic; ++ def _y : R600ReadPreloadRegisterIntrinsic; ++ def _z : R600ReadPreloadRegisterIntrinsic; ++} ++ ++defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz < ++ "__builtin_r600_read_global_size">; ++defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz < ++ "__builtin_r600_read_local_size">; ++defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz < ++ "__builtin_r600_read_ngroups">; ++defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz < ++ "__builtin_r600_read_tgid">; ++defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz < ++ "__builtin_r600_read_tidig">; ++} diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp new file mode 100644 -index 0000000..2557e8f +index 0000000..c8c27a8 --- /dev/null +++ b/lib/Target/R600/R600LowerConstCopy.cpp @@ -0,0 +1,222 @@ @@ -18429,7 +18433,7 @@ index 0000000..2557e8f + int ConstMovSel = + TII->getOperandIdx(CstMov->getOpcode(), R600Operands::SRC0_SEL); + unsigned ConstIndex = CstMov->getOperand(ConstMovSel).getImm(); -+ if (canFoldInBundle(CP, ConstIndex)) { ++ if (MI->isInsideBundle() && canFoldInBundle(CP, ConstIndex)) { + TII->setImmOperand(MI, OpTable[SrcOp][1], ConstIndex); + MI->getOperand(SrcIdx).setReg(AMDGPU::ALU_CONST); + } else { @@ -18534,10 +18538,10 @@ index 0000000..41e4894 +#endif //R600MACHINEFUNCTIONINFO_H diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp new file mode 100644 -index 0000000..33e858d +index 0000000..bbd7995 --- /dev/null +++ b/lib/Target/R600/R600RegisterInfo.cpp -@@ -0,0 +1,93 @@ +@@ -0,0 +1,99 @@ +//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===// +// +// The LLVM Compiler Infrastructure @@ -18589,6 +18593,12 @@ index 0000000..33e858d + Reserved.set(*I); + } + ++ for (TargetRegisterClass::iterator I = AMDGPU::TRegMemRegClass.begin(), ++ E = AMDGPU::TRegMemRegClass.end(); ++ I != E; ++I) { ++ Reserved.set(*I); ++ } ++ + const R600InstrInfo *RII = static_cast(&TII); + std::vector IndirectRegs = RII->getIndirectReservedRegs(MF); + for (std::vector::iterator I = IndirectRegs.begin(), @@ -18694,7 +18704,7 @@ index 0000000..c170ccb +#endif // AMDIDSAREGISTERINFO_H_ diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td new file mode 100644 -index 0000000..3812eb7 +index 0000000..a7d847a --- /dev/null +++ b/lib/Target/R600/R600RegisterInfo.td @@ -0,0 +1,146 @@ @@ -18781,7 +18791,7 @@ index 0000000..3812eb7 +} // End isAllocatable = 0 + +def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, -+ (add (sequence "T%u_X", 0, 127))>; ++ (add (sequence "T%u_X", 0, 127), AR_X)>; + +def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_Y", 0, 127))>; @@ -19382,10 +19392,10 @@ index 0000000..832e44d +} diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp new file mode 100644 -index 0000000..202584b +index 0000000..694c045 --- /dev/null +++ b/lib/Target/R600/SIISelLowering.cpp -@@ -0,0 +1,489 @@ +@@ -0,0 +1,399 @@ +//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// +// +// The LLVM Compiler Infrastructure @@ -19419,8 +19429,7 @@ index 0000000..202584b + addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); -+ addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass); -+ addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass); ++ addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); + + addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass); @@ -19430,8 +19439,6 @@ index 0000000..202584b + + computeRegisterProperties(); + -+ setOperationAction(ISD::AND, MVT::i1, Custom); -+ + setOperationAction(ISD::ADD, MVT::i64, Legal); + setOperationAction(ISD::ADD, MVT::i32, Legal); + @@ -19462,13 +19469,11 @@ index 0000000..202584b + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + case AMDGPU::BRANCH: return BB; + case AMDGPU::CLAMP_SI: -+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_ADD_F32_e64)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) -+ // VSRC1-2 are unused, but we still need to fill all the -+ // operand slots, so we just reuse the VSRC0 operand -+ .addOperand(MI->getOperand(1)) -+ .addOperand(MI->getOperand(1)) ++ .addImm(0x80) // SRC1 ++ .addImm(0x80) // SRC2 + .addImm(0) // ABS + .addImm(1) // CLAMP + .addImm(0) // OMOD @@ -19477,13 +19482,11 @@ index 0000000..202584b + break; + + case AMDGPU::FABS_SI: -+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_ADD_F32_e64)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) -+ // VSRC1-2 are unused, but we still need to fill all the -+ // operand slots, so we just reuse the VSRC0 operand -+ .addOperand(MI->getOperand(1)) -+ .addOperand(MI->getOperand(1)) ++ .addImm(0x80) // SRC1 ++ .addImm(0x80) // SRC2 + .addImm(1) // ABS + .addImm(0) // CLAMP + .addImm(0) // OMOD @@ -19492,13 +19495,11 @@ index 0000000..202584b + break; + + case AMDGPU::FNEG_SI: -+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_ADD_F32_e64)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) -+ // VSRC1-2 are unused, but we still need to fill all the -+ // operand slots, so we just reuse the VSRC0 operand -+ .addOperand(MI->getOperand(1)) -+ .addOperand(MI->getOperand(1)) ++ .addImm(0x80) // SRC1 ++ .addImm(0x80) // SRC2 + .addImm(0) // ABS + .addImm(0) // CLAMP + .addImm(0) // OMOD @@ -19514,15 +19515,9 @@ index 0000000..202584b + case AMDGPU::SI_INTERP: + LowerSI_INTERP(MI, *BB, I, MRI); + break; -+ case AMDGPU::SI_INTERP_CONST: -+ LowerSI_INTERP_CONST(MI, *BB, I, MRI); -+ break; + case AMDGPU::SI_WQM: + LowerSI_WQM(MI, *BB, I, MRI); + break; -+ case AMDGPU::SI_V_CNDLT: -+ LowerSI_V_CNDLT(MI, *BB, I, MRI); -+ break; + } + return BB; +} @@ -19566,46 +19561,6 @@ index 0000000..202584b + MI->eraseFromParent(); +} + -+void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI, -+ MachineBasicBlock &BB, MachineBasicBlock::iterator I, -+ MachineRegisterInfo &MRI) const { -+ MachineOperand dst = MI->getOperand(0); -+ MachineOperand attr_chan = MI->getOperand(1); -+ MachineOperand attr = MI->getOperand(2); -+ MachineOperand params = MI->getOperand(3); -+ unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass); -+ -+ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0) -+ .addOperand(params); -+ -+ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32)) -+ .addOperand(dst) -+ .addOperand(attr_chan) -+ .addOperand(attr) -+ .addReg(M0); -+ -+ MI->eraseFromParent(); -+} -+ -+void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB, -+ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { -+ unsigned VCC = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); -+ -+ BuildMI(BB, I, BB.findDebugLoc(I), -+ TII->get(AMDGPU::V_CMP_GT_F32_e32), -+ VCC) -+ .addReg(AMDGPU::SREG_LIT_0) -+ .addOperand(MI->getOperand(1)); -+ -+ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32_e32)) -+ .addOperand(MI->getOperand(0)) -+ .addOperand(MI->getOperand(3)) -+ .addOperand(MI->getOperand(2)) -+ .addReg(VCC); -+ -+ MI->eraseFromParent(); -+} -+ +EVT SITargetLowering::getSetCCResultType(EVT VT) const { + return MVT::i1; +} @@ -19620,7 +19575,6 @@ index 0000000..202584b + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); -+ case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND); + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID = + cast(Op.getOperand(0))->getZExtValue(); @@ -19637,30 +19591,6 @@ index 0000000..202584b + return SDValue(); +} + -+/// \brief The function is for lowering i1 operations on the -+/// VCC register. -+/// -+/// In the VALU context, VCC is a one bit register, but in the -+/// SALU context the VCC is a 64-bit register (1-bit per thread). Since only -+/// the SALU can perform operations on the VCC register, we need to promote -+/// the operand types from i1 to i64 in order for tablegen to be able to match -+/// this operation to the correct SALU instruction. We do this promotion by -+/// wrapping the operands in a CopyToReg node. -+/// -+SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op, -+ SelectionDAG &DAG, -+ unsigned VCCNode) const { -+ DebugLoc DL = Op.getDebugLoc(); -+ -+ SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64, -+ DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64, -+ Op.getOperand(0)), -+ DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64, -+ Op.getOperand(1))); -+ -+ return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode); -+} -+ +/// \brief Helper function for LowerBRCOND +static SDNode *findUser(SDValue Value, unsigned Opcode) { + @@ -19865,22 +19795,12 @@ index 0000000..202584b + } + return SDValue(); +} -+ -+#define NODE_NAME_CASE(node) case SIISD::node: return #node; -+ -+const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const { -+ switch (Opcode) { -+ default: return AMDGPUTargetLowering::getTargetNodeName(Opcode); -+ NODE_NAME_CASE(VCC_AND) -+ NODE_NAME_CASE(VCC_BITCAST) -+ } -+} diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h new file mode 100644 -index 0000000..8528c24 +index 0000000..5d048f8 --- /dev/null +++ b/lib/Target/R600/SIISelLowering.h -@@ -0,0 +1,55 @@ +@@ -0,0 +1,48 @@ +//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure @@ -19910,15 +19830,9 @@ index 0000000..8528c24 + MachineBasicBlock::iterator I, unsigned Opocde) const; + void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; -+ void LowerSI_INTERP_CONST(MachineInstr *MI, MachineBasicBlock &BB, -+ MachineBasicBlock::iterator I, MachineRegisterInfo &MRI) const; + void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; -+ void LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB, -+ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; + -+ SDValue Loweri1ContextSwitch(SDValue Op, SelectionDAG &DAG, -+ unsigned VCCNode) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -19930,7 +19844,6 @@ index 0000000..8528c24 + virtual EVT getSetCCResultType(EVT VT) const; + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; -+ virtual const char* getTargetNodeName(unsigned Opcode) const; +}; + +} // End namespace llvm @@ -20297,10 +20210,10 @@ index 0000000..24fc929 +} diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td new file mode 100644 -index 0000000..aea3b5a +index 0000000..40e37aa --- /dev/null +++ b/lib/Target/R600/SIInstrFormats.td -@@ -0,0 +1,146 @@ +@@ -0,0 +1,188 @@ +//===-- SIInstrFormats.td - SI Instruction Formats ------------------------===// +// +// The LLVM Compiler Infrastructure @@ -20324,40 +20237,23 @@ index 0000000..aea3b5a +// +//===----------------------------------------------------------------------===// + -+class VOP3b_2IN op, string opName, RegisterClass dstClass, -+ RegisterClass src0Class, RegisterClass src1Class, -+ list pattern> -+ : VOP3b ; -+ -+ -+class VOP3_1_32 op, string opName, list pattern> -+ : VOP3b_2IN ; -+ +class VOP3_32 op, string opName, list pattern> -+ : VOP3 ; ++ : VOP3 ; + +class VOP3_64 op, string opName, list pattern> -+ : VOP3 ; -+ ++ : VOP3 ; + +class SOP1_32 op, string opName, list pattern> -+ : SOP1 ; ++ : SOP1 ; + +class SOP1_64 op, string opName, list pattern> -+ : SOP1 ; ++ : SOP1 ; + +class SOP2_32 op, string opName, list pattern> -+ : SOP2 ; ++ : SOP2 ; + +class SOP2_64 op, string opName, list pattern> -+ : SOP2 ; -+ -+class SOP2_VCC op, string opName, list pattern> -+ : SOP2 ; ++ : SOP2 ; + +class VOP1_Helper op, RegisterClass vrc, RegisterClass arc, + string opName, list pattern> : @@ -20366,7 +20262,7 @@ index 0000000..aea3b5a + >; + +multiclass VOP1_32 op, string opName, list pattern> { -+ def _e32: VOP1_Helper ; ++ def _e32: VOP1_Helper ; + def _e64 : VOP3_32 <{1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + opName, [] + >; @@ -20374,7 +20270,7 @@ index 0000000..aea3b5a + +multiclass VOP1_64 op, string opName, list pattern> { + -+ def _e32 : VOP1_Helper ; ++ def _e32 : VOP1_Helper ; + + def _e64 : VOP3_64 < + {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, @@ -20390,7 +20286,7 @@ index 0000000..aea3b5a + +multiclass VOP2_32 op, string opName, list pattern> { + -+ def _e32 : VOP2_Helper ; ++ def _e32 : VOP2_Helper ; + + def _e64 : VOP3_32 <{1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + opName, [] @@ -20398,7 +20294,7 @@ index 0000000..aea3b5a +} + +multiclass VOP2_64 op, string opName, list pattern> { -+ def _e32: VOP2_Helper ; ++ def _e32: VOP2_Helper ; + + def _e64 : VOP3_64 < + {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, @@ -20412,47 +20308,106 @@ index 0000000..aea3b5a +class SOPK_64 op, string opName, list pattern> + : SOPK ; + -+class VOPC_Helper op, RegisterClass vrc, RegisterClass arc, -+ string opName, list pattern> : -+ VOPC < -+ op, (ins arc:$src0, vrc:$src1), opName, pattern -+ >; ++multiclass VOPC_Helper op, RegisterClass vrc, RegisterClass arc, ++ string opName, list pattern> { + -+multiclass VOPC_32 op, string opName, list pattern> { -+ -+ def _e32 : VOPC_Helper < -+ {op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, -+ VReg_32, AllReg_32, opName, pattern -+ >; -+ -+ def _e64 : VOP3_1_32 < -+ op, -+ opName, pattern -+ >; -+} -+ -+multiclass VOPC_64 op, string opName, list pattern> { -+ -+ def _e32 : VOPC_Helper ; -+ -+ def _e64 : VOP3_64 < ++ def _e32 : VOPC ; ++ def _e64 : VOP3 < + {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, -+ opName, [] -+ >; ++ (outs SReg_64:$dst), ++ (ins arc:$src0, vrc:$src1, ++ InstFlag:$abs, InstFlag:$clamp, ++ InstFlag:$omod, InstFlag:$neg), ++ opName, pattern ++ > { ++ let SRC2 = 0x80; ++ } +} + ++multiclass VOPC_32 op, string opName, list pattern> ++ : VOPC_Helper ; ++ ++multiclass VOPC_64 op, string opName, list pattern> ++ : VOPC_Helper ; ++ +class SOPC_32 op, string opName, list pattern> -+ : SOPC ; ++ : SOPC ; + +class SOPC_64 op, string opName, list pattern> -+ : SOPC ; ++ : SOPC ; ++ ++class MIMG_Load_Helper op, string asm> : MIMG < ++ op, ++ (outs VReg_128:$vdata), ++ (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, ++ i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr, ++ GPR4Align:$srsrc, GPR4Align:$ssamp), ++ asm, ++ []> { ++ let mayLoad = 1; ++ let mayStore = 0; ++} ++ ++class MTBUF_Store_Helper op, string asm, RegisterClass regClass> : MTBUF < ++ op, ++ (outs), ++ (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, ++ i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, ++ GPR4Align:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), ++ asm, ++ []> { ++ let mayStore = 1; ++ let mayLoad = 0; ++} ++ ++class MUBUF_Load_Helper op, string asm, RegisterClass regClass> : MUBUF < ++ op, ++ (outs regClass:$dst), ++ (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, ++ i1imm:$lds, VReg_32:$vaddr, GPR4Align:$srsrc, i1imm:$slc, ++ i1imm:$tfe, SSrc_32:$soffset), ++ asm, ++ []> { ++ let mayLoad = 1; ++ let mayStore = 0; ++} ++ ++class MTBUF_Load_Helper op, string asm, RegisterClass regClass> : MTBUF < ++ op, ++ (outs regClass:$dst), ++ (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, ++ i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align:$srsrc, ++ i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), ++ asm, ++ []> { ++ let mayLoad = 1; ++ let mayStore = 0; ++} ++ ++multiclass SMRD_Helper op, string asm, RegisterClass dstClass> { ++ def _IMM : SMRD < ++ op, 1, ++ (outs dstClass:$dst), ++ (ins GPR2Align:$sbase, i32imm:$offset), ++ asm, ++ [] ++ >; ++ ++ def _SGPR : SMRD < ++ op, 0, ++ (outs dstClass:$dst), ++ (ins GPR2Align:$sbase, SReg_32:$soff), ++ asm, ++ [] ++ >; ++} + diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp new file mode 100644 -index 0000000..011ec50 +index 0000000..1c4b3cf --- /dev/null +++ b/lib/Target/R600/SIInstrInfo.cpp -@@ -0,0 +1,137 @@ +@@ -0,0 +1,143 @@ +//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// +// +// The LLVM Compiler Infrastructure @@ -20497,7 +20452,15 @@ index 0000000..011ec50 + // never be necessary. + assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); + -+ if (AMDGPU::SReg_64RegClass.contains(DestReg)) { ++ if (AMDGPU::VReg_64RegClass.contains(DestReg)) { ++ assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || ++ AMDGPU::SReg_64RegClass.contains(SrcReg)); ++ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub0)) ++ .addReg(RI.getSubReg(SrcReg, AMDGPU::sub0), getKillRegState(KillSrc)) ++ .addReg(DestReg, RegState::Define | RegState::Implicit); ++ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub1)) ++ .addReg(RI.getSubReg(SrcReg, AMDGPU::sub1), getKillRegState(KillSrc)); ++ } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); @@ -20516,9 +20479,10 @@ index 0000000..011ec50 + +MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg, + int64_t Imm) const { -+ MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_IMM_I32), DebugLoc()); -+ MachineInstrBuilder(MI).addReg(DstReg, RegState::Define); -+ MachineInstrBuilder(MI).addImm(Imm); ++ MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_B32_e32), DebugLoc()); ++ MachineInstrBuilder MIB(MI); ++ MIB.addReg(DstReg, RegState::Define); ++ MIB.addImm(Imm); + + return MI; + @@ -20531,9 +20495,6 @@ index 0000000..011ec50 + case AMDGPU::S_MOV_B64: + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: -+ case AMDGPU::V_MOV_IMM_F32: -+ case AMDGPU::V_MOV_IMM_I32: -+ case AMDGPU::S_MOV_IMM_I32: + return true; + } +} @@ -20592,10 +20553,10 @@ index 0000000..011ec50 +} diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h new file mode 100644 -index 0000000..e4de4b8 +index 0000000..a65f7b6 --- /dev/null +++ b/lib/Target/R600/SIInstrInfo.h -@@ -0,0 +1,90 @@ +@@ -0,0 +1,84 @@ +//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===// +// +// The LLVM Compiler Infrastructure @@ -20633,12 +20594,6 @@ index 0000000..e4de4b8 + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + -+ /// \returns the encoding type of this instruction. -+ unsigned getEncodingType(const MachineInstr &MI) const; -+ -+ /// \returns the size of this instructions encoding in number of bytes. -+ unsigned getEncodingBytes(const MachineInstr &MI) const; -+ + virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg, + int64_t Imm) const; + @@ -20679,19 +20634,19 @@ index 0000000..e4de4b8 +namespace SIInstrFlags { + enum Flags { + // First 4 bits are the instruction encoding -+ VM_CNT = 1 << 4, -+ EXP_CNT = 1 << 5, -+ LGKM_CNT = 1 << 6 ++ VM_CNT = 1 << 0, ++ EXP_CNT = 1 << 1, ++ LGKM_CNT = 1 << 2 + }; +} + +#endif //SIINSTRINFO_H diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td new file mode 100644 -index 0000000..9d9f5f6 +index 0000000..8c4e5af --- /dev/null +++ b/lib/Target/R600/SIInstrInfo.td -@@ -0,0 +1,591 @@ +@@ -0,0 +1,465 @@ +//===-- SIInstrInfo.td - SI Instruction Encodings ---------*- tablegen -*--===// +// +// The LLVM Compiler Infrastructure @@ -20702,60 +20657,66 @@ index 0000000..9d9f5f6 +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// -+// SI DAG Profiles -+//===----------------------------------------------------------------------===// -+def SDTVCCBinaryOp : SDTypeProfile<1, 2, [ -+ SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2> -+]>; -+ -+//===----------------------------------------------------------------------===// +// SI DAG Nodes +//===----------------------------------------------------------------------===// + -+// and operation on 64-bit wide vcc -+def SIsreg1_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp, -+ [SDNPCommutative, SDNPAssociative] ++// SMRD takes a 64bit memory address and can only add an 32bit offset ++def SIadd64bit32bit : SDNode<"ISD::ADD", ++ SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]> +>; + -+// Special bitcast node for sharing VCC register between VALU and SALU -+def SIsreg1_bitcast : SDNode<"SIISD::VCC_BITCAST", -+ SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]> ++// Transformation function, extract the lower 32bit of a 64bit immediate ++def LO32 : SDNodeXFormgetTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32); ++}]>; ++ ++// Transformation function, extract the upper 32bit of a 64bit immediate ++def HI32 : SDNodeXFormgetTargetConstant(N->getZExtValue() >> 32, MVT::i32); ++}]>; ++ ++def IMM8bitDWORD : ImmLeaf < ++ i32, [{ ++ return (Imm & ~0x3FC) == 0; ++ }], SDNodeXFormgetTargetConstant( ++ N->getZExtValue() >> 2, MVT::i32); ++ }]> +>; + -+// and operation on 64-bit wide vcc -+def SIvcc_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp, -+ [SDNPCommutative, SDNPAssociative] ++def IMM12bit : ImmLeaf < ++ i16, ++ [{return isUInt<12>(Imm);}] +>; + -+// Special bitcast node for sharing VCC register between VALU and SALU -+def SIvcc_bitcast : SDNode<"SIISD::VCC_BITCAST", -+ SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]> -+>; ++class InlineImm : ImmLeaf ; + +class InstSI pattern> : + AMDGPUInst { + -+ field bits<4> EncodingType = 0; + field bits<1> VM_CNT = 0; + field bits<1> EXP_CNT = 0; + field bits<1> LGKM_CNT = 0; + -+ let TSFlags{3-0} = EncodingType; -+ let TSFlags{4} = VM_CNT; -+ let TSFlags{5} = EXP_CNT; -+ let TSFlags{6} = LGKM_CNT; ++ let TSFlags{0} = VM_CNT; ++ let TSFlags{1} = EXP_CNT; ++ let TSFlags{2} = LGKM_CNT; +} + +class Enc32 pattern> : + InstSI { + + field bits<32> Inst; ++ let Size = 4; +} + +class Enc64 pattern> : + InstSI { + + field bits<64> Inst; ++ let Size = 8; +} + +class SIOperand : Operand { @@ -20763,49 +20724,16 @@ index 0000000..9d9f5f6 + let MIOperandInfo = opInfo; +} + -+def IMM16bit : ImmLeaf < -+ i16, -+ [{return isInt<16>(Imm);}] -+>; -+ -+def IMM8bit : ImmLeaf < -+ i32, -+ [{return (int32_t)Imm >= 0 && (int32_t)Imm <= 0xff;}] -+>; -+ -+def IMM12bit : ImmLeaf < -+ i16, -+ [{return (int16_t)Imm >= 0 && (int16_t)Imm <= 0xfff;}] -+>; -+ -+def IMM32bitIn64bit : ImmLeaf < -+ i64, -+ [{return isInt<32>(Imm);}] -+>; -+ +class GPR4Align : Operand { + let EncoderMethod = "GPR4AlignEncode"; + let MIOperandInfo = (ops rc:$reg); +} + -+class GPR2Align : Operand { ++class GPR2Align : Operand { + let EncoderMethod = "GPR2AlignEncode"; + let MIOperandInfo = (ops rc:$reg); +} + -+def SMRDmemrr : Operand { -+ let MIOperandInfo = (ops SReg_64, SReg_32); -+ let EncoderMethod = "GPR2AlignEncode"; -+} -+ -+def SMRDmemri : Operand { -+ let MIOperandInfo = (ops SReg_64, i32imm); -+ let EncoderMethod = "SMRDmemriEncode"; -+} -+ -+def ADDR_Reg : ComplexPattern; -+def ADDR_Offset8 : ComplexPattern; -+ +let Uses = [EXEC] in { + +def EXP : Enc64< @@ -20835,7 +20763,6 @@ index 0000000..9d9f5f6 + let Inst{47-40} = VSRC1; + let Inst{55-48} = VSRC2; + let Inst{63-56} = VSRC3; -+ let EncodingType = 0; //SIInstrEncodingType::EXP + + let EXP_CNT = 1; +} @@ -20870,7 +20797,6 @@ index 0000000..9d9f5f6 + let Inst{47-40} = VDATA; + let Inst{52-48} = SRSRC; + let Inst{57-53} = SSAMP; -+ let EncodingType = 2; //SIInstrEncodingType::MIMG + + let VM_CNT = 1; + let EXP_CNT = 1; @@ -20908,7 +20834,6 @@ index 0000000..9d9f5f6 + let Inst{54} = SLC; + let Inst{55} = TFE; + let Inst{63-56} = SOFFSET; -+ let EncodingType = 3; //SIInstrEncodingType::MTBUF + + let VM_CNT = 1; + let EXP_CNT = 1; @@ -20946,7 +20871,6 @@ index 0000000..9d9f5f6 + let Inst{54} = SLC; + let Inst{55} = TFE; + let Inst{63-56} = SOFFSET; -+ let EncodingType = 4; //SIInstrEncodingType::MUBUF + + let VM_CNT = 1; + let EXP_CNT = 1; @@ -20956,22 +20880,19 @@ index 0000000..9d9f5f6 + +} // End Uses = [EXEC] + -+class SMRD op, dag outs, dag ins, string asm, list pattern> : -+ Enc32 { ++class SMRD op, bits<1> imm, dag outs, dag ins, string asm, ++ list pattern> : Enc32 { + + bits<7> SDST; -+ bits<15> PTR; -+ bits<8> OFFSET = PTR{7-0}; -+ bits<1> IMM = PTR{8}; -+ bits<6> SBASE = PTR{14-9}; ++ bits<6> SBASE; ++ bits<8> OFFSET; + + let Inst{7-0} = OFFSET; -+ let Inst{8} = IMM; ++ let Inst{8} = imm; + let Inst{14-9} = SBASE; + let Inst{21-15} = SDST; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding -+ let EncodingType = 5; //SIInstrEncodingType::SMRD + + let LGKM_CNT = 1; +} @@ -20986,7 +20907,6 @@ index 0000000..9d9f5f6 + let Inst{15-8} = op; + let Inst{22-16} = SDST; + let Inst{31-23} = 0x17d; //encoding; -+ let EncodingType = 6; //SIInstrEncodingType::SOP1 + + let mayLoad = 0; + let mayStore = 0; @@ -21005,7 +20925,6 @@ index 0000000..9d9f5f6 + let Inst{22-16} = SDST; + let Inst{29-23} = op; + let Inst{31-30} = 0x2; // encoding -+ let EncodingType = 7; // SIInstrEncodingType::SOP2 + + let mayLoad = 0; + let mayStore = 0; @@ -21022,7 +20941,6 @@ index 0000000..9d9f5f6 + let Inst{15-8} = SSRC1; + let Inst{22-16} = op; + let Inst{31-23} = 0x17e; -+ let EncodingType = 8; // SIInstrEncodingType::SOPC + + let DisableEncoding = "$dst"; + let mayLoad = 0; @@ -21040,7 +20958,6 @@ index 0000000..9d9f5f6 + let Inst{22-16} = SDST; + let Inst{27-23} = op; + let Inst{31-28} = 0xb; //encoding -+ let EncodingType = 9; // SIInstrEncodingType::SOPK + + let mayLoad = 0; + let mayStore = 0; @@ -21058,7 +20975,6 @@ index 0000000..9d9f5f6 + let Inst{15-0} = SIMM16; + let Inst{22-16} = op; + let Inst{31-23} = 0x17f; // encoding -+ let EncodingType = 10; // SIInstrEncodingType::SOPP + + let mayLoad = 0; + let mayStore = 0; @@ -21081,7 +20997,6 @@ index 0000000..9d9f5f6 + let Inst{17-16} = op; + let Inst{25-18} = VDST; + let Inst{31-26} = 0x32; // encoding -+ let EncodingType = 11; // SIInstrEncodingType::VINTRP + + let neverHasSideEffects = 1; + let mayLoad = 1; @@ -21099,9 +21014,6 @@ index 0000000..9d9f5f6 + let Inst{24-17} = VDST; + let Inst{31-25} = 0x3f; //encoding + -+ let EncodingType = 12; // SIInstrEncodingType::VOP1 -+ let PostEncoderMethod = "VOPPostEncode"; -+ + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; @@ -21120,9 +21032,6 @@ index 0000000..9d9f5f6 + let Inst{30-25} = op; + let Inst{31} = 0x0; //encoding + -+ let EncodingType = 13; // SIInstrEncodingType::VOP2 -+ let PostEncoderMethod = "VOPPostEncode"; -+ + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; @@ -21151,9 +21060,6 @@ index 0000000..9d9f5f6 + let Inst{60-59} = OMOD; + let Inst{63-61} = NEG; + -+ let EncodingType = 14; // SIInstrEncodingType::VOP3 -+ let PostEncoderMethod = "VOPPostEncode"; -+ + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; @@ -21180,9 +21086,6 @@ index 0000000..9d9f5f6 + let Inst{60-59} = OMOD; + let Inst{63-61} = NEG; + -+ let EncodingType = 14; // SIInstrEncodingType::VOP3 -+ let PostEncoderMethod = "VOPPostEncode"; -+ + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; @@ -21199,8 +21102,6 @@ index 0000000..9d9f5f6 + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; + -+ let EncodingType = 15; //SIInstrEncodingType::VOPC -+ let PostEncoderMethod = "VOPPostEncode"; + let DisableEncoding = "$dst"; + let mayLoad = 0; + let mayStore = 0; @@ -21209,86 +21110,14 @@ index 0000000..9d9f5f6 + +} // End Uses = [EXEC] + -+class MIMG_Load_Helper op, string asm> : MIMG < -+ op, -+ (outs VReg_128:$vdata), -+ (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, -+ i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr, -+ GPR4Align:$srsrc, GPR4Align:$ssamp), -+ asm, -+ []> { -+ let mayLoad = 1; -+ let mayStore = 0; -+} -+ -+class MUBUF_Load_Helper op, string asm, RegisterClass regClass> : MUBUF < -+ op, -+ (outs regClass:$dst), -+ (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, -+ i1imm:$lds, VReg_32:$vaddr, GPR4Align:$srsrc, i1imm:$slc, -+ i1imm:$tfe, SReg_32:$soffset), -+ asm, -+ []> { -+ let mayLoad = 1; -+ let mayStore = 0; -+} -+ -+class MTBUF_Load_Helper op, string asm, RegisterClass regClass> : MTBUF < -+ op, -+ (outs regClass:$dst), -+ (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, -+ i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align:$srsrc, -+ i1imm:$slc, i1imm:$tfe, SReg_32:$soffset), -+ asm, -+ []> { -+ let mayLoad = 1; -+ let mayStore = 0; -+} -+ -+class MTBUF_Store_Helper op, string asm, RegisterClass regClass> : MTBUF < -+ op, -+ (outs), -+ (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, -+ i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, -+ GPR4Align:$srsrc, i1imm:$slc, i1imm:$tfe, SReg_32:$soffset), -+ asm, -+ []> { -+ let mayStore = 1; -+ let mayLoad = 0; -+} -+ -+multiclass SMRD_Helper op, string asm, RegisterClass dstClass, -+ ValueType vt> { -+ def _IMM : SMRD < -+ op, -+ (outs dstClass:$dst), -+ (ins SMRDmemri:$src0), -+ asm, -+ [(set (vt dstClass:$dst), (constant_load ADDR_Offset8:$src0))] -+ >; -+ -+ def _SGPR : SMRD < -+ op, -+ (outs dstClass:$dst), -+ (ins SMRDmemrr:$src0), -+ asm, -+ [(set (vt dstClass:$dst), (constant_load ADDR_Reg:$src0))] -+ >; -+} -+ -+multiclass SMRD_32 op, string asm, RegisterClass dstClass> { -+ defm _F32 : SMRD_Helper ; -+ defm _I32 : SMRD_Helper ; -+} -+ +include "SIInstrFormats.td" +include "SIInstructions.td" diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td new file mode 100644 -index 0000000..5d15761 +index 0000000..3a9822a --- /dev/null +++ b/lib/Target/R600/SIInstructions.td -@@ -0,0 +1,1423 @@ +@@ -0,0 +1,1462 @@ +//===-- SIInstructions.td - SI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure @@ -21302,6 +21131,17 @@ index 0000000..5d15761 +// that are not yet supported remain commented out. +//===----------------------------------------------------------------------===// + ++class InterpSlots { ++int P0 = 2; ++int P10 = 0; ++int P20 = 1; ++} ++def INTERP : InterpSlots; ++ ++def InterpSlot : Operand { ++ let PrintMethod = "printInterpSlot"; ++} ++ +def isSI : Predicate<"Subtarget.device()" + "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">; + @@ -21410,33 +21250,33 @@ index 0000000..5d15761 +defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32", []>; +defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", []>; +def : Pat < -+ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LT)), -+ (V_CMP_LT_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_LT)), ++ (V_CMP_LT_F32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", []>; +def : Pat < -+ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)), -+ (V_CMP_EQ_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_EQ)), ++ (V_CMP_EQ_F32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", []>; +def : Pat < -+ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LE)), -+ (V_CMP_LE_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_LE)), ++ (V_CMP_LE_F32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", []>; +def : Pat < -+ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GT)), -+ (V_CMP_GT_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_GT)), ++ (V_CMP_GT_F32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", []>; +def : Pat < -+ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)), -+ (V_CMP_LG_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_NE)), ++ (V_CMP_LG_F32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", []>; +def : Pat < -+ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GE)), -+ (V_CMP_GE_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_GE)), ++ (V_CMP_GE_F32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", []>; +defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", []>; @@ -21446,8 +21286,8 @@ index 0000000..5d15761 +defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32", []>; +defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", []>; +def : Pat < -+ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)), -+ (V_CMP_NEQ_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_NE)), ++ (V_CMP_NEQ_F32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32", []>; +defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32", []>; @@ -21580,33 +21420,33 @@ index 0000000..5d15761 +defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32", []>; +defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", []>; +def : Pat < -+ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LT)), -+ (V_CMP_LT_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_LT)), ++ (V_CMP_LT_I32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", []>; +def : Pat < -+ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)), -+ (V_CMP_EQ_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_EQ)), ++ (V_CMP_EQ_I32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", []>; +def : Pat < -+ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LE)), -+ (V_CMP_LE_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_LE)), ++ (V_CMP_LE_I32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", []>; +def : Pat < -+ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GT)), -+ (V_CMP_GT_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_GT)), ++ (V_CMP_GT_I32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", []>; +def : Pat < -+ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_NE)), -+ (V_CMP_NE_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_NE)), ++ (V_CMP_NE_I32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", []>; +def : Pat < -+ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GE)), -+ (V_CMP_GE_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++ (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_GE)), ++ (V_CMP_GE_I32_e64 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32", []>; + @@ -21752,11 +21592,13 @@ index 0000000..5d15761 +//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>; +//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>; + -+defm S_LOAD_DWORD : SMRD_32 <0x00000000, "S_LOAD_DWORD", SReg_32>; ++let mayLoad = 1 in { ++ ++defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>; + +//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>; -+defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128, v4i32>; -+defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256, v8i32>; ++defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>; ++defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>; +//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>; +//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>; +//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>; @@ -21764,6 +21606,8 @@ index 0000000..5d15761 +//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>; +//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>; + ++} // mayLoad = 1 ++ +//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>; +//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>; +//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>; @@ -21870,12 +21714,12 @@ index 0000000..5d15761 +//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>; +//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>; +defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32", -+ [(set VReg_32:$dst, (sint_to_fp AllReg_32:$src0))] ++ [(set VReg_32:$dst, (sint_to_fp VSrc_32:$src0))] +>; +//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>; +//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>; +defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", -+ [(set (i32 VReg_32:$dst), (fp_to_sint AllReg_32:$src0))] ++ [(set (i32 VReg_32:$dst), (fp_to_sint VSrc_32:$src0))] +>; +defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; +////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>; @@ -21892,31 +21736,35 @@ index 0000000..5d15761 +//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>; +//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>; +defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", -+ [(set VReg_32:$dst, (AMDGPUfract AllReg_32:$src0))] ++ [(set VReg_32:$dst, (AMDGPUfract VSrc_32:$src0))] +>; +defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>; -+defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", []>; ++defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", ++ [(set VReg_32:$dst, (fceil VSrc_32:$src0))] ++>; +defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32", -+ [(set VReg_32:$dst, (frint AllReg_32:$src0))] ++ [(set VReg_32:$dst, (frint VSrc_32:$src0))] +>; +defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32", -+ [(set VReg_32:$dst, (ffloor AllReg_32:$src0))] ++ [(set VReg_32:$dst, (ffloor VSrc_32:$src0))] +>; +defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32", -+ [(set VReg_32:$dst, (fexp2 AllReg_32:$src0))] ++ [(set VReg_32:$dst, (fexp2 VSrc_32:$src0))] +>; +defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; -+defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", []>; ++defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", ++ [(set VReg_32:$dst, (flog2 VSrc_32:$src0))] ++>; +defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; +defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; +defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", -+ [(set VReg_32:$dst, (fdiv FP_ONE, AllReg_32:$src0))] ++ [(set VReg_32:$dst, (fdiv FP_ONE, VSrc_32:$src0))] +>; +defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; +defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; +defm V_RSQ_LEGACY_F32 : VOP1_32 < + 0x0000002d, "V_RSQ_LEGACY_F32", -+ [(set VReg_32:$dst, (int_AMDGPU_rsq AllReg_32:$src0))] ++ [(set VReg_32:$dst, (int_AMDGPU_rsq VSrc_32:$src0))] +>; +defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>; +defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>; @@ -21966,10 +21814,9 @@ index 0000000..5d15761 +def V_INTERP_MOV_F32 : VINTRP < + 0x00000002, + (outs VReg_32:$dst), -+ (ins i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), -+ "V_INTERP_MOV_F32", ++ (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), ++ "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr", + []> { -+ let VSRC = 0; + let DisableEncoding = "$m0"; +} + @@ -22049,22 +21896,22 @@ index 0000000..5d15761 +//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>; + +def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst), -+ (ins AllReg_32:$src0, VReg_32:$src1, VCCReg:$vcc), "V_CNDMASK_B32_e32", ++ (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc), "V_CNDMASK_B32_e32", + [] +>{ + let DisableEncoding = "$vcc"; +} + +def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst), -+ (ins VReg_32:$src0, VReg_32:$src1, SReg_1:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), ++ (ins VReg_32:$src0, VReg_32:$src1, SReg_64:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), + "V_CNDMASK_B32_e64", -+ [(set (i32 VReg_32:$dst), (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0))] ++ [(set (i32 VReg_32:$dst), (select (i1 SReg_64:$src2), VReg_32:$src1, VReg_32:$src0))] +>; + +//f32 pattern for V_CNDMASK_B32_e64 +def : Pat < -+ (f32 (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0)), -+ (V_CNDMASK_B32_e64 VReg_32:$src0, VReg_32:$src1, SReg_1:$src2) ++ (f32 (select (i1 SReg_64:$src2), VReg_32:$src1, VReg_32:$src0)), ++ (V_CNDMASK_B32_e64 VReg_32:$src0, VReg_32:$src1, SReg_64:$src2) +>; + +defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>; @@ -22072,35 +21919,35 @@ index 0000000..5d15761 + +defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", []>; +def : Pat < -+ (f32 (fadd AllReg_32:$src0, VReg_32:$src1)), -+ (V_ADD_F32_e32 AllReg_32:$src0, VReg_32:$src1) ++ (f32 (fadd VSrc_32:$src0, VReg_32:$src1)), ++ (V_ADD_F32_e32 VSrc_32:$src0, VReg_32:$src1) +>; + +defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", []>; +def : Pat < -+ (f32 (fsub AllReg_32:$src0, VReg_32:$src1)), -+ (V_SUB_F32_e32 AllReg_32:$src0, VReg_32:$src1) ++ (f32 (fsub VSrc_32:$src0, VReg_32:$src1)), ++ (V_SUB_F32_e32 VSrc_32:$src0, VReg_32:$src1) +>; +defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>; +defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>; +defm V_MUL_LEGACY_F32 : VOP2_32 < + 0x00000007, "V_MUL_LEGACY_F32", -+ [(set VReg_32:$dst, (int_AMDGPU_mul AllReg_32:$src0, VReg_32:$src1))] ++ [(set VReg_32:$dst, (int_AMDGPU_mul VSrc_32:$src0, VReg_32:$src1))] +>; + +defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", -+ [(set VReg_32:$dst, (fmul AllReg_32:$src0, VReg_32:$src1))] ++ [(set VReg_32:$dst, (fmul VSrc_32:$src0, VReg_32:$src1))] +>; +//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>; +//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>; +//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>; +//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>; +defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32", -+ [(set VReg_32:$dst, (AMDGPUfmin AllReg_32:$src0, VReg_32:$src1))] ++ [(set VReg_32:$dst, (AMDGPUfmin VSrc_32:$src0, VReg_32:$src1))] +>; + +defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32", -+ [(set VReg_32:$dst, (AMDGPUfmax AllReg_32:$src0, VReg_32:$src1))] ++ [(set VReg_32:$dst, (AMDGPUfmax VSrc_32:$src0, VReg_32:$src1))] +>; +defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>; +defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>; @@ -22115,13 +21962,13 @@ index 0000000..5d15761 +defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>; +defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>; +defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", -+ [(set VReg_32:$dst, (and AllReg_32:$src0, VReg_32:$src1))] ++ [(set VReg_32:$dst, (and VSrc_32:$src0, VReg_32:$src1))] +>; +defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", -+ [(set VReg_32:$dst, (or AllReg_32:$src0, VReg_32:$src1))] ++ [(set VReg_32:$dst, (or VSrc_32:$src0, VReg_32:$src1))] +>; +defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", -+ [(set VReg_32:$dst, (xor AllReg_32:$src0, VReg_32:$src1))] ++ [(set VReg_32:$dst, (xor VSrc_32:$src0, VReg_32:$src1))] +>; +defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>; +defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>; @@ -22132,10 +21979,10 @@ index 0000000..5d15761 +//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; +let Defs = [VCC] in { // Carry-out goes to VCC +defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32", -+ [(set VReg_32:$dst, (add (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))] ++ [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))] +>; +defm V_SUB_I32 : VOP2_32 <0x00000026, "V_SUB_I32", -+ [(set VReg_32:$dst, (sub (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))] ++ [(set VReg_32:$dst, (sub (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))] +>; +} // End Defs = [VCC] +defm V_SUBREV_I32 : VOP2_32 <0x00000027, "V_SUBREV_I32", []>; @@ -22147,7 +21994,7 @@ index 0000000..5d15761 +////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>; +////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32", -+ [(set VReg_32:$dst, (int_SI_packf16 AllReg_32:$src0, VReg_32:$src1))] ++ [(set VReg_32:$dst, (int_SI_packf16 VSrc_32:$src0, VReg_32:$src1))] +>; +////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>; +////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>; @@ -22217,6 +22064,10 @@ index 0000000..5d15761 +def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>; +def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>; +def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>; ++def : Pat < ++ (mul VSrc_32:$src0, VReg_32:$src1), ++ (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (IMPLICIT_DEF), 0, 0, 0, 0) ++>; +def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; +def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; +def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; @@ -22254,13 +22105,20 @@ index 0000000..5d15761 +def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>; + +def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", -+ [(set SReg_64:$dst, (and SReg_64:$src0, SReg_64:$src1))] ++ [(set SReg_64:$dst, (i64 (and SSrc_64:$src0, SSrc_64:$src1)))] +>; -+def S_AND_VCC : SOP2_VCC <0x0000000f, "S_AND_B64", -+ [(set SReg_1:$vcc, (SIvcc_and SReg_64:$src0, SReg_64:$src1))] ++ ++def : Pat < ++ (i1 (and SSrc_64:$src0, SSrc_64:$src1)), ++ (S_AND_B64 SSrc_64:$src0, SSrc_64:$src1) +>; ++ +def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>; +def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>; ++def : Pat < ++ (i1 (or SSrc_64:$src0, SSrc_64:$src1)), ++ (S_OR_B64 SSrc_64:$src0, SSrc_64:$src1) ++>; +def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>; +def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>; +def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>; @@ -22289,48 +22147,6 @@ index 0000000..5d15761 +//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>; +def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>; + -+class V_MOV_IMM : InstSI < -+ (outs VReg_32:$dst), -+ (ins immType:$src0), -+ "V_MOV_IMM", -+ [(set VReg_32:$dst, (type immNode:$src0))] -+>; -+ -+let isCodeGenOnly = 1, isPseudo = 1 in { -+ -+def V_MOV_IMM_I32 : V_MOV_IMM; -+def V_MOV_IMM_F32 : V_MOV_IMM; -+ -+def S_MOV_IMM_I32 : InstSI < -+ (outs SReg_32:$dst), -+ (ins i32imm:$src0), -+ "S_MOV_IMM_I32", -+ [(set SReg_32:$dst, (imm:$src0))] -+>; -+ -+// i64 immediates aren't really supported in hardware, but LLVM will use the i64 -+// type for indices on load and store instructions. The pattern for -+// S_MOV_IMM_I64 will only match i64 immediates that can fit into 32-bits, -+// which the hardware can handle. -+def S_MOV_IMM_I64 : InstSI < -+ (outs SReg_64:$dst), -+ (ins i64imm:$src0), -+ "S_MOV_IMM_I64 $dst, $src0", -+ [(set SReg_64:$dst, (IMM32bitIn64bit:$src0))] -+>; -+ -+} // End isCodeGenOnly, isPseudo = 1 -+ -+class SI_LOAD_LITERAL : -+ Enc32 <(outs), (ins ImmType:$imm), "LOAD_LITERAL $imm", []> { -+ -+ bits<32> imm; -+ let Inst{31-0} = imm; -+} -+ -+def SI_LOAD_LITERAL_I32 : SI_LOAD_LITERAL; -+def SI_LOAD_LITERAL_F32 : SI_LOAD_LITERAL; -+ +let isCodeGenOnly = 1, isPseudo = 1 in { + +def SET_M0 : InstSI < @@ -22349,13 +22165,6 @@ index 0000000..5d15761 + +let usesCustomInserter = 1 in { + -+def SI_V_CNDLT : InstSI < -+ (outs VReg_32:$dst), -+ (ins VReg_32:$src0, VReg_32:$src1, VReg_32:$src2), -+ "SI_V_CNDLT $dst, $src0, $src1, $src2", -+ [(set VReg_32:$dst, (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2))] -+>; -+ +def SI_INTERP : InstSI < + (outs VReg_32:$dst), + (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params), @@ -22363,14 +22172,6 @@ index 0000000..5d15761 + [] +>; + -+def SI_INTERP_CONST : InstSI < -+ (outs VReg_32:$dst), -+ (ins i32imm:$attr_chan, i32imm:$attr, SReg_32:$params), -+ "SI_INTERP_CONST $dst, $attr_chan, $attr, $params", -+ [(set VReg_32:$dst, (int_SI_fs_interp_constant imm:$attr_chan, -+ imm:$attr, SReg_32:$params))] -+>; -+ +def SI_WQM : InstSI < + (outs), + (ins), @@ -22390,9 +22191,9 @@ index 0000000..5d15761 + +def SI_IF : InstSI < + (outs SReg_64:$dst), -+ (ins SReg_1:$vcc, brtarget:$target), ++ (ins SReg_64:$vcc, brtarget:$target), + "SI_IF", -+ [(set SReg_64:$dst, (int_SI_if SReg_1:$vcc, bb:$target))] ++ [(set SReg_64:$dst, (int_SI_if SReg_64:$vcc, bb:$target))] +>; + +def SI_ELSE : InstSI < @@ -22422,9 +22223,9 @@ index 0000000..5d15761 + +def SI_IF_BREAK : InstSI < + (outs SReg_64:$dst), -+ (ins SReg_1:$vcc, SReg_64:$src), ++ (ins SReg_64:$vcc, SReg_64:$src), + "SI_IF_BREAK", -+ [(set SReg_64:$dst, (int_SI_if_break SReg_1:$vcc, SReg_64:$src))] ++ [(set SReg_64:$dst, (int_SI_if_break SReg_64:$vcc, SReg_64:$src))] +>; + +def SI_ELSE_BREAK : InstSI < @@ -22453,9 +22254,14 @@ index 0000000..5d15761 + +} // end IsCodeGenOnly, isPseudo + ++def : Pat< ++ (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2), ++ (V_CNDMASK_B32_e64 VReg_32:$src2, VReg_32:$src1, (V_CMP_GT_F32_e64 0, VReg_32:$src0)) ++>; ++ +def : Pat < + (int_AMDGPU_kilp), -+ (SI_KILL (V_MOV_IMM_I32 0xbf800000)) ++ (SI_KILL (V_MOV_B32_e32 0xbf800000)) +>; + +/* int_SI_vs_load_input */ @@ -22464,7 +22270,7 @@ index 0000000..5d15761 + VReg_32:$buf_idx_vgpr), + (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0, + VReg_32:$buf_idx_vgpr, SReg_128:$tlst, -+ 0, 0, (i32 SREG_LIT_0)) ++ 0, 0, 0) +>; + +/* int_SI_export */ @@ -22581,24 +22387,46 @@ index 0000000..5d15761 +def : BitConvert ; +def : BitConvert ; + ++/********** ================== **********/ ++/********** Immediate Patterns **********/ ++/********** ================== **********/ ++ +def : Pat < -+ (i64 (SIsreg1_bitcast SReg_1:$vcc)), -+ (S_MOV_B64 (COPY_TO_REGCLASS SReg_1:$vcc, SReg_64)) ++ (i1 imm:$imm), ++ (S_MOV_B64 imm:$imm) +>; + +def : Pat < -+ (i1 (SIsreg1_bitcast SReg_64:$vcc)), -+ (COPY_TO_REGCLASS SReg_64:$vcc, SReg_1) ++ (i32 imm:$imm), ++ (V_MOV_B32_e32 imm:$imm) +>; + +def : Pat < -+ (i64 (SIvcc_bitcast VCCReg:$vcc)), -+ (S_MOV_B64 (COPY_TO_REGCLASS VCCReg:$vcc, SReg_64)) ++ (f32 fpimm:$imm), ++ (V_MOV_B32_e32 fpimm:$imm) +>; + +def : Pat < -+ (i1 (SIvcc_bitcast SReg_64:$vcc)), -+ (COPY_TO_REGCLASS SReg_64:$vcc, VCCReg) ++ (i32 imm:$imm), ++ (S_MOV_B32 imm:$imm) ++>; ++ ++def : Pat < ++ (f32 fpimm:$imm), ++ (S_MOV_B32 fpimm:$imm) ++>; ++ ++def : Pat < ++ (i64 InlineImm:$imm), ++ (S_MOV_B64 InlineImm:$imm) ++>; ++ ++// i64 immediates aren't supported in hardware, split it into two 32bit values ++def : Pat < ++ (i64 imm:$imm), ++ (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), ++ (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0), ++ (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1) +>; + +/********** ===================== **********/ @@ -22606,6 +22434,12 @@ index 0000000..5d15761 +/********** ===================== **********/ + +def : Pat < ++ (int_SI_fs_interp_constant imm:$attr_chan, imm:$attr, SReg_32:$params), ++ (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, ++ (S_MOV_B32 SReg_32:$params)) ++>; ++ ++def : Pat < + (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params), + (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan, + imm:$attr, SReg_32:$params) @@ -22663,23 +22497,23 @@ index 0000000..5d15761 +def : POW_Common ; + +def : Pat < -+ (int_AMDGPU_div AllReg_32:$src0, AllReg_32:$src1), -+ (V_MUL_LEGACY_F32_e32 AllReg_32:$src0, (V_RCP_LEGACY_F32_e32 AllReg_32:$src1)) ++ (int_AMDGPU_div VSrc_32:$src0, VSrc_32:$src1), ++ (V_MUL_LEGACY_F32_e32 VSrc_32:$src0, (V_RCP_LEGACY_F32_e32 VSrc_32:$src1)) +>; + +def : Pat< -+ (fdiv AllReg_32:$src0, AllReg_32:$src1), -+ (V_MUL_F32_e32 AllReg_32:$src0, (V_RCP_F32_e32 AllReg_32:$src1)) ++ (fdiv VSrc_32:$src0, VSrc_32:$src1), ++ (V_MUL_F32_e32 VSrc_32:$src0, (V_RCP_F32_e32 VSrc_32:$src1)) +>; + +def : Pat < -+ (fcos AllReg_32:$src0), -+ (V_COS_F32_e32 (V_MUL_F32_e32 AllReg_32:$src0, (V_MOV_IMM_I32 CONST.TWO_PI_INV))) ++ (fcos VSrc_32:$src0), ++ (V_COS_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) +>; + +def : Pat < -+ (fsin AllReg_32:$src0), -+ (V_SIN_F32_e32 (V_MUL_F32_e32 AllReg_32:$src0, (V_MOV_IMM_I32 CONST.TWO_PI_INV))) ++ (fsin VSrc_32:$src0), ++ (V_SIN_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) +>; + +def : Pat < @@ -22703,14 +22537,48 @@ index 0000000..5d15761 + 0, 0, 0, 0), sub3) +>; + ++def : Pat < ++ (i32 (sext (i1 SReg_64:$src0))), ++ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0) ++>; ++ +/********** ================== **********/ +/********** VOP3 Patterns **********/ +/********** ================== **********/ + -+def : Pat <(f32 (IL_mad AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2)), -+ (V_MAD_LEGACY_F32 AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2, ++def : Pat <(f32 (IL_mad VSrc_32:$src0, VReg_32:$src1, VReg_32:$src2)), ++ (V_MAD_LEGACY_F32 VSrc_32:$src0, VReg_32:$src1, VReg_32:$src2, + 0, 0, 0, 0)>; + ++/********** ================== **********/ ++/********** SMRD Patterns **********/ ++/********** ================== **********/ ++ ++multiclass SMRD_Pattern { ++ // 1. Offset as 8bit DWORD immediate ++ def : Pat < ++ (constant_load (SIadd64bit32bit SReg_64:$sbase, IMM8bitDWORD:$offset)), ++ (vt (Instr_IMM SReg_64:$sbase, IMM8bitDWORD:$offset)) ++ >; ++ ++ // 2. Offset loaded in an 32bit SGPR ++ def : Pat < ++ (constant_load (SIadd64bit32bit SReg_64:$sbase, imm:$offset)), ++ (vt (Instr_SGPR SReg_64:$sbase, (S_MOV_B32 imm:$offset))) ++ >; ++ ++ // 3. No offset at all ++ def : Pat < ++ (constant_load SReg_64:$sbase), ++ (vt (Instr_IMM SReg_64:$sbase, 0)) ++ >; ++} ++ ++defm : SMRD_Pattern ; ++defm : SMRD_Pattern ; ++defm : SMRD_Pattern ; ++defm : SMRD_Pattern ; ++ +} // End isSI predicate diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td new file mode 100644 @@ -22774,7 +22642,7 @@ index 0000000..611b9c4 +} diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp new file mode 100644 -index 0000000..3780e40 +index 0000000..2007d30 --- /dev/null +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -0,0 +1,372 @@ @@ -22938,10 +22806,10 @@ index 0000000..3780e40 + .addImm(0) + .addImm(1) + .addImm(1) -+ .addReg(AMDGPU::SREG_LIT_0) -+ .addReg(AMDGPU::SREG_LIT_0) -+ .addReg(AMDGPU::SREG_LIT_0) -+ .addReg(AMDGPU::SREG_LIT_0); ++ .addReg(AMDGPU::VGPR0) ++ .addReg(AMDGPU::VGPR0) ++ .addReg(AMDGPU::VGPR0) ++ .addReg(AMDGPU::VGPR0); + + // ... and terminate wavefront + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); @@ -23077,7 +22945,7 @@ index 0000000..3780e40 + + // Clear this pixel from the exec mask if the operand is negative + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) -+ .addReg(AMDGPU::SREG_LIT_0) ++ .addImm(0) + .addOperand(MI.getOperand(0)); + + MI.eraseFromParent(); @@ -23150,120 +23018,6 @@ index 0000000..3780e40 + + return true; +} -diff --git a/lib/Target/R600/SILowerLiteralConstants.cpp b/lib/Target/R600/SILowerLiteralConstants.cpp -new file mode 100644 -index 0000000..2d5ab0b ---- /dev/null -+++ b/lib/Target/R600/SILowerLiteralConstants.cpp -@@ -0,0 +1,108 @@ -+//===-- SILowerLiteralConstants.cpp - Lower intrs using literal constants--===// -+// -+// The LLVM Compiler Infrastructure -+// -+// This file is distributed under the University of Illinois Open Source -+// License. See LICENSE.TXT for details. -+// -+//===----------------------------------------------------------------------===// -+// -+/// \file -+/// \brief This pass performs the following transformation on instructions with -+/// literal constants: -+/// -+/// %VGPR0 = V_MOV_IMM_I32 1 -+/// -+/// becomes: -+/// -+/// BUNDLE -+/// * %VGPR = V_MOV_B32_32 SI_LITERAL_CONSTANT -+/// * SI_LOAD_LITERAL 1 -+/// -+/// The resulting sequence matches exactly how the hardware handles immediate -+/// operands, so this transformation greatly simplifies the code generator. -+/// -+/// Only the *_MOV_IMM_* support immediate operands at the moment, but when -+/// support for immediate operands is added to other instructions, they -+/// will be lowered here as well. -+//===----------------------------------------------------------------------===// -+ -+#include "AMDGPU.h" -+#include "llvm/CodeGen/MachineFunction.h" -+#include "llvm/CodeGen/MachineFunctionPass.h" -+#include "llvm/CodeGen/MachineInstrBuilder.h" -+#include "llvm/CodeGen/MachineInstrBundle.h" -+ -+using namespace llvm; -+ -+namespace { -+ -+class SILowerLiteralConstantsPass : public MachineFunctionPass { -+ -+private: -+ static char ID; -+ const TargetInstrInfo *TII; -+ -+public: -+ SILowerLiteralConstantsPass(TargetMachine &tm) : -+ MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } -+ -+ virtual bool runOnMachineFunction(MachineFunction &MF); -+ -+ const char *getPassName() const { -+ return "SI Lower literal constants pass"; -+ } -+}; -+ -+} // End anonymous namespace -+ -+char SILowerLiteralConstantsPass::ID = 0; -+ -+FunctionPass *llvm::createSILowerLiteralConstantsPass(TargetMachine &tm) { -+ return new SILowerLiteralConstantsPass(tm); -+} -+ -+bool SILowerLiteralConstantsPass::runOnMachineFunction(MachineFunction &MF) { -+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); -+ BB != BB_E; ++BB) { -+ MachineBasicBlock &MBB = *BB; -+ for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); -+ I != MBB.end(); I = Next) { -+ Next = llvm::next(I); -+ MachineInstr &MI = *I; -+ switch (MI.getOpcode()) { -+ default: break; -+ case AMDGPU::S_MOV_IMM_I32: -+ case AMDGPU::S_MOV_IMM_I64: -+ case AMDGPU::V_MOV_IMM_F32: -+ case AMDGPU::V_MOV_IMM_I32: { -+ unsigned MovOpcode; -+ unsigned LoadLiteralOpcode; -+ MachineOperand LiteralOp = MI.getOperand(1); -+ if (AMDGPU::VReg_32RegClass.contains(MI.getOperand(0).getReg())) { -+ MovOpcode = AMDGPU::V_MOV_B32_e32; -+ } else { -+ MovOpcode = AMDGPU::S_MOV_B32; -+ } -+ if (LiteralOp.isImm()) { -+ LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_I32; -+ } else { -+ LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_F32; -+ } -+ MachineInstr *First = -+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(MovOpcode), -+ MI.getOperand(0).getReg()) -+ .addReg(AMDGPU::SI_LITERAL_CONSTANT); -+ MachineInstr *Last = -+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(LoadLiteralOpcode)) -+ .addOperand(MI.getOperand(1)); -+ Last->setIsInsideBundle(); -+ llvm::finalizeBundle(MBB, First, Last); -+ MI.eraseFromParent(); -+ break; -+ } -+ } -+ } -+ } -+ return false; -+} diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp new file mode 100644 index 0000000..7e59b42 @@ -23439,10 +23193,10 @@ index 0000000..40171e4 +#endif // SIREGISTERINFO_H_ diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td new file mode 100644 -index 0000000..9b483eb +index 0000000..ab36b87 --- /dev/null +++ b/lib/Target/R600/SIRegisterInfo.td -@@ -0,0 +1,188 @@ +@@ -0,0 +1,190 @@ + +class SIReg encoding = 0> : Register { + let Namespace = "AMDGPU"; @@ -23457,7 +23211,9 @@ index 0000000..9b483eb + +class SGPR_32 num, string name> : SIReg; + -+class VGPR_32 num, string name> : SIReg; ++class VGPR_32 num, string name> : SIReg { ++ let HWEncoding{8} = 1; ++} + +// Special Registers +def VCC : SIReg<"VCC", 106>; @@ -23465,8 +23221,6 @@ index 0000000..9b483eb +def EXEC_HI : SIReg <"EXEC HI", 127>; +def EXEC : SI_64<"EXEC", [EXEC_LO, EXEC_HI], 126>; +def SCC : SIReg<"SCC", 253>; -+def SREG_LIT_0 : SIReg <"S LIT 0", 128>; -+def SI_LITERAL_CONSTANT : SIReg<"LITERAL CONSTANT", 255>; +def M0 : SIReg <"M0", 124>; + +//Interpolation registers @@ -23579,12 +23333,10 @@ index 0000000..9b483eb + +// Register class for all scalar registers (SGPRs + Special Registers) +def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, -+ (add SGPR_32, SREG_LIT_0, M0, EXEC_LO, EXEC_HI) ++ (add SGPR_32, M0, EXEC_LO, EXEC_HI) +>; + -+def SReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add SGPR_64, VCC, EXEC)>; -+ -+def SReg_1 : RegisterClass<"AMDGPU", [i1], 1, (add VCC, SGPR_64, EXEC)>; ++def SReg_64 : RegisterClass<"AMDGPU", [i1, i64], 64, (add SGPR_64, VCC, EXEC)>; + +def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>; + @@ -23620,10 +23372,14 @@ index 0000000..9b483eb + +def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>; + -+// AllReg_* - A set of all scalar and vector registers of a given width. -+def AllReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, (add VReg_32, SReg_32)>; ++// [SV]Src_* operands can have either an immediate or an register ++def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>; + -+def AllReg_64 : RegisterClass<"AMDGPU", [f64, i64], 64, (add SReg_64, VReg_64)>; ++def SSrc_64 : RegisterClass<"AMDGPU", [i1, i64], 64, (add SReg_64)>; ++ ++def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>; ++ ++def VSrc_64 : RegisterClass<"AMDGPU", [i64], 64, (add SReg_64, VReg_64)>; + +// Special register classes for predicates and the M0 register +def SCCReg : RegisterClass<"AMDGPU", [i1], 1, (add SCC)>; @@ -23747,6 +23503,30 @@ index 0000000..b8ac4e7 +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common +diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll +new file mode 100644 +index 0000000..114f9e7 +--- /dev/null ++++ b/test/CodeGen/R600/128bit-kernel-args.ll +@@ -0,0 +1,18 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; CHECK: @v4i32_kernel_arg ++; CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 40 ++ ++define void @v4i32_kernel_arg(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { ++entry: ++ store <4 x i32> %in, <4 x i32> addrspace(1)* %out ++ ret void ++} ++ ++; CHECK: @v4f32_kernel_arg ++; CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 40 ++define void @v4f32_kernel_args(<4 x float> addrspace(1)* %out, <4 x float> %in) { ++entry: ++ store <4 x float> %in, <4 x float> addrspace(1)* %out ++ ret void ++} diff --git a/test/CodeGen/R600/add.v4i32.ll b/test/CodeGen/R600/add.v4i32.ll new file mode 100644 index 0000000..ac4a874 @@ -23831,6 +23611,40 @@ index 0000000..fd958b3 + store <4 x float> %splat, <4 x float> addrspace(1)* %out + ret void +} +diff --git a/test/CodeGen/R600/disconnected-predset-break-bug.ll b/test/CodeGen/R600/disconnected-predset-break-bug.ll +new file mode 100644 +index 0000000..a586742 +--- /dev/null ++++ b/test/CodeGen/R600/disconnected-predset-break-bug.ll +@@ -0,0 +1,28 @@ ++; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; PRED_SET* instructions must be tied to any instruction that uses their ++; result. This tests that there are no instructions between the PRED_SET* ++; and the PREDICATE_BREAK in this loop. ++ ++; CHECK: @loop_ge ++; CHECK: WHILE ++; CHECK: PRED_SET ++; CHECK-NEXT: PREDICATED_BREAK ++define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind { ++entry: ++ %cmp5 = icmp sgt i32 %iterations, 0 ++ br i1 %cmp5, label %for.body, label %for.end ++ ++for.body: ; preds = %for.body, %entry ++ %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ] ++ %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ] ++ %i.07 = add nsw i32 %i.07.in, -1 ++ %arrayidx = getelementptr inbounds i32 addrspace(1)* %out, i32 %ai.06 ++ store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4 ++ %add = add nsw i32 %ai.06, 1 ++ %exitcond = icmp eq i32 %add, %iterations ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ; preds = %for.body, %entry ++ ret void ++} diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll new file mode 100644 index 0000000..0407533 @@ -24176,6 +23990,64 @@ index 0000000..aad44d9 + store i32 %value, i32 addrspace(1)* %out + ret void +} +diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll +new file mode 100644 +index 0000000..382f78c +--- /dev/null ++++ b/test/CodeGen/R600/kcache-fold.ll +@@ -0,0 +1,52 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; CHECK: MOV T{{[0-9]+\.[XYZW], CBuf0\[[0-9]+\]\.[XYZW]}} ++ ++define void @main() { ++main_body: ++ %0 = load <4 x float> addrspace(9)* null ++ %1 = extractelement <4 x float> %0, i32 0 ++ %2 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) ++ %3 = extractelement <4 x float> %2, i32 0 ++ %4 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) ++ %5 = extractelement <4 x float> %4, i32 0 ++ %6 = fcmp ult float %1, 0.000000e+00 ++ %7 = select i1 %6, float %3, float %5 ++ %8 = load <4 x float> addrspace(9)* null ++ %9 = extractelement <4 x float> %8, i32 1 ++ %10 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) ++ %11 = extractelement <4 x float> %10, i32 1 ++ %12 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) ++ %13 = extractelement <4 x float> %12, i32 1 ++ %14 = fcmp ult float %9, 0.000000e+00 ++ %15 = select i1 %14, float %11, float %13 ++ %16 = load <4 x float> addrspace(9)* null ++ %17 = extractelement <4 x float> %16, i32 2 ++ %18 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) ++ %19 = extractelement <4 x float> %18, i32 2 ++ %20 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) ++ %21 = extractelement <4 x float> %20, i32 2 ++ %22 = fcmp ult float %17, 0.000000e+00 ++ %23 = select i1 %22, float %19, float %21 ++ %24 = load <4 x float> addrspace(9)* null ++ %25 = extractelement <4 x float> %24, i32 3 ++ %26 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) ++ %27 = extractelement <4 x float> %26, i32 3 ++ %28 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) ++ %29 = extractelement <4 x float> %28, i32 3 ++ %30 = fcmp ult float %25, 0.000000e+00 ++ %31 = select i1 %30, float %27, float %29 ++ %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) ++ %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) ++ %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00) ++ %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00) ++ %36 = insertelement <4 x float> undef, float %32, i32 0 ++ %37 = insertelement <4 x float> %36, float %33, i32 1 ++ %38 = insertelement <4 x float> %37, float %34, i32 2 ++ %39 = insertelement <4 x float> %38, float %35, i32 3 ++ call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0) ++ ret void ++} ++ ++declare float @llvm.AMDIL.clamp.(float, float, float) readnone ++declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg new file mode 100644 index 0000000..36ee493 @@ -24278,6 +24150,35 @@ index 0000000..fac957f +declare void @llvm.AMDGPU.store.output(float, i32) + +declare float @llvm.AMDGPU.trunc(float ) readnone +diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll +new file mode 100644 +index 0000000..0c19f14 +--- /dev/null ++++ b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll +@@ -0,0 +1,23 @@ ++;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s ++ ++;CHECK: S_MOV_B32 ++;CHECK-NEXT: V_INTERP_MOV_F32 ++ ++define void @main() { ++main_body: ++ call void @llvm.AMDGPU.shader.type(i32 0) ++ %0 = load i32 addrspace(8)* inttoptr (i32 6 to i32 addrspace(8)*) ++ %1 = call float @llvm.SI.fs.interp.constant(i32 0, i32 0, i32 %0) ++ %2 = call i32 @llvm.SI.packf16(float %1, float %1) ++ %3 = bitcast i32 %2 to float ++ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) ++ ret void ++} ++ ++declare void @llvm.AMDGPU.shader.type(i32) ++ ++declare float @llvm.SI.fs.interp.constant(i32, i32, i32) readonly ++ ++declare i32 @llvm.SI.packf16(float, float) readnone ++ ++declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll new file mode 100644 index 0000000..dc120bf @@ -25059,15 +24960,3 @@ index 0000000..62cdcf5 +declare <4 x float> @llvm.SI.vs.load.input(<4 x i32>, i32, i32) + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll -index 466b096..d11bb9e 100644 ---- a/test/CodeGen/X86/cvtv2f32.ll -+++ b/test/CodeGen/X86/cvtv2f32.ll -@@ -1,3 +1,7 @@ -+; A bug fix in the DAGCombiner made this test fail, so marking as xfail -+; until this can be investigated further. -+; XFAIL: * -+ - ; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s - - define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) { diff --git a/debian/patches/series b/debian/patches/series index b3a0596e..154b4c97 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -30,3 +30,7 @@ polly-c++0x.diff declare_clear_cache.diff #r600-snapshot.diff 31-powerpcspe.diff +0101-LegalizeDAG-Allow-type-promotion-for-scalar-stores.patch +0102-LegalizeDAG-Allow-type-promotion-of-scalar-loads.patch +0103-DAGCombiner-Avoid-generating-illegal-vector-INT_TO_F.patch +