diff --git a/debian/changelog b/debian/changelog
index a4101940..1bff38c2 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -7,6 +7,9 @@ llvm-toolchain-3.2 (1:3.2repack-1~exp4) UNRELEASED; urgency=low
     See: 31-powerpcspe.diff
   * Fix the path detection of scan-build (Closes: #698352)
     See: 32-scan-build-path.diff
+  * debian/patches/r600-snapshot.diff: Move backports into individual patches.
+  * debian/patches/r600-snapshot.diff: Update to mesa-9.1 git tag.
+    (Closes: #703671, #697356)
 
  -- Sylvestre Ledru <sylvestre@debian.org>  Thu, 14 Mar 2013 17:47:12 +0100
 
diff --git a/debian/patches/0101-LegalizeDAG-Allow-type-promotion-for-scalar-stores.patch b/debian/patches/0101-LegalizeDAG-Allow-type-promotion-for-scalar-stores.patch
new file mode 100644
index 00000000..d6dd4ef0
--- /dev/null
+++ b/debian/patches/0101-LegalizeDAG-Allow-type-promotion-for-scalar-stores.patch
@@ -0,0 +1,32 @@
+From 7fd12aa7dac937573ea8a48fbe646b334ece4b74 Mon Sep 17 00:00:00 2001
+From: Tom Stellard <thomas.stellard@amd.com>
+Date: Mon, 10 Dec 2012 21:41:54 +0000
+Subject: [PATCH 1/4] LegalizeDAG: Allow type promotion for scalar stores
+
+git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169772 91177308-0d34-0410-b5e6-96231b3b80d8
+(cherry picked from commit 8b7f16e9719a64973e3b4d35e122222c26839c44)
+---
+ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |    7 ++++---
+ 1 fil ändrad, 4 tillägg(+), 3 borttagningar(-)
+
+diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+index abf40b7..9946694 100644
+--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
++++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+@@ -731,9 +731,10 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
+           return;
+         }
+         case TargetLowering::Promote: {
+-          assert(VT.isVector() && "Unknown legal promote case!");
+-          Value = DAG.getNode(ISD::BITCAST, dl,
+-                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Value);
++          EVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT);
++          assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
++                 "Can only promote stores to same size type");
++          Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
+           SDValue Result =
+             DAG.getStore(Chain, dl, Value, Ptr,
+                          ST->getPointerInfo(), isVolatile,
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0102-LegalizeDAG-Allow-type-promotion-of-scalar-loads.patch b/debian/patches/0102-LegalizeDAG-Allow-type-promotion-of-scalar-loads.patch
new file mode 100644
index 00000000..3dca7854
--- /dev/null
+++ b/debian/patches/0102-LegalizeDAG-Allow-type-promotion-of-scalar-loads.patch
@@ -0,0 +1,31 @@
+From 0530926051350bd36e1ea974066c5a14ae0fe202 Mon Sep 17 00:00:00 2001
+From: Tom Stellard <thomas.stellard@amd.com>
+Date: Mon, 10 Dec 2012 21:41:58 +0000
+Subject: [PATCH 2/4] LegalizeDAG: Allow type promotion of scalar loads
+
+git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169773 91177308-0d34-0410-b5e6-96231b3b80d8
+(cherry picked from commit f45d11b56bffeaec94291f330dc9f7f7aae5a741)
+---
+ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |    5 ++---
+ 1 fil ändrad, 2 tillägg(+), 3 borttagningar(-)
+
+diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+index 9946694..2596f00 100644
+--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
++++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+@@ -890,10 +890,9 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
+       break;
+     }
+     case TargetLowering::Promote: {
+-      // Only promote a load of vector type to another.
+-      assert(VT.isVector() && "Cannot promote this load!");
+-      // Change base type to a different vector type.
+       EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
++      assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
++             "Can only promote loads to same size type");
+ 
+       SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
+                          LD->isVolatile(), LD->isNonTemporal(),
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0103-DAGCombiner-Avoid-generating-illegal-vector-INT_TO_F.patch b/debian/patches/0103-DAGCombiner-Avoid-generating-illegal-vector-INT_TO_F.patch
new file mode 100644
index 00000000..49422983
--- /dev/null
+++ b/debian/patches/0103-DAGCombiner-Avoid-generating-illegal-vector-INT_TO_F.patch
@@ -0,0 +1,70 @@
+From 1d10f5a4c953104cf44c7c3e5927aec536b734f4 Mon Sep 17 00:00:00 2001
+From: Tom Stellard <thomas.stellard@amd.com>
+Date: Wed, 2 Jan 2013 22:13:01 +0000
+Subject: [PATCH 3/4] DAGCombiner: Avoid generating illegal vector INT_TO_FP
+ nodes
+
+DAGCombiner::reduceBuildVecConvertToConvertBuildVec() was making two
+mistakes:
+
+1. It was checking the legality of scalar INT_TO_FP nodes and then generating
+vector nodes.
+
+2. It was passing the result value type to
+TargetLoweringInfo::getOperationAction() when it should have been
+passing the value type of the first operand.
+
+git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171420 91177308-0d34-0410-b5e6-96231b3b80d8
+(cherry picked from commit d40758b24ebab5777131533d9369e707fc852594)
+
+Conflicts:
+	test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+	test/CodeGen/R600/vec4-expand.ll
+---
+ lib/CodeGen/SelectionDAG/DAGCombiner.cpp |    9 +++++----
+ test/CodeGen/X86/cvtv2f32.ll             |    4 ++++
+ 2 filer ändrade, 9 tillägg(+), 4 borttagningar(-)
+
+diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+index 37d7731..d0ca5c0 100644
+--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
++++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+@@ -8514,11 +8514,8 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
+     if (Opcode == ISD::DELETED_NODE &&
+         (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
+       Opcode = Opc;
+-      // If not supported by target, bail out.
+-      if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Legal &&
+-          TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
+-        return SDValue();
+     }
++
+     if (Opc != Opcode)
+       return SDValue();
+ 
+@@ -8543,6 +8540,10 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
+   assert(SrcVT != MVT::Other && "Cannot determine source type!");
+ 
+   EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
++
++  if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
++    return SDValue();
++
+   SmallVector<SDValue, 8> Opnds;
+   for (unsigned i = 0; i != NumInScalars; ++i) {
+     SDValue In = N->getOperand(i);
+diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
+index 466b096..d11bb9e 100644
+--- a/test/CodeGen/X86/cvtv2f32.ll
++++ b/test/CodeGen/X86/cvtv2f32.ll
+@@ -1,3 +1,7 @@
++; A bug fix in the DAGCombiner made this test fail, so marking as xfail
++; until this can be investigated further.
++; XFAIL: *
++
+ ; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
+ 
+ define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) {
+-- 
+1.7.10.4
+
diff --git a/debian/patches/r600-snapshot.diff b/debian/patches/r600-snapshot.diff
index 47c36306..64bab75d 100644
--- a/debian/patches/r600-snapshot.diff
+++ b/debian/patches/r600-snapshot.diff
@@ -41,116 +41,6 @@ index 4fa0705..02012b9 100755
  #include "confdefs.h"
  
  #if HAVE_DLFCN_H
-diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
-index 2e1597f..059bd80 100644
---- a/include/llvm/Intrinsics.td
-+++ b/include/llvm/Intrinsics.td
-@@ -469,3 +469,4 @@ include "llvm/IntrinsicsXCore.td"
- include "llvm/IntrinsicsHexagon.td"
- include "llvm/IntrinsicsNVVM.td"
- include "llvm/IntrinsicsMips.td"
-+include "llvm/IntrinsicsR600.td"
-diff --git a/include/llvm/IntrinsicsR600.td b/include/llvm/IntrinsicsR600.td
-new file mode 100644
-index 0000000..ecb5668
---- /dev/null
-+++ b/include/llvm/IntrinsicsR600.td
-@@ -0,0 +1,36 @@
-+//===- IntrinsicsR600.td - Defines R600 intrinsics ---------*- tablegen -*-===//
-+//
-+//                     The LLVM Compiler Infrastructure
-+//
-+// This file is distributed under the University of Illinois Open Source
-+// License. See LICENSE.TXT for details.
-+//
-+//===----------------------------------------------------------------------===//
-+//
-+// This file defines all of the R600-specific intrinsics.
-+//
-+//===----------------------------------------------------------------------===//
-+
-+let TargetPrefix = "r600" in {
-+
-+class R600ReadPreloadRegisterIntrinsic<string name>
-+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
-+    GCCBuiltin<name>;
-+
-+multiclass R600ReadPreloadRegisterIntrinsic_xyz<string prefix> {
-+  def _x : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_x")>;
-+  def _y : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_y")>;
-+  def _z : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_z")>;
-+}
-+
-+defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz <
-+                                       "__builtin_r600_read_global_size">;
-+defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz <
-+                                       "__builtin_r600_read_local_size">;
-+defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz <
-+                                       "__builtin_r600_read_ngroups">;
-+defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
-+                                       "__builtin_r600_read_tgid">;
-+defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
-+                                       "__builtin_r600_read_tidig">;
-+} // End TargetPrefix = "r600"
-diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
-index 37d7731..d0ca5c0 100644
---- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
-+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
-@@ -8514,11 +8514,8 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
-     if (Opcode == ISD::DELETED_NODE &&
-         (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
-       Opcode = Opc;
--      // If not supported by target, bail out.
--      if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Legal &&
--          TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
--        return SDValue();
-     }
-+
-     if (Opc != Opcode)
-       return SDValue();
- 
-@@ -8543,6 +8540,10 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
-   assert(SrcVT != MVT::Other && "Cannot determine source type!");
- 
-   EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
-+
-+  if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
-+    return SDValue();
-+
-   SmallVector<SDValue, 8> Opnds;
-   for (unsigned i = 0; i != NumInScalars; ++i) {
-     SDValue In = N->getOperand(i);
-diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
-index abf40b7..2596f00 100644
---- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
-+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
-@@ -731,9 +731,10 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
-           return;
-         }
-         case TargetLowering::Promote: {
--          assert(VT.isVector() && "Unknown legal promote case!");
--          Value = DAG.getNode(ISD::BITCAST, dl,
--                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Value);
-+          EVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT);
-+          assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
-+                 "Can only promote stores to same size type");
-+          Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
-           SDValue Result =
-             DAG.getStore(Chain, dl, Value, Ptr,
-                          ST->getPointerInfo(), isVolatile,
-@@ -889,10 +890,9 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
-       break;
-     }
-     case TargetLowering::Promote: {
--      // Only promote a load of vector type to another.
--      assert(VT.isVector() && "Cannot promote this load!");
--      // Change base type to a different vector type.
-       EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
-+      assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
-+             "Can only promote loads to same size type");
- 
-       SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                          LD->isVolatile(), LD->isNonTemporal(),
 diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
 index 8995080..84c4111 100644
 --- a/lib/Target/LLVMBuild.txt
@@ -166,10 +56,10 @@ index 8995080..84c4111 100644
  ; with the best execution engine (the native JIT, if available, or the
 diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
 new file mode 100644
-index 0000000..bac01a3
+index 0000000..ba87918
 --- /dev/null
 +++ b/lib/Target/R600/AMDGPU.h
-@@ -0,0 +1,52 @@
+@@ -0,0 +1,51 @@
 +//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -202,7 +92,6 @@ index 0000000..bac01a3
 +FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
 +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 +FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
-+FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
 +FunctionPass *createSIInsertWaits(TargetMachine &tm);
 +
 +// Passes common to R600 and SI
@@ -270,10 +159,10 @@ index 0000000..40f4741
 +include "AMDGPUInstructions.td"
 diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
 new file mode 100644
-index 0000000..a8c9bb4
+index 0000000..254e62e
 --- /dev/null
 +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
-@@ -0,0 +1,147 @@
+@@ -0,0 +1,145 @@
 +//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -367,8 +256,6 @@ index 0000000..a8c9bb4
 +        switch (reg) {
 +        default: break;
 +        case AMDGPU::EXEC:
-+        case AMDGPU::SI_LITERAL_CONSTANT:
-+        case AMDGPU::SREG_LIT_0:
 +        case AMDGPU::M0:
 +          continue;
 +        }
@@ -403,7 +290,7 @@ index 0000000..a8c9bb4
 +        } else {
 +          assert(!"Unknown register class");
 +        }
-+        hwReg = RI->getEncodingValue(reg);
++        hwReg = RI->getEncodingValue(reg) & 0xff;
 +        maxUsed = hwReg + width - 1;
 +        if (isSGPR) {
 +          MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
@@ -471,61 +358,6 @@ index 0000000..3812282
 +} // End anonymous llvm
 +
 +#endif //AMDGPU_ASMPRINTER_H
-diff --git a/lib/Target/R600/AMDGPUCodeEmitter.h b/lib/Target/R600/AMDGPUCodeEmitter.h
-new file mode 100644
-index 0000000..84f3588
---- /dev/null
-+++ b/lib/Target/R600/AMDGPUCodeEmitter.h
-@@ -0,0 +1,49 @@
-+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
-+//
-+//                     The LLVM Compiler Infrastructure
-+//
-+// This file is distributed under the University of Illinois Open Source
-+// License. See LICENSE.TXT for details.
-+//
-+//===----------------------------------------------------------------------===//
-+//
-+/// \file
-+/// \brief CodeEmitter interface for R600 and SI codegen.
-+//
-+//===----------------------------------------------------------------------===//
-+
-+#ifndef AMDGPUCODEEMITTER_H
-+#define AMDGPUCODEEMITTER_H
-+
-+namespace llvm {
-+
-+class AMDGPUCodeEmitter {
-+public:
-+  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
-+  virtual uint64_t getMachineOpValue(const MachineInstr &MI,
-+                                   const MachineOperand &MO) const { return 0; }
-+  virtual unsigned GPR4AlignEncode(const MachineInstr  &MI,
-+                                     unsigned OpNo) const {
-+    return 0;
-+  }
-+  virtual unsigned GPR2AlignEncode(const MachineInstr &MI,
-+                                   unsigned OpNo) const {
-+    return 0;
-+  }
-+  virtual uint64_t VOPPostEncode(const MachineInstr &MI,
-+                                 uint64_t Value) const {
-+    return Value;
-+  }
-+  virtual uint64_t i32LiteralEncode(const MachineInstr &MI,
-+                                    unsigned OpNo) const {
-+    return 0;
-+  }
-+  virtual uint32_t SMRDmemriEncode(const MachineInstr &MI, unsigned OpNo)
-+                                                                   const {
-+    return 0;
-+  }
-+};
-+
-+} // End namespace llvm
-+
-+#endif // AMDGPUCODEEMITTER_H
 diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp
 new file mode 100644
 index 0000000..50297d1
@@ -1198,10 +1030,10 @@ index 0000000..d0d23d6
 +}
 diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
 new file mode 100644
-index 0000000..927ed09
+index 0000000..99a11ff
 --- /dev/null
 +++ b/lib/Target/R600/AMDGPUISelLowering.h
-@@ -0,0 +1,145 @@
+@@ -0,0 +1,140 @@
 +//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -1257,6 +1089,11 @@ index 0000000..927ed09
 +                              const SmallVectorImpl<ISD::OutputArg> &Outs,
 +                              const SmallVectorImpl<SDValue> &OutVals,
 +                              DebugLoc DL, SelectionDAG &DAG) const;
++  virtual SDValue LowerCall(CallLoweringInfo &CLI,
++                            SmallVectorImpl<SDValue> &InVals) const {
++    CLI.Callee.dump();
++    llvm_unreachable("Undefined function");
++  }
 +
 +  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 +  SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
@@ -1334,25 +1171,15 @@ index 0000000..927ed09
 +
 +} // End namespace AMDGPUISD
 +
-+namespace SIISD {
-+
-+enum {
-+  SI_FIRST = AMDGPUISD::LAST_AMDGPU_ISD_NUMBER,
-+  VCC_AND,
-+  VCC_BITCAST
-+};
-+
-+} // End namespace SIISD
-+
 +} // End namespace llvm
 +
 +#endif // AMDGPUISELLOWERING_H
 diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
 new file mode 100644
-index 0000000..56aaf23
+index 0000000..15840b3
 --- /dev/null
 +++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
-@@ -0,0 +1,326 @@
+@@ -0,0 +1,344 @@
 +//===-- AMDGPUIndirectAddressing.cpp - Indirect Adressing Support ---------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -1524,9 +1351,6 @@ index 0000000..56aaf23
 +        }
 +
 +        if (RegisterAddressMap[Reg] == Address) {
-+          if (!regHasExplicitDef(MRI, Reg)) {
-+            continue;
-+          }
 +          PhiRegisters.push_back(Reg);
 +        }
 +      }
@@ -1625,7 +1449,8 @@ index 0000000..56aaf23
 +          // instruction that uses indirect addressing. 
 +          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
 +                   MI.getOperand(0).getReg())
-+                   .addReg(AddrReg);
++                   .addReg(AddrReg)
++                   .addReg(Reg, RegState::Implicit);
 +        }
 +      } else {
 +        // Indirect register access
@@ -1647,8 +1472,7 @@ index 0000000..56aaf23
 +          // We only need to use REG_SEQUENCE for explicit defs, since the
 +          // register coalescer won't do anything with the implicit defs.
 +          MachineInstr *DefInstr = MRI.getVRegDef(Reg);
-+          if (!DefInstr->getOperand(0).isReg() ||
-+              DefInstr->getOperand(0).getReg() != Reg) {
++          if (!regHasExplicitDef(MRI, Reg)) {
 +            continue;
 +          }
 +
@@ -1665,6 +1489,7 @@ index 0000000..56aaf23
 +
 +
 +        Mov.addReg(IndirectReg, RegState::Implicit | RegState::Kill);
++        Mov.addReg(LiveAddressRegisterMap[Address], RegState::Implicit);
 +
 +      }
 +      MI.eraseFromParent();
@@ -1676,7 +1501,27 @@ index 0000000..56aaf23
 +bool AMDGPUIndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI,
 +                                                  unsigned Reg) const {
 +  MachineInstr *DefInstr = MRI.getVRegDef(Reg);
-+  return DefInstr && DefInstr->getOperand(0).isReg() &&
++
++  if (!DefInstr) {
++    return false;
++  }
++
++  if (DefInstr->getOpcode() == AMDGPU::PHI) {
++    bool Explicit = false;
++    for (MachineInstr::const_mop_iterator I = DefInstr->operands_begin(),
++                                          E = DefInstr->operands_end();
++                                          I != E; ++I) {
++      const MachineOperand &MO = *I;
++      if (!MO.isReg() || MO.isDef()) {
++        continue;
++      }
++
++      Explicit = Explicit || regHasExplicitDef(MRI, MO.getReg());
++    }
++    return Explicit;
++  }
++
++  return DefInstr->getOperand(0).isReg() &&
 +         DefInstr->getOperand(0).getReg() == Reg;
 +}
 diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -1953,7 +1798,7 @@ index 0000000..640707d
 +}
 diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
 new file mode 100644
-index 0000000..bca3aa2
+index 0000000..5220aa0
 --- /dev/null
 +++ b/lib/Target/R600/AMDGPUInstrInfo.h
 @@ -0,0 +1,207 @@
@@ -2160,8 +2005,8 @@ index 0000000..bca3aa2
 +
 +} // End llvm namespace
 +
-+#define AMDGPU_FLAG_REGISTER_LOAD  (1UL << 63)
-+#define AMDGPU_FLAG_REGISTER_STORE (1UL << 62)
++#define AMDGPU_FLAG_REGISTER_LOAD  (UINT64_C(1) << 63)
++#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62)
 +
 +#endif // AMDGPUINSTRINFO_H
 diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -2907,10 +2752,10 @@ index 0000000..b5aca03
 +include "SIRegisterInfo.td"
 diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
 new file mode 100644
-index 0000000..22338b5
+index 0000000..a8c9621
 --- /dev/null
 +++ b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
-@@ -0,0 +1,714 @@
+@@ -0,0 +1,893 @@
 +//===-- AMDGPUStructurizeCFG.cpp -  ------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -2935,30 +2780,101 @@ index 0000000..22338b5
 +#include "llvm/Analysis/RegionInfo.h"
 +#include "llvm/Analysis/RegionPass.h"
 +#include "llvm/Transforms/Utils/SSAUpdater.h"
++#include "llvm/Support/PatternMatch.h"
 +
 +using namespace llvm;
++using namespace llvm::PatternMatch;
 +
 +namespace {
 +
 +// Definition of the complex types used in this pass.
 +
 +typedef std::pair<BasicBlock *, Value *> BBValuePair;
-+typedef ArrayRef<BasicBlock*> BBVecRef;
 +
 +typedef SmallVector<RegionNode*, 8> RNVector;
 +typedef SmallVector<BasicBlock*, 8> BBVector;
++typedef SmallVector<BranchInst*, 8> BranchVector;
 +typedef SmallVector<BBValuePair, 2> BBValueVector;
 +
++typedef SmallPtrSet<BasicBlock *, 8> BBSet;
++
 +typedef DenseMap<PHINode *, BBValueVector> PhiMap;
++typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap;
 +typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
 +typedef DenseMap<BasicBlock *, Value *> BBPredicates;
 +typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
-+typedef DenseMap<BasicBlock *, unsigned> VisitedMap;
++typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
++typedef DenseMap<BasicBlock *, BBVector> BB2BBVecMap;
 +
 +// The name for newly created blocks.
 +
 +static const char *FlowBlockName = "Flow";
 +
++/// @brief Find the nearest common dominator for multiple BasicBlocks
++///
++/// Helper class for AMDGPUStructurizeCFG
++/// TODO: Maybe move into common code
++class NearestCommonDominator {
++
++  DominatorTree *DT;
++
++  DTN2UnsignedMap IndexMap;
++
++  BasicBlock *Result;
++  unsigned ResultIndex;
++  bool ExplicitMentioned;
++
++public:
++  /// \brief Start a new query
++  NearestCommonDominator(DominatorTree *DomTree) {
++    DT = DomTree;
++    Result = 0;
++  }
++
++  /// \brief Add BB to the resulting dominator
++  void addBlock(BasicBlock *BB, bool Remember = true) {
++
++    DomTreeNode *Node = DT->getNode(BB);
++
++    if (Result == 0) {
++      unsigned Numbering = 0;
++      for (;Node;Node = Node->getIDom())
++        IndexMap[Node] = ++Numbering;
++      Result = BB;
++      ResultIndex = 1;
++      ExplicitMentioned = Remember;
++      return;
++    }
++
++    for (;Node;Node = Node->getIDom())
++      if (IndexMap.count(Node))
++        break;
++      else
++        IndexMap[Node] = 0;
++
++    assert(Node && "Dominator tree invalid!");
++
++    unsigned Numbering = IndexMap[Node];
++    if (Numbering > ResultIndex) {
++      Result = Node->getBlock();
++      ResultIndex = Numbering;
++      ExplicitMentioned = Remember && (Result == BB);
++    } else if (Numbering == ResultIndex) {
++      ExplicitMentioned |= Remember;
++    }
++  }
++
++  /// \brief Is "Result" one of the BBs added with "Remember" = True?
++  bool wasResultExplicitMentioned() {
++    return ExplicitMentioned;
++  }
++
++  /// \brief Get the query result
++  BasicBlock *getResult() {
++    return Result;
++  }
++};
++
 +/// @brief Transforms the control flow graph on one single entry/exit region
 +/// at a time.
 +///
@@ -3019,46 +2935,63 @@ index 0000000..22338b5
 +  DominatorTree *DT;
 +
 +  RNVector Order;
-+  VisitedMap Visited;
-+  PredMap Predicates;
-+  BBPhiMap DeletedPhis;
-+  BBVector FlowsInserted;
++  BBSet Visited;
 +
-+  BasicBlock *LoopStart;
-+  BasicBlock *LoopEnd;
-+  BBPredicates LoopPred;
++  BBPhiMap DeletedPhis;
++  BB2BBVecMap AddedPhis;
++
++  PredMap Predicates;
++  BranchVector Conditions;
++
++  BB2BBMap Loops;
++  PredMap LoopPreds;
++  BranchVector LoopConds;
++
++  RegionNode *PrevNode;
 +
 +  void orderNodes();
 +
-+  void buildPredicate(BranchInst *Term, unsigned Idx,
-+                      BBPredicates &Pred, bool Invert);
++  void analyzeLoops(RegionNode *N);
 +
-+  void analyzeBlock(BasicBlock *BB);
++  Value *invert(Value *Condition);
 +
-+  void analyzeLoop(BasicBlock *BB, unsigned &LoopIdx);
++  Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
++
++  void gatherPredicates(RegionNode *N);
 +
 +  void collectInfos();
 +
-+  bool dominatesPredicates(BasicBlock *A, BasicBlock *B);
-+
-+  void killTerminator(BasicBlock *BB);
-+
-+  RegionNode *skipChained(RegionNode *Node);
++  void insertConditions(bool Loops);
 +
 +  void delPhiValues(BasicBlock *From, BasicBlock *To);
 +
 +  void addPhiValues(BasicBlock *From, BasicBlock *To);
 +
-+  BasicBlock *getNextFlow(BasicBlock *Prev);
++  void setPhiValues();
 +
-+  bool isPredictableTrue(BasicBlock *Prev, BasicBlock *Node);
++  void killTerminator(BasicBlock *BB);
 +
-+  BasicBlock *wireFlowBlock(BasicBlock *Prev, RegionNode *Node);
++  void changeExit(RegionNode *Node, BasicBlock *NewExit,
++                  bool IncludeDominator);
++
++  BasicBlock *getNextFlow(BasicBlock *Dominator);
++
++  BasicBlock *needPrefix(bool NeedEmpty);
++
++  BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
++
++  void setPrevNode(BasicBlock *BB);
++
++  bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
++
++  bool isPredictableTrue(RegionNode *Node);
++
++  void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
++
++  void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
 +
 +  void createFlow();
 +
-+  void insertConditions();
-+
 +  void rebuildSSA();
 +
 +public:
@@ -3111,212 +3044,214 @@ index 0000000..22338b5
 +  }
 +}
 +
-+/// \brief Build blocks and loop predicates
-+void AMDGPUStructurizeCFG::buildPredicate(BranchInst *Term, unsigned Idx,
-+                                          BBPredicates &Pred, bool Invert) {
-+  Value *True = Invert ? BoolFalse : BoolTrue;
-+  Value *False = Invert ? BoolTrue : BoolFalse;
++/// \brief Determine the end of the loops
++void AMDGPUStructurizeCFG::analyzeLoops(RegionNode *N) {
 +
-+  RegionInfo *RI = ParentRegion->getRegionInfo();
-+  BasicBlock *BB = Term->getParent();
-+
-+  // Handle the case where multiple regions start at the same block
-+  Region *R = BB != ParentRegion->getEntry() ?
-+              RI->getRegionFor(BB) : ParentRegion;
-+
-+  if (R == ParentRegion) {
-+    // It's a top level block in our region
-+    Value *Cond = True;
-+    if (Term->isConditional()) {
-+      BasicBlock *Other = Term->getSuccessor(!Idx);
-+
-+      if (Visited.count(Other)) {
-+        if (!Pred.count(Other))
-+          Pred[Other] = False;
-+
-+        if (!Pred.count(BB))
-+          Pred[BB] = True;
-+        return;
-+      }
-+      Cond = Term->getCondition();
-+
-+      if (Idx != Invert)
-+        Cond = BinaryOperator::CreateNot(Cond, "", Term);
-+    }
-+
-+    Pred[BB] = Cond;
-+
-+  } else if (ParentRegion->contains(R)) {
-+    // It's a block in a sub region
-+    while(R->getParent() != ParentRegion)
-+      R = R->getParent();
-+
-+    Pred[R->getEntry()] = True;
++  if (N->isSubRegion()) {
++    // Test for exit as back edge
++    BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
++    if (Visited.count(Exit))
++      Loops[Exit] = N->getEntry();
 +
 +  } else {
-+    // It's a branch from outside into our parent region
-+    Pred[BB] = True;
-+  }
-+}
-+
-+/// \brief Analyze the successors of each block and build up predicates
-+void AMDGPUStructurizeCFG::analyzeBlock(BasicBlock *BB) {
-+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-+  BBPredicates &Pred = Predicates[BB];
-+
-+  for (; PI != PE; ++PI) {
-+    BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
++    // Test for sucessors as back edge
++    BasicBlock *BB = N->getNodeAs<BasicBlock>();
++    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
 +
 +    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
 +      BasicBlock *Succ = Term->getSuccessor(i);
-+      if (Succ != BB)
-+        continue;
-+      buildPredicate(Term, i, Pred, false);
++
++      if (Visited.count(Succ))
++        Loops[Succ] = BB;
 +    }
 +  }
 +}
 +
-+/// \brief Analyze the conditions leading to loop to a previous block
-+void AMDGPUStructurizeCFG::analyzeLoop(BasicBlock *BB, unsigned &LoopIdx) {
-+  BranchInst *Term = cast<BranchInst>(BB->getTerminator());
++/// \brief Invert the given condition
++Value *AMDGPUStructurizeCFG::invert(Value *Condition) {
 +
-+  for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
-+    BasicBlock *Succ = Term->getSuccessor(i);
++  // First: Check if it's a constant
++  if (Condition == BoolTrue)
++    return BoolFalse;
 +
-+    // Ignore it if it's not a back edge
-+    if (!Visited.count(Succ))
++  if (Condition == BoolFalse)
++    return BoolTrue;
++
++  if (Condition == BoolUndef)
++    return BoolUndef;
++
++  // Second: If the condition is already inverted, return the original value
++  if (match(Condition, m_Not(m_Value(Condition))))
++    return Condition;
++
++  // Third: Check all the users for an invert
++  BasicBlock *Parent = cast<Instruction>(Condition)->getParent();
++  for (Value::use_iterator I = Condition->use_begin(),
++       E = Condition->use_end(); I != E; ++I) {
++
++    Instruction *User = dyn_cast<Instruction>(*I);
++    if (!User || User->getParent() != Parent)
 +      continue;
 +
-+    buildPredicate(Term, i, LoopPred, true);
++    if (match(*I, m_Not(m_Specific(Condition))))
++      return *I;
++  }
 +
-+    LoopEnd = BB;
-+    if (Visited[Succ] < LoopIdx) {
-+      LoopIdx = Visited[Succ];
-+      LoopStart = Succ;
++  // Last option: Create a new instruction
++  return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
++}
++
++/// \brief Build the condition for one edge
++Value *AMDGPUStructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
++                                            bool Invert) {
++  Value *Cond = Invert ? BoolFalse : BoolTrue;
++  if (Term->isConditional()) {
++    Cond = Term->getCondition();
++
++    if (Idx != Invert)
++      Cond = invert(Cond);
++  }
++  return Cond;
++}
++
++/// \brief Analyze the predecessors of each block and build up predicates
++void AMDGPUStructurizeCFG::gatherPredicates(RegionNode *N) {
++
++  RegionInfo *RI = ParentRegion->getRegionInfo();
++  BasicBlock *BB = N->getEntry();
++  BBPredicates &Pred = Predicates[BB];
++  BBPredicates &LPred = LoopPreds[BB];
++
++  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
++       PI != PE; ++PI) {
++
++    // Ignore it if it's a branch from outside into our region entry
++    if (!ParentRegion->contains(*PI))
++      continue;
++
++    Region *R = RI->getRegionFor(*PI);
++    if (R == ParentRegion) {
++
++      // It's a top level block in our region
++      BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
++      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
++        BasicBlock *Succ = Term->getSuccessor(i);
++        if (Succ != BB)
++          continue;
++
++        if (Visited.count(*PI)) {
++          // Normal forward edge
++          if (Term->isConditional()) {
++            // Try to treat it like an ELSE block
++            BasicBlock *Other = Term->getSuccessor(!i);
++            if (Visited.count(Other) && !Loops.count(Other) &&
++                !Pred.count(Other) && !Pred.count(*PI)) {
++                
++              Pred[Other] = BoolFalse;
++              Pred[*PI] = BoolTrue;
++              continue;
++            }
++          }
++          Pred[*PI] = buildCondition(Term, i, false);
++ 
++        } else {
++          // Back edge
++          LPred[*PI] = buildCondition(Term, i, true);
++        }
++      }
++
++    } else {
++
++      // It's an exit from a sub region
++      while(R->getParent() != ParentRegion)
++        R = R->getParent();
++
++      // Edge from inside a subregion to its entry, ignore it
++      if (R == N)
++        continue;
++
++      BasicBlock *Entry = R->getEntry();
++      if (Visited.count(Entry))
++        Pred[Entry] = BoolTrue;
++      else
++        LPred[Entry] = BoolFalse;
 +    }
 +  }
 +}
 +
 +/// \brief Collect various loop and predicate infos
 +void AMDGPUStructurizeCFG::collectInfos() {
-+  unsigned Number = 0, LoopIdx = ~0;
 +
 +  // Reset predicate
 +  Predicates.clear();
 +
 +  // and loop infos
-+  LoopStart = LoopEnd = 0;
-+  LoopPred.clear();
++  Loops.clear();
++  LoopPreds.clear();
 +
-+  RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
-+  for (Visited.clear(); OI != OE; Visited[(*OI++)->getEntry()] = ++Number) {
++  // Reset the visited nodes
++  Visited.clear();
++
++  for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
++       OI != OE; ++OI) {
 +
 +    // Analyze all the conditions leading to a node
-+    analyzeBlock((*OI)->getEntry());
++    gatherPredicates(*OI);
 +
-+    if ((*OI)->isSubRegion())
-+      continue;
++    // Remember that we've seen this node
++    Visited.insert((*OI)->getEntry());
 +
-+    // Find the first/last loop nodes and loop predicates
-+    analyzeLoop((*OI)->getNodeAs<BasicBlock>(), LoopIdx);
++    // Find the last back edges
++    analyzeLoops(*OI);
 +  }
 +}
 +
-+/// \brief Does A dominate all the predicates of B ?
-+bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *A, BasicBlock *B) {
-+  BBPredicates &Preds = Predicates[B];
-+  for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
-+       PI != PE; ++PI) {
++/// \brief Insert the missing branch conditions
++void AMDGPUStructurizeCFG::insertConditions(bool Loops) {
++  BranchVector &Conds = Loops ? LoopConds : Conditions;
++  Value *Default = Loops ? BoolTrue : BoolFalse;
++  SSAUpdater PhiInserter;
 +
-+    if (!DT->dominates(A, PI->first))
-+      return false;
-+  }
-+  return true;
-+}
++  for (BranchVector::iterator I = Conds.begin(),
++       E = Conds.end(); I != E; ++I) {
 +
-+/// \brief Remove phi values from all successors and the remove the terminator.
-+void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
-+  TerminatorInst *Term = BB->getTerminator();
-+  if (!Term)
-+    return;
++    BranchInst *Term = *I;
++    assert(Term->isConditional());
 +
-+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
-+       SI != SE; ++SI) {
++    BasicBlock *Parent = Term->getParent();
++    BasicBlock *SuccTrue = Term->getSuccessor(0);
++    BasicBlock *SuccFalse = Term->getSuccessor(1);
 +
-+    delPhiValues(BB, *SI);
-+  }
++    PhiInserter.Initialize(Boolean, "");
++    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
++    PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
 +
-+  Term->eraseFromParent();
-+}
++    BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
 +
-+/// First: Skip forward to the first region node that either isn't a subregion or not
-+/// dominating it's exit, remove all the skipped nodes from the node order.
-+///
-+/// Second: Handle the first successor directly if the resulting nodes successor
-+/// predicates are still dominated by the original entry
-+RegionNode *AMDGPUStructurizeCFG::skipChained(RegionNode *Node) {
-+  BasicBlock *Entry = Node->getEntry();
++    NearestCommonDominator Dominator(DT);
++    Dominator.addBlock(Parent, false);
 +
-+  // Skip forward as long as it is just a linear flow
-+  while (true) {
-+    BasicBlock *Entry = Node->getEntry();
-+    BasicBlock *Exit;
++    Value *ParentValue = 0;
++    for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
++         PI != PE; ++PI) {
 +
-+    if (Node->isSubRegion()) {
-+      Exit = Node->getNodeAs<Region>()->getExit();
-+    } else {
-+      TerminatorInst *Term = Entry->getTerminator();
-+      if (Term->getNumSuccessors() != 1)
++      if (PI->first == Parent) {
++        ParentValue = PI->second;
 +        break;
-+      Exit = Term->getSuccessor(0);
++      }
++      PhiInserter.AddAvailableValue(PI->first, PI->second);
++      Dominator.addBlock(PI->first);
 +    }
 +
-+    // It's a back edge, break here so we can insert a loop node
-+    if (!Visited.count(Exit))
-+      return Node;
++    if (ParentValue) {
++      Term->setCondition(ParentValue);
++    } else {
++      if (!Dominator.wasResultExplicitMentioned())
++        PhiInserter.AddAvailableValue(Dominator.getResult(), Default);
 +
-+    // More than node edges are pointing to exit
-+    if (!DT->dominates(Entry, Exit))
-+      return Node;
-+
-+    RegionNode *Next = ParentRegion->getNode(Exit);
-+    RNVector::iterator I = std::find(Order.begin(), Order.end(), Next);
-+    assert(I != Order.end());
-+
-+    Visited.erase(Next->getEntry());
-+    Order.erase(I);
-+    Node = Next;
++      Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
++    }
 +  }
-+
-+  BasicBlock *BB = Node->getEntry();
-+  TerminatorInst *Term = BB->getTerminator();
-+  if (Term->getNumSuccessors() != 2)
-+    return Node;
-+
-+  // Our node has exactly two succesors, check if we can handle
-+  // any of them directly
-+  BasicBlock *Succ = Term->getSuccessor(0);
-+  if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) {
-+    Succ = Term->getSuccessor(1);
-+    if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ))
-+      return Node;
-+  } else {
-+    BasicBlock *Succ2 = Term->getSuccessor(1);
-+    if (Visited.count(Succ2) && Visited[Succ] > Visited[Succ2] &&
-+        dominatesPredicates(Entry, Succ2))
-+      Succ = Succ2;
-+  }
-+
-+  RegionNode *Next = ParentRegion->getNode(Succ);
-+  RNVector::iterator E = Order.end();
-+  RNVector::iterator I = std::find(Order.begin(), E, Next);
-+  assert(I != E);
-+
-+  killTerminator(BB);
-+  FlowsInserted.push_back(BB);
-+  Visited.erase(Succ);
-+  Order.erase(I);
-+  return ParentRegion->getNode(wireFlowBlock(BB, Next));
 +}
 +
 +/// \brief Remove all PHI values coming from "From" into "To" and remember
@@ -3334,224 +3269,306 @@ index 0000000..22338b5
 +  }
 +}
 +
-+/// \brief Add the PHI values back once we knew the new predecessor
++/// \brief Add a dummy PHI value as soon as we knew the new predecessor
 +void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
-+  if (!DeletedPhis.count(To))
++  for (BasicBlock::iterator I = To->begin(), E = To->end();
++       I != E && isa<PHINode>(*I);) {
++
++    PHINode &Phi = cast<PHINode>(*I++);
++    Value *Undef = UndefValue::get(Phi.getType());
++    Phi.addIncoming(Undef, From);
++  }
++  AddedPhis[To].push_back(From);
++}
++
++/// \brief Add the real PHI value as soon as everything is set up
++void AMDGPUStructurizeCFG::setPhiValues() {
++  
++  SSAUpdater Updater;
++  for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end();
++       AI != AE; ++AI) {
++
++    BasicBlock *To = AI->first;
++    BBVector &From = AI->second;
++
++    if (!DeletedPhis.count(To))
++      continue;
++
++    PhiMap &Map = DeletedPhis[To];
++    for (PhiMap::iterator PI = Map.begin(), PE = Map.end();
++         PI != PE; ++PI) {
++
++      PHINode *Phi = PI->first;
++      Value *Undef = UndefValue::get(Phi->getType());
++      Updater.Initialize(Phi->getType(), "");
++      Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
++      Updater.AddAvailableValue(To, Undef);
++
++      NearestCommonDominator Dominator(DT);
++      Dominator.addBlock(To, false);
++      for (BBValueVector::iterator VI = PI->second.begin(),
++           VE = PI->second.end(); VI != VE; ++VI) {
++
++        Updater.AddAvailableValue(VI->first, VI->second);
++        Dominator.addBlock(VI->first);
++      }
++
++      if (!Dominator.wasResultExplicitMentioned())
++        Updater.AddAvailableValue(Dominator.getResult(), Undef);
++
++      for (BBVector::iterator FI = From.begin(), FE = From.end();
++           FI != FE; ++FI) {
++
++        int Idx = Phi->getBasicBlockIndex(*FI);
++        assert(Idx != -1);
++        Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI));
++      }
++    }
++
++    DeletedPhis.erase(To);
++  }
++  assert(DeletedPhis.empty());
++}
++
++/// \brief Remove phi values from all successors and then remove the terminator.
++void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
++  TerminatorInst *Term = BB->getTerminator();
++  if (!Term)
 +    return;
 +
-+  PhiMap &Map = DeletedPhis[To];
-+  SSAUpdater Updater;
++  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
++       SI != SE; ++SI) {
 +
-+  for (PhiMap::iterator I = Map.begin(), E = Map.end(); I != E; ++I) {
-+
-+    PHINode *Phi = I->first;
-+    Updater.Initialize(Phi->getType(), "");
-+    BasicBlock *Fallback = To;
-+    bool HaveFallback = false;
-+
-+    for (BBValueVector::iterator VI = I->second.begin(), VE = I->second.end();
-+         VI != VE; ++VI) {
-+
-+      Updater.AddAvailableValue(VI->first, VI->second);
-+      BasicBlock *Dom = DT->findNearestCommonDominator(Fallback, VI->first);
-+      if (Dom == VI->first)
-+        HaveFallback = true;
-+      else if (Dom != Fallback)
-+        HaveFallback = false;
-+      Fallback = Dom;
-+    }
-+    if (!HaveFallback) {
-+      Value *Undef = UndefValue::get(Phi->getType());
-+      Updater.AddAvailableValue(Fallback, Undef);
-+    }
-+
-+    Phi->addIncoming(Updater.GetValueAtEndOfBlock(From), From);
++    delPhiValues(BB, *SI);
++  }
++
++  Term->eraseFromParent();
++}
++
++/// \brief Let node exit(s) point to NewExit
++void AMDGPUStructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
++                                      bool IncludeDominator) {
++
++  if (Node->isSubRegion()) {
++    Region *SubRegion = Node->getNodeAs<Region>();
++    BasicBlock *OldExit = SubRegion->getExit();
++    BasicBlock *Dominator = 0;
++
++    // Find all the edges from the sub region to the exit
++    for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit);
++         I != E;) {
++
++      BasicBlock *BB = *I++;
++      if (!SubRegion->contains(BB))
++        continue;
++
++      // Modify the edges to point to the new exit
++      delPhiValues(BB, OldExit);
++      BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
++      addPhiValues(BB, NewExit);
++
++      // Find the new dominator (if requested)
++      if (IncludeDominator) {
++        if (!Dominator)
++          Dominator = BB;
++        else
++          Dominator = DT->findNearestCommonDominator(Dominator, BB);
++      }
++    }
++
++    // Change the dominator (if requested)
++    if (Dominator)
++      DT->changeImmediateDominator(NewExit, Dominator);
++
++    // Update the region info
++    SubRegion->replaceExit(NewExit);
++
++  } else {
++    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
++    killTerminator(BB);
++    BranchInst::Create(NewExit, BB);
++    addPhiValues(BB, NewExit);
++    if (IncludeDominator)
++      DT->changeImmediateDominator(NewExit, BB);
 +  }
-+  DeletedPhis.erase(To);
 +}
 +
 +/// \brief Create a new flow node and update dominator tree and region info
-+BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Prev) {
++BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Dominator) {
 +  LLVMContext &Context = Func->getContext();
 +  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
 +                       Order.back()->getEntry();
 +  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
 +                                        Func, Insert);
-+  DT->addNewBlock(Flow, Prev);
++  DT->addNewBlock(Flow, Dominator);
 +  ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
-+  FlowsInserted.push_back(Flow);
 +  return Flow;
 +}
 +
++/// \brief Create a new or reuse the previous node as flow node
++BasicBlock *AMDGPUStructurizeCFG::needPrefix(bool NeedEmpty) {
++
++  BasicBlock *Entry = PrevNode->getEntry();
++
++  if (!PrevNode->isSubRegion()) {
++    killTerminator(Entry);
++    if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
++      return Entry;
++
++  } 
++
++  // create a new flow node
++  BasicBlock *Flow = getNextFlow(Entry);
++
++  // and wire it up
++  changeExit(PrevNode, Flow, true);
++  PrevNode = ParentRegion->getBBNode(Flow);
++  return Flow;
++}
++
++/// \brief Returns the region exit if possible, otherwise just a new flow node
++BasicBlock *AMDGPUStructurizeCFG::needPostfix(BasicBlock *Flow,
++                                              bool ExitUseAllowed) {
++
++  if (Order.empty() && ExitUseAllowed) {
++    BasicBlock *Exit = ParentRegion->getExit();
++    DT->changeImmediateDominator(Exit, Flow);
++    addPhiValues(Flow, Exit);
++    return Exit;
++  }
++  return getNextFlow(Flow);
++}
++
++/// \brief Set the previous node
++void AMDGPUStructurizeCFG::setPrevNode(BasicBlock *BB) {
++  PrevNode =  ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0;
++}
++
++/// \brief Does BB dominate all the predicates of Node ?
++bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
++  BBPredicates &Preds = Predicates[Node->getEntry()];
++  for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
++       PI != PE; ++PI) {
++
++    if (!DT->dominates(BB, PI->first))
++      return false;
++  }
++  return true;
++}
++
 +/// \brief Can we predict that this node will always be called?
-+bool AMDGPUStructurizeCFG::isPredictableTrue(BasicBlock *Prev,
-+                                             BasicBlock *Node) {
-+  BBPredicates &Preds = Predicates[Node];
++bool AMDGPUStructurizeCFG::isPredictableTrue(RegionNode *Node) {
++
++  BBPredicates &Preds = Predicates[Node->getEntry()];
 +  bool Dominated = false;
 +
++  // Regionentry is always true
++  if (PrevNode == 0)
++    return true;
++
 +  for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
 +       I != E; ++I) {
 +
 +    if (I->second != BoolTrue)
 +      return false;
 +
-+    if (!Dominated && DT->dominates(I->first, Prev))
++    if (!Dominated && DT->dominates(I->first, PrevNode->getEntry()))
 +      Dominated = true;
 +  }
++
++  // TODO: The dominator check is too strict
 +  return Dominated;
 +}
 +
-+/// \brief Wire up the new control flow by inserting or updating the branch
-+/// instructions at node exits
-+BasicBlock *AMDGPUStructurizeCFG::wireFlowBlock(BasicBlock *Prev,
-+                                                RegionNode *Node) {
-+  BasicBlock *Entry = Node->getEntry();
++/// Take one node from the order vector and wire it up
++void AMDGPUStructurizeCFG::wireFlow(bool ExitUseAllowed,
++                                    BasicBlock *LoopEnd) {
 +
-+  if (LoopStart == Entry) {
-+    LoopStart = Prev;
-+    LoopPred[Prev] = BoolTrue;
-+  }
++  RegionNode *Node = Order.pop_back_val();
++  Visited.insert(Node->getEntry());
 +
-+  // Wire it up temporary, skipChained may recurse into us
-+  BranchInst::Create(Entry, Prev);
-+  DT->changeImmediateDominator(Entry, Prev);
-+  addPhiValues(Prev, Entry);
-+
-+  Node = skipChained(Node);
-+
-+  BasicBlock *Next = getNextFlow(Prev);
-+  if (!isPredictableTrue(Prev, Entry)) {
-+    // Let Prev point to entry and next block
-+    Prev->getTerminator()->eraseFromParent();
-+    BranchInst::Create(Entry, Next, BoolUndef, Prev);
-+  } else {
-+    DT->changeImmediateDominator(Next, Entry);
-+  }
-+
-+  // Let node exit(s) point to next block
-+  if (Node->isSubRegion()) {
-+    Region *SubRegion = Node->getNodeAs<Region>();
-+    BasicBlock *Exit = SubRegion->getExit();
-+
-+    // Find all the edges from the sub region to the exit
-+    BBVector ToDo;
-+    for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
-+      if (SubRegion->contains(*I))
-+        ToDo.push_back(*I);
++  if (isPredictableTrue(Node)) {
++    // Just a linear flow
++    if (PrevNode) {
++      changeExit(PrevNode, Node->getEntry(), true);
 +    }
-+
-+    // Modify the edges to point to the new flow block
-+    for (BBVector::iterator I = ToDo.begin(), E = ToDo.end(); I != E; ++I) {
-+      delPhiValues(*I, Exit);
-+      TerminatorInst *Term = (*I)->getTerminator();
-+      Term->replaceUsesOfWith(Exit, Next);
-+    }
-+
-+    // Update the region info
-+    SubRegion->replaceExit(Next);
++    PrevNode = Node;
 +
 +  } else {
-+    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
-+    killTerminator(BB);
-+    BranchInst::Create(Next, BB);
++    // Insert extra prefix node (or reuse last one)
++    BasicBlock *Flow = needPrefix(false);
 +
-+    if (BB == LoopEnd)
-+      LoopEnd = 0;
++    // Insert extra postfix node (or use exit instead)
++    BasicBlock *Entry = Node->getEntry();
++    BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
++
++    // let it point to entry and next block
++    Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
++    addPhiValues(Flow, Entry);
++    DT->changeImmediateDominator(Entry, Flow);
++
++    PrevNode = Node;
++    while (!Order.empty() && !Visited.count(LoopEnd) &&
++           dominatesPredicates(Entry, Order.back())) {
++      handleLoops(false, LoopEnd);
++    }
++
++    changeExit(PrevNode, Next, false);
++    setPrevNode(Next);
 +  }
-+
-+  return Next;
 +}
 +
-+/// Destroy node order and visited map, build up flow order instead.
++void AMDGPUStructurizeCFG::handleLoops(bool ExitUseAllowed,
++                                       BasicBlock *LoopEnd) {
++  RegionNode *Node = Order.back();
++  BasicBlock *LoopStart = Node->getEntry();
++
++  if (!Loops.count(LoopStart)) {
++    wireFlow(ExitUseAllowed, LoopEnd);
++    return;
++  }
++
++  if (!isPredictableTrue(Node))
++    LoopStart = needPrefix(true);
++
++  LoopEnd = Loops[Node->getEntry()];
++  wireFlow(false, LoopEnd);
++  while (!Visited.count(LoopEnd)) {
++    handleLoops(false, LoopEnd);
++  }
++
++  // Create an extra loop end node
++  LoopEnd = needPrefix(false);
++  BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
++  LoopConds.push_back(BranchInst::Create(Next, LoopStart,
++                                         BoolUndef, LoopEnd));
++  addPhiValues(LoopEnd, LoopStart);
++  setPrevNode(Next);
++}
++
 +/// After this function control flow looks like it should be, but
-+/// branches only have undefined conditions.
++/// branches and PHI nodes only have undefined conditions.
 +void AMDGPUStructurizeCFG::createFlow() {
-+  DeletedPhis.clear();
-+
-+  BasicBlock *Prev = Order.pop_back_val()->getEntry();
-+  assert(Prev == ParentRegion->getEntry() && "Incorrect node order!");
-+  Visited.erase(Prev);
-+
-+  if (LoopStart == Prev) {
-+    // Loop starts at entry, split entry so that we can predicate it
-+    BasicBlock::iterator Insert = Prev->getFirstInsertionPt();
-+    BasicBlock *Split = Prev->splitBasicBlock(Insert, FlowBlockName);
-+    DT->addNewBlock(Split, Prev);
-+    ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion);
-+    Predicates[Split] = Predicates[Prev];
-+    Order.push_back(ParentRegion->getBBNode(Split));
-+    LoopPred[Prev] = BoolTrue;
-+
-+  } else if (LoopStart == Order.back()->getEntry()) {
-+    // Loop starts behind entry, split entry so that we can jump to it
-+    Instruction *Term = Prev->getTerminator();
-+    BasicBlock *Split = Prev->splitBasicBlock(Term, FlowBlockName);
-+    DT->addNewBlock(Split, Prev);
-+    ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion);
-+    Prev = Split;
-+  }
-+
-+  killTerminator(Prev);
-+  FlowsInserted.clear();
-+  FlowsInserted.push_back(Prev);
-+
-+  while (!Order.empty()) {
-+    RegionNode *Node = Order.pop_back_val();
-+    Visited.erase(Node->getEntry());
-+    Prev = wireFlowBlock(Prev, Node);
-+    if (LoopStart && !LoopEnd) {
-+      // Create an extra loop end node
-+      LoopEnd = Prev;
-+      Prev = getNextFlow(LoopEnd);
-+      BranchInst::Create(Prev, LoopStart, BoolUndef, LoopEnd);
-+      addPhiValues(LoopEnd, LoopStart);
-+    }
-+  }
 +
 +  BasicBlock *Exit = ParentRegion->getExit();
-+  BranchInst::Create(Exit, Prev);
-+  addPhiValues(Prev, Exit);
-+  if (DT->dominates(ParentRegion->getEntry(), Exit))
-+    DT->changeImmediateDominator(Exit, Prev);
++  bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
 +
-+  if (LoopStart && LoopEnd) {
-+    BBVector::iterator FI = std::find(FlowsInserted.begin(),
-+                                      FlowsInserted.end(),
-+                                      LoopStart);
-+    for (; *FI != LoopEnd; ++FI) {
-+      addPhiValues(*FI, (*FI)->getTerminator()->getSuccessor(0));
-+    }
++  DeletedPhis.clear();
++  AddedPhis.clear();
++  Conditions.clear();
++  LoopConds.clear();
++
++  PrevNode = 0;
++  Visited.clear();
++
++  while (!Order.empty()) {
++    handleLoops(EntryDominatesExit, 0);
 +  }
 +
-+  assert(Order.empty());
-+  assert(Visited.empty());
-+  assert(DeletedPhis.empty());
-+}
-+
-+/// \brief Insert the missing branch conditions
-+void AMDGPUStructurizeCFG::insertConditions() {
-+  SSAUpdater PhiInserter;
-+
-+  for (BBVector::iterator FI = FlowsInserted.begin(), FE = FlowsInserted.end();
-+       FI != FE; ++FI) {
-+
-+    BranchInst *Term = cast<BranchInst>((*FI)->getTerminator());
-+    if (Term->isUnconditional())
-+      continue;
-+
-+    PhiInserter.Initialize(Boolean, "");
-+    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), BoolFalse);
-+
-+    BasicBlock *Succ = Term->getSuccessor(0);
-+    BBPredicates &Preds = (*FI == LoopEnd) ? LoopPred : Predicates[Succ];
-+    for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
-+         PI != PE; ++PI) {
-+
-+      PhiInserter.AddAvailableValue(PI->first, PI->second);
-+    }
-+
-+    Term->setCondition(PhiInserter.GetValueAtEndOfBlock(*FI));
-+  }
++  if (PrevNode)
++    changeExit(PrevNode, Exit, EntryDominatesExit);
++  else
++    assert(EntryDominatesExit);
 +}
 +
 +/// Handle a rare case where the disintegrated nodes instructions
@@ -3609,14 +3626,21 @@ index 0000000..22338b5
 +  orderNodes();
 +  collectInfos();
 +  createFlow();
-+  insertConditions();
++  insertConditions(false);
++  insertConditions(true);
++  setPhiValues();
 +  rebuildSSA();
 +
++  // Cleanup
 +  Order.clear();
 +  Visited.clear();
-+  Predicates.clear();
 +  DeletedPhis.clear();
-+  FlowsInserted.clear();
++  AddedPhis.clear();
++  Predicates.clear();
++  Conditions.clear();
++  Loops.clear();
++  LoopPreds.clear();
++  LoopConds.clear();
 +
 +  return true;
 +}
@@ -3791,10 +3815,10 @@ index 0000000..cab7884
 +#endif // AMDGPUSUBTARGET_H
 diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
 new file mode 100644
-index 0000000..821e864
+index 0000000..e2f00be
 --- /dev/null
 +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
-@@ -0,0 +1,154 @@
+@@ -0,0 +1,153 @@
 +//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -3942,7 +3966,6 @@ index 0000000..821e864
 +    addPass(&FinalizeMachineBundlesID);
 +    addPass(createR600LowerConstCopy(*TM));
 +  } else {
-+    addPass(createSILowerLiteralConstantsPass(*TM));
 +    addPass(createSILowerControlFlowPass(*TM));
 +  }
 +
@@ -8242,10 +8265,10 @@ index 0000000..6dc2deb
 +#endif // AMDILEVERGREENDEVICE_H
 diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
 new file mode 100644
-index 0000000..2699409
+index 0000000..2e726e9
 --- /dev/null
 +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
-@@ -0,0 +1,625 @@
+@@ -0,0 +1,577 @@
 +//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -8320,8 +8343,6 @@ index 0000000..2699409
 +  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
 +  bool SelectGlobalValueVariableOffset(SDValue Addr,
 +      SDValue &BaseReg, SDValue& Offset);
-+  bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset);
-+  bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset);
 +  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
 +  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
 +
@@ -8468,7 +8489,9 @@ index 0000000..2699409
 +            continue;
 +          }
 +      } else {
-+        if (!TII->isALUInstr(Use->getMachineOpcode())) {
++        if (!TII->isALUInstr(Use->getMachineOpcode()) ||
++            (TII->get(Use->getMachineOpcode()).TSFlags &
++            R600_InstFlag::VECTOR)) {
 +          continue;
 +        }
 +
@@ -8511,7 +8534,8 @@ index 0000000..2699409
 +  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
 +    const R600InstrInfo *TII =
 +        static_cast<const R600InstrInfo*>(TM.getInstrInfo());
-+    if (Result && Result->isMachineOpcode()
++    if (Result && Result->isMachineOpcode() &&
++        !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
 +        && TII->isALUInstr(Result->getMachineOpcode())) {
 +      // Fold FNEG/FABS/CONST_ADDRESS
 +      // TODO: Isel can generate multiple MachineInst, we need to recursively
@@ -8581,6 +8605,8 @@ index 0000000..2699409
 +    SDValue Operand = Ops[OperandIdx[i] - 1];
 +    switch (Operand.getOpcode()) {
 +    case AMDGPUISD::CONST_ADDRESS: {
++      if (i == 2)
++        break;
 +      SDValue CstOffset;
 +      if (!Operand.getValueType().isVector() &&
 +          SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) {
@@ -8775,43 +8801,6 @@ index 0000000..2699409
 +  return false;
 +}
 +
-+bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base,
-+                                             SDValue& Offset) {
-+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-+    return false;
-+  }
-+
-+
-+  if (Addr.getOpcode() == ISD::ADD) {
-+    bool Match = false;
-+
-+    // Find the base ptr and the offset
-+    for (unsigned i = 0; i < Addr.getNumOperands(); i++) {
-+      SDValue Arg = Addr.getOperand(i);
-+      ConstantSDNode * OffsetNode = dyn_cast<ConstantSDNode>(Arg);
-+      // This arg isn't a constant so it must be the base PTR.
-+      if (!OffsetNode) {
-+        Base = Addr.getOperand(i);
-+        continue;
-+      }
-+      // Check if the constant argument fits in 8-bits.  The offset is in bytes
-+      // so we need to convert it to dwords.
-+      if (isUInt<8>(OffsetNode->getZExtValue() >> 2)) {
-+        Match = true;
-+        Offset = CurDAG->getTargetConstant(OffsetNode->getZExtValue() >> 2,
-+                                           MVT::i32);
-+      }
-+    }
-+    return Match;
-+  }
-+
-+  // Default case, no offset
-+  Base = Addr;
-+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
-+  return true;
-+}
-+
 +bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
 +                                           SDValue &Offset) {
 +  ConstantSDNode * IMMOffset;
@@ -8839,20 +8828,6 @@ index 0000000..2699409
 +  return true;
 +}
 +
-+bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base,
-+                                      SDValue& Offset) {
-+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-+      Addr.getOpcode() == ISD::TargetGlobalAddress  ||
-+      Addr.getOpcode() != ISD::ADD) {
-+    return false;
-+  }
-+
-+  Base = Addr.getOperand(0);
-+  Offset = Addr.getOperand(1);
-+
-+  return true;
-+}
-+
 +bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
 +                                            SDValue &Offset) {
 +  ConstantSDNode *C;
@@ -11799,10 +11774,10 @@ index 0000000..8ef9f8c
 +add_subdirectory(MCTargetDesc)
 diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
 new file mode 100644
-index 0000000..fb17ab7
+index 0000000..d6450a0
 --- /dev/null
 +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
-@@ -0,0 +1,153 @@
+@@ -0,0 +1,168 @@
 +//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -11845,6 +11820,21 @@ index 0000000..fb17ab7
 +  }
 +}
 +
++void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
++                                        raw_ostream &O) {
++  unsigned Imm = MI->getOperand(OpNum).getImm();
++
++  if (Imm == 2) {
++    O << "P0";
++  } else if (Imm == 1) {
++    O << "P20";
++  } else if (Imm == 0) {
++    O << "P10";
++  } else {
++    assert(!"Invalid interpolation parameter slot");
++  }
++}
++
 +void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
 +                                        raw_ostream &O) {
 +  printOperand(MI, OpNo, O);
@@ -11958,10 +11948,10 @@ index 0000000..fb17ab7
 +#include "AMDGPUGenAsmWriter.inc"
 diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
 new file mode 100644
-index 0000000..e775c4c
+index 0000000..767a708
 --- /dev/null
 +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
-@@ -0,0 +1,53 @@
+@@ -0,0 +1,54 @@
 +//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -11997,6 +11987,7 @@ index 0000000..e775c4c
 +
 +private:
 +  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
++  void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 +  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 +  void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm);
 +  void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -12342,10 +12333,10 @@ index 0000000..3ad0fa6
 +#endif // AMDGPUMCASMINFO_H
 diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
 new file mode 100644
-index 0000000..9d0d6cf
+index 0000000..8721f80
 --- /dev/null
 +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
-@@ -0,0 +1,60 @@
+@@ -0,0 +1,49 @@
 +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -12390,17 +12381,6 @@ index 0000000..9d0d6cf
 +                                   SmallVectorImpl<MCFixup> &Fixups) const {
 +    return 0;
 +  }
-+  virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const {
-+    return Value;
-+  }
-+  virtual uint64_t i32LiteralEncode(const MCInst &MI, unsigned OpNo,
-+                                   SmallVectorImpl<MCFixup> &Fixups) const {
-+    return 0;
-+  }
-+  virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
-+                                   SmallVectorImpl<MCFixup> &Fixups) const {
-+    return 0;
-+  }
 +};
 +
 +} // End namespace llvm
@@ -12655,10 +12635,10 @@ index 0000000..8894a76
 +include $(LEVEL)/Makefile.common
 diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
 new file mode 100644
-index 0000000..e061b18
+index 0000000..115fe8d
 --- /dev/null
 +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
-@@ -0,0 +1,580 @@
+@@ -0,0 +1,582 @@
 +//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -12823,10 +12803,12 @@ index 0000000..e061b18
 +    case AMDGPU::VTX_READ_PARAM_8_eg:
 +    case AMDGPU::VTX_READ_PARAM_16_eg:
 +    case AMDGPU::VTX_READ_PARAM_32_eg:
++    case AMDGPU::VTX_READ_PARAM_128_eg:
 +    case AMDGPU::VTX_READ_GLOBAL_8_eg:
 +    case AMDGPU::VTX_READ_GLOBAL_32_eg:
 +    case AMDGPU::VTX_READ_GLOBAL_128_eg:
-+    case AMDGPU::TEX_VTX_CONSTBUF: {
++    case AMDGPU::TEX_VTX_CONSTBUF:
++    case AMDGPU::TEX_VTX_TEXBUF : {
 +      uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
 +      uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
 +
@@ -13241,10 +13223,10 @@ index 0000000..e061b18
 +#include "AMDGPUGenMCCodeEmitter.inc"
 diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
 new file mode 100644
-index 0000000..c47dc99
+index 0000000..6dfbbe8
 --- /dev/null
 +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
-@@ -0,0 +1,298 @@
+@@ -0,0 +1,235 @@
 +//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -13271,38 +13253,16 @@ index 0000000..c47dc99
 +#include "llvm/MC/MCFixup.h"
 +#include "llvm/Support/raw_ostream.h"
 +
-+#define VGPR_BIT(src_idx) (1ULL << (9 * src_idx - 1))
-+#define SI_INSTR_FLAGS_ENCODING_MASK 0xf
-+
-+// These must be kept in sync with SIInstructions.td and also the
-+// InstrEncodingInfo array in SIInstrInfo.cpp.
-+//
-+// NOTE: This enum is only used to identify the encoding type within LLVM,
-+// the actual encoding type that is part of the instruction format is different
-+namespace SIInstrEncodingType {
-+  enum Encoding {
-+    EXP = 0,
-+    LDS = 1,
-+    MIMG = 2,
-+    MTBUF = 3,
-+    MUBUF = 4,
-+    SMRD = 5,
-+    SOP1 = 6,
-+    SOP2 = 7,
-+    SOPC = 8,
-+    SOPK = 9,
-+    SOPP = 10,
-+    VINTRP = 11,
-+    VOP1 = 12,
-+    VOP2 = 13,
-+    VOP3 = 14,
-+    VOPC = 15
-+  };
-+}
-+
 +using namespace llvm;
 +
 +namespace {
++
++/// \brief Helper type used in encoding
++typedef union {
++  int32_t I;
++  float F;
++} IntFloatUnion;
++
 +class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
 +  SIMCCodeEmitter(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
 +  void operator=(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
@@ -13311,6 +13271,15 @@ index 0000000..c47dc99
 +  const MCSubtargetInfo &STI;
 +  MCContext &Ctx;
 +
++  /// \brief Encode a sequence of registers with the correct alignment.
++  unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const;
++
++  /// \brief Can this operand also contain immediate values?
++  bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
++
++  /// \brief Encode an fp or int literal
++  uint32_t getLitEncoding(const MCOperand &MO) const;
++
 +public:
 +  SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
 +                  const MCSubtargetInfo &sti, MCContext &ctx)
@@ -13326,11 +13295,6 @@ index 0000000..c47dc99
 +  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
 +                                     SmallVectorImpl<MCFixup> &Fixups) const;
 +
-+public:
-+
-+  /// \brief Encode a sequence of registers with the correct alignment.
-+  unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const;
-+
 +  /// \brief Encoding for when 2 consecutive registers are used
 +  virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
 +                                   SmallVectorImpl<MCFixup> &Fixup) const;
@@ -13338,29 +13302,6 @@ index 0000000..c47dc99
 +  /// \brief Encoding for when 4 consectuive registers are used
 +  virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
 +                                   SmallVectorImpl<MCFixup> &Fixup) const;
-+
-+  /// \brief Encoding for SMRD indexed loads
-+  virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
-+                                   SmallVectorImpl<MCFixup> &Fixup) const;
-+
-+  /// \brief Post-Encoder method for VOP instructions
-+  virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const;
-+
-+private:
-+
-+  /// \returns this SIInstrEncodingType for this instruction.
-+  unsigned getEncodingType(const MCInst &MI) const;
-+
-+  /// \brief Get then size in bytes of this instructions encoding.
-+  unsigned getEncodingBytes(const MCInst &MI) const;
-+
-+  /// \returns the hardware encoding for a register
-+  unsigned getRegBinaryCode(unsigned reg) const;
-+
-+  /// \brief Generated function that returns the hardware encoding for
-+  /// a register
-+  unsigned getHWRegNum(unsigned reg) const;
-+
 +};
 +
 +} // End anonymous namespace
@@ -13372,39 +13313,131 @@ index 0000000..c47dc99
 +  return new SIMCCodeEmitter(MCII, MRI, STI, Ctx);
 +}
 +
++bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
++                                   unsigned OpNo) const {
++
++  unsigned RegClass = Desc.OpInfo[OpNo].RegClass;
++  return (AMDGPU::SSrc_32RegClassID == RegClass) ||
++         (AMDGPU::SSrc_64RegClassID == RegClass) ||
++         (AMDGPU::VSrc_32RegClassID == RegClass) ||
++         (AMDGPU::VSrc_64RegClassID == RegClass);
++}
++
++uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const {
++
++  IntFloatUnion Imm;
++  if (MO.isImm())
++    Imm.I = MO.getImm();
++  else if (MO.isFPImm())
++    Imm.F = MO.getFPImm();
++  else
++    return ~0;
++
++  if (Imm.I >= 0 && Imm.I <= 64)
++    return 128 + Imm.I;
++
++  if (Imm.I >= -16 && Imm.I <= -1)
++    return 192 + abs(Imm.I);
++
++  if (Imm.F == 0.5f)
++    return 240;
++
++  if (Imm.F == -0.5f)
++    return 241;
++
++  if (Imm.F == 1.0f)
++    return 242;
++
++  if (Imm.F == -1.0f)
++    return 243;
++
++  if (Imm.F == 2.0f)
++    return 244;
++
++  if (Imm.F == -2.0f)
++    return 245;
++
++  if (Imm.F == 4.0f)
++    return 246;
++
++  if (Imm.F == 4.0f)
++    return 247;
++
++  return 255;
++}
++
 +void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
 +                                       SmallVectorImpl<MCFixup> &Fixups) const {
++
 +  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups);
-+  unsigned bytes = getEncodingBytes(MI);
++  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
++  unsigned bytes = Desc.getSize();
++
 +  for (unsigned i = 0; i < bytes; i++) {
 +    OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
 +  }
++
++  if (bytes > 4)
++    return;
++
++  // Check for additional literals in SRC0/1/2 (Op 1/2/3)
++  for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
++
++    // Check if this operand should be encoded as [SV]Src
++    if (!isSrcOperand(Desc, i))
++      continue;
++
++    // Is this operand a literal immediate?
++    const MCOperand &Op = MI.getOperand(i);
++    if (getLitEncoding(Op) != 255)
++      continue;
++
++    // Yes! Encode it
++    IntFloatUnion Imm;
++    if (Op.isImm())
++      Imm.I = Op.getImm();
++    else
++      Imm.F = Op.getFPImm();
++
++    for (unsigned j = 0; j < 4; j++) {
++      OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff));
++    }
++
++    // Only one literal value allowed
++    break;
++  }
 +}
 +
 +uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
 +                                            const MCOperand &MO,
 +                                       SmallVectorImpl<MCFixup> &Fixups) const {
-+  if (MO.isReg()) {
-+    return getRegBinaryCode(MO.getReg());
-+  } else if (MO.isImm()) {
-+    return MO.getImm();
-+  } else if (MO.isFPImm()) {
-+    // XXX: Not all instructions can use inline literals
-+    // XXX: We should make sure this is a 32-bit constant
-+    union {
-+      float F;
-+      uint32_t I;
-+    } Imm;
-+    Imm.F = MO.getFPImm();
-+    return Imm.I;
-+  } else if (MO.isExpr()) {
++  if (MO.isReg())
++    return MRI.getEncodingValue(MO.getReg());
++
++  if (MO.isExpr()) {
 +    const MCExpr *Expr = MO.getExpr();
 +    MCFixupKind Kind = MCFixupKind(FK_PCRel_4);
 +    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
 +    return 0;
-+  } else{
-+    llvm_unreachable("Encoding of this operand type is not supported yet.");
 +  }
++
++  // Figure out the operand number, needed for isSrcOperand check
++  unsigned OpNo = 0;
++  for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
++    if (&MO == &MI.getOperand(OpNo))
++      break;
++  }
++
++  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
++  if (isSrcOperand(Desc, OpNo)) {
++    uint32_t Enc = getLitEncoding(MO);
++    if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
++      return Enc;
++
++  } else if (MO.isImm())
++    return MO.getImm();
++
++  llvm_unreachable("Encoding of this operand type is not supported yet.");
 +  return 0;
 +}
 +
@@ -13414,10 +13447,10 @@ index 0000000..c47dc99
 +
 +unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo,
 +                                   unsigned shift) const {
-+  unsigned regCode = getRegBinaryCode(MI.getOperand(OpNo).getReg());
-+  return regCode >> shift;
-+  return 0;
++  unsigned regCode = MRI.getEncodingValue(MI.getOperand(OpNo).getReg());
++  return (regCode & 0xff) >> shift;
 +}
++
 +unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI,
 +                                          unsigned OpNo ,
 +                                        SmallVectorImpl<MCFixup> &Fixup) const {
@@ -13429,120 +13462,6 @@ index 0000000..c47dc99
 +                                        SmallVectorImpl<MCFixup> &Fixup) const {
 +  return GPRAlign(MI, OpNo, 2);
 +}
-+
-+#define SMRD_OFFSET_MASK 0xff
-+#define SMRD_IMM_SHIFT 8
-+#define SMRD_SBASE_MASK 0x3f
-+#define SMRD_SBASE_SHIFT 9
-+/// This function is responsibe for encoding the offset
-+/// and the base ptr for SMRD instructions it should return a bit string in
-+/// this format:
-+///
-+/// OFFSET = bits{7-0}
-+/// IMM    = bits{8}
-+/// SBASE  = bits{14-9}
-+///
-+uint32_t SIMCCodeEmitter::SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
-+                                        SmallVectorImpl<MCFixup> &Fixup) const {
-+  uint32_t Encoding;
-+
-+  const MCOperand &OffsetOp = MI.getOperand(OpNo + 1);
-+
-+  //XXX: Use this function for SMRD loads with register offsets
-+  assert(OffsetOp.isImm());
-+
-+  Encoding =
-+      (getMachineOpValue(MI, OffsetOp, Fixup) & SMRD_OFFSET_MASK)
-+    | (1 << SMRD_IMM_SHIFT) //XXX If the Offset is a register we shouldn't set this bit
-+    | ((GPR2AlignEncode(MI, OpNo, Fixup) & SMRD_SBASE_MASK) << SMRD_SBASE_SHIFT)
-+    ;
-+
-+  return Encoding;
-+}
-+
-+//===----------------------------------------------------------------------===//
-+// Post Encoder Callbacks
-+//===----------------------------------------------------------------------===//
-+
-+uint64_t SIMCCodeEmitter::VOPPostEncode(const MCInst &MI, uint64_t Value) const{
-+  unsigned encodingType = getEncodingType(MI);
-+  unsigned numSrcOps;
-+  unsigned vgprBitOffset;
-+
-+  if (encodingType == SIInstrEncodingType::VOP3) {
-+    numSrcOps = 3;
-+    vgprBitOffset = 32;
-+  } else {
-+    numSrcOps = 1;
-+    vgprBitOffset = 0;
-+  }
-+
-+  // Add one to skip over the destination reg operand.
-+  for (unsigned opIdx = 1; opIdx < numSrcOps + 1; opIdx++) {
-+    const MCOperand &MO = MI.getOperand(opIdx);
-+    if (MO.isReg()) {
-+      unsigned reg = MI.getOperand(opIdx).getReg();
-+      if (AMDGPUMCRegisterClasses[AMDGPU::VReg_32RegClassID].contains(reg) ||
-+          AMDGPUMCRegisterClasses[AMDGPU::VReg_64RegClassID].contains(reg)) {
-+        Value |= (VGPR_BIT(opIdx)) << vgprBitOffset;
-+      }
-+    } else if (MO.isFPImm()) {
-+      union {
-+        float f;
-+        uint32_t i;
-+      } Imm;
-+      // XXX: Not all instructions can use inline literals
-+      // XXX: We should make sure this is a 32-bit constant
-+      Imm.f = MO.getFPImm();
-+      Value |= ((uint64_t)Imm.i) << 32;
-+    }
-+  }
-+  return Value;
-+}
-+
-+//===----------------------------------------------------------------------===//
-+// Encoding helper functions
-+//===----------------------------------------------------------------------===//
-+
-+unsigned SIMCCodeEmitter::getEncodingType(const MCInst &MI) const {
-+  return MCII.get(MI.getOpcode()).TSFlags & SI_INSTR_FLAGS_ENCODING_MASK;
-+}
-+
-+unsigned SIMCCodeEmitter::getEncodingBytes(const MCInst &MI) const {
-+
-+  // These instructions aren't real instructions with an encoding type, so
-+  // we need to manually specify their size.
-+  switch (MI.getOpcode()) {
-+  default: break;
-+  case AMDGPU::SI_LOAD_LITERAL_I32:
-+  case AMDGPU::SI_LOAD_LITERAL_F32:
-+    return 4;
-+  }
-+
-+  unsigned encoding_type = getEncodingType(MI);
-+  switch (encoding_type) {
-+    case SIInstrEncodingType::EXP:
-+    case SIInstrEncodingType::LDS:
-+    case SIInstrEncodingType::MUBUF:
-+    case SIInstrEncodingType::MTBUF:
-+    case SIInstrEncodingType::MIMG:
-+    case SIInstrEncodingType::VOP3:
-+      return 8;
-+    default:
-+      return 4;
-+  }
-+}
-+
-+
-+unsigned SIMCCodeEmitter::getRegBinaryCode(unsigned reg) const {
-+  switch (reg) {
-+    case AMDGPU::M0: return 124;
-+    case AMDGPU::SREG_LIT_0: return 128;
-+    case AMDGPU::SI_LITERAL_CONSTANT: return 255;
-+    default: return MRI.getEncodingValue(reg);
-+  }
-+}
-+
 diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile
 new file mode 100644
 index 0000000..1b3ebbe
@@ -13574,10 +13493,10 @@ index 0000000..1b3ebbe
 +include $(LEVEL)/Makefile.common
 diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
 new file mode 100644
-index 0000000..3dc1ecd
+index 0000000..868810c
 --- /dev/null
 +++ b/lib/Target/R600/Processors.td
-@@ -0,0 +1,29 @@
+@@ -0,0 +1,30 @@
 +//===-- Processors.td - TODO: Add brief description -------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -13593,6 +13512,7 @@ index 0000000..3dc1ecd
 +
 +class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
 +: Processor<Name, itin, Features>;
++def : Proc<"",           R600_EG_Itin, [FeatureR600ALUInst]>;
 +def : Proc<"r600",       R600_EG_Itin, [FeatureR600ALUInst]>;
 +def : Proc<"rv710",      R600_EG_Itin, []>;
 +def : Proc<"rv730",      R600_EG_Itin, []>;
@@ -14008,7 +13928,7 @@ index 0000000..c00c349
 +}
 diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
 new file mode 100644
-index 0000000..a479cee
+index 0000000..9c38522
 --- /dev/null
 +++ b/lib/Target/R600/R600ISelLowering.cpp
 @@ -0,0 +1,1195 @@
@@ -15094,7 +15014,7 @@ index 0000000..a479cee
 +                                                    AMDGPUAS::PARAM_I_ADDRESS);
 +    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
 +                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
-+                                       MachinePointerInfo(new Argument(PtrTy)),
++                                       MachinePointerInfo(UndefValue::get(PtrTy)),
 +                                       ArgVT, false, false, ArgBytes);
 +    InVals.push_back(Arg);
 +    ParamOffsetBytes += ArgBytes;
@@ -16282,10 +16202,10 @@ index 0000000..278fad1
 +#endif // R600INSTRINFO_H_
 diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
 new file mode 100644
-index 0000000..d307ed2
+index 0000000..409da07
 --- /dev/null
 +++ b/lib/Target/R600/R600Instructions.td
-@@ -0,0 +1,1917 @@
+@@ -0,0 +1,1976 @@
 +//===-- R600Instructions.td - R600 Instruction defs  -------*- tablegen -*-===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -16687,7 +16607,7 @@ index 0000000..d307ed2
 +def TEX_SHADOW : PatLeaf<
 +  (imm),
 +  [{uint32_t TType = (uint32_t)N->getZExtValue();
-+    return (TType >= 6 && TType <= 8) || TType == 13;
++    return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13);
 +  }]
 +>;
 +
@@ -17779,6 +17699,10 @@ index 0000000..d307ed2
 +  [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
 +>;
 +
++def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
++  [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))]
++>;
++
 +//===----------------------------------------------------------------------===//
 +// VTX Read from global memory space
 +//===----------------------------------------------------------------------===//
@@ -17874,6 +17798,7 @@ index 0000000..d307ed2
 +  (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
 +  "", [], NullALU> {
 +  let FlagOperandIdx = 3;
++  let isTerminator = 1;
 +}
 +
 +let isTerminator = 1, isBranch = 1, isBarrier = 1 in {
@@ -18001,6 +17926,60 @@ index 0000000..d307ed2
 +// Inst{127-96} = 0;
 +}
 +
++def TEX_VTX_TEXBUF:
++  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
++      [(set R600_Reg128:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
++VTX_WORD1_GPR, VTX_WORD0 {
++
++let VC_INST = 0;
++let FETCH_TYPE = 2;
++let FETCH_WHOLE_QUAD = 0;
++let SRC_REL = 0;
++let SRC_SEL_X = 0;
++let DST_REL = 0;
++let USE_CONST_FIELDS = 1;
++let NUM_FORMAT_ALL = 0;
++let FORMAT_COMP_ALL = 0;
++let SRF_MODE_ALL = 1;
++let MEGA_FETCH_COUNT = 16;
++let DST_SEL_X        = 0;
++let DST_SEL_Y        = 1;
++let DST_SEL_Z        = 2;
++let DST_SEL_W        = 3;
++let DATA_FORMAT      = 0;
++
++let Inst{31-0} = Word0;
++let Inst{63-32} = Word1;
++
++// LLVM can only encode 64-bit instructions, so these fields are manually
++// encoded in R600CodeEmitter
++//
++// bits<16> OFFSET;
++// bits<2>  ENDIAN_SWAP = 0;
++// bits<1>  CONST_BUF_NO_STRIDE = 0;
++// bits<1>  MEGA_FETCH = 0;
++// bits<1>  ALT_CONST = 0;
++// bits<2>  BUFFER_INDEX_MODE = 0;
++
++
++
++// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
++// is done in R600CodeEmitter
++//
++// Inst{79-64} = OFFSET;
++// Inst{81-80} = ENDIAN_SWAP;
++// Inst{82}    = CONST_BUF_NO_STRIDE;
++// Inst{83}    = MEGA_FETCH;
++// Inst{84}    = ALT_CONST;
++// Inst{86-85} = BUFFER_INDEX_MODE;
++// Inst{95-86} = 0; Reserved
++
++// VTX_WORD3 (Padding)
++//
++// Inst{127-96} = 0;
++}
++
++
 +
 +//===--------------------------------------------------------------------===//
 +// Instructions support
@@ -18205,10 +18184,10 @@ index 0000000..d307ed2
 +} // End isR600toCayman Predicate
 diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td
 new file mode 100644
-index 0000000..284d4d8
+index 0000000..6046f0d
 --- /dev/null
 +++ b/lib/Target/R600/R600Intrinsics.td
-@@ -0,0 +1,32 @@
+@@ -0,0 +1,57 @@
 +//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -18227,6 +18206,8 @@ index 0000000..284d4d8
 +    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
 +  def int_R600_interp_input :
 +    Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
++  def int_R600_load_texbuf :
++    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 +  def int_R600_store_swizzle :
 +    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
 +
@@ -18241,9 +18222,32 @@ index 0000000..284d4d8
 +  def int_R600_store_dummy :
 +      Intrinsic<[], [llvm_i32_ty], []>;
 +}
++let TargetPrefix = "r600", isTarget = 1 in {
++
++class R600ReadPreloadRegisterIntrinsic<string name>
++  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
++    GCCBuiltin<name>;
++
++multiclass R600ReadPreloadRegisterIntrinsic_xyz<string prefix> {
++  def _x : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_x")>;
++  def _y : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_y")>;
++  def _z : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_z")>;
++}
++
++defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz <
++                                       "__builtin_r600_read_global_size">;
++defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz <
++                                       "__builtin_r600_read_local_size">;
++defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz <
++                                       "__builtin_r600_read_ngroups">;
++defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
++                                       "__builtin_r600_read_tgid">;
++defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
++                                       "__builtin_r600_read_tidig">;
++}
 diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp
 new file mode 100644
-index 0000000..2557e8f
+index 0000000..c8c27a8
 --- /dev/null
 +++ b/lib/Target/R600/R600LowerConstCopy.cpp
 @@ -0,0 +1,222 @@
@@ -18429,7 +18433,7 @@ index 0000000..2557e8f
 +            int ConstMovSel =
 +                TII->getOperandIdx(CstMov->getOpcode(), R600Operands::SRC0_SEL);
 +            unsigned ConstIndex = CstMov->getOperand(ConstMovSel).getImm();
-+            if (canFoldInBundle(CP, ConstIndex)) {
++            if (MI->isInsideBundle() && canFoldInBundle(CP, ConstIndex)) {
 +              TII->setImmOperand(MI, OpTable[SrcOp][1], ConstIndex);
 +              MI->getOperand(SrcIdx).setReg(AMDGPU::ALU_CONST);
 +            } else {
@@ -18534,10 +18538,10 @@ index 0000000..41e4894
 +#endif //R600MACHINEFUNCTIONINFO_H
 diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
 new file mode 100644
-index 0000000..33e858d
+index 0000000..bbd7995
 --- /dev/null
 +++ b/lib/Target/R600/R600RegisterInfo.cpp
-@@ -0,0 +1,93 @@
+@@ -0,0 +1,99 @@
 +//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -18589,6 +18593,12 @@ index 0000000..33e858d
 +    Reserved.set(*I);
 +  }
 +
++  for (TargetRegisterClass::iterator I = AMDGPU::TRegMemRegClass.begin(),
++                                     E = AMDGPU::TRegMemRegClass.end();
++                                     I !=  E; ++I) {
++    Reserved.set(*I);
++  }
++
 +  const R600InstrInfo *RII = static_cast<const R600InstrInfo*>(&TII);
 +  std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF);
 +  for (std::vector<unsigned>::iterator I = IndirectRegs.begin(),
@@ -18694,7 +18704,7 @@ index 0000000..c170ccb
 +#endif // AMDIDSAREGISTERINFO_H_
 diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
 new file mode 100644
-index 0000000..3812eb7
+index 0000000..a7d847a
 --- /dev/null
 +++ b/lib/Target/R600/R600RegisterInfo.td
 @@ -0,0 +1,146 @@
@@ -18781,7 +18791,7 @@ index 0000000..3812eb7
 +} // End isAllocatable = 0
 +
 +def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
-+                                   (add (sequence "T%u_X", 0, 127))>;
++                                   (add (sequence "T%u_X", 0, 127), AR_X)>;
 +
 +def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
 +                                   (add (sequence "T%u_Y", 0, 127))>;
@@ -19382,10 +19392,10 @@ index 0000000..832e44d
 +}
 diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
 new file mode 100644
-index 0000000..202584b
+index 0000000..694c045
 --- /dev/null
 +++ b/lib/Target/R600/SIISelLowering.cpp
-@@ -0,0 +1,489 @@
+@@ -0,0 +1,399 @@
 +//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -19419,8 +19429,7 @@ index 0000000..202584b
 +  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
 +  addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
 +  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
-+  addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass);
-+  addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass);
++  addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
 +
 +  addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass);
 +  addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass);
@@ -19430,8 +19439,6 @@ index 0000000..202584b
 +
 +  computeRegisterProperties();
 +
-+  setOperationAction(ISD::AND, MVT::i1, Custom);
-+
 +  setOperationAction(ISD::ADD, MVT::i64, Legal);
 +  setOperationAction(ISD::ADD, MVT::i32, Legal);
 +
@@ -19462,13 +19469,11 @@ index 0000000..202584b
 +    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 +  case AMDGPU::BRANCH: return BB;
 +  case AMDGPU::CLAMP_SI:
-+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
++    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_ADD_F32_e64))
 +           .addOperand(MI->getOperand(0))
 +           .addOperand(MI->getOperand(1))
-+           // VSRC1-2 are unused, but we still need to fill all the
-+           // operand slots, so we just reuse the VSRC0 operand
-+           .addOperand(MI->getOperand(1))
-+           .addOperand(MI->getOperand(1))
++           .addImm(0x80) // SRC1
++           .addImm(0x80) // SRC2
 +           .addImm(0) // ABS
 +           .addImm(1) // CLAMP
 +           .addImm(0) // OMOD
@@ -19477,13 +19482,11 @@ index 0000000..202584b
 +    break;
 +
 +  case AMDGPU::FABS_SI:
-+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
++    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_ADD_F32_e64))
 +                 .addOperand(MI->getOperand(0))
 +                 .addOperand(MI->getOperand(1))
-+                 // VSRC1-2 are unused, but we still need to fill all the
-+                 // operand slots, so we just reuse the VSRC0 operand
-+                 .addOperand(MI->getOperand(1))
-+                 .addOperand(MI->getOperand(1))
++                 .addImm(0x80) // SRC1
++                 .addImm(0x80) // SRC2
 +                 .addImm(1) // ABS
 +                 .addImm(0) // CLAMP
 +                 .addImm(0) // OMOD
@@ -19492,13 +19495,11 @@ index 0000000..202584b
 +    break;
 +
 +  case AMDGPU::FNEG_SI:
-+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
++    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_ADD_F32_e64))
 +                 .addOperand(MI->getOperand(0))
 +                 .addOperand(MI->getOperand(1))
-+                 // VSRC1-2 are unused, but we still need to fill all the
-+                 // operand slots, so we just reuse the VSRC0 operand
-+                 .addOperand(MI->getOperand(1))
-+                 .addOperand(MI->getOperand(1))
++                 .addImm(0x80) // SRC1
++                 .addImm(0x80) // SRC2
 +                 .addImm(0) // ABS
 +                 .addImm(0) // CLAMP
 +                 .addImm(0) // OMOD
@@ -19514,15 +19515,9 @@ index 0000000..202584b
 +  case AMDGPU::SI_INTERP:
 +    LowerSI_INTERP(MI, *BB, I, MRI);
 +    break;
-+  case AMDGPU::SI_INTERP_CONST:
-+    LowerSI_INTERP_CONST(MI, *BB, I, MRI);
-+    break;
 +  case AMDGPU::SI_WQM:
 +    LowerSI_WQM(MI, *BB, I, MRI);
 +    break;
-+  case AMDGPU::SI_V_CNDLT:
-+    LowerSI_V_CNDLT(MI, *BB, I, MRI);
-+    break;
 +  }
 +  return BB;
 +}
@@ -19566,46 +19561,6 @@ index 0000000..202584b
 +  MI->eraseFromParent();
 +}
 +
-+void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI,
-+    MachineBasicBlock &BB, MachineBasicBlock::iterator I,
-+    MachineRegisterInfo &MRI) const {
-+  MachineOperand dst = MI->getOperand(0);
-+  MachineOperand attr_chan = MI->getOperand(1);
-+  MachineOperand attr = MI->getOperand(2);
-+  MachineOperand params = MI->getOperand(3);
-+  unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
-+
-+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
-+          .addOperand(params);
-+
-+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32))
-+          .addOperand(dst)
-+          .addOperand(attr_chan)
-+          .addOperand(attr)
-+          .addReg(M0);
-+
-+  MI->eraseFromParent();
-+}
-+
-+void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
-+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
-+  unsigned VCC = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-+
-+  BuildMI(BB, I, BB.findDebugLoc(I),
-+          TII->get(AMDGPU::V_CMP_GT_F32_e32),
-+          VCC)
-+          .addReg(AMDGPU::SREG_LIT_0)
-+          .addOperand(MI->getOperand(1));
-+
-+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32_e32))
-+          .addOperand(MI->getOperand(0))
-+          .addOperand(MI->getOperand(3))
-+          .addOperand(MI->getOperand(2))
-+          .addReg(VCC);
-+
-+  MI->eraseFromParent();
-+}
-+
 +EVT SITargetLowering::getSetCCResultType(EVT VT) const {
 +  return MVT::i1;
 +}
@@ -19620,7 +19575,6 @@ index 0000000..202584b
 +  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 +  case ISD::LOAD: return LowerLOAD(Op, DAG);
 +  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-+  case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND);
 +  case ISD::INTRINSIC_WO_CHAIN: {
 +    unsigned IntrinsicID =
 +                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -19637,30 +19591,6 @@ index 0000000..202584b
 +  return SDValue();
 +}
 +
-+/// \brief The function is for lowering i1 operations on the
-+/// VCC register.
-+///
-+/// In the VALU context, VCC is a one bit register, but in the
-+/// SALU context the VCC is a 64-bit register (1-bit per thread).  Since only
-+/// the SALU can perform operations on the VCC register, we need to promote
-+/// the operand types from i1 to i64 in order for tablegen to be able to match
-+/// this operation to the correct SALU instruction.  We do this promotion by
-+/// wrapping the operands in a CopyToReg node.
-+///
-+SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op,
-+                                               SelectionDAG &DAG,
-+                                               unsigned VCCNode) const {
-+  DebugLoc DL = Op.getDebugLoc();
-+
-+  SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64,
-+                               DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
-+                                           Op.getOperand(0)),
-+                               DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
-+                                           Op.getOperand(1)));
-+
-+  return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode);
-+}
-+
 +/// \brief Helper function for LowerBRCOND
 +static SDNode *findUser(SDValue Value, unsigned Opcode) {
 +
@@ -19865,22 +19795,12 @@ index 0000000..202584b
 +  }
 +  return SDValue();
 +}
-+
-+#define NODE_NAME_CASE(node) case SIISD::node: return #node;
-+
-+const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const {
-+  switch (Opcode) {
-+  default: return AMDGPUTargetLowering::getTargetNodeName(Opcode);
-+  NODE_NAME_CASE(VCC_AND)
-+  NODE_NAME_CASE(VCC_BITCAST)
-+  }
-+}
 diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
 new file mode 100644
-index 0000000..8528c24
+index 0000000..5d048f8
 --- /dev/null
 +++ b/lib/Target/R600/SIISelLowering.h
-@@ -0,0 +1,55 @@
+@@ -0,0 +1,48 @@
 +//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -19910,15 +19830,9 @@ index 0000000..8528c24
 +              MachineBasicBlock::iterator I, unsigned Opocde) const;
 +  void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
 +              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
-+  void LowerSI_INTERP_CONST(MachineInstr *MI, MachineBasicBlock &BB,
-+              MachineBasicBlock::iterator I, MachineRegisterInfo &MRI) const;
 +  void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
 +              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
-+  void LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
-+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
 +
-+  SDValue Loweri1ContextSwitch(SDValue Op, SelectionDAG &DAG,
-+                                           unsigned VCCNode) const;
 +  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
 +  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
 +  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -19930,7 +19844,6 @@ index 0000000..8528c24
 +  virtual EVT getSetCCResultType(EVT VT) const;
 +  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 +  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-+  virtual const char* getTargetNodeName(unsigned Opcode) const;
 +};
 +
 +} // End namespace llvm
@@ -20297,10 +20210,10 @@ index 0000000..24fc929
 +}
 diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
 new file mode 100644
-index 0000000..aea3b5a
+index 0000000..40e37aa
 --- /dev/null
 +++ b/lib/Target/R600/SIInstrFormats.td
-@@ -0,0 +1,146 @@
+@@ -0,0 +1,188 @@
 +//===-- SIInstrFormats.td - SI Instruction Formats ------------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -20324,40 +20237,23 @@ index 0000000..aea3b5a
 +//
 +//===----------------------------------------------------------------------===//
 +
-+class VOP3b_2IN <bits<9> op, string opName, RegisterClass dstClass,
-+                 RegisterClass src0Class, RegisterClass src1Class,
-+                 list<dag> pattern>
-+  : VOP3b <op, (outs dstClass:$vdst),
-+               (ins src0Class:$src0, src1Class:$src1, InstFlag:$src2, InstFlag:$sdst,
-+                    InstFlag:$omod, InstFlag:$neg),
-+           opName, pattern
-+>;
-+
-+
-+class VOP3_1_32 <bits<9> op, string opName, list<dag> pattern>
-+  : VOP3b_2IN <op, opName, SReg_1, AllReg_32, VReg_32, pattern>;
-+
 +class VOP3_32 <bits<9> op, string opName, list<dag> pattern>
-+  : VOP3 <op, (outs VReg_32:$dst), (ins AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
++  : VOP3 <op, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1, VReg_32:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
 +
 +class VOP3_64 <bits<9> op, string opName, list<dag> pattern>
-+  : VOP3 <op, (outs VReg_64:$dst), (ins AllReg_64:$src0, VReg_64:$src1, VReg_64:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
-+
++  : VOP3 <op, (outs VReg_64:$dst), (ins VSrc_64:$src0, VReg_64:$src1, VReg_64:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
 +
 +class SOP1_32 <bits<8> op, string opName, list<dag> pattern>
-+  : SOP1 <op, (outs SReg_32:$dst), (ins SReg_32:$src0), opName, pattern>;
++  : SOP1 <op, (outs SReg_32:$dst), (ins SSrc_32:$src0), opName, pattern>;
 +
 +class SOP1_64 <bits<8> op, string opName, list<dag> pattern>
-+  : SOP1 <op, (outs SReg_64:$dst), (ins SReg_64:$src0), opName, pattern>;
++  : SOP1 <op, (outs SReg_64:$dst), (ins SSrc_64:$src0), opName, pattern>;
 +
 +class SOP2_32 <bits<7> op, string opName, list<dag> pattern>
-+  : SOP2 <op, (outs SReg_32:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
++  : SOP2 <op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), opName, pattern>;
 +
 +class SOP2_64 <bits<7> op, string opName, list<dag> pattern>
-+  : SOP2 <op, (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
-+
-+class SOP2_VCC <bits<7> op, string opName, list<dag> pattern>
-+  : SOP2 <op, (outs SReg_1:$vcc), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
++  : SOP2 <op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), opName, pattern>;
 +
 +class VOP1_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
 +                   string opName, list<dag> pattern> : 
@@ -20366,7 +20262,7 @@ index 0000000..aea3b5a
 +  >;
 +
 +multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern> {
-+  def _e32: VOP1_Helper <op, VReg_32, AllReg_32, opName, pattern>;
++  def _e32: VOP1_Helper <op, VReg_32, VSrc_32, opName, pattern>;
 +  def _e64 : VOP3_32 <{1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
 +                      opName, []
 +  >;
@@ -20374,7 +20270,7 @@ index 0000000..aea3b5a
 +
 +multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> {
 +
-+  def _e32 : VOP1_Helper <op, VReg_64, AllReg_64, opName, pattern>;
++  def _e32 : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>;
 +
 +  def _e64 : VOP3_64 <
 +    {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
@@ -20390,7 +20286,7 @@ index 0000000..aea3b5a
 +
 +multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern> {
 +
-+  def _e32 : VOP2_Helper <op, VReg_32, AllReg_32, opName, pattern>;
++  def _e32 : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern>;
 +
 +  def _e64 : VOP3_32 <{1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
 +                      opName, []
@@ -20398,7 +20294,7 @@ index 0000000..aea3b5a
 +}
 +
 +multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern> {
-+  def _e32: VOP2_Helper <op, VReg_64, AllReg_64, opName, pattern>;
++  def _e32: VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern>;
 +
 +  def _e64 : VOP3_64 <
 +    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
@@ -20412,47 +20308,106 @@ index 0000000..aea3b5a
 +class SOPK_64 <bits<5> op, string opName, list<dag> pattern>
 +  : SOPK <op, (outs SReg_64:$dst), (ins i16imm:$src0), opName, pattern>;
 +
-+class VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
-+                 string opName, list<dag> pattern> :
-+  VOPC <
-+    op, (ins arc:$src0, vrc:$src1), opName, pattern
-+  >;
++multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
++                        string opName, list<dag> pattern> {
 +
-+multiclass VOPC_32 <bits<9> op, string opName, list<dag> pattern> {
-+
-+  def _e32 : VOPC_Helper <
-+    {op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-+    VReg_32, AllReg_32, opName, pattern
-+  >;
-+
-+  def _e64 : VOP3_1_32 <
-+    op,
-+    opName, pattern
-+  >;
-+}
-+
-+multiclass VOPC_64 <bits<8> op, string opName, list<dag> pattern> {
-+
-+  def _e32 : VOPC_Helper <op, VReg_64, AllReg_64, opName, pattern>;
-+
-+  def _e64 : VOP3_64 <
++  def _e32 : VOPC <op, (ins arc:$src0, vrc:$src1), opName, pattern>;
++  def _e64 : VOP3 <
 +    {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-+    opName, []
-+  >;
++    (outs SReg_64:$dst),
++    (ins arc:$src0, vrc:$src1,
++         InstFlag:$abs, InstFlag:$clamp,
++         InstFlag:$omod, InstFlag:$neg),
++    opName, pattern
++  > {
++    let SRC2 = 0x80;
++  }
 +}
 +
++multiclass VOPC_32 <bits<8> op, string opName, list<dag> pattern>
++  : VOPC_Helper <op, VReg_32, VSrc_32, opName, pattern>;
++
++multiclass VOPC_64 <bits<8> op, string opName, list<dag> pattern>
++  : VOPC_Helper <op, VReg_64, VSrc_64, opName, pattern>;
++
 +class SOPC_32 <bits<7> op, string opName, list<dag> pattern>
-+  : SOPC <op, (outs SCCReg:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
++  : SOPC <op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), opName, pattern>;
 +
 +class SOPC_64 <bits<7> op, string opName, list<dag> pattern>
-+  : SOPC <op, (outs SCCReg:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
++  : SOPC <op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), opName, pattern>;
++
++class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
++  op,
++  (outs VReg_128:$vdata),
++  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
++       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr,
++       GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
++  asm,
++  []> {
++  let mayLoad = 1;
++  let mayStore = 0;
++}
++
++class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
++  op,
++  (outs),
++  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
++   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
++   GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
++  asm,
++  []> {
++  let mayStore = 1;
++  let mayLoad = 0;
++}
++
++class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
++  op,
++  (outs regClass:$dst),
++  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
++       i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc,
++       i1imm:$tfe, SSrc_32:$soffset),
++  asm,
++  []> {
++  let mayLoad = 1;
++  let mayStore = 0;
++}
++
++class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
++  op,
++  (outs regClass:$dst),
++  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
++       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc,
++       i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
++  asm,
++  []> {
++  let mayLoad = 1;
++  let mayStore = 0;
++}
++
++multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass> {
++  def _IMM : SMRD <
++             op, 1,
++             (outs dstClass:$dst),
++             (ins GPR2Align<SReg_64>:$sbase, i32imm:$offset),
++             asm,
++             []
++  >;
++
++  def _SGPR : SMRD <
++              op, 0,
++              (outs dstClass:$dst),
++              (ins GPR2Align<SReg_64>:$sbase, SReg_32:$soff),
++              asm,
++              []
++  >;
++}
 +
 diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
 new file mode 100644
-index 0000000..011ec50
+index 0000000..1c4b3cf
 --- /dev/null
 +++ b/lib/Target/R600/SIInstrInfo.cpp
-@@ -0,0 +1,137 @@
+@@ -0,0 +1,143 @@
 +//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -20497,7 +20452,15 @@ index 0000000..011ec50
 +  // never be necessary.
 +  assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
 +
-+  if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
++  if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
++    assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
++	   AMDGPU::SReg_64RegClass.contains(SrcReg));
++    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub0))
++            .addReg(RI.getSubReg(SrcReg, AMDGPU::sub0), getKillRegState(KillSrc))
++            .addReg(DestReg, RegState::Define | RegState::Implicit);
++    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub1))
++            .addReg(RI.getSubReg(SrcReg, AMDGPU::sub1), getKillRegState(KillSrc));
++  } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
 +    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
 +    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
 +            .addReg(SrcReg, getKillRegState(KillSrc));
@@ -20516,9 +20479,10 @@ index 0000000..011ec50
 +
 +MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg,
 +                                           int64_t Imm) const {
-+  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_IMM_I32), DebugLoc());
-+  MachineInstrBuilder(MI).addReg(DstReg, RegState::Define);
-+  MachineInstrBuilder(MI).addImm(Imm);
++  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_B32_e32), DebugLoc());
++  MachineInstrBuilder MIB(MI);
++  MIB.addReg(DstReg, RegState::Define);
++  MIB.addImm(Imm);
 +
 +  return MI;
 +
@@ -20531,9 +20495,6 @@ index 0000000..011ec50
 +  case AMDGPU::S_MOV_B64:
 +  case AMDGPU::V_MOV_B32_e32:
 +  case AMDGPU::V_MOV_B32_e64:
-+  case AMDGPU::V_MOV_IMM_F32:
-+  case AMDGPU::V_MOV_IMM_I32:
-+  case AMDGPU::S_MOV_IMM_I32:
 +    return true;
 +  }
 +}
@@ -20592,10 +20553,10 @@ index 0000000..011ec50
 +}
 diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
 new file mode 100644
-index 0000000..e4de4b8
+index 0000000..a65f7b6
 --- /dev/null
 +++ b/lib/Target/R600/SIInstrInfo.h
-@@ -0,0 +1,90 @@
+@@ -0,0 +1,84 @@
 +//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -20633,12 +20594,6 @@ index 0000000..e4de4b8
 +                           unsigned DestReg, unsigned SrcReg,
 +                           bool KillSrc) const;
 +
-+  /// \returns the encoding type of this instruction.
-+  unsigned getEncodingType(const MachineInstr &MI) const;
-+
-+  /// \returns the size of this instructions encoding in number of bytes.
-+  unsigned getEncodingBytes(const MachineInstr &MI) const;
-+
 +  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
 +                                        int64_t Imm) const;
 +
@@ -20679,19 +20634,19 @@ index 0000000..e4de4b8
 +namespace SIInstrFlags {
 +  enum Flags {
 +    // First 4 bits are the instruction encoding
-+    VM_CNT = 1 << 4,
-+    EXP_CNT = 1 << 5,
-+    LGKM_CNT = 1 << 6
++    VM_CNT = 1 << 0,
++    EXP_CNT = 1 << 1,
++    LGKM_CNT = 1 << 2
 +  };
 +}
 +
 +#endif //SIINSTRINFO_H
 diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
 new file mode 100644
-index 0000000..9d9f5f6
+index 0000000..8c4e5af
 --- /dev/null
 +++ b/lib/Target/R600/SIInstrInfo.td
-@@ -0,0 +1,591 @@
+@@ -0,0 +1,465 @@
 +//===-- SIInstrInfo.td - SI Instruction Encodings ---------*- tablegen -*--===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -20702,60 +20657,66 @@ index 0000000..9d9f5f6
 +//===----------------------------------------------------------------------===//
 +
 +//===----------------------------------------------------------------------===//
-+// SI DAG Profiles
-+//===----------------------------------------------------------------------===//
-+def SDTVCCBinaryOp : SDTypeProfile<1, 2, [
-+  SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>
-+]>;
-+
-+//===----------------------------------------------------------------------===//
 +// SI DAG Nodes
 +//===----------------------------------------------------------------------===//
 +
-+// and operation on 64-bit wide vcc
-+def SIsreg1_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp,
-+  [SDNPCommutative, SDNPAssociative]
++// SMRD takes a 64bit memory address and can only add an 32bit offset
++def SIadd64bit32bit : SDNode<"ISD::ADD",
++  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
 +>;
 +
-+// Special bitcast node for sharing VCC register between VALU and SALU
-+def SIsreg1_bitcast : SDNode<"SIISD::VCC_BITCAST",
-+  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>
++// Transformation function, extract the lower 32bit of a 64bit immediate
++def LO32 : SDNodeXForm<imm, [{
++  return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
++}]>;
++
++// Transformation function, extract the upper 32bit of a 64bit immediate
++def HI32 : SDNodeXForm<imm, [{
++  return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32);
++}]>;
++
++def IMM8bitDWORD : ImmLeaf <
++  i32, [{
++    return (Imm & ~0x3FC) == 0;
++  }], SDNodeXForm<imm, [{
++    return CurDAG->getTargetConstant(
++      N->getZExtValue() >> 2, MVT::i32);
++  }]>
 +>;
 +
-+// and operation on 64-bit wide vcc
-+def SIvcc_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp,
-+  [SDNPCommutative, SDNPAssociative]
++def IMM12bit : ImmLeaf <
++  i16,
++  [{return isUInt<12>(Imm);}]
 +>;
 +
-+// Special bitcast node for sharing VCC register between VALU and SALU
-+def SIvcc_bitcast : SDNode<"SIISD::VCC_BITCAST",
-+  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>
-+>;
++class InlineImm <ValueType vt> : ImmLeaf <vt, [{
++  return -16 <= Imm && Imm <= 64;
++}]>;
 +
 +class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
 +    AMDGPUInst<outs, ins, asm, pattern> {
 +
-+  field bits<4> EncodingType = 0;
 +  field bits<1> VM_CNT = 0;
 +  field bits<1> EXP_CNT = 0;
 +  field bits<1> LGKM_CNT = 0;
 +
-+  let TSFlags{3-0} = EncodingType;
-+  let TSFlags{4} = VM_CNT;
-+  let TSFlags{5} = EXP_CNT;
-+  let TSFlags{6} = LGKM_CNT;
++  let TSFlags{0} = VM_CNT;
++  let TSFlags{1} = EXP_CNT;
++  let TSFlags{2} = LGKM_CNT;
 +}
 +
 +class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
 +    InstSI <outs, ins, asm, pattern> {
 +
 +  field bits<32> Inst;
++  let Size = 4;
 +}
 +
 +class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
 +    InstSI <outs, ins, asm, pattern> {
 +
 +  field bits<64> Inst;
++  let Size = 8;
 +}
 +
 +class SIOperand <ValueType vt, dag opInfo>: Operand <vt> {
@@ -20763,49 +20724,16 @@ index 0000000..9d9f5f6
 +  let MIOperandInfo = opInfo;
 +}
 +
-+def IMM16bit : ImmLeaf <
-+  i16,
-+  [{return isInt<16>(Imm);}]
-+>;
-+
-+def IMM8bit : ImmLeaf <
-+  i32,
-+  [{return (int32_t)Imm >= 0 && (int32_t)Imm <= 0xff;}]
-+>;
-+
-+def IMM12bit : ImmLeaf <
-+  i16,
-+  [{return (int16_t)Imm >= 0 && (int16_t)Imm <= 0xfff;}]
-+>;
-+
-+def IMM32bitIn64bit : ImmLeaf <
-+  i64,
-+  [{return isInt<32>(Imm);}]
-+>;
-+
 +class GPR4Align <RegisterClass rc> : Operand <vAny> {
 +  let EncoderMethod = "GPR4AlignEncode";
 +  let MIOperandInfo = (ops rc:$reg); 
 +}
 +
-+class GPR2Align <RegisterClass rc, ValueType vt> : Operand <vt> {
++class GPR2Align <RegisterClass rc> : Operand <iPTR> {
 +  let EncoderMethod = "GPR2AlignEncode";
 +  let MIOperandInfo = (ops rc:$reg);
 +}
 +
-+def SMRDmemrr : Operand<iPTR> {
-+  let MIOperandInfo = (ops SReg_64, SReg_32);
-+  let EncoderMethod = "GPR2AlignEncode";
-+}
-+
-+def SMRDmemri : Operand<iPTR> {
-+  let MIOperandInfo = (ops SReg_64, i32imm);
-+  let EncoderMethod = "SMRDmemriEncode";
-+}
-+
-+def ADDR_Reg     : ComplexPattern<i64, 2, "SelectADDRReg", [], []>;
-+def ADDR_Offset8 : ComplexPattern<i64, 2, "SelectADDR8BitOffset", [], []>;
-+
 +let Uses = [EXEC] in {
 +
 +def EXP : Enc64<
@@ -20835,7 +20763,6 @@ index 0000000..9d9f5f6
 +  let Inst{47-40} = VSRC1;
 +  let Inst{55-48} = VSRC2;
 +  let Inst{63-56} = VSRC3;
-+  let EncodingType = 0; //SIInstrEncodingType::EXP
 +
 +  let EXP_CNT = 1;
 +}
@@ -20870,7 +20797,6 @@ index 0000000..9d9f5f6
 +  let Inst{47-40} = VDATA;
 +  let Inst{52-48} = SRSRC;
 +  let Inst{57-53} = SSAMP;
-+  let EncodingType = 2; //SIInstrEncodingType::MIMG
 +
 +  let VM_CNT = 1;
 +  let EXP_CNT = 1;
@@ -20908,7 +20834,6 @@ index 0000000..9d9f5f6
 +  let Inst{54} = SLC;
 +  let Inst{55} = TFE;
 +  let Inst{63-56} = SOFFSET;
-+  let EncodingType = 3; //SIInstrEncodingType::MTBUF
 +
 +  let VM_CNT = 1;
 +  let EXP_CNT = 1;
@@ -20946,7 +20871,6 @@ index 0000000..9d9f5f6
 +  let Inst{54} = SLC;
 +  let Inst{55} = TFE;
 +  let Inst{63-56} = SOFFSET;
-+  let EncodingType = 4; //SIInstrEncodingType::MUBUF
 +
 +  let VM_CNT = 1;
 +  let EXP_CNT = 1;
@@ -20956,22 +20880,19 @@ index 0000000..9d9f5f6
 +
 +} // End Uses = [EXEC]
 +
-+class SMRD <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
-+    Enc32<outs, ins, asm, pattern> {
++class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
++            list<dag> pattern> : Enc32<outs, ins, asm, pattern> {
 +
 +  bits<7> SDST;
-+  bits<15> PTR;
-+  bits<8> OFFSET = PTR{7-0};
-+  bits<1> IMM    = PTR{8};
-+  bits<6> SBASE  = PTR{14-9};
++  bits<6> SBASE;
++  bits<8> OFFSET;
 +  
 +  let Inst{7-0} = OFFSET;
-+  let Inst{8} = IMM;
++  let Inst{8} = imm;
 +  let Inst{14-9} = SBASE;
 +  let Inst{21-15} = SDST;
 +  let Inst{26-22} = op;
 +  let Inst{31-27} = 0x18; //encoding
-+  let EncodingType = 5; //SIInstrEncodingType::SMRD
 +
 +  let LGKM_CNT = 1;
 +}
@@ -20986,7 +20907,6 @@ index 0000000..9d9f5f6
 +  let Inst{15-8} = op;
 +  let Inst{22-16} = SDST;
 +  let Inst{31-23} = 0x17d; //encoding;
-+  let EncodingType = 6; //SIInstrEncodingType::SOP1
 +
 +  let mayLoad = 0;
 +  let mayStore = 0;
@@ -21005,7 +20925,6 @@ index 0000000..9d9f5f6
 +  let Inst{22-16} = SDST;
 +  let Inst{29-23} = op;
 +  let Inst{31-30} = 0x2; // encoding
-+  let EncodingType = 7; // SIInstrEncodingType::SOP2  
 +
 +  let mayLoad = 0;
 +  let mayStore = 0;
@@ -21022,7 +20941,6 @@ index 0000000..9d9f5f6
 +  let Inst{15-8} = SSRC1;
 +  let Inst{22-16} = op;
 +  let Inst{31-23} = 0x17e;
-+  let EncodingType = 8; // SIInstrEncodingType::SOPC
 +
 +  let DisableEncoding = "$dst";
 +  let mayLoad = 0;
@@ -21040,7 +20958,6 @@ index 0000000..9d9f5f6
 +  let Inst{22-16} = SDST;
 +  let Inst{27-23} = op;
 +  let Inst{31-28} = 0xb; //encoding
-+  let EncodingType = 9; // SIInstrEncodingType::SOPK
 +
 +  let mayLoad = 0;
 +  let mayStore = 0;
@@ -21058,7 +20975,6 @@ index 0000000..9d9f5f6
 +  let Inst{15-0} = SIMM16;
 +  let Inst{22-16} = op;
 +  let Inst{31-23} = 0x17f; // encoding
-+  let EncodingType = 10; // SIInstrEncodingType::SOPP
 +
 +  let mayLoad = 0;
 +  let mayStore = 0;
@@ -21081,7 +20997,6 @@ index 0000000..9d9f5f6
 +  let Inst{17-16} = op;
 +  let Inst{25-18} = VDST;
 +  let Inst{31-26} = 0x32; // encoding
-+  let EncodingType = 11; // SIInstrEncodingType::VINTRP
 +
 +  let neverHasSideEffects = 1;
 +  let mayLoad = 1;
@@ -21099,9 +21014,6 @@ index 0000000..9d9f5f6
 +  let Inst{24-17} = VDST;
 +  let Inst{31-25} = 0x3f; //encoding
 +  
-+  let EncodingType = 12; // SIInstrEncodingType::VOP1
-+  let PostEncoderMethod = "VOPPostEncode";
-+
 +  let mayLoad = 0;
 +  let mayStore = 0;
 +  let hasSideEffects = 0;
@@ -21120,9 +21032,6 @@ index 0000000..9d9f5f6
 +  let Inst{30-25} = op;
 +  let Inst{31} = 0x0; //encoding
 +  
-+  let EncodingType = 13; // SIInstrEncodingType::VOP2
-+  let PostEncoderMethod = "VOPPostEncode";
-+
 +  let mayLoad = 0;
 +  let mayStore = 0;
 +  let hasSideEffects = 0;
@@ -21151,9 +21060,6 @@ index 0000000..9d9f5f6
 +  let Inst{60-59} = OMOD;
 +  let Inst{63-61} = NEG;
 +  
-+  let EncodingType = 14; // SIInstrEncodingType::VOP3
-+  let PostEncoderMethod = "VOPPostEncode";
-+
 +  let mayLoad = 0;
 +  let mayStore = 0;
 +  let hasSideEffects = 0;
@@ -21180,9 +21086,6 @@ index 0000000..9d9f5f6
 +  let Inst{60-59} = OMOD;
 +  let Inst{63-61} = NEG;
 +
-+  let EncodingType = 14; // SIInstrEncodingType::VOP3
-+  let PostEncoderMethod = "VOPPostEncode";
-+
 +  let mayLoad = 0;
 +  let mayStore = 0;
 +  let hasSideEffects = 0;
@@ -21199,8 +21102,6 @@ index 0000000..9d9f5f6
 +  let Inst{24-17} = op;
 +  let Inst{31-25} = 0x3e;
 + 
-+  let EncodingType = 15; //SIInstrEncodingType::VOPC
-+  let PostEncoderMethod = "VOPPostEncode";
 +  let DisableEncoding = "$dst";
 +  let mayLoad = 0;
 +  let mayStore = 0;
@@ -21209,86 +21110,14 @@ index 0000000..9d9f5f6
 +
 +} // End Uses = [EXEC]
 +
-+class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
-+  op,
-+  (outs VReg_128:$vdata),
-+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr,
-+       GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
-+  asm,
-+  []> {
-+  let mayLoad = 1;
-+  let mayStore = 0;
-+}
-+
-+class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
-+  op,
-+  (outs regClass:$dst),
-+  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-+       i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc,
-+       i1imm:$tfe, SReg_32:$soffset),
-+  asm,
-+  []> {
-+  let mayLoad = 1;
-+  let mayStore = 0;
-+}
-+
-+class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
-+  op,
-+  (outs regClass:$dst),
-+  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-+       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc,
-+       i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
-+  asm,
-+  []> {
-+  let mayLoad = 1;
-+  let mayStore = 0;
-+}
-+
-+class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
-+  op,
-+  (outs),
-+  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
-+   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
-+   GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
-+  asm,
-+  []> {
-+  let mayStore = 1;
-+  let mayLoad = 0;
-+}
-+
-+multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass,
-+                        ValueType vt> {
-+  def _IMM : SMRD <
-+              op,
-+              (outs dstClass:$dst),
-+              (ins SMRDmemri:$src0),
-+              asm,
-+              [(set (vt dstClass:$dst), (constant_load ADDR_Offset8:$src0))]
-+  >;
-+
-+  def _SGPR : SMRD <
-+              op,
-+              (outs dstClass:$dst),
-+              (ins SMRDmemrr:$src0),
-+              asm,
-+              [(set (vt dstClass:$dst), (constant_load ADDR_Reg:$src0))]
-+  >;
-+}
-+
-+multiclass SMRD_32 <bits<5> op, string asm, RegisterClass dstClass> {
-+  defm _F32 : SMRD_Helper <op, asm, dstClass, f32>;
-+  defm _I32 : SMRD_Helper <op, asm, dstClass, i32>;
-+}
-+
 +include "SIInstrFormats.td"
 +include "SIInstructions.td"
 diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
 new file mode 100644
-index 0000000..5d15761
+index 0000000..3a9822a
 --- /dev/null
 +++ b/lib/Target/R600/SIInstructions.td
-@@ -0,0 +1,1423 @@
+@@ -0,0 +1,1462 @@
 +//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
@@ -21302,6 +21131,17 @@ index 0000000..5d15761
 +// that are not yet supported remain commented out.
 +//===----------------------------------------------------------------------===//
 +
++class InterpSlots {
++int P0 = 2;
++int P10 = 0;
++int P20 = 1;
++}
++def INTERP : InterpSlots;
++
++def InterpSlot : Operand<i32> {
++  let PrintMethod = "printInterpSlot";
++}
++
 +def isSI : Predicate<"Subtarget.device()"
 +                            "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">;
 +
@@ -21410,33 +21250,33 @@ index 0000000..5d15761
 +defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32", []>;
 +defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", []>;
 +def : Pat <
-+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LT)),
-+  (V_CMP_LT_F32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_LT)),
++  (V_CMP_LT_F32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", []>;
 +def : Pat <
-+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)),
-+  (V_CMP_EQ_F32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_EQ)),
++  (V_CMP_EQ_F32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", []>;
 +def : Pat <
-+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LE)),
-+  (V_CMP_LE_F32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_LE)),
++  (V_CMP_LE_F32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", []>;
 +def : Pat <
-+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GT)),
-+  (V_CMP_GT_F32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_GT)),
++  (V_CMP_GT_F32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", []>;
 +def : Pat <
-+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
-+  (V_CMP_LG_F32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_NE)),
++  (V_CMP_LG_F32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", []>;
 +def : Pat <
-+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GE)),
-+  (V_CMP_GE_F32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_GE)),
++  (V_CMP_GE_F32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", []>;
 +defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", []>;
@@ -21446,8 +21286,8 @@ index 0000000..5d15761
 +defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32", []>;
 +defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", []>;
 +def : Pat <
-+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
-+  (V_CMP_NEQ_F32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (f32 VSrc_32:$src0), VReg_32:$src1, COND_NE)),
++  (V_CMP_NEQ_F32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32", []>;
 +defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32", []>;
@@ -21580,33 +21420,33 @@ index 0000000..5d15761
 +defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32", []>;
 +defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", []>;
 +def : Pat <
-+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LT)),
-+  (V_CMP_LT_I32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_LT)),
++  (V_CMP_LT_I32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", []>;
 +def : Pat <
-+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)),
-+  (V_CMP_EQ_I32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_EQ)),
++  (V_CMP_EQ_I32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", []>;
 +def : Pat <
-+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LE)),
-+  (V_CMP_LE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_LE)),
++  (V_CMP_LE_I32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", []>;
 +def : Pat <
-+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GT)),
-+  (V_CMP_GT_I32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_GT)),
++  (V_CMP_GT_I32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", []>;
 +def : Pat <
-+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
-+  (V_CMP_NE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_NE)),
++  (V_CMP_NE_I32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", []>;
 +def : Pat <
-+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GE)),
-+  (V_CMP_GE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
++  (i1 (setcc (i32 VSrc_32:$src0), VReg_32:$src1, COND_GE)),
++  (V_CMP_GE_I32_e64 VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32", []>;
 +
@@ -21752,11 +21592,13 @@ index 0000000..5d15761
 +//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
 +//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
 +
-+defm S_LOAD_DWORD : SMRD_32 <0x00000000, "S_LOAD_DWORD", SReg_32>;
++let mayLoad = 1 in {
++
++defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>;
 +
 +//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>;
-+defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128, v4i32>;
-+defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256, v8i32>;
++defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>;
++defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>;
 +//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>;
 +//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>;
 +//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>;
@@ -21764,6 +21606,8 @@ index 0000000..5d15761
 +//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>;
 +//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>;
 +
++} // mayLoad = 1
++
 +//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
 +//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
 +//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
@@ -21870,12 +21714,12 @@ index 0000000..5d15761
 +//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
 +//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
 +defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
-+  [(set VReg_32:$dst, (sint_to_fp AllReg_32:$src0))]
++  [(set VReg_32:$dst, (sint_to_fp VSrc_32:$src0))]
 +>;
 +//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>;
 +//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
 +defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
-+  [(set (i32 VReg_32:$dst), (fp_to_sint AllReg_32:$src0))]
++  [(set (i32 VReg_32:$dst), (fp_to_sint VSrc_32:$src0))]
 +>;
 +defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
 +////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
@@ -21892,31 +21736,35 @@ index 0000000..5d15761
 +//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
 +//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
 +defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
-+  [(set VReg_32:$dst, (AMDGPUfract AllReg_32:$src0))]
++  [(set VReg_32:$dst, (AMDGPUfract VSrc_32:$src0))]
 +>;
 +defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>;
-+defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", []>;
++defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
++  [(set VReg_32:$dst, (fceil VSrc_32:$src0))]
++>;
 +defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
-+  [(set VReg_32:$dst, (frint AllReg_32:$src0))]
++  [(set VReg_32:$dst, (frint VSrc_32:$src0))]
 +>;
 +defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
-+  [(set VReg_32:$dst, (ffloor AllReg_32:$src0))]
++  [(set VReg_32:$dst, (ffloor VSrc_32:$src0))]
 +>;
 +defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
-+  [(set VReg_32:$dst, (fexp2 AllReg_32:$src0))]
++  [(set VReg_32:$dst, (fexp2 VSrc_32:$src0))]
 +>;
 +defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
-+defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", []>;
++defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
++  [(set VReg_32:$dst, (flog2 VSrc_32:$src0))]
++>;
 +defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
 +defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
 +defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
-+  [(set VReg_32:$dst, (fdiv FP_ONE, AllReg_32:$src0))]
++  [(set VReg_32:$dst, (fdiv FP_ONE, VSrc_32:$src0))]
 +>;
 +defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
 +defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
 +defm V_RSQ_LEGACY_F32 : VOP1_32 <
 +  0x0000002d, "V_RSQ_LEGACY_F32",
-+  [(set VReg_32:$dst, (int_AMDGPU_rsq AllReg_32:$src0))]
++  [(set VReg_32:$dst, (int_AMDGPU_rsq VSrc_32:$src0))]
 +>;
 +defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
 +defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>;
@@ -21966,10 +21814,9 @@ index 0000000..5d15761
 +def V_INTERP_MOV_F32 : VINTRP <
 +  0x00000002,
 +  (outs VReg_32:$dst),
-+  (ins i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
-+  "V_INTERP_MOV_F32",
++  (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
++  "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr",
 +  []> {
-+  let VSRC = 0;
 +  let DisableEncoding = "$m0";
 +}
 +
@@ -22049,22 +21896,22 @@ index 0000000..5d15761
 +//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
 +
 +def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
-+  (ins AllReg_32:$src0, VReg_32:$src1, VCCReg:$vcc), "V_CNDMASK_B32_e32",
++  (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc), "V_CNDMASK_B32_e32",
 +  []
 +>{
 +  let DisableEncoding = "$vcc";
 +}
 +
 +def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
-+  (ins VReg_32:$src0, VReg_32:$src1, SReg_1:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
++  (ins VReg_32:$src0, VReg_32:$src1, SReg_64:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
 +  "V_CNDMASK_B32_e64",
-+  [(set (i32 VReg_32:$dst), (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0))]
++  [(set (i32 VReg_32:$dst), (select (i1 SReg_64:$src2), VReg_32:$src1, VReg_32:$src0))]
 +>;
 +
 +//f32 pattern for V_CNDMASK_B32_e64
 +def : Pat <
-+  (f32 (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0)),
-+  (V_CNDMASK_B32_e64 VReg_32:$src0, VReg_32:$src1, SReg_1:$src2)
++  (f32 (select (i1 SReg_64:$src2), VReg_32:$src1, VReg_32:$src0)),
++  (V_CNDMASK_B32_e64 VReg_32:$src0, VReg_32:$src1, SReg_64:$src2)
 +>;
 +
 +defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
@@ -22072,35 +21919,35 @@ index 0000000..5d15761
 +
 +defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", []>;
 +def : Pat <
-+  (f32 (fadd AllReg_32:$src0, VReg_32:$src1)),
-+  (V_ADD_F32_e32  AllReg_32:$src0, VReg_32:$src1)
++  (f32 (fadd VSrc_32:$src0, VReg_32:$src1)),
++  (V_ADD_F32_e32  VSrc_32:$src0, VReg_32:$src1)
 +>;
 +
 +defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", []>;
 +def : Pat <
-+  (f32 (fsub AllReg_32:$src0, VReg_32:$src1)),
-+  (V_SUB_F32_e32  AllReg_32:$src0, VReg_32:$src1)
++  (f32 (fsub VSrc_32:$src0, VReg_32:$src1)),
++  (V_SUB_F32_e32  VSrc_32:$src0, VReg_32:$src1)
 +>;
 +defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>;
 +defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
 +defm V_MUL_LEGACY_F32 : VOP2_32 <
 +  0x00000007, "V_MUL_LEGACY_F32",
-+  [(set VReg_32:$dst, (int_AMDGPU_mul AllReg_32:$src0, VReg_32:$src1))]
++  [(set VReg_32:$dst, (int_AMDGPU_mul VSrc_32:$src0, VReg_32:$src1))]
 +>;
 +
 +defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
-+  [(set VReg_32:$dst, (fmul AllReg_32:$src0, VReg_32:$src1))]
++  [(set VReg_32:$dst, (fmul VSrc_32:$src0, VReg_32:$src1))]
 +>;
 +//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>;
 +//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
 +//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>;
 +//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
 +defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
-+  [(set VReg_32:$dst, (AMDGPUfmin AllReg_32:$src0, VReg_32:$src1))]
++  [(set VReg_32:$dst, (AMDGPUfmin VSrc_32:$src0, VReg_32:$src1))]
 +>;
 +
 +defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
-+  [(set VReg_32:$dst, (AMDGPUfmax AllReg_32:$src0, VReg_32:$src1))]
++  [(set VReg_32:$dst, (AMDGPUfmax VSrc_32:$src0, VReg_32:$src1))]
 +>;
 +defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
 +defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
@@ -22115,13 +21962,13 @@ index 0000000..5d15761
 +defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
 +defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>;
 +defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
-+  [(set VReg_32:$dst, (and AllReg_32:$src0, VReg_32:$src1))]
++  [(set VReg_32:$dst, (and VSrc_32:$src0, VReg_32:$src1))]
 +>;
 +defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
-+  [(set VReg_32:$dst, (or AllReg_32:$src0, VReg_32:$src1))]
++  [(set VReg_32:$dst, (or VSrc_32:$src0, VReg_32:$src1))]
 +>;
 +defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
-+  [(set VReg_32:$dst, (xor AllReg_32:$src0, VReg_32:$src1))]
++  [(set VReg_32:$dst, (xor VSrc_32:$src0, VReg_32:$src1))]
 +>;
 +defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>;
 +defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
@@ -22132,10 +21979,10 @@ index 0000000..5d15761
 +//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 +let Defs = [VCC] in { // Carry-out goes to VCC
 +defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32",
-+  [(set VReg_32:$dst, (add (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))]
++  [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
 +>;
 +defm V_SUB_I32 : VOP2_32 <0x00000026, "V_SUB_I32",
-+  [(set VReg_32:$dst, (sub (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))]
++  [(set VReg_32:$dst, (sub (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
 +>;
 +} // End Defs = [VCC]
 +defm V_SUBREV_I32 : VOP2_32 <0x00000027, "V_SUBREV_I32", []>;
@@ -22147,7 +21994,7 @@ index 0000000..5d15761
 +////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
 +////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
 +defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
-+ [(set VReg_32:$dst, (int_SI_packf16 AllReg_32:$src0, VReg_32:$src1))]
++ [(set VReg_32:$dst, (int_SI_packf16 VSrc_32:$src0, VReg_32:$src1))]
 +>;
 +////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
 +////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
@@ -22217,6 +22064,10 @@ index 0000000..5d15761
 +def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
 +def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
 +def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
++def : Pat <
++  (mul VSrc_32:$src0, VReg_32:$src1),
++  (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (IMPLICIT_DEF), 0, 0, 0, 0)
++>;
 +def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
 +def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
 +def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
@@ -22254,13 +22105,20 @@ index 0000000..5d15761
 +def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>;
 +
 +def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
-+  [(set SReg_64:$dst, (and SReg_64:$src0, SReg_64:$src1))]
++  [(set SReg_64:$dst, (i64 (and SSrc_64:$src0, SSrc_64:$src1)))]
 +>;
-+def S_AND_VCC : SOP2_VCC <0x0000000f, "S_AND_B64",
-+  [(set SReg_1:$vcc, (SIvcc_and SReg_64:$src0, SReg_64:$src1))]
++
++def : Pat <
++  (i1 (and SSrc_64:$src0, SSrc_64:$src1)),
++  (S_AND_B64 SSrc_64:$src0, SSrc_64:$src1)
 +>;
++
 +def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>;
 +def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>;
++def : Pat <
++  (i1 (or SSrc_64:$src0, SSrc_64:$src1)),
++  (S_OR_B64 SSrc_64:$src0, SSrc_64:$src1)
++>;
 +def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
 +def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
 +def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
@@ -22289,48 +22147,6 @@ index 0000000..5d15761
 +//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
 +def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
 +
-+class V_MOV_IMM <ValueType type, Operand immType, SDNode immNode> : InstSI <
-+  (outs VReg_32:$dst),
-+  (ins immType:$src0),
-+  "V_MOV_IMM",
-+   [(set VReg_32:$dst, (type immNode:$src0))]
-+>;
-+
-+let isCodeGenOnly = 1, isPseudo = 1 in {
-+
-+def V_MOV_IMM_I32 : V_MOV_IMM<i32, i32imm, imm>;
-+def V_MOV_IMM_F32 : V_MOV_IMM<f32, f32imm, fpimm>;
-+
-+def S_MOV_IMM_I32 : InstSI <
-+  (outs SReg_32:$dst),
-+  (ins i32imm:$src0),
-+  "S_MOV_IMM_I32",
-+  [(set SReg_32:$dst, (imm:$src0))]
-+>;
-+
-+// i64 immediates aren't really supported in hardware, but LLVM will use the i64
-+// type for indices on load and store instructions.  The pattern for
-+// S_MOV_IMM_I64 will only match i64 immediates that can fit into 32-bits,
-+// which the hardware can handle.
-+def S_MOV_IMM_I64 : InstSI <
-+  (outs SReg_64:$dst),
-+  (ins i64imm:$src0),
-+  "S_MOV_IMM_I64 $dst, $src0",
-+  [(set SReg_64:$dst, (IMM32bitIn64bit:$src0))]
-+>;
-+
-+} // End isCodeGenOnly, isPseudo = 1
-+
-+class SI_LOAD_LITERAL<Operand ImmType> :
-+    Enc32 <(outs), (ins ImmType:$imm), "LOAD_LITERAL $imm", []> {
-+
-+  bits<32> imm;
-+  let Inst{31-0} = imm;
-+}
-+
-+def SI_LOAD_LITERAL_I32 : SI_LOAD_LITERAL<i32imm>;
-+def SI_LOAD_LITERAL_F32 : SI_LOAD_LITERAL<f32imm>;
-+
 +let isCodeGenOnly = 1, isPseudo = 1 in {
 +
 +def SET_M0 : InstSI <
@@ -22349,13 +22165,6 @@ index 0000000..5d15761
 +
 +let usesCustomInserter = 1 in {
 +
-+def SI_V_CNDLT : InstSI <
-+  (outs VReg_32:$dst),
-+  (ins VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
-+  "SI_V_CNDLT $dst, $src0, $src1, $src2",
-+  [(set VReg_32:$dst, (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2))]
-+>;
-+
 +def SI_INTERP : InstSI <
 +  (outs VReg_32:$dst),
 +  (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
@@ -22363,14 +22172,6 @@ index 0000000..5d15761
 +  []
 +>;
 +
-+def SI_INTERP_CONST : InstSI <
-+  (outs VReg_32:$dst),
-+  (ins i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
-+  "SI_INTERP_CONST $dst, $attr_chan, $attr, $params",
-+  [(set VReg_32:$dst, (int_SI_fs_interp_constant imm:$attr_chan,
-+                                                 imm:$attr, SReg_32:$params))]
-+>;
-+
 +def SI_WQM : InstSI <
 +  (outs),
 +  (ins),
@@ -22390,9 +22191,9 @@ index 0000000..5d15761
 +
 +def SI_IF : InstSI <
 +  (outs SReg_64:$dst),
-+  (ins SReg_1:$vcc, brtarget:$target),
++  (ins SReg_64:$vcc, brtarget:$target),
 +  "SI_IF",
-+  [(set SReg_64:$dst, (int_SI_if SReg_1:$vcc, bb:$target))]
++  [(set SReg_64:$dst, (int_SI_if SReg_64:$vcc, bb:$target))]
 +>;
 +
 +def SI_ELSE : InstSI <
@@ -22422,9 +22223,9 @@ index 0000000..5d15761
 +
 +def SI_IF_BREAK : InstSI <
 +  (outs SReg_64:$dst),
-+  (ins SReg_1:$vcc, SReg_64:$src),
++  (ins SReg_64:$vcc, SReg_64:$src),
 +  "SI_IF_BREAK",
-+  [(set SReg_64:$dst, (int_SI_if_break SReg_1:$vcc, SReg_64:$src))]
++  [(set SReg_64:$dst, (int_SI_if_break SReg_64:$vcc, SReg_64:$src))]
 +>;
 +
 +def SI_ELSE_BREAK : InstSI <
@@ -22453,9 +22254,14 @@ index 0000000..5d15761
 +
 +} // end IsCodeGenOnly, isPseudo
 +
++def : Pat<
++  (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
++  (V_CNDMASK_B32_e64 VReg_32:$src2, VReg_32:$src1, (V_CMP_GT_F32_e64 0, VReg_32:$src0))
++>;
++
 +def : Pat <
 +  (int_AMDGPU_kilp),
-+  (SI_KILL (V_MOV_IMM_I32 0xbf800000))
++  (SI_KILL (V_MOV_B32_e32 0xbf800000))
 +>;
 +
 +/* int_SI_vs_load_input */
@@ -22464,7 +22270,7 @@ index 0000000..5d15761
 +                        VReg_32:$buf_idx_vgpr),
 +  (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
 +                           VReg_32:$buf_idx_vgpr, SReg_128:$tlst,
-+                           0, 0, (i32 SREG_LIT_0))
++                           0, 0, 0)
 +>;
 +
 +/* int_SI_export */
@@ -22581,24 +22387,46 @@ index 0000000..5d15761
 +def : BitConvert <f32, i32, SReg_32>;
 +def : BitConvert <f32, i32, VReg_32>;
 +
++/********** ================== **********/
++/********** Immediate Patterns **********/
++/********** ================== **********/
++
 +def : Pat <
-+  (i64 (SIsreg1_bitcast SReg_1:$vcc)),
-+  (S_MOV_B64 (COPY_TO_REGCLASS SReg_1:$vcc, SReg_64))
++  (i1 imm:$imm),
++  (S_MOV_B64 imm:$imm)
 +>;
 +
 +def : Pat <
-+  (i1 (SIsreg1_bitcast SReg_64:$vcc)),
-+  (COPY_TO_REGCLASS SReg_64:$vcc, SReg_1)
++  (i32 imm:$imm),
++  (V_MOV_B32_e32 imm:$imm)
 +>;
 +
 +def : Pat <
-+  (i64 (SIvcc_bitcast VCCReg:$vcc)),
-+  (S_MOV_B64 (COPY_TO_REGCLASS VCCReg:$vcc, SReg_64))
++  (f32 fpimm:$imm),
++  (V_MOV_B32_e32 fpimm:$imm)
 +>;
 +
 +def : Pat <
-+  (i1 (SIvcc_bitcast SReg_64:$vcc)),
-+  (COPY_TO_REGCLASS SReg_64:$vcc, VCCReg)
++  (i32 imm:$imm),
++  (S_MOV_B32 imm:$imm)
++>;
++
++def : Pat <
++  (f32 fpimm:$imm),
++  (S_MOV_B32 fpimm:$imm)
++>;
++
++def : Pat <
++  (i64 InlineImm<i64>:$imm),
++  (S_MOV_B64 InlineImm<i64>:$imm)
++>;
++
++// i64 immediates aren't supported in hardware, split it into two 32bit values
++def : Pat <
++  (i64 imm:$imm),
++  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
++    (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0),
++    (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1)
 +>;
 +
 +/********** ===================== **********/
@@ -22606,6 +22434,12 @@ index 0000000..5d15761
 +/********** ===================== **********/
 +
 +def : Pat <
++  (int_SI_fs_interp_constant imm:$attr_chan, imm:$attr, SReg_32:$params),
++  (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr,
++                    (S_MOV_B32 SReg_32:$params))
++>;
++
++def : Pat <
 +  (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params),
 +  (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan,
 +             imm:$attr, SReg_32:$params)
@@ -22663,23 +22497,23 @@ index 0000000..5d15761
 +def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32>;
 +
 +def : Pat <
-+  (int_AMDGPU_div AllReg_32:$src0, AllReg_32:$src1),
-+  (V_MUL_LEGACY_F32_e32 AllReg_32:$src0, (V_RCP_LEGACY_F32_e32 AllReg_32:$src1))
++  (int_AMDGPU_div VSrc_32:$src0, VSrc_32:$src1),
++  (V_MUL_LEGACY_F32_e32 VSrc_32:$src0, (V_RCP_LEGACY_F32_e32 VSrc_32:$src1))
 +>;
 +
 +def : Pat<
-+  (fdiv AllReg_32:$src0, AllReg_32:$src1),
-+  (V_MUL_F32_e32 AllReg_32:$src0, (V_RCP_F32_e32 AllReg_32:$src1))
++  (fdiv VSrc_32:$src0, VSrc_32:$src1),
++  (V_MUL_F32_e32 VSrc_32:$src0, (V_RCP_F32_e32 VSrc_32:$src1))
 +>;
 +
 +def : Pat <
-+  (fcos AllReg_32:$src0),
-+  (V_COS_F32_e32 (V_MUL_F32_e32 AllReg_32:$src0, (V_MOV_IMM_I32 CONST.TWO_PI_INV)))
++  (fcos VSrc_32:$src0),
++  (V_COS_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
 +>;
 +
 +def : Pat <
-+  (fsin AllReg_32:$src0),
-+  (V_SIN_F32_e32 (V_MUL_F32_e32 AllReg_32:$src0, (V_MOV_IMM_I32 CONST.TWO_PI_INV)))
++  (fsin VSrc_32:$src0),
++  (V_SIN_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
 +>;
 +
 +def : Pat <
@@ -22703,14 +22537,48 @@ index 0000000..5d15761
 +                  0, 0, 0, 0), sub3)
 +>;
 +
++def : Pat <
++  (i32 (sext (i1 SReg_64:$src0))),
++  (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0)
++>;
++
 +/********** ================== **********/
 +/**********   VOP3 Patterns    **********/
 +/********** ================== **********/
 +
-+def : Pat <(f32 (IL_mad AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2)),
-+           (V_MAD_LEGACY_F32 AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2,
++def : Pat <(f32 (IL_mad VSrc_32:$src0, VReg_32:$src1, VReg_32:$src2)),
++           (V_MAD_LEGACY_F32 VSrc_32:$src0, VReg_32:$src1, VReg_32:$src2,
 +            0, 0, 0, 0)>;
 +
++/********** ================== **********/
++/**********   SMRD Patterns    **********/
++/********** ================== **********/
++
++multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
++  // 1. Offset as 8bit DWORD immediate
++  def : Pat <
++    (constant_load (SIadd64bit32bit SReg_64:$sbase, IMM8bitDWORD:$offset)),
++    (vt (Instr_IMM SReg_64:$sbase, IMM8bitDWORD:$offset))
++  >;
++
++  // 2. Offset loaded in an 32bit SGPR
++  def : Pat <
++    (constant_load (SIadd64bit32bit SReg_64:$sbase, imm:$offset)),
++    (vt (Instr_SGPR SReg_64:$sbase, (S_MOV_B32 imm:$offset)))
++  >;
++
++  // 3. No offset at all
++  def : Pat <
++    (constant_load SReg_64:$sbase),
++    (vt (Instr_IMM SReg_64:$sbase, 0))
++  >;
++}
++
++defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
++defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
++defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
++defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
++
 +} // End isSI predicate
 diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
 new file mode 100644
@@ -22774,7 +22642,7 @@ index 0000000..611b9c4
 +}
 diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
 new file mode 100644
-index 0000000..3780e40
+index 0000000..2007d30
 --- /dev/null
 +++ b/lib/Target/R600/SILowerControlFlow.cpp
 @@ -0,0 +1,372 @@
@@ -22938,10 +22806,10 @@ index 0000000..3780e40
 +          .addImm(0)
 +          .addImm(1)
 +          .addImm(1)
-+          .addReg(AMDGPU::SREG_LIT_0)
-+          .addReg(AMDGPU::SREG_LIT_0)
-+          .addReg(AMDGPU::SREG_LIT_0)
-+          .addReg(AMDGPU::SREG_LIT_0);
++          .addReg(AMDGPU::VGPR0)
++          .addReg(AMDGPU::VGPR0)
++          .addReg(AMDGPU::VGPR0)
++          .addReg(AMDGPU::VGPR0);
 +
 +  // ... and terminate wavefront
 +  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
@@ -23077,7 +22945,7 @@ index 0000000..3780e40
 +
 +  // Clear this pixel from the exec mask if the operand is negative
 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
-+          .addReg(AMDGPU::SREG_LIT_0)
++          .addImm(0)
 +          .addOperand(MI.getOperand(0));
 +
 +  MI.eraseFromParent();
@@ -23150,120 +23018,6 @@ index 0000000..3780e40
 +
 +  return true;
 +}
-diff --git a/lib/Target/R600/SILowerLiteralConstants.cpp b/lib/Target/R600/SILowerLiteralConstants.cpp
-new file mode 100644
-index 0000000..2d5ab0b
---- /dev/null
-+++ b/lib/Target/R600/SILowerLiteralConstants.cpp
-@@ -0,0 +1,108 @@
-+//===-- SILowerLiteralConstants.cpp - Lower intrs using literal constants--===//
-+//
-+//                     The LLVM Compiler Infrastructure
-+//
-+// This file is distributed under the University of Illinois Open Source
-+// License. See LICENSE.TXT for details.
-+//
-+//===----------------------------------------------------------------------===//
-+//
-+/// \file
-+/// \brief This pass performs the following transformation on instructions with
-+/// literal constants:
-+///
-+/// %VGPR0 = V_MOV_IMM_I32 1
-+///
-+/// becomes:
-+///
-+/// BUNDLE
-+///   * %VGPR = V_MOV_B32_32 SI_LITERAL_CONSTANT
-+///   * SI_LOAD_LITERAL 1
-+///
-+/// The resulting sequence matches exactly how the hardware handles immediate
-+/// operands, so this transformation greatly simplifies the code generator.
-+///
-+/// Only the *_MOV_IMM_* support immediate operands at the moment, but when
-+/// support for immediate operands is added to other instructions, they
-+/// will be lowered here as well.
-+//===----------------------------------------------------------------------===//
-+
-+#include "AMDGPU.h"
-+#include "llvm/CodeGen/MachineFunction.h"
-+#include "llvm/CodeGen/MachineFunctionPass.h"
-+#include "llvm/CodeGen/MachineInstrBuilder.h"
-+#include "llvm/CodeGen/MachineInstrBundle.h"
-+
-+using namespace llvm;
-+
-+namespace {
-+
-+class SILowerLiteralConstantsPass : public MachineFunctionPass {
-+
-+private:
-+  static char ID;
-+  const TargetInstrInfo *TII;
-+
-+public:
-+  SILowerLiteralConstantsPass(TargetMachine &tm) :
-+    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
-+
-+  virtual bool runOnMachineFunction(MachineFunction &MF);
-+
-+  const char *getPassName() const {
-+    return "SI Lower literal constants pass";
-+  }
-+};
-+
-+} // End anonymous namespace
-+
-+char SILowerLiteralConstantsPass::ID = 0;
-+
-+FunctionPass *llvm::createSILowerLiteralConstantsPass(TargetMachine &tm) {
-+  return new SILowerLiteralConstantsPass(tm);
-+}
-+
-+bool SILowerLiteralConstantsPass::runOnMachineFunction(MachineFunction &MF) {
-+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-+                                                  BB != BB_E; ++BB) {
-+    MachineBasicBlock &MBB = *BB;
-+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-+                               I != MBB.end(); I = Next) {
-+      Next = llvm::next(I);
-+      MachineInstr &MI = *I;
-+      switch (MI.getOpcode()) {
-+      default: break;
-+      case AMDGPU::S_MOV_IMM_I32:
-+      case AMDGPU::S_MOV_IMM_I64:
-+      case AMDGPU::V_MOV_IMM_F32:
-+      case AMDGPU::V_MOV_IMM_I32: {
-+          unsigned MovOpcode;
-+          unsigned LoadLiteralOpcode;
-+          MachineOperand LiteralOp = MI.getOperand(1);
-+          if (AMDGPU::VReg_32RegClass.contains(MI.getOperand(0).getReg())) {
-+            MovOpcode = AMDGPU::V_MOV_B32_e32;
-+          } else {
-+            MovOpcode = AMDGPU::S_MOV_B32;
-+          }
-+          if (LiteralOp.isImm()) {
-+            LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_I32;
-+          } else {
-+            LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_F32;
-+          }
-+          MachineInstr *First =
-+            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(MovOpcode),
-+                    MI.getOperand(0).getReg())
-+                    .addReg(AMDGPU::SI_LITERAL_CONSTANT);
-+          MachineInstr *Last =
-+            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(LoadLiteralOpcode))
-+                    .addOperand(MI.getOperand(1));
-+          Last->setIsInsideBundle();
-+          llvm::finalizeBundle(MBB, First, Last);
-+          MI.eraseFromParent();
-+          break;
-+        }
-+      }
-+    }
-+  }
-+  return false;
-+}
 diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
 new file mode 100644
 index 0000000..7e59b42
@@ -23439,10 +23193,10 @@ index 0000000..40171e4
 +#endif // SIREGISTERINFO_H_
 diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
 new file mode 100644
-index 0000000..9b483eb
+index 0000000..ab36b87
 --- /dev/null
 +++ b/lib/Target/R600/SIRegisterInfo.td
-@@ -0,0 +1,188 @@
+@@ -0,0 +1,190 @@
 +
 +class SIReg <string n, bits<16> encoding = 0> : Register<n> {
 +  let Namespace = "AMDGPU";
@@ -23457,7 +23211,9 @@ index 0000000..9b483eb
 +
 +class SGPR_32 <bits<16> num, string name> : SIReg<name, num>;
 +
-+class VGPR_32 <bits<16> num, string name> : SIReg<name, num>;
++class VGPR_32 <bits<16> num, string name> : SIReg<name, num> {
++  let HWEncoding{8} = 1;
++}
 +
 +// Special Registers
 +def VCC : SIReg<"VCC", 106>;
@@ -23465,8 +23221,6 @@ index 0000000..9b483eb
 +def EXEC_HI : SIReg <"EXEC HI", 127>;
 +def EXEC : SI_64<"EXEC", [EXEC_LO, EXEC_HI], 126>;
 +def SCC : SIReg<"SCC", 253>;
-+def SREG_LIT_0 : SIReg <"S LIT 0", 128>;
-+def SI_LITERAL_CONSTANT : SIReg<"LITERAL CONSTANT", 255>;
 +def M0 : SIReg <"M0", 124>;
 +
 +//Interpolation registers
@@ -23579,12 +23333,10 @@ index 0000000..9b483eb
 +
 +// Register class for all scalar registers (SGPRs + Special Registers)
 +def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
-+    (add SGPR_32,  SREG_LIT_0, M0, EXEC_LO, EXEC_HI)
++    (add SGPR_32, M0, EXEC_LO, EXEC_HI)
 +>;
 +
-+def SReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add SGPR_64, VCC, EXEC)>;
-+
-+def SReg_1 : RegisterClass<"AMDGPU", [i1], 1, (add VCC, SGPR_64, EXEC)>;
++def SReg_64 : RegisterClass<"AMDGPU", [i1, i64], 64, (add SGPR_64, VCC, EXEC)>;
 +
 +def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>;
 +
@@ -23620,10 +23372,14 @@ index 0000000..9b483eb
 +
 +def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>;
 +
-+// AllReg_* - A set of all scalar and vector registers of a given width.
-+def AllReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, (add VReg_32, SReg_32)>;
++// [SV]Src_* operands can have either an immediate or an register
++def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
 +
-+def AllReg_64 : RegisterClass<"AMDGPU", [f64, i64], 64, (add SReg_64, VReg_64)>;
++def SSrc_64 : RegisterClass<"AMDGPU", [i1, i64], 64, (add SReg_64)>;
++
++def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
++
++def VSrc_64 : RegisterClass<"AMDGPU", [i64], 64, (add SReg_64, VReg_64)>;
 +
 +// Special register classes for predicates and the M0 register
 +def SCCReg : RegisterClass<"AMDGPU", [i1], 1, (add SCC)>;
@@ -23747,6 +23503,30 @@ index 0000000..b8ac4e7
 +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 +
 +include $(LEVEL)/Makefile.common
+diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll
+new file mode 100644
+index 0000000..114f9e7
+--- /dev/null
++++ b/test/CodeGen/R600/128bit-kernel-args.ll
+@@ -0,0 +1,18 @@
++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
++
++; CHECK: @v4i32_kernel_arg
++; CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 40
++
++define void @v4i32_kernel_arg(<4 x i32> addrspace(1)* %out, <4 x i32>  %in) {
++entry:
++  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
++  ret void
++}
++
++; CHECK: @v4f32_kernel_arg
++; CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 40
++define void @v4f32_kernel_args(<4 x float> addrspace(1)* %out, <4 x float>  %in) {
++entry:
++  store <4 x float> %in, <4 x float> addrspace(1)* %out
++  ret void
++}
 diff --git a/test/CodeGen/R600/add.v4i32.ll b/test/CodeGen/R600/add.v4i32.ll
 new file mode 100644
 index 0000000..ac4a874
@@ -23831,6 +23611,40 @@ index 0000000..fd958b3
 +  store <4 x float> %splat, <4 x float> addrspace(1)* %out
 +  ret void
 +}
+diff --git a/test/CodeGen/R600/disconnected-predset-break-bug.ll b/test/CodeGen/R600/disconnected-predset-break-bug.ll
+new file mode 100644
+index 0000000..a586742
+--- /dev/null
++++ b/test/CodeGen/R600/disconnected-predset-break-bug.ll
+@@ -0,0 +1,28 @@
++; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
++
++; PRED_SET* instructions must be tied to any instruction that uses their
++; result.  This tests that there are no instructions between the PRED_SET*
++; and the PREDICATE_BREAK in this loop.
++
++; CHECK: @loop_ge
++; CHECK: WHILE
++; CHECK: PRED_SET
++; CHECK-NEXT: PREDICATED_BREAK
++define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
++entry:
++  %cmp5 = icmp sgt i32 %iterations, 0
++  br i1 %cmp5, label %for.body, label %for.end
++
++for.body:                                         ; preds = %for.body, %entry
++  %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
++  %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
++  %i.07 = add nsw i32 %i.07.in, -1
++  %arrayidx = getelementptr inbounds i32 addrspace(1)* %out, i32 %ai.06
++  store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
++  %add = add nsw i32 %ai.06, 1
++  %exitcond = icmp eq i32 %add, %iterations
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body, %entry
++  ret void
++}
 diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
 new file mode 100644
 index 0000000..0407533
@@ -24176,6 +23990,64 @@ index 0000000..aad44d9
 +  store i32 %value, i32 addrspace(1)* %out
 +  ret void
 +}
+diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll
+new file mode 100644
+index 0000000..382f78c
+--- /dev/null
++++ b/test/CodeGen/R600/kcache-fold.ll
+@@ -0,0 +1,52 @@
++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
++
++; CHECK: MOV T{{[0-9]+\.[XYZW], CBuf0\[[0-9]+\]\.[XYZW]}}
++
++define void @main() {
++main_body:
++  %0 = load <4 x float> addrspace(9)* null
++  %1 = extractelement <4 x float> %0, i32 0
++  %2 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
++  %3 = extractelement <4 x float> %2, i32 0
++  %4 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
++  %5 = extractelement <4 x float> %4, i32 0
++  %6 = fcmp ult float %1, 0.000000e+00
++  %7 = select i1 %6, float %3, float %5
++  %8 = load <4 x float> addrspace(9)* null
++  %9 = extractelement <4 x float> %8, i32 1
++  %10 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
++  %11 = extractelement <4 x float> %10, i32 1
++  %12 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
++  %13 = extractelement <4 x float> %12, i32 1
++  %14 = fcmp ult float %9, 0.000000e+00
++  %15 = select i1 %14, float %11, float %13
++  %16 = load <4 x float> addrspace(9)* null
++  %17 = extractelement <4 x float> %16, i32 2
++  %18 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
++  %19 = extractelement <4 x float> %18, i32 2
++  %20 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
++  %21 = extractelement <4 x float> %20, i32 2
++  %22 = fcmp ult float %17, 0.000000e+00
++  %23 = select i1 %22, float %19, float %21
++  %24 = load <4 x float> addrspace(9)* null
++  %25 = extractelement <4 x float> %24, i32 3
++  %26 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
++  %27 = extractelement <4 x float> %26, i32 3
++  %28 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
++  %29 = extractelement <4 x float> %28, i32 3
++  %30 = fcmp ult float %25, 0.000000e+00
++  %31 = select i1 %30, float %27, float %29
++  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
++  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
++  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
++  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
++  %36 = insertelement <4 x float> undef, float %32, i32 0
++  %37 = insertelement <4 x float> %36, float %33, i32 1
++  %38 = insertelement <4 x float> %37, float %34, i32 2
++  %39 = insertelement <4 x float> %38, float %35, i32 3
++  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
++  ret void
++}
++
++declare float @llvm.AMDIL.clamp.(float, float, float) readnone
++declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg
 new file mode 100644
 index 0000000..36ee493
@@ -24278,6 +24150,35 @@ index 0000000..fac957f
 +declare void @llvm.AMDGPU.store.output(float, i32)
 +
 +declare float @llvm.AMDGPU.trunc(float ) readnone
+diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
+new file mode 100644
+index 0000000..0c19f14
+--- /dev/null
++++ b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
+@@ -0,0 +1,23 @@
++;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
++
++;CHECK: S_MOV_B32
++;CHECK-NEXT: V_INTERP_MOV_F32
++
++define void @main() {
++main_body:
++  call void @llvm.AMDGPU.shader.type(i32 0)
++  %0 = load i32 addrspace(8)* inttoptr (i32 6 to i32 addrspace(8)*)
++  %1 = call float @llvm.SI.fs.interp.constant(i32 0, i32 0, i32 %0)
++  %2 = call i32 @llvm.SI.packf16(float %1, float %1)
++  %3 = bitcast i32 %2 to float
++  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
++  ret void
++}
++
++declare void @llvm.AMDGPU.shader.type(i32)
++
++declare float @llvm.SI.fs.interp.constant(i32, i32, i32) readonly
++
++declare i32 @llvm.SI.packf16(float, float) readnone
++
++declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
 new file mode 100644
 index 0000000..dc120bf
@@ -25059,15 +24960,3 @@ index 0000000..62cdcf5
 +declare <4 x float> @llvm.SI.vs.load.input(<4 x i32>, i32, i32)
 +
 +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
-index 466b096..d11bb9e 100644
---- a/test/CodeGen/X86/cvtv2f32.ll
-+++ b/test/CodeGen/X86/cvtv2f32.ll
-@@ -1,3 +1,7 @@
-+; A bug fix in the DAGCombiner made this test fail, so marking as xfail
-+; until this can be investigated further.
-+; XFAIL: *
-+
- ; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
- 
- define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) {
diff --git a/debian/patches/series b/debian/patches/series
index b3a0596e..154b4c97 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -30,3 +30,7 @@ polly-c++0x.diff
 declare_clear_cache.diff
 #r600-snapshot.diff
 31-powerpcspe.diff
+0101-LegalizeDAG-Allow-type-promotion-for-scalar-stores.patch
+0102-LegalizeDAG-Allow-type-promotion-of-scalar-loads.patch
+0103-DAGCombiner-Avoid-generating-illegal-vector-INT_TO_F.patch
+