diff --git a/builtins.cpp b/builtins.cpp
index 43f68833..fa2e7328 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -536,6 +536,12 @@ lSetInternalFunctions(llvm::Module *module) {
         "__set_system_isa",
         "__sext_uniform_bool",
         "__sext_varying_bool",
+        "__shift_double",
+        "__shift_float",
+        "__shift_i16",
+        "__shift_i32",
+        "__shift_i64",
+        "__shift_i8",
         "__shuffle2_double",
         "__shuffle2_float",
         "__shuffle2_i16",
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 2a5d1b32..92b7a18e 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -80,6 +80,13 @@ declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
 declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
 declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
 
+declare <WIDTH x i8> @__shift_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__shift_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x float> @__shift_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x i32> @__shift_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x double> @__shift_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i64> @__shift_i64(<WIDTH x i64>, i32) nounwind readnone
+
 declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
                                     <WIDTH x i32>) nounwind readnone
diff --git a/builtins/util.m4 b/builtins/util.m4
index 68fa818b..4cb46310 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -797,6 +797,43 @@ not_const:
   ret <WIDTH x $1> %result
 }
 
+define <WIDTH x $1> @__shift_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
+  %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
+  %zeropaddedvec = shufflevector <WIDTH x $1> %0, <WIDTH x $1> zeroinitializer,
+                     <eval(2*WIDTH) x i32> < forloop(i, 0, eval(2*WIDTH-2), `i32 i, ')i32 eval(2*WIDTH-1) >
+  br i1 %isc, label %is_const, label %not_const
+
+is_const:
+  ; though verbose, this turms into tight code if %1 is a constant
+forloop(i, 0, eval(WIDTH-1), `  
+  %delta_`'i = add i32 %1, i
+  %delta_clamped_`'i = and i32 %delta_`'i, eval(2*WIDTH-1)
+  %v_`'i = extractelement <eval(2*WIDTH) x $1> %zeropaddedvec, i32 %delta_clamped_`'i')
+  %ret_0 = insertelement <WIDTH x $1> zeroinitializer, $1 %v_0, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
+')
+  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
+
+not_const:
+  ; store two instances of the vector into memory
+  %ptr = alloca <WIDTH x $1>, i32 3
+  %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
+  store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
+  %ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
+  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
+  %ptr2 = getelementptr <WIDTH x $1> * %ptr, i32 2
+  store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr2
+
+  ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
+  %offset = add i32 %1, 16
+  %ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(3*WIDTH) x $1] *
+  %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
+  %load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
+  %result = load <WIDTH x $1> * %load_ptr_vec, align $2
+  ret <WIDTH x $1> %result
+}
+
+
 define <WIDTH x $1> @__shuffle_$1(<WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
 forloop(i, 0, eval(WIDTH-1), `  
   %index_`'i = extractelement <WIDTH x i32> %1, i32 i')
diff --git a/opt.cpp b/opt.cpp
index 75eae20c..0146e7cf 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -72,6 +72,7 @@
 #include <llvm/Analysis/ConstantFolding.h>
 #include <llvm/Target/TargetLibraryInfo.h>
 #include <llvm/ADT/Triple.h>
+#include <llvm/ADT/SmallSet.h>
 #include <llvm/Transforms/Scalar.h>
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
@@ -124,6 +125,8 @@ static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 
 static llvm::Pass *CreateDebugPass(char * output);
 
+static llvm::Pass *CreateReplaceExtractInsertChainsPass();
+
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
         (getenv("FUNC") == NULL ||                             \
@@ -635,6 +638,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
         optPM.add(CreateInstructionSimplifyPass());
+        optPM.add(CreateReplaceExtractInsertChainsPass());
 
         optPM.add(llvm::createMemCpyOptPass());
         optPM.add(llvm::createSCCPPass());
@@ -4923,3 +4927,136 @@ static llvm::Pass *
 CreatePeepholePass() {
   return new PeepholePass;
 }
+
+///////////////////////////////////////////////////////////////////////////
+// ReplaceExtractInsertChainsPass
+
+/** 
+    We occassionally get chains of ExtractElementInsts followed by 
+    InsertElementInsts.  Unfortunately, all of these can't be replaced by 
+    ShuffleVectorInsts as we don't know that things are constant at the time.
+
+    This Pass will detect such chains, and replace them with ShuffleVectorInsts
+    if all the appropriate values are constant.
+ */
+
+class ReplaceExtractInsertChainsPass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    ReplaceExtractInsertChainsPass() : BasicBlockPass(ID) {
+    }
+
+    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+};
+
+char ReplaceExtractInsertChainsPass::ID = 0;
+
+#include <iostream>
+
+/** Given an llvm::Value known to be an integer, return its value as
+    an int64_t.
+*/
+static int64_t
+lGetIntValue(llvm::Value *offset) {
+  llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
+  Assert(intOffset && (intOffset->getBitWidth() == 32 ||
+                       intOffset->getBitWidth() == 64));
+  return intOffset->getSExtValue();
+}
+
+bool
+ReplaceExtractInsertChainsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("ReplaceExtractInsertChainsPass");
+    bool modifiedAny = false;
+
+    // Initialize our mapping to the first spot in the zero vector
+    int vectorWidth = g->target->getVectorWidth();
+    int shuffleMap[vectorWidth];
+    for (int i = 0; i < vectorWidth; i++) {
+      shuffleMap[i] = vectorWidth;
+    }
+
+    // Hack-y.  16 is likely the upper limit for now.
+    llvm::SmallSet<llvm::Value *, 16> inserts;
+
+    // save the last Insert in the chain
+    llvm::Value * lastInsert = NULL;
+
+    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+      // Iterate through the instructions looking for InsertElementInsts
+      llvm::InsertElementInst *ieInst = llvm::dyn_cast<llvm::InsertElementInst>(&*i);
+      if (ieInst == NULL) {
+        // These aren't the instructions you're looking for.
+        continue;
+      }
+      
+      llvm::Value * base = ieInst->getOperand(0);
+      if ( (llvm::isa<llvm::UndefValue>(base))
+           || (llvm::isa<llvm::ConstantAggregateZero>(base))
+           || (base == lastInsert)) {
+        // if source for insert scalar is 0 or an EEInst, add insert
+        llvm::Value *scalar = ieInst->getOperand(1);
+        if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast<llvm::ExtractElementInst>(scalar)) {
+          // We're only going to deal with Inserts into a Constant vector lane
+          if (llvm::isa<llvm::Constant>(eeInst->getOperand(1))) {
+            inserts.insert(ieInst);
+            lastInsert = ieInst;
+          }
+        }
+        else if (llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(scalar)) {
+          if (ci->isZero()) {
+            inserts.insert(ieInst);
+            lastInsert = ieInst;
+          }
+        }
+        else {
+          lastInsert = NULL;
+        }
+      }
+    }
+    
+    // Look for chains, not insert/shuffle sequences
+    if (inserts.size() > 1) {
+      // The vector from which we're extracting elements
+      llvm::Value * baseVec = NULL;
+      llvm::Value *ee = llvm::cast<llvm::InsertElementInst>((*inserts.begin()))->getOperand(1);
+      if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast<llvm::ExtractElementInst>(ee)) {
+        baseVec = eeInst->getOperand(0);
+      }
+
+      bool sameBase = true;
+      for (llvm::SmallSet<llvm::Value *,16>::iterator i = inserts.begin(); i != inserts.end(); i++) {
+        llvm::InsertElementInst *ie = llvm::cast<llvm::InsertElementInst>(*i);
+        if (llvm::ExtractElementInst *ee = llvm::dyn_cast<llvm::ExtractElementInst>(ie->getOperand(1))) {
+          if (ee->getOperand(0) != baseVec) {
+            sameBase = false;
+            break;
+          }
+          int64_t from = lGetIntValue(ee->getIndexOperand());
+          int64_t to = lGetIntValue(ie->getOperand(2)); 
+          shuffleMap[to] = from;
+        }
+      }
+      if (sameBase) {
+        llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleMap);
+        llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shuffleIdxs->getType());
+        llvm::Value *shuffle = new llvm::ShuffleVectorInst(baseVec, zeroVec, shuffleIdxs, "shiftInZero", llvm::cast<llvm::Instruction>(lastInsert));
+        // For now, be lazy and let DCE clean up the Extracts/Inserts.
+        lastInsert->replaceAllUsesWith(shuffle);
+
+        modifiedAny = true;
+      }
+    }    
+    
+    DEBUG_END_PASS("ReplaceExtractInsertChainsPass");
+
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateReplaceExtractInsertChainsPass() {
+    return new ReplaceExtractInsertChainsPass();
+}
diff --git a/stdlib.ispc b/stdlib.ispc
index 9b02d0ba..248f664a 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -170,6 +170,36 @@ static inline int64 rotate(int64 v, uniform int i) {
     return __rotate_i64(v, i);
 }
 
+__declspec(safe) 
+static inline float shift(float v, uniform int i) {
+    return __shift_float(v, i);
+}
+
+__declspec(safe) 
+static inline int8 shift(int8 v, uniform int i) {
+    return __shift_i8(v, i);
+}
+
+__declspec(safe) 
+static inline int16 shift(int16 v, uniform int i) {
+    return __shift_i16(v, i);
+}
+
+__declspec(safe) 
+static inline int32 shift(int32 v, uniform int i) {
+    return __shift_i32(v, i);
+}
+
+__declspec(safe) 
+static inline double shift(double v, uniform int i) {
+    return __shift_double(v, i);
+}
+
+__declspec(safe) 
+static inline int64 shift(int64 v, uniform int i) {
+    return __shift_i64(v, i);
+}
+
 __declspec(safe) 
 static inline float shuffle(float v, int i) {
     return __shuffle_float(v, i);