diff --git a/llvmutil.cpp b/llvmutil.cpp
index 75d832c5..cfcdf113 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -39,7 +39,9 @@
 #include "ispc.h"
 #include "type.h"
 #include <llvm/Instructions.h>
+#include <llvm/BasicBlock.h>
 #include <set>
+#include <map>
 
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
 LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
@@ -784,6 +786,96 @@ LLVMDumpValue(llvm::Value *v) {
 }
 
 
+static llvm::Value *
+lExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore,
+                           std::map<llvm::PHINode *, llvm::PHINode *> &phiMap) {
+    // If it's not an instruction (i.e. is a constant), then we can just
+    // emit an extractelement instruction and let the regular optimizer do
+    // the rest.
+    if (llvm::isa<llvm::Instruction>(v) == false)
+        return llvm::ExtractElementInst::Create(v, LLVMInt32(0), "first_elt",
+                                                insertBefore);
+
+    LLVM_TYPE_CONST llvm::VectorType *vt =
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v->getType());
+    Assert(vt != NULL);
+
+    llvm::Twine newName = v->getName() + llvm::Twine(".elt0");
+
+    // Rewrite regular binary operators and casts to the scalarized
+    // equivalent.
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
+    if (bop != NULL) {
+        llvm::Value *v0 = lExtractFirstVectorElement(bop->getOperand(0),
+                                                     insertBefore, phiMap);
+        llvm::Value *v1 = lExtractFirstVectorElement(bop->getOperand(1),
+                                                     insertBefore, phiMap);
+        return llvm::BinaryOperator::Create(bop->getOpcode(), v0, v1,
+                                            newName, insertBefore);
+    }
+
+    llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(v);
+    if (cast != NULL) {
+        llvm::Value *v = lExtractFirstVectorElement(cast->getOperand(0),
+                                                    insertBefore, phiMap);
+        return llvm::CastInst::Create(cast->getOpcode(), v,
+                                      vt->getElementType(), newName,
+                                      insertBefore);
+    }
+
+    llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
+    if (phi != NULL) {
+        // For PHI notes, recursively scalarize them.
+        if (phiMap.find(phi) != phiMap.end())
+            return phiMap[phi];
+
+        // We need to create the new scalar PHI node immediately, though,
+        // and put it in the map<>, so that if we come back to this node
+        // via a recursive lExtractFirstVectorElement() call, then we can
+        // return the pointer and not get stuck in an infinite loop.
+        //
+        // The insertion point for the new phi node also has to be the
+        // start of the bblock of the original phi node, which isn't
+        // necessarily the same bblock as insertBefore is in!
+        llvm::Instruction *phiInsertPos = phi->getParent()->begin();
+        llvm::PHINode *scalarPhi = 
+            llvm::PHINode::Create(vt->getElementType(), 
+                                  phi->getNumIncomingValues(), newName,
+                                  phiInsertPos);
+        phiMap[phi] = scalarPhi;
+
+        for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i) {
+            llvm::Value *v = lExtractFirstVectorElement(phi->getIncomingValue(i),
+                                                        insertBefore, phiMap);
+            scalarPhi->addIncoming(v, phi->getIncomingBlock(i));
+        }
+
+        return scalarPhi;
+    }
+
+    // If we have a chain of insertelement instructions, then we can just
+    // flatten them out and grab the value for the first one.
+    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
+    if (ie != NULL) {
+        llvm::Value *elements[ISPC_MAX_NVEC];
+        LLVMFlattenInsertChain(ie, vt->getNumElements(), elements);
+        return elements[0];
+    }
+
+    // Worst case, for everything else, just do a regular extract element
+    return llvm::ExtractElementInst::Create(v, LLVMInt32(0), "first_elt",
+                                            insertBefore);
+}
+
+
+llvm::Value *
+LLVMExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore) {
+    std::map<llvm::PHINode *, llvm::PHINode *> phiMap;
+    llvm::Value *ret = lExtractFirstVectorElement(v, insertBefore, phiMap);
+    return ret;
+}
+
+
 /** Given two vectors of the same type, concatenate them into a vector that
     has twice as many elements, where the first half has the elements from
     the first vector and the second half has the elements from the second
diff --git a/llvmutil.h b/llvmutil.h
index 1f696904..41f98d96 100644
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -239,6 +239,17 @@ void LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
     on. */
 extern void LLVMDumpValue(llvm::Value *v);
 
+/** Given a vector-typed value, this function returns the value of its
+    first element.  Rather than just doing the straightforward thing of
+    using a single extractelement instruction to do this, this function
+    tries to rewrite the computation for the first element in scalar form;
+    this is generally more efficient than computing the entire vector's
+    worth of values just to extract the first element, in cases where only
+    the first element's value is needed.
+  */
+extern llvm::Value *LLVMExtractFirstVectorElement(llvm::Value *v, 
+                                              llvm::Instruction *insertBefore);
+
 /** This function takes two vectors, expected to be the same length, and
     returns a new vector of twice the length that represents concatenating
     the two of them. */
diff --git a/opt.cpp b/opt.cpp
index f4adc6bd..e2efe9a5 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -2531,9 +2531,8 @@ struct GatherImpInfo {
 static llvm::Value *
 lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
                       llvm::Instruction *insertBefore) {
-    llvm::Value *firstOffset = 
-        llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
-                                         insertBefore);
+    llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets,
+                                                             insertBefore);
     return lGEPInst(base, firstOffset, "ptr", insertBefore);
 }
 
@@ -3524,9 +3523,8 @@ lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore) {
     // All of the variable offsets values should be the same, due to
     // checking for this in GatherCoalescePass::runOnBasicBlock().  Thus,
     // extract the first value and use that as a scalar.
-    llvm::Value *variable = 
-        llvm::ExtractElementInst::Create(variableOffsets, LLVMInt32(0),
-                                         "variable0", insertBefore);
+    llvm::Value *variable = LLVMExtractFirstVectorElement(variableOffsets,
+                                                          insertBefore);
     if (variable->getType() == LLVMTypes::Int64Type)
         offsetScale = new llvm::ZExtInst(offsetScale, LLVMTypes::Int64Type,
                                          "scale_to64", insertBefore);