diff --git a/ctx.cpp b/ctx.cpp
index 27e2f0f4..9c72fd3d 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1644,7 +1644,8 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
         return;
     }
 
-    llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, name, bblock);
+    llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
+                                                  4, bblock);
     AddDebugPos(inst);
 }
 
@@ -1662,7 +1663,8 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
     // Figure out what kind of store we're doing here
     if (rvalueType->IsUniformType()) {
         // The easy case; a regular store
-        llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, name, bblock);
+        llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
+                                                    4, bblock);
         AddDebugPos(si);
     }
     else if (llvm::isa<const llvm::ArrayType>(lvalue->getType()))
@@ -1673,7 +1675,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
         // Otherwise it is a masked store unless we can determine that the
         // mask is all on...
         llvm::Instruction *si = 
-            new llvm::StoreInst(rvalue, lvalue, name, bblock);
+            new llvm::StoreInst(rvalue, lvalue, false /*not volatile*/, 4, bblock);
         AddDebugPos(si);
     }
     else
diff --git a/opt.cpp b/opt.cpp
index 583e8324..69e75247 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -1131,10 +1131,17 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         }
         else if (maskAsInt == allOnMask) {
             // The mask is all on, so turn this into a regular store
-            const llvm::Type *ptrType = llvm::PointerType::get(rvalue->getType(), 0);
+            const llvm::Type *rvalueType = rvalue->getType();
+            const llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
+            // Need to update this when int8/int16 are added
+            int align = (called == pms32Func || called == pms64Func ||
+                         called == msb32Func) ? 4 : 8;
+
             lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
             lCopyMetadata(lvalue, callInst);
-            llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue);
+            llvm::Instruction *store = 
+                new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
+                                    align);
             lCopyMetadata(store, callInst);
             llvm::ReplaceInstWithInst(callInst, store);
 
diff --git a/stdlib-avx.ll b/stdlib-avx.ll
index 3d125a7e..5ad79adf 100644
--- a/stdlib-avx.ll
+++ b/stdlib-avx.ll
@@ -520,7 +520,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
                                                    <8 x float> %newAsFloat,
                                                    <8 x float> %mask_as_float)
   %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
-  store <8 x i32> %blendAsInt, <8 x i32>* %0
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
   ret void
 }
 
diff --git a/stdlib-sse2.ll b/stdlib-sse2.ll
index 654e81f1..a67584f9 100644
--- a/stdlib-sse2.ll
+++ b/stdlib-sse2.ll
@@ -280,7 +280,7 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
                                      <4 x i32> %mask) nounwind alwaysinline {
   %val = load <4 x i32> * %0
   %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
-  store <4 x i32> %newval, <4 x i32> * %0
+  store <4 x i32> %newval, <4 x i32> * %0, align 4
   ret void
 }
 
@@ -322,7 +322,7 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
   ; reconstruct the final <4 x i64> vector
   %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i64> %final, <4 x i64> * %ptr
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
   ret void
 }
 
diff --git a/stdlib-sse4.ll b/stdlib-sse4.ll
index f28dc35d..68b8dd90 100644
--- a/stdlib-sse4.ll
+++ b/stdlib-sse4.ll
@@ -195,7 +195,7 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
                                                      <4 x float> %newAsFloat,
                                                      <4 x float> %mask_as_float)
   %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
-  store <4 x i32> %blendAsInt, <4 x i32>* %0
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
   ret void
 }
 
@@ -243,6 +243,6 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
   ; reconstruct the final <4 x i64> vector
   %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i64> %final, <4 x i64> * %ptr
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
   ret void
 }
diff --git a/stdlib-sse4x2.ll b/stdlib-sse4x2.ll
index c97fd8ce..39410eca 100644
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -584,7 +584,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
   %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b,
                <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
-  store <8 x i32> %blendAsInt, <8 x i32>* %0
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
   ret void
 }
 
@@ -651,7 +651,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
        <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i64> %final, <8 x i64> * %ptr
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
   ret void
 }
 
diff --git a/stdlib.m4 b/stdlib.m4
index b098e131..b437ec19 100644
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -544,7 +544,7 @@ all_on:
   ;; vector load
   %vecptr = bitcast i32 *%startptr to <$1 x i32> *
   %vec_load = load <$1 x i32> *%vecptr, align 4
-  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr
+  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
   ret i32 $1
 
 not_all_on: