Merge branch 'master' of /Users/mmp/git/ispc

2011-06-24 05:11:06 -07:00
parent f39d31174e 865e430b56
commit f6920f8d5e
13 changed files with 84 additions and 181 deletions
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1315,8 +1315,21 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,
    if (llvm::isa<const llvm::PointerType>(lvalue->getType())) {
        // If the lvalue is a straight up regular pointer, then just issue
-        // a regular load
+        // a regular load.  First figure out the alignment; in general we
-        llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load", bblock);
+        // can just assume the natural alignment (0 here), but for varying
        // atomic types, we need to make sure that the compiler emits
        // unaligned vector loads, so we specify a reduced alignment here.
        int align = 0;
        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
        if (atomicType != NULL && atomicType->IsVaryingType())
            // We actually just want to align to the vector element
            // alignment, but can't easily get that here, so just tell LLVM
            // it's totally unaligned.  (This shouldn't make any difference
            // vs the proper alignment in practice.)
            align = 1;
        llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load",
                                                     false /* not volatile */,
                                                     align, bblock);
        AddDebugPos(inst);
        return inst;
    }
@@ -1644,7 +1657,16 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
        return;
    }
-    llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, name, bblock);
+    llvm::Instruction *inst;
    if (llvm::isa<llvm::VectorType>(rvalue->getType()))
        // Specify an unaligned store, since we don't know that the lvalue
        // will in fact be aligned to a vector width here.  (Actually
        // should be aligned to the alignment of the vector elment type...)
        inst = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
                                   1, bblock);
    else
        inst = new llvm::StoreInst(rvalue, lvalue, bblock);
    AddDebugPos(inst);
 }
@@ -1661,8 +1683,8 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
    // Figure out what kind of store we're doing here
    if (rvalueType->IsUniformType()) {
-        // The easy case; a regular store
+        // The easy case; a regular store, natural alignment is fine
-        llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, name, bblock);
+        llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, bblock);
        AddDebugPos(si);
    }
    else if (llvm::isa<const llvm::ArrayType>(lvalue->getType()))
@@ -1672,9 +1694,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
    else if (storeMask == LLVMMaskAllOn) {
        // Otherwise it is a masked store unless we can determine that the
        // mask is all on...
-        llvm::Instruction *si = 
+        StoreInst(rvalue, lvalue, name);
            new llvm::StoreInst(rvalue, lvalue, name, bblock);
        AddDebugPos(si);
    }
    else
        maskedStore(rvalue, lvalue, rvalueType, storeMask);
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -1970,7 +1970,7 @@ Data Layout
 In general, ``ispc`` tries to ensure that ``struct`` s and other complex
 datatypes are laid out in the same way in memory as they are in C/C++.
-Matching alignment is important for easy interoperability between C/C++
+Matching structure layout is important for easy interoperability between C/C++
 code and ``ispc`` code.
 The main complexity in sharing data between ``ispc`` and C/C++ often comes
@@ -2023,11 +2023,6 @@ It can pass ``array`` to a ``ispc`` function defined as:
   export void foo(uniform float array[], uniform int count)
 (Though the pointer must be aligned to the compilation target's natural
 vector width; see the discussion of alignment restrictions in `Data
 Alignment and Aliasing`_ and the aligned allocation routines in
 ``examples/options/options.cpp`` for example.)
 Similarly, ``struct`` s from the application can have embedded pointers.
 This is handled with similar ``[]`` syntax:
@@ -2062,55 +2057,20 @@ vector types from C/C++ application code if possible.
 Data Alignment and Aliasing
 ---------------------------
-There are two important constraints that must be adhered to when passing
+There are are two important constraints that must be adhered to when
-pointers from the application to ``ispc`` programs.
+passing pointers from the application to ``ispc`` programs.
-The first constraint is alignment: any pointers from the host program that
+The first is that it is required that it be valid to read memory at the
-are passed to ``ispc`` must be aligned to natural vector alignment of
+first element of any array that is passed to ``ispc``.  In practice, this
-system--for example, 16 byte alignment on a target that supports Intel®
+should just happen naturally, but it does mean that it is illegal to pass a
-SSE, 32-byte on an Intel® AVX target.  If this constraint isn't met, the
+``NULL`` pointer as a parameter to a ``ispc`` function called from the
-program may abort at runtime with an unaligned memory access error.
+application.
-For example, in a ``ispc`` function with the following declaration:
+The second constraint is that pointers and references in ``ispc`` programs
-
+must not alias.  The ``ispc`` compiler assumes that different pointers
-::
+can't end up pointing to the same memory location, either due to having the
-
+same initial value, or through array indexing in the program as it
-    export void foo(uniform float in[], uniform float out[],
+executed.
                    int count);
 If the application is passing stack-allocated arrays for ``in`` and
 ``out``, these C/C++ compiler must be told to align these arrays.
 ::
    // MSVC, SSE target
    __declspec(align(16)) float in[16], out[16];
    foo(in, out, 16);
 With the gcc/clang compilers, the syntax for providing alignment is
 slightly different:
 ::
    float x[16] __attribute__ ((__align__(16)));
    foo(in, out, 16);
 If the data being passed is dynamically allocated, the appropriate system
 aligned memory allocation routine should be used to allocate it (for
 example, ``_aligned_malloc()`` with Windows\*, ``memalign()`` with
 Linux\*; see the ``AllocAligned()`` function in ``examples/rt/rt.cpp`` for
 an example.)
 It is also required that it be valid to read memory at the first element of
 any array that is passed to ``ispc``.  In practice, this should just
 happen naturally, but it does mean that it is illegal to pass a ``NULL``
 pointer as a parameter to a ``ispc`` function called from the application.
 The second key constraint is that pointers and references in ``ispc``
 programs must not alias.  The ``ispc`` compiler assumes that different
 pointers can't end up pointing to the same memory location, either due to
 having the same initial value, or through array indexing in the program as
 it executed.
 This aliasing constraint also applies to ``reference`` parameters to
 functions.  Given a function like:
@@ -2127,8 +2087,8 @@ another case of aliasing, and if the caller calls the function as ``func(x,
 x)``, it's not guaranteed that the ``if`` test will evaluate to true, due
 to the compiler's requirement of no aliasing.
-(In the future, ``ispc`` will have the ability to work with unaligned
+(In the future, ``ispc`` will have a mechanism to indicate that pointers
-memory as well as have a mechanism to indicate that pointers may alias.)
+may alias.)
 Using ISPC Effectively
 ======================
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -103,24 +103,6 @@ savePPM(const char *fname, int w, int h)
 }
 // Allocate memory with 64-byte alignment.
 float *
 AllocAligned(int size) {
 #if defined(_WIN32) || defined(_WIN64)
    return (float *)_aligned_malloc(size, 64);
 #elif defined (__APPLE__)
    // Allocate excess memory to ensure an aligned pointer can be returned
    void *mem = malloc(size + (64-1) + sizeof(void*));
    char *amem = ((char*)mem) + sizeof(void*);
    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
    ((void**)amem)[-1] = mem;
    return (float *)amem;
 #else
    return (float *)memalign(64, size);
 #endif
 }
 int main(int argc, char **argv)
 {
    if (argc != 4) {
@@ -136,8 +118,8 @@ int main(int argc, char **argv)
    }
    // Allocate space for output images
-    img = (unsigned char *)AllocAligned(width * height * 3);
+    img = new unsigned char[width * height * 3];
-    fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
+    fimg = new float[width * height * 3];
    //
    // Run the ispc path, test_iterations times, and report the minimum
--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -102,24 +102,6 @@ savePPM(const char *fname, int w, int h)
 }
 // Allocate memory with 64-byte alignment.
 float *
 AllocAligned(int size) {
 #if defined(_WIN32) || defined(_WIN64)
    return (float *)_aligned_malloc(size, 64);
 #elif defined (__APPLE__)
    // Allocate excess memory to ensure an aligned pointer can be returned
    void *mem = malloc(size + (64-1) + sizeof(void*));
    char *amem = ((char*)mem) + sizeof(void*);
    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
    ((void**)amem)[-1] = mem;
    return (float *)amem;
 #else
    return (float *)memalign(64, size);
 #endif
 }
 int main(int argc, char **argv)
 {
    if (argc != 4) {
@@ -135,8 +117,8 @@ int main(int argc, char **argv)
    }
    // Allocate space for output images
-    img = (unsigned char *)AllocAligned(width * height * 3);
+    img = new unsigned char[width * height * 3];
-    fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
+    fimg = new float[width * height * 3];
    ao_ispc(width, height, NSUBSAMPLES, fimg);
--- a/examples/options/options.cpp
+++ b/examples/options/options.cpp
@@ -37,9 +37,6 @@
 #include <assert.h>
 #include <math.h>
 #include <algorithm>
 #ifndef __APPLE__
 #include <malloc.h>
 #endif // !__APPLE__
 using std::max;
 #include "options_defs.h"
@@ -48,23 +45,6 @@ using std::max;
 #include "options_ispc.h"
 using namespace ispc;
 // Allocate memory with 64-byte alignment.
 float *AllocFloats(int count) {
    int size = count * sizeof(float);
 #if defined(_WIN32) || defined(_WIN64)
    return (float *)_aligned_malloc(size, 64);
 #elif defined (__APPLE__)
    // Allocate excess memory to ensure an aligned pointer can be returned
    void *mem = malloc(size + (64-1) + sizeof(void*));
    char *amem = ((char*)mem) + sizeof(void*);
    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
    ((void**)amem)[-1] = mem;
    return (float *)amem;
 #else
    return (float *)memalign(64, size);
 #endif
 }
 extern void black_scholes_serial(float Sa[], float Xa[], float Ta[], 
                                 float ra[], float va[], 
                                 float result[], int count);
@@ -76,12 +56,12 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
 int main() {
    // Pointers passed to ispc code must have alignment of the target's
    // vector width at minimum.
-    float *S = AllocFloats(N_OPTIONS);
+    float *S = new float[N_OPTIONS];
-    float *X = AllocFloats(N_OPTIONS);
+    float *X = new float[N_OPTIONS];
-    float *T = AllocFloats(N_OPTIONS);
+    float *T = new float[N_OPTIONS];
-    float *r = AllocFloats(N_OPTIONS);
+    float *r = new float[N_OPTIONS];
-    float *v = AllocFloats(N_OPTIONS);
+    float *v = new float[N_OPTIONS];
-    float *result = AllocFloats(N_OPTIONS);
+    float *result = new float[N_OPTIONS];
    for (int i = 0; i < N_OPTIONS; ++i) {
        S[i] = 100;  // stock price
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -43,9 +43,6 @@
 #include <algorithm>
 #include <assert.h>
 #include <sys/types.h>
 #ifndef __APPLE__
 #include <malloc.h>
 #endif
 #include "../timing.h"
 #include "rt_ispc.h"
@@ -53,23 +50,6 @@ using namespace ispc;
 typedef unsigned int uint;
 template <typename T> 
 T *AllocAligned(int count) {
    int size = count * sizeof(T);
 #if defined(_WIN32) || defined(_WIN64)
    return (T *)_aligned_malloc(size, 64);
 #elif defined (__APPLE__)
    // Allocate excess memory to ensure an aligned pointer can be returned
    void *mem = malloc(size + (64-1) + sizeof(void*));
    char *amem = ((char*)mem) + sizeof(void*);
    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
    ((void**)amem)[-1] = mem;
    return (T *)amem;
 #else
    return (T *)memalign(64, size);
 #endif
 }
 extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
                            const float camera2world[4][4], float image[],
                            int id[], const LinearBVHNode nodes[],
@@ -161,7 +141,7 @@ int main(int argc, char *argv[]) {
    uint nNodes;
    READ(nNodes, 1);
-    LinearBVHNode *nodes = AllocAligned<LinearBVHNode>(nNodes);
+    LinearBVHNode *nodes = new LinearBVHNode[nNodes];
    for (unsigned int i = 0; i < nNodes; ++i) {
        // Each node is 6x floats for a boox, then an integer for an offset
        // to the second child node, then an integer that encodes the type
@@ -181,7 +161,7 @@ int main(int argc, char *argv[]) {
    // And then read the triangles 
    uint nTris;
    READ(nTris, 1);
-    Triangle *triangles = AllocAligned<Triangle>(nTris);
+    Triangle *triangles = new Triangle[nTris];
    for (uint i = 0; i < nTris; ++i) {
        // 9x floats for the 3 vertices
        float v[9];
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -38,15 +38,7 @@
 using namespace ispc;
 int main() {
-    // Pointers passed to ispc-compiled code are currently required to have
+    float vin[16], vout[16];
    // alignment equal to the target's native vector size.  Here we align
    // to 32 bytes to be safe for both SSE and AVX targets.
 #ifdef _MSC_VER
    __declspec(align(32)) float vin[16], vout[16];
 #else
    float vin[16] __attribute__((aligned(32)));
    float vout[16] __attribute__((aligned(32)));
 #endif
    // Initialize input buffer
    for (int i = 0; i < 16; ++i)
--- a/opt.cpp
+++ b/opt.cpp
@@ -1131,10 +1131,17 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        }
        else if (maskAsInt == allOnMask) {
            // The mask is all on, so turn this into a regular store
-            const llvm::Type *ptrType = llvm::PointerType::get(rvalue->getType(), 0);
+            const llvm::Type *rvalueType = rvalue->getType();
            const llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
            // Need to update this when int8/int16 are added
            int align = (called == pms32Func || called == pms64Func ||
                         called == msb32Func) ? 4 : 8;
            lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
            lCopyMetadata(lvalue, callInst);
-            llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue);
+            llvm::Instruction *store = 
                new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
                                    align);
            lCopyMetadata(store, callInst);
            llvm::ReplaceInstWithInst(callInst, store);
--- a/stdlib-avx.ll
+++ b/stdlib-avx.ll
@@ -513,14 +513,14 @@ declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>,
 define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
                                           <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
-  %oldValue = load <8 x i32>* %0
+  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
  %blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat,
                                                   <8 x float> %newAsFloat,
                                                   <8 x float> %mask_as_float)
  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
-  store <8 x i32> %blendAsInt, <8 x i32>* %0
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
  ret void
 }
--- a/stdlib-sse2.ll
+++ b/stdlib-sse2.ll
@@ -278,15 +278,15 @@ define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwa
 define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
                                     <4 x i32> %mask) nounwind alwaysinline {
-  %val = load <4 x i32> * %0
+  %val = load <4 x i32> * %0, align 4
  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
-  store <4 x i32> %newval, <4 x i32> * %0
+  store <4 x i32> %newval, <4 x i32> * %0, align 4
  ret void
 }
 define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
                                     <4 x i32> %mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr
+  %oldValue = load <4 x i64>* %ptr, align 8
  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
  ; are actually bitcast <2 x i64> values
@@ -322,7 +322,7 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
  ; reconstruct the final <4 x i64> vector
  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i64> %final, <4 x i64> * %ptr
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
  ret void
 }
--- a/stdlib-sse4.ll
+++ b/stdlib-sse4.ll
@@ -188,21 +188,21 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
 define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
                                     <4 x i32> %mask) nounwind alwaysinline {
  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
-  %oldValue = load <4 x i32>* %0
+  %oldValue = load <4 x i32>* %0, align 4
  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
                                                     <4 x float> %newAsFloat,
                                                     <4 x float> %mask_as_float)
  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
-  store <4 x i32> %blendAsInt, <4 x i32>* %0
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
  ret void
 }
 define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
                                     <4 x i32> %i32mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr
+  %oldValue = load <4 x i64>* %ptr, align 8
  %mask = bitcast <4 x i32> %i32mask to <4 x float>
  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
@@ -243,6 +243,6 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
  ; reconstruct the final <4 x i64> vector
  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i64> %final, <4 x i64> * %ptr
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
  ret void
 }
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -566,7 +566,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %oldValue = load <8 x i32>* %0
+  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
  %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
@@ -584,7 +584,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
  %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b,
               <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
-  store <8 x i32> %blendAsInt, <8 x i32>* %0
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
  ret void
 }
@@ -595,7 +595,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
-  %old = load <8 x i64>* %ptr
+  %old = load <8 x i64>* %ptr, align 8
  ; set up the first two 64-bit values
  %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -651,7 +651,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567,
       <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i64> %final, <8 x i64> * %ptr
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
  ret void
 }
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -452,7 +452,7 @@ define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alw
  %ptr16 = bitcast [0 x i32] *%0 to i16 *
  %ptr = getelementptr i16 * %ptr16, i32 %offset
  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
-  %val = load i`'eval(16*$1) * %ptr64, align 1
+  %val = load i`'eval(16*$1) * %ptr64, align 2
  %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
  ; unsigned, so use zero-extent...
@@ -479,7 +479,7 @@ define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
  %oldmasked = and i`'eval(8*$1) %old, %notmask
  %newmasked = and i`'eval(8*$1) %val64, %mask64
  %final = or i`'eval(8*$1) %oldmasked, %newmasked
-  store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64
+  store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1
  ret void
 }
@@ -498,11 +498,11 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32
  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
  ;; as above, use mask to do blending with logical ops...
-  %old = load i`'eval(16*$1) * %ptr64, align 1
+  %old = load i`'eval(16*$1) * %ptr64, align 2
  %oldmasked = and i`'eval(16*$1) %old, %notmask
  %newmasked = and i`'eval(16*$1) %val64, %mask64
  %final = or i`'eval(16*$1) %oldmasked, %newmasked
-  store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64
+  store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2
  ret void
 }
@@ -544,7 +544,7 @@ all_on:
  ;; vector load
  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
  %vec_load = load <$1 x i32> *%vecptr, align 4
-  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr
+  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
  ret i32 $1
 not_all_on: