diff --git a/.gitignore b/.gitignore
index f3d74a9a..4df2d277 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 *.pyc
 *~
+depend
+ispc
+ispc_test
+objs
diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp
index 1a2eefe5..48cfca42 100644
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -103,24 +103,6 @@ savePPM(const char *fname, int w, int h)
 }
 
 
-// Allocate memory with 64-byte alignment.
-float *
-AllocAligned(int size) {
-#if defined(_WIN32) || defined(_WIN64)
-    return (float *)_aligned_malloc(size, 64);
-#elif defined (__APPLE__)
-    // Allocate excess memory to ensure an aligned pointer can be returned
-    void *mem = malloc(size + (64-1) + sizeof(void*));
-    char *amem = ((char*)mem) + sizeof(void*);
-    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
-    ((void**)amem)[-1] = mem;
-    return (float *)amem;
-#else
-    return (float *)memalign(64, size);
-#endif
-}
-
-
 int main(int argc, char **argv)
 {
     if (argc != 4) {
@@ -136,8 +118,8 @@ int main(int argc, char **argv)
     }
 
     // Allocate space for output images
-    img = (unsigned char *)AllocAligned(width * height * 3);
-    fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
+    img = new unsigned char[width * height * 3];
+    fimg = new float[width * height * 3];
 
     //
     // Run the ispc path, test_iterations times, and report the minimum
diff --git a/examples/aobench_instrumented/ao.cpp b/examples/aobench_instrumented/ao.cpp
index 742a0862..5037bdc4 100644
--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -102,24 +102,6 @@ savePPM(const char *fname, int w, int h)
 }
 
 
-// Allocate memory with 64-byte alignment.
-float *
-AllocAligned(int size) {
-#if defined(_WIN32) || defined(_WIN64)
-    return (float *)_aligned_malloc(size, 64);
-#elif defined (__APPLE__)
-    // Allocate excess memory to ensure an aligned pointer can be returned
-    void *mem = malloc(size + (64-1) + sizeof(void*));
-    char *amem = ((char*)mem) + sizeof(void*);
-    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
-    ((void**)amem)[-1] = mem;
-    return (float *)amem;
-#else
-    return (float *)memalign(64, size);
-#endif
-}
-
-
 int main(int argc, char **argv)
 {
     if (argc != 4) {
@@ -135,8 +117,8 @@ int main(int argc, char **argv)
     }
 
     // Allocate space for output images
-    img = (unsigned char *)AllocAligned(width * height * 3);
-    fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
+    img = new unsigned char[width * height * 3];
+    fimg = new float[width * height * 3];
 
     ao_ispc(width, height, NSUBSAMPLES, fimg);
 
diff --git a/examples/mandelbrot/.gitignore b/examples/mandelbrot/.gitignore
new file mode 100644
index 00000000..8b48e0db
--- /dev/null
+++ b/examples/mandelbrot/.gitignore
@@ -0,0 +1,3 @@
+mandelbrot
+*.ppm
+objs
diff --git a/examples/options/options.cpp b/examples/options/options.cpp
index 241b32be..86c55dae 100644
--- a/examples/options/options.cpp
+++ b/examples/options/options.cpp
@@ -37,9 +37,6 @@
 #include <assert.h>
 #include <math.h>
 #include <algorithm>
-#ifndef __APPLE__
-#include <malloc.h>
-#endif // !__APPLE__
 using std::max;
 
 #include "options_defs.h"
@@ -48,23 +45,6 @@ using std::max;
 #include "options_ispc.h"
 using namespace ispc;
 
-// Allocate memory with 64-byte alignment.
-float *AllocFloats(int count) {
-    int size = count * sizeof(float);
-#if defined(_WIN32) || defined(_WIN64)
-    return (float *)_aligned_malloc(size, 64);
-#elif defined (__APPLE__)
-    // Allocate excess memory to ensure an aligned pointer can be returned
-    void *mem = malloc(size + (64-1) + sizeof(void*));
-    char *amem = ((char*)mem) + sizeof(void*);
-    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
-    ((void**)amem)[-1] = mem;
-    return (float *)amem;
-#else
-    return (float *)memalign(64, size);
-#endif
-}
-
 extern void black_scholes_serial(float Sa[], float Xa[], float Ta[], 
                                  float ra[], float va[], 
                                  float result[], int count);
@@ -76,12 +56,12 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
 int main() {
     // Pointers passed to ispc code must have alignment of the target's
     // vector width at minimum.
-    float *S = AllocFloats(N_OPTIONS);
-    float *X = AllocFloats(N_OPTIONS);
-    float *T = AllocFloats(N_OPTIONS);
-    float *r = AllocFloats(N_OPTIONS);
-    float *v = AllocFloats(N_OPTIONS);
-    float *result = AllocFloats(N_OPTIONS);
+    float *S = new float[N_OPTIONS];
+    float *X = new float[N_OPTIONS];
+    float *T = new float[N_OPTIONS];
+    float *r = new float[N_OPTIONS];
+    float *v = new float[N_OPTIONS];
+    float *result = new float[N_OPTIONS];
 
     for (int i = 0; i < N_OPTIONS; ++i) {
         S[i] = 100;  // stock price
diff --git a/examples/rt/rt.cpp b/examples/rt/rt.cpp
index e589bd94..d7a19285 100644
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -43,9 +43,6 @@
 #include <algorithm>
 #include <assert.h>
 #include <sys/types.h>
-#ifndef __APPLE__
-#include <malloc.h>
-#endif
 #include "../timing.h"
 #include "rt_ispc.h"
 
@@ -53,23 +50,6 @@ using namespace ispc;
 
 typedef unsigned int uint;
 
-template <typename T> 
-T *AllocAligned(int count) {
-    int size = count * sizeof(T);
-#if defined(_WIN32) || defined(_WIN64)
-    return (T *)_aligned_malloc(size, 64);
-#elif defined (__APPLE__)
-    // Allocate excess memory to ensure an aligned pointer can be returned
-    void *mem = malloc(size + (64-1) + sizeof(void*));
-    char *amem = ((char*)mem) + sizeof(void*);
-    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
-    ((void**)amem)[-1] = mem;
-    return (T *)amem;
-#else
-    return (T *)memalign(64, size);
-#endif
-}
-
 extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
                             const float camera2world[4][4], float image[],
                             int id[], const LinearBVHNode nodes[],
@@ -161,7 +141,7 @@ int main(int argc, char *argv[]) {
     uint nNodes;
     READ(nNodes, 1);
 
-    LinearBVHNode *nodes = AllocAligned<LinearBVHNode>(nNodes);
+    LinearBVHNode *nodes = new LinearBVHNode[nNodes];
     for (unsigned int i = 0; i < nNodes; ++i) {
         // Each node is 6x floats for a boox, then an integer for an offset
         // to the second child node, then an integer that encodes the type
@@ -181,7 +161,7 @@ int main(int argc, char *argv[]) {
     // And then read the triangles 
     uint nTris;
     READ(nTris, 1);
-    Triangle *triangles = AllocAligned<Triangle>(nTris);
+    Triangle *triangles = new Triangle[nTris];
     for (uint i = 0; i < nTris; ++i) {
         // 9x floats for the 3 vertices
         float v[9];
diff --git a/examples/simple/.gitignore b/examples/simple/.gitignore
new file mode 100644
index 00000000..3a3d5f0a
--- /dev/null
+++ b/examples/simple/.gitignore
@@ -0,0 +1,2 @@
+simple
+objs
diff --git a/stdlib-avx.ll b/stdlib-avx.ll
index 5ad79adf..fff3719f 100644
--- a/stdlib-avx.ll
+++ b/stdlib-avx.ll
@@ -513,7 +513,7 @@ declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>,
 define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
                                            <8 x i32>) nounwind alwaysinline {
   %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
-  %oldValue = load <8 x i32>* %0
+  %oldValue = load <8 x i32>* %0, align 4
   %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
   %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
   %blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat,
diff --git a/stdlib-sse2.ll b/stdlib-sse2.ll
index a67584f9..c37fdfb5 100644
--- a/stdlib-sse2.ll
+++ b/stdlib-sse2.ll
@@ -278,7 +278,7 @@ define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwa
 
 define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
                                      <4 x i32> %mask) nounwind alwaysinline {
-  %val = load <4 x i32> * %0
+  %val = load <4 x i32> * %0, align 4
   %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
   store <4 x i32> %newval, <4 x i32> * %0, align 4
   ret void
@@ -286,7 +286,7 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
 
 define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
                                      <4 x i32> %mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr
+  %oldValue = load <4 x i64>* %ptr, align 8
 
   ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
   ; are actually bitcast <2 x i64> values
diff --git a/stdlib-sse4.ll b/stdlib-sse4.ll
index 68b8dd90..30b6f43b 100644
--- a/stdlib-sse4.ll
+++ b/stdlib-sse4.ll
@@ -188,7 +188,7 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
 define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
                                      <4 x i32> %mask) nounwind alwaysinline {
   %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
-  %oldValue = load <4 x i32>* %0
+  %oldValue = load <4 x i32>* %0, align 4
   %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
   %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
   %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
@@ -202,7 +202,7 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
 
 define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
                                      <4 x i32> %i32mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr
+  %oldValue = load <4 x i64>* %ptr, align 8
   %mask = bitcast <4 x i32> %i32mask to <4 x float>
 
   ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
diff --git a/stdlib-sse4x2.ll b/stdlib-sse4x2.ll
index 39410eca..009c1c5b 100644
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -566,7 +566,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
                 <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
                 <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %oldValue = load <8 x i32>* %0
+  %oldValue = load <8 x i32>* %0, align 4
   %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
   %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
   %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
@@ -595,7 +595,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
 
   %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
 
-  %old = load <8 x i64>* %ptr
+  %old = load <8 x i64>* %ptr, align 8
 
   ; set up the first two 64-bit values
   %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
diff --git a/stdlib.m4 b/stdlib.m4
index b437ec19..500d183c 100644
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -452,7 +452,7 @@ define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alw
   %ptr16 = bitcast [0 x i32] *%0 to i16 *
   %ptr = getelementptr i16 * %ptr16, i32 %offset
   %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
-  %val = load i`'eval(16*$1) * %ptr64, align 1
+  %val = load i`'eval(16*$1) * %ptr64, align 2
 
   %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
   ; unsigned, so use zero-extent...
@@ -479,7 +479,7 @@ define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
   %oldmasked = and i`'eval(8*$1) %old, %notmask
   %newmasked = and i`'eval(8*$1) %val64, %mask64
   %final = or i`'eval(8*$1) %oldmasked, %newmasked
-  store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64
+  store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1
 
   ret void
 }
@@ -498,11 +498,11 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32
   %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
 
   ;; as above, use mask to do blending with logical ops...
-  %old = load i`'eval(16*$1) * %ptr64, align 1
+  %old = load i`'eval(16*$1) * %ptr64, align 2
   %oldmasked = and i`'eval(16*$1) %old, %notmask
   %newmasked = and i`'eval(16*$1) %val64, %mask64
   %final = or i`'eval(16*$1) %oldmasked, %newmasked
-  store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64
+  store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2
 
   ret void
 }