diff --git a/.gitignore b/.gitignore index f3d74a9a..4df2d277 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ *.pyc *~ +depend +ispc +ispc_test +objs diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp index 1a2eefe5..48cfca42 100644 --- a/examples/aobench/ao.cpp +++ b/examples/aobench/ao.cpp @@ -103,24 +103,6 @@ savePPM(const char *fname, int w, int h) } -// Allocate memory with 64-byte alignment. -float * -AllocAligned(int size) { -#if defined(_WIN32) || defined(_WIN64) - return (float *)_aligned_malloc(size, 64); -#elif defined (__APPLE__) - // Allocate excess memory to ensure an aligned pointer can be returned - void *mem = malloc(size + (64-1) + sizeof(void*)); - char *amem = ((char*)mem) + sizeof(void*); - amem += 64 - (reinterpret_cast(amem) & (64 - 1)); - ((void**)amem)[-1] = mem; - return (float *)amem; -#else - return (float *)memalign(64, size); -#endif -} - - int main(int argc, char **argv) { if (argc != 4) { @@ -136,8 +118,8 @@ int main(int argc, char **argv) } // Allocate space for output images - img = (unsigned char *)AllocAligned(width * height * 3); - fimg = (float *)AllocAligned(sizeof(float) * width * height * 3); + img = new unsigned char[width * height * 3]; + fimg = new float[width * height * 3]; // // Run the ispc path, test_iterations times, and report the minimum diff --git a/examples/aobench_instrumented/ao.cpp b/examples/aobench_instrumented/ao.cpp index 742a0862..5037bdc4 100644 --- a/examples/aobench_instrumented/ao.cpp +++ b/examples/aobench_instrumented/ao.cpp @@ -102,24 +102,6 @@ savePPM(const char *fname, int w, int h) } -// Allocate memory with 64-byte alignment. -float * -AllocAligned(int size) { -#if defined(_WIN32) || defined(_WIN64) - return (float *)_aligned_malloc(size, 64); -#elif defined (__APPLE__) - // Allocate excess memory to ensure an aligned pointer can be returned - void *mem = malloc(size + (64-1) + sizeof(void*)); - char *amem = ((char*)mem) + sizeof(void*); - amem += 64 - (reinterpret_cast(amem) & (64 - 1)); - ((void**)amem)[-1] = mem; - return (float *)amem; -#else - return (float *)memalign(64, size); -#endif -} - - int main(int argc, char **argv) { if (argc != 4) { @@ -135,8 +117,8 @@ int main(int argc, char **argv) } // Allocate space for output images - img = (unsigned char *)AllocAligned(width * height * 3); - fimg = (float *)AllocAligned(sizeof(float) * width * height * 3); + img = new unsigned char[width * height * 3]; + fimg = new float[width * height * 3]; ao_ispc(width, height, NSUBSAMPLES, fimg); diff --git a/examples/mandelbrot/.gitignore b/examples/mandelbrot/.gitignore new file mode 100644 index 00000000..8b48e0db --- /dev/null +++ b/examples/mandelbrot/.gitignore @@ -0,0 +1,3 @@ +mandelbrot +*.ppm +objs diff --git a/examples/options/options.cpp b/examples/options/options.cpp index 241b32be..86c55dae 100644 --- a/examples/options/options.cpp +++ b/examples/options/options.cpp @@ -37,9 +37,6 @@ #include #include #include -#ifndef __APPLE__ -#include -#endif // !__APPLE__ using std::max; #include "options_defs.h" @@ -48,23 +45,6 @@ using std::max; #include "options_ispc.h" using namespace ispc; -// Allocate memory with 64-byte alignment. -float *AllocFloats(int count) { - int size = count * sizeof(float); -#if defined(_WIN32) || defined(_WIN64) - return (float *)_aligned_malloc(size, 64); -#elif defined (__APPLE__) - // Allocate excess memory to ensure an aligned pointer can be returned - void *mem = malloc(size + (64-1) + sizeof(void*)); - char *amem = ((char*)mem) + sizeof(void*); - amem += 64 - (reinterpret_cast(amem) & (64 - 1)); - ((void**)amem)[-1] = mem; - return (float *)amem; -#else - return (float *)memalign(64, size); -#endif -} - extern void black_scholes_serial(float Sa[], float Xa[], float Ta[], float ra[], float va[], float result[], int count); @@ -76,12 +56,12 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[], int main() { // Pointers passed to ispc code must have alignment of the target's // vector width at minimum. - float *S = AllocFloats(N_OPTIONS); - float *X = AllocFloats(N_OPTIONS); - float *T = AllocFloats(N_OPTIONS); - float *r = AllocFloats(N_OPTIONS); - float *v = AllocFloats(N_OPTIONS); - float *result = AllocFloats(N_OPTIONS); + float *S = new float[N_OPTIONS]; + float *X = new float[N_OPTIONS]; + float *T = new float[N_OPTIONS]; + float *r = new float[N_OPTIONS]; + float *v = new float[N_OPTIONS]; + float *result = new float[N_OPTIONS]; for (int i = 0; i < N_OPTIONS; ++i) { S[i] = 100; // stock price diff --git a/examples/rt/rt.cpp b/examples/rt/rt.cpp index e589bd94..d7a19285 100644 --- a/examples/rt/rt.cpp +++ b/examples/rt/rt.cpp @@ -43,9 +43,6 @@ #include #include #include -#ifndef __APPLE__ -#include -#endif #include "../timing.h" #include "rt_ispc.h" @@ -53,23 +50,6 @@ using namespace ispc; typedef unsigned int uint; -template -T *AllocAligned(int count) { - int size = count * sizeof(T); -#if defined(_WIN32) || defined(_WIN64) - return (T *)_aligned_malloc(size, 64); -#elif defined (__APPLE__) - // Allocate excess memory to ensure an aligned pointer can be returned - void *mem = malloc(size + (64-1) + sizeof(void*)); - char *amem = ((char*)mem) + sizeof(void*); - amem += 64 - (reinterpret_cast(amem) & (64 - 1)); - ((void**)amem)[-1] = mem; - return (T *)amem; -#else - return (T *)memalign(64, size); -#endif -} - extern void raytrace_serial(int width, int height, const float raster2camera[4][4], const float camera2world[4][4], float image[], int id[], const LinearBVHNode nodes[], @@ -161,7 +141,7 @@ int main(int argc, char *argv[]) { uint nNodes; READ(nNodes, 1); - LinearBVHNode *nodes = AllocAligned(nNodes); + LinearBVHNode *nodes = new LinearBVHNode[nNodes]; for (unsigned int i = 0; i < nNodes; ++i) { // Each node is 6x floats for a boox, then an integer for an offset // to the second child node, then an integer that encodes the type @@ -181,7 +161,7 @@ int main(int argc, char *argv[]) { // And then read the triangles uint nTris; READ(nTris, 1); - Triangle *triangles = AllocAligned(nTris); + Triangle *triangles = new Triangle[nTris]; for (uint i = 0; i < nTris; ++i) { // 9x floats for the 3 vertices float v[9]; diff --git a/examples/simple/.gitignore b/examples/simple/.gitignore new file mode 100644 index 00000000..3a3d5f0a --- /dev/null +++ b/examples/simple/.gitignore @@ -0,0 +1,2 @@ +simple +objs diff --git a/stdlib-avx.ll b/stdlib-avx.ll index 5ad79adf..fff3719f 100644 --- a/stdlib-avx.ll +++ b/stdlib-avx.ll @@ -513,7 +513,7 @@ declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>, define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, <8 x i32>) nounwind alwaysinline { %mask_as_float = bitcast <8 x i32> %2 to <8 x float> - %oldValue = load <8 x i32>* %0 + %oldValue = load <8 x i32>* %0, align 4 %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> %newAsFloat = bitcast <8 x i32> %1 to <8 x float> %blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat, diff --git a/stdlib-sse2.ll b/stdlib-sse2.ll index a67584f9..c37fdfb5 100644 --- a/stdlib-sse2.ll +++ b/stdlib-sse2.ll @@ -278,7 +278,7 @@ define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwa define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32> %mask) nounwind alwaysinline { - %val = load <4 x i32> * %0 + %val = load <4 x i32> * %0, align 4 %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) store <4 x i32> %newval, <4 x i32> * %0, align 4 ret void @@ -286,7 +286,7 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, <4 x i32> %mask) nounwind alwaysinline { - %oldValue = load <4 x i64>* %ptr + %oldValue = load <4 x i64>* %ptr, align 8 ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values ; are actually bitcast <2 x i64> values diff --git a/stdlib-sse4.ll b/stdlib-sse4.ll index 68b8dd90..30b6f43b 100644 --- a/stdlib-sse4.ll +++ b/stdlib-sse4.ll @@ -188,7 +188,7 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32> %mask) nounwind alwaysinline { %mask_as_float = bitcast <4 x i32> %mask to <4 x float> - %oldValue = load <4 x i32>* %0 + %oldValue = load <4 x i32>* %0, align 4 %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float> %newAsFloat = bitcast <4 x i32> %1 to <4 x float> %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat, @@ -202,7 +202,7 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, <4 x i32> %i32mask) nounwind alwaysinline { - %oldValue = load <4 x i64>* %ptr + %oldValue = load <4 x i64>* %ptr, align 8 %mask = bitcast <4 x i32> %i32mask to <4 x float> ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values diff --git a/stdlib-sse4x2.ll b/stdlib-sse4x2.ll index 39410eca..009c1c5b 100644 --- a/stdlib-sse4x2.ll +++ b/stdlib-sse4x2.ll @@ -566,7 +566,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, <4 x i32> %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef, <4 x i32> - %oldValue = load <8 x i32>* %0 + %oldValue = load <8 x i32>* %0, align 4 %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> %newAsFloat = bitcast <8 x i32> %1 to <8 x float> %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef, @@ -595,7 +595,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, %mask_as_float = bitcast <8 x i32> %mask to <8 x float> - %old = load <8 x i64>* %ptr + %old = load <8 x i64>* %ptr, align 8 ; set up the first two 64-bit values %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> diff --git a/stdlib.m4 b/stdlib.m4 index b437ec19..500d183c 100644 --- a/stdlib.m4 +++ b/stdlib.m4 @@ -452,7 +452,7 @@ define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alw %ptr16 = bitcast [0 x i32] *%0 to i16 * %ptr = getelementptr i16 * %ptr16, i32 %offset %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) * - %val = load i`'eval(16*$1) * %ptr64, align 1 + %val = load i`'eval(16*$1) * %ptr64, align 2 %vval = bitcast i`'eval(16*$1) %val to <$1 x i16> ; unsigned, so use zero-extent... @@ -479,7 +479,7 @@ define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32, %oldmasked = and i`'eval(8*$1) %old, %notmask %newmasked = and i`'eval(8*$1) %val64, %mask64 %final = or i`'eval(8*$1) %oldmasked, %newmasked - store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64 + store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1 ret void } @@ -498,11 +498,11 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32 %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) * ;; as above, use mask to do blending with logical ops... - %old = load i`'eval(16*$1) * %ptr64, align 1 + %old = load i`'eval(16*$1) * %ptr64, align 2 %oldmasked = and i`'eval(16*$1) %old, %notmask %newmasked = and i`'eval(16*$1) %val64, %mask64 %final = or i`'eval(16*$1) %oldmasked, %newmasked - store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64 + store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2 ret void }