Merge branch 'master' of github.com:ispc/ispc
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,2 +1,6 @@
|
|||||||
*.pyc
|
*.pyc
|
||||||
*~
|
*~
|
||||||
|
depend
|
||||||
|
ispc
|
||||||
|
ispc_test
|
||||||
|
objs
|
||||||
|
|||||||
@@ -103,24 +103,6 @@ savePPM(const char *fname, int w, int h)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Allocate memory with 64-byte alignment.
|
|
||||||
float *
|
|
||||||
AllocAligned(int size) {
|
|
||||||
#if defined(_WIN32) || defined(_WIN64)
|
|
||||||
return (float *)_aligned_malloc(size, 64);
|
|
||||||
#elif defined (__APPLE__)
|
|
||||||
// Allocate excess memory to ensure an aligned pointer can be returned
|
|
||||||
void *mem = malloc(size + (64-1) + sizeof(void*));
|
|
||||||
char *amem = ((char*)mem) + sizeof(void*);
|
|
||||||
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
|
|
||||||
((void**)amem)[-1] = mem;
|
|
||||||
return (float *)amem;
|
|
||||||
#else
|
|
||||||
return (float *)memalign(64, size);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
if (argc != 4) {
|
if (argc != 4) {
|
||||||
@@ -136,8 +118,8 @@ int main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Allocate space for output images
|
// Allocate space for output images
|
||||||
img = (unsigned char *)AllocAligned(width * height * 3);
|
img = new unsigned char[width * height * 3];
|
||||||
fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
|
fimg = new float[width * height * 3];
|
||||||
|
|
||||||
//
|
//
|
||||||
// Run the ispc path, test_iterations times, and report the minimum
|
// Run the ispc path, test_iterations times, and report the minimum
|
||||||
|
|||||||
@@ -102,24 +102,6 @@ savePPM(const char *fname, int w, int h)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Allocate memory with 64-byte alignment.
|
|
||||||
float *
|
|
||||||
AllocAligned(int size) {
|
|
||||||
#if defined(_WIN32) || defined(_WIN64)
|
|
||||||
return (float *)_aligned_malloc(size, 64);
|
|
||||||
#elif defined (__APPLE__)
|
|
||||||
// Allocate excess memory to ensure an aligned pointer can be returned
|
|
||||||
void *mem = malloc(size + (64-1) + sizeof(void*));
|
|
||||||
char *amem = ((char*)mem) + sizeof(void*);
|
|
||||||
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
|
|
||||||
((void**)amem)[-1] = mem;
|
|
||||||
return (float *)amem;
|
|
||||||
#else
|
|
||||||
return (float *)memalign(64, size);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
if (argc != 4) {
|
if (argc != 4) {
|
||||||
@@ -135,8 +117,8 @@ int main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Allocate space for output images
|
// Allocate space for output images
|
||||||
img = (unsigned char *)AllocAligned(width * height * 3);
|
img = new unsigned char[width * height * 3];
|
||||||
fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
|
fimg = new float[width * height * 3];
|
||||||
|
|
||||||
ao_ispc(width, height, NSUBSAMPLES, fimg);
|
ao_ispc(width, height, NSUBSAMPLES, fimg);
|
||||||
|
|
||||||
|
|||||||
3
examples/mandelbrot/.gitignore
vendored
Normal file
3
examples/mandelbrot/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
mandelbrot
|
||||||
|
*.ppm
|
||||||
|
objs
|
||||||
@@ -37,9 +37,6 @@
|
|||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#ifndef __APPLE__
|
|
||||||
#include <malloc.h>
|
|
||||||
#endif // !__APPLE__
|
|
||||||
using std::max;
|
using std::max;
|
||||||
|
|
||||||
#include "options_defs.h"
|
#include "options_defs.h"
|
||||||
@@ -48,23 +45,6 @@ using std::max;
|
|||||||
#include "options_ispc.h"
|
#include "options_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
// Allocate memory with 64-byte alignment.
|
|
||||||
float *AllocFloats(int count) {
|
|
||||||
int size = count * sizeof(float);
|
|
||||||
#if defined(_WIN32) || defined(_WIN64)
|
|
||||||
return (float *)_aligned_malloc(size, 64);
|
|
||||||
#elif defined (__APPLE__)
|
|
||||||
// Allocate excess memory to ensure an aligned pointer can be returned
|
|
||||||
void *mem = malloc(size + (64-1) + sizeof(void*));
|
|
||||||
char *amem = ((char*)mem) + sizeof(void*);
|
|
||||||
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
|
|
||||||
((void**)amem)[-1] = mem;
|
|
||||||
return (float *)amem;
|
|
||||||
#else
|
|
||||||
return (float *)memalign(64, size);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
extern void black_scholes_serial(float Sa[], float Xa[], float Ta[],
|
extern void black_scholes_serial(float Sa[], float Xa[], float Ta[],
|
||||||
float ra[], float va[],
|
float ra[], float va[],
|
||||||
float result[], int count);
|
float result[], int count);
|
||||||
@@ -76,12 +56,12 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
|
|||||||
int main() {
|
int main() {
|
||||||
// Pointers passed to ispc code must have alignment of the target's
|
// Pointers passed to ispc code must have alignment of the target's
|
||||||
// vector width at minimum.
|
// vector width at minimum.
|
||||||
float *S = AllocFloats(N_OPTIONS);
|
float *S = new float[N_OPTIONS];
|
||||||
float *X = AllocFloats(N_OPTIONS);
|
float *X = new float[N_OPTIONS];
|
||||||
float *T = AllocFloats(N_OPTIONS);
|
float *T = new float[N_OPTIONS];
|
||||||
float *r = AllocFloats(N_OPTIONS);
|
float *r = new float[N_OPTIONS];
|
||||||
float *v = AllocFloats(N_OPTIONS);
|
float *v = new float[N_OPTIONS];
|
||||||
float *result = AllocFloats(N_OPTIONS);
|
float *result = new float[N_OPTIONS];
|
||||||
|
|
||||||
for (int i = 0; i < N_OPTIONS; ++i) {
|
for (int i = 0; i < N_OPTIONS; ++i) {
|
||||||
S[i] = 100; // stock price
|
S[i] = 100; // stock price
|
||||||
|
|||||||
@@ -43,9 +43,6 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#ifndef __APPLE__
|
|
||||||
#include <malloc.h>
|
|
||||||
#endif
|
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
#include "rt_ispc.h"
|
#include "rt_ispc.h"
|
||||||
|
|
||||||
@@ -53,23 +50,6 @@ using namespace ispc;
|
|||||||
|
|
||||||
typedef unsigned int uint;
|
typedef unsigned int uint;
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
T *AllocAligned(int count) {
|
|
||||||
int size = count * sizeof(T);
|
|
||||||
#if defined(_WIN32) || defined(_WIN64)
|
|
||||||
return (T *)_aligned_malloc(size, 64);
|
|
||||||
#elif defined (__APPLE__)
|
|
||||||
// Allocate excess memory to ensure an aligned pointer can be returned
|
|
||||||
void *mem = malloc(size + (64-1) + sizeof(void*));
|
|
||||||
char *amem = ((char*)mem) + sizeof(void*);
|
|
||||||
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
|
|
||||||
((void**)amem)[-1] = mem;
|
|
||||||
return (T *)amem;
|
|
||||||
#else
|
|
||||||
return (T *)memalign(64, size);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
extern void raytrace_serial(int width, int height, const float raster2camera[4][4],
|
extern void raytrace_serial(int width, int height, const float raster2camera[4][4],
|
||||||
const float camera2world[4][4], float image[],
|
const float camera2world[4][4], float image[],
|
||||||
int id[], const LinearBVHNode nodes[],
|
int id[], const LinearBVHNode nodes[],
|
||||||
@@ -161,7 +141,7 @@ int main(int argc, char *argv[]) {
|
|||||||
uint nNodes;
|
uint nNodes;
|
||||||
READ(nNodes, 1);
|
READ(nNodes, 1);
|
||||||
|
|
||||||
LinearBVHNode *nodes = AllocAligned<LinearBVHNode>(nNodes);
|
LinearBVHNode *nodes = new LinearBVHNode[nNodes];
|
||||||
for (unsigned int i = 0; i < nNodes; ++i) {
|
for (unsigned int i = 0; i < nNodes; ++i) {
|
||||||
// Each node is 6x floats for a boox, then an integer for an offset
|
// Each node is 6x floats for a boox, then an integer for an offset
|
||||||
// to the second child node, then an integer that encodes the type
|
// to the second child node, then an integer that encodes the type
|
||||||
@@ -181,7 +161,7 @@ int main(int argc, char *argv[]) {
|
|||||||
// And then read the triangles
|
// And then read the triangles
|
||||||
uint nTris;
|
uint nTris;
|
||||||
READ(nTris, 1);
|
READ(nTris, 1);
|
||||||
Triangle *triangles = AllocAligned<Triangle>(nTris);
|
Triangle *triangles = new Triangle[nTris];
|
||||||
for (uint i = 0; i < nTris; ++i) {
|
for (uint i = 0; i < nTris; ++i) {
|
||||||
// 9x floats for the 3 vertices
|
// 9x floats for the 3 vertices
|
||||||
float v[9];
|
float v[9];
|
||||||
|
|||||||
2
examples/simple/.gitignore
vendored
Normal file
2
examples/simple/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
simple
|
||||||
|
objs
|
||||||
@@ -513,7 +513,7 @@ declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>,
|
|||||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||||
<8 x i32>) nounwind alwaysinline {
|
<8 x i32>) nounwind alwaysinline {
|
||||||
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
||||||
%oldValue = load <8 x i32>* %0
|
%oldValue = load <8 x i32>* %0, align 4
|
||||||
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
||||||
%newAsFloat = bitcast <8 x i32> %1 to <8 x float>
|
%newAsFloat = bitcast <8 x i32> %1 to <8 x float>
|
||||||
%blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat,
|
%blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat,
|
||||||
|
|||||||
@@ -278,7 +278,7 @@ define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwa
|
|||||||
|
|
||||||
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
||||||
<4 x i32> %mask) nounwind alwaysinline {
|
<4 x i32> %mask) nounwind alwaysinline {
|
||||||
%val = load <4 x i32> * %0
|
%val = load <4 x i32> * %0, align 4
|
||||||
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
|
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
|
||||||
store <4 x i32> %newval, <4 x i32> * %0, align 4
|
store <4 x i32> %newval, <4 x i32> * %0, align 4
|
||||||
ret void
|
ret void
|
||||||
@@ -286,7 +286,7 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
|||||||
|
|
||||||
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||||
<4 x i32> %mask) nounwind alwaysinline {
|
<4 x i32> %mask) nounwind alwaysinline {
|
||||||
%oldValue = load <4 x i64>* %ptr
|
%oldValue = load <4 x i64>* %ptr, align 8
|
||||||
|
|
||||||
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
||||||
; are actually bitcast <2 x i64> values
|
; are actually bitcast <2 x i64> values
|
||||||
|
|||||||
@@ -188,7 +188,7 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
|||||||
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
||||||
<4 x i32> %mask) nounwind alwaysinline {
|
<4 x i32> %mask) nounwind alwaysinline {
|
||||||
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
|
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
|
||||||
%oldValue = load <4 x i32>* %0
|
%oldValue = load <4 x i32>* %0, align 4
|
||||||
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
||||||
%newAsFloat = bitcast <4 x i32> %1 to <4 x float>
|
%newAsFloat = bitcast <4 x i32> %1 to <4 x float>
|
||||||
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
|
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
|
||||||
@@ -202,7 +202,7 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
|||||||
|
|
||||||
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||||
<4 x i32> %i32mask) nounwind alwaysinline {
|
<4 x i32> %i32mask) nounwind alwaysinline {
|
||||||
%oldValue = load <4 x i64>* %ptr
|
%oldValue = load <4 x i64>* %ptr, align 8
|
||||||
%mask = bitcast <4 x i32> %i32mask to <4 x float>
|
%mask = bitcast <4 x i32> %i32mask to <4 x float>
|
||||||
|
|
||||||
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
||||||
|
|||||||
@@ -566,7 +566,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
|||||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
%mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
|
%mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
|
||||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
%oldValue = load <8 x i32>* %0
|
%oldValue = load <8 x i32>* %0, align 4
|
||||||
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
||||||
%newAsFloat = bitcast <8 x i32> %1 to <8 x float>
|
%newAsFloat = bitcast <8 x i32> %1 to <8 x float>
|
||||||
%old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
|
%old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
|
||||||
@@ -595,7 +595,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
|||||||
|
|
||||||
%mask_as_float = bitcast <8 x i32> %mask to <8 x float>
|
%mask_as_float = bitcast <8 x i32> %mask to <8 x float>
|
||||||
|
|
||||||
%old = load <8 x i64>* %ptr
|
%old = load <8 x i64>* %ptr, align 8
|
||||||
|
|
||||||
; set up the first two 64-bit values
|
; set up the first two 64-bit values
|
||||||
%old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
|
%old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
|
||||||
|
|||||||
@@ -452,7 +452,7 @@ define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alw
|
|||||||
%ptr16 = bitcast [0 x i32] *%0 to i16 *
|
%ptr16 = bitcast [0 x i32] *%0 to i16 *
|
||||||
%ptr = getelementptr i16 * %ptr16, i32 %offset
|
%ptr = getelementptr i16 * %ptr16, i32 %offset
|
||||||
%ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
|
%ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
|
||||||
%val = load i`'eval(16*$1) * %ptr64, align 1
|
%val = load i`'eval(16*$1) * %ptr64, align 2
|
||||||
|
|
||||||
%vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
|
%vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
|
||||||
; unsigned, so use zero-extent...
|
; unsigned, so use zero-extent...
|
||||||
@@ -479,7 +479,7 @@ define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
|
|||||||
%oldmasked = and i`'eval(8*$1) %old, %notmask
|
%oldmasked = and i`'eval(8*$1) %old, %notmask
|
||||||
%newmasked = and i`'eval(8*$1) %val64, %mask64
|
%newmasked = and i`'eval(8*$1) %val64, %mask64
|
||||||
%final = or i`'eval(8*$1) %oldmasked, %newmasked
|
%final = or i`'eval(8*$1) %oldmasked, %newmasked
|
||||||
store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64
|
store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1
|
||||||
|
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
@@ -498,11 +498,11 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32
|
|||||||
%ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
|
%ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
|
||||||
|
|
||||||
;; as above, use mask to do blending with logical ops...
|
;; as above, use mask to do blending with logical ops...
|
||||||
%old = load i`'eval(16*$1) * %ptr64, align 1
|
%old = load i`'eval(16*$1) * %ptr64, align 2
|
||||||
%oldmasked = and i`'eval(16*$1) %old, %notmask
|
%oldmasked = and i`'eval(16*$1) %old, %notmask
|
||||||
%newmasked = and i`'eval(16*$1) %val64, %mask64
|
%newmasked = and i`'eval(16*$1) %val64, %mask64
|
||||||
%final = or i`'eval(16*$1) %oldmasked, %newmasked
|
%final = or i`'eval(16*$1) %oldmasked, %newmasked
|
||||||
store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64
|
store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2
|
||||||
|
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user