From 89169d55060ae293ef2d6950d489c46dd9736c83 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Sun, 5 Jan 2014 10:12:45 +0100
Subject: [PATCH] +1

---
 examples_ptx/aobench/.gitignore      |   2 +
 examples_ptx/aobench/Makefile_cpu    |   8 +
 examples_ptx/aobench/Makefile_gpu    |   9 +
 examples_ptx/aobench/ao.cpp          | 204 +++++++++++++++++
 examples_ptx/aobench/ao.ispc         | 272 +++++++++++++++++++++++
 examples_ptx/aobench/ao_serial.cpp   | 314 +++++++++++++++++++++++++++
 examples_ptx/aobench/aobench.vcxproj |  34 +++
 examples_ptx/common_gpu.mk           |   6 +-
 8 files changed, 848 insertions(+), 1 deletion(-)
 create mode 100644 examples_ptx/aobench/.gitignore
 create mode 100644 examples_ptx/aobench/Makefile_cpu
 create mode 100644 examples_ptx/aobench/Makefile_gpu
 create mode 100644 examples_ptx/aobench/ao.cpp
 create mode 100644 examples_ptx/aobench/ao.ispc
 create mode 100644 examples_ptx/aobench/ao_serial.cpp
 create mode 100644 examples_ptx/aobench/aobench.vcxproj
diff --git a/examples_ptx/aobench/.gitignore b/examples_ptx/aobench/.gitignore
new file mode 100644
index 00000000..ad080f43
--- /dev/null
+++ b/examples_ptx/aobench/.gitignore
@@ -0,0 +1,2 @@
+ao
+*.ppm
diff --git a/examples_ptx/aobench/Makefile_cpu b/examples_ptx/aobench/Makefile_cpu
new file mode 100644
index 00000000..28f0f051
--- /dev/null
+++ b/examples_ptx/aobench/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=ao
+CPP_SRC=ao.cpp ao_serial.cpp
+ISPC_SRC=ao.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+
+include ../common.mk
diff --git a/examples_ptx/aobench/Makefile_gpu b/examples_ptx/aobench/Makefile_gpu
new file mode 100644
index 00000000..fa58d307
--- /dev/null
+++ b/examples_ptx/aobench/Makefile_gpu
@@ -0,0 +1,9 @@
+PROG=ao_gpu
+ISPC_SRC=ao.ispc
+CXX_SRC=ao.cpp  ao_serial.cpp
+PTXCC_REGMAX=64
+
+include ../common_gpu.mk
+
+
+
diff --git a/examples_ptx/aobench/ao.cpp b/examples_ptx/aobench/ao.cpp
new file mode 100644
index 00000000..b951113c
--- /dev/null
+++ b/examples_ptx/aobench/ao.cpp
@@ -0,0 +1,204 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include <math.h>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <sys/types.h>
+
+#include "ao_ispc.h"
+
+#include "../timing.h"
+#include "../ispc_malloc.h"
+
+#define NSUBSAMPLES        2
+
+extern void ao_serial(int w, int h, int nsubsamples, float image[]);
+
+static unsigned int test_iterations[] = {3, 7, 1};
+static unsigned int width, height;
+static unsigned char *img;
+static float *fimg;
+
+
+static unsigned char
+clamp(float f)
+{
+    int i = (int)(f * 255.5);
+
+    if (i < 0) i = 0;
+    if (i > 255) i = 255;
+
+    return (unsigned char)i;
+}
+
+
+static void
+savePPM(const char *fname, int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)  {
+            img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
+            img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
+            img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
+        }
+    }
+
+    FILE *fp = fopen(fname, "wb");
+    if (!fp) {
+        perror(fname);
+        exit(1);
+    }
+
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", w, h);
+    fprintf(fp, "255\n");
+    fwrite(img, w * h * 3, 1, fp);
+    fclose(fp);
+    printf("Wrote image file %s\n", fname);
+}
+
+
+int main(int argc, char **argv)
+{
+    if (argc < 3) {
+        printf ("%s\n", argv[0]);
+        printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n");
+        getchar();
+        exit(-1);
+    }
+    else {
+        if (argc == 6) {
+            for (int i = 0; i < 3; i++) {
+                test_iterations[i] = atoi(argv[3 + i]);
+            }
+        }
+        width = atoi (argv[1]);
+        height = atoi (argv[2]);
+    }
+
+    // Allocate space for output images
+#if 0
+    img = new unsigned char[width * height * 3];
+    fimg = new float[width * height * 3];
+#else
+    ispc_malloc((void**) &img, sizeof(unsigned char)*width*height*3);
+    ispc_malloc((void**)&fimg, sizeof(        float)*width*height*3);
+#endif
+
+    //
+    // Run the ispc path, test_iterations times, and report the minimum
+    // time for any of them.
+    //
+#ifndef _CUDA_
+    double minTimeISPC = 1e30;
+    memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+    for (unsigned int i = 0; i < test_iterations[0]; i++) {
+        assert(NSUBSAMPLES == 2);
+        reset_and_start_timer();
+        ispc::ao_ispc(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_msec();
+        printf("@time of ISPC run:\t\t\t[%.3f] msec\n", t);
+        minTimeISPC = std::min(minTimeISPC, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc]:\t\t\t[%.3f] msec (%d x %d image)\n", 
+           minTimeISPC, width, height);
+    savePPM("ao-ispc.ppm", width, height); 
+#endif
+
+    //
+    // Run the ispc + tasks path, test_iterations times, and report the
+    // minimum time for any of them.
+    //
+    double minTimeISPCTasks = 1e30;
+    memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+    for (unsigned int i = 0; i < test_iterations[1]; i++) {
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ispc::ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_msec();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", t);
+        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n", 
+           minTimeISPCTasks, width, height);
+    savePPM("ao-ispc-tasks.ppm", width, height); 
+
+    //
+    // Run the serial path, again test_iteration times, and report the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+    for (unsigned int i = 0; i < test_iterations[2]; i++) {
+        reset_and_start_timer();
+        ao_serial(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_msec();
+        printf("@time of serial run:\t\t\t\t[%.3f] msec\n", t);
+        minTimeSerial = std::min(minTimeSerial, t);
+    }
+
+    // Report more results, save another image...
+    printf("[aobench serial]:\t\t[%.3f] msec (%d x %d image)\n", minTimeSerial, 
+           width, height);
+#ifndef _CUDA_
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
+#else
+    printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPCTasks);
+#endif
+    savePPM("ao-serial.ppm", width, height); 
+        
+    return 0;
+}
diff --git a/examples_ptx/aobench/ao.ispc b/examples_ptx/aobench/ao.ispc
new file mode 100644
index 00000000..ec234eaf
--- /dev/null
+++ b/examples_ptx/aobench/ao.ispc
@@ -0,0 +1,272 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#define NAO_SAMPLES		8
+#define M_PI 3.1415926535f
+
+typedef float<3> vec;
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(vec a, vec b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(vec v0, vec v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(vec &v) {
+    float len2 = dot(v, v);
+    float invlen = rsqrt(len2);
+    v *= invlen;
+}
+
+
+static void
+ray_plane_intersect(Isect &isect, Ray &ray, const uniform Plane &plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    cif (abs(v) < 1.0e-17) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(Isect &isect, Ray &ray, const uniform Sphere &sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    cif (D > 0.) {
+        float t = -B - sqrt(D);
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static void
+orthoBasis(vec basis[3], vec n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static float
+ambient_occlusion(Isect &isect, const uniform Plane &plane, const uniform Sphere spheres[3],
+                  RNGState &rngstate) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const uniform int ntheta = NAO_SAMPLES;
+    static const uniform int nphi   = NAO_SAMPLES;
+    for (uniform int j = 0; j < ntheta; j++) {
+        for (uniform int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrt(frandom(&rngstate));
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
+            float x = cos(phi) * theta;
+            float y = sin(phi) * theta;
+            float z = sqrt(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         uniform float image[]) {
+    const static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    const static uniform Sphere spheres[3] = {
+        { { -2.0f, 0.0f, -3.5f }, 0.5f },
+        { { -0.5f, 0.0f, -3.0f }, 0.5f },
+        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+    RNGState rngstate;
+
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
+    float invSamples = 1.f / nsubsamples;
+
+    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
+                  u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
+        float du = (float)u * invSamples, dv = (float)v * invSamples;
+
+        // Figure out x,y pixel in NDC
+        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+        float ret = 0.f;
+        Ray ray;
+        Isect isect;
+
+        ray.org = 0.f;
+
+        // Poor man's perspective projection
+        ray.dir.x = px;
+        ray.dir.y = py;
+        ray.dir.z = -1.0;
+        vnormalize(ray.dir);
+
+        isect.t   = 1.0e+17;
+        isect.hit = 0;
+
+        for (uniform int snum = 0; snum < 3; ++snum)
+            ray_sphere_intersect(isect, ray, spheres[snum]);
+        ray_plane_intersect(isect, ray, plane);
+
+        // Note use of 'coherent' if statement; the set of rays we
+        // trace will often all hit or all miss the scene
+        cif (isect.hit) {
+            ret = ambient_occlusion(isect, plane, spheres, rngstate);
+            ret *= invSamples * invSamples;
+
+            int offset = 3 * (y * w + x);
+            atomic_add_local(&image[offset], ret);
+            atomic_add_local(&image[offset+1], ret);
+            atomic_add_local(&image[offset+2], ret);
+        }
+    }
+}
+
+
+export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, 
+                    uniform float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
+
+
+static void task ao_task(uniform int width, uniform int height, 
+                         uniform int nsubsamples, uniform float image[]) {
+    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    launch[h] ao_task(w, h, nsubsamples, image);
+}
diff --git a/examples_ptx/aobench/ao_serial.cpp b/examples_ptx/aobench/ao_serial.cpp
new file mode 100644
index 00000000..69af2eba
--- /dev/null
+++ b/examples_ptx/aobench/ao_serial.cpp
@@ -0,0 +1,314 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+static long long drand48_x = 0x1234ABCD330E;
+
+static inline void srand48(int x) {
+    drand48_x = x ^ (x << 16);
+}
+
+static inline double drand48() {
+    drand48_x = drand48_x * 0x5DEECE66D + 0xB;
+    return (drand48_x & 0xFFFFFFFFFFFF) * (1.0 / 281474976710656.0);
+}
+#endif // _MSC_VER
+
+#ifdef _MSC_VER
+__declspec(align(16)) 
+#endif
+struct vec {
+    vec() { x=y=z=pad=0.; }
+    vec(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
+
+    vec operator*(float f) const { return vec(x*f, y*f, z*f); }
+    vec operator+(const vec &f2) const { 
+        return vec(x+f2.x, y+f2.y, z+f2.z); 
+    }
+    vec operator-(const vec &f2) const { 
+        return vec(x-f2.x, y-f2.y, z-f2.z); 
+    }
+    vec operator*(const vec &f2) const { 
+        return vec(x*f2.x, y*f2.y, z*f2.z); 
+    }
+    float x, y, z;
+    float pad;
+}
+#ifndef _MSC_VER
+__attribute__ ((aligned(16)))
+#endif
+;
+inline vec operator*(float f, const vec &v) { return vec(f*v.x, f*v.y, f*v.z); }
+
+
+#define NAO_SAMPLES		8
+
+#ifdef M_PI
+#undef M_PI
+#endif
+#define M_PI 3.1415926535f
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(const vec &a, const vec &b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(const vec &v0, const vec &v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(vec &v) {
+    float len2 = dot(v, v);
+    float invlen = 1.f / sqrtf(len2);
+    v = v * invlen;
+}
+
+
+static inline void
+ray_plane_intersect(Isect &isect, Ray &ray, 
+                    Plane &plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    if (fabsf(v) < 1.0e-17f) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        if ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(Isect &isect, Ray &ray, 
+                     Sphere &sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    if (D > 0.) {
+        float t = -B - sqrtf(D);
+
+        if ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static inline void
+orthoBasis(vec basis[3], const vec &n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6f) && (n.x > -0.6f)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6f) && (n.y > -0.6f)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6f) && (n.z > -0.6f)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static float
+ambient_occlusion(Isect &isect, Plane &plane, 
+                  Sphere spheres[3]) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const int ntheta = NAO_SAMPLES;
+    static const int nphi   = NAO_SAMPLES;
+    for (int j = 0; j < ntheta; j++) {
+        for (int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrtf(drand48());
+            float phi   = 2.0f * M_PI * drand48();
+            float x = cosf(phi) * theta;
+            float y = sinf(phi) * theta;
+            float z = sqrtf(1.0f - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17f;
+            occIsect.hit = 0;
+
+            for (int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.f;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,
+                         float image[]) {
+    static Plane plane = { vec(0.0f, -0.5f, 0.0f), vec(0.f, 1.f, 0.f) };
+    static Sphere spheres[3] = {
+        { vec(-2.0f, 0.0f, -3.5f), 0.5f },
+        { vec(-0.5f, 0.0f, -3.0f), 0.5f },
+        { vec(1.0f, 0.0f, -2.2f), 0.5f } };
+
+    srand48(y0);
+    
+    for (int y = y0; y < y1; ++y) {
+        for (int x = 0; x < w; ++x)  {
+            int offset = 3 * (y * w + x);
+            for (int u = 0; u < nsubsamples; ++u) {
+                for (int v = 0; v < nsubsamples; ++v) {
+                    float px = (x + (u / (float)nsubsamples) - (w / 2.0f)) / (w / 2.0f);
+                    float py = -(y + (v / (float)nsubsamples) - (h / 2.0f)) / (h / 2.0f);
+                    float ret = 0.f;
+                    Ray ray;
+                    Isect isect;
+
+                    ray.org = vec(0.f, 0.f, 0.f);
+
+                    ray.dir.x = px;
+                    ray.dir.y = py;
+                    ray.dir.z = -1.0f;
+                    vnormalize(ray.dir);
+
+                    isect.t   = 1.0e+17f;
+                    isect.hit = 0;
+
+                    for (int snum = 0; snum < 3; ++snum)
+                        ray_sphere_intersect(isect, ray, spheres[snum]);
+                    ray_plane_intersect(isect, ray, plane);
+
+                    if (isect.hit)
+                        ret = ambient_occlusion(isect, plane, spheres);
+
+                    // Update image for AO for this ray
+                    image[offset+0] += ret;
+                    image[offset+1] += ret;
+                    image[offset+2] += ret;
+                }
+            }
+            // Normalize image pixels by number of samples taken per pixel
+            image[offset+0] /= nsubsamples * nsubsamples;
+            image[offset+1] /= nsubsamples * nsubsamples;
+            image[offset+2] /= nsubsamples * nsubsamples;
+        }
+    }
+}
+
+
+void ao_serial(int w, int h, int nsubsamples, 
+               float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
diff --git a/examples_ptx/aobench/aobench.vcxproj b/examples_ptx/aobench/aobench.vcxproj
new file mode 100644
index 00000000..298be2cb
--- /dev/null
+++ b/examples_ptx/aobench/aobench.vcxproj
@@ -0,0 +1,34 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>aobench</RootNamespace>
+    <ISPC_file>ao</ISPC_file>
+    <default_targets>sse2,sse4,avx1-i32x8</default_targets>
+  </PropertyGroup>
+  <Import Project="..\common.props" />
+  <ItemGroup>
+    <ClCompile Include="ao.cpp" />
+    <ClCompile Include="ao_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+</Project>
diff --git a/examples_ptx/common_gpu.mk b/examples_ptx/common_gpu.mk
index 46aa5190..2a0c4af9 100644
--- a/examples_ptx/common_gpu.mk
+++ b/examples_ptx/common_gpu.mk
@@ -11,7 +11,11 @@ LD=nvcc
 LDFLAGS=-lcudart -lcudadevrt -arch=sm_35
 #
 PTXCC=ptxcc
-PTXCC_FLAGS = -maxrregcount=32 -Xptxas=-v
+PTXCC_FLAGS = -Xptxas=-v
+ifdef PTXCC_REGMAX
+  PTXCC_FLAGS += -maxrregcount=$(PTXCC_REGMAX)
+endif
+
 #
 ISPC=ispc
 ISPC_FLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math