From a7d4a3f922ef5203f109bc64f5f9321c277dee0a Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Sun, 26 Jan 2014 13:15:13 +0100
Subject: [PATCH] fix for __any

---
 builtins.cpp             |  22 ++
 builtins/target-nvptx.ll | 420 +++++++++++++++++++++++++++++++++++++++
 builtins/util-nvptx.m4   | 130 ------------
 stdlib.ispc              |  70 ++++---
 4 files changed, 482 insertions(+), 160 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index 57ed3808..29d33ba1 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -353,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) {
         "__atomic_add_int64_global",
         "__atomic_add_uniform_int32_global",
         "__atomic_add_uniform_int64_global",
+        "__atomic_add_varying_int32_global",
+        "__atomic_add_varying_int64_global",
         "__atomic_and_int32_global",
         "__atomic_and_int64_global",
         "__atomic_and_uniform_int32_global",
         "__atomic_and_uniform_int64_global",
+        "__atomic_and_varying_int32_global",
+        "__atomic_and_varying_int64_global",
         "__atomic_compare_exchange_double_global",
         "__atomic_compare_exchange_float_global",
         "__atomic_compare_exchange_int32_global",
@@ -369,14 +373,22 @@ lSetInternalFunctions(llvm::Module *module) {
         "__atomic_max_uniform_int64_global",
         "__atomic_min_uniform_int32_global",
         "__atomic_min_uniform_int64_global",
+        "__atomic_max_varying_int32_global",
+        "__atomic_max_varying_int64_global",
+        "__atomic_min_varying_int32_global",
+        "__atomic_min_varying_int64_global",
         "__atomic_or_int32_global",
         "__atomic_or_int64_global",
         "__atomic_or_uniform_int32_global",
         "__atomic_or_uniform_int64_global",
+        "__atomic_or_varying_int32_global",
+        "__atomic_or_varying_int64_global",
         "__atomic_sub_int32_global",
         "__atomic_sub_int64_global",
         "__atomic_sub_uniform_int32_global",
         "__atomic_sub_uniform_int64_global",
+        "__atomic_sub_varying_int32_global",
+        "__atomic_sub_varying_int64_global",
         "__atomic_swap_double_global",
         "__atomic_swap_float_global",
         "__atomic_swap_int32_global",
@@ -389,10 +401,20 @@ lSetInternalFunctions(llvm::Module *module) {
         "__atomic_umax_uniform_uint64_global",
         "__atomic_umin_uniform_uint32_global",
         "__atomic_umin_uniform_uint64_global",
+        "__atomic_umax_varying_uint32_global",
+        "__atomic_umax_varying_uint64_global",
+        "__atomic_umin_varying_uint32_global",
+        "__atomic_umin_varying_uint64_global",
         "__atomic_xor_int32_global",
         "__atomic_xor_int64_global",
         "__atomic_xor_uniform_int32_global",
         "__atomic_xor_uniform_int64_global",
+        "__atomic_xor_uniform_int32_global",
+        "__atomic_xor_uniform_int64_global",
+        "__atomic_xor_varying_int32_global",
+        "__atomic_xor_varying_int64_global",
+        "__atomic_xor_varying_int32_global",
+        "__atomic_xor_varying_int64_global",
         "__broadcast_double",
         "__broadcast_float",
         "__broadcast_i16",
diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
index dbfedc0d..d0f39c51 100644
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -1660,3 +1660,423 @@ define i64 @__clock() nounwind alwaysinline {
   ret i64 %r
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic_associative
+;; More efficient implementation for atomics that are associative (e.g.,
+;; add, and, ...).  If a basic implementation would do sometihng like:
+;; result0 = atomic_op(ptr, val0)
+;; result1 = atomic_op(ptr, val1)
+;; ..
+;; Then instead we can do:
+;; tmp = (val0 op val1 op ...)
+;; result0 = atomic_op(ptr, tmp)
+;; result1 = (result0 op val0)
+;; ..
+;; And more efficiently compute the same result
+;;
+;; Takes five parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
+;; add
+define <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+;; sub
+define <1 x i32> @__atomic_sub_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %nvalv = sub <1 x i32> <i32 0>, %valv
+  %ret = call <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %nvalv, <1 x i1> %maskv);
+  ret <1 x i32> %ret;
+}
+;; and
+define <1 x i32> @__atomic_and_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+;; or
+define <1 x i32> @__atomic_or_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+;; xor
+define <1 x i32> @__atomic_xor_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i32> %valv  to i32
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  %oldv = bitcast i32 %old to <1 x i32>
+  ret <1 x i32> %oldv
+pass:
+  ret <1 x i32> %valv
+}
+
+;;;;;;;;; int64
+define <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+define <1 x i64> @__atomic_sub_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %nvalv = sub <1 x i64> <i64 0>, %valv
+  %ret = call <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %nvalv, <1 x i1> %maskv);
+  ret <1 x i64> %ret;
+}
+
+;; and
+define <1 x i64> @__atomic_and_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %andr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %andr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+
+;; or 
+define <1 x i64> @__atomic_or_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %orr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %orr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+
+;; xor
+define <1 x i64> @__atomic_xor_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
+{
+  %mask = bitcast <1 x  i1> %maskv to  i1
+  %val  = bitcast <1 x i64> %valv  to i64
+  br i1 %mask, label %exec, label %pass
+exec:
+  %xorr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %xorr, i64 %val);
+  %oldv = bitcast i64 %old to <1 x i64>
+  ret <1 x i64> %oldv
+pass:
+  ret <1 x i64> %valv
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic_uniform
+;; Defines the implementation of a function that handles the mapping from
+;; an ispc atomic function to the underlying LLVM intrinsics.  This variant
+;; just calls the atomic once, for the given uniform value
+;;
+;; Takes four parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+
+define i32 @__get_first_active_lane()
+{
+  %nact  = call i32 @__ballot_nvptx(i1 true);
+  %lane1 = call i32 @__count_leading_zeros_i32(i32 %nact)
+  %lane  = sub i32 31, %lane1
+  ret i32 %lane
+}
+
+define i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define i32 @__atomic_sub_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %nval = sub i32 0, %val;
+  %old = tail call i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %nval);
+  ret i32 %old;
+}
+define i32 @__atomic_and_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define i32 @__atomic_or_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define i32 @__atomic_xor_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define i32 @__atomic_min_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.min.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define i32 @__atomic_max_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.max.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define i32 @__atomic_umin_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.min.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+define i32 @__atomic_umax_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i32* %ptr to i64
+  %old = tail call i32 asm sideeffect "atom.max.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
+  ret i32 %old;
+}
+
+
+define i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define i64 @__atomic_sub_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %nval = sub i64 0, %val;
+  %old = tail call i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %nval);
+  ret i64 %old;
+}
+define i64 @__atomic_and_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define i64 @__atomic_or_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define i64 @__atomic_xor_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define i64 @__atomic_min_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.min.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define i64 @__atomic_max_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.max.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define i64 @__atomic_umin_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.min.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+define i64 @__atomic_umax_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
+{
+  %addr = ptrtoint i64* %ptr to i64
+  %old = tail call i64 asm sideeffect "atom.max.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
+  ret i64 %old;
+}
+
+define(`global_atomic_uniform',`
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline
+{
+entry:
+  %addr   = ptrtoint $3 * %ptr to i64
+  %active = call i32 @__get_first_active_lane();
+  %lane   = call i32 @__laneidx();
+  %c      = icmp eq i32 %lane, %active
+  br i1 %c, label %p1, label %p2
+
+p1:
+  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val);
+  br label %p2;
+
+p2: 
+  %t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry]
+  %old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active)
+  ret $3 %old;
+}
+')
+global_atomic_uniform(1, add, i32, int32)
+global_atomic_uniform(1, sub, i32, int32)
+global_atomic_uniform(1, and, i32, int32)
+global_atomic_uniform(1, or, i32, int32)
+global_atomic_uniform(1, xor, i32, int32)
+global_atomic_uniform(1, min, i32, int32)
+global_atomic_uniform(1, max, i32, int32)
+global_atomic_uniform(1, umin, i32, uint32)
+global_atomic_uniform(1, umax, i32, uint32)
+
+global_atomic_uniform(1, add, i64, int64)
+global_atomic_uniform(1, sub, i64, int64)
+global_atomic_uniform(1, and, i64, int64)
+global_atomic_uniform(1, or, i64, int64)
+global_atomic_uniform(1, xor, i64, int64)
+global_atomic_uniform(1, min, i64, int64)
+global_atomic_uniform(1, max, i64, int64)
+global_atomic_uniform(1, umin, i64, uint64)
+global_atomic_uniform(1, umax, i64, uint64)
+
+define(`global_atomic_varying',`
+define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline
+{
+entry:
+  %addr  = bitcast <1 x i64> %ptr   to i64
+  %c     = bitcast <1 x  i1> %maskv to  i1
+  br i1 %c, label %p1, label %p2
+
+p1:
+  %sv = bitcast <1 x $3> %val to $3
+  %sptr = inttoptr i64 %addr to $3*
+  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sv);
+  %t0v = bitcast $3 %t0 to <1 x $3>
+  ret < 1x $3> %t0v
+
+p2: 
+  ret <1 x $3> %val
+}
+')
+global_atomic_varying(1, add, i32, int32)
+global_atomic_varying(1, sub, i32, int32)
+global_atomic_varying(1, and, i32, int32)
+global_atomic_varying(1, or, i32, int32)
+global_atomic_varying(1, xor, i32, int32)
+global_atomic_varying(1, min, i32, int32)
+global_atomic_varying(1, max, i32, int32)
+global_atomic_varying(1, umin, i32, uint32)
+global_atomic_varying(1, umax, i32, uint32)
+
+global_atomic_varying(1, add, i64, int64)
+global_atomic_varying(1, sub, i64, int64)
+global_atomic_varying(1, and, i64, int64)
+global_atomic_varying(1, or, i64, int64)
+global_atomic_varying(1, xor, i64, int64)
+global_atomic_varying(1, min, i64, int64)
+global_atomic_varying(1, max, i64, int64)
+global_atomic_varying(1, umin, i64, uint64)
+global_atomic_varying(1, umax, i64, uint64)
+
+;; Macro to declare the function that implements the swap atomic.  
+;; Takes three parameters:
+;; $1: vector width of the target
+;; $2: llvm type of the vector elements (e.g. i32)
+;; $3: ispc type of the elements (e.g. int32)
+
+define(`global_swap', `
+declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ;
+')
+
+
+;; Similarly, macro to declare the function that implements the compare/exchange
+;; atomic.  Takes three parameters:
+;; $1: vector width of the target
+;; $2: llvm type of the vector elements (e.g. i32)
+;; $3: ispc type of the elements (e.g. int32)
+
+define(`global_atomic_exchange', `
+
+declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
+                               <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ;
+
+declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
+                                                       $2 %val) nounwind alwaysinline ;
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; atomics and memory barriers
+
+global_swap(WIDTH, i32, int32)
+global_swap(WIDTH, i64, int64)
+
+declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ;
+declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ;
+global_atomic_exchange(WIDTH, i32, int32)
+global_atomic_exchange(WIDTH, i64, int64)
+
+declare <WIDTH x float> @__atomic_compare_exchange_float_global(float * %ptr,
+                      <WIDTH x float> %cmp, <WIDTH x float> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
+declare <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
+                      <WIDTH x double> %cmp, <WIDTH x double> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
+declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
+                                                             float %val) nounwind alwaysinline ;
+declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
+                                                               double %val) nounwind alwaysinline ;
+
+declare void @llvm.nvvm.membar.gl()
+declare void @llvm.nvvm.membar.sys()
+declare void @llvm.nvvm.membar.cta()
+
+define void @__memory_barrier() nounwind readnone alwaysinline {
+  ;; see http://llvm.org/bugs/show_bug.cgi?id=2829.  It seems like we
+  ;; only get an MFENCE on x86 if "device" is true, but IMHO we should
+  ;; in the case where the first 4 args are true but it is false.
+  ;;  So we just always set that to true...
+  call void @llvm.nvvm.membar.gl()
+  ret void
+}
diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4
index 7bb1014b..ede70860 100644
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
@@ -768,27 +768,6 @@ shuffles(double, 8)
 shuffles(i64, 8)
 ')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; global_atomic_associative
-;; More efficient implementation for atomics that are associative (e.g.,
-;; add, and, ...).  If a basic implementation would do sometihng like:
-;; result0 = atomic_op(ptr, val0)
-;; result1 = atomic_op(ptr, val1)
-;; ..
-;; Then instead we can do:
-;; tmp = (val0 op val1 op ...)
-;; result0 = atomic_op(ptr, tmp)
-;; result1 = (result0 op val0)
-;; ..
-;; And more efficiently compute the same result
-;;
-;; Takes five parameters:
-;; $1: vector width of the target
-;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
-;;     (add, sub...)
-;; $3: return type of the LLVM atomic (e.g. i32)
-;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
-;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
 
 define(`mask_converts', `
 define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
@@ -875,54 +854,6 @@ define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
 
 mask_converts(WIDTH)
 
-define(`global_atomic_associative', `
-
-declare <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
-                                        <$1 x MASK> %m) nounwind alwaysinline ;
-')
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; global_atomic_uniform
-;; Defines the implementation of a function that handles the mapping from
-;; an ispc atomic function to the underlying LLVM intrinsics.  This variant
-;; just calls the atomic once, for the given uniform value
-;;
-;; Takes four parameters:
-;; $1: vector width of the target
-;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
-;;     (add, sub...)
-;; $3: return type of the LLVM atomic (e.g. i32)
-;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
-
-define(`global_atomic_uniform', `
-declare $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline ;
-')
-
-;; Macro to declare the function that implements the swap atomic.  
-;; Takes three parameters:
-;; $1: vector width of the target
-;; $2: llvm type of the vector elements (e.g. i32)
-;; $3: ispc type of the elements (e.g. int32)
-
-define(`global_swap', `
-declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ;
-')
-
-
-;; Similarly, macro to declare the function that implements the compare/exchange
-;; atomic.  Takes three parameters:
-;; $1: vector width of the target
-;; $2: llvm type of the vector elements (e.g. i32)
-;; $3: ispc type of the elements (e.g. int32)
-
-define(`global_atomic_exchange', `
-
-declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
-                               <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ;
-
-declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
-                                                       $2 %val) nounwind alwaysinline ;
-')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; count trailing zeros
@@ -2507,67 +2438,6 @@ define double @__stdlib_pow(double, double) nounwind readnone alwaysinline {
   ret double %r
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; atomics and memory barriers
-
-declare void @llvm.memory.barrier(i1 %loadload, i1 %loadstore, i1 %storeload,
-                                  i1 %storestore, i1 %device)
-
-define void @__memory_barrier() nounwind readnone alwaysinline {
-  ;; see http://llvm.org/bugs/show_bug.cgi?id=2829.  It seems like we
-  ;; only get an MFENCE on x86 if "device" is true, but IMHO we should
-  ;; in the case where the first 4 args are true but it is false.
-  ;;  So we just always set that to true...
-  call void @llvm.memory.barrier(i1 true, i1 true, i1 true, i1 true, i1 true)
-  ret void
-}
-
-global_atomic_associative(WIDTH, add, i32, int32, 0)
-global_atomic_associative(WIDTH, sub, i32, int32, 0)
-global_atomic_associative(WIDTH, and, i32, int32, -1)
-global_atomic_associative(WIDTH, or, i32, int32, 0)
-global_atomic_associative(WIDTH, xor, i32, int32, 0)
-global_atomic_uniform(WIDTH, add, i32, int32)
-global_atomic_uniform(WIDTH, sub, i32, int32)
-global_atomic_uniform(WIDTH, and, i32, int32)
-global_atomic_uniform(WIDTH, or, i32, int32)
-global_atomic_uniform(WIDTH, xor, i32, int32)
-global_atomic_uniform(WIDTH, min, i32, int32)
-global_atomic_uniform(WIDTH, max, i32, int32)
-global_atomic_uniform(WIDTH, umin, i32, uint32)
-global_atomic_uniform(WIDTH, umax, i32, uint32)
-
-global_atomic_associative(WIDTH, add, i64, int64, 0)
-global_atomic_associative(WIDTH, sub, i64, int64, 0)
-global_atomic_associative(WIDTH, and, i64, int64, -1)
-global_atomic_associative(WIDTH, or, i64, int64, 0)
-global_atomic_associative(WIDTH, xor, i64, int64, 0)
-global_atomic_uniform(WIDTH, add, i64, int64)
-global_atomic_uniform(WIDTH, sub, i64, int64)
-global_atomic_uniform(WIDTH, and, i64, int64)
-global_atomic_uniform(WIDTH, or, i64, int64)
-global_atomic_uniform(WIDTH, xor, i64, int64)
-global_atomic_uniform(WIDTH, min, i64, int64)
-global_atomic_uniform(WIDTH, max, i64, int64)
-global_atomic_uniform(WIDTH, umin, i64, uint64)
-global_atomic_uniform(WIDTH, umax, i64, uint64)
-
-global_swap(WIDTH, i32, int32)
-global_swap(WIDTH, i64, int64)
-
-declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ;
-declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ;
-global_atomic_exchange(WIDTH, i32, int32)
-global_atomic_exchange(WIDTH, i64, int64)
-
-declare <WIDTH x float> @__atomic_compare_exchange_float_global(float * %ptr,
-                      <WIDTH x float> %cmp, <WIDTH x float> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
-declare <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
-                      <WIDTH x double> %cmp, <WIDTH x double> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
-declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
-                                                             float %val) nounwind alwaysinline ;
-declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
-                                                               double %val) nounwind alwaysinline ;
 
 ')
 
diff --git a/stdlib.ispc b/stdlib.ispc
index 2d79bf33..a607fab7 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1814,7 +1814,7 @@ static inline void memory_barrier() {
     __memory_barrier();
 }
 
-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                        \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
     TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
     return ret;                                                         \
@@ -1825,6 +1825,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
     return ret;                                                         \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
@@ -1835,6 +1839,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     return ret;                                                         \
+  } \
 }                                                                       \
 
 #define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
@@ -1888,7 +1893,7 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
     return ret;                                                         \
 }                                                                       \
 
-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                          \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
     uniform TA oneval = reduce_##OPA(value);                            \
     TA ret;                                                             \
@@ -1903,6 +1908,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
                                        TA value) {                      \
+  if (__is_nvptx_target) {                                            \
+    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
+    return ret;                                                         \
+  } else {    \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
@@ -1913,48 +1922,49 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     return ret;                                                         \
+  } \
 }
 
-DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
-DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
 DEFINE_ATOMIC_SWAP(int32,int32)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
 DEFINE_ATOMIC_SWAP(unsigned int32,int32)
 
 DEFINE_ATOMIC_SWAP(float,float)
 
-DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
-DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
 DEFINE_ATOMIC_SWAP(int64,int64)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
 DEFINE_ATOMIC_SWAP(unsigned int64,int64)
 
 DEFINE_ATOMIC_SWAP(double,double)