fix for __any

2014-01-26 13:15:13 +01:00
parent 09ea9c9fd6
commit a7d4a3f922
4 changed files with 482 additions and 160 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -353,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_add_int64_global",
        "__atomic_add_uniform_int32_global",
        "__atomic_add_uniform_int64_global",
        "__atomic_add_varying_int32_global",
        "__atomic_add_varying_int64_global",
        "__atomic_and_int32_global",
        "__atomic_and_int64_global",
        "__atomic_and_uniform_int32_global",
        "__atomic_and_uniform_int64_global",
        "__atomic_and_varying_int32_global",
        "__atomic_and_varying_int64_global",
        "__atomic_compare_exchange_double_global",
        "__atomic_compare_exchange_float_global",
        "__atomic_compare_exchange_int32_global",
@@ -369,14 +373,22 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_max_uniform_int64_global",
        "__atomic_min_uniform_int32_global",
        "__atomic_min_uniform_int64_global",
        "__atomic_max_varying_int32_global",
        "__atomic_max_varying_int64_global",
        "__atomic_min_varying_int32_global",
        "__atomic_min_varying_int64_global",
        "__atomic_or_int32_global",
        "__atomic_or_int64_global",
        "__atomic_or_uniform_int32_global",
        "__atomic_or_uniform_int64_global",
        "__atomic_or_varying_int32_global",
        "__atomic_or_varying_int64_global",
        "__atomic_sub_int32_global",
        "__atomic_sub_int64_global",
        "__atomic_sub_uniform_int32_global",
        "__atomic_sub_uniform_int64_global",
        "__atomic_sub_varying_int32_global",
        "__atomic_sub_varying_int64_global",
        "__atomic_swap_double_global",
        "__atomic_swap_float_global",
        "__atomic_swap_int32_global",
@@ -389,10 +401,20 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_umax_uniform_uint64_global",
        "__atomic_umin_uniform_uint32_global",
        "__atomic_umin_uniform_uint64_global",
        "__atomic_umax_varying_uint32_global",
        "__atomic_umax_varying_uint64_global",
        "__atomic_umin_varying_uint32_global",
        "__atomic_umin_varying_uint64_global",
        "__atomic_xor_int32_global",
        "__atomic_xor_int64_global",
        "__atomic_xor_uniform_int32_global",
        "__atomic_xor_uniform_int64_global",
        "__atomic_xor_uniform_int32_global",
        "__atomic_xor_uniform_int64_global",
        "__atomic_xor_varying_int32_global",
        "__atomic_xor_varying_int64_global",
        "__atomic_xor_varying_int32_global",
        "__atomic_xor_varying_int64_global",
        "__broadcast_double",
        "__broadcast_float",
        "__broadcast_i16",
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -1660,3 +1660,423 @@ define i64 @__clock() nounwind alwaysinline {
  ret i64 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; global_atomic_associative
 ;; More efficient implementation for atomics that are associative (e.g.,
 ;; add, and, ...).  If a basic implementation would do sometihng like:
 ;; result0 = atomic_op(ptr, val0)
 ;; result1 = atomic_op(ptr, val1)
 ;; ..
 ;; Then instead we can do:
 ;; tmp = (val0 op val1 op ...)
 ;; result0 = atomic_op(ptr, tmp)
 ;; result1 = (result0 op val0)
 ;; ..
 ;; And more efficiently compute the same result
 ;;
 ;; Takes five parameters:
 ;; $1: vector width of the target
 ;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
 ;;     (add, sub...)
 ;; $3: return type of the LLVM atomic (e.g. i32)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
 ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
 ;; add
 define <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
 {
  %mask = bitcast <1 x  i1> %maskv to  i1
  %val  = bitcast <1 x i32> %valv  to i32
  br i1 %mask, label %exec, label %pass
 exec:
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  %oldv = bitcast i32 %old to <1 x i32>
  ret <1 x i32> %oldv
 pass:
  ret <1 x i32> %valv
 }
 ;; sub
 define <1 x i32> @__atomic_sub_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
 {
  %nvalv = sub <1 x i32> <i32 0>, %valv
  %ret = call <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %nvalv, <1 x i1> %maskv);
  ret <1 x i32> %ret;
 }
 ;; and
 define <1 x i32> @__atomic_and_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
 {
  %mask = bitcast <1 x  i1> %maskv to  i1
  %val  = bitcast <1 x i32> %valv  to i32
  br i1 %mask, label %exec, label %pass
 exec:
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  %oldv = bitcast i32 %old to <1 x i32>
  ret <1 x i32> %oldv
 pass:
  ret <1 x i32> %valv
 }
 ;; or
 define <1 x i32> @__atomic_or_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
 {
  %mask = bitcast <1 x  i1> %maskv to  i1
  %val  = bitcast <1 x i32> %valv  to i32
  br i1 %mask, label %exec, label %pass
 exec:
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  %oldv = bitcast i32 %old to <1 x i32>
  ret <1 x i32> %oldv
 pass:
  ret <1 x i32> %valv
 }
 ;; xor
 define <1 x i32> @__atomic_xor_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
 {
  %mask = bitcast <1 x  i1> %maskv to  i1
  %val  = bitcast <1 x i32> %valv  to i32
  br i1 %mask, label %exec, label %pass
 exec:
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  %oldv = bitcast i32 %old to <1 x i32>
  ret <1 x i32> %oldv
 pass:
  ret <1 x i32> %valv
 }
 ;;;;;;;;; int64
 define <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
 {
  %mask = bitcast <1 x  i1> %maskv to  i1
  %val  = bitcast <1 x i64> %valv  to i64
  br i1 %mask, label %exec, label %pass
 exec:
  %addr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
  %oldv = bitcast i64 %old to <1 x i64>
  ret <1 x i64> %oldv
 pass:
  ret <1 x i64> %valv
 }
 define <1 x i64> @__atomic_sub_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
 {
  %nvalv = sub <1 x i64> <i64 0>, %valv
  %ret = call <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %nvalv, <1 x i1> %maskv);
  ret <1 x i64> %ret;
 }
 ;; and
 define <1 x i64> @__atomic_and_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
 {
  %mask = bitcast <1 x  i1> %maskv to  i1
  %val  = bitcast <1 x i64> %valv  to i64
  br i1 %mask, label %exec, label %pass
 exec:
  %andr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %andr, i64 %val);
  %oldv = bitcast i64 %old to <1 x i64>
  ret <1 x i64> %oldv
 pass:
  ret <1 x i64> %valv
 }
 ;; or 
 define <1 x i64> @__atomic_or_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
 {
  %mask = bitcast <1 x  i1> %maskv to  i1
  %val  = bitcast <1 x i64> %valv  to i64
  br i1 %mask, label %exec, label %pass
 exec:
  %orr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %orr, i64 %val);
  %oldv = bitcast i64 %old to <1 x i64>
  ret <1 x i64> %oldv
 pass:
  ret <1 x i64> %valv
 }
 ;; xor
 define <1 x i64> @__atomic_xor_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
 {
  %mask = bitcast <1 x  i1> %maskv to  i1
  %val  = bitcast <1 x i64> %valv  to i64
  br i1 %mask, label %exec, label %pass
 exec:
  %xorr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %xorr, i64 %val);
  %oldv = bitcast i64 %old to <1 x i64>
  ret <1 x i64> %oldv
 pass:
  ret <1 x i64> %valv
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; global_atomic_uniform
 ;; Defines the implementation of a function that handles the mapping from
 ;; an ispc atomic function to the underlying LLVM intrinsics.  This variant
 ;; just calls the atomic once, for the given uniform value
 ;;
 ;; Takes four parameters:
 ;; $1: vector width of the target
 ;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
 ;;     (add, sub...)
 ;; $3: return type of the LLVM atomic (e.g. i32)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
 define i32 @__get_first_active_lane()
 {
  %nact  = call i32 @__ballot_nvptx(i1 true);
  %lane1 = call i32 @__count_leading_zeros_i32(i32 %nact)
  %lane  = sub i32 31, %lane1
  ret i32 %lane
 }
 define i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  ret i32 %old;
 }
 define i32 @__atomic_sub_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
 {
  %nval = sub i32 0, %val;
  %old = tail call i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %nval);
  ret i32 %old;
 }
 define i32 @__atomic_and_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  ret i32 %old;
 }
 define i32 @__atomic_or_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  ret i32 %old;
 }
 define i32 @__atomic_xor_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  ret i32 %old;
 }
 define i32 @__atomic_min_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.min.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  ret i32 %old;
 }
 define i32 @__atomic_max_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.max.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  ret i32 %old;
 }
 define i32 @__atomic_umin_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.min.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  ret i32 %old;
 }
 define i32 @__atomic_umax_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i32* %ptr to i64
  %old = tail call i32 asm sideeffect "atom.max.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
  ret i32 %old;
 }
 define i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
  ret i64 %old;
 }
 define i64 @__atomic_sub_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
 {
  %nval = sub i64 0, %val;
  %old = tail call i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %nval);
  ret i64 %old;
 }
 define i64 @__atomic_and_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
  ret i64 %old;
 }
 define i64 @__atomic_or_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
  ret i64 %old;
 }
 define i64 @__atomic_xor_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
  ret i64 %old;
 }
 define i64 @__atomic_min_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.min.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
  ret i64 %old;
 }
 define i64 @__atomic_max_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.max.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
  ret i64 %old;
 }
 define i64 @__atomic_umin_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.min.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
  ret i64 %old;
 }
 define i64 @__atomic_umax_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
 {
  %addr = ptrtoint i64* %ptr to i64
  %old = tail call i64 asm sideeffect "atom.max.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
  ret i64 %old;
 }
 define(`global_atomic_uniform',`
 define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline
 {
 entry:
  %addr   = ptrtoint $3 * %ptr to i64
  %active = call i32 @__get_first_active_lane();
  %lane   = call i32 @__laneidx();
  %c      = icmp eq i32 %lane, %active
  br i1 %c, label %p1, label %p2
 p1:
  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val);
  br label %p2;
 p2: 
  %t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry]
  %old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active)
  ret $3 %old;
 }
 ')
 global_atomic_uniform(1, add, i32, int32)
 global_atomic_uniform(1, sub, i32, int32)
 global_atomic_uniform(1, and, i32, int32)
 global_atomic_uniform(1, or, i32, int32)
 global_atomic_uniform(1, xor, i32, int32)
 global_atomic_uniform(1, min, i32, int32)
 global_atomic_uniform(1, max, i32, int32)
 global_atomic_uniform(1, umin, i32, uint32)
 global_atomic_uniform(1, umax, i32, uint32)
 global_atomic_uniform(1, add, i64, int64)
 global_atomic_uniform(1, sub, i64, int64)
 global_atomic_uniform(1, and, i64, int64)
 global_atomic_uniform(1, or, i64, int64)
 global_atomic_uniform(1, xor, i64, int64)
 global_atomic_uniform(1, min, i64, int64)
 global_atomic_uniform(1, max, i64, int64)
 global_atomic_uniform(1, umin, i64, uint64)
 global_atomic_uniform(1, umax, i64, uint64)
 define(`global_atomic_varying',`
 define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline
 {
 entry:
  %addr  = bitcast <1 x i64> %ptr   to i64
  %c     = bitcast <1 x  i1> %maskv to  i1
  br i1 %c, label %p1, label %p2
 p1:
  %sv = bitcast <1 x $3> %val to $3
  %sptr = inttoptr i64 %addr to $3*
  %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sv);
  %t0v = bitcast $3 %t0 to <1 x $3>
  ret < 1x $3> %t0v
 p2: 
  ret <1 x $3> %val
 }
 ')
 global_atomic_varying(1, add, i32, int32)
 global_atomic_varying(1, sub, i32, int32)
 global_atomic_varying(1, and, i32, int32)
 global_atomic_varying(1, or, i32, int32)
 global_atomic_varying(1, xor, i32, int32)
 global_atomic_varying(1, min, i32, int32)
 global_atomic_varying(1, max, i32, int32)
 global_atomic_varying(1, umin, i32, uint32)
 global_atomic_varying(1, umax, i32, uint32)
 global_atomic_varying(1, add, i64, int64)
 global_atomic_varying(1, sub, i64, int64)
 global_atomic_varying(1, and, i64, int64)
 global_atomic_varying(1, or, i64, int64)
 global_atomic_varying(1, xor, i64, int64)
 global_atomic_varying(1, min, i64, int64)
 global_atomic_varying(1, max, i64, int64)
 global_atomic_varying(1, umin, i64, uint64)
 global_atomic_varying(1, umax, i64, uint64)
 ;; Macro to declare the function that implements the swap atomic.  
 ;; Takes three parameters:
 ;; $1: vector width of the target
 ;; $2: llvm type of the vector elements (e.g. i32)
 ;; $3: ispc type of the elements (e.g. int32)
 define(`global_swap', `
 declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ;
 ')
 ;; Similarly, macro to declare the function that implements the compare/exchange
 ;; atomic.  Takes three parameters:
 ;; $1: vector width of the target
 ;; $2: llvm type of the vector elements (e.g. i32)
 ;; $3: ispc type of the elements (e.g. int32)
 define(`global_atomic_exchange', `
 declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
                               <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ;
 declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
                                                       $2 %val) nounwind alwaysinline ;
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; atomics and memory barriers
 global_swap(WIDTH, i32, int32)
 global_swap(WIDTH, i64, int64)
 declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ;
 declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ;
 global_atomic_exchange(WIDTH, i32, int32)
 global_atomic_exchange(WIDTH, i64, int64)
 declare <WIDTH x float> @__atomic_compare_exchange_float_global(float * %ptr,
                      <WIDTH x float> %cmp, <WIDTH x float> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
 declare <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
                      <WIDTH x double> %cmp, <WIDTH x double> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
 declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
                                                             float %val) nounwind alwaysinline ;
 declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
                                                               double %val) nounwind alwaysinline ;
 declare void @llvm.nvvm.membar.gl()
 declare void @llvm.nvvm.membar.sys()
 declare void @llvm.nvvm.membar.cta()
 define void @__memory_barrier() nounwind readnone alwaysinline {
  ;; see http://llvm.org/bugs/show_bug.cgi?id=2829.  It seems like we
  ;; only get an MFENCE on x86 if "device" is true, but IMHO we should
  ;; in the case where the first 4 args are true but it is false.
  ;;  So we just always set that to true...
  call void @llvm.nvvm.membar.gl()
  ret void
 }
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
@@ -768,27 +768,6 @@ shuffles(double, 8)
 shuffles(i64, 8)
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; global_atomic_associative
 ;; More efficient implementation for atomics that are associative (e.g.,
 ;; add, and, ...).  If a basic implementation would do sometihng like:
 ;; result0 = atomic_op(ptr, val0)
 ;; result1 = atomic_op(ptr, val1)
 ;; ..
 ;; Then instead we can do:
 ;; tmp = (val0 op val1 op ...)
 ;; result0 = atomic_op(ptr, tmp)
 ;; result1 = (result0 op val0)
 ;; ..
 ;; And more efficiently compute the same result
 ;;
 ;; Takes five parameters:
 ;; $1: vector width of the target
 ;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
 ;;     (add, sub...)
 ;; $3: return type of the LLVM atomic (e.g. i32)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
 ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
 define(`mask_converts', `
 define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
@@ -875,54 +854,6 @@ define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
 mask_converts(WIDTH)
 define(`global_atomic_associative', `
 declare <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
                                        <$1 x MASK> %m) nounwind alwaysinline ;
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; global_atomic_uniform
 ;; Defines the implementation of a function that handles the mapping from
 ;; an ispc atomic function to the underlying LLVM intrinsics.  This variant
 ;; just calls the atomic once, for the given uniform value
 ;;
 ;; Takes four parameters:
 ;; $1: vector width of the target
 ;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
 ;;     (add, sub...)
 ;; $3: return type of the LLVM atomic (e.g. i32)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
 define(`global_atomic_uniform', `
 declare $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline ;
 ')
 ;; Macro to declare the function that implements the swap atomic.  
 ;; Takes three parameters:
 ;; $1: vector width of the target
 ;; $2: llvm type of the vector elements (e.g. i32)
 ;; $3: ispc type of the elements (e.g. int32)
 define(`global_swap', `
 declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ;
 ')
 ;; Similarly, macro to declare the function that implements the compare/exchange
 ;; atomic.  Takes three parameters:
 ;; $1: vector width of the target
 ;; $2: llvm type of the vector elements (e.g. i32)
 ;; $3: ispc type of the elements (e.g. int32)
 define(`global_atomic_exchange', `
 declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
                               <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ;
 declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
                                                       $2 %val) nounwind alwaysinline ;
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; count trailing zeros
@@ -2507,67 +2438,6 @@ define double @__stdlib_pow(double, double) nounwind readnone alwaysinline {
  ret double %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; atomics and memory barriers
 declare void @llvm.memory.barrier(i1 %loadload, i1 %loadstore, i1 %storeload,
                                  i1 %storestore, i1 %device)
 define void @__memory_barrier() nounwind readnone alwaysinline {
  ;; see http://llvm.org/bugs/show_bug.cgi?id=2829.  It seems like we
  ;; only get an MFENCE on x86 if "device" is true, but IMHO we should
  ;; in the case where the first 4 args are true but it is false.
  ;;  So we just always set that to true...
  call void @llvm.memory.barrier(i1 true, i1 true, i1 true, i1 true, i1 true)
  ret void
 }
 global_atomic_associative(WIDTH, add, i32, int32, 0)
 global_atomic_associative(WIDTH, sub, i32, int32, 0)
 global_atomic_associative(WIDTH, and, i32, int32, -1)
 global_atomic_associative(WIDTH, or, i32, int32, 0)
 global_atomic_associative(WIDTH, xor, i32, int32, 0)
 global_atomic_uniform(WIDTH, add, i32, int32)
 global_atomic_uniform(WIDTH, sub, i32, int32)
 global_atomic_uniform(WIDTH, and, i32, int32)
 global_atomic_uniform(WIDTH, or, i32, int32)
 global_atomic_uniform(WIDTH, xor, i32, int32)
 global_atomic_uniform(WIDTH, min, i32, int32)
 global_atomic_uniform(WIDTH, max, i32, int32)
 global_atomic_uniform(WIDTH, umin, i32, uint32)
 global_atomic_uniform(WIDTH, umax, i32, uint32)
 global_atomic_associative(WIDTH, add, i64, int64, 0)
 global_atomic_associative(WIDTH, sub, i64, int64, 0)
 global_atomic_associative(WIDTH, and, i64, int64, -1)
 global_atomic_associative(WIDTH, or, i64, int64, 0)
 global_atomic_associative(WIDTH, xor, i64, int64, 0)
 global_atomic_uniform(WIDTH, add, i64, int64)
 global_atomic_uniform(WIDTH, sub, i64, int64)
 global_atomic_uniform(WIDTH, and, i64, int64)
 global_atomic_uniform(WIDTH, or, i64, int64)
 global_atomic_uniform(WIDTH, xor, i64, int64)
 global_atomic_uniform(WIDTH, min, i64, int64)
 global_atomic_uniform(WIDTH, max, i64, int64)
 global_atomic_uniform(WIDTH, umin, i64, uint64)
 global_atomic_uniform(WIDTH, umax, i64, uint64)
 global_swap(WIDTH, i32, int32)
 global_swap(WIDTH, i64, int64)
 declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ;
 declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ;
 global_atomic_exchange(WIDTH, i32, int32)
 global_atomic_exchange(WIDTH, i64, int64)
 declare <WIDTH x float> @__atomic_compare_exchange_float_global(float * %ptr,
                      <WIDTH x float> %cmp, <WIDTH x float> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
 declare <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
                      <WIDTH x double> %cmp, <WIDTH x double> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
 declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
                                                             float %val) nounwind alwaysinline ;
 declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
                                                               double %val) nounwind alwaysinline ;
 ')
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1814,7 +1814,7 @@ static inline void memory_barrier() {
    __memory_barrier();
 }
-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                        \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
    return ret;                                                         \
@@ -1825,6 +1825,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
    return ret;                                                         \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
  if (__is_nvptx_target) {                                            \
    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
    return ret;                                                         \
  } else {    \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
    TA ret;                                                             \
@@ -1835,6 +1839,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    return ret;                                                         \
  } \
 }                                                                       \
 #define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
@@ -1888,7 +1893,7 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
    return ret;                                                         \
 }                                                                       \
-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC)                          \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    uniform TA oneval = reduce_##OPA(value);                            \
    TA ret;                                                             \
@@ -1903,6 +1908,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
                                       TA value) {                      \
  if (__is_nvptx_target) {                                            \
    TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask);      \
    return ret;                                                         \
  } else {    \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
    TA ret;                                                             \
@@ -1913,48 +1922,49 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    return ret;                                                         \
  } \
 }
-DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
 DEFINE_ATOMIC_SWAP(int32,int32)
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
 DEFINE_ATOMIC_SWAP(unsigned int32,int32)
 DEFINE_ATOMIC_SWAP(float,float)
-DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
 DEFINE_ATOMIC_SWAP(int64,int64)
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
 DEFINE_ATOMIC_SWAP(unsigned int64,int64)
 DEFINE_ATOMIC_SWAP(double,double)