Remove memory_barrier() calls from atomics.
This was unnecessary overhead to impose on all callers; the user should handle these as needed on their own. Also added some explanatory text to the documentation that highlights that memory_barrier() is only needed across HW threads/cores, not across program instances in a gang.
This commit is contained in:
@@ -3880,6 +3880,11 @@ code.
|
||||
|
||||
void memory_barrier();
|
||||
|
||||
Note that this barrier is *not* needed for coordinating reads and writes
|
||||
among the program instances in a gang; it's only needed for coordinating
|
||||
between multiple hardware threads running on different cores. See the
|
||||
section `Data Races Within a Gang`_ for the guarantees provided about
|
||||
memory read/write ordering across a gang.
|
||||
|
||||
Prefetches
|
||||
----------
|
||||
|
||||
29
stdlib.ispc
29
stdlib.ispc
@@ -1,6 +1,6 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -1588,22 +1588,17 @@ static inline void memory_barrier() {
|
||||
|
||||
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
memory_barrier(); \
|
||||
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
__foreach_active (i) { \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
@@ -1611,13 +1606,11 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_SWAP(TA,TB) \
|
||||
static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform int i = 0; \
|
||||
TA ret[programCount]; \
|
||||
TA memVal; \
|
||||
@@ -1647,20 +1640,16 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
|
||||
/* And the last instance that wanted to swap gets the value we \
|
||||
originally got back from memory... */ \
|
||||
ret[lastSwap] = memVal; \
|
||||
memory_barrier(); \
|
||||
return ret[programIndex]; \
|
||||
} \
|
||||
static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
__foreach_active (i) { \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
@@ -1668,7 +1657,6 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
||||
uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
|
||||
@@ -1676,25 +1664,19 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
uniform TA oneval = reduce_##OPA(value); \
|
||||
TA ret; \
|
||||
if (lanemask() != 0) { \
|
||||
memory_barrier(); \
|
||||
if (lanemask() != 0) \
|
||||
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval); \
|
||||
memory_barrier(); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
__foreach_active (i) { \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
@@ -1702,7 +1684,6 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
@@ -1757,25 +1738,20 @@ DEFINE_ATOMIC_SWAP(double,double)
|
||||
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
|
||||
static inline uniform TA atomic_compare_exchange_global( \
|
||||
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = \
|
||||
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_compare_exchange_global( \
|
||||
uniform TA * uniform ptr, TA oldval, TA newval) { \
|
||||
memory_barrier(); \
|
||||
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_compare_exchange_global( \
|
||||
uniform TA * varying ptr, TA oldval, TA newval) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
__foreach_active (i) { \
|
||||
uniform TA r = \
|
||||
@@ -1784,7 +1760,6 @@ static inline TA atomic_compare_exchange_global( \
|
||||
extract(newval, i)); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user