Remove memory_barrier() calls from atomics.

This was unnecessary overhead to impose on all callers; the user
should handle these as needed on their own.

Also added some explanatory text to the documentation that highlights
that memory_barrier() is only needed across HW threads/cores, not
across program instances in a gang.
This commit is contained in:
Matt Pharr
2012-04-10 19:37:03 -07:00
parent acfbe77ffc
commit 2aa61007c6
2 changed files with 7 additions and 27 deletions

View File

@@ -3880,6 +3880,11 @@ code.
void memory_barrier();
Note that this barrier is *not* needed for coordinating reads and writes
among the program instances in a gang; it's only needed for coordinating
between multiple hardware threads running on different cores. See the
section `Data Races Within a Gang`_ for the guarantees provided about
memory read/write ordering across a gang.
Prefetches
----------

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -1588,22 +1588,17 @@ static inline void memory_barrier() {
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
memory_barrier(); \
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
memory_barrier(); \
return ret; \
} \
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
uniform TA value) { \
memory_barrier(); \
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
memory_barrier(); \
return ret; \
} \
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
memory_barrier(); \
TA ret; \
__foreach_active (i) { \
uniform TA * uniform p = ptrArray[i]; \
@@ -1611,13 +1606,11 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
ret = insert(ret, i, r); \
} \
memory_barrier(); \
return ret; \
} \
#define DEFINE_ATOMIC_SWAP(TA,TB) \
static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
memory_barrier(); \
uniform int i = 0; \
TA ret[programCount]; \
TA memVal; \
@@ -1647,20 +1640,16 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
/* And the last instance that wanted to swap gets the value we \
originally got back from memory... */ \
ret[lastSwap] = memVal; \
memory_barrier(); \
return ret[programIndex]; \
} \
static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
uniform TA value) { \
memory_barrier(); \
uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \
memory_barrier(); \
return ret; \
} \
static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
memory_barrier(); \
TA ret; \
__foreach_active (i) { \
uniform TA * uniform p = ptrArray[i]; \
@@ -1668,7 +1657,6 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \
ret = insert(ret, i, r); \
} \
memory_barrier(); \
return ret; \
} \
@@ -1676,25 +1664,19 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
uniform TA oneval = reduce_##OPA(value); \
TA ret; \
if (lanemask() != 0) { \
memory_barrier(); \
if (lanemask() != 0) \
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval); \
memory_barrier(); \
} \
return ret; \
} \
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
uniform TA value) { \
memory_barrier(); \
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
memory_barrier(); \
return ret; \
} \
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
TA value) { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
memory_barrier(); \
TA ret; \
__foreach_active (i) { \
uniform TA * uniform p = ptrArray[i]; \
@@ -1702,7 +1684,6 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
ret = insert(ret, i, r); \
} \
memory_barrier(); \
return ret; \
}
@@ -1757,25 +1738,20 @@ DEFINE_ATOMIC_SWAP(double,double)
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
static inline uniform TA atomic_compare_exchange_global( \
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
memory_barrier(); \
uniform TA ret = \
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
memory_barrier(); \
return ret; \
} \
static inline TA atomic_compare_exchange_global( \
uniform TA * uniform ptr, TA oldval, TA newval) { \
memory_barrier(); \
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \
(MASKTYPE)__mask); \
memory_barrier(); \
return ret; \
} \
static inline TA atomic_compare_exchange_global( \
uniform TA * varying ptr, TA oldval, TA newval) { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
memory_barrier(); \
TA ret; \
__foreach_active (i) { \
uniform TA r = \
@@ -1784,7 +1760,6 @@ static inline TA atomic_compare_exchange_global( \
extract(newval, i)); \
ret = insert(ret, i, r); \
} \
memory_barrier(); \
return ret; \
}