stdlib updates to take advantage of pointers
The packed_{load,store}_active now functions take a pointer to a
location at which to start loading/storing, rather than an array
base and a uniform index.
Variants of the prefetch functions that take varying pointers
are now available.
There are now variants of the various atomic functions that take
varying pointers (issue #112).
This commit is contained in:
@@ -2393,11 +2393,10 @@ define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
|
|||||||
|
|
||||||
define(`packed_load_and_store', `
|
define(`packed_load_and_store', `
|
||||||
|
|
||||||
define i32 @__packed_load_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> * %val_ptr,
|
define i32 @__packed_load_active(i32 * %startptr, <$1 x i32> * %val_ptr,
|
||||||
<$1 x i32> %full_mask) nounwind alwaysinline {
|
<$1 x i32> %full_mask) nounwind alwaysinline {
|
||||||
entry:
|
entry:
|
||||||
%mask = call i32 @__movmsk(<$1 x i32> %full_mask)
|
%mask = call i32 @__movmsk(<$1 x i32> %full_mask)
|
||||||
%startptr = getelementptr i32 * %baseptr, i32 %start_offset
|
|
||||||
%mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
|
%mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
|
||||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||||
|
|
||||||
@@ -2448,11 +2447,10 @@ done:
|
|||||||
ret i32 %nextoffset
|
ret i32 %nextoffset
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__packed_store_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> %vals,
|
define i32 @__packed_store_active(i32 * %startptr, <$1 x i32> %vals,
|
||||||
<$1 x i32> %full_mask) nounwind alwaysinline {
|
<$1 x i32> %full_mask) nounwind alwaysinline {
|
||||||
entry:
|
entry:
|
||||||
%mask = call i32 @__movmsk(<$1 x i32> %full_mask)
|
%mask = call i32 @__movmsk(<$1 x i32> %full_mask)
|
||||||
%startptr = getelementptr i32 * %baseptr, i32 %start_offset
|
|
||||||
%mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
|
%mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
|
||||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||||
|
|
||||||
|
|||||||
@@ -238,7 +238,7 @@ IntersectLightsWithTileMinMax(
|
|||||||
|
|
||||||
// Pack and store intersecting lights
|
// Pack and store intersecting lights
|
||||||
cif (inFrustum) {
|
cif (inFrustum) {
|
||||||
tileNumLights += packed_store_active(tileLightIndices, tileNumLights,
|
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
|
||||||
lightIndex);
|
lightIndex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -692,21 +692,21 @@ SplitTileMinMax(
|
|||||||
// Pack and store intersecting lights
|
// Pack and store intersecting lights
|
||||||
// TODO: Experiment with a loop here instead
|
// TODO: Experiment with a loop here instead
|
||||||
cif (inFrustum[0])
|
cif (inFrustum[0])
|
||||||
subtileLightOffset[0] += packed_store_active(subtileIndices,
|
subtileLightOffset[0] +=
|
||||||
subtileLightOffset[0],
|
packed_store_active(&subtileIndices[subtileLightOffset[0]],
|
||||||
lightIndex);
|
lightIndex);
|
||||||
cif (inFrustum[1])
|
cif (inFrustum[1])
|
||||||
subtileLightOffset[1] += packed_store_active(subtileIndices,
|
subtileLightOffset[1] +=
|
||||||
subtileLightOffset[1],
|
packed_store_active(&subtileIndices[subtileLightOffset[1]],
|
||||||
lightIndex);
|
lightIndex);
|
||||||
cif (inFrustum[2])
|
cif (inFrustum[2])
|
||||||
subtileLightOffset[2] += packed_store_active(subtileIndices,
|
subtileLightOffset[2] +=
|
||||||
subtileLightOffset[2],
|
packed_store_active(&subtileIndices[subtileLightOffset[2]],
|
||||||
lightIndex);
|
lightIndex);
|
||||||
cif (inFrustum[3])
|
cif (inFrustum[3])
|
||||||
subtileLightOffset[3] += packed_store_active(subtileIndices,
|
subtileLightOffset[3] +=
|
||||||
subtileLightOffset[3],
|
packed_store_active(&subtileIndices[subtileLightOffset[3]],
|
||||||
lightIndex);
|
lightIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||||
|
|||||||
127
stdlib.ispc
127
stdlib.ispc
@@ -385,23 +385,57 @@ static inline void prefetch_nt(const void * uniform ptr) {
|
|||||||
__prefetch_read_uniform_nt((uniform int8 * uniform)ptr);
|
__prefetch_read_uniform_nt((uniform int8 * uniform)ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
|
||||||
static inline void prefetch_l1(const void * varying ptr) {
|
static inline void prefetch_l1(const void * varying ptr) {
|
||||||
__prefetch_read_varying_1((varying int8 * varying)ptr);
|
const void * uniform ptrArray[programCount];
|
||||||
|
ptrArray[programIndex] = ptr;
|
||||||
|
|
||||||
|
uniform int mask = lanemask();
|
||||||
|
for (uniform int i = 0; i < programCount; ++i) {
|
||||||
|
if ((mask & (1 << i)) == 0)
|
||||||
|
continue;
|
||||||
|
const void * uniform p = ptrArray[i];
|
||||||
|
prefetch_l1(p);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void prefetch_l2(const void * varying ptr) {
|
static inline void prefetch_l2(const void * varying ptr) {
|
||||||
__prefetch_read_varying_2((varying int8 * varying)ptr);
|
const void * uniform ptrArray[programCount];
|
||||||
|
ptrArray[programIndex] = ptr;
|
||||||
|
|
||||||
|
uniform int mask = lanemask();
|
||||||
|
for (uniform int i = 0; i < programCount; ++i) {
|
||||||
|
if ((mask & (1 << i)) == 0)
|
||||||
|
continue;
|
||||||
|
const void * uniform p = ptrArray[i];
|
||||||
|
prefetch_l2(p);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void prefetch_l3(const void * varying ptr) {
|
static inline void prefetch_l3(const void * varying ptr) {
|
||||||
__prefetch_read_varying_3((varying int8 * varying)ptr);
|
const void * uniform ptrArray[programCount];
|
||||||
|
ptrArray[programIndex] = ptr;
|
||||||
|
|
||||||
|
uniform int mask = lanemask();
|
||||||
|
for (uniform int i = 0; i < programCount; ++i) {
|
||||||
|
if ((mask & (1 << i)) == 0)
|
||||||
|
continue;
|
||||||
|
const void * uniform p = ptrArray[i];
|
||||||
|
prefetch_l3(p);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void prefetch_nt(const void * varying ptr) {
|
static inline void prefetch_nt(const void * varying ptr) {
|
||||||
__prefetch_read_varying_nt((varying int8 * varying)ptr);
|
const void * uniform ptrArray[programCount];
|
||||||
|
ptrArray[programIndex] = ptr;
|
||||||
|
|
||||||
|
uniform int mask = lanemask();
|
||||||
|
for (uniform int i = 0; i < programCount; ++i) {
|
||||||
|
if ((mask & (1 << i)) == 0)
|
||||||
|
continue;
|
||||||
|
const void * uniform p = ptrArray[i];
|
||||||
|
prefetch_nt(p);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Horizontal ops / reductions
|
// Horizontal ops / reductions
|
||||||
@@ -602,27 +636,25 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
|
|||||||
// packed load, store
|
// packed load, store
|
||||||
|
|
||||||
static inline uniform int
|
static inline uniform int
|
||||||
packed_load_active(uniform unsigned int a[], uniform int start,
|
packed_load_active(uniform unsigned int * uniform a,
|
||||||
unsigned int * uniform vals) {
|
unsigned int * uniform vals) {
|
||||||
return __packed_load_active(&a[0], (unsigned int)start, vals,
|
return __packed_load_active(a, vals, (unsigned int32)__mask);
|
||||||
(unsigned int32)__mask);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int
|
static inline uniform int
|
||||||
packed_store_active(uniform unsigned int a[], uniform int start,
|
packed_store_active(uniform unsigned int * uniform a,
|
||||||
unsigned int vals) {
|
unsigned int vals) {
|
||||||
return __packed_store_active(&a[0], (unsigned int)start, vals,
|
return __packed_store_active(a, vals, (unsigned int32)__mask);
|
||||||
(unsigned int32)__mask);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int packed_load_active(uniform int a[], uniform int start,
|
static inline uniform int
|
||||||
int * uniform vals) {
|
packed_load_active(uniform int * uniform a, int * uniform vals) {
|
||||||
return __packed_load_active(&a[0], start, vals, (int32)__mask);
|
return __packed_load_active(a, vals, (int32)__mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int packed_store_active(uniform int a[], uniform int start,
|
static inline uniform int
|
||||||
int vals) {
|
packed_store_active(uniform int * uniform a, int vals) {
|
||||||
return __packed_store_active(&a[0], start, vals, (int32)__mask);
|
return __packed_store_active(a, vals, (int32)__mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
@@ -649,10 +681,29 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
|||||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||||
uniform TA value) { \
|
uniform TA value) { \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
|
||||||
|
(MASKTYPE)__mask); \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
return ret; \
|
return ret; \
|
||||||
}
|
} \
|
||||||
|
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||||
|
uniform TA * uniform ptrArray[programCount]; \
|
||||||
|
ptrArray[programIndex] = ptr; \
|
||||||
|
memory_barrier(); \
|
||||||
|
TA ret; \
|
||||||
|
uniform int mask = lanemask(); \
|
||||||
|
for (uniform int i = 0; i < programCount; ++i) { \
|
||||||
|
if ((mask & (1 << i)) == 0) \
|
||||||
|
continue; \
|
||||||
|
uniform TA * uniform p = ptrArray[i]; \
|
||||||
|
uniform TA v = extract(value, i); \
|
||||||
|
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
|
||||||
|
(MASKTYPE)__mask); \
|
||||||
|
ret = insert(ret, i, r); \
|
||||||
|
} \
|
||||||
|
memory_barrier(); \
|
||||||
|
return ret; \
|
||||||
|
} \
|
||||||
|
|
||||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \
|
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \
|
||||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||||
@@ -660,7 +711,8 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
|||||||
TA ret; \
|
TA ret; \
|
||||||
if (lanemask() != 0) { \
|
if (lanemask() != 0) { \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, (MASKTYPE)__mask); \
|
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, \
|
||||||
|
(MASKTYPE)__mask); \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
} \
|
} \
|
||||||
return ret; \
|
return ret; \
|
||||||
@@ -668,7 +720,27 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
|||||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||||
uniform TA value) { \
|
uniform TA value) { \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
|
||||||
|
(MASKTYPE)__mask); \
|
||||||
|
memory_barrier(); \
|
||||||
|
return ret; \
|
||||||
|
} \
|
||||||
|
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||||
|
TA value) { \
|
||||||
|
uniform TA * uniform ptrArray[programCount]; \
|
||||||
|
ptrArray[programIndex] = ptr; \
|
||||||
|
memory_barrier(); \
|
||||||
|
TA ret; \
|
||||||
|
uniform int mask = lanemask(); \
|
||||||
|
for (uniform int i = 0; i < programCount; ++i) { \
|
||||||
|
if ((mask & (1 << i)) == 0) \
|
||||||
|
continue; \
|
||||||
|
uniform TA * uniform p = ptrArray[i]; \
|
||||||
|
uniform TA v = extract(value, i); \
|
||||||
|
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
|
||||||
|
(MASKTYPE)__mask); \
|
||||||
|
ret = insert(ret, i, r); \
|
||||||
|
} \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
return ret; \
|
return ret; \
|
||||||
}
|
}
|
||||||
@@ -723,14 +795,17 @@ DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
|
|||||||
static inline TA atomic_compare_exchange_global( \
|
static inline TA atomic_compare_exchange_global( \
|
||||||
uniform TA * uniform ptr, TA oldval, TA newval) { \
|
uniform TA * uniform ptr, TA oldval, TA newval) { \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \
|
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \
|
||||||
|
(MASKTYPE)__mask); \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
return ret; \
|
return ret; \
|
||||||
} \
|
} \
|
||||||
static inline uniform TA atomic_compare_exchange_global( \
|
static inline uniform TA atomic_compare_exchange_global( \
|
||||||
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \
|
uniform TA ret = \
|
||||||
|
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
|
||||||
|
(MASKTYPE)__mask); \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
return ret; \
|
return ret; \
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|||||||
uniform unsigned int a[programCount];
|
uniform unsigned int a[programCount];
|
||||||
a[programIndex] = aFOO[programIndex];
|
a[programIndex] = aFOO[programIndex];
|
||||||
unsigned int aa;
|
unsigned int aa;
|
||||||
packed_load_active(a, 0, &aa);
|
packed_load_active(a, &aa);
|
||||||
RET[programIndex] = aa;
|
RET[programIndex] = aa;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|||||||
int aa = 15;
|
int aa = 15;
|
||||||
uniform int count = 0;
|
uniform int count = 0;
|
||||||
if (programIndex < 2)
|
if (programIndex < 2)
|
||||||
count += packed_load_active(a, 0, &aa);
|
count += packed_load_active(a, &aa);
|
||||||
RET[programIndex] = aa;
|
RET[programIndex] = aa;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|||||||
int aa;
|
int aa;
|
||||||
uniform int count = 0;
|
uniform int count = 0;
|
||||||
if (programIndex < 2)
|
if (programIndex < 2)
|
||||||
count += packed_load_active(a, 0, &aa);
|
count += packed_load_active(a, &aa);
|
||||||
RET[programIndex] = count;
|
RET[programIndex] = count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|||||||
int aa = 32;
|
int aa = 32;
|
||||||
uniform int count = 0;
|
uniform int count = 0;
|
||||||
if (programIndex < 2)
|
if (programIndex < 2)
|
||||||
count += packed_load_active(a, 5, &aa);
|
count += packed_load_active(&a[5], &aa);
|
||||||
RET[programIndex] = aa;
|
RET[programIndex] = aa;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,9 +8,9 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|||||||
int aa = 32;
|
int aa = 32;
|
||||||
uniform int count = 0;
|
uniform int count = 0;
|
||||||
if (programIndex & 1)
|
if (programIndex & 1)
|
||||||
count += packed_load_active(a, 10, &aa);
|
count += packed_load_active(&a[10], &aa);
|
||||||
if (!(programIndex & 1))
|
if (!(programIndex & 1))
|
||||||
count += packed_load_active(a, 10+count, &aa);
|
count += packed_load_active(&a[10+count], &aa);
|
||||||
RET[programIndex] = aa;
|
RET[programIndex] = aa;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|||||||
uniform int pack[2+programCount];
|
uniform int pack[2+programCount];
|
||||||
for (uniform int i = 0; i < 2+programCount; ++i)
|
for (uniform int i = 0; i < 2+programCount; ++i)
|
||||||
pack[i] = 0;
|
pack[i] = 0;
|
||||||
packed_store_active(pack, 2, a);
|
packed_store_active(&pack[2], a);
|
||||||
RET[programIndex] = pack[programIndex];
|
RET[programIndex] = pack[programIndex];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|||||||
for (uniform int i = 0; i < 2+programCount; ++i)
|
for (uniform int i = 0; i < 2+programCount; ++i)
|
||||||
pack[i] = 0;
|
pack[i] = 0;
|
||||||
if ((int)a & 1)
|
if ((int)a & 1)
|
||||||
packed_store_active(pack, 2, a);
|
packed_store_active(&pack[2], a);
|
||||||
RET[programIndex] = pack[programIndex];
|
RET[programIndex] = pack[programIndex];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|||||||
pack[i] = 0;
|
pack[i] = 0;
|
||||||
uniform int count = 0;
|
uniform int count = 0;
|
||||||
if ((int)a & 1)
|
if ((int)a & 1)
|
||||||
count += packed_store_active(pack, 2, a);
|
count += packed_store_active(&pack[2], a);
|
||||||
RET[programIndex] = count;
|
RET[programIndex] = count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|||||||
uniform unsigned int pack[programCount];
|
uniform unsigned int pack[programCount];
|
||||||
for (uniform int i = 0; i < programCount; ++i)
|
for (uniform int i = 0; i < programCount; ++i)
|
||||||
pack[i] = 0;
|
pack[i] = 0;
|
||||||
packed_store_active(pack, 0, (unsigned int)a);
|
packed_store_active(pack, (unsigned int)a);
|
||||||
RET[programIndex] = pack[programIndex];
|
RET[programIndex] = pack[programIndex];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user