stdlib updates to take advantage of pointers
The packed_{load,store}_active now functions take a pointer to a
location at which to start loading/storing, rather than an array
base and a uniform index.
Variants of the prefetch functions that take varying pointers
are now available.
There are now variants of the various atomic functions that take
varying pointers (issue #112).
This commit is contained in:
@@ -2393,11 +2393,10 @@ define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
|
||||
|
||||
define(`packed_load_and_store', `
|
||||
|
||||
define i32 @__packed_load_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> * %val_ptr,
|
||||
define i32 @__packed_load_active(i32 * %startptr, <$1 x i32> * %val_ptr,
|
||||
<$1 x i32> %full_mask) nounwind alwaysinline {
|
||||
entry:
|
||||
%mask = call i32 @__movmsk(<$1 x i32> %full_mask)
|
||||
%startptr = getelementptr i32 * %baseptr, i32 %start_offset
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
|
||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||
|
||||
@@ -2448,11 +2447,10 @@ done:
|
||||
ret i32 %nextoffset
|
||||
}
|
||||
|
||||
define i32 @__packed_store_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> %vals,
|
||||
define i32 @__packed_store_active(i32 * %startptr, <$1 x i32> %vals,
|
||||
<$1 x i32> %full_mask) nounwind alwaysinline {
|
||||
entry:
|
||||
%mask = call i32 @__movmsk(<$1 x i32> %full_mask)
|
||||
%startptr = getelementptr i32 * %baseptr, i32 %start_offset
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
|
||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||
|
||||
|
||||
@@ -238,7 +238,7 @@ IntersectLightsWithTileMinMax(
|
||||
|
||||
// Pack and store intersecting lights
|
||||
cif (inFrustum) {
|
||||
tileNumLights += packed_store_active(tileLightIndices, tileNumLights,
|
||||
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
|
||||
lightIndex);
|
||||
}
|
||||
}
|
||||
@@ -692,21 +692,21 @@ SplitTileMinMax(
|
||||
// Pack and store intersecting lights
|
||||
// TODO: Experiment with a loop here instead
|
||||
cif (inFrustum[0])
|
||||
subtileLightOffset[0] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[0],
|
||||
lightIndex);
|
||||
subtileLightOffset[0] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[0]],
|
||||
lightIndex);
|
||||
cif (inFrustum[1])
|
||||
subtileLightOffset[1] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[1],
|
||||
lightIndex);
|
||||
subtileLightOffset[1] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[1]],
|
||||
lightIndex);
|
||||
cif (inFrustum[2])
|
||||
subtileLightOffset[2] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[2],
|
||||
lightIndex);
|
||||
subtileLightOffset[2] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[2]],
|
||||
lightIndex);
|
||||
cif (inFrustum[3])
|
||||
subtileLightOffset[3] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[3],
|
||||
lightIndex);
|
||||
subtileLightOffset[3] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[3]],
|
||||
lightIndex);
|
||||
}
|
||||
|
||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||
|
||||
127
stdlib.ispc
127
stdlib.ispc
@@ -385,23 +385,57 @@ static inline void prefetch_nt(const void * uniform ptr) {
|
||||
__prefetch_read_uniform_nt((uniform int8 * uniform)ptr);
|
||||
}
|
||||
|
||||
#if 0
|
||||
static inline void prefetch_l1(const void * varying ptr) {
|
||||
__prefetch_read_varying_1((varying int8 * varying)ptr);
|
||||
const void * uniform ptrArray[programCount];
|
||||
ptrArray[programIndex] = ptr;
|
||||
|
||||
uniform int mask = lanemask();
|
||||
for (uniform int i = 0; i < programCount; ++i) {
|
||||
if ((mask & (1 << i)) == 0)
|
||||
continue;
|
||||
const void * uniform p = ptrArray[i];
|
||||
prefetch_l1(p);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void prefetch_l2(const void * varying ptr) {
|
||||
__prefetch_read_varying_2((varying int8 * varying)ptr);
|
||||
const void * uniform ptrArray[programCount];
|
||||
ptrArray[programIndex] = ptr;
|
||||
|
||||
uniform int mask = lanemask();
|
||||
for (uniform int i = 0; i < programCount; ++i) {
|
||||
if ((mask & (1 << i)) == 0)
|
||||
continue;
|
||||
const void * uniform p = ptrArray[i];
|
||||
prefetch_l2(p);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void prefetch_l3(const void * varying ptr) {
|
||||
__prefetch_read_varying_3((varying int8 * varying)ptr);
|
||||
const void * uniform ptrArray[programCount];
|
||||
ptrArray[programIndex] = ptr;
|
||||
|
||||
uniform int mask = lanemask();
|
||||
for (uniform int i = 0; i < programCount; ++i) {
|
||||
if ((mask & (1 << i)) == 0)
|
||||
continue;
|
||||
const void * uniform p = ptrArray[i];
|
||||
prefetch_l3(p);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void prefetch_nt(const void * varying ptr) {
|
||||
__prefetch_read_varying_nt((varying int8 * varying)ptr);
|
||||
const void * uniform ptrArray[programCount];
|
||||
ptrArray[programIndex] = ptr;
|
||||
|
||||
uniform int mask = lanemask();
|
||||
for (uniform int i = 0; i < programCount; ++i) {
|
||||
if ((mask & (1 << i)) == 0)
|
||||
continue;
|
||||
const void * uniform p = ptrArray[i];
|
||||
prefetch_nt(p);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Horizontal ops / reductions
|
||||
@@ -602,27 +636,25 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
|
||||
// packed load, store
|
||||
|
||||
static inline uniform int
|
||||
packed_load_active(uniform unsigned int a[], uniform int start,
|
||||
packed_load_active(uniform unsigned int * uniform a,
|
||||
unsigned int * uniform vals) {
|
||||
return __packed_load_active(&a[0], (unsigned int)start, vals,
|
||||
(unsigned int32)__mask);
|
||||
return __packed_load_active(a, vals, (unsigned int32)__mask);
|
||||
}
|
||||
|
||||
static inline uniform int
|
||||
packed_store_active(uniform unsigned int a[], uniform int start,
|
||||
packed_store_active(uniform unsigned int * uniform a,
|
||||
unsigned int vals) {
|
||||
return __packed_store_active(&a[0], (unsigned int)start, vals,
|
||||
(unsigned int32)__mask);
|
||||
return __packed_store_active(a, vals, (unsigned int32)__mask);
|
||||
}
|
||||
|
||||
static inline uniform int packed_load_active(uniform int a[], uniform int start,
|
||||
int * uniform vals) {
|
||||
return __packed_load_active(&a[0], start, vals, (int32)__mask);
|
||||
static inline uniform int
|
||||
packed_load_active(uniform int * uniform a, int * uniform vals) {
|
||||
return __packed_load_active(a, vals, (int32)__mask);
|
||||
}
|
||||
|
||||
static inline uniform int packed_store_active(uniform int a[], uniform int start,
|
||||
int vals) {
|
||||
return __packed_store_active(&a[0], start, vals, (int32)__mask);
|
||||
static inline uniform int
|
||||
packed_store_active(uniform int * uniform a, int vals) {
|
||||
return __packed_store_active(a, vals, (int32)__mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -649,10 +681,29 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
uniform TA v = extract(value, i); \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
|
||||
(MASKTYPE)__mask); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
@@ -660,7 +711,8 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
TA ret; \
|
||||
if (lanemask() != 0) { \
|
||||
memory_barrier(); \
|
||||
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, (MASKTYPE)__mask); \
|
||||
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
} \
|
||||
return ret; \
|
||||
@@ -668,7 +720,27 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
uniform TA v = extract(value, i); \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
|
||||
(MASKTYPE)__mask); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
@@ -723,14 +795,17 @@ DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
|
||||
static inline TA atomic_compare_exchange_global( \
|
||||
uniform TA * uniform ptr, TA oldval, TA newval) { \
|
||||
memory_barrier(); \
|
||||
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \
|
||||
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_compare_exchange_global( \
|
||||
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \
|
||||
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = \
|
||||
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform unsigned int a[programCount];
|
||||
a[programIndex] = aFOO[programIndex];
|
||||
unsigned int aa;
|
||||
packed_load_active(a, 0, &aa);
|
||||
packed_load_active(a, &aa);
|
||||
RET[programIndex] = aa;
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
int aa = 15;
|
||||
uniform int count = 0;
|
||||
if (programIndex < 2)
|
||||
count += packed_load_active(a, 0, &aa);
|
||||
count += packed_load_active(a, &aa);
|
||||
RET[programIndex] = aa;
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
int aa;
|
||||
uniform int count = 0;
|
||||
if (programIndex < 2)
|
||||
count += packed_load_active(a, 0, &aa);
|
||||
count += packed_load_active(a, &aa);
|
||||
RET[programIndex] = count;
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
int aa = 32;
|
||||
uniform int count = 0;
|
||||
if (programIndex < 2)
|
||||
count += packed_load_active(a, 5, &aa);
|
||||
count += packed_load_active(&a[5], &aa);
|
||||
RET[programIndex] = aa;
|
||||
}
|
||||
|
||||
|
||||
@@ -8,9 +8,9 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
int aa = 32;
|
||||
uniform int count = 0;
|
||||
if (programIndex & 1)
|
||||
count += packed_load_active(a, 10, &aa);
|
||||
count += packed_load_active(&a[10], &aa);
|
||||
if (!(programIndex & 1))
|
||||
count += packed_load_active(a, 10+count, &aa);
|
||||
count += packed_load_active(&a[10+count], &aa);
|
||||
RET[programIndex] = aa;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int pack[2+programCount];
|
||||
for (uniform int i = 0; i < 2+programCount; ++i)
|
||||
pack[i] = 0;
|
||||
packed_store_active(pack, 2, a);
|
||||
packed_store_active(&pack[2], a);
|
||||
RET[programIndex] = pack[programIndex];
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
for (uniform int i = 0; i < 2+programCount; ++i)
|
||||
pack[i] = 0;
|
||||
if ((int)a & 1)
|
||||
packed_store_active(pack, 2, a);
|
||||
packed_store_active(&pack[2], a);
|
||||
RET[programIndex] = pack[programIndex];
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
pack[i] = 0;
|
||||
uniform int count = 0;
|
||||
if ((int)a & 1)
|
||||
count += packed_store_active(pack, 2, a);
|
||||
count += packed_store_active(&pack[2], a);
|
||||
RET[programIndex] = count;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform unsigned int pack[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
pack[i] = 0;
|
||||
packed_store_active(pack, 0, (unsigned int)a);
|
||||
packed_store_active(pack, (unsigned int)a);
|
||||
RET[programIndex] = pack[programIndex];
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user