stdlib updates to take advantage of pointers

The packed_{load,store}_active now functions take a pointer to a
location at which to start loading/storing, rather than an array
base and a uniform index.

Variants of the prefetch functions that take varying pointers 
are now available.

There are now variants of the various atomic functions that take
varying pointers (issue #112).
This commit is contained in:
Matt Pharr
2011-11-29 15:41:38 -08:00
parent bbb32c0c5d
commit 11547cb950
12 changed files with 126 additions and 53 deletions

View File

@@ -2393,11 +2393,10 @@ define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
define(`packed_load_and_store', `
define i32 @__packed_load_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> * %val_ptr,
define i32 @__packed_load_active(i32 * %startptr, <$1 x i32> * %val_ptr,
<$1 x i32> %full_mask) nounwind alwaysinline {
entry:
%mask = call i32 @__movmsk(<$1 x i32> %full_mask)
%startptr = getelementptr i32 * %baseptr, i32 %start_offset
%mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
br i1 %mask_known, label %known_mask, label %unknown_mask
@@ -2448,11 +2447,10 @@ done:
ret i32 %nextoffset
}
define i32 @__packed_store_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> %vals,
define i32 @__packed_store_active(i32 * %startptr, <$1 x i32> %vals,
<$1 x i32> %full_mask) nounwind alwaysinline {
entry:
%mask = call i32 @__movmsk(<$1 x i32> %full_mask)
%startptr = getelementptr i32 * %baseptr, i32 %start_offset
%mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
br i1 %mask_known, label %known_mask, label %unknown_mask

View File

@@ -238,7 +238,7 @@ IntersectLightsWithTileMinMax(
// Pack and store intersecting lights
cif (inFrustum) {
tileNumLights += packed_store_active(tileLightIndices, tileNumLights,
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
lightIndex);
}
}
@@ -692,20 +692,20 @@ SplitTileMinMax(
// Pack and store intersecting lights
// TODO: Experiment with a loop here instead
cif (inFrustum[0])
subtileLightOffset[0] += packed_store_active(subtileIndices,
subtileLightOffset[0],
subtileLightOffset[0] +=
packed_store_active(&subtileIndices[subtileLightOffset[0]],
lightIndex);
cif (inFrustum[1])
subtileLightOffset[1] += packed_store_active(subtileIndices,
subtileLightOffset[1],
subtileLightOffset[1] +=
packed_store_active(&subtileIndices[subtileLightOffset[1]],
lightIndex);
cif (inFrustum[2])
subtileLightOffset[2] += packed_store_active(subtileIndices,
subtileLightOffset[2],
subtileLightOffset[2] +=
packed_store_active(&subtileIndices[subtileLightOffset[2]],
lightIndex);
cif (inFrustum[3])
subtileLightOffset[3] += packed_store_active(subtileIndices,
subtileLightOffset[3],
subtileLightOffset[3] +=
packed_store_active(&subtileIndices[subtileLightOffset[3]],
lightIndex);
}

View File

@@ -385,23 +385,57 @@ static inline void prefetch_nt(const void * uniform ptr) {
__prefetch_read_uniform_nt((uniform int8 * uniform)ptr);
}
#if 0
static inline void prefetch_l1(const void * varying ptr) {
__prefetch_read_varying_1((varying int8 * varying)ptr);
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
uniform int mask = lanemask();
for (uniform int i = 0; i < programCount; ++i) {
if ((mask & (1 << i)) == 0)
continue;
const void * uniform p = ptrArray[i];
prefetch_l1(p);
}
}
static inline void prefetch_l2(const void * varying ptr) {
__prefetch_read_varying_2((varying int8 * varying)ptr);
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
uniform int mask = lanemask();
for (uniform int i = 0; i < programCount; ++i) {
if ((mask & (1 << i)) == 0)
continue;
const void * uniform p = ptrArray[i];
prefetch_l2(p);
}
}
static inline void prefetch_l3(const void * varying ptr) {
__prefetch_read_varying_3((varying int8 * varying)ptr);
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
uniform int mask = lanemask();
for (uniform int i = 0; i < programCount; ++i) {
if ((mask & (1 << i)) == 0)
continue;
const void * uniform p = ptrArray[i];
prefetch_l3(p);
}
}
static inline void prefetch_nt(const void * varying ptr) {
__prefetch_read_varying_nt((varying int8 * varying)ptr);
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
uniform int mask = lanemask();
for (uniform int i = 0; i < programCount; ++i) {
if ((mask & (1 << i)) == 0)
continue;
const void * uniform p = ptrArray[i];
prefetch_nt(p);
}
}
#endif
///////////////////////////////////////////////////////////////////////////
// Horizontal ops / reductions
@@ -602,27 +636,25 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
// packed load, store
static inline uniform int
packed_load_active(uniform unsigned int a[], uniform int start,
packed_load_active(uniform unsigned int * uniform a,
unsigned int * uniform vals) {
return __packed_load_active(&a[0], (unsigned int)start, vals,
(unsigned int32)__mask);
return __packed_load_active(a, vals, (unsigned int32)__mask);
}
static inline uniform int
packed_store_active(uniform unsigned int a[], uniform int start,
packed_store_active(uniform unsigned int * uniform a,
unsigned int vals) {
return __packed_store_active(&a[0], (unsigned int)start, vals,
(unsigned int32)__mask);
return __packed_store_active(a, vals, (unsigned int32)__mask);
}
static inline uniform int packed_load_active(uniform int a[], uniform int start,
int * uniform vals) {
return __packed_load_active(&a[0], start, vals, (int32)__mask);
static inline uniform int
packed_load_active(uniform int * uniform a, int * uniform vals) {
return __packed_load_active(a, vals, (int32)__mask);
}
static inline uniform int packed_store_active(uniform int a[], uniform int start,
int vals) {
return __packed_store_active(&a[0], start, vals, (int32)__mask);
static inline uniform int
packed_store_active(uniform int * uniform a, int vals) {
return __packed_store_active(a, vals, (int32)__mask);
}
///////////////////////////////////////////////////////////////////////////
@@ -649,10 +681,29 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
uniform TA value) { \
memory_barrier(); \
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
(MASKTYPE)__mask); \
memory_barrier(); \
return ret; \
}
} \
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
memory_barrier(); \
TA ret; \
uniform int mask = lanemask(); \
for (uniform int i = 0; i < programCount; ++i) { \
if ((mask & (1 << i)) == 0) \
continue; \
uniform TA * uniform p = ptrArray[i]; \
uniform TA v = extract(value, i); \
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
(MASKTYPE)__mask); \
ret = insert(ret, i, r); \
} \
memory_barrier(); \
return ret; \
} \
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
@@ -660,7 +711,8 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
TA ret; \
if (lanemask() != 0) { \
memory_barrier(); \
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, (MASKTYPE)__mask); \
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, \
(MASKTYPE)__mask); \
memory_barrier(); \
} \
return ret; \
@@ -668,7 +720,27 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
uniform TA value) { \
memory_barrier(); \
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
(MASKTYPE)__mask); \
memory_barrier(); \
return ret; \
} \
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
TA value) { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
memory_barrier(); \
TA ret; \
uniform int mask = lanemask(); \
for (uniform int i = 0; i < programCount; ++i) { \
if ((mask & (1 << i)) == 0) \
continue; \
uniform TA * uniform p = ptrArray[i]; \
uniform TA v = extract(value, i); \
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
(MASKTYPE)__mask); \
ret = insert(ret, i, r); \
} \
memory_barrier(); \
return ret; \
}
@@ -723,14 +795,17 @@ DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
static inline TA atomic_compare_exchange_global( \
uniform TA * uniform ptr, TA oldval, TA newval) { \
memory_barrier(); \
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \
(MASKTYPE)__mask); \
memory_barrier(); \
return ret; \
} \
static inline uniform TA atomic_compare_exchange_global( \
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
memory_barrier(); \
uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \
uniform TA ret = \
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
(MASKTYPE)__mask); \
memory_barrier(); \
return ret; \
}

View File

@@ -5,7 +5,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform unsigned int a[programCount];
a[programIndex] = aFOO[programIndex];
unsigned int aa;
packed_load_active(a, 0, &aa);
packed_load_active(a, &aa);
RET[programIndex] = aa;
}

View File

@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
int aa = 15;
uniform int count = 0;
if (programIndex < 2)
count += packed_load_active(a, 0, &aa);
count += packed_load_active(a, &aa);
RET[programIndex] = aa;
}

View File

@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
int aa;
uniform int count = 0;
if (programIndex < 2)
count += packed_load_active(a, 0, &aa);
count += packed_load_active(a, &aa);
RET[programIndex] = count;
}

View File

@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
int aa = 32;
uniform int count = 0;
if (programIndex < 2)
count += packed_load_active(a, 5, &aa);
count += packed_load_active(&a[5], &aa);
RET[programIndex] = aa;
}

View File

@@ -8,9 +8,9 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
int aa = 32;
uniform int count = 0;
if (programIndex & 1)
count += packed_load_active(a, 10, &aa);
count += packed_load_active(&a[10], &aa);
if (!(programIndex & 1))
count += packed_load_active(a, 10+count, &aa);
count += packed_load_active(&a[10+count], &aa);
RET[programIndex] = aa;
}

View File

@@ -6,7 +6,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int pack[2+programCount];
for (uniform int i = 0; i < 2+programCount; ++i)
pack[i] = 0;
packed_store_active(pack, 2, a);
packed_store_active(&pack[2], a);
RET[programIndex] = pack[programIndex];
}

View File

@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
for (uniform int i = 0; i < 2+programCount; ++i)
pack[i] = 0;
if ((int)a & 1)
packed_store_active(pack, 2, a);
packed_store_active(&pack[2], a);
RET[programIndex] = pack[programIndex];
}

View File

@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
pack[i] = 0;
uniform int count = 0;
if ((int)a & 1)
count += packed_store_active(pack, 2, a);
count += packed_store_active(&pack[2], a);
RET[programIndex] = count;
}

View File

@@ -6,7 +6,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform unsigned int pack[programCount];
for (uniform int i = 0; i < programCount; ++i)
pack[i] = 0;
packed_store_active(pack, 0, (unsigned int)a);
packed_store_active(pack, (unsigned int)a);
RET[programIndex] = pack[programIndex];
}