diff --git a/builtins.m4 b/builtins.m4 index 998b4820..2b98bd80 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -2393,11 +2393,10 @@ define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>, define(`packed_load_and_store', ` -define i32 @__packed_load_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> * %val_ptr, +define i32 @__packed_load_active(i32 * %startptr, <$1 x i32> * %val_ptr, <$1 x i32> %full_mask) nounwind alwaysinline { entry: %mask = call i32 @__movmsk(<$1 x i32> %full_mask) - %startptr = getelementptr i32 * %baseptr, i32 %start_offset %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask @@ -2448,11 +2447,10 @@ done: ret i32 %nextoffset } -define i32 @__packed_store_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> %vals, +define i32 @__packed_store_active(i32 * %startptr, <$1 x i32> %vals, <$1 x i32> %full_mask) nounwind alwaysinline { entry: %mask = call i32 @__movmsk(<$1 x i32> %full_mask) - %startptr = getelementptr i32 * %baseptr, i32 %start_offset %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask diff --git a/examples/deferred/kernels.ispc b/examples/deferred/kernels.ispc index 6ade1d82..fff09602 100644 --- a/examples/deferred/kernels.ispc +++ b/examples/deferred/kernels.ispc @@ -238,7 +238,7 @@ IntersectLightsWithTileMinMax( // Pack and store intersecting lights cif (inFrustum) { - tileNumLights += packed_store_active(tileLightIndices, tileNumLights, + tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], lightIndex); } } @@ -692,21 +692,21 @@ SplitTileMinMax( // Pack and store intersecting lights // TODO: Experiment with a loop here instead cif (inFrustum[0]) - subtileLightOffset[0] += packed_store_active(subtileIndices, - subtileLightOffset[0], - lightIndex); + subtileLightOffset[0] += + packed_store_active(&subtileIndices[subtileLightOffset[0]], + lightIndex); cif (inFrustum[1]) - subtileLightOffset[1] += packed_store_active(subtileIndices, - subtileLightOffset[1], - lightIndex); + subtileLightOffset[1] += + packed_store_active(&subtileIndices[subtileLightOffset[1]], + lightIndex); cif (inFrustum[2]) - subtileLightOffset[2] += packed_store_active(subtileIndices, - subtileLightOffset[2], - lightIndex); + subtileLightOffset[2] += + packed_store_active(&subtileIndices[subtileLightOffset[2]], + lightIndex); cif (inFrustum[3]) - subtileLightOffset[3] += packed_store_active(subtileIndices, - subtileLightOffset[3], - lightIndex); + subtileLightOffset[3] += + packed_store_active(&subtileIndices[subtileLightOffset[3]], + lightIndex); } subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch; diff --git a/stdlib.ispc b/stdlib.ispc index c27dead7..a125d9af 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -385,23 +385,57 @@ static inline void prefetch_nt(const void * uniform ptr) { __prefetch_read_uniform_nt((uniform int8 * uniform)ptr); } -#if 0 static inline void prefetch_l1(const void * varying ptr) { - __prefetch_read_varying_1((varying int8 * varying)ptr); + const void * uniform ptrArray[programCount]; + ptrArray[programIndex] = ptr; + + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + const void * uniform p = ptrArray[i]; + prefetch_l1(p); + } } static inline void prefetch_l2(const void * varying ptr) { - __prefetch_read_varying_2((varying int8 * varying)ptr); + const void * uniform ptrArray[programCount]; + ptrArray[programIndex] = ptr; + + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + const void * uniform p = ptrArray[i]; + prefetch_l2(p); + } } static inline void prefetch_l3(const void * varying ptr) { - __prefetch_read_varying_3((varying int8 * varying)ptr); + const void * uniform ptrArray[programCount]; + ptrArray[programIndex] = ptr; + + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + const void * uniform p = ptrArray[i]; + prefetch_l3(p); + } } static inline void prefetch_nt(const void * varying ptr) { - __prefetch_read_varying_nt((varying int8 * varying)ptr); + const void * uniform ptrArray[programCount]; + ptrArray[programIndex] = ptr; + + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + const void * uniform p = ptrArray[i]; + prefetch_nt(p); + } } -#endif /////////////////////////////////////////////////////////////////////////// // Horizontal ops / reductions @@ -602,27 +636,25 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) { // packed load, store static inline uniform int -packed_load_active(uniform unsigned int a[], uniform int start, +packed_load_active(uniform unsigned int * uniform a, unsigned int * uniform vals) { - return __packed_load_active(&a[0], (unsigned int)start, vals, - (unsigned int32)__mask); + return __packed_load_active(a, vals, (unsigned int32)__mask); } static inline uniform int -packed_store_active(uniform unsigned int a[], uniform int start, +packed_store_active(uniform unsigned int * uniform a, unsigned int vals) { - return __packed_store_active(&a[0], (unsigned int)start, vals, - (unsigned int32)__mask); + return __packed_store_active(a, vals, (unsigned int32)__mask); } -static inline uniform int packed_load_active(uniform int a[], uniform int start, - int * uniform vals) { - return __packed_load_active(&a[0], start, vals, (int32)__mask); +static inline uniform int +packed_load_active(uniform int * uniform a, int * uniform vals) { + return __packed_load_active(a, vals, (int32)__mask); } -static inline uniform int packed_store_active(uniform int a[], uniform int start, - int vals) { - return __packed_store_active(&a[0], start, vals, (int32)__mask); +static inline uniform int +packed_store_active(uniform int * uniform a, int vals) { + return __packed_store_active(a, vals, (int32)__mask); } /////////////////////////////////////////////////////////////////////////// @@ -649,10 +681,29 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ uniform TA value) { \ memory_barrier(); \ - uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \ + uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \ + (MASKTYPE)__mask); \ memory_barrier(); \ return ret; \ -} +} \ +static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \ + uniform TA * uniform ptrArray[programCount]; \ + ptrArray[programIndex] = ptr; \ + memory_barrier(); \ + TA ret; \ + uniform int mask = lanemask(); \ + for (uniform int i = 0; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + uniform TA * uniform p = ptrArray[i]; \ + uniform TA v = extract(value, i); \ + uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \ + (MASKTYPE)__mask); \ + ret = insert(ret, i, r); \ + } \ + memory_barrier(); \ + return ret; \ +} \ #define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ @@ -660,7 +711,8 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ TA ret; \ if (lanemask() != 0) { \ memory_barrier(); \ - ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, (MASKTYPE)__mask); \ + ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, \ + (MASKTYPE)__mask); \ memory_barrier(); \ } \ return ret; \ @@ -668,7 +720,27 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ uniform TA value) { \ memory_barrier(); \ - uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \ + uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \ + (MASKTYPE)__mask); \ + memory_barrier(); \ + return ret; \ +} \ +static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ + TA value) { \ + uniform TA * uniform ptrArray[programCount]; \ + ptrArray[programIndex] = ptr; \ + memory_barrier(); \ + TA ret; \ + uniform int mask = lanemask(); \ + for (uniform int i = 0; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + uniform TA * uniform p = ptrArray[i]; \ + uniform TA v = extract(value, i); \ + uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \ + (MASKTYPE)__mask); \ + ret = insert(ret, i, r); \ + } \ memory_barrier(); \ return ret; \ } @@ -723,14 +795,17 @@ DEFINE_ATOMIC_OP(double,double,swap,swap,int32) static inline TA atomic_compare_exchange_global( \ uniform TA * uniform ptr, TA oldval, TA newval) { \ memory_barrier(); \ - TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \ + TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \ + (MASKTYPE)__mask); \ memory_barrier(); \ return ret; \ } \ static inline uniform TA atomic_compare_exchange_global( \ - uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \ - memory_barrier(); \ - uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \ + uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \ + memory_barrier(); \ + uniform TA ret = \ + __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \ + (MASKTYPE)__mask); \ memory_barrier(); \ return ret; \ } diff --git a/tests/packed-load-1.ispc b/tests/packed-load-1.ispc index 7f645d6a..6f0f7009 100644 --- a/tests/packed-load-1.ispc +++ b/tests/packed-load-1.ispc @@ -5,7 +5,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { uniform unsigned int a[programCount]; a[programIndex] = aFOO[programIndex]; unsigned int aa; - packed_load_active(a, 0, &aa); + packed_load_active(a, &aa); RET[programIndex] = aa; } diff --git a/tests/packed-load-2.ispc b/tests/packed-load-2.ispc index 97a3543e..bdeba7e4 100644 --- a/tests/packed-load-2.ispc +++ b/tests/packed-load-2.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { int aa = 15; uniform int count = 0; if (programIndex < 2) - count += packed_load_active(a, 0, &aa); + count += packed_load_active(a, &aa); RET[programIndex] = aa; } diff --git a/tests/packed-load-3.ispc b/tests/packed-load-3.ispc index 826aab38..150fd428 100644 --- a/tests/packed-load-3.ispc +++ b/tests/packed-load-3.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { int aa; uniform int count = 0; if (programIndex < 2) - count += packed_load_active(a, 0, &aa); + count += packed_load_active(a, &aa); RET[programIndex] = count; } diff --git a/tests/packed-load-4.ispc b/tests/packed-load-4.ispc index 13e4ce11..ddb1db83 100644 --- a/tests/packed-load-4.ispc +++ b/tests/packed-load-4.ispc @@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { int aa = 32; uniform int count = 0; if (programIndex < 2) - count += packed_load_active(a, 5, &aa); + count += packed_load_active(&a[5], &aa); RET[programIndex] = aa; } diff --git a/tests/packed-load-5.ispc b/tests/packed-load-5.ispc index e26720ea..ee3dae7b 100644 --- a/tests/packed-load-5.ispc +++ b/tests/packed-load-5.ispc @@ -8,9 +8,9 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { int aa = 32; uniform int count = 0; if (programIndex & 1) - count += packed_load_active(a, 10, &aa); + count += packed_load_active(&a[10], &aa); if (!(programIndex & 1)) - count += packed_load_active(a, 10+count, &aa); + count += packed_load_active(&a[10+count], &aa); RET[programIndex] = aa; } diff --git a/tests/packed-store-1.ispc b/tests/packed-store-1.ispc index 5d392379..b7bbf9f5 100644 --- a/tests/packed-store-1.ispc +++ b/tests/packed-store-1.ispc @@ -6,7 +6,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { uniform int pack[2+programCount]; for (uniform int i = 0; i < 2+programCount; ++i) pack[i] = 0; - packed_store_active(pack, 2, a); + packed_store_active(&pack[2], a); RET[programIndex] = pack[programIndex]; } diff --git a/tests/packed-store-2.ispc b/tests/packed-store-2.ispc index c90a2f7b..6654f115 100644 --- a/tests/packed-store-2.ispc +++ b/tests/packed-store-2.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { for (uniform int i = 0; i < 2+programCount; ++i) pack[i] = 0; if ((int)a & 1) - packed_store_active(pack, 2, a); + packed_store_active(&pack[2], a); RET[programIndex] = pack[programIndex]; } diff --git a/tests/packed-store-3.ispc b/tests/packed-store-3.ispc index b59693a1..8cec64e7 100644 --- a/tests/packed-store-3.ispc +++ b/tests/packed-store-3.ispc @@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { pack[i] = 0; uniform int count = 0; if ((int)a & 1) - count += packed_store_active(pack, 2, a); + count += packed_store_active(&pack[2], a); RET[programIndex] = count; } diff --git a/tests/packed-store.ispc b/tests/packed-store.ispc index 3c41f7d7..1863dec4 100644 --- a/tests/packed-store.ispc +++ b/tests/packed-store.ispc @@ -6,7 +6,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { uniform unsigned int pack[programCount]; for (uniform int i = 0; i < programCount; ++i) pack[i] = 0; - packed_store_active(pack, 0, (unsigned int)a); + packed_store_active(pack, (unsigned int)a); RET[programIndex] = pack[programIndex]; }