diff --git a/builtins.cpp b/builtins.cpp index 1c690295..47e0d2af 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -504,6 +504,9 @@ lSetInternalFunctions(llvm::Module *module) { "__prefetch_read_uniform_3", "__prefetch_read_uniform_nt", "__pseudo_prefetch_read_varying_1", + "__pseudo_prefetch_read_varying_2", + "__pseudo_prefetch_read_varying_3", + "__pseudo_prefetch_read_varying_nt", "__psubs_vi8", "__psubs_vi16", "__psubus_vi8", diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index ce4cf70d..19373633 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -372,6 +372,12 @@ declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind declare void @__prefetch_read_varying_1( %addr, %mask) nounwind declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind +declare void @__prefetch_read_varying_2( %addr, %mask) nounwind +declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind +declare void @__prefetch_read_varying_3( %addr, %mask) nounwind +declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind +declare void @__prefetch_read_varying_nt( %addr, %mask) nounwind +declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int8/int16 builtins diff --git a/builtins/util.m4 b/builtins/util.m4 index 25868d69..fda60891 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1595,6 +1595,39 @@ define void @__prefetch_read_varying_1( %addr, %mask } declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind + +define void @__prefetch_read_varying_2( %addr, %mask) alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %addr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8* + call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 2, i32 1) + ') + ret void +} + +declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind + +define void @__prefetch_read_varying_3( %addr, %mask) alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %addr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8* + call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 1, i32 1) + ') + ret void +} + +declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind + +define void @__prefetch_read_varying_nt( %addr, %mask) alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %addr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8* + call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 0, i32 1) + ') + ret void +} + +declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2553,6 +2586,24 @@ declare void @__pseudo_prefetch_read_varying_1_native(i8 *, i32, , ) nounwind +declare void @__pseudo_prefetch_read_varying_2(, ) nounwind + +declare void +@__pseudo_prefetch_read_varying_2_native(i8 *, i32, , + ) nounwind + +declare void @__pseudo_prefetch_read_varying_3(, ) nounwind + +declare void +@__pseudo_prefetch_read_varying_3_native(i8 *, i32, , + ) nounwind + +declare void @__pseudo_prefetch_read_varying_nt(, ) nounwind + +declare void +@__pseudo_prefetch_read_varying_nt_native(i8 *, i32, , + ) nounwind + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; declare void @__use8() @@ -3063,6 +3114,30 @@ ifelse(HAVE_SCATTER, `1', %v32, %mask) call void @__prefetch_read_varying_1( %v64, %mask) + call void @__pseudo_prefetch_read_varying_2( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_2_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_2_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_2( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_3( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_3_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_3_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_3( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_nt( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_nt_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_nt_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_nt( %v64, %mask) + ret void } diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index c8f2cf08..f44c581e 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1540,6 +1540,15 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) { static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) { } +#define PREFETCH_READ_VARYING(CACHE_NUM) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec16_i32 offsets, __vec16_i1 mask) {} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \ + +PREFETCH_READ_VARYING(1) +PREFETCH_READ_VARYING(2) +PREFETCH_READ_VARYING(3) +PREFETCH_READ_VARYING(nt) /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index f5bb233c..26505615 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1624,6 +1624,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) { static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) { } +#define PREFETCH_READ_VARYING(CACHE_NUM) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec32_i32 offsets, __vec32_i1 mask) {} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec32_i64 addr, __vec32_i1 mask) {} \ + +PREFETCH_READ_VARYING(1) +PREFETCH_READ_VARYING(2) +PREFETCH_READ_VARYING(3) +PREFETCH_READ_VARYING(nt) + /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index a7148c8b..b5caa008 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1757,6 +1757,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) { static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) { } +#define PREFETCH_READ_VARYING(CACHE_NUM) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec64_i32 offsets, __vec64_i1 mask) {} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec64_i64 addr, __vec64_i1 mask) {} \ + +PREFETCH_READ_VARYING(1) +PREFETCH_READ_VARYING(2) +PREFETCH_READ_VARYING(3) +PREFETCH_READ_VARYING(nt) + /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 0e7a849a..d9d9c011 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -2550,16 +2550,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) { // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint } -static FORCEINLINE void __prefetch_read_varying_1_native(uint8_t *base, uint32_t scale, - __vec16_i32 offsets, __vec16_i1 mask) { - _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); - offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets); - __vec16_i1 copy_mask = _mm512_kmov(mask); - _mm512_kswapb(mask, copy_mask); - _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); -} +#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec16_i32 offsets, __vec16_i1 mask) { \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \ + offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\ + __vec16_i1 copy_mask = _mm512_kmov(mask); \ + _mm512_kswapb(mask, copy_mask); \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \ +} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \ + +PREFETCH_READ_VARYING(1, _MM_HINT_T0) +PREFETCH_READ_VARYING(2, _MM_HINT_T1) +PREFETCH_READ_VARYING(nt, _MM_HINT_T2) + +static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale, + __vec16_i32 offsets, __vec16_i1 mask) {} + +static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {} -static FORCEINLINE void __prefetch_read_varying_1(__vec16_i64 addr, __vec16_i1 mask) {} /////////////////////////////////////////////////////////////////////////// // atomics /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index 478ad75a..5eb8ea05 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -2606,6 +2606,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) { // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint } +#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec16_i32 offsets, __vec16_i1 mask) { \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \ + offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\ + __vec16_i1 copy_mask = _mm512_kmov(mask); \ + _mm512_kswapb(mask, copy_mask); \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \ +} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \ + +PREFETCH_READ_VARYING(1, _MM_HINT_T0) +PREFETCH_READ_VARYING(2, _MM_HINT_T1) +PREFETCH_READ_VARYING(nt, _MM_HINT_T2) + +static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale, + __vec16_i32 offsets, __vec16_i1 mask) {} + +static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {} + /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index e3598538..223c5051 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1926,13 +1926,25 @@ static FORCEINLINE void __prefetch_read_uniform_nt(const char *p) { // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint } -static FORCEINLINE void __prefetch_read_varying_1_native(uint8_t *base, uint32_t scale, - __vec16_i32 offsets, __vec16_i1 mask) { - _mm512_prefetch_i32gather_ps(offsets, base, scale, _MM_HINT_T0); - offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), offsets); - /* TODO: permutevar mask */ - _mm512_prefetch_i32gather_ps(offsets, base, scale, _MM_HINT_T0); -} +#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec16_i32 offsets, __vec16_i1 mask) { \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \ + offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\ + __vec16_i1 copy_mask = _mm512_kmov(mask); \ + _mm512_kswapb(mask, copy_mask); \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \ +} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \ + +PREFETCH_READ_VARYING(1, _MM_HINT_T0) +PREFETCH_READ_VARYING(2, _MM_HINT_T1) +PREFETCH_READ_VARYING(nt, _MM_HINT_T2) + +static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale, + __vec16_i32 offsets, __vec16_i1 mask) {} + +static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {} /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index a25af10b..765a931f 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -3898,6 +3898,15 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *ptr) { _mm_prefetch((char *)ptr, _MM_HINT_NTA); } +#define PREFETCH_READ_VARYING(CACHE_NUM) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec4_i32 offsets, __vec4_i1 mask) {} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec4_i64 addr, __vec4_i1 mask) {} \ + +PREFETCH_READ_VARYING(1) +PREFETCH_READ_VARYING(2) +PREFETCH_READ_VARYING(3) +PREFETCH_READ_VARYING(nt) /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/opt.cpp b/opt.cpp index e570919e..2715c0fc 100644 --- a/opt.cpp +++ b/opt.cpp @@ -2161,6 +2161,27 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1", false, true), + + GSInfo("__pseudo_prefetch_read_varying_2", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : + "__prefetch_read_varying_2", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : + "__prefetch_read_varying_2", + false, true), + + GSInfo("__pseudo_prefetch_read_varying_3", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : + "__prefetch_read_varying_3", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : + "__prefetch_read_varying_3", + false, true), + + GSInfo("__pseudo_prefetch_read_varying_nt", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : + "__prefetch_read_varying_nt", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : + "__prefetch_read_varying_nt", + false, true), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -2394,7 +2415,25 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) { "__prefetch_read_varying_1", g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1", - false, true) + false, true), + + GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : + "__prefetch_read_varying_2", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : + "__prefetch_read_varying_2", + false, true), + + GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : + "__prefetch_read_varying_3", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : + "__prefetch_read_varying_3", + false, true), + + GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : + "__prefetch_read_varying_nt", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : + "__prefetch_read_varying_nt", + false, true), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -4334,6 +4373,21 @@ lReplacePseudoGS(llvm::CallInst *callInst) { "__prefetch_read_varying_1", false, true), LowerGSInfo("__pseudo_prefetch_read_varying_1_native", "__prefetch_read_varying_1_native", false, true), + + LowerGSInfo("__pseudo_prefetch_read_varying_2", + "__prefetch_read_varying_2", false, true), + LowerGSInfo("__pseudo_prefetch_read_varying_2_native", + "__prefetch_read_varying_2_native", false, true), + + LowerGSInfo("__pseudo_prefetch_read_varying_3", + "__prefetch_read_varying_3", false, true), + LowerGSInfo("__pseudo_prefetch_read_varying_3_native", + "__prefetch_read_varying_3_native", false, true), + + LowerGSInfo("__pseudo_prefetch_read_varying_nt", + "__prefetch_read_varying_nt", false, true), + LowerGSInfo("__pseudo_prefetch_read_varying_nt_native", + "__prefetch_read_varying_nt_native", false, true), }; llvm::Function *calledFunc = callInst->getCalledFunction(); @@ -4641,7 +4695,8 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__scatter64_i8", "__scatter64_i16", "__scatter64_i32", "__scatter64_i64", "__scatter64_float", "__scatter64_double", - "__prefetch_read_varying_1", + "__prefetch_read_varying_1", "__prefetch_read_varying_2", + "__prefetch_read_varying_3", "__prefetch_read_varying_nt", "__keep_funcs_live", }; diff --git a/stdlib.ispc b/stdlib.ispc index 1aed23cf..d2111d72 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -824,33 +824,15 @@ static inline void prefetch_l1(const void * varying ptr) { } static inline void prefetch_l2(const void * varying ptr) { - const void * uniform ptrArray[programCount]; - ptrArray[programIndex] = ptr; - - foreach_active (i) { - const void * uniform p = ptrArray[i]; - prefetch_l2(p); - } + __pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask); } static inline void prefetch_l3(const void * varying ptr) { - const void * uniform ptrArray[programCount]; - ptrArray[programIndex] = ptr; - - foreach_active (i) { - const void * uniform p = ptrArray[i]; - prefetch_l3(p); - } + __pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask); } static inline void prefetch_nt(const void * varying ptr) { - const void * uniform ptrArray[programCount]; - ptrArray[programIndex] = ptr; - - foreach_active (i) { - const void * uniform p = ptrArray[i]; - prefetch_nt(p); - } + __pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask); } /////////////////////////////////////////////////////////////////////////// diff --git a/tests/prefetch-varying.ispc b/tests/prefetch-varying.ispc new file mode 100644 index 00000000..02df84c9 --- /dev/null +++ b/tests/prefetch-varying.ispc @@ -0,0 +1,22 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int64 *ptr = &(a[programIndex+zero]); + prefetch_l1(ptr); + prefetch_l2(ptr); + prefetch_l3(ptr); + prefetch_nt(ptr); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +}