diff --git a/builtins.cpp b/builtins.cpp index 10b9b861..1c690295 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -503,6 +503,7 @@ lSetInternalFunctions(llvm::Module *module) { "__prefetch_read_uniform_2", "__prefetch_read_uniform_3", "__prefetch_read_uniform_nt", + "__pseudo_prefetch_read_varying_1", "__psubs_vi8", "__psubs_vi16", "__psubus_vi8", diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 225c2681..ce4cf70d 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -370,6 +370,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind +declare void @__prefetch_read_varying_1( %addr, %mask) nounwind +declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int8/int16 builtins diff --git a/builtins/util.m4 b/builtins/util.m4 index 3a817a54..25868d69 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1584,6 +1584,17 @@ define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1) ret void } + +define void @__prefetch_read_varying_1( %addr, %mask) alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %addr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8* + call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 3, i32 1) + ') + ret void +} + +declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2535,6 +2546,13 @@ declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, , , ) nounwind + +declare void @__pseudo_prefetch_read_varying_1(, ) nounwind + +declare void +@__pseudo_prefetch_read_varying_1_native(i8 *, i32, , + ) nounwind + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; declare void @__use8() @@ -3034,6 +3052,17 @@ ifelse(HAVE_SCATTER, `1', %vd, %mask) ') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; prefetchs + + call void @__pseudo_prefetch_read_varying_1( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_1_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_1_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_1( %v64, %mask) + ret void } diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index be9cbd1c..0e7a849a 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -2550,6 +2550,16 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) { // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint } +static FORCEINLINE void __prefetch_read_varying_1_native(uint8_t *base, uint32_t scale, + __vec16_i32 offsets, __vec16_i1 mask) { + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); + offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets); + __vec16_i1 copy_mask = _mm512_kmov(mask); + _mm512_kswapb(mask, copy_mask); + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); +} + +static FORCEINLINE void __prefetch_read_varying_1(__vec16_i64 addr, __vec16_i1 mask) {} /////////////////////////////////////////////////////////////////////////// // atomics /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 9a66ce22..e3598538 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1926,6 +1926,14 @@ static FORCEINLINE void __prefetch_read_uniform_nt(const char *p) { // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint } +static FORCEINLINE void __prefetch_read_varying_1_native(uint8_t *base, uint32_t scale, + __vec16_i32 offsets, __vec16_i1 mask) { + _mm512_prefetch_i32gather_ps(offsets, base, scale, _MM_HINT_T0); + offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), offsets); + /* TODO: permutevar mask */ + _mm512_prefetch_i32gather_ps(offsets, base, scale, _MM_HINT_T0); +} + /////////////////////////////////////////////////////////////////////////// // atomics /////////////////////////////////////////////////////////////////////////// diff --git a/ispc.cpp b/ispc.cpp index 0a0df702..ad1bd455 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -199,7 +199,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_hasTranscendentals(false), m_hasTrigonometry(false), m_hasRsqrtd(false), - m_hasRcpd(false) + m_hasRcpd(false), + m_hasVecPrefetch(false) { if (isa == NULL) { if (cpu != NULL) { @@ -381,6 +382,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTrigonometry = false; this->m_hasGather = this->m_hasScatter = true; this->m_hasRsqrtd = this->m_hasRcpd = true; + // It's set to true, because MIC has hardware vector prefetch instruction + this->m_hasVecPrefetch = true; } else if (!strcasecmp(isa, "generic-32") || !strcasecmp(isa, "generic-x32")) { diff --git a/ispc.h b/ispc.h index 2050ac44..9f9447f8 100644 --- a/ispc.h +++ b/ispc.h @@ -280,6 +280,8 @@ public: bool hasRcpd() const {return m_hasRcpd;} + bool hasVecPrefetch() const {return m_hasVecPrefetch;} + private: /** llvm Target object representing this target. */ @@ -382,6 +384,9 @@ private: /** Indicates whether there is an ISA double precision rcp. */ bool m_hasRcpd; + + /** Indicates whether the target has hardware instruction for vector prefetch. */ + bool m_hasVecPrefetch; }; diff --git a/opt.cpp b/opt.cpp index 3fb68810..e570919e 100644 --- a/opt.cpp +++ b/opt.cpp @@ -1994,8 +1994,8 @@ static bool lGSToGSBaseOffsets(llvm::CallInst *callInst) { struct GSInfo { GSInfo(const char *pgFuncName, const char *pgboFuncName, - const char *pgbo32FuncName, bool ig) - : isGather(ig) { + const char *pgbo32FuncName, bool ig, bool ip) + : isGather(ig), isPrefetch(ip) { func = m->module->getFunction(pgFuncName); baseOffsetsFunc = m->module->getFunction(pgboFuncName); baseOffsets32Func = m->module->getFunction(pgbo32FuncName); @@ -2003,6 +2003,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { llvm::Function *func; llvm::Function *baseOffsetsFunc, *baseOffsets32Func; const bool isGather; + const bool isPrefetch; }; GSInfo gsFuncs[] = { @@ -2011,148 +2012,155 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { "__pseudo_gather_factored_base_offsets32_i8", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8", - true), + true, false), GSInfo("__pseudo_gather32_i16", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" : "__pseudo_gather_factored_base_offsets32_i16", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" : "__pseudo_gather_factored_base_offsets32_i16", - true), + true, false), GSInfo("__pseudo_gather32_i32", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" : "__pseudo_gather_factored_base_offsets32_i32", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" : "__pseudo_gather_factored_base_offsets32_i32", - true), + true, false), GSInfo("__pseudo_gather32_float", g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" : "__pseudo_gather_factored_base_offsets32_float", g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" : "__pseudo_gather_factored_base_offsets32_float", - true), + true, false), GSInfo("__pseudo_gather32_i64", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" : "__pseudo_gather_factored_base_offsets32_i64", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" : "__pseudo_gather_factored_base_offsets32_i64", - true), + true, false), GSInfo("__pseudo_gather32_double", g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" : "__pseudo_gather_factored_base_offsets32_double", g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" : "__pseudo_gather_factored_base_offsets32_double", - true), + true, false), GSInfo("__pseudo_scatter32_i8", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" : "__pseudo_scatter_factored_base_offsets32_i8", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" : "__pseudo_scatter_factored_base_offsets32_i8", - false), + false, false), GSInfo("__pseudo_scatter32_i16", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" : "__pseudo_scatter_factored_base_offsets32_i16", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" : "__pseudo_scatter_factored_base_offsets32_i16", - false), + false, false), GSInfo("__pseudo_scatter32_i32", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" : "__pseudo_scatter_factored_base_offsets32_i32", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" : "__pseudo_scatter_factored_base_offsets32_i32", - false), + false, false), GSInfo("__pseudo_scatter32_float", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" : "__pseudo_scatter_factored_base_offsets32_float", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" : "__pseudo_scatter_factored_base_offsets32_float", - false), + false, false), GSInfo("__pseudo_scatter32_i64", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" : "__pseudo_scatter_factored_base_offsets32_i64", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" : "__pseudo_scatter_factored_base_offsets32_i64", - false), + false, false), GSInfo("__pseudo_scatter32_double", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" : "__pseudo_scatter_factored_base_offsets32_double", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" : "__pseudo_scatter_factored_base_offsets32_double", - false), + false, false), GSInfo("__pseudo_gather64_i8", g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" : "__pseudo_gather_factored_base_offsets64_i8", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8", - true), + true, false), GSInfo("__pseudo_gather64_i16", g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" : "__pseudo_gather_factored_base_offsets64_i16", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" : "__pseudo_gather_factored_base_offsets32_i16", - true), + true, false), GSInfo("__pseudo_gather64_i32", g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" : "__pseudo_gather_factored_base_offsets64_i32", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" : "__pseudo_gather_factored_base_offsets32_i32", - true), + true, false), GSInfo("__pseudo_gather64_float", g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" : "__pseudo_gather_factored_base_offsets64_float", g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" : "__pseudo_gather_factored_base_offsets32_float", - true), + true, false), GSInfo("__pseudo_gather64_i64", g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" : "__pseudo_gather_factored_base_offsets64_i64", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" : "__pseudo_gather_factored_base_offsets32_i64", - true), + true, false), GSInfo("__pseudo_gather64_double", g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" : "__pseudo_gather_factored_base_offsets64_double", g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" : "__pseudo_gather_factored_base_offsets32_double", - true), + true, false), GSInfo("__pseudo_scatter64_i8", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" : "__pseudo_scatter_factored_base_offsets64_i8", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" : "__pseudo_scatter_factored_base_offsets32_i8", - false), + false, false), GSInfo("__pseudo_scatter64_i16", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" : "__pseudo_scatter_factored_base_offsets64_i16", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" : "__pseudo_scatter_factored_base_offsets32_i16", - false), + false, false), GSInfo("__pseudo_scatter64_i32", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" : "__pseudo_scatter_factored_base_offsets64_i32", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" : "__pseudo_scatter_factored_base_offsets32_i32", - false), + false, false), GSInfo("__pseudo_scatter64_float", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" : "__pseudo_scatter_factored_base_offsets64_float", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" : "__pseudo_scatter_factored_base_offsets32_float", - false), + false, false), GSInfo("__pseudo_scatter64_i64", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" : "__pseudo_scatter_factored_base_offsets64_i64", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" : "__pseudo_scatter_factored_base_offsets32_i64", - false), + false, false), GSInfo("__pseudo_scatter64_double", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" : "__pseudo_scatter_factored_base_offsets64_double", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" : "__pseudo_scatter_factored_base_offsets32_double", - false), + false, false), + + GSInfo("__pseudo_prefetch_read_varying_1", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : + "__prefetch_read_varying_1", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : + "__prefetch_read_varying_1", + false, true), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -2178,7 +2186,8 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector, callInst); - if (basePtr == NULL || offsetVector == NULL) + if (basePtr == NULL || offsetVector == NULL || + (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false)) // It's actually a fully general gather/scatter with a varying // set of base pointers, so leave it as is and continune onward // to the next instruction... @@ -2193,7 +2202,9 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { llvm::Function *gatherScatterFunc = info->baseOffsetsFunc; if ((info->isGather == true && g->target->hasGather()) || - (info->isGather == false && g->target->hasScatter())) { + (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) || + (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) { + // See if the offsets are scaled by 2, 4, or 8. If so, // extract that scale factor and rewrite the offsets to remove // it. @@ -2207,7 +2218,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { gatherScatterFunc = info->baseOffsets32Func; } - if (info->isGather) { + if (info->isGather || info->isPrefetch) { llvm::Value *mask = callInst->getArgOperand(1); // Generate a new function call to the next pseudo gather @@ -2264,7 +2275,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { gatherScatterFunc = info->baseOffsets32Func; } - if (info->isGather) { + if (info->isGather || info->isPrefetch) { llvm::Value *mask = callInst->getArgOperand(1); // Generate a new function call to the next pseudo gather @@ -2306,13 +2317,14 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) { struct GSBOInfo { - GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig) - : isGather(ig) { + GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip) + : isGather(ig), isPrefetch(ip) { baseOffsetsFunc = m->module->getFunction(pgboFuncName); baseOffsets32Func = m->module->getFunction(pgbo32FuncName); } llvm::Function *baseOffsetsFunc, *baseOffsets32Func; const bool isGather; + const bool isPrefetch; }; GSBOInfo gsFuncs[] = { @@ -2320,63 +2332,69 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) { "__pseudo_gather_factored_base_offsets32_i8", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8", - true), + true, false), GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" : "__pseudo_gather_factored_base_offsets32_i16", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" : "__pseudo_gather_factored_base_offsets32_i16", - true), + true, false), GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" : "__pseudo_gather_factored_base_offsets32_i32", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" : "__pseudo_gather_factored_base_offsets32_i32", - true), + true, false), GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" : "__pseudo_gather_factored_base_offsets32_float", g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" : "__pseudo_gather_factored_base_offsets32_float", - true), + true, false), GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" : "__pseudo_gather_factored_base_offsets32_i64", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" : "__pseudo_gather_factored_base_offsets32_i64", - true), + true, false), GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" : "__pseudo_gather_factored_base_offsets32_double", g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" : "__pseudo_gather_factored_base_offsets32_double", - true), + true, false), GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" : "__pseudo_scatter_factored_base_offsets32_i8", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" : "__pseudo_scatter_factored_base_offsets32_i8", - false), + false, false), GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" : "__pseudo_scatter_factored_base_offsets32_i16", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" : "__pseudo_scatter_factored_base_offsets32_i16", - false), + false, false), GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" : "__pseudo_scatter_factored_base_offsets32_i32", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" : "__pseudo_scatter_factored_base_offsets32_i32", - false), + false, false), GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" : "__pseudo_scatter_factored_base_offsets32_float", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" : "__pseudo_scatter_factored_base_offsets32_float", - false), + false, false), GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" : "__pseudo_scatter_factored_base_offsets32_i64", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" : "__pseudo_scatter_factored_base_offsets32_i64", - false), + false, false), GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" : "__pseudo_scatter_factored_base_offsets32_double", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" : "__pseudo_scatter_factored_base_offsets32_double", - false), + false, false), + + GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : + "__prefetch_read_varying_1", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : + "__prefetch_read_varying_1", + false, true) }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -4167,149 +4185,155 @@ lReplacePseudoMaskedStore(llvm::CallInst *callInst) { static bool lReplacePseudoGS(llvm::CallInst *callInst) { struct LowerGSInfo { - LowerGSInfo(const char *pName, const char *aName, bool ig) - : isGather(ig) { + LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip) + : isGather(ig), isPrefetch(ip) { pseudoFunc = m->module->getFunction(pName); actualFunc = m->module->getFunction(aName); } llvm::Function *pseudoFunc; llvm::Function *actualFunc; const bool isGather; + const bool isPrefetch; }; LowerGSInfo lgsInfo[] = { - LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true), - LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true), - LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true), - LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true), - LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true), - LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true), + LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false), + LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false), + LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false), + LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false), + LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false), + LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false), - LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true), - LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true), - LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true), - LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true), - LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true), - LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true), + LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false), + LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false), + LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false), + LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false), + LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false), + LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", - "__gather_factored_base_offsets32_i8", true), + "__gather_factored_base_offsets32_i8", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", - "__gather_factored_base_offsets32_i16", true), + "__gather_factored_base_offsets32_i16", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", - "__gather_factored_base_offsets32_i32", true), + "__gather_factored_base_offsets32_i32", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", - "__gather_factored_base_offsets32_float", true), + "__gather_factored_base_offsets32_float", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", - "__gather_factored_base_offsets32_i64", true), + "__gather_factored_base_offsets32_i64", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", - "__gather_factored_base_offsets32_double", true), + "__gather_factored_base_offsets32_double", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", - "__gather_factored_base_offsets64_i8", true), + "__gather_factored_base_offsets64_i8", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", - "__gather_factored_base_offsets64_i16", true), + "__gather_factored_base_offsets64_i16", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", - "__gather_factored_base_offsets64_i32", true), + "__gather_factored_base_offsets64_i32", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", - "__gather_factored_base_offsets64_float", true), + "__gather_factored_base_offsets64_float", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", - "__gather_factored_base_offsets64_i64", true), + "__gather_factored_base_offsets64_i64", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", - "__gather_factored_base_offsets64_double", true), + "__gather_factored_base_offsets64_double", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_i8", - "__gather_base_offsets32_i8", true), + "__gather_base_offsets32_i8", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_i16", - "__gather_base_offsets32_i16", true), + "__gather_base_offsets32_i16", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_i32", - "__gather_base_offsets32_i32", true), + "__gather_base_offsets32_i32", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_float", - "__gather_base_offsets32_float", true), + "__gather_base_offsets32_float", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_i64", - "__gather_base_offsets32_i64", true), + "__gather_base_offsets32_i64", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_double", - "__gather_base_offsets32_double", true), + "__gather_base_offsets32_double", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_i8", - "__gather_base_offsets64_i8", true), + "__gather_base_offsets64_i8", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_i16", - "__gather_base_offsets64_i16", true), + "__gather_base_offsets64_i16", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_i32", - "__gather_base_offsets64_i32", true), + "__gather_base_offsets64_i32", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_float", - "__gather_base_offsets64_float", true), + "__gather_base_offsets64_float", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_i64", - "__gather_base_offsets64_i64", true), + "__gather_base_offsets64_i64", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_double", - "__gather_base_offsets64_double", true), + "__gather_base_offsets64_double", true, false), - LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false), - LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false), - LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false), - LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false), - LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false), - LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false), + LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false), + LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false), + LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false), + LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false), + LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false), + LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false), - LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false), - LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false), - LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false), - LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false), - LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false), - LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false), + LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false), + LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false), + LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false), + LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false), + LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false), + LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", - "__scatter_factored_base_offsets32_i8", false), + "__scatter_factored_base_offsets32_i8", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", - "__scatter_factored_base_offsets32_i16", false), + "__scatter_factored_base_offsets32_i16", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", - "__scatter_factored_base_offsets32_i32", false), + "__scatter_factored_base_offsets32_i32", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", - "__scatter_factored_base_offsets32_float", false), + "__scatter_factored_base_offsets32_float", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", - "__scatter_factored_base_offsets32_i64", false), + "__scatter_factored_base_offsets32_i64", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", - "__scatter_factored_base_offsets32_double", false), + "__scatter_factored_base_offsets32_double", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", - "__scatter_factored_base_offsets64_i8", false), + "__scatter_factored_base_offsets64_i8", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", - "__scatter_factored_base_offsets64_i16", false), + "__scatter_factored_base_offsets64_i16", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", - "__scatter_factored_base_offsets64_i32", false), + "__scatter_factored_base_offsets64_i32", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", - "__scatter_factored_base_offsets64_float", false), + "__scatter_factored_base_offsets64_float", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", - "__scatter_factored_base_offsets64_i64", false), + "__scatter_factored_base_offsets64_i64", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", - "__scatter_factored_base_offsets64_double", false), + "__scatter_factored_base_offsets64_double", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_i8", - "__scatter_base_offsets32_i8", false), + "__scatter_base_offsets32_i8", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_i16", - "__scatter_base_offsets32_i16", false), + "__scatter_base_offsets32_i16", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_i32", - "__scatter_base_offsets32_i32", false), + "__scatter_base_offsets32_i32", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_float", - "__scatter_base_offsets32_float", false), + "__scatter_base_offsets32_float", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_i64", - "__scatter_base_offsets32_i64", false), + "__scatter_base_offsets32_i64", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_double", - "__scatter_base_offsets32_double", false), + "__scatter_base_offsets32_double", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_i8", - "__scatter_base_offsets64_i8", false), + "__scatter_base_offsets64_i8", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_i16", - "__scatter_base_offsets64_i16", false), + "__scatter_base_offsets64_i16", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_i32", - "__scatter_base_offsets64_i32", false), + "__scatter_base_offsets64_i32", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_float", - "__scatter_base_offsets64_float", false), + "__scatter_base_offsets64_float", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_i64", - "__scatter_base_offsets64_i64", false), + "__scatter_base_offsets64_i64", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_double", - "__scatter_base_offsets64_double", false), + "__scatter_base_offsets64_double", false, false), + + LowerGSInfo("__pseudo_prefetch_read_varying_1", + "__prefetch_read_varying_1", false, true), + LowerGSInfo("__pseudo_prefetch_read_varying_1_native", + "__prefetch_read_varying_1_native", false, true), }; llvm::Function *calledFunc = callInst->getCalledFunction(); @@ -4336,7 +4360,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) { if (gotPosition && g->target->getVectorWidth() > 1) { if (info->isGather) PerformanceWarning(pos, "Gather required to load value."); - else + else if (!info->isPrefetch) PerformanceWarning(pos, "Scatter required to store value."); } return true; @@ -4617,6 +4641,7 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__scatter64_i8", "__scatter64_i16", "__scatter64_i32", "__scatter64_i64", "__scatter64_float", "__scatter64_double", + "__prefetch_read_varying_1", "__keep_funcs_live", }; diff --git a/stdlib.ispc b/stdlib.ispc index a3845ded..1aed23cf 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -820,13 +820,7 @@ static inline void prefetch_nt(const void * uniform ptr) { } static inline void prefetch_l1(const void * varying ptr) { - const void * uniform ptrArray[programCount]; - ptrArray[programIndex] = ptr; - - foreach_active (i) { - const void * uniform p = ptrArray[i]; - prefetch_l1(p); - } + __pseudo_prefetch_read_varying_1((int64)ptr, (IntMaskType)__mask); } static inline void prefetch_l2(const void * varying ptr) {