Support for cache 2/3 and all targets

This commit is contained in:
Vsevolod Livinskiy
2014-10-02 16:25:23 +04:00
parent 0a6eb61ad0
commit eb61d5df72
13 changed files with 262 additions and 39 deletions

View File

@@ -504,6 +504,9 @@ lSetInternalFunctions(llvm::Module *module) {
"__prefetch_read_uniform_3",
"__prefetch_read_uniform_nt",
"__pseudo_prefetch_read_varying_1",
"__pseudo_prefetch_read_varying_2",
"__pseudo_prefetch_read_varying_3",
"__pseudo_prefetch_read_varying_nt",
"__psubs_vi8",
"__psubs_vi16",
"__psubus_vi8",

View File

@@ -372,6 +372,12 @@ declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
declare void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins

View File

@@ -1595,6 +1595,39 @@ define void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask
}
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
define void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 2, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
define void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 1, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
define void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 0, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2553,6 +2586,24 @@ declare void
@__pseudo_prefetch_read_varying_1_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_2(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_2_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_3(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_3_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_nt_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
declare void @__use8(<WIDTH x i8>)
@@ -3063,6 +3114,30 @@ ifelse(HAVE_SCATTER, `1',
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_2_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_2_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_3_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_3_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
ret void
}

View File

@@ -1540,6 +1540,15 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -1624,6 +1624,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec32_i32 offsets, __vec32_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec32_i64 addr, __vec32_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -1757,6 +1757,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec64_i32 offsets, __vec64_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec64_i64 addr, __vec64_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -2550,16 +2550,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
}
static FORCEINLINE void __prefetch_read_varying_1_native(uint8_t *base, uint32_t scale,
__vec16_i32 offsets, __vec16_i1 mask) {
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0);
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);
__vec16_i1 copy_mask = _mm512_kmov(mask);
_mm512_kswapb(mask, copy_mask);
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0);
}
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) { \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
__vec16_i1 copy_mask = _mm512_kmov(mask); \
_mm512_kswapb(mask, copy_mask); \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
__vec16_i32 offsets, __vec16_i1 mask) {}
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
static FORCEINLINE void __prefetch_read_varying_1(__vec16_i64 addr, __vec16_i1 mask) {}
///////////////////////////////////////////////////////////////////////////
// atomics
///////////////////////////////////////////////////////////////////////////

View File

@@ -2606,6 +2606,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
}
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) { \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
__vec16_i1 copy_mask = _mm512_kmov(mask); \
_mm512_kswapb(mask, copy_mask); \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
__vec16_i32 offsets, __vec16_i1 mask) {}
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -1926,13 +1926,25 @@ static FORCEINLINE void __prefetch_read_uniform_nt(const char *p) {
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
}
static FORCEINLINE void __prefetch_read_varying_1_native(uint8_t *base, uint32_t scale,
__vec16_i32 offsets, __vec16_i1 mask) {
_mm512_prefetch_i32gather_ps(offsets, base, scale, _MM_HINT_T0);
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), offsets);
/* TODO: permutevar mask */
_mm512_prefetch_i32gather_ps(offsets, base, scale, _MM_HINT_T0);
}
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) { \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
__vec16_i1 copy_mask = _mm512_kmov(mask); \
_mm512_kswapb(mask, copy_mask); \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
__vec16_i32 offsets, __vec16_i1 mask) {}
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -3898,6 +3898,15 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *ptr) {
_mm_prefetch((char *)ptr, _MM_HINT_NTA);
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec4_i32 offsets, __vec4_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec4_i64 addr, __vec4_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

59
opt.cpp
View File

@@ -2161,6 +2161,27 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
"__prefetch_read_varying_1",
false, true),
GSInfo("__pseudo_prefetch_read_varying_2",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
false, true),
GSInfo("__pseudo_prefetch_read_varying_3",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
false, true),
GSInfo("__pseudo_prefetch_read_varying_nt",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
false, true),
};
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -2394,7 +2415,25 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
"__prefetch_read_varying_1",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
"__prefetch_read_varying_1",
false, true)
false, true),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
false, true),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
false, true),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
false, true),
};
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -4334,6 +4373,21 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
"__prefetch_read_varying_1", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_1_native",
"__prefetch_read_varying_1_native", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_2",
"__prefetch_read_varying_2", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_2_native",
"__prefetch_read_varying_2_native", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_3",
"__prefetch_read_varying_3", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_3_native",
"__prefetch_read_varying_3_native", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_nt",
"__prefetch_read_varying_nt", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_nt_native",
"__prefetch_read_varying_nt_native", false, true),
};
llvm::Function *calledFunc = callInst->getCalledFunction();
@@ -4641,7 +4695,8 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
"__scatter64_i8", "__scatter64_i16",
"__scatter64_i32", "__scatter64_i64",
"__scatter64_float", "__scatter64_double",
"__prefetch_read_varying_1",
"__prefetch_read_varying_1", "__prefetch_read_varying_2",
"__prefetch_read_varying_3", "__prefetch_read_varying_nt",
"__keep_funcs_live",
};

View File

@@ -824,33 +824,15 @@ static inline void prefetch_l1(const void * varying ptr) {
}
static inline void prefetch_l2(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l2(p);
}
__pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask);
}
static inline void prefetch_l3(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l3(p);
}
__pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask);
}
static inline void prefetch_nt(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_nt(p);
}
__pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask);
}
///////////////////////////////////////////////////////////////////////////

View File

@@ -0,0 +1,22 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int64 *ptr = &(a[programIndex+zero]);
prefetch_l1(ptr);
prefetch_l2(ptr);
prefetch_l3(ptr);
prefetch_nt(ptr);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}