Merge pull request #864 from Vsevolod-Livinskij/opt_prefetch
Optimized prefetch
This commit is contained in:
@@ -503,6 +503,10 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__prefetch_read_uniform_2",
|
"__prefetch_read_uniform_2",
|
||||||
"__prefetch_read_uniform_3",
|
"__prefetch_read_uniform_3",
|
||||||
"__prefetch_read_uniform_nt",
|
"__prefetch_read_uniform_nt",
|
||||||
|
"__pseudo_prefetch_read_varying_1",
|
||||||
|
"__pseudo_prefetch_read_varying_2",
|
||||||
|
"__pseudo_prefetch_read_varying_3",
|
||||||
|
"__pseudo_prefetch_read_varying_nt",
|
||||||
"__psubs_vi8",
|
"__psubs_vi8",
|
||||||
"__psubs_vi16",
|
"__psubs_vi16",
|
||||||
"__psubus_vi8",
|
"__psubus_vi8",
|
||||||
|
|||||||
@@ -370,6 +370,14 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
|
|||||||
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
|
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
|
||||||
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
|
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
|
||||||
|
|
||||||
|
declare void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
|
||||||
|
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||||
|
declare void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
|
||||||
|
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||||
|
declare void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
|
||||||
|
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||||
|
declare void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
|
||||||
|
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; int8/int16 builtins
|
;; int8/int16 builtins
|
||||||
|
|
||||||
|
|||||||
104
builtins/util.m4
104
builtins/util.m4
@@ -1584,6 +1584,50 @@ define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
|
|||||||
call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
|
call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
|
||||||
|
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||||
|
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
|
||||||
|
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
|
||||||
|
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 3, i32 1)
|
||||||
|
')
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||||
|
|
||||||
|
define void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
|
||||||
|
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||||
|
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
|
||||||
|
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
|
||||||
|
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 2, i32 1)
|
||||||
|
')
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||||
|
|
||||||
|
define void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
|
||||||
|
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||||
|
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
|
||||||
|
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
|
||||||
|
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 1, i32 1)
|
||||||
|
')
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||||
|
|
||||||
|
define void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
|
||||||
|
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||||
|
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
|
||||||
|
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
|
||||||
|
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 0, i32 1)
|
||||||
|
')
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||||
')
|
')
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
@@ -2535,6 +2579,31 @@ declare void
|
|||||||
@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
|
@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
|
||||||
<WIDTH x double>, <WIDTH x MASK>) nounwind
|
<WIDTH x double>, <WIDTH x MASK>) nounwind
|
||||||
|
|
||||||
|
|
||||||
|
declare void @__pseudo_prefetch_read_varying_1(<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||||
|
|
||||||
|
declare void
|
||||||
|
@__pseudo_prefetch_read_varying_1_native(i8 *, i32, <WIDTH x i32>,
|
||||||
|
<WIDTH x MASK>) nounwind
|
||||||
|
|
||||||
|
declare void @__pseudo_prefetch_read_varying_2(<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||||
|
|
||||||
|
declare void
|
||||||
|
@__pseudo_prefetch_read_varying_2_native(i8 *, i32, <WIDTH x i32>,
|
||||||
|
<WIDTH x MASK>) nounwind
|
||||||
|
|
||||||
|
declare void @__pseudo_prefetch_read_varying_3(<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||||
|
|
||||||
|
declare void
|
||||||
|
@__pseudo_prefetch_read_varying_3_native(i8 *, i32, <WIDTH x i32>,
|
||||||
|
<WIDTH x MASK>) nounwind
|
||||||
|
|
||||||
|
declare void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||||
|
|
||||||
|
declare void
|
||||||
|
@__pseudo_prefetch_read_varying_nt_native(i8 *, i32, <WIDTH x i32>,
|
||||||
|
<WIDTH x MASK>) nounwind
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
declare void @__use8(<WIDTH x i8>)
|
declare void @__use8(<WIDTH x i8>)
|
||||||
@@ -3034,6 +3103,41 @@ ifelse(HAVE_SCATTER, `1',
|
|||||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||||
')
|
')
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; prefetchs
|
||||||
|
|
||||||
|
call void @__pseudo_prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||||
|
|
||||||
|
call void @__pseudo_prefetch_read_varying_1_native(i8 * %ptr, i32 0,
|
||||||
|
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||||
|
call void @__prefetch_read_varying_1_native(i8 * %ptr, i32 0,
|
||||||
|
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||||
|
call void @__prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||||
|
|
||||||
|
call void @__pseudo_prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||||
|
|
||||||
|
call void @__pseudo_prefetch_read_varying_2_native(i8 * %ptr, i32 0,
|
||||||
|
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||||
|
call void @__prefetch_read_varying_2_native(i8 * %ptr, i32 0,
|
||||||
|
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||||
|
call void @__prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||||
|
|
||||||
|
call void @__pseudo_prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||||
|
|
||||||
|
call void @__pseudo_prefetch_read_varying_3_native(i8 * %ptr, i32 0,
|
||||||
|
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||||
|
call void @__prefetch_read_varying_3_native(i8 * %ptr, i32 0,
|
||||||
|
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||||
|
call void @__prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||||
|
|
||||||
|
call void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||||
|
|
||||||
|
call void @__pseudo_prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
|
||||||
|
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||||
|
call void @__prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
|
||||||
|
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||||
|
call void @__prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||||
|
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1540,6 +1540,15 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
|
|||||||
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
|
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define PREFETCH_READ_VARYING(CACHE_NUM) \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||||
|
__vec16_i32 offsets, __vec16_i1 mask) {} \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
|
||||||
|
|
||||||
|
PREFETCH_READ_VARYING(1)
|
||||||
|
PREFETCH_READ_VARYING(2)
|
||||||
|
PREFETCH_READ_VARYING(3)
|
||||||
|
PREFETCH_READ_VARYING(nt)
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// atomics
|
// atomics
|
||||||
|
|
||||||
|
|||||||
@@ -1624,6 +1624,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
|
|||||||
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
|
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define PREFETCH_READ_VARYING(CACHE_NUM) \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||||
|
__vec32_i32 offsets, __vec32_i1 mask) {} \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec32_i64 addr, __vec32_i1 mask) {} \
|
||||||
|
|
||||||
|
PREFETCH_READ_VARYING(1)
|
||||||
|
PREFETCH_READ_VARYING(2)
|
||||||
|
PREFETCH_READ_VARYING(3)
|
||||||
|
PREFETCH_READ_VARYING(nt)
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// atomics
|
// atomics
|
||||||
|
|
||||||
|
|||||||
@@ -1757,6 +1757,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
|
|||||||
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
|
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define PREFETCH_READ_VARYING(CACHE_NUM) \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||||
|
__vec64_i32 offsets, __vec64_i1 mask) {} \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec64_i64 addr, __vec64_i1 mask) {} \
|
||||||
|
|
||||||
|
PREFETCH_READ_VARYING(1)
|
||||||
|
PREFETCH_READ_VARYING(2)
|
||||||
|
PREFETCH_READ_VARYING(3)
|
||||||
|
PREFETCH_READ_VARYING(nt)
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// atomics
|
// atomics
|
||||||
|
|
||||||
|
|||||||
@@ -2550,6 +2550,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
|
|||||||
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
|
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||||
|
__vec16_i32 offsets, __vec16_i1 mask) { \
|
||||||
|
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
|
||||||
|
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
|
||||||
|
__vec16_i1 copy_mask = _mm512_kmov(mask); \
|
||||||
|
_mm512_kswapb(mask, copy_mask); \
|
||||||
|
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
|
||||||
|
} \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
|
||||||
|
|
||||||
|
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
|
||||||
|
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
|
||||||
|
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
|
||||||
|
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
|
||||||
|
__vec16_i32 offsets, __vec16_i1 mask) {}
|
||||||
|
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// atomics
|
// atomics
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -2606,6 +2606,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
|
|||||||
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
|
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||||
|
__vec16_i32 offsets, __vec16_i1 mask) { \
|
||||||
|
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
|
||||||
|
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
|
||||||
|
__vec16_i1 copy_mask = _mm512_kmov(mask); \
|
||||||
|
_mm512_kswapb(mask, copy_mask); \
|
||||||
|
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
|
||||||
|
} \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
|
||||||
|
|
||||||
|
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
|
||||||
|
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
|
||||||
|
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
|
||||||
|
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
|
||||||
|
__vec16_i32 offsets, __vec16_i1 mask) {}
|
||||||
|
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// atomics
|
// atomics
|
||||||
|
|
||||||
|
|||||||
@@ -2218,6 +2218,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(const char *p) {
|
|||||||
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
|
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||||
|
__vec16_i32 offsets, __vec16_i1 mask) { \
|
||||||
|
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
|
||||||
|
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
|
||||||
|
__vec16_i1 copy_mask = _mm512_kmov(mask); \
|
||||||
|
_mm512_kswapb(mask, copy_mask); \
|
||||||
|
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
|
||||||
|
} \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
|
||||||
|
|
||||||
|
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
|
||||||
|
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
|
||||||
|
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
|
||||||
|
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
|
||||||
|
__vec16_i32 offsets, __vec16_i1 mask) {}
|
||||||
|
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// atomics
|
// atomics
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -3898,6 +3898,15 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *ptr) {
|
|||||||
_mm_prefetch((char *)ptr, _MM_HINT_NTA);
|
_mm_prefetch((char *)ptr, _MM_HINT_NTA);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define PREFETCH_READ_VARYING(CACHE_NUM) \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||||
|
__vec4_i32 offsets, __vec4_i1 mask) {} \
|
||||||
|
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec4_i64 addr, __vec4_i1 mask) {} \
|
||||||
|
|
||||||
|
PREFETCH_READ_VARYING(1)
|
||||||
|
PREFETCH_READ_VARYING(2)
|
||||||
|
PREFETCH_READ_VARYING(3)
|
||||||
|
PREFETCH_READ_VARYING(nt)
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// atomics
|
// atomics
|
||||||
|
|
||||||
|
|||||||
5
ispc.cpp
5
ispc.cpp
@@ -199,7 +199,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
m_hasTranscendentals(false),
|
m_hasTranscendentals(false),
|
||||||
m_hasTrigonometry(false),
|
m_hasTrigonometry(false),
|
||||||
m_hasRsqrtd(false),
|
m_hasRsqrtd(false),
|
||||||
m_hasRcpd(false)
|
m_hasRcpd(false),
|
||||||
|
m_hasVecPrefetch(false)
|
||||||
{
|
{
|
||||||
if (isa == NULL) {
|
if (isa == NULL) {
|
||||||
if (cpu != NULL) {
|
if (cpu != NULL) {
|
||||||
@@ -381,6 +382,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
this->m_hasTrigonometry = false;
|
this->m_hasTrigonometry = false;
|
||||||
this->m_hasGather = this->m_hasScatter = true;
|
this->m_hasGather = this->m_hasScatter = true;
|
||||||
this->m_hasRsqrtd = this->m_hasRcpd = true;
|
this->m_hasRsqrtd = this->m_hasRcpd = true;
|
||||||
|
// It's set to true, because MIC has hardware vector prefetch instruction
|
||||||
|
this->m_hasVecPrefetch = true;
|
||||||
}
|
}
|
||||||
else if (!strcasecmp(isa, "generic-32") ||
|
else if (!strcasecmp(isa, "generic-32") ||
|
||||||
!strcasecmp(isa, "generic-x32")) {
|
!strcasecmp(isa, "generic-x32")) {
|
||||||
|
|||||||
5
ispc.h
5
ispc.h
@@ -280,6 +280,8 @@ public:
|
|||||||
|
|
||||||
bool hasRcpd() const {return m_hasRcpd;}
|
bool hasRcpd() const {return m_hasRcpd;}
|
||||||
|
|
||||||
|
bool hasVecPrefetch() const {return m_hasVecPrefetch;}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
/** llvm Target object representing this target. */
|
/** llvm Target object representing this target. */
|
||||||
@@ -382,6 +384,9 @@ private:
|
|||||||
|
|
||||||
/** Indicates whether there is an ISA double precision rcp. */
|
/** Indicates whether there is an ISA double precision rcp. */
|
||||||
bool m_hasRcpd;
|
bool m_hasRcpd;
|
||||||
|
|
||||||
|
/** Indicates whether the target has hardware instruction for vector prefetch. */
|
||||||
|
bool m_hasVecPrefetch;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
318
opt.cpp
318
opt.cpp
@@ -1994,8 +1994,8 @@ static bool
|
|||||||
lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
||||||
struct GSInfo {
|
struct GSInfo {
|
||||||
GSInfo(const char *pgFuncName, const char *pgboFuncName,
|
GSInfo(const char *pgFuncName, const char *pgboFuncName,
|
||||||
const char *pgbo32FuncName, bool ig)
|
const char *pgbo32FuncName, bool ig, bool ip)
|
||||||
: isGather(ig) {
|
: isGather(ig), isPrefetch(ip) {
|
||||||
func = m->module->getFunction(pgFuncName);
|
func = m->module->getFunction(pgFuncName);
|
||||||
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
|
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
|
||||||
baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
|
baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
|
||||||
@@ -2003,6 +2003,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
|||||||
llvm::Function *func;
|
llvm::Function *func;
|
||||||
llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
|
llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
|
||||||
const bool isGather;
|
const bool isGather;
|
||||||
|
const bool isPrefetch;
|
||||||
};
|
};
|
||||||
|
|
||||||
GSInfo gsFuncs[] = {
|
GSInfo gsFuncs[] = {
|
||||||
@@ -2011,148 +2012,176 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
|||||||
"__pseudo_gather_factored_base_offsets32_i8",
|
"__pseudo_gather_factored_base_offsets32_i8",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i8",
|
"__pseudo_gather_factored_base_offsets32_i8",
|
||||||
true),
|
true, false),
|
||||||
GSInfo("__pseudo_gather32_i16",
|
GSInfo("__pseudo_gather32_i16",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i16",
|
"__pseudo_gather_factored_base_offsets32_i16",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i16",
|
"__pseudo_gather_factored_base_offsets32_i16",
|
||||||
true),
|
true, false),
|
||||||
GSInfo("__pseudo_gather32_i32",
|
GSInfo("__pseudo_gather32_i32",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i32",
|
"__pseudo_gather_factored_base_offsets32_i32",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i32",
|
"__pseudo_gather_factored_base_offsets32_i32",
|
||||||
true),
|
true, false),
|
||||||
GSInfo("__pseudo_gather32_float",
|
GSInfo("__pseudo_gather32_float",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
||||||
"__pseudo_gather_factored_base_offsets32_float",
|
"__pseudo_gather_factored_base_offsets32_float",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
||||||
"__pseudo_gather_factored_base_offsets32_float",
|
"__pseudo_gather_factored_base_offsets32_float",
|
||||||
true),
|
true, false),
|
||||||
GSInfo("__pseudo_gather32_i64",
|
GSInfo("__pseudo_gather32_i64",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i64",
|
"__pseudo_gather_factored_base_offsets32_i64",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i64",
|
"__pseudo_gather_factored_base_offsets32_i64",
|
||||||
true),
|
true, false),
|
||||||
GSInfo("__pseudo_gather32_double",
|
GSInfo("__pseudo_gather32_double",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
||||||
"__pseudo_gather_factored_base_offsets32_double",
|
"__pseudo_gather_factored_base_offsets32_double",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
||||||
"__pseudo_gather_factored_base_offsets32_double",
|
"__pseudo_gather_factored_base_offsets32_double",
|
||||||
true),
|
true, false),
|
||||||
|
|
||||||
GSInfo("__pseudo_scatter32_i8",
|
GSInfo("__pseudo_scatter32_i8",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i8",
|
"__pseudo_scatter_factored_base_offsets32_i8",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i8",
|
"__pseudo_scatter_factored_base_offsets32_i8",
|
||||||
false),
|
false, false),
|
||||||
GSInfo("__pseudo_scatter32_i16",
|
GSInfo("__pseudo_scatter32_i16",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i16",
|
"__pseudo_scatter_factored_base_offsets32_i16",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i16",
|
"__pseudo_scatter_factored_base_offsets32_i16",
|
||||||
false),
|
false, false),
|
||||||
GSInfo("__pseudo_scatter32_i32",
|
GSInfo("__pseudo_scatter32_i32",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i32",
|
"__pseudo_scatter_factored_base_offsets32_i32",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i32",
|
"__pseudo_scatter_factored_base_offsets32_i32",
|
||||||
false),
|
false, false),
|
||||||
GSInfo("__pseudo_scatter32_float",
|
GSInfo("__pseudo_scatter32_float",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_float",
|
"__pseudo_scatter_factored_base_offsets32_float",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_float",
|
"__pseudo_scatter_factored_base_offsets32_float",
|
||||||
false),
|
false, false),
|
||||||
GSInfo("__pseudo_scatter32_i64",
|
GSInfo("__pseudo_scatter32_i64",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i64",
|
"__pseudo_scatter_factored_base_offsets32_i64",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i64",
|
"__pseudo_scatter_factored_base_offsets32_i64",
|
||||||
false),
|
false, false),
|
||||||
GSInfo("__pseudo_scatter32_double",
|
GSInfo("__pseudo_scatter32_double",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_double",
|
"__pseudo_scatter_factored_base_offsets32_double",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_double",
|
"__pseudo_scatter_factored_base_offsets32_double",
|
||||||
false),
|
false, false),
|
||||||
|
|
||||||
GSInfo("__pseudo_gather64_i8",
|
GSInfo("__pseudo_gather64_i8",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
|
||||||
"__pseudo_gather_factored_base_offsets64_i8",
|
"__pseudo_gather_factored_base_offsets64_i8",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i8",
|
"__pseudo_gather_factored_base_offsets32_i8",
|
||||||
true),
|
true, false),
|
||||||
GSInfo("__pseudo_gather64_i16",
|
GSInfo("__pseudo_gather64_i16",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
|
||||||
"__pseudo_gather_factored_base_offsets64_i16",
|
"__pseudo_gather_factored_base_offsets64_i16",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i16",
|
"__pseudo_gather_factored_base_offsets32_i16",
|
||||||
true),
|
true, false),
|
||||||
GSInfo("__pseudo_gather64_i32",
|
GSInfo("__pseudo_gather64_i32",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
|
||||||
"__pseudo_gather_factored_base_offsets64_i32",
|
"__pseudo_gather_factored_base_offsets64_i32",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i32",
|
"__pseudo_gather_factored_base_offsets32_i32",
|
||||||
true),
|
true, false),
|
||||||
GSInfo("__pseudo_gather64_float",
|
GSInfo("__pseudo_gather64_float",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
|
||||||
"__pseudo_gather_factored_base_offsets64_float",
|
"__pseudo_gather_factored_base_offsets64_float",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
||||||
"__pseudo_gather_factored_base_offsets32_float",
|
"__pseudo_gather_factored_base_offsets32_float",
|
||||||
true),
|
true, false),
|
||||||
GSInfo("__pseudo_gather64_i64",
|
GSInfo("__pseudo_gather64_i64",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
|
||||||
"__pseudo_gather_factored_base_offsets64_i64",
|
"__pseudo_gather_factored_base_offsets64_i64",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i64",
|
"__pseudo_gather_factored_base_offsets32_i64",
|
||||||
true),
|
true, false),
|
||||||
GSInfo("__pseudo_gather64_double",
|
GSInfo("__pseudo_gather64_double",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
|
||||||
"__pseudo_gather_factored_base_offsets64_double",
|
"__pseudo_gather_factored_base_offsets64_double",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
||||||
"__pseudo_gather_factored_base_offsets32_double",
|
"__pseudo_gather_factored_base_offsets32_double",
|
||||||
true),
|
true, false),
|
||||||
|
|
||||||
GSInfo("__pseudo_scatter64_i8",
|
GSInfo("__pseudo_scatter64_i8",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
|
||||||
"__pseudo_scatter_factored_base_offsets64_i8",
|
"__pseudo_scatter_factored_base_offsets64_i8",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i8",
|
"__pseudo_scatter_factored_base_offsets32_i8",
|
||||||
false),
|
false, false),
|
||||||
GSInfo("__pseudo_scatter64_i16",
|
GSInfo("__pseudo_scatter64_i16",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
|
||||||
"__pseudo_scatter_factored_base_offsets64_i16",
|
"__pseudo_scatter_factored_base_offsets64_i16",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i16",
|
"__pseudo_scatter_factored_base_offsets32_i16",
|
||||||
false),
|
false, false),
|
||||||
GSInfo("__pseudo_scatter64_i32",
|
GSInfo("__pseudo_scatter64_i32",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
|
||||||
"__pseudo_scatter_factored_base_offsets64_i32",
|
"__pseudo_scatter_factored_base_offsets64_i32",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i32",
|
"__pseudo_scatter_factored_base_offsets32_i32",
|
||||||
false),
|
false, false),
|
||||||
GSInfo("__pseudo_scatter64_float",
|
GSInfo("__pseudo_scatter64_float",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
|
||||||
"__pseudo_scatter_factored_base_offsets64_float",
|
"__pseudo_scatter_factored_base_offsets64_float",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_float",
|
"__pseudo_scatter_factored_base_offsets32_float",
|
||||||
false),
|
false, false),
|
||||||
GSInfo("__pseudo_scatter64_i64",
|
GSInfo("__pseudo_scatter64_i64",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
|
||||||
"__pseudo_scatter_factored_base_offsets64_i64",
|
"__pseudo_scatter_factored_base_offsets64_i64",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i64",
|
"__pseudo_scatter_factored_base_offsets32_i64",
|
||||||
false),
|
false, false),
|
||||||
GSInfo("__pseudo_scatter64_double",
|
GSInfo("__pseudo_scatter64_double",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
|
||||||
"__pseudo_scatter_factored_base_offsets64_double",
|
"__pseudo_scatter_factored_base_offsets64_double",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_double",
|
"__pseudo_scatter_factored_base_offsets32_double",
|
||||||
false),
|
false, false),
|
||||||
|
|
||||||
|
GSInfo("__pseudo_prefetch_read_varying_1",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
|
||||||
|
"__prefetch_read_varying_1",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
|
||||||
|
"__prefetch_read_varying_1",
|
||||||
|
false, true),
|
||||||
|
|
||||||
|
GSInfo("__pseudo_prefetch_read_varying_2",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
|
||||||
|
"__prefetch_read_varying_2",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
|
||||||
|
"__prefetch_read_varying_2",
|
||||||
|
false, true),
|
||||||
|
|
||||||
|
GSInfo("__pseudo_prefetch_read_varying_3",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
|
||||||
|
"__prefetch_read_varying_3",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
|
||||||
|
"__prefetch_read_varying_3",
|
||||||
|
false, true),
|
||||||
|
|
||||||
|
GSInfo("__pseudo_prefetch_read_varying_nt",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
|
||||||
|
"__prefetch_read_varying_nt",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
|
||||||
|
"__prefetch_read_varying_nt",
|
||||||
|
false, true),
|
||||||
};
|
};
|
||||||
|
|
||||||
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
|
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
|
||||||
@@ -2178,7 +2207,8 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
|||||||
llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector,
|
llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector,
|
||||||
callInst);
|
callInst);
|
||||||
|
|
||||||
if (basePtr == NULL || offsetVector == NULL)
|
if (basePtr == NULL || offsetVector == NULL ||
|
||||||
|
(info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
|
||||||
// It's actually a fully general gather/scatter with a varying
|
// It's actually a fully general gather/scatter with a varying
|
||||||
// set of base pointers, so leave it as is and continune onward
|
// set of base pointers, so leave it as is and continune onward
|
||||||
// to the next instruction...
|
// to the next instruction...
|
||||||
@@ -2193,7 +2223,9 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
|||||||
llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
|
llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
|
||||||
|
|
||||||
if ((info->isGather == true && g->target->hasGather()) ||
|
if ((info->isGather == true && g->target->hasGather()) ||
|
||||||
(info->isGather == false && g->target->hasScatter())) {
|
(info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
|
||||||
|
(info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
|
||||||
|
|
||||||
// See if the offsets are scaled by 2, 4, or 8. If so,
|
// See if the offsets are scaled by 2, 4, or 8. If so,
|
||||||
// extract that scale factor and rewrite the offsets to remove
|
// extract that scale factor and rewrite the offsets to remove
|
||||||
// it.
|
// it.
|
||||||
@@ -2207,7 +2239,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
|||||||
gatherScatterFunc = info->baseOffsets32Func;
|
gatherScatterFunc = info->baseOffsets32Func;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (info->isGather) {
|
if (info->isGather || info->isPrefetch) {
|
||||||
llvm::Value *mask = callInst->getArgOperand(1);
|
llvm::Value *mask = callInst->getArgOperand(1);
|
||||||
|
|
||||||
// Generate a new function call to the next pseudo gather
|
// Generate a new function call to the next pseudo gather
|
||||||
@@ -2264,7 +2296,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
|||||||
gatherScatterFunc = info->baseOffsets32Func;
|
gatherScatterFunc = info->baseOffsets32Func;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (info->isGather) {
|
if (info->isGather || info->isPrefetch) {
|
||||||
llvm::Value *mask = callInst->getArgOperand(1);
|
llvm::Value *mask = callInst->getArgOperand(1);
|
||||||
|
|
||||||
// Generate a new function call to the next pseudo gather
|
// Generate a new function call to the next pseudo gather
|
||||||
@@ -2306,13 +2338,14 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
|||||||
static bool
|
static bool
|
||||||
lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
|
lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
|
||||||
struct GSBOInfo {
|
struct GSBOInfo {
|
||||||
GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig)
|
GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
|
||||||
: isGather(ig) {
|
: isGather(ig), isPrefetch(ip) {
|
||||||
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
|
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
|
||||||
baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
|
baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
|
||||||
}
|
}
|
||||||
llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
|
llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
|
||||||
const bool isGather;
|
const bool isGather;
|
||||||
|
const bool isPrefetch;
|
||||||
};
|
};
|
||||||
|
|
||||||
GSBOInfo gsFuncs[] = {
|
GSBOInfo gsFuncs[] = {
|
||||||
@@ -2320,63 +2353,87 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
|
|||||||
"__pseudo_gather_factored_base_offsets32_i8",
|
"__pseudo_gather_factored_base_offsets32_i8",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i8",
|
"__pseudo_gather_factored_base_offsets32_i8",
|
||||||
true),
|
true, false),
|
||||||
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i16",
|
"__pseudo_gather_factored_base_offsets32_i16",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i16",
|
"__pseudo_gather_factored_base_offsets32_i16",
|
||||||
true),
|
true, false),
|
||||||
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i32",
|
"__pseudo_gather_factored_base_offsets32_i32",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i32",
|
"__pseudo_gather_factored_base_offsets32_i32",
|
||||||
true),
|
true, false),
|
||||||
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
||||||
"__pseudo_gather_factored_base_offsets32_float",
|
"__pseudo_gather_factored_base_offsets32_float",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
||||||
"__pseudo_gather_factored_base_offsets32_float",
|
"__pseudo_gather_factored_base_offsets32_float",
|
||||||
true),
|
true, false),
|
||||||
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i64",
|
"__pseudo_gather_factored_base_offsets32_i64",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
||||||
"__pseudo_gather_factored_base_offsets32_i64",
|
"__pseudo_gather_factored_base_offsets32_i64",
|
||||||
true),
|
true, false),
|
||||||
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
||||||
"__pseudo_gather_factored_base_offsets32_double",
|
"__pseudo_gather_factored_base_offsets32_double",
|
||||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
||||||
"__pseudo_gather_factored_base_offsets32_double",
|
"__pseudo_gather_factored_base_offsets32_double",
|
||||||
true),
|
true, false),
|
||||||
|
|
||||||
GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i8",
|
"__pseudo_scatter_factored_base_offsets32_i8",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i8",
|
"__pseudo_scatter_factored_base_offsets32_i8",
|
||||||
false),
|
false, false),
|
||||||
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i16",
|
"__pseudo_scatter_factored_base_offsets32_i16",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i16",
|
"__pseudo_scatter_factored_base_offsets32_i16",
|
||||||
false),
|
false, false),
|
||||||
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i32",
|
"__pseudo_scatter_factored_base_offsets32_i32",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i32",
|
"__pseudo_scatter_factored_base_offsets32_i32",
|
||||||
false),
|
false, false),
|
||||||
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_float",
|
"__pseudo_scatter_factored_base_offsets32_float",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_float",
|
"__pseudo_scatter_factored_base_offsets32_float",
|
||||||
false),
|
false, false),
|
||||||
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i64",
|
"__pseudo_scatter_factored_base_offsets32_i64",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_i64",
|
"__pseudo_scatter_factored_base_offsets32_i64",
|
||||||
false),
|
false, false),
|
||||||
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_double",
|
"__pseudo_scatter_factored_base_offsets32_double",
|
||||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
||||||
"__pseudo_scatter_factored_base_offsets32_double",
|
"__pseudo_scatter_factored_base_offsets32_double",
|
||||||
false),
|
false, false),
|
||||||
|
|
||||||
|
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
|
||||||
|
"__prefetch_read_varying_1",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
|
||||||
|
"__prefetch_read_varying_1",
|
||||||
|
false, true),
|
||||||
|
|
||||||
|
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
|
||||||
|
"__prefetch_read_varying_2",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
|
||||||
|
"__prefetch_read_varying_2",
|
||||||
|
false, true),
|
||||||
|
|
||||||
|
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
|
||||||
|
"__prefetch_read_varying_3",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
|
||||||
|
"__prefetch_read_varying_3",
|
||||||
|
false, true),
|
||||||
|
|
||||||
|
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
|
||||||
|
"__prefetch_read_varying_nt",
|
||||||
|
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
|
||||||
|
"__prefetch_read_varying_nt",
|
||||||
|
false, true),
|
||||||
};
|
};
|
||||||
|
|
||||||
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
|
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
|
||||||
@@ -4167,149 +4224,170 @@ lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
|
|||||||
static bool
|
static bool
|
||||||
lReplacePseudoGS(llvm::CallInst *callInst) {
|
lReplacePseudoGS(llvm::CallInst *callInst) {
|
||||||
struct LowerGSInfo {
|
struct LowerGSInfo {
|
||||||
LowerGSInfo(const char *pName, const char *aName, bool ig)
|
LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip)
|
||||||
: isGather(ig) {
|
: isGather(ig), isPrefetch(ip) {
|
||||||
pseudoFunc = m->module->getFunction(pName);
|
pseudoFunc = m->module->getFunction(pName);
|
||||||
actualFunc = m->module->getFunction(aName);
|
actualFunc = m->module->getFunction(aName);
|
||||||
}
|
}
|
||||||
llvm::Function *pseudoFunc;
|
llvm::Function *pseudoFunc;
|
||||||
llvm::Function *actualFunc;
|
llvm::Function *actualFunc;
|
||||||
const bool isGather;
|
const bool isGather;
|
||||||
|
const bool isPrefetch;
|
||||||
};
|
};
|
||||||
|
|
||||||
LowerGSInfo lgsInfo[] = {
|
LowerGSInfo lgsInfo[] = {
|
||||||
LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true),
|
LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
|
||||||
LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true),
|
LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
|
||||||
LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true),
|
LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
|
||||||
LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true),
|
LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
|
||||||
LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true),
|
LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
|
||||||
LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true),
|
LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true),
|
LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
|
||||||
LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true),
|
LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
|
||||||
LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true),
|
LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
|
||||||
LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true),
|
LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
|
||||||
LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true),
|
LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
|
||||||
LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true),
|
LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",
|
||||||
"__gather_factored_base_offsets32_i8", true),
|
"__gather_factored_base_offsets32_i8", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16",
|
||||||
"__gather_factored_base_offsets32_i16", true),
|
"__gather_factored_base_offsets32_i16", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32",
|
||||||
"__gather_factored_base_offsets32_i32", true),
|
"__gather_factored_base_offsets32_i32", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
|
||||||
"__gather_factored_base_offsets32_float", true),
|
"__gather_factored_base_offsets32_float", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64",
|
||||||
"__gather_factored_base_offsets32_i64", true),
|
"__gather_factored_base_offsets32_i64", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_double",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets32_double",
|
||||||
"__gather_factored_base_offsets32_double", true),
|
"__gather_factored_base_offsets32_double", true, false),
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",
|
||||||
"__gather_factored_base_offsets64_i8", true),
|
"__gather_factored_base_offsets64_i8", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16",
|
||||||
"__gather_factored_base_offsets64_i16", true),
|
"__gather_factored_base_offsets64_i16", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32",
|
||||||
"__gather_factored_base_offsets64_i32", true),
|
"__gather_factored_base_offsets64_i32", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_float",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets64_float",
|
||||||
"__gather_factored_base_offsets64_float", true),
|
"__gather_factored_base_offsets64_float", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64",
|
||||||
"__gather_factored_base_offsets64_i64", true),
|
"__gather_factored_base_offsets64_i64", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_double",
|
LowerGSInfo("__pseudo_gather_factored_base_offsets64_double",
|
||||||
"__gather_factored_base_offsets64_double", true),
|
"__gather_factored_base_offsets64_double", true, false),
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets32_i8",
|
LowerGSInfo("__pseudo_gather_base_offsets32_i8",
|
||||||
"__gather_base_offsets32_i8", true),
|
"__gather_base_offsets32_i8", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets32_i16",
|
LowerGSInfo("__pseudo_gather_base_offsets32_i16",
|
||||||
"__gather_base_offsets32_i16", true),
|
"__gather_base_offsets32_i16", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets32_i32",
|
LowerGSInfo("__pseudo_gather_base_offsets32_i32",
|
||||||
"__gather_base_offsets32_i32", true),
|
"__gather_base_offsets32_i32", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets32_float",
|
LowerGSInfo("__pseudo_gather_base_offsets32_float",
|
||||||
"__gather_base_offsets32_float", true),
|
"__gather_base_offsets32_float", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets32_i64",
|
LowerGSInfo("__pseudo_gather_base_offsets32_i64",
|
||||||
"__gather_base_offsets32_i64", true),
|
"__gather_base_offsets32_i64", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets32_double",
|
LowerGSInfo("__pseudo_gather_base_offsets32_double",
|
||||||
"__gather_base_offsets32_double", true),
|
"__gather_base_offsets32_double", true, false),
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets64_i8",
|
LowerGSInfo("__pseudo_gather_base_offsets64_i8",
|
||||||
"__gather_base_offsets64_i8", true),
|
"__gather_base_offsets64_i8", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets64_i16",
|
LowerGSInfo("__pseudo_gather_base_offsets64_i16",
|
||||||
"__gather_base_offsets64_i16", true),
|
"__gather_base_offsets64_i16", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets64_i32",
|
LowerGSInfo("__pseudo_gather_base_offsets64_i32",
|
||||||
"__gather_base_offsets64_i32", true),
|
"__gather_base_offsets64_i32", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets64_float",
|
LowerGSInfo("__pseudo_gather_base_offsets64_float",
|
||||||
"__gather_base_offsets64_float", true),
|
"__gather_base_offsets64_float", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets64_i64",
|
LowerGSInfo("__pseudo_gather_base_offsets64_i64",
|
||||||
"__gather_base_offsets64_i64", true),
|
"__gather_base_offsets64_i64", true, false),
|
||||||
LowerGSInfo("__pseudo_gather_base_offsets64_double",
|
LowerGSInfo("__pseudo_gather_base_offsets64_double",
|
||||||
"__gather_base_offsets64_double", true),
|
"__gather_base_offsets64_double", true, false),
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false),
|
LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false),
|
LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false),
|
LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false),
|
LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false),
|
LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false),
|
LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false),
|
LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false),
|
LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false),
|
LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false),
|
LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false),
|
LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false),
|
LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",
|
||||||
"__scatter_factored_base_offsets32_i8", false),
|
"__scatter_factored_base_offsets32_i8", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16",
|
||||||
"__scatter_factored_base_offsets32_i16", false),
|
"__scatter_factored_base_offsets32_i16", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32",
|
||||||
"__scatter_factored_base_offsets32_i32", false),
|
"__scatter_factored_base_offsets32_i32", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float",
|
||||||
"__scatter_factored_base_offsets32_float", false),
|
"__scatter_factored_base_offsets32_float", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64",
|
||||||
"__scatter_factored_base_offsets32_i64", false),
|
"__scatter_factored_base_offsets32_i64", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
|
||||||
"__scatter_factored_base_offsets32_double", false),
|
"__scatter_factored_base_offsets32_double", false, false),
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",
|
||||||
"__scatter_factored_base_offsets64_i8", false),
|
"__scatter_factored_base_offsets64_i8", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16",
|
||||||
"__scatter_factored_base_offsets64_i16", false),
|
"__scatter_factored_base_offsets64_i16", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32",
|
||||||
"__scatter_factored_base_offsets64_i32", false),
|
"__scatter_factored_base_offsets64_i32", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
|
||||||
"__scatter_factored_base_offsets64_float", false),
|
"__scatter_factored_base_offsets64_float", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64",
|
||||||
"__scatter_factored_base_offsets64_i64", false),
|
"__scatter_factored_base_offsets64_i64", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double",
|
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double",
|
||||||
"__scatter_factored_base_offsets64_double", false),
|
"__scatter_factored_base_offsets64_double", false, false),
|
||||||
|
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets32_i8",
|
LowerGSInfo("__pseudo_scatter_base_offsets32_i8",
|
||||||
"__scatter_base_offsets32_i8", false),
|
"__scatter_base_offsets32_i8", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets32_i16",
|
LowerGSInfo("__pseudo_scatter_base_offsets32_i16",
|
||||||
"__scatter_base_offsets32_i16", false),
|
"__scatter_base_offsets32_i16", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets32_i32",
|
LowerGSInfo("__pseudo_scatter_base_offsets32_i32",
|
||||||
"__scatter_base_offsets32_i32", false),
|
"__scatter_base_offsets32_i32", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets32_float",
|
LowerGSInfo("__pseudo_scatter_base_offsets32_float",
|
||||||
"__scatter_base_offsets32_float", false),
|
"__scatter_base_offsets32_float", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets32_i64",
|
LowerGSInfo("__pseudo_scatter_base_offsets32_i64",
|
||||||
"__scatter_base_offsets32_i64", false),
|
"__scatter_base_offsets32_i64", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets32_double",
|
LowerGSInfo("__pseudo_scatter_base_offsets32_double",
|
||||||
"__scatter_base_offsets32_double", false),
|
"__scatter_base_offsets32_double", false, false),
|
||||||
|
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets64_i8",
|
LowerGSInfo("__pseudo_scatter_base_offsets64_i8",
|
||||||
"__scatter_base_offsets64_i8", false),
|
"__scatter_base_offsets64_i8", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets64_i16",
|
LowerGSInfo("__pseudo_scatter_base_offsets64_i16",
|
||||||
"__scatter_base_offsets64_i16", false),
|
"__scatter_base_offsets64_i16", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets64_i32",
|
LowerGSInfo("__pseudo_scatter_base_offsets64_i32",
|
||||||
"__scatter_base_offsets64_i32", false),
|
"__scatter_base_offsets64_i32", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets64_float",
|
LowerGSInfo("__pseudo_scatter_base_offsets64_float",
|
||||||
"__scatter_base_offsets64_float", false),
|
"__scatter_base_offsets64_float", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets64_i64",
|
LowerGSInfo("__pseudo_scatter_base_offsets64_i64",
|
||||||
"__scatter_base_offsets64_i64", false),
|
"__scatter_base_offsets64_i64", false, false),
|
||||||
LowerGSInfo("__pseudo_scatter_base_offsets64_double",
|
LowerGSInfo("__pseudo_scatter_base_offsets64_double",
|
||||||
"__scatter_base_offsets64_double", false),
|
"__scatter_base_offsets64_double", false, false),
|
||||||
|
|
||||||
|
LowerGSInfo("__pseudo_prefetch_read_varying_1",
|
||||||
|
"__prefetch_read_varying_1", false, true),
|
||||||
|
LowerGSInfo("__pseudo_prefetch_read_varying_1_native",
|
||||||
|
"__prefetch_read_varying_1_native", false, true),
|
||||||
|
|
||||||
|
LowerGSInfo("__pseudo_prefetch_read_varying_2",
|
||||||
|
"__prefetch_read_varying_2", false, true),
|
||||||
|
LowerGSInfo("__pseudo_prefetch_read_varying_2_native",
|
||||||
|
"__prefetch_read_varying_2_native", false, true),
|
||||||
|
|
||||||
|
LowerGSInfo("__pseudo_prefetch_read_varying_3",
|
||||||
|
"__prefetch_read_varying_3", false, true),
|
||||||
|
LowerGSInfo("__pseudo_prefetch_read_varying_3_native",
|
||||||
|
"__prefetch_read_varying_3_native", false, true),
|
||||||
|
|
||||||
|
LowerGSInfo("__pseudo_prefetch_read_varying_nt",
|
||||||
|
"__prefetch_read_varying_nt", false, true),
|
||||||
|
LowerGSInfo("__pseudo_prefetch_read_varying_nt_native",
|
||||||
|
"__prefetch_read_varying_nt_native", false, true),
|
||||||
};
|
};
|
||||||
|
|
||||||
llvm::Function *calledFunc = callInst->getCalledFunction();
|
llvm::Function *calledFunc = callInst->getCalledFunction();
|
||||||
@@ -4336,7 +4414,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
|
|||||||
if (gotPosition && g->target->getVectorWidth() > 1) {
|
if (gotPosition && g->target->getVectorWidth() > 1) {
|
||||||
if (info->isGather)
|
if (info->isGather)
|
||||||
PerformanceWarning(pos, "Gather required to load value.");
|
PerformanceWarning(pos, "Gather required to load value.");
|
||||||
else
|
else if (!info->isPrefetch)
|
||||||
PerformanceWarning(pos, "Scatter required to store value.");
|
PerformanceWarning(pos, "Scatter required to store value.");
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@@ -4617,6 +4695,8 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
|
|||||||
"__scatter64_i8", "__scatter64_i16",
|
"__scatter64_i8", "__scatter64_i16",
|
||||||
"__scatter64_i32", "__scatter64_i64",
|
"__scatter64_i32", "__scatter64_i64",
|
||||||
"__scatter64_float", "__scatter64_double",
|
"__scatter64_float", "__scatter64_double",
|
||||||
|
"__prefetch_read_varying_1", "__prefetch_read_varying_2",
|
||||||
|
"__prefetch_read_varying_3", "__prefetch_read_varying_nt",
|
||||||
"__keep_funcs_live",
|
"__keep_funcs_live",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
32
stdlib.ispc
32
stdlib.ispc
@@ -820,43 +820,19 @@ static inline void prefetch_nt(const void * uniform ptr) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline void prefetch_l1(const void * varying ptr) {
|
static inline void prefetch_l1(const void * varying ptr) {
|
||||||
const void * uniform ptrArray[programCount];
|
__pseudo_prefetch_read_varying_1((int64)ptr, (IntMaskType)__mask);
|
||||||
ptrArray[programIndex] = ptr;
|
|
||||||
|
|
||||||
foreach_active (i) {
|
|
||||||
const void * uniform p = ptrArray[i];
|
|
||||||
prefetch_l1(p);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void prefetch_l2(const void * varying ptr) {
|
static inline void prefetch_l2(const void * varying ptr) {
|
||||||
const void * uniform ptrArray[programCount];
|
__pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask);
|
||||||
ptrArray[programIndex] = ptr;
|
|
||||||
|
|
||||||
foreach_active (i) {
|
|
||||||
const void * uniform p = ptrArray[i];
|
|
||||||
prefetch_l2(p);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void prefetch_l3(const void * varying ptr) {
|
static inline void prefetch_l3(const void * varying ptr) {
|
||||||
const void * uniform ptrArray[programCount];
|
__pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask);
|
||||||
ptrArray[programIndex] = ptr;
|
|
||||||
|
|
||||||
foreach_active (i) {
|
|
||||||
const void * uniform p = ptrArray[i];
|
|
||||||
prefetch_l3(p);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void prefetch_nt(const void * varying ptr) {
|
static inline void prefetch_nt(const void * varying ptr) {
|
||||||
const void * uniform ptrArray[programCount];
|
__pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask);
|
||||||
ptrArray[programIndex] = ptr;
|
|
||||||
|
|
||||||
foreach_active (i) {
|
|
||||||
const void * uniform p = ptrArray[i];
|
|
||||||
prefetch_nt(p);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
22
tests/prefetch-varying.ispc
Normal file
22
tests/prefetch-varying.ispc
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
int64 zero = 0;
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
uniform int64 a[programCount];
|
||||||
|
for (uniform int i = 0; i < programCount; ++i)
|
||||||
|
a[i] = aFOO[i];
|
||||||
|
|
||||||
|
int64 *ptr = &(a[programIndex+zero]);
|
||||||
|
prefetch_l1(ptr);
|
||||||
|
prefetch_l2(ptr);
|
||||||
|
prefetch_l3(ptr);
|
||||||
|
prefetch_nt(ptr);
|
||||||
|
int g = *ptr;
|
||||||
|
RET[programIndex] = g;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
RET[programIndex] = 1 + programIndex;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user