Merge branch 'master' into nvptx_clean_master

This commit is contained in:
evghenii
2014-10-14 14:27:00 +02:00
25 changed files with 1947 additions and 1176 deletions

View File

@@ -110,8 +110,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
if version_LLVM == "trunk":
SVN_PATH="trunk"
if version_LLVM == "3.5":
# SVN_PATH=tags/RELEASE_35/rc1
SVN_PATH="branches/release_35"
SVN_PATH="tags/RELEASE_350/final"
version_LLVM = "3_5"
if version_LLVM == "3.4":
SVN_PATH="tags/RELEASE_34/dot2-final"

View File

@@ -555,6 +555,10 @@ lSetInternalFunctions(llvm::Module *module) {
"__prefetch_read_uniform_2",
"__prefetch_read_uniform_3",
"__prefetch_read_uniform_nt",
"__pseudo_prefetch_read_varying_1",
"__pseudo_prefetch_read_varying_2",
"__pseudo_prefetch_read_varying_3",
"__pseudo_prefetch_read_varying_nt",
"__psubs_vi8",
"__psubs_vi16",
"__psubus_vi8",
@@ -780,7 +784,11 @@ void
AddBitcodeToModule(const unsigned char *bitcode, int length,
llvm::Module *module, SymbolTable *symbolTable) {
llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
#else // LLVM 3.6+
llvm::MemoryBufferRef bcBuf = llvm::MemoryBuffer::getMemBuffer(sb)->getMemBufferRef();
#endif
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
llvm::ErrorOr<llvm::Module *> ModuleOrErr = llvm::parseBitcodeFile(bcBuf, *g->ctx);
@@ -910,12 +918,23 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
// have the DW_AT_artifical attribute. It's not clear if this
// matters for anything though.
llvm::DIGlobalVariable var =
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
m->diBuilder->createGlobalVariable(file,
name,
name,
file,
0 /* line */,
diType,
true /* static */,
sym->storagePtr);
#else
m->diBuilder->createGlobalVariable(name,
file,
0 /* line */,
diType,
true /* static */,
sym->storagePtr);
#endif
Assert(var.Verify());
}
}
@@ -970,12 +989,23 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
llvm::DIType diType = sym->type->GetDIType(file);
Assert(diType.Verify());
llvm::DIGlobalVariable var =
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
m->diBuilder->createGlobalVariable(file,
sym->name.c_str(),
sym->name.c_str(),
file,
0 /* line */,
diType,
false /* static */,
sym->storagePtr);
#else
m->diBuilder->createGlobalVariable(sym->name.c_str(),
file,
0 /* line */,
diType,
false /* static */,
sym->storagePtr);
#endif
Assert(var.Verify());
}
}

View File

@@ -370,6 +370,14 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
declare void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins

View File

@@ -1584,6 +1584,50 @@ define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
ret void
}
define void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 3, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
define void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 2, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
define void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 1, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
define void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 0, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2535,6 +2579,31 @@ declare void
@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x double>, <WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_1(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_1_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_2(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_2_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_3(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_3_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_nt_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
declare void @__use8(<WIDTH x i8>)
@@ -3034,6 +3103,41 @@ ifelse(HAVE_SCATTER, `1',
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; prefetchs
call void @__pseudo_prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_1_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_1_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_2_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_2_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_3_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_3_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
ret void
}

View File

@@ -4945,9 +4945,19 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
#endif
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
std::string error;
#else // LLVM 3.6+
std::error_code error;
#endif
llvm::tool_output_file *of = new llvm::tool_output_file(fn, error, flags);
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
if (error.size()) {
#else // LLVM 3.6+
if (error) {
#endif
fprintf(stderr, "Error opening output file \"%s\".\n", fn);
return false;
}

29
ctx.cpp
View File

@@ -745,6 +745,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
// that have executed a 'break' statement:
// breakLanes = breakLanes | mask
AssertPos(currentPos, breakLanesPtr != NULL);
llvm::Value *mask = GetInternalMask();
llvm::Value *breakMask = LoadInst(breakLanesPtr,
"break_mask");
@@ -927,6 +928,16 @@ FunctionEmitContext::RestoreContinuedLanes() {
}
void
FunctionEmitContext::ClearBreakLanes() {
if (breakLanesPtr == NULL)
return;
// breakLanes = 0
StoreInst(LLVMMaskAllOff, breakLanesPtr);
}
void
FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
llvm::Value *oldMask = GetInternalMask();
@@ -1636,14 +1647,16 @@ FunctionEmitContext::StartScope() {
llvm::DILexicalBlock lexicalBlock =
m->diBuilder->createLexicalBlock(parentScope, diFile,
currentPos.first_line,
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
#if defined(LLVM_3_5)
// Revision 202736 in LLVM adds support of DWARF discriminator
// to the last argument and revision 202737 in clang adds 0
// for the last argument by default.
currentPos.first_column, 0);
#else
// Revision 216239 in LLVM removes support of DWARF discriminator
// as the last argument
currentPos.first_column);
#endif
#endif // LLVM 3.2, 3.3, 3.4 and 3.6+
AssertPos(currentPos, lexicalBlock.Verify());
debugScopes.push_back(lexicalBlock);
}
@@ -1683,8 +1696,14 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
diType,
true /* preserve through opts */);
AssertPos(currentPos, var.Verify());
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
llvm::DIExpression E = m->diBuilder->createExpression();
llvm::Instruction *declareInst =
m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock);
#else
llvm::Instruction *declareInst =
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
#endif
AddDebugPos(declareInst, &sym->pos, &scope);
}
@@ -1710,8 +1729,14 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym, int argNum) {
flags,
argNum+1);
AssertPos(currentPos, var.Verify());
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
llvm::DIExpression E = m->diBuilder->createExpression();
llvm::Instruction *declareInst =
m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock);
#else
llvm::Instruction *declareInst =
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
#endif
AddDebugPos(declareInst, &sym->pos, &scope);
}

7
ctx.h
View File

@@ -196,6 +196,13 @@ public:
previous iteration. */
void RestoreContinuedLanes();
/** This method is called by code emitting IR for a loop. It clears
any lanes that contained a break since the mask has been updated to take
them into account. This is necessary as all the bail out checks for
breaks are meant to only deal with lanes breaking on the current iteration.
*/
void ClearBreakLanes();
/** Indicates that code generation for a "switch" statement is about to
start. isUniform indicates whether the "switch" value is uniform,
and bbAfterSwitch gives the basic block immediately following the

View File

@@ -160,8 +160,8 @@
<ItemGroup>
<CustomBuild Include='$(ISPC_file).ispc'>
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) $(flags)</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) -g $(flags)</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) -g $(flags)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>

View File

@@ -1540,6 +1540,15 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -1624,6 +1624,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec32_i32 offsets, __vec32_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec32_i64 addr, __vec32_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -1757,6 +1757,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec64_i32 offsets, __vec64_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec64_i64 addr, __vec64_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -1,5 +1,5 @@
/**
Copyright (c) 2010-2013, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <limits.h> // INT_MIN
#include <stdint.h>
#include <math.h>
#include <assert.h>
@@ -525,11 +526,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
*p = v;
}
template <class RetVecType> RetVecType __smear_i1(int i);
template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
template <class RetVecType> static RetVecType __smear_i1(int i);
template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
template <class RetVecType> RetVecType __setzero_i1();
template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
template <class RetVecType> static RetVecType __setzero_i1();
template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
template <class RetVecType> __vec16_i1 __undef_i1();
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
@@ -677,8 +678,8 @@ static FORCEINLINE __vec16_i32 __select( bool cond, __vec16_i32 a, __vec16_
static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int32_t index) { return v[index]; }
static FORCEINLINE void __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val; }
template <class RetVecType> RetVecType __smear_i32(int32_t i);
template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
template <class RetVecType> RetVecType static __smear_i32(int32_t i);
template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
@@ -686,11 +687,11 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
template <class RetVecType> RetVecType __setzero_i32();
template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
template <class RetVecType> static RetVecType __setzero_i32();
template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
template <class RetVecType> RetVecType __undef_i32();
template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
template <class RetVecType> static RetVecType __undef_i32();
template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
@@ -742,11 +743,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
}
#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
{
return _mm512_load_epi32(p);
}
template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v)
template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v)
{
_mm512_store_epi32(p, v);
}
@@ -1017,21 +1018,21 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
}
#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
{
__m512i v2 = _mm512_load_epi32(p);
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
return __vec16_i64(v2,v1);
}
template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); }
template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); }
template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
{
__m512i v1 = v.v2;
__m512i v2 = v.v1;
_mm512_store_epi64(p, v2);
_mm512_store_epi64(((uint8_t*)p)+64, v1);
}
template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); }
template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); }
#endif
@@ -1067,14 +1068,14 @@ static FORCEINLINE __vec16_f __select( bool cond, __vec16_f a, __vec16_f b)
static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { return v[index]; }
static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
template <class RetVecType> RetVecType __smear_float(float f);
template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
template <class RetVecType> static RetVecType __smear_float(float f);
template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
template <class RetVecType> RetVecType __setzero_float();
template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
template <class RetVecType> static RetVecType __setzero_float();
template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
template <class RetVecType> RetVecType __undef_float();
template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
template <class RetVecType> static RetVecType __undef_float();
template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index)
{
@@ -1131,12 +1132,12 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
}
#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
{
return _mm512_load_ps(p);
}
/* this one doesn't fail but it is commented out for completeness, no aligned load/stores */
template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v)
template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v)
{
_mm512_store_ps(p, v);
}
@@ -1309,14 +1310,14 @@ static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b)
static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { return v[index]; }
static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
template <class RetVecType> RetVecType __smear_double(double d);
template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
template <class RetVecType> static RetVecType __smear_double(double d);
template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
template <class RetVecType> RetVecType __setzero_double();
template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
template <class RetVecType> static RetVecType __setzero_double();
template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
template <class RetVecType> RetVecType __undef_double();
template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
template <class RetVecType> static RetVecType __undef_double();
template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
#define CASTD2F(_v_, _v_hi_, _v_lo_) \
__vec16_f _v_hi_, _v_lo_; \
@@ -1390,17 +1391,17 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
{
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
}
template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v)
template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v)
{
_mm512_store_pd(p, v.v1);
_mm512_store_pd(((uint8_t*)p)+64, v.v2);
}
template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); }
template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); }
template <> FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); }
template <> FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); }
#endif
///////////////////////////////////////////////////////////////////////////
@@ -2162,6 +2163,7 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t
static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask)
{
const __vec16_i64 offsets = _offsets.cvt2hilo();
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
__vec16_i1 still_to_do = mask;
__vec16_i32 tmp;
while (still_to_do) {
@@ -2172,8 +2174,8 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base,
_MM_UPCONV_EPI32_SINT8, scale,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match,still_to_do);
@@ -2197,6 +2199,7 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32
static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask)
{
const __vec16_i64 offsets = _offsets.cvt2hilo();
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
// There is no gather instruction with 64-bit offsets in KNC.
// We have to manually iterate over the upper 32 bits ;-)
__vec16_i1 still_to_do = mask;
@@ -2209,8 +2212,8 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
ret = _mm512_mask_i32extgather_epi32(ret, match, signed_offsets, base,
_MM_UPCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match, still_to_do);
@@ -2230,6 +2233,7 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32
static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask)
{
const __vec16_i64 offsets = _offsets.cvt2hilo();
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
// There is no gather instruction with 64-bit offsets in KNC.
// We have to manually iterate over the upper 32 bits ;-)
__vec16_i1 still_to_do = mask;
@@ -2242,8 +2246,8 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base,
_MM_UPCONV_PS_NONE, scale,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match, still_to_do);
@@ -2339,6 +2343,7 @@ static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale,
static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask)
{
const __vec16_i64 offsets = _offsets.cvt2hilo();
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
__vec16_i1 still_to_do = mask;
while (still_to_do) {
@@ -2349,8 +2354,8 @@ static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t sc
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
_mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
_mm512_mask_i32extscatter_epi32(base, match, signed_offsets,
value,
_MM_DOWNCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
@@ -2370,6 +2375,7 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal
static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask)
{
const __vec16_i64 offsets = _offsets.cvt2hilo();
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
__vec16_i1 still_to_do = mask;
while (still_to_do) {
@@ -2380,8 +2386,9 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
_mm512_mask_i32extscatter_ps(base, match, offsets.v_lo,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
_mm512_mask_i32extscatter_ps(base, match, signed_offsets,
value,
_MM_DOWNCONV_PS_NONE, scale,
_MM_HINT_NONE);
@@ -2543,6 +2550,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
}
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) { \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
__vec16_i1 copy_mask = _mm512_kmov(mask); \
_mm512_kswapb(mask, copy_mask); \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
__vec16_i32 offsets, __vec16_i1 mask) {}
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
///////////////////////////////////////////////////////////////////////////
// atomics
///////////////////////////////////////////////////////////////////////////

View File

@@ -2606,6 +2606,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
}
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) { \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
__vec16_i1 copy_mask = _mm512_kmov(mask); \
_mm512_kswapb(mask, copy_mask); \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
__vec16_i32 offsets, __vec16_i1 mask) {}
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
///////////////////////////////////////////////////////////////////////////
// atomics

File diff suppressed because it is too large Load Diff

View File

@@ -3898,6 +3898,15 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *ptr) {
_mm_prefetch((char *)ptr, _MM_HINT_NTA);
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec4_i32 offsets, __vec4_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec4_i64 addr, __vec4_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -257,6 +257,32 @@
./tests/reduce-equal-5.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/reduce-equal-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/reduce-equal-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/paddus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/paddus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmuls_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i16.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i32.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/paddus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/paddus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmuls_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i16.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i32.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
.\tests\foreach-double-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.5 cl -O2 *
.\tests\foreach-double-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.5 cl -O2 *
.\tests\foreach-double-1.ispc runfail x86 avx2-i64x4 Windows LLVM 3.5 cl -O2 *
@@ -267,7 +293,6 @@
./tests/ptr-22.ispc runfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/ptr-22.ispc runfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O2 *
./tests/ptr-22.ispc runfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
./tests/atomics-1.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
./tests/atomics-10.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
./tests/atomics-11.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
@@ -454,3 +479,35 @@
./tests/reduce-equal-5.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 *
./tests/reduce-equal-6.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 *
./tests/reduce-equal-8.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 *
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/paddus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/paddus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmuls_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i32.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/paddus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/paddus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmuls_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i32.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/psubus_vi16.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O2 *
./tests/psubus_vi8.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O2 *
./tests/psubus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/psubus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/psubus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/psubus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *

View File

@@ -199,7 +199,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
m_hasTranscendentals(false),
m_hasTrigonometry(false),
m_hasRsqrtd(false),
m_hasRcpd(false)
m_hasRcpd(false),
m_hasVecPrefetch(false)
{
if (isa == NULL) {
if (cpu != NULL) {
@@ -386,6 +387,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_hasTrigonometry = false;
this->m_hasGather = this->m_hasScatter = true;
this->m_hasRsqrtd = this->m_hasRcpd = true;
// It's set to true, because MIC has hardware vector prefetch instruction
this->m_hasVecPrefetch = true;
}
else if (!strcasecmp(isa, "generic-32") ||
!strcasecmp(isa, "generic-x32")) {

5
ispc.h
View File

@@ -283,6 +283,8 @@ public:
bool hasRcpd() const {return m_hasRcpd;}
bool hasVecPrefetch() const {return m_hasVecPrefetch;}
private:
/** llvm Target object representing this target. */
@@ -385,6 +387,9 @@ private:
/** Indicates whether there is an ISA double precision rcp. */
bool m_hasRcpd;
/** Indicates whether the target has hardware instruction for vector prefetch. */
bool m_hasVecPrefetch;
};

View File

@@ -403,6 +403,7 @@
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'">LLVMMCDisassembler.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'">LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
@@ -424,6 +425,8 @@
<OptimizeReferences>true</OptimizeReferences>
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'AND'$(LLVM_VERSION)'!='LLVM_3_5'">LLVMProfileData.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'">LLVMMCDisassembler.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'">LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>

View File

@@ -604,12 +604,23 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
if (diBuilder) {
llvm::DIFile file = pos.GetDIFile();
llvm::DIGlobalVariable var =
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
diBuilder->createGlobalVariable(file,
name,
name,
file,
pos.first_line,
sym->type->GetDIType(file),
(sym->storageClass == SC_STATIC),
sym->storagePtr);
#else
diBuilder->createGlobalVariable(name,
file,
pos.first_line,
sym->type->GetDIType(file),
(sym->storageClass == SC_STATIC),
sym->storagePtr);
#endif
Assert(var.Verify());
}
}
@@ -1304,18 +1315,33 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
#endif
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
std::string error;
#else // LLVM 3.6+
std::error_code error;
#endif
llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags);
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
if (error.size()) {
#else // LLVM 3.6+
if (error) {
#endif
fprintf(stderr, "Error opening output file \"%s\".\n", outFileName);
return false;
}
llvm::PassManager pm;
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
#else
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
#elif defined(LLVM_3_5)
pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
#else // LLVM 3.6+
llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
dlp->doInitialization(*module);
pm.add(dlp);
#endif
llvm::formatted_raw_ostream fos(of->os());

328
opt.cpp
View File

@@ -479,10 +479,14 @@ Optimize(llvm::Module *module, int optLevel) {
new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
optPM.add(targetLibraryInfo);
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
#else
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
#elif defined(LLVM_3_5)
optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
#else // LLVM 3.6+
llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
dlp->doInitialization(*module);
optPM.add(dlp);
#endif
llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
@@ -2117,8 +2121,8 @@ static bool
lGSToGSBaseOffsets(llvm::CallInst *callInst) {
struct GSInfo {
GSInfo(const char *pgFuncName, const char *pgboFuncName,
const char *pgbo32FuncName, bool ig)
: isGather(ig) {
const char *pgbo32FuncName, bool ig, bool ip)
: isGather(ig), isPrefetch(ip) {
func = m->module->getFunction(pgFuncName);
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
@@ -2126,6 +2130,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
llvm::Function *func;
llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
const bool isGather;
const bool isPrefetch;
};
GSInfo gsFuncs[] = {
@@ -2134,148 +2139,176 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
"__pseudo_gather_factored_base_offsets32_i8",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
"__pseudo_gather_factored_base_offsets32_i8",
true),
true, false),
GSInfo("__pseudo_gather32_i16",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
"__pseudo_gather_factored_base_offsets32_i16",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
"__pseudo_gather_factored_base_offsets32_i16",
true),
true, false),
GSInfo("__pseudo_gather32_i32",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
"__pseudo_gather_factored_base_offsets32_i32",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
"__pseudo_gather_factored_base_offsets32_i32",
true),
true, false),
GSInfo("__pseudo_gather32_float",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
"__pseudo_gather_factored_base_offsets32_float",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
"__pseudo_gather_factored_base_offsets32_float",
true),
true, false),
GSInfo("__pseudo_gather32_i64",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
"__pseudo_gather_factored_base_offsets32_i64",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
"__pseudo_gather_factored_base_offsets32_i64",
true),
true, false),
GSInfo("__pseudo_gather32_double",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
"__pseudo_gather_factored_base_offsets32_double",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
"__pseudo_gather_factored_base_offsets32_double",
true),
true, false),
GSInfo("__pseudo_scatter32_i8",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
"__pseudo_scatter_factored_base_offsets32_i8",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
"__pseudo_scatter_factored_base_offsets32_i8",
false),
false, false),
GSInfo("__pseudo_scatter32_i16",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
"__pseudo_scatter_factored_base_offsets32_i16",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
"__pseudo_scatter_factored_base_offsets32_i16",
false),
false, false),
GSInfo("__pseudo_scatter32_i32",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
"__pseudo_scatter_factored_base_offsets32_i32",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
"__pseudo_scatter_factored_base_offsets32_i32",
false),
false, false),
GSInfo("__pseudo_scatter32_float",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
"__pseudo_scatter_factored_base_offsets32_float",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
"__pseudo_scatter_factored_base_offsets32_float",
false),
false, false),
GSInfo("__pseudo_scatter32_i64",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
"__pseudo_scatter_factored_base_offsets32_i64",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
"__pseudo_scatter_factored_base_offsets32_i64",
false),
false, false),
GSInfo("__pseudo_scatter32_double",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
"__pseudo_scatter_factored_base_offsets32_double",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
"__pseudo_scatter_factored_base_offsets32_double",
false),
false, false),
GSInfo("__pseudo_gather64_i8",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
"__pseudo_gather_factored_base_offsets64_i8",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
"__pseudo_gather_factored_base_offsets32_i8",
true),
true, false),
GSInfo("__pseudo_gather64_i16",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
"__pseudo_gather_factored_base_offsets64_i16",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
"__pseudo_gather_factored_base_offsets32_i16",
true),
true, false),
GSInfo("__pseudo_gather64_i32",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
"__pseudo_gather_factored_base_offsets64_i32",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
"__pseudo_gather_factored_base_offsets32_i32",
true),
true, false),
GSInfo("__pseudo_gather64_float",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
"__pseudo_gather_factored_base_offsets64_float",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
"__pseudo_gather_factored_base_offsets32_float",
true),
true, false),
GSInfo("__pseudo_gather64_i64",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
"__pseudo_gather_factored_base_offsets64_i64",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
"__pseudo_gather_factored_base_offsets32_i64",
true),
true, false),
GSInfo("__pseudo_gather64_double",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
"__pseudo_gather_factored_base_offsets64_double",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
"__pseudo_gather_factored_base_offsets32_double",
true),
true, false),
GSInfo("__pseudo_scatter64_i8",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
"__pseudo_scatter_factored_base_offsets64_i8",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
"__pseudo_scatter_factored_base_offsets32_i8",
false),
false, false),
GSInfo("__pseudo_scatter64_i16",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
"__pseudo_scatter_factored_base_offsets64_i16",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
"__pseudo_scatter_factored_base_offsets32_i16",
false),
false, false),
GSInfo("__pseudo_scatter64_i32",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
"__pseudo_scatter_factored_base_offsets64_i32",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
"__pseudo_scatter_factored_base_offsets32_i32",
false),
false, false),
GSInfo("__pseudo_scatter64_float",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
"__pseudo_scatter_factored_base_offsets64_float",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
"__pseudo_scatter_factored_base_offsets32_float",
false),
false, false),
GSInfo("__pseudo_scatter64_i64",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
"__pseudo_scatter_factored_base_offsets64_i64",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
"__pseudo_scatter_factored_base_offsets32_i64",
false),
false, false),
GSInfo("__pseudo_scatter64_double",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
"__pseudo_scatter_factored_base_offsets64_double",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
"__pseudo_scatter_factored_base_offsets32_double",
false),
false, false),
GSInfo("__pseudo_prefetch_read_varying_1",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
"__prefetch_read_varying_1",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
"__prefetch_read_varying_1",
false, true),
GSInfo("__pseudo_prefetch_read_varying_2",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
false, true),
GSInfo("__pseudo_prefetch_read_varying_3",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
false, true),
GSInfo("__pseudo_prefetch_read_varying_nt",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
false, true),
};
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -2301,7 +2334,8 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector,
callInst);
if (basePtr == NULL || offsetVector == NULL)
if (basePtr == NULL || offsetVector == NULL ||
(info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
// It's actually a fully general gather/scatter with a varying
// set of base pointers, so leave it as is and continune onward
// to the next instruction...
@@ -2316,7 +2350,9 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
if ((info->isGather == true && g->target->hasGather()) ||
(info->isGather == false && g->target->hasScatter())) {
(info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
(info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
// See if the offsets are scaled by 2, 4, or 8. If so,
// extract that scale factor and rewrite the offsets to remove
// it.
@@ -2330,7 +2366,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
gatherScatterFunc = info->baseOffsets32Func;
}
if (info->isGather) {
if (info->isGather || info->isPrefetch) {
llvm::Value *mask = callInst->getArgOperand(1);
// Generate a new function call to the next pseudo gather
@@ -2387,7 +2423,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
gatherScatterFunc = info->baseOffsets32Func;
}
if (info->isGather) {
if (info->isGather || info->isPrefetch) {
llvm::Value *mask = callInst->getArgOperand(1);
// Generate a new function call to the next pseudo gather
@@ -2429,13 +2465,14 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
static bool
lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
struct GSBOInfo {
GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig)
: isGather(ig) {
GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
: isGather(ig), isPrefetch(ip) {
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
}
llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
const bool isGather;
const bool isPrefetch;
};
GSBOInfo gsFuncs[] = {
@@ -2443,63 +2480,87 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
"__pseudo_gather_factored_base_offsets32_i8",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
"__pseudo_gather_factored_base_offsets32_i8",
true),
true, false),
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
"__pseudo_gather_factored_base_offsets32_i16",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
"__pseudo_gather_factored_base_offsets32_i16",
true),
true, false),
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
"__pseudo_gather_factored_base_offsets32_i32",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
"__pseudo_gather_factored_base_offsets32_i32",
true),
true, false),
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
"__pseudo_gather_factored_base_offsets32_float",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
"__pseudo_gather_factored_base_offsets32_float",
true),
true, false),
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
"__pseudo_gather_factored_base_offsets32_i64",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
"__pseudo_gather_factored_base_offsets32_i64",
true),
true, false),
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
"__pseudo_gather_factored_base_offsets32_double",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
"__pseudo_gather_factored_base_offsets32_double",
true),
true, false),
GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
"__pseudo_scatter_factored_base_offsets32_i8",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
"__pseudo_scatter_factored_base_offsets32_i8",
false),
false, false),
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
"__pseudo_scatter_factored_base_offsets32_i16",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
"__pseudo_scatter_factored_base_offsets32_i16",
false),
false, false),
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
"__pseudo_scatter_factored_base_offsets32_i32",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
"__pseudo_scatter_factored_base_offsets32_i32",
false),
false, false),
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
"__pseudo_scatter_factored_base_offsets32_float",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
"__pseudo_scatter_factored_base_offsets32_float",
false),
false, false),
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
"__pseudo_scatter_factored_base_offsets32_i64",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
"__pseudo_scatter_factored_base_offsets32_i64",
false),
false, false),
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
"__pseudo_scatter_factored_base_offsets32_double",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
"__pseudo_scatter_factored_base_offsets32_double",
false),
false, false),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
"__prefetch_read_varying_1",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
"__prefetch_read_varying_1",
false, true),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
false, true),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
false, true),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
false, true),
};
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -4290,149 +4351,170 @@ lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
static bool
lReplacePseudoGS(llvm::CallInst *callInst) {
struct LowerGSInfo {
LowerGSInfo(const char *pName, const char *aName, bool ig)
: isGather(ig) {
LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip)
: isGather(ig), isPrefetch(ip) {
pseudoFunc = m->module->getFunction(pName);
actualFunc = m->module->getFunction(aName);
}
llvm::Function *pseudoFunc;
llvm::Function *actualFunc;
const bool isGather;
const bool isPrefetch;
};
LowerGSInfo lgsInfo[] = {
LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true),
LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true),
LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true),
LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true),
LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true),
LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true),
LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true),
LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true),
LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true),
LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true),
LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true),
LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true),
LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",
"__gather_factored_base_offsets32_i8", true),
"__gather_factored_base_offsets32_i8", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16",
"__gather_factored_base_offsets32_i16", true),
"__gather_factored_base_offsets32_i16", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32",
"__gather_factored_base_offsets32_i32", true),
"__gather_factored_base_offsets32_i32", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
"__gather_factored_base_offsets32_float", true),
"__gather_factored_base_offsets32_float", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64",
"__gather_factored_base_offsets32_i64", true),
"__gather_factored_base_offsets32_i64", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_double",
"__gather_factored_base_offsets32_double", true),
"__gather_factored_base_offsets32_double", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",
"__gather_factored_base_offsets64_i8", true),
"__gather_factored_base_offsets64_i8", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16",
"__gather_factored_base_offsets64_i16", true),
"__gather_factored_base_offsets64_i16", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32",
"__gather_factored_base_offsets64_i32", true),
"__gather_factored_base_offsets64_i32", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_float",
"__gather_factored_base_offsets64_float", true),
"__gather_factored_base_offsets64_float", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64",
"__gather_factored_base_offsets64_i64", true),
"__gather_factored_base_offsets64_i64", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_double",
"__gather_factored_base_offsets64_double", true),
"__gather_factored_base_offsets64_double", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_i8",
"__gather_base_offsets32_i8", true),
"__gather_base_offsets32_i8", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_i16",
"__gather_base_offsets32_i16", true),
"__gather_base_offsets32_i16", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_i32",
"__gather_base_offsets32_i32", true),
"__gather_base_offsets32_i32", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_float",
"__gather_base_offsets32_float", true),
"__gather_base_offsets32_float", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_i64",
"__gather_base_offsets32_i64", true),
"__gather_base_offsets32_i64", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_double",
"__gather_base_offsets32_double", true),
"__gather_base_offsets32_double", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_i8",
"__gather_base_offsets64_i8", true),
"__gather_base_offsets64_i8", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_i16",
"__gather_base_offsets64_i16", true),
"__gather_base_offsets64_i16", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_i32",
"__gather_base_offsets64_i32", true),
"__gather_base_offsets64_i32", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_float",
"__gather_base_offsets64_float", true),
"__gather_base_offsets64_float", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_i64",
"__gather_base_offsets64_i64", true),
"__gather_base_offsets64_i64", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_double",
"__gather_base_offsets64_double", true),
"__gather_base_offsets64_double", true, false),
LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false),
LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false),
LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false),
LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false),
LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false),
LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false),
LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false),
LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false),
LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false),
LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false),
LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false),
LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false),
LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",
"__scatter_factored_base_offsets32_i8", false),
"__scatter_factored_base_offsets32_i8", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16",
"__scatter_factored_base_offsets32_i16", false),
"__scatter_factored_base_offsets32_i16", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32",
"__scatter_factored_base_offsets32_i32", false),
"__scatter_factored_base_offsets32_i32", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float",
"__scatter_factored_base_offsets32_float", false),
"__scatter_factored_base_offsets32_float", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64",
"__scatter_factored_base_offsets32_i64", false),
"__scatter_factored_base_offsets32_i64", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
"__scatter_factored_base_offsets32_double", false),
"__scatter_factored_base_offsets32_double", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",
"__scatter_factored_base_offsets64_i8", false),
"__scatter_factored_base_offsets64_i8", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16",
"__scatter_factored_base_offsets64_i16", false),
"__scatter_factored_base_offsets64_i16", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32",
"__scatter_factored_base_offsets64_i32", false),
"__scatter_factored_base_offsets64_i32", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
"__scatter_factored_base_offsets64_float", false),
"__scatter_factored_base_offsets64_float", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64",
"__scatter_factored_base_offsets64_i64", false),
"__scatter_factored_base_offsets64_i64", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double",
"__scatter_factored_base_offsets64_double", false),
"__scatter_factored_base_offsets64_double", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i8",
"__scatter_base_offsets32_i8", false),
"__scatter_base_offsets32_i8", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i16",
"__scatter_base_offsets32_i16", false),
"__scatter_base_offsets32_i16", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i32",
"__scatter_base_offsets32_i32", false),
"__scatter_base_offsets32_i32", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_float",
"__scatter_base_offsets32_float", false),
"__scatter_base_offsets32_float", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i64",
"__scatter_base_offsets32_i64", false),
"__scatter_base_offsets32_i64", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_double",
"__scatter_base_offsets32_double", false),
"__scatter_base_offsets32_double", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i8",
"__scatter_base_offsets64_i8", false),
"__scatter_base_offsets64_i8", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i16",
"__scatter_base_offsets64_i16", false),
"__scatter_base_offsets64_i16", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i32",
"__scatter_base_offsets64_i32", false),
"__scatter_base_offsets64_i32", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_float",
"__scatter_base_offsets64_float", false),
"__scatter_base_offsets64_float", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i64",
"__scatter_base_offsets64_i64", false),
"__scatter_base_offsets64_i64", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_double",
"__scatter_base_offsets64_double", false),
"__scatter_base_offsets64_double", false, false),
LowerGSInfo("__pseudo_prefetch_read_varying_1",
"__prefetch_read_varying_1", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_1_native",
"__prefetch_read_varying_1_native", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_2",
"__prefetch_read_varying_2", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_2_native",
"__prefetch_read_varying_2_native", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_3",
"__prefetch_read_varying_3", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_3_native",
"__prefetch_read_varying_3_native", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_nt",
"__prefetch_read_varying_nt", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_nt_native",
"__prefetch_read_varying_nt_native", false, true),
};
llvm::Function *calledFunc = callInst->getCalledFunction();
@@ -4459,7 +4541,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
if (gotPosition && g->target->getVectorWidth() > 1) {
if (info->isGather)
PerformanceWarning(pos, "Gather required to load value.");
else
else if (!info->isPrefetch)
PerformanceWarning(pos, "Scatter required to store value.");
}
return true;
@@ -4740,6 +4822,8 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
"__scatter64_i8", "__scatter64_i16",
"__scatter64_i32", "__scatter64_i64",
"__scatter64_float", "__scatter64_double",
"__prefetch_read_varying_1", "__prefetch_read_varying_2",
"__prefetch_read_varying_3", "__prefetch_read_varying_nt",
"__keep_funcs_live",
};

View File

@@ -647,8 +647,8 @@ def run_tests(options1, args, print_version):
options.include_file = "examples/intrinsics/generic-64.h"
options.target = "generic-64"
elif options.target == "knc":
error("No knc #include specified; using examples/intrinsics/knc-i1x16.h\n", 2)
options.include_file = "examples/intrinsics/knc-i1x16.h"
error("No knc #include specified; using examples/intrinsics/knc.h\n", 2)
options.include_file = "examples/intrinsics/knc.h"
if options.compiler_exe == None:
if (options.target == "knc"):

View File

@@ -847,43 +847,19 @@ static inline void prefetch_nt(const void * uniform ptr) {
}
static inline void prefetch_l1(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l1(p);
}
__pseudo_prefetch_read_varying_1((int64)ptr, (IntMaskType)__mask);
}
static inline void prefetch_l2(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l2(p);
}
__pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask);
}
static inline void prefetch_l3(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l3(p);
}
__pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask);
}
static inline void prefetch_nt(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_nt(p);
}
__pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask);
}
///////////////////////////////////////////////////////////////////////////

View File

@@ -712,7 +712,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
}
}
/** Emit code for an if test that checks the mask and the test values and
tries to be smart about jumping over code that doesn't need to be run.
*/
@@ -1101,8 +1100,10 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
// the code for the test. This is only necessary for varying loops;
// 'uniform' loops just jump when they hit a continue statement and
// don't mess with the mask.
if (!uniformTest)
if (!uniformTest) {
ctx->RestoreContinuedLanes();
ctx->ClearBreakLanes();
}
llvm::Value *testValue = testExpr->GetValue(ctx);
if (!testValue)
return;
@@ -1310,6 +1311,8 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
// test code.
ctx->SetCurrentBasicBlock(bstep);
ctx->RestoreContinuedLanes();
ctx->ClearBreakLanes();
if (step)
step->EmitCode(ctx);
ctx->BranchInst(btest);

View File

@@ -0,0 +1,22 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int64 *ptr = &(a[programIndex+zero]);
prefetch_l1(ptr);
prefetch_l2(ptr);
prefetch_l3(ptr);
prefetch_nt(ptr);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}