Merge branch 'master' into nvptx_clean_master

This commit is contained in:
evghenii
2014-10-14 14:27:00 +02:00
25 changed files with 1947 additions and 1176 deletions

View File

@@ -110,8 +110,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
if version_LLVM == "trunk":
SVN_PATH="trunk"
if version_LLVM == "3.5":
# SVN_PATH=tags/RELEASE_35/rc1
SVN_PATH="branches/release_35"
SVN_PATH="tags/RELEASE_350/final"
version_LLVM = "3_5"
if version_LLVM == "3.4":
SVN_PATH="tags/RELEASE_34/dot2-final"

View File

@@ -555,6 +555,10 @@ lSetInternalFunctions(llvm::Module *module) {
"__prefetch_read_uniform_2",
"__prefetch_read_uniform_3",
"__prefetch_read_uniform_nt",
"__pseudo_prefetch_read_varying_1",
"__pseudo_prefetch_read_varying_2",
"__pseudo_prefetch_read_varying_3",
"__pseudo_prefetch_read_varying_nt",
"__psubs_vi8",
"__psubs_vi16",
"__psubus_vi8",
@@ -780,7 +784,11 @@ void
AddBitcodeToModule(const unsigned char *bitcode, int length,
llvm::Module *module, SymbolTable *symbolTable) {
llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
#else // LLVM 3.6+
llvm::MemoryBufferRef bcBuf = llvm::MemoryBuffer::getMemBuffer(sb)->getMemBufferRef();
#endif
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
llvm::ErrorOr<llvm::Module *> ModuleOrErr = llvm::parseBitcodeFile(bcBuf, *g->ctx);
@@ -910,12 +918,23 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
// have the DW_AT_artifical attribute. It's not clear if this
// matters for anything though.
llvm::DIGlobalVariable var =
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
m->diBuilder->createGlobalVariable(file,
name,
name,
file,
0 /* line */,
diType,
true /* static */,
sym->storagePtr);
#else
m->diBuilder->createGlobalVariable(name,
file,
0 /* line */,
diType,
true /* static */,
sym->storagePtr);
#endif
Assert(var.Verify());
}
}
@@ -970,12 +989,23 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
llvm::DIType diType = sym->type->GetDIType(file);
Assert(diType.Verify());
llvm::DIGlobalVariable var =
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
m->diBuilder->createGlobalVariable(file,
sym->name.c_str(),
sym->name.c_str(),
file,
0 /* line */,
diType,
false /* static */,
sym->storagePtr);
#else
m->diBuilder->createGlobalVariable(sym->name.c_str(),
file,
0 /* line */,
diType,
false /* static */,
sym->storagePtr);
#endif
Assert(var.Verify());
}
}

View File

@@ -370,6 +370,14 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
declare void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins

View File

@@ -1584,6 +1584,50 @@ define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
ret void
}
define void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 3, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
define void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 2, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
define void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 1, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
define void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 0, i32 1)
')
ret void
}
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2535,6 +2579,31 @@ declare void
@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x double>, <WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_1(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_1_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_2(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_2_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_3(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_3_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
declare void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_prefetch_read_varying_nt_native(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
declare void @__use8(<WIDTH x i8>)
@@ -3034,6 +3103,41 @@ ifelse(HAVE_SCATTER, `1',
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; prefetchs
call void @__pseudo_prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_1_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_1_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_2_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_2_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_3_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_3_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
ret void
}

View File

@@ -4945,9 +4945,19 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
#endif
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
std::string error;
#else // LLVM 3.6+
std::error_code error;
#endif
llvm::tool_output_file *of = new llvm::tool_output_file(fn, error, flags);
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
if (error.size()) {
#else // LLVM 3.6+
if (error) {
#endif
fprintf(stderr, "Error opening output file \"%s\".\n", fn);
return false;
}

29
ctx.cpp
View File

@@ -745,6 +745,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
// that have executed a 'break' statement:
// breakLanes = breakLanes | mask
AssertPos(currentPos, breakLanesPtr != NULL);
llvm::Value *mask = GetInternalMask();
llvm::Value *breakMask = LoadInst(breakLanesPtr,
"break_mask");
@@ -927,6 +928,16 @@ FunctionEmitContext::RestoreContinuedLanes() {
}
void
FunctionEmitContext::ClearBreakLanes() {
if (breakLanesPtr == NULL)
return;
// breakLanes = 0
StoreInst(LLVMMaskAllOff, breakLanesPtr);
}
void
FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
llvm::Value *oldMask = GetInternalMask();
@@ -1636,14 +1647,16 @@ FunctionEmitContext::StartScope() {
llvm::DILexicalBlock lexicalBlock =
m->diBuilder->createLexicalBlock(parentScope, diFile,
currentPos.first_line,
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
#if defined(LLVM_3_5)
// Revision 202736 in LLVM adds support of DWARF discriminator
// to the last argument and revision 202737 in clang adds 0
// for the last argument by default.
currentPos.first_column, 0);
#else
// Revision 216239 in LLVM removes support of DWARF discriminator
// as the last argument
currentPos.first_column);
#endif
#endif // LLVM 3.2, 3.3, 3.4 and 3.6+
AssertPos(currentPos, lexicalBlock.Verify());
debugScopes.push_back(lexicalBlock);
}
@@ -1683,8 +1696,14 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
diType,
true /* preserve through opts */);
AssertPos(currentPos, var.Verify());
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
llvm::DIExpression E = m->diBuilder->createExpression();
llvm::Instruction *declareInst =
m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock);
#else
llvm::Instruction *declareInst =
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
#endif
AddDebugPos(declareInst, &sym->pos, &scope);
}
@@ -1710,8 +1729,14 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym, int argNum) {
flags,
argNum+1);
AssertPos(currentPos, var.Verify());
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
llvm::DIExpression E = m->diBuilder->createExpression();
llvm::Instruction *declareInst =
m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock);
#else
llvm::Instruction *declareInst =
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
#endif
AddDebugPos(declareInst, &sym->pos, &scope);
}

7
ctx.h
View File

@@ -196,6 +196,13 @@ public:
previous iteration. */
void RestoreContinuedLanes();
/** This method is called by code emitting IR for a loop. It clears
any lanes that contained a break since the mask has been updated to take
them into account. This is necessary as all the bail out checks for
breaks are meant to only deal with lanes breaking on the current iteration.
*/
void ClearBreakLanes();
/** Indicates that code generation for a "switch" statement is about to
start. isUniform indicates whether the "switch" value is uniform,
and bbAfterSwitch gives the basic block immediately following the

View File

@@ -160,8 +160,8 @@
<ItemGroup>
<CustomBuild Include='$(ISPC_file).ispc'>
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) $(flags)</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) -g $(flags)</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) -g $(flags)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>

View File

@@ -1540,6 +1540,15 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -1624,6 +1624,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec32_i32 offsets, __vec32_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec32_i64 addr, __vec32_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -1757,6 +1757,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec64_i32 offsets, __vec64_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec64_i64 addr, __vec64_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -1,5 +1,5 @@
/**
Copyright (c) 2010-2013, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <limits.h> // INT_MIN
#include <stdint.h>
#include <math.h>
#include <assert.h>
@@ -525,11 +526,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
*p = v;
}
template <class RetVecType> RetVecType __smear_i1(int i);
template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
template <class RetVecType> static RetVecType __smear_i1(int i);
template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
template <class RetVecType> RetVecType __setzero_i1();
template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
template <class RetVecType> static RetVecType __setzero_i1();
template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
template <class RetVecType> __vec16_i1 __undef_i1();
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
@@ -677,8 +678,8 @@ static FORCEINLINE __vec16_i32 __select( bool cond, __vec16_i32 a, __vec16_
static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int32_t index) { return v[index]; }
static FORCEINLINE void __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val; }
template <class RetVecType> RetVecType __smear_i32(int32_t i);
template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
template <class RetVecType> RetVecType static __smear_i32(int32_t i);
template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
@@ -686,11 +687,11 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
template <class RetVecType> RetVecType __setzero_i32();
template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
template <class RetVecType> static RetVecType __setzero_i32();
template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
template <class RetVecType> RetVecType __undef_i32();
template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
template <class RetVecType> static RetVecType __undef_i32();
template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
@@ -742,11 +743,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
}
#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
{
return _mm512_load_epi32(p);
}
template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v)
template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v)
{
_mm512_store_epi32(p, v);
}
@@ -1017,21 +1018,21 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
}
#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
{
__m512i v2 = _mm512_load_epi32(p);
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
return __vec16_i64(v2,v1);
}
template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); }
template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); }
template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
{
__m512i v1 = v.v2;
__m512i v2 = v.v1;
_mm512_store_epi64(p, v2);
_mm512_store_epi64(((uint8_t*)p)+64, v1);
}
template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); }
template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); }
#endif
@@ -1067,14 +1068,14 @@ static FORCEINLINE __vec16_f __select( bool cond, __vec16_f a, __vec16_f b)
static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { return v[index]; }
static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
template <class RetVecType> RetVecType __smear_float(float f);
template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
template <class RetVecType> static RetVecType __smear_float(float f);
template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
template <class RetVecType> RetVecType __setzero_float();
template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
template <class RetVecType> static RetVecType __setzero_float();
template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
template <class RetVecType> RetVecType __undef_float();
template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
template <class RetVecType> static RetVecType __undef_float();
template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index)
{
@@ -1131,12 +1132,12 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
}
#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
{
return _mm512_load_ps(p);
}
/* this one doesn't fail but it is commented out for completeness, no aligned load/stores */
template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v)
template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v)
{
_mm512_store_ps(p, v);
}
@@ -1309,14 +1310,14 @@ static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b)
static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { return v[index]; }
static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
template <class RetVecType> RetVecType __smear_double(double d);
template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
template <class RetVecType> static RetVecType __smear_double(double d);
template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
template <class RetVecType> RetVecType __setzero_double();
template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
template <class RetVecType> static RetVecType __setzero_double();
template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
template <class RetVecType> RetVecType __undef_double();
template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
template <class RetVecType> static RetVecType __undef_double();
template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
#define CASTD2F(_v_, _v_hi_, _v_lo_) \
__vec16_f _v_hi_, _v_lo_; \
@@ -1390,17 +1391,17 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
{
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
}
template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v)
template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v)
{
_mm512_store_pd(p, v.v1);
_mm512_store_pd(((uint8_t*)p)+64, v.v2);
}
template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); }
template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); }
template <> FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); }
template <> FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); }
#endif
///////////////////////////////////////////////////////////////////////////
@@ -2162,6 +2163,7 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t
static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask)
{
const __vec16_i64 offsets = _offsets.cvt2hilo();
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
__vec16_i1 still_to_do = mask;
__vec16_i32 tmp;
while (still_to_do) {
@@ -2172,8 +2174,8 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base,
_MM_UPCONV_EPI32_SINT8, scale,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match,still_to_do);
@@ -2197,6 +2199,7 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32
static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask)
{
const __vec16_i64 offsets = _offsets.cvt2hilo();
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
// There is no gather instruction with 64-bit offsets in KNC.
// We have to manually iterate over the upper 32 bits ;-)
__vec16_i1 still_to_do = mask;
@@ -2209,8 +2212,8 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
ret = _mm512_mask_i32extgather_epi32(ret, match, signed_offsets, base,
_MM_UPCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match, still_to_do);
@@ -2230,6 +2233,7 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32
static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask)
{
const __vec16_i64 offsets = _offsets.cvt2hilo();
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
// There is no gather instruction with 64-bit offsets in KNC.
// We have to manually iterate over the upper 32 bits ;-)
__vec16_i1 still_to_do = mask;
@@ -2242,8 +2246,8 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base,
_MM_UPCONV_PS_NONE, scale,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match, still_to_do);
@@ -2339,6 +2343,7 @@ static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale,
static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask)
{
const __vec16_i64 offsets = _offsets.cvt2hilo();
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
__vec16_i1 still_to_do = mask;
while (still_to_do) {
@@ -2349,8 +2354,8 @@ static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t sc
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
_mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
_mm512_mask_i32extscatter_epi32(base, match, signed_offsets,
value,
_MM_DOWNCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
@@ -2370,6 +2375,7 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal
static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask)
{
const __vec16_i64 offsets = _offsets.cvt2hilo();
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
__vec16_i1 still_to_do = mask;
while (still_to_do) {
@@ -2380,8 +2386,9 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
_mm512_mask_i32extscatter_ps(base, match, offsets.v_lo,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
_mm512_mask_i32extscatter_ps(base, match, signed_offsets,
value,
_MM_DOWNCONV_PS_NONE, scale,
_MM_HINT_NONE);
@@ -2543,6 +2550,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
}
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) { \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
__vec16_i1 copy_mask = _mm512_kmov(mask); \
_mm512_kswapb(mask, copy_mask); \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
__vec16_i32 offsets, __vec16_i1 mask) {}
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
///////////////////////////////////////////////////////////////////////////
// atomics
///////////////////////////////////////////////////////////////////////////

View File

@@ -2606,6 +2606,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
}
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) { \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
__vec16_i1 copy_mask = _mm512_kmov(mask); \
_mm512_kswapb(mask, copy_mask); \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
__vec16_i32 offsets, __vec16_i1 mask) {}
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2012, Intel Corporation
/**
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <limits.h> // INT_MIN
#include <stdint.h>
#include <math.h>
#include <assert.h>
@@ -43,6 +44,15 @@
#include <iostream> // for operator<<(m512[i])
#include <iomanip> // for operator<<(m512[i])
#if 0
#define STRING(x) #x
#define TOSTRING(x) STRING(x)
#define PING std::cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << std::endl
#define PRINT(x) std::cout << STRING(x) << " = " << (x) << std::endl
#define PRINT2(x,y) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << std::endl
#define PRINT3(x,y,z) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << std::endl
#define PRINT4(x,y,z,w) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << std::endl
#endif
#define FORCEINLINE __forceinline
#ifdef _MSC_VER
@@ -75,7 +85,44 @@ typedef int64_t __vec1_i64;
struct __vec16_i32;
#if 0
/* (iw) actually, this *SHOULD* be the right implementation for a
vec16_i1: this one is a class that can have a constructor (which
ISPC sometimes emits for these vectors...) This version might
not be working with embree's ISPC bindings, probably because
embree still uses the 'wrong' implementation */
typedef struct PRE_ALIGN(2) __vec16_i1
{
FORCEINLINE operator __mmask16() const { return v; }
FORCEINLINE __vec16_i1() { }
FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { }
FORCEINLINE __vec16_i1(bool v0, bool v1, bool v2, bool v3,
bool v4, bool v5, bool v6, bool v7,
bool v8, bool v9, bool v10, bool v11,
bool v12, bool v13, bool v14, bool v15) {
v = ((v0 & 1) |
((v1 & 1) << 1) |
((v2 & 1) << 2) |
((v3 & 1) << 3) |
((v4 & 1) << 4) |
((v5 & 1) << 5) |
((v6 & 1) << 6) |
((v7 & 1) << 7) |
((v8 & 1) << 8) |
((v9 & 1) << 9) |
((v10 & 1) << 10) |
((v11 & 1) << 11) |
((v12 & 1) << 12) |
((v13 & 1) << 13) |
((v14 & 1) << 14) |
((v15 & 1) << 15));
}
__mmask16 v;
} POST_ALIGN(2) __vec16_i1;
#else
typedef __mmask16 POST_ALIGN(2) __vec16_i1;
#endif
typedef struct PRE_ALIGN(64) __vec16_f {
FORCEINLINE operator __m512() const { return v; }
@@ -167,14 +214,14 @@ struct vec16 {
PRE_ALIGN(16) struct __vec16_i8 : public vec16<int8_t> {
FORCEINLINE __vec16_i8() { }
FORCEINLINE __vec16_i8(const __vec16_i8 &o);
FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
FORCEINLINE __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
int8_t v4, int8_t v5, int8_t v6, int8_t v7,
int8_t v8, int8_t v9, int8_t v10, int8_t v11,
int8_t v12, int8_t v13, int8_t v14, int8_t v15)
FORCEINLINE __vec16_i8(const int8_t v0, const int8_t v1, const int8_t v2, const int8_t v3,
const int8_t v4, const int8_t v5, const int8_t v6, const int8_t v7,
const int8_t v8, const int8_t v9, const int8_t v10, const int8_t v11,
const int8_t v12, const int8_t v13, const int8_t v14, const int8_t v15)
: vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14, v15) { }
FORCEINLINE __vec16_i8(const __vec16_i8 &o);
FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
} POST_ALIGN(16);
PRE_ALIGN(32) struct __vec16_i16 : public vec16<int16_t> {
@@ -215,6 +262,28 @@ inline std::ostream &operator<<(std::ostream &out, const __m512 &v)
return out;
}
inline std::ostream &operator<<(std::ostream &out, const __vec16_i8 &v)
{
out << "[";
for (int i=0;i<16;i++)
out << (i?",":"") << std::dec << std::setw(8) << (int)((unsigned char*)&v)[i] << std::dec;
// out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec;
out << "]" << std::flush;
return out;
}
inline std::ostream &operator<<(std::ostream &out, const __vec16_i64 &v)
{
out << "[";
uint32_t *ptr = (uint32_t*)&v;
for (int i=0;i<16;i++) {
uint64_t val = (uint64_t(ptr[i])<<32)+ptr[i+16];
out << (i?",":"") << ((int*)val);
}
out << "]" << std::flush;
return out;
}
///////////////////////////////////////////////////////////////////////////
// macros...
@@ -299,15 +368,29 @@ static FORCEINLINE bool __extract_element(__vec16_i1 mask, uint32_t index) {
return (mask & (1 << index)) ? true : false;
}
static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index)
{
//uint *src = (uint *)&v;
const uint *src = (const uint *)&v;
return src[index+16] | (uint64_t(src[index]) << 32);
}
/*
static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
bool val) {
if (val == false)
vec->v &= ~(1 << index);
else
vec->v |= (1 << index);
}
*/
}
*/
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
const uint16_t *ptr = (const uint16_t *)p;
@@ -321,18 +404,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
*ptr = v;
}
template <class RetVecType> RetVecType __smear_i1(int i);
template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
template <class RetVecType> static RetVecType __smear_i1(int i);
template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
return i?0xFFFF:0x0;
}
template <class RetVecType> RetVecType __setzero_i1();
template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
template <class RetVecType> static RetVecType __setzero_i1();
template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
return 0;
}
template <class RetVecType> RetVecType __undef_i1();
template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
template <class RetVecType> static RetVecType __undef_i1();
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
return __vec16_i1();
}
@@ -342,7 +425,7 @@ template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
/*
TODO
TODO
*/
@@ -353,7 +436,7 @@ TODO
/*
TODO
TODO
*/
@@ -532,8 +615,8 @@ static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t
((int32_t *)v)[index] = val;
}
template <class RetVecType> RetVecType __smear_i32(int32_t i);
template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
template <class RetVecType> static RetVecType __smear_i32(int32_t i);
template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
return _mm512_set1_epi32(i);
}
@@ -542,13 +625,13 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
template <class RetVecType> RetVecType __setzero_i32();
template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
template <class RetVecType> static RetVecType __setzero_i32();
template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
return _mm512_setzero_epi32();
}
template <class RetVecType> RetVecType __undef_i32();
template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
template <class RetVecType> static RetVecType __undef_i32();
template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
return __vec16_i32();
}
@@ -557,9 +640,13 @@ static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
return _mm512_set1_epi32(val);
}
static FORCEINLINE __vec16_i32 __cast_trunc(__vec16_i32, const __vec16_i64 i64) {
return __vec16_i32(i64.v_lo);
}
static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
__vec16_i32 idx = __smear_i32<__vec16_i32>(index);
__vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0x7));
__vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xf));
return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
}
@@ -578,7 +665,7 @@ template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p)
#endif
}
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
return _mm512_load_epi32(p);
}
@@ -591,18 +678,32 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
#endif
}
template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
_mm512_store_epi32(p, v);
}
///////////////////////////////////////////////////////////////////////////
// int64
///////////////////////////////////////////////////////////////////////////
static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index)
static FORCEINLINE
void __masked_store_i64(void *p, const __vec16_i64 &v, __vec16_i1 mask)
{
uint *src = (uint *)&v;
return src[index+16] | (int64_t(src[index]) << 32);
__m512i v1;
__m512i v2;
v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
v.v_hi);
v1 = _mm512_mask_permutevar_epi32(v1, 0x5555,
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
v.v_lo);
v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
v.v_hi);
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
v.v_lo);
_mm512_mask_store_epi64(p, mask, v2);
_mm512_mask_store_epi64(((uint8_t*)p)+64, mask>>8, v1);
}
static FORCEINLINE void __insert_element(__vec16_i64 *v, uint32_t index, int64_t val) {
@@ -611,16 +712,16 @@ static FORCEINLINE void __insert_element(__vec16_i64 *v, uint32_t index, int64_t
}
template <class RetVecType> RetVecType __setzero_i64();
template <> static FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() {
template <class RetVecType> static RetVecType __setzero_i64();
template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() {
__vec16_i64 ret;
ret.v_lo = _mm512_setzero_epi32();
ret.v_hi = _mm512_setzero_epi32();
return ret;
}
template <class RetVecType> RetVecType __undef_i64();
template <> static FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() {
template <class RetVecType> static RetVecType __undef_i64();
template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() {
return __vec16_i64();
}
@@ -704,6 +805,13 @@ static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, __vec16_i64 b) {
return __vec16_i64(lo, hi);
}
static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, unsigned long long b) {
__vec16_i32 hi = _mm512_or_epi32(_mm512_slli_epi32(a.v_hi, b),
_mm512_srli_epi32(a.v_lo, 32-b));
__vec16_i32 lo = _mm512_slli_epi32(a.v_lo, b);
return __vec16_i64(lo, hi);
}
static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) {
__vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
__vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
@@ -724,6 +832,16 @@ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, __vec16_i64 b) {
return __vec16_i64(lo, hi);
}
static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, unsigned long long b) {
__vec16_i32 xfer
= _mm512_slli_epi32(_mm512_and_epi32(a.v_hi,
_mm512_set1_epi32((1<<b)-1)),
32-b);
__vec16_i32 hi = _mm512_srai_epi32(a.v_hi, b);
__vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srli_epi32(a.v_lo, b));
return __vec16_i64(lo, hi);
}
static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
@@ -731,9 +849,9 @@ static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i6
static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
__vec16_i1 mask) {
__mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
__mmask16 lo_match = _mm512_mask_cmpeq_epi32_mask((__mmask16)mask, a.v_lo,b.v_lo);
__mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
return _mm512_kand(full_match, (__mmask16)mask);
return full_match;
}
static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
@@ -753,7 +871,7 @@ static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
return ret;
}
template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
template <class RetVecType> static RetVecType __smear_i64(const int64_t &l);
template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) {
const int *i = (const int*)&l;
return __vec16_i64(_mm512_set1_epi32(i[0]), _mm512_set1_epi32(i[1]));
@@ -762,10 +880,11 @@ template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l)
template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) {
__vec16_i32 v1;
__vec16_i32 v2;
v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
const uint8_t*ptr = (const uint8_t*)p;
v2 = _mm512_extloadunpacklo_epi32(v2, ptr, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v2 = _mm512_extloadunpackhi_epi32(v2, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v1 = _mm512_extloadunpacklo_epi32(v1, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v1 = _mm512_extloadunpackhi_epi32(v1, ptr+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
__vec16_i64 ret;
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
@@ -783,7 +902,7 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p)
return ret;
}
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
__m512i v2 = _mm512_load_epi32(p);
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
__vec16_i64 ret;
@@ -802,7 +921,7 @@ template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
return ret;
}
template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {
template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {
return __load<64>(p);
}
@@ -827,7 +946,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
_mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
}
template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
__m512i v1;
__m512i v2;
v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
@@ -846,10 +965,72 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
_mm512_store_epi64(((uint8_t*)p)+64, v1);
}
template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {
template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {
__store<64>(p, v);
}
/*! gather vector of 64-bit ints from addresses pointing to uniform ints
(iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF
***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't
expose whether it's from array of uniform or array of varying
poitners, so in here there's no way to tell - only thing we can do
is pick one...
*/
static FORCEINLINE __vec16_i64
__gather_base_offsets32_i64(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
__vec16_i1 mask) {
__vec16_i64 ret;
ret.v_lo = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets,
base, _MM_UPCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
ret.v_hi = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets,
base+4, _MM_UPCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
return ret;
}
/*! gather vector of 64-bit ints from addresses pointing to uniform ints
(iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF
***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't
expose whether it's from array of uniform or array of varying
poitners, so in here there's no way to tell - only thing we can do
is pick one...
*/
static FORCEINLINE __vec16_i64
__gather64_i64(__vec16_i64 addr, __vec16_i1 mask)
{
__vec16_i64 ret;
// There is no gather instruction with 64-bit offsets in KNC.
// We have to manually iterate over the upper 32 bits ;-)
__vec16_i1 still_to_do = mask;
const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
while (still_to_do) {
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
const uint32_t &hi32 = ((uint*)&addr.v_hi)[first_active_lane];
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi,
__smear_i32<__vec16_i32>((int32_t)hi32),
_MM_CMPINT_EQ);
void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN));
ret.v_lo = _mm512_mask_i32extgather_epi32(ret.v_lo, match, signed_offsets,
base, _MM_UPCONV_EPI32_NONE, 1,
_MM_HINT_NONE);
ret.v_hi = _mm512_mask_i32extgather_epi32(ret.v_hi, match, signed_offsets,
base+4, _MM_UPCONV_EPI32_NONE, 1,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match, still_to_do);
}
return ret;
}
///////////////////////////////////////////////////////////////////////////
// float
///////////////////////////////////////////////////////////////////////////
@@ -948,18 +1129,18 @@ static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float va
((float *)v)[index] = val;
}
template <class RetVecType> RetVecType __smear_float(float f);
template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
template <class RetVecType> static RetVecType __smear_float(float f);
template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
return _mm512_set_1to16_ps(f);
}
template <class RetVecType> RetVecType __setzero_float();
template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
template <class RetVecType> static RetVecType __setzero_float();
template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
return _mm512_setzero_ps();
}
template <class RetVecType> RetVecType __undef_float();
template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
template <class RetVecType> static RetVecType __undef_float();
template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
return __vec16_f();
}
@@ -983,7 +1164,7 @@ template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) {
#endif
}
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
return _mm512_load_ps(p);
}
@@ -996,7 +1177,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
#endif
}
template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
_mm512_store_ps(p, v);
}
@@ -1178,24 +1359,24 @@ static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double v
((double *)v)[index] = val;
}
template <class RetVecType> RetVecType __smear_double(double d);
template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
template <class RetVecType> static RetVecType __smear_double(double d);
template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
__vec16_d ret;
ret.v1 = _mm512_set1_pd(d);
ret.v2 = _mm512_set1_pd(d);
return ret;
}
template <class RetVecType> RetVecType __setzero_double();
template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
template <class RetVecType> static RetVecType __setzero_double();
template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
__vec16_d ret;
ret.v1 = _mm512_setzero_pd();
ret.v2 = _mm512_setzero_pd();
return ret;
}
template <class RetVecType> RetVecType __undef_double();
template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
template <class RetVecType> static RetVecType __undef_double();
template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
return __vec16_d();
}
@@ -1216,14 +1397,14 @@ template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) {
return ret;
}
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
__vec16_d ret;
ret.v1 = _mm512_load_pd(p);
ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
return ret;
}
template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
template <> FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
return __load<64>(p);
}
@@ -1234,12 +1415,12 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
_mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
}
template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
_mm512_store_pd(p, v.v1);
_mm512_store_pd(((uint8_t*)p)+64, v.v2);
}
template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
template <> FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
__store<64>(p, v);
}
@@ -1329,16 +1510,15 @@ static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) {
static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
__vec16_d ret;
ret.v2 = _mm512_cvtpslo_pd(val.v);
ret.v1 = _mm512_cvtpslo_pd(val.v);
__vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
ret.v1 = _mm512_cvtpslo_pd(other8);
ret.v2 = _mm512_cvtpslo_pd(other8);
return ret;
}
static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
__m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
__m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
__m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
__m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
return _mm512_mask_permute4f128_epi32(r1i, 0xFF00, r0i, _MM_PERM_BABA);
}
@@ -1352,11 +1532,41 @@ static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) {
static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) {
return *(__vec16_i64*)&val;
__vec16_i64 ret;
ret.v_hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
_mm512_castpd_si512(val.v2));
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF,
_mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
_mm512_castpd_si512(val.v1));
ret.v_lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
_mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
_mm512_castpd_si512(val.v2));
ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF,
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
_mm512_castpd_si512(val.v1));
return ret;
}
static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
return *(__vec16_d*)&val;
__vec16_d ret;
ret.v2 = _mm512_castsi512_pd(
_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
val.v_hi));
ret.v2 = _mm512_castsi512_pd(
_mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v2), 0x5555,
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
val.v_lo));
ret.v1 = _mm512_castsi512_pd(
_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
val.v_hi));
ret.v1 = _mm512_castsi512_pd(
_mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v1), 0x5555,
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
val.v_lo));
return ret;
}
///////////////////////////////////////////////////////////////////////////
@@ -1488,12 +1698,14 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
return _mm512_invsqrt_ps(v);
#endif
}
static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
__vec16_d y;
for (int i = 0; i < 16; i++)
__insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
return y;
}
static FORCEINLINE double __rsqrt_uniform_double(double v)
{
return 1.0/v;
@@ -1629,6 +1841,38 @@ static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
#endif
}
static FORCEINLINE void __masked_store_i8(void *p, const __vec16_i8 &val, __vec16_i1 mask) {
__vec16_i32 tmp = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
_mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
}
static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, __vec16_i1 mask) {
__vec16_i8 ret;
__vec16_i32 tmp = _mm512_mask_extload_epi32(_mm512_undefined_epi32(),mask,p,
_MM_UPCONV_EPI32_SINT8,
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
_mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
return ret;
}
template <int ALIGN> static FORCEINLINE __vec16_i8 __load(const __vec16_i8 *p) {
return *p;
}
template <int ALIGN> static FORCEINLINE void __store(__vec16_i8 *p, __vec16_i8 v) {
*p = v;
}
static FORCEINLINE void
__scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
__vec16_i8 val, __vec16_i1 mask)
{
__vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT8,
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
printf("__scatter_base_offsets32_i8\n");
_mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp,
_MM_DOWNCONV_EPI32_SINT8, scale,
_MM_HINT_NONE);
}
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
#ifdef ISPC_FORCE_ALIGNED_MEMORY
_mm512_mask_store_epi32(p, mask, val.v);
@@ -1729,16 +1973,44 @@ static FORCEINLINE __vec16_d
__gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
__vec16_i1 mask) {
__vec16_d ret;
ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
base, _MM_UPCONV_PD_NONE, scale,
_MM_HINT_NONE);
__m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets,
ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets,
base, _MM_UPCONV_PD_NONE, scale,
_MM_HINT_NONE);
return ret;
}
static FORCEINLINE __vec16_f
__gather64_float(__vec16_i64 addr, __vec16_i1 mask)
{
__vec16_f ret;
// There is no gather instruction with 64-bit offsets in KNC.
// We have to manually iterate over the upper 32 bits ;-)
__vec16_i1 still_to_do = mask;
const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
while (still_to_do) {
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
const uint &hi32 = ((uint*)&addr.v_hi)[first_active_lane];
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi,
__smear_i32<__vec16_i32>((int32_t)hi32),
_MM_CMPINT_EQ);
void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN));
ret.v = _mm512_mask_i32extgather_ps(ret.v, match, signed_offsets,
base, _MM_UPCONV_PS_NONE, 1,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match, still_to_do);
}
return ret;
}
/*! gather with 64-bit offsets.
\todo add optimization that falls back to 32-bit offset gather if
@@ -1749,6 +2021,8 @@ __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offset
static FORCEINLINE __vec16_f
__gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
__vec16_i1 mask) {
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
// There is no gather instruction with 64-bit offsets in KNC.
// We have to manually iterate over the upper 32 bits ;-)
__vec16_i1 still_to_do = mask;
@@ -1759,10 +2033,10 @@ __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offset
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
__smear_i32<__vec16_i32>((int32_t)hi32),
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base,
_MM_UPCONV_PS_NONE, scale,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match, still_to_do);
@@ -1776,6 +2050,8 @@ static FORCEINLINE __vec16_i8
__gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
__vec16_i1 mask)
{
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
__vec16_i1 still_to_do = mask;
__vec16_i32 tmp;
while (still_to_do) {
@@ -1786,8 +2062,8 @@ __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base,
_MM_UPCONV_EPI32_SINT8, scale,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match,still_to_do);
@@ -1802,6 +2078,8 @@ static FORCEINLINE void
__scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
__vec16_f value,
__vec16_i1 mask) {
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
__vec16_i1 still_to_do = mask;
while (still_to_do) {
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
@@ -1811,8 +2089,8 @@ __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offse
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
_mm512_mask_i32extscatter_ps(base, match, offsets.v_lo,
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
_mm512_mask_i32extscatter_ps(base, match, signed_offsets,
value,
_MM_DOWNCONV_PS_NONE, scale,
_MM_HINT_NONE);
@@ -1824,7 +2102,36 @@ static FORCEINLINE void
__scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
__vec16_i32 value,
__vec16_i1 mask) {
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
__vec16_i1 still_to_do = mask;
while (still_to_do) {
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
__smear_i32<__vec16_i32>((int32_t)hi32),
_MM_CMPINT_EQ);
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
_mm512_mask_i32extscatter_epi32(base, match, signed_offsets,
value,
_MM_DOWNCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match,still_to_do);
}
}
static FORCEINLINE void // TODO
__scatter_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
__vec16_i8 value,
__vec16_i1 mask) {
__vec16_i1 still_to_do = mask;
__vec16_i32 tmp = _mm512_extload_epi32(&value, _MM_UPCONV_EPI32_SINT8,
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
// _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
while (still_to_do) {
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
@@ -1835,15 +2142,14 @@ __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets
void * base = (void*)((unsigned long)_base +
((scale*(unsigned long)hi32) << 32));
_mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo,
value,
_MM_DOWNCONV_EPI32_NONE, scale,
tmp,
_MM_DOWNCONV_EPI32_SINT8, scale,
_MM_HINT_NONE);
still_to_do = _mm512_kxor(match,still_to_do);
}
}
static FORCEINLINE __vec16_i32
__gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
__vec16_i1 mask)
@@ -1876,17 +2182,15 @@ __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
// packed load/store
///////////////////////////////////////////////////////////////////////////
static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val,
__vec16_i1 mask) {
__vec16_i32 v;
static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask) {
__vec16_i32 v = __load<64>(val);
v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v = _mm512_mask_extloadunpackhi_epi32(_mm512_undefined_epi32(), mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
__store<64>(val, v);
return _mm_countbits_32(uint32_t(mask));
}
static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
__vec16_i1 mask) {
static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) {
_mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
return _mm_countbits_32(uint32_t(mask));
@@ -1918,6 +2222,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(const char *p) {
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
}
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec16_i32 offsets, __vec16_i1 mask) { \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
__vec16_i1 copy_mask = _mm512_kmov(mask); \
_mm512_kswapb(mask, copy_mask); \
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
__vec16_i32 offsets, __vec16_i1 mask) {}
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
///////////////////////////////////////////////////////////////////////////
// atomics
///////////////////////////////////////////////////////////////////////////

View File

@@ -3898,6 +3898,15 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *ptr) {
_mm_prefetch((char *)ptr, _MM_HINT_NTA);
}
#define PREFETCH_READ_VARYING(CACHE_NUM) \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
__vec4_i32 offsets, __vec4_i1 mask) {} \
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec4_i64 addr, __vec4_i1 mask) {} \
PREFETCH_READ_VARYING(1)
PREFETCH_READ_VARYING(2)
PREFETCH_READ_VARYING(3)
PREFETCH_READ_VARYING(nt)
///////////////////////////////////////////////////////////////////////////
// atomics

View File

@@ -257,6 +257,32 @@
./tests/reduce-equal-5.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/reduce-equal-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/reduce-equal-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/paddus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/paddus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmuls_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i16.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i32.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/paddus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/paddus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmuls_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i16.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i32.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/pmulus_i8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
.\tests\foreach-double-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.5 cl -O2 *
.\tests\foreach-double-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.5 cl -O2 *
.\tests\foreach-double-1.ispc runfail x86 avx2-i64x4 Windows LLVM 3.5 cl -O2 *
@@ -267,7 +293,6 @@
./tests/ptr-22.ispc runfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/ptr-22.ispc runfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O2 *
./tests/ptr-22.ispc runfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
./tests/atomics-1.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
./tests/atomics-10.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
./tests/atomics-11.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
@@ -454,3 +479,35 @@
./tests/reduce-equal-5.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 *
./tests/reduce-equal-6.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 *
./tests/reduce-equal-8.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 *
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/paddus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/paddus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmuls_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i32.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/paddus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/paddus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmuls_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i32.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/pmulus_i8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 *
./tests/psubus_vi16.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O2 *
./tests/psubus_vi8.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O2 *
./tests/psubus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/psubus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/psubus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
./tests/psubus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *

View File

@@ -199,7 +199,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
m_hasTranscendentals(false),
m_hasTrigonometry(false),
m_hasRsqrtd(false),
m_hasRcpd(false)
m_hasRcpd(false),
m_hasVecPrefetch(false)
{
if (isa == NULL) {
if (cpu != NULL) {
@@ -386,6 +387,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_hasTrigonometry = false;
this->m_hasGather = this->m_hasScatter = true;
this->m_hasRsqrtd = this->m_hasRcpd = true;
// It's set to true, because MIC has hardware vector prefetch instruction
this->m_hasVecPrefetch = true;
}
else if (!strcasecmp(isa, "generic-32") ||
!strcasecmp(isa, "generic-x32")) {

5
ispc.h
View File

@@ -283,6 +283,8 @@ public:
bool hasRcpd() const {return m_hasRcpd;}
bool hasVecPrefetch() const {return m_hasVecPrefetch;}
private:
/** llvm Target object representing this target. */
@@ -385,6 +387,9 @@ private:
/** Indicates whether there is an ISA double precision rcp. */
bool m_hasRcpd;
/** Indicates whether the target has hardware instruction for vector prefetch. */
bool m_hasVecPrefetch;
};

View File

@@ -403,6 +403,7 @@
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'">LLVMMCDisassembler.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'">LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
@@ -424,6 +425,8 @@
<OptimizeReferences>true</OptimizeReferences>
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'AND'$(LLVM_VERSION)'!='LLVM_3_5'">LLVMProfileData.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'">LLVMMCDisassembler.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'">LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>

View File

@@ -604,12 +604,23 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
if (diBuilder) {
llvm::DIFile file = pos.GetDIFile();
llvm::DIGlobalVariable var =
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
diBuilder->createGlobalVariable(file,
name,
name,
file,
pos.first_line,
sym->type->GetDIType(file),
(sym->storageClass == SC_STATIC),
sym->storagePtr);
#else
diBuilder->createGlobalVariable(name,
file,
pos.first_line,
sym->type->GetDIType(file),
(sym->storageClass == SC_STATIC),
sym->storagePtr);
#endif
Assert(var.Verify());
}
}
@@ -1304,18 +1315,33 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
#endif
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
std::string error;
#else // LLVM 3.6+
std::error_code error;
#endif
llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags);
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
if (error.size()) {
#else // LLVM 3.6+
if (error) {
#endif
fprintf(stderr, "Error opening output file \"%s\".\n", outFileName);
return false;
}
llvm::PassManager pm;
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
#else
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
#elif defined(LLVM_3_5)
pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
#else // LLVM 3.6+
llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
dlp->doInitialization(*module);
pm.add(dlp);
#endif
llvm::formatted_raw_ostream fos(of->os());

328
opt.cpp
View File

@@ -479,10 +479,14 @@ Optimize(llvm::Module *module, int optLevel) {
new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
optPM.add(targetLibraryInfo);
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
#else
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
#elif defined(LLVM_3_5)
optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
#else // LLVM 3.6+
llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
dlp->doInitialization(*module);
optPM.add(dlp);
#endif
llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
@@ -2117,8 +2121,8 @@ static bool
lGSToGSBaseOffsets(llvm::CallInst *callInst) {
struct GSInfo {
GSInfo(const char *pgFuncName, const char *pgboFuncName,
const char *pgbo32FuncName, bool ig)
: isGather(ig) {
const char *pgbo32FuncName, bool ig, bool ip)
: isGather(ig), isPrefetch(ip) {
func = m->module->getFunction(pgFuncName);
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
@@ -2126,6 +2130,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
llvm::Function *func;
llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
const bool isGather;
const bool isPrefetch;
};
GSInfo gsFuncs[] = {
@@ -2134,148 +2139,176 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
"__pseudo_gather_factored_base_offsets32_i8",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
"__pseudo_gather_factored_base_offsets32_i8",
true),
true, false),
GSInfo("__pseudo_gather32_i16",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
"__pseudo_gather_factored_base_offsets32_i16",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
"__pseudo_gather_factored_base_offsets32_i16",
true),
true, false),
GSInfo("__pseudo_gather32_i32",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
"__pseudo_gather_factored_base_offsets32_i32",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
"__pseudo_gather_factored_base_offsets32_i32",
true),
true, false),
GSInfo("__pseudo_gather32_float",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
"__pseudo_gather_factored_base_offsets32_float",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
"__pseudo_gather_factored_base_offsets32_float",
true),
true, false),
GSInfo("__pseudo_gather32_i64",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
"__pseudo_gather_factored_base_offsets32_i64",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
"__pseudo_gather_factored_base_offsets32_i64",
true),
true, false),
GSInfo("__pseudo_gather32_double",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
"__pseudo_gather_factored_base_offsets32_double",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
"__pseudo_gather_factored_base_offsets32_double",
true),
true, false),
GSInfo("__pseudo_scatter32_i8",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
"__pseudo_scatter_factored_base_offsets32_i8",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
"__pseudo_scatter_factored_base_offsets32_i8",
false),
false, false),
GSInfo("__pseudo_scatter32_i16",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
"__pseudo_scatter_factored_base_offsets32_i16",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
"__pseudo_scatter_factored_base_offsets32_i16",
false),
false, false),
GSInfo("__pseudo_scatter32_i32",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
"__pseudo_scatter_factored_base_offsets32_i32",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
"__pseudo_scatter_factored_base_offsets32_i32",
false),
false, false),
GSInfo("__pseudo_scatter32_float",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
"__pseudo_scatter_factored_base_offsets32_float",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
"__pseudo_scatter_factored_base_offsets32_float",
false),
false, false),
GSInfo("__pseudo_scatter32_i64",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
"__pseudo_scatter_factored_base_offsets32_i64",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
"__pseudo_scatter_factored_base_offsets32_i64",
false),
false, false),
GSInfo("__pseudo_scatter32_double",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
"__pseudo_scatter_factored_base_offsets32_double",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
"__pseudo_scatter_factored_base_offsets32_double",
false),
false, false),
GSInfo("__pseudo_gather64_i8",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
"__pseudo_gather_factored_base_offsets64_i8",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
"__pseudo_gather_factored_base_offsets32_i8",
true),
true, false),
GSInfo("__pseudo_gather64_i16",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
"__pseudo_gather_factored_base_offsets64_i16",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
"__pseudo_gather_factored_base_offsets32_i16",
true),
true, false),
GSInfo("__pseudo_gather64_i32",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
"__pseudo_gather_factored_base_offsets64_i32",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
"__pseudo_gather_factored_base_offsets32_i32",
true),
true, false),
GSInfo("__pseudo_gather64_float",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
"__pseudo_gather_factored_base_offsets64_float",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
"__pseudo_gather_factored_base_offsets32_float",
true),
true, false),
GSInfo("__pseudo_gather64_i64",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
"__pseudo_gather_factored_base_offsets64_i64",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
"__pseudo_gather_factored_base_offsets32_i64",
true),
true, false),
GSInfo("__pseudo_gather64_double",
g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
"__pseudo_gather_factored_base_offsets64_double",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
"__pseudo_gather_factored_base_offsets32_double",
true),
true, false),
GSInfo("__pseudo_scatter64_i8",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
"__pseudo_scatter_factored_base_offsets64_i8",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
"__pseudo_scatter_factored_base_offsets32_i8",
false),
false, false),
GSInfo("__pseudo_scatter64_i16",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
"__pseudo_scatter_factored_base_offsets64_i16",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
"__pseudo_scatter_factored_base_offsets32_i16",
false),
false, false),
GSInfo("__pseudo_scatter64_i32",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
"__pseudo_scatter_factored_base_offsets64_i32",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
"__pseudo_scatter_factored_base_offsets32_i32",
false),
false, false),
GSInfo("__pseudo_scatter64_float",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
"__pseudo_scatter_factored_base_offsets64_float",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
"__pseudo_scatter_factored_base_offsets32_float",
false),
false, false),
GSInfo("__pseudo_scatter64_i64",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
"__pseudo_scatter_factored_base_offsets64_i64",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
"__pseudo_scatter_factored_base_offsets32_i64",
false),
false, false),
GSInfo("__pseudo_scatter64_double",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
"__pseudo_scatter_factored_base_offsets64_double",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
"__pseudo_scatter_factored_base_offsets32_double",
false),
false, false),
GSInfo("__pseudo_prefetch_read_varying_1",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
"__prefetch_read_varying_1",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
"__prefetch_read_varying_1",
false, true),
GSInfo("__pseudo_prefetch_read_varying_2",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
false, true),
GSInfo("__pseudo_prefetch_read_varying_3",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
false, true),
GSInfo("__pseudo_prefetch_read_varying_nt",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
false, true),
};
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -2301,7 +2334,8 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector,
callInst);
if (basePtr == NULL || offsetVector == NULL)
if (basePtr == NULL || offsetVector == NULL ||
(info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
// It's actually a fully general gather/scatter with a varying
// set of base pointers, so leave it as is and continune onward
// to the next instruction...
@@ -2316,7 +2350,9 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
if ((info->isGather == true && g->target->hasGather()) ||
(info->isGather == false && g->target->hasScatter())) {
(info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
(info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
// See if the offsets are scaled by 2, 4, or 8. If so,
// extract that scale factor and rewrite the offsets to remove
// it.
@@ -2330,7 +2366,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
gatherScatterFunc = info->baseOffsets32Func;
}
if (info->isGather) {
if (info->isGather || info->isPrefetch) {
llvm::Value *mask = callInst->getArgOperand(1);
// Generate a new function call to the next pseudo gather
@@ -2387,7 +2423,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
gatherScatterFunc = info->baseOffsets32Func;
}
if (info->isGather) {
if (info->isGather || info->isPrefetch) {
llvm::Value *mask = callInst->getArgOperand(1);
// Generate a new function call to the next pseudo gather
@@ -2429,13 +2465,14 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
static bool
lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
struct GSBOInfo {
GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig)
: isGather(ig) {
GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
: isGather(ig), isPrefetch(ip) {
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
}
llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
const bool isGather;
const bool isPrefetch;
};
GSBOInfo gsFuncs[] = {
@@ -2443,63 +2480,87 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
"__pseudo_gather_factored_base_offsets32_i8",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
"__pseudo_gather_factored_base_offsets32_i8",
true),
true, false),
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
"__pseudo_gather_factored_base_offsets32_i16",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
"__pseudo_gather_factored_base_offsets32_i16",
true),
true, false),
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
"__pseudo_gather_factored_base_offsets32_i32",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
"__pseudo_gather_factored_base_offsets32_i32",
true),
true, false),
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
"__pseudo_gather_factored_base_offsets32_float",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
"__pseudo_gather_factored_base_offsets32_float",
true),
true, false),
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
"__pseudo_gather_factored_base_offsets32_i64",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
"__pseudo_gather_factored_base_offsets32_i64",
true),
true, false),
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
"__pseudo_gather_factored_base_offsets32_double",
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
"__pseudo_gather_factored_base_offsets32_double",
true),
true, false),
GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
"__pseudo_scatter_factored_base_offsets32_i8",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
"__pseudo_scatter_factored_base_offsets32_i8",
false),
false, false),
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
"__pseudo_scatter_factored_base_offsets32_i16",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
"__pseudo_scatter_factored_base_offsets32_i16",
false),
false, false),
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
"__pseudo_scatter_factored_base_offsets32_i32",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
"__pseudo_scatter_factored_base_offsets32_i32",
false),
false, false),
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
"__pseudo_scatter_factored_base_offsets32_float",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
"__pseudo_scatter_factored_base_offsets32_float",
false),
false, false),
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
"__pseudo_scatter_factored_base_offsets32_i64",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
"__pseudo_scatter_factored_base_offsets32_i64",
false),
false, false),
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
"__pseudo_scatter_factored_base_offsets32_double",
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
"__pseudo_scatter_factored_base_offsets32_double",
false),
false, false),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
"__prefetch_read_varying_1",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
"__prefetch_read_varying_1",
false, true),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
"__prefetch_read_varying_2",
false, true),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
"__prefetch_read_varying_3",
false, true),
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
"__prefetch_read_varying_nt",
false, true),
};
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
@@ -4290,149 +4351,170 @@ lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
static bool
lReplacePseudoGS(llvm::CallInst *callInst) {
struct LowerGSInfo {
LowerGSInfo(const char *pName, const char *aName, bool ig)
: isGather(ig) {
LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip)
: isGather(ig), isPrefetch(ip) {
pseudoFunc = m->module->getFunction(pName);
actualFunc = m->module->getFunction(aName);
}
llvm::Function *pseudoFunc;
llvm::Function *actualFunc;
const bool isGather;
const bool isPrefetch;
};
LowerGSInfo lgsInfo[] = {
LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true),
LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true),
LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true),
LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true),
LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true),
LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true),
LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true),
LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true),
LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true),
LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true),
LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true),
LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true),
LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",
"__gather_factored_base_offsets32_i8", true),
"__gather_factored_base_offsets32_i8", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16",
"__gather_factored_base_offsets32_i16", true),
"__gather_factored_base_offsets32_i16", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32",
"__gather_factored_base_offsets32_i32", true),
"__gather_factored_base_offsets32_i32", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
"__gather_factored_base_offsets32_float", true),
"__gather_factored_base_offsets32_float", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64",
"__gather_factored_base_offsets32_i64", true),
"__gather_factored_base_offsets32_i64", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets32_double",
"__gather_factored_base_offsets32_double", true),
"__gather_factored_base_offsets32_double", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",
"__gather_factored_base_offsets64_i8", true),
"__gather_factored_base_offsets64_i8", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16",
"__gather_factored_base_offsets64_i16", true),
"__gather_factored_base_offsets64_i16", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32",
"__gather_factored_base_offsets64_i32", true),
"__gather_factored_base_offsets64_i32", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_float",
"__gather_factored_base_offsets64_float", true),
"__gather_factored_base_offsets64_float", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64",
"__gather_factored_base_offsets64_i64", true),
"__gather_factored_base_offsets64_i64", true, false),
LowerGSInfo("__pseudo_gather_factored_base_offsets64_double",
"__gather_factored_base_offsets64_double", true),
"__gather_factored_base_offsets64_double", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_i8",
"__gather_base_offsets32_i8", true),
"__gather_base_offsets32_i8", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_i16",
"__gather_base_offsets32_i16", true),
"__gather_base_offsets32_i16", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_i32",
"__gather_base_offsets32_i32", true),
"__gather_base_offsets32_i32", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_float",
"__gather_base_offsets32_float", true),
"__gather_base_offsets32_float", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_i64",
"__gather_base_offsets32_i64", true),
"__gather_base_offsets32_i64", true, false),
LowerGSInfo("__pseudo_gather_base_offsets32_double",
"__gather_base_offsets32_double", true),
"__gather_base_offsets32_double", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_i8",
"__gather_base_offsets64_i8", true),
"__gather_base_offsets64_i8", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_i16",
"__gather_base_offsets64_i16", true),
"__gather_base_offsets64_i16", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_i32",
"__gather_base_offsets64_i32", true),
"__gather_base_offsets64_i32", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_float",
"__gather_base_offsets64_float", true),
"__gather_base_offsets64_float", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_i64",
"__gather_base_offsets64_i64", true),
"__gather_base_offsets64_i64", true, false),
LowerGSInfo("__pseudo_gather_base_offsets64_double",
"__gather_base_offsets64_double", true),
"__gather_base_offsets64_double", true, false),
LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false),
LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false),
LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false),
LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false),
LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false),
LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false),
LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false),
LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false),
LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false),
LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false),
LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false),
LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false),
LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",
"__scatter_factored_base_offsets32_i8", false),
"__scatter_factored_base_offsets32_i8", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16",
"__scatter_factored_base_offsets32_i16", false),
"__scatter_factored_base_offsets32_i16", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32",
"__scatter_factored_base_offsets32_i32", false),
"__scatter_factored_base_offsets32_i32", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float",
"__scatter_factored_base_offsets32_float", false),
"__scatter_factored_base_offsets32_float", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64",
"__scatter_factored_base_offsets32_i64", false),
"__scatter_factored_base_offsets32_i64", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
"__scatter_factored_base_offsets32_double", false),
"__scatter_factored_base_offsets32_double", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",
"__scatter_factored_base_offsets64_i8", false),
"__scatter_factored_base_offsets64_i8", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16",
"__scatter_factored_base_offsets64_i16", false),
"__scatter_factored_base_offsets64_i16", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32",
"__scatter_factored_base_offsets64_i32", false),
"__scatter_factored_base_offsets64_i32", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
"__scatter_factored_base_offsets64_float", false),
"__scatter_factored_base_offsets64_float", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64",
"__scatter_factored_base_offsets64_i64", false),
"__scatter_factored_base_offsets64_i64", false, false),
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double",
"__scatter_factored_base_offsets64_double", false),
"__scatter_factored_base_offsets64_double", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i8",
"__scatter_base_offsets32_i8", false),
"__scatter_base_offsets32_i8", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i16",
"__scatter_base_offsets32_i16", false),
"__scatter_base_offsets32_i16", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i32",
"__scatter_base_offsets32_i32", false),
"__scatter_base_offsets32_i32", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_float",
"__scatter_base_offsets32_float", false),
"__scatter_base_offsets32_float", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_i64",
"__scatter_base_offsets32_i64", false),
"__scatter_base_offsets32_i64", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets32_double",
"__scatter_base_offsets32_double", false),
"__scatter_base_offsets32_double", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i8",
"__scatter_base_offsets64_i8", false),
"__scatter_base_offsets64_i8", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i16",
"__scatter_base_offsets64_i16", false),
"__scatter_base_offsets64_i16", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i32",
"__scatter_base_offsets64_i32", false),
"__scatter_base_offsets64_i32", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_float",
"__scatter_base_offsets64_float", false),
"__scatter_base_offsets64_float", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_i64",
"__scatter_base_offsets64_i64", false),
"__scatter_base_offsets64_i64", false, false),
LowerGSInfo("__pseudo_scatter_base_offsets64_double",
"__scatter_base_offsets64_double", false),
"__scatter_base_offsets64_double", false, false),
LowerGSInfo("__pseudo_prefetch_read_varying_1",
"__prefetch_read_varying_1", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_1_native",
"__prefetch_read_varying_1_native", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_2",
"__prefetch_read_varying_2", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_2_native",
"__prefetch_read_varying_2_native", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_3",
"__prefetch_read_varying_3", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_3_native",
"__prefetch_read_varying_3_native", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_nt",
"__prefetch_read_varying_nt", false, true),
LowerGSInfo("__pseudo_prefetch_read_varying_nt_native",
"__prefetch_read_varying_nt_native", false, true),
};
llvm::Function *calledFunc = callInst->getCalledFunction();
@@ -4459,7 +4541,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
if (gotPosition && g->target->getVectorWidth() > 1) {
if (info->isGather)
PerformanceWarning(pos, "Gather required to load value.");
else
else if (!info->isPrefetch)
PerformanceWarning(pos, "Scatter required to store value.");
}
return true;
@@ -4740,6 +4822,8 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
"__scatter64_i8", "__scatter64_i16",
"__scatter64_i32", "__scatter64_i64",
"__scatter64_float", "__scatter64_double",
"__prefetch_read_varying_1", "__prefetch_read_varying_2",
"__prefetch_read_varying_3", "__prefetch_read_varying_nt",
"__keep_funcs_live",
};

View File

@@ -647,8 +647,8 @@ def run_tests(options1, args, print_version):
options.include_file = "examples/intrinsics/generic-64.h"
options.target = "generic-64"
elif options.target == "knc":
error("No knc #include specified; using examples/intrinsics/knc-i1x16.h\n", 2)
options.include_file = "examples/intrinsics/knc-i1x16.h"
error("No knc #include specified; using examples/intrinsics/knc.h\n", 2)
options.include_file = "examples/intrinsics/knc.h"
if options.compiler_exe == None:
if (options.target == "knc"):

View File

@@ -847,43 +847,19 @@ static inline void prefetch_nt(const void * uniform ptr) {
}
static inline void prefetch_l1(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l1(p);
}
__pseudo_prefetch_read_varying_1((int64)ptr, (IntMaskType)__mask);
}
static inline void prefetch_l2(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l2(p);
}
__pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask);
}
static inline void prefetch_l3(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l3(p);
}
__pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask);
}
static inline void prefetch_nt(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_nt(p);
}
__pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask);
}
///////////////////////////////////////////////////////////////////////////

View File

@@ -712,7 +712,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
}
}
/** Emit code for an if test that checks the mask and the test values and
tries to be smart about jumping over code that doesn't need to be run.
*/
@@ -1101,8 +1100,10 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
// the code for the test. This is only necessary for varying loops;
// 'uniform' loops just jump when they hit a continue statement and
// don't mess with the mask.
if (!uniformTest)
if (!uniformTest) {
ctx->RestoreContinuedLanes();
ctx->ClearBreakLanes();
}
llvm::Value *testValue = testExpr->GetValue(ctx);
if (!testValue)
return;
@@ -1310,6 +1311,8 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
// test code.
ctx->SetCurrentBasicBlock(bstep);
ctx->RestoreContinuedLanes();
ctx->ClearBreakLanes();
if (step)
step->EmitCode(ctx);
ctx->BranchInst(btest);

View File

@@ -0,0 +1,22 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int64 *ptr = &(a[programIndex+zero]);
prefetch_l1(ptr);
prefetch_l2(ptr);
prefetch_l3(ptr);
prefetch_nt(ptr);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}