Merge branch 'master' into nvptx_clean_master
This commit is contained in:
3
alloy.py
3
alloy.py
@@ -110,8 +110,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
|
||||
if version_LLVM == "trunk":
|
||||
SVN_PATH="trunk"
|
||||
if version_LLVM == "3.5":
|
||||
# SVN_PATH=tags/RELEASE_35/rc1
|
||||
SVN_PATH="branches/release_35"
|
||||
SVN_PATH="tags/RELEASE_350/final"
|
||||
version_LLVM = "3_5"
|
||||
if version_LLVM == "3.4":
|
||||
SVN_PATH="tags/RELEASE_34/dot2-final"
|
||||
|
||||
30
builtins.cpp
30
builtins.cpp
@@ -555,6 +555,10 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__prefetch_read_uniform_2",
|
||||
"__prefetch_read_uniform_3",
|
||||
"__prefetch_read_uniform_nt",
|
||||
"__pseudo_prefetch_read_varying_1",
|
||||
"__pseudo_prefetch_read_varying_2",
|
||||
"__pseudo_prefetch_read_varying_3",
|
||||
"__pseudo_prefetch_read_varying_nt",
|
||||
"__psubs_vi8",
|
||||
"__psubs_vi16",
|
||||
"__psubus_vi8",
|
||||
@@ -780,7 +784,11 @@ void
|
||||
AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
llvm::Module *module, SymbolTable *symbolTable) {
|
||||
llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
|
||||
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
|
||||
llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
|
||||
#else // LLVM 3.6+
|
||||
llvm::MemoryBufferRef bcBuf = llvm::MemoryBuffer::getMemBuffer(sb)->getMemBufferRef();
|
||||
#endif
|
||||
|
||||
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
|
||||
llvm::ErrorOr<llvm::Module *> ModuleOrErr = llvm::parseBitcodeFile(bcBuf, *g->ctx);
|
||||
@@ -910,12 +918,23 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
// have the DW_AT_artifical attribute. It's not clear if this
|
||||
// matters for anything though.
|
||||
llvm::DIGlobalVariable var =
|
||||
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
|
||||
m->diBuilder->createGlobalVariable(file,
|
||||
name,
|
||||
name,
|
||||
file,
|
||||
0 /* line */,
|
||||
diType,
|
||||
true /* static */,
|
||||
sym->storagePtr);
|
||||
#else
|
||||
m->diBuilder->createGlobalVariable(name,
|
||||
file,
|
||||
0 /* line */,
|
||||
diType,
|
||||
true /* static */,
|
||||
sym->storagePtr);
|
||||
#endif
|
||||
Assert(var.Verify());
|
||||
}
|
||||
}
|
||||
@@ -970,12 +989,23 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
llvm::DIType diType = sym->type->GetDIType(file);
|
||||
Assert(diType.Verify());
|
||||
llvm::DIGlobalVariable var =
|
||||
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
|
||||
m->diBuilder->createGlobalVariable(file,
|
||||
sym->name.c_str(),
|
||||
sym->name.c_str(),
|
||||
file,
|
||||
0 /* line */,
|
||||
diType,
|
||||
false /* static */,
|
||||
sym->storagePtr);
|
||||
#else
|
||||
m->diBuilder->createGlobalVariable(sym->name.c_str(),
|
||||
file,
|
||||
0 /* line */,
|
||||
diType,
|
||||
false /* static */,
|
||||
sym->storagePtr);
|
||||
#endif
|
||||
Assert(var.Verify());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -370,6 +370,14 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
|
||||
|
||||
declare void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
|
||||
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||
declare void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
|
||||
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||
declare void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
|
||||
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||
declare void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
|
||||
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
|
||||
104
builtins/util.m4
104
builtins/util.m4
@@ -1584,6 +1584,50 @@ define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
|
||||
call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
|
||||
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 3, i32 1)
|
||||
')
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||
|
||||
define void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
|
||||
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 2, i32 1)
|
||||
')
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||
|
||||
define void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
|
||||
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 1, i32 1)
|
||||
')
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||
|
||||
define void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) alwaysinline {
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i64> %addr, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8*
|
||||
call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 0, i32 1)
|
||||
')
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -2535,6 +2579,31 @@ declare void
|
||||
@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x double>, <WIDTH x MASK>) nounwind
|
||||
|
||||
|
||||
declare void @__pseudo_prefetch_read_varying_1(<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void
|
||||
@__pseudo_prefetch_read_varying_1_native(i8 *, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind
|
||||
|
||||
declare void @__pseudo_prefetch_read_varying_2(<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void
|
||||
@__pseudo_prefetch_read_varying_2_native(i8 *, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind
|
||||
|
||||
declare void @__pseudo_prefetch_read_varying_3(<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void
|
||||
@__pseudo_prefetch_read_varying_3_native(i8 *, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind
|
||||
|
||||
declare void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void
|
||||
@__pseudo_prefetch_read_varying_nt_native(i8 *, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
declare void @__use8(<WIDTH x i8>)
|
||||
@@ -3034,6 +3103,41 @@ ifelse(HAVE_SCATTER, `1',
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; prefetchs
|
||||
|
||||
call void @__pseudo_prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__pseudo_prefetch_read_varying_1_native(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__prefetch_read_varying_1_native(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__prefetch_read_varying_1(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__pseudo_prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__pseudo_prefetch_read_varying_2_native(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__prefetch_read_varying_2_native(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__prefetch_read_varying_2(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__pseudo_prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__pseudo_prefetch_read_varying_3_native(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__prefetch_read_varying_3_native(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__prefetch_read_varying_3(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__pseudo_prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__pseudo_prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__prefetch_read_varying_nt_native(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__prefetch_read_varying_nt(<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
10
cbackend.cpp
10
cbackend.cpp
@@ -4945,9 +4945,19 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
|
||||
llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
|
||||
#endif
|
||||
|
||||
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
|
||||
std::string error;
|
||||
#else // LLVM 3.6+
|
||||
std::error_code error;
|
||||
#endif
|
||||
|
||||
llvm::tool_output_file *of = new llvm::tool_output_file(fn, error, flags);
|
||||
|
||||
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
|
||||
if (error.size()) {
|
||||
#else // LLVM 3.6+
|
||||
if (error) {
|
||||
#endif
|
||||
fprintf(stderr, "Error opening output file \"%s\".\n", fn);
|
||||
return false;
|
||||
}
|
||||
|
||||
29
ctx.cpp
29
ctx.cpp
@@ -745,6 +745,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
|
||||
// that have executed a 'break' statement:
|
||||
// breakLanes = breakLanes | mask
|
||||
AssertPos(currentPos, breakLanesPtr != NULL);
|
||||
|
||||
llvm::Value *mask = GetInternalMask();
|
||||
llvm::Value *breakMask = LoadInst(breakLanesPtr,
|
||||
"break_mask");
|
||||
@@ -927,6 +928,16 @@ FunctionEmitContext::RestoreContinuedLanes() {
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::ClearBreakLanes() {
|
||||
if (breakLanesPtr == NULL)
|
||||
return;
|
||||
|
||||
// breakLanes = 0
|
||||
StoreInst(LLVMMaskAllOff, breakLanesPtr);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
@@ -1636,14 +1647,16 @@ FunctionEmitContext::StartScope() {
|
||||
llvm::DILexicalBlock lexicalBlock =
|
||||
m->diBuilder->createLexicalBlock(parentScope, diFile,
|
||||
currentPos.first_line,
|
||||
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
|
||||
#if defined(LLVM_3_5)
|
||||
// Revision 202736 in LLVM adds support of DWARF discriminator
|
||||
// to the last argument and revision 202737 in clang adds 0
|
||||
// for the last argument by default.
|
||||
currentPos.first_column, 0);
|
||||
#else
|
||||
// Revision 216239 in LLVM removes support of DWARF discriminator
|
||||
// as the last argument
|
||||
currentPos.first_column);
|
||||
#endif
|
||||
#endif // LLVM 3.2, 3.3, 3.4 and 3.6+
|
||||
AssertPos(currentPos, lexicalBlock.Verify());
|
||||
debugScopes.push_back(lexicalBlock);
|
||||
}
|
||||
@@ -1683,8 +1696,14 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
|
||||
diType,
|
||||
true /* preserve through opts */);
|
||||
AssertPos(currentPos, var.Verify());
|
||||
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
|
||||
llvm::DIExpression E = m->diBuilder->createExpression();
|
||||
llvm::Instruction *declareInst =
|
||||
m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock);
|
||||
#else
|
||||
llvm::Instruction *declareInst =
|
||||
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
|
||||
#endif
|
||||
AddDebugPos(declareInst, &sym->pos, &scope);
|
||||
}
|
||||
|
||||
@@ -1710,8 +1729,14 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym, int argNum) {
|
||||
flags,
|
||||
argNum+1);
|
||||
AssertPos(currentPos, var.Verify());
|
||||
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
|
||||
llvm::DIExpression E = m->diBuilder->createExpression();
|
||||
llvm::Instruction *declareInst =
|
||||
m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock);
|
||||
#else
|
||||
llvm::Instruction *declareInst =
|
||||
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
|
||||
#endif
|
||||
AddDebugPos(declareInst, &sym->pos, &scope);
|
||||
}
|
||||
|
||||
|
||||
7
ctx.h
7
ctx.h
@@ -196,6 +196,13 @@ public:
|
||||
previous iteration. */
|
||||
void RestoreContinuedLanes();
|
||||
|
||||
/** This method is called by code emitting IR for a loop. It clears
|
||||
any lanes that contained a break since the mask has been updated to take
|
||||
them into account. This is necessary as all the bail out checks for
|
||||
breaks are meant to only deal with lanes breaking on the current iteration.
|
||||
*/
|
||||
void ClearBreakLanes();
|
||||
|
||||
/** Indicates that code generation for a "switch" statement is about to
|
||||
start. isUniform indicates whether the "switch" value is uniform,
|
||||
and bbAfterSwitch gives the basic block immediately following the
|
||||
|
||||
@@ -160,8 +160,8 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include='$(ISPC_file).ispc'>
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) $(flags)</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) -g $(flags)</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) -g $(flags)</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out)</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out)</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>
|
||||
|
||||
@@ -1540,6 +1540,15 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
|
||||
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
|
||||
}
|
||||
|
||||
#define PREFETCH_READ_VARYING(CACHE_NUM) \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||
__vec16_i32 offsets, __vec16_i1 mask) {} \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
|
||||
|
||||
PREFETCH_READ_VARYING(1)
|
||||
PREFETCH_READ_VARYING(2)
|
||||
PREFETCH_READ_VARYING(3)
|
||||
PREFETCH_READ_VARYING(nt)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// atomics
|
||||
|
||||
|
||||
@@ -1624,6 +1624,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
|
||||
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
|
||||
}
|
||||
|
||||
#define PREFETCH_READ_VARYING(CACHE_NUM) \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||
__vec32_i32 offsets, __vec32_i1 mask) {} \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec32_i64 addr, __vec32_i1 mask) {} \
|
||||
|
||||
PREFETCH_READ_VARYING(1)
|
||||
PREFETCH_READ_VARYING(2)
|
||||
PREFETCH_READ_VARYING(3)
|
||||
PREFETCH_READ_VARYING(nt)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// atomics
|
||||
|
||||
|
||||
@@ -1757,6 +1757,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
|
||||
static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
|
||||
}
|
||||
|
||||
#define PREFETCH_READ_VARYING(CACHE_NUM) \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||
__vec64_i32 offsets, __vec64_i1 mask) {} \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec64_i64 addr, __vec64_i1 mask) {} \
|
||||
|
||||
PREFETCH_READ_VARYING(1)
|
||||
PREFETCH_READ_VARYING(2)
|
||||
PREFETCH_READ_VARYING(3)
|
||||
PREFETCH_READ_VARYING(nt)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// atomics
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
Copyright (c) 2010-2013, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -31,6 +31,7 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <limits.h> // INT_MIN
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
@@ -525,11 +526,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
|
||||
*p = v;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __smear_i1(int i);
|
||||
template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
|
||||
template <class RetVecType> static RetVecType __smear_i1(int i);
|
||||
template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
|
||||
|
||||
template <class RetVecType> RetVecType __setzero_i1();
|
||||
template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
|
||||
template <class RetVecType> static RetVecType __setzero_i1();
|
||||
template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
|
||||
|
||||
template <class RetVecType> __vec16_i1 __undef_i1();
|
||||
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
|
||||
@@ -677,8 +678,8 @@ static FORCEINLINE __vec16_i32 __select( bool cond, __vec16_i32 a, __vec16_
|
||||
static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int32_t index) { return v[index]; }
|
||||
static FORCEINLINE void __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val; }
|
||||
|
||||
template <class RetVecType> RetVecType __smear_i32(int32_t i);
|
||||
template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
|
||||
template <class RetVecType> RetVecType static __smear_i32(int32_t i);
|
||||
template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
|
||||
|
||||
static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
|
||||
static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
|
||||
@@ -686,11 +687,11 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
|
||||
static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
|
||||
static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
template <class RetVecType> RetVecType __setzero_i32();
|
||||
template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
|
||||
template <class RetVecType> static RetVecType __setzero_i32();
|
||||
template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
|
||||
|
||||
template <class RetVecType> RetVecType __undef_i32();
|
||||
template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
|
||||
template <class RetVecType> static RetVecType __undef_i32();
|
||||
template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
|
||||
|
||||
static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
|
||||
|
||||
@@ -742,11 +743,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
|
||||
}
|
||||
|
||||
#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
|
||||
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
|
||||
template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
|
||||
{
|
||||
return _mm512_load_epi32(p);
|
||||
}
|
||||
template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v)
|
||||
template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v)
|
||||
{
|
||||
_mm512_store_epi32(p, v);
|
||||
}
|
||||
@@ -1017,21 +1018,21 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
|
||||
}
|
||||
|
||||
#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
|
||||
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
|
||||
template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
|
||||
{
|
||||
__m512i v2 = _mm512_load_epi32(p);
|
||||
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
|
||||
return __vec16_i64(v2,v1);
|
||||
}
|
||||
template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); }
|
||||
template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
|
||||
template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); }
|
||||
template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
|
||||
{
|
||||
__m512i v1 = v.v2;
|
||||
__m512i v2 = v.v1;
|
||||
_mm512_store_epi64(p, v2);
|
||||
_mm512_store_epi64(((uint8_t*)p)+64, v1);
|
||||
}
|
||||
template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); }
|
||||
template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); }
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1067,14 +1068,14 @@ static FORCEINLINE __vec16_f __select( bool cond, __vec16_f a, __vec16_f b)
|
||||
static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { return v[index]; }
|
||||
static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
|
||||
|
||||
template <class RetVecType> RetVecType __smear_float(float f);
|
||||
template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
|
||||
template <class RetVecType> static RetVecType __smear_float(float f);
|
||||
template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
|
||||
|
||||
template <class RetVecType> RetVecType __setzero_float();
|
||||
template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
|
||||
template <class RetVecType> static RetVecType __setzero_float();
|
||||
template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
|
||||
|
||||
template <class RetVecType> RetVecType __undef_float();
|
||||
template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
|
||||
template <class RetVecType> static RetVecType __undef_float();
|
||||
template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
|
||||
|
||||
static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index)
|
||||
{
|
||||
@@ -1131,12 +1132,12 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
|
||||
}
|
||||
|
||||
#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
|
||||
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
|
||||
template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
|
||||
{
|
||||
return _mm512_load_ps(p);
|
||||
}
|
||||
/* this one doesn't fail but it is commented out for completeness, no aligned load/stores */
|
||||
template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v)
|
||||
template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v)
|
||||
{
|
||||
_mm512_store_ps(p, v);
|
||||
}
|
||||
@@ -1309,14 +1310,14 @@ static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b)
|
||||
static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { return v[index]; }
|
||||
static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
|
||||
|
||||
template <class RetVecType> RetVecType __smear_double(double d);
|
||||
template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
|
||||
template <class RetVecType> static RetVecType __smear_double(double d);
|
||||
template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
|
||||
|
||||
template <class RetVecType> RetVecType __setzero_double();
|
||||
template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
|
||||
template <class RetVecType> static RetVecType __setzero_double();
|
||||
template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
|
||||
|
||||
template <class RetVecType> RetVecType __undef_double();
|
||||
template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
|
||||
template <class RetVecType> static RetVecType __undef_double();
|
||||
template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
|
||||
|
||||
#define CASTD2F(_v_, _v_hi_, _v_lo_) \
|
||||
__vec16_f _v_hi_, _v_lo_; \
|
||||
@@ -1390,17 +1391,17 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
|
||||
|
||||
|
||||
#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */
|
||||
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
|
||||
template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
|
||||
{
|
||||
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
|
||||
}
|
||||
template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v)
|
||||
template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v)
|
||||
{
|
||||
_mm512_store_pd(p, v.v1);
|
||||
_mm512_store_pd(((uint8_t*)p)+64, v.v2);
|
||||
}
|
||||
template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); }
|
||||
template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); }
|
||||
template <> FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); }
|
||||
template <> FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); }
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -2162,6 +2163,7 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t
|
||||
static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask)
|
||||
{
|
||||
const __vec16_i64 offsets = _offsets.cvt2hilo();
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
__vec16_i1 still_to_do = mask;
|
||||
__vec16_i32 tmp;
|
||||
while (still_to_do) {
|
||||
@@ -2172,8 +2174,8 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((unsigned long)_base +
|
||||
((scale*(unsigned long)hi32) << 32));
|
||||
tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
|
||||
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
|
||||
tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base,
|
||||
_MM_UPCONV_EPI32_SINT8, scale,
|
||||
_MM_HINT_NONE);
|
||||
still_to_do = _mm512_kxor(match,still_to_do);
|
||||
@@ -2197,6 +2199,7 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32
|
||||
static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask)
|
||||
{
|
||||
const __vec16_i64 offsets = _offsets.cvt2hilo();
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
// There is no gather instruction with 64-bit offsets in KNC.
|
||||
// We have to manually iterate over the upper 32 bits ;-)
|
||||
__vec16_i1 still_to_do = mask;
|
||||
@@ -2209,8 +2212,8 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((unsigned long)_base +
|
||||
((scale*(unsigned long)hi32) << 32));
|
||||
ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base,
|
||||
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
|
||||
ret = _mm512_mask_i32extgather_epi32(ret, match, signed_offsets, base,
|
||||
_MM_UPCONV_EPI32_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
still_to_do = _mm512_kxor(match, still_to_do);
|
||||
@@ -2230,6 +2233,7 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32
|
||||
static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask)
|
||||
{
|
||||
const __vec16_i64 offsets = _offsets.cvt2hilo();
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
// There is no gather instruction with 64-bit offsets in KNC.
|
||||
// We have to manually iterate over the upper 32 bits ;-)
|
||||
__vec16_i1 still_to_do = mask;
|
||||
@@ -2242,8 +2246,8 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((unsigned long)_base +
|
||||
((scale*(unsigned long)hi32) << 32));
|
||||
ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
|
||||
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
|
||||
ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base,
|
||||
_MM_UPCONV_PS_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
still_to_do = _mm512_kxor(match, still_to_do);
|
||||
@@ -2339,6 +2343,7 @@ static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale,
|
||||
static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask)
|
||||
{
|
||||
const __vec16_i64 offsets = _offsets.cvt2hilo();
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
|
||||
__vec16_i1 still_to_do = mask;
|
||||
while (still_to_do) {
|
||||
@@ -2349,8 +2354,8 @@ static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t sc
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((unsigned long)_base +
|
||||
((scale*(unsigned long)hi32) << 32));
|
||||
_mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo,
|
||||
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
|
||||
_mm512_mask_i32extscatter_epi32(base, match, signed_offsets,
|
||||
value,
|
||||
_MM_DOWNCONV_EPI32_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
@@ -2370,6 +2375,7 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal
|
||||
static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask)
|
||||
{
|
||||
const __vec16_i64 offsets = _offsets.cvt2hilo();
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
|
||||
__vec16_i1 still_to_do = mask;
|
||||
while (still_to_do) {
|
||||
@@ -2380,8 +2386,9 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((unsigned long)_base +
|
||||
((scale*(unsigned long)hi32) << 32));
|
||||
_mm512_mask_i32extscatter_ps(base, match, offsets.v_lo,
|
||||
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
|
||||
|
||||
_mm512_mask_i32extscatter_ps(base, match, signed_offsets,
|
||||
value,
|
||||
_MM_DOWNCONV_PS_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
@@ -2543,6 +2550,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
|
||||
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
|
||||
}
|
||||
|
||||
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||
__vec16_i32 offsets, __vec16_i1 mask) { \
|
||||
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
|
||||
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
|
||||
__vec16_i1 copy_mask = _mm512_kmov(mask); \
|
||||
_mm512_kswapb(mask, copy_mask); \
|
||||
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
|
||||
} \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
|
||||
|
||||
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
|
||||
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
|
||||
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
|
||||
|
||||
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
|
||||
__vec16_i32 offsets, __vec16_i1 mask) {}
|
||||
|
||||
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// atomics
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -2606,6 +2606,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
|
||||
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
|
||||
}
|
||||
|
||||
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||
__vec16_i32 offsets, __vec16_i1 mask) { \
|
||||
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
|
||||
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
|
||||
__vec16_i1 copy_mask = _mm512_kmov(mask); \
|
||||
_mm512_kswapb(mask, copy_mask); \
|
||||
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
|
||||
} \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
|
||||
|
||||
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
|
||||
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
|
||||
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
|
||||
|
||||
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
|
||||
__vec16_i32 offsets, __vec16_i1 mask) {}
|
||||
|
||||
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// atomics
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
/**
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -31,6 +31,7 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <limits.h> // INT_MIN
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
@@ -43,6 +44,15 @@
|
||||
#include <iostream> // for operator<<(m512[i])
|
||||
#include <iomanip> // for operator<<(m512[i])
|
||||
|
||||
#if 0
|
||||
#define STRING(x) #x
|
||||
#define TOSTRING(x) STRING(x)
|
||||
#define PING std::cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << std::endl
|
||||
#define PRINT(x) std::cout << STRING(x) << " = " << (x) << std::endl
|
||||
#define PRINT2(x,y) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << std::endl
|
||||
#define PRINT3(x,y,z) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << std::endl
|
||||
#define PRINT4(x,y,z,w) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << std::endl
|
||||
#endif
|
||||
|
||||
#define FORCEINLINE __forceinline
|
||||
#ifdef _MSC_VER
|
||||
@@ -75,7 +85,44 @@ typedef int64_t __vec1_i64;
|
||||
|
||||
struct __vec16_i32;
|
||||
|
||||
#if 0
|
||||
/* (iw) actually, this *SHOULD* be the right implementation for a
|
||||
vec16_i1: this one is a class that can have a constructor (which
|
||||
ISPC sometimes emits for these vectors...) This version might
|
||||
not be working with embree's ISPC bindings, probably because
|
||||
embree still uses the 'wrong' implementation */
|
||||
typedef struct PRE_ALIGN(2) __vec16_i1
|
||||
{
|
||||
FORCEINLINE operator __mmask16() const { return v; }
|
||||
FORCEINLINE __vec16_i1() { }
|
||||
FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { }
|
||||
FORCEINLINE __vec16_i1(bool v0, bool v1, bool v2, bool v3,
|
||||
bool v4, bool v5, bool v6, bool v7,
|
||||
bool v8, bool v9, bool v10, bool v11,
|
||||
bool v12, bool v13, bool v14, bool v15) {
|
||||
v = ((v0 & 1) |
|
||||
((v1 & 1) << 1) |
|
||||
((v2 & 1) << 2) |
|
||||
((v3 & 1) << 3) |
|
||||
((v4 & 1) << 4) |
|
||||
((v5 & 1) << 5) |
|
||||
((v6 & 1) << 6) |
|
||||
((v7 & 1) << 7) |
|
||||
((v8 & 1) << 8) |
|
||||
((v9 & 1) << 9) |
|
||||
((v10 & 1) << 10) |
|
||||
((v11 & 1) << 11) |
|
||||
((v12 & 1) << 12) |
|
||||
((v13 & 1) << 13) |
|
||||
((v14 & 1) << 14) |
|
||||
((v15 & 1) << 15));
|
||||
}
|
||||
__mmask16 v;
|
||||
} POST_ALIGN(2) __vec16_i1;
|
||||
|
||||
#else
|
||||
typedef __mmask16 POST_ALIGN(2) __vec16_i1;
|
||||
#endif
|
||||
|
||||
typedef struct PRE_ALIGN(64) __vec16_f {
|
||||
FORCEINLINE operator __m512() const { return v; }
|
||||
@@ -167,14 +214,14 @@ struct vec16 {
|
||||
|
||||
PRE_ALIGN(16) struct __vec16_i8 : public vec16<int8_t> {
|
||||
FORCEINLINE __vec16_i8() { }
|
||||
FORCEINLINE __vec16_i8(const __vec16_i8 &o);
|
||||
FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
|
||||
FORCEINLINE __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
|
||||
int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
||||
int8_t v8, int8_t v9, int8_t v10, int8_t v11,
|
||||
int8_t v12, int8_t v13, int8_t v14, int8_t v15)
|
||||
FORCEINLINE __vec16_i8(const int8_t v0, const int8_t v1, const int8_t v2, const int8_t v3,
|
||||
const int8_t v4, const int8_t v5, const int8_t v6, const int8_t v7,
|
||||
const int8_t v8, const int8_t v9, const int8_t v10, const int8_t v11,
|
||||
const int8_t v12, const int8_t v13, const int8_t v14, const int8_t v15)
|
||||
: vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
|
||||
v8, v9, v10, v11, v12, v13, v14, v15) { }
|
||||
FORCEINLINE __vec16_i8(const __vec16_i8 &o);
|
||||
FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
|
||||
} POST_ALIGN(16);
|
||||
|
||||
PRE_ALIGN(32) struct __vec16_i16 : public vec16<int16_t> {
|
||||
@@ -215,6 +262,28 @@ inline std::ostream &operator<<(std::ostream &out, const __m512 &v)
|
||||
return out;
|
||||
}
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &out, const __vec16_i8 &v)
|
||||
{
|
||||
out << "[";
|
||||
for (int i=0;i<16;i++)
|
||||
out << (i?",":"") << std::dec << std::setw(8) << (int)((unsigned char*)&v)[i] << std::dec;
|
||||
// out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec;
|
||||
|
||||
out << "]" << std::flush;
|
||||
return out;
|
||||
}
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &out, const __vec16_i64 &v)
|
||||
{
|
||||
out << "[";
|
||||
uint32_t *ptr = (uint32_t*)&v;
|
||||
for (int i=0;i<16;i++) {
|
||||
uint64_t val = (uint64_t(ptr[i])<<32)+ptr[i+16];
|
||||
out << (i?",":"") << ((int*)val);
|
||||
}
|
||||
out << "]" << std::flush;
|
||||
return out;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// macros...
|
||||
@@ -299,15 +368,29 @@ static FORCEINLINE bool __extract_element(__vec16_i1 mask, uint32_t index) {
|
||||
return (mask & (1 << index)) ? true : false;
|
||||
}
|
||||
|
||||
|
||||
static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index)
|
||||
{
|
||||
//uint *src = (uint *)&v;
|
||||
const uint *src = (const uint *)&v;
|
||||
return src[index+16] | (uint64_t(src[index]) << 32);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
|
||||
static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
|
||||
bool val) {
|
||||
if (val == false)
|
||||
vec->v &= ~(1 << index);
|
||||
else
|
||||
vec->v |= (1 << index);
|
||||
}
|
||||
*/
|
||||
}
|
||||
*/
|
||||
|
||||
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
|
||||
const uint16_t *ptr = (const uint16_t *)p;
|
||||
@@ -321,18 +404,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
|
||||
*ptr = v;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __smear_i1(int i);
|
||||
template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
|
||||
template <class RetVecType> static RetVecType __smear_i1(int i);
|
||||
template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
|
||||
return i?0xFFFF:0x0;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __setzero_i1();
|
||||
template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
|
||||
template <class RetVecType> static RetVecType __setzero_i1();
|
||||
template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __undef_i1();
|
||||
template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
|
||||
template <class RetVecType> static RetVecType __undef_i1();
|
||||
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
|
||||
return __vec16_i1();
|
||||
}
|
||||
|
||||
@@ -342,7 +425,7 @@ template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
|
||||
|
||||
/*
|
||||
|
||||
TODO
|
||||
TODO
|
||||
|
||||
*/
|
||||
|
||||
@@ -353,7 +436,7 @@ TODO
|
||||
|
||||
/*
|
||||
|
||||
TODO
|
||||
TODO
|
||||
|
||||
*/
|
||||
|
||||
@@ -532,8 +615,8 @@ static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t
|
||||
((int32_t *)v)[index] = val;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __smear_i32(int32_t i);
|
||||
template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
|
||||
template <class RetVecType> static RetVecType __smear_i32(int32_t i);
|
||||
template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
|
||||
return _mm512_set1_epi32(i);
|
||||
}
|
||||
|
||||
@@ -542,13 +625,13 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
|
||||
static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
|
||||
static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
template <class RetVecType> RetVecType __setzero_i32();
|
||||
template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
|
||||
template <class RetVecType> static RetVecType __setzero_i32();
|
||||
template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
|
||||
return _mm512_setzero_epi32();
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __undef_i32();
|
||||
template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
|
||||
template <class RetVecType> static RetVecType __undef_i32();
|
||||
template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
|
||||
return __vec16_i32();
|
||||
}
|
||||
|
||||
@@ -557,9 +640,13 @@ static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
|
||||
return _mm512_set1_epi32(val);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __cast_trunc(__vec16_i32, const __vec16_i64 i64) {
|
||||
return __vec16_i32(i64.v_lo);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
|
||||
__vec16_i32 idx = __smear_i32<__vec16_i32>(index);
|
||||
__vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0x7));
|
||||
__vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xf));
|
||||
return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
|
||||
}
|
||||
|
||||
@@ -578,7 +665,7 @@ template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
|
||||
template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
|
||||
return _mm512_load_epi32(p);
|
||||
}
|
||||
|
||||
@@ -591,18 +678,32 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
|
||||
#endif
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
|
||||
template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
|
||||
_mm512_store_epi32(p, v);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// int64
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index)
|
||||
static FORCEINLINE
|
||||
void __masked_store_i64(void *p, const __vec16_i64 &v, __vec16_i1 mask)
|
||||
{
|
||||
uint *src = (uint *)&v;
|
||||
return src[index+16] | (int64_t(src[index]) << 32);
|
||||
__m512i v1;
|
||||
__m512i v2;
|
||||
v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
|
||||
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
|
||||
v.v_hi);
|
||||
v1 = _mm512_mask_permutevar_epi32(v1, 0x5555,
|
||||
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
|
||||
v.v_lo);
|
||||
v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
|
||||
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||
v.v_hi);
|
||||
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
|
||||
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||
v.v_lo);
|
||||
_mm512_mask_store_epi64(p, mask, v2);
|
||||
_mm512_mask_store_epi64(((uint8_t*)p)+64, mask>>8, v1);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __insert_element(__vec16_i64 *v, uint32_t index, int64_t val) {
|
||||
@@ -611,16 +712,16 @@ static FORCEINLINE void __insert_element(__vec16_i64 *v, uint32_t index, int64_t
|
||||
}
|
||||
|
||||
|
||||
template <class RetVecType> RetVecType __setzero_i64();
|
||||
template <> static FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() {
|
||||
template <class RetVecType> static RetVecType __setzero_i64();
|
||||
template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() {
|
||||
__vec16_i64 ret;
|
||||
ret.v_lo = _mm512_setzero_epi32();
|
||||
ret.v_hi = _mm512_setzero_epi32();
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __undef_i64();
|
||||
template <> static FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() {
|
||||
template <class RetVecType> static RetVecType __undef_i64();
|
||||
template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() {
|
||||
return __vec16_i64();
|
||||
}
|
||||
|
||||
@@ -704,6 +805,13 @@ static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, __vec16_i64 b) {
|
||||
return __vec16_i64(lo, hi);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, unsigned long long b) {
|
||||
__vec16_i32 hi = _mm512_or_epi32(_mm512_slli_epi32(a.v_hi, b),
|
||||
_mm512_srli_epi32(a.v_lo, 32-b));
|
||||
__vec16_i32 lo = _mm512_slli_epi32(a.v_lo, b);
|
||||
return __vec16_i64(lo, hi);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) {
|
||||
__vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
|
||||
__vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
|
||||
@@ -724,6 +832,16 @@ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, __vec16_i64 b) {
|
||||
return __vec16_i64(lo, hi);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, unsigned long long b) {
|
||||
__vec16_i32 xfer
|
||||
= _mm512_slli_epi32(_mm512_and_epi32(a.v_hi,
|
||||
_mm512_set1_epi32((1<<b)-1)),
|
||||
32-b);
|
||||
__vec16_i32 hi = _mm512_srai_epi32(a.v_hi, b);
|
||||
__vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srli_epi32(a.v_lo, b));
|
||||
return __vec16_i64(lo, hi);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
|
||||
const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
|
||||
return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
|
||||
@@ -731,9 +849,9 @@ static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i6
|
||||
|
||||
static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
|
||||
__vec16_i1 mask) {
|
||||
__mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
|
||||
__mmask16 lo_match = _mm512_mask_cmpeq_epi32_mask((__mmask16)mask, a.v_lo,b.v_lo);
|
||||
__mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
|
||||
return _mm512_kand(full_match, (__mmask16)mask);
|
||||
return full_match;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
|
||||
@@ -753,7 +871,7 @@ static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
|
||||
template <class RetVecType> static RetVecType __smear_i64(const int64_t &l);
|
||||
template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) {
|
||||
const int *i = (const int*)&l;
|
||||
return __vec16_i64(_mm512_set1_epi32(i[0]), _mm512_set1_epi32(i[1]));
|
||||
@@ -762,10 +880,11 @@ template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l)
|
||||
template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) {
|
||||
__vec16_i32 v1;
|
||||
__vec16_i32 v2;
|
||||
v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
const uint8_t*ptr = (const uint8_t*)p;
|
||||
v2 = _mm512_extloadunpacklo_epi32(v2, ptr, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v2 = _mm512_extloadunpackhi_epi32(v2, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v1 = _mm512_extloadunpacklo_epi32(v1, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v1 = _mm512_extloadunpackhi_epi32(v1, ptr+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
|
||||
__vec16_i64 ret;
|
||||
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
|
||||
@@ -783,7 +902,7 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p)
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
|
||||
template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
|
||||
__m512i v2 = _mm512_load_epi32(p);
|
||||
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
|
||||
__vec16_i64 ret;
|
||||
@@ -802,7 +921,7 @@ template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {
|
||||
template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {
|
||||
return __load<64>(p);
|
||||
}
|
||||
|
||||
@@ -827,7 +946,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
|
||||
_mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
|
||||
template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
|
||||
__m512i v1;
|
||||
__m512i v2;
|
||||
v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
|
||||
@@ -846,10 +965,72 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
|
||||
_mm512_store_epi64(((uint8_t*)p)+64, v1);
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {
|
||||
template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {
|
||||
__store<64>(p, v);
|
||||
}
|
||||
|
||||
|
||||
/*! gather vector of 64-bit ints from addresses pointing to uniform ints
|
||||
|
||||
(iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF
|
||||
***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't
|
||||
expose whether it's from array of uniform or array of varying
|
||||
poitners, so in here there's no way to tell - only thing we can do
|
||||
is pick one...
|
||||
*/
|
||||
static FORCEINLINE __vec16_i64
|
||||
__gather_base_offsets32_i64(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i64 ret;
|
||||
ret.v_lo = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets,
|
||||
base, _MM_UPCONV_EPI32_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
ret.v_hi = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets,
|
||||
base+4, _MM_UPCONV_EPI32_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*! gather vector of 64-bit ints from addresses pointing to uniform ints
|
||||
|
||||
(iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF
|
||||
***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't
|
||||
expose whether it's from array of uniform or array of varying
|
||||
poitners, so in here there's no way to tell - only thing we can do
|
||||
is pick one...
|
||||
*/
|
||||
static FORCEINLINE __vec16_i64
|
||||
__gather64_i64(__vec16_i64 addr, __vec16_i1 mask)
|
||||
{
|
||||
__vec16_i64 ret;
|
||||
|
||||
// There is no gather instruction with 64-bit offsets in KNC.
|
||||
// We have to manually iterate over the upper 32 bits ;-)
|
||||
__vec16_i1 still_to_do = mask;
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
while (still_to_do) {
|
||||
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
|
||||
const uint32_t &hi32 = ((uint*)&addr.v_hi)[first_active_lane];
|
||||
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi,
|
||||
__smear_i32<__vec16_i32>((int32_t)hi32),
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN));
|
||||
ret.v_lo = _mm512_mask_i32extgather_epi32(ret.v_lo, match, signed_offsets,
|
||||
base, _MM_UPCONV_EPI32_NONE, 1,
|
||||
_MM_HINT_NONE);
|
||||
ret.v_hi = _mm512_mask_i32extgather_epi32(ret.v_hi, match, signed_offsets,
|
||||
base+4, _MM_UPCONV_EPI32_NONE, 1,
|
||||
_MM_HINT_NONE);
|
||||
|
||||
still_to_do = _mm512_kxor(match, still_to_do);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// float
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -948,18 +1129,18 @@ static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float va
|
||||
((float *)v)[index] = val;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __smear_float(float f);
|
||||
template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
|
||||
template <class RetVecType> static RetVecType __smear_float(float f);
|
||||
template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
|
||||
return _mm512_set_1to16_ps(f);
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __setzero_float();
|
||||
template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
|
||||
template <class RetVecType> static RetVecType __setzero_float();
|
||||
template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
|
||||
return _mm512_setzero_ps();
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __undef_float();
|
||||
template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
|
||||
template <class RetVecType> static RetVecType __undef_float();
|
||||
template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
|
||||
return __vec16_f();
|
||||
}
|
||||
|
||||
@@ -983,7 +1164,7 @@ template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) {
|
||||
#endif
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
|
||||
template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
|
||||
return _mm512_load_ps(p);
|
||||
}
|
||||
|
||||
@@ -996,7 +1177,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
|
||||
template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
|
||||
_mm512_store_ps(p, v);
|
||||
}
|
||||
|
||||
@@ -1178,24 +1359,24 @@ static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double v
|
||||
((double *)v)[index] = val;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __smear_double(double d);
|
||||
template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
|
||||
template <class RetVecType> static RetVecType __smear_double(double d);
|
||||
template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
|
||||
__vec16_d ret;
|
||||
ret.v1 = _mm512_set1_pd(d);
|
||||
ret.v2 = _mm512_set1_pd(d);
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __setzero_double();
|
||||
template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
|
||||
template <class RetVecType> static RetVecType __setzero_double();
|
||||
template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
|
||||
__vec16_d ret;
|
||||
ret.v1 = _mm512_setzero_pd();
|
||||
ret.v2 = _mm512_setzero_pd();
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __undef_double();
|
||||
template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
|
||||
template <class RetVecType> static RetVecType __undef_double();
|
||||
template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
|
||||
return __vec16_d();
|
||||
}
|
||||
|
||||
@@ -1216,14 +1397,14 @@ template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
|
||||
template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
|
||||
__vec16_d ret;
|
||||
ret.v1 = _mm512_load_pd(p);
|
||||
ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
|
||||
template <> FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
|
||||
return __load<64>(p);
|
||||
}
|
||||
|
||||
@@ -1234,12 +1415,12 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
|
||||
_mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
|
||||
template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
|
||||
_mm512_store_pd(p, v.v1);
|
||||
_mm512_store_pd(((uint8_t*)p)+64, v.v2);
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
|
||||
template <> FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
|
||||
__store<64>(p, v);
|
||||
}
|
||||
|
||||
@@ -1329,16 +1510,15 @@ static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) {
|
||||
|
||||
static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
|
||||
__vec16_d ret;
|
||||
ret.v2 = _mm512_cvtpslo_pd(val.v);
|
||||
ret.v1 = _mm512_cvtpslo_pd(val.v);
|
||||
__vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
|
||||
ret.v1 = _mm512_cvtpslo_pd(other8);
|
||||
ret.v2 = _mm512_cvtpslo_pd(other8);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
|
||||
__m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
|
||||
__m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
|
||||
|
||||
__m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
|
||||
__m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
|
||||
return _mm512_mask_permute4f128_epi32(r1i, 0xFF00, r0i, _MM_PERM_BABA);
|
||||
}
|
||||
|
||||
@@ -1352,11 +1532,41 @@ static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) {
|
||||
|
||||
|
||||
static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) {
|
||||
return *(__vec16_i64*)&val;
|
||||
__vec16_i64 ret;
|
||||
ret.v_hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
|
||||
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
|
||||
_mm512_castpd_si512(val.v2));
|
||||
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF,
|
||||
_mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
|
||||
_mm512_castpd_si512(val.v1));
|
||||
ret.v_lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
|
||||
_mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
|
||||
_mm512_castpd_si512(val.v2));
|
||||
ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF,
|
||||
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
|
||||
_mm512_castpd_si512(val.v1));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
|
||||
return *(__vec16_d*)&val;
|
||||
__vec16_d ret;
|
||||
ret.v2 = _mm512_castsi512_pd(
|
||||
_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
|
||||
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
|
||||
val.v_hi));
|
||||
ret.v2 = _mm512_castsi512_pd(
|
||||
_mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v2), 0x5555,
|
||||
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
|
||||
val.v_lo));
|
||||
ret.v1 = _mm512_castsi512_pd(
|
||||
_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
|
||||
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||
val.v_hi));
|
||||
ret.v1 = _mm512_castsi512_pd(
|
||||
_mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v1), 0x5555,
|
||||
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||
val.v_lo));
|
||||
return ret;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -1488,12 +1698,14 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
|
||||
return _mm512_invsqrt_ps(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
|
||||
__vec16_d y;
|
||||
for (int i = 0; i < 16; i++)
|
||||
__insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
|
||||
return y;
|
||||
}
|
||||
|
||||
static FORCEINLINE double __rsqrt_uniform_double(double v)
|
||||
{
|
||||
return 1.0/v;
|
||||
@@ -1629,6 +1841,38 @@ static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
|
||||
#endif
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_i8(void *p, const __vec16_i8 &val, __vec16_i1 mask) {
|
||||
__vec16_i32 tmp = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
|
||||
_mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, __vec16_i1 mask) {
|
||||
__vec16_i8 ret;
|
||||
__vec16_i32 tmp = _mm512_mask_extload_epi32(_mm512_undefined_epi32(),mask,p,
|
||||
_MM_UPCONV_EPI32_SINT8,
|
||||
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
|
||||
_mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
|
||||
return ret;
|
||||
}
|
||||
template <int ALIGN> static FORCEINLINE __vec16_i8 __load(const __vec16_i8 *p) {
|
||||
return *p;
|
||||
}
|
||||
template <int ALIGN> static FORCEINLINE void __store(__vec16_i8 *p, __vec16_i8 v) {
|
||||
*p = v;
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_i8 val, __vec16_i1 mask)
|
||||
{
|
||||
__vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT8,
|
||||
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
|
||||
printf("__scatter_base_offsets32_i8\n");
|
||||
_mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp,
|
||||
_MM_DOWNCONV_EPI32_SINT8, scale,
|
||||
_MM_HINT_NONE);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
|
||||
#ifdef ISPC_FORCE_ALIGNED_MEMORY
|
||||
_mm512_mask_store_epi32(p, mask, val.v);
|
||||
@@ -1729,16 +1973,44 @@ static FORCEINLINE __vec16_d
|
||||
__gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_d ret;
|
||||
ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
|
||||
ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
|
||||
base, _MM_UPCONV_PD_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
__m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
|
||||
ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets,
|
||||
ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets,
|
||||
base, _MM_UPCONV_PD_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f
|
||||
__gather64_float(__vec16_i64 addr, __vec16_i1 mask)
|
||||
{
|
||||
__vec16_f ret;
|
||||
|
||||
// There is no gather instruction with 64-bit offsets in KNC.
|
||||
// We have to manually iterate over the upper 32 bits ;-)
|
||||
__vec16_i1 still_to_do = mask;
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
while (still_to_do) {
|
||||
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
|
||||
const uint &hi32 = ((uint*)&addr.v_hi)[first_active_lane];
|
||||
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi,
|
||||
__smear_i32<__vec16_i32>((int32_t)hi32),
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN));
|
||||
|
||||
ret.v = _mm512_mask_i32extgather_ps(ret.v, match, signed_offsets,
|
||||
base, _MM_UPCONV_PS_NONE, 1,
|
||||
_MM_HINT_NONE);
|
||||
still_to_do = _mm512_kxor(match, still_to_do);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*! gather with 64-bit offsets.
|
||||
|
||||
\todo add optimization that falls back to 32-bit offset gather if
|
||||
@@ -1749,6 +2021,8 @@ __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offset
|
||||
static FORCEINLINE __vec16_f
|
||||
__gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
|
||||
__vec16_i1 mask) {
|
||||
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
// There is no gather instruction with 64-bit offsets in KNC.
|
||||
// We have to manually iterate over the upper 32 bits ;-)
|
||||
__vec16_i1 still_to_do = mask;
|
||||
@@ -1759,10 +2033,10 @@ __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offset
|
||||
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
|
||||
__smear_i32<__vec16_i32>((int32_t)hi32),
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((unsigned long)_base +
|
||||
((scale*(unsigned long)hi32) << 32));
|
||||
ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
|
||||
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
|
||||
|
||||
ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base,
|
||||
_MM_UPCONV_PS_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
still_to_do = _mm512_kxor(match, still_to_do);
|
||||
@@ -1776,6 +2050,8 @@ static FORCEINLINE __vec16_i8
|
||||
__gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
|
||||
__vec16_i1 mask)
|
||||
{
|
||||
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
__vec16_i1 still_to_do = mask;
|
||||
__vec16_i32 tmp;
|
||||
while (still_to_do) {
|
||||
@@ -1786,8 +2062,8 @@ __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((unsigned long)_base +
|
||||
((scale*(unsigned long)hi32) << 32));
|
||||
tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
|
||||
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
|
||||
tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base,
|
||||
_MM_UPCONV_EPI32_SINT8, scale,
|
||||
_MM_HINT_NONE);
|
||||
still_to_do = _mm512_kxor(match,still_to_do);
|
||||
@@ -1802,6 +2078,8 @@ static FORCEINLINE void
|
||||
__scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
|
||||
__vec16_f value,
|
||||
__vec16_i1 mask) {
|
||||
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
__vec16_i1 still_to_do = mask;
|
||||
while (still_to_do) {
|
||||
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
|
||||
@@ -1811,8 +2089,8 @@ __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offse
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((unsigned long)_base +
|
||||
((scale*(unsigned long)hi32) << 32));
|
||||
_mm512_mask_i32extscatter_ps(base, match, offsets.v_lo,
|
||||
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
|
||||
_mm512_mask_i32extscatter_ps(base, match, signed_offsets,
|
||||
value,
|
||||
_MM_DOWNCONV_PS_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
@@ -1824,7 +2102,36 @@ static FORCEINLINE void
|
||||
__scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
|
||||
__vec16_i32 value,
|
||||
__vec16_i1 mask) {
|
||||
|
||||
const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||
__vec16_i1 still_to_do = mask;
|
||||
while (still_to_do) {
|
||||
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
|
||||
const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
|
||||
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
|
||||
__smear_i32<__vec16_i32>((int32_t)hi32),
|
||||
_MM_CMPINT_EQ);
|
||||
|
||||
void * base = (void*)((unsigned long)_base +
|
||||
((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN));
|
||||
_mm512_mask_i32extscatter_epi32(base, match, signed_offsets,
|
||||
value,
|
||||
_MM_DOWNCONV_EPI32_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
still_to_do = _mm512_kxor(match,still_to_do);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void // TODO
|
||||
__scatter_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
|
||||
__vec16_i8 value,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i1 still_to_do = mask;
|
||||
|
||||
__vec16_i32 tmp = _mm512_extload_epi32(&value, _MM_UPCONV_EPI32_SINT8,
|
||||
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
|
||||
// _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
|
||||
|
||||
while (still_to_do) {
|
||||
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
|
||||
const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
|
||||
@@ -1835,15 +2142,14 @@ __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets
|
||||
void * base = (void*)((unsigned long)_base +
|
||||
((scale*(unsigned long)hi32) << 32));
|
||||
_mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo,
|
||||
value,
|
||||
_MM_DOWNCONV_EPI32_NONE, scale,
|
||||
tmp,
|
||||
_MM_DOWNCONV_EPI32_SINT8, scale,
|
||||
_MM_HINT_NONE);
|
||||
still_to_do = _mm512_kxor(match,still_to_do);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static FORCEINLINE __vec16_i32
|
||||
__gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
|
||||
__vec16_i1 mask)
|
||||
@@ -1876,17 +2182,15 @@ __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
|
||||
// packed load/store
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 v;
|
||||
static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask) {
|
||||
__vec16_i32 v = __load<64>(val);
|
||||
v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v = _mm512_mask_extloadunpackhi_epi32(_mm512_undefined_epi32(), mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
__store<64>(val, v);
|
||||
return _mm_countbits_32(uint32_t(mask));
|
||||
}
|
||||
|
||||
static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) {
|
||||
_mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
_mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
return _mm_countbits_32(uint32_t(mask));
|
||||
@@ -1918,6 +2222,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(const char *p) {
|
||||
// _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
|
||||
}
|
||||
|
||||
#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||
__vec16_i32 offsets, __vec16_i1 mask) { \
|
||||
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \
|
||||
offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\
|
||||
__vec16_i1 copy_mask = _mm512_kmov(mask); \
|
||||
_mm512_kswapb(mask, copy_mask); \
|
||||
_mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \
|
||||
} \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \
|
||||
|
||||
PREFETCH_READ_VARYING(1, _MM_HINT_T0)
|
||||
PREFETCH_READ_VARYING(2, _MM_HINT_T1)
|
||||
PREFETCH_READ_VARYING(nt, _MM_HINT_T2)
|
||||
|
||||
static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale,
|
||||
__vec16_i32 offsets, __vec16_i1 mask) {}
|
||||
|
||||
static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// atomics
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -3898,6 +3898,15 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *ptr) {
|
||||
_mm_prefetch((char *)ptr, _MM_HINT_NTA);
|
||||
}
|
||||
|
||||
#define PREFETCH_READ_VARYING(CACHE_NUM) \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \
|
||||
__vec4_i32 offsets, __vec4_i1 mask) {} \
|
||||
static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec4_i64 addr, __vec4_i1 mask) {} \
|
||||
|
||||
PREFETCH_READ_VARYING(1)
|
||||
PREFETCH_READ_VARYING(2)
|
||||
PREFETCH_READ_VARYING(3)
|
||||
PREFETCH_READ_VARYING(nt)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// atomics
|
||||
|
||||
|
||||
59
fail_db.txt
59
fail_db.txt
@@ -257,6 +257,32 @@
|
||||
./tests/reduce-equal-5.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
./tests/reduce-equal-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
./tests/reduce-equal-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/paddus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/paddus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/pmuls_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/pmulus_i16.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/pmulus_i32.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/pmulus_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/pmulus_i8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/paddus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/paddus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/pmuls_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/pmulus_i16.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/pmulus_i32.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/pmulus_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/pmulus_i8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 *
|
||||
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
.\tests\foreach-double-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.5 cl -O2 *
|
||||
.\tests\foreach-double-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.5 cl -O2 *
|
||||
.\tests\foreach-double-1.ispc runfail x86 avx2-i64x4 Windows LLVM 3.5 cl -O2 *
|
||||
@@ -267,7 +293,6 @@
|
||||
./tests/ptr-22.ispc runfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
./tests/ptr-22.ispc runfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 *
|
||||
./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O2 *
|
||||
./tests/ptr-22.ispc runfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
|
||||
./tests/atomics-1.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
|
||||
./tests/atomics-10.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
|
||||
./tests/atomics-11.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 *
|
||||
@@ -454,3 +479,35 @@
|
||||
./tests/reduce-equal-5.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 *
|
||||
./tests/reduce-equal-6.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 *
|
||||
./tests/reduce-equal-8.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 *
|
||||
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/paddus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/paddus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/pmuls_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/pmulus_i16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/pmulus_i32.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/pmulus_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/pmulus_i8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 *
|
||||
./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 *
|
||||
./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 *
|
||||
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/paddus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/paddus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/pmuls_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/pmulus_i16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/pmulus_i32.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/pmulus_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/pmulus_i8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 *
|
||||
./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 *
|
||||
./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 *
|
||||
./tests/psubus_vi16.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O2 *
|
||||
./tests/psubus_vi8.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O2 *
|
||||
./tests/psubus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/psubus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/psubus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
./tests/psubus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 *
|
||||
|
||||
5
ispc.cpp
5
ispc.cpp
@@ -199,7 +199,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
m_hasTranscendentals(false),
|
||||
m_hasTrigonometry(false),
|
||||
m_hasRsqrtd(false),
|
||||
m_hasRcpd(false)
|
||||
m_hasRcpd(false),
|
||||
m_hasVecPrefetch(false)
|
||||
{
|
||||
if (isa == NULL) {
|
||||
if (cpu != NULL) {
|
||||
@@ -386,6 +387,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
this->m_hasTrigonometry = false;
|
||||
this->m_hasGather = this->m_hasScatter = true;
|
||||
this->m_hasRsqrtd = this->m_hasRcpd = true;
|
||||
// It's set to true, because MIC has hardware vector prefetch instruction
|
||||
this->m_hasVecPrefetch = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-32") ||
|
||||
!strcasecmp(isa, "generic-x32")) {
|
||||
|
||||
5
ispc.h
5
ispc.h
@@ -283,6 +283,8 @@ public:
|
||||
|
||||
bool hasRcpd() const {return m_hasRcpd;}
|
||||
|
||||
bool hasVecPrefetch() const {return m_hasVecPrefetch;}
|
||||
|
||||
private:
|
||||
|
||||
/** llvm Target object representing this target. */
|
||||
@@ -385,6 +387,9 @@ private:
|
||||
|
||||
/** Indicates whether there is an ISA double precision rcp. */
|
||||
bool m_hasRcpd;
|
||||
|
||||
/** Indicates whether the target has hardware instruction for vector prefetch. */
|
||||
bool m_hasVecPrefetch;
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -403,6 +403,7 @@
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'">LLVMMCDisassembler.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'">LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
@@ -424,6 +425,8 @@
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'AND'$(LLVM_VERSION)'!='LLVM_3_5'">LLVMProfileData.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'AND'$(LLVM_VERSION)'!='LLVM_3_4'">LLVMMCDisassembler.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies Condition="'$(LLVM_VERSION)'!='LLVM_3_2'AND'$(LLVM_VERSION)'!='LLVM_3_3'">LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
|
||||
32
module.cpp
32
module.cpp
@@ -604,12 +604,23 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
|
||||
if (diBuilder) {
|
||||
llvm::DIFile file = pos.GetDIFile();
|
||||
llvm::DIGlobalVariable var =
|
||||
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+
|
||||
diBuilder->createGlobalVariable(file,
|
||||
name,
|
||||
name,
|
||||
file,
|
||||
pos.first_line,
|
||||
sym->type->GetDIType(file),
|
||||
(sym->storageClass == SC_STATIC),
|
||||
sym->storagePtr);
|
||||
#else
|
||||
diBuilder->createGlobalVariable(name,
|
||||
file,
|
||||
pos.first_line,
|
||||
sym->type->GetDIType(file),
|
||||
(sym->storageClass == SC_STATIC),
|
||||
sym->storagePtr);
|
||||
#endif
|
||||
Assert(var.Verify());
|
||||
}
|
||||
}
|
||||
@@ -1304,18 +1315,33 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
|
||||
std::string error;
|
||||
#else // LLVM 3.6+
|
||||
std::error_code error;
|
||||
#endif
|
||||
|
||||
llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags);
|
||||
|
||||
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
|
||||
if (error.size()) {
|
||||
#else // LLVM 3.6+
|
||||
if (error) {
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "Error opening output file \"%s\".\n", outFileName);
|
||||
return false;
|
||||
}
|
||||
|
||||
llvm::PassManager pm;
|
||||
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
|
||||
pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
|
||||
#else
|
||||
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
|
||||
pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
|
||||
#elif defined(LLVM_3_5)
|
||||
pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
|
||||
#else // LLVM 3.6+
|
||||
llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
|
||||
dlp->doInitialization(*module);
|
||||
pm.add(dlp);
|
||||
#endif
|
||||
|
||||
llvm::formatted_raw_ostream fos(of->os());
|
||||
|
||||
328
opt.cpp
328
opt.cpp
@@ -479,10 +479,14 @@ Optimize(llvm::Module *module, int optLevel) {
|
||||
new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
|
||||
optPM.add(targetLibraryInfo);
|
||||
|
||||
#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+
|
||||
optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
|
||||
#else
|
||||
#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
|
||||
optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
|
||||
#elif defined(LLVM_3_5)
|
||||
optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout()));
|
||||
#else // LLVM 3.6+
|
||||
llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass();
|
||||
dlp->doInitialization(*module);
|
||||
optPM.add(dlp);
|
||||
#endif
|
||||
|
||||
llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
|
||||
@@ -2117,8 +2121,8 @@ static bool
|
||||
lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
||||
struct GSInfo {
|
||||
GSInfo(const char *pgFuncName, const char *pgboFuncName,
|
||||
const char *pgbo32FuncName, bool ig)
|
||||
: isGather(ig) {
|
||||
const char *pgbo32FuncName, bool ig, bool ip)
|
||||
: isGather(ig), isPrefetch(ip) {
|
||||
func = m->module->getFunction(pgFuncName);
|
||||
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
|
||||
baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
|
||||
@@ -2126,6 +2130,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
||||
llvm::Function *func;
|
||||
llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
|
||||
const bool isGather;
|
||||
const bool isPrefetch;
|
||||
};
|
||||
|
||||
GSInfo gsFuncs[] = {
|
||||
@@ -2134,148 +2139,176 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
||||
"__pseudo_gather_factored_base_offsets32_i8",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
|
||||
"__pseudo_gather_factored_base_offsets32_i8",
|
||||
true),
|
||||
true, false),
|
||||
GSInfo("__pseudo_gather32_i16",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
||||
"__pseudo_gather_factored_base_offsets32_i16",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
||||
"__pseudo_gather_factored_base_offsets32_i16",
|
||||
true),
|
||||
true, false),
|
||||
GSInfo("__pseudo_gather32_i32",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
||||
"__pseudo_gather_factored_base_offsets32_i32",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
||||
"__pseudo_gather_factored_base_offsets32_i32",
|
||||
true),
|
||||
true, false),
|
||||
GSInfo("__pseudo_gather32_float",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
||||
"__pseudo_gather_factored_base_offsets32_float",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
||||
"__pseudo_gather_factored_base_offsets32_float",
|
||||
true),
|
||||
true, false),
|
||||
GSInfo("__pseudo_gather32_i64",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
||||
"__pseudo_gather_factored_base_offsets32_i64",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
||||
"__pseudo_gather_factored_base_offsets32_i64",
|
||||
true),
|
||||
true, false),
|
||||
GSInfo("__pseudo_gather32_double",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
||||
"__pseudo_gather_factored_base_offsets32_double",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
||||
"__pseudo_gather_factored_base_offsets32_double",
|
||||
true),
|
||||
true, false),
|
||||
|
||||
GSInfo("__pseudo_scatter32_i8",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i8",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i8",
|
||||
false),
|
||||
false, false),
|
||||
GSInfo("__pseudo_scatter32_i16",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i16",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i16",
|
||||
false),
|
||||
false, false),
|
||||
GSInfo("__pseudo_scatter32_i32",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i32",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i32",
|
||||
false),
|
||||
false, false),
|
||||
GSInfo("__pseudo_scatter32_float",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
||||
"__pseudo_scatter_factored_base_offsets32_float",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
||||
"__pseudo_scatter_factored_base_offsets32_float",
|
||||
false),
|
||||
false, false),
|
||||
GSInfo("__pseudo_scatter32_i64",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i64",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i64",
|
||||
false),
|
||||
false, false),
|
||||
GSInfo("__pseudo_scatter32_double",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
||||
"__pseudo_scatter_factored_base_offsets32_double",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
||||
"__pseudo_scatter_factored_base_offsets32_double",
|
||||
false),
|
||||
false, false),
|
||||
|
||||
GSInfo("__pseudo_gather64_i8",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" :
|
||||
"__pseudo_gather_factored_base_offsets64_i8",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
|
||||
"__pseudo_gather_factored_base_offsets32_i8",
|
||||
true),
|
||||
true, false),
|
||||
GSInfo("__pseudo_gather64_i16",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" :
|
||||
"__pseudo_gather_factored_base_offsets64_i16",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
||||
"__pseudo_gather_factored_base_offsets32_i16",
|
||||
true),
|
||||
true, false),
|
||||
GSInfo("__pseudo_gather64_i32",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" :
|
||||
"__pseudo_gather_factored_base_offsets64_i32",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
||||
"__pseudo_gather_factored_base_offsets32_i32",
|
||||
true),
|
||||
true, false),
|
||||
GSInfo("__pseudo_gather64_float",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" :
|
||||
"__pseudo_gather_factored_base_offsets64_float",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
||||
"__pseudo_gather_factored_base_offsets32_float",
|
||||
true),
|
||||
true, false),
|
||||
GSInfo("__pseudo_gather64_i64",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" :
|
||||
"__pseudo_gather_factored_base_offsets64_i64",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
||||
"__pseudo_gather_factored_base_offsets32_i64",
|
||||
true),
|
||||
true, false),
|
||||
GSInfo("__pseudo_gather64_double",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" :
|
||||
"__pseudo_gather_factored_base_offsets64_double",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
||||
"__pseudo_gather_factored_base_offsets32_double",
|
||||
true),
|
||||
true, false),
|
||||
|
||||
GSInfo("__pseudo_scatter64_i8",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" :
|
||||
"__pseudo_scatter_factored_base_offsets64_i8",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i8",
|
||||
false),
|
||||
false, false),
|
||||
GSInfo("__pseudo_scatter64_i16",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" :
|
||||
"__pseudo_scatter_factored_base_offsets64_i16",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i16",
|
||||
false),
|
||||
false, false),
|
||||
GSInfo("__pseudo_scatter64_i32",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" :
|
||||
"__pseudo_scatter_factored_base_offsets64_i32",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i32",
|
||||
false),
|
||||
false, false),
|
||||
GSInfo("__pseudo_scatter64_float",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" :
|
||||
"__pseudo_scatter_factored_base_offsets64_float",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
||||
"__pseudo_scatter_factored_base_offsets32_float",
|
||||
false),
|
||||
false, false),
|
||||
GSInfo("__pseudo_scatter64_i64",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" :
|
||||
"__pseudo_scatter_factored_base_offsets64_i64",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i64",
|
||||
false),
|
||||
false, false),
|
||||
GSInfo("__pseudo_scatter64_double",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" :
|
||||
"__pseudo_scatter_factored_base_offsets64_double",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
||||
"__pseudo_scatter_factored_base_offsets32_double",
|
||||
false),
|
||||
false, false),
|
||||
|
||||
GSInfo("__pseudo_prefetch_read_varying_1",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
|
||||
"__prefetch_read_varying_1",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
|
||||
"__prefetch_read_varying_1",
|
||||
false, true),
|
||||
|
||||
GSInfo("__pseudo_prefetch_read_varying_2",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
|
||||
"__prefetch_read_varying_2",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
|
||||
"__prefetch_read_varying_2",
|
||||
false, true),
|
||||
|
||||
GSInfo("__pseudo_prefetch_read_varying_3",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
|
||||
"__prefetch_read_varying_3",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
|
||||
"__prefetch_read_varying_3",
|
||||
false, true),
|
||||
|
||||
GSInfo("__pseudo_prefetch_read_varying_nt",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
|
||||
"__prefetch_read_varying_nt",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
|
||||
"__prefetch_read_varying_nt",
|
||||
false, true),
|
||||
};
|
||||
|
||||
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
|
||||
@@ -2301,7 +2334,8 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
||||
llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector,
|
||||
callInst);
|
||||
|
||||
if (basePtr == NULL || offsetVector == NULL)
|
||||
if (basePtr == NULL || offsetVector == NULL ||
|
||||
(info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false))
|
||||
// It's actually a fully general gather/scatter with a varying
|
||||
// set of base pointers, so leave it as is and continune onward
|
||||
// to the next instruction...
|
||||
@@ -2316,7 +2350,9 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
||||
llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
|
||||
|
||||
if ((info->isGather == true && g->target->hasGather()) ||
|
||||
(info->isGather == false && g->target->hasScatter())) {
|
||||
(info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
|
||||
(info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {
|
||||
|
||||
// See if the offsets are scaled by 2, 4, or 8. If so,
|
||||
// extract that scale factor and rewrite the offsets to remove
|
||||
// it.
|
||||
@@ -2330,7 +2366,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
||||
gatherScatterFunc = info->baseOffsets32Func;
|
||||
}
|
||||
|
||||
if (info->isGather) {
|
||||
if (info->isGather || info->isPrefetch) {
|
||||
llvm::Value *mask = callInst->getArgOperand(1);
|
||||
|
||||
// Generate a new function call to the next pseudo gather
|
||||
@@ -2387,7 +2423,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
||||
gatherScatterFunc = info->baseOffsets32Func;
|
||||
}
|
||||
|
||||
if (info->isGather) {
|
||||
if (info->isGather || info->isPrefetch) {
|
||||
llvm::Value *mask = callInst->getArgOperand(1);
|
||||
|
||||
// Generate a new function call to the next pseudo gather
|
||||
@@ -2429,13 +2465,14 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) {
|
||||
static bool
|
||||
lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
|
||||
struct GSBOInfo {
|
||||
GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig)
|
||||
: isGather(ig) {
|
||||
GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
|
||||
: isGather(ig), isPrefetch(ip) {
|
||||
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
|
||||
baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
|
||||
}
|
||||
llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
|
||||
const bool isGather;
|
||||
const bool isPrefetch;
|
||||
};
|
||||
|
||||
GSBOInfo gsFuncs[] = {
|
||||
@@ -2443,63 +2480,87 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
|
||||
"__pseudo_gather_factored_base_offsets32_i8",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" :
|
||||
"__pseudo_gather_factored_base_offsets32_i8",
|
||||
true),
|
||||
true, false),
|
||||
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
||||
"__pseudo_gather_factored_base_offsets32_i16",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" :
|
||||
"__pseudo_gather_factored_base_offsets32_i16",
|
||||
true),
|
||||
true, false),
|
||||
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
||||
"__pseudo_gather_factored_base_offsets32_i32",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" :
|
||||
"__pseudo_gather_factored_base_offsets32_i32",
|
||||
true),
|
||||
true, false),
|
||||
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
||||
"__pseudo_gather_factored_base_offsets32_float",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" :
|
||||
"__pseudo_gather_factored_base_offsets32_float",
|
||||
true),
|
||||
true, false),
|
||||
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
||||
"__pseudo_gather_factored_base_offsets32_i64",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" :
|
||||
"__pseudo_gather_factored_base_offsets32_i64",
|
||||
true),
|
||||
true, false),
|
||||
GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
||||
"__pseudo_gather_factored_base_offsets32_double",
|
||||
g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" :
|
||||
"__pseudo_gather_factored_base_offsets32_double",
|
||||
true),
|
||||
true, false),
|
||||
|
||||
GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i8",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i8",
|
||||
false),
|
||||
false, false),
|
||||
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i16",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i16",
|
||||
false),
|
||||
false, false),
|
||||
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i32",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i32",
|
||||
false),
|
||||
false, false),
|
||||
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
||||
"__pseudo_scatter_factored_base_offsets32_float",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" :
|
||||
"__pseudo_scatter_factored_base_offsets32_float",
|
||||
false),
|
||||
false, false),
|
||||
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i64",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" :
|
||||
"__pseudo_scatter_factored_base_offsets32_i64",
|
||||
false),
|
||||
false, false),
|
||||
GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
||||
"__pseudo_scatter_factored_base_offsets32_double",
|
||||
g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" :
|
||||
"__pseudo_scatter_factored_base_offsets32_double",
|
||||
false),
|
||||
false, false),
|
||||
|
||||
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
|
||||
"__prefetch_read_varying_1",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" :
|
||||
"__prefetch_read_varying_1",
|
||||
false, true),
|
||||
|
||||
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
|
||||
"__prefetch_read_varying_2",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" :
|
||||
"__prefetch_read_varying_2",
|
||||
false, true),
|
||||
|
||||
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
|
||||
"__prefetch_read_varying_3",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" :
|
||||
"__prefetch_read_varying_3",
|
||||
false, true),
|
||||
|
||||
GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
|
||||
"__prefetch_read_varying_nt",
|
||||
g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" :
|
||||
"__prefetch_read_varying_nt",
|
||||
false, true),
|
||||
};
|
||||
|
||||
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
|
||||
@@ -4290,149 +4351,170 @@ lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
|
||||
static bool
|
||||
lReplacePseudoGS(llvm::CallInst *callInst) {
|
||||
struct LowerGSInfo {
|
||||
LowerGSInfo(const char *pName, const char *aName, bool ig)
|
||||
: isGather(ig) {
|
||||
LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip)
|
||||
: isGather(ig), isPrefetch(ip) {
|
||||
pseudoFunc = m->module->getFunction(pName);
|
||||
actualFunc = m->module->getFunction(aName);
|
||||
}
|
||||
llvm::Function *pseudoFunc;
|
||||
llvm::Function *actualFunc;
|
||||
const bool isGather;
|
||||
const bool isPrefetch;
|
||||
};
|
||||
|
||||
LowerGSInfo lgsInfo[] = {
|
||||
LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true),
|
||||
LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true),
|
||||
LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true),
|
||||
LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true),
|
||||
LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true),
|
||||
LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true),
|
||||
LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
|
||||
LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
|
||||
LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
|
||||
LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
|
||||
LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
|
||||
LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),
|
||||
|
||||
LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true),
|
||||
LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true),
|
||||
LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true),
|
||||
LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true),
|
||||
LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true),
|
||||
LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true),
|
||||
LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
|
||||
LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
|
||||
LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
|
||||
LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
|
||||
LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
|
||||
LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),
|
||||
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8",
|
||||
"__gather_factored_base_offsets32_i8", true),
|
||||
"__gather_factored_base_offsets32_i8", true, false),
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16",
|
||||
"__gather_factored_base_offsets32_i16", true),
|
||||
"__gather_factored_base_offsets32_i16", true, false),
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32",
|
||||
"__gather_factored_base_offsets32_i32", true),
|
||||
"__gather_factored_base_offsets32_i32", true, false),
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_float",
|
||||
"__gather_factored_base_offsets32_float", true),
|
||||
"__gather_factored_base_offsets32_float", true, false),
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64",
|
||||
"__gather_factored_base_offsets32_i64", true),
|
||||
"__gather_factored_base_offsets32_i64", true, false),
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets32_double",
|
||||
"__gather_factored_base_offsets32_double", true),
|
||||
"__gather_factored_base_offsets32_double", true, false),
|
||||
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8",
|
||||
"__gather_factored_base_offsets64_i8", true),
|
||||
"__gather_factored_base_offsets64_i8", true, false),
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16",
|
||||
"__gather_factored_base_offsets64_i16", true),
|
||||
"__gather_factored_base_offsets64_i16", true, false),
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32",
|
||||
"__gather_factored_base_offsets64_i32", true),
|
||||
"__gather_factored_base_offsets64_i32", true, false),
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_float",
|
||||
"__gather_factored_base_offsets64_float", true),
|
||||
"__gather_factored_base_offsets64_float", true, false),
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64",
|
||||
"__gather_factored_base_offsets64_i64", true),
|
||||
"__gather_factored_base_offsets64_i64", true, false),
|
||||
LowerGSInfo("__pseudo_gather_factored_base_offsets64_double",
|
||||
"__gather_factored_base_offsets64_double", true),
|
||||
"__gather_factored_base_offsets64_double", true, false),
|
||||
|
||||
LowerGSInfo("__pseudo_gather_base_offsets32_i8",
|
||||
"__gather_base_offsets32_i8", true),
|
||||
"__gather_base_offsets32_i8", true, false),
|
||||
LowerGSInfo("__pseudo_gather_base_offsets32_i16",
|
||||
"__gather_base_offsets32_i16", true),
|
||||
"__gather_base_offsets32_i16", true, false),
|
||||
LowerGSInfo("__pseudo_gather_base_offsets32_i32",
|
||||
"__gather_base_offsets32_i32", true),
|
||||
"__gather_base_offsets32_i32", true, false),
|
||||
LowerGSInfo("__pseudo_gather_base_offsets32_float",
|
||||
"__gather_base_offsets32_float", true),
|
||||
"__gather_base_offsets32_float", true, false),
|
||||
LowerGSInfo("__pseudo_gather_base_offsets32_i64",
|
||||
"__gather_base_offsets32_i64", true),
|
||||
"__gather_base_offsets32_i64", true, false),
|
||||
LowerGSInfo("__pseudo_gather_base_offsets32_double",
|
||||
"__gather_base_offsets32_double", true),
|
||||
"__gather_base_offsets32_double", true, false),
|
||||
|
||||
LowerGSInfo("__pseudo_gather_base_offsets64_i8",
|
||||
"__gather_base_offsets64_i8", true),
|
||||
"__gather_base_offsets64_i8", true, false),
|
||||
LowerGSInfo("__pseudo_gather_base_offsets64_i16",
|
||||
"__gather_base_offsets64_i16", true),
|
||||
"__gather_base_offsets64_i16", true, false),
|
||||
LowerGSInfo("__pseudo_gather_base_offsets64_i32",
|
||||
"__gather_base_offsets64_i32", true),
|
||||
"__gather_base_offsets64_i32", true, false),
|
||||
LowerGSInfo("__pseudo_gather_base_offsets64_float",
|
||||
"__gather_base_offsets64_float", true),
|
||||
"__gather_base_offsets64_float", true, false),
|
||||
LowerGSInfo("__pseudo_gather_base_offsets64_i64",
|
||||
"__gather_base_offsets64_i64", true),
|
||||
"__gather_base_offsets64_i64", true, false),
|
||||
LowerGSInfo("__pseudo_gather_base_offsets64_double",
|
||||
"__gather_base_offsets64_double", true),
|
||||
"__gather_base_offsets64_double", true, false),
|
||||
|
||||
LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false),
|
||||
LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false),
|
||||
LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false),
|
||||
LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false),
|
||||
LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false),
|
||||
LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false),
|
||||
LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
|
||||
LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
|
||||
LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
|
||||
LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
|
||||
LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
|
||||
LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),
|
||||
|
||||
LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false),
|
||||
LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false),
|
||||
LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false),
|
||||
LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false),
|
||||
LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false),
|
||||
LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false),
|
||||
LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
|
||||
LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
|
||||
LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
|
||||
LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
|
||||
LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
|
||||
LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),
|
||||
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8",
|
||||
"__scatter_factored_base_offsets32_i8", false),
|
||||
"__scatter_factored_base_offsets32_i8", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16",
|
||||
"__scatter_factored_base_offsets32_i16", false),
|
||||
"__scatter_factored_base_offsets32_i16", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32",
|
||||
"__scatter_factored_base_offsets32_i32", false),
|
||||
"__scatter_factored_base_offsets32_i32", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float",
|
||||
"__scatter_factored_base_offsets32_float", false),
|
||||
"__scatter_factored_base_offsets32_float", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64",
|
||||
"__scatter_factored_base_offsets32_i64", false),
|
||||
"__scatter_factored_base_offsets32_i64", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double",
|
||||
"__scatter_factored_base_offsets32_double", false),
|
||||
"__scatter_factored_base_offsets32_double", false, false),
|
||||
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8",
|
||||
"__scatter_factored_base_offsets64_i8", false),
|
||||
"__scatter_factored_base_offsets64_i8", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16",
|
||||
"__scatter_factored_base_offsets64_i16", false),
|
||||
"__scatter_factored_base_offsets64_i16", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32",
|
||||
"__scatter_factored_base_offsets64_i32", false),
|
||||
"__scatter_factored_base_offsets64_i32", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float",
|
||||
"__scatter_factored_base_offsets64_float", false),
|
||||
"__scatter_factored_base_offsets64_float", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64",
|
||||
"__scatter_factored_base_offsets64_i64", false),
|
||||
"__scatter_factored_base_offsets64_i64", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double",
|
||||
"__scatter_factored_base_offsets64_double", false),
|
||||
"__scatter_factored_base_offsets64_double", false, false),
|
||||
|
||||
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets32_i8",
|
||||
"__scatter_base_offsets32_i8", false),
|
||||
"__scatter_base_offsets32_i8", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets32_i16",
|
||||
"__scatter_base_offsets32_i16", false),
|
||||
"__scatter_base_offsets32_i16", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets32_i32",
|
||||
"__scatter_base_offsets32_i32", false),
|
||||
"__scatter_base_offsets32_i32", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets32_float",
|
||||
"__scatter_base_offsets32_float", false),
|
||||
"__scatter_base_offsets32_float", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets32_i64",
|
||||
"__scatter_base_offsets32_i64", false),
|
||||
"__scatter_base_offsets32_i64", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets32_double",
|
||||
"__scatter_base_offsets32_double", false),
|
||||
"__scatter_base_offsets32_double", false, false),
|
||||
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets64_i8",
|
||||
"__scatter_base_offsets64_i8", false),
|
||||
"__scatter_base_offsets64_i8", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets64_i16",
|
||||
"__scatter_base_offsets64_i16", false),
|
||||
"__scatter_base_offsets64_i16", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets64_i32",
|
||||
"__scatter_base_offsets64_i32", false),
|
||||
"__scatter_base_offsets64_i32", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets64_float",
|
||||
"__scatter_base_offsets64_float", false),
|
||||
"__scatter_base_offsets64_float", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets64_i64",
|
||||
"__scatter_base_offsets64_i64", false),
|
||||
"__scatter_base_offsets64_i64", false, false),
|
||||
LowerGSInfo("__pseudo_scatter_base_offsets64_double",
|
||||
"__scatter_base_offsets64_double", false),
|
||||
"__scatter_base_offsets64_double", false, false),
|
||||
|
||||
LowerGSInfo("__pseudo_prefetch_read_varying_1",
|
||||
"__prefetch_read_varying_1", false, true),
|
||||
LowerGSInfo("__pseudo_prefetch_read_varying_1_native",
|
||||
"__prefetch_read_varying_1_native", false, true),
|
||||
|
||||
LowerGSInfo("__pseudo_prefetch_read_varying_2",
|
||||
"__prefetch_read_varying_2", false, true),
|
||||
LowerGSInfo("__pseudo_prefetch_read_varying_2_native",
|
||||
"__prefetch_read_varying_2_native", false, true),
|
||||
|
||||
LowerGSInfo("__pseudo_prefetch_read_varying_3",
|
||||
"__prefetch_read_varying_3", false, true),
|
||||
LowerGSInfo("__pseudo_prefetch_read_varying_3_native",
|
||||
"__prefetch_read_varying_3_native", false, true),
|
||||
|
||||
LowerGSInfo("__pseudo_prefetch_read_varying_nt",
|
||||
"__prefetch_read_varying_nt", false, true),
|
||||
LowerGSInfo("__pseudo_prefetch_read_varying_nt_native",
|
||||
"__prefetch_read_varying_nt_native", false, true),
|
||||
};
|
||||
|
||||
llvm::Function *calledFunc = callInst->getCalledFunction();
|
||||
@@ -4459,7 +4541,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) {
|
||||
if (gotPosition && g->target->getVectorWidth() > 1) {
|
||||
if (info->isGather)
|
||||
PerformanceWarning(pos, "Gather required to load value.");
|
||||
else
|
||||
else if (!info->isPrefetch)
|
||||
PerformanceWarning(pos, "Scatter required to store value.");
|
||||
}
|
||||
return true;
|
||||
@@ -4740,6 +4822,8 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
|
||||
"__scatter64_i8", "__scatter64_i16",
|
||||
"__scatter64_i32", "__scatter64_i64",
|
||||
"__scatter64_float", "__scatter64_double",
|
||||
"__prefetch_read_varying_1", "__prefetch_read_varying_2",
|
||||
"__prefetch_read_varying_3", "__prefetch_read_varying_nt",
|
||||
"__keep_funcs_live",
|
||||
};
|
||||
|
||||
|
||||
@@ -647,8 +647,8 @@ def run_tests(options1, args, print_version):
|
||||
options.include_file = "examples/intrinsics/generic-64.h"
|
||||
options.target = "generic-64"
|
||||
elif options.target == "knc":
|
||||
error("No knc #include specified; using examples/intrinsics/knc-i1x16.h\n", 2)
|
||||
options.include_file = "examples/intrinsics/knc-i1x16.h"
|
||||
error("No knc #include specified; using examples/intrinsics/knc.h\n", 2)
|
||||
options.include_file = "examples/intrinsics/knc.h"
|
||||
|
||||
if options.compiler_exe == None:
|
||||
if (options.target == "knc"):
|
||||
|
||||
32
stdlib.ispc
32
stdlib.ispc
@@ -847,43 +847,19 @@ static inline void prefetch_nt(const void * uniform ptr) {
|
||||
}
|
||||
|
||||
static inline void prefetch_l1(const void * varying ptr) {
|
||||
const void * uniform ptrArray[programCount];
|
||||
ptrArray[programIndex] = ptr;
|
||||
|
||||
foreach_active (i) {
|
||||
const void * uniform p = ptrArray[i];
|
||||
prefetch_l1(p);
|
||||
}
|
||||
__pseudo_prefetch_read_varying_1((int64)ptr, (IntMaskType)__mask);
|
||||
}
|
||||
|
||||
static inline void prefetch_l2(const void * varying ptr) {
|
||||
const void * uniform ptrArray[programCount];
|
||||
ptrArray[programIndex] = ptr;
|
||||
|
||||
foreach_active (i) {
|
||||
const void * uniform p = ptrArray[i];
|
||||
prefetch_l2(p);
|
||||
}
|
||||
__pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask);
|
||||
}
|
||||
|
||||
static inline void prefetch_l3(const void * varying ptr) {
|
||||
const void * uniform ptrArray[programCount];
|
||||
ptrArray[programIndex] = ptr;
|
||||
|
||||
foreach_active (i) {
|
||||
const void * uniform p = ptrArray[i];
|
||||
prefetch_l3(p);
|
||||
}
|
||||
__pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask);
|
||||
}
|
||||
|
||||
static inline void prefetch_nt(const void * varying ptr) {
|
||||
const void * uniform ptrArray[programCount];
|
||||
ptrArray[programIndex] = ptr;
|
||||
|
||||
foreach_active (i) {
|
||||
const void * uniform p = ptrArray[i];
|
||||
prefetch_nt(p);
|
||||
}
|
||||
__pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
7
stmt.cpp
7
stmt.cpp
@@ -712,7 +712,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Emit code for an if test that checks the mask and the test values and
|
||||
tries to be smart about jumping over code that doesn't need to be run.
|
||||
*/
|
||||
@@ -1101,8 +1100,10 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// the code for the test. This is only necessary for varying loops;
|
||||
// 'uniform' loops just jump when they hit a continue statement and
|
||||
// don't mess with the mask.
|
||||
if (!uniformTest)
|
||||
if (!uniformTest) {
|
||||
ctx->RestoreContinuedLanes();
|
||||
ctx->ClearBreakLanes();
|
||||
}
|
||||
llvm::Value *testValue = testExpr->GetValue(ctx);
|
||||
if (!testValue)
|
||||
return;
|
||||
@@ -1310,6 +1311,8 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// test code.
|
||||
ctx->SetCurrentBasicBlock(bstep);
|
||||
ctx->RestoreContinuedLanes();
|
||||
ctx->ClearBreakLanes();
|
||||
|
||||
if (step)
|
||||
step->EmitCode(ctx);
|
||||
ctx->BranchInst(btest);
|
||||
|
||||
22
tests/prefetch-varying.ispc
Normal file
22
tests/prefetch-varying.ispc
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int64 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int64 *ptr = &(a[programIndex+zero]);
|
||||
prefetch_l1(ptr);
|
||||
prefetch_l2(ptr);
|
||||
prefetch_l3(ptr);
|
||||
prefetch_nt(ptr);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
Reference in New Issue
Block a user