diff --git a/alloy.py b/alloy.py index 080033c3..f5c8a7ca 100755 --- a/alloy.py +++ b/alloy.py @@ -110,8 +110,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, if version_LLVM == "trunk": SVN_PATH="trunk" if version_LLVM == "3.5": - # SVN_PATH=tags/RELEASE_35/rc1 - SVN_PATH="branches/release_35" + SVN_PATH="tags/RELEASE_350/final" version_LLVM = "3_5" if version_LLVM == "3.4": SVN_PATH="tags/RELEASE_34/dot2-final" diff --git a/builtins.cpp b/builtins.cpp index 35cf115c..2a1df0eb 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -555,6 +555,10 @@ lSetInternalFunctions(llvm::Module *module) { "__prefetch_read_uniform_2", "__prefetch_read_uniform_3", "__prefetch_read_uniform_nt", + "__pseudo_prefetch_read_varying_1", + "__pseudo_prefetch_read_varying_2", + "__pseudo_prefetch_read_varying_3", + "__pseudo_prefetch_read_varying_nt", "__psubs_vi8", "__psubs_vi16", "__psubus_vi8", @@ -780,7 +784,11 @@ void AddBitcodeToModule(const unsigned char *bitcode, int length, llvm::Module *module, SymbolTable *symbolTable) { llvm::StringRef sb = llvm::StringRef((char *)bitcode, length); +#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5) llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb); +#else // LLVM 3.6+ + llvm::MemoryBufferRef bcBuf = llvm::MemoryBuffer::getMemBuffer(sb)->getMemBufferRef(); +#endif #if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+ llvm::ErrorOr ModuleOrErr = llvm::parseBitcodeFile(bcBuf, *g->ctx); @@ -910,12 +918,23 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module, // have the DW_AT_artifical attribute. It's not clear if this // matters for anything though. llvm::DIGlobalVariable var = +#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+ + m->diBuilder->createGlobalVariable(file, + name, + name, + file, + 0 /* line */, + diType, + true /* static */, + sym->storagePtr); +#else m->diBuilder->createGlobalVariable(name, file, 0 /* line */, diType, true /* static */, sym->storagePtr); +#endif Assert(var.Verify()); } } @@ -970,12 +989,23 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) { llvm::DIType diType = sym->type->GetDIType(file); Assert(diType.Verify()); llvm::DIGlobalVariable var = +#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+ + m->diBuilder->createGlobalVariable(file, + sym->name.c_str(), + sym->name.c_str(), + file, + 0 /* line */, + diType, + false /* static */, + sym->storagePtr); +#else m->diBuilder->createGlobalVariable(sym->name.c_str(), file, 0 /* line */, diType, false /* static */, sym->storagePtr); +#endif Assert(var.Verify()); } } diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index d50f4947..dd9bd428 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -370,6 +370,14 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind +declare void @__prefetch_read_varying_1( %addr, %mask) nounwind +declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind +declare void @__prefetch_read_varying_2( %addr, %mask) nounwind +declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind +declare void @__prefetch_read_varying_3( %addr, %mask) nounwind +declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind +declare void @__prefetch_read_varying_nt( %addr, %mask) nounwind +declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int8/int16 builtins diff --git a/builtins/util.m4 b/builtins/util.m4 index 58f5a5cd..b265add8 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1584,6 +1584,50 @@ define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1) ret void } + +define void @__prefetch_read_varying_1( %addr, %mask) alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %addr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8* + call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 3, i32 1) + ') + ret void +} + +declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind + +define void @__prefetch_read_varying_2( %addr, %mask) alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %addr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8* + call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 2, i32 1) + ') + ret void +} + +declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind + +define void @__prefetch_read_varying_3( %addr, %mask) alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %addr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8* + call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 1, i32 1) + ') + ret void +} + +declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind + +define void @__prefetch_read_varying_nt( %addr, %mask) alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %addr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8* + call void @llvm.prefetch(i8 * %ptr_LANE_ID, i32 0, i32 0, i32 1) + ') + ret void +} + +declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, %offsets, %mask) nounwind ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2535,6 +2579,31 @@ declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, , , ) nounwind + +declare void @__pseudo_prefetch_read_varying_1(, ) nounwind + +declare void +@__pseudo_prefetch_read_varying_1_native(i8 *, i32, , + ) nounwind + +declare void @__pseudo_prefetch_read_varying_2(, ) nounwind + +declare void +@__pseudo_prefetch_read_varying_2_native(i8 *, i32, , + ) nounwind + +declare void @__pseudo_prefetch_read_varying_3(, ) nounwind + +declare void +@__pseudo_prefetch_read_varying_3_native(i8 *, i32, , + ) nounwind + +declare void @__pseudo_prefetch_read_varying_nt(, ) nounwind + +declare void +@__pseudo_prefetch_read_varying_nt_native(i8 *, i32, , + ) nounwind + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; declare void @__use8() @@ -3034,6 +3103,41 @@ ifelse(HAVE_SCATTER, `1', %vd, %mask) ') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; prefetchs + + call void @__pseudo_prefetch_read_varying_1( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_1_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_1_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_1( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_2( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_2_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_2_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_2( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_3( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_3_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_3_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_3( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_nt( %v64, %mask) + + call void @__pseudo_prefetch_read_varying_nt_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_nt_native(i8 * %ptr, i32 0, + %v32, %mask) + call void @__prefetch_read_varying_nt( %v64, %mask) + ret void } diff --git a/cbackend.cpp b/cbackend.cpp index 867385a4..5de99366 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -4945,9 +4945,19 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth, llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None; #endif +#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5) std::string error; +#else // LLVM 3.6+ + std::error_code error; +#endif + llvm::tool_output_file *of = new llvm::tool_output_file(fn, error, flags); + +#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5) if (error.size()) { +#else // LLVM 3.6+ + if (error) { +#endif fprintf(stderr, "Error opening output file \"%s\".\n", fn); return false; } diff --git a/ctx.cpp b/ctx.cpp index 04ec22f6..f09002c1 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -745,6 +745,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) { // that have executed a 'break' statement: // breakLanes = breakLanes | mask AssertPos(currentPos, breakLanesPtr != NULL); + llvm::Value *mask = GetInternalMask(); llvm::Value *breakMask = LoadInst(breakLanesPtr, "break_mask"); @@ -883,7 +884,7 @@ FunctionEmitContext::jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target) { finishedLanes = BinaryOperator(llvm::Instruction::Or, finishedLanes, continued, "returned|breaked|continued"); } - + finishedLanes = BinaryOperator(llvm::Instruction::And, finishedLanes, GetFunctionMask(), "finished&func"); @@ -927,6 +928,16 @@ FunctionEmitContext::RestoreContinuedLanes() { } +void +FunctionEmitContext::ClearBreakLanes() { + if (breakLanesPtr == NULL) + return; + + // breakLanes = 0 + StoreInst(LLVMMaskAllOff, breakLanesPtr); +} + + void FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) { llvm::Value *oldMask = GetInternalMask(); @@ -1636,14 +1647,16 @@ FunctionEmitContext::StartScope() { llvm::DILexicalBlock lexicalBlock = m->diBuilder->createLexicalBlock(parentScope, diFile, currentPos.first_line, -#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+ +#if defined(LLVM_3_5) // Revision 202736 in LLVM adds support of DWARF discriminator // to the last argument and revision 202737 in clang adds 0 // for the last argument by default. currentPos.first_column, 0); #else + // Revision 216239 in LLVM removes support of DWARF discriminator + // as the last argument currentPos.first_column); -#endif +#endif // LLVM 3.2, 3.3, 3.4 and 3.6+ AssertPos(currentPos, lexicalBlock.Verify()); debugScopes.push_back(lexicalBlock); } @@ -1683,8 +1696,14 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) { diType, true /* preserve through opts */); AssertPos(currentPos, var.Verify()); +#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+ + llvm::DIExpression E = m->diBuilder->createExpression(); + llvm::Instruction *declareInst = + m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock); +#else llvm::Instruction *declareInst = m->diBuilder->insertDeclare(sym->storagePtr, var, bblock); +#endif AddDebugPos(declareInst, &sym->pos, &scope); } @@ -1710,8 +1729,14 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym, int argNum) { flags, argNum+1); AssertPos(currentPos, var.Verify()); +#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+ + llvm::DIExpression E = m->diBuilder->createExpression(); + llvm::Instruction *declareInst = + m->diBuilder->insertDeclare(sym->storagePtr, var, E, bblock); +#else llvm::Instruction *declareInst = m->diBuilder->insertDeclare(sym->storagePtr, var, bblock); +#endif AddDebugPos(declareInst, &sym->pos, &scope); } diff --git a/ctx.h b/ctx.h index c4f9a6aa..cd4db7e8 100644 --- a/ctx.h +++ b/ctx.h @@ -195,6 +195,13 @@ public: 'continue' statement when going through the loop body in the previous iteration. */ void RestoreContinuedLanes(); + + /** This method is called by code emitting IR for a loop. It clears + any lanes that contained a break since the mask has been updated to take + them into account. This is necessary as all the bail out checks for + breaks are meant to only deal with lanes breaking on the current iteration. + */ + void ClearBreakLanes(); /** Indicates that code generation for a "switch" statement is about to start. isUniform indicates whether the "switch" value is uniform, diff --git a/examples/common.props b/examples/common.props index 5cfad4fc..08f40f65 100644 --- a/examples/common.props +++ b/examples/common.props @@ -160,8 +160,8 @@ Document - $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags) - $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) $(flags) + $(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) -g $(flags) + $(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) -g $(flags) $(Target_out) $(Target_out) $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags) diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index c8f2cf08..f44c581e 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1540,6 +1540,15 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) { static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) { } +#define PREFETCH_READ_VARYING(CACHE_NUM) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec16_i32 offsets, __vec16_i1 mask) {} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \ + +PREFETCH_READ_VARYING(1) +PREFETCH_READ_VARYING(2) +PREFETCH_READ_VARYING(3) +PREFETCH_READ_VARYING(nt) /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index f5bb233c..26505615 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1624,6 +1624,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) { static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) { } +#define PREFETCH_READ_VARYING(CACHE_NUM) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec32_i32 offsets, __vec32_i1 mask) {} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec32_i64 addr, __vec32_i1 mask) {} \ + +PREFETCH_READ_VARYING(1) +PREFETCH_READ_VARYING(2) +PREFETCH_READ_VARYING(3) +PREFETCH_READ_VARYING(nt) + /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index a7148c8b..b5caa008 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1757,6 +1757,16 @@ static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) { static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) { } +#define PREFETCH_READ_VARYING(CACHE_NUM) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec64_i32 offsets, __vec64_i1 mask) {} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec64_i64 addr, __vec64_i1 mask) {} \ + +PREFETCH_READ_VARYING(1) +PREFETCH_READ_VARYING(2) +PREFETCH_READ_VARYING(3) +PREFETCH_READ_VARYING(nt) + /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 2e6afed5..b09958fa 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -1,5 +1,5 @@ /** - Copyright (c) 2010-2013, Intel Corporation + Copyright (c) 2010-2014, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,7 +31,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include +#include // INT_MIN +#include #include #include #include @@ -525,11 +526,11 @@ template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v *p = v; } -template RetVecType __smear_i1(int i); -template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; } +template static RetVecType __smear_i1(int i); +template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; } -template RetVecType __setzero_i1(); -template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; } +template static RetVecType __setzero_i1(); +template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; } template __vec16_i1 __undef_i1(); template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); } @@ -677,8 +678,8 @@ static FORCEINLINE __vec16_i32 __select( bool cond, __vec16_i32 a, __vec16_ static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int32_t index) { return v[index]; } static FORCEINLINE void __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val; } -template RetVecType __smear_i32(int32_t i); -template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); } +template RetVecType static __smear_i32(int32_t i); +template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); } static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1); static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0); @@ -686,11 +687,11 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32); static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1); static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -template RetVecType __setzero_i32(); -template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); } +template static RetVecType __setzero_i32(); +template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); } -template RetVecType __undef_i32(); -template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); } +template static RetVecType __undef_i32(); +template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); } static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); } @@ -742,11 +743,11 @@ template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 } #if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */ -template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) +template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) { return _mm512_load_epi32(p); } -template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) +template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { _mm512_store_epi32(p, v); } @@ -1017,21 +1018,21 @@ template static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 } #if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */ -template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) +template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) { __m512i v2 = _mm512_load_epi32(p); __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64); return __vec16_i64(v2,v1); } -template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); } -template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) +template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); } +template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) { __m512i v1 = v.v2; __m512i v2 = v.v1; _mm512_store_epi64(p, v2); _mm512_store_epi64(((uint8_t*)p)+64, v1); } -template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); } +template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); } #endif @@ -1067,14 +1068,14 @@ static FORCEINLINE __vec16_f __select( bool cond, __vec16_f a, __vec16_f b) static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { return v[index]; } static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; } -template RetVecType __smear_float(float f); -template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); } +template static RetVecType __smear_float(float f); +template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); } -template RetVecType __setzero_float(); -template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); } +template static RetVecType __setzero_float(); +template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); } -template RetVecType __undef_float(); -template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); } +template static RetVecType __undef_float(); +template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); } static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index) { @@ -1131,12 +1132,12 @@ template static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) } #if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */ -template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) +template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) { return _mm512_load_ps(p); } /* this one doesn't fail but it is commented out for completeness, no aligned load/stores */ -template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) +template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) { _mm512_store_ps(p, v); } @@ -1309,14 +1310,14 @@ static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { return v[index]; } static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; } -template RetVecType __smear_double(double d); -template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); } +template static RetVecType __smear_double(double d); +template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); } -template RetVecType __setzero_double(); -template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); } +template static RetVecType __setzero_double(); +template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); } -template RetVecType __undef_double(); -template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); } +template static RetVecType __undef_double(); +template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); } #define CASTD2F(_v_, _v_hi_, _v_lo_) \ __vec16_f _v_hi_, _v_lo_; \ @@ -1390,17 +1391,17 @@ template static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) #if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */ -template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) +template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) { return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64)); } -template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) +template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) { _mm512_store_pd(p, v.v1); _mm512_store_pd(((uint8_t*)p)+64, v.v2); } -template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); } -template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); } +template <> FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); } +template <> FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); } #endif /////////////////////////////////////////////////////////////////////////// @@ -2162,6 +2163,7 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); + const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); __vec16_i1 still_to_do = mask; __vec16_i32 tmp; while (still_to_do) { @@ -2172,8 +2174,8 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_ _MM_CMPINT_EQ); void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32)); - tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base, + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base, _MM_UPCONV_EPI32_SINT8, scale, _MM_HINT_NONE); still_to_do = _mm512_kxor(match,still_to_do); @@ -2197,6 +2199,7 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32 static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); + const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); // There is no gather instruction with 64-bit offsets in KNC. // We have to manually iterate over the upper 32 bits ;-) __vec16_i1 still_to_do = mask; @@ -2207,10 +2210,10 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3 __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, __smear_i32<__vec16_i32>((int32_t)hi32), _MM_CMPINT_EQ); - + void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32)); - ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base, + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + ret = _mm512_mask_i32extgather_epi32(ret, match, signed_offsets, base, _MM_UPCONV_EPI32_NONE, scale, _MM_HINT_NONE); still_to_do = _mm512_kxor(match, still_to_do); @@ -2230,6 +2233,7 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32 static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); + const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); // There is no gather instruction with 64-bit offsets in KNC. // We have to manually iterate over the upper 32 bits ;-) __vec16_i1 still_to_do = mask; @@ -2242,8 +2246,8 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3 _MM_CMPINT_EQ); void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32)); - ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base, + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base, _MM_UPCONV_PS_NONE, scale, _MM_HINT_NONE); still_to_do = _mm512_kxor(match, still_to_do); @@ -2339,7 +2343,8 @@ static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); - + const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); + __vec16_i1 still_to_do = mask; while (still_to_do) { int first_active_lane = _mm_tzcnt_32((int)still_to_do); @@ -2349,8 +2354,8 @@ static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t sc _MM_CMPINT_EQ); void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32)); - _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + _mm512_mask_i32extscatter_epi32(base, match, signed_offsets, value, _MM_DOWNCONV_EPI32_NONE, scale, _MM_HINT_NONE); @@ -2370,7 +2375,8 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); - + const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); + __vec16_i1 still_to_do = mask; while (still_to_do) { int first_active_lane = _mm_tzcnt_32((int)still_to_do); @@ -2380,8 +2386,9 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t _MM_CMPINT_EQ); void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32)); - _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + + _mm512_mask_i32extscatter_ps(base, match, signed_offsets, value, _MM_DOWNCONV_PS_NONE, scale, _MM_HINT_NONE); @@ -2543,6 +2550,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) { // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint } +#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec16_i32 offsets, __vec16_i1 mask) { \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \ + offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\ + __vec16_i1 copy_mask = _mm512_kmov(mask); \ + _mm512_kswapb(mask, copy_mask); \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \ +} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \ + +PREFETCH_READ_VARYING(1, _MM_HINT_T0) +PREFETCH_READ_VARYING(2, _MM_HINT_T1) +PREFETCH_READ_VARYING(nt, _MM_HINT_T2) + +static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale, + __vec16_i32 offsets, __vec16_i1 mask) {} + +static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {} + /////////////////////////////////////////////////////////////////////////// // atomics /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index 478ad75a..5eb8ea05 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -2606,6 +2606,26 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) { // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint } +#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec16_i32 offsets, __vec16_i1 mask) { \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \ + offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\ + __vec16_i1 copy_mask = _mm512_kmov(mask); \ + _mm512_kswapb(mask, copy_mask); \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \ +} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \ + +PREFETCH_READ_VARYING(1, _MM_HINT_T0) +PREFETCH_READ_VARYING(2, _MM_HINT_T1) +PREFETCH_READ_VARYING(nt, _MM_HINT_T2) + +static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale, + __vec16_i32 offsets, __vec16_i1 mask) {} + +static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {} + /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index e674f409..732330b9 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1,5 +1,5 @@ -/* - Copyright (c) 2012, Intel Corporation +/** + Copyright (c) 2010-2014, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,6 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include // INT_MIN #include #include #include @@ -43,6 +44,15 @@ #include // for operator<<(m512[i]) #include // for operator<<(m512[i]) +#if 0 + #define STRING(x) #x + #define TOSTRING(x) STRING(x) + #define PING std::cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << std::endl + #define PRINT(x) std::cout << STRING(x) << " = " << (x) << std::endl + #define PRINT2(x,y) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << std::endl + #define PRINT3(x,y,z) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << std::endl + #define PRINT4(x,y,z,w) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << std::endl +#endif #define FORCEINLINE __forceinline #ifdef _MSC_VER @@ -57,13 +67,13 @@ #define KNC 1 extern "C" { - int printf(const unsigned char *, ...); - int puts(unsigned char *); - unsigned int putchar(unsigned int); - int fflush(void *); - uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t); - uint8_t *memset(uint8_t *, uint8_t, uint64_t); - void memset_pattern16(void *, const void *, uint64_t); + int printf(const unsigned char *, ...); + int puts(unsigned char *); + unsigned int putchar(unsigned int); + int fflush(void *); + uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t); + uint8_t *memset(uint8_t *, uint8_t, uint64_t); + void memset_pattern16(void *, const void *, uint64_t); } typedef float __vec1_f; @@ -75,118 +85,155 @@ typedef int64_t __vec1_i64; struct __vec16_i32; +#if 0 +/* (iw) actually, this *SHOULD* be the right implementation for a +vec16_i1: this one is a class that can have a constructor (which +ISPC sometimes emits for these vectors...) This version might +not be working with embree's ISPC bindings, probably because +embree still uses the 'wrong' implementation */ +typedef struct PRE_ALIGN(2) __vec16_i1 +{ + FORCEINLINE operator __mmask16() const { return v; } + FORCEINLINE __vec16_i1() { } + FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { } + FORCEINLINE __vec16_i1(bool v0, bool v1, bool v2, bool v3, + bool v4, bool v5, bool v6, bool v7, + bool v8, bool v9, bool v10, bool v11, + bool v12, bool v13, bool v14, bool v15) { + v = ((v0 & 1) | + ((v1 & 1) << 1) | + ((v2 & 1) << 2) | + ((v3 & 1) << 3) | + ((v4 & 1) << 4) | + ((v5 & 1) << 5) | + ((v6 & 1) << 6) | + ((v7 & 1) << 7) | + ((v8 & 1) << 8) | + ((v9 & 1) << 9) | + ((v10 & 1) << 10) | + ((v11 & 1) << 11) | + ((v12 & 1) << 12) | + ((v13 & 1) << 13) | + ((v14 & 1) << 14) | + ((v15 & 1) << 15)); + } + __mmask16 v; +} POST_ALIGN(2) __vec16_i1; + +#else typedef __mmask16 POST_ALIGN(2) __vec16_i1; +#endif typedef struct PRE_ALIGN(64) __vec16_f { - FORCEINLINE operator __m512() const { return v; } - FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { } - FORCEINLINE __vec16_f(const __m512 &in) : v(in) {} - FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {} - FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; } - FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, - float v04, float v05, float v06, float v07, - float v08, float v09, float v10, float v11, - float v12, float v13, float v14, float v15) { - v = _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00); - } - __m512 v; + FORCEINLINE operator __m512() const { return v; } + FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { } + FORCEINLINE __vec16_f(const __m512 &in) : v(in) {} + FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {} + FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; } + FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, + float v04, float v05, float v06, float v07, + float v08, float v09, float v10, float v11, + float v12, float v13, float v14, float v15) { + v = _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00); + } + __m512 v; } POST_ALIGN(64) __vec16_f; typedef struct PRE_ALIGN(64) __vec16_d { - FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {} - FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {} - FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {} - FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; } - FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, - double v04, double v05, double v06, double v07, - double v08, double v09, double v10, double v11, - double v12, double v13, double v14, double v15) { - v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08); - v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00); - } - __m512d v1; - __m512d v2; + FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {} + FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {} + FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {} + FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; } + FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, + double v04, double v05, double v06, double v07, + double v08, double v09, double v10, double v11, + double v12, double v13, double v14, double v15) { + v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08); + v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00); + } + __m512d v1; + __m512d v2; } POST_ALIGN(64) __vec16_d; typedef struct PRE_ALIGN(64) __vec16_i32 { - FORCEINLINE operator __m512i() const { return v; } - FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {} - FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {} - FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {} - FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {} - FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; } - FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, - int32_t v04, int32_t v05, int32_t v06, int32_t v07, - int32_t v08, int32_t v09, int32_t v10, int32_t v11, - int32_t v12, int32_t v13, int32_t v14, int32_t v15) { - v = _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00); - } - __m512i v; + FORCEINLINE operator __m512i() const { return v; } + FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {} + FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {} + FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {} + FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {} + FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; } + FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, + int32_t v04, int32_t v05, int32_t v06, int32_t v07, + int32_t v08, int32_t v09, int32_t v10, int32_t v11, + int32_t v12, int32_t v13, int32_t v14, int32_t v15) { + v = _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00); + } + __m512i v; } POST_ALIGN(64) __vec16_i32; typedef struct PRE_ALIGN(64) __vec16_i64 { - FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()), v_hi(_mm512_undefined_epi32()) {} - FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {} - FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {} - FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; } - FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, - int64_t v04, int64_t v05, int64_t v06, int64_t v07, - int64_t v08, int64_t v09, int64_t v10, int64_t v11, - int64_t v12, int64_t v13, int64_t v14, int64_t v15) { - __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08); - __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00); - v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v1); - v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v2); - v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v1); - v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v2); - } - __m512i v_hi; - __m512i v_lo; + FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()), v_hi(_mm512_undefined_epi32()) {} + FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {} + FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {} + FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; } + FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, + int64_t v04, int64_t v05, int64_t v06, int64_t v07, + int64_t v08, int64_t v09, int64_t v10, int64_t v11, + int64_t v12, int64_t v13, int64_t v14, int64_t v15) { + __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08); + __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00); + v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); + v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); + v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); + v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); + } + __m512i v_hi; + __m512i v_lo; } POST_ALIGN(64) __vec16_i64; template struct vec16 { - FORCEINLINE vec16() { } - FORCEINLINE vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, - T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { - v[0] = v0; v[1] = v1; v[2] = v2; v[3] = v3; - v[4] = v4; v[5] = v5; v[6] = v6; v[7] = v7; - v[8] = v8; v[9] = v9; v[10] = v10; v[11] = v11; - v[12] = v12; v[13] = v13; v[14] = v14; v[15] = v15; - } - T v[16]; + FORCEINLINE vec16() { } + FORCEINLINE vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { + v[0] = v0; v[1] = v1; v[2] = v2; v[3] = v3; + v[4] = v4; v[5] = v5; v[6] = v6; v[7] = v7; + v[8] = v8; v[9] = v9; v[10] = v10; v[11] = v11; + v[12] = v12; v[13] = v13; v[14] = v14; v[15] = v15; + } + T v[16]; }; PRE_ALIGN(16) struct __vec16_i8 : public vec16 { - FORCEINLINE __vec16_i8() { } - FORCEINLINE __vec16_i8(const __vec16_i8 &o); - FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o); - FORCEINLINE __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, - int8_t v4, int8_t v5, int8_t v6, int8_t v7, - int8_t v8, int8_t v9, int8_t v10, int8_t v11, - int8_t v12, int8_t v13, int8_t v14, int8_t v15) - : vec16(v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15) { } + FORCEINLINE __vec16_i8() { } + FORCEINLINE __vec16_i8(const int8_t v0, const int8_t v1, const int8_t v2, const int8_t v3, + const int8_t v4, const int8_t v5, const int8_t v6, const int8_t v7, + const int8_t v8, const int8_t v9, const int8_t v10, const int8_t v11, + const int8_t v12, const int8_t v13, const int8_t v14, const int8_t v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } + FORCEINLINE __vec16_i8(const __vec16_i8 &o); + FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o); } POST_ALIGN(16); PRE_ALIGN(32) struct __vec16_i16 : public vec16 { - FORCEINLINE __vec16_i16() { } - FORCEINLINE __vec16_i16(const __vec16_i16 &o); - FORCEINLINE __vec16_i16& operator =(const __vec16_i16 &o); - FORCEINLINE __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, - int16_t v4, int16_t v5, int16_t v6, int16_t v7, - int16_t v8, int16_t v9, int16_t v10, int16_t v11, - int16_t v12, int16_t v13, int16_t v14, int16_t v15) - : vec16(v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15) { } + FORCEINLINE __vec16_i16() { } + FORCEINLINE __vec16_i16(const __vec16_i16 &o); + FORCEINLINE __vec16_i16& operator =(const __vec16_i16 &o); + FORCEINLINE __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, + int16_t v4, int16_t v5, int16_t v6, int16_t v7, + int16_t v8, int16_t v9, int16_t v10, int16_t v11, + int16_t v12, int16_t v13, int16_t v14, int16_t v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } } POST_ALIGN(32); @@ -199,8 +246,8 @@ inline std::ostream &operator<<(std::ostream &out, const __m512i &v) out << "["; for (int i=0;i<16;i++) out << (i?",":"") << std::dec << std::setw(8) << ((int*)&v)[i] << std::dec; - // out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec; - + // out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec; + out << "]" << std::flush; return out; } @@ -210,11 +257,33 @@ inline std::ostream &operator<<(std::ostream &out, const __m512 &v) out << "["; for (int i=0;i<16;i++) out << (i?",":"") << ((float*)&v)[i]; - + out << "]" << std::flush; return out; } +inline std::ostream &operator<<(std::ostream &out, const __vec16_i8 &v) +{ + out << "["; + for (int i=0;i<16;i++) + out << (i?",":"") << std::dec << std::setw(8) << (int)((unsigned char*)&v)[i] << std::dec; + // out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec; + + out << "]" << std::flush; + return out; +} + +inline std::ostream &operator<<(std::ostream &out, const __vec16_i64 &v) +{ + out << "["; + uint32_t *ptr = (uint32_t*)&v; + for (int i=0;i<16;i++) { + uint64_t val = (uint64_t(ptr[i])<<32)+ptr[i+16]; + out << (i?",":"") << ((int*)val); + } + out << "]" << std::flush; + return out; +} /////////////////////////////////////////////////////////////////////////// // macros... @@ -237,103 +306,117 @@ FORCEINLINE __vec16_i8& __vec16_i8::operator=(const __vec16_i8 &o) /////////////////////////////////////////////////////////////////////////// static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { - return _mm512_kmov(mask); + return _mm512_kmov(mask); } static FORCEINLINE bool __any(__vec16_i1 mask) { - return !_mm512_kortestz(mask, mask); + return !_mm512_kortestz(mask, mask); } static FORCEINLINE bool __all(__vec16_i1 mask) { - return _mm512_kortestc(mask, mask); + return _mm512_kortestc(mask, mask); } static FORCEINLINE bool __none(__vec16_i1 mask) { - return _mm512_kortestz(mask, mask); + return _mm512_kortestz(mask, mask); } static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { - return _mm512_knot( _mm512_kandn(a, b)); + return _mm512_knot( _mm512_kandn(a, b)); } static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kand(a, b); + return _mm512_kand(a, b); } static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) { - return _mm512_knot(a); + return _mm512_knot(a); } static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kandn(a, b); + return _mm512_kandn(a, b); } static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kandnr(a, b); + return _mm512_kandnr(a, b); } static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kxor(a, b); + return _mm512_kxor(a, b); } static FORCEINLINE __vec16_i1 __xnor(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kxnor(a, b); + return _mm512_kxnor(a, b); } static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kor(a, b); + return _mm512_kor(a, b); } static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, - __vec16_i1 b) { - return ((a & mask) | (b & ~mask)); - //return __or(__and(a, mask), __andnr(b, mask)); + __vec16_i1 b) { + return ((a & mask) | (b & ~mask)); + //return __or(__and(a, mask), __andnr(b, mask)); } static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) { - return cond ? a : b; + return cond ? a : b; } static FORCEINLINE bool __extract_element(__vec16_i1 mask, uint32_t index) { - return (mask & (1 << index)) ? true : false; + return (mask & (1 << index)) ? true : false; } + +static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index) +{ + //uint *src = (uint *)&v; + const uint *src = (const uint *)&v; + return src[index+16] | (uint64_t(src[index]) << 32); +} + + + + + + + /* -static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, - bool val) { - if (val == false) - vec->v &= ~(1 << index); - else - vec->v |= (1 << index); -} -*/ + static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, + bool val) { + if (val == false) + vec->v &= ~(1 << index); + else + vec->v |= (1 << index); + } + */ template static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) { - const uint16_t *ptr = (const uint16_t *)p; - __vec16_i1 r; - r = *ptr; - return r; + const uint16_t *ptr = (const uint16_t *)p; + __vec16_i1 r; + r = *ptr; + return r; } template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) { - uint16_t *ptr = (uint16_t *)p; - *ptr = v; + uint16_t *ptr = (uint16_t *)p; + *ptr = v; } -template RetVecType __smear_i1(int i); -template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { - return i?0xFFFF:0x0; +template static RetVecType __smear_i1(int i); +template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { + return i?0xFFFF:0x0; } -template RetVecType __setzero_i1(); -template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { - return 0; +template static RetVecType __setzero_i1(); +template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { + return 0; } -template RetVecType __undef_i1(); -template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { - return __vec16_i1(); +template static RetVecType __undef_i1(); +template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { + return __vec16_i1(); } /////////////////////////////////////////////////////////////////////////// @@ -342,7 +425,7 @@ template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { /* -TODO + TODO */ @@ -353,7 +436,7 @@ TODO /* -TODO + TODO */ @@ -362,179 +445,179 @@ TODO /////////////////////////////////////////////////////////////////////////// static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) { - return _mm512_add_epi32(a, b); + return _mm512_add_epi32(a, b); } static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) { - return _mm512_sub_epi32(a, b); + return _mm512_sub_epi32(a, b); } static FORCEINLINE __vec16_i32 __mul(__vec16_i32 a, __vec16_i32 b) { - return _mm512_mullo_epi32(a, b); + return _mm512_mullo_epi32(a, b); } static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) { - return _mm512_div_epu32(a, b); + return _mm512_div_epu32(a, b); } static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) { - return _mm512_div_epi32(a, b); + return _mm512_div_epi32(a, b); } static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) { - return _mm512_rem_epu32(a, b); + return _mm512_rem_epu32(a, b); } static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) { - return _mm512_rem_epi32(a, b); + return _mm512_rem_epi32(a, b); } static FORCEINLINE __vec16_i32 __or(__vec16_i32 a, __vec16_i32 b) { - return _mm512_or_epi32(a, b); + return _mm512_or_epi32(a, b); } static FORCEINLINE __vec16_i32 __and(__vec16_i32 a, __vec16_i32 b) { - return _mm512_and_epi32(a, b); + return _mm512_and_epi32(a, b); } static FORCEINLINE __vec16_i32 __xor(__vec16_i32 a, __vec16_i32 b) { - return _mm512_xor_epi32(a, b); + return _mm512_xor_epi32(a, b); } static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, __vec16_i32 b) { - return _mm512_sllv_epi32(a, b); + return _mm512_sllv_epi32(a, b); } static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) { - return _mm512_srlv_epi32(a, b); + return _mm512_srlv_epi32(a, b); } static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) { - return _mm512_srav_epi32(a, b); + return _mm512_srav_epi32(a, b); } static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, int32_t n) { - return _mm512_slli_epi32(a, n); + return _mm512_slli_epi32(a, n); } static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) { - return _mm512_srli_epi32(a, n); + return _mm512_srli_epi32(a, n); } static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) { - return _mm512_srai_epi32(a, n); + return _mm512_srai_epi32(a, n); } static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) { - return _mm512_cmpeq_epi32_mask(a, b); + return _mm512_cmpeq_epi32_mask(a, b); } static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b, - __vec16_i1 m) { - return _mm512_mask_cmpeq_epi32_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmpeq_epi32_mask(m, a, b); } static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmpneq_epi32_mask(a, b); + return _mm512_cmpneq_epi32_mask(a, b); } static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmpneq_epi32_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmpneq_epi32_mask(m, a, b); } static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmple_epu32_mask(a, b); + return _mm512_cmple_epu32_mask(a, b); } static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmple_epu32_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmple_epu32_mask(m, a, b); } static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmple_epi32_mask(a, b); + return _mm512_cmple_epi32_mask(a, b); } static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmple_epi32_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmple_epi32_mask(m, a, b); } static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmpge_epu32_mask(a, b); + return _mm512_cmpge_epu32_mask(a, b); } static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmpge_epu32_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmpge_epu32_mask(m, a, b); } static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmpge_epi32_mask(a, b); + return _mm512_cmpge_epi32_mask(a, b); } static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmpge_epi32_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmpge_epi32_mask(m, a, b); } static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmplt_epu32_mask(a, b); + return _mm512_cmplt_epu32_mask(a, b); } static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmplt_epu32_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmplt_epu32_mask(m, a, b); } static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmplt_epi32_mask(a, b); + return _mm512_cmplt_epi32_mask(a, b); } static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmplt_epi32_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmplt_epi32_mask(m, a, b); } static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmpgt_epu32_mask(a, b); + return _mm512_cmpgt_epu32_mask(a, b); } static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmpgt_epu32_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmpgt_epu32_mask(m, a, b); } static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmpgt_epi32_mask(a, b); + return _mm512_cmpgt_epi32_mask(a, b); } static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmpgt_epi32_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmpgt_epi32_mask(m, a, b); } static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, - __vec16_i32 a, __vec16_i32 b) { - return _mm512_mask_mov_epi32(b.v, mask, a.v); + __vec16_i32 a, __vec16_i32 b) { + return _mm512_mask_mov_epi32(b.v, mask, a.v); } static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) { - return cond ? a : b; + return cond ? a : b; } static FORCEINLINE int32_t __extract_element(__vec16_i32 v, uint32_t index) { - return ((int32_t *)&v)[index]; + return ((int32_t *)&v)[index]; } static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t val) { - ((int32_t *)v)[index] = val; + ((int32_t *)v)[index] = val; } -template RetVecType __smear_i32(int32_t i); -template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { - return _mm512_set1_epi32(i); +template static RetVecType __smear_i32(int32_t i); +template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { + return _mm512_set1_epi32(i); } static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1); @@ -542,462 +625,560 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32); static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1); static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -template RetVecType __setzero_i32(); -template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { - return _mm512_setzero_epi32(); +template static RetVecType __setzero_i32(); +template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { + return _mm512_setzero_epi32(); } -template RetVecType __undef_i32(); -template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { - return __vec16_i32(); +template static RetVecType __undef_i32(); +template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { + return __vec16_i32(); } static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { - int32_t val = __extract_element(v, index & 0xf); - return _mm512_set1_epi32(val); + int32_t val = __extract_element(v, index & 0xf); + return _mm512_set1_epi32(val); +} + +static FORCEINLINE __vec16_i32 __cast_trunc(__vec16_i32, const __vec16_i64 i64) { + return __vec16_i32(i64.v_lo); } static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) { - __vec16_i32 idx = __smear_i32<__vec16_i32>(index); - __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0x7)); - return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v); + __vec16_i32 idx = __smear_i32<__vec16_i32>(index); + __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xf)); + return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v); } static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) { - return _mm512_mask_permutevar_epi32(v, 0xffff, index, v); + return _mm512_mask_permutevar_epi32(v, 0xffff, index, v); } template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) { #ifdef ISPC_FORCE_ALIGNED_MEMORY - return _mm512_load_epi32(p); + return _mm512_load_epi32(p); #else - __vec16_i32 v; - v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - return v; + __vec16_i32 v; + v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + return v; #endif } -template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) { - return _mm512_load_epi32(p); +template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) { + return _mm512_load_epi32(p); } template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) { #ifdef ISPC_FORCE_ALIGNED_MEMORY - _mm512_store_epi32(p, v); + _mm512_store_epi32(p, v); #else - _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); #endif } -template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { - _mm512_store_epi32(p, v); +template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { + _mm512_store_epi32(p, v); } /////////////////////////////////////////////////////////////////////////// // int64 /////////////////////////////////////////////////////////////////////////// - -static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index) + static FORCEINLINE +void __masked_store_i64(void *p, const __vec16_i64 &v, __vec16_i1 mask) { - uint *src = (uint *)&v; - return src[index+16] | (int64_t(src[index]) << 32); + __m512i v1; + __m512i v2; + v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_hi); + v1 = _mm512_mask_permutevar_epi32(v1, 0x5555, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_lo); + v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_hi); + v2 = _mm512_mask_permutevar_epi32(v2, 0x5555, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_lo); + _mm512_mask_store_epi64(p, mask, v2); + _mm512_mask_store_epi64(((uint8_t*)p)+64, mask>>8, v1); } static FORCEINLINE void __insert_element(__vec16_i64 *v, uint32_t index, int64_t val) { - ((int32_t *)&v->v_hi)[index] = val>>32; - ((int32_t *)&v->v_lo)[index] = val; + ((int32_t *)&v->v_hi)[index] = val>>32; + ((int32_t *)&v->v_lo)[index] = val; } -template RetVecType __setzero_i64(); -template <> static FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { - __vec16_i64 ret; - ret.v_lo = _mm512_setzero_epi32(); - ret.v_hi = _mm512_setzero_epi32(); - return ret; +template static RetVecType __setzero_i64(); +template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { + __vec16_i64 ret; + ret.v_lo = _mm512_setzero_epi32(); + ret.v_hi = _mm512_setzero_epi32(); + return ret; } -template RetVecType __undef_i64(); -template <> static FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { - return __vec16_i64(); +template static RetVecType __undef_i64(); +template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { + return __vec16_i64(); } static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b) { - __mmask16 carry = 0; - __m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry); - __m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry); - return __vec16_i64(lo, hi); + __mmask16 carry = 0; + __m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry); + __m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry); + return __vec16_i64(lo, hi); } static FORCEINLINE __vec16_i64 __sub(const __vec16_i64 &a, const __vec16_i64 &b) { - __mmask16 borrow = 0; - __m512i lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow); - __m512i hi = _mm512_sbb_epi32(a.v_hi, borrow, b.v_hi, &borrow); - return __vec16_i64(lo, hi); + __mmask16 borrow = 0; + __m512i lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow); + __m512i hi = _mm512_sbb_epi32(a.v_hi, borrow, b.v_hi, &borrow); + return __vec16_i64(lo, hi); } /*! 64x32 bit mul -- address computations often use a scale that we - know is 32 bits; and 32x64 is faster than 64x64 */ + know is 32 bits; and 32x64 is faster than 64x64 */ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b) { - return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo), - _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi), - _mm512_mulhi_epi32(a.v, b.v_lo))); + return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo), + _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi), + _mm512_mulhi_epi32(a.v, b.v_lo))); } static FORCEINLINE __vec16_i64 __mul(const __vec16_i64 &a, const __vec16_i64 &b) { - __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo); - __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo); - __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo); - __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi); - __mmask16 carry = 0; - __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry); - __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry); - return __vec16_i64(lo, hi); + __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo); + __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo); + __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo); + __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi); + __mmask16 carry = 0; + __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry); + __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry); + return __vec16_i64(lo, hi); } static FORCEINLINE __vec16_i64 __sdiv(const __vec16_i64 &a, const __vec16_i64 &b) { - __vec16_i64 ret; - for(int i=0; i<16; i++) { - int64_t dividend = __extract_element(a, i); - int64_t divisor = __extract_element(b, i); - int64_t quotient = dividend / divisor; // SVML - __insert_element(&ret, i, quotient); - } - return ret; + __vec16_i64 ret; + for(int i=0; i<16; i++) { + int64_t dividend = __extract_element(a, i); + int64_t divisor = __extract_element(b, i); + int64_t quotient = dividend / divisor; // SVML + __insert_element(&ret, i, quotient); + } + return ret; } static FORCEINLINE __vec16_i64 __udiv(const __vec16_i64 &a, const __vec16_i64 &b) { - __vec16_i64 ret; - for(int i=0; i<16; i++) { - uint64_t dividend = __extract_element(a, i); - uint64_t divisor = __extract_element(b, i); - uint64_t quotient = dividend / divisor; // SVML - __insert_element(&ret, i, quotient); - } - return ret; + __vec16_i64 ret; + for(int i=0; i<16; i++) { + uint64_t dividend = __extract_element(a, i); + uint64_t divisor = __extract_element(b, i); + uint64_t quotient = dividend / divisor; // SVML + __insert_element(&ret, i, quotient); + } + return ret; } static FORCEINLINE __vec16_i64 __or(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_or_epi32(a.v_lo, b.v_lo), _mm512_or_epi32(a.v_hi, b.v_hi)); + return __vec16_i64(_mm512_or_epi32(a.v_lo, b.v_lo), _mm512_or_epi32(a.v_hi, b.v_hi)); } static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_and_epi32(a.v_lo, b.v_lo), _mm512_and_epi32(a.v_hi, b.v_hi)); + return __vec16_i64(_mm512_and_epi32(a.v_lo, b.v_lo), _mm512_and_epi32(a.v_hi, b.v_hi)); } static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_xor_epi32(a.v_lo, b.v_lo), _mm512_xor_epi32(a.v_hi, b.v_hi)); + return __vec16_i64(_mm512_xor_epi32(a.v_lo, b.v_lo), _mm512_xor_epi32(a.v_hi, b.v_hi)); } static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, __vec16_i64 b) { - __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); - __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer); - __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo); - return __vec16_i64(lo, hi); + __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); + __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer); + __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo); + return __vec16_i64(lo, hi); +} + +static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, unsigned long long b) { + __vec16_i32 hi = _mm512_or_epi32(_mm512_slli_epi32(a.v_hi, b), + _mm512_srli_epi32(a.v_lo, 32-b)); + __vec16_i32 lo = _mm512_slli_epi32(a.v_lo, b); + return __vec16_i64(lo, hi); } static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) { - __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo); - __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift)); - //__vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, - // _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), - // _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); - __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo); - __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo)); - return __vec16_i64(lo, hi); + __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo); + __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift)); + //__vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, + // _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), + // _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); + __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo); + __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo)); + return __vec16_i64(lo, hi); } static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, __vec16_i64 b) { - __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, - _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), - _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); - __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo); - __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo)); - return __vec16_i64(lo, hi); + __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, + _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), + _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); + __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo); + __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo)); + return __vec16_i64(lo, hi); +} + +static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, unsigned long long b) { + __vec16_i32 xfer + = _mm512_slli_epi32(_mm512_and_epi32(a.v_hi, + _mm512_set1_epi32((1< RetVecType __smear_i64(const int64_t &l); +template static RetVecType __smear_i64(const int64_t &l); template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { - const int *i = (const int*)&l; - return __vec16_i64(_mm512_set1_epi32(i[0]), _mm512_set1_epi32(i[1])); + const int *i = (const int*)&l; + return __vec16_i64(_mm512_set1_epi32(i[0]), _mm512_set1_epi32(i[1])); } template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) { - __vec16_i32 v1; - __vec16_i32 v2; - v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __vec16_i32 v1; + __vec16_i32 v2; + const uint8_t*ptr = (const uint8_t*)p; + v2 = _mm512_extloadunpacklo_epi32(v2, ptr, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v2 = _mm512_extloadunpackhi_epi32(v2, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v1 = _mm512_extloadunpacklo_epi32(v1, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v1 = _mm512_extloadunpackhi_epi32(v1, ptr+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __vec16_i64 ret; - ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v1); - ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v2); - ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v1); - ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v2); - return ret; + __vec16_i64 ret; + ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); + ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); + ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0xFF00, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); + ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); + return ret; } -template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) { - __m512i v2 = _mm512_load_epi32(p); - __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64); - __vec16_i64 ret; - ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v1); - ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v2); - ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v1); - ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v2); - return ret; +template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) { + __m512i v2 = _mm512_load_epi32(p); + __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64); + __vec16_i64 ret; + ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); + ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); + ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0xFF00, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); + ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); + return ret; } -template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { - return __load<64>(p); +template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { + return __load<64>(p); } template static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) { - __m512i v1; - __m512i v2; - v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_hi); - v1 = _mm512_mask_permutevar_epi32(v1, 0x5555, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_lo); - v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_hi); - v2 = _mm512_mask_permutevar_epi32(v2, 0x5555, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_lo); - _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + __m512i v1; + __m512i v2; + v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_hi); + v1 = _mm512_mask_permutevar_epi32(v1, 0x5555, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_lo); + v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_hi); + v2 = _mm512_mask_permutevar_epi32(v2, 0x5555, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_lo); + _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); } -template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) { - __m512i v1; - __m512i v2; - v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_hi); - v1 = _mm512_mask_permutevar_epi32(v1, 0x5555, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_lo); - v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_hi); - v2 = _mm512_mask_permutevar_epi32(v2, 0x5555, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_lo); - _mm512_store_epi64(p, v2); - _mm512_store_epi64(((uint8_t*)p)+64, v1); +template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) { + __m512i v1; + __m512i v2; + v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_hi); + v1 = _mm512_mask_permutevar_epi32(v1, 0x5555, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_lo); + v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_hi); + v2 = _mm512_mask_permutevar_epi32(v2, 0x5555, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_lo); + _mm512_store_epi64(p, v2); + _mm512_store_epi64(((uint8_t*)p)+64, v1); } -template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { - __store<64>(p, v); +template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { + __store<64>(p, v); } + +/*! gather vector of 64-bit ints from addresses pointing to uniform ints + + (iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF + ***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't + expose whether it's from array of uniform or array of varying + poitners, so in here there's no way to tell - only thing we can do + is pick one... + */ +static FORCEINLINE __vec16_i64 +__gather_base_offsets32_i64(uint8_t *base, uint32_t scale, __vec16_i32 offsets, + __vec16_i1 mask) { + __vec16_i64 ret; + ret.v_lo = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, + base, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); + ret.v_hi = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, + base+4, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); + return ret; +} + +/*! gather vector of 64-bit ints from addresses pointing to uniform ints + + (iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF + ***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't + expose whether it's from array of uniform or array of varying + poitners, so in here there's no way to tell - only thing we can do + is pick one... + */ + static FORCEINLINE __vec16_i64 +__gather64_i64(__vec16_i64 addr, __vec16_i1 mask) +{ + __vec16_i64 ret; + + // There is no gather instruction with 64-bit offsets in KNC. + // We have to manually iterate over the upper 32 bits ;-) + __vec16_i1 still_to_do = mask; + const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint32_t &hi32 = ((uint*)&addr.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN)); + ret.v_lo = _mm512_mask_i32extgather_epi32(ret.v_lo, match, signed_offsets, + base, _MM_UPCONV_EPI32_NONE, 1, + _MM_HINT_NONE); + ret.v_hi = _mm512_mask_i32extgather_epi32(ret.v_hi, match, signed_offsets, + base+4, _MM_UPCONV_EPI32_NONE, 1, + _MM_HINT_NONE); + + still_to_do = _mm512_kxor(match, still_to_do); + } + + return ret; +} + + + /////////////////////////////////////////////////////////////////////////// // float /////////////////////////////////////////////////////////////////////////// static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { - return _mm512_add_ps(a, b); + return _mm512_add_ps(a, b); } static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) { - return _mm512_sub_ps(a, b); + return _mm512_sub_ps(a, b); } static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { - return _mm512_mul_ps(a, b); + return _mm512_mul_ps(a, b); } static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { - return _mm512_div_ps(a, b); + return _mm512_div_ps(a, b); } static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) { - return _mm512_cmpeq_ps_mask(a, b); + return _mm512_cmpeq_ps_mask(a, b); } static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { - return _mm512_mask_cmpeq_ps_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmpeq_ps_mask(m, a, b); } static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) { - return _mm512_cmpneq_ps_mask(a, b); + return _mm512_cmpneq_ps_mask(a, b); } static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { - return _mm512_mask_cmpneq_ps_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmpneq_ps_mask(m, a, b); } static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) { - return _mm512_cmplt_ps_mask(a, b); + return _mm512_cmplt_ps_mask(a, b); } static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { - return _mm512_mask_cmplt_ps_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmplt_ps_mask(m, a, b); } static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) { - return _mm512_cmple_ps_mask(a, b); + return _mm512_cmple_ps_mask(a, b); } static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { - return _mm512_mask_cmple_ps_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmple_ps_mask(m, a, b); } static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) { - return _mm512_cmpnle_ps_mask(a, b); + return _mm512_cmpnle_ps_mask(a, b); } static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { - return _mm512_mask_cmpnle_ps_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmpnle_ps_mask(m, a, b); } static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { - return _mm512_cmpnlt_ps_mask(a, b); + return _mm512_cmpnlt_ps_mask(a, b); } static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { - return _mm512_mask_cmpnlt_ps_mask(m, a, b); + __vec16_i1 m) { + return _mm512_mask_cmpnlt_ps_mask(m, a, b); } static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) { - return _mm512_cmpord_ps_mask(a, b); + return _mm512_cmpord_ps_mask(a, b); } static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { - return _mm512_cmpunord_ps_mask(a, b); + return _mm512_cmpunord_ps_mask(a, b); } static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) { - return _mm512_mask_mov_ps(b, mask, a); + return _mm512_mask_mov_ps(b, mask, a); } static FORCEINLINE __vec16_f __select(bool cond, __vec16_f a, __vec16_f b) { - return cond ? a : b; + return cond ? a : b; } static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { - return ((float *)&v)[index]; + return ((float *)&v)[index]; } static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { - ((float *)v)[index] = val; + ((float *)v)[index] = val; } -template RetVecType __smear_float(float f); -template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { - return _mm512_set_1to16_ps(f); +template static RetVecType __smear_float(float f); +template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { + return _mm512_set_1to16_ps(f); } -template RetVecType __setzero_float(); -template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { - return _mm512_setzero_ps(); +template static RetVecType __setzero_float(); +template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { + return _mm512_setzero_ps(); } -template RetVecType __undef_float(); -template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { - return __vec16_f(); +template static RetVecType __undef_float(); +template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() { + return __vec16_f(); } static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) { - int32_t val = __extract_element(v, index & 0xf); - return _mm512_set1_ps(val); + int32_t val = __extract_element(v, index & 0xf); + return _mm512_set1_ps(val); } static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) { - return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v))); + return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v))); } template static FORCEINLINE __vec16_f __load(const __vec16_f *p) { #ifdef ISPC_FORCE_ALIGNED_MEMORY - return _mm512_load_ps(p); + return _mm512_load_ps(p); #else - __vec16_f v; - v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - return v; + __vec16_f v; + v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + return v; #endif } -template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) { - return _mm512_load_ps(p); +template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) { + return _mm512_load_ps(p); } template static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) { #ifdef ISPC_FORCE_ALIGNED_MEMORY - _mm512_store_ps(p, v); + _mm512_store_ps(p, v); #else - _mm512_extpackstorelo_ps(p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_ps(p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); #endif } -template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) { - _mm512_store_ps(p, v); +template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) { + _mm512_store_ps(p, v); } @@ -1006,241 +1187,241 @@ template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) { /////////////////////////////////////////////////////////////////////////// static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { - __vec16_d ret; - ret.v1 = _mm512_add_pd(a.v1, b.v1); - ret.v2 = _mm512_add_pd(a.v2, b.v2); - return ret; + __vec16_d ret; + ret.v1 = _mm512_add_pd(a.v1, b.v1); + ret.v2 = _mm512_add_pd(a.v2, b.v2); + return ret; } static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) { - __vec16_d ret; - ret.v1 = _mm512_sub_pd(a.v1, b.v1); - ret.v2 = _mm512_sub_pd(a.v2, b.v2); - return ret; + __vec16_d ret; + ret.v1 = _mm512_sub_pd(a.v1, b.v1); + ret.v2 = _mm512_sub_pd(a.v2, b.v2); + return ret; } static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) { - __vec16_d ret; - ret.v1 = _mm512_mul_pd(a.v1, b.v1); - ret.v2 = _mm512_mul_pd(a.v2, b.v2); - return ret; + __vec16_d ret; + ret.v1 = _mm512_mul_pd(a.v1, b.v1); + ret.v2 = _mm512_mul_pd(a.v2, b.v2); + return ret; } static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) { - __vec16_d ret; - ret.v1 = _mm512_div_pd(a.v1, b.v1); - ret.v2 = _mm512_div_pd(a.v2, b.v2); - return ret; + __vec16_d ret; + ret.v1 = _mm512_div_pd(a.v1, b.v1); + ret.v2 = _mm512_div_pd(a.v2, b.v2); + return ret; } static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1); - __vec16_i1 tmp_m = m; - ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1); + __vec16_i1 tmp_m = m; + ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - __vec16_i1 tmp_m = m; - ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1); - ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + __vec16_i1 tmp_m = m; + ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1); + ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - __vec16_i1 tmp_m = m; - ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1); - ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + __vec16_i1 tmp_m = m; + ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1); + ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmple_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmple_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmple_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmple_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - __vec16_i1 tmp_m = m; - ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1); - ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + __vec16_i1 tmp_m = m; + ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1); + ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - __vec16_i1 tmp_m = m; - ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1); - ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + __vec16_i1 tmp_m = m; + ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1); + ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - __vec16_i1 tmp_m = m; - ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1); - ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + __vec16_i1 tmp_m = m; + ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1); + ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); } static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) { - __vec16_d ret; - __vec16_i1 tmp_m = mask; - ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1); - ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2); - return ret; + __vec16_d ret; + __vec16_i1 tmp_m = mask; + ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1); + ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2); + return ret; } static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) { - return cond ? a : b; + return cond ? a : b; } static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { - return ((double *)&v)[index]; + return ((double *)&v)[index]; } static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { - ((double *)v)[index] = val; + ((double *)v)[index] = val; } -template RetVecType __smear_double(double d); -template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { - __vec16_d ret; - ret.v1 = _mm512_set1_pd(d); - ret.v2 = _mm512_set1_pd(d); - return ret; +template static RetVecType __smear_double(double d); +template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { + __vec16_d ret; + ret.v1 = _mm512_set1_pd(d); + ret.v2 = _mm512_set1_pd(d); + return ret; } -template RetVecType __setzero_double(); -template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { - __vec16_d ret; - ret.v1 = _mm512_setzero_pd(); - ret.v2 = _mm512_setzero_pd(); - return ret; +template static RetVecType __setzero_double(); +template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { + __vec16_d ret; + ret.v1 = _mm512_setzero_pd(); + ret.v2 = _mm512_setzero_pd(); + return ret; } -template RetVecType __undef_double(); -template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { - return __vec16_d(); +template static RetVecType __undef_double(); +template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() { + return __vec16_d(); } static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) { - __vec16_d ret; - double val = __extract_element(v, index & 0xf); - ret.v1 = _mm512_set1_pd(val); - ret.v2 = _mm512_set1_pd(val); - return ret; + __vec16_d ret; + double val = __extract_element(v, index & 0xf); + ret.v1 = _mm512_set1_pd(val); + ret.v2 = _mm512_set1_pd(val); + return ret; } template static FORCEINLINE __vec16_d __load(const __vec16_d *p) { - __vec16_d ret; - ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - return ret; + __vec16_d ret; + ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + return ret; } -template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) { - __vec16_d ret; - ret.v1 = _mm512_load_pd(p); - ret.v2 = _mm512_load_pd(((uint8_t*)p)+64); - return ret; +template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) { + __vec16_d ret; + ret.v1 = _mm512_load_pd(p); + ret.v2 = _mm512_load_pd(((uint8_t*)p)+64); + return ret; } -template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) { - return __load<64>(p); +template <> FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) { + return __load<64>(p); } - + template static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) { - _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); } -template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) { - _mm512_store_pd(p, v.v1); - _mm512_store_pd(((uint8_t*)p)+64, v.v2); +template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) { + _mm512_store_pd(p, v.v1); + _mm512_store_pd(((uint8_t*)p)+64, v.v2); } -template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { - __store<64>(p, v); +template <> FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { + __store<64>(p, v); } /////////////////////////////////////////////////////////////////////////// @@ -1249,12 +1430,12 @@ template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) { - return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31)); + return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31)); } static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val) { - return __vec16_i64(val.v, _mm512_setzero_epi32()); + return __vec16_i64(val.v, _mm512_setzero_epi32()); } static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val) @@ -1266,47 +1447,47 @@ static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val) { - __vec16_i32 ret = _mm512_setzero_epi32(); - __vec16_i32 one = _mm512_set1_epi32(1); - return _mm512_mask_mov_epi32(ret, val, one); + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(1); + return _mm512_mask_mov_epi32(ret, val, one); } static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8 val) { - return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); } static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) { - return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); } static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) { - return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); + return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); } static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) { - __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); - __vec16_d ret; - ret.v1 = _mm512_cvtepi32lo_pd(vi); - __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); - ret.v2 = _mm512_cvtepi32lo_pd(other8); - return ret; + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; } static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) { - __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); - __vec16_d ret; - ret.v1 = _mm512_cvtepi32lo_pd(vi); - __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); - ret.v2 = _mm512_cvtepi32lo_pd(other8); - return ret; + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; } static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) { - __vec16_d ret; - ret.v1 = _mm512_cvtepi32lo_pd(val); - __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC); - ret.v2 = _mm512_cvtepi32lo_pd(other8); - return ret; + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(val); + __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; } static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, const __vec16_i8 &v) { @@ -1328,35 +1509,64 @@ static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) { } static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) { - __vec16_d ret; - ret.v2 = _mm512_cvtpslo_pd(val.v); - __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC); - ret.v1 = _mm512_cvtpslo_pd(other8); - return ret; + __vec16_d ret; + ret.v1 = _mm512_cvtpslo_pd(val.v); + __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC); + ret.v2 = _mm512_cvtpslo_pd(other8); + return ret; } static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) { - __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1)); - __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2)); - - return _mm512_mask_permute4f128_epi32(r1i, 0xFF00, r0i, _MM_PERM_BABA); + __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2)); + __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1)); + return _mm512_mask_permute4f128_epi32(r1i, 0xFF00, r0i, _MM_PERM_BABA); } static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) { - return _mm512_castsi512_ps(val); + return _mm512_castsi512_ps(val); } static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) { - return _mm512_castps_si512(val); + return _mm512_castps_si512(val); } static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) { - return *(__vec16_i64*)&val; + __vec16_i64 ret; + ret.v_hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + _mm512_castpd_si512(val.v2)); + ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + _mm512_castpd_si512(val.v1)); + ret.v_lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + _mm512_castpd_si512(val.v2)); + ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + _mm512_castpd_si512(val.v1)); + return ret; } static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { - return *(__vec16_d*)&val; + __vec16_d ret; + ret.v2 = _mm512_castsi512_pd( + _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + val.v_hi)); + ret.v2 = _mm512_castsi512_pd( + _mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v2), 0x5555, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + val.v_lo)); + ret.v1 = _mm512_castsi512_pd( + _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + val.v_hi)); + ret.v1 = _mm512_castsi512_pd( + _mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v1), 0x5555, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + val.v_lo)); + return ret; } /////////////////////////////////////////////////////////////////////////// @@ -1364,27 +1574,27 @@ static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { /////////////////////////////////////////////////////////////////////////// static FORCEINLINE float __round_uniform_float(float v) { - return roundf(v); + return roundf(v); } static FORCEINLINE float __floor_uniform_float(float v) { - return floorf(v); + return floorf(v); } static FORCEINLINE float __ceil_uniform_float(float v) { - return ceilf(v); + return ceilf(v); } static FORCEINLINE double __round_uniform_double(double v) { - return round(v); + return round(v); } static FORCEINLINE double __floor_uniform_double(double v) { - return floor(v); + return floor(v); } static FORCEINLINE double __ceil_uniform_double(double v) { - return ceil(v); + return ceil(v); } static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) { @@ -1443,30 +1653,30 @@ static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 // sqrt/rsqrt/rcp static FORCEINLINE float __rsqrt_uniform_float(float v) { - return 1.f / sqrtf(v); + return 1.f / sqrtf(v); } static FORCEINLINE float __rcp_uniform_float(float v) { - return 1.f / v; + return 1.f / v; } static FORCEINLINE float __sqrt_uniform_float(float v) { - return sqrtf(v); + return sqrtf(v); } static FORCEINLINE double __sqrt_uniform_double(double v) { - return sqrt(v); + return sqrt(v); } static FORCEINLINE __vec16_f __sqrt_varying_float(__vec16_f v) { - return _mm512_sqrt_ps(v); + return _mm512_sqrt_ps(v); } static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) { #ifdef ISPC_FAST_MATH - return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy. + return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy. #else - return _mm512_recip_ps(v); + return _mm512_recip_ps(v); #endif } static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) { @@ -1483,17 +1693,19 @@ static FORCEINLINE double __rcp_uniform_double(double v) static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { #ifdef ISPC_FAST_MATH - return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy + return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy #else - return _mm512_invsqrt_ps(v); + return _mm512_invsqrt_ps(v); #endif } + static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) { __vec16_d y; for (int i = 0; i < 16; i++) __insert_element(&y, i, 1.0/sqrt(__extract_element(x,i))); return y; } + static FORCEINLINE double __rsqrt_uniform_double(double v) { return 1.0/v; @@ -1505,20 +1717,20 @@ static FORCEINLINE double __rsqrt_uniform_double(double v) /////////////////////////////////////////////////////////////////////////// static FORCEINLINE int32_t __popcnt_int32(const __vec1_i32 mask) { - return _mm_countbits_32(mask); + return _mm_countbits_32(mask); } static FORCEINLINE int32_t __popcnt_int64(const __vec1_i64 mask) { - return _mm_countbits_64(mask); + return _mm_countbits_64(mask); } static FORCEINLINE int32_t __count_trailing_zeros_i32(const __vec1_i32 mask) { - return _mm_tzcnt_32(mask); + return _mm_tzcnt_32(mask); } static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) { - return _mm_tzcnt_64(mask); + return _mm_tzcnt_64(mask); } /////////////////////////////////////////////////////////////////////////// @@ -1542,39 +1754,39 @@ static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) { } static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) { - return _mm512_reduce_add_epi32(v); + return _mm512_reduce_add_epi32(v); } static FORCEINLINE uint32_t __reduce_min_i32(__vec16_i32 v) { - return _mm512_reduce_min_epi32(v); + return _mm512_reduce_min_epi32(v); } static FORCEINLINE uint32_t __reduce_max_i32(__vec16_i32 v) { - return _mm512_reduce_max_epi32(v); + return _mm512_reduce_max_epi32(v); } static FORCEINLINE float __reduce_add_float(__vec16_f v) { - return _mm512_reduce_add_ps(v); + return _mm512_reduce_add_ps(v); } static FORCEINLINE float __reduce_min_float(__vec16_f v) { - return _mm512_reduce_min_ps(v); + return _mm512_reduce_min_ps(v); } static FORCEINLINE float __reduce_max_float(__vec16_f v) { - return _mm512_reduce_max_ps(v); + return _mm512_reduce_max_ps(v); } static FORCEINLINE float __reduce_add_double(__vec16_d v) { - return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); + return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); } static FORCEINLINE float __reduce_min_double(__vec16_d v) { - return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); + return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); } static FORCEINLINE float __reduce_max_double(__vec16_d v) { - return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); + return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); } /////////////////////////////////////////////////////////////////////////// @@ -1584,110 +1796,142 @@ static FORCEINLINE float __reduce_max_double(__vec16_d v) { // Currently, when a pseudo_gather is converted into a masked load, it has to be unaligned static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY - return _mm512_mask_load_epi32(__vec16_i32(), mask, p); + return _mm512_mask_load_epi32(__vec16_i32(), mask, p); #else - __vec16_i32 tmp; - tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __vec16_i32 ret; - return _mm512_mask_mov_epi32(ret.v, mask, tmp.v); + __vec16_i32 tmp; + tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __vec16_i32 ret; + return _mm512_mask_mov_epi32(ret.v, mask, tmp.v); #endif } static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY - return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p); + return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p); #else - __vec16_f tmp; - tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - __vec16_f ret; - return _mm512_mask_mov_ps(ret.v, mask, tmp.v); + __vec16_f tmp; + tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + __vec16_f ret; + return _mm512_mask_mov_ps(ret.v, mask, tmp.v); #endif } static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY - __vec16_d ret; - __vec16_i1 tmp_m = mask; - tmp_m = _mm512_kswapb(tmp_m, tmp_m); - ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p); - ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64); - return ret; + __vec16_d ret; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p); + ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64); + return ret; #else - __vec16_d tmp; - tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - __vec16_d ret; - __vec16_i1 tmp_m = mask; - tmp_m = _mm512_kswapb(tmp_m, tmp_m); - ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1); - ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2); - return ret; + __vec16_d tmp; + tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + __vec16_d ret; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1); + ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2); + return ret; #endif } +static FORCEINLINE void __masked_store_i8(void *p, const __vec16_i8 &val, __vec16_i1 mask) { + __vec16_i32 tmp = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); +} + +static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, __vec16_i1 mask) { + __vec16_i8 ret; + __vec16_i32 tmp = _mm512_mask_extload_epi32(_mm512_undefined_epi32(),mask,p, + _MM_UPCONV_EPI32_SINT8, + _MM_BROADCAST32_NONE, _MM_HINT_NONE); + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} +template static FORCEINLINE __vec16_i8 __load(const __vec16_i8 *p) { + return *p; +} +template static FORCEINLINE void __store(__vec16_i8 *p, __vec16_i8 v) { + *p = v; +} + +static FORCEINLINE void +__scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets, + __vec16_i8 val, __vec16_i1 mask) +{ + __vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT8, + _MM_BROADCAST32_NONE, _MM_HINT_NONE); + printf("__scatter_base_offsets32_i8\n"); + _mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp, + _MM_DOWNCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); +} + static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY - _mm512_mask_store_epi32(p, mask, val.v); + _mm512_mask_store_epi32(p, mask, val.v); #else - __vec16_i32 tmp; - tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v); - _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp; + tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v); + _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); #endif } static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, - __vec16_i1 mask) { + __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY - _mm512_mask_store_ps(p, mask, val.v); + _mm512_mask_store_ps(p, mask, val.v); #else - __vec16_f tmp; - tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v); - _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + __vec16_f tmp; + tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v); + _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); #endif } static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, - __vec16_i1 mask) { + __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY - __vec16_i1 tmp_m = mask; - tmp_m = _mm512_kswapb(tmp_m, tmp_m); - _mm512_mask_store_pd(p, mask, val.v1); - _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2); + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + _mm512_mask_store_pd(p, mask, val.v1); + _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2); #else - __vec16_d tmp; - __vec16_i1 tmp_m = mask; - tmp_m = _mm512_kswapb(tmp_m, tmp_m); - tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1); - tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2); - _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + __vec16_d tmp; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1); + tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2); + _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); #endif } static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val, - __vec16_i1 mask) { - __masked_store_i32(p, val, mask); + __vec16_i1 mask) { + __masked_store_i32(p, val, mask); } static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, - __vec16_i1 mask) { - __masked_store_float(p, val, mask); + __vec16_i1 mask) { + __masked_store_float(p, val, mask); } /////////////////////////////////////////////////////////////////////////// @@ -1698,47 +1942,75 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets, - __vec16_i1 mask) { - // (iw): need to temporarily store as int because gathers can only return ints. - __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, - _MM_UPCONV_EPI32_SINT8, scale, - _MM_HINT_NONE); - // now, downconverting to chars into temporary char vector - __vec16_i8 ret; - _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; + __vec16_i1 mask) { + // (iw): need to temporarily store as int because gathers can only return ints. + __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + // now, downconverting to chars into temporary char vector + __vec16_i8 ret; + _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; } static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, - __vec16_i1 mask) { - return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, - base, _MM_UPCONV_EPI32_NONE, scale, - _MM_HINT_NONE); + __vec16_i1 mask) { + return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, + base, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); } static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, - __vec16_i1 mask) { - return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets, - base, _MM_UPCONV_PS_NONE, scale, - _MM_HINT_NONE); + __vec16_i1 mask) { + return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets, + base, _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); } static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, - __vec16_i1 mask) { - __vec16_d ret; - ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets, - base, _MM_UPCONV_PD_NONE, scale, - _MM_HINT_NONE); - __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); - ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets, - base, _MM_UPCONV_PD_NONE, scale, - _MM_HINT_NONE); - return ret; + __vec16_i1 mask) { + __vec16_d ret; + ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); + __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); + ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); + return ret; } +static FORCEINLINE __vec16_f +__gather64_float(__vec16_i64 addr, __vec16_i1 mask) +{ + __vec16_f ret; + + // There is no gather instruction with 64-bit offsets in KNC. + // We have to manually iterate over the upper 32 bits ;-) + __vec16_i1 still_to_do = mask; + const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&addr.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN)); + + ret.v = _mm512_mask_i32extgather_ps(ret.v, match, signed_offsets, + base, _MM_UPCONV_PS_NONE, 1, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match, still_to_do); + } + + return ret; +} + + /*! gather with 64-bit offsets. \todo add optimization that falls back to 32-bit offset gather if @@ -1748,148 +2020,180 @@ __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offset static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_i1 mask) { - // There is no gather instruction with 64-bit offsets in KNC. - // We have to manually iterate over the upper 32 bits ;-) - __vec16_i1 still_to_do = mask; - __vec16_f ret; - while (still_to_do) { - int first_active_lane = _mm_tzcnt_32((int)still_to_do); - const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; - __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); - - void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32)); - ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base, - _MM_UPCONV_PS_NONE, scale, - _MM_HINT_NONE); - still_to_do = _mm512_kxor(match, still_to_do); - } + __vec16_i1 mask) { - return ret; + const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); + // There is no gather instruction with 64-bit offsets in KNC. + // We have to manually iterate over the upper 32 bits ;-) + __vec16_i1 still_to_do = mask; + __vec16_f ret; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + + ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base, + _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match, still_to_do); + } + + return ret; } static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_i1 mask) + __vec16_i1 mask) { - __vec16_i1 still_to_do = mask; - __vec16_i32 tmp; - while (still_to_do) { - int first_active_lane = _mm_tzcnt_32((int)still_to_do); - const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; - __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); - - void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32)); - tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base, - _MM_UPCONV_EPI32_SINT8, scale, - _MM_HINT_NONE); - still_to_do = _mm512_kxor(match,still_to_do); - } - __vec16_i8 ret; - _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; + + const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); + __vec16_i1 still_to_do = mask; + __vec16_i32 tmp; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } + __vec16_i8 ret; + _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; } static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_f value, - __vec16_i1 mask) { - __vec16_i1 still_to_do = mask; - while (still_to_do) { - int first_active_lane = _mm_tzcnt_32((int)still_to_do); - const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; - __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); + __vec16_f value, + __vec16_i1 mask) { - void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32)); - _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, - value, - _MM_DOWNCONV_PS_NONE, scale, - _MM_HINT_NONE); - still_to_do = _mm512_kxor(match,still_to_do); - } + const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); + __vec16_i1 still_to_do = mask; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + _mm512_mask_i32extscatter_ps(base, match, signed_offsets, + value, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } } static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_i32 value, - __vec16_i1 mask) { - __vec16_i1 still_to_do = mask; - while (still_to_do) { - int first_active_lane = _mm_tzcnt_32((int)still_to_do); - const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; - __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); - - void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32)); - _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, - value, - _MM_DOWNCONV_EPI32_NONE, scale, - _MM_HINT_NONE); - still_to_do = _mm512_kxor(match,still_to_do); - } + __vec16_i32 value, + __vec16_i1 mask) { + + const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); + __vec16_i1 still_to_do = mask; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + _mm512_mask_i32extscatter_epi32(base, match, signed_offsets, + value, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } } +static FORCEINLINE void // TODO +__scatter_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, + __vec16_i8 value, + __vec16_i1 mask) { + __vec16_i1 still_to_do = mask; + + __vec16_i32 tmp = _mm512_extload_epi32(&value, _MM_UPCONV_EPI32_SINT8, + _MM_BROADCAST32_NONE, _MM_HINT_NONE); + // _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, + tmp, + _MM_DOWNCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } +} static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_i1 mask) + __vec16_i1 mask) { - __vec16_f r = __gather_base_offsets64_float(_base,scale,offsets,mask); - return (__vec16_i32&)r; + __vec16_f r = __gather_base_offsets64_float(_base,scale,offsets,mask); + return (__vec16_i32&)r; } // scatter static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets, - __vec16_i32 val, __vec16_i1 mask) + __vec16_i32 val, __vec16_i1 mask) { - _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, - _MM_DOWNCONV_EPI32_NONE, scale, - _MM_HINT_NONE); + _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); } static FORCEINLINE void -__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, - __vec16_f val, __vec16_i1 mask) +__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, + __vec16_f val, __vec16_i1 mask) { - _mm512_mask_i32extscatter_ps(base, mask, offsets, val, - _MM_DOWNCONV_PS_NONE, scale, - _MM_HINT_NONE); + _mm512_mask_i32extscatter_ps(base, mask, offsets, val, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); } /////////////////////////////////////////////////////////////////////////// // packed load/store /////////////////////////////////////////////////////////////////////////// -static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, - __vec16_i1 mask) { - __vec16_i32 v; - v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(_mm512_undefined_epi32(), mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(mask)); +static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask) { + __vec16_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(mask)); } -static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, - __vec16_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(mask)); +static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) { + _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(mask)); } static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) @@ -1902,241 +2206,261 @@ static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, /////////////////////////////////////////////////////////////////////////// static FORCEINLINE void __prefetch_read_uniform_1(const char *p) { - _mm_prefetch(p, _MM_HINT_T0); // prefetch into L1$ + _mm_prefetch(p, _MM_HINT_T0); // prefetch into L1$ } static FORCEINLINE void __prefetch_read_uniform_2(const char *p) { - _mm_prefetch(p, _MM_HINT_T1); // prefetch into L2$ + _mm_prefetch(p, _MM_HINT_T1); // prefetch into L2$ } static FORCEINLINE void __prefetch_read_uniform_3(const char *p) { - // There is no L3$ on KNC, don't want to pollute L2$ unecessarily + // There is no L3$ on KNC, don't want to pollute L2$ unecessarily } static FORCEINLINE void __prefetch_read_uniform_nt(const char *p) { - _mm_prefetch(p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint - // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint + _mm_prefetch(p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint + // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint } +#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec16_i32 offsets, __vec16_i1 mask) { \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \ + offsets = _mm512_permutevar_epi32(_mm512_set_16to16_pi(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8), offsets);\ + __vec16_i1 copy_mask = _mm512_kmov(mask); \ + _mm512_kswapb(mask, copy_mask); \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, _MM_HINT_T0); \ +} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \ + +PREFETCH_READ_VARYING(1, _MM_HINT_T0) +PREFETCH_READ_VARYING(2, _MM_HINT_T1) +PREFETCH_READ_VARYING(nt, _MM_HINT_T2) + +static FORCEINLINE void __prefetch_read_varying_3_native(uint8_t *base, uint32_t scale, + __vec16_i32 offsets, __vec16_i1 mask) {} + +static FORCEINLINE void __prefetch_read_varying_3(__vec16_i64 addr, __vec16_i1 mask) {} + /////////////////////////////////////////////////////////////////////////// // atomics /////////////////////////////////////////////////////////////////////////// static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return InterlockedAdd((LONG volatile *)p, v) - v; + return InterlockedAdd((LONG volatile *)p, v) - v; #else - return __sync_fetch_and_add(p, v); + return __sync_fetch_and_add(p, v); #endif } static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return InterlockedAdd((LONG volatile *)p, -v) + v; + return InterlockedAdd((LONG volatile *)p, -v) + v; #else - return __sync_fetch_and_sub(p, v); + return __sync_fetch_and_sub(p, v); #endif } static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return _InterlockedAnd((LONG volatile *)p, v); + return _InterlockedAnd((LONG volatile *)p, v); #else - return __sync_fetch_and_and(p, v); + return __sync_fetch_and_and(p, v); #endif } static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return _InterlockedOr((LONG volatile *)p, v); + return _InterlockedOr((LONG volatile *)p, v); #else - return __sync_fetch_and_or(p, v); + return __sync_fetch_and_or(p, v); #endif } static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return _InterlockedXor((LONG volatile *)p, v); + return _InterlockedXor((LONG volatile *)p, v); #else - return __sync_fetch_and_xor(p, v); + return __sync_fetch_and_xor(p, v); #endif } static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) { - int32_t old, min; - do { - old = *((volatile int32_t *)p); - min = (old < (int32_t)v) ? old : (int32_t)v; + int32_t old, min; + do { + old = *((volatile int32_t *)p); + min = (old < (int32_t)v) ? old : (int32_t)v; #ifdef _MSC_VER - } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); + } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, min) == false); + } while (__sync_bool_compare_and_swap(p, old, min) == false); #endif - return old; + return old; } static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) { - int32_t old, max; - do { - old = *((volatile int32_t *)p); - max = (old > (int32_t)v) ? old : (int32_t)v; + int32_t old, max; + do { + old = *((volatile int32_t *)p); + max = (old > (int32_t)v) ? old : (int32_t)v; #ifdef _MSC_VER - } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); + } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, max) == false); + } while (__sync_bool_compare_and_swap(p, old, max) == false); #endif - return old; + return old; } static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) { - uint32_t old, min; - do { - old = *((volatile uint32_t *)p); - min = (old < v) ? old : v; + uint32_t old, min; + do { + old = *((volatile uint32_t *)p); + min = (old < v) ? old : v; #ifdef _MSC_VER - } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); + } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, min) == false); + } while (__sync_bool_compare_and_swap(p, old, min) == false); #endif - return old; + return old; } static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) { - uint32_t old, max; - do { - old = *((volatile uint32_t *)p); - max = (old > v) ? old : v; + uint32_t old, max; + do { + old = *((volatile uint32_t *)p); + max = (old > v) ? old : v; #ifdef _MSC_VER - } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); + } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, max) == false); + } while (__sync_bool_compare_and_swap(p, old, max) == false); #endif - return old; + return old; } static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return InterlockedExchange((LONG volatile *)p, v); + return InterlockedExchange((LONG volatile *)p, v); #else - return __sync_lock_test_and_set(p, v); + return __sync_lock_test_and_set(p, v); #endif } static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval, - uint32_t newval) { + uint32_t newval) { #ifdef _MSC_VER - return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval); + return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval); #else - return __sync_val_compare_and_swap(p, cmpval, newval); + return __sync_val_compare_and_swap(p, cmpval, newval); #endif } static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedAdd64((LONGLONG volatile *)p, v) - v; + return InterlockedAdd64((LONGLONG volatile *)p, v) - v; #else - return __sync_fetch_and_add(p, v); + return __sync_fetch_and_add(p, v); #endif } static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedAdd64((LONGLONG volatile *)p, -v) + v; + return InterlockedAdd64((LONGLONG volatile *)p, -v) + v; #else - return __sync_fetch_and_sub(p, v); + return __sync_fetch_and_sub(p, v); #endif } static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedAnd64((LONGLONG volatile *)p, v) - v; + return InterlockedAnd64((LONGLONG volatile *)p, v) - v; #else - return __sync_fetch_and_and(p, v); + return __sync_fetch_and_and(p, v); #endif } static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedOr64((LONGLONG volatile *)p, v) - v; + return InterlockedOr64((LONGLONG volatile *)p, v) - v; #else - return __sync_fetch_and_or(p, v); + return __sync_fetch_and_or(p, v); #endif } static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedXor64((LONGLONG volatile *)p, v) - v; + return InterlockedXor64((LONGLONG volatile *)p, v) - v; #else - return __sync_fetch_and_xor(p, v); + return __sync_fetch_and_xor(p, v); #endif } static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) { - int64_t old, min; - do { - old = *((volatile int64_t *)p); - min = (old < (int64_t)v) ? old : (int64_t)v; + int64_t old, min; + do { + old = *((volatile int64_t *)p); + min = (old < (int64_t)v) ? old : (int64_t)v; #ifdef _MSC_VER - } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, min) == false); + } while (__sync_bool_compare_and_swap(p, old, min) == false); #endif - return old; + return old; } static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) { - int64_t old, max; - do { - old = *((volatile int64_t *)p); - max = (old > (int64_t)v) ? old : (int64_t)v; + int64_t old, max; + do { + old = *((volatile int64_t *)p); + max = (old > (int64_t)v) ? old : (int64_t)v; #ifdef _MSC_VER - } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, max) == false); + } while (__sync_bool_compare_and_swap(p, old, max) == false); #endif - return old; + return old; } static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) { - uint64_t old, min; - do { - old = *((volatile uint64_t *)p); - min = (old < v) ? old : v; + uint64_t old, min; + do { + old = *((volatile uint64_t *)p); + min = (old < v) ? old : v; #ifdef _MSC_VER - } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, min) == false); + } while (__sync_bool_compare_and_swap(p, old, min) == false); #endif - return old; + return old; } static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) { - uint64_t old, max; - do { - old = *((volatile uint64_t *)p); - max = (old > v) ? old : v; + uint64_t old, max; + do { + old = *((volatile uint64_t *)p); + max = (old > v) ? old : v; #ifdef _MSC_VER - } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, max) == false); + } while (__sync_bool_compare_and_swap(p, old, max) == false); #endif - return old; + return old; } static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedExchange64((LONGLONG volatile *)p, v); + return InterlockedExchange64((LONGLONG volatile *)p, v); #else - return __sync_lock_test_and_set(p, v); + return __sync_lock_test_and_set(p, v); #endif } static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, - uint64_t newval) { + uint64_t newval) { #ifdef _MSC_VER - return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval); + return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval); #else - return __sync_val_compare_and_swap(p, cmpval, newval); + return __sync_val_compare_and_swap(p, cmpval, newval); #endif } @@ -2148,10 +2472,10 @@ static FORCEINLINE uint64_t __clock() { uint32_t low, high; #ifdef __x86_64 __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" - ::: "%rax", "%rbx", "%rcx", "%rdx" ); + ::: "%rax", "%rbx", "%rcx", "%rdx" ); #else __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" - ::: "%eax", "%ebx", "%ecx", "%edx" ); + ::: "%eax", "%ebx", "%ecx", "%edx" ); #endif __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); return (uint64_t)high << 32 | low; diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index a25af10b..765a931f 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -3898,6 +3898,15 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *ptr) { _mm_prefetch((char *)ptr, _MM_HINT_NTA); } +#define PREFETCH_READ_VARYING(CACHE_NUM) \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec4_i32 offsets, __vec4_i1 mask) {} \ +static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec4_i64 addr, __vec4_i1 mask) {} \ + +PREFETCH_READ_VARYING(1) +PREFETCH_READ_VARYING(2) +PREFETCH_READ_VARYING(3) +PREFETCH_READ_VARYING(nt) /////////////////////////////////////////////////////////////////////////// // atomics diff --git a/fail_db.txt b/fail_db.txt index 9def9b6c..b7c1ad74 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -257,6 +257,32 @@ ./tests/reduce-equal-5.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 * ./tests/reduce-equal-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 * ./tests/reduce-equal-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 * +./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/paddus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/paddus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/pmuls_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/pmulus_i16.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/pmulus_i32.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/pmulus_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/pmulus_i8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 * +./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 * +./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 * +./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/paddus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/paddus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/pmuls_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/pmulus_i16.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/pmulus_i32.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/pmulus_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/pmulus_i8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O2 * +./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 * +./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 * +./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 * .\tests\foreach-double-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.5 cl -O2 * .\tests\foreach-double-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.5 cl -O2 * .\tests\foreach-double-1.ispc runfail x86 avx2-i64x4 Windows LLVM 3.5 cl -O2 * @@ -267,7 +293,6 @@ ./tests/ptr-22.ispc runfail x86-64 generic-4 Linux LLVM 3.5 clang++3.4 -O0 * ./tests/ptr-22.ispc runfail x86-64 generic-16 Linux LLVM 3.5 clang++3.4 -O0 * ./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O2 * -./tests/ptr-22.ispc runfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 * ./tests/atomics-1.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 * ./tests/atomics-10.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 * ./tests/atomics-11.ispc compfail x86-64 knc Linux LLVM 3.4 icpc13.1 -O0 * @@ -454,3 +479,35 @@ ./tests/reduce-equal-5.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 * ./tests/reduce-equal-6.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 * ./tests/reduce-equal-8.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O0 * +./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/paddus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/paddus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/pmuls_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/pmulus_i16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/pmulus_i32.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/pmulus_i64.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/pmulus_i8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/atomics-6.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 * +./tests/atomics-uniform-8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 * +./tests/atomics-uniform-9.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O0 * +./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/paddus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/paddus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/pmuls_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/pmulus_i16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/pmulus_i32.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/pmulus_i64.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/pmulus_i8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/atomics-6.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 * +./tests/atomics-uniform-8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 * +./tests/atomics-uniform-9.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O0 * +./tests/psubus_vi16.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O2 * +./tests/psubus_vi8.ispc compfail x86-64 knc Linux LLVM 3.6 icpc13.1 -O2 * +./tests/psubus_vi16.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/psubus_vi8.ispc compfail x86-64 generic-4 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/psubus_vi16.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * +./tests/psubus_vi8.ispc compfail x86-64 generic-16 Linux LLVM 3.6 clang++3.4 -O2 * diff --git a/ispc.cpp b/ispc.cpp index 00e0faec..6b7d3fce 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -199,7 +199,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_hasTranscendentals(false), m_hasTrigonometry(false), m_hasRsqrtd(false), - m_hasRcpd(false) + m_hasRcpd(false), + m_hasVecPrefetch(false) { if (isa == NULL) { if (cpu != NULL) { @@ -386,6 +387,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTrigonometry = false; this->m_hasGather = this->m_hasScatter = true; this->m_hasRsqrtd = this->m_hasRcpd = true; + // It's set to true, because MIC has hardware vector prefetch instruction + this->m_hasVecPrefetch = true; } else if (!strcasecmp(isa, "generic-32") || !strcasecmp(isa, "generic-x32")) { diff --git a/ispc.h b/ispc.h index 01aae562..e11840a6 100644 --- a/ispc.h +++ b/ispc.h @@ -283,6 +283,8 @@ public: bool hasRcpd() const {return m_hasRcpd;} + bool hasVecPrefetch() const {return m_hasVecPrefetch;} + private: /** llvm Target object representing this target. */ @@ -385,6 +387,9 @@ private: /** Indicates whether there is an ISA double precision rcp. */ bool m_hasRcpd; + + /** Indicates whether the target has hardware instruction for vector prefetch. */ + bool m_hasVecPrefetch; }; diff --git a/ispc.vcxproj b/ispc.vcxproj index 4308715a..c4dc4373 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -131,7 +131,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit > $(Configuration)/gen-bitcode-sse4-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit > $(Configuration)/gen-bitcode-sse4-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit > $(Configuration)/gen-bitcode-sse4-64bit.cpp $(Configuration)/gen-bitcode-sse4-32bit.cpp; $(Configuration)/gen-bitcode-sse4-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll @@ -141,7 +141,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit > $(Configuration)/gen-bitcode-sse4-8-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit > $(Configuration)/gen-bitcode-sse4-8-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit > $(Configuration)/gen-bitcode-sse4-8-64bit.cpp $(Configuration)/gen-bitcode-sse4-8-32bit.cpp; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll @@ -151,7 +151,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit > $(Configuration)/gen-bitcode-sse4-16-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit > $(Configuration)/gen-bitcode-sse4-16-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit > $(Configuration)/gen-bitcode-sse4-16-64bit.cpp $(Configuration)/gen-bitcode-sse4-16-32bit.cpp; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll @@ -161,7 +161,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit > $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit > $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit > $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll @@ -171,7 +171,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit > $(Configuration)/gen-bitcode-sse2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit > $(Configuration)/gen-bitcode-sse2-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit > $(Configuration)/gen-bitcode-sse2-64bit.cpp $(Configuration)/gen-bitcode-sse2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll @@ -181,7 +181,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit > $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit > $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit > $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll @@ -191,7 +191,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp $(Configuration)/gen-bitcode-avx1-32bit.cpp; $(Configuration)/gen-bitcode-avx1-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll @@ -201,7 +201,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit > $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit > $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit > $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll @@ -211,7 +211,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll @@ -221,7 +221,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit > $(Configuration)/gen-bitcode-avx11-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit > $(Configuration)/gen-bitcode-avx11-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit > $(Configuration)/gen-bitcode-avx11-64bit.cpp $(Configuration)/gen-bitcode-avx11-32bit.cpp; $(Configuration)/gen-bitcode-avx11-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll @@ -231,7 +231,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit > $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit > $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit > $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll @@ -241,7 +241,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll @@ -251,7 +251,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit > $(Configuration)/gen-bitcode-avx2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit > $(Configuration)/gen-bitcode-avx2-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit > $(Configuration)/gen-bitcode-avx2-64bit.cpp $(Configuration)/gen-bitcode-avx2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll @@ -261,7 +261,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit > $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit > $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit > $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll @@ -271,7 +271,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll @@ -281,7 +281,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit > $(Configuration)/gen-bitcode-generic-1-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit > $(Configuration)/gen-bitcode-generic-1-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit > $(Configuration)/gen-bitcode-generic-1-64bit.cpp $(Configuration)/gen-bitcode-generic-1-32bit.cpp; $(Configuration)/gen-bitcode-generic-1-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll @@ -291,7 +291,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit > $(Configuration)/gen-bitcode-generic-4-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit > $(Configuration)/gen-bitcode-generic-4-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit > $(Configuration)/gen-bitcode-generic-4-64bit.cpp $(Configuration)/gen-bitcode-generic-4-32bit.cpp; $(Configuration)/gen-bitcode-generic-4-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll @@ -301,7 +301,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit > $(Configuration)/gen-bitcode-generic-8-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit > $(Configuration)/gen-bitcode-generic-8-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit > $(Configuration)/gen-bitcode-generic-8-64bit.cpp $(Configuration)/gen-bitcode-generic-8-32bit.cpp; $(Configuration)/gen-bitcode-generic-8-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll @@ -311,7 +311,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit > $(Configuration)/gen-bitcode-generic-16-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit > $(Configuration)/gen-bitcode-generic-16-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit > $(Configuration)/gen-bitcode-generic-16-64bit.cpp $(Configuration)/gen-bitcode-generic-16-32bit.cpp; $(Configuration)/gen-bitcode-generic-16-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll @@ -321,7 +321,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit > $(Configuration)/gen-bitcode-generic-32-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit > $(Configuration)/gen-bitcode-generic-32-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit > $(Configuration)/gen-bitcode-generic-32-64bit.cpp $(Configuration)/gen-bitcode-generic-32-32bit.cpp; $(Configuration)/gen-bitcode-generic-32-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll @@ -331,7 +331,7 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit > $(Configuration)/gen-bitcode-generic-64-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit > $(Configuration)/gen-bitcode-generic-64-32bit.cpp; m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit > $(Configuration)/gen-bitcode-generic-64-64bit.cpp $(Configuration)/gen-bitcode-generic-64-32bit.cpp; $(Configuration)/gen-bitcode-generic-64-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll @@ -403,6 +403,7 @@ true $(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories) clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies) + LLVMMCDisassembler.lib;%(AdditionalDependencies) LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies) @@ -424,10 +425,12 @@ true $(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories) clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies) + LLVMProfileData.lib;%(AdditionalDependencies) + LLVMMCDisassembler.lib;%(AdditionalDependencies) LLVMOption.lib;LLVMSupport.lib;%(AdditionalDependencies) - + \ No newline at end of file diff --git a/module.cpp b/module.cpp index 2ef89bbd..a20d2297 100644 --- a/module.cpp +++ b/module.cpp @@ -604,12 +604,23 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE if (diBuilder) { llvm::DIFile file = pos.GetDIFile(); llvm::DIGlobalVariable var = +#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)// LLVM 3.6+ + diBuilder->createGlobalVariable(file, + name, + name, + file, + pos.first_line, + sym->type->GetDIType(file), + (sym->storageClass == SC_STATIC), + sym->storagePtr); +#else diBuilder->createGlobalVariable(name, file, pos.first_line, sym->type->GetDIType(file), (sym->storageClass == SC_STATIC), sym->storagePtr); +#endif Assert(var.Verify()); } } @@ -1304,18 +1315,33 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine, #endif +#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5) std::string error; +#else // LLVM 3.6+ + std::error_code error; +#endif + llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags); + +#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5) if (error.size()) { +#else // LLVM 3.6+ + if (error) { +#endif + fprintf(stderr, "Error opening output file \"%s\".\n", outFileName); return false; } llvm::PassManager pm; -#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+ - pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout())); -#else +#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) pm.add(new llvm::DataLayout(*g->target->getDataLayout())); +#elif defined(LLVM_3_5) + pm.add(new llvm::DataLayoutPass(*g->target->getDataLayout())); +#else // LLVM 3.6+ + llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass(); + dlp->doInitialization(*module); + pm.add(dlp); #endif llvm::formatted_raw_ostream fos(of->os()); diff --git a/opt.cpp b/opt.cpp index a28b5758..135a7c8c 100644 --- a/opt.cpp +++ b/opt.cpp @@ -479,10 +479,14 @@ Optimize(llvm::Module *module, int optLevel) { new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple())); optPM.add(targetLibraryInfo); -#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) // LLVM 3.5+ - optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout())); -#else +#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) optPM.add(new llvm::DataLayout(*g->target->getDataLayout())); +#elif defined(LLVM_3_5) + optPM.add(new llvm::DataLayoutPass(*g->target->getDataLayout())); +#else // LLVM 3.6+ + llvm::DataLayoutPass *dlp= new llvm::DataLayoutPass(); + dlp->doInitialization(*module); + optPM.add(dlp); #endif llvm::TargetMachine *targetMachine = g->target->GetTargetMachine(); @@ -2117,8 +2121,8 @@ static bool lGSToGSBaseOffsets(llvm::CallInst *callInst) { struct GSInfo { GSInfo(const char *pgFuncName, const char *pgboFuncName, - const char *pgbo32FuncName, bool ig) - : isGather(ig) { + const char *pgbo32FuncName, bool ig, bool ip) + : isGather(ig), isPrefetch(ip) { func = m->module->getFunction(pgFuncName); baseOffsetsFunc = m->module->getFunction(pgboFuncName); baseOffsets32Func = m->module->getFunction(pgbo32FuncName); @@ -2126,6 +2130,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { llvm::Function *func; llvm::Function *baseOffsetsFunc, *baseOffsets32Func; const bool isGather; + const bool isPrefetch; }; GSInfo gsFuncs[] = { @@ -2134,148 +2139,176 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { "__pseudo_gather_factored_base_offsets32_i8", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8", - true), + true, false), GSInfo("__pseudo_gather32_i16", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" : "__pseudo_gather_factored_base_offsets32_i16", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" : "__pseudo_gather_factored_base_offsets32_i16", - true), + true, false), GSInfo("__pseudo_gather32_i32", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" : "__pseudo_gather_factored_base_offsets32_i32", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" : "__pseudo_gather_factored_base_offsets32_i32", - true), + true, false), GSInfo("__pseudo_gather32_float", g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" : "__pseudo_gather_factored_base_offsets32_float", g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" : "__pseudo_gather_factored_base_offsets32_float", - true), + true, false), GSInfo("__pseudo_gather32_i64", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" : "__pseudo_gather_factored_base_offsets32_i64", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" : "__pseudo_gather_factored_base_offsets32_i64", - true), + true, false), GSInfo("__pseudo_gather32_double", g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" : "__pseudo_gather_factored_base_offsets32_double", g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" : "__pseudo_gather_factored_base_offsets32_double", - true), + true, false), GSInfo("__pseudo_scatter32_i8", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" : "__pseudo_scatter_factored_base_offsets32_i8", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" : "__pseudo_scatter_factored_base_offsets32_i8", - false), + false, false), GSInfo("__pseudo_scatter32_i16", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" : "__pseudo_scatter_factored_base_offsets32_i16", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" : "__pseudo_scatter_factored_base_offsets32_i16", - false), + false, false), GSInfo("__pseudo_scatter32_i32", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" : "__pseudo_scatter_factored_base_offsets32_i32", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" : "__pseudo_scatter_factored_base_offsets32_i32", - false), + false, false), GSInfo("__pseudo_scatter32_float", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" : "__pseudo_scatter_factored_base_offsets32_float", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" : "__pseudo_scatter_factored_base_offsets32_float", - false), + false, false), GSInfo("__pseudo_scatter32_i64", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" : "__pseudo_scatter_factored_base_offsets32_i64", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" : "__pseudo_scatter_factored_base_offsets32_i64", - false), + false, false), GSInfo("__pseudo_scatter32_double", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" : "__pseudo_scatter_factored_base_offsets32_double", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" : "__pseudo_scatter_factored_base_offsets32_double", - false), + false, false), GSInfo("__pseudo_gather64_i8", g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" : "__pseudo_gather_factored_base_offsets64_i8", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8", - true), + true, false), GSInfo("__pseudo_gather64_i16", g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16" : "__pseudo_gather_factored_base_offsets64_i16", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" : "__pseudo_gather_factored_base_offsets32_i16", - true), + true, false), GSInfo("__pseudo_gather64_i32", g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32" : "__pseudo_gather_factored_base_offsets64_i32", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" : "__pseudo_gather_factored_base_offsets32_i32", - true), + true, false), GSInfo("__pseudo_gather64_float", g->target->hasGather() ? "__pseudo_gather_base_offsets64_float" : "__pseudo_gather_factored_base_offsets64_float", g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" : "__pseudo_gather_factored_base_offsets32_float", - true), + true, false), GSInfo("__pseudo_gather64_i64", g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64" : "__pseudo_gather_factored_base_offsets64_i64", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" : "__pseudo_gather_factored_base_offsets32_i64", - true), + true, false), GSInfo("__pseudo_gather64_double", g->target->hasGather() ? "__pseudo_gather_base_offsets64_double" : "__pseudo_gather_factored_base_offsets64_double", g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" : "__pseudo_gather_factored_base_offsets32_double", - true), + true, false), GSInfo("__pseudo_scatter64_i8", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8" : "__pseudo_scatter_factored_base_offsets64_i8", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" : "__pseudo_scatter_factored_base_offsets32_i8", - false), + false, false), GSInfo("__pseudo_scatter64_i16", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16" : "__pseudo_scatter_factored_base_offsets64_i16", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" : "__pseudo_scatter_factored_base_offsets32_i16", - false), + false, false), GSInfo("__pseudo_scatter64_i32", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32" : "__pseudo_scatter_factored_base_offsets64_i32", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" : "__pseudo_scatter_factored_base_offsets32_i32", - false), + false, false), GSInfo("__pseudo_scatter64_float", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float" : "__pseudo_scatter_factored_base_offsets64_float", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" : "__pseudo_scatter_factored_base_offsets32_float", - false), + false, false), GSInfo("__pseudo_scatter64_i64", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64" : "__pseudo_scatter_factored_base_offsets64_i64", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" : "__pseudo_scatter_factored_base_offsets32_i64", - false), + false, false), GSInfo("__pseudo_scatter64_double", g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double" : "__pseudo_scatter_factored_base_offsets64_double", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" : "__pseudo_scatter_factored_base_offsets32_double", - false), + false, false), + + GSInfo("__pseudo_prefetch_read_varying_1", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : + "__prefetch_read_varying_1", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : + "__prefetch_read_varying_1", + false, true), + + GSInfo("__pseudo_prefetch_read_varying_2", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : + "__prefetch_read_varying_2", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : + "__prefetch_read_varying_2", + false, true), + + GSInfo("__pseudo_prefetch_read_varying_3", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : + "__prefetch_read_varying_3", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : + "__prefetch_read_varying_3", + false, true), + + GSInfo("__pseudo_prefetch_read_varying_nt", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : + "__prefetch_read_varying_nt", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : + "__prefetch_read_varying_nt", + false, true), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -2301,7 +2334,8 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector, callInst); - if (basePtr == NULL || offsetVector == NULL) + if (basePtr == NULL || offsetVector == NULL || + (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false)) // It's actually a fully general gather/scatter with a varying // set of base pointers, so leave it as is and continune onward // to the next instruction... @@ -2316,7 +2350,9 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { llvm::Function *gatherScatterFunc = info->baseOffsetsFunc; if ((info->isGather == true && g->target->hasGather()) || - (info->isGather == false && g->target->hasScatter())) { + (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) || + (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) { + // See if the offsets are scaled by 2, 4, or 8. If so, // extract that scale factor and rewrite the offsets to remove // it. @@ -2330,7 +2366,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { gatherScatterFunc = info->baseOffsets32Func; } - if (info->isGather) { + if (info->isGather || info->isPrefetch) { llvm::Value *mask = callInst->getArgOperand(1); // Generate a new function call to the next pseudo gather @@ -2387,7 +2423,7 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { gatherScatterFunc = info->baseOffsets32Func; } - if (info->isGather) { + if (info->isGather || info->isPrefetch) { llvm::Value *mask = callInst->getArgOperand(1); // Generate a new function call to the next pseudo gather @@ -2429,13 +2465,14 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) { struct GSBOInfo { - GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig) - : isGather(ig) { + GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip) + : isGather(ig), isPrefetch(ip) { baseOffsetsFunc = m->module->getFunction(pgboFuncName); baseOffsets32Func = m->module->getFunction(pgbo32FuncName); } llvm::Function *baseOffsetsFunc, *baseOffsets32Func; const bool isGather; + const bool isPrefetch; }; GSBOInfo gsFuncs[] = { @@ -2443,63 +2480,87 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) { "__pseudo_gather_factored_base_offsets32_i8", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8", - true), + true, false), GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" : "__pseudo_gather_factored_base_offsets32_i16", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16" : "__pseudo_gather_factored_base_offsets32_i16", - true), + true, false), GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" : "__pseudo_gather_factored_base_offsets32_i32", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32" : "__pseudo_gather_factored_base_offsets32_i32", - true), + true, false), GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" : "__pseudo_gather_factored_base_offsets32_float", g->target->hasGather() ? "__pseudo_gather_base_offsets32_float" : "__pseudo_gather_factored_base_offsets32_float", - true), + true, false), GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" : "__pseudo_gather_factored_base_offsets32_i64", g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64" : "__pseudo_gather_factored_base_offsets32_i64", - true), + true, false), GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" : "__pseudo_gather_factored_base_offsets32_double", g->target->hasGather() ? "__pseudo_gather_base_offsets32_double" : "__pseudo_gather_factored_base_offsets32_double", - true), + true, false), GSBOInfo( g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" : "__pseudo_scatter_factored_base_offsets32_i8", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8" : "__pseudo_scatter_factored_base_offsets32_i8", - false), + false, false), GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" : "__pseudo_scatter_factored_base_offsets32_i16", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16" : "__pseudo_scatter_factored_base_offsets32_i16", - false), + false, false), GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" : "__pseudo_scatter_factored_base_offsets32_i32", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32" : "__pseudo_scatter_factored_base_offsets32_i32", - false), + false, false), GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" : "__pseudo_scatter_factored_base_offsets32_float", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float" : "__pseudo_scatter_factored_base_offsets32_float", - false), + false, false), GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" : "__pseudo_scatter_factored_base_offsets32_i64", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64" : "__pseudo_scatter_factored_base_offsets32_i64", - false), + false, false), GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" : "__pseudo_scatter_factored_base_offsets32_double", g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double" : "__pseudo_scatter_factored_base_offsets32_double", - false), + false, false), + + GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : + "__prefetch_read_varying_1", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : + "__prefetch_read_varying_1", + false, true), + + GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : + "__prefetch_read_varying_2", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : + "__prefetch_read_varying_2", + false, true), + + GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : + "__prefetch_read_varying_3", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : + "__prefetch_read_varying_3", + false, true), + + GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : + "__prefetch_read_varying_nt", + g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : + "__prefetch_read_varying_nt", + false, true), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -4290,149 +4351,170 @@ lReplacePseudoMaskedStore(llvm::CallInst *callInst) { static bool lReplacePseudoGS(llvm::CallInst *callInst) { struct LowerGSInfo { - LowerGSInfo(const char *pName, const char *aName, bool ig) - : isGather(ig) { + LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip) + : isGather(ig), isPrefetch(ip) { pseudoFunc = m->module->getFunction(pName); actualFunc = m->module->getFunction(aName); } llvm::Function *pseudoFunc; llvm::Function *actualFunc; const bool isGather; + const bool isPrefetch; }; LowerGSInfo lgsInfo[] = { - LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true), - LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true), - LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true), - LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true), - LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true), - LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true), + LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false), + LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false), + LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false), + LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false), + LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false), + LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false), - LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true), - LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true), - LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true), - LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true), - LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true), - LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true), + LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false), + LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false), + LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false), + LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false), + LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false), + LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", - "__gather_factored_base_offsets32_i8", true), + "__gather_factored_base_offsets32_i8", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", - "__gather_factored_base_offsets32_i16", true), + "__gather_factored_base_offsets32_i16", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", - "__gather_factored_base_offsets32_i32", true), + "__gather_factored_base_offsets32_i32", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", - "__gather_factored_base_offsets32_float", true), + "__gather_factored_base_offsets32_float", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", - "__gather_factored_base_offsets32_i64", true), + "__gather_factored_base_offsets32_i64", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", - "__gather_factored_base_offsets32_double", true), + "__gather_factored_base_offsets32_double", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", - "__gather_factored_base_offsets64_i8", true), + "__gather_factored_base_offsets64_i8", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", - "__gather_factored_base_offsets64_i16", true), + "__gather_factored_base_offsets64_i16", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", - "__gather_factored_base_offsets64_i32", true), + "__gather_factored_base_offsets64_i32", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", - "__gather_factored_base_offsets64_float", true), + "__gather_factored_base_offsets64_float", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", - "__gather_factored_base_offsets64_i64", true), + "__gather_factored_base_offsets64_i64", true, false), LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", - "__gather_factored_base_offsets64_double", true), + "__gather_factored_base_offsets64_double", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_i8", - "__gather_base_offsets32_i8", true), + "__gather_base_offsets32_i8", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_i16", - "__gather_base_offsets32_i16", true), + "__gather_base_offsets32_i16", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_i32", - "__gather_base_offsets32_i32", true), + "__gather_base_offsets32_i32", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_float", - "__gather_base_offsets32_float", true), + "__gather_base_offsets32_float", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_i64", - "__gather_base_offsets32_i64", true), + "__gather_base_offsets32_i64", true, false), LowerGSInfo("__pseudo_gather_base_offsets32_double", - "__gather_base_offsets32_double", true), + "__gather_base_offsets32_double", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_i8", - "__gather_base_offsets64_i8", true), + "__gather_base_offsets64_i8", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_i16", - "__gather_base_offsets64_i16", true), + "__gather_base_offsets64_i16", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_i32", - "__gather_base_offsets64_i32", true), + "__gather_base_offsets64_i32", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_float", - "__gather_base_offsets64_float", true), + "__gather_base_offsets64_float", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_i64", - "__gather_base_offsets64_i64", true), + "__gather_base_offsets64_i64", true, false), LowerGSInfo("__pseudo_gather_base_offsets64_double", - "__gather_base_offsets64_double", true), + "__gather_base_offsets64_double", true, false), - LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false), - LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false), - LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false), - LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false), - LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false), - LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false), + LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false), + LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false), + LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false), + LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false), + LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false), + LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false), - LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false), - LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false), - LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false), - LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false), - LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false), - LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false), + LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false), + LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false), + LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false), + LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false), + LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false), + LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", - "__scatter_factored_base_offsets32_i8", false), + "__scatter_factored_base_offsets32_i8", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", - "__scatter_factored_base_offsets32_i16", false), + "__scatter_factored_base_offsets32_i16", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", - "__scatter_factored_base_offsets32_i32", false), + "__scatter_factored_base_offsets32_i32", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", - "__scatter_factored_base_offsets32_float", false), + "__scatter_factored_base_offsets32_float", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", - "__scatter_factored_base_offsets32_i64", false), + "__scatter_factored_base_offsets32_i64", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", - "__scatter_factored_base_offsets32_double", false), + "__scatter_factored_base_offsets32_double", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", - "__scatter_factored_base_offsets64_i8", false), + "__scatter_factored_base_offsets64_i8", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", - "__scatter_factored_base_offsets64_i16", false), + "__scatter_factored_base_offsets64_i16", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", - "__scatter_factored_base_offsets64_i32", false), + "__scatter_factored_base_offsets64_i32", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", - "__scatter_factored_base_offsets64_float", false), + "__scatter_factored_base_offsets64_float", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", - "__scatter_factored_base_offsets64_i64", false), + "__scatter_factored_base_offsets64_i64", false, false), LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", - "__scatter_factored_base_offsets64_double", false), + "__scatter_factored_base_offsets64_double", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_i8", - "__scatter_base_offsets32_i8", false), + "__scatter_base_offsets32_i8", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_i16", - "__scatter_base_offsets32_i16", false), + "__scatter_base_offsets32_i16", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_i32", - "__scatter_base_offsets32_i32", false), + "__scatter_base_offsets32_i32", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_float", - "__scatter_base_offsets32_float", false), + "__scatter_base_offsets32_float", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_i64", - "__scatter_base_offsets32_i64", false), + "__scatter_base_offsets32_i64", false, false), LowerGSInfo("__pseudo_scatter_base_offsets32_double", - "__scatter_base_offsets32_double", false), + "__scatter_base_offsets32_double", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_i8", - "__scatter_base_offsets64_i8", false), + "__scatter_base_offsets64_i8", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_i16", - "__scatter_base_offsets64_i16", false), + "__scatter_base_offsets64_i16", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_i32", - "__scatter_base_offsets64_i32", false), + "__scatter_base_offsets64_i32", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_float", - "__scatter_base_offsets64_float", false), + "__scatter_base_offsets64_float", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_i64", - "__scatter_base_offsets64_i64", false), + "__scatter_base_offsets64_i64", false, false), LowerGSInfo("__pseudo_scatter_base_offsets64_double", - "__scatter_base_offsets64_double", false), + "__scatter_base_offsets64_double", false, false), + + LowerGSInfo("__pseudo_prefetch_read_varying_1", + "__prefetch_read_varying_1", false, true), + LowerGSInfo("__pseudo_prefetch_read_varying_1_native", + "__prefetch_read_varying_1_native", false, true), + + LowerGSInfo("__pseudo_prefetch_read_varying_2", + "__prefetch_read_varying_2", false, true), + LowerGSInfo("__pseudo_prefetch_read_varying_2_native", + "__prefetch_read_varying_2_native", false, true), + + LowerGSInfo("__pseudo_prefetch_read_varying_3", + "__prefetch_read_varying_3", false, true), + LowerGSInfo("__pseudo_prefetch_read_varying_3_native", + "__prefetch_read_varying_3_native", false, true), + + LowerGSInfo("__pseudo_prefetch_read_varying_nt", + "__prefetch_read_varying_nt", false, true), + LowerGSInfo("__pseudo_prefetch_read_varying_nt_native", + "__prefetch_read_varying_nt_native", false, true), }; llvm::Function *calledFunc = callInst->getCalledFunction(); @@ -4459,7 +4541,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) { if (gotPosition && g->target->getVectorWidth() > 1) { if (info->isGather) PerformanceWarning(pos, "Gather required to load value."); - else + else if (!info->isPrefetch) PerformanceWarning(pos, "Scatter required to store value."); } return true; @@ -4740,6 +4822,8 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__scatter64_i8", "__scatter64_i16", "__scatter64_i32", "__scatter64_i64", "__scatter64_float", "__scatter64_double", + "__prefetch_read_varying_1", "__prefetch_read_varying_2", + "__prefetch_read_varying_3", "__prefetch_read_varying_nt", "__keep_funcs_live", }; diff --git a/run_tests.py b/run_tests.py index d48b1a2f..c540c020 100755 --- a/run_tests.py +++ b/run_tests.py @@ -647,8 +647,8 @@ def run_tests(options1, args, print_version): options.include_file = "examples/intrinsics/generic-64.h" options.target = "generic-64" elif options.target == "knc": - error("No knc #include specified; using examples/intrinsics/knc-i1x16.h\n", 2) - options.include_file = "examples/intrinsics/knc-i1x16.h" + error("No knc #include specified; using examples/intrinsics/knc.h\n", 2) + options.include_file = "examples/intrinsics/knc.h" if options.compiler_exe == None: if (options.target == "knc"): diff --git a/stdlib.ispc b/stdlib.ispc index 2ec3859e..01aae815 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -847,43 +847,19 @@ static inline void prefetch_nt(const void * uniform ptr) { } static inline void prefetch_l1(const void * varying ptr) { - const void * uniform ptrArray[programCount]; - ptrArray[programIndex] = ptr; - - foreach_active (i) { - const void * uniform p = ptrArray[i]; - prefetch_l1(p); - } + __pseudo_prefetch_read_varying_1((int64)ptr, (IntMaskType)__mask); } static inline void prefetch_l2(const void * varying ptr) { - const void * uniform ptrArray[programCount]; - ptrArray[programIndex] = ptr; - - foreach_active (i) { - const void * uniform p = ptrArray[i]; - prefetch_l2(p); - } + __pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask); } static inline void prefetch_l3(const void * varying ptr) { - const void * uniform ptrArray[programCount]; - ptrArray[programIndex] = ptr; - - foreach_active (i) { - const void * uniform p = ptrArray[i]; - prefetch_l3(p); - } + __pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask); } static inline void prefetch_nt(const void * varying ptr) { - const void * uniform ptrArray[programCount]; - ptrArray[programIndex] = ptr; - - foreach_active (i) { - const void * uniform p = ptrArray[i]; - prefetch_nt(p); - } + __pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask); } /////////////////////////////////////////////////////////////////////////// diff --git a/stmt.cpp b/stmt.cpp index a551b3f7..586cb0fe 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -712,7 +712,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, } } - /** Emit code for an if test that checks the mask and the test values and tries to be smart about jumping over code that doesn't need to be run. */ @@ -1101,8 +1100,10 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const { // the code for the test. This is only necessary for varying loops; // 'uniform' loops just jump when they hit a continue statement and // don't mess with the mask. - if (!uniformTest) + if (!uniformTest) { ctx->RestoreContinuedLanes(); + ctx->ClearBreakLanes(); + } llvm::Value *testValue = testExpr->GetValue(ctx); if (!testValue) return; @@ -1310,6 +1311,8 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const { // test code. ctx->SetCurrentBasicBlock(bstep); ctx->RestoreContinuedLanes(); + ctx->ClearBreakLanes(); + if (step) step->EmitCode(ctx); ctx->BranchInst(btest); diff --git a/tests/prefetch-varying.ispc b/tests/prefetch-varying.ispc new file mode 100644 index 00000000..02df84c9 --- /dev/null +++ b/tests/prefetch-varying.ispc @@ -0,0 +1,22 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int64 *ptr = &(a[programIndex+zero]); + prefetch_l1(ptr); + prefetch_l2(ptr); + prefetch_l3(ptr); + prefetch_nt(ptr); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +}