From d665e2e85b1c609c129af7669ead85ea4b98c3a7 Mon Sep 17 00:00:00 2001 From: Gerrit Code Review Date: Wed, 24 Oct 2012 09:53:29 -0700 Subject: [PATCH 1/6] Initial empty repository From e57801a5d16b5641bb0e801e2f34b478336201cb Mon Sep 17 00:00:00 2001 From: "james.brodman" Date: Wed, 31 Oct 2012 15:25:26 -0400 Subject: [PATCH 2/6] Typo Fix --- examples/intrinsics/sse4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index e3a4e277..b0365c86 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -274,7 +274,7 @@ static FORCEINLINE bool __any(__vec4_i1 mask) { } static FORCEINLINE bool __all(__vec4_i1 mask) { - return (_mm_movemask_ps(mask.v)=0xF); + return (_mm_movemask_ps(mask.v)==0xF); } static FORCEINLINE bool __none(__vec4_i1 mask) { From 810784da1f2e797410847cfe5a404b74297c4d92 Mon Sep 17 00:00:00 2001 From: ptu1 Date: Tue, 13 Nov 2012 12:35:45 -0800 Subject: [PATCH 3/6] Set the ScalarReplAggregate maximum structure size based on target vector width. --- opt.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/opt.cpp b/opt.cpp index bb5ba955..8c6f7b06 100644 --- a/opt.cpp +++ b/opt.cpp @@ -471,8 +471,14 @@ Optimize(llvm::Module *module, int optLevel) { } optPM.add(llvm::createDeadInstEliminationPass()); + // Max struct size threshold for scalar replacement is + // 1) 4 fields (r,g,b,w) + // 2) field size: vectorWidth * sizeof(float) + const int field_limit = 4; + int sr_threshold = g->target.vectorWidth * sizeof(float) * field_limit; + // On to more serious optimizations - optPM.add(llvm::createScalarReplAggregatesPass()); + optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold)); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createPromoteMemoryToRegisterPass()); @@ -494,7 +500,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createJumpThreadingPass()); optPM.add(llvm::createCFGSimplificationPass()); - optPM.add(llvm::createScalarReplAggregatesPass()); + optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold)); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createTailCallEliminationPass()); @@ -540,7 +546,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createFunctionInliningPass()); optPM.add(llvm::createArgumentPromotionPass()); - optPM.add(llvm::createScalarReplAggregatesPass(-1, false)); + optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false)); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createReassociatePass()); From 2129b1e27d30b8a481eee04346e2bec91bc303b1 Mon Sep 17 00:00:00 2001 From: Jean-Luc Duprat Date: Wed, 21 Nov 2012 15:40:35 -0800 Subject: [PATCH 4/6] knc.h: Fixed __rsqrt_varying_float() to use _mm512_invsqrt_ps() instead of _mm512_invsqrt_pd() This was a typo. --- examples/intrinsics/knc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index b884ea82..08d2e8bf 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1959,7 +1959,7 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { #ifdef ISPC_FAST_MATH return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy #else - return _mm512_invsqrt_pd(v); + return _mm512_invsqrt_ps(v); #endif } From 16b0806d4091cb49a9e78710235825b68de5efd4 Mon Sep 17 00:00:00 2001 From: Peng Tu Date: Wed, 21 Nov 2012 19:09:10 -0800 Subject: [PATCH 5/6] Fix LLVM TOT build issue. --- Makefile | 4 ++++ ispc.h | 4 ++-- main.cpp | 2 ++ module.cpp | 6 +++--- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index fc132e59..17855931 100644 --- a/Makefile +++ b/Makefile @@ -129,6 +129,10 @@ objs/cbackend.o: cbackend.cpp @echo Compiling $< @$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $< +objs/opt.o: opt.cpp + @echo Compiling $< + @$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $< + objs/%.o: objs/%.cpp @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< diff --git a/ispc.h b/ispc.h index 045916ab..a52e51c1 100644 --- a/ispc.h +++ b/ispc.h @@ -40,8 +40,8 @@ #define ISPC_VERSION "1.3.1dev" -#if !defined(LLVM_3_0) && !defined(LLVM_3_1) && !defined(LLVM_3_2) -#error "Only LLVM 3.0, 3.1, and the 3.2 development branch are supported" +#if !defined(LLVM_3_0) && !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) +#error "Only LLVM 3.0, 3.1, 3.2 and the 3.3 development branch are supported" #endif #if defined(_WIN32) || defined(_WIN64) diff --git a/main.cpp b/main.cpp index 8076456f..70dfb5b6 100644 --- a/main.cpp +++ b/main.cpp @@ -68,6 +68,8 @@ lPrintVersion() { "3.1" #elif defined(LLVM_3_2) "3.2" +#elif defined(LLVM_3_3) + "3.3" #else #error "Unhandled LLVM version" #endif diff --git a/module.cpp b/module.cpp index 45c5fae7..be6796af 100644 --- a/module.cpp +++ b/module.cpp @@ -1757,9 +1757,9 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre clang::TextDiagnosticPrinter *diagPrinter = new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions()); #else - clang::DiagnosticOptions diagOptions; + clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions(); clang::TextDiagnosticPrinter *diagPrinter = - new clang::TextDiagnosticPrinter(stderrRaw, &diagOptions); + new clang::TextDiagnosticPrinter(stderrRaw, diagOptions); #endif llvm::IntrusiveRefCntPtr diagIDs(new clang::DiagnosticIDs); #if defined(LLVM_3_0) || defined(LLVM_3_1) @@ -1767,7 +1767,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre new clang::DiagnosticsEngine(diagIDs, diagPrinter); #else clang::DiagnosticsEngine *diagEngine = - new clang::DiagnosticsEngine(diagIDs, &diagOptions, diagPrinter); + new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter); #endif inst.setDiagnostics(diagEngine); From 24087ff3cc08c9ef050165566d25257067744e10 Mon Sep 17 00:00:00 2001 From: Jean-Luc Duprat Date: Tue, 27 Nov 2012 13:38:28 -0800 Subject: [PATCH 6/6] Expose none() in the ISPC standard library. On KNC: all(), any() and none() do not generate a redundant movmsk instruction. --- docs/ispc.rst | 11 +++++++---- examples/intrinsics/knc.h | 2 +- stdlib.ispc | 23 +++++++++++++++++------ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index 609b7838..9412e649 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3701,15 +3701,18 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v`` Reductions ---------- -A number routines are available to evaluate conditions across the running -program instances. For example, ``any()`` returns ``true`` if the given -value ``v`` is ``true`` for any of the SPMD program instances currently -running, and ``all()`` returns ``true`` if it true for all of them. +A number routines are available to evaluate conditions across the +running program instances. For example, ``any()`` returns ``true`` if +the given value ``v`` is ``true`` for any of the SPMD program +instances currently running, ``all()`` returns ``true`` if it true +for all of them, and ``none()`` returns ``true`` if ``v`` is always +``false``. :: uniform bool any(bool v) uniform bool all(bool v) + uniform bool none(bool v) You can also compute a variety of reductions across the program instances. For example, the values of the given value in each of the active program diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 08d2e8bf..dba551fe 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -482,7 +482,7 @@ static FORCEINLINE bool __all(__vec16_i1 mask) { } static FORCEINLINE bool __none(__vec16_i1 mask) { - return !__any(mask); + return _mm512_kortestz(mask, mask); } static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { diff --git a/stdlib.ispc b/stdlib.ispc index 80f44e91..81ebac70 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -340,9 +340,9 @@ static inline uniform bool any(bool v) { // We only care about whether "any" is true for the active program instances, // so we have to make v with the current program mask. #ifdef ISPC_TARGET_GENERIC - return __movmsk(v & __mask) != 0; + return __any(v | !__mask); #else - return __movmsk(__sext_varying_bool(v) & __mask) != 0; + return __any(__sext_varying_bool(v) | !__mask); #endif } @@ -350,13 +350,24 @@ __declspec(safe) static inline uniform bool all(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes + #ifdef ISPC_TARGET_GENERIC - bool match = ((v & __mask) == __mask); + return __all(v | !__mask); #else - int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask); + return __all(__sext_varying_bool(v) | !__mask); +#endif +} + +__declspec(safe) +static inline uniform bool none(bool v) { + // As with any(), we need to explicitly mask v with the current program mask + // so we're only looking at the current lanes + +#ifdef ISPC_TARGET_GENERIC + return __none(v | !__mask); +#else + return __none(__sext_varying_bool(v) | !__mask); #endif - return __movmsk(match) == ((programCount == 64) ? ~0ull : - ((1ull << programCount) - 1)); } __declspec(safe)