diff --git a/Makefile b/Makefile index fc132e59..17855931 100644 --- a/Makefile +++ b/Makefile @@ -129,6 +129,10 @@ objs/cbackend.o: cbackend.cpp @echo Compiling $< @$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $< +objs/opt.o: opt.cpp + @echo Compiling $< + @$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $< + objs/%.o: objs/%.cpp @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< diff --git a/docs/ispc.rst b/docs/ispc.rst index 609b7838..9412e649 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3701,15 +3701,18 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v`` Reductions ---------- -A number routines are available to evaluate conditions across the running -program instances. For example, ``any()`` returns ``true`` if the given -value ``v`` is ``true`` for any of the SPMD program instances currently -running, and ``all()`` returns ``true`` if it true for all of them. +A number routines are available to evaluate conditions across the +running program instances. For example, ``any()`` returns ``true`` if +the given value ``v`` is ``true`` for any of the SPMD program +instances currently running, ``all()`` returns ``true`` if it true +for all of them, and ``none()`` returns ``true`` if ``v`` is always +``false``. :: uniform bool any(bool v) uniform bool all(bool v) + uniform bool none(bool v) You can also compute a variety of reductions across the program instances. For example, the values of the given value in each of the active program diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index b884ea82..dba551fe 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -482,7 +482,7 @@ static FORCEINLINE bool __all(__vec16_i1 mask) { } static FORCEINLINE bool __none(__vec16_i1 mask) { - return !__any(mask); + return _mm512_kortestz(mask, mask); } static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { @@ -1959,7 +1959,7 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { #ifdef ISPC_FAST_MATH return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy #else - return _mm512_invsqrt_pd(v); + return _mm512_invsqrt_ps(v); #endif } diff --git a/ispc.h b/ispc.h index 40436d7c..a52e51c1 100644 --- a/ispc.h +++ b/ispc.h @@ -41,7 +41,7 @@ #define ISPC_VERSION "1.3.1dev" #if !defined(LLVM_3_0) && !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) -#error "Only LLVM 3.0, 3.1, 3.2, and the 3.3 development branch are supported" +#error "Only LLVM 3.0, 3.1, 3.2 and the 3.3 development branch are supported" #endif #if defined(_WIN32) || defined(_WIN64) diff --git a/module.cpp b/module.cpp index 45c5fae7..be6796af 100644 --- a/module.cpp +++ b/module.cpp @@ -1757,9 +1757,9 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre clang::TextDiagnosticPrinter *diagPrinter = new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions()); #else - clang::DiagnosticOptions diagOptions; + clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions(); clang::TextDiagnosticPrinter *diagPrinter = - new clang::TextDiagnosticPrinter(stderrRaw, &diagOptions); + new clang::TextDiagnosticPrinter(stderrRaw, diagOptions); #endif llvm::IntrusiveRefCntPtr diagIDs(new clang::DiagnosticIDs); #if defined(LLVM_3_0) || defined(LLVM_3_1) @@ -1767,7 +1767,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre new clang::DiagnosticsEngine(diagIDs, diagPrinter); #else clang::DiagnosticsEngine *diagEngine = - new clang::DiagnosticsEngine(diagIDs, &diagOptions, diagPrinter); + new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter); #endif inst.setDiagnostics(diagEngine); diff --git a/opt.cpp b/opt.cpp index bb5ba955..8c6f7b06 100644 --- a/opt.cpp +++ b/opt.cpp @@ -471,8 +471,14 @@ Optimize(llvm::Module *module, int optLevel) { } optPM.add(llvm::createDeadInstEliminationPass()); + // Max struct size threshold for scalar replacement is + // 1) 4 fields (r,g,b,w) + // 2) field size: vectorWidth * sizeof(float) + const int field_limit = 4; + int sr_threshold = g->target.vectorWidth * sizeof(float) * field_limit; + // On to more serious optimizations - optPM.add(llvm::createScalarReplAggregatesPass()); + optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold)); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createPromoteMemoryToRegisterPass()); @@ -494,7 +500,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createJumpThreadingPass()); optPM.add(llvm::createCFGSimplificationPass()); - optPM.add(llvm::createScalarReplAggregatesPass()); + optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold)); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createTailCallEliminationPass()); @@ -540,7 +546,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createFunctionInliningPass()); optPM.add(llvm::createArgumentPromotionPass()); - optPM.add(llvm::createScalarReplAggregatesPass(-1, false)); + optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false)); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createReassociatePass()); diff --git a/stdlib.ispc b/stdlib.ispc index 80f44e91..81ebac70 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -340,9 +340,9 @@ static inline uniform bool any(bool v) { // We only care about whether "any" is true for the active program instances, // so we have to make v with the current program mask. #ifdef ISPC_TARGET_GENERIC - return __movmsk(v & __mask) != 0; + return __any(v | !__mask); #else - return __movmsk(__sext_varying_bool(v) & __mask) != 0; + return __any(__sext_varying_bool(v) | !__mask); #endif } @@ -350,13 +350,24 @@ __declspec(safe) static inline uniform bool all(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes + #ifdef ISPC_TARGET_GENERIC - bool match = ((v & __mask) == __mask); + return __all(v | !__mask); #else - int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask); + return __all(__sext_varying_bool(v) | !__mask); +#endif +} + +__declspec(safe) +static inline uniform bool none(bool v) { + // As with any(), we need to explicitly mask v with the current program mask + // so we're only looking at the current lanes + +#ifdef ISPC_TARGET_GENERIC + return __none(v | !__mask); +#else + return __none(__sext_varying_bool(v) | !__mask); #endif - return __movmsk(match) == ((programCount == 64) ? ~0ull : - ((1ull << programCount) - 1)); } __declspec(safe)