This commit is contained in:
Matt Pharr
2012-12-02 14:32:52 -08:00
7 changed files with 43 additions and 19 deletions

View File

@@ -129,6 +129,10 @@ objs/cbackend.o: cbackend.cpp
@echo Compiling $<
@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
objs/opt.o: opt.cpp
@echo Compiling $<
@$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $<
objs/%.o: objs/%.cpp
@echo Compiling $<
@$(CXX) $(CXXFLAGS) -o $@ -c $<

View File

@@ -3701,15 +3701,18 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v``
Reductions
----------
A number routines are available to evaluate conditions across the running
program instances. For example, ``any()`` returns ``true`` if the given
value ``v`` is ``true`` for any of the SPMD program instances currently
running, and ``all()`` returns ``true`` if it true for all of them.
A number routines are available to evaluate conditions across the
running program instances. For example, ``any()`` returns ``true`` if
the given value ``v`` is ``true`` for any of the SPMD program
instances currently running, ``all()`` returns ``true`` if it true
for all of them, and ``none()`` returns ``true`` if ``v`` is always
``false``.
::
uniform bool any(bool v)
uniform bool all(bool v)
uniform bool none(bool v)
You can also compute a variety of reductions across the program instances.
For example, the values of the given value in each of the active program

View File

@@ -482,7 +482,7 @@ static FORCEINLINE bool __all(__vec16_i1 mask) {
}
static FORCEINLINE bool __none(__vec16_i1 mask) {
return !__any(mask);
return _mm512_kortestz(mask, mask);
}
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
@@ -1959,7 +1959,7 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
#ifdef ISPC_FAST_MATH
return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
#else
return _mm512_invsqrt_pd(v);
return _mm512_invsqrt_ps(v);
#endif
}

2
ispc.h
View File

@@ -41,7 +41,7 @@
#define ISPC_VERSION "1.3.1dev"
#if !defined(LLVM_3_0) && !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
#error "Only LLVM 3.0, 3.1, 3.2, and the 3.3 development branch are supported"
#error "Only LLVM 3.0, 3.1, 3.2 and the 3.3 development branch are supported"
#endif
#if defined(_WIN32) || defined(_WIN64)

View File

@@ -1757,9 +1757,9 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
clang::TextDiagnosticPrinter *diagPrinter =
new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
#else
clang::DiagnosticOptions diagOptions;
clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions();
clang::TextDiagnosticPrinter *diagPrinter =
new clang::TextDiagnosticPrinter(stderrRaw, &diagOptions);
new clang::TextDiagnosticPrinter(stderrRaw, diagOptions);
#endif
llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagIDs(new clang::DiagnosticIDs);
#if defined(LLVM_3_0) || defined(LLVM_3_1)
@@ -1767,7 +1767,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
new clang::DiagnosticsEngine(diagIDs, diagPrinter);
#else
clang::DiagnosticsEngine *diagEngine =
new clang::DiagnosticsEngine(diagIDs, &diagOptions, diagPrinter);
new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter);
#endif
inst.setDiagnostics(diagEngine);

12
opt.cpp
View File

@@ -471,8 +471,14 @@ Optimize(llvm::Module *module, int optLevel) {
}
optPM.add(llvm::createDeadInstEliminationPass());
// Max struct size threshold for scalar replacement is
// 1) 4 fields (r,g,b,w)
// 2) field size: vectorWidth * sizeof(float)
const int field_limit = 4;
int sr_threshold = g->target.vectorWidth * sizeof(float) * field_limit;
// On to more serious optimizations
optPM.add(llvm::createScalarReplAggregatesPass());
optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createPromoteMemoryToRegisterPass());
@@ -494,7 +500,7 @@ Optimize(llvm::Module *module, int optLevel) {
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createJumpThreadingPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createScalarReplAggregatesPass());
optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createTailCallEliminationPass());
@@ -540,7 +546,7 @@ Optimize(llvm::Module *module, int optLevel) {
optPM.add(llvm::createFunctionInliningPass());
optPM.add(llvm::createArgumentPromotionPass());
optPM.add(llvm::createScalarReplAggregatesPass(-1, false));
optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createReassociatePass());

View File

@@ -340,9 +340,9 @@ static inline uniform bool any(bool v) {
// We only care about whether "any" is true for the active program instances,
// so we have to make v with the current program mask.
#ifdef ISPC_TARGET_GENERIC
return __movmsk(v & __mask) != 0;
return __any(v | !__mask);
#else
return __movmsk(__sext_varying_bool(v) & __mask) != 0;
return __any(__sext_varying_bool(v) | !__mask);
#endif
}
@@ -350,13 +350,24 @@ __declspec(safe)
static inline uniform bool all(bool v) {
// As with any(), we need to explicitly mask v with the current program mask
// so we're only looking at the current lanes
#ifdef ISPC_TARGET_GENERIC
bool match = ((v & __mask) == __mask);
return __all(v | !__mask);
#else
int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
return __all(__sext_varying_bool(v) | !__mask);
#endif
}
__declspec(safe)
static inline uniform bool none(bool v) {
// As with any(), we need to explicitly mask v with the current program mask
// so we're only looking at the current lanes
#ifdef ISPC_TARGET_GENERIC
return __none(v | !__mask);
#else
return __none(__sext_varying_bool(v) | !__mask);
#endif
return __movmsk(match) == ((programCount == 64) ? ~0ull :
((1ull << programCount) - 1));
}
__declspec(safe)