From d665e2e85b1c609c129af7669ead85ea4b98c3a7 Mon Sep 17 00:00:00 2001
From: Gerrit Code Review <gerrit@fmygit6001.fm.intel.com>
Date: Wed, 24 Oct 2012 09:53:29 -0700
Subject: [PATCH 1/6] Initial empty repository


From e57801a5d16b5641bb0e801e2f34b478336201cb Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Wed, 31 Oct 2012 15:25:26 -0400
Subject: [PATCH 2/6] Typo Fix

---
 examples/intrinsics/sse4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index e3a4e277..b0365c86 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -274,7 +274,7 @@ static FORCEINLINE bool __any(__vec4_i1 mask) {
 }
 
 static FORCEINLINE bool __all(__vec4_i1 mask) {
-    return (_mm_movemask_ps(mask.v)=0xF);
+    return (_mm_movemask_ps(mask.v)==0xF);
 }
 
 static FORCEINLINE bool __none(__vec4_i1 mask) {

From 810784da1f2e797410847cfe5a404b74297c4d92 Mon Sep 17 00:00:00 2001
From: ptu1 <peng.tu@intel.com>
Date: Tue, 13 Nov 2012 12:35:45 -0800
Subject: [PATCH 3/6] Set the ScalarReplAggregate maximum structure size based
 on target vector width.

---
 opt.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index bb5ba955..8c6f7b06 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -471,8 +471,14 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         optPM.add(llvm::createDeadInstEliminationPass());
 
+        // Max struct size threshold for scalar replacement is
+        //    1) 4 fields (r,g,b,w)
+        //    2) field size: vectorWidth * sizeof(float)
+        const int field_limit = 4;
+        int sr_threshold = g->target.vectorWidth * sizeof(float) * field_limit;
+
         // On to more serious optimizations
-        optPM.add(llvm::createScalarReplAggregatesPass());
+        optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createPromoteMemoryToRegisterPass());
@@ -494,7 +500,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCFGSimplificationPass());
-        optPM.add(llvm::createScalarReplAggregatesPass());
+        optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createTailCallEliminationPass());
 
@@ -540,7 +546,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(llvm::createArgumentPromotionPass());   
-        optPM.add(llvm::createScalarReplAggregatesPass(-1, false));
+        optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
         optPM.add(llvm::createInstructionCombiningPass());  
         optPM.add(llvm::createCFGSimplificationPass());     
         optPM.add(llvm::createReassociatePass());           

From 2129b1e27d30b8a481eee04346e2bec91bc303b1 Mon Sep 17 00:00:00 2001
From: Jean-Luc Duprat <jld@acm.org>
Date: Wed, 21 Nov 2012 15:40:35 -0800
Subject: [PATCH 4/6] knc.h: Fixed __rsqrt_varying_float() to use
 _mm512_invsqrt_ps() instead of _mm512_invsqrt_pd() This was a typo.

---
 examples/intrinsics/knc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index b884ea82..08d2e8bf 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1959,7 +1959,7 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
 #ifdef ISPC_FAST_MATH
     return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
 #else 
-    return _mm512_invsqrt_pd(v);
+    return _mm512_invsqrt_ps(v);
 #endif
 }
 

From 16b0806d4091cb49a9e78710235825b68de5efd4 Mon Sep 17 00:00:00 2001
From: Peng Tu <peng.tu1@gmail.com>
Date: Wed, 21 Nov 2012 19:09:10 -0800
Subject: [PATCH 5/6] Fix LLVM TOT build issue.

---
 Makefile   | 4 ++++
 ispc.h     | 4 ++--
 main.cpp   | 2 ++
 module.cpp | 6 +++---
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index fc132e59..17855931 100644
--- a/Makefile
+++ b/Makefile
@@ -129,6 +129,10 @@ objs/cbackend.o: cbackend.cpp
 	@echo Compiling $<
 	@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
 
+objs/opt.o: opt.cpp
+	@echo Compiling $<
+	@$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $<
+
 objs/%.o: objs/%.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
diff --git a/ispc.h b/ispc.h
index 045916ab..a52e51c1 100644
--- a/ispc.h
+++ b/ispc.h
@@ -40,8 +40,8 @@
 
 #define ISPC_VERSION "1.3.1dev"
 
-#if !defined(LLVM_3_0) && !defined(LLVM_3_1) && !defined(LLVM_3_2)
-#error "Only LLVM 3.0, 3.1, and the 3.2 development branch are supported"
+#if !defined(LLVM_3_0) && !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
+#error "Only LLVM 3.0, 3.1, 3.2 and the 3.3 development branch are supported"
 #endif
 
 #if defined(_WIN32) || defined(_WIN64)
diff --git a/main.cpp b/main.cpp
index 8076456f..70dfb5b6 100644
--- a/main.cpp
+++ b/main.cpp
@@ -68,6 +68,8 @@ lPrintVersion() {
            "3.1"
 #elif defined(LLVM_3_2)
            "3.2"
+#elif defined(LLVM_3_3)
+           "3.3"
 #else
 #error "Unhandled LLVM version"
 #endif 
diff --git a/module.cpp b/module.cpp
index 45c5fae7..be6796af 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1757,9 +1757,9 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     clang::TextDiagnosticPrinter *diagPrinter =
         new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
 #else
-    clang::DiagnosticOptions diagOptions;
+    clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions();
     clang::TextDiagnosticPrinter *diagPrinter =
-        new clang::TextDiagnosticPrinter(stderrRaw, &diagOptions);
+        new clang::TextDiagnosticPrinter(stderrRaw, diagOptions);
 #endif
     llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagIDs(new clang::DiagnosticIDs);
 #if defined(LLVM_3_0) || defined(LLVM_3_1)
@@ -1767,7 +1767,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
         new clang::DiagnosticsEngine(diagIDs, diagPrinter);
 #else
     clang::DiagnosticsEngine *diagEngine = 
-        new clang::DiagnosticsEngine(diagIDs, &diagOptions, diagPrinter);
+        new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter);
 #endif
     inst.setDiagnostics(diagEngine);
 

From 24087ff3cc08c9ef050165566d25257067744e10 Mon Sep 17 00:00:00 2001
From: Jean-Luc Duprat <jld@acm.org>
Date: Tue, 27 Nov 2012 13:38:28 -0800
Subject: [PATCH 6/6] Expose none() in the ISPC standard library. On KNC:
 all(), any() and none() do not generate a redundant movmsk instruction.

---
 docs/ispc.rst             | 11 +++++++----
 examples/intrinsics/knc.h |  2 +-
 stdlib.ispc               | 23 +++++++++++++++++------
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index 609b7838..9412e649 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3701,15 +3701,18 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v``
 Reductions
 ----------
 
-A number routines are available to evaluate conditions across the running
-program instances.  For example, ``any()`` returns ``true`` if the given
-value ``v`` is ``true`` for any of the SPMD program instances currently
-running, and ``all()`` returns ``true`` if it true for all of them.
+A number routines are available to evaluate conditions across the
+running program instances.  For example, ``any()`` returns ``true`` if
+the given value ``v`` is ``true`` for any of the SPMD program
+instances currently running, ``all()`` returns ``true`` if it true
+for all of them, and ``none()`` returns ``true`` if ``v`` is always
+``false``.
 
 ::
 
     uniform bool any(bool v)
     uniform bool all(bool v)
+    uniform bool none(bool v)
 
 You can also compute a variety of reductions across the program instances.
 For example, the values of the given value in each of the active program
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 08d2e8bf..dba551fe 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -482,7 +482,7 @@ static FORCEINLINE bool __all(__vec16_i1 mask) {
 }
 
 static FORCEINLINE bool __none(__vec16_i1 mask) {
-    return !__any(mask);
+    return _mm512_kortestz(mask, mask);
 }
 
 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
diff --git a/stdlib.ispc b/stdlib.ispc
index 80f44e91..81ebac70 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -340,9 +340,9 @@ static inline uniform bool any(bool v) {
     // We only care about whether "any" is true for the active program instances,
     // so we have to make v with the current program mask.
 #ifdef ISPC_TARGET_GENERIC
-    return __movmsk(v & __mask) != 0;
+    return __any(v | !__mask);
 #else
-    return __movmsk(__sext_varying_bool(v) & __mask) != 0;
+    return __any(__sext_varying_bool(v) | !__mask);
 #endif
 }
 
@@ -350,13 +350,24 @@ __declspec(safe)
 static inline uniform bool all(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
+
 #ifdef ISPC_TARGET_GENERIC
-    bool match = ((v & __mask) == __mask);
+    return __all(v | !__mask);
 #else
-    int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
+    return __all(__sext_varying_bool(v) | !__mask);
+#endif
+}
+
+__declspec(safe)
+static inline uniform bool none(bool v) {
+    // As with any(), we need to explicitly mask v with the current program mask
+    // so we're only looking at the current lanes
+
+#ifdef ISPC_TARGET_GENERIC
+    return __none(v | !__mask);
+#else
+    return __none(__sext_varying_bool(v) | !__mask);
 #endif
-    return __movmsk(match) == ((programCount == 64) ? ~0ull : 
-                               ((1ull << programCount) - 1));
 }
 
 __declspec(safe)