From fe7717ab671c8518b2ffd505672af58740c2a6dc Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Sat, 2 Jul 2011 08:39:19 +0100
Subject: [PATCH] Added shuffle() variant to the standard library that takes
 two varying values and a permutation index that spans the concatenation of
 the two of them (along the lines of SHUFPS...)

---
 builtins.cpp          | 25 ------------------------
 docs/ReleaseNotes.txt |  6 ++++++
 docs/ispc.txt         | 23 ++++++++++++++++++----
 opt.cpp               | 31 +++++++++++++++++++----------
 stdlib.ispc           | 16 +++++++++++++++
 stdlib.m4             | 45 +++++++++++++++++++++++++++++++++++++++++--
 tests/shuffle2-1.ispc | 13 +++++++++++++
 tests/shuffle2-2.ispc | 13 +++++++++++++
 tests/shuffle2-3.ispc | 13 +++++++++++++
 tests/shuffle2-4.ispc | 13 +++++++++++++
 tests/shuffle2-5.ispc | 13 +++++++++++++
 11 files changed, 170 insertions(+), 41 deletions(-)
 create mode 100644 tests/shuffle2-1.ispc
 create mode 100644 tests/shuffle2-2.ispc
 create mode 100644 tests/shuffle2-3.ispc
 create mode 100644 tests/shuffle2-4.ispc
 create mode 100644 tests/shuffle2-5.ispc
diff --git a/builtins.cpp b/builtins.cpp
index d2a49c7e..9537dbf8 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -176,30 +176,6 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
     }
 }
 
-/** Declare the function symbol 'bool __is_compile_time_constant_mask(mask type)'.  
-    This function will never be defined; it's just a placeholder
-    that will be handled during the optimization process.  See the
-    discussion of the implementation of CompileTimeConstantResolvePass for
-    more details.
- */
-static void
-lDeclareCompileTimeConstant(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";
-
-    std::vector<const llvm::Type *> argTypes;
-    argTypes.push_back(LLVMTypes::MaskType);
-
-    llvm::FunctionType *fType = 
-        llvm::FunctionType::get(LLVMTypes::BoolType, argTypes, false);
-    llvm::Function *func =
-        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                               "__is_compile_time_constant_mask", module);
-    func->setOnlyReadsMemory(true);
-    func->setDoesNotThrow(true);
-}
-
-
 /** Declare the 'pseudo-gather' functions.  When the ispc front-end needs
     to perform a gather, it generates a call to one of these functions,
     which have signatures:
@@ -583,7 +559,6 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
 
     // Declare various placeholder functions that the optimizer will later
     // find and replace with something more useful.
-    lDeclareCompileTimeConstant(module);
     lDeclarePseudoGathers(module);
     lDeclarePseudoScatters(module);
     lDeclarePseudoMaskedStore(module);
diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index e77ccf61..d2aef0a8 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -6,6 +6,12 @@ initialize their members; they now must be initialized with initializer
 lists in braces (or initialized after of the initializer with a loop over
 array elements, etc.)
 
+Added another shuffle() function to the standard library: 
+"<type> shuffle(<type> v0, <type> v1, int permute)", where the permutation
+vector indexes over the concatenation of the two vectors (e.g. the value
+0 corresponds to the first element of v0, the value 2*programCount-1
+corresponds to the last element of v1, etc.)
+
 === v1.0.2 === (1 July 2011)
 
 Floating-point hexidecimal constants are now parsed correctly on Windows
diff --git a/docs/ispc.txt b/docs/ispc.txt
index 12a1ea85..1106b62f 100644
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -1704,10 +1704,11 @@ provided offset value can be positive or negative, and may be greater than
     int64 rotate(int64 value, uniform int offset)
 
 
-Finally, ``shuffle()`` allows fully general shuffling of values among the
-program instances.  Each program instance's value of permutation gives the
-program instance from which to get the value of ``value``.  The provided
-values for ``permutation`` must all be between 0 and ``programCount-1``.
+Finally, the ``shuffle()`` functions allow two variants of fully general
+shuffling of values among the program instances.  For the first version,
+each program instance's value of permutation gives the program instance
+from which to get the value of ``value``.  The provided values for
+``permutation`` must all be between 0 and ``programCount-1``.
 
 ::
 
@@ -1716,6 +1717,20 @@ values for ``permutation`` must all be between 0 and ``programCount-1``.
     double shuffle(double value, int permutation)
     int64 shuffle(int64 value, int permutation)
 
+
+The second variant of ``shuffle()`` permutes over the extended vector that
+is the concatenation of the two provided values.  In other words, a value
+of 0 in an element of ``permutation`` corresponds to the first element of
+``value0``, the value ``2*programCount-1`` corresponds to the last element
+of ``value1``, etc.)
+
+::
+
+    float shuffle(float value0, float value1, int permutation)
+    int32 shuffle(int32 value0, int32 value1, int permutation)
+    double shuffle(double value0, double value1, int permutation)
+    int64 shuffle(int64 value0, int64 value1, int permutation)
+
 The various variants of ``popcnt()`` return the population count--the
 number of bits set in the given value.
 
diff --git a/opt.cpp b/opt.cpp
index efda1d2a..61c7606b 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -2116,12 +2116,11 @@ CreateLowerGatherScatterPass() {
 // IsCompileTimeConstantPass
 
 /** LLVM IR implementations of target-specific functions may include calls
-    to the functions "bool __is_compile_time_constant_mask(mask type)" and
-    "bool __is_compile_time_constant_int32(i32)"; these allow them to have
-    specialied code paths for where the corresponding value is known at
-    compile time.  For masks, for example, this allows them to not incur
-    the cost of a MOVMSK call at runtime to compute its value in cases
-    where the mask value isn't known until runtime.
+    to the functions "bool __is_compile_time_constant_*(...)"; these allow
+    them to have specialied code paths for where the corresponding value is
+    known at compile time.  For masks, for example, this allows them to not
+    incur the cost of a MOVMSK call at runtime to compute its value in
+    cases where the mask value isn't known until runtime.
 
     This pass resolves these calls into either 'true' or 'false' values so
     that later optimization passes can operate with these as constants.
@@ -2149,8 +2148,11 @@ llvm::RegisterPass<IsCompileTimeConstantPass>
 
 bool
 IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *maskFunc = m->module->getFunction("__is_compile_time_constant_mask");
-    llvm::Function *int32Func = m->module->getFunction("__is_compile_time_constant_int32");
+    llvm::Function *funcs[] = {
+        m->module->getFunction("__is_compile_time_constant_mask"),
+        m->module->getFunction("__is_compile_time_constant_uniform_int32"),
+        m->module->getFunction("__is_compile_time_constant_varying_int32")
+    };
 
     bool modifiedAny = false;
  restart:
@@ -2158,8 +2160,17 @@ IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         // Iterate through the instructions looking for calls to the
         // __is_compile_time_constant_*() functions
         llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
-        if (!callInst || (callInst->getCalledFunction() != maskFunc &&
-                          callInst->getCalledFunction() != int32Func))
+        if (callInst == NULL)
+            continue;
+
+        int j;
+        int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
+        for (j = 0; j < nFuncs; ++j) {
+            if (callInst->getCalledFunction() == funcs[j]) 
+                break;
+        }
+        if (j == nFuncs)
+            // not a __is_compile_time_constant_* function
             continue;
 
         // This optimization pass can be disabled with the (poorly named)
diff --git a/stdlib.ispc b/stdlib.ispc
index b9770da4..8ba5410b 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -129,6 +129,22 @@ static inline int64 shuffle(int64 v, int i) {
     return __shuffle_int64(v, i);
 }
 
+static inline float shuffle(float v0, float v1, int i) {
+    return __shuffle2_float(v0, v1, i);
+}
+
+static inline int32 shuffle(int32 v0, int32 v1, int i) {
+    return __shuffle2_int32(v0, v1, i);
+}
+
+static inline double shuffle(double v0, double v1, int i) {
+    return __shuffle2_double(v0, v1, i);
+}
+
+static inline int64 shuffle(int64 v0, int64 v1, int i) {
+    return __shuffle2_int64(v0, v1, i);
+}
+
 // x[i]
 static inline uniform float extract(float x, uniform int i) {
     return __extract(x, i);
diff --git a/stdlib.m4 b/stdlib.m4
index bc7cfc19..82cb471c 100644
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -34,7 +34,7 @@
 ;; builtins for various targets can use macros from this file to simplify
 ;; generating code for their implementations of those builtins.
 
-declare i1 @__is_compile_time_constant_int32(i32)
+declare i1 @__is_compile_time_constant_uniform_int32(i32)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -319,7 +319,7 @@ forloop(i, 1, eval($1-1), `  %r_`'i = insertelement <$1 x $2> %r_`'eval(i-1), $2
 }
 
 define internal <$1 x $2> @__rotate_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
-  %isc = call i1 @__is_compile_time_constant_int32(i32 %1)
+  %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
   br i1 %isc, label %is_const, label %not_const
 
 is_const:
@@ -363,12 +363,53 @@ forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1)
   ret <$1 x $2> %ret_`'eval($1-1)
 }
 
+define internal <$1 x $2> @__shuffle2_$3(<$1 x $2>, <$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline {
+  %v2 = shufflevector <$1 x $2> %0, <$1 x $2> %1, <eval(2*$1) x i32> <
+      forloop(i, 0, eval(2*$1-2), `i32 i, ') i32 eval(2*$1-1)
+  >
+forloop(i, 0, eval($1-1), `  
+  %index_`'i = extractelement <$1 x i32> %2, i32 i')
+
+  %isc = call i1 @__is_compile_time_constant_varying_int32(<$1 x i32> %2)
+  br i1 %isc, label %is_const, label %not_const
+
+is_const:
+  ; extract from the requested lanes and insert into the result; LLVM turns
+  ; this into good code in the end
+forloop(i, 0, eval($1-1), `  
+  %v_`'i = extractelement <eval(2*$1) x $2> %v2, i32 %index_`'i')
+
+  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
+forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+')
+  ret <$1 x $2> %ret_`'eval($1-1)
+
+not_const:
+  ; otherwise store the two vectors onto the stack and then use the given
+  ; permutation vector to get indices into that array...
+  %ptr = alloca <eval(2*$1) x $2>
+  store <eval(2*$1) x $2> %v2, <eval(2*$1) x $2> * %ptr
+  %baseptr = bitcast <eval(2*$1) x $2> * %ptr to $2 *
+
+  %ptr_0 = getelementptr $2 * %baseptr, i32 %index_0
+  %val_0 = load $2 * %ptr_0
+  %result_0 = insertelement <$1 x $2> undef, $2 %val_0, i32 0
+
+forloop(i, 1, eval($1-1), `  
+  %ptr_`'i = getelementptr $2 * %baseptr, i32 %index_`'i
+  %val_`'i = load $2 * %ptr_`'i
+  %result_`'i = insertelement <$1 x $2> %result_`'eval(i-1), $2 %val_`'i, i32 i
+')
+
+  ret <$1 x $2> %result_`'eval($1-1)
+}
 ')
 
 
 define(`stdlib_core', `
 
 declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
+declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
diff --git a/tests/shuffle2-1.ispc b/tests/shuffle2-1.ispc
new file mode 100644
index 00000000..5d33cdf9
--- /dev/null
+++ b/tests/shuffle2-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int32 aa = aFOO[programIndex]; 
+    int32 bb = aa + programCount;
+    int32 shuf = shuffle(aa, bb, programCount + 1);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + programCount;
+}
diff --git a/tests/shuffle2-2.ispc b/tests/shuffle2-2.ispc
new file mode 100644
index 00000000..56426201
--- /dev/null
+++ b/tests/shuffle2-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int32 aa = aFOO[programIndex]; 
+    int32 bb = aa + programCount;
+    int32 shuf = shuffle(aa, bb, programIndex + 2);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3 + programIndex;
+}
diff --git a/tests/shuffle2-3.ispc b/tests/shuffle2-3.ispc
new file mode 100644
index 00000000..97040bab
--- /dev/null
+++ b/tests/shuffle2-3.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float aa = aFOO[programIndex]; 
+    float bb = aa + programCount;
+    float shuf = shuffle(aa, bb, programIndex + 2 + (int)b - 5);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3 + programIndex;
+}
diff --git a/tests/shuffle2-4.ispc b/tests/shuffle2-4.ispc
new file mode 100644
index 00000000..04a7c3a4
--- /dev/null
+++ b/tests/shuffle2-4.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double aa = aFOO[programIndex]; 
+    double bb = aa + programCount;
+    double shuf = shuffle(aa, bb, 2*programIndex+(int)b-5);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + 2*programIndex;
+}
diff --git a/tests/shuffle2-5.ispc b/tests/shuffle2-5.ispc
new file mode 100644
index 00000000..fd1bc299
--- /dev/null
+++ b/tests/shuffle2-5.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double aa = aFOO[programIndex]; 
+    double bb = aa + programCount;
+    double shuf = shuffle(aa, bb, 2*programIndex);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + 2*programIndex;
+}