Added shuffle() variant to the standard library that takes two

varying values and a permutation index that spans the concatenation of the two of them (along the lines of SHUFPS...)
2011-07-02 08:39:19 +01:00
parent a9540b7c18
commit fe7717ab67
11 changed files with 170 additions and 41 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -176,30 +176,6 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
    }
 }

-/** Declare the function symbol 'bool __is_compile_time_constant_mask(mask type)'.  
-    This function will never be defined; it's just a placeholder
-    that will be handled during the optimization process.  See the
-    discussion of the implementation of CompileTimeConstantResolvePass for
-    more details.
- */
-static void
-lDeclareCompileTimeConstant(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";
-
-    std::vector<const llvm::Type *> argTypes;
-    argTypes.push_back(LLVMTypes::MaskType);
-
-    llvm::FunctionType *fType = 
-        llvm::FunctionType::get(LLVMTypes::BoolType, argTypes, false);
-    llvm::Function *func =
-        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                               "__is_compile_time_constant_mask", module);
-    func->setOnlyReadsMemory(true);
-    func->setDoesNotThrow(true);
-}
-
-
 /** Declare the 'pseudo-gather' functions.  When the ispc front-end needs
    to perform a gather, it generates a call to one of these functions,
    which have signatures:
@@ -583,7 +559,6 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod

    // Declare various placeholder functions that the optimizer will later
    // find and replace with something more useful.
-    lDeclareCompileTimeConstant(module);
    lDeclarePseudoGathers(module);
    lDeclarePseudoScatters(module);
    lDeclarePseudoMaskedStore(module);
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -6,6 +6,12 @@ initialize their members; they now must be initialized with initializer
 lists in braces (or initialized after of the initializer with a loop over
 array elements, etc.)

+Added another shuffle() function to the standard library: 
+"<type> shuffle(<type> v0, <type> v1, int permute)", where the permutation
+vector indexes over the concatenation of the two vectors (e.g. the value
+0 corresponds to the first element of v0, the value 2*programCount-1
+corresponds to the last element of v1, etc.)
+
 === v1.0.2 === (1 July 2011)

 Floating-point hexidecimal constants are now parsed correctly on Windows
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -1704,10 +1704,11 @@ provided offset value can be positive or negative, and may be greater than
    int64 rotate(int64 value, uniform int offset)


-Finally, ``shuffle()`` allows fully general shuffling of values among the
-program instances.  Each program instance's value of permutation gives the
-program instance from which to get the value of ``value``.  The provided
-values for ``permutation`` must all be between 0 and ``programCount-1``.
+Finally, the ``shuffle()`` functions allow two variants of fully general
+shuffling of values among the program instances.  For the first version,
+each program instance's value of permutation gives the program instance
+from which to get the value of ``value``.  The provided values for
+``permutation`` must all be between 0 and ``programCount-1``.

 ::

@@ -1716,6 +1717,20 @@ values for ``permutation`` must all be between 0 and ``programCount-1``.
    double shuffle(double value, int permutation)
    int64 shuffle(int64 value, int permutation)

+
+The second variant of ``shuffle()`` permutes over the extended vector that
+is the concatenation of the two provided values.  In other words, a value
+of 0 in an element of ``permutation`` corresponds to the first element of
+``value0``, the value ``2*programCount-1`` corresponds to the last element
+of ``value1``, etc.)
+
+::
+
+    float shuffle(float value0, float value1, int permutation)
+    int32 shuffle(int32 value0, int32 value1, int permutation)
+    double shuffle(double value0, double value1, int permutation)
+    int64 shuffle(int64 value0, int64 value1, int permutation)
+
 The various variants of ``popcnt()`` return the population count--the
 number of bits set in the given value.

--- a/opt.cpp
+++ b/opt.cpp
@@ -2116,12 +2116,11 @@ CreateLowerGatherScatterPass() {
 // IsCompileTimeConstantPass

 /** LLVM IR implementations of target-specific functions may include calls
-    to the functions "bool __is_compile_time_constant_mask(mask type)" and
-    "bool __is_compile_time_constant_int32(i32)"; these allow them to have
-    specialied code paths for where the corresponding value is known at
-    compile time.  For masks, for example, this allows them to not incur
-    the cost of a MOVMSK call at runtime to compute its value in cases
-    where the mask value isn't known until runtime.
+    to the functions "bool __is_compile_time_constant_*(...)"; these allow
+    them to have specialied code paths for where the corresponding value is
+    known at compile time.  For masks, for example, this allows them to not
+    incur the cost of a MOVMSK call at runtime to compute its value in
+    cases where the mask value isn't known until runtime.

    This pass resolves these calls into either 'true' or 'false' values so
    that later optimization passes can operate with these as constants.
@@ -2149,8 +2148,11 @@ llvm::RegisterPass<IsCompileTimeConstantPass>

 bool
 IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *maskFunc = m->module->getFunction("__is_compile_time_constant_mask");
-    llvm::Function *int32Func = m->module->getFunction("__is_compile_time_constant_int32");
+    llvm::Function *funcs[] = {
+        m->module->getFunction("__is_compile_time_constant_mask"),
+        m->module->getFunction("__is_compile_time_constant_uniform_int32"),
+        m->module->getFunction("__is_compile_time_constant_varying_int32")
+    };

    bool modifiedAny = false;
 restart:
@@ -2158,8 +2160,17 @@ IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        // Iterate through the instructions looking for calls to the
        // __is_compile_time_constant_*() functions
        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
-        if (!callInst || (callInst->getCalledFunction() != maskFunc &&
-                          callInst->getCalledFunction() != int32Func))
+        if (callInst == NULL)
+            continue;
+
+        int j;
+        int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
+        for (j = 0; j < nFuncs; ++j) {
+            if (callInst->getCalledFunction() == funcs[j]) 
+                break;
+        }
+        if (j == nFuncs)
+            // not a __is_compile_time_constant_* function
            continue;

        // This optimization pass can be disabled with the (poorly named)
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -129,6 +129,22 @@ static inline int64 shuffle(int64 v, int i) {
    return __shuffle_int64(v, i);
 }

+static inline float shuffle(float v0, float v1, int i) {
+    return __shuffle2_float(v0, v1, i);
+}
+
+static inline int32 shuffle(int32 v0, int32 v1, int i) {
+    return __shuffle2_int32(v0, v1, i);
+}
+
+static inline double shuffle(double v0, double v1, int i) {
+    return __shuffle2_double(v0, v1, i);
+}
+
+static inline int64 shuffle(int64 v0, int64 v1, int i) {
+    return __shuffle2_int64(v0, v1, i);
+}
+
 // x[i]
 static inline uniform float extract(float x, uniform int i) {
    return __extract(x, i);
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -34,7 +34,7 @@
 ;; builtins for various targets can use macros from this file to simplify
 ;; generating code for their implementations of those builtins.

-declare i1 @__is_compile_time_constant_int32(i32)
+declare i1 @__is_compile_time_constant_uniform_int32(i32)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@@ -319,7 +319,7 @@ forloop(i, 1, eval($1-1), `  %r_`'i = insertelement <$1 x $2> %r_`'eval(i-1), $2
 }

 define internal <$1 x $2> @__rotate_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
-  %isc = call i1 @__is_compile_time_constant_int32(i32 %1)
+  %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
  br i1 %isc, label %is_const, label %not_const

 is_const:
@@ -363,12 +363,53 @@ forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1)
  ret <$1 x $2> %ret_`'eval($1-1)
 }

+define internal <$1 x $2> @__shuffle2_$3(<$1 x $2>, <$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline {
+  %v2 = shufflevector <$1 x $2> %0, <$1 x $2> %1, <eval(2*$1) x i32> <
+      forloop(i, 0, eval(2*$1-2), `i32 i, ') i32 eval(2*$1-1)
+  >
+forloop(i, 0, eval($1-1), `  
+  %index_`'i = extractelement <$1 x i32> %2, i32 i')
+
+  %isc = call i1 @__is_compile_time_constant_varying_int32(<$1 x i32> %2)
+  br i1 %isc, label %is_const, label %not_const
+
+is_const:
+  ; extract from the requested lanes and insert into the result; LLVM turns
+  ; this into good code in the end
+forloop(i, 0, eval($1-1), `  
+  %v_`'i = extractelement <eval(2*$1) x $2> %v2, i32 %index_`'i')
+
+  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
+forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+')
+  ret <$1 x $2> %ret_`'eval($1-1)
+
+not_const:
+  ; otherwise store the two vectors onto the stack and then use the given
+  ; permutation vector to get indices into that array...
+  %ptr = alloca <eval(2*$1) x $2>
+  store <eval(2*$1) x $2> %v2, <eval(2*$1) x $2> * %ptr
+  %baseptr = bitcast <eval(2*$1) x $2> * %ptr to $2 *
+
+  %ptr_0 = getelementptr $2 * %baseptr, i32 %index_0
+  %val_0 = load $2 * %ptr_0
+  %result_0 = insertelement <$1 x $2> undef, $2 %val_0, i32 0
+
+forloop(i, 1, eval($1-1), `  
+  %ptr_`'i = getelementptr $2 * %baseptr, i32 %index_`'i
+  %val_`'i = load $2 * %ptr_`'i
+  %result_`'i = insertelement <$1 x $2> %result_`'eval(i-1), $2 %val_`'i, i32 i
+')
+
+  ret <$1 x $2> %result_`'eval($1-1)
+}
 ')


 define(`stdlib_core', `

 declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
+declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
--- a/tests/shuffle2-1.ispc
+++ b/tests/shuffle2-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int32 aa = aFOO[programIndex]; 
+    int32 bb = aa + programCount;
+    int32 shuf = shuffle(aa, bb, programCount + 1);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + programCount;
+}
--- a/tests/shuffle2-2.ispc
+++ b/tests/shuffle2-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int32 aa = aFOO[programIndex]; 
+    int32 bb = aa + programCount;
+    int32 shuf = shuffle(aa, bb, programIndex + 2);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3 + programIndex;
+}
--- a/tests/shuffle2-3.ispc
+++ b/tests/shuffle2-3.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float aa = aFOO[programIndex]; 
+    float bb = aa + programCount;
+    float shuf = shuffle(aa, bb, programIndex + 2 + (int)b - 5);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3 + programIndex;
+}
--- a/tests/shuffle2-4.ispc
+++ b/tests/shuffle2-4.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double aa = aFOO[programIndex]; 
+    double bb = aa + programCount;
+    double shuf = shuffle(aa, bb, 2*programIndex+(int)b-5);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + 2*programIndex;
+}
--- a/tests/shuffle2-5.ispc
+++ b/tests/shuffle2-5.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double aa = aFOO[programIndex]; 
+    double bb = aa + programCount;
+    double shuf = shuffle(aa, bb, 2*programIndex);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + 2*programIndex;
+}