diff --git a/Makefile b/Makefile index 6be3182f..3e54d55f 100644 --- a/Makefile +++ b/Makefile @@ -94,9 +94,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< -$(STDLIB_SRC): stdlib.m4 - -objs/stdlib-%.cpp: stdlib-%.ll +objs/stdlib-%.cpp: stdlib-%.ll stdlib.m4 @echo Creating C++ source from stdlib file $< @m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@ diff --git a/docs/ispc.txt b/docs/ispc.txt index 993d2bb7..74dff7c1 100644 --- a/docs/ispc.txt +++ b/docs/ispc.txt @@ -74,7 +74,8 @@ Contents: + `Math Functions`_ + `Output Functions`_ - + `Cross-Lane Operations`_ + + `Cross-Program Instance Operations`_ + + `Packed Load and Store Operations`_ + `Low-Level Bits`_ * `Interoperability with the Application`_ @@ -1659,14 +1660,14 @@ values for the inactive program instances aren't printed. (In other cases, they may have garbage values or be otherwise undefined.) -Cross-Lane Operations ---------------------- +Cross-Program Instance Operations +--------------------------------- -Usually, ``ispc`` code expresses independent computation on separate data -elements. There are, however, a number of cases where it's useful for the -program instances to be able to cooperate in computing results. The -cross-lane operations described in this section provide primitives for -communication between the running program instances. +Usually, ``ispc`` code expresses independent programs performing +computation on separate data elements. There are, however, a number of +cases where it's useful for the program instances to be able to cooperate +in computing results. The cross-lane operations described in this section +provide primitives for communication between the running program instances. A few routines that evaluate conditions across the running program instances. For example, ``any()`` returns ``true`` if the given value @@ -1678,6 +1679,47 @@ and ``all()`` returns ``true`` if it true for all of them. uniform bool any(bool v) uniform bool all(bool v) +To broadcast a value from one program instance to all of the others, a +``broadcast()`` function is available. It broadcasts the value of the +``value`` parameter for the program instance given by ``index`` to all of +the running program instances. + +:: + + float broadcast(float value, uniform int index) + int32 broadcast(int32 value, uniform int index) + double broadcast(double value, uniform int index) + int64 broadcast(int64 value, uniform int index) + +The ``rotate()`` function allows each program instance to find the value of +the given value that their neighbor ``offset`` steps away has. For +example, on an 8-wide target, if ``offset`` has the value (1, 2, 3, 4, 5, +6, 7, 8) in each of the running program instances, then ``rotate(value, +-1)`` causes the first program instance to get the value 8, the second +program instance to get the value 1, the third 2, and so forth. The +provided offset value can be positive or negative, and may be greater than +``programCount`` (it is masked to ensure valid offsets). + +:: + + float rotate(float value, uniform int offset) + int32 rotate(int32 value, uniform int offset) + double rotate(double value, uniform int offset) + int64 rotate(int64 value, uniform int offset) + + +Finally, ``shuffle()`` allows fully general shuffling of values among the +program instances. Each program instance's value of permutation gives the +program instance from which to get the value of ``value``. The provided +values for ``permutation`` must all be between 0 and ``programCount-1``. + +:: + + float shuffle(float value, int permutation) + int32 shuffle(int32 value, int permutation) + double shuffle(double value, int permutation) + int64 shuffle(int64 value, int permutation) + The various variants of ``popcnt()`` return the population count--the number of bits set in the given value. @@ -1719,8 +1761,12 @@ given value across all of the currently-executing vector lanes. uniform unsigned int reduce_max(unsigned int a, unsigned int b) -Finally, there are routines for writing out and reading in values from -linear memory locations for the active program instances. + +Packed Load and Store Operations +-------------------------------- + +The standard library also offers routines for writing out and reading in +values from linear memory locations for the active program instances. ``packed_load_active()`` loads consecutive values from the given array, starting at ``a[offset]``, loading one value for each currently-executing program instance and storing it into that program instance's ``val`` @@ -2280,21 +2326,11 @@ elements to work with and then proceeds with the computation. Communicating Between SPMD Program Instances -------------------------------------------- -The ``programIndex`` built-in variable (see `Mapping Data To Program -Instances`_) can be used to communicate between the set of executing -program instances. Consider the following code, which shows all of the -program instances writing into unique locations in an array. - -:: - - float x = ...; - uniform float allX[programCount]; - allX[programIndex] = x; - -In this code, a program instance that reads ``allX[0]`` finds the value of -``x`` that was computed by the first of the running program instances, and -so forth. Program instances can communicate with their neighbor instances -with indexing like ``allX[(programIndex+1)%programCount]``. +The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library +routiens provide a variety of mechanisms for the running program instances +to communicate values to each other during execution. See the section +`Cross-Program Instance Operations`_ for more information about their +operation. Gather and Scatter diff --git a/opt.cpp b/opt.cpp index 69e75247..efda1d2a 100644 --- a/opt.cpp +++ b/opt.cpp @@ -2116,11 +2116,12 @@ CreateLowerGatherScatterPass() { // IsCompileTimeConstantPass /** LLVM IR implementations of target-specific functions may include calls - to a function "bool __is_compile_time_constant_mask(mask type)"; this - allows them to have specialied code paths for where the mask is known - at compile time but not incurring the cost of a MOVMSK call at runtime - to compute its value in cases where the mask value isn't known until - runtime. + to the functions "bool __is_compile_time_constant_mask(mask type)" and + "bool __is_compile_time_constant_int32(i32)"; these allow them to have + specialied code paths for where the corresponding value is known at + compile time. For masks, for example, this allows them to not incur + the cost of a MOVMSK call at runtime to compute its value in cases + where the mask value isn't known until runtime. This pass resolves these calls into either 'true' or 'false' values so that later optimization passes can operate with these as constants. @@ -2148,17 +2149,17 @@ llvm::RegisterPass bool IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) { - llvm::Function *func = m->module->getFunction("__is_compile_time_constant_mask"); - if (!func) - return false; + llvm::Function *maskFunc = m->module->getFunction("__is_compile_time_constant_mask"); + llvm::Function *int32Func = m->module->getFunction("__is_compile_time_constant_int32"); bool modifiedAny = false; restart: for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) { - // Iterate through the instructions looking for calls to - // __is_compile_time_constant_mask(). + // Iterate through the instructions looking for calls to the + // __is_compile_time_constant_*() functions llvm::CallInst *callInst = llvm::dyn_cast(&*i); - if (!callInst || callInst->getCalledFunction() != func) + if (!callInst || (callInst->getCalledFunction() != maskFunc && + callInst->getCalledFunction() != int32Func)) continue; // This optimization pass can be disabled with the (poorly named) @@ -2171,8 +2172,8 @@ IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) { // Is it a constant? Bingo, turn the call's value into a constant // true value. - llvm::Value *mask = callInst->getArgOperand(0); - if (llvm::isa(mask)) { + llvm::Value *operand = callInst->getArgOperand(0); + if (llvm::isa(operand)) { llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue); modifiedAny = true; goto restart; diff --git a/stdlib.ispc b/stdlib.ispc index 05abfd1a..b9770da4 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -81,6 +81,54 @@ static inline uniform unsigned int64 intbits(uniform double d) { return __intbits_uniform_double(d); } +static inline float broadcast(float v, uniform int i) { + return __broadcast_float(v, i); +} + +static inline int32 broadcast(int32 v, uniform int i) { + return __broadcast_int32(v, i); +} + +static inline double broadcast(double v, uniform int i) { + return __broadcast_double(v, i); +} + +static inline int64 broadcast(int64 v, uniform int i) { + return __broadcast_int64(v, i); +} + +static inline float rotate(float v, uniform int i) { + return __rotate_float(v, i); +} + +static inline int32 rotate(int32 v, uniform int i) { + return __rotate_int32(v, i); +} + +static inline double rotate(double v, uniform int i) { + return __rotate_double(v, i); +} + +static inline int64 rotate(int64 v, uniform int i) { + return __rotate_int64(v, i); +} + +static inline float shuffle(float v, int i) { + return __shuffle_float(v, i); +} + +static inline int32 shuffle(int32 v, int i) { + return __shuffle_int32(v, i); +} + +static inline double shuffle(double v, int i) { + return __shuffle_double(v, i); +} + +static inline int64 shuffle(int64 v, int i) { + return __shuffle_int64(v, i); +} + // x[i] static inline uniform float extract(float x, uniform int i) { return __extract(x, i); diff --git a/stdlib.m4 b/stdlib.m4 index 500d183c..bc7cfc19 100644 --- a/stdlib.m4 +++ b/stdlib.m4 @@ -34,6 +34,8 @@ ;; builtins for various targets can use macros from this file to simplify ;; generating code for their implementations of those builtins. +declare i1 @__is_compile_time_constant_int32(i32) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -284,6 +286,22 @@ ret <8 x float> %ret ' ) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; forloop macro + +divert(`-1') +# forloop(var, from, to, stmt) - improved version: +# works even if VAR is not a strict macro name +# performs sanity check that FROM is larger than TO +# allows complex numerical expressions in TO and FROM +define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1', + `pushdef(`$1', eval(`$2'))_$0(`$1', + eval(`$3'), `$4')popdef(`$1')')') +define(`_forloop', + `$3`'ifelse(indir(`$1'), `$2', `', + `define(`$1', incr(indir(`$1')))$0($@)')') +divert`'dnl + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; stdlib_core ;; @@ -291,8 +309,67 @@ ret <8 x float> %ret ;; target's vector width, which it takes as its first parameter. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +define(`shuffles', ` +define internal <$1 x $2> @__broadcast_$3(<$1 x $2>, i32) nounwind readnone alwaysinline { + %v = extractelement <$1 x $2> %0, i32 %1 + %r_0 = insertelement <$1 x $2> undef, $2 %v, i32 0 +forloop(i, 1, eval($1-1), ` %r_`'i = insertelement <$1 x $2> %r_`'eval(i-1), $2 %v, i32 i +') + ret <$1 x $2> %r_`'eval($1-1) +} + +define internal <$1 x $2> @__rotate_$3(<$1 x $2>, i32) nounwind readnone alwaysinline { + %isc = call i1 @__is_compile_time_constant_int32(i32 %1) + br i1 %isc, label %is_const, label %not_const + +is_const: + ; though verbose, this turms into tight code if %1 is a constant +forloop(i, 0, eval($1-1), ` + %delta_`'i = add i32 %1, i + %delta_clamped_`'i = and i32 %delta_`'i, eval($1-1) + %v_`'i = extractelement <$1 x $2> %0, i32 %delta_clamped_`'i') + + %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0 +forloop(i, 1, eval($1-1), ` %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i +') + ret <$1 x $2> %ret_`'eval($1-1) + +not_const: + ; store two instances of the vector into memory + %ptr = alloca <$1 x $2>, i32 2 + %ptr0 = getelementptr <$1 x $2> * %ptr, i32 0 + store <$1 x $2> %0, <$1 x $2> * %ptr0 + %ptr1 = getelementptr <$1 x $2> * %ptr, i32 1 + store <$1 x $2> %0, <$1 x $2> * %ptr1 + + ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector + %offset = and i32 %1, eval($1-1) + %ptr_as_elt_array = bitcast <$1 x $2> * %ptr to [eval(2*$1) x $2] * + %load_ptr = getelementptr [eval(2*$1) x $2] * %ptr_as_elt_array, i32 0, i32 %offset + %load_ptr_vec = bitcast $2 * %load_ptr to <$1 x $2> * + %result = load <$1 x $2> * %load_ptr_vec, align $4 + ret <$1 x $2> %result +} + +define internal <$1 x $2> @__shuffle_$3(<$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline { +forloop(i, 0, eval($1-1), ` + %index_`'i = extractelement <$1 x i32> %1, i32 i') +forloop(i, 0, eval($1-1), ` + %v_`'i = extractelement <$1 x $2> %0, i32 %index_`'i') + + %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0 +forloop(i, 1, eval($1-1), ` %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i +') + ret <$1 x $2> %ret_`'eval($1-1) +} + +') + + define(`stdlib_core', ` +declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector ops @@ -307,6 +384,10 @@ define internal <$1 x float> @__insert(<$1 x float>, i32, ret <$1 x float> %insert } +shuffles($1, float, float, 4) +shuffles($1, i32, int32, 4) +shuffles($1, double, double, 8) +shuffles($1, i64, int64, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; various bitcasts from one type to another @@ -524,7 +605,6 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32 ;; FIXME: use the per_lane macro, defined below, to implement these! define(`packed_load_and_store', ` -declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask) define i32 @__packed_load_active([0 x i32] *, i32 %start_offset, <$1 x i32> * %val_ptr, <$1 x i32> %full_mask) nounwind alwaysinline { @@ -661,19 +741,6 @@ done: ;; Inside this code, any instances of the text "LANE" are replaced ;; with an i32 value that represents the current lane number -divert(`-1') -# forloop(var, from, to, stmt) - improved version: -# works even if VAR is not a strict macro name -# performs sanity check that FROM is larger than TO -# allows complex numerical expressions in TO and FROM -define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1', - `pushdef(`$1', eval(`$2'))_$0(`$1', - eval(`$3'), `$4')popdef(`$1')')') -define(`_forloop', - `$3`'ifelse(indir(`$1'), `$2', `', - `define(`$1', incr(indir(`$1')))$0($@)')') -divert`'dnl - ; num lanes, mask, code block to do per lane define(`per_lane', ` br label %pl_entry diff --git a/tests/broadcast-1.ispc b/tests/broadcast-1.ispc new file mode 100644 index 00000000..7cefce7a --- /dev/null +++ b/tests/broadcast-1.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = aFOO[programIndex]; + int br = broadcast(a, (uniform int)b-2); + RET[programIndex] = br; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 4; +} diff --git a/tests/broadcast.ispc b/tests/broadcast.ispc new file mode 100644 index 00000000..e45bbf90 --- /dev/null +++ b/tests/broadcast.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = broadcast(a, 2); + RET[programIndex] = b; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 3; +} diff --git a/tests/rotate-1.ispc b/tests/rotate-1.ispc new file mode 100644 index 00000000..8a06b566 --- /dev/null +++ b/tests/rotate-1.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = aFOO[programIndex]; + int rot = rotate(a, -1); + RET[programIndex] = rot; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount; +} diff --git a/tests/rotate-2.ispc b/tests/rotate-2.ispc new file mode 100644 index 00000000..e1ee32cb --- /dev/null +++ b/tests/rotate-2.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = aFOO[programIndex]; + uniform int delta = b - 6; // -1 + int rot = rotate(a, delta); + RET[programIndex] = rot; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount; +} diff --git a/tests/rotate-3.ispc b/tests/rotate-3.ispc new file mode 100644 index 00000000..ab50a4d9 --- /dev/null +++ b/tests/rotate-3.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int64 a = aFOO[programIndex]; + uniform int delta = b - 6; // -1 + int64 rot = rotate(a, delta); + RET[programIndex] = rot; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount; +} diff --git a/tests/rotate-4.ispc b/tests/rotate-4.ispc new file mode 100644 index 00000000..cafd0a3c --- /dev/null +++ b/tests/rotate-4.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int64 a = aFOO[programIndex]; + int64 rot = rotate(a, -1); + RET[programIndex] = rot; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount; +} diff --git a/tests/rotate.ispc b/tests/rotate.ispc new file mode 100644 index 00000000..eab7a5f3 --- /dev/null +++ b/tests/rotate.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = aFOO[programIndex]; + int rot = rotate(a, 2); + RET[programIndex] = rot; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + (programIndex + 2) % programCount; +} diff --git a/tests/shuffle-1.ispc b/tests/shuffle-1.ispc new file mode 100644 index 00000000..11474886 --- /dev/null +++ b/tests/shuffle-1.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float a = aFOO[programIndex]; + int reverse = programCount - 1 - programIndex; + float shuf = shuffle(a, reverse); + RET[programIndex] = shuf; +} + +export void result(uniform float RET[]) { + RET[programIndex] = programCount - programIndex; +} diff --git a/tests/shuffle-2.ispc b/tests/shuffle-2.ispc new file mode 100644 index 00000000..2cf2a91e --- /dev/null +++ b/tests/shuffle-2.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float a = aFOO[programIndex]; + int reverse = programCount - 1 - programIndex + (int)b - 5; + float shuf = shuffle(a, reverse); + RET[programIndex] = shuf; +} + +export void result(uniform float RET[]) { + RET[programIndex] = programCount - programIndex; +} diff --git a/tests/shuffle.ispc b/tests/shuffle.ispc new file mode 100644 index 00000000..a17b1309 --- /dev/null +++ b/tests/shuffle.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int32 a = aFOO[programIndex]; + int32 shuf = shuffle(a, 1); + RET[programIndex] = shuf; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 2; +}