Added shuffle() variant to the standard library that takes two

varying values and a permutation index that spans the concatenation
of the two of them (along the lines of SHUFPS...)
This commit is contained in:
Matt Pharr
2011-07-02 08:39:19 +01:00
parent a9540b7c18
commit fe7717ab67
11 changed files with 170 additions and 41 deletions

View File

@@ -176,30 +176,6 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
}
}
/** Declare the function symbol 'bool __is_compile_time_constant_mask(mask type)'.
This function will never be defined; it's just a placeholder
that will be handled during the optimization process. See the
discussion of the implementation of CompileTimeConstantResolvePass for
more details.
*/
static void
lDeclareCompileTimeConstant(llvm::Module *module) {
SourcePos noPos;
noPos.name = "__stdlib";
std::vector<const llvm::Type *> argTypes;
argTypes.push_back(LLVMTypes::MaskType);
llvm::FunctionType *fType =
llvm::FunctionType::get(LLVMTypes::BoolType, argTypes, false);
llvm::Function *func =
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
"__is_compile_time_constant_mask", module);
func->setOnlyReadsMemory(true);
func->setDoesNotThrow(true);
}
/** Declare the 'pseudo-gather' functions. When the ispc front-end needs
to perform a gather, it generates a call to one of these functions,
which have signatures:
@@ -583,7 +559,6 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
// Declare various placeholder functions that the optimizer will later
// find and replace with something more useful.
lDeclareCompileTimeConstant(module);
lDeclarePseudoGathers(module);
lDeclarePseudoScatters(module);
lDeclarePseudoMaskedStore(module);

View File

@@ -6,6 +6,12 @@ initialize their members; they now must be initialized with initializer
lists in braces (or initialized after of the initializer with a loop over
array elements, etc.)
Added another shuffle() function to the standard library:
"<type> shuffle(<type> v0, <type> v1, int permute)", where the permutation
vector indexes over the concatenation of the two vectors (e.g. the value
0 corresponds to the first element of v0, the value 2*programCount-1
corresponds to the last element of v1, etc.)
=== v1.0.2 === (1 July 2011)
Floating-point hexidecimal constants are now parsed correctly on Windows

View File

@@ -1704,10 +1704,11 @@ provided offset value can be positive or negative, and may be greater than
int64 rotate(int64 value, uniform int offset)
Finally, ``shuffle()`` allows fully general shuffling of values among the
program instances. Each program instance's value of permutation gives the
program instance from which to get the value of ``value``. The provided
values for ``permutation`` must all be between 0 and ``programCount-1``.
Finally, the ``shuffle()`` functions allow two variants of fully general
shuffling of values among the program instances. For the first version,
each program instance's value of permutation gives the program instance
from which to get the value of ``value``. The provided values for
``permutation`` must all be between 0 and ``programCount-1``.
::
@@ -1716,6 +1717,20 @@ values for ``permutation`` must all be between 0 and ``programCount-1``.
double shuffle(double value, int permutation)
int64 shuffle(int64 value, int permutation)
The second variant of ``shuffle()`` permutes over the extended vector that
is the concatenation of the two provided values. In other words, a value
of 0 in an element of ``permutation`` corresponds to the first element of
``value0``, the value ``2*programCount-1`` corresponds to the last element
of ``value1``, etc.)
::
float shuffle(float value0, float value1, int permutation)
int32 shuffle(int32 value0, int32 value1, int permutation)
double shuffle(double value0, double value1, int permutation)
int64 shuffle(int64 value0, int64 value1, int permutation)
The various variants of ``popcnt()`` return the population count--the
number of bits set in the given value.

31
opt.cpp
View File

@@ -2116,12 +2116,11 @@ CreateLowerGatherScatterPass() {
// IsCompileTimeConstantPass
/** LLVM IR implementations of target-specific functions may include calls
to the functions "bool __is_compile_time_constant_mask(mask type)" and
"bool __is_compile_time_constant_int32(i32)"; these allow them to have
specialied code paths for where the corresponding value is known at
compile time. For masks, for example, this allows them to not incur
the cost of a MOVMSK call at runtime to compute its value in cases
where the mask value isn't known until runtime.
to the functions "bool __is_compile_time_constant_*(...)"; these allow
them to have specialied code paths for where the corresponding value is
known at compile time. For masks, for example, this allows them to not
incur the cost of a MOVMSK call at runtime to compute its value in
cases where the mask value isn't known until runtime.
This pass resolves these calls into either 'true' or 'false' values so
that later optimization passes can operate with these as constants.
@@ -2149,8 +2148,11 @@ llvm::RegisterPass<IsCompileTimeConstantPass>
bool
IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
llvm::Function *maskFunc = m->module->getFunction("__is_compile_time_constant_mask");
llvm::Function *int32Func = m->module->getFunction("__is_compile_time_constant_int32");
llvm::Function *funcs[] = {
m->module->getFunction("__is_compile_time_constant_mask"),
m->module->getFunction("__is_compile_time_constant_uniform_int32"),
m->module->getFunction("__is_compile_time_constant_varying_int32")
};
bool modifiedAny = false;
restart:
@@ -2158,8 +2160,17 @@ IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
// Iterate through the instructions looking for calls to the
// __is_compile_time_constant_*() functions
llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
if (!callInst || (callInst->getCalledFunction() != maskFunc &&
callInst->getCalledFunction() != int32Func))
if (callInst == NULL)
continue;
int j;
int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
for (j = 0; j < nFuncs; ++j) {
if (callInst->getCalledFunction() == funcs[j])
break;
}
if (j == nFuncs)
// not a __is_compile_time_constant_* function
continue;
// This optimization pass can be disabled with the (poorly named)

View File

@@ -129,6 +129,22 @@ static inline int64 shuffle(int64 v, int i) {
return __shuffle_int64(v, i);
}
static inline float shuffle(float v0, float v1, int i) {
return __shuffle2_float(v0, v1, i);
}
static inline int32 shuffle(int32 v0, int32 v1, int i) {
return __shuffle2_int32(v0, v1, i);
}
static inline double shuffle(double v0, double v1, int i) {
return __shuffle2_double(v0, v1, i);
}
static inline int64 shuffle(int64 v0, int64 v1, int i) {
return __shuffle2_int64(v0, v1, i);
}
// x[i]
static inline uniform float extract(float x, uniform int i) {
return __extract(x, i);

View File

@@ -34,7 +34,7 @@
;; builtins for various targets can use macros from this file to simplify
;; generating code for their implementations of those builtins.
declare i1 @__is_compile_time_constant_int32(i32)
declare i1 @__is_compile_time_constant_uniform_int32(i32)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -319,7 +319,7 @@ forloop(i, 1, eval($1-1), ` %r_`'i = insertelement <$1 x $2> %r_`'eval(i-1), $2
}
define internal <$1 x $2> @__rotate_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
%isc = call i1 @__is_compile_time_constant_int32(i32 %1)
%isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
br i1 %isc, label %is_const, label %not_const
is_const:
@@ -363,12 +363,53 @@ forloop(i, 1, eval($1-1), ` %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1)
ret <$1 x $2> %ret_`'eval($1-1)
}
define internal <$1 x $2> @__shuffle2_$3(<$1 x $2>, <$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline {
%v2 = shufflevector <$1 x $2> %0, <$1 x $2> %1, <eval(2*$1) x i32> <
forloop(i, 0, eval(2*$1-2), `i32 i, ') i32 eval(2*$1-1)
>
forloop(i, 0, eval($1-1), `
%index_`'i = extractelement <$1 x i32> %2, i32 i')
%isc = call i1 @__is_compile_time_constant_varying_int32(<$1 x i32> %2)
br i1 %isc, label %is_const, label %not_const
is_const:
; extract from the requested lanes and insert into the result; LLVM turns
; this into good code in the end
forloop(i, 0, eval($1-1), `
%v_`'i = extractelement <eval(2*$1) x $2> %v2, i32 %index_`'i')
%ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
forloop(i, 1, eval($1-1), ` %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
')
ret <$1 x $2> %ret_`'eval($1-1)
not_const:
; otherwise store the two vectors onto the stack and then use the given
; permutation vector to get indices into that array...
%ptr = alloca <eval(2*$1) x $2>
store <eval(2*$1) x $2> %v2, <eval(2*$1) x $2> * %ptr
%baseptr = bitcast <eval(2*$1) x $2> * %ptr to $2 *
%ptr_0 = getelementptr $2 * %baseptr, i32 %index_0
%val_0 = load $2 * %ptr_0
%result_0 = insertelement <$1 x $2> undef, $2 %val_0, i32 0
forloop(i, 1, eval($1-1), `
%ptr_`'i = getelementptr $2 * %baseptr, i32 %index_`'i
%val_`'i = load $2 * %ptr_`'i
%result_`'i = insertelement <$1 x $2> %result_`'eval(i-1), $2 %val_`'i, i32 i
')
ret <$1 x $2> %result_`'eval($1-1)
}
')
define(`stdlib_core', `
declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; vector ops

13
tests/shuffle2-1.ispc Normal file
View File

@@ -0,0 +1,13 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int32 aa = aFOO[programIndex];
int32 bb = aa + programCount;
int32 shuf = shuffle(aa, bb, programCount + 1);
RET[programIndex] = shuf;
}
export void result(uniform float RET[]) {
RET[programIndex] = 2 + programCount;
}

13
tests/shuffle2-2.ispc Normal file
View File

@@ -0,0 +1,13 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int32 aa = aFOO[programIndex];
int32 bb = aa + programCount;
int32 shuf = shuffle(aa, bb, programIndex + 2);
RET[programIndex] = shuf;
}
export void result(uniform float RET[]) {
RET[programIndex] = 3 + programIndex;
}

13
tests/shuffle2-3.ispc Normal file
View File

@@ -0,0 +1,13 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float aa = aFOO[programIndex];
float bb = aa + programCount;
float shuf = shuffle(aa, bb, programIndex + 2 + (int)b - 5);
RET[programIndex] = shuf;
}
export void result(uniform float RET[]) {
RET[programIndex] = 3 + programIndex;
}

13
tests/shuffle2-4.ispc Normal file
View File

@@ -0,0 +1,13 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
double aa = aFOO[programIndex];
double bb = aa + programCount;
double shuf = shuffle(aa, bb, 2*programIndex+(int)b-5);
RET[programIndex] = shuf;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + 2*programIndex;
}

13
tests/shuffle2-5.ispc Normal file
View File

@@ -0,0 +1,13 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
double aa = aFOO[programIndex];
double bb = aa + programCount;
double shuf = shuffle(aa, bb, 2*programIndex);
RET[programIndex] = shuf;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + 2*programIndex;
}