diff --git a/builtins.cpp b/builtins.cpp index 43f68833..fa2e7328 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -536,6 +536,12 @@ lSetInternalFunctions(llvm::Module *module) { "__set_system_isa", "__sext_uniform_bool", "__sext_varying_bool", + "__shift_double", + "__shift_float", + "__shift_i16", + "__shift_i32", + "__shift_i64", + "__shift_i8", "__shuffle2_double", "__shuffle2_float", "__shuffle2_i16", diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 2a5d1b32..92b7a18e 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -80,6 +80,13 @@ declare @__rotate_i32(, i32) nounwind readnone declare @__rotate_double(, i32) nounwind readnone declare @__rotate_i64(, i32) nounwind readnone +declare @__shift_i8(, i32) nounwind readnone +declare @__shift_i16(, i32) nounwind readnone +declare @__shift_float(, i32) nounwind readnone +declare @__shift_i32(, i32) nounwind readnone +declare @__shift_double(, i32) nounwind readnone +declare @__shift_i64(, i32) nounwind readnone + declare @__shuffle_i8(, ) nounwind readnone declare @__shuffle2_i8(, , ) nounwind readnone diff --git a/builtins/util.m4 b/builtins/util.m4 index 68fa818b..4cb46310 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -797,6 +797,43 @@ not_const: ret %result } +define @__shift_$1(, i32) nounwind readnone alwaysinline { + %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1) + %zeropaddedvec = shufflevector %0, zeroinitializer, + < forloop(i, 0, eval(2*WIDTH-2), `i32 i, ')i32 eval(2*WIDTH-1) > + br i1 %isc, label %is_const, label %not_const + +is_const: + ; though verbose, this turms into tight code if %1 is a constant +forloop(i, 0, eval(WIDTH-1), ` + %delta_`'i = add i32 %1, i + %delta_clamped_`'i = and i32 %delta_`'i, eval(2*WIDTH-1) + %v_`'i = extractelement %zeropaddedvec, i32 %delta_clamped_`'i') + %ret_0 = insertelement zeroinitializer, $1 %v_0, i32 0 +forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eval(i-1), $1 %v_`'i, i32 i +') + ret %ret_`'eval(WIDTH-1) + +not_const: + ; store two instances of the vector into memory + %ptr = alloca , i32 3 + %ptr0 = getelementptr * %ptr, i32 0 + store zeroinitializer, * %ptr0 + %ptr1 = getelementptr * %ptr, i32 1 + store %0, * %ptr1 + %ptr2 = getelementptr * %ptr, i32 2 + store zeroinitializer, * %ptr2 + + ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector + %offset = add i32 %1, 16 + %ptr_as_elt_array = bitcast * %ptr to [eval(3*WIDTH) x $1] * + %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset + %load_ptr_vec = bitcast $1 * %load_ptr to * + %result = load * %load_ptr_vec, align $2 + ret %result +} + + define @__shuffle_$1(, ) nounwind readnone alwaysinline { forloop(i, 0, eval(WIDTH-1), ` %index_`'i = extractelement %1, i32 i') diff --git a/opt.cpp b/opt.cpp index 75eae20c..0146e7cf 100644 --- a/opt.cpp +++ b/opt.cpp @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -124,6 +125,8 @@ static llvm::Pass *CreateMakeInternalFuncsStaticPass(); static llvm::Pass *CreateDebugPass(char * output); +static llvm::Pass *CreateReplaceExtractInsertChainsPass(); + #define DEBUG_START_PASS(NAME) \ if (g->debugPrint && \ (getenv("FUNC") == NULL || \ @@ -635,6 +638,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(CreateIsCompileTimeConstantPass(true)); optPM.add(CreateIntrinsicsOptPass()); optPM.add(CreateInstructionSimplifyPass()); + optPM.add(CreateReplaceExtractInsertChainsPass()); optPM.add(llvm::createMemCpyOptPass()); optPM.add(llvm::createSCCPPass()); @@ -4923,3 +4927,136 @@ static llvm::Pass * CreatePeepholePass() { return new PeepholePass; } + +/////////////////////////////////////////////////////////////////////////// +// ReplaceExtractInsertChainsPass + +/** + We occassionally get chains of ExtractElementInsts followed by + InsertElementInsts. Unfortunately, all of these can't be replaced by + ShuffleVectorInsts as we don't know that things are constant at the time. + + This Pass will detect such chains, and replace them with ShuffleVectorInsts + if all the appropriate values are constant. + */ + +class ReplaceExtractInsertChainsPass : public llvm::BasicBlockPass { +public: + static char ID; + ReplaceExtractInsertChainsPass() : BasicBlockPass(ID) { + } + + const char *getPassName() const { return "Resolve \"replace extract insert chains\""; } + bool runOnBasicBlock(llvm::BasicBlock &BB); + +}; + +char ReplaceExtractInsertChainsPass::ID = 0; + +#include + +/** Given an llvm::Value known to be an integer, return its value as + an int64_t. +*/ +static int64_t +lGetIntValue(llvm::Value *offset) { + llvm::ConstantInt *intOffset = llvm::dyn_cast(offset); + Assert(intOffset && (intOffset->getBitWidth() == 32 || + intOffset->getBitWidth() == 64)); + return intOffset->getSExtValue(); +} + +bool +ReplaceExtractInsertChainsPass::runOnBasicBlock(llvm::BasicBlock &bb) { + DEBUG_START_PASS("ReplaceExtractInsertChainsPass"); + bool modifiedAny = false; + + // Initialize our mapping to the first spot in the zero vector + int vectorWidth = g->target->getVectorWidth(); + int shuffleMap[vectorWidth]; + for (int i = 0; i < vectorWidth; i++) { + shuffleMap[i] = vectorWidth; + } + + // Hack-y. 16 is likely the upper limit for now. + llvm::SmallSet inserts; + + // save the last Insert in the chain + llvm::Value * lastInsert = NULL; + + for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) { + // Iterate through the instructions looking for InsertElementInsts + llvm::InsertElementInst *ieInst = llvm::dyn_cast(&*i); + if (ieInst == NULL) { + // These aren't the instructions you're looking for. + continue; + } + + llvm::Value * base = ieInst->getOperand(0); + if ( (llvm::isa(base)) + || (llvm::isa(base)) + || (base == lastInsert)) { + // if source for insert scalar is 0 or an EEInst, add insert + llvm::Value *scalar = ieInst->getOperand(1); + if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast(scalar)) { + // We're only going to deal with Inserts into a Constant vector lane + if (llvm::isa(eeInst->getOperand(1))) { + inserts.insert(ieInst); + lastInsert = ieInst; + } + } + else if (llvm::ConstantInt *ci = llvm::dyn_cast(scalar)) { + if (ci->isZero()) { + inserts.insert(ieInst); + lastInsert = ieInst; + } + } + else { + lastInsert = NULL; + } + } + } + + // Look for chains, not insert/shuffle sequences + if (inserts.size() > 1) { + // The vector from which we're extracting elements + llvm::Value * baseVec = NULL; + llvm::Value *ee = llvm::cast((*inserts.begin()))->getOperand(1); + if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast(ee)) { + baseVec = eeInst->getOperand(0); + } + + bool sameBase = true; + for (llvm::SmallSet::iterator i = inserts.begin(); i != inserts.end(); i++) { + llvm::InsertElementInst *ie = llvm::cast(*i); + if (llvm::ExtractElementInst *ee = llvm::dyn_cast(ie->getOperand(1))) { + if (ee->getOperand(0) != baseVec) { + sameBase = false; + break; + } + int64_t from = lGetIntValue(ee->getIndexOperand()); + int64_t to = lGetIntValue(ie->getOperand(2)); + shuffleMap[to] = from; + } + } + if (sameBase) { + llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleMap); + llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shuffleIdxs->getType()); + llvm::Value *shuffle = new llvm::ShuffleVectorInst(baseVec, zeroVec, shuffleIdxs, "shiftInZero", llvm::cast(lastInsert)); + // For now, be lazy and let DCE clean up the Extracts/Inserts. + lastInsert->replaceAllUsesWith(shuffle); + + modifiedAny = true; + } + } + + DEBUG_END_PASS("ReplaceExtractInsertChainsPass"); + + return modifiedAny; +} + + +static llvm::Pass * +CreateReplaceExtractInsertChainsPass() { + return new ReplaceExtractInsertChainsPass(); +} diff --git a/stdlib.ispc b/stdlib.ispc index 9b02d0ba..248f664a 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -170,6 +170,36 @@ static inline int64 rotate(int64 v, uniform int i) { return __rotate_i64(v, i); } +__declspec(safe) +static inline float shift(float v, uniform int i) { + return __shift_float(v, i); +} + +__declspec(safe) +static inline int8 shift(int8 v, uniform int i) { + return __shift_i8(v, i); +} + +__declspec(safe) +static inline int16 shift(int16 v, uniform int i) { + return __shift_i16(v, i); +} + +__declspec(safe) +static inline int32 shift(int32 v, uniform int i) { + return __shift_i32(v, i); +} + +__declspec(safe) +static inline double shift(double v, uniform int i) { + return __shift_double(v, i); +} + +__declspec(safe) +static inline int64 shift(int64 v, uniform int i) { + return __shift_i64(v, i); +} + __declspec(safe) static inline float shuffle(float v, int i) { return __shuffle_float(v, i);