diff --git a/builtins.m4 b/builtins.m4 index 268825e1..f83bdbff 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -883,21 +883,22 @@ declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x ; varying int32 __pseudo_gather(varying int32 *, mask) ; varying int64 __pseudo_gather(varying int64 *, mask) ; -; These functions are never actually implemented; the -; GatherScatterFlattenOpt optimization pass finds them and then converts -; them to make calls to the following functions, which represent gathers -; from a common base pointer with offsets. This approach allows the -; front-end to be relatively simple in how it emits address calculation -; for gathers. +; The GatherScatterFlattenOpt optimization pass finds these calls and then +; converts them to make calls to the following functions (when appropriate); +; these represent gathers from a common base pointer with offsets. The +; offset_scale factor scales the offsets before they are added to the base +; pointer--it should have the value 1, 2, 4, or 8. (It can always just be 1.) +; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling +; available in x86 addressing calculations... ; -; varying int8 __pseudo_gather_base_offsets_8(uniform int8 *base, -; int32 offsets, mask) -; varying int16 __pseudo_gather_base_offsets_16(uniform int16 *base, -; int32 offsets, mask) -; varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, -; int32 offsets, mask) -; varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, -; int64 offsets, mask) +; varying int8 __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base, +; int{32,64} offsets, int32 offset_scale, mask) +; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base, +; int{32,64} offsets, int32 offset_scale, mask) +; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base, +; int{32,64} offsets, int32 offset_scale, mask) +; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base, +; int{32,64} offsets, int32 offset_scale, mask) ; ; Then, the GSImprovementsPass optimizations finds these and either ; converts them to native gather functions or converts them to vector @@ -913,15 +914,23 @@ declare <$1 x i16> @__pseudo_gather64_16(<$1 x i64>, <$1 x i32>) nounwind readon declare <$1 x i32> @__pseudo_gather64_32(<$1 x i64>, <$1 x i32>) nounwind readonly declare <$1 x i64> @__pseudo_gather64_64(<$1 x i64>, <$1 x i32>) nounwind readonly -declare <$1 x i8> @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly -declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly -declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly -declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly +declare <$1 x i8> @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, i32, + <$1 x i32>) nounwind readonly +declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, i32, + <$1 x i32>) nounwind readonly +declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, i32, + <$1 x i32>) nounwind readonly +declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, i32, + <$1 x i32>) nounwind readonly -declare <$1 x i8> @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly -declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly -declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly -declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly +declare <$1 x i8> @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, i32, + <$1 x i32>) nounwind readonly +declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, i32, + <$1 x i32>) nounwind readonly +declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, i32, + <$1 x i32>) nounwind readonly +declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32, + <$1 x i32>) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined ; pseudo-scatter instructions with signatures: @@ -934,14 +943,14 @@ declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, <$1 x i3 ; The GatherScatterFlattenOpt optimization pass also finds these and ; transforms them to scatters like: ; -; void __pseudo_scatter_base_offsets_8(uniform int8 *base, -; varying int32 offsets, varying int8 values, mask) -; void __pseudo_scatter_base_offsets_16(uniform int16 *base, -; varying int32 offsets, varying int16 values, mask) -; void __pseudo_scatter_base_offsets_32(uniform int32 *base, -; varying int32 offsets, varying int32 values, mask) -; void __pseudo_scatter_base_offsets_64(uniform int64 *base, -; varying int32 offsets, varying int64 values, mask) +; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, +; varying int32 offsets, int32 offset_scale, varying int8 values, mask) +; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base, +; varying int32 offsets, int32 offset_scale, varying int16 values, mask) +; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base, +; varying int32 offsets, int32 offset_scale, varying int32 values, mask) +; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base, +; varying int32 offsets, int32 offset_scale, varying int64 values, mask) ; ; And the GSImprovementsPass in turn converts these to actual native ; scatters or masked stores. @@ -956,22 +965,22 @@ declare void @__pseudo_scatter64_16(<$1 x i64>, <$1 x i16>, <$1 x i32>) nounwind declare void @__pseudo_scatter64_32(<$1 x i64>, <$1 x i32>, <$1 x i32>) nounwind declare void @__pseudo_scatter64_64(<$1 x i64>, <$1 x i64>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>, +declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>, i32, <$1 x i8>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>, +declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>, i32, <$1 x i16>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>, +declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>, i32, <$1 x i32>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>, +declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>, i32, <$1 x i64>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>, +declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>, i32, <$1 x i8>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>, +declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>, i32, <$1 x i16>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>, +declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>, i32, <$1 x i32>, <$1 x i32>) nounwind -declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>, +declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>, i32, <$1 x i64>, <$1 x i32>) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2732,34 +2741,43 @@ pl_done: define(`gen_gather', ` ;; Define the utility function to do the gather operation for a single element ;; of the type -define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x $2> %ret, - i32 %lane) nounwind readonly alwaysinline { +define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale, + <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline { ; compute address for this one from the base %offset32 = extractelement <$1 x i32> %offsets, i32 %lane - %ptroffset = getelementptr i8 * %ptr, i32 %offset32 - %ptrcast = bitcast i8 * %ptroffset to $2 * + ; the order and details of the next 4 lines are important--they match LLVMs + ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations + %offset64 = sext i32 %offset32 to i64 + %scale64 = sext i32 %offset_scale to i64 + %offset = mul i64 %offset64, %scale64 + %ptroffset = getelementptr i8 * %ptr, i64 %offset ; load value and insert into returned value + %ptrcast = bitcast i8 * %ptroffset to $2 * %val = load $2 *%ptrcast %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane ret <$1 x $2> %updatedret } -define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, <$1 x $2> %ret, - i32 %lane) nounwind readonly alwaysinline { +define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale, + <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline { ; compute address for this one from the base - %offset32 = extractelement <$1 x i64> %offsets, i32 %lane - %ptroffset = getelementptr i8 * %ptr, i64 %offset32 - %ptrcast = bitcast i8 * %ptroffset to $2 * + %offset64 = extractelement <$1 x i64> %offsets, i32 %lane + ; the order and details of the next 4 lines are important--they match LLVMs + ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations + %offset_scale64 = sext i32 %offset_scale to i64 + %offset = mul i64 %offset64, %offset_scale64 + %ptroffset = getelementptr i8 * %ptr, i64 %offset ; load value and insert into returned value + %ptrcast = bitcast i8 * %ptroffset to $2 * %val = load $2 *%ptrcast %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane ret <$1 x $2> %updatedret } -define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, +define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale, <$1 x i32> %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always @@ -2773,15 +2791,15 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, %newOffsets = load <$1 x i32> * %offsetsPtr %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets, - <$1 x $2> undef, i32 0) + i32 %offset_scale, <$1 x $2> undef, i32 0) forloop(lane, 1, eval($1-1), `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, - <$1 x i32> %newOffsets, <$1 x $2> %retPREV, i32 LANE) + <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE) ', `LANE', lane), `PREV', eval(lane-1))') ret <$1 x $2> %ret`'eval($1-1) } -define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, +define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale, <$1 x i32> %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always @@ -2795,10 +2813,10 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, %newOffsets = load <$1 x i64> * %offsetsPtr %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets, - <$1 x $2> undef, i32 0) + i32 %offset_scale, <$1 x $2> undef, i32 0) forloop(lane, 1, eval($1-1), `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, - <$1 x i64> %newOffsets, <$1 x $2> %retPREV, i32 LANE) + <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE) ', `LANE', lane), `PREV', eval(lane-1))') ret <$1 x $2> %ret`'eval($1-1) } @@ -2848,42 +2866,52 @@ define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs, define(`gen_scatter', ` ;; Define the function that descripes the work to do to scatter a single ;; value -define void @__scatter_elt32_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, - i32 %lane) nounwind alwaysinline { +define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale, + <$1 x $2> %values, i32 %lane) nounwind alwaysinline { %offset32 = extractelement <$1 x i32> %offsets, i32 %lane - %offset64 = zext i32 %offset32 to i64 - %ptrdelta = add i64 %ptr64, %offset64 - %ptr = inttoptr i64 %ptrdelta to $2 * + ; the order and details of the next 4 lines are important--they match LLVMs + ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations + %offset64 = sext i32 %offset32 to i64 + %scale64 = sext i32 %offset_scale to i64 + %offset = mul i64 %offset64, %scale64 + %ptroffset = getelementptr i8 * %ptr, i64 %offset + + %ptrcast = bitcast i8 * %ptroffset to $2 * %storeval = extractelement <$1 x $2> %values, i32 %lane - store $2 %storeval, $2 * %ptr + store $2 %storeval, $2 * %ptrcast ret void } -define void @__scatter_elt64_$2(i64 %ptr64, <$1 x i64> %offsets, <$1 x $2> %values, - i32 %lane) nounwind alwaysinline { +define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale, + <$1 x $2> %values, i32 %lane) nounwind alwaysinline { %offset64 = extractelement <$1 x i64> %offsets, i32 %lane - %ptrdelta = add i64 %ptr64, %offset64 - %ptr = inttoptr i64 %ptrdelta to $2 * + ; the order and details of the next 4 lines are important--they match LLVMs + ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations + %scale64 = sext i32 %offset_scale to i64 + %offset = mul i64 %offset64, %scale64 + %ptroffset = getelementptr i8 * %ptr, i64 %offset + %ptrcast = bitcast i8 * %ptroffset to $2 * + %storeval = extractelement <$1 x $2> %values, i32 %lane - store $2 %storeval, $2 * %ptr + store $2 %storeval, $2 * %ptrcast ret void } -define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, <$1 x $2> %values, - <$1 x i32> %mask) nounwind alwaysinline { +define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale, + <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - %ptr64 = ptrtoint i8 * %base to i64 per_lane($1, <$1 x i32> %mask, ` - call void @__scatter_elt32_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, i32 LANE)') + call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale, + <$1 x $2> %values, i32 LANE)') ret void } -define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, <$1 x $2> %values, - <$1 x i32> %mask) nounwind alwaysinline { +define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale, + <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - %ptr64 = ptrtoint i8 * %base to i64 per_lane($1, <$1 x i32> %mask, ` - call void @__scatter_elt64_$2(i64 %ptr64, <$1 x i64> %offsets, <$1 x $2> %values, i32 LANE)') + call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale, + <$1 x $2> %values, i32 LANE)') ret void } diff --git a/opt.cpp b/opt.cpp index 25c950cb..c77a76f7 100644 --- a/opt.cpp +++ b/opt.cpp @@ -979,6 +979,81 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets) { } +/* Returns true if the given value is a constant vector of integers with + the value 2, 4, 8 in all of the elements. (Returns the splatted value + in *splat, if so). */ +static bool +lIs248Splat(llvm::Value *v, int *splat) { + llvm::ConstantVector *cvec = llvm::dyn_cast(v); + if (cvec == NULL) + return false; + + llvm::ConstantInt *ci = + llvm::dyn_cast(cvec->getSplatValue()); + if (ci == NULL) + return false; + + int64_t splatVal = ci->getSExtValue(); + if (splatVal != 2 && splatVal != 4 && splatVal != 8) + return false; + + *splat = (int)splatVal; + return true; +} + + +/** Given a vector of integer offsets to a base pointer being used for a + gather or a scatter, see if its root operation is a multiply by a + vector of some value by all 2s/4s/8s. If not, return NULL. + + If it is return an i32 value of 2, 4, 8 from the function and modify + *vec so that it points to the operand that is being multiplied by + 2/4/8. + + We go through all this trouble so that we can pass the i32 scale factor + to the {gather,scatter}_base_offsets function as a separate scale + factor for the offsets. This in turn is used in a way so that the LLVM + x86 code generator matches it to apply x86's free scale by 2x, 4x, or + 8x to one of two registers being added together for an addressing + calculation. + */ +static llvm::Value * +lExtractOffsetVector248Scale(llvm::Value **vec) { + llvm::SExtInst *sext = llvm::dyn_cast(*vec); + if (sext != NULL) { + llvm::Value *sextOp = sext->getOperand(0); + // Check the sext target. + llvm::Value *scale = lExtractOffsetVector248Scale(&sextOp); + if (scale == NULL) + return NULL; + + // make a new sext instruction so that we end up with the right + // type + *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext); + return scale; + } + + // If we don't have a multiply, then just return + llvm::BinaryOperator *bop = llvm::dyn_cast(*vec); + if (bop == NULL || bop->getOpcode() != llvm::Instruction::Mul) + return LLVMInt32(1); + + // Check each operand for being one of the scale factors we care about. + llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1); + int splat; + if (lIs248Splat(op0, &splat)) { + *vec = op1; + return LLVMInt32(splat); + } + else if (lIs248Splat(op1, &splat)) { + *vec = op0; + return LLVMInt32(splat); + } + else + return LLVMInt32(1); +} + + struct GSInfo { GSInfo(const char *pgFuncName, const char *pgboFuncName, const char *pgbo32FuncName, bool ig) @@ -1067,6 +1142,8 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) { // to the next instruction... continue; + llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector); + // Cast the base pointer to a void *, since that's what the // __pseudo_*_base_offsets_* functions want. basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType, @@ -1100,37 +1177,38 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) { // llvm::Instruction to llvm::CallInst::Create; this means that // the instruction isn't inserted into a basic block and that // way we can then call ReplaceInstWithInst(). - llvm::Value *newArgs[3] = { basePtr, offsetVector, mask }; + llvm::Value *newArgs[4] = { basePtr, offsetVector, offsetScale, mask }; #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) - llvm::ArrayRef newArgArray(&newArgs[0], &newArgs[3]); + llvm::ArrayRef newArgArray(&newArgs[0], &newArgs[4]); llvm::Instruction *newCall = llvm::CallInst::Create(gatherScatterFunc, newArgArray, "newgather", (llvm::Instruction *)NULL); #else llvm::Instruction *newCall = - llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], &newArgs[3], + llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], &newArgs[4], "newgather"); #endif lCopyMetadata(newCall, callInst); llvm::ReplaceInstWithInst(callInst, newCall); } else { + llvm::Value *storeValue = callInst->getArgOperand(1); llvm::Value *mask = callInst->getArgOperand(2); - llvm::Value *rvalue = callInst->getArgOperand(1); // Generate a new function call to the next pseudo scatter // base+offsets instruction. See above for why passing NULL // for the Instruction * is intended. - llvm::Value *newArgs[4] = { basePtr, offsetVector, rvalue, mask }; + llvm::Value *newArgs[5] = { basePtr, offsetVector, offsetScale, + storeValue, mask }; #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) - llvm::ArrayRef newArgArray(&newArgs[0], &newArgs[4]); + llvm::ArrayRef newArgArray(&newArgs[0], &newArgs[5]); llvm::Instruction *newCall = llvm::CallInst::Create(gatherScatterFunc, newArgArray, "", (llvm::Instruction *)NULL); #else llvm::Instruction *newCall = llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], - &newArgs[4]); + &newArgs[5]); #endif lCopyMetadata(newCall, callInst); llvm::ReplaceInstWithInst(callInst, newCall); @@ -1893,7 +1971,20 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) { llvm::Value *base = callInst->getArgOperand(0); llvm::Value *offsets = callInst->getArgOperand(1); - llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3); + llvm::Value *offsetScale = callInst->getArgOperand(2); + llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(3) : NULL; + llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4); + + llvm::ConstantInt *offsetScaleInt = + llvm::dyn_cast(offsetScale); + assert(offsetScaleInt != NULL); + + if (offsets->getType() == LLVMTypes::Int64VectorType) + // offsetScale is an i32, so sext it so that if we use it in a + // multiply below, it has the same type as the i64 offset used + // as the other operand... + offsetScale = new llvm::SExtInst(offsetScale, LLVMTypes::Int64Type, + "offset_sext", callInst); { std::vector seenPhis; @@ -1901,10 +1992,18 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) { // If all the offsets are equal, then compute the single // pointer they all represent based on the first one of them // (arbitrarily). + + // FIXME: the code from here to where ptr is computed is highly + // redundant with the case for a vector linear below. + llvm::Value *firstOffset = llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset", callInst); - llvm::Value *indices[1] = { firstOffset }; + llvm::Value *scaledOffset = + llvm::BinaryOperator::Create(llvm::Instruction::Mul, firstOffset, + offsetScale, "scaled_offset", callInst); + + llvm::Value *indices[1] = { scaledOffset }; #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) llvm::ArrayRef arrayRef(&indices[0], &indices[1]); llvm::Value *ptr = @@ -1945,9 +2044,8 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) { Warning(pos, "Undefined behavior: all program instances are " "writing to the same location!"); - llvm::Value *rvalue = callInst->getArgOperand(2); llvm::Value *first = - llvm::ExtractElementInst::Create(rvalue, LLVMInt32(0), "rvalue_first", + llvm::ExtractElementInst::Create(storeValue, LLVMInt32(0), "rvalue_first", callInst); lCopyMetadata(first, callInst); ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0), @@ -1965,8 +2063,11 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) { } int step = gatherInfo ? gatherInfo->align : scatterInfo->align; + step /= (int)offsetScaleInt->getZExtValue(); + std::vector seenPhis; - if (lVectorIsLinear(offsets, g->target.vectorWidth, step, seenPhis)) { + if (step > 0 && lVectorIsLinear(offsets, g->target.vectorWidth, + step, seenPhis)) { // We have a linear sequence of memory locations being accessed // starting with the location given by the offset from // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit @@ -1976,7 +2077,11 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) { llvm::Value *firstOffset = llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset", callInst); - llvm::Value *indices[1] = { firstOffset }; + llvm::Value *scaledOffset = + llvm::BinaryOperator::Create(llvm::Instruction::Mul, firstOffset, + offsetScale, "scaled_offset", callInst); + + llvm::Value *indices[1] = { scaledOffset }; #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) llvm::ArrayRef arrayRef(&indices[0], &indices[1]); llvm::Value *ptr = @@ -2006,11 +2111,10 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) { } else { Debug(pos, "Transformed scatter to unaligned vector store!"); - llvm::Value *rvalue = callInst->getArgOperand(2); ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", callInst); - llvm::Value *args[3] = { ptr, rvalue, mask }; + llvm::Value *args[3] = { ptr, storeValue, mask }; #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) llvm::ArrayRef argArray(&args[0], &args[3]); llvm::Instruction *newCall =