diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index c21ae323..7f737626 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -353,13 +353,6 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(i8) -load_and_broadcast(i16) -load_and_broadcast(i32) -load_and_broadcast(float) -load_and_broadcast(i64) -load_and_broadcast(double) - ; no masked load instruction for i8 and i16 types?? masked_load(i8, 1) masked_load(i16, 2) diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 31e20f0b..3cd76516 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -334,12 +334,6 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(i8) -load_and_broadcast(i16) -load_and_broadcast(i32) -load_and_broadcast(float) -load_and_broadcast(i64) -load_and_broadcast(double) ; no masked load instruction for i8 and i16 types?? masked_load(i8, 1) diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 6a98257f..5e82b4f1 100755 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -21,12 +21,6 @@ gen_masked_store(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(i8) -load_and_broadcast(i16) -load_and_broadcast(i32) -load_and_broadcast(float) -load_and_broadcast(i64) -load_and_broadcast(double) masked_load(i8, 1) masked_load(i16, 2) diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index ab642c9f..9cedf4e4 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -230,12 +230,6 @@ declare i64 @__reduce_max_uint64() nounwind readnone ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(i8) -load_and_broadcast(i16) -load_and_broadcast(i32) -load_and_broadcast(float) -load_and_broadcast(i64) -load_and_broadcast(double) declare @__masked_load_i8(i8 * nocapture, %mask) nounwind readonly declare @__masked_load_i16(i8 * nocapture, %mask) nounwind readonly diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 0829f2dd..0260971a 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -433,12 +433,6 @@ reduce_equal(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(i8) -load_and_broadcast(i16) -load_and_broadcast(i32) -load_and_broadcast(float) -load_and_broadcast(i64) -load_and_broadcast(double) masked_load(i8, 1) masked_load(i16, 2) diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 4b0804ab..5f40d1eb 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -562,12 +562,6 @@ gen_masked_store(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(i8) -load_and_broadcast(i16) -load_and_broadcast(i32) -load_and_broadcast(float) -load_and_broadcast(i64) -load_and_broadcast(double) masked_load(i8, 1) masked_load(i16, 2) diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index eb7e3db8..ef3a7746 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -360,12 +360,6 @@ reduce_equal(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(i8) -load_and_broadcast(i16) -load_and_broadcast(i32) -load_and_broadcast(float) -load_and_broadcast(i64) -load_and_broadcast(double) masked_load(i8, 1) masked_load(i16, 2) diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 2342f182..ee57f6bd 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -461,12 +461,6 @@ masked_store_float_double() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(i8) -load_and_broadcast(i16) -load_and_broadcast(i32) -load_and_broadcast(float) -load_and_broadcast(i64) -load_and_broadcast(double) masked_load(i8, 1) masked_load(i16, 2) diff --git a/builtins/util.m4 b/builtins/util.m4 index c0401336..0cac6718 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1796,19 +1796,6 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %mld = call @__masked_load_double(i8 * %ptr, %mask) call void @__usedouble( %mld) - %lb8 = call @__load_and_broadcast_i8(i8 * %ptr, %mask) - call void @__use8( %lb8) - %lb16 = call @__load_and_broadcast_i16(i8 * %ptr, %mask) - call void @__use16( %lb16) - %lb32 = call @__load_and_broadcast_i32(i8 * %ptr, %mask) - call void @__use32( %lb32) - %lbf = call @__load_and_broadcast_float(i8 * %ptr, %mask) - call void @__usefloat( %lbf) - %lb64 = call @__load_and_broadcast_i64(i8 * %ptr, %mask) - call void @__use64( %lb64) - %lbd = call @__load_and_broadcast_double(i8 * %ptr, %mask) - call void @__usedouble( %lbd) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; stores %pv8 = bitcast i8 * %ptr to * @@ -2680,23 +2667,6 @@ i64minmax(WIDTH,min,uint64,ult) i64minmax(WIDTH,max,uint64,ugt) ') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Emit code to safely load a scalar value and broadcast it across the -;; elements of a vector. Parameter: -;; $1: element type for which to emit the function (i32, i64, ...) - -define(`load_and_broadcast', ` -define @__load_and_broadcast_$1(i8 *, %mask) nounwind alwaysinline { - %ptr = bitcast i8 * %0 to $1 * - %val = load $1 * %ptr - - %ret0 = insertelement undef, $1 %val, i32 0 - forloop(i, 1, eval(WIDTH-1), ` - %ret`'i = insertelement %ret`'eval(i-1), $1 %val, i32 i') - ret %ret`'eval(WIDTH-1) -} -') - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Emit general-purpose code to do a masked load for targets that dont have ;; an instruction to do that. Parameters: diff --git a/opt.cpp b/opt.cpp index a082382c..ee431baf 100644 --- a/opt.cpp +++ b/opt.cpp @@ -2266,19 +2266,18 @@ char GSToLoadStorePass::ID = 0; struct GatherImpInfo { - GatherImpInfo(const char *pName, const char *lbName, const char *lmName, + GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st, int a) : align(a) { pseudoFunc = m->module->getFunction(pName); - loadBroadcastFunc = m->module->getFunction(lbName); loadMaskedFunc = m->module->getFunction(lmName); - - Assert(pseudoFunc != NULL && loadBroadcastFunc != NULL && - loadMaskedFunc != NULL); + Assert(pseudoFunc != NULL && loadMaskedFunc != NULL); + scalarType = st; } + llvm::Function *pseudoFunc; - llvm::Function *loadBroadcastFunc; llvm::Function *loadMaskedFunc; + llvm::Type *scalarType; const int align; }; @@ -2312,30 +2311,30 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("GSToLoadStorePass"); GatherImpInfo gInfo[] = { - GatherImpInfo("__pseudo_gather_base_offsets32_i8", "__load_and_broadcast_i8", - "__masked_load_i8", 1), - GatherImpInfo("__pseudo_gather_base_offsets32_i16", "__load_and_broadcast_i16", - "__masked_load_i16", 2), - GatherImpInfo("__pseudo_gather_base_offsets32_i32", "__load_and_broadcast_i32", - "__masked_load_i32", 4), - GatherImpInfo("__pseudo_gather_base_offsets32_float", "__load_and_broadcast_float", - "__masked_load_float", 4), - GatherImpInfo("__pseudo_gather_base_offsets32_i64", "__load_and_broadcast_i64", - "__masked_load_i64", 8), - GatherImpInfo("__pseudo_gather_base_offsets32_double", "__load_and_broadcast_double", - "__masked_load_double", 8), - GatherImpInfo("__pseudo_gather_base_offsets64_i8", "__load_and_broadcast_i8", - "__masked_load_i8", 1), - GatherImpInfo("__pseudo_gather_base_offsets64_i16", "__load_and_broadcast_i16", - "__masked_load_i16", 2), - GatherImpInfo("__pseudo_gather_base_offsets64_i32", "__load_and_broadcast_i32", - "__masked_load_i32", 4), - GatherImpInfo("__pseudo_gather_base_offsets64_float", "__load_and_broadcast_float", - "__masked_load_float", 4), - GatherImpInfo("__pseudo_gather_base_offsets64_i64", "__load_and_broadcast_i64", - "__masked_load_i64", 8), - GatherImpInfo("__pseudo_gather_base_offsets64_double", "__load_and_broadcast_double", - "__masked_load_double", 8) + GatherImpInfo("__pseudo_gather_base_offsets32_i8", "__masked_load_i8", + LLVMTypes::Int8Type, 1), + GatherImpInfo("__pseudo_gather_base_offsets32_i16", "__masked_load_i16", + LLVMTypes::Int16Type, 2), + GatherImpInfo("__pseudo_gather_base_offsets32_i32", "__masked_load_i32", + LLVMTypes::Int32Type, 4), + GatherImpInfo("__pseudo_gather_base_offsets32_float", "__masked_load_float", + LLVMTypes::FloatType, 4), + GatherImpInfo("__pseudo_gather_base_offsets32_i64", "__masked_load_i64", + LLVMTypes::Int64Type, 8), + GatherImpInfo("__pseudo_gather_base_offsets32_double", "__masked_load_double", + LLVMTypes::DoubleType, 8), + GatherImpInfo("__pseudo_gather_base_offsets64_i8", "__masked_load_i8", + LLVMTypes::Int8Type, 1), + GatherImpInfo("__pseudo_gather_base_offsets64_i16", "__masked_load_i16", + LLVMTypes::Int16Type, 2), + GatherImpInfo("__pseudo_gather_base_offsets64_i32", "__masked_load_i32", + LLVMTypes::Int32Type, 4), + GatherImpInfo("__pseudo_gather_base_offsets64_float", "__masked_load_float", + LLVMTypes::FloatType, 4), + GatherImpInfo("__pseudo_gather_base_offsets64_i64", "__masked_load_i64", + LLVMTypes::Int64Type, 8), + GatherImpInfo("__pseudo_gather_base_offsets64_double", "__masked_load_double", + LLVMTypes::DoubleType, 8) }; ScatterImpInfo sInfo[] = { ScatterImpInfo("__pseudo_scatter_base_offsets32_i8", "__pseudo_masked_store_i8", @@ -2443,17 +2442,23 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { if (gatherInfo != NULL) { // A gather with everyone going to the same location is // handled as a scalar load and broadcast across the lanes. - // Note that we do still have to pass the mask to the - // __load_and_broadcast_* functions, since they shouldn't - // access memory if the mask is all off (the location may - // be invalid in that case). Debug(pos, "Transformed gather to scalar load and broadcast!"); - llvm::Instruction *newCall = - lCallInst(gatherInfo->loadBroadcastFunc, ptr, mask, - LLVMGetName(callInst, "_broadcast")); - lCopyMetadata(newCall, callInst); - llvm::ReplaceInstWithInst(callInst, newCall); + ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(gatherInfo->scalarType, 0), + ptr->getName(), callInst); + llvm::Value *scalarValue = new llvm::LoadInst(ptr, callInst->getName(), callInst); + llvm::Value *vecValue = llvm::UndefValue::get(callInst->getType()); + for (int i = 0; i < g->target.vectorWidth; ++i) { + if (i < g->target.vectorWidth - 1) + vecValue = llvm::InsertElementInst::Create(vecValue, scalarValue, LLVMInt32(i), + callInst->getName(), callInst); + else + vecValue = llvm::InsertElementInst::Create(vecValue, scalarValue, LLVMInt32(i), + callInst->getName()); + } + lCopyMetadata(vecValue, callInst); + llvm::ReplaceInstWithInst(callInst, + llvm::dyn_cast(vecValue)); modifiedAny = true; goto restart; } @@ -3894,9 +3899,6 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__gather_elt64_i8", "__gather_elt64_i16", "__gather_elt64_i32", "__gather_elt64_i64", "__gather_elt64_float", "__gather_elt64_double", - "__load_and_broadcast_i8", "__load_and_broadcast_i16", - "__load_and_broadcast_i32", "__load_and_broadcast_i64", - "__load_and_broadcast_float", "__load_and_broadcast_double", "__masked_load_i8", "__masked_load_i16", "__masked_load_i32", "__masked_load_i64", "__masked_load_float", "__masked_load_double",