diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 69b9d381..ddb9f095 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -353,10 +353,10 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(16, i8, 8) -load_and_broadcast(16, i16, 16) -load_and_broadcast(16, i32, 32) -load_and_broadcast(16, i64, 64) +load_and_broadcast(i8) +load_and_broadcast(i16) +load_and_broadcast(i32) +load_and_broadcast(i64) ; no masked load instruction for i8 and i16 types?? masked_load(16, i8, 8, 1) diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index bc8c64a7..856e4453 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -334,10 +334,10 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(8, i8, 8) -load_and_broadcast(8, i16, 16) -load_and_broadcast(8, i32, 32) -load_and_broadcast(8, i64, 64) +load_and_broadcast(i8) +load_and_broadcast(i16) +load_and_broadcast(i32) +load_and_broadcast(i64) ; no masked load instruction for i8 and i16 types?? masked_load(8, i8, 8, 1) diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 39870c75..13343495 100755 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -21,10 +21,10 @@ gen_masked_store(1, i64, 64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(1, i8, 8) -load_and_broadcast(1, i16, 16) -load_and_broadcast(1, i32, 32) -load_and_broadcast(1, i64, 64) +load_and_broadcast(i8) +load_and_broadcast(i16) +load_and_broadcast(i32) +load_and_broadcast(i64) masked_load(1, i8, 8, 1) masked_load(1, i16, 16, 2) diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 6bf90d95..4a7b5c2a 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -230,16 +230,17 @@ declare i64 @__reduce_max_uint64() nounwind readnone ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(WIDTH, i8, 8) -load_and_broadcast(WIDTH, i16, 16) -load_and_broadcast(WIDTH, i32, 32) -load_and_broadcast(WIDTH, i64, 64) declare @__masked_load_8(i8 * nocapture, %mask) nounwind readonly declare @__masked_load_16(i8 * nocapture, %mask) nounwind readonly declare @__masked_load_32(i8 * nocapture, %mask) nounwind readonly declare @__masked_load_64(i8 * nocapture, %mask) nounwind readonly +load_and_broadcast(i8) +load_and_broadcast(i16) +load_and_broadcast(i32) +load_and_broadcast(i64) + declare void @__masked_store_8(* nocapture, , ) nounwind declare void @__masked_store_16(* nocapture, , diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 56145a0f..fb8568fd 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -433,15 +433,15 @@ reduce_equal(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(8, i8, 8) -load_and_broadcast(8, i16, 16) -load_and_broadcast(8, i32, 32) -load_and_broadcast(8, i64, 64) masked_load(8, i8, 8, 1) masked_load(8, i16, 16, 2) masked_load(8, i32, 32, 4) masked_load(8, i64, 64, 8) +load_and_broadcast(i8) +load_and_broadcast(i16) +load_and_broadcast(i32) +load_and_broadcast(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index e6eb7390..c1182767 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -560,10 +560,10 @@ gen_masked_store(4, i64, 64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(4, i8, 8) -load_and_broadcast(4, i16, 16) -load_and_broadcast(4, i32, 32) -load_and_broadcast(4, i64, 64) +load_and_broadcast(i8) +load_and_broadcast(i16) +load_and_broadcast(i32) +load_and_broadcast(i64) masked_load(4, i8, 8, 1) masked_load(4, i16, 16, 2) diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 99e66e36..48a14b70 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -360,15 +360,15 @@ reduce_equal(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(8, i8, 8) -load_and_broadcast(8, i16, 16) -load_and_broadcast(8, i32, 32) -load_and_broadcast(8, i64, 64) masked_load(8, i8, 8, 1) masked_load(8, i16, 16, 2) masked_load(8, i32, 32, 4) masked_load(8, i64, 64, 8) +load_and_broadcast(i8) +load_and_broadcast(i16) +load_and_broadcast(i32) +load_and_broadcast(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 98426b24..2bf3104d 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -459,10 +459,10 @@ gen_masked_store(4, i64, 64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts -load_and_broadcast(4, i8, 8) -load_and_broadcast(4, i16, 16) -load_and_broadcast(4, i32, 32) -load_and_broadcast(4, i64, 64) +load_and_broadcast(i8) +load_and_broadcast(i16) +load_and_broadcast(i32) +load_and_broadcast(i64) masked_load(4, i8, 8, 1) masked_load(4, i16, 16, 2) diff --git a/builtins/util.m4 b/builtins/util.m4 index c53851e5..deb2fac8 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1701,13 +1701,13 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %ml64 = call @__masked_load_64(i8 * %ptr, %mask) call void @__use64( %ml64) - %lb8 = call @__load_and_broadcast_8(i8 * %ptr, %mask) + %lb8 = call @__load_and_broadcast_i8(i8 * %ptr, %mask) call void @__use8( %lb8) - %lb16 = call @__load_and_broadcast_16(i8 * %ptr, %mask) + %lb16 = call @__load_and_broadcast_i16(i8 * %ptr, %mask) call void @__use16( %lb16) - %lb32 = call @__load_and_broadcast_32(i8 * %ptr, %mask) + %lb32 = call @__load_and_broadcast_i32(i8 * %ptr, %mask) call void @__use32( %lb32) - %lb64 = call @__load_and_broadcast_64(i8 * %ptr, %mask) + %lb64 = call @__load_and_broadcast_i64(i8 * %ptr, %mask) call void @__use64( %lb64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2489,20 +2489,18 @@ i64minmax(WIDTH,max,uint64,ugt) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Emit code to safely load a scalar value and broadcast it across the -;; elements of a vector. Parameters: -;; $1: target vector width -;; $2: element type for which to emit the function (i32, i64, ...) -;; $3: suffix for function name (32, 64, ...) +;; elements of a vector. Parameter: +;; $1: element type for which to emit the function (i32, i64, ...) define(`load_and_broadcast', ` -define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline { - %ptr = bitcast i8 * %0 to $2 * - %val = load $2 * %ptr +define @__load_and_broadcast_$1(i8 *, %mask) nounwind alwaysinline { + %ptr = bitcast i8 * %0 to $1 * + %val = load $1 * %ptr - %ret0 = insertelement <$1 x $2> undef, $2 %val, i32 0 - forloop(i, 1, eval($1-1), ` - %ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i') - ret <$1 x $2> %ret`'eval($1-1) + %ret0 = insertelement undef, $1 %val, i32 0 + forloop(i, 1, eval(WIDTH-1), ` + %ret`'i = insertelement %ret`'eval(i-1), $1 %val, i32 i') + ret %ret`'eval(WIDTH-1) } ') diff --git a/opt.cpp b/opt.cpp index ce455d6f..b8e1d3f9 100644 --- a/opt.cpp +++ b/opt.cpp @@ -2281,21 +2281,21 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("GSToLoadStorePass"); GatherImpInfo gInfo[] = { - GatherImpInfo("__pseudo_gather_base_offsets32_8", "__load_and_broadcast_8", + GatherImpInfo("__pseudo_gather_base_offsets32_8", "__load_and_broadcast_i8", "__masked_load_8", 1), - GatherImpInfo("__pseudo_gather_base_offsets32_16", "__load_and_broadcast_16", + GatherImpInfo("__pseudo_gather_base_offsets32_16", "__load_and_broadcast_i16", "__masked_load_16", 2), - GatherImpInfo("__pseudo_gather_base_offsets32_32", "__load_and_broadcast_32", + GatherImpInfo("__pseudo_gather_base_offsets32_32", "__load_and_broadcast_i32", "__masked_load_32", 4), - GatherImpInfo("__pseudo_gather_base_offsets32_64", "__load_and_broadcast_64", + GatherImpInfo("__pseudo_gather_base_offsets32_64", "__load_and_broadcast_i64", "__masked_load_64", 8), - GatherImpInfo("__pseudo_gather_base_offsets64_8", "__load_and_broadcast_8", + GatherImpInfo("__pseudo_gather_base_offsets64_8", "__load_and_broadcast_i8", "__masked_load_8", 1), - GatherImpInfo("__pseudo_gather_base_offsets64_16", "__load_and_broadcast_16", + GatherImpInfo("__pseudo_gather_base_offsets64_16", "__load_and_broadcast_i16", "__masked_load_16", 2), - GatherImpInfo("__pseudo_gather_base_offsets64_32", "__load_and_broadcast_32", + GatherImpInfo("__pseudo_gather_base_offsets64_32", "__load_and_broadcast_i32", "__masked_load_32", 4), - GatherImpInfo("__pseudo_gather_base_offsets64_64", "__load_and_broadcast_64", + GatherImpInfo("__pseudo_gather_base_offsets64_64", "__load_and_broadcast_i64", "__masked_load_64", 8) }; ScatterImpInfo sInfo[] = { @@ -3815,14 +3815,14 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__gather_elt32_i32", "__gather_elt32_i64", "__gather_elt64_i8", "__gather_elt64_i16", "__gather_elt64_i32", "__gather_elt64_i64", - "__load_and_broadcast_8", "__load_and_broadcast_16", - "__load_and_broadcast_32", "__load_and_broadcast_64", "__masked_load_8", "__masked_load_16", "__masked_load_32", "__masked_load_64", "__masked_store_8", "__masked_store_16", "__masked_store_32", "__masked_store_64", "__masked_store_blend_8", "__masked_store_blend_16", "__masked_store_blend_32", "__masked_store_blend_64", + "__load_and_broadcast_i8", "__load_and_broadcast_i16", + "__load_and_broadcast_i32", "__load_and_broadcast_i64", "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16", "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64", "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",