From 29859e81bae957486b21841bd70fa64113e1861e Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskiy Date: Thu, 5 Mar 2015 12:51:28 +0300 Subject: [PATCH] New LLVM IR for getelementptr instruction --- builtins/target-avx-x2.ll | 16 +-- builtins/target-avx.ll | 4 +- builtins/util-nvptx.m4 | 38 +++---- builtins/util.m4 | 213 ++++++++++++++++++++------------------ 4 files changed, 141 insertions(+), 130 deletions(-) diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 5bdc547c..d5d608f8 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -427,7 +427,7 @@ define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinli %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0) %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef, <8 x i32> - %ptr1 = getelementptr i8 * %0, i32 32 ;; 8x4 bytes = 32 + %ptr1 = getelementptr PTR_OP_ARGS(`i8', `%0, i32 32 ') ;; 8x4 bytes = 32 %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1) %retval = shufflevector <8 x float> %val0, <8 x float> %val1, @@ -454,11 +454,11 @@ define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinli %mask3d = bitcast <8 x i32> %mask3 to <4 x double> %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d) - %ptr1 = getelementptr i8 * %0, i32 32 + %ptr1 = getelementptr PTR_OP_ARGS(`i8', `%0, i32 32') %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d) - %ptr2 = getelementptr i8 * %0, i32 64 + %ptr2 = getelementptr PTR_OP_ARGS(`i8', `%0, i32 64') %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d) - %ptr3 = getelementptr i8 * %0, i32 96 + %ptr3 = getelementptr PTR_OP_ARGS(`i8', `%0, i32 96') %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d) %val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d, @@ -504,7 +504,7 @@ define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>, <8 x i32> call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0) - %ptr1 = getelementptr i8 * %ptr, i32 32 + %ptr1 = getelementptr PTR_OP_ARGS(`i8', `%ptr, i32 32') call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1) ret void @@ -539,11 +539,11 @@ define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>, <4 x i32> call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0) - %ptr1 = getelementptr i8 * %ptr, i32 32 + %ptr1 = getelementptr PTR_OP_ARGS(`i8', `%ptr, i32 32') call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1) - %ptr2 = getelementptr i8 * %ptr, i32 64 + %ptr2 = getelementptr PTR_OP_ARGS(`i8', `%ptr, i32 64') call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2) - %ptr3 = getelementptr i8 * %ptr, i32 96 + %ptr3 = getelementptr PTR_OP_ARGS(`i8', `%ptr, i32 96') call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3) ret void diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index aa120260..91277a1c 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -420,7 +420,7 @@ define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline %mask1d = bitcast <8 x i32> %mask1 to <4 x double> %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d) - %ptr1 = getelementptr i8 * %0, i32 32 + %ptr1 = getelementptr PTR_OP_ARGS(`i8', `%0, i32 32') %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d) %vald = shufflevector <4 x double> %val0d, <4 x double> %val1d, @@ -469,7 +469,7 @@ define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>, <4 x i32> call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0) - %ptr1 = getelementptr i8 * %ptr, i32 32 + %ptr1 = getelementptr PTR_OP_ARGS(`i8', `%ptr, i32 32') call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1) ret void } diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4 index 764872a2..6510cf26 100644 --- a/builtins/util-nvptx.m4 +++ b/builtins/util-nvptx.m4 @@ -2157,7 +2157,7 @@ define @__new_varying32_32rt( %size, % %sz_LANE_ID = extractelement %size, i32 LANE %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i32 %sz_LANE_ID, i32 %alignment) %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 - %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + %store_LANE_ID = getelementptr PTR_OP_ARGS(`i64', `%ret64, i32 LANE') store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') %r = load * %ret @@ -2213,7 +2213,7 @@ define @__new_varying32_64rt( %size, % %sz64_LANE_ID = zext i32 %sz_LANE_ID to i64 %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i64 %sz64_LANE_ID, i64 %alignment64) %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 - %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + %store_LANE_ID = getelementptr PTR_OP_ARGS(`i64', `%ret64, i32 LANE') store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') %r = load * %ret @@ -2231,7 +2231,7 @@ define @__new_varying64_64rt( %size, % %sz64_LANE_ID = extractelement %size, i32 LANE %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i64 %sz64_LANE_ID, i64 %alignment64) %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 - %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + %store_LANE_ID = getelementptr PTR_OP_ARGS(`i64', `%ret64, i32 LANE') store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') %r = load * %ret @@ -2433,7 +2433,7 @@ define <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline %v1_`'i = extractelement <$1 x i64> %1, i32 i %c_`'i = icmp $4 i64 %v0_`'i, %v1_`'i %v_`'i = select i1 %c_`'i, i64 %v0_`'i, i64 %v1_`'i - %ptr_`'i = getelementptr i64 * %r64ptr, i32 i + %ptr_`'i = getelementptr PTR_OP_ARGS(`i64', `%r64ptr, i32 i') store i64 %v_`'i, i64 * %ptr_`'i ') @@ -2499,9 +2499,9 @@ load_lane: ; yes! do the load and store the result into the appropriate place in the ; allocaed memory above %ptr32 = bitcast i8 * %0 to $1 * - %lane_ptr = getelementptr $1 * %ptr32, i32 %lane + %lane_ptr = getelementptr PTR_OP_ARGS(`$1', `%ptr32, i32 %lane') %val = load $1 * %lane_ptr - %store_ptr = getelementptr $1 * %retptr32, i32 %lane + %store_ptr = getelementptr PTR_OP_ARGS(`$1', `%retptr32, i32 %lane') store $1 %val, $1 * %store_ptr br label %lane_done @@ -2525,7 +2525,7 @@ return: define(`gen_masked_store', ` define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { per_lane(WIDTH, %2, ` - %ptr_LANE_ID = getelementptr * %0, i32 0, i32 LANE + %ptr_LANE_ID = getelementptr PTR_OP_ARGS(`', `%0, i32 0, i32 LANE') %storeval_LANE_ID = extractelement %1, i32 LANE store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID') ret void @@ -2759,7 +2759,7 @@ entry: if.then: ; preds = %entry %idxprom = ashr i64 %call, 32 - %arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom + %arrayidx = getelementptr inbounds PTR_OP_ARGS(`i32', `startptr, i64 %idxprom') %val = load i32* %arrayidx, align 4 %valvec = insertelement <1 x i32> undef, i32 %val, i32 0 store <1 x i32> %valvec, <1 x i32>* %val_ptr, align 4 @@ -2780,7 +2780,7 @@ entry: if.then: ; preds = %entry %idxprom = ashr i64 %call, 32 - %arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom + %arrayidx = getelementptr inbounds PTR_OP_ARGS(`i32', `startptr, i64 %idxprom') %val = extractelement <1 x i32> %vals, i32 0 store i32 %val, i32* %arrayidx, align 4 br label %if.end @@ -2986,7 +2986,7 @@ define @__gather32_$1( %ptrs, %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID - %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + %store_ptr_LANE_ID = getelementptr PTR_OP_ARGS(`', `%ret_ptr, i32 0, i32 LANE') store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID ') @@ -3002,7 +3002,7 @@ define @__gather64_$1( %ptrs, %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID - %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + %store_ptr_LANE_ID = getelementptr PTR_OP_ARGS(`', `%ret_ptr, i32 0, i32 LANE') store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID ') @@ -3025,11 +3025,11 @@ define @__gather_elt32_$1(i8 * %ptr, %offsets, i32 %o %offset64 = sext i32 %offset32 to i64 %scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %scale64 - %ptroffset = getelementptr i8 * %ptr, i64 %offset + %ptroffset = getelementptr PTR_OP_ARGS(`i8', `%ptr, i64 %offset') %delta = extractelement %offset_delta, i32 %lane %delta64 = sext i32 %delta to i64 - %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + %finalptr = getelementptr PTR_OP_ARGS(`i8', `%ptroffset, i64 %delta64') ; load value and insert into returned value %ptrcast = bitcast i8 * %finalptr to $1 * @@ -3047,10 +3047,10 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations %offset_scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %offset_scale64 - %ptroffset = getelementptr i8 * %ptr, i64 %offset + %ptroffset = getelementptr PTR_OP_ARGS(`i8', `%ptr, i64 %offset') %delta64 = extractelement %offset_delta, i32 %lane - %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + %finalptr = getelementptr PTR_OP_ARGS(`i8', `%ptroffset, i64 %delta64') ; load value and insert into returned value %ptrcast = bitcast i8 * %finalptr to $1 * @@ -3180,11 +3180,11 @@ define void @__scatter_elt32_$1(i8 * %ptr, %offsets, i32 %offset_s %offset64 = sext i32 %offset32 to i64 %scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %scale64 - %ptroffset = getelementptr i8 * %ptr, i64 %offset + %ptroffset = getelementptr PTR_OP_ARGS(`i8', `%ptr, i64 %offset') %delta = extractelement %offset_delta, i32 %lane %delta64 = sext i32 %delta to i64 - %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + %finalptr = getelementptr PTR_OP_ARGS(`i8', `%ptroffset, i64 %delta64') %ptrcast = bitcast i8 * %finalptr to $1 * %storeval = extractelement %values, i32 %lane @@ -3200,10 +3200,10 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations %scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %scale64 - %ptroffset = getelementptr i8 * %ptr, i64 %offset + %ptroffset = getelementptr PTR_OP_ARGS(`i8', `%ptr, i64 %offset') %delta64 = extractelement %offset_delta, i32 %lane - %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + %finalptr = getelementptr PTR_OP_ARGS(`i8', `%ptroffset, i64 %delta64') %ptrcast = bitcast i8 * %finalptr to $1 * %storeval = extractelement %values, i32 %lane diff --git a/builtins/util.m4 b/builtins/util.m4 index 39be8b80..32fc84ab 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,6 +49,17 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; LLVM has different IR for different versions since 3.7 + +define(`PTR_OP_ARGS', + ifelse(LLVM_VERSION, LLVM_3_7, + ``$1 , $1 * $2'', + ``$1 * $2'' + ) +) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; vector convertation utilities ;; convert vector of one width into vector of other width ;; @@ -1193,15 +1204,15 @@ forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eva not_const: ; store two instances of the vector into memory %ptr = alloca , i32 2 - %ptr0 = getelementptr * %ptr, i32 0 + %ptr0 = getelementptr PTR_OP_ARGS(`', `%ptr, i32 0') store %0, * %ptr0 - %ptr1 = getelementptr * %ptr, i32 1 + %ptr1 = getelementptr PTR_OP_ARGS(`', `%ptr, i32 1') store %0, * %ptr1 ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector %offset = and i32 %1, eval(WIDTH-1) %ptr_as_elt_array = bitcast * %ptr to [eval(2*WIDTH) x $1] * - %load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset + %load_ptr = getelementptr PTR_OP_ARGS(`[eval(2*WIDTH) x $1]', `%ptr_as_elt_array, i32 0, i32 %offset') %load_ptr_vec = bitcast $1 * %load_ptr to * %result = load * %load_ptr_vec, align $2 ret %result @@ -1209,16 +1220,16 @@ not_const: define @__shift_$1(, i32) nounwind readnone alwaysinline { %ptr = alloca , i32 3 - %ptr0 = getelementptr * %ptr, i32 0 + %ptr0 = getelementptr PTR_OP_ARGS(`', `%ptr, i32 0') store zeroinitializer, * %ptr0 - %ptr1 = getelementptr * %ptr, i32 1 + %ptr1 = getelementptr PTR_OP_ARGS(`', `%ptr, i32 1') store %0, * %ptr1 - %ptr2 = getelementptr * %ptr, i32 2 + %ptr2 = getelementptr PTR_OP_ARGS(`', `%ptr, i32 2') store zeroinitializer, * %ptr2 %offset = add i32 %1, WIDTH %ptr_as_elt_array = bitcast * %ptr to [eval(3*WIDTH) x $1] * - %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset + %load_ptr = getelementptr PTR_OP_ARGS(`[eval(3*WIDTH) x $1]', `%ptr_as_elt_array, i32 0, i32 %offset') %load_ptr_vec = bitcast $1 * %load_ptr to * %result = load * %load_ptr_vec, align $2 ret %result @@ -1265,12 +1276,12 @@ not_const: store %v2, * %ptr %baseptr = bitcast * %ptr to $1 * - %ptr_0 = getelementptr $1 * %baseptr, i32 %index_0 + %ptr_0 = getelementptr PTR_OP_ARGS(`$1', `%baseptr, i32 %index_0') %val_0 = load $1 * %ptr_0 %result_0 = insertelement undef, $1 %val_0, i32 0 forloop(i, 1, eval(WIDTH-1), ` - %ptr_`'i = getelementptr $1 * %baseptr, i32 %index_`'i + %ptr_`'i = getelementptr PTR_OP_ARGS(`$1', `%baseptr, i32 %index_`'i') %val_`'i = load $1 * %ptr_`'i %result_`'i = insertelement %result_`'eval(i-1), $1 %val_`'i, i32 i ') @@ -1511,7 +1522,7 @@ define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp, ',` %r_LANE_ID = cmpxchg $2 * %ptr, $2 %cmp_LANE_ID, $2 %val_LANE_ID seq_cst ') - %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE + %rp_LANE_ID = getelementptr PTR_OP_ARGS(`$2', `%rptr32, i32 LANE') store $2 %r_LANE_ID, $2 * %rp_LANE_ID') %r = load <$1 x $2> * %rptr @@ -1779,13 +1790,13 @@ define void ;; Similarly for the output pointers %out0a = bitcast <8 x float> * %out0 to <4 x float> * - %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out0b = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 1') %out1a = bitcast <8 x float> * %out1 to <4 x float> * - %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out1b = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 1') %out2a = bitcast <8 x float> * %out2 to <4 x float> * - %out2b = getelementptr <4 x float> * %out2a, i32 1 + %out2b = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 1') %out3a = bitcast <8 x float> * %out3 to <4 x float> * - %out3b = getelementptr <4 x float> * %out3a, i32 1 + %out3b = getelementptr PTR_OP_ARGS(`float>', `t> * %out3a, i32 1') ;; Do the first part--given input vectors like ;; , @@ -1828,13 +1839,13 @@ define void <4 x i32> %out0a = bitcast <8 x float> * %out0 to <4 x float> * - %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out0b = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 1') %out1a = bitcast <8 x float> * %out1 to <4 x float> * - %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out1b = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 1') %out2a = bitcast <8 x float> * %out2 to <4 x float> * - %out2b = getelementptr <4 x float> * %out2a, i32 1 + %out2b = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 1') %out3a = bitcast <8 x float> * %out3 to <4 x float> * - %out3b = getelementptr <4 x float> * %out3a, i32 1 + %out3b = getelementptr PTR_OP_ARGS(`float>', `t> * %out3a, i32 1') ;; First part--given input vectors ;; @@ -1871,11 +1882,11 @@ define void <4 x i32> %out0a = bitcast <8 x float> * %out0 to <4 x float> * - %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out0b = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 1') %out1a = bitcast <8 x float> * %out1 to <4 x float> * - %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out1b = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 1') %out2a = bitcast <8 x float> * %out2 to <4 x float> * - %out2b = getelementptr <4 x float> * %out2a, i32 1 + %out2b = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 1') call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b, <4 x float> %v1a, <4 x float> * %out0a, <4 x float> * %out1a, @@ -1905,11 +1916,11 @@ define void <4 x i32> %out0a = bitcast <8 x float> * %out0 to <4 x float> * - %out0b = getelementptr <4 x float> * %out0a, i32 1 + %out0b = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 1') %out1a = bitcast <8 x float> * %out1 to <4 x float> * - %out1b = getelementptr <4 x float> * %out1a, i32 1 + %out1b = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 1') %out2a = bitcast <8 x float> * %out2 to <4 x float> * - %out2b = getelementptr <4 x float> * %out2a, i32 1 + %out2b = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 1') call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a, <4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b, @@ -1961,21 +1972,21 @@ define void <4 x i32> %out0a = bitcast <16 x float> * %out0 to <4 x float> * - %out0b = getelementptr <4 x float> * %out0a, i32 1 - %out0c = getelementptr <4 x float> * %out0a, i32 2 - %out0d = getelementptr <4 x float> * %out0a, i32 3 + %out0b = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 1') + %out0c = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 2') + %out0d = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 3') %out1a = bitcast <16 x float> * %out1 to <4 x float> * - %out1b = getelementptr <4 x float> * %out1a, i32 1 - %out1c = getelementptr <4 x float> * %out1a, i32 2 - %out1d = getelementptr <4 x float> * %out1a, i32 3 + %out1b = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 1') + %out1c = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 2') + %out1d = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 3') %out2a = bitcast <16 x float> * %out2 to <4 x float> * - %out2b = getelementptr <4 x float> * %out2a, i32 1 - %out2c = getelementptr <4 x float> * %out2a, i32 2 - %out2d = getelementptr <4 x float> * %out2a, i32 3 + %out2b = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 1') + %out2c = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 2') + %out2d = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 3') %out3a = bitcast <16 x float> * %out3 to <4 x float> * - %out3b = getelementptr <4 x float> * %out3a, i32 1 - %out3c = getelementptr <4 x float> * %out3a, i32 2 - %out3d = getelementptr <4 x float> * %out3a, i32 3 + %out3b = getelementptr PTR_OP_ARGS(`float>', `t> * %out3a, i32 1') + %out3c = getelementptr PTR_OP_ARGS(`float>', `t> * %out3a, i32 2') + %out3d = getelementptr PTR_OP_ARGS(`float>', `t> * %out3a, i32 3') call void @__aos_to_soa4_float4(<4 x float> %v0a, <4 x float> %v0b, <4 x float> %v0c, <4 x float> %v0d, <4 x float> * %out0a, @@ -2032,21 +2043,21 @@ define void <4 x i32> %out0a = bitcast <16 x float> * %out0 to <4 x float> * - %out0b = getelementptr <4 x float> * %out0a, i32 1 - %out0c = getelementptr <4 x float> * %out0a, i32 2 - %out0d = getelementptr <4 x float> * %out0a, i32 3 + %out0b = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 1') + %out0c = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 2') + %out0d = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 3') %out1a = bitcast <16 x float> * %out1 to <4 x float> * - %out1b = getelementptr <4 x float> * %out1a, i32 1 - %out1c = getelementptr <4 x float> * %out1a, i32 2 - %out1d = getelementptr <4 x float> * %out1a, i32 3 + %out1b = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 1') + %out1c = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 2') + %out1d = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 3') %out2a = bitcast <16 x float> * %out2 to <4 x float> * - %out2b = getelementptr <4 x float> * %out2a, i32 1 - %out2c = getelementptr <4 x float> * %out2a, i32 2 - %out2d = getelementptr <4 x float> * %out2a, i32 3 + %out2b = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 1') + %out2c = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 2') + %out2d = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 3') %out3a = bitcast <16 x float> * %out3 to <4 x float> * - %out3b = getelementptr <4 x float> * %out3a, i32 1 - %out3c = getelementptr <4 x float> * %out3a, i32 2 - %out3d = getelementptr <4 x float> * %out3a, i32 3 + %out3b = getelementptr PTR_OP_ARGS(`float>', `t> * %out3a, i32 1') + %out3c = getelementptr PTR_OP_ARGS(`float>', `t> * %out3a, i32 2') + %out3d = getelementptr PTR_OP_ARGS(`float>', `t> * %out3a, i32 3') call void @__soa_to_aos4_float4(<4 x float> %v0a, <4 x float> %v1a, <4 x float> %v2a, <4 x float> %v3a, <4 x float> * %out0a, @@ -2094,17 +2105,17 @@ define void <4 x i32> %out0a = bitcast <16 x float> * %out0 to <4 x float> * - %out0b = getelementptr <4 x float> * %out0a, i32 1 - %out0c = getelementptr <4 x float> * %out0a, i32 2 - %out0d = getelementptr <4 x float> * %out0a, i32 3 + %out0b = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 1') + %out0c = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 2') + %out0d = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 3') %out1a = bitcast <16 x float> * %out1 to <4 x float> * - %out1b = getelementptr <4 x float> * %out1a, i32 1 - %out1c = getelementptr <4 x float> * %out1a, i32 2 - %out1d = getelementptr <4 x float> * %out1a, i32 3 + %out1b = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 1') + %out1c = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 2') + %out1d = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 3') %out2a = bitcast <16 x float> * %out2 to <4 x float> * - %out2b = getelementptr <4 x float> * %out2a, i32 1 - %out2c = getelementptr <4 x float> * %out2a, i32 2 - %out2d = getelementptr <4 x float> * %out2a, i32 3 + %out2b = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 1') + %out2c = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 2') + %out2d = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 3') call void @__aos_to_soa3_float4(<4 x float> %v0a, <4 x float> %v0b, <4 x float> %v0c, <4 x float> * %out0a, <4 x float> * %out1a, @@ -2152,17 +2163,17 @@ define void <4 x i32> %out0a = bitcast <16 x float> * %out0 to <4 x float> * - %out0b = getelementptr <4 x float> * %out0a, i32 1 - %out0c = getelementptr <4 x float> * %out0a, i32 2 - %out0d = getelementptr <4 x float> * %out0a, i32 3 + %out0b = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 1') + %out0c = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 2') + %out0d = getelementptr PTR_OP_ARGS(`float>', `t> * %out0a, i32 3') %out1a = bitcast <16 x float> * %out1 to <4 x float> * - %out1b = getelementptr <4 x float> * %out1a, i32 1 - %out1c = getelementptr <4 x float> * %out1a, i32 2 - %out1d = getelementptr <4 x float> * %out1a, i32 3 + %out1b = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 1') + %out1c = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 2') + %out1d = getelementptr PTR_OP_ARGS(`float>', `t> * %out1a, i32 3') %out2a = bitcast <16 x float> * %out2 to <4 x float> * - %out2b = getelementptr <4 x float> * %out2a, i32 1 - %out2c = getelementptr <4 x float> * %out2a, i32 2 - %out2d = getelementptr <4 x float> * %out2a, i32 3 + %out2b = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 1') + %out2c = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 2') + %out2d = getelementptr PTR_OP_ARGS(`float>', `t> * %out2a, i32 3') call void @__soa_to_aos3_float4(<4 x float> %v0a, <4 x float> %v1a, <4 x float> %v2a, <4 x float> * %out0a, <4 x float> * %out0b, @@ -2188,11 +2199,11 @@ define void nounwind alwaysinline { %p0 = bitcast float * %p to * %v0 = load * %p0, align 4 - %p1 = getelementptr * %p0, i32 1 + %p1 = getelementptr PTR_OP_ARGS(`', `%p0, i32 1') %v1 = load * %p1, align 4 - %p2 = getelementptr * %p0, i32 2 + %p2 = getelementptr PTR_OP_ARGS(`', `%p0, i32 2') %v2 = load * %p2, align 4 - %p3 = getelementptr * %p0, i32 3 + %p3 = getelementptr PTR_OP_ARGS(`', `%p0, i32 3') %v3 = load * %p3, align 4 call void @__aos_to_soa4_float`'WIDTH ( %v0, %v1, %v2, %v3, * %out0, @@ -2205,9 +2216,9 @@ define void @__soa_to_aos4_float( %v0, %v1, %v2, %v3, float * noalias %p) nounwind alwaysinline { %out0 = bitcast float * %p to * - %out1 = getelementptr * %out0, i32 1 - %out2 = getelementptr * %out0, i32 2 - %out3 = getelementptr * %out0, i32 3 + %out1 = getelementptr PTR_OP_ARGS(`', `%out0, i32 1') + %out2 = getelementptr PTR_OP_ARGS(`', `%out0, i32 2') + %out3 = getelementptr PTR_OP_ARGS(`', `%out0, i32 3') call void @__soa_to_aos4_float`'WIDTH ( %v0, %v1, %v2, %v3, * %out0, * %out1, * %out2, * %out3) @@ -2221,9 +2232,9 @@ define void * %out2) nounwind alwaysinline { %p0 = bitcast float * %p to * %v0 = load * %p0, align 4 - %p1 = getelementptr * %p0, i32 1 + %p1 = getelementptr PTR_OP_ARGS(`', `%p0, i32 1') %v1 = load * %p1, align 4 - %p2 = getelementptr * %p0, i32 2 + %p2 = getelementptr PTR_OP_ARGS(`', `%p0, i32 2') %v2 = load * %p2, align 4 call void @__aos_to_soa3_float`'WIDTH ( %v0, %v1, %v2, * %out0, * %out1, @@ -2236,8 +2247,8 @@ define void @__soa_to_aos3_float( %v0, %v1, %v2, float * noalias %p) nounwind alwaysinline { %out0 = bitcast float * %p to * - %out1 = getelementptr * %out0, i32 1 - %out2 = getelementptr * %out0, i32 2 + %out1 = getelementptr PTR_OP_ARGS(`', `%out0, i32 1') + %out2 = getelementptr PTR_OP_ARGS(`', `%out0, i32 2') call void @__soa_to_aos3_float`'WIDTH ( %v0, %v1, %v2, * %out0, * %out1, * %out2) @@ -3400,7 +3411,7 @@ define @__new_varying32_32rt( %size, % per_lane(WIDTH, %mask, ` %sz_LANE_ID = extractelement %size, i32 LANE - %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + %store_LANE_ID = getelementptr PTR_OP_ARGS(`i64', `%ret64, i32 LANE') %ptr_LANE_ID = bitcast i64* %store_LANE_ID to i8** %call_LANE_ID = call i32 @posix_memalign(i8** %ptr_LANE_ID, i32 %alignment, i32 %sz_LANE_ID)') @@ -3457,7 +3468,7 @@ define @__new_varying32_64rt( %size, % per_lane(WIDTH, %mask, ` %sz_LANE_ID = extractelement %size, i32 LANE %sz64_LANE_ID = zext i32 %sz_LANE_ID to i64 - %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + %store_LANE_ID = getelementptr PTR_OP_ARGS(`i64', `%ret64, i32 LANE') %ptr_LANE_ID = bitcast i64* %store_LANE_ID to i8** %call_LANE_ID = call i32 @posix_memalign(i8** %ptr_LANE_ID, i64 %alignment64, i64 %sz64_LANE_ID)') @@ -3474,7 +3485,7 @@ define @__new_varying64_64rt( %size, % per_lane(WIDTH, %mask, ` %sz64_LANE_ID = extractelement %size, i32 LANE - %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + %store_LANE_ID = getelementptr PTR_OP_ARGS(`i64', `%ret64, i32 LANE') %ptr_LANE_ID = bitcast i64* %store_LANE_ID to i8** %call_LANE_ID = call i32 @posix_memalign(i8** %ptr_LANE_ID, i64 %alignment64, i64 %sz64_LANE_ID)') @@ -3537,7 +3548,7 @@ define @__new_varying32_32rt( %size, % %sz_LANE_ID = extractelement %size, i32 LANE %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i32 %sz_LANE_ID, i32 %alignment) %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 - %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + %store_LANE_ID = getelementptr PTR_OP_ARGS(`i64', `%ret64, i32 LANE') store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') %r = load * %ret @@ -3593,7 +3604,7 @@ define @__new_varying32_64rt( %size, % %sz64_LANE_ID = zext i32 %sz_LANE_ID to i64 %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i64 %sz64_LANE_ID, i64 %alignment64) %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 - %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + %store_LANE_ID = getelementptr PTR_OP_ARGS(`i64', `%ret64, i32 LANE') store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') %r = load * %ret @@ -3611,7 +3622,7 @@ define @__new_varying64_64rt( %size, % %sz64_LANE_ID = extractelement %size, i32 LANE %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i64 %sz64_LANE_ID, i64 %alignment64) %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 - %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + %store_LANE_ID = getelementptr PTR_OP_ARGS(`i64', `%ret64, i32 LANE') store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') %r = load * %ret @@ -3929,7 +3940,7 @@ define <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline %v1_`'i = extractelement <$1 x i64> %1, i32 i %c_`'i = icmp $4 i64 %v0_`'i, %v1_`'i %v_`'i = select i1 %c_`'i, i64 %v0_`'i, i64 %v1_`'i - %ptr_`'i = getelementptr i64 * %r64ptr, i32 i + %ptr_`'i = getelementptr PTR_OP_ARGS(`i64', `%r64ptr, i32 i') store i64 %v_`'i, i64 * %ptr_`'i ') @@ -3995,9 +4006,9 @@ load_lane: ; yes! do the load and store the result into the appropriate place in the ; allocaed memory above %ptr32 = bitcast i8 * %0 to $1 * - %lane_ptr = getelementptr $1 * %ptr32, i32 %lane + %lane_ptr = getelementptr PTR_OP_ARGS(`$1', `%ptr32, i32 %lane') %val = load $1 * %lane_ptr - %store_ptr = getelementptr $1 * %retptr32, i32 %lane + %store_ptr = getelementptr PTR_OP_ARGS(`$1', `%retptr32, i32 %lane') store $1 %val, $1 * %store_ptr br label %lane_done @@ -4021,7 +4032,7 @@ return: define(`gen_masked_store', ` define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { per_lane(WIDTH, %2, ` - %ptr_LANE_ID = getelementptr * %0, i32 0, i32 LANE + %ptr_LANE_ID = getelementptr PTR_OP_ARGS(`', `%0, i32 0, i32 LANE') %storeval_LANE_ID = extractelement %1, i32 LANE store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID') ret void @@ -4174,10 +4185,10 @@ loop: br i1 %do_load, label %load, label %loopend load: - %loadptr = getelementptr i32 *%startptr, i32 %offset + %loadptr = getelementptr PTR_OP_ARGS(`i32', `startptr, i32 %offset') %loadval = load i32 *%loadptr %val_ptr_i32 = bitcast * %val_ptr to i32 * - %storeptr = getelementptr i32 *%val_ptr_i32, i32 %lane + %storeptr = getelementptr PTR_OP_ARGS(`i32', `val_ptr_i32, i32 %lane') store i32 %loadval, i32 *%storeptr %offset1 = add i32 %offset, 1 br label %loopend @@ -4226,7 +4237,7 @@ loop: store: %storeval = extractelement %vals, i32 %lane - %storeptr = getelementptr i32 *%startptr, i32 %offset + %storeptr = getelementptr PTR_OP_ARGS(`i32', `startptr, i32 %offset') store i32 %storeval, i32 *%storeptr %offset1 = add i32 %offset, 1 br label %loopend @@ -4272,9 +4283,9 @@ loop: ;; zero or sign extending it, while zero extend is free. Also do nothing for ;; i64 MASK, as we need i64 value. ifelse(MASK, `i64', -` %storeptr = getelementptr i32 *%startptr, MASK %offset', +` %storeptr = getelementptr PTR_OP_ARGS(`i32', `startptr, MASK %offset',') ` %offset1 = zext MASK %offset to i64 - %storeptr = getelementptr i32 *%startptr, i64 %offset1') + %storeptr = getelementptr PTR_OP_ARGS(`i32', `startptr, i64 %offset1')') store i32 %storeval, i32 *%storeptr %mull_mask = extractelement %full_mask, i32 %i @@ -4537,7 +4548,7 @@ define @__gather32_$1( %ptrs, %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID - %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + %store_ptr_LANE_ID = getelementptr PTR_OP_ARGS(`', `%ret_ptr, i32 0, i32 LANE') store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID ') @@ -4553,7 +4564,7 @@ define @__gather64_$1( %ptrs, %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID - %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + %store_ptr_LANE_ID = getelementptr PTR_OP_ARGS(`', `%ret_ptr, i32 0, i32 LANE') store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID ') @@ -4576,11 +4587,11 @@ define @__gather_elt32_$1(i8 * %ptr, %offsets, i32 %o %offset64 = sext i32 %offset32 to i64 %scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %scale64 - %ptroffset = getelementptr i8 * %ptr, i64 %offset + %ptroffset = getelementptr PTR_OP_ARGS(`i8', `%ptr, i64 %offset') %delta = extractelement %offset_delta, i32 %lane %delta64 = sext i32 %delta to i64 - %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + %finalptr = getelementptr PTR_OP_ARGS(`i8', `%ptroffset, i64 %delta64') ; load value and insert into returned value %ptrcast = bitcast i8 * %finalptr to $1 * @@ -4598,10 +4609,10 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations %offset_scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %offset_scale64 - %ptroffset = getelementptr i8 * %ptr, i64 %offset + %ptroffset = getelementptr PTR_OP_ARGS(`i8', `%ptr, i64 %offset') %delta64 = extractelement %offset_delta, i32 %lane - %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + %finalptr = getelementptr PTR_OP_ARGS(`i8', `%ptroffset, i64 %delta64') ; load value and insert into returned value %ptrcast = bitcast i8 * %finalptr to $1 * @@ -4731,11 +4742,11 @@ define void @__scatter_elt32_$1(i8 * %ptr, %offsets, i32 %offset_s %offset64 = sext i32 %offset32 to i64 %scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %scale64 - %ptroffset = getelementptr i8 * %ptr, i64 %offset + %ptroffset = getelementptr PTR_OP_ARGS(`i8', `%ptr, i64 %offset') %delta = extractelement %offset_delta, i32 %lane %delta64 = sext i32 %delta to i64 - %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + %finalptr = getelementptr PTR_OP_ARGS(`i8', `%ptroffset, i64 %delta64') %ptrcast = bitcast i8 * %finalptr to $1 * %storeval = extractelement %values, i32 %lane @@ -4751,10 +4762,10 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations %scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %scale64 - %ptroffset = getelementptr i8 * %ptr, i64 %offset + %ptroffset = getelementptr PTR_OP_ARGS(`i8', `%ptr, i64 %offset') %delta64 = extractelement %offset_delta, i32 %lane - %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + %finalptr = getelementptr PTR_OP_ARGS(`i8', `%ptroffset, i64 %delta64') %ptrcast = bitcast i8 * %finalptr to $1 * %storeval = extractelement %values, i32 %lane