merged with master

2014-08-11 10:04:54 +02:00
parent d607b9cb31 9b053c5518
commit 8745888ce9
37 changed files with 1199 additions and 617 deletions
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -46,8 +46,6 @@
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; Note: clang from LLVM 3.1 should be used if this is updated, for maximum
-;; backwards compatibility for anyone building ispc with LLVM 3.1
 ;;
 ;; #include <stdint.h>
 ;; #include <stdlib.h>
--- a/builtins/target-avx11-i64x4.ll
+++ b/builtins/target-avx11-i64x4.ll
@@ -31,8 +31,7 @@

 include(`target-avx1-i64x4base.ll')

-ifelse(LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -31,9 +31,7 @@

 include(`target-avx-x2.ll')

-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -75,9 +73,6 @@ gen_gather(double)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions

-ifelse(LLVM_VERSION, `LLVM_3_0', `
-;; nothing to define...
-', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -128,5 +123,3 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
-'
-)
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -31,10 +31,7 @@

 include(`target-avx.ll')

-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
-
+rdrand_definition()
 saturation_arithmetic()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -77,9 +74,6 @@ gen_gather(double)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions

-ifelse(LLVM_VERSION, `LLVM_3_0', `
-;; nothing to define...
-', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -114,4 +108,3 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
-')
--- a/builtins/target-avx2-i64x4.ll
+++ b/builtins/target-avx2-i64x4.ll
@@ -29,13 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

-ifelse(LLVM_VERSION, `LLVM_3_1', `',
-       `define(`HAVE_GATHER', `1')')
+define(`HAVE_GATHER', `1')

 include(`target-avx1-i64x4base.ll')

-ifelse(LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -126,15 +124,6 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {

 declare void @llvm.trap() noreturn nounwind

-
-ifelse(LLVM_VERSION, `LLVM_3_1', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)', `
-
 gen_gather(i8)
 gen_gather(i16)

@@ -351,5 +340,3 @@ define <4 x double> @__gather64_double(<4 x i64> %ptrs,

  ret <4 x double> %v
 }
-
-')
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -29,15 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

-ifelse(LLVM_VERSION, `LLVM_3_0', `',
-       LLVM_VERSION, `LLVM_3_1', `',
-       `define(`HAVE_GATHER', `1')')
+define(`HAVE_GATHER', `1')

 include(`target-avx-x2.ll')

-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -74,9 +70,6 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions

-ifelse(LLVM_VERSION, `LLVM_3_0', `
-;; nothing to define...
-', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -127,7 +120,7 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
-')
+

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
@@ -176,20 +169,6 @@ define(`assemble_4s', `
  assemble_8s($1, $2, $2_1, $2_2)
 ')

-ifelse(LLVM_VERSION, `LLVM_3_0', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)',
-LLVM_VERSION, `LLVM_3_1', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)', `

 gen_gather(i8)
 gen_gather(i16)
@@ -557,5 +536,3 @@ define <16 x double> @__gather64_double(<16 x i64> %ptrs,

  ret <16 x double> %v
 }
-
-')
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -29,16 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

-ifelse(LLVM_VERSION, `LLVM_3_0', `',
-       LLVM_VERSION, `LLVM_3_1', `',
-       `define(`HAVE_GATHER', `1')')
+define(`HAVE_GATHER', `1')

 include(`target-avx.ll')

-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
-
+rdrand_definition()
 saturation_arithmetic()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -76,9 +71,6 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions

-ifelse(LLVM_VERSION, `LLVM_3_0', `
-;; nothing to define...
-', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -113,7 +105,6 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
-')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
@@ -125,21 +116,6 @@ define(`extract_4s', `
  %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ')

-ifelse(LLVM_VERSION, `LLVM_3_0', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)',
-LLVM_VERSION, `LLVM_3_1', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)', `
-
 gen_gather(i8)
 gen_gather(i16)

@@ -431,5 +407,3 @@ define <8 x double> @__gather64_double(<8 x i64> %ptrs,

  ret <8 x double> %v
 }
-
-')
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -275,20 +275,7 @@ declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
 declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
                                    <WIDTH x i1> %mask) nounwind 

-ifelse(LLVM_VERSION, `LLVM_3_0', `
-declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                                      <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
-                                       <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                       <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
-                                       <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
-                                       <WIDTH x i1> %mask) nounwind 
-declare void @__masked_store_blend_double(<WIDTH x double>* nocapture, <WIDTH x double>,
-                                       <WIDTH x i1> %mask) nounwind 
-', `
+
 define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i8> * %0
@@ -336,7 +323,6 @@ define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
  store <WIDTH x double> %v1, <WIDTH x double> * %0
  ret void
 }
-')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1497,13 +1497,17 @@ define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
  per_lane($1, <$1 x MASK> %mask, `
   %cmp_LANE_ID = extractelement <$1 x $2> %cmp, i32 LANE
   %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
+
+  ;; 3.5 and 3.6 code is the same since m4 has no OR and AND operators
  ifelse(LLVM_VERSION,LLVM_3_5,`
    %r_LANE_ID_t = cmpxchg $2 * %ptr, $2 %cmp_LANE_ID, $2 %val_LANE_ID seq_cst seq_cst
    %r_LANE_ID = extractvalue { $2, i1 } %r_LANE_ID_t, 0
+  ',LLVM_VERSION,LLVM_3_6,` 
+    %r_LANE_ID_t = cmpxchg $2 * %ptr, $2 %cmp_LANE_ID, $2 %val_LANE_ID seq_cst seq_cst
+    %r_LANE_ID = extractvalue { $2, i1 } %r_LANE_ID_t, 0
  ',`
    %r_LANE_ID = cmpxchg $2 * %ptr, $2 %cmp_LANE_ID, $2 %val_LANE_ID seq_cst
  ')
-
   %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
   store $2 %r_LANE_ID, $2 * %rp_LANE_ID')

@@ -1512,10 +1516,14 @@ define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
 }

 define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
-                                                       $2 %val) nounwind alwaysinline {
+                                                       $2 %val) nounwind alwaysinline {                                                           
+  ;; 3.5 and 3.6 code is the same since m4 has no OR and AND operators
  ifelse(LLVM_VERSION,LLVM_3_5,`
   %r_t = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst seq_cst
   %r = extractvalue { $2, i1 } %r_t, 0
+  ',LLVM_VERSION,LLVM_3_6,`
+   %r_t = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst seq_cst
+   %r = extractvalue { $2, i1 } %r_t, 0
  ',`
   %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst
  ')
@@ -3914,23 +3922,10 @@ define(`masked_store_blend_8_16_by_4', `
 define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
                                     <4 x i32>) nounwind alwaysinline {
  %old = load <4 x i8> * %0, align 1
-  ifelse(LLVM_VERSION,LLVM_3_0,`
-    %old32 = bitcast <4 x i8> %old to i32
-    %new32 = bitcast <4 x i8> %1 to i32
-
-    %mask8 = trunc <4 x i32> %2 to <4 x i8>
-    %mask32 = bitcast <4 x i8> %mask8 to i32
-    %notmask32 = xor i32 %mask32, -1
-
-    %newmasked = and i32 %new32, %mask32
-    %oldmasked = and i32 %old32, %notmask32
-    %result = or i32 %newmasked, %oldmasked
-
-    %resultvec = bitcast i32 %result to <4 x i8>
-  ',`
-    %m = trunc <4 x i32> %2 to <4 x i1>
-    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
-  ')
+  
+  %m = trunc <4 x i32> %2 to <4 x i1>
+  %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+ 
  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
  ret void
 }
@@ -3938,23 +3933,10 @@ define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
 define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
                                      <4 x i32>) nounwind alwaysinline {
  %old = load <4 x i16> * %0, align 2
-  ifelse(LLVM_VERSION,LLVM_3_0,`
-    %old64 = bitcast <4 x i16> %old to i64
-    %new64 = bitcast <4 x i16> %1 to i64
+  
+  %m = trunc <4 x i32> %2 to <4 x i1>
+  %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old

-    %mask16 = trunc <4 x i32> %2 to <4 x i16>
-    %mask64 = bitcast <4 x i16> %mask16 to i64
-    %notmask64 = xor i64 %mask64, -1
-
-    %newmasked = and i64 %new64, %mask64
-    %oldmasked = and i64 %old64, %notmask64
-    %result = or i64 %newmasked, %oldmasked
-
-    %resultvec = bitcast i64 %result to <4 x i16>
-  ',`
-    %m = trunc <4 x i32> %2 to <4 x i1>
-    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
-  ')
  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
  ret void
 }
@@ -3964,23 +3946,10 @@ define(`masked_store_blend_8_16_by_4_mask64', `
 define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
                                     <4 x i64>) nounwind alwaysinline {
  %old = load <4 x i8> * %0, align 1
-  ifelse(LLVM_VERSION,LLVM_3_0,`
-    %old32 = bitcast <4 x i8> %old to i32
-    %new32 = bitcast <4 x i8> %1 to i32

-    %mask8 = trunc <4 x i64> %2 to <4 x i8>
-    %mask32 = bitcast <4 x i8> %mask8 to i32
-    %notmask32 = xor i32 %mask32, -1
-
-    %newmasked = and i32 %new32, %mask32
-    %oldmasked = and i32 %old32, %notmask32
-    %result = or i32 %newmasked, %oldmasked
-
-    %resultvec = bitcast i32 %result to <4 x i8>
-  ',`
-    %m = trunc <4 x i64> %2 to <4 x i1>
-    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
-  ')
+  %m = trunc <4 x i64> %2 to <4 x i1>
+  %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  
  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
  ret void
 }
@@ -3988,23 +3957,10 @@ define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
 define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
                                      <4 x i64>) nounwind alwaysinline {
  %old = load <4 x i16> * %0, align 2
-  ifelse(LLVM_VERSION,LLVM_3_0,`
-    %old64 = bitcast <4 x i16> %old to i64
-    %new64 = bitcast <4 x i16> %1 to i64
-
-    %mask16 = trunc <4 x i64> %2 to <4 x i16>
-    %mask64 = bitcast <4 x i16> %mask16 to i64
-    %notmask64 = xor i64 %mask64, -1
-
-    %newmasked = and i64 %new64, %mask64
-    %oldmasked = and i64 %old64, %notmask64
-    %result = or i64 %newmasked, %oldmasked
-
-    %resultvec = bitcast i64 %result to <4 x i16>
-  ',`
-    %m = trunc <4 x i64> %2 to <4 x i1>
-    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
-  ')
+  
+  %m = trunc <4 x i64> %2 to <4 x i1>
+  %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  
  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
  ret void
 }
@@ -4014,23 +3970,10 @@ define(`masked_store_blend_8_16_by_8', `
 define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
                                     <8 x i32>) nounwind alwaysinline {
  %old = load <8 x i8> * %0, align 1
-  ifelse(LLVM_VERSION,LLVM_3_0,`
-    %old64 = bitcast <8 x i8> %old to i64
-    %new64 = bitcast <8 x i8> %1 to i64
-
-    %mask8 = trunc <8 x i32> %2 to <8 x i8>
-    %mask64 = bitcast <8 x i8> %mask8 to i64
-    %notmask64 = xor i64 %mask64, -1
-
-    %newmasked = and i64 %new64, %mask64
-    %oldmasked = and i64 %old64, %notmask64
-    %result = or i64 %newmasked, %oldmasked
-
-    %resultvec = bitcast i64 %result to <8 x i8>
-  ',`
-    %m = trunc <8 x i32> %2 to <8 x i1>
-    %resultvec = select <8 x i1> %m, <8 x i8> %1, <8 x i8> %old
-  ')
+  
+  %m = trunc <8 x i32> %2 to <8 x i1>
+  %resultvec = select <8 x i1> %m, <8 x i8> %1, <8 x i8> %old
+  
  store <8 x i8> %resultvec, <8 x i8> * %0, align 1
  ret void
 }
@@ -4038,23 +3981,10 @@ define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
 define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
                                      <8 x i32>) nounwind alwaysinline {
  %old = load <8 x i16> * %0, align 2
-  ifelse(LLVM_VERSION,LLVM_3_0,`
-    %old128 = bitcast <8 x i16> %old to i128
-    %new128 = bitcast <8 x i16> %1 to i128
-
-    %mask16 = trunc <8 x i32> %2 to <8 x i16>
-    %mask128 = bitcast <8 x i16> %mask16 to i128
-    %notmask128 = xor i128 %mask128, -1
-
-    %newmasked = and i128 %new128, %mask128
-    %oldmasked = and i128 %old128, %notmask128
-    %result = or i128 %newmasked, %oldmasked
-
-    %resultvec = bitcast i128 %result to <8 x i16>
-  ',`
-    %m = trunc <8 x i32> %2 to <8 x i1>
-    %resultvec = select <8 x i1> %m, <8 x i16> %1, <8 x i16> %old
-  ')
+  
+  %m = trunc <8 x i32> %2 to <8 x i1>
+  %resultvec = select <8 x i1> %m, <8 x i16> %1, <8 x i16> %old
+  
  store <8 x i16> %resultvec, <8 x i16> * %0, align 2
  ret void
 }
@@ -4065,23 +3995,10 @@ define(`masked_store_blend_8_16_by_16', `
 define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
                                     <16 x i32>) nounwind alwaysinline {
  %old = load <16 x i8> * %0, align 1
-  ifelse(LLVM_VERSION,LLVM_3_0,`
-    %old128 = bitcast <16 x i8> %old to i128
-    %new128 = bitcast <16 x i8> %1 to i128

-    %mask8 = trunc <16 x i32> %2 to <16 x i8>
-    %mask128 = bitcast <16 x i8> %mask8 to i128
-    %notmask128 = xor i128 %mask128, -1
+  %m = trunc <16 x i32> %2 to <16 x i1>
+  %resultvec = select <16 x i1> %m, <16 x i8> %1, <16 x i8> %old

-    %newmasked = and i128 %new128, %mask128
-    %oldmasked = and i128 %old128, %notmask128
-    %result = or i128 %newmasked, %oldmasked
-
-    %resultvec = bitcast i128 %result to <16 x i8>
-  ',`
-    %m = trunc <16 x i32> %2 to <16 x i1>
-    %resultvec = select <16 x i1> %m, <16 x i8> %1, <16 x i8> %old
-  ')
  store <16 x i8> %resultvec, <16 x i8> * %0, align 1
  ret void
 }
@@ -4089,23 +4006,10 @@ define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
 define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
                                      <16 x i32>) nounwind alwaysinline {
  %old = load <16 x i16> * %0, align 2
-  ifelse(LLVM_VERSION,LLVM_3_0,`
-    %old256 = bitcast <16 x i16> %old to i256
-    %new256 = bitcast <16 x i16> %1 to i256

-    %mask16 = trunc <16 x i32> %2 to <16 x i16>
-    %mask256 = bitcast <16 x i16> %mask16 to i256
-    %notmask256 = xor i256 %mask256, -1
+  %m = trunc <16 x i32> %2 to <16 x i1>
+  %resultvec = select <16 x i1> %m, <16 x i16> %1, <16 x i16> %old

-    %newmasked = and i256 %new256, %mask256
-    %oldmasked = and i256 %old256, %notmask256
-    %result = or i256 %newmasked, %oldmasked
-
-    %resultvec = bitcast i256 %result to <16 x i16>
-  ',`
-    %m = trunc <16 x i32> %2 to <16 x i1>
-    %resultvec = select <16 x i1> %m, <16 x i16> %1, <16 x i16> %old
-  ')
  store <16 x i16> %resultvec, <16 x i16> * %0, align 2
  ret void
 }