Fix __load_masked_{32,64} to properly obey the mask. Fixes issue #28.

Fixed the implementations of these builtin functions for targets that don't have native masked load instructions so that they do no loads if the vector mask is all off, and only do an (unaligned) vector load if both the first and last element of the mask are on. Otherwise they serialize and do scalar loads for only the active lanes. This fixes a number of potential sources of crashes due to accessing invalid memory.
2011-07-08 11:21:11 +01:00
parent 092d288aef
commit e156651190
3 changed files with 92 additions and 123 deletions
--- a/stdlib-sse.ll
+++ b/stdlib-sse.ll
@@ -401,82 +401,10 @@ define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-define <4 x i32> @__load_and_broadcast_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  %ptr = bitcast i8 * %0 to i32 *
-  %val = load i32 * %ptr
-
-  %ret0 = insertelement <4 x i32> undef, i32 %val, i32 0
-  %ret1 = insertelement <4 x i32> %ret0, i32 %val, i32 1
-  %ret2 = insertelement <4 x i32> %ret1, i32 %val, i32 2
-  %ret3 = insertelement <4 x i32> %ret2, i32 %val, i32 3
-  ret <4 x i32> %ret3
-
-skip:
-  ret <4 x i32> undef
-}
-
-define <4 x i64> @__load_and_broadcast_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  %ptr = bitcast i8 * %0 to i64 *
-  %val = load i64 * %ptr
-
-  %ret0 = insertelement <4 x i64> undef, i64 %val, i32 0
-  %ret1 = insertelement <4 x i64> %ret0, i64 %val, i32 1
-  %ret2 = insertelement <4 x i64> %ret1, i64 %val, i32 2
-  %ret3 = insertelement <4 x i64> %ret2, i64 %val, i32 3
-  ret <4 x i64> %ret3
-
-skip:
-  ret <4 x i64> undef
-}
-
-define <4 x i32> @__load_masked_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load: 
-  ; if any mask lane is on, just load all of the values
-  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
-  ; next page is invalid to read, but the mask bits are set so that we
-  ; aren't supposed to be reading those elements...
-  %ptr = bitcast i8 * %0 to <4 x i32> *
-  %val = load <4 x i32> * %ptr, align 4
-  ret <4 x i32> %val
-
-skip:
-  ret <4 x i32> undef
-}
-
-define <4 x i64> @__load_masked_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  ; if any mask lane is on, just load all of the values
-  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
-  ; next page is invalid to read, but the mask bits are set so that we
-  ; aren't supposed to be reading those elements...
-  %ptr = bitcast i8 * %0 to <4 x i64> *
-  %val = load <4 x i64> * %ptr, align 8
-  ret <4 x i64> %val
-
-skip:
-  ret <4 x i64> undef
-}
-
+load_and_broadcast(4, i32, 32)
+load_and_broadcast(4, i64, 64)
+load_masked(4, i32, 32, 4)
+load_masked(4, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -463,53 +463,10 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-; FIXME: I think this and the next one need to verify that the mask isn't
-; all off before doing the load!!!  (See e.g. stdlib-sse.ll)
-
-define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to i32 *
-  %val = load i32 * %ptr
-
-  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
-  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
-  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
-  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
-  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
-  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
-  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
-  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
-  ret <8 x i32> %ret7
-}
-
-
-define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to i64 *
-  %val = load i64 * %ptr
-
-  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
-  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
-  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
-  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
-  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
-  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
-  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
-  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
-  ret <8 x i64> %ret7
-}
-
-
-define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to <8 x i32> *
-  %val = load <8 x i32> * %ptr, align 4
-  ret <8 x i32> %val
-}
-
-
-define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to <8 x i64> *
-  %val = load <8 x i64> * %ptr, align 8
-  ret <8 x i64> %val
-}
+load_and_broadcast(8, i32, 32)
+load_and_broadcast(8, i64, 64)
+load_masked(8, i32, 32, 4)
+load_masked(8, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -1055,6 +1055,90 @@ skip:
 '
 )

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Emit code to safely load a scalar value and broadcast it across the
+;; elements of a vector.  Parameters:
+;; $1: target vector width
+;; $2: element type for which to emit the function (i32, i64, ...)
+;; $3: suffix for function name (32, 64, ...)
+
+
+define(`load_and_broadcast', `
+define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
+  ; must not load if the mask is all off; the address may be invalid
+  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to $2 *
+  %val = load $2 * %ptr
+
+  %ret0 = insertelement <$1 x $2> undef, $2 %val, i32 0
+  forloop(i, 1, eval($1-1), `
+  %ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
+  ret <$1 x $2> %ret`'eval($1-1)
+
+skip:
+  ret <$1 x $2> undef
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Emit general-purpose code to do a masked load for targets that dont have
+;; an instruction to do that.  Parameters:
+;; $1: target vector width
+;; $2: element type for which to emit the function (i32, i64, ...)
+;; $3: suffix for function name (32, 64, ...)
+;; $4: alignment for elements of type $2 (4, 8, ...)
+
+define(`load_masked', `
+define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
+entry:
+  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  ; if the first lane and the last lane are on, then it is safe to do a vector load
+  ; of the whole thing--what the lanes in the middle want turns out to not matter...
+  %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
+  %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
+  ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
+  %retptr = alloca <$1 x $2>
+  %retptr32 = bitcast <$1 x $2> * %retptr to $2 *
+  br i1 %can_vload, label %load, label %loop
+
+load: 
+  %ptr = bitcast i8 * %0 to <$1 x $2> *
+  %valall = load <$1 x $2> * %ptr, align $4
+  ret <$1 x $2> %valall
+
+loop:
+  ; loop over the lanes and see if each one is on...
+  %lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ]
+  %lanemask = shl i32 1, %lane
+  %mask_and = and i32 %mm, %lanemask
+  %do_lane = icmp ne i32 %mask_and, 0
+  br i1 %do_lane, label %load_lane, label %lane_done
+
+load_lane:
+  ; yes!  do the load and store the result into the appropriate place in the
+  ; allocaed memory above
+  %ptr32 = bitcast i8 * %0 to $2 *
+  %lane_ptr = getelementptr $2 * %ptr32, i32 %lane
+  %val = load $2 * %lane_ptr
+  %store_ptr = getelementptr $2 * %retptr32, i32 %lane
+  store $2 %val, $2 * %store_ptr
+  br label %lane_done
+
+lane_done:
+  %next_lane = add i32 %lane, 1
+  %done = icmp eq i32 %lane, eval($1-1)
+  br i1 %done, label %return, label %loop
+
+return:
+  %r = load <$1 x $2> * %retptr
+  ret <$1 x $2> %r
+}
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
 ;;