Fix __load_masked_{32,64} to properly obey the mask. Fixes issue #28.

Fixed the implementations of these builtin functions for targets that don't have native masked load instructions so that they do no loads if the vector mask is all off, and only do an (unaligned) vector load if both the first and last element of the mask are on.  Otherwise they serialize and do scalar loads for only the active lanes.  This fixes a number of potential sources of crashes due to accessing invalid memory.
This commit is contained in:
Matt Pharr
2011-07-08 11:21:11 +01:00
parent 092d288aef
commit e156651190
3 changed files with 92 additions and 123 deletions

View File

@@ -401,82 +401,10 @@ define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounw
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
define <4 x i32> @__load_and_broadcast_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
; must not load if the mask is all off; the address may be invalid
%mm = call i32 @__movmsk(<4 x i32> %mask)
%any_on = icmp ne i32 %mm, 0
br i1 %any_on, label %load, label %skip
load:
%ptr = bitcast i8 * %0 to i32 *
%val = load i32 * %ptr
%ret0 = insertelement <4 x i32> undef, i32 %val, i32 0
%ret1 = insertelement <4 x i32> %ret0, i32 %val, i32 1
%ret2 = insertelement <4 x i32> %ret1, i32 %val, i32 2
%ret3 = insertelement <4 x i32> %ret2, i32 %val, i32 3
ret <4 x i32> %ret3
skip:
ret <4 x i32> undef
}
define <4 x i64> @__load_and_broadcast_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
; must not load if the mask is all off; the address may be invalid
%mm = call i32 @__movmsk(<4 x i32> %mask)
%any_on = icmp ne i32 %mm, 0
br i1 %any_on, label %load, label %skip
load:
%ptr = bitcast i8 * %0 to i64 *
%val = load i64 * %ptr
%ret0 = insertelement <4 x i64> undef, i64 %val, i32 0
%ret1 = insertelement <4 x i64> %ret0, i64 %val, i32 1
%ret2 = insertelement <4 x i64> %ret1, i64 %val, i32 2
%ret3 = insertelement <4 x i64> %ret2, i64 %val, i32 3
ret <4 x i64> %ret3
skip:
ret <4 x i64> undef
}
define <4 x i32> @__load_masked_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
%mm = call i32 @__movmsk(<4 x i32> %mask)
%any_on = icmp ne i32 %mm, 0
br i1 %any_on, label %load, label %skip
load:
; if any mask lane is on, just load all of the values
; FIXME: there is a lurking bug here if we straddle a page boundary, the
; next page is invalid to read, but the mask bits are set so that we
; aren't supposed to be reading those elements...
%ptr = bitcast i8 * %0 to <4 x i32> *
%val = load <4 x i32> * %ptr, align 4
ret <4 x i32> %val
skip:
ret <4 x i32> undef
}
define <4 x i64> @__load_masked_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
%mm = call i32 @__movmsk(<4 x i32> %mask)
%any_on = icmp ne i32 %mm, 0
br i1 %any_on, label %load, label %skip
load:
; if any mask lane is on, just load all of the values
; FIXME: there is a lurking bug here if we straddle a page boundary, the
; next page is invalid to read, but the mask bits are set so that we
; aren't supposed to be reading those elements...
%ptr = bitcast i8 * %0 to <4 x i64> *
%val = load <4 x i64> * %ptr, align 8
ret <4 x i64> %val
skip:
ret <4 x i64> undef
}
load_and_broadcast(4, i32, 32)
load_and_broadcast(4, i64, 64)
load_masked(4, i32, 32, 4)
load_masked(4, i64, 64, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter