diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index 9151adf8..1e8d0ae5 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -722,9 +722,16 @@ svml_stubs(double,d,WIDTH) define i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 - %call = call i32 @__ballot_nvptx(i1 zeroext %v) - %v64 = zext i32 %call to i64 - ret i64 %v64 +;; if 0 + ;; this one fails with ./tests/popcnt-4.ispc and others ... +;; %v0 = call i32 @__ballot_nvptx(i1 %v) +;; %v64 = zext i32 %v0 to i64 + +;; else + ;; this one just copies mask + %v64 = zext i1 %v to i64 +;; endif + ret i64 %v64 } define i1 @__any(<1 x i1>) nounwind readnone alwaysinline { diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4 index 04c63758..93870f15 100644 --- a/builtins/util-nvptx.m4 +++ b/builtins/util-nvptx.m4 @@ -2693,44 +2693,10 @@ declare i8* @malloc(i32) declare i32 @posix_memalign(i8**, i32, i32) declare void @free(i8 *) -define noalias i8 * @__new_uniform_32rt(i64 %size) { - %ptr = alloca i8* - %conv = trunc i64 %size to i32 - %alignment = load i32* @memory_alignment - %call1 = call i32 @posix_memalign(i8** %ptr, i32 %alignment, i32 %conv) - %ptr_val = load i8** %ptr - ret i8* %ptr_val -} - -define @__new_varying32_32rt( %size, %mask) { - %ret = alloca - store zeroinitializer, * %ret - %ret64 = bitcast * %ret to i64 * - %alignment = load i32* @memory_alignment - - per_lane(WIDTH, %mask, ` - %sz_LANE_ID = extractelement %size, i32 LANE - %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE - %ptr_LANE_ID = bitcast i64* %store_LANE_ID to i8** - %call_LANE_ID = call i32 @posix_memalign(i8** %ptr_LANE_ID, i32 %alignment, i32 %sz_LANE_ID)') - - %r = load * %ret - ret %r -} - -define void @__delete_uniform_32rt(i8 * %ptr) { - call void @free(i8 * %ptr) - ret void -} - -define void @__delete_varying_32rt( %ptr, %mask) { - per_lane(WIDTH, %mask, ` - %iptr_LANE_ID = extractelement %ptr, i32 LANE - %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 * - call void @free(i8 * %ptr_LANE_ID) - ') - ret void -} +declare noalias i8 * @__new_uniform_32rt(i64 %size); +declare @__new_varying32_32rt( %size, %mask); +declare void @__delete_uniform_32rt(i8 * %ptr); +declare void @__delete_varying_32rt( %ptr, %mask); ', RUNTIME, `64',