diff --git a/builtins/builtins.c b/builtins/builtins.c index 36498e1a..8e1a5624 100644 --- a/builtins/builtins.c +++ b/builtins/builtins.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2010-2011, Intel Corporation + Copyright (c) 2010-2012, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -70,7 +70,7 @@ typedef int Bool; putchar('['); \ for (int i = 0; i < width; ++i) { \ /* only print the value if the current lane is executing */ \ - if (mask & (1< @__min_varying_float(<16 x float>, declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone -define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline { +define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline { %floatmask = bitcast <16 x i32> %0 to <16 x float> %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef, <8 x i32> @@ -186,7 +186,8 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline { %v1shift = shl i32 %v1, 8 %v = or i32 %v1shift, %v0 - ret i32 %v + %v64 = zext i32 %v to i64 + ret i64 %v64 } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 53659b7c..608d2dcd 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -175,10 +175,11 @@ define <8 x float> @__min_varying_float(<8 x float>, declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone -define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { +define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { %floatmask = bitcast <8 x i32> %0 to <8 x float> %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone - ret i32 %v + %v64 = zext i32 %v to i64 + ret i64 %v64 } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index ad911e64..5ced9da9 100755 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -186,14 +186,14 @@ define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>, ret void } -define i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline { +define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline { %item = extractelement <1 x i32> %0, i32 0 %v = lshr i32 %item, 31 - ret i32 %v + %v64 = zext i32 %v to i64 + ret i64 %v64 } - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding ;; diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 50daf23e..6bf90d95 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -201,7 +201,7 @@ declare @__svml_pow(, ) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions -declare i32 @__movmsk() nounwind readnone +declare i64 @__movmsk() nounwind readnone declare float @__reduce_add_float() nounwind readnone declare float @__reduce_min_float() nounwind readnone diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 2e6d1bdc..65d30939 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -295,7 +295,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone -define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { +define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { ; first do two 4-wide movmsk calls %floatmask = bitcast <8 x i32> %0 to <8 x float> %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef, @@ -309,7 +309,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { ; of the second one %v1s = shl i32 %v1, 4 %v = or i32 %v0, %v1s - ret i32 %v + %v64 = zext i32 %v to i64 + ret i64 %v64 } define <4 x float> @__vec4_add_float(<4 x float> %v0, diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 21ffb267..e6eb7390 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -239,10 +239,11 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone -define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { +define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { %floatmask = bitcast <4 x i32> %0 to <4 x float> %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone - ret i32 %v + %v64 = zext i32 %v to i64 + ret i64 %v64 } define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 5a467ec2..1ac6b3e5 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -237,7 +237,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone -define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { +define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { ; first do two 4-wide movmsk calls %floatmask = bitcast <8 x i32> %0 to <8 x float> %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef, @@ -251,7 +251,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { ; of the second one %v1s = shl i32 %v1, 4 %v = or i32 %v0, %v1s - ret i32 %v + %v64 = zext i32 %v to i64 + ret i64 %v64 } define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 9dfe9db7..98426b24 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -271,10 +271,11 @@ define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alway declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone -define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { +define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { %floatmask = bitcast <4 x i32> %0 to <4 x float> %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone - ret i32 %v + %v64 = zext i32 %v to i64 + ret i64 %v64 } declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone diff --git a/builtins/util.m4 b/builtins/util.m4 index 023ca411..59185942 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -38,6 +38,18 @@ declare i1 @__is_compile_time_constant_uniform_int32(i32) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets... +define(`ALL_ON_MASK', +`ifelse(WIDTH, `64', `-1', + WIDTH, `32', `4294967295', + `eval((1< %mask) declare i1 @__is_compile_time_constant_varying_int32() @@ -2096,12 +2108,12 @@ ok: define void @__do_assert_varying(i8 *%str, %test, - %mask) { + %mask) { %nottest = xor %test, < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 > %nottest_and_mask = and %nottest, %mask - %mm = call i32 @__movmsk( %nottest_and_mask) - %all_ok = icmp eq i32 %mm, 0 + %mm = call i64 @__movmsk( %nottest_and_mask) + %all_ok = icmp eq i64 %mm, 0 br i1 %all_ok, label %ok, label %fail fail: @@ -2505,12 +2517,16 @@ define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x MASK> %mask) nounwind alwa define(`masked_load', ` define <$1 x $2> @__masked_load_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline { entry: - %mm = call i32 @__movmsk(<$1 x MASK> %mask) + %mm = call i64 @__movmsk(<$1 x MASK> %mask) ; if the first lane and the last lane are on, then it is safe to do a vector load ; of the whole thing--what the lanes in the middle want turns out to not matter... - %mm_and = and i32 %mm, eval(1 | (1<<($1-1))) - %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1))) + %mm_and_low = and i64 %mm, 1 + %mm_and_high = and i64 %mm, MASK_HIGH_BIT_ON + %mm_and_high_shift = lshr i64 %mm_and_high, eval(WIDTH-1) + %mm_and_low_i1 = trunc i64 %mm_and_low to i1 + %mm_and_high_shift_i1 = trunc i64 %mm_and_high_shift to i1 + %can_vload = and i1 %mm_and_low_i1, %mm_and_high_shift_i1 %fast32 = call i32 @__fast_masked_vload() %fast_i1 = trunc i32 %fast32 to i1 @@ -2529,9 +2545,10 @@ load: loop: ; loop over the lanes and see if each one is on... %lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ] - %lanemask = shl i32 1, %lane - %mask_and = and i32 %mm, %lanemask - %do_lane = icmp ne i32 %mask_and, 0 + %lane64 = zext i32 %lane to i64 + %lanemask = shl i64 1, %lane64 + %mask_and = and i64 %mm, %lanemask + %do_lane = icmp ne i64 %mask_and, 0 br i1 %do_lane, label %load_lane, label %lane_done load_lane: @@ -2743,12 +2760,12 @@ define(`packed_load_and_store', ` define i32 @__packed_load_active(i32 * %startptr, * %val_ptr, %full_mask) nounwind alwaysinline { entry: - %mask = call i32 @__movmsk( %full_mask) + %mask = call i64 @__movmsk( %full_mask) %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: - %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1) + %allon = icmp eq i64 %mask, ALL_ON_MASK br i1 %allon, label %all_on, label %unknown_mask all_on: @@ -2764,12 +2781,12 @@ unknown_mask: loop: %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ] - %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ] + %lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ] %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ] ; is the current lane on? - %and = and i32 %mask, %lanemask - %do_load = icmp eq i32 %and, %lanemask + %and = and i64 %mask, %lanemask + %do_load = icmp eq i64 %and, %lanemask br i1 %do_load, label %load, label %loopend load: @@ -2784,7 +2801,7 @@ load: loopend: %nextoffset = phi i32 [ %offset1, %load ], [ %offset, %loop ] %nextlane = add i32 %lane, 1 - %nextlanemask = mul i32 %lanemask, 2 + %nextlanemask = mul i64 %lanemask, 2 ; are we done yet? %test = icmp ne i32 %nextlane, WIDTH @@ -2795,14 +2812,14 @@ done: } define i32 @__packed_store_active(i32 * %startptr, %vals, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i32 @__movmsk( %full_mask) + %mask = call i64 @__movmsk( %full_mask) %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: - %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1) + %allon = icmp eq i64 %mask, ALL_ON_MASK br i1 %allon, label %all_on, label %unknown_mask all_on: @@ -2815,12 +2832,12 @@ unknown_mask: loop: %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ] - %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ] + %lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ] %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ] ; is the current lane on? - %and = and i32 %mask, %lanemask - %do_store = icmp eq i32 %and, %lanemask + %and = and i64 %mask, %lanemask + %do_store = icmp eq i64 %and, %lanemask br i1 %do_store, label %store, label %loopend store: @@ -2833,7 +2850,7 @@ store: loopend: %nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ] %nextlane = add i32 %lane, 1 - %nextlanemask = mul i32 %lanemask, 2 + %nextlanemask = mul i64 %lanemask, 2 ; are we done yet? %test = icmp ne i32 %nextlane, WIDTH @@ -2857,14 +2874,15 @@ define(`reduce_equal_aux', ` define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue, <$1 x MASK> %mask) nounwind alwaysinline { entry: - %mm = call i32 @__movmsk(<$1 x MASK> %mask) - %allon = icmp eq i32 %mm, eval((1<<$1)-1) + %mm = call i64 @__movmsk(<$1 x MASK> %mask) + %allon = icmp eq i64 %mm, ALL_ON_MASK br i1 %allon, label %check_neighbors, label %domixed domixed: ; First, figure out which lane is the first active one - %first = call i32 @llvm.cttz.i32(i32 %mm) - %baseval = extractelement <$1 x $2> %v, i32 %first + %first = call i64 @llvm.cttz.i64(i64 %mm) + %first32 = trunc i64 %first to i32 + %baseval = extractelement <$1 x $2> %v, i32 %first32 %basev1 = bitcast $2 %baseval to <1 x $2> ; get a vector that is that value smeared across all elements %basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef, @@ -2895,9 +2913,9 @@ check_neighbors: %eq = $5 eq <$1 x $2> %vec, %vr ifelse(MASK,i32, ` %eq32 = sext <$1 x i1> %eq to <$1 x i32> - %eqmm = call i32 @__movmsk(<$1 x i32> %eq32)', ` - %eqmm = call i32 @__movmsk(<$1 x MASK> %eq)') - %alleq = icmp eq i32 %eqmm, eval((1<<$1)-1) + %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', ` + %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)') + %alleq = icmp eq i64 %eqmm, ALL_ON_MASK br i1 %alleq, label %all_equal, label %not_all_equal ', ` ; But for 64-bit elements, it turns out to be more efficient to just @@ -3010,14 +3028,14 @@ define(`per_lane', ` br label %pl_entry pl_entry: - %pl_mask = call i32 @__movmsk($2) + %pl_mask = call i64 @__movmsk($2) %pl_mask_known = call i1 @__is_compile_time_constant_mask($2) br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask pl_known_mask: ;; the mask is known at compile time; see if it is something we can ;; handle more efficiently - %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1) + %pl_is_allon = icmp eq i64 %pl_mask, ALL_ON_MASK br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask pl_all_on: @@ -3039,11 +3057,11 @@ pl_unknown_mask: pl_loop: ;; Loop over each lane and see if we want to do the work for this lane %pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ] - %pl_lanemask = phi i32 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ] + %pl_lanemask = phi i64 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ] ; is the current lane on? if so, goto do work, otherwise to end of loop - %pl_and = and i32 %pl_mask, %pl_lanemask - %pl_doit = icmp eq i32 %pl_and, %pl_lanemask + %pl_and = and i64 %pl_mask, %pl_lanemask + %pl_doit = icmp eq i64 %pl_and, %pl_lanemask br i1 %pl_doit, label %pl_dolane, label %pl_loopend pl_dolane: @@ -3054,7 +3072,7 @@ pl_dolane: pl_loopend: %pl_nextlane = add i32 %pl_lane, 1 - %pl_nextlanemask = mul i32 %pl_lanemask, 2 + %pl_nextlanemask = mul i64 %pl_lanemask, 2 ; are we done yet? %pl_test = icmp ne i32 %pl_nextlane, $1 diff --git a/ctx.cpp b/ctx.cpp index 4e357873..11957ae2 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1254,16 +1254,19 @@ llvm::Value * FunctionEmitContext::Any(llvm::Value *mask) { llvm::Value *mmval = LaneMask(mask); return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, mmval, - LLVMInt32(0), LLVMGetName(mask, "_any")); + LLVMInt64(0), LLVMGetName(mask, "_any")); } llvm::Value * FunctionEmitContext::All(llvm::Value *mask) { llvm::Value *mmval = LaneMask(mask); + llvm::Value *allOnMaskValue = (g->target.vectorWidth == 64) ? + LLVMInt64(~0ull) : + LLVMInt64((1ull << g->target.vectorWidth) - 1); + return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval, - LLVMInt32((1<target.vectorWidth)-1), - LLVMGetName(mask, "_all")); + allOnMaskValue, LLVMGetName(mask, "_all")); } @@ -1271,14 +1274,14 @@ llvm::Value * FunctionEmitContext::None(llvm::Value *mask) { llvm::Value *mmval = LaneMask(mask); return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval, - LLVMInt32(0), LLVMGetName(mask, "_none")); + LLVMInt64(0), LLVMGetName(mask, "_none")); } llvm::Value * FunctionEmitContext::LaneMask(llvm::Value *v) { // Call the target-dependent movmsk function to turn the vector mask - // into an i32 value + // into an i64 value std::vector mm; m->symbolTable->LookupFunction("__movmsk", &mm); if (g->target.maskBitCount == 1) @@ -1396,7 +1399,7 @@ FunctionEmitContext::AddInstrumentationPoint(const char *note) { args.push_back(lGetStringAsValue(bblock, note)); // arg 3: line number args.push_back(LLVMInt32(currentPos.first_line)); - // arg 4: current mask, movmsk'ed down to an int32 + // arg 4: current mask, movmsk'ed down to an int64 args.push_back(LaneMask(GetFullMask())); llvm::Function *finst = m->module->getFunction("ISPCInstrument"); @@ -3196,10 +3199,12 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType, // pointer to be called. llvm::Value *currentMask = LoadInst(maskPtr); llvm::Function *cttz = - m->module->getFunction("__count_trailing_zeros_i32"); + m->module->getFunction("__count_trailing_zeros_i64"); AssertPos(currentPos, cttz != NULL); - llvm::Value *firstLane = CallInst(cttz, NULL, LaneMask(currentMask), - "first_lane"); + llvm::Value *firstLane64 = CallInst(cttz, NULL, LaneMask(currentMask), + "first_lane64"); + llvm::Value *firstLane = + TruncInst(firstLane64, LLVMTypes::Int32Type, "first_lane32"); // Get the pointer to the function we're going to call this // time through: ftpr = func[firstLane] diff --git a/ctx.h b/ctx.h index 304f8af1..10a22115 100644 --- a/ctx.h +++ b/ctx.h @@ -276,7 +276,7 @@ public: llvm::Value *None(llvm::Value *mask); /** Given a boolean mask value of type LLVMTypes::MaskType, return an - i32 value wherein the i'th bit is on if and only if the i'th lane + i64 value wherein the i'th bit is on if and only if the i'th lane of the mask is on. */ llvm::Value *LaneMask(llvm::Value *mask); diff --git a/docs/perfguide.rst b/docs/perfguide.rst index 6e8555bf..b8e65893 100644 --- a/docs/perfguide.rst +++ b/docs/perfguide.rst @@ -624,7 +624,7 @@ gathers happen.) extern "C" { void ISPCInstrument(const char *fn, const char *note, - int line, int mask); + int line, uint64_t mask); } This function is passed the file name of the ``ispc`` file running, a short @@ -637,7 +637,7 @@ as follows: :: - ISPCInstrument("foo.ispc", "function entry", 55, 0xf); + ISPCInstrument("foo.ispc", "function entry", 55, 0xfull); This call indicates that at the currently executing program has just entered the function defined at line 55 of the file ``foo.ispc``, with a diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 57eba63f..80c2635c 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -311,8 +311,8 @@ INSERT_EXTRACT(__vec1_d, double) /////////////////////////////////////////////////////////////////////////// // mask ops -static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) { - return mask.v; +static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) { + return (uint64_t)mask.v; } static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) { diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 5fe22b78..9f301bb7 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2010-2011, Intel Corporation + Copyright (c) 2010-2012, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -224,8 +224,8 @@ CAST_BITS_SCALAR(double, int64_t) /////////////////////////////////////////////////////////////////////////// // mask ops -static FORCEINLINE uint32_t __movmsk(__vec4_i1 mask) { - return _mm_movemask_ps(mask.v); +static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) { + return (uint64_t)_mm_movemask_ps(mask.v); } static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) { diff --git a/ispc.h b/ispc.h index d0837110..4cbbce7d 100644 --- a/ispc.h +++ b/ispc.h @@ -61,7 +61,7 @@ /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation targets. */ -#define ISPC_MAX_NVEC 32 +#define ISPC_MAX_NVEC 64 // Forward declarations of a number of widely-used LLVM types namespace llvm { diff --git a/module.cpp b/module.cpp index b5afc875..d16916be 100644 --- a/module.cpp +++ b/module.cpp @@ -1228,7 +1228,7 @@ Module::writeHeader(const char *fn) { if (g->emitInstrumentation) { fprintf(f, "#define ISPC_INSTRUMENTATION 1\n"); fprintf(f, "extern \"C\" {\n"); - fprintf(f, " void ISPCInstrument(const char *fn, const char *note, int line, int mask);\n"); + fprintf(f, " void ISPCInstrument(const char *fn, const char *note, int line, uint64_t mask);\n"); fprintf(f, "}\n"); } diff --git a/opt.cpp b/opt.cpp index df4dd572..ce455d6f 100644 --- a/opt.cpp +++ b/opt.cpp @@ -269,12 +269,12 @@ lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name, execution mask, convert it to a bitvector where the 0th bit corresponds to the first vector value and so forth. */ -static uint32_t +static uint64_t lConstElementsToMask(const llvm::SmallVector &elements) { - Assert(elements.size() <= 32); + Assert(elements.size() <= 64); - uint32_t mask = 0; + uint64_t mask = 0; for (unsigned int i = 0; i < elements.size(); ++i) { llvm::APInt intMaskValue; // SSE has the "interesting" approach of encoding blending @@ -293,7 +293,7 @@ lConstElementsToMask(const llvm::SmallVector 0) - mask |= (1 << i); + mask |= (1ull << i); } return mask; } @@ -306,7 +306,7 @@ lConstElementsToMask(const llvm::SmallVector, we have 0b1001 = 9. */ static bool -lGetMask(llvm::Value *factor, uint32_t *mask) { +lGetMask(llvm::Value *factor, uint64_t *mask) { #ifndef LLVM_3_0 llvm::ConstantDataVector *cdv = llvm::dyn_cast(factor); if (cdv != NULL) { @@ -364,7 +364,7 @@ enum MaskStatus { ALL_ON, ALL_OFF, MIXED, UNKNOWN }; */ static MaskStatus lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) { - uint32_t bits; + uint64_t bits; if (lGetMask(mask, &bits) == false) return UNKNOWN; @@ -373,7 +373,7 @@ lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) { if (vecWidth == -1) vecWidth = g->target.vectorWidth; - Assert(vecWidth <= 32); + Assert(vecWidth <= 64); for (int i = 0; i < vecWidth; ++i) { if ((bits & (1ull << i)) == 0) @@ -601,12 +601,12 @@ private: instruction for this optimization pass. */ struct BlendInstruction { - BlendInstruction(llvm::Function *f, uint32_t ao, int o0, int o1, int of) + BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of) : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) { } /** Function pointer for the blend instruction */ llvm::Function *function; /** Mask value for an "all on" mask for this instruction */ - uint32_t allOnMask; + uint64_t allOnMask; /** The operand number in the llvm CallInst corresponds to the first operand to blend with. */ int op0; @@ -728,7 +728,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { goto restart; } - uint32_t mask; + uint64_t mask; if (lGetMask(factor, &mask) == true) { llvm::Value *value = NULL; if (mask == 0) @@ -748,12 +748,13 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { } else if (matchesMaskInstruction(callInst->getCalledFunction())) { llvm::Value *factor = callInst->getArgOperand(0); - uint32_t mask; + uint64_t mask; if (lGetMask(factor, &mask) == true) { // If the vector-valued mask has a known value, replace it // with the corresponding integer mask from its elements // high bits. - llvm::Value *value = LLVMInt32(mask); + llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ? + LLVMInt32(mask) : LLVMInt64(mask); llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value); modifiedAny = true; @@ -763,7 +764,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { else if (callInst->getCalledFunction() == avxMaskedLoad32 || callInst->getCalledFunction() == avxMaskedLoad64) { llvm::Value *factor = callInst->getArgOperand(1); - uint32_t mask; + uint64_t mask; if (lGetMask(factor, &mask) == true) { if (mask == 0) { // nothing being loaded, replace with undef value @@ -802,7 +803,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { callInst->getCalledFunction() == avxMaskedStore64) { // NOTE: mask is the 2nd parameter, not the 3rd one!! llvm::Value *factor = callInst->getArgOperand(1); - uint32_t mask; + uint64_t mask; if (lGetMask(factor, &mask) == true) { if (mask == 0) { // nothing actually being stored, just remove the inst @@ -931,7 +932,7 @@ VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) { if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk")) continue; - uint32_t mask; + uint64_t mask; if (lGetMask(callInst->getArgOperand(0), &mask) == true) { #if 0 fprintf(stderr, "mask %d\n", mask); @@ -939,7 +940,7 @@ VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) { fprintf(stderr, "-----------\n"); #endif llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), - iter, LLVMInt32(mask)); + iter, LLVMInt64(mask)); modifiedAny = true; goto restart; } diff --git a/stdlib.ispc b/stdlib.ispc index 9b2fe17d..4cfcdea4 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -355,7 +355,8 @@ static inline uniform bool all(bool v) { #else int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask); #endif - return __movmsk(match) == (1 << programCount) - 1; + return __movmsk(match) == ((programCount == 64) ? ~0ull : + ((1ull << programCount) - 1)); } __declspec(safe) @@ -388,14 +389,14 @@ __declspec(safe) static inline uniform int popcnt(bool v) { // As with any() and all(), only count across the active lanes #ifdef ISPC_TARGET_GENERIC - return __popcnt_int32(__movmsk(v & __mask)); + return __popcnt_int64(__movmsk(v & __mask)); #else - return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask)); + return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask)); #endif } __declspec(safe) -static inline uniform int lanemask() { +static inline uniform unsigned int64 lanemask() { return __movmsk(__mask); } @@ -1615,12 +1616,12 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \ TA ret[programCount]; \ TA memVal; \ uniform int lastSwap; \ - uniform int mask = lanemask(); \ + uniform unsigned int64 mask = lanemask(); \ /* First, have the first running program instance (if any) perform \ the swap with memory with its value of "value"; record the \ value returned. */ \ for (; i < programCount; ++i) { \ - if ((mask & (1 << i)) == 0) \ + if ((mask & (1ull << i)) == 0) \ continue; \ memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \ lastSwap = i; \ @@ -1632,7 +1633,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \ current instance had executed a hardware atomic swap right before \ the last one that did a swap. */ \ for (; i < programCount; ++i) { \ - if ((mask & (1 << i)) == 0) \ + if ((mask & (1ull << i)) == 0) \ continue; \ ret[lastSwap] = extract(value, i); \ lastSwap = i; \ diff --git a/stmt.cpp b/stmt.cpp index 6a6f58e5..11cc94ea 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -2843,7 +2843,7 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) { pos); // Compute the per lane mask to test the mask bits against: (1 << iter) - ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, 1, + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt64, 1ll, iterSym->pos); Expr *shiftLaneExpr = new BinaryExpr(BinaryExpr::Shl, oneExpr, symExpr, pos); @@ -2863,4 +2863,3 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) { // And return a for loop that wires it all together. return new ForStmt(initStmt, testExpr, stepStmt, laneCheckIf, false, pos); } -