diff --git a/builtins/builtins.c b/builtins/builtins.c
index 36498e1a..8e1a5624 100644
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -70,7 +70,7 @@ typedef int Bool;
     putchar('[');                                                       \
     for (int i = 0; i < width; ++i) {                                   \
         /* only print the value if the current lane is executing */     \
-        if (mask & (1<<i))                                              \
+        if (mask & (1ull<<i))                                           \
             printf(fmt, ((type *)ptr)[i]);                              \
         else                                                            \
             printf("((" fmt "))", ((type *)ptr)[i]);                    \
@@ -89,7 +89,7 @@ typedef int Bool;
     @param mask    Current lane mask when the print statemnt is called
     @param args    Array of pointers to the values to be printed
  */
-void __do_print(const char *format, const char *types, int width, int mask, 
+void __do_print(const char *format, const char *types, int width, uint64_t mask, 
                 void **args) {
     if (mask == 0) 
         return;
@@ -113,7 +113,7 @@ void __do_print(const char *format, const char *types, int width, int mask,
                 case 'B': {
                     putchar('[');
                     for (int i = 0; i < width; ++i) {
-                        if (mask & (1<<i))
+                        if (mask & (1ull << i))
                             printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
                         else
                             printf("_________");
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index c1979e30..13c4335d 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -175,7 +175,7 @@ define <16 x float> @__min_varying_float(<16 x float>,
 
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
-define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
   %floatmask = bitcast <16 x i32> %0 to <16 x float>
   %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -186,7 +186,8 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
 
   %v1shift = shl i32 %v1, 8
   %v = or i32 %v1shift, %v0
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 53659b7c..608d2dcd 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -175,10 +175,11 @@ define <8 x float> @__min_varying_float(<8 x float>,
 
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   %floatmask = bitcast <8 x i32> %0 to <8 x float>
   %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index ad911e64..5ced9da9 100755
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -186,14 +186,14 @@ define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
   ret void
 }
 
-define  i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
+define  i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
   %item = extractelement <1 x i32> %0, i32 0
   %v = lshr i32 %item, 31
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 50daf23e..6bf90d95 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -201,7 +201,7 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
-declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone 
+declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone 
 
 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 2e6d1bdc..65d30939 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -295,7 +295,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ; first do two 4-wide movmsk calls
   %floatmask = bitcast <8 x i32> %0 to <8 x float>
   %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -309,7 +309,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ; of the second one
   %v1s = shl i32 %v1, 4
   %v = or i32 %v0, %v1s
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 define <4 x float> @__vec4_add_float(<4 x float> %v0,
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 21ffb267..e6eb7390 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -239,10 +239,11 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
   %floatmask = bitcast <4 x i32> %0 to <4 x float>
   %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 5a467ec2..1ac6b3e5 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -237,7 +237,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>,
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ; first do two 4-wide movmsk calls
   %floatmask = bitcast <8 x i32> %0 to <8 x float>
   %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -251,7 +251,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ; of the second one
   %v1s = shl i32 %v1, 4
   %v = or i32 %v0, %v1s
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 9dfe9db7..98426b24 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -271,10 +271,11 @@ define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alway
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
   %floatmask = bitcast <4 x i32> %0 to <4 x float>
   %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
diff --git a/builtins/util.m4 b/builtins/util.m4
index 023ca411..59185942 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -38,6 +38,18 @@ declare i1 @__is_compile_time_constant_uniform_int32(i32)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets...
+define(`ALL_ON_MASK',
+`ifelse(WIDTH, `64', `-1', 
+        WIDTH, `32', `4294967295',
+                     `eval((1<<WIDTH)-1)')')
+
+define(`MASK_HIGH_BIT_ON',
+`ifelse(WIDTH, `64', `-9223372036854775808',
+        WIDTH, `32', `2147483648',
+                     `eval(1<<(WIDTH-1))')')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Helper macro for calling various SSE instructions for scalar values
 ;; but where the instruction takes a vector parameter.
@@ -1529,7 +1541,7 @@ declare i32 @__fast_masked_vload()
 declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
 declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
 declare void @ISPCSync(i8*) nounwind
-declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
+declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind
 
 declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
 declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
@@ -2096,12 +2108,12 @@ ok:
 
 
 define void @__do_assert_varying(i8 *%str, <WIDTH x MASK> %test,
-                                          <WIDTH x MASK> %mask) {
+                                 <WIDTH x MASK> %mask) {
   %nottest = xor <WIDTH x MASK> %test,
                  < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 >
   %nottest_and_mask = and <WIDTH x MASK> %nottest, %mask
-  %mm = call i32 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
-  %all_ok = icmp eq i32 %mm, 0
+  %mm = call i64 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
+  %all_ok = icmp eq i64 %mm, 0
   br i1 %all_ok, label %ok, label %fail
 
 fail:
@@ -2505,12 +2517,16 @@ define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x MASK> %mask) nounwind alwa
 define(`masked_load', `
 define <$1 x $2> @__masked_load_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline {
 entry:
-  %mm = call i32 @__movmsk(<$1 x MASK> %mask)
+  %mm = call i64 @__movmsk(<$1 x MASK> %mask)
   
   ; if the first lane and the last lane are on, then it is safe to do a vector load
   ; of the whole thing--what the lanes in the middle want turns out to not matter...
-  %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
-  %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
+  %mm_and_low = and i64 %mm, 1
+  %mm_and_high = and i64 %mm, MASK_HIGH_BIT_ON
+  %mm_and_high_shift = lshr i64 %mm_and_high, eval(WIDTH-1)
+  %mm_and_low_i1 = trunc i64 %mm_and_low to i1
+  %mm_and_high_shift_i1 = trunc i64 %mm_and_high_shift to i1
+  %can_vload = and i1 %mm_and_low_i1, %mm_and_high_shift_i1
 
   %fast32 = call i32 @__fast_masked_vload()
   %fast_i1 = trunc i32 %fast32 to i1
@@ -2529,9 +2545,10 @@ load:
 loop:
   ; loop over the lanes and see if each one is on...
   %lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ]
-  %lanemask = shl i32 1, %lane
-  %mask_and = and i32 %mm, %lanemask
-  %do_lane = icmp ne i32 %mask_and, 0
+  %lane64 = zext i32 %lane to i64
+  %lanemask = shl i64 1, %lane64
+  %mask_and = and i64 %mm, %lanemask
+  %do_lane = icmp ne i64 %mask_and, 0
   br i1 %do_lane, label %load_lane, label %lane_done
 
 load_lane:
@@ -2743,12 +2760,12 @@ define(`packed_load_and_store', `
 define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
                                  <WIDTH x i32> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
   %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
-  %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
+  %allon = icmp eq i64 %mask, ALL_ON_MASK
   br i1 %allon, label %all_on, label %unknown_mask
 
 all_on:
@@ -2764,12 +2781,12 @@ unknown_mask:
 
 loop:
   %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
-  %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
+  %lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
   %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
 
   ; is the current lane on?
-  %and = and i32 %mask, %lanemask
-  %do_load = icmp eq i32 %and, %lanemask
+  %and = and i64 %mask, %lanemask
+  %do_load = icmp eq i64 %and, %lanemask
   br i1 %do_load, label %load, label %loopend 
 
 load:
@@ -2784,7 +2801,7 @@ load:
 loopend:
   %nextoffset = phi i32 [ %offset1, %load ], [ %offset, %loop ]
   %nextlane = add i32 %lane, 1
-  %nextlanemask = mul i32 %lanemask, 2
+  %nextlanemask = mul i64 %lanemask, 2
 
   ; are we done yet?
   %test = icmp ne i32 %nextlane, WIDTH
@@ -2795,14 +2812,14 @@ done:
 }
 
 define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
-                                  <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                   <WIDTH x i32> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
   %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
-  %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
+  %allon = icmp eq i64 %mask, ALL_ON_MASK
   br i1 %allon, label %all_on, label %unknown_mask
 
 all_on:
@@ -2815,12 +2832,12 @@ unknown_mask:
 
 loop:
   %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
-  %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
+  %lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
   %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
 
   ; is the current lane on?
-  %and = and i32 %mask, %lanemask
-  %do_store = icmp eq i32 %and, %lanemask
+  %and = and i64 %mask, %lanemask
+  %do_store = icmp eq i64 %and, %lanemask
   br i1 %do_store, label %store, label %loopend 
 
 store:
@@ -2833,7 +2850,7 @@ store:
 loopend:
   %nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ]
   %nextlane = add i32 %lane, 1
-  %nextlanemask = mul i32 %lanemask, 2
+  %nextlanemask = mul i64 %lanemask, 2
 
   ; are we done yet?
   %test = icmp ne i32 %nextlane, WIDTH
@@ -2857,14 +2874,15 @@ define(`reduce_equal_aux', `
 define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
                              <$1 x MASK> %mask) nounwind alwaysinline {
 entry:
-   %mm = call i32 @__movmsk(<$1 x MASK> %mask)
-   %allon = icmp eq i32 %mm, eval((1<<$1)-1)
+   %mm = call i64 @__movmsk(<$1 x MASK> %mask)
+   %allon = icmp eq i64 %mm, ALL_ON_MASK
    br i1 %allon, label %check_neighbors, label %domixed
 
 domixed:
   ; First, figure out which lane is the first active one
-  %first = call i32 @llvm.cttz.i32(i32 %mm)
-  %baseval = extractelement <$1 x $2> %v, i32 %first
+  %first = call i64 @llvm.cttz.i64(i64 %mm)
+  %first32 = trunc i64 %first to i32
+  %baseval = extractelement <$1 x $2> %v, i32 %first32
   %basev1 = bitcast $2 %baseval to <1 x $2>
   ; get a vector that is that value smeared across all elements
   %basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef,
@@ -2895,9 +2913,9 @@ check_neighbors:
   %eq = $5 eq <$1 x $2> %vec, %vr
   ifelse(MASK,i32, `
     %eq32 = sext <$1 x i1> %eq to <$1 x i32>
-    %eqmm = call i32 @__movmsk(<$1 x i32> %eq32)', `
-    %eqmm = call i32 @__movmsk(<$1 x MASK> %eq)')
-  %alleq = icmp eq i32 %eqmm, eval((1<<$1)-1)
+    %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
+  %alleq = icmp eq i64 %eqmm, ALL_ON_MASK
   br i1 %alleq, label %all_equal, label %not_all_equal
   ', `
   ; But for 64-bit elements, it turns out to be more efficient to just
@@ -3010,14 +3028,14 @@ define(`per_lane', `
   br label %pl_entry
 
 pl_entry:
-  %pl_mask = call i32 @__movmsk($2)
+  %pl_mask = call i64 @__movmsk($2)
   %pl_mask_known = call i1 @__is_compile_time_constant_mask($2)
   br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask
 
 pl_known_mask:
   ;; the mask is known at compile time; see if it is something we can
   ;; handle more efficiently
-  %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
+  %pl_is_allon = icmp eq i64 %pl_mask, ALL_ON_MASK
   br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask
 
 pl_all_on:
@@ -3039,11 +3057,11 @@ pl_unknown_mask:
 pl_loop:
   ;; Loop over each lane and see if we want to do the work for this lane
   %pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ]
-  %pl_lanemask = phi i32 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
+  %pl_lanemask = phi i64 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
 
   ; is the current lane on?  if so, goto do work, otherwise to end of loop
-  %pl_and = and i32 %pl_mask, %pl_lanemask
-  %pl_doit = icmp eq i32 %pl_and, %pl_lanemask
+  %pl_and = and i64 %pl_mask, %pl_lanemask
+  %pl_doit = icmp eq i64 %pl_and, %pl_lanemask
   br i1 %pl_doit, label %pl_dolane, label %pl_loopend 
 
 pl_dolane:
@@ -3054,7 +3072,7 @@ pl_dolane:
 
 pl_loopend:
   %pl_nextlane = add i32 %pl_lane, 1
-  %pl_nextlanemask = mul i32 %pl_lanemask, 2
+  %pl_nextlanemask = mul i64 %pl_lanemask, 2
 
   ; are we done yet?
   %pl_test = icmp ne i32 %pl_nextlane, $1
diff --git a/ctx.cpp b/ctx.cpp
index 4e357873..11957ae2 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1254,16 +1254,19 @@ llvm::Value *
 FunctionEmitContext::Any(llvm::Value *mask) {
     llvm::Value *mmval = LaneMask(mask);
     return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, mmval,
-                   LLVMInt32(0), LLVMGetName(mask, "_any"));
+                   LLVMInt64(0), LLVMGetName(mask, "_any"));
 }
 
 
 llvm::Value *
 FunctionEmitContext::All(llvm::Value *mask) {
     llvm::Value *mmval = LaneMask(mask);
+    llvm::Value *allOnMaskValue = (g->target.vectorWidth == 64) ?
+        LLVMInt64(~0ull) :
+        LLVMInt64((1ull << g->target.vectorWidth) - 1);
+
     return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
-                   LLVMInt32((1<<g->target.vectorWidth)-1), 
-                   LLVMGetName(mask, "_all"));
+                   allOnMaskValue, LLVMGetName(mask, "_all"));
 }
 
 
@@ -1271,14 +1274,14 @@ llvm::Value *
 FunctionEmitContext::None(llvm::Value *mask) {
     llvm::Value *mmval = LaneMask(mask);
     return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
-                   LLVMInt32(0), LLVMGetName(mask, "_none"));
+                   LLVMInt64(0), LLVMGetName(mask, "_none"));
 }
 
 
 llvm::Value *
 FunctionEmitContext::LaneMask(llvm::Value *v) {
     // Call the target-dependent movmsk function to turn the vector mask
-    // into an i32 value
+    // into an i64 value
     std::vector<Symbol *> mm;
     m->symbolTable->LookupFunction("__movmsk", &mm);
     if (g->target.maskBitCount == 1)
@@ -1396,7 +1399,7 @@ FunctionEmitContext::AddInstrumentationPoint(const char *note) {
     args.push_back(lGetStringAsValue(bblock, note));
     // arg 3: line number
     args.push_back(LLVMInt32(currentPos.first_line));
-    // arg 4: current mask, movmsk'ed down to an int32
+    // arg 4: current mask, movmsk'ed down to an int64
     args.push_back(LaneMask(GetFullMask()));
 
     llvm::Function *finst = m->module->getFunction("ISPCInstrument");
@@ -3196,10 +3199,12 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
             // pointer to be called.
             llvm::Value *currentMask = LoadInst(maskPtr);
             llvm::Function *cttz = 
-                m->module->getFunction("__count_trailing_zeros_i32");
+                m->module->getFunction("__count_trailing_zeros_i64");
             AssertPos(currentPos, cttz != NULL);
-            llvm::Value *firstLane = CallInst(cttz, NULL, LaneMask(currentMask),
-                                              "first_lane");
+            llvm::Value *firstLane64 = CallInst(cttz, NULL, LaneMask(currentMask),
+                                                "first_lane64");
+            llvm::Value *firstLane = 
+                TruncInst(firstLane64, LLVMTypes::Int32Type, "first_lane32");
 
             // Get the pointer to the function we're going to call this
             // time through: ftpr = func[firstLane]
diff --git a/ctx.h b/ctx.h
index 304f8af1..10a22115 100644
--- a/ctx.h
+++ b/ctx.h
@@ -276,7 +276,7 @@ public:
     llvm::Value *None(llvm::Value *mask);
 
     /** Given a boolean mask value of type LLVMTypes::MaskType, return an
-        i32 value wherein the i'th bit is on if and only if the i'th lane
+        i64 value wherein the i'th bit is on if and only if the i'th lane
         of the mask is on. */
     llvm::Value *LaneMask(llvm::Value *mask);
 
diff --git a/docs/perfguide.rst b/docs/perfguide.rst
index 6e8555bf..b8e65893 100644
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
@@ -624,7 +624,7 @@ gathers happen.)
 
     extern "C" {
         void ISPCInstrument(const char *fn, const char *note, 
-                            int line, int mask);
+                            int line, uint64_t mask);
     }
 
 This function is passed the file name of the ``ispc`` file running, a short
@@ -637,7 +637,7 @@ as follows:
 
 ::
 
-   ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
+   ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);
 
 This call indicates that at the currently executing program has just
 entered the function defined at line 55 of the file ``foo.ispc``, with a
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 57eba63f..80c2635c 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -311,8 +311,8 @@ INSERT_EXTRACT(__vec1_d, double)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops
 
-static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
-    return mask.v;
+static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
+    return (uint64_t)mask.v;
 }
 
 static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 5fe22b78..9f301bb7 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -224,8 +224,8 @@ CAST_BITS_SCALAR(double, int64_t)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops
 
-static FORCEINLINE uint32_t __movmsk(__vec4_i1 mask) {
-    return _mm_movemask_ps(mask.v);
+static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) {
+    return (uint64_t)_mm_movemask_ps(mask.v);
 }
 
 static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) {
diff --git a/ispc.h b/ispc.h
index d0837110..4cbbce7d 100644
--- a/ispc.h
+++ b/ispc.h
@@ -61,7 +61,7 @@
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
     targets.
  */
-#define ISPC_MAX_NVEC 32
+#define ISPC_MAX_NVEC 64
 
 // Forward declarations of a number of widely-used LLVM types
 namespace llvm {
diff --git a/module.cpp b/module.cpp
index b5afc875..d16916be 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1228,7 +1228,7 @@ Module::writeHeader(const char *fn) {
     if (g->emitInstrumentation) {
         fprintf(f, "#define ISPC_INSTRUMENTATION 1\n");
         fprintf(f, "extern \"C\" {\n");
-        fprintf(f, "  void ISPCInstrument(const char *fn, const char *note, int line, int mask);\n");
+        fprintf(f, "  void ISPCInstrument(const char *fn, const char *note, int line, uint64_t mask);\n");
         fprintf(f, "}\n");
     }
 
diff --git a/opt.cpp b/opt.cpp
index df4dd572..ce455d6f 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -269,12 +269,12 @@ lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
     execution mask, convert it to a bitvector where the 0th bit corresponds
     to the first vector value and so forth.
 */
-static uint32_t
+static uint64_t
 lConstElementsToMask(const llvm::SmallVector<llvm::Constant *, 
                                              ISPC_MAX_NVEC> &elements) {
-    Assert(elements.size() <= 32);
+    Assert(elements.size() <= 64);
 
-    uint32_t mask = 0;
+    uint64_t mask = 0;
     for (unsigned int i = 0; i < elements.size(); ++i) {
         llvm::APInt intMaskValue;
         // SSE has the "interesting" approach of encoding blending
@@ -293,7 +293,7 @@ lConstElementsToMask(const llvm::SmallVector<llvm::Constant *,
         // Is the high-bit set?  If so, OR in the appropriate bit in
         // the result mask
         if (intMaskValue.countLeadingOnes() > 0)
-            mask |= (1 << i);
+            mask |= (1ull << i);
     }
     return mask;
 }
@@ -306,7 +306,7 @@ lConstElementsToMask(const llvm::SmallVector<llvm::Constant *,
     4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
  */
 static bool
-lGetMask(llvm::Value *factor, uint32_t *mask) {
+lGetMask(llvm::Value *factor, uint64_t *mask) {
 #ifndef LLVM_3_0
     llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
     if (cdv != NULL) {
@@ -364,7 +364,7 @@ enum MaskStatus { ALL_ON, ALL_OFF, MIXED, UNKNOWN };
 */
 static MaskStatus
 lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
-    uint32_t bits;
+    uint64_t bits;
     if (lGetMask(mask, &bits) == false)
         return UNKNOWN;
 
@@ -373,7 +373,7 @@ lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
 
     if (vecWidth == -1)
         vecWidth = g->target.vectorWidth;
-    Assert(vecWidth <= 32);
+    Assert(vecWidth <= 64);
 
     for (int i = 0; i < vecWidth; ++i) {
         if ((bits & (1ull << i)) == 0)
@@ -601,12 +601,12 @@ private:
         instruction for this optimization pass.
      */
     struct BlendInstruction {
-        BlendInstruction(llvm::Function *f, uint32_t ao, int o0, int o1, int of)
+        BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
             : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) { }
         /** Function pointer for the blend instruction */ 
         llvm::Function *function;
         /** Mask value for an "all on" mask for this instruction */
-        uint32_t allOnMask;
+        uint64_t allOnMask;
         /** The operand number in the llvm CallInst corresponds to the
             first operand to blend with. */
         int op0;
@@ -728,7 +728,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                 goto restart;
             }
 
-            uint32_t mask;
+            uint64_t mask;
             if (lGetMask(factor, &mask) == true) {
                 llvm::Value *value = NULL;
                 if (mask == 0)
@@ -748,12 +748,13 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
         }
         else if (matchesMaskInstruction(callInst->getCalledFunction())) {
             llvm::Value *factor = callInst->getArgOperand(0);
-            uint32_t mask;
+            uint64_t mask;
             if (lGetMask(factor, &mask) == true) {
                 // If the vector-valued mask has a known value, replace it
                 // with the corresponding integer mask from its elements
                 // high bits.
-                llvm::Value *value = LLVMInt32(mask);
+                llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ?
+                    LLVMInt32(mask) : LLVMInt64(mask);
                 llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
                                            iter, value);
                 modifiedAny = true;
@@ -763,7 +764,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
         else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
                  callInst->getCalledFunction() == avxMaskedLoad64) {
             llvm::Value *factor = callInst->getArgOperand(1);
-            uint32_t mask;
+            uint64_t mask;
             if (lGetMask(factor, &mask) == true) {
                 if (mask == 0) {
                     // nothing being loaded, replace with undef value
@@ -802,7 +803,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                  callInst->getCalledFunction() == avxMaskedStore64) {
             // NOTE: mask is the 2nd parameter, not the 3rd one!!
             llvm::Value *factor = callInst->getArgOperand(1);
-            uint32_t mask;
+            uint64_t mask;
             if (lGetMask(factor, &mask) == true) {
                 if (mask == 0) {
                     // nothing actually being stored, just remove the inst
@@ -931,7 +932,7 @@ VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
         if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
             continue;
 
-        uint32_t mask;
+        uint64_t mask;
         if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
 #if 0
             fprintf(stderr, "mask %d\n", mask);
@@ -939,7 +940,7 @@ VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
             fprintf(stderr, "-----------\n");
 #endif
             llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
-                                       iter, LLVMInt32(mask));
+                                       iter, LLVMInt64(mask));
             modifiedAny = true;
             goto restart;
         }
diff --git a/stdlib.ispc b/stdlib.ispc
index 9b2fe17d..4cfcdea4 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -355,7 +355,8 @@ static inline uniform bool all(bool v) {
 #else
     int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
 #endif
-    return __movmsk(match) == (1 << programCount) - 1;
+    return __movmsk(match) == ((programCount == 64) ? ~0ull : 
+                               ((1ull << programCount) - 1));
 }
 
 __declspec(safe) 
@@ -388,14 +389,14 @@ __declspec(safe)
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
 #ifdef ISPC_TARGET_GENERIC
-    return __popcnt_int32(__movmsk(v & __mask));
+    return __popcnt_int64(__movmsk(v & __mask));
 #else
-    return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask));
+    return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask));
 #endif
 }
 
 __declspec(safe) 
-static inline uniform int lanemask() {
+static inline uniform unsigned int64 lanemask() {
     return __movmsk(__mask);
 }
 
@@ -1615,12 +1616,12 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
     TA ret[programCount];                                               \
     TA memVal;                                                          \
     uniform int lastSwap;                                               \
-    uniform int mask = lanemask();                                      \
+    uniform unsigned int64 mask = lanemask();                           \
     /* First, have the first running program instance (if any) perform  \
        the swap with memory with its value of "value"; record the       \
        value returned. */                                               \
     for (; i < programCount; ++i) {                                     \
-        if ((mask & (1 << i)) == 0)                                     \
+        if ((mask & (1ull << i)) == 0)                                  \
             continue;                                                   \
         memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
         lastSwap = i;                                                   \
@@ -1632,7 +1633,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
        current instance had executed a hardware atomic swap right before \
        the last one that did a swap. */                                 \
     for (; i < programCount; ++i) {                                     \
-        if ((mask & (1 << i)) == 0)                                     \
+        if ((mask & (1ull << i)) == 0)                                  \
             continue;                                                   \
         ret[lastSwap] = extract(value, i);                              \
         lastSwap = i;                                                   \
diff --git a/stmt.cpp b/stmt.cpp
index 6a6f58e5..11cc94ea 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -2843,7 +2843,7 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
                                                         pos);
 
     // Compute the per lane mask to test the mask bits against: (1 << iter)
-    ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, 1,
+    ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt64, 1ll,
                                        iterSym->pos);
     Expr *shiftLaneExpr = new BinaryExpr(BinaryExpr::Shl, oneExpr, symExpr, 
                                          pos);
@@ -2863,4 +2863,3 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
     // And return a for loop that wires it all together.
     return new ForStmt(initStmt, testExpr, stepStmt, laneCheckIf, false, pos);
 }
-