diff --git a/Makefile b/Makefile
index 43f41e09..92debe4f 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avxh avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avx-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
@@ -160,7 +160,7 @@ BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
@@ -268,20 +268,25 @@ objs/builtins-c-64.cpp: builtins/builtins.c
 
 objs/stdlib_mask1_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask1
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask1 > $@
 
 objs/stdlib_mask8_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask8
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask8 > $@
 
 objs/stdlib_mask16_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask16
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask16 > $@
 
 objs/stdlib_mask32_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask32
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask32 > $@
+
+objs/stdlib_mask64_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask64
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask64 > $@
diff --git a/builtins.cpp b/builtins.cpp
index 816d4d78..f8d4136e 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -302,6 +302,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
         // check the llvm.x86.* intrinsics for now...
         if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
             llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
+            if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s  \n", funcName.c_str());
             Assert(id != 0);
             llvm::Type *intrinsicType =
                 llvm::Intrinsic::getType(*g->ctx, id);
@@ -936,10 +937,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         switch (g->target->getVectorWidth()) {
         case 4:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_avxh_32bit);
+                EXPORT_MODULE(builtins_bitcode_avx_i64x4_32bit);
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_avxh_64bit);
+                EXPORT_MODULE(builtins_bitcode_avx_i64x4_64bit);
             }
             break;
         case 8:
@@ -1105,7 +1106,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         // serialized version of the stdlib.ispc file to get its
         // definitions added.
         extern char stdlib_mask1_code[], stdlib_mask8_code[];
-        extern char stdlib_mask16_code[], stdlib_mask32_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[];
         if (g->target->getISA() == Target::GENERIC &&
             g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
             yy_scan_string(stdlib_mask32_code);
@@ -1124,6 +1125,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             case 32:
                 yy_scan_string(stdlib_mask32_code);
                 break;
+            case 64:
+                yy_scan_string(stdlib_mask64_code);
+                break;
             default:
                 FATAL("Unhandled mask bit size for stdlib.ispc");
             }
diff --git a/builtins/target-avxh.ll b/builtins/target-avx-i64x4.ll
similarity index 98%
rename from builtins/target-avxh.ll
rename to builtins/target-avx-i64x4.ll
index 98c9111d..d7dbb6bd 100644
--- a/builtins/target-avxh.ll
+++ b/builtins/target-avx-i64x4.ll
@@ -29,7 +29,7 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-include(`target-avx-h.ll')
+include(`target-avx-i64x4base.ll')
 
 rdrand_decls()
 
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-i64x4base.ll
similarity index 78%
rename from builtins/target-avx-h.ll
rename to builtins/target-avx-i64x4base.ll
index 283eaddd..05bf178d 100644
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-i64x4base.ll
@@ -33,7 +33,7 @@
 ;; Basic 4-wide definitions
 
 define(`WIDTH',`4')
-define(`MASK',`i32')
+define(`MASK',`i64')
 include(`util.m4')
 
 stdlib_core()
@@ -185,32 +185,32 @@ define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind reado
 ; horizontal ops
 
 ;; sse intrinsic 
-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
 
-define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %v64 = zext i32 %v to i64
   ret i64 %v64
 }
 
-define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp ne i32 %v, 0
   ret i1 %cmp
 }
 
-define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp eq i32 %v, 15
   ret i1 %cmp
 }
 
-define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp eq i32 %v, 0
   ret i1 %cmp
 }
@@ -392,7 +392,8 @@ masked_load(i16, 2)
 declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
  
-define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
+  %mask      = trunc <4 x i64> %mask64 to <4 x i32>
   %floatmask = bitcast <4 x i32> %mask to <4 x float>
   %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
   %retval = bitcast <4 x float> %floatval to <4 x i32>
@@ -400,18 +401,11 @@ define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline
 }
 
 
-define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; double up masks, bitcast to doubles
-  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
-     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-
-  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
-
-  %vald = shufflevector <4 x double> %val0d, <4 x double> undef,
-      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %val = bitcast <4 x double> %vald to <4 x i64>
-  ret <4 x i64> %val
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
+  %doublemask = bitcast <4 x i64> %mask to <4 x double>
+  %doubleval  = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
+  %retval = bitcast <4 x double> %doubleval to <4 x i64>
+  ret <4 x i64> %retval
 }
 
 masked_load_float_double()
@@ -428,83 +422,62 @@ declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
 
 define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
-                                <4 x i32>) nounwind alwaysinline {
-  %ptr = bitcast <4 x i32> * %0 to i8 *
-  %val = bitcast <4 x i32> %1 to <4 x float>
-  %mask = bitcast <4 x i32> %2 to <4 x float>
+                                <4 x i64>) nounwind alwaysinline {
+  %mask32 = trunc <4 x i64> %2 to <4 x i32>
+
+  %ptr    = bitcast <4 x i32> * %0 to i8 *
+  %val    = bitcast <4 x i32> %1 to <4 x float>
+  %mask   = bitcast <4 x i32> %mask32 to <4 x float>
   call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
   ret void
 }
 
 define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
-                                <4 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast <4 x i64> * %0 to i8 *
-  %val = bitcast <4 x i64> %1 to <4 x double>
-
-  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
-     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-
-  %val0 = shufflevector <4 x double> %val, <4 x double> undef,
-     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-
-  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+                                <4 x i64>) nounwind alwaysinline {
+  %ptr  = bitcast <4 x i64> * %0 to i8 *
+  %val  = bitcast <4 x i64> %1 to <4 x double>
+  %mask = bitcast <4 x i64> %2 to <4 x double>
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
   ret void
 }
 
 
-masked_store_blend_8_16_by_4()
+masked_store_blend_8_16_by_4_mask64()
 
 ;; sse intrinsic
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+declare <4 x float>  @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                              <4 x float>) nounwind readnone
 
-
 define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
-                                      <4 x i32> %mask) nounwind alwaysinline {
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask          = trunc   <4 x i64> %2 to <4 x i32>
   %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
-  %oldValue = load <4 x i32>* %0, align 4
-  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
-  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
-  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
-                                                     <4 x float> %newAsFloat,
-                                                     <4 x float> %mask_as_float)
+  %oldValue      = load    <4 x i32>* %0, align 4
+  %oldAsFloat    = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat    = bitcast <4 x i32> %1 to <4 x float>
+  %blend         = call    <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                             <4 x float> %newAsFloat,
+                                                             <4 x float> %mask_as_float)
   %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
   store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
   ret void
 }
 
 ;; avx intrinsic
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
-                                                <8 x float>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                <4 x double>) nounwind readnone
 
-define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, 
-                                      <4 x i32> %i32mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr, align 8
-  %mask = bitcast <4 x i32> %i32mask to <4 x float>
-
-  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
-  ; are actually bitcast <4 x i64> values
-  ;
-  ; set up the first four 64-bit values
-  %old01  = bitcast <4 x i64> %oldValue to <4 x i64>
-  %old01f = bitcast <4 x i64> %old01 to <8 x float>
-  %new01  = bitcast <4 x i64> %new  to <4 x i64>
-  %new01f = bitcast <4 x i64> %new01 to <8 x float>
-  ; compute mask--note that the indices are all doubled-up
-  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
-                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
-                                     i32 2, i32 2, i32 3, i32 3>
-  ; and blend them
-  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
-                                                            <8 x float> %new01f,
-                                                            <8 x float> %mask01)
-  %result01 = bitcast <8 x float> %result01f to <4 x i64>
-
-
-  %final = bitcast <4 x i64> %result01 to <4 x i64>
-  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask_as_double = bitcast <4 x i64>  %2 to <4 x double>
+  %oldValue       = load    <4 x i64>* %0, align 4
+  %oldAsDouble    = bitcast <4 x i64>  %oldValue to <4 x double>
+  %newAsDouble    = bitcast <4 x i64>  %1 to <4 x double>
+  %blend          = call    <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
+                                                                        <4 x double> %newAsDouble,
+                                                                        <4 x double> %mask_as_double)
+  %blendAsInt = bitcast <4 x double> %blend to <4 x i64>
+  store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
   ret void
 }
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index 6c90c821..68fa818b 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -947,6 +947,22 @@ define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
   %r = sext <$1 x i32> %0 to <$1 x i64>
   ret <$1 x i64> %r
 }
+
+define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
+  ret <$1 x i64> %0
+}
 ')
 
 mask_converts(WIDTH)
@@ -2689,9 +2705,13 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
 }
 
 define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
-  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
-  `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
-  ret <WIDTH x i32> %se')
+;;  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+;; ret <WIDTH x i32> %se')
+  ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
+         MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
+                   `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
+  ret <WIDTH x i32> %se
 }
 
 
@@ -3508,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
 }
 ')
 
+define(`masked_store_blend_8_16_by_4_mask64', `
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old32 = bitcast <4 x i8> %old to i32
+    %new32 = bitcast <4 x i8> %1 to i32
+
+    %mask8 = trunc <4 x i64> %2 to <4 x i8>
+    %mask32 = bitcast <4 x i8> %mask8 to i32
+    %notmask32 = xor i32 %mask32, -1
+
+    %newmasked = and i32 %new32, %mask32
+    %oldmasked = and i32 %old32, %notmask32
+    %result = or i32 %newmasked, %oldmasked
+
+    %resultvec = bitcast i32 %result to <4 x i8>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  ')
+  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <4 x i16> %old to i64
+    %new64 = bitcast <4 x i16> %1 to i64
+
+    %mask16 = trunc <4 x i64> %2 to <4 x i16>
+    %mask64 = bitcast <4 x i16> %mask16 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <4 x i16>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  ')
+  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
+  ret void
+}
+')
+
 define(`masked_store_blend_8_16_by_8', `
 define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
                                      <8 x i32>) nounwind alwaysinline {
diff --git a/ispc.cpp b/ispc.cpp
index 02c23568..046c64c4 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,14 +446,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avxh") ) {
-        fprintf(stderr, " ISA is avxh \n");
+    else if (!strcasecmp(isa, "avx-i64x4") ) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
-        this->m_maskBitCount = 32;
+        this->m_maskBitCount = 64;
     }
     else if (!strcasecmp(isa, "avx-x2") ||
              !strcasecmp(isa, "avx1-x2") ||
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 180c8676..64691498 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -132,6 +132,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
         break;
+    case 64:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt64Ty(*ctx), target.getVectorWidth());
+        break;
     default:
         FATAL("Unhandled mask width for initializing MaskType");
     }
@@ -183,6 +187,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
         break;
+    case 64:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
+                                    true /*signed*/); // 0xffffffff
+        break;
     default:
         FATAL("Unhandled mask width for onMask");
     }
@@ -210,6 +218,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
                                          true /*signed*/);
         break;
+    case 64:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
     default:
         FATAL("Unhandled mask width for offMask");
     }
@@ -480,7 +492,10 @@ LLVMUInt64Vector(const uint64_t *ivec) {
 llvm::Constant *
 LLVMBoolVector(bool b) {
     llvm::Constant *v;
-    if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+    if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int64Type, b ? 0xffffffffffffffffull : 0,
+                                   false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
         v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
                                    false /*unsigned*/);
     else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
@@ -506,7 +521,10 @@ LLVMBoolVector(const bool *bvec) {
     std::vector<llvm::Constant *> vals;
     for (int i = 0; i < g->target->getVectorWidth(); ++i) {
         llvm::Constant *v;
-        if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+        if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int64Type, bvec[i] ? 0xffffffffffffffffull : 0,
+                                       false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
             v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
                                        false /*unsigned*/);
         else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
diff --git a/parse.yy b/parse.yy
index 5fc01cb0..9a2b4fc3 100644
--- a/parse.yy
+++ b/parse.yy
@@ -2183,6 +2183,9 @@ static void lAddMaskToSymbolTable(SourcePos pos) {
     case 32:
         t = AtomicType::VaryingUInt32;
         break;
+    case 64:
+        t = AtomicType::VaryingUInt64;
+        break;
     default:
         FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
     }
diff --git a/stdlib.ispc b/stdlib.ispc
index db9d7f36..6d7ee051 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -50,6 +50,9 @@
 #elif (ISPC_MASK_BITS == 32)
   #define IntMaskType int32
   #define UIntMaskType unsigned int32
+#elif (ISPC_MASK_BITS == 64)
+  #define IntMaskType int64
+  #define UIntMaskType unsigned int64
 #else
   #error Unknown value of ISPC_MASK_BITS
 #endif