diff --git a/Makefile b/Makefile
index 34839f86..182cff26 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,7 @@
 
 # If you have your own special version of llvm and/or clang, change
 # these variables to match.
-LLVM_CONFIG=$(shell which /home/evghenii/usr/local/llvm/bin-trunk/bin/llvm-config)
+LLVM_CONFIG=$(shell which /home/evghenii/usr/local/llvm/bin-3.2/bin/llvm-config)
 CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
 
 # Enable ARM by request
diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll
index 8bceace7..fa719ba9 100644
--- a/builtins/target-nvptx64.ll
+++ b/builtins/target-nvptx64.ll
@@ -647,15 +647,15 @@ define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline
 define  float @__reduce_min_float(<1 x float>) nounwind readnone {
   %value = extractelement <1 x float> %0, i32 0
   %call = tail call float @__shfl_xor_float(float %value, i32 16)
-  %call1 = tail call float @__fminf(float %value, float %call) #4
+  %call1 = tail call float @__fminf(float %value, float %call) 
   %call.1 = tail call float @__shfl_xor_float(float %call1, i32 8)
-  %call1.1 = tail call float @__fminf(float %call1, float %call.1) #4
+  %call1.1 = tail call float @__fminf(float %call1, float %call.1) 
   %call.2 = tail call float @__shfl_xor_float(float %call1.1, i32 4)
-  %call1.2 = tail call float @__fminf(float %call1.1, float %call.2) #4
+  %call1.2 = tail call float @__fminf(float %call1.1, float %call.2) 
   %call.3 = tail call float @__shfl_xor_float(float %call1.2, i32 2)
-  %call1.3 = tail call float @__fminf(float %call1.2, float %call.3) #4
+  %call1.3 = tail call float @__fminf(float %call1.2, float %call.3) 
   %call.4 = tail call float @__shfl_xor_float(float %call1.3, i32 1)
-  %call1.4 = tail call float @__fminf(float %call1.3, float %call.4) #4
+  %call1.4 = tail call float @__fminf(float %call1.3, float %call.4) 
   ret float %call1.4
 }
 
diff --git a/builtins/util_ptx.m4 b/builtins/util_ptx.m4
index ca61f84e..d60a09dd 100644
--- a/builtins/util_ptx.m4
+++ b/builtins/util_ptx.m4
@@ -2917,7 +2917,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  %call2 = tail call noalias i8* @malloc(i64 %size) #3
+  %call2 = tail call noalias i8* @malloc(i64 %size) 
   %phitmp = ptrtoint i8* %call2 to i64
   br label %if.end
 
@@ -2946,7 +2946,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  tail call void @free(i8* %ptr) #3
+  tail call void @free(i8* %ptr) 
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %entry
diff --git a/examples_cuda/aobench/ao1.ispc b/examples_cuda/aobench/ao1.ispc
index 24659535..b01523d0 100644
--- a/examples_cuda/aobench/ao1.ispc
+++ b/examples_cuda/aobench/ao1.ispc
@@ -298,4 +298,5 @@ export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
   const uniform int ntilex = (w+TILEX-1)/TILEX;
   const uniform int ntiley = (h+TILEY-1)/TILEY;
   launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image);
+  sync;
 }
diff --git a/examples_cuda/deferred/Makefile_gpu b/examples_cuda/deferred/Makefile_gpu
index 83b45531..6e7739fa 100644
--- a/examples_cuda/deferred/Makefile_gpu
+++ b/examples_cuda/deferred/Makefile_gpu
@@ -41,8 +41,9 @@ $(PROG): $(CXX_OBJ) kernel.ptx
 	$(ISPC) $(ISPCFLAGS) --emit-llvm -o `basename $< .ispc`_ispc_nvptx64.bc -h `basename $< .ispc`_ispc.h $< --emit-llvm
 
 %.ptx: %.bc
-	$(LLVM32DIS) $<
-	$(PTXGEN)  `basename $< .bc`.ll > $@
+	$(PTXGEN)  $< > $@
+# $(LLVM32DIS) $<
+# $(PTXGEN)  `basename $< .bc`.ll > $@
 
 kernel.ptx: $(PTXSRC)
 	cat $^ > kernel.ptx
diff --git a/examples_cuda/options/Makefile_gpu b/examples_cuda/options/Makefile_gpu
index 03c63d50..768da3d9 100644
--- a/examples_cuda/options/Makefile_gpu
+++ b/examples_cuda/options/Makefile_gpu
@@ -42,8 +42,9 @@ $(PROG): $(CXX_OBJ) kernel.ptx
 	$(ISPC) $(ISPCFLAGS) --emit-llvm -o `basename $< .ispc`_ispc_nvptx64.bc -h `basename $< .ispc`_ispc.h $< --emit-llvm
 
 %.ptx: %.bc
-	$(LLVM32DIS) $<
-	$(PTXGEN)  `basename $< .bc`.ll > $@
+	$(PTXGEN)  $< > $@
+#	$(LLVM32DIS) $<
+#	$(PTXGEN)  `basename $< .bc`.ll > $@
 
 kernel.ptx: $(PTXSRC)
 	cat $^ > kernel.ptx