diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
index d29d1c1c..e6385d10 100644
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -124,23 +124,23 @@ define i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
 ;; i32
 define internal i32 @__shfl_i32_nvptx(i32, i32) nounwind readnone alwaysinline
 {
-  %shfl = tail call i32 asm sideeffect "shfl.idx.b32  $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline
+  %shfl = tail call i32 asm sideeffect "shfl.idx.b32  $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) 
   ret i32 %shfl
 }
 define internal i32 @__shfl_xor_i32_nvptx(i32, i32) nounwind readnone alwaysinline
 {
-  %shfl = tail call i32 asm sideeffect "shfl.bfly.b32  $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline
+  %shfl = tail call i32 asm sideeffect "shfl.bfly.b32  $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) 
   ret i32 %shfl
 }
 ;; float
 define internal float @__shfl_float_nvptx(float, i32) nounwind readnone alwaysinline
 {
-  %shfl = tail call float asm sideeffect "shfl.idx.b32  $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline
+  %shfl = tail call float asm sideeffect "shfl.idx.b32  $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1)
   ret float %shfl
 }
 define internal float @__shfl_xor_float_nvptx(float, i32) nounwind readnone alwaysinline
 {
-  %shfl = tail call float asm sideeffect "shfl.bfly.b32  $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline
+  %shfl = tail call float asm sideeffect "shfl.bfly.b32  $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) 
   ret float %shfl
 }
 
@@ -148,12 +148,12 @@ define internal float @__shfl_xor_float_nvptx(float, i32) nounwind readnone alwa
 ;; float/double
 define internal float @__fminf_nvptx(float,float) nounwind readnone alwaysinline
 {
-  %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline
+  %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1)
   ret float %min
 }
 define internal float @__fmaxf_nvptx(float,float) nounwind readnone alwaysinline
 {
-  %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline
+  %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1)
   ret float %max
 }
 
@@ -245,12 +245,12 @@ define internal i32 @__ballot_nvptx(i1) nounwind readnone alwaysinline
       "{ .reg .pred %p1; 
          setp.ne.u32 %p1, $1, 0; 
          vote.ballot.b32  $0, %p1; 
-      }", "=r,r"(i32 %conv) nounwind readnone alwaysinline
+      }", "=r,r"(i32 %conv) 
   ret i32 %res
 }
 define internal i32 @__lanemask_lt_nvptx() nounwind readnone alwaysinline
 {
-  %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() nounwind readnone alwaysinline
+  %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() 
   ret i32 %mask
 }
 
@@ -418,7 +418,7 @@ define float @__half_to_float_uniform(i16 %v) nounwind readnone alwaysinline
       "{ .reg .f16 tmp; 
         mov.b16 tmp, $1;
         cvt.f32.f16 $0, tmp;
-     }", "=f,h"(i16 %v) nounwind readnone alwaysinline
+     }", "=f,h"(i16 %v) 
   ret float %res
 }
 define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline
@@ -429,7 +429,7 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline
       "{ .reg .f16 tmp; 
         cvt.rn.f16.f32 tmp, $1;
         mov.b16 $0, tmp;
-     }", "=h,f"(float %v) nounwind readnone alwaysinline
+     }", "=h,f"(float %v) 
   ret i16 %half
 }
 define <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone alwaysinline
@@ -473,7 +473,7 @@ define internal float @__round_uniform_float_ptx(float) nounwind readnone always
            cvt.rzi.f32.f32	f9, f4;
 BB2_2:
            mov.f32 $0, f9;
-        }", "=f,f"(float %0) nounwind readnone alwaysinline
+        }", "=f,f"(float %0) 
   ret float %2
 }
 define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
@@ -490,12 +490,12 @@ define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
 }
 define float @__floor_uniform_float(float) nounwind readnone alwaysinline
 {
-  %2 = tail call float asm sideeffect "cvt.rmi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone
+  %2 = tail call float asm sideeffect "cvt.rmi.f32.f32 $0, $1;", "=f,f"(float %0) 
   ret float %2
 }
 define float @__ceil_uniform_float(float) nounwind readnone alwaysinline
 {
-  %2 = tail call float asm sideeffect "cvt.rpi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone
+  %2 = tail call float asm sideeffect "cvt.rpi.f32.f32 $0, $1;", "=f,f"(float %0)
   ret float %2
 }
 
@@ -534,17 +534,17 @@ define double @__round_uniform_double(double) nounwind readnone alwaysinline
 
 BB5_2:
           mov.f64	$0, fd8;
-        }", "=d,d"(double %0) nounwind readnone alwaysinline
+        }", "=d,d"(double %0)
   ret double %2
 }
 define double @__floor_uniform_double(double) nounwind readnone alwaysinline
 {
-  %2 = tail call double asm sideeffect "cvt.rmi.f64.f64 $0, $1;", "=f,f"(double %0) nounwind alwaysinline readnone
+  %2 = tail call double asm sideeffect "cvt.rmi.f64.f64 $0, $1;", "=f,f"(double %0)
   ret double %2
 }
 define double @__ceil_uniform_double(double) nounwind readnone alwaysinline
 {
-  %2 = tail call double asm sideeffect "cvt.rpi.f64.f64 $0, $1;", "=f,f"(double %0) nounwind alwaysinline readnone
+  %2 = tail call double asm sideeffect "cvt.rpi.f64.f64 $0, $1;", "=f,f"(double %0)
   ret double %2
 }
 
@@ -763,7 +763,7 @@ declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 define  i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
-;;  %res = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %0) nounwind readnone alwaysinline
+;;  %res = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %0)
  ;; ret i32 %res
 }
 
@@ -1117,7 +1117,7 @@ define internal i32 @__shfl_reduce_and_step_i32_nvptx(i32, i32) nounwind readnon
         shfl.bfly.b32  r0|p, $1, $2, 0;
         @p and.b32 r0, r0, $3;
         mov.u32 $0, r0;
-      }", "=r,r,r,r"(i32 %0, i32 %1, i32 %0) nounwind readnone alwaysinline
+      }", "=r,r,r,r"(i32 %0, i32 %1, i32 %0)
   ret i32 %shfl
 }
 shfl64(__shfl_reduce_and_step, i64)
@@ -1298,7 +1298,7 @@ define internal i32 @__shfl_scan_add_step_i32(i32 %partial, i32 %up_offset) noun
        shfl.up.b32 r0|p, $1, $2, 0;
        @p add.u32 r0, r0, $3;
        mov.u32 $0, r0;
-       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial)
   ret i32 %result;
 }
 define <1 x i32> @__exclusive_scan_add_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
@@ -1325,7 +1325,7 @@ define internal i32 @__shfl_scan_or_step_i32(i32 %partial, i32 %up_offset) nounw
        shfl.up.b32 r0|p, $1, $2, 0;
        @p or.b32 r0, r0, $3;
        mov.u32 $0, r0;
-       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind alwaysinline
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial)
   ret i32 %result;
 }
 define <1 x i32> @__exclusive_scan_or_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
@@ -1341,7 +1341,7 @@ define <1 x i32> @__exclusive_scan_or_i32(<1 x i32>, <1 x i1>) nounwind readnone
         shfl.up.b32 r0|p, $1, 1, 0;
         @!p mov.u32 r0, 0;
         mov.u32 $0, r0;
-      }","=r,r"(i32 %v1); alwaysinline nounwind
+      }","=r,r"(i32 %v1)
   
   %s1 = tail call i32 @__shfl_scan_or_step_i32(i32 %v,  i32  1);
   %s2 = tail call i32 @__shfl_scan_or_step_i32(i32 %s1, i32  2);
@@ -1360,7 +1360,7 @@ define internal i32 @__shfl_scan_and_step_i32(i32 %partial, i32 %up_offset) noun
        shfl.up.b32 r0|p, $1, $2, 0;
        @p and.b32 r0, r0, $3;
        mov.u32 $0, r0;
-       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) alwaysinline
+       }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial)
   ret i32 %result;
 }
 define <1 x i32> @__exclusive_scan_and_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline
@@ -1376,7 +1376,7 @@ define <1 x i32> @__exclusive_scan_and_i32(<1 x i32>, <1 x i1>) nounwind readnon
         shfl.up.b32 r0|p, $1, 1, 0;
         @!p mov.u32 r0, -1;
         mov.u32 $0, r0;
-      }","=r,r"(i32 %v1); alwaysinline
+      }","=r,r"(i32 %v1)
 
   %s1 = call i32 @__shfl_scan_and_step_i32(i32 %v,  i32  1);
   %s2 = call i32 @__shfl_scan_and_step_i32(i32 %s1, i32  2);
@@ -1395,7 +1395,7 @@ define internal float @__shfl_scan_add_step_float(float %partial, i32 %up_offset
        shfl.up.b32 f0|p, $1, $2, 0;
        @p add.f32 f0, f0, $3;
        mov.f32 $0, f0;
-       }", "=f,f,r,f"(float %partial, i32 %up_offset, float %partial) nounwind readnone alwaysinline
+       }", "=f,f,r,f"(float %partial, i32 %up_offset, float %partial)
   ret float %result;
 }
 define <1 x float> @__exclusive_scan_add_float(<1 x float>, <1 x i1>) nounwind readnone alwaysinline
@@ -1427,7 +1427,7 @@ define internal double @__shfl_scan_add_step_double(double %partial, i32 %up_off
        mov.b64 fd0, {r3,r4};
        @p add.f64 fd0, fd0, $3;
        mov.f64 $0, fd0;
-       }", "=d,d,r,d"(double %partial, i32 %up_offset, double %partial) nounwind readnone alwaysinline
+       }", "=d,d,r,d"(double %partial, i32 %up_offset, double %partial)
   ret double %result;
 }
 define <1 x double> @__exclusive_scan_add_double(<1 x double>, <1 x i1>) nounwind readnone alwaysinline
@@ -1460,7 +1460,7 @@ define internal i64 @__shfl_scan_add_step_i64(i64 %partial, i32 %up_offset) noun
        mov.b64 rl0, {r3,r4};
        @p add.s64 rl0, rl0, $3;
        mov.s64 $0, rl0;
-       }", "=l,l,r,l"(i64 %partial, i32 %up_offset, i64 %partial) nounwind readnone alwaysinline
+       }", "=l,l,r,l"(i64 %partial, i32 %up_offset, i64 %partial) 
   ret i64 %result;
 }
 define <1 x i64> @__exclusive_scan_add_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline
diff --git a/examples/portable/common_ptx.mk b/examples/portable/common_ptx.mk
index ed4b11cf..cfaa0b02 100644
--- a/examples/portable/common_ptx.mk
+++ b/examples/portable/common_ptx.mk
@@ -21,14 +21,15 @@ ifdef PTXCC_REGMAX
 endif
 
 #
-ISPC=ispc
+ISPC=$(ISPC_HOME)/ispc
 ISPC_FLAGS+=-O3 --math-lib=fast --target=nvptx --opt=fast-math
 #
 #
 #
 ISPC_LLVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.o)
 ISPC_NVVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.o)
-ISPC_BCS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.bc)
+#ISPC_BCS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.bc)
+ISPC_LLS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.ll)
 ISPC_LLVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.ptx)
 ISPC_NVVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.ptx)
 ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.h)
@@ -42,13 +43,13 @@ CXX_OBJS+=objs_ptx/ispc_malloc_gcc.o
 PTXGEN = $(ISPC_HOME)/ptxtools/ptxgen
 PTXGEN += --use_fast_math
 
-LLVM32=$(HOME)/usr/local/llvm/bin-3.2
-LLVM32DIS=$(LLVM32)/bin/llvm-dis
+#LLVM32=$(HOME)/usr/local/llvm/bin-3.2
+#LLVM32DIS=$(LLVM32)/bin/llvm-dis
 
-LLC=$(HOME)/usr/local/llvm/bin-trunk/bin/llc
+LLC=$(LLVM_ROOT)/bin/llc
 LLC_FLAGS=-march=nvptx64 -mcpu=sm_35
 
-# .SUFFIXES: .bc .o .cu 
+# .SUFFIXES: .bc .o .cu  .ll
 
 ifdef LLVM_GPU
   OBJSptx_llvm=$(ISPC_LLVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) 
@@ -74,7 +75,7 @@ endif
 all: dirs  \
 	$(PROGptx_nvvm)  \
 	$(PROGptx_llvm)  \
-	$(PROGcu) $(ISPC_BC)  $(ISPC_HEADERS) $(ISPC_NVVM_PTX) $(ISPC_LLVM_PTX)
+	$(PROGcu) $(ISPC_BCS) $(ISPC_LLS)  $(ISPC_HEADERS) $(ISPC_NVVM_PTX) $(ISPC_LLVM_PTX)
 
 dirs:
 	/bin/mkdir -p objs_ptx/
@@ -109,15 +110,21 @@ objs_ptx/%_nvcc.o: %.cu
 	$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
 
 # compile ISPC to LLVM BC
-objs_ptx/%_ispc.h objs_ptx/%_ispc.bc: %.ispc 
-	$(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.bc $<
+#objs_ptx/%_ispc.h objs_ptx/%_ispc.bc: %.ispc 
+#	$(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.bc $<
+objs_ptx/%_ispc.h objs_ptx/%_ispc.ll: %.ispc 
+	$(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.ll $<
 
 # generate PTX from LLVM BC
-objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.bc
+#objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.bc
+#	$(LLC) $(LLC_FLAGS) -o $@ $<
+objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.ll
 	$(LLC) $(LLC_FLAGS) -o $@ $<
-objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.bc
-	$(LLVM32DIS) $< -o objs_ptx/$*_ispc-ll32.ll
-	$(PTXGEN) objs_ptx/$*_ispc-ll32.ll -o $@
+#objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.bc
+#	$(LLVM32DIS) $< -o objs_ptx/$*_ispc-ll32.ll
+#	$(PTXGEN) objs_ptx/$*_ispc-ll32.ll -o $@
+objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.ll
+	$(PTXGEN) $< -o $@
 
 # generate an object file from PTX
 objs_ptx/%_ispc.o: objs_ptx/%_ispc.ptx
diff --git a/module.cpp b/module.cpp
index db378591..fc5c6437 100644
--- a/module.cpp
+++ b/module.cpp
@@ -58,6 +58,7 @@
 #include <set>
 #include <sstream>
 #include <iostream>
+#include <map>
 #ifdef ISPC_IS_WINDOWS
 #include <windows.h>
 #include <io.h>
@@ -71,6 +72,7 @@
   #include <llvm/Instructions.h>
   #include <llvm/Intrinsics.h>
   #include <llvm/DerivedTypes.h>
+  #include "llvm/Assembly/AssemblyAnnotationWriter.h"
 #else
   #include <llvm/IR/LLVMContext.h>
   #include <llvm/IR/Module.h>
@@ -78,6 +80,7 @@
   #include <llvm/IR/Instructions.h>
   #include <llvm/IR/Intrinsics.h>
   #include <llvm/IR/DerivedTypes.h>
+  #include "llvm/Assembly/AssemblyAnnotationWriter.h"
 #endif
 #include <llvm/PassManager.h>
 #include <llvm/PassRegistry.h>
@@ -1034,8 +1037,14 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
                 fileType = "assembly";
             break;
         case Bitcode:
-            if (strcasecmp(suffix, "bc"))
-                fileType = "LLVM bitcode";
+            if (g->target->getISA() != Target::NVPTX)
+            {
+              if (strcasecmp(suffix, "bc"))
+                  fileType = "LLVM bitcode";
+            }
+            else
+              if (strcasecmp(suffix, "ll"))
+                  fileType = "LLVM assembly";
             break;
         case Object:
             if (strcasecmp(suffix, "o") && strcasecmp(suffix, "obj"))
@@ -1104,6 +1113,73 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
         return writeObjectFileOrAssembly(outputType, outFileName);
 }
 
+typedef std::vector<std::string> vecString_t;
+static vecString_t 
+lSplitString(const std::string &s)
+{
+  std::stringstream ss(s);
+  std::istream_iterator<std::string> begin(ss);
+  std::istream_iterator<std::string> end;
+  return vecString_t(begin,end);
+}
+
+static void 
+lFixAttributes(const vecString_t &src, vecString_t &dst)
+{
+  dst.clear();
+
+  std::vector< std::pair<int,int> > attributePos;
+
+  typedef std::map<std::string, std::string> attributeMap_t;
+  attributeMap_t attributeMap;
+
+  for (vecString_t::const_iterator it = src.begin();  it != src.end(); it++)
+  {
+    const vecString_t words = lSplitString(*it);
+    if (!words.empty() && words[0] == "attributes" && words[1][0] == '#')
+    {
+      const int nWords = words.size();
+      assert(nWords > 3);
+      assert(words[2       ] == "=");
+      assert(words[3       ] == "{");
+      assert(words[nWords-1] == "}");
+      std::string attributes;
+      for (int w = 4; w < nWords-1; w++)
+          attributes += words[w] + " ";
+      attributeMap[words[1]] = attributes;
+    }
+  }
+  for (vecString_t::const_iterator it = src.begin();  it != src.end(); it++)
+  {
+    vecString_t words = lSplitString(*it);
+    if (!words.empty() && words[0] == "attributes")
+      continue;
+    std::string s;
+    std::map<std::string, std::string> attributeSet;
+#if 1  /* this attributed cannot be used in function parametrers, so remove them */
+    attributeSet["readnone"]   = " ";
+    attributeSet["readonly"]   = " ";
+    attributeSet["readnone,"]   = ",";
+    attributeSet["readonly,"]   = ",";
+#endif
+
+
+    for (vecString_t::iterator w = words.begin(); w != words.end(); w++)
+    {
+      if (attributeSet.find(*w) != attributeSet.end())
+        *w = attributeSet[*w];
+
+      if ((*w)[0] == '#')
+      {
+        attributeMap_t::iterator m = attributeMap.find(*w);
+        assert (m != attributeMap.end());
+        *w = attributeMap[*w];
+      }
+      s += *w + " ";
+    }
+    dst.push_back(s);
+  }
+}
 
 bool
 Module::writeBitcode(llvm::Module *module, const char *outFileName) {
@@ -1128,12 +1204,44 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
     }
 
     llvm::raw_fd_ostream fos(fd, (fd != 1), false);
-    if (g->target->getISA() == Target::NVPTX)
+    if (g->target->getISA() != Target::NVPTX)
     {
+     llvm::WriteBitcodeToFile(module, fos);
+    }
+    else
+    {
+      /* when using "nvptx" target, emit patched/hacked assembly 
+       * NVPTX only accepts 3.2-style LLVM assembly, where attributes
+       * must be inlined, rather then referenced by #attribute_d
+       * As soon as NVVM support 3.3,3.4 style assembly this fix won't be needed
+       */
       const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
       module->setDataLayout(dl_string);
+
+      std::string s;
+      llvm::raw_string_ostream out(s);
+      llvm::OwningPtr<llvm::AssemblyAnnotationWriter> Annotator;
+      module->print(out, Annotator.get());
+      std::istringstream iss(s);
+
+      vecString_t input,output;
+      while (std::getline(iss,s))
+        input.push_back(s);
+      output = input;
+
+#if !(defined(LLVM_3_1) || defined(LLVM_3_2))
+      /* do not fix attributed with LLVM 3.2, everything is fine there */
+      lFixAttributes(input,output);
+#endif
+
+      for (vecString_t::iterator it = output.begin(); it != output.end(); it++)
+      {
+        *it += "\n";
+        fos << *it;
+      }
     }
-    llvm::WriteBitcodeToFile(module, fos);
+
+
     return true;
 }
 
diff --git a/ptxtools/runtest_ptxcc.sh b/ptxtools/runtest_ptxcc.sh
index 67d9ccd7..52eccad0 100755
--- a/ptxtools/runtest_ptxcc.sh
+++ b/ptxtools/runtest_ptxcc.sh
@@ -7,9 +7,12 @@ TMPDIR=/tmp
 fbname=`basename $1`
 if [ "$NVVM" == "1" ];
 then
-  LLVM32=$HOME/usr/local/llvm/bin-3.2
-  LLVMDIS=$LLVM32/bin/llvm-dis
-  $($LLVMDIS $1 -o $TMPDIR/$fbname.ll) && $($PTXGEN $TMPDIR/$fbname.ll -o $TMPDIR/$fbname.ptx) && \
+#  LLVM32=$HOME/usr/local/llvm/bin-3.2
+#  LLVM34=$HOME/usr/local/llvm/bin-3.4
+#  LLVMAS=$LLVM34/bin/llvm-as
+#  LLVMDIS=$LLVM32/bin/llvm-dis
+#  $($LLVMAS $1 -o $TMPDIR/$fbname.bc) && $($LLVMDIS $TMPDIR/$fbname.bc -o $TMPDIR/$fbname.ll) && $($PTXGEN $TMPDIR/$fbname.ll -o $TMPDIR/$fbname.ptx) && \
+  $($PTXGEN $1 -o $TMPDIR/$fbname.ptx) && \
   $($PTXCC $TMPDIR/$fbname.ptx -o $TMPDIR/$fbname.o -Xnvcc="-G") && \
   $(nvcc test_static_nvptx.cpp examples/util/nvcc_helpers.cu examples/util/ispc_malloc.cpp $TMPDIR/$fbname.o -arch=sm_35 -Iexamples/util/ -D_CUDA_ -lcudadevrt $ARGS);
 else