diff --git a/Makefile b/Makefile index 835f8e15..043ab4cf 100644 --- a/Makefile +++ b/Makefile @@ -137,7 +137,7 @@ BISON_SRC=parse.yy FLEX_SRC=lex.ll OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \ - stdlib_generic_ispc.o stdlib_x86_ispc.o \ + stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \ $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc @@ -243,12 +243,23 @@ objs/builtins-c-64.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@ -objs/stdlib_generic_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for generic - @$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py generic > $@ +objs/stdlib_mask1_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask1 + @$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask1 > $@ + +objs/stdlib_mask8_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask8 + @$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask8 > $@ + +objs/stdlib_mask16_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask16 + @$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask16 > $@ + +objs/stdlib_mask32_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask32 + @$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask32 > $@ -objs/stdlib_x86_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for x86 - @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py x86 > $@ diff --git a/builtins.cpp b/builtins.cpp index 3e03de10..d3bbaa6a 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64; // varying - if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType && - t == LLVMTypes::MaskType) - return AtomicType::VaryingBool; - else if (t == LLVMTypes::Int8VectorType) + if (t == LLVMTypes::Int8VectorType) return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8; else if (t == LLVMTypes::Int16VectorType) return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16; @@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return AtomicType::VaryingDouble; else if (t == LLVMTypes::Int64VectorType) return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64; + else if (t == LLVMTypes::MaskType) + return AtomicType::VaryingBool; // pointers to uniform else if (t == LLVMTypes::Int8PointerType) @@ -1038,16 +1037,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its // definitions added. + extern char stdlib_mask1_code[], stdlib_mask8_code[]; + extern char stdlib_mask16_code[], stdlib_mask32_code[]; if (g->target->getISA() == Target::GENERIC && - g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib - extern char stdlib_generic_code[]; - yy_scan_string(stdlib_generic_code); - yyparse(); + g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib + yy_scan_string(stdlib_mask32_code); } else { - extern char stdlib_x86_code[]; - yy_scan_string(stdlib_x86_code); - yyparse(); + switch (g->target->getMaskBitCount()) { + case 1: + yy_scan_string(stdlib_mask1_code); + break; + case 8: + yy_scan_string(stdlib_mask8_code); + break; + case 16: + yy_scan_string(stdlib_mask16_code); + break; + case 32: + yy_scan_string(stdlib_mask32_code); + break; + default: + FATAL("Unhandled mask bit size for stdlib.ispc"); + } } + yyparse(); } } diff --git a/builtins/util.m4 b/builtins/util.m4 index c19d4930..d6f3e5c3 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -690,6 +690,75 @@ shuffles(i64, 8) ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) +define(`mask_converts', ` +define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) { + ret <$1 x i8> %0 +} +define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) { + %r = trunc <$1 x i16> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) { + ret <$1 x i16> %0 +} +define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) { + ret <$1 x i32> %0 +} +define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) { + %r = sext <$1 x i32> %0 to <$1 x i64> + ret <$1 x i64> %r +} +') + +mask_converts(WIDTH) + define(`global_atomic_associative', ` define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, @@ -697,17 +766,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, ; first, for any lanes where the mask is off, compute a vector where those lanes ; hold the identity value.. - ; for the bit tricks below, we need the mask to be sign extended to be - ; the size of the element type. - ifelse( - MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>', - $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>', - $3,i32, ` - ; silly workaround to do %mask = %m, which is not possible directly.. - %maskmem = alloca <$1 x i32> - store <$1 x i32> %m, <$1 x i32> * %maskmem - %mask = load <$1 x i32> * %maskmem' - ) + ; for the bit tricks below, we need the mask to have the + ; the same element size as the element type. + %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m) + ; zero out any lanes that are off %valoff = and <$1 x $3> %val, %mask @@ -2440,13 +2502,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { } define @__sext_varying_bool() nounwind readnone alwaysinline { - ifelse(MASK,i1, ` - %se = sext %0 to - ret %se - ', ` - ret %0') + ifelse(MASK,i32, `ret %0', + `%se = sext %0 to + ret %se') } + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; memcpy/memmove/memset @@ -3201,8 +3262,8 @@ return: ;; $1: llvm type of elements (and suffix for function name) define(`gen_masked_store', ` -define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { - per_lane(WIDTH, %2, ` +define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { + per_lane(WIDTH, %2, ` %ptr_LANE_ID = getelementptr * %0, i32 0, i32 LANE %storeval_LANE_ID = extractelement %1, i32 LANE store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID') @@ -3378,10 +3439,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, define(`packed_load_and_store', ` define i32 @__packed_load_active(i32 * %startptr, * %val_ptr, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3432,10 +3493,10 @@ done: } define i32 @__packed_store_active(i32 * %startptr, %vals, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3544,10 +3605,10 @@ check_neighbors: %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1) %vr = bitcast <$1 x $4> %castvr to <$1 x $2> %eq = $5 $7 <$1 x $2> %vec, %vr - ifelse(MASK,i32, ` - %eq32 = sext <$1 x i1> %eq to <$1 x i32> - %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', ` - %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)') + ifelse(MASK,i1, ` + %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)', + `%eqm = sext <$1 x i1> %eq to <$1 x MASK> + %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)') %alleq = icmp eq i64 %eqmm, ALL_ON_MASK br i1 %alleq, label %all_equal, label %not_all_equal ', ` @@ -3722,9 +3783,9 @@ pl_done: define(`gen_gather_general', ` ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s define @__gather32_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3738,9 +3799,9 @@ define @__gather32_$1( %ptrs, ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s define @__gather64_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3804,7 +3865,7 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o define @__gather_factored_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3813,13 +3874,13 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i32( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i32( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt32_$1(i8 * %ptr, %newOffsets, @@ -3835,7 +3896,7 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, @__gather_factored_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3844,13 +3905,13 @@ define @__gather_factored_base_offsets64_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i64( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i64( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt64_$1(i8 * %ptr, %newOffsets, @@ -3876,27 +3937,27 @@ gen_gather_factored($1) define @__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale_vec = bitcast i32 %offset_scale to <1 x i32> %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets32_$1(i8 * %ptr, %scaled_offsets, i32 1, - zeroinitializer, %vecmask) + zeroinitializer, %vecmask) ret %v } define @__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale64 = zext i32 %offset_scale to i64 %scale_vec = bitcast i64 %scale64 to <1 x i64> %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets64_$1(i8 * %ptr, %scaled_offsets, - i32 1, zeroinitializer, %vecmask) + i32 1, zeroinitializer, %vecmask) ret %v } @@ -3955,9 +4016,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s define void @__scatter_factored_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt32_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3965,9 +4026,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, %offs define void @__scatter_factored_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt64_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3975,8 +4036,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, %offs ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s define void @__scatter32_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE @@ -3987,8 +4048,8 @@ define void @__scatter32_$1( %ptrs, %values, ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s define void @__scatter64_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE diff --git a/ctx.cpp b/ctx.cpp index 1e79c97b..c50d22f9 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { for (unsigned int i = 0; i < at->getNumElements(); ++i) { llvm::Value *elt = ExtractInst(b, i); llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType, - LLVMGetName(elt, "_to_boolvec32")); + LLVMGetName(elt, "_to_boolvec")); ret = InsertInst(ret, sext, i); } return ret; } else - return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32")); + return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec")); } @@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, // Figure out if we need a 8, 16, 32 or 64-bit masked store. llvm::Function *maskedStoreFunc = NULL; + llvm::Type *llvmValueType = value->getType(); const PointerType *pt = CastType(valueType); if (pt != NULL) { @@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, else maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) && - g->target->getMaskBitCount() == 1) { + else if (llvmValueType == LLVMTypes::Int1VectorType) { llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask, LLVMMaskAllOn, "~mask"); llvm::Value *old = LoadInst(ptr); @@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, StoreInst(final, ptr); return; } - else if (Type::Equal(valueType, AtomicType::VaryingDouble)) { + else if (llvmValueType == LLVMTypes::DoubleVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt64) || - Type::Equal(valueType, AtomicType::VaryingUInt64)) { + else if (llvmValueType == LLVMTypes::Int64VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingFloat)) { + else if (llvmValueType == LLVMTypes::FloatVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) || - Type::Equal(valueType, AtomicType::VaryingInt32) || - Type::Equal(valueType, AtomicType::VaryingUInt32) || - CastType(valueType) != NULL) { + else if (llvmValueType == LLVMTypes::Int32VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt16) || - Type::Equal(valueType, AtomicType::VaryingUInt16)) { + else if (llvmValueType == LLVMTypes::Int16VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt8) || - Type::Equal(valueType, AtomicType::VaryingUInt8)) { + else if (llvmValueType == LLVMTypes::Int8VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8"); } AssertPos(currentPos, maskedStoreFunc != NULL); diff --git a/expr.cpp b/expr.cpp index 3baaabaf..6bde2acb 100644 --- a/expr.cpp +++ b/expr.cpp @@ -6161,9 +6161,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // If we have a bool vector of i32 elements, first truncate - // down to a single bit + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // If we have a bool vector of non-i1 elements, first + // truncate down to a single bit. exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); // And then do an unisgned int->float cast cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int @@ -6205,8 +6205,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // truncate i32 bool vector values to i1s + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // truncate bool vector values to i1s exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double exprVal, targetType, cOpName); @@ -6243,7 +6243,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6279,7 +6279,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6321,7 +6321,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6361,7 +6361,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6407,7 +6407,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6447,7 +6447,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6493,7 +6493,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6531,7 +6531,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6625,12 +6625,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, if (fromType->IsUniformType()) { if (toType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) { - // extend out to i32 bool values from i1 here. then we'll - // turn into a vector below, the way it does for everyone - // else... + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) { + // extend out to an bool as an i8/i16/i32 from the i1 here. + // Then we'll turn that into a vector below, the way it + // does for everyone else... cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(), - LLVMGetName(cast, "to_i32bool")); + LLVMGetName(cast, "to_i_bool")); } } else diff --git a/llvmutil.cpp b/llvmutil.cpp index 26c18bf5..180c8676 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -115,13 +115,25 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0); LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0); - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth()); - else { - Assert(target.getMaskBitCount() == 32); + break; + case 8: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth()); + break; + case 16: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth()); + break; + case 32: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth()); + break; + default: + FATAL("Unhandled mask width for initializing MaskType"); } LLVMTypes::Int1VectorType = @@ -154,12 +166,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskOnes; llvm::Constant *onMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1, false /*unsigned*/); // 0x1 - else + break; + case 8: + onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1, + true /*signed*/); // 0xff + break; + case 16: + onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1, + true /*signed*/); // 0xffff + break; + case 32: onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, true /*signed*/); // 0xffffffff + break; + default: + FATAL("Unhandled mask width for onMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskOnes.push_back(onMask); @@ -167,13 +193,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskZeros; llvm::Constant *offMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0, true /*signed*/); - else + break; + case 8: + offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0, + true /*signed*/); + break; + case 16: + offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0, + true /*signed*/); + break; + case 32: offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, true /*signed*/); - + break; + default: + FATAL("Unhandled mask width for offMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskZeros.push_back(offMask); LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros); @@ -444,9 +483,14 @@ LLVMBoolVector(bool b) { if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = b ? LLVMTrue : LLVMFalse; } @@ -465,9 +509,14 @@ LLVMBoolVector(const bool *bvec) { if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = bvec[i] ? LLVMTrue : LLVMFalse; } diff --git a/parse.yy b/parse.yy index 3ad815cf..488c864a 100644 --- a/parse.yy +++ b/parse.yy @@ -2148,8 +2148,24 @@ lAddFunctionParams(Declarator *decl) { /** Add a symbol for the built-in mask variable to the symbol table */ static void lAddMaskToSymbolTable(SourcePos pos) { - const Type *t = g->target->getMaskBitCount() == 1 ? - AtomicType::VaryingBool : AtomicType::VaryingUInt32; + const Type *t; + switch (g->target->getMaskBitCount()) { + case 1: + t = AtomicType::VaryingBool; + break; + case 8: + t = AtomicType::VaryingUInt8; + break; + case 16: + t = AtomicType::VaryingUInt16; + break; + case 32: + t = AtomicType::VaryingUInt32; + break; + default: + FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable"); + } + t = t->GetAsConstType(); Symbol *maskSymbol = new Symbol("__mask", pos, t); m->symbolTable->AddVariable(maskSymbol); diff --git a/stdlib.ispc b/stdlib.ispc index b8ed2057..8ad5aa49 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -38,12 +38,20 @@ ispc code */ -#ifdef ISPC_TARGET_GENERIC -#define IntMaskType bool -#define UIntMaskType bool +#if (ISPC_MASK_BITS == 1) + #define IntMaskType bool + #define UIntMaskType bool +#elif (ISPC_MASK_BITS == 8) + #define IntMaskType int8 + #define UIntMaskType unsigned int8 +#elif (ISPC_MASK_BITS == 16) + #define IntMaskType int16 + #define UIntMaskType unsigned int16 +#elif (ISPC_MASK_BITS == 32) + #define IntMaskType int32 + #define UIntMaskType unsigned int32 #else -#define IntMaskType int32 -#define UIntMaskType unsigned int32 + #error Unknown value of ISPC_MASK_BITS #endif /////////////////////////////////////////////////////////////////////////// @@ -335,14 +343,15 @@ static inline int32 sign_extend(bool v) { return __sext_varying_bool(v); } + __declspec(safe) static inline uniform bool any(bool v) { // We only care about whether "any" is true for the active program instances, // so we have to make v with the current program mask. -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __any(v & __mask); #else - return __any(__sext_varying_bool(v) & __mask); + return __any((UIntMaskType)__sext_varying_bool(v) & __mask); #endif } @@ -350,11 +359,10 @@ __declspec(safe) static inline uniform bool all(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes - -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __all(v | !__mask); #else - return __all(__sext_varying_bool(v) | !__mask); + return __all((UIntMaskType)__sext_varying_bool(v) | !__mask); #endif } @@ -362,11 +370,10 @@ __declspec(safe) static inline uniform bool none(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes - -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __none(v & __mask); #else - return __none(__sext_varying_bool(v) & __mask); + return __none((UIntMaskType)__sext_varying_bool(v) & __mask); #endif } @@ -399,10 +406,10 @@ static inline int popcnt(int64 v) { __declspec(safe) static inline uniform int popcnt(bool v) { // As with any() and all(), only count across the active lanes -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __popcnt_int64(__movmsk(v & __mask)); #else - return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask)); + return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask)); #endif }