Add support for mask vectors of 8 and 16-bit element types.

There were a number of places throughout the system that assumed that the
execution mask would only have either 32-bit or 1-bit elements.  This
commit makes it possible to have a target with an 8- or 16-bit mask.
This commit is contained in:
Matt Pharr
2013-07-23 16:38:10 -07:00
parent 83e1630fbc
commit e7abf3f2ea
8 changed files with 284 additions and 133 deletions

View File

@@ -137,7 +137,7 @@ BISON_SRC=parse.yy
FLEX_SRC=lex.ll
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
stdlib_generic_ispc.o stdlib_x86_ispc.o \
stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
default: ispc
@@ -243,12 +243,23 @@ objs/builtins-c-64.cpp: builtins/builtins.c
@echo Creating C++ source from builtins definition file $<
@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@
objs/stdlib_generic_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for generic
@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
python stdlib2cpp.py generic > $@
objs/stdlib_mask1_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for mask1
@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
python stdlib2cpp.py mask1 > $@
objs/stdlib_mask8_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for mask8
@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
python stdlib2cpp.py mask8 > $@
objs/stdlib_mask16_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for mask16
@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
python stdlib2cpp.py mask16 > $@
objs/stdlib_mask32_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for mask32
@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
python stdlib2cpp.py mask32 > $@
objs/stdlib_x86_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for x86
@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
python stdlib2cpp.py x86 > $@

View File

@@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
// varying
if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
t == LLVMTypes::MaskType)
return AtomicType::VaryingBool;
else if (t == LLVMTypes::Int8VectorType)
if (t == LLVMTypes::Int8VectorType)
return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
else if (t == LLVMTypes::Int16VectorType)
return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
@@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
return AtomicType::VaryingDouble;
else if (t == LLVMTypes::Int64VectorType)
return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
else if (t == LLVMTypes::MaskType)
return AtomicType::VaryingBool;
// pointers to uniform
else if (t == LLVMTypes::Int8PointerType)
@@ -1038,16 +1037,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
// If the user wants the standard library to be included, parse the
// serialized version of the stdlib.ispc file to get its
// definitions added.
extern char stdlib_mask1_code[], stdlib_mask8_code[];
extern char stdlib_mask16_code[], stdlib_mask32_code[];
if (g->target->getISA() == Target::GENERIC &&
g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib
extern char stdlib_generic_code[];
yy_scan_string(stdlib_generic_code);
yyparse();
g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
yy_scan_string(stdlib_mask32_code);
}
else {
extern char stdlib_x86_code[];
yy_scan_string(stdlib_x86_code);
switch (g->target->getMaskBitCount()) {
case 1:
yy_scan_string(stdlib_mask1_code);
break;
case 8:
yy_scan_string(stdlib_mask8_code);
break;
case 16:
yy_scan_string(stdlib_mask16_code);
break;
case 32:
yy_scan_string(stdlib_mask32_code);
break;
default:
FATAL("Unhandled mask bit size for stdlib.ispc");
}
}
yyparse();
}
}
}

View File

@@ -690,6 +690,75 @@ shuffles(i64, 8)
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
define(`mask_converts', `
define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
%r = sext <$1 x i1> %0 to <$1 x i8>
ret <$1 x i8> %r
}
define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) {
%r = sext <$1 x i1> %0 to <$1 x i16>
ret <$1 x i16> %r
}
define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) {
%r = sext <$1 x i1> %0 to <$1 x i32>
ret <$1 x i32> %r
}
define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) {
%r = sext <$1 x i1> %0 to <$1 x i64>
ret <$1 x i64> %r
}
define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) {
ret <$1 x i8> %0
}
define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) {
%r = sext <$1 x i8> %0 to <$1 x i16>
ret <$1 x i16> %r
}
define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) {
%r = sext <$1 x i8> %0 to <$1 x i32>
ret <$1 x i32> %r
}
define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) {
%r = sext <$1 x i8> %0 to <$1 x i64>
ret <$1 x i64> %r
}
define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) {
%r = trunc <$1 x i16> %0 to <$1 x i8>
ret <$1 x i8> %r
}
define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) {
ret <$1 x i16> %0
}
define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) {
%r = sext <$1 x i16> %0 to <$1 x i32>
ret <$1 x i32> %r
}
define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) {
%r = sext <$1 x i16> %0 to <$1 x i64>
ret <$1 x i64> %r
}
define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) {
%r = trunc <$1 x i32> %0 to <$1 x i8>
ret <$1 x i8> %r
}
define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) {
%r = trunc <$1 x i32> %0 to <$1 x i16>
ret <$1 x i16> %r
}
define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) {
ret <$1 x i32> %0
}
define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
%r = sext <$1 x i32> %0 to <$1 x i64>
ret <$1 x i64> %r
}
')
mask_converts(WIDTH)
define(`global_atomic_associative', `
define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
@@ -697,17 +766,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
; first, for any lanes where the mask is off, compute a vector where those lanes
; hold the identity value..
; for the bit tricks below, we need the mask to be sign extended to be
; the size of the element type.
ifelse(
MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>',
$3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>',
$3,i32, `
; silly workaround to do %mask = %m, which is not possible directly..
%maskmem = alloca <$1 x i32>
store <$1 x i32> %m, <$1 x i32> * %maskmem
%mask = load <$1 x i32> * %maskmem'
)
; for the bit tricks below, we need the mask to have the
; the same element size as the element type.
%mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m)
; zero out any lanes that are off
%valoff = and <$1 x $3> %val, %mask
@@ -2440,13 +2502,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
}
define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
ifelse(MASK,i1, `
%se = sext <WIDTH x i1> %0 to <WIDTH x i32>
ret <WIDTH x i32> %se
', `
ret <WIDTH x i32> %0')
ifelse(MASK,i32, `ret <WIDTH x i32> %0',
`%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
ret <WIDTH x i32> %se')
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; memcpy/memmove/memset
@@ -3201,8 +3262,8 @@ return:
;; $1: llvm type of elements (and suffix for function name)
define(`gen_masked_store', `
define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x i32>) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x i32> %2, `
define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x MASK>) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %2, `
%ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
%storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
@@ -3378,10 +3439,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
define(`packed_load_and_store', `
define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
<WIDTH x i32> %full_mask) nounwind alwaysinline {
<WIDTH x MASK> %full_mask) nounwind alwaysinline {
entry:
%mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
%mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
br i1 %mask_known, label %known_mask, label %unknown_mask
known_mask:
@@ -3432,10 +3493,10 @@ done:
}
define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
<WIDTH x i32> %full_mask) nounwind alwaysinline {
<WIDTH x MASK> %full_mask) nounwind alwaysinline {
entry:
%mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
%mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
%mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
br i1 %mask_known, label %known_mask, label %unknown_mask
known_mask:
@@ -3544,10 +3605,10 @@ check_neighbors:
%castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1)
%vr = bitcast <$1 x $4> %castvr to <$1 x $2>
%eq = $5 $7 <$1 x $2> %vec, %vr
ifelse(MASK,i32, `
%eq32 = sext <$1 x i1> %eq to <$1 x i32>
%eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
%eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
ifelse(MASK,i1, `
%eqmm = call i64 @__movmsk(<$1 x MASK> %eq)',
`%eqm = sext <$1 x i1> %eq to <$1 x MASK>
%eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)')
%alleq = icmp eq i64 %eqmm, ALL_ON_MASK
br i1 %alleq, label %all_equal, label %not_all_equal
', `
@@ -3722,9 +3783,9 @@ pl_done:
define(`gen_gather_general', `
; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
%ret_ptr = alloca <WIDTH x $1>
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
%iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
%val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3738,9 +3799,9 @@ define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
%ret_ptr = alloca <WIDTH x $1>
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
%val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3804,7 +3865,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
<WIDTH x i32> %offset_delta,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
; We can be clever and avoid the per-lane stuff for gathers if we are willing
; to require that the 0th element of the array being gathered from is always
; legal to read from (and we do indeed require that, given the benefits!)
@@ -3813,13 +3874,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
%offsetsPtr = alloca <WIDTH x i32>
store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %offsetsPtr
call void @__masked_store_blend_i32(<WIDTH x i32> * %offsetsPtr, <WIDTH x i32> %offsets,
<WIDTH x i32> %vecmask)
<WIDTH x MASK> %vecmask)
%newOffsets = load <WIDTH x i32> * %offsetsPtr
%deltaPtr = alloca <WIDTH x i32>
store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %deltaPtr
call void @__masked_store_blend_i32(<WIDTH x i32> * %deltaPtr, <WIDTH x i32> %offset_delta,
<WIDTH x i32> %vecmask)
<WIDTH x MASK> %vecmask)
%newDelta = load <WIDTH x i32> * %deltaPtr
%ret0 = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %newOffsets,
@@ -3835,7 +3896,7 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
<WIDTH x i64> %offset_delta,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
; We can be clever and avoid the per-lane stuff for gathers if we are willing
; to require that the 0th element of the array being gathered from is always
; legal to read from (and we do indeed require that, given the benefits!)
@@ -3844,13 +3905,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64
%offsetsPtr = alloca <WIDTH x i64>
store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %offsetsPtr
call void @__masked_store_blend_i64(<WIDTH x i64> * %offsetsPtr, <WIDTH x i64> %offsets,
<WIDTH x i32> %vecmask)
<WIDTH x MASK> %vecmask)
%newOffsets = load <WIDTH x i64> * %offsetsPtr
%deltaPtr = alloca <WIDTH x i64>
store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %deltaPtr
call void @__masked_store_blend_i64(<WIDTH x i64> * %deltaPtr, <WIDTH x i64> %offset_delta,
<WIDTH x i32> %vecmask)
<WIDTH x MASK> %vecmask)
%newDelta = load <WIDTH x i64> * %deltaPtr
%ret0 = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %newOffsets,
@@ -3876,27 +3937,27 @@ gen_gather_factored($1)
define <WIDTH x $1>
@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
<WIDTH x i32> %offsets,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
%scale_vec = bitcast i32 %offset_scale to <1 x i32>
%smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
<WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
%scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
%v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1,
<WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
<WIDTH x i32> zeroinitializer, <WIDTH x MASK> %vecmask)
ret <WIDTH x $1> %v
}
define <WIDTH x $1>
@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
<WIDTH x i64> %offsets,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
<WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
%scale64 = zext i32 %offset_scale to i64
%scale_vec = bitcast i64 %scale64 to <1 x i64>
%smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
<WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
%scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
%v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x MASK> %vecmask)
ret <WIDTH x $1> %v
}
@@ -3955,9 +4016,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
<WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
<WIDTH x i32> %mask) nounwind alwaysinline {
<WIDTH x MASK> %mask) nounwind alwaysinline {
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
per_lane(WIDTH, <WIDTH x i32> %mask, `
per_lane(WIDTH, <WIDTH x MASK> %mask, `
call void @__scatter_elt32_$1(i8 * %base, <WIDTH x i32> %offsets, i32 %offset_scale,
<WIDTH x i32> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
ret void
@@ -3965,9 +4026,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offs
define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
<WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
<WIDTH x i32> %mask) nounwind alwaysinline {
<WIDTH x MASK> %mask) nounwind alwaysinline {
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
per_lane(WIDTH, <WIDTH x i32> %mask, `
per_lane(WIDTH, <WIDTH x MASK> %mask, `
call void @__scatter_elt64_$1(i8 * %base, <WIDTH x i64> %offsets, i32 %offset_scale,
<WIDTH x i64> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
ret void
@@ -3975,8 +4036,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offs
; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
<WIDTH x i32> %mask) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x i32> %mask, `
<WIDTH x MASK> %mask) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
%val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
@@ -3987,8 +4048,8 @@ define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
<WIDTH x i32> %mask) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x i32> %mask, `
<WIDTH x MASK> %mask) nounwind alwaysinline {
per_lane(WIDTH, <WIDTH x MASK> %mask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
%val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE

26
ctx.cpp
View File

@@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
for (unsigned int i = 0; i < at->getNumElements(); ++i) {
llvm::Value *elt = ExtractInst(b, i);
llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType,
LLVMGetName(elt, "_to_boolvec32"));
LLVMGetName(elt, "_to_boolvec"));
ret = InsertInst(ret, sext, i);
}
return ret;
}
else
return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32"));
return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec"));
}
@@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
// Figure out if we need a 8, 16, 32 or 64-bit masked store.
llvm::Function *maskedStoreFunc = NULL;
llvm::Type *llvmValueType = value->getType();
const PointerType *pt = CastType<PointerType>(valueType);
if (pt != NULL) {
@@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
else
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
}
else if (Type::Equal(valueType, AtomicType::VaryingBool) &&
g->target->getMaskBitCount() == 1) {
else if (llvmValueType == LLVMTypes::Int1VectorType) {
llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask,
LLVMMaskAllOn, "~mask");
llvm::Value *old = LoadInst(ptr);
@@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
StoreInst(final, ptr);
return;
}
else if (Type::Equal(valueType, AtomicType::VaryingDouble)) {
else if (llvmValueType == LLVMTypes::DoubleVectorType) {
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double");
}
else if (Type::Equal(valueType, AtomicType::VaryingInt64) ||
Type::Equal(valueType, AtomicType::VaryingUInt64)) {
else if (llvmValueType == LLVMTypes::Int64VectorType) {
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
}
else if (Type::Equal(valueType, AtomicType::VaryingFloat)) {
else if (llvmValueType == LLVMTypes::FloatVectorType) {
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float");
}
else if (Type::Equal(valueType, AtomicType::VaryingBool) ||
Type::Equal(valueType, AtomicType::VaryingInt32) ||
Type::Equal(valueType, AtomicType::VaryingUInt32) ||
CastType<EnumType>(valueType) != NULL) {
else if (llvmValueType == LLVMTypes::Int32VectorType) {
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32");
}
else if (Type::Equal(valueType, AtomicType::VaryingInt16) ||
Type::Equal(valueType, AtomicType::VaryingUInt16)) {
else if (llvmValueType == LLVMTypes::Int16VectorType) {
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16");
}
else if (Type::Equal(valueType, AtomicType::VaryingInt8) ||
Type::Equal(valueType, AtomicType::VaryingUInt8)) {
else if (llvmValueType == LLVMTypes::Int8VectorType) {
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8");
}
AssertPos(currentPos, maskedStoreFunc != NULL);

View File

@@ -6161,9 +6161,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
switch (fromType->basicType) {
case AtomicType::TYPE_BOOL:
if (fromType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
// If we have a bool vector of i32 elements, first truncate
// down to a single bit
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
// If we have a bool vector of non-i1 elements, first
// truncate down to a single bit.
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
// And then do an unisgned int->float cast
cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
@@ -6205,8 +6205,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
switch (fromType->basicType) {
case AtomicType::TYPE_BOOL:
if (fromType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
// truncate i32 bool vector values to i1s
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
// truncate bool vector values to i1s
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double
exprVal, targetType, cOpName);
@@ -6243,7 +6243,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
switch (fromType->basicType) {
case AtomicType::TYPE_BOOL:
if (fromType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
cast = ctx->ZExtInst(exprVal, targetType, cOpName);
break;
@@ -6279,7 +6279,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
switch (fromType->basicType) {
case AtomicType::TYPE_BOOL:
if (fromType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
cast = ctx->ZExtInst(exprVal, targetType, cOpName);
break;
@@ -6321,7 +6321,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
switch (fromType->basicType) {
case AtomicType::TYPE_BOOL:
if (fromType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
cast = ctx->ZExtInst(exprVal, targetType, cOpName);
break;
@@ -6361,7 +6361,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
switch (fromType->basicType) {
case AtomicType::TYPE_BOOL:
if (fromType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
cast = ctx->ZExtInst(exprVal, targetType, cOpName);
break;
@@ -6407,7 +6407,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
switch (fromType->basicType) {
case AtomicType::TYPE_BOOL:
if (fromType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
cast = ctx->ZExtInst(exprVal, targetType, cOpName);
break;
@@ -6447,7 +6447,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
switch (fromType->basicType) {
case AtomicType::TYPE_BOOL:
if (fromType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
cast = ctx->ZExtInst(exprVal, targetType, cOpName);
break;
@@ -6493,7 +6493,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
switch (fromType->basicType) {
case AtomicType::TYPE_BOOL:
if (fromType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
cast = ctx->ZExtInst(exprVal, targetType, cOpName);
break;
@@ -6531,7 +6531,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
switch (fromType->basicType) {
case AtomicType::TYPE_BOOL:
if (fromType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
cast = ctx->ZExtInst(exprVal, targetType, cOpName);
break;
@@ -6625,12 +6625,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
if (fromType->IsUniformType()) {
if (toType->IsVaryingType() &&
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) {
// extend out to i32 bool values from i1 here. then we'll
// turn into a vector below, the way it does for everyone
// else...
LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) {
// extend out to an bool as an i8/i16/i32 from the i1 here.
// Then we'll turn that into a vector below, the way it
// does for everyone else...
cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(),
LLVMGetName(cast, "to_i32bool"));
LLVMGetName(cast, "to_i_bool"));
}
}
else

View File

@@ -115,13 +115,25 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
if (target.getMaskBitCount() == 1)
switch (target.getMaskBitCount()) {
case 1:
LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth());
else {
Assert(target.getMaskBitCount() == 32);
break;
case 8:
LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth());
break;
case 16:
LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth());
break;
case 32:
LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
break;
default:
FATAL("Unhandled mask width for initializing MaskType");
}
LLVMTypes::Int1VectorType =
@@ -154,12 +166,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
std::vector<llvm::Constant *> maskOnes;
llvm::Constant *onMask = NULL;
if (target.getMaskBitCount() == 1)
switch (target.getMaskBitCount()) {
case 1:
onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1,
false /*unsigned*/); // 0x1
else
break;
case 8:
onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1,
true /*signed*/); // 0xff
break;
case 16:
onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1,
true /*signed*/); // 0xffff
break;
case 32:
onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
true /*signed*/); // 0xffffffff
break;
default:
FATAL("Unhandled mask width for onMask");
}
for (int i = 0; i < target.getVectorWidth(); ++i)
maskOnes.push_back(onMask);
@@ -167,13 +193,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
std::vector<llvm::Constant *> maskZeros;
llvm::Constant *offMask = NULL;
if (target.getMaskBitCount() == 1)
switch (target.getMaskBitCount()) {
case 1:
offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0,
true /*signed*/);
else
break;
case 8:
offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0,
true /*signed*/);
break;
case 16:
offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0,
true /*signed*/);
break;
case 32:
offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
true /*signed*/);
break;
default:
FATAL("Unhandled mask width for offMask");
}
for (int i = 0; i < target.getVectorWidth(); ++i)
maskZeros.push_back(offMask);
LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros);
@@ -444,9 +483,14 @@ LLVMBoolVector(bool b) {
if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
false /*unsigned*/);
else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0,
false /*unsigned*/);
else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType)
v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0,
false /*unsigned*/);
else {
Assert(LLVMTypes::BoolVectorType->getElementType() ==
llvm::Type::getInt1Ty(*g->ctx));
Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType);
v = b ? LLVMTrue : LLVMFalse;
}
@@ -465,9 +509,14 @@ LLVMBoolVector(const bool *bvec) {
if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
false /*unsigned*/);
else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0,
false /*unsigned*/);
else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType)
v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0,
false /*unsigned*/);
else {
Assert(LLVMTypes::BoolVectorType->getElementType() ==
llvm::Type::getInt1Ty(*g->ctx));
Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType);
v = bvec[i] ? LLVMTrue : LLVMFalse;
}

View File

@@ -2148,8 +2148,24 @@ lAddFunctionParams(Declarator *decl) {
/** Add a symbol for the built-in mask variable to the symbol table */
static void lAddMaskToSymbolTable(SourcePos pos) {
const Type *t = g->target->getMaskBitCount() == 1 ?
AtomicType::VaryingBool : AtomicType::VaryingUInt32;
const Type *t;
switch (g->target->getMaskBitCount()) {
case 1:
t = AtomicType::VaryingBool;
break;
case 8:
t = AtomicType::VaryingUInt8;
break;
case 16:
t = AtomicType::VaryingUInt16;
break;
case 32:
t = AtomicType::VaryingUInt32;
break;
default:
FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
}
t = t->GetAsConstType();
Symbol *maskSymbol = new Symbol("__mask", pos, t);
m->symbolTable->AddVariable(maskSymbol);

View File

@@ -38,12 +38,20 @@
ispc code
*/
#ifdef ISPC_TARGET_GENERIC
#define IntMaskType bool
#define UIntMaskType bool
#if (ISPC_MASK_BITS == 1)
#define IntMaskType bool
#define UIntMaskType bool
#elif (ISPC_MASK_BITS == 8)
#define IntMaskType int8
#define UIntMaskType unsigned int8
#elif (ISPC_MASK_BITS == 16)
#define IntMaskType int16
#define UIntMaskType unsigned int16
#elif (ISPC_MASK_BITS == 32)
#define IntMaskType int32
#define UIntMaskType unsigned int32
#else
#define IntMaskType int32
#define UIntMaskType unsigned int32
#error Unknown value of ISPC_MASK_BITS
#endif
///////////////////////////////////////////////////////////////////////////
@@ -335,14 +343,15 @@ static inline int32 sign_extend(bool v) {
return __sext_varying_bool(v);
}
__declspec(safe)
static inline uniform bool any(bool v) {
// We only care about whether "any" is true for the active program instances,
// so we have to make v with the current program mask.
#ifdef ISPC_TARGET_GENERIC
#if (ISPC_MASK_BITS == 1)
return __any(v & __mask);
#else
return __any(__sext_varying_bool(v) & __mask);
return __any((UIntMaskType)__sext_varying_bool(v) & __mask);
#endif
}
@@ -350,11 +359,10 @@ __declspec(safe)
static inline uniform bool all(bool v) {
// As with any(), we need to explicitly mask v with the current program mask
// so we're only looking at the current lanes
#ifdef ISPC_TARGET_GENERIC
#if (ISPC_MASK_BITS == 1)
return __all(v | !__mask);
#else
return __all(__sext_varying_bool(v) | !__mask);
return __all((UIntMaskType)__sext_varying_bool(v) | !__mask);
#endif
}
@@ -362,11 +370,10 @@ __declspec(safe)
static inline uniform bool none(bool v) {
// As with any(), we need to explicitly mask v with the current program mask
// so we're only looking at the current lanes
#ifdef ISPC_TARGET_GENERIC
#if (ISPC_MASK_BITS == 1)
return __none(v & __mask);
#else
return __none(__sext_varying_bool(v) & __mask);
return __none((UIntMaskType)__sext_varying_bool(v) & __mask);
#endif
}
@@ -399,10 +406,10 @@ static inline int popcnt(int64 v) {
__declspec(safe)
static inline uniform int popcnt(bool v) {
// As with any() and all(), only count across the active lanes
#ifdef ISPC_TARGET_GENERIC
#if (ISPC_MASK_BITS == 1)
return __popcnt_int64(__movmsk(v & __mask));
#else
return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask));
return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
#endif
}