Add SSE4 target optimized for computation with 8-bit datatypes.

This change adds a new 'sse4-8' target, where programCount is 16 and
the mask element size is 8-bits.  (i.e. the most appropriate sizing of
the mask for SIMD computation with 8-bit datatypes.)
This commit is contained in:
Matt Pharr
2013-07-23 17:30:32 -07:00
parent 15a3ef370a
commit 53414f12e6
7 changed files with 578 additions and 7 deletions

View File

@@ -123,7 +123,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
opt.h stmt.h sym.h type.h util.h
TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
sse4-8 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
# These files need to be compiled in two versions - 32 and 64 bits.
BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
# These are files to be compiled in single version.

View File

@@ -868,6 +868,15 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
}
break;
case 16:
Assert(g->target->getMaskBitCount() == 8);
if (runtime32) {
EXPORT_MODULE(builtins_bitcode_sse4_8_32bit);
}
else {
EXPORT_MODULE(builtins_bitcode_sse4_8_64bit);
}
break;
default:
FATAL("logic error in DefineStdlib");
}

444
builtins/target-sse4-8.ll Normal file
View File

@@ -0,0 +1,444 @@
;; Copyright (c) 2013, Google, Inc.
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Google, Inc. nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Define common 4-wide stuff
define(`WIDTH',`16')
define(`MASK',`i8')
include(`util.m4')
stdlib_core()
packed_load_and_store()
scans()
int64minmax()
include(`target-sse4-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0)
; do one N-R iteration to improve precision
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
%v_iv = fmul <16 x float> %0, %call
%two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <16 x float> %call, %two_minus
ret <16 x float> %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v);
unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v)
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <16 x float> %v, %is
%v_is_is = fmul <16 x float> %v_is, %is
%three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <16 x float> %is, %three_sub
%half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <16 x float> %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0)
ret <16 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind
alwaysinline {
unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
ret <16 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
round4to16(%0, 8)
}
define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
round4to16(%0, 9)
}
define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
round4to16(%0, 10)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
; XXXround2to4double(%0, 8)
; FIXME: need round2to16double in util.m4...
ret <16 x double> undef
}
define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
; XXXround2to4double(%0, 9)
ret <16 x double> undef
}
define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
; XXXround2to4double(%0, 10)
ret <16 x double> undef
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1)
ret <16 x float> %call
}
define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1)
ret <16 x float> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 min/max
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <16 x i32> %call
}
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <16 x i32> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unsigned int min/max
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <16 x i32> %call
}
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <16 x i32> %call
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone {
binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
ret <16 x double> %ret
}
define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone {
binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <16 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; svml
; FIXME
declare <16 x float> @__svml_sin(<16 x float>)
declare <16 x float> @__svml_cos(<16 x float>)
declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
declare <16 x float> @__svml_tan(<16 x float>)
declare <16 x float> @__svml_atan(<16 x float>)
declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
declare <16 x float> @__svml_exp(<16 x float>)
declare <16 x float> @__svml_log(<16 x float>)
declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions
declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline {
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
%m64 = zext i32 %m to i64
ret i64 %m64
}
define i1 @__any(<16 x i8>) nounwind readnone alwaysinline {
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
%mne = icmp ne i32 %m, 0
ret i1 %mne
}
define i1 @__all(<16 x i8>) nounwind readnone alwaysinline {
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
%meq = icmp eq i32 %m, ALL_ON_MASK
ret i1 %meq
}
define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
%m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
%meq = icmp eq i32 %m, 0
ret i1 %meq
}
define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
%r = fadd <16 x float> %0, %1
ret <16 x float> %r
}
define internal float @__add_uniform_float(float, float) {
%r = fadd float %0, %1
ret float %r
}
define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
reduce16(float, @__add_varying_float, @__add_uniform_float)
}
define float @__reduce_min_float(<16 x float>) nounwind readnone {
reduce16(float, @__min_varying_float, @__min_uniform_float)
}
define float @__reduce_max_float(<16 x float>) nounwind readnone {
reduce16(float, @__max_varying_float, @__max_uniform_float)
}
define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) {
%r = add <16 x i32> %0, %1
ret <16 x i32> %r
}
define internal i32 @__add_uniform_int32(i32, i32) {
%r = add i32 %0, %1
ret i32 %r
}
define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone {
reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
}
define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone {
reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
}
define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone {
reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
}
define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone {
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone {
reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) {
%r = fadd <16 x double> %0, %1
ret <16 x double> %r
}
define internal double @__add_uniform_double(double, double) {
%r = fadd double %0, %1
ret double %r
}
define double @__reduce_add_double(<16 x double>) nounwind readnone {
reduce16(double, @__add_varying_double, @__add_uniform_double)
}
define double @__reduce_min_double(<16 x double>) nounwind readnone {
reduce16(double, @__min_varying_double, @__min_uniform_double)
}
define double @__reduce_max_double(<16 x double>) nounwind readnone {
reduce16(double, @__max_varying_double, @__max_uniform_double)
}
define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) {
%r = add <16 x i64> %0, %1
ret <16 x i64> %r
}
define internal i64 @__add_uniform_int64(i64, i64) {
%r = add i64 %0, %1
ret i64 %r
}
define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone {
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
}
define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone {
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
}
define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone {
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
}
define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone {
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone {
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
reduce_equal(16)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>,
<16 x i8> %mask) nounwind
alwaysinline {
%mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
%old = load <16 x i64>* %0, align 4
%blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old
store <16 x i64> %blend, <16 x i64>* %0, align 4
ret void
}
define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>,
<16 x MASK> %mask) nounwind alwaysinline {
%mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
%old = load <16 x i32>* %0, align 4
%blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old
store <16 x i32> %blend, <16 x i32>* %0, align 4
ret void
}
define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
<16 x MASK> %mask) nounwind alwaysinline {
%mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
%old = load <16 x i16>* %0, align 4
%blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old
store <16 x i16> %blend, <16 x i16>* %0, align 4
ret void
}
define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
<16 x MASK> %mask) nounwind alwaysinline {
%mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
%old = load <16 x i8>* %0, align 4
%blend = select <16 x i1> %mask_as_i1, <16 x i8> %1, <16 x i8> %old
store <16 x i8> %blend, <16 x i8>* %0, align 4
ret void
}
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
; define these with the macros from stdlib.m4
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)

View File

@@ -411,6 +411,42 @@ define(`unary2to8', `
'
)
define(`unary2to16', `
%$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
%$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
%$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
%v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
%$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
%$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
%v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4)
%$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
%v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5)
%$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
%v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6)
%$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
%v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7)
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
'
)
;; Maps an 2-wide binary function to two 8-wide vector operands
;; $1: name of variable into which the final result should go
;; $2: scalar type of the vector elements
@@ -432,12 +468,58 @@ define(`binary2to8', `
%$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
'
)
define(`binary2to16', `
%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
%$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
%$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
%v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
%$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
%$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
%$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
%$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
%v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b)
%$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
%$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
%v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b)
%$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
%$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
%v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b)
%$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
%$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
%v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b)
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
%$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
'
)
@@ -460,6 +542,26 @@ ret <8 x float> %ret
'
)
define(`round4to16', `
%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2)
%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2)
%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x float> %ret
'
)
define(`round8to16', `
%v0 = shufflevector <16 x float> $1, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

View File

@@ -3123,6 +3123,10 @@ static llvm::Value *
lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
llvm::Value *expr1, llvm::Value *expr2,
const Type *type) {
#if !defined(LLVM_3_1)
test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
return ctx->SelectInst(test, expr1, expr2, "select");
#else
llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp");
// Don't need to worry about masking here
ctx->StoreInst(expr2, resultPtr);
@@ -3131,6 +3135,7 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
PointerType::GetUniform(type)->LLVMType(g->ctx));
ctx->StoreInst(expr1, resultPtr, test, type, PointerType::GetUniform(type));
return ctx->LoadInst(resultPtr, "selectexpr_final");
#endif // !LLVM_3_1
}

View File

@@ -310,6 +310,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_maskingIsFree = false;
this->m_maskBitCount = 32;
}
else if (!strcasecmp(isa, "sse4-8")) {
this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 16;
this->m_vectorWidth = 16;
this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
this->m_maskingIsFree = false;
this->m_maskBitCount = 8;
}
else if (!strcasecmp(isa, "generic-4")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 4;

13
opt.cpp
View File

@@ -670,14 +670,17 @@ IntrinsicsOpt::IntrinsicsOpt()
// All of the mask instructions we may encounter. Note that even if
// compiling for AVX, we may still encounter the regular 4-wide SSE
// MOVMSK instruction.
llvm::Function *sseMovmsk =
llvm::Function *ssei8Movmsk =
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse2_pmovmskb_128);
maskInstructions.push_back(ssei8Movmsk);
llvm::Function *sseFloatMovmsk =
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
maskInstructions.push_back(sseMovmsk);
maskInstructions.push_back(sseFloatMovmsk);
maskInstructions.push_back(m->module->getFunction("__movmsk"));
llvm::Function *avxMovmsk =
llvm::Function *avxFloatMovmsk =
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256);
Assert(avxMovmsk != NULL);
maskInstructions.push_back(avxMovmsk);
Assert(avxFloatMovmsk != NULL);
maskInstructions.push_back(avxFloatMovmsk);
// And all of the blend instructions
blendInstructions.push_back(BlendInstruction(