Files
ispc/builtins-sse2-common.ll
Matt Pharr cabe358c0a Workaround change to linker behavior in LLVM 3.1
Now, the Linker::LinkModules() call doesn't link in any functions
marked as 'internal', which is problematic, since we'd like to have
just about all of the builtins marked as internal so that they are
eliminated after they've been inlined when they are in fact used.

This change removes all of the internal qualifiers in the builtins
and adds a lSetInternalFunctions() routine to builtins.cpp that
sets this property on the functions that need it after they've
been linked in by LinkModules().
2011-11-05 16:57:26 -07:00

267 lines
9.1 KiB
LLVM

;; Copyright (c) 2010-2011, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; do the rcpss call
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration to improve precision, as above
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
; uniform float is = extract(__rsqrt_u(v), 0);
%v = insertelement <4 x float> undef, float %0, i32 0
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
%is = extractelement <4 x float> %vis, i32 0
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul float %0, %is
%v_is_is = fmul float %v_is, %is
%three_sub = fsub float 3., %v_is_is
%is_mul = fmul float %is, %three_sub
%half_scale = fmul float 0.5, %is_mul
ret float %half_scale
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
ret float %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fast math mode
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
define void @__fastmath() nounwind alwaysinline {
%ptr = alloca i32
%ptr8 = bitcast i32 * %ptr to i8 *
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
%oldval = load i32 *%ptr
; turn on DAZ (64)/FTZ (32768) -> 32832
%update = or i32 %oldval, 32832
store i32 %update, i32 *%ptr
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
ret float %ret
}
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
ret float %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define double @__min_uniform_double(double, double) nounwind readnone {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret
}
define double @__max_uniform_double(double, double) nounwind readnone {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding
;;
;; There are not any rounding instructions in SSE2, so we have to emulate
;; the functionality with multiple instructions...
; The code for __round_* is the result of compiling the following source
; code.
;
; export float Round(float x) {
; unsigned int sign = signbits(x);
; unsigned int ix = intbits(x);
; ix ^= sign;
; x = floatbits(ix);
; x += 0x1.0p23f;
; x -= 0x1.0p23f;
; ix = intbits(x);
; ix ^= sign;
; x = floatbits(ix);
; return x;
;}
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
%binop21.i = fadd float %binop.i, -8.388608e+06
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
ret float %int_to_float_bitcast.i.i.i
}
;; Similarly, for implementations of the __floor* functions below, we have the
;; bitcode from compiling the following source code...
;export float Floor(float x) {
; float y = Round(x);
; unsigned int cmp = y > x ? 0xffffffff : 0;
; float delta = -1.f;
; unsigned int idelta = intbits(delta);
; idelta &= cmp;
; delta = floatbits(idelta);
; return y + delta;
;}
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
%bincmp.i = fcmp ogt float %calltmp.i, %0
%selectexpr.i = sext i1 %bincmp.i to i32
%bitop.i = and i32 %selectexpr.i, -1082130432
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
ret float %binop.i
}
;; And here is the code we compiled to get the __ceil* functions below
;
;export uniform float Ceil(uniform float x) {
; uniform float y = Round(x);
; uniform int yltx = y < x ? 0xffffffff : 0;
; uniform float delta = 1.f;
; uniform int idelta = intbits(delta);
; idelta &= yltx;
; delta = floatbits(idelta);
; return y + delta;
;}
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
%bincmp.i = fcmp olt float %calltmp.i, %0
%selectexpr.i = sext i1 %bincmp.i to i32
%bitop.i = and i32 %selectexpr.i, 1065353216
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
ret float %binop.i
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles
declare double @round(double)
declare double @floor(double)
declare double @ceil(double)
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
%r = call double @round(double %0)
ret double %r
}
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
%r = call double @floor(double %0)
ret double %r
}
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
%r = call double @ceil(double %0)
ret double %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32)
declare i64 @llvm.ctpop.i64(i64)
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
%val = call i32 @llvm.ctpop.i32(i32 %0)
ret i32 %val
}
define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
%val = call i64 @llvm.ctpop.i64(i64 %0)
ret i64 %val
}