Big rewrite / improvement of target handling.

If no CPU is specified, use the host CPU type, not just a default of "nehalem".
Provide better features strings to the LLVM target machinery.
 -> Thus ensuring that LLVM doesn't generate SSE>2 instructions for the SSE2
    target (Fixes issue #82).
 -> Slight code improvements from using cmovs in generated code now
Use the llvm popcnt intrinsic for the SSE2 target now (it now generates code
  that doesn't call the popcnt instruction now that we properly tell LLVM
  which instructions are and aren't available for SSE2.)
This commit is contained in:
Matt Pharr
2011-08-26 09:54:45 -07:00
parent c340ff3893
commit b67498766e
7 changed files with 286 additions and 184 deletions

View File

@@ -277,41 +277,18 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions
; FIXME: this is very inefficient, loops over all 32 bits...
; we could use the LLVM intrinsic declare i32 @llvm.ctpop.i32(i32),
; although that currently ends up generating a POPCNT instruction even
; if we give --target=sse2 on the command line. We probably need to
; pipe through the 'sse2' request to LLVM via the 'features' string
; at codegen time... (If e.g. --cpu=penryn is also passed along, then
; it does generate non-POPCNT code and in particular better code than
; the below does.)
declare i32 @llvm.ctpop.i32(i32)
declare i64 @llvm.ctpop.i64(i64)
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %newcount, %loop ]
%val = phi i32 [ %0, %entry ], [ %newval, %loop ]
%delta = and i32 %val, 1
%newcount = add i32 %count, %delta
%newval = lshr i32 %val, 1
%done = icmp eq i32 %newval, 0
br i1 %done, label %exit, label %loop
exit:
ret i32 %newcount
%val = call i32 @llvm.ctpop.i32(i32 %0)
ret i32 %val
}
define internal i32 @__popcnt_int64(i64) nounwind readnone alwaysinline {
%vec = bitcast i64 %0 to <2 x i32>
%v0 = extractelement <2 x i32> %vec, i32 0
%v1 = extractelement <2 x i32> %vec, i32 1
%c0 = call i32 @__popcnt_int32(i32 %v0)
%c1 = call i32 @__popcnt_int32(i32 %v1)
%sum = add i32 %c0, %c1
ret i32 %sum
%val = call i64 @llvm.ctpop.i64(i64 %0)
%val32 = trunc i64 %val to i32
ret i32 %val32
}