Add clock() function to standard library.

Also corrected the declaration of num_cores() to return a
uniform value.
This commit is contained in:
Matt Pharr
2012-01-22 13:05:27 -08:00
parent 1f0f2ec05f
commit d805e8b183
3 changed files with 39 additions and 4 deletions

View File

@@ -1811,6 +1811,22 @@ ok:
ret void ret void
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; read hw clock
define i64 @__clock() nounwind uwtable ssp {
entry:
tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
%0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
%asmresult = extractvalue { i32, i32 } %0, 0
%asmresult1 = extractvalue { i32, i32 } %0, 1
%conv = zext i32 %asmresult1 to i64
%shl = shl nuw i64 %conv, 32
%conv2 = zext i32 %asmresult to i64
%or = or i64 %shl, %conv2
ret i64 %or
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; stdlib transcendentals ;; stdlib transcendentals
;; ;;

View File

@@ -3444,12 +3444,27 @@ pointer types.
System Information System Information
------------------ ------------------
A routine is available to find the number of CPU cores available in the The value of a high-precision hardware clock counter is returned by the
system: ``clock()`` routine; its value increments by one each processor cycle.
Thus, taking the difference between the values returned by ``clock()`` and
different points in program execution gives the number of cycles between
those points in the program.
:: ::
int num_cores() uniform int64 clock()
Note that ``clock()`` flushes the processor pipeline. It has an overhead
of a hundred or so cycles, so for very fine-grained measurements, it may be
worthwhile to measure the cost of calling ``clock()`` and subtracting that
value from reported results.
A routine is also available to find the number of CPU cores available in
the system:
::
uniform int num_cores()
This value can be useful for adapting the granularity of parallel task This value can be useful for adapting the granularity of parallel task
decomposition depending on the number of processors in the system. decomposition depending on the number of processors in the system.

View File

@@ -787,10 +787,14 @@ packed_store_active(uniform int * uniform a, int vals) {
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// System information // System information
static inline int num_cores() { static inline uniform int num_cores() {
return __num_cores(); return __num_cores();
} }
static inline uniform int64 clock() {
return __clock();
}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// Atomics and memory barriers // Atomics and memory barriers