Add clock() function to standard library.
Also corrected the declaration of num_cores() to return a uniform value.
This commit is contained in:
@@ -1811,6 +1811,22 @@ ok:
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; read hw clock
|
||||
|
||||
define i64 @__clock() nounwind uwtable ssp {
|
||||
entry:
|
||||
tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||
%0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||
%asmresult = extractvalue { i32, i32 } %0, 0
|
||||
%asmresult1 = extractvalue { i32, i32 } %0, 1
|
||||
%conv = zext i32 %asmresult1 to i64
|
||||
%shl = shl nuw i64 %conv, 32
|
||||
%conv2 = zext i32 %asmresult to i64
|
||||
%or = or i64 %shl, %conv2
|
||||
ret i64 %or
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; stdlib transcendentals
|
||||
;;
|
||||
|
||||
@@ -3444,12 +3444,27 @@ pointer types.
|
||||
System Information
|
||||
------------------
|
||||
|
||||
A routine is available to find the number of CPU cores available in the
|
||||
system:
|
||||
The value of a high-precision hardware clock counter is returned by the
|
||||
``clock()`` routine; its value increments by one each processor cycle.
|
||||
Thus, taking the difference between the values returned by ``clock()`` and
|
||||
different points in program execution gives the number of cycles between
|
||||
those points in the program.
|
||||
|
||||
::
|
||||
|
||||
int num_cores()
|
||||
uniform int64 clock()
|
||||
|
||||
Note that ``clock()`` flushes the processor pipeline. It has an overhead
|
||||
of a hundred or so cycles, so for very fine-grained measurements, it may be
|
||||
worthwhile to measure the cost of calling ``clock()`` and subtracting that
|
||||
value from reported results.
|
||||
|
||||
A routine is also available to find the number of CPU cores available in
|
||||
the system:
|
||||
|
||||
::
|
||||
|
||||
uniform int num_cores()
|
||||
|
||||
This value can be useful for adapting the granularity of parallel task
|
||||
decomposition depending on the number of processors in the system.
|
||||
|
||||
@@ -787,10 +787,14 @@ packed_store_active(uniform int * uniform a, int vals) {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// System information
|
||||
|
||||
static inline int num_cores() {
|
||||
static inline uniform int num_cores() {
|
||||
return __num_cores();
|
||||
}
|
||||
|
||||
static inline uniform int64 clock() {
|
||||
return __clock();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Atomics and memory barriers
|
||||
|
||||
|
||||
Reference in New Issue
Block a user