Add clock() function to standard library.
Also corrected the declaration of num_cores() to return a uniform value.
This commit is contained in:
@@ -1811,6 +1811,22 @@ ok:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; read hw clock
|
||||||
|
|
||||||
|
define i64 @__clock() nounwind uwtable ssp {
|
||||||
|
entry:
|
||||||
|
tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||||
|
%0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||||
|
%asmresult = extractvalue { i32, i32 } %0, 0
|
||||||
|
%asmresult1 = extractvalue { i32, i32 } %0, 1
|
||||||
|
%conv = zext i32 %asmresult1 to i64
|
||||||
|
%shl = shl nuw i64 %conv, 32
|
||||||
|
%conv2 = zext i32 %asmresult to i64
|
||||||
|
%or = or i64 %shl, %conv2
|
||||||
|
ret i64 %or
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; stdlib transcendentals
|
;; stdlib transcendentals
|
||||||
;;
|
;;
|
||||||
|
|||||||
@@ -3444,12 +3444,27 @@ pointer types.
|
|||||||
System Information
|
System Information
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
A routine is available to find the number of CPU cores available in the
|
The value of a high-precision hardware clock counter is returned by the
|
||||||
system:
|
``clock()`` routine; its value increments by one each processor cycle.
|
||||||
|
Thus, taking the difference between the values returned by ``clock()`` and
|
||||||
|
different points in program execution gives the number of cycles between
|
||||||
|
those points in the program.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
int num_cores()
|
uniform int64 clock()
|
||||||
|
|
||||||
|
Note that ``clock()`` flushes the processor pipeline. It has an overhead
|
||||||
|
of a hundred or so cycles, so for very fine-grained measurements, it may be
|
||||||
|
worthwhile to measure the cost of calling ``clock()`` and subtracting that
|
||||||
|
value from reported results.
|
||||||
|
|
||||||
|
A routine is also available to find the number of CPU cores available in
|
||||||
|
the system:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
uniform int num_cores()
|
||||||
|
|
||||||
This value can be useful for adapting the granularity of parallel task
|
This value can be useful for adapting the granularity of parallel task
|
||||||
decomposition depending on the number of processors in the system.
|
decomposition depending on the number of processors in the system.
|
||||||
|
|||||||
@@ -787,10 +787,14 @@ packed_store_active(uniform int * uniform a, int vals) {
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// System information
|
// System information
|
||||||
|
|
||||||
static inline int num_cores() {
|
static inline uniform int num_cores() {
|
||||||
return __num_cores();
|
return __num_cores();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uniform int64 clock() {
|
||||||
|
return __clock();
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Atomics and memory barriers
|
// Atomics and memory barriers
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user