From d805e8b18386e5635e617a2d5188a13ce8c40c31 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Sun, 22 Jan 2012 13:05:27 -0800 Subject: [PATCH] Add clock() function to standard library. Also corrected the declaration of num_cores() to return a uniform value. --- builtins/util.m4 | 16 ++++++++++++++++ docs/ispc.rst | 21 ++++++++++++++++++--- stdlib.ispc | 6 +++++- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/builtins/util.m4 b/builtins/util.m4 index 883cfb4c..64e3a130 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1811,6 +1811,22 @@ ok: ret void } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; read hw clock + +define i64 @__clock() nounwind uwtable ssp { +entry: + tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind + %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind + %asmresult = extractvalue { i32, i32 } %0, 0 + %asmresult1 = extractvalue { i32, i32 } %0, 1 + %conv = zext i32 %asmresult1 to i64 + %shl = shl nuw i64 %conv, 32 + %conv2 = zext i32 %asmresult to i64 + %or = or i64 %shl, %conv2 + ret i64 %or +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; stdlib transcendentals ;; diff --git a/docs/ispc.rst b/docs/ispc.rst index ddac4bcd..c0dcd6df 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3444,12 +3444,27 @@ pointer types. System Information ------------------ -A routine is available to find the number of CPU cores available in the -system: +The value of a high-precision hardware clock counter is returned by the +``clock()`` routine; its value increments by one each processor cycle. +Thus, taking the difference between the values returned by ``clock()`` and +different points in program execution gives the number of cycles between +those points in the program. :: - int num_cores() + uniform int64 clock() + +Note that ``clock()`` flushes the processor pipeline. It has an overhead +of a hundred or so cycles, so for very fine-grained measurements, it may be +worthwhile to measure the cost of calling ``clock()`` and subtracting that +value from reported results. + +A routine is also available to find the number of CPU cores available in +the system: + +:: + + uniform int num_cores() This value can be useful for adapting the granularity of parallel task decomposition depending on the number of processors in the system. diff --git a/stdlib.ispc b/stdlib.ispc index cae63abe..8a7daf49 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -787,10 +787,14 @@ packed_store_active(uniform int * uniform a, int vals) { /////////////////////////////////////////////////////////////////////////// // System information -static inline int num_cores() { +static inline uniform int num_cores() { return __num_cores(); } +static inline uniform int64 clock() { + return __clock(); +} + /////////////////////////////////////////////////////////////////////////// // Atomics and memory barriers