diff --git a/examples_ptx/ispc_malloc.h b/examples_ptx/ispc_malloc.h new file mode 100644 index 00000000..6eb206db --- /dev/null +++ b/examples_ptx/ispc_malloc.h @@ -0,0 +1,17 @@ +#pragma once + +#ifdef _CUDA_ +extern void ispc_malloc(void **ptr, const size_t size); +extern void ispc_free(void *ptr); +#else +#include +static inline void ispc_malloc(void **ptr, const size_t size) +{ + *ptr = malloc(size); +} +static inline void ispc_free(void *ptr) +{ + free(ptr); +} + +#endif diff --git a/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.cpp index 682987ae..61678cdf 100644 --- a/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -40,11 +40,11 @@ #include #include +#include #include -#include #include "../timing.h" +#include "../ispc_malloc.h" #include "mandelbrot_tasks_ispc.h" -using namespace ispc; extern void mandelbrot_serial(float x0, float y0, float x1, float y1, int width, int height, int maxIterations, @@ -103,25 +103,28 @@ int main(int argc, char *argv[]) { int maxIterations = 512; - int *buf = new int[width*height]; + int *buf; + ispc_malloc(&buf, n*widht*height); + + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; // // Compute the image using the ispc implementation; report the minimum // time of three runs. // double minISPC = 1e30; - for (int i = 0; i < test_iterations[0]; ++i) { - // Clear out the buffer - for (unsigned int i = 0; i < width * height; ++i) - buf[i] = 0; - reset_and_start_timer(); - mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); - double dt = get_elapsed_mcycles(); - printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt); - minISPC = std::min(minISPC, dt); + for (int i = 0; i < test_iterations[0]; ++i) + { + // Clear out the buffer + reset_and_start_timer(); + ispc::mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_msec(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt); + minISPC = std::min(minISPC, dt); } - printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); + printf("[mandelbrot ispc+tasks]:\t[%.3f] msec\n", minISPC); writePPM(buf, width, height, "mandelbrot-ispc.ppm"); @@ -129,22 +132,26 @@ int main(int argc, char *argv[]) { // And run the serial implementation 3 times, again reporting the // minimum time. // + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + double minSerial = 1e30; - for (int i = 0; i < test_iterations[1]; ++i) { - // Clear out the buffer - for (unsigned int i = 0; i < width * height; ++i) - buf[i] = 0; - reset_and_start_timer(); - mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); - double dt = get_elapsed_mcycles(); - printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); - minSerial = std::min(minSerial, dt); + for (int i = 0; i < test_iterations[1]; ++i) + { + reset_and_start_timer(); + mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] msec\n", dt); + minSerial = std::min(minSerial, dt); } - printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); + printf("[mandelbrot serial]:\t\t[%.3f] msec\n", minSerial); writePPM(buf, width, height, "mandelbrot-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); + ispc_free(buf); + return 0; } diff --git a/examples_ptx/timing.h b/examples_ptx/timing.h new file mode 100644 index 00000000..7a44c531 --- /dev/null +++ b/examples_ptx/timing.h @@ -0,0 +1,110 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +static inline double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} + +#ifdef __arm__ +#include +// There's no easy way to get a hardware clock counter on ARM, so instead +// we'll pretend it's a 1GHz processor and then compute pretend cycles +// based on elapsed time from gettimeofday(). +__inline__ uint64_t rdtsc() { + static bool first = true; + static struct timeval tv_start; + if (first) { + gettimeofday(&tv_start, NULL); + first = false; + return 0; + } + + struct timeval tv; + gettimeofday(&tv, NULL); + tv.tv_sec -= tv_start.tv_sec; + tv.tv_usec -= tv_start.tv_usec; + return (1000000ull * tv.tv_sec + tv.tv_usec) * 1000ull; +} + +#else // __arm__ + +#ifdef WIN32 +#include +#define rdtsc __rdtsc +#else // WIN32 +__inline__ uint64_t rdtsc() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 +#endif // !__arm__ + +static uint64_t start, end; +static double tstart, tend; + +static inline void reset_and_start_timer() +{ + start = rdtsc(); + tstart = rtc(); +} + +/* Returns the number of millions of elapsed processor cycles since the + last reset_and_start_timer() call. */ +static inline double get_elapsed_mcycles() +{ + end = rdtsc(); + tend = rtc(); +#if 0 + return (end-start) / (1024. * 1024.); +#else + return (tend - tstart)*1e3; +#endif +}