diff --git a/examples_cuda/intrinsics/knc-i1x16.h b/examples_cuda/intrinsics/knc-i1x16.h index ea15df5d..8c3f80e6 100644 --- a/examples_cuda/intrinsics/knc-i1x16.h +++ b/examples_cuda/intrinsics/knc-i1x16.h @@ -739,7 +739,7 @@ template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 #endif } -#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */ +#if 1 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */ template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) { return _mm512_load_epi32(p); @@ -1007,7 +1007,7 @@ template static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 #endif } -#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */ +#if 1 /* knc::fails as with _i32 this may generate fails ... so commetining it out */ template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) { __m512i v2 = _mm512_load_epi32(p); @@ -1119,7 +1119,7 @@ template static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) #endif } -#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */ +#if 1 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */ template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) { return _mm512_load_ps(p); @@ -1397,7 +1397,7 @@ template static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) } -#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */ +#if 1 /* knc::fails as with _f this may generate fails ... so commetining it out */ template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) { return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64)); diff --git a/examples_cuda/mandelbrot_tasks/Makefile b/examples_cuda/mandelbrot_tasks/Makefile index 1a565ffd..1663a92e 100644 --- a/examples_cuda/mandelbrot_tasks/Makefile +++ b/examples_cuda/mandelbrot_tasks/Makefile @@ -2,7 +2,7 @@ EXAMPLE=mandelbrot_tasks CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp ISPC_SRC=mandelbrot_tasks.ispc -ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 +ISPC_IA_TARGETS=avx ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples_cuda/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples_cuda/mandelbrot_tasks/mandelbrot_tasks.cpp index 698daf0f..24e72115 100644 --- a/examples_cuda/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples_cuda/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -42,105 +42,123 @@ #include #include #include "../timing.h" -#include "mandelbrot_tasks_ispc.h" +#include "mandelbrot_ispc.h" using namespace ispc; +#include + + +double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} + extern void mandelbrot_serial(float x0, float y0, float x1, float y1, - int width, int height, int maxIterations, - int output[]); + int width, int height, int maxIterations, + int output[]); /* Write a PPM image file with the image of the Mandelbrot set */ static void writePPM(int *buf, int width, int height, const char *fn) { - FILE *fp = fopen(fn, "wb"); - fprintf(fp, "P6\n"); - fprintf(fp, "%d %d\n", width, height); - fprintf(fp, "255\n"); - for (int i = 0; i < width*height; ++i) { - // Map the iteration count to colors by just alternating between - // two greys. - char c = (buf[i] & 0x1) ? 240 : 20; - for (int j = 0; j < 3; ++j) - fputc(c, fp); - } - fclose(fp); - printf("Wrote image file %s\n", fn); + FILE *fp = fopen(fn, "wb"); + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", width, height); + fprintf(fp, "255\n"); + for (int i = 0; i < width*height; ++i) { + // Map the iteration count to colors by just alternating between + // two greys. + char c = (buf[i] & 0x1) ? 240 : 20; + for (int j = 0; j < 3; ++j) + fputc(c, fp); + } + fclose(fp); + printf("Wrote image file %s\n", fn); } static void usage() { - fprintf(stderr, "usage: mandelbrot [--scale=]\n"); - exit(1); + fprintf(stderr, "usage: mandelbrot [--scale=]\n"); + exit(1); } int main(int argc, char *argv[]) { - unsigned int width = 1536; - unsigned int height = 1024; - float x0 = -2; - float x1 = 1; - float y0 = -1; - float y1 = 1; + unsigned int width = 1536; + unsigned int height = 1024; + float x0 = -2; + float x1 = 1; + float y0 = -1; + float y1 = 1; - if (argc == 1) - ; - else if (argc == 2) { - if (strncmp(argv[1], "--scale=", 8) == 0) { - float scale = atof(argv[1] + 8); - if (scale == 0.f) - usage(); - width *= scale; - height *= scale; - // round up to multiples of 16 - width = (width + 0xf) & ~0xf; - height = (height + 0xf) & ~0xf; - } - else - usage(); - } - else + if (argc == 1) + ; + else if (argc == 2) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + if (scale == 0.f) usage(); - - int maxIterations = 512; - int *buf = new int[width*height]; - - // - // Compute the image using the ispc implementation; report the minimum - // time of three runs. - // - double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { - // Clear out the buffer - for (unsigned int i = 0; i < width * height; ++i) - buf[i] = 0; - reset_and_start_timer(); - mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); - double dt = get_elapsed_mcycles(); - minISPC = std::min(minISPC, dt); + width *= scale; + height *= scale; + // round up to multiples of 16 + width = (width + 0xf) & ~0xf; + height = (height + 0xf) & ~0xf; } + else + usage(); + } + else + usage(); - printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); - writePPM(buf, width, height, "mandelbrot-ispc.ppm"); + int maxIterations = 512; + int *buf = new int[width*height]; + + // + // Compute the image using the ispc implementation; report the minimum + // time of three runs. + // + double minISPC = 1e30; + for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + reset_and_start_timer(); + double t0 = rtc(); + mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = rtc() - t0; //get_elapsed_mcycles(); + minISPC = std::min(minISPC, dt); + } + + printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); + writePPM(buf, width, height, "mandelbrot-ispc.ppm"); - // - // And run the serial implementation 3 times, again reporting the - // minimum time. - // - double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { - // Clear out the buffer - for (unsigned int i = 0; i < width * height; ++i) - buf[i] = 0; - reset_and_start_timer(); - mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); - double dt = get_elapsed_mcycles(); - minSerial = std::min(minSerial, dt); - } + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minSerial = 1e30; +#if 0 + for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + reset_and_start_timer(); + mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minSerial = std::min(minSerial, dt); + } - printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); - writePPM(buf, width, height, "mandelbrot-serial.ppm"); + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); + writePPM(buf, width, height, "mandelbrot-serial.ppm"); +#endif - printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); - return 0; + return 0; } diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp index 9cbb966a..ffad92d0 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp +++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp @@ -42,7 +42,7 @@ #include #include #include "../timing.h" -#include "mandelbrot_tasks3d_ispc.h" +#include "mandelbrot_ispc.h" using namespace ispc; extern void mandelbrot_serial(float x0, float y0, float x1, float y1, @@ -127,6 +127,7 @@ int main(int argc, char *argv[]) { // minimum time. // double minSerial = 1e30; +#if 0 for (int i = 0; i < 3; ++i) { // Clear out the buffer for (unsigned int i = 0; i < width * height; ++i) @@ -139,6 +140,7 @@ int main(int argc, char *argv[]) { printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(buf, width, height, "mandelbrot-serial.ppm"); +#endif printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); diff --git a/examples_cuda/stencil/stencil.cpp b/examples_cuda/stencil/stencil.cpp index dec9aee2..2dd09535 100644 --- a/examples_cuda/stencil/stencil.cpp +++ b/examples_cuda/stencil/stencil.cpp @@ -45,107 +45,129 @@ #include "stencil_ispc.h" using namespace ispc; +#include + + +double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} + extern void loop_stencil_serial(int t0, int t1, int x0, int x1, - int y0, int y1, int z0, int z1, - int Nx, int Ny, int Nz, - const double coef[5], - const double vsq[], - double Aeven[], double Aodd[]); + int y0, int y1, int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[5], + const double vsq[], + double Aeven[], double Aodd[]); void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { - int offset = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { - A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); - A[1][offset] = 0; - vsq[offset] = x*y*z / double(Nx * Ny * Nz); - } + int offset = 0; +#pragma omp parallel for collapse(2) private(offset) + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { + A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); + A[1][offset] = 0; + vsq[offset] = x*y*z / double(Nx * Ny * Nz); + } } int main() { - int Nx = 256, Ny = 256, Nz = 256; - int width = 4; - double *Aserial[2], *Aispc[2]; - Aserial[0] = new double [Nx * Ny * Nz]; - Aserial[1] = new double [Nx * Ny * Nz]; - Aispc[0] = new double [Nx * Ny * Nz]; - Aispc[1] = new double [Nx * Ny * Nz]; - double *vsq = new double [Nx * Ny * Nz]; + int Nx = 256, Ny = 256, Nz = 256; + int width = 4; + double *Aserial[2], *Aispc[2]; + Aserial[0] = new double [Nx * Ny * Nz]; + Aserial[1] = new double [Nx * Ny * Nz]; + Aispc[0] = new double [Nx * Ny * Nz]; + Aispc[1] = new double [Nx * Ny * Nz]; + double *vsq = new double [Nx * Ny * Nz]; - double coeff[4] = { 0.5, -.25, .125, -.0625 }; + double coeff[4] = { 0.5, -.25, .125, -.0625 }; - InitData(Nx, Ny, Nz, Aispc, vsq); +// InitData(Nx, Ny, Nz, Aispc, vsq); - // - // Compute the image using the ispc implementation on one core; report - // the minimum time of three runs. - // - double minTimeISPC = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aispc[0], Aispc[1]); - double dt = get_elapsed_mcycles(); - minTimeISPC = std::min(minTimeISPC, dt); - } + // + // Compute the image using the ispc implementation on one core; report + // the minimum time of three runs. + // + double minTimeISPC = 1e30; +#if 0 + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = get_elapsed_mcycles(); + minTimeISPC = std::min(minTimeISPC, dt); + } - printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); + printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); +#endif - InitData(Nx, Ny, Nz, Aispc, vsq); + fprintf(stderr, " -- init -- \n"); + InitData(Nx, Ny, Nz, Aispc, vsq); + fprintf(stderr, " -- done init -- \n"); - // - // Compute the image using the ispc implementation with tasks; report - // the minimum time of three runs. - // - double minTimeISPCTasks = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aispc[0], Aispc[1]); - double dt = get_elapsed_mcycles(); - minTimeISPCTasks = std::min(minTimeISPCTasks, dt); - } + // + // Compute the image using the ispc implementation with tasks; report + // the minimum time of three runs. + // + double minTimeISPCTasks = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + const double t0 = rtc(); + loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = rtc() - t0; //get_elapsed_mcycles(); + minTimeISPCTasks = std::min(minTimeISPCTasks, dt); + } - printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); + fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); - InitData(Nx, Ny, Nz, Aserial, vsq); - // - // And run the serial implementation 3 times, again reporting the - // minimum time. - // - double minTimeSerial = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aserial[0], Aserial[1]); - double dt = get_elapsed_mcycles(); - minTimeSerial = std::min(minTimeSerial, dt); - } + InitData(Nx, Ny, Nz, Aserial, vsq); - printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minTimeSerial = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aserial[0], Aserial[1]); + double dt = get_elapsed_mcycles(); + minTimeSerial = std::min(minTimeSerial, dt); + } - printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", - minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); + printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); - // Check for agreement - int offset = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { - double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / - Aserial[1][offset]); - if (error > 1e-4) - printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n", - x, y, z, Aispc[1][offset], Aserial[1][offset]); - } + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", + minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); - return 0; + // Check for agreement + int offset = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { + double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / + Aserial[1][offset]); + if (error > 1e-4) + printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n", + x, y, z, Aispc[1][offset], Aserial[1][offset]); + } + + return 0; } diff --git a/examples_cuda/tasksys.cpp b/examples_cuda/tasksys.cpp index 6bc60129..55cbccd5 100644 --- a/examples_cuda/tasksys.cpp +++ b/examples_cuda/tasksys.cpp @@ -59,7 +59,9 @@ #define ISPC_USE_PTHREADS #define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED #define ISPC_USE_CILK +*/ #define ISPC_USE_OMP +/* #define ISPC_USE_TBB_TASK_GROUP #define ISPC_USE_TBB_PARALLEL_FOR @@ -943,7 +945,7 @@ InitTaskSystem() { inline void TaskGroup::Launch(int baseIndex, int count) { -#pragma omp parallel for +#pragma omp parallel for schedule(dynamic) for(int i = 0; i < count; i++) { TaskInfo *ti = GetTaskInfo(baseIndex + i); diff --git a/examples_cuda/volume_rendering/volume.cpp b/examples_cuda/volume_rendering/volume.cpp index 17f0fe7b..fac567f6 100644 --- a/examples_cuda/volume_rendering/volume.cpp +++ b/examples_cuda/volume_rendering/volume.cpp @@ -179,10 +179,10 @@ int main(int argc, char *argv[]) { double dt = get_elapsed_mcycles(); minISPC = std::min(minISPC, dt); } -#endif printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC); writePPM(image, width, height, "volume-ispc-1core.ppm"); +#endif // Clear out the buffer for (int i = 0; i < width * height; ++i) @@ -214,6 +214,7 @@ int main(int argc, char *argv[]) { // minimum time. // double minSerial = 1e30; +#if 0 for (int i = 0; i < 3; ++i) { reset_and_start_timer(); volume_serial(density, n, raster2camera, camera2world, @@ -224,6 +225,7 @@ int main(int argc, char *argv[]) { printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(image, width, height, "volume-serial.ppm"); +#endif printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", minSerial/minISPC, minSerial / minISPCtasks);