AVX2

update bison
A an array of tests
2020-02-27 21:09:45 -08:00 · 2019-07-14 12:48:50 -07:00 · 2017-05-12 12:33:51 -04:00 · 2017-05-11 21:16:08 -04:00 · 2017-05-11 16:28:11 -04:00 · 2017-05-11 16:25:07 -04:00
14 changed files with 534 additions and 23 deletions
--- a/func.cpp
+++ b/func.cpp
@@ -635,7 +635,7 @@ Function::GenerateIR() {
 const bool
 Function::IsPolyFunction() const {
    for (size_t i = 0; i < args.size(); i++) {
-        if (args[i]->type->IsPolymorphicType()) {
+        if (args[i] && args[i]->type->IsPolymorphicType()) {
            return true;
        }
    }
--- a/module.cpp
+++ b/module.cpp
@@ -1061,8 +1061,6 @@ Module::AddFunctionDeclaration(const std::string &name,

                const Type *ret = eft->GetReturnType();
                if (Type::EqualForReplacement(ret, pt)) {
-                    printf("Replaced return type %s\n",
-                           ret->GetString().c_str());
                    ret = PolyType::ReplaceType(ret, *te);
                }

@@ -1998,11 +1996,13 @@ lPrintPolyFunctionWrappers(FILE *file, const std::vector<std::string> &funcs) {
        for (size_t j=0; j<poly.size(); j++) {
            const FunctionType *ftype = CastType<FunctionType>(poly[j]->type);
            Assert(ftype);
-            std::string decl = ftype->GetCDeclaration(funcs[i]);
-            fprintf(file, "    %s {\n", decl.c_str());
+            if (ftype->isExported || ftype->isExternC) {
+                std::string decl = ftype->GetCDeclaration(funcs[i]);
+                fprintf(file, "    %s {\n", decl.c_str());

-            std::string call = ftype->GetCCall(poly[j]->name);
-            fprintf(file, "        return %s;\n    }\n", call.c_str());
+                std::string call = ftype->GetCCall(poly[j]->name);
+                fprintf(file, "        return %s;\n    }\n", call.c_str());
+            }
        }
    }

--- a/parse.yy
+++ b/parse.yy
@@ -37,7 +37,7 @@
 /* one for 'if', one for 'cif' */
 %expect 2

-%error-verbose
+%define parse.error verbose

 %code requires {

--- a/tests_ispcpp/CycleTimer.h
+++ b/tests_ispcpp/CycleTimer.h
@@ -0,0 +1,177 @@
+#ifndef _SYRAH_CYCLE_TIMER_H_
+#define _SYRAH_CYCLE_TIMER_H_
+
+#if defined(__APPLE__)
+  #if defined(__x86_64__)
+    #include <sys/sysctl.h>
+  #else
+    #include <mach/mach.h>
+    #include <mach/mach_time.h>
+  #endif // __x86_64__ or not
+
+  #include <stdio.h>  // fprintf
+  #include <stdlib.h> // exit
+
+#elif _WIN32
+#  include <windows.h>
+#  include <time.h>
+#else
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+#  include <sys/time.h>
+#endif
+
+
+  // This uses the cycle counter of the processor.  Different
+  // processors in the system will have different values for this.  If
+  // you process moves across processors, then the delta time you
+  // measure will likely be incorrect.  This is mostly for fine
+  // grained measurements where the process is likely to be on the
+  // same processor.  For more global things you should use the
+  // Time interface.
+
+  // Also note that if you processors' speeds change (i.e. processors
+  // scaling) or if you are in a heterogenous environment, you will
+  // likely get spurious results.
+  class CycleTimer {
+  public:
+    typedef unsigned long long SysClock;
+
+    //////////
+    // Return the current CPU time, in terms of clock ticks.
+    // Time zero is at some arbitrary point in the past.
+    static SysClock currentTicks() {
+#if defined(__APPLE__) && !defined(__x86_64__)
+      return mach_absolute_time();
+#elif defined(_WIN32)
+      LARGE_INTEGER qwTime;
+      QueryPerformanceCounter(&qwTime);
+      return qwTime.QuadPart;
+#elif defined(__x86_64__)
+      unsigned int a, d;
+      asm volatile("rdtsc" : "=a" (a), "=d" (d));
+      return static_cast<unsigned long long>(a) |
+        (static_cast<unsigned long long>(d) << 32);
+#elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
+      unsigned int val;
+      asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
+      return val;
+#else
+      timespec spec;
+      clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
+      return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
+#endif
+    }
+
+    //////////
+    // Return the current CPU time, in terms of seconds.
+    // This is slower than currentTicks().  Time zero is at
+    // some arbitrary point in the past.
+    static double currentSeconds() {
+      return currentTicks() * secondsPerTick();
+    }
+
+    //////////
+    // Return the conversion from seconds to ticks.
+    static double ticksPerSecond() {
+      return 1.0/secondsPerTick();
+    }
+
+    static const char* tickUnits() {
+#if defined(__APPLE__) && !defined(__x86_64__)
+      return "ns";
+#elif defined(__WIN32__) || defined(__x86_64__)
+      return "cycles";
+#else
+      return "ns"; // clock_gettime
+#endif
+    }
+
+    //////////
+    // Return the conversion from ticks to seconds.
+    static double secondsPerTick() {
+      static bool initialized = false;
+      static double secondsPerTick_val;
+      if (initialized) return secondsPerTick_val;
+#if defined(__APPLE__)
+  #ifdef __x86_64__
+      int args[] = {CTL_HW, HW_CPU_FREQ};
+      unsigned int Hz;
+      size_t len = sizeof(Hz);
+      if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
+         fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
+         exit(-1);
+      }
+      secondsPerTick_val = 1.0 / (double) Hz;
+  #else
+      mach_timebase_info_data_t time_info;
+      mach_timebase_info(&time_info);
+
+      // Scales to nanoseconds without 1e-9f
+      secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
+        static_cast<double>(time_info.denom);
+  #endif // x86_64 or not
+#elif defined(_WIN32)
+      LARGE_INTEGER qwTicksPerSec;
+      QueryPerformanceFrequency(&qwTicksPerSec);
+      secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
+#else
+      FILE *fp = fopen("/proc/cpuinfo","r");
+      char input[1024];
+      if (!fp) {
+         fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
+         exit(-1);
+      }
+      // In case we don't find it, e.g. on the N900
+      secondsPerTick_val = 1e-9;
+      while (!feof(fp) && fgets(input, 1024, fp)) {
+        // NOTE(boulos): Because reading cpuinfo depends on dynamic
+        // frequency scaling it's better to read the @ sign first
+        float GHz, MHz;
+        if (strstr(input, "model name")) {
+          char* at_sign = strstr(input, "@");
+          if (at_sign) {
+            char* after_at = at_sign + 1;
+            char* GHz_str = strstr(after_at, "GHz");
+            char* MHz_str = strstr(after_at, "MHz");
+            if (GHz_str) {
+              *GHz_str = '\0';
+              if (1 == sscanf(after_at, "%f", &GHz)) {
+                //printf("GHz = %f\n", GHz);
+                secondsPerTick_val = 1e-9f / GHz;
+                break;
+              }
+            } else if (MHz_str) {
+              *MHz_str = '\0';
+              if (1 == sscanf(after_at, "%f", &MHz)) {
+                //printf("MHz = %f\n", MHz);
+                secondsPerTick_val = 1e-6f / GHz;
+                break;
+              }
+            }
+          }
+        } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
+          //printf("MHz = %f\n", MHz);
+          secondsPerTick_val = 1e-6f / MHz;
+          break;
+        }
+      }
+      fclose(fp);
+#endif
+
+      initialized = true;
+      return secondsPerTick_val;
+    }
+
+    //////////
+    // Return the conversion from ticks to milliseconds.
+    static double msPerTick() {
+      return secondsPerTick() * 1000.0;
+    }
+
+  private:
+    CycleTimer();
+  };
+
+#endif // #ifndef _SYRAH_CYCLE_TIMER_H_
--- a/tests_ispcpp/Makefile
+++ b/tests_ispcpp/Makefile
@@ -1,12 +1,15 @@
 CXX=g++
-CXXFLAGS=-std=c++11 -O2
+CXXFLAGS=-std=c++11 -O3 -lm -lpthread

 ISPC=../ispc
-ISPCFLAGS=--target=sse4-x2 -O2 --arch=x86-64
+ISPCFLAGS=--target=avx2 -O3 --arch=x86-64

-%.out : %.cpp %.o
+%.out : %.cpp %.o tasksys.o
 	$(CXX) $(CXXFLAGS) -o $@ $^

+tasksys.o : ../examples/tasksys.cpp
+	$(CXX) $(CXXFLAGS) -c -o $@ $^
+
 $ : $.o

 %.o : %.ispc
--- a/tests_ispcpp/array.ispc
+++ b/tests_ispcpp/array.ispc
@@ -0,0 +1,13 @@
+export void array(uniform int N, uniform integer * uniform X) {
+    integer *A = new integer[N/2];
+
+    foreach (i = 0 ... N/2) {
+        A[i] = X[i] + X[N/2 + i];
+    }
+
+    foreach (i = 0 ... N) {
+        X[i] = A[i/2];
+    }
+
+    delete[] A;
+}
--- a/tests_ispcpp/error_5.ispc
+++ b/tests_ispcpp/error_5.ispc
@@ -0,0 +1,17 @@
+//@error
+
+floating$0 mult(floating$0 x, floating$1 y) {
+    return x * y;
+}
+
+export void saxpy(uniform int N,
+                  uniform floating$0 scale,
+                  uniform floating$1 X[],
+                  uniform floating$1 Y[],
+                  uniform floating$2 result[])
+{
+    foreach (i = 0 ... N) {
+        floating$ tmp = mult(scale, X[i]) + Y[i];
+        result[i] = tmp;
+    }
+}
--- a/tests_ispcpp/error_6.ispc
+++ b/tests_ispcpp/error_6.ispc
@@ -0,0 +1,14 @@
+number pow(number b, int a) {
+    number out = b;
+    for (int i = 1; i<a; i++) {
+        out *= b;
+    }
+
+    return out;
+}
+
+export void square(uniform int N, uniform number$-1 b[], uniform number$-1 out[]) {
+    foreach (i = 0 ... N) {
+        out[i] = pow(b[i], 2);
+    }
+}
--- a/tests_ispcpp/error_7.ispc
+++ b/tests_ispcpp/error_7.ispc
@@ -0,0 +1,13 @@
+floating foo(floating a, floating b) {
+    floating d = a / b;
+    if (d < 0.)
+        return 0.;
+
+    return d;
+}
+
+export void bar(uniform integer * uniform X, uniform int N) {
+    foreach (i = 0 ... N-1) {
+        X[i] = foo(X[i], X[i+1]);
+    }
+}
--- a/tests_ispcpp/function.ispc
+++ b/tests_ispcpp/function.ispc
@@ -1,15 +1,15 @@
 floating saxpy_helper(floating scale,
-                      floating<0> x,
-                      floating<0> y) {
+                      floating$3 x,
+                      floating$3 y) {
    return scale * x + y;
 }


 export void saxpy(uniform int N,
-                  uniform floating<0> scale,
-                  uniform floating<1> X[],
-                  uniform floating<1> Y[],
-                  uniform floating<2> result[])
+                  uniform floating$0 scale,
+                  uniform floating$1 X[],
+                  uniform floating$1 Y[],
+                  uniform floating$2 result[])
 {
    foreach (i = 0 ... N) {
        result[i] = saxpy_helper(scale, X[i], Y[i]);
--- a/tests_ispcpp/sqrt.cpp
+++ b/tests_ispcpp/sqrt.cpp
@@ -0,0 +1,212 @@
+#include <stdio.h>
+#include <algorithm>
+#include <pthread.h>
+#include <math.h>
+#include <cmath>
+
+#include "CycleTimer.h"
+#include "sqrt.h"
+
+using namespace ispc;
+
+void sqrtSerial(int N,
+                float initialGuess,
+                float values[],
+                float output[])
+{
+
+    static const float kThreshold = 0.00001f;
+
+    for (int i=0; i<N; i++) {
+
+        float x = values[i];
+        float guess = initialGuess;
+
+        float error = fabs(guess * guess * x - 1.f);
+
+        while (error > kThreshold) {
+            guess = (3.f * guess - x * guess * guess * guess) * 0.5f;
+            error = fabs(guess * guess * x - 1.f);
+        }
+
+        output[i] = x * guess;
+    }
+}
+
+void sqrtSerial(int N,
+                double initialGuess,
+                double values[],
+                double output[])
+{
+
+    static const double kThreshold = 0.00001f;
+
+    for (int i=0; i<N; i++) {
+
+        double x = values[i];
+        double guess = initialGuess;
+
+        double error = std::abs(guess * guess * x - 1.);
+
+        while (error > kThreshold) {
+            guess = (3. * guess - x * guess * guess * guess) * 0.5;
+            error = std::abs(guess * guess * x - 1.);
+        }
+
+        output[i] = x * guess;
+    }
+}
+
+static void verifyResult(int N, float* result, float* gold) {
+    for (int i=0; i<N; i++) {
+        if (fabs(result[i] - gold[i]) > 1e-4) {
+            printf("Error: [%d] Got %f expected %f\n", i, result[i], gold[i]);
+        }
+    }
+}
+
+static void verifyResult(int N, double* result, double* gold) {
+    for (int i=0; i<N; i++) {
+        if (std::abs(result[i] - gold[i]) > 1e-4) {
+            printf("Error: [%d] Got %f expected %f\n", i, result[i], gold[i]);
+        }
+    }
+}
+
+int main() {
+
+    const unsigned int N = 20 * 1000 * 1000;
+    const float initialGuess = 1.0f;
+    const double dinitialGuess = 1.0;
+
+    float* values = new float[N];
+    float* output = new float[N];
+    float* gold = new float[N];
+
+    double* dvalues = new double[N];
+    double* doutput = new double[N];
+    double* dgold = new double[N];
+
+    for (unsigned int i=0; i<N; i++)
+    {
+        // random input values
+        values[i] = .001f + 2.998f * static_cast<float>(rand()) / RAND_MAX;
+        dvalues[i] = .001 + 2.998 * static_cast<double>(rand()) / RAND_MAX;
+        output[i] = 0.f;
+        doutput[i] = 0.;
+    }
+
+    // generate a gold version to check results
+    for (unsigned int i=0; i<N; i++) {
+        gold[i] = sqrt(values[i]);
+        dgold[i] = sqrt(dvalues[i]);
+    }
+
+    //
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        double startTime = CycleTimer::currentSeconds();
+        sqrtSerial(N, initialGuess, values, output);
+        double endTime = CycleTimer::currentSeconds();
+        minSerial = std::min(minSerial, endTime - startTime);
+    }
+
+    printf("[sqrt float serial]:\t\t[%.3f] ms\n", minSerial * 1000);
+    verifyResult(N, output, gold);
+
+    double minDSerial = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        double startTime = CycleTimer::currentSeconds();
+        sqrtSerial(N, dinitialGuess, dvalues, doutput);
+        double endTime = CycleTimer::currentSeconds();
+        minDSerial = std::min(minDSerial, endTime - startTime);
+    }
+
+    printf("[sqrt double serial]:\t\t[%.3f] ms\n", minDSerial * 1000);
+    verifyResult(N, doutput, dgold);
+
+    // Clear out the buffer
+    for (unsigned int i = 0; i < N; ++i) {
+        output[i] = 0;
+        doutput[i] = 0;
+    }
+
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        double startTime = CycleTimer::currentSeconds();
+        ispc::sqrt_ispc(N, initialGuess, values, output);
+        double endTime = CycleTimer::currentSeconds();
+        minISPC = std::min(minISPC, endTime - startTime);
+    }
+
+    printf("[sqrt float ispc]:\t\t[%.3f] ms\n", minISPC * 1000);
+
+    verifyResult(N, output, gold);
+
+    double minDISPC = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        double startTime = CycleTimer::currentSeconds();
+        ispc::sqrt_ispc(N, dinitialGuess, dvalues, doutput);
+        double endTime = CycleTimer::currentSeconds();
+        minDISPC = std::min(minDISPC, endTime - startTime);
+    }
+
+    printf("[sqrt double ispc]:\t\t[%.3f] ms\n", minDISPC * 1000);
+
+    verifyResult(N, doutput, dgold);
+
+    // Clear out the buffer
+    for (unsigned int i = 0; i < N; ++i) {
+        output[i] = 0;
+        doutput[i] = 0;
+    }
+
+    //
+    // Tasking version of the ISPC code
+    //
+    double minTaskISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        double startTime = CycleTimer::currentSeconds();
+        ispc::sqrt_ispc_withtasks(N, initialGuess, values, output);
+        double endTime = CycleTimer::currentSeconds();
+        minTaskISPC = std::min(minTaskISPC, endTime - startTime);
+    }
+
+    printf("[sqrt float task ispc]:\t\t[%.3f] ms\n", minTaskISPC * 1000);
+
+    verifyResult(N, output, gold);
+
+    double minDTaskISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        double startTime = CycleTimer::currentSeconds();
+        ispc::sqrt_ispc_withtasks(N, dinitialGuess, dvalues, doutput);
+        double endTime = CycleTimer::currentSeconds();
+        minDTaskISPC = std::min(minDTaskISPC, endTime - startTime);
+    }
+
+    printf("[sqrt double task ispc]:\t[%.3f] ms\n", minDTaskISPC * 1000);
+
+    verifyResult(N, output, gold);
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC float)\n", minSerial/minISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC double)\n", minDSerial/minDISPC);
+    printf("\t\t\t\t(%.2fx speedup from task ISPC float)\n", minSerial/minTaskISPC);
+    printf("\t\t\t\t(%.2fx speedup from task ISPC double)\n", minDSerial/minDTaskISPC);
+
+    delete[] values;
+    delete[] output;
+    delete[] gold;
+    delete[] dvalues;
+    delete[] doutput;
+    delete[] dgold;
+
+    return 0;
+}
--- a/tests_ispcpp/sqrt.ispc
+++ b/tests_ispcpp/sqrt.ispc
@@ -0,0 +1,62 @@
+
+static const float kThreshold = 0.00001f;
+
+export void sqrt_ispc(uniform int N,
+                      uniform floating initialGuess,
+                      uniform floating values[],
+                      uniform floating output[])
+{
+    foreach (i = 0 ... N) {
+
+        floating x = values[i];
+        floating guess = initialGuess;
+
+        floating pred = abs(guess * guess * x - 1.f);
+
+        while (pred > kThreshold) {
+            guess = (3.f * guess - x * guess * guess * guess) * 0.5f;
+            pred = abs(guess * guess * x - 1.f);
+        }
+
+        output[i] = x * guess;
+
+    }
+}
+
+task void sqrt_ispc_task(uniform int N,
+                         uniform int span,
+                         uniform floating initialGuess,
+                         uniform floating values[],
+                         uniform floating output[])
+{
+
+    uniform int indexStart = taskIndex * span;
+    uniform int indexEnd = min(N, indexStart + span);
+
+    foreach (i = indexStart ... indexEnd) {
+
+        floating x = values[i];
+        floating guess = initialGuess;
+
+        floating pred = abs(guess * guess * x - 1.f);
+
+        while (pred > kThreshold) {
+            guess = (3.f * guess - x * guess * guess * guess) * 0.5f;
+            pred = abs(guess * guess * x - 1.f);
+        }
+
+        output[i] = x * guess;
+
+    }
+}
+
+export void sqrt_ispc_withtasks(uniform int N,
+                                uniform floating initialGuess,
+                                uniform floating values[],
+                                uniform floating output[])
+{
+
+    uniform int span = N / 64;  // 64 tasks
+
+    launch[N/span] sqrt_ispc_task(N, span, initialGuess, values, output);
+}
--- a/tests_ispcpp/varying.cpp
+++ b/tests_ispcpp/varying.cpp
@@ -6,7 +6,7 @@
 int main() {
    float A[256];
    double B[256];
-    double outA[256];
+    float outA[256];
    double outB[256];


@@ -15,7 +15,7 @@ int main() {
        B[i] = 1. / (i+1);
    }

-    ispc::square(256, (float*)&A, (double*)&outA);
+    ispc::square(256, (float*)&A, (float*)&outA);

    ispc::square(256, (double*)&B, (double*)&outB);

--- a/tests_ispcpp/varying.ispc
+++ b/tests_ispcpp/varying.ispc
@@ -1,5 +1,5 @@
-floating foo(const uniform int a, floating b) {
-    floating out = b;
+number pow(number b, int a) {
+    number out = b;
    for (int i = 1; i<a; i++) {
        out *= b;
    }
@@ -7,8 +7,8 @@ floating foo(const uniform int a, floating b) {
    return out;
 }

-export void square(uniform int N, uniform floating b[], uniform double out[]) {
+export void square(uniform int N, uniform number b[], uniform number out[]) {
    foreach (i = 0 ... N) {
-        out[i] = foo(2, b[i]);
+        out[i] = pow(b[i], 2);
    }
 }
Author	SHA1	Message	Date
Aaron Gutierrez	1633d50b34	AVX2	2020-02-27 21:09:45 -08:00
Aaron Gutierrez	b8453b4a3a	update bison	2019-07-14 12:48:50 -07:00
Aaron Gutierrez	1eb64a13e1	A an array of tests	2017-05-12 12:33:51 -04:00
Aaron Gutierrez	2921430e45	Cleaning up tests and printing for demo	2017-05-11 21:16:08 -04:00
Aaron Gutierrez	34d26554bf	use correct abs function for doubles	2017-05-11 16:28:11 -04:00
Aaron Gutierrez	5c0911c2a8	add missing dependency for timing test	2017-05-11 16:25:07 -04:00
Aaron Gutierrez	f513e085ea	Add sqrt tests from assignment 1	2017-05-11 16:22:18 -04:00
Aaron Gutierrez	a47cab4dfa	Replicates all needed state between expanded functions commit `5e6f06cf59` Author: Aaron Gutierrez <gutierrez.aaron.m@gmail.com> Date: Thu May 11 15:42:11 2017 -0400 Fixed issue with aliasing local variables ISPC++ now produces valid code, or an appropriate error message, for all of my test cases. commit `bfe723e1b7` Author: Aaron Gutierrez <gutierrez.aaron.m@gmail.com> Date: Thu May 11 03:09:38 2017 -0400 Actually copy the AST. Type replacement works except for function parameters. commit `f65b3e6300` Author: Aaron Gutierrez <gutierrez.aaron.m@gmail.com> Date: Thu May 11 01:19:50 2017 -0400 [WIP] Remove cases for ForeachStmt and SymbolExpr commit `2e28640860` Merge: `6a91c5d` `d020107` Author: Aaron Gutierrez <gutierrez.aaron.m@gmail.com> Date: Wed May 10 23:13:40 2017 -0400 Merge branch 'master' into copy_ast commit `6a91c5d5ac` Author: Aaron Gutierrez <gutierrez.aaron.m@gmail.com> Date: Wed May 10 11:11:39 2017 -0400 Attempt to replicate AST when expanding polytypes	2017-05-11 15:43:29 -04:00