From 0ac4f7b6207136eefe1db53a9ef215c04de731c5 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 3 Aug 2011 12:07:30 -0700
Subject: [PATCH] Add various prefetch functions to the standard library.

---
 Makefile      |  5 +++--
 builtins.cpp  | 47 +++++++++++++++++++++++++++++++++-----------
 builtins.m4   | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++
 docs/ispc.txt | 34 ++++++++++++++++++++++++++++++++
 stdlib.ispc   | 37 +++++++++++++++++++++++++++++++++++
 type.cpp      |  7 ++++---
 6 files changed, 168 insertions(+), 16 deletions(-)
diff --git a/Makefile b/Makefile
index 46e60dad..0966c8c2 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,8 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
 
 LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
 
 BUILD_DATE=$(shell date +%Y%m%d)
 BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
@@ -105,7 +106,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 
 objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll
 	@echo Creating C++ source from builtin definitions file $<
-	@m4 builtins.m4 $< | ./bitcode2cpp.py $< > $@
+	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
 
 objs/builtins-%.o: objs/builtins-%.cpp
 	@echo Compiling $<
diff --git a/builtins.cpp b/builtins.cpp
index 2a02ceab..5b4f4d2e 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -170,6 +170,27 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
 }
 
 
+static void
+lCreateSymbol(const std::string &name, const Type *returnType, 
+              const std::vector<const Type *> &argTypes, 
+              const llvm::FunctionType *ftype, llvm::Function *func, 
+              SymbolTable *symbolTable) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+    // set NULL default arguments
+    std::vector<ConstExpr *> defaults;
+    for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
+        defaults.push_back(NULL);
+    funcType->SetArgumentDefaults(defaults);
+
+    Symbol *sym = new Symbol(name, noPos, funcType);
+    sym->function = func;
+    symbolTable->AddFunction(sym);
+}
+
+
 /** Given an LLVM function declaration, synthesize the equivalent ispc
     symbol for the function (if possible).  Returns true on success, false
     on failure.
@@ -221,7 +242,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
 
         // Iterate over the arguments and try to find their equivalent ispc
         // types.  Track if any of the arguments has an integer type.
-        bool anyIntArgs = false;
+        bool anyIntArgs = false, anyReferenceArgs = false;
         std::vector<const Type *> argTypes;
         for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
             const llvm::Type *llvmArgType = ftype->getParamType(j);
@@ -230,22 +251,26 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
                 return false;
             anyIntArgs |= 
                 (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
+            anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
             argTypes.push_back(type);
         }
 
         // Always create the symbol the first time through, in particular
         // so that we get symbols for things with no integer types!
-        if (i == 0 || anyIntArgs == true) {
-            FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
-            // set NULL default arguments
-            std::vector<ConstExpr *> defaults;
-            for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
-                defaults.push_back(NULL);
-            funcType->SetArgumentDefaults(defaults);
+        if (i == 0 || anyIntArgs == true)
+            lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
 
-            Symbol *sym = new Symbol(name, noPos, funcType);
-            sym->function = func;
-            symbolTable->AddFunction(sym);
+        // If there are any reference types, also make a variant of the
+        // symbol that has them as const references.  This obviously
+        // doesn't make sense for many builtins, but we'll give the stdlib
+        // the option to call one if it needs one.
+        if (anyReferenceArgs == true) {
+            for (unsigned int j = 0; j < argTypes.size(); ++j) {
+                if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
+                    argTypes[j] = argTypes[j]->GetAsConstType();
+                lCreateSymbol(name + "_refsconst", returnType, argTypes, 
+                              ftype, func, symbolTable);
+            }
         }
     }
 
diff --git a/builtins.m4 b/builtins.m4
index 661d9ba7..47158292 100644
--- a/builtins.m4
+++ b/builtins.m4
@@ -557,6 +557,41 @@ define internal <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $
 }
 ')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch definitions
+
+; prefetch has a new parameter in LLVM3.0, to distinguish between instruction
+; and data caches--the declaration is now:
+; declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
+;                             i32 %cachetype)  (cachetype 1 == data cache)
+; however, the version below seems to still work...
+
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
+
+define(`prefetch_read', `
+define internal void @__prefetch_read_1_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 3)
+  ret void
+}
+define internal void @__prefetch_read_2_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 2)
+  ret void
+}
+define internal void @__prefetch_read_3_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 1)
+  ret void
+}
+define internal void @__prefetch_read_nt_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 0)
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 define(`stdlib_core', `
 
@@ -779,6 +814,25 @@ define internal <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone al
   ret <$1 x i32> %0
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetching
+
+prefetch_read(uniform_bool, i1)
+prefetch_read(uniform_int8, i8)
+prefetch_read(uniform_int16, i16)
+prefetch_read(uniform_int32, i32)
+prefetch_read(uniform_int64, i64)
+prefetch_read(uniform_float, float)
+prefetch_read(uniform_double, double)
+
+prefetch_read(varying_bool, <$1 x i32>)
+prefetch_read(varying_int8, <$1 x i8>)
+prefetch_read(varying_int16, <$1 x i16>)
+prefetch_read(varying_int32, <$1 x i32>)
+prefetch_read(varying_int64, <$1 x i64>)
+prefetch_read(varying_float, <$1 x float>)
+prefetch_read(varying_double, <$1 x double>)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib transcendentals
 ;;
diff --git a/docs/ispc.txt b/docs/ispc.txt
index b72a4849..3ba925dd 100644
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -79,6 +79,7 @@ Contents:
   + `Packed Load and Store Operations`_
   + `Conversions To and From Half-Precision Floats`_
   + `Atomic Operations and Memory Fences`_
+  + `Prefetches`_
   + `Low-Level Bits`_
 
 * `Interoperability with the Application`_
@@ -1990,6 +1991,39 @@ code.
     void memory_barrier();
 
 
+Prefetches
+----------
+
+The standard library has a variety of functions to prefetch data into the
+processor's cache.  While modern CPUs have automatic prefetchers that do a
+reasonable job of prefetching data to the cache before its needed, high
+performance applications may find it helpful to prefetch data before it's
+needed.
+
+For example, this code shows how to prefetch data to the processor's L1
+cache while iterating over the items in an array.  
+
+::
+
+   uniform int32 array[...];
+   for (uniform int i = 0; i < count; ++i) {
+       // do computation with array[i]
+       prefetch_l1(array[i+32]);
+   }
+
+The standard library has routines to prefetch to the L1, L2, and L3
+caches.  It also has a variant, ``prefetch_nt()``, that indicates that the
+value being prefetched isn't expected to be used more than once (so should
+be high priority to be evicted from the cache).
+
+::
+
+    void prefetch_{l1,l2,l3,nt}(reference TYPE)
+
+These functions are available for all of the basic types in the
+language--``int8``, ``int16``, ``int32``, ``float``, and so forth.
+
+
 Low-Level Bits
 --------------
 
diff --git a/stdlib.ispc b/stdlib.ispc
index bbd9515b..9907904f 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -315,6 +315,39 @@ static inline uniform int lanemask() {
     return __movmsk(__mask);
 }
 
+///////////////////////////////////////////////////////////////////////////
+// Prefetching
+
+#define PREFETCHES(NAME, TYPE)                                  \
+static inline void prefetch_l1(const reference TYPE ptr) {      \
+    __prefetch_read_1_##NAME##_refsconst(ptr);                  \
+}                                                               \
+static inline void prefetch_l2(const reference TYPE ptr) {      \
+    __prefetch_read_2_##NAME##_refsconst(ptr);                  \
+}                                                               \
+static inline void prefetch_l3(const reference TYPE ptr) {      \
+    __prefetch_read_3_##NAME##_refsconst(ptr);                  \
+}                                                               \
+ static inline void prefetch_nt(const reference TYPE ptr) {     \
+     __prefetch_read_nt_##NAME##_refsconst(ptr);                \
+}
+
+PREFETCHES(uniform_int8, uniform int8)
+PREFETCHES(uniform_int16, uniform int16)
+PREFETCHES(uniform_int32, uniform int32)
+PREFETCHES(uniform_int64, uniform int64)
+PREFETCHES(uniform_float, uniform float)
+PREFETCHES(uniform_double, uniform double)
+
+PREFETCHES(varying_int8, int8)
+PREFETCHES(varying_int16, int16)
+PREFETCHES(varying_int32, int32)
+PREFETCHES(varying_int64, int64)
+PREFETCHES(varying_float, float)
+PREFETCHES(varying_double, double)
+
+#undef PREFETCHES
+
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions
 
@@ -522,6 +555,8 @@ DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap)
 
 DEFINE_ATOMIC_OP(double,double,swap,swap)
 
+#undef DEFINE_ATOMIC_OP
+
 #define ATOMIC_DECL_CMPXCHG(TA, TB)                                        \
 static inline TA atomic_compare_exchange_global(                           \
          uniform reference TA ref, TA oldval, TA newval) {                 \
@@ -538,6 +573,8 @@ ATOMIC_DECL_CMPXCHG(int64, int64)
 ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
 ATOMIC_DECL_CMPXCHG(double, double)
 
+#undef ATOMIC_DECL_CMPXCHG
+
 ///////////////////////////////////////////////////////////////////////////
 // Floating-Point Math
 
diff --git a/type.cpp b/type.cpp
index 5b419c88..053a37d3 100644
--- a/type.cpp
+++ b/type.cpp
@@ -1541,7 +1541,7 @@ StructType::GetElementNumber(const std::string &n) const {
 // ReferenceType
 
 ReferenceType::ReferenceType(const Type *t, bool ic) 
-    : isConst(ic), targetType(t) {
+    : isConst(ic), targetType(t->GetAsNonConstType()) {
 }
 
 
@@ -2136,8 +2136,9 @@ Type::Equal(const Type *a, const Type *b) {
     const ReferenceType *rta = dynamic_cast<const ReferenceType *>(a);
     const ReferenceType *rtb = dynamic_cast<const ReferenceType *>(b);
     if (rta != NULL && rtb != NULL)
-        return Type::Equal(rta->GetReferenceTarget(),
-                           rtb->GetReferenceTarget());
+        return ((rta->IsConstType() == rtb->IsConstType()) &&
+                Type::Equal(rta->GetReferenceTarget(),
+                            rtb->GetReferenceTarget()));
 
     const FunctionType *fta = dynamic_cast<const FunctionType *>(a);
     const FunctionType *ftb = dynamic_cast<const FunctionType *>(b);