diff --git a/Makefile b/Makefile
index 6be3182f..3e54d55f 100644
--- a/Makefile
+++ b/Makefile
@@ -94,9 +94,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-$(STDLIB_SRC): stdlib.m4
-
-objs/stdlib-%.cpp: stdlib-%.ll
+objs/stdlib-%.cpp: stdlib-%.ll stdlib.m4
 	@echo Creating C++ source from stdlib file $<
 	@m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@
 
diff --git a/docs/ispc.txt b/docs/ispc.txt
index 993d2bb7..74dff7c1 100644
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -74,7 +74,8 @@ Contents:
 
   + `Math Functions`_
   + `Output Functions`_
-  + `Cross-Lane Operations`_
+  + `Cross-Program Instance Operations`_
+  + `Packed Load and Store Operations`_
   + `Low-Level Bits`_
 
 * `Interoperability with the Application`_
@@ -1659,14 +1660,14 @@ values for the inactive program instances aren't printed.  (In other cases,
 they may have garbage values or be otherwise undefined.)
 
 
-Cross-Lane Operations
----------------------
+Cross-Program Instance Operations
+---------------------------------
 
-Usually, ``ispc`` code expresses independent computation on separate data
-elements.  There are, however, a number of cases where it's useful for the
-program instances to be able to cooperate in computing results.  The
-cross-lane operations described in this section provide primitives for
-communication between the running program instances.
+Usually, ``ispc`` code expresses independent programs performing
+computation on separate data elements.  There are, however, a number of
+cases where it's useful for the program instances to be able to cooperate
+in computing results.  The cross-lane operations described in this section
+provide primitives for communication between the running program instances.
  
 A few routines that evaluate conditions across the running program
 instances.  For example, ``any()`` returns ``true`` if the given value
@@ -1678,6 +1679,47 @@ and ``all()`` returns ``true`` if it true for all of them.
     uniform bool any(bool v)
     uniform bool all(bool v)
 
+To broadcast a value from one program instance to all of the others, a
+``broadcast()`` function is available.  It broadcasts the value of the
+``value`` parameter for the program instance given by ``index`` to all of
+the running program instances.
+
+::
+
+    float broadcast(float value, uniform int index)
+    int32 broadcast(int32 value, uniform int index)
+    double broadcast(double value, uniform int index)
+    int64 broadcast(int64 value, uniform int index)
+
+The ``rotate()`` function allows each program instance to find the value of
+the given value that their neighbor ``offset`` steps away has.  For
+example, on an 8-wide target, if ``offset`` has the value (1, 2, 3, 4, 5,
+6, 7, 8) in each of the running program instances, then ``rotate(value,
+-1)`` causes the first program instance to get the value 8, the second
+program instance to get the value 1, the third 2, and so forth.  The
+provided offset value can be positive or negative, and may be greater than
+``programCount`` (it is masked to ensure valid offsets).
+
+::
+
+    float rotate(float value, uniform int offset)
+    int32 rotate(int32 value, uniform int offset)
+    double rotate(double value, uniform int offset)
+    int64 rotate(int64 value, uniform int offset)
+
+
+Finally, ``shuffle()`` allows fully general shuffling of values among the
+program instances.  Each program instance's value of permutation gives the
+program instance from which to get the value of ``value``.  The provided
+values for ``permutation`` must all be between 0 and ``programCount-1``.
+
+::
+
+    float shuffle(float value, int permutation)
+    int32 shuffle(int32 value, int permutation)
+    double shuffle(double value, int permutation)
+    int64 shuffle(int64 value, int permutation)
+
 The various variants of ``popcnt()`` return the population count--the
 number of bits set in the given value.
 
@@ -1719,8 +1761,12 @@ given value across all of the currently-executing vector lanes.
     uniform unsigned int reduce_max(unsigned int a, unsigned int b)
 
 
-Finally, there are routines for writing out and reading in values from
-linear memory locations for the active program instances.
+
+Packed Load and Store Operations
+--------------------------------
+
+The standard library also offers routines for writing out and reading in
+values from linear memory locations for the active program instances.
 ``packed_load_active()`` loads consecutive values from the given array,
 starting at ``a[offset]``, loading one value for each currently-executing
 program instance and storing it into that program instance's ``val``
@@ -2280,21 +2326,11 @@ elements to work with and then proceeds with the computation.
 Communicating Between SPMD Program Instances
 --------------------------------------------
 
-The ``programIndex`` built-in variable (see `Mapping Data To Program
-Instances`_) can be used to communicate between the set of executing
-program instances.  Consider the following code, which shows all of the
-program instances writing into unique locations in an array.
-
-::
-
-    float x = ...;
-    uniform float allX[programCount];
-    allX[programIndex] = x;
-
-In this code, a program instance that reads ``allX[0]`` finds the value of
-``x`` that was computed by the first of the running program instances, and
-so forth.  Program instances can communicate with their neighbor instances
-with indexing like ``allX[(programIndex+1)%programCount]``.
+The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
+routiens provide a variety of mechanisms for the running program instances
+to communicate values to each other during execution.  See the section
+`Cross-Program Instance Operations`_ for more information about their
+operation.
 
 
 Gather and Scatter
diff --git a/opt.cpp b/opt.cpp
index 69e75247..efda1d2a 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -2116,11 +2116,12 @@ CreateLowerGatherScatterPass() {
 // IsCompileTimeConstantPass
 
 /** LLVM IR implementations of target-specific functions may include calls
-    to a function "bool __is_compile_time_constant_mask(mask type)"; this
-    allows them to have specialied code paths for where the mask is known
-    at compile time but not incurring the cost of a MOVMSK call at runtime
-    to compute its value in cases where the mask value isn't known until
-    runtime.
+    to the functions "bool __is_compile_time_constant_mask(mask type)" and
+    "bool __is_compile_time_constant_int32(i32)"; these allow them to have
+    specialied code paths for where the corresponding value is known at
+    compile time.  For masks, for example, this allows them to not incur
+    the cost of a MOVMSK call at runtime to compute its value in cases
+    where the mask value isn't known until runtime.
 
     This pass resolves these calls into either 'true' or 'false' values so
     that later optimization passes can operate with these as constants.
@@ -2148,17 +2149,17 @@ llvm::RegisterPass<IsCompileTimeConstantPass>
 
 bool
 IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *func = m->module->getFunction("__is_compile_time_constant_mask");
-    if (!func)
-        return false;
+    llvm::Function *maskFunc = m->module->getFunction("__is_compile_time_constant_mask");
+    llvm::Function *int32Func = m->module->getFunction("__is_compile_time_constant_int32");
 
     bool modifiedAny = false;
  restart:
     for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
-        // Iterate through the instructions looking for calls to
-        // __is_compile_time_constant_mask().
+        // Iterate through the instructions looking for calls to the
+        // __is_compile_time_constant_*() functions
         llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
-        if (!callInst || callInst->getCalledFunction() != func)
+        if (!callInst || (callInst->getCalledFunction() != maskFunc &&
+                          callInst->getCalledFunction() != int32Func))
             continue;
 
         // This optimization pass can be disabled with the (poorly named)
@@ -2171,8 +2172,8 @@ IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
 
         // Is it a constant?  Bingo, turn the call's value into a constant
         // true value.
-        llvm::Value *mask = callInst->getArgOperand(0);
-        if (llvm::isa<llvm::Constant>(mask)) {
+        llvm::Value *operand = callInst->getArgOperand(0);
+        if (llvm::isa<llvm::Constant>(operand)) {
             llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
             modifiedAny = true;
             goto restart;
diff --git a/stdlib.ispc b/stdlib.ispc
index 05abfd1a..b9770da4 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -81,6 +81,54 @@ static inline uniform unsigned int64 intbits(uniform double d) {
     return __intbits_uniform_double(d);
 }
 
+static inline float broadcast(float v, uniform int i) {
+    return __broadcast_float(v, i);
+}
+
+static inline int32 broadcast(int32 v, uniform int i) {
+    return __broadcast_int32(v, i);
+}
+
+static inline double broadcast(double v, uniform int i) {
+    return __broadcast_double(v, i);
+}
+
+static inline int64 broadcast(int64 v, uniform int i) {
+    return __broadcast_int64(v, i);
+}
+
+static inline float rotate(float v, uniform int i) {
+    return __rotate_float(v, i);
+}
+
+static inline int32 rotate(int32 v, uniform int i) {
+    return __rotate_int32(v, i);
+}
+
+static inline double rotate(double v, uniform int i) {
+    return __rotate_double(v, i);
+}
+
+static inline int64 rotate(int64 v, uniform int i) {
+    return __rotate_int64(v, i);
+}
+
+static inline float shuffle(float v, int i) {
+    return __shuffle_float(v, i);
+}
+
+static inline int32 shuffle(int32 v, int i) {
+    return __shuffle_int32(v, i);
+}
+
+static inline double shuffle(double v, int i) {
+    return __shuffle_double(v, i);
+}
+
+static inline int64 shuffle(int64 v, int i) {
+    return __shuffle_int64(v, i);
+}
+
 // x[i]
 static inline uniform float extract(float x, uniform int i) {
     return __extract(x, i);
diff --git a/stdlib.m4 b/stdlib.m4
index 500d183c..bc7cfc19 100644
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -34,6 +34,8 @@
 ;; builtins for various targets can use macros from this file to simplify
 ;; generating code for their implementations of those builtins.
 
+declare i1 @__is_compile_time_constant_int32(i32)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 
@@ -284,6 +286,22 @@ ret <8 x float> %ret
 '
 )
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; forloop macro
+
+divert(`-1')
+# forloop(var, from, to, stmt) - improved version:
+#   works even if VAR is not a strict macro name
+#   performs sanity check that FROM is larger than TO
+#   allows complex numerical expressions in TO and FROM
+define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1',
+  `pushdef(`$1', eval(`$2'))_$0(`$1',
+    eval(`$3'), `$4')popdef(`$1')')')
+define(`_forloop',
+  `$3`'ifelse(indir(`$1'), `$2', `',
+    `define(`$1', incr(indir(`$1')))$0($@)')')
+divert`'dnl
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib_core
 ;;
@@ -291,8 +309,67 @@ ret <8 x float> %ret
 ;; target's vector width, which it takes as its first parameter.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+define(`shuffles', `
+define internal <$1 x $2> @__broadcast_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
+  %v = extractelement <$1 x $2> %0, i32 %1
+  %r_0 = insertelement <$1 x $2> undef, $2 %v, i32 0
+forloop(i, 1, eval($1-1), `  %r_`'i = insertelement <$1 x $2> %r_`'eval(i-1), $2 %v, i32 i
+')
+  ret <$1 x $2> %r_`'eval($1-1)
+}
+
+define internal <$1 x $2> @__rotate_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
+  %isc = call i1 @__is_compile_time_constant_int32(i32 %1)
+  br i1 %isc, label %is_const, label %not_const
+
+is_const:
+  ; though verbose, this turms into tight code if %1 is a constant
+forloop(i, 0, eval($1-1), `  
+  %delta_`'i = add i32 %1, i
+  %delta_clamped_`'i = and i32 %delta_`'i, eval($1-1)
+  %v_`'i = extractelement <$1 x $2> %0, i32 %delta_clamped_`'i')
+
+  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
+forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+')
+  ret <$1 x $2> %ret_`'eval($1-1)
+
+not_const:
+  ; store two instances of the vector into memory
+  %ptr = alloca <$1 x $2>, i32 2
+  %ptr0 = getelementptr <$1 x $2> * %ptr, i32 0
+  store <$1 x $2> %0, <$1 x $2> * %ptr0
+  %ptr1 = getelementptr <$1 x $2> * %ptr, i32 1
+  store <$1 x $2> %0, <$1 x $2> * %ptr1
+
+  ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
+  %offset = and i32 %1, eval($1-1)
+  %ptr_as_elt_array = bitcast <$1 x $2> * %ptr to [eval(2*$1) x $2] *
+  %load_ptr = getelementptr [eval(2*$1) x $2] * %ptr_as_elt_array, i32 0, i32 %offset
+  %load_ptr_vec = bitcast $2 * %load_ptr to <$1 x $2> *
+  %result = load <$1 x $2> * %load_ptr_vec, align $4
+  ret <$1 x $2> %result
+}
+
+define internal <$1 x $2> @__shuffle_$3(<$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline {
+forloop(i, 0, eval($1-1), `  
+  %index_`'i = extractelement <$1 x i32> %1, i32 i')
+forloop(i, 0, eval($1-1), `  
+  %v_`'i = extractelement <$1 x $2> %0, i32 %index_`'i')
+
+  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
+forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+')
+  ret <$1 x $2> %ret_`'eval($1-1)
+}
+
+')
+
+
 define(`stdlib_core', `
 
+declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
 
@@ -307,6 +384,10 @@ define internal <$1 x float> @__insert(<$1 x float>, i32,
   ret <$1 x float> %insert
 }
 
+shuffles($1, float, float, 4)
+shuffles($1, i32, int32, 4)
+shuffles($1, double, double, 8)
+shuffles($1, i64, int64, 8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; various bitcasts from one type to another
@@ -524,7 +605,6 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32
 ;; FIXME: use the per_lane macro, defined below, to implement these!
 
 define(`packed_load_and_store', `
-declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
 
 define i32 @__packed_load_active([0 x i32] *, i32 %start_offset, <$1 x i32> * %val_ptr,
                                  <$1 x i32> %full_mask) nounwind alwaysinline {
@@ -661,19 +741,6 @@ done:
 ;;       Inside this code, any instances of the text "LANE" are replaced
 ;;       with an i32 value that represents the current lane number
 
-divert(`-1')
-# forloop(var, from, to, stmt) - improved version:
-#   works even if VAR is not a strict macro name
-#   performs sanity check that FROM is larger than TO
-#   allows complex numerical expressions in TO and FROM
-define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1',
-  `pushdef(`$1', eval(`$2'))_$0(`$1',
-    eval(`$3'), `$4')popdef(`$1')')')
-define(`_forloop',
-  `$3`'ifelse(indir(`$1'), `$2', `',
-    `define(`$1', incr(indir(`$1')))$0($@)')')
-divert`'dnl
-
 ; num lanes, mask, code block to do per lane
 define(`per_lane', `
   br label %pl_entry
diff --git a/tests/broadcast-1.ispc b/tests/broadcast-1.ispc
new file mode 100644
index 00000000..7cefce7a
--- /dev/null
+++ b/tests/broadcast-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int br = broadcast(a, (uniform int)b-2);
+    RET[programIndex] = br;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4;
+}
diff --git a/tests/broadcast.ispc b/tests/broadcast.ispc
new file mode 100644
index 00000000..e45bbf90
--- /dev/null
+++ b/tests/broadcast.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = broadcast(a, 2);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
diff --git a/tests/rotate-1.ispc b/tests/rotate-1.ispc
new file mode 100644
index 00000000..8a06b566
--- /dev/null
+++ b/tests/rotate-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int rot = rotate(a, -1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount;
+}
diff --git a/tests/rotate-2.ispc b/tests/rotate-2.ispc
new file mode 100644
index 00000000..e1ee32cb
--- /dev/null
+++ b/tests/rotate-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    uniform int delta = b - 6; // -1
+    int rot = rotate(a, delta);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount;
+}
diff --git a/tests/rotate-3.ispc b/tests/rotate-3.ispc
new file mode 100644
index 00000000..ab50a4d9
--- /dev/null
+++ b/tests/rotate-3.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int64 a = aFOO[programIndex]; 
+    uniform int delta = b - 6; // -1
+    int64 rot = rotate(a, delta);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount;
+}
diff --git a/tests/rotate-4.ispc b/tests/rotate-4.ispc
new file mode 100644
index 00000000..cafd0a3c
--- /dev/null
+++ b/tests/rotate-4.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int64 a = aFOO[programIndex]; 
+    int64 rot = rotate(a, -1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount;
+}
diff --git a/tests/rotate.ispc b/tests/rotate.ispc
new file mode 100644
index 00000000..eab7a5f3
--- /dev/null
+++ b/tests/rotate.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int rot = rotate(a, 2);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + 2) % programCount;
+}
diff --git a/tests/shuffle-1.ispc b/tests/shuffle-1.ispc
new file mode 100644
index 00000000..11474886
--- /dev/null
+++ b/tests/shuffle-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    int reverse = programCount - 1 - programIndex;
+    float shuf = shuffle(a, reverse);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount - programIndex;
+}
diff --git a/tests/shuffle-2.ispc b/tests/shuffle-2.ispc
new file mode 100644
index 00000000..2cf2a91e
--- /dev/null
+++ b/tests/shuffle-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    int reverse = programCount - 1 - programIndex + (int)b - 5;
+    float shuf = shuffle(a, reverse);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount - programIndex;
+}
diff --git a/tests/shuffle.ispc b/tests/shuffle.ispc
new file mode 100644
index 00000000..a17b1309
--- /dev/null
+++ b/tests/shuffle.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int32 a = aFOO[programIndex]; 
+    int32 shuf = shuffle(a, 1);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+}