diff --git a/docs/ispc.txt b/docs/ispc.txt
index 6376b025..26d582e3 100644
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -1852,7 +1852,8 @@ example.
 
 Here are the declarations of the ``int32`` variants of these functions.
 There are also ``int64`` equivalents as well as variants that take
-``unsigned`` ``int32`` and ``int64`` values.
+``unsigned`` ``int32`` and ``int64`` values.  (The ``atomic_swap_global()``
+function can be used with ``float`` and ``double`` types as well.)
 
 ::
 
@@ -1869,7 +1870,8 @@ There is also an atomic "compare and exchange" function; it atomically
 compares the value in "val" to "compare"--if they match, it assigns
 "newval" to "val".  In either case, the old value of "val" is returned.
 (As with the other atomic operations, there are also ``unsigned`` and
-64-bit variants of this function.)
+64-bit variants of this function.  Furthermore, there are ``float`` and
+``double`` variants as well.)
 
 ::
 
diff --git a/stdlib.ispc b/stdlib.ispc
index d6cfd12a..ec94c4c8 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -425,6 +425,8 @@ DEFINE_ATOMIC_OP(unsigned int32,int32,or,or)
 DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor)
 DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap)
 
+DEFINE_ATOMIC_OP(float,float,swap,swap)
+
 DEFINE_ATOMIC_OP(int64,int64,add,add)
 DEFINE_ATOMIC_OP(int64,int64,subtract,sub)
 DEFINE_ATOMIC_OP(int64,int64,min,min)
@@ -445,6 +447,8 @@ DEFINE_ATOMIC_OP(unsigned int64,int64,or,or)
 DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor)
 DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap)
 
+DEFINE_ATOMIC_OP(double,double,swap,swap)
+
 #define ATOMIC_DECL_CMPXCHG(TA, TB)                                        \
 static inline TA atomic_compare_exchange_global(                           \
          uniform reference TA ref, TA oldval, TA newval) {                 \
@@ -456,8 +460,10 @@ static inline TA atomic_compare_exchange_global(                           \
 
 ATOMIC_DECL_CMPXCHG(int32, int32)
 ATOMIC_DECL_CMPXCHG(unsigned int32, int32)
+ATOMIC_DECL_CMPXCHG(float, float)
 ATOMIC_DECL_CMPXCHG(int64, int64)
 ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
+ATOMIC_DECL_CMPXCHG(double, double)
 
 ///////////////////////////////////////////////////////////////////////////
 // Load/store from/to 8/16-bit types
diff --git a/stdlib.m4 b/stdlib.m4
index 6b781b17..50fbc9cc 100644
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -800,9 +800,48 @@ global_atomic($1, umax, i64, uint64)
 global_swap($1, i32, int32)
 global_swap($1, i64, int64)
 
+define internal <$1 x float> @__atomic_swap_float_global(float * %ptr, <$1 x float> %val,
+                                                   <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast float * %ptr to i32 *
+  %ival = bitcast <$1 x float> %val to <$1 x i32>
+  %iret = call <$1 x i32> @__atomic_swap_int32_global(i32 * %iptr, <$1 x i32> %ival, <$1 x i32> %mask)
+  %ret = bitcast <$1 x i32> %iret to <$1 x float>
+  ret <$1 x float> %ret
+}
+
+define internal <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x double> %val,
+                                                   <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast double * %ptr to i64 *
+  %ival = bitcast <$1 x double> %val to <$1 x i64>
+  %iret = call <$1 x i64> @__atomic_swap_int64_global(i64 * %iptr, <$1 x i64> %ival, <$1 x i32> %mask)
+  %ret = bitcast <$1 x i64> %iret to <$1 x double>
+  ret <$1 x double> %ret
+}
+
 global_atomic_exchange($1, i32, int32)
 global_atomic_exchange($1, i64, int64)
 
+define internal <$1 x float> @__atomic_compare_exchange_float_global(float * %ptr,
+                      <$1 x float> %cmp, <$1 x float> %val, <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast float * %ptr to i32 *
+  %icmp = bitcast <$1 x float> %cmp to <$1 x i32>
+  %ival = bitcast <$1 x float> %val to <$1 x i32>
+  %iret = call <$1 x i32> @__atomic_compare_exchange_int32_global(i32 * %iptr, <$1 x i32> %icmp,
+                                                                  <$1 x i32> %ival, <$1 x i32> %mask)
+  %ret = bitcast <$1 x i32> %iret to <$1 x float>
+  ret <$1 x float> %ret
+}
+
+define internal <$1 x double> @__atomic_compare_exchange_double_global(double * %ptr,
+                      <$1 x double> %cmp, <$1 x double> %val, <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast double * %ptr to i64 *
+  %icmp = bitcast <$1 x double> %cmp to <$1 x i64>
+  %ival = bitcast <$1 x double> %val to <$1 x i64>
+  %iret = call <$1 x i64> @__atomic_compare_exchange_int64_global(i64 * %iptr, <$1 x i64> %icmp,
+                                                                  <$1 x i64> %ival, <$1 x i32> %mask)
+  %ret = bitcast <$1 x i64> %iret to <$1 x double>
+  ret <$1 x double> %ret
+}
 ')