From 3b95452481010dfe0ccee9bda22562f53cd5ba3a Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 5 Mar 2012 16:09:00 -0800 Subject: [PATCH] Add memcpy(), memmove() and memset() to the standard library. Issue #183. --- builtins/util.m4 | 49 +++++++++++++++ docs/ispc.rst | 57 ++++++++++++++++- stdlib.ispc | 125 +++++++++++++++++++++++++++++++++++++ tests/memcpy-uniform.ispc | 17 +++++ tests/memcpy-varying.ispc | 21 +++++++ tests/memmove-uniform.ispc | 16 +++++ tests/memmove-varying.ispc | 19 ++++++ tests/memset-uniform.ispc | 16 +++++ tests/memset-varying.ispc | 21 +++++++ 9 files changed, 338 insertions(+), 3 deletions(-) create mode 100644 tests/memcpy-uniform.ispc create mode 100644 tests/memcpy-varying.ispc create mode 100644 tests/memmove-uniform.ispc create mode 100644 tests/memmove-varying.ispc create mode 100644 tests/memset-uniform.ispc create mode 100644 tests/memset-varying.ispc diff --git a/builtins/util.m4 b/builtins/util.m4 index 96f3544a..26cbfafb 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1768,6 +1768,55 @@ define @__sext_varying_bool() nounwind readnone alwa ret %0') } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; memcpy/memmove/memset + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, + i32 %len, i32 %align, i1 %isvolatile) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, + i64 %len, i32 %align, i1 %isvolatile) + +define void @__memcpy32(i8 * %dst, i8 * %src, i32 %len) alwaysinline { + call void @llvm.memcpy.p0i8.p0i8.i32(i8 * %dst, i8 * %src, i32 %len, i32 0, i1 0) + ret void +} + +define void @__memcpy64(i8 * %dst, i8 * %src, i64 %len) alwaysinline { + call void @llvm.memcpy.p0i8.p0i8.i64(i8 * %dst, i8 * %src, i64 %len, i32 0, i1 0) + ret void +} + +declare void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, + i32 %len, i32 %align, i1 %isvolatile) +declare void @llvm.memmove.p0i8.p0i8.i64(i8* %dest, i8* %src, + i64 %len, i32 %align, i1 %isvolatile) + +define void @__memmove32(i8 * %dst, i8 * %src, i32 %len) alwaysinline { + call void @llvm.memmove.p0i8.p0i8.i32(i8 * %dst, i8 * %src, i32 %len, i32 0, i1 0) + ret void +} + +define void @__memmove64(i8 * %dst, i8 * %src, i64 %len) alwaysinline { + call void @llvm.memmove.p0i8.p0i8.i64(i8 * %dst, i8 * %src, i64 %len, i32 0, i1 0) + ret void +} + + +declare void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 %len, i32 %align, + i1 %isvolatile) +declare void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %len, i32 %align, + i1 %isvolatile) + +define void @__memset32(i8 * %dst, i8 %val, i32 %len) alwaysinline { + call void @llvm.memset.p0i8.i32(i8 * %dst, i8 %val, i32 %len, i32 0, i1 0) + ret void +} + +define void @__memset64(i8 * %dst, i8 %val, i64 %len) alwaysinline { + call void @llvm.memset.p0i8.i64(i8 * %dst, i8 %val, i64 %len, i32 0, i1 0) + ret void +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; assert diff --git a/docs/ispc.rst b/docs/ispc.rst index 442c36bf..61f4a21a 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -134,9 +134,13 @@ Contents: * `Reductions`_ - + `Data Conversions And Storage`_ + + `Data Movement`_ + * `Setting and Copying Values In Memory`_ * `Packed Load and Store Operations`_ + + + `Data Conversions`_ + * `Converting Between Array-of-Structures and Structure-of-Arrays Layout`_ * `Conversions To and From Half-Precision Floats`_ @@ -3387,8 +3391,52 @@ program instances into a compact output buffer is `discussed in the FAQ`_. .. _discussed in the FAQ: faq.html#how-can-a-gang-of-program-instances-generate-variable-amounts-of-output-efficiently -Data Conversions And Storage ----------------------------- +Data Movement +------------- + +Setting and Copying Values In Memory +------------------------------------ + +There are a few functions for copying blocks of memory and initializing +values in memory. Along the lines of the equivalently-named routines in +the C Standard libary, ``memcpy`` copies a given number of bytes starting +from a source location in memory to a destination locaiton, where the two +regions of memory are guaranteed by the caller to be non-overlapping. +Alternatively, ``memmove`` can be used to copy data if the buffers may +overlap. + +:: + + void memcpy(void * uniform dst, void * uniform src, uniform int32 count) + void memmove(void * uniform dst, void * uniform src, uniform int32 count) + void memcpy(void * varying dst, void * varying src, int32 count) + void memmove(void * varying dst, void * varying src, int32 count) + +Note that there are variants of these functions that take both ``uniform`` +and ``varying`` pointers. + +To initialize values in memory, the ``memset`` routine can be used. (It +also behaves like the function of the same name in the C Standard Library.) +It sets the given number of bytes of memory starting at the given location +to the value provided. + +:: + + void memset(void * uniform ptr, uniform int8 val, uniform int32 count) + void memset(void * varying ptr, int8 val, int32 count) + +There are also variants of all of these functions that take 64-bit values +for the number of bytes of memory to operate on: + +:: + + void memcpy64(void * uniform dst, void * uniform src, uniform int64 count) + void memcpy64(void * varying dst, void * varying src, int64 count) + void memmove64(void * uniform dst, void * uniform src, uniform int64 count) + void memmove64(void * varying dst, void * varying src, int64 count) + void memset64(void * uniform ptr, uniform int8 val, uniform int64 count) + void memset64(void * varying ptr, int8 val, int64 count) + Packed Load and Store Operations -------------------------------- @@ -3447,6 +3495,9 @@ of four negative values, and initializes the first four elements of indices where ``a[i]`` was less than zero. +Data Conversions +---------------- + Converting Between Array-of-Structures and Structure-of-Arrays Layout --------------------------------------------------------------------- diff --git a/stdlib.ispc b/stdlib.ispc index 4727ec6e..5de14778 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -335,6 +335,131 @@ static inline uniform int lanemask() { return __movmsk(__mask); } +/////////////////////////////////////////////////////////////////////////// +// memcpy/memmove/memset + +static inline void memcpy(void * uniform dst, void * uniform src, + uniform int32 count) { + __memcpy32((int8 * uniform)dst, (int8 * uniform)src, count); +} + +static inline void memcpy64(void * uniform dst, void * uniform src, + uniform int64 count) { + __memcpy64((int8 * uniform)dst, (int8 * uniform)src, count); +} + +static inline void memcpy(void * varying dst, void * varying src, + int32 count) { + void * uniform da[programCount]; + void * uniform sa[programCount]; + + da[programIndex] = dst; + sa[programIndex] = src; + + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + void * uniform d = da[i], * uniform s = sa[i]; + __memcpy32((int8 * uniform)d, (int8 * uniform)s, extract(count, i)); + } +} + +static inline void memcpy64(void * varying dst, void * varying src, + int64 count) { + void * uniform da[programCount]; + void * uniform sa[programCount]; + + da[programIndex] = dst; + sa[programIndex] = src; + + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + void * uniform d = da[i], * uniform s = sa[i]; + __memcpy64((int8 * uniform)d, (int8 * uniform)s, extract(count, i)); + } +} + +static inline void memmove(void * uniform dst, void * uniform src, + uniform int32 count) { + __memmove32((int8 * uniform)dst, (int8 * uniform)src, count); +} + +static inline void memmove64(void * uniform dst, void * uniform src, + uniform int64 count) { + __memmove64((int8 * uniform)dst, (int8 * uniform)src, count); +} + +static inline void memmove(void * varying dst, void * varying src, + int32 count) { + void * uniform da[programCount]; + void * uniform sa[programCount]; + + da[programIndex] = dst; + sa[programIndex] = src; + + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + void * uniform d = da[i], * uniform s = sa[i]; + __memmove32((int8 * uniform)d, (int8 * uniform)s, extract(count, i)); + } +} + +static inline void memmove64(void * varying dst, void * varying src, + int64 count) { + void * uniform da[programCount]; + void * uniform sa[programCount]; + + da[programIndex] = dst; + sa[programIndex] = src; + + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + void * uniform d = da[i], * uniform s = sa[i]; + __memmove64((int8 * uniform)d, (int8 * uniform)s, extract(count, i)); + } +} + +static inline void memset(void * uniform ptr, uniform int8 val, + uniform int32 count) { + __memset32((int8 * uniform)ptr, val, count); +} + +static inline void memset64(void * uniform ptr, uniform int8 val, + uniform int64 count) { + __memset64((int8 * uniform)ptr, val, count); +} + +static inline void memset(void * varying ptr, int8 val, int32 count) { + void * uniform pa[programCount]; + pa[programIndex] = ptr; + + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + __memset32((int8 * uniform)pa[i], extract(val, i), extract(count, i)); + } +} + +static inline void memset64(void * varying ptr, int8 val, int64 count) { + void * uniform pa[programCount]; + pa[programIndex] = ptr; + + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + __memset64((int8 * uniform)pa[i], extract(val, i), extract(count, i)); + } +} + /////////////////////////////////////////////////////////////////////////// // count leading/trailing zeros diff --git a/tests/memcpy-uniform.ispc b/tests/memcpy-uniform.ispc new file mode 100644 index 00000000..075ff729 --- /dev/null +++ b/tests/memcpy-uniform.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + int32 * uniform src = uniform new int32[1024]; + int32 * uniform dst = uniform new int32[1024]; + + foreach (i = 0 ... 1024) + src[i] = i; + + memcpy(&dst[32], src, (1024-32)*sizeof(uniform int)); + RET[programIndex] = dst[64+programIndex]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 32 + programIndex; +} diff --git a/tests/memcpy-varying.ispc b/tests/memcpy-varying.ispc new file mode 100644 index 00000000..309d2d8d --- /dev/null +++ b/tests/memcpy-varying.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + int32 *src = new int32[1024]; + int32 *dst = new int32[1024]; + + for (uniform int i = 0; i < 1024; ++i) + src[i] = programIndex * 10000 + i; + + if (programIndex == 2) + memcpy(dst, src, programCount*sizeof(uniform int)); + else + memcpy(dst, src, programCount*sizeof(uniform int)); + + RET[programIndex] = dst[programIndex]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 10000 * programIndex + programIndex; +} diff --git a/tests/memmove-uniform.ispc b/tests/memmove-uniform.ispc new file mode 100644 index 00000000..92f0b1b5 --- /dev/null +++ b/tests/memmove-uniform.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + int32 * uniform buf = uniform new int32[1024]; + + foreach (i = 0 ... 1024) + buf[i] = i; + + memmove(&buf[1], buf, (1024-1)*sizeof(uniform int)); + RET[programIndex] = buf[programIndex]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = max(0, programIndex-1); +} diff --git a/tests/memmove-varying.ispc b/tests/memmove-varying.ispc new file mode 100644 index 00000000..b0f64054 --- /dev/null +++ b/tests/memmove-varying.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + int32 *buf = new int32[1024]; + + for (uniform int i = 0; i < 1024; ++i) + buf[i] = programIndex * 10000 + i; + + if (programIndex == 2) + memmove(buf, buf+programCount/2, programCount*sizeof(uniform int)); + + RET[programIndex] = buf[0]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 10000 * programIndex; + RET[2] = 10000 * 2 + programCount/2; +} diff --git a/tests/memset-uniform.ispc b/tests/memset-uniform.ispc new file mode 100644 index 00000000..e0692a54 --- /dev/null +++ b/tests/memset-uniform.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + int32 * uniform buf = uniform new int32[1024]; + buf[0] = 0; + memset(buf+1, 0x7f, 1024*sizeof(uniform int32)); + + int v = buf[programIndex]; + RET[programIndex] = (v == 0x7f7f7f7f); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; + RET[0] = 0; +} diff --git a/tests/memset-varying.ispc b/tests/memset-varying.ispc new file mode 100644 index 00000000..b837bcca --- /dev/null +++ b/tests/memset-varying.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + int32 * varying buf = varying new int32[1024*(programIndex+1)]; + + if (programIndex & 1) { + memset(buf, 0xff, 1024*(programIndex+1)*sizeof(uniform int32)); + } + else { + memset(buf, 0x01, 1024*(programIndex+1)*sizeof(uniform int32)); + } + + int v = buf[0]; + int expected = (programIndex & 1) ? 0xffffffff : 0x01010101; + RET[programIndex] = (v == expected); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +}