Edit:
By adding the restrict keyword I was able to get my memcpy up to speed with the library implementation (and in this particular test, exceeding the library implementations speed). New results:
Test case | mem_cpy |
mem_cpy_naive |
memcpy |
---|---|---|---|
big string (1000 bytes) | 2.584988s | 3.936075s | 3.952187s |
small string (8 bytes) | 0.025931s | 0.051899s | 0.025807s |
Note: I tested also it as a part of a bigger implementation I had been working on. Previously I gained about 20% performance by swapping the libc memcpy in place of my own, now there was no difference.
Updated code:
static void
copy_words(void *restrict dst, const void *restrict src, size_t words)
{
const uint64_t *restrict src64;
uint64_t *restrict dst64;
uint64_t pages;
uint64_t offset;
pages = words / 8;
offset = words - pages * 8;
src64 = (const uint64_t *restrict)src;
dst64 = (uint64_t *restrict)dst;
while (pages--)
{
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
}
while (offset--)
*dst64++ = *src64++;
}
static void
copy_small(void *restrict dst, const void *restrict src, size_t size)
{
const uint64_t *restrict src64;
uint64_t *restrict dst64;
src64 = (const uint64_t *restrict)src;
dst64 = (uint64_t *restrict)dst;
*dst64 = *src64;
}
void
*mem_cpy(void *restrict dst, const void *restrict src, const size_t size)
{
const uint8_t *restrict src8;
uint8_t *restrict dst8;
size_t offset;
size_t words;
size_t aligned_size;
if (!src || !dst)
return (NULL);
if (size <= 8)
{
copy_small(dst, src, size);
return (dst);
}
words = size / 8;
aligned_size = words * 8;
offset = size - aligned_size;
copy_words(dst, src, words);
if (offset)
{
src8 = (const uint8_t *restrict)src;
src8 = &src8(aligned_size);
dst8 = (uint8_t *restrict)dst;
dst8 = &dst8(aligned_size);
while (offset--)
*dst8++ = *src8++;
}
return (dst);
}
As a practice in optimization I’m trying to get my memcpy re-creation as close in speed to the libc one as I can. I have used the following techniques to optimize my memcpy:
- Casting the data to as big a datatype as possible for copying.
- Unrolling the main loop 8 times.
- For data <= 8 bytes I bypass the main loop.
My results (I have added a naive 1 byte at a time memcpy for reference):
Test case | mem_cpy |
mem_cpy_naive |
memcpy |
---|---|---|---|
big string (1000 bytes) | 12.452919s | 212.728906s | 0.935605s |
small string (8 bytes) | 0.367271s | 1.413559s | 0.149886s |
I feel I have exhausted the “low hanging fruit” in terms of optimization. I understand that the libc function could be optimized on a level not accessible to me writing only C, but I wonder if there’s still something to be done here or is the next step to write it in assembly. To give a bit more clarification as to my motive for this. I study programming in a school that has performance constrains on our projects, but as of now we are only able to use standard C, so I can’t go optimizing on assembly level yet. We are also not allowed to use libc and have to create our own versions of the standard functions we want to use so making my memcpy as fast as possible helps me hitting the performance goals in my projects. And it’s great for learning obviously. I welcome any ideas!
Here is the code including the tests, can be compiled as is:
#include <time.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
const size_t iters = 100000000;
//-----------------------------------------------------------------------------
// Optimized memcpy
//
static void copy_words(void *dst, const void *src, size_t words)
{
const uint64_t *src64;
uint64_t *dst64;
uint64_t pages;
uint64_t offset;
pages = words / 8;
offset = words - pages * 8;
src64 = (const uint64_t *)src;
dst64 = (uint64_t *)dst;
while (pages--)
{
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
*dst64++ = *src64++;
}
while (offset--)
*dst64++ = *src64++;
}
static void copy_small(void *dst, const void *src, size_t size)
{
const uint64_t *src64;
uint64_t *dst64;
src64 = (const uint64_t *)src;
dst64 = (uint64_t *)dst;
*dst64 = *src64;
}
void *mem_cpy(void *dst, const void *src, const size_t size)
{
const uint8_t *src8;
uint8_t *dst8;
size_t offset;
size_t words;
size_t aligned_size;
if (!src || !dst)
return (NULL);
if (size <= 8)
{
copy_small(dst, src, size);
return (dst);
}
words = size / 8;
aligned_size = words * 8;
offset = size - aligned_size;
copy_words(dst, src, words);
if (offset)
{
src8 = (const uint8_t *)src;
src8 = &src8(aligned_size);
dst8 = (uint8_t *)dst;
dst8 = &dst8(aligned_size);
while (offset--)
*dst8++ = *src8++;
}
return (dst);
}
//-----------------------------------------------------------------------------
// Naive memcpy
//
void *mem_cpy_naive(void *dst, const void *src, size_t n)
{
const uint8_t *src8;
uint8_t *dst8;
if (src == NULL)
return (NULL);
src8 = (const uint8_t *)src;
dst8 = (uint8_t *)dst;
while (n--)
*dst8++ = *src8++;
return (dst);
}
//-----------------------------------------------------------------------------
// Tests
//
int test(int (*f)(), char *test_name)
{
clock_t begin = clock();
f();
clock_t end = clock();
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%s: %fn", test_name, time_spent);
return (1);
}
char *big_data()
{
char *out;
size_t i;
out = (char *)malloc(sizeof(char) * 1000);
i = 0;
while (i < 1000)
{
out(i) = 'a';
i++;
}
return (out);
}
int test1()
{
char *src;
char *dst;
size_t i;
src = big_data();
dst = (char *)malloc(sizeof(char) * 1000);
i = 0;
while (i < iters)
{
mem_cpy(dst, src, 1000);
i++;
}
return (1);
}
int test2()
{
char *src;
char *dst;
size_t i;
src = big_data();
dst = (char *)malloc(sizeof(char) * 1000);
i = 0;
while (i < iters)
{
mem_cpy_naive(dst, src, 1000);
i++;
}
return (1);
}
int test3()
{
char *src;
char *dst;
size_t i;
src = big_data();
dst = (char *)malloc(sizeof(char) * 1000);
i = 0;
while (i < iters)
{
memcpy(dst, src, 1000);
i++;
}
return (1);
}
int test4()
{
char *src;
char *dst;
size_t i;
src = "12345678";
dst = (char *)malloc(sizeof(char) * 8);
i = 0;
while (i < iters)
{
mem_cpy(dst, src, 8);
i++;
}
return (1);
}
int test5()
{
char *src;
char *dst;
size_t i;
src = "12345678";
dst = (char *)malloc(sizeof(char) * 8);
i = 0;
while (i < iters)
{
mem_cpy_naive(dst, src, 8);
i++;
}
return (1);
}
int test6()
{
char *src;
char *dst;
size_t i;
src = "12345678";
dst = (char *)malloc(sizeof(char) * 8);
i = 0;
while (i < iters)
{
memcpy(dst, src, 8);
i++;
}
return (1);
}
int main(void)
{
test(test1, "User memcpy (big string)");
test(test2, "User memcpy naive (big string)");
test(test3, "Libc memcpy (big string)");
test(test4, "User memcpy");
test(test5, "USer memcpy naive");
test(test6, "Libc memcpy");
}
I won’t paste the assembly, since I think it’s more convenient to just put a link to compiler explorer:
https://godbolt.org/z/Yva9EaPrP