[lonetix/smallbytecopy] Add optimized routines for small byte buffer copies.

This commit is contained in:
Lorenzo Cogotti 2021-10-18 11:57:17 +02:00
parent bfdde40031
commit d7c7060cb7
1 changed files with 143 additions and 0 deletions

143
lonetix/smallbytecopy.h Normal file
View File

@ -0,0 +1,143 @@
// SPDX-License-Identifier: LGPL-3.0-or-later
/**
* \file smallbytecopy.h
*
* Optimized routines for tiny buffer copy (<= 64 bytes).
*
* Whenever possible, avoid these routines and use regular
* memcpy()/memmove().
* These routines are recommended only when:
* - you are absolutely sure of the maximum size of your data.
* - the compiler cannot possibly estimate it statically,
* otherwise the compiler could do a much better job at
* optimizing the copy.
*
* \copyright The DoubleFourteen Code Forge (C) All Rights Reserved
* \author Lorenzo Cogotti
*/
#ifndef DF_SMALLBYTECOPY_H_
#define DF_SMALLBYTECOPY_H_
#include "xpt.h"
#if defined(__i386__) || defined(__x86_64__)
#include <assert.h>
// Optimize copy and don't pay attention to alignment,
// ugly but fast and inline-able compared to plain memcpy()...
#define _bytecopy1(d, s) ((void) (*(Uint8 *) (d) = *(Uint8 *) (s)))
#define _bytecopy2(d, s) ((void) (*(Uint16 *) (d) = *(Uint16 *) (s)))
#define _bytecopy4(d, s) ((void) (*(Uint32 *) (d) = *(Uint32 *) (s)))
#define _bytecopy8(d, s) ((void) (*(Uint64 *) (d) = *(Uint64 *) (s)))
INLINE void _smallbytecopy4(void *__restrict dest, const void *__restrict src, size_t n)
{
Uint8 *d = (Uint8 *) dest, *s = (Uint8 *) src;
assert(n <= 4);
switch (n) {
case 4: _bytecopy4(d, s);
break;
case 3: _bytecopy2(d + 1, s + 1);
/*FALLTHROUGH*/
case 1: _bytecopy1(d, s);
break;
case 2: _bytecopy2(d, s);
break;
case 0: break;
default: UNREACHABLE; break;
}
}
INLINE void _smallbytecopy8(void *__restrict dest, const void *__restrict src, size_t n)
{
Uint8 *d = (Uint8 *) dest, *s = (Uint8 *) src;
assert(n <= 8);
switch (n) {
case 8: _bytecopy8(d, s);
break;
case 7: _bytecopy4(d + 3, s + 3);
/*FALLTHROUGH*/
case 3: _bytecopy2(d + 1, s + 1);
/*FALLTHROUGH*/
case 1: _bytecopy1(d, s);
break;
case 6: _bytecopy4(d + 2, s + 2);
/*FALLTHROUGH*/
case 2: _bytecopy2(d, s);
break;
case 5: _bytecopy1(d + 4, s + 4);
/*FALLTHROUGH*/
case 4: _bytecopy4(d, s);
break;
case 0: break;
default: UNREACHABLE; break;
}
}
INLINE void _smallbytecopy16(void *__restrict dest, const void *__restrict src, size_t n)
{
Uint8 *d = (Uint8 *) dest, *s = (Uint8 *) src;
assert(n <= 16);
if (n > 8) {
_bytecopy8(d, s);
d += 8, s += 8, n -= 8;
}
_smallbytecopy8(d, s, n);
}
INLINE void _smallbytecopy32(void *__restrict dest, const void *__restrict src, size_t n)
{
Uint8 *d = (Uint8 *) dest, *s = (Uint8 *) src;
assert(n <= 32);
if (n > 16) {
_bytecopy8(d, s);
_bytecopy8(d + 8, s + 8);
d += 16, s += 16, n -= 16;
}
_smallbytecopy16(d, s, n);
}
INLINE void _smallbytecopy64(void *__restrict dest, const void *__restrict src, size_t n)
{
Uint8 *d = (Uint8 *) dest, *s = (Uint8 *) src;
assert(n <= 64);
if (n > 32) {
_bytecopy8(d, s);
_bytecopy8(d + 8, s + 8);
_bytecopy8(d + 16, s + 16);
_bytecopy8(d + 24, s + 24);
d += 32, s += 32, n -= 32;
}
_smallbytecopy32(d, s, n);
}
#undef _bytecopy1
#undef _bytecopy2
#undef _bytecopy4
#undef _bytecopy8
#else
#include <string.h>
#define _smallbytecopy4(d, s, n) ((void) memcpy(d, s, n))
#define _smallbytecopy8(d, s, n) ((void) memcpy(d, s, n))
#define _smallbytecopy16(d, s, n) ((void) memcpy(d, s, n))
#define _smallbytecopy32(d, s, n) ((void) memcpy(d, s, n))
#define _smallbytecopy64(d, s, n) ((void) memcpy(d, s, n))
#endif
#endif