Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[arm] Add optimised string functions for 64-bit ARM
Signed-off-by: Michael Brown <mcb30@ipxe.org>
- Loading branch information
Showing
3 changed files
with
355 additions
and
0 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,249 @@ | ||
/* | ||
* Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>. | ||
* | ||
* This program is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU General Public License as | ||
* published by the Free Software Foundation; either version 2 of the | ||
* License, or any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, but | ||
* WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA | ||
* 02110-1301, USA. | ||
* | ||
* You can also choose to distribute this program under the terms of | ||
* the Unmodified Binary Distribution Licence (as given in the file | ||
* COPYING.UBDL), provided that you have satisfied its requirements. | ||
*/ | ||
|
||
/** @file | ||
* | ||
* Optimised string operations | ||
* | ||
*/ | ||
|
||
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); | ||
|
||
#include <string.h> | ||
|
||
/** | ||
* Copy memory area | ||
* | ||
* @v dest Destination address | ||
* @v src Source address | ||
* @v len Length | ||
* @ret dest Destination address | ||
*/ | ||
void arm64_memcpy ( void *dest, const void *src, size_t len ) { | ||
void *discard_dest; | ||
void *discard_end; | ||
const void *discard_src; | ||
size_t discard_offset; | ||
unsigned long discard_data; | ||
unsigned long discard_low; | ||
unsigned long discard_high; | ||
|
||
/* If length is too short for an "ldp"/"stp" instruction pair, | ||
* then just copy individual bytes. | ||
*/ | ||
if ( len < 16 ) { | ||
__asm__ __volatile__ ( "cbz %0, 2f\n\t" | ||
"\n1:\n\t" | ||
"sub %0, %0, #1\n\t" | ||
"ldrb %w1, [%3, %0]\n\t" | ||
"strb %w1, [%2, %0]\n\t" | ||
"cbnz %0, 1b\n\t" | ||
"\n2:\n\t" | ||
: "=&r" ( discard_offset ), | ||
"=&r" ( discard_data ) | ||
: "r" ( dest ), "r" ( src ), "0" ( len ) | ||
: "memory" ); | ||
return; | ||
} | ||
|
||
/* Use "ldp"/"stp" to copy 16 bytes at a time: one initial | ||
* potentially unaligned access, multiple destination-aligned | ||
* accesses, one final potentially unaligned access. | ||
*/ | ||
__asm__ __volatile__ ( "ldp %3, %4, [%1], #16\n\t" | ||
"stp %3, %4, [%0], #16\n\t" | ||
"and %3, %0, #15\n\t" | ||
"sub %0, %0, %3\n\t" | ||
"sub %1, %1, %3\n\t" | ||
"bic %2, %5, #15\n\t" | ||
"b 2f\n\t" | ||
"\n1:\n\t" | ||
"ldp %3, %4, [%1], #16\n\t" | ||
"stp %3, %4, [%0], #16\n\t" | ||
"\n2:\n\t" | ||
"cmp %0, %2\n\t" | ||
"bne 1b\n\t" | ||
"ldp %3, %4, [%6, #-16]\n\t" | ||
"stp %3, %4, [%5, #-16]\n\t" | ||
: "=&r" ( discard_dest ), | ||
"=&r" ( discard_src ), | ||
"=&r" ( discard_end ), | ||
"=&r" ( discard_low ), | ||
"=&r" ( discard_high ) | ||
: "r" ( dest + len ), "r" ( src + len ), | ||
"0" ( dest ), "1" ( src ) | ||
: "memory", "cc" ); | ||
} | ||
|
||
/** | ||
* Zero memory region | ||
* | ||
* @v dest Destination region | ||
* @v len Length | ||
*/ | ||
void arm64_bzero ( void *dest, size_t len ) { | ||
size_t discard_offset; | ||
void *discard_dest; | ||
void *discard_end; | ||
|
||
/* If length is too short for an "stp" instruction, then just | ||
* zero individual bytes. | ||
*/ | ||
if ( len < 16 ) { | ||
__asm__ __volatile__ ( "cbz %0, 2f\n\t" | ||
"\n1:\n\t" | ||
"sub %0, %0, #1\n\t" | ||
"strb wzr, [%1, %0]\n\t" | ||
"cbnz %0, 1b\n\t" | ||
"\n2:\n\t" | ||
: "=&r" ( discard_offset ) | ||
: "r" ( dest ), "0" ( len ) | ||
: "memory" ); | ||
return; | ||
} | ||
|
||
/* Use "stp" to zero 16 bytes at a time: one initial | ||
* potentially unaligned access, multiple aligned accesses, | ||
* one final potentially unaligned access. | ||
*/ | ||
__asm__ __volatile__ ( "stp xzr, xzr, [%0], #16\n\t" | ||
"bic %0, %0, #15\n\t" | ||
"bic %1, %2, #15\n\t" | ||
"b 2f\n\t" | ||
"\n1:\n\t" | ||
"stp xzr, xzr, [%0], #16\n\t" | ||
"\n2:\n\t" | ||
"cmp %0, %1\n\t" | ||
"bne 1b\n\t" | ||
"stp xzr, xzr, [%2, #-16]\n\t" | ||
: "=&r" ( discard_dest ), | ||
"=&r" ( discard_end ) | ||
: "r" ( dest + len ), "0" ( dest ) | ||
: "memory", "cc" ); | ||
} | ||
|
||
/** | ||
* Fill memory region | ||
* | ||
* @v dest Destination region | ||
* @v len Length | ||
* @v character Fill character | ||
* | ||
* The unusual parameter order is to allow for more efficient | ||
* tail-calling to arm64_memset() when zeroing a region. | ||
*/ | ||
void arm64_memset ( void *dest, size_t len, int character ) { | ||
size_t discard_offset; | ||
|
||
/* Use optimised zeroing code if applicable */ | ||
if ( character == 0 ) { | ||
arm64_bzero ( dest, len ); | ||
return; | ||
} | ||
|
||
/* Fill one byte at a time. Calling memset() with a non-zero | ||
* value is relatively rare and unlikely to be | ||
* performance-critical. | ||
*/ | ||
__asm__ __volatile__ ( "cbz %0, 2f\n\t" | ||
"\n1:\n\t" | ||
"sub %0, %0, #1\n\t" | ||
"strb %w2, [%1, %0]\n\t" | ||
"cbnz %0, 1b\n\t" | ||
"\n2:\n\t" | ||
: "=&r" ( discard_offset ) | ||
: "r" ( dest ), "r" ( character ), "0" ( len ) | ||
: "memory" ); | ||
} | ||
|
||
/** | ||
* Copy (possibly overlapping) memory region forwards | ||
* | ||
* @v dest Destination region | ||
* @v src Source region | ||
* @v len Length | ||
*/ | ||
void arm64_memmove_forwards ( void *dest, const void *src, size_t len ) { | ||
void *discard_dest; | ||
const void *discard_src; | ||
unsigned long discard_data; | ||
|
||
/* Assume memmove() is not performance-critical, and perform a | ||
* bytewise copy for simplicity. | ||
*/ | ||
__asm__ __volatile__ ( "b 2f\n\t" | ||
"\n1:\n\t" | ||
"ldrb %w2, [%1], #1\n\t" | ||
"strb %w2, [%0], #1\n\t" | ||
"\n2:\n\t" | ||
"cmp %0, %3\n\t" | ||
"bne 1b\n\t" | ||
: "=&r" ( discard_dest ), | ||
"=&r" ( discard_src ), | ||
"=&r" ( discard_data ) | ||
: "r" ( dest + len ), "0" ( dest ), "1" ( src ) | ||
: "memory" ); | ||
} | ||
|
||
/** | ||
* Copy (possibly overlapping) memory region backwards | ||
* | ||
* @v dest Destination region | ||
* @v src Source region | ||
* @v len Length | ||
*/ | ||
void arm64_memmove_backwards ( void *dest, const void *src, size_t len ) { | ||
size_t discard_offset; | ||
unsigned long discard_data; | ||
|
||
/* Assume memmove() is not performance-critical, and perform a | ||
* bytewise copy for simplicity. | ||
*/ | ||
__asm__ __volatile__ ( "cbz %0, 2f\n\t" | ||
"\n1:\n\t" | ||
"sub %0, %0, #1\n\t" | ||
"ldrb %w1, [%3, %0]\n\t" | ||
"strb %w1, [%2, %0]\n\t" | ||
"cbnz %0, 1b\n\t" | ||
"\n2:\n\t" | ||
: "=&r" ( discard_offset ), | ||
"=&r" ( discard_data ) | ||
: "r" ( dest ), "r" ( src ), "0" ( len ) | ||
: "memory" ); | ||
} | ||
|
||
/** | ||
* Copy (possibly overlapping) memory region | ||
* | ||
* @v dest Destination region | ||
* @v src Source region | ||
* @v len Length | ||
*/ | ||
void arm64_memmove ( void *dest, const void *src, size_t len ) { | ||
|
||
if ( dest <= src ) { | ||
arm64_memmove_forwards ( dest, src, len ); | ||
} else { | ||
arm64_memmove_backwards ( dest, src, len ); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#ifndef BITS_STRING_H | ||
#define BITS_STRING_H | ||
|
||
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); | ||
|
||
/** @file | ||
* | ||
* String functions | ||
* | ||
*/ | ||
|
||
extern void arm64_bzero ( void *dest, size_t len ); | ||
extern void arm64_memset ( void *dest, size_t len, int character ); | ||
extern void arm64_memcpy ( void *dest, const void *src, size_t len ); | ||
extern void arm64_memmove_forwards ( void *dest, const void *src, size_t len ); | ||
extern void arm64_memmove_backwards ( void *dest, const void *src, size_t len ); | ||
extern void arm64_memmove ( void *dest, const void *src, size_t len ); | ||
|
||
/** | ||
* Fill memory region | ||
* | ||
* @v dest Destination region | ||
* @v character Fill character | ||
* @v len Length | ||
* @ret dest Destination region | ||
*/ | ||
static inline __attribute__ (( always_inline )) void * | ||
memset ( void *dest, int character, size_t len ) { | ||
|
||
/* Allow gcc to generate inline "stX xzr" instructions for | ||
* small, constant lengths. | ||
*/ | ||
if ( __builtin_constant_p ( character ) && ( character == 0 ) && | ||
__builtin_constant_p ( len ) && ( len <= 64 ) ) { | ||
__builtin_memset ( dest, 0, len ); | ||
return dest; | ||
} | ||
|
||
/* For zeroing larger or non-constant lengths, use the | ||
* optimised variable-length zeroing code. | ||
*/ | ||
if ( __builtin_constant_p ( character ) && ( character == 0 ) ) { | ||
arm64_bzero ( dest, len ); | ||
return dest; | ||
} | ||
|
||
/* Not necessarily zeroing: use basic variable-length code */ | ||
arm64_memset ( dest, len, character ); | ||
return dest; | ||
} | ||
|
||
/** | ||
* Copy memory region | ||
* | ||
* @v dest Destination region | ||
* @v src Source region | ||
* @v len Length | ||
* @ret dest Destination region | ||
*/ | ||
static inline __attribute__ (( always_inline )) void * | ||
memcpy ( void *dest, const void *src, size_t len ) { | ||
|
||
/* Allow gcc to generate inline "ldX"/"stX" instructions for | ||
* small, constant lengths. | ||
*/ | ||
if ( __builtin_constant_p ( len ) && ( len <= 64 ) ) { | ||
__builtin_memcpy ( dest, src, len ); | ||
return dest; | ||
} | ||
|
||
/* Otherwise, use variable-length code */ | ||
arm64_memcpy ( dest, src, len ); | ||
return dest; | ||
} | ||
|
||
/** | ||
* Copy (possibly overlapping) memory region | ||
* | ||
* @v dest Destination region | ||
* @v src Source region | ||
* @v len Length | ||
* @ret dest Destination region | ||
*/ | ||
static inline __attribute__ (( always_inline )) void * | ||
memmove ( void *dest, const void *src, size_t len ) { | ||
ssize_t offset = ( dest - src ); | ||
|
||
/* If required direction of copy is known at build time, then | ||
* use the appropriate forwards/backwards copy directly. | ||
*/ | ||
if ( __builtin_constant_p ( offset ) ) { | ||
if ( offset <= 0 ) { | ||
arm64_memmove_forwards ( dest, src, len ); | ||
return dest; | ||
} else { | ||
arm64_memmove_backwards ( dest, src, len ); | ||
return dest; | ||
} | ||
} | ||
|
||
/* Otherwise, use ambidirectional copy */ | ||
arm64_memmove ( dest, src, len ); | ||
return dest; | ||
} | ||
|
||
#endif /* BITS_STRING_H */ |