188 lines
4.3 KiB
C
Executable File
188 lines
4.3 KiB
C
Executable File
// SPDX-License-Identifier: LGPL-3.0-or-later
|
|
|
|
/**
|
|
* \file utf/rune.c
|
|
*
|
|
* Implements `char`s to `Rune` conversion functions and back.
|
|
*
|
|
* \copyright The DoubleFourteen Code Forge (C) All Rights Reserved
|
|
* \author Lorenzo Cogotti
|
|
*/
|
|
|
|
/*
|
|
* This work is derived from Plan 9 libutf, see utf/utf.h for details.
|
|
* Original license terms follow:
|
|
*
|
|
* The authors of this software are Rob Pike and Ken Thompson.
|
|
* Copyright (c) 2002 by Lucent Technologies.
|
|
* Permission to use, copy, modify, and distribute this software for any
|
|
* purpose without fee is hereby granted, provided that this entire notice
|
|
* is included in all copies of any software which is or includes a copy
|
|
* or modification of this software and in all copies of the supporting
|
|
* documentation for such software.
|
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
|
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
|
*
|
|
* The original libutf library is available at: https://9fans.github.io/plan9port/unix/libutf.tgz
|
|
*/
|
|
|
|
#include "utf/utf.h"
|
|
|
|
#define BIT1 7
|
|
#define BITX 6
|
|
#define BIT2 5
|
|
#define BIT3 4
|
|
#define BIT4 3
|
|
#define BIT5 2
|
|
|
|
#define T1 (((1u << (BIT1+1)) - 1u) ^ 0xffu) /* 0000 0000 */
|
|
#define TX (((1u << (BITX+1)) - 1u) ^ 0xffu) /* 1000 0000 */
|
|
#define T2 (((1u << (BIT2+1)) - 1u) ^ 0xffu) /* 1100 0000 */
|
|
#define T3 (((1u << (BIT3+1)) - 1u) ^ 0xffu) /* 1110 0000 */
|
|
#define T4 (((1u << (BIT4+1)) - 1u) ^ 0xffu) /* 1111 0000 */
|
|
#define T5 (((1u << (BIT5+1)) - 1u) ^ 0xffu) /* 1111 1000 */
|
|
|
|
#define RUNE1 ((U32_C(1) << (BIT1 + 0*BITX)) - 1u) /* 0000 0000 0000 0000 0111 1111 */
|
|
#define RUNE2 ((U32_C(1) << (BIT2 + 1*BITX)) - 1u) /* 0000 0000 0000 0111 1111 1111 */
|
|
#define RUNE3 ((U32_C(1) << (BIT3 + 2*BITX)) - 1u) /* 0000 0000 1111 1111 1111 1111 */
|
|
#define RUNE4 ((U32_C(1) << (BIT4 + 3*BITX)) - 1u) /* 0011 1111 1111 1111 1111 1111 */
|
|
|
|
#define MASKX ((1u << BITX) - 1u) /* 0011 1111 */
|
|
#define TESTX (MASKX ^ 0xFFu) /* 1100 0000 */
|
|
|
|
size_t chartorune(Rune *dest, const char *str)
|
|
{
|
|
// One character sequence
|
|
// 00000-0007F => T1
|
|
Rune c = *(const Uint8 *) str;
|
|
if (c < TX) {
|
|
*dest = c;
|
|
return 1;
|
|
}
|
|
|
|
// Two character sequence
|
|
// 0080-07FF => T2 Tx
|
|
Rune c1 = *(const Uint8 *) (str+1) ^ TX;
|
|
if (c1 & TESTX)
|
|
goto bad;
|
|
|
|
if (c < T3) {
|
|
if (c < T2)
|
|
goto bad;
|
|
|
|
Rune r = ((c << BITX) | c1) & RUNE2;
|
|
if (r <= RUNE1)
|
|
goto bad;
|
|
|
|
*dest = r;
|
|
return 2;
|
|
}
|
|
|
|
// Three character sequence
|
|
// 0800-FFFF => T3 Tx Tx
|
|
Rune c2 = *(const Uint8 *) (str+2) ^ TX;
|
|
if (c2 & TESTX)
|
|
goto bad;
|
|
|
|
if (c < T4) {
|
|
Rune r = ((((c << BITX) | c1) << BITX) | c2) & RUNE3;
|
|
if (r <= RUNE2)
|
|
goto bad;
|
|
|
|
*dest = r;
|
|
return 3;
|
|
}
|
|
|
|
// Four character sequence
|
|
// 10000-10FFFF => T4 Tx Tx Tx
|
|
Rune c3 = *(const Uint8*) (str+3) ^ TX;
|
|
if (c3 & TESTX)
|
|
goto bad;
|
|
|
|
if (c < T5) {
|
|
Rune r = ((((((c << BITX) | c1) << BITX) | c2) << BITX) | c3) & RUNE4;
|
|
if (r <= RUNE3)
|
|
goto bad;
|
|
if (r > MAXRUNE)
|
|
goto bad;
|
|
|
|
*dest = r;
|
|
return 4;
|
|
}
|
|
|
|
// Decoding error
|
|
bad:
|
|
*dest = RUNE_ERR;
|
|
return 1;
|
|
}
|
|
|
|
size_t runetochar(char *dest, Rune r)
|
|
{
|
|
// One character sequence
|
|
// 00000-0007F => 00-7F
|
|
if (r <= RUNE1) {
|
|
dest[0] = r;
|
|
return 1;
|
|
}
|
|
|
|
// Two character sequence
|
|
// 00080-007FF => T2 Tx
|
|
if (r <= RUNE2) {
|
|
dest[0] = T2 | (r >> 1*BITX);
|
|
dest[1] = TX | (r & MASKX);
|
|
return 2;
|
|
}
|
|
|
|
// Three character sequence
|
|
// 00800-0FFFF => T3 Tx Tx
|
|
if (r > MAXRUNE)
|
|
r = RUNE_ERR;
|
|
if (r <= RUNE3) {
|
|
dest[0] = T3 | (r >> 2*BITX);
|
|
dest[1] = TX | ((r >> 1*BITX) & MASKX);
|
|
dest[2] = TX | (r & MASKX);
|
|
return 3;
|
|
}
|
|
|
|
// Four character sequence
|
|
// 010000-1FFFFF => T4 Tx Tx Tx
|
|
dest[0] = T4 | (r >> 3*BITX);
|
|
dest[1] = TX | ((r >> 2*BITX) & MASKX);
|
|
dest[2] = TX | ((r >> 1*BITX) & MASKX);
|
|
dest[3] = TX | (r & MASKX);
|
|
return 4;
|
|
}
|
|
|
|
size_t runelen(Rune r)
|
|
{
|
|
return 1 + (r > RUNE1) + (r > RUNE2) + (r > RUNE3 && r <= MAXRUNE);
|
|
}
|
|
|
|
size_t runenlen(const Rune *r, size_t n)
|
|
{
|
|
size_t nb = 0;
|
|
|
|
while (n--)
|
|
nb += runelen(*r++);
|
|
|
|
return nb;
|
|
}
|
|
|
|
Boolean fullrune(const char *str, size_t n)
|
|
{
|
|
if (n == 0)
|
|
return FALSE;
|
|
|
|
Rune c = *(const Uint8 *) str;
|
|
if (c < TX)
|
|
return TRUE;
|
|
if (c < T3)
|
|
return n >= 2;
|
|
if (c < T4)
|
|
return n >= 3;
|
|
|
|
return n >= 4;
|
|
}
|