diff --git a/include/lib.h b/include/lib.h index db9921f..55afbed 100644 --- a/include/lib.h +++ b/include/lib.h @@ -3,6 +3,9 @@ #define listen pm_listen #define sleep ksleep #define wakeup kwakeup +#ifdef strtod +#undef strtod +#endif #define strtod fmtstrtod /* conflicts on some os's */ @@ -27,7 +30,7 @@ typedef unsigned int p9_ulong; typedef int p9_long; typedef signed char p9_schar; typedef unsigned short p9_ushort; -typedef unsigned short Rune; +typedef unsigned int Rune; typedef unsigned int p9_u32int; typedef p9_u32int mpdigit; @@ -50,10 +53,12 @@ typedef p9_u32int mpdigit; enum { - UTFmax = 3, /* maximum bytes per rune */ + UTFmax = 4, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80 /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* 21-bit rune */ + Runemask = 0x1FFFFF, /* bits used by runes (see grep) */ }; /* diff --git a/libc/rune.c b/libc/rune.c index b62da9e..cccd3a5 100644 --- a/libc/rune.c +++ b/libc/rune.c @@ -1,26 +1,24 @@ #include #include +#define Bit(i) (7-(i)) +/* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */ +#define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) +/* 0000 0000 0000 0111 1111 1111 */ +#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) + enum { - Bit1 = 7, - Bitx = 6, - Bit2 = 5, - Bit3 = 4, - Bit4 = 3, + Bitx = Bit(1), - T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ - Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ - T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ - T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ - T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + Tx = T(1), /* 1000 0000 */ + Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Maskx = (1< T1 + * 00080-007FF => T2 Tx + * 00800-0FFFF => T3 Tx Tx + * 10000-10FFFF => T4 Tx Tx Tx */ - c = *(uchar*)str; - if(c < Tx) { - *rune = c; + + c[0] = *(uchar*)(str); + if(c[0] < Tx){ + *rune = c[0]; return 1; } + l = c[0]; - /* - * two character sequence - * 0080-07FF => T2 Tx - */ - c1 = *(uchar*)(str+1) ^ Tx; - if(c1 & Testx) - goto bad; - if(c < T3) { - if(c < T2) + for(i = 1; i < UTFmax; i++) { + c[i] = *(uchar*)(str+i); + c[i] ^= Tx; + if(c[i] & Testx) goto bad; - l = ((c << Bitx) | c1) & Rune2; - if(l <= Rune1) - goto bad; - *rune = l; - return 2; - } - - /* - * three character sequence - * 0800-FFFF => T3 Tx Tx - */ - c2 = *(uchar*)(str+2) ^ Tx; - if(c2 & Testx) - goto bad; - if(c < T4) { - l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; - if(l <= Rune2) - goto bad; - *rune = l; - return 3; + l = (l << Bitx) | c[i]; + if(c[0] < T(i + 2)) { + l &= RuneX(i + 1); + if(i == 1) { + if(c[0] < T(2) || l <= Rune1) + goto bad; + } else if(l <= RuneX(i) || l > Runemax) + goto bad; + if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) + goto bad; + *rune = l; + return i + 1; + } } /* @@ -84,12 +75,9 @@ bad: int runetochar(char *str, Rune *rune) { - long c; + int i, j; + Rune c; - /* - * one character sequence - * 00000-0007F => 00-7F - */ c = *rune; if(c <= Rune1) { str[0] = c; @@ -97,23 +85,35 @@ runetochar(char *str, Rune *rune) } /* + * one character sequence + * 00000-0007F => 00-7F * two character sequence * 0080-07FF => T2 Tx - */ - if(c <= Rune2) { - str[0] = T2 | (c >> 1*Bitx); - str[1] = Tx | (c & Maskx); - return 2; - } - - /* * three character sequence * 0800-FFFF => T3 Tx Tx + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + * If the Rune is out of range or a surrogate half, + * convert it to the error rune. + * Do this test when i==3 because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + for(i = 2; i < UTFmax + 1; i++){ + if(i == 3){ + if(c > Runemax) + c = Runeerror; + if(SurrogateMin <= c && c <= SurrogateMax) + c = Runeerror; + } + if (c <= RuneX(i) || i == UTFmax ) { + str[0] = T(i) | (c >> (i - 1)*Bitx); + for(j = 1; j < i; j++) + str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); + return i; + } + } + return UTFmax; } int @@ -129,18 +129,21 @@ runelen(long c) int runenlen(Rune *r, int nrune) { - int nb, c; + int nb, i; + Rune c; nb = 0; while(nrune--) { c = *r++; - if(c <= Rune1) + if(c <= Rune1){ nb++; - else - if(c <= Rune2) - nb += 2; - else - nb += 3; + } else { + for(i = 2; i < UTFmax + 1; i++) + if(c <= RuneX(i) || i == UTFmax){ + nb += i; + break; + } + } } return nb; } @@ -148,15 +151,16 @@ runenlen(Rune *r, int nrune) int fullrune(char *str, int n) { - int c; + int i; + Rune c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + for(i = 3; i < UTFmax + 1; i++) + if(c < T(i)) + return n >= i - 1; + return n >= UTFmax; } diff --git a/libc/utf.h b/libc/utf.h index 623bfda..8b6c014 100644 --- a/libc/utf.h +++ b/libc/utf.h @@ -1,14 +1,16 @@ #ifndef _UTFH_ #define _UTFH_ 1 -typedef unsigned short Rune; /* 16 bits */ +typedef unsigned int Rune; /* 32 bits */ enum { - UTFmax = 3, /* maximum bytes per rune */ + UTFmax = 4, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* 21-bit rune */ + Runemask = 0x1FFFFF, /* bits used by runes (see grep) */ }; /*