/* -------------------------------------------------------------------------- * Operations on Chars. * * Extended to Unicode by Dimitry Golubovsky . * * The Hugs 98 system is Copyright (c) Mark P Jones, Alastair Reid, the * Yale Haskell Group, and the OGI School of Science & Engineering at OHSU, * 1994-2005, All rights reserved. It is distributed as free software under * the license in the file "License", which is included in the distribution. * ------------------------------------------------------------------------*/ #include "prelude.h" #include "storage.h" #include "connect.h" #include "errors.h" #include "char.h" #include /* -------------------------------------------------------------------------- * Unicode character properties (cf http://www.unicode.org/ucd/) * ------------------------------------------------------------------------*/ /* Unicode general categories, listed in the same order as in the Unicode * standard -- this must be the same order as in Data.Char. */ enum { GENCAT_Lu, /* Letter, Uppercase */ GENCAT_Ll, /* Letter, Lowercase */ GENCAT_Lt, /* Letter, Titlecase */ GENCAT_Lm, /* Letter, Modifier */ GENCAT_Lo, /* Letter, Other */ GENCAT_Mn, /* Mark, Non-Spacing */ GENCAT_Mc, /* Mark, Spacing Combining */ GENCAT_Me, /* Mark, Enclosing */ GENCAT_Nd, /* Number, Decimal */ GENCAT_Nl, /* Number, Letter */ GENCAT_No, /* Number, Other */ GENCAT_Pc, /* Punctuation, Connector */ GENCAT_Pd, /* Punctuation, Dash */ GENCAT_Ps, /* Punctuation, Open */ GENCAT_Pe, /* Punctuation, Close */ GENCAT_Pi, /* Punctuation, Initial quote */ GENCAT_Pf, /* Punctuation, Final quote */ GENCAT_Po, /* Punctuation, Other */ GENCAT_Sm, /* Symbol, Math */ GENCAT_Sc, /* Symbol, Currency */ GENCAT_Sk, /* Symbol, Modifier */ GENCAT_So, /* Symbol, Other */ GENCAT_Zs, /* Separator, Space */ GENCAT_Zl, /* Separator, Line */ GENCAT_Zp, /* Separator, Paragraph */ GENCAT_Cc, /* Other, Control */ GENCAT_Cf, /* Other, Format */ GENCAT_Cs, /* Other, Surrogate */ GENCAT_Co, /* Other, Private Use */ GENCAT_Cn /* Other, Not Assigned */ }; #if UNICODE_CHARS /* properties of a Unicode character */ struct CharProperties { int category; /* Unicode general category */ int upper_offset; /* offset of the result of toUpper */ int lower_offset; /* offset of the result of toUpper */ int title_offset; /* offset of the result of toUpper */ }; /* A contiguous block of characters with the same properties */ struct CharBlock { Char blk_start; /* first character in the block */ Char blk_length; /* number of characters in the block */ const struct CharProperties *blk_properties; /* properties shared by these characters, */ /* and possibly by other blocks too. */ }; /* property table automatically generated from UnicodeData file */ #include "unitable.c" /* properties for an character not covered by the table */ static const struct CharProperties null_properties = { GENCAT_Cn, 0, 0, 0 }; #endif /* -------------------------------------------------------------------------- * Local function prototypes: * ------------------------------------------------------------------------*/ static Void local initConsCharTable Args((Void)); static Void local markConsCharTable Args((Void)); #if UNICODE_CHARS static Void local freeConsCharTable Args((Void)); static const struct CharProperties * local get_properties Args((Char)); #endif #if CHAR_ENCODING_UTF8 static Int local utfseqlen Args((Int)); static Int local utfcodelen Args((Char)); #endif /* -------------------------------------------------------------------------- * Character set handling: * * Hugs follows Haskell 1.3 in assuming that input uses the ISO-8859-1 * (Latin-1) character set. The following code provides methods for * classifying input characters according to the lexical structure * specified by the report. Hugs should still accept older programs * because ASCII is just a subset of the Latin-1 character set. * ------------------------------------------------------------------------*/ Bool charTabBuilt; unsigned char charTable[NUM_LAT1_CHARS]; Void initCharTab() { /* Initialize char decode table */ #define setRange(x,f,t) {Int i=f; while (i<=t) charTable[i++] |=x;} #define setChar(x,c) charTable[c] |= (x) #define setChars(x,s) {char *p=s; while (*p) charTable[(Int)*p++]|=x;} #define setCopy(x,c) {Int i; \ for (i=0; i?@\\^|-~"); setChar (IDAFTER, '\''); /* Characters in identifier */ setCopy (IDAFTER, (DIGIT|SMALL|LARGE)); setChar (SPACE, ' '); /* ASCII space character */ setChar (SPACE, 160); /* Latin-1 non breaking space */ setRange(SPACE, 9,13); /* special whitespace: \t\n\v\f\r */ setRange(PRINT, 32,126); setRange(PRINT, 160,172); setRange(PRINT, 174,255); /* (omits soft hyphen) */ #undef setRange #undef setChar #undef setChars #undef setCopy charTabBuilt = TRUE; } /* -------------------------------------------------------------------------- * Char primitives. * ------------------------------------------------------------------------*/ #if UNICODE_CHARS #define UPPER_MASK ((1<category==GENCAT_Ll; } Bool isUpper(Char c) { return isLatin1(c) ? isUpperLat1(c) : ((1<category)&UPPER_MASK)!=0; } Bool isAlpha(Char c) { return isLatin1(c) ? isAlphaLat1(c) : (get_properties(c)->category<=GENCAT_Lo); } Bool isAlphaNum(Char c) { return isLatin1(c) ? isAlphaNumLat1(c) : get_properties(c)->category<=GENCAT_No; } Bool isPrint(Char c) { return isLatin1(c) ? isPrintLat1(c) : get_properties(c)->category<=GENCAT_Zs; } Char toUpper(Char c) { return c + get_properties(c)->upper_offset; } Char toLower(Char c) { return c + get_properties(c)->lower_offset; } Char toTitle(Char c) { return c + get_properties(c)->title_offset; } Int uni_gencat(Char c) { return get_properties(c)->category; } /* binary search of the properties table */ static const struct CharProperties * local get_properties(Char c) { Int lo, hi, mid; lo = 0; hi = NUM_BLOCKS-1; while (lo!=hi) { /* i <= lo => char_block[i].blk_start <= c */ /* i > hi => char_block[i].blk_start > c */ mid = (lo+hi+1)/2; /* lo < mid <= hi */ if (char_block[mid].blk_start<=c) lo = mid; else hi = mid-1; } /* i <= lo => char_block[i].blk_start <= c */ /* i > lo => char_block[i].blk_start > c */ return c' */ /* '?' */ GENCAT_Sm, GENCAT_Sm, GENCAT_Sm, GENCAT_Po, /* '@' */ /* 'A' */ /* 'B' */ /* 'C' */ GENCAT_Po, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* 'D' */ /* 'E' */ /* 'F' */ /* 'G' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* 'H' */ /* 'I' */ /* 'J' */ /* 'K' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* 'L' */ /* 'M' */ /* 'N' */ /* 'O' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* 'P' */ /* 'Q' */ /* 'R' */ /* 'S' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* 'T' */ /* 'U' */ /* 'V' */ /* 'W' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* 'X' */ /* 'Y' */ /* 'Z' */ /* '[' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Ps, /* '\\' */ /* ']' */ /* '^' */ /* '_' */ GENCAT_Po, GENCAT_Pe, GENCAT_Sk, GENCAT_Pc, /* '`' */ /* 'a' */ /* 'b' */ /* 'c' */ GENCAT_Sk, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* 'd' */ /* 'e' */ /* 'f' */ /* 'g' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* 'h' */ /* 'i' */ /* 'j' */ /* 'k' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* 'l' */ /* 'm' */ /* 'n' */ /* 'o' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* 'p' */ /* 'q' */ /* 'r' */ /* 's' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* 't' */ /* 'u' */ /* 'v' */ /* 'w' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* 'x' */ /* 'y' */ /* 'z' */ /* '{' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ps, /* '|' */ /* '}' */ /* '~' */ /* '\DEL' */ GENCAT_Sm, GENCAT_Pe, GENCAT_Sm, GENCAT_Cc, /* '\128' */ /* '\129' */ /* '\130' */ /* '\131' */ GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, /* '\132' */ /* '\133' */ /* '\134' */ /* '\135' */ GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, /* '\136' */ /* '\137' */ /* '\138' */ /* '\139' */ GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, /* '\140' */ /* '\141' */ /* '\142' */ /* '\143' */ GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, /* '\144' */ /* '\145' */ /* '\146' */ /* '\147' */ GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, /* '\148' */ /* '\149' */ /* '\150' */ /* '\151' */ GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, /* '\152' */ /* '\153' */ /* '\154' */ /* '\155' */ GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, /* '\156' */ /* '\157' */ /* '\158' */ /* '\159' */ GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, GENCAT_Cc, /* '\160' */ /* '\161' */ /* '\162' */ /* '\163' */ GENCAT_Zs, GENCAT_Po, GENCAT_Sc, GENCAT_Sc, /* '\164' */ /* '\165' */ /* '\166' */ /* '\167' */ GENCAT_Sc, GENCAT_Sc, GENCAT_So, GENCAT_So, /* '\168' */ /* '\169' */ /* '\170' */ /* '\171' */ GENCAT_Sk, GENCAT_So, GENCAT_Ll, GENCAT_Pi, /* '\172' */ /* '\173' */ /* '\174' */ /* '\175' */ GENCAT_Sm, GENCAT_Cf, GENCAT_So, GENCAT_Sk, /* '\176' */ /* '\177' */ /* '\178' */ /* '\179' */ GENCAT_So, GENCAT_Sm, GENCAT_No, GENCAT_No, /* '\180' */ /* '\181' */ /* '\182' */ /* '\183' */ GENCAT_Sk, GENCAT_Ll, GENCAT_So, GENCAT_Po, /* '\184' */ /* '\185' */ /* '\186' */ /* '\187' */ GENCAT_Sk, GENCAT_No, GENCAT_Ll, GENCAT_Pf, /* '\188' */ /* '\189' */ /* '\190' */ /* '\191' */ GENCAT_No, GENCAT_No, GENCAT_No, GENCAT_Po, /* '\192' */ /* '\193' */ /* '\194' */ /* '\195' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* '\196' */ /* '\197' */ /* '\198' */ /* '\199' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* '\200' */ /* '\201' */ /* '\202' */ /* '\203' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* '\204' */ /* '\205' */ /* '\206' */ /* '\207' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* '\208' */ /* '\209' */ /* '\210' */ /* '\211' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* '\212' */ /* '\213' */ /* '\214' */ /* '\215' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Sm, /* '\216' */ /* '\217' */ /* '\218' */ /* '\219' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, /* '\220' */ /* '\221' */ /* '\222' */ /* '\223' */ GENCAT_Lu, GENCAT_Lu, GENCAT_Lu, GENCAT_Ll, /* '\224' */ /* '\225' */ /* '\226' */ /* '\227' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* '\228' */ /* '\229' */ /* '\230' */ /* '\231' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* '\232' */ /* '\233' */ /* '\234' */ /* '\235' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* '\236' */ /* '\237' */ /* '\238' */ /* '\239' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* '\240' */ /* '\241' */ /* '\242' */ /* '\243' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* '\244' */ /* '\245' */ /* '\246' */ /* '\247' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Sm, /* '\248' */ /* '\249' */ /* '\250' */ /* '\251' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, /* '\252' */ /* '\253' */ /* '\254' */ /* '\255' */ GENCAT_Ll, GENCAT_Ll, GENCAT_Ll, GENCAT_Ll }; Int uni_gencat(Char c) { return char_category[c]; } #endif /* -------------------------------------------------------------------------- * Multibyte character incodings. * ------------------------------------------------------------------------*/ #if CHAR_ENCODING int fputc_mb(Char c, FILE *f) { char buf[MAX_CHAR_ENCODING]; String s = buf; addc_mb(c, &s); return fwrite(buf, s-buf, 1, f)==1 ? c : EOF; } #endif /* CHAR_ENCODING */ #if CHAR_ENCODING_LOCALE /* Using the encoding specified by the current locale (LC_CTYPE). * Note that ISO C does not permit both byte-oriented and wchar I/O * on the same stream, so we use byte-oriented I/O, and do the conversion * ourselves using mbrtowc/wcrtomb. */ Bool charIsRepresentable(Char c) { char buf[MAX_CHAR_ENCODING]; size_t n; wchar_t wc; mbstate_t st; if (c == '\0') return TRUE; memset(&st, 0, sizeof(st)); n = wcrtomb(buf, c, &st); if (n == (size_t)(-1)) errno = 0; else { memset(&st, 0, sizeof(st)); if (mbrtowc(&wc, buf, n, &st) == n) return wc==c; } return FALSE; } /* Read a Char encoded as a multi-byte sequence. */ int fgetc_mb(FILE *f) { char buf[MAX_CHAR_ENCODING]; Int n = 0; size_t size; wchar_t wc; mbstate_t st; for (;;) { int c = fgetc(f); if (c == EOF) return n == 0 ? EOF : BAD_CHAR; buf[n++] = c; memset(&st, 0, sizeof(st)); size = mbrtowc(&wc, buf, n, &st); switch (size) { case (size_t)(-1): /* decoding error */ errno = 0; return BAD_CHAR; case (size_t)(-2): /* incomplete sequence */ if (n == MAX_CHAR_ENCODING) return BAD_CHAR; break; case 0: /* null character */ return 0; default: /* successful decoding */ if (size < n) /* If the encoding uses lookahead, we have to read extra * bytes to detect the end of an encoding. If it's * just one byte (size == n-1) we can push it back onto * the input. This won't work for encodings that need * more than 1 byte of lookahead, but I don't think * there are any. */ ungetc(c, f); return wc; } } } /* Add a Char to a multi-byte encoded string, moving the pointer. */ Void addc_mb(Char c, String *sp) { size_t size; mbstate_t st; memset(&st, 0, sizeof(st)); size = wcrtomb(*sp, c, &st); if (size == (size_t)-1) { /* encoding error */ *(*sp)++ = '?'; errno = 0; } else *sp += size; } /* Get a Char from a multi-byte encoded string, moving the pointer. */ Char extc_mb(String *sp) { wchar_t c = '\0'; size_t size; mbstate_t st; memset(&st, 0, sizeof(st)); size = mbrtowc(&c, *sp, MAX_CHAR_ENCODING, &st); switch (size) { case (size_t)(-1): /* decoding error */ errno = 0; /* fall through to ... */ case (size_t)(-2): /* incomplete sequence */ c = BAD_CHAR; (*sp)++; break; case 0: /* string starts with \0 */ (*sp)++; break; default: /* successful decoding */ *sp += size; } return c; } #elif CHAR_ENCODING_UTF8 /* * The UTF-FSS (aka UTF-8) encoding of UCS, as described in the following * quote from Ken Thompson's utf-fss.c: * * Bits Hex Min Hex Max Byte Sequence in Binary * 7 00000000 0000007f 0vvvvvvv * 11 00000080 000007FF 110vvvvv 10vvvvvv * 16 00000800 0000FFFF 1110vvvv 10vvvvvv 10vvvvvv * 21 00010000 001FFFFF 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv * 26 00200000 03FFFFFF 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv * 31 04000000 7FFFFFFF 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv * * The UCS value is just the concatenation of the v bits in the multibyte * encoding. When there are multiple ways to encode a value, for example * UCS 0, only the shortest encoding is legal. */ #define UTFhead(b) (((b)&0xC0)==0xC0) /* head of a multibyte sequence */ #define UTFtail(b) (((b)&0xC0)==0x80) /* tail of a multibyte sequence */ #define UTFmask(n) ((0xFF00>>(n))&0xFF) /* length bits of a head byte */ /* Note also that since ASCII, head and tail chars are disjoint in UTF-8, * it is possible to resynchronize a stream after a coding error, but we * don't do that. */ int fgetc_mb(FILE *f) { unsigned char buf[MAX_CHAR_ENCODING]; Int b; Char c; Int size, i; if ((b = fgetc(f))==EOF) return EOF; if (b<0x80) /* ASCII character */ return b; if (!UTFhead(b)) return BAD_CHAR; size = utfseqlen(b); c = b&~UTFmask(size); for (i=1 ; i=0; i--) { (*sp)[i] = (cn%64)|((i==0)?UTFmask(noct):0x80); cn /= 64; } *sp += noct; } } Char extc_mb(String *sp) { Char c = *(unsigned char *)*sp; if (c<0x80) (*sp)++; else if (!UTFhead(c)) c = BAD_CHAR; else { int i, size = utfseqlen(c); /* how many octets to expect */ c &= ~UTFmask(size); for (i=1; i= 0); return consCharArray[c]; } static void local initConsCharTable() { Int i; for (i=0; i