Changeset 2103 for libswish3/trunk/src/libswish3/utf8.c
- Timestamp:
- 03/27/08 23:35:21 (10 months ago)
- Files:
-
- libswish3/trunk/src/libswish3/utf8.c (modified) (25 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/src/libswish3/utf8.c
r1913 r2103 15 15 16 16 /* convert UTF-8 data to wide character */ 17 static int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz); 17 static int u8_toucs( 18 u_int32_t * dest, 19 int sz, 20 char *src, 21 int srcsz 22 ); 18 23 19 24 /* the opposite conversion */ 20 static int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz); 25 static int u8_toutf8( 26 char *dest, 27 int sz, 28 u_int32_t * src, 29 int srcsz 30 ); 21 31 22 32 /* single character to UTF-8 */ 23 static int u8_wc_toutf8(char *dest, u_int32_t ch); 33 static int u8_wc_toutf8( 34 char *dest, 35 u_int32_t ch 36 ); 24 37 25 38 /* character number to byte offset */ 26 static int u8_offset(char *str, int charnum); 39 static int u8_offset( 40 char *str, 41 int charnum 42 ); 27 43 28 44 /* byte offset to character number */ 29 static int u8_charnum(char *s, int offset); 45 static int u8_charnum( 46 char *s, 47 int offset 48 ); 30 49 31 50 /* return next character, updating an index variable */ 32 static u_int32_t u8_nextchar(char *s, int *i); 51 static u_int32_t u8_nextchar( 52 char *s, 53 int *i 54 ); 33 55 34 56 /* move to next character */ 35 static void u8_inc(char *s, int *i); 57 static void u8_inc( 58 char *s, 59 int *i 60 ); 36 61 37 62 /* move to previous character */ 38 static void u8_dec(char *s, int *i); 63 static void u8_dec( 64 char *s, 65 int *i 66 ); 39 67 40 68 /* returns length of next utf-8 sequence */ 41 static int u8_seqlen(char *s); 69 static int u8_seqlen( 70 char *s 71 ); 42 72 43 73 /* assuming src points to the character after a backslash, read an 44 74 escape sequence, storing the result in dest and returning the number of 45 75 input characters processed */ 46 static int u8_read_escape_sequence(char *src, u_int32_t *dest); 76 static int u8_read_escape_sequence( 77 char *src, 78 u_int32_t * dest 79 ); 47 80 48 81 /* given a wide character, convert it to an ASCII escape sequence stored in 49 82 buf, where buf is "sz" bytes. returns the number of characters output. */ 50 static int u8_escape_wchar(char *buf, int sz, u_int32_t ch); 83 static int u8_escape_wchar( 84 char *buf, 85 int sz, 86 u_int32_t ch 87 ); 51 88 52 89 /* convert a string "src" containing escape sequences to UTF-8 */ 53 static int u8_unescape(char *buf, int sz, char *src); 90 static int u8_unescape( 91 char *buf, 92 int sz, 93 char *src 94 ); 54 95 55 96 /* convert UTF-8 "src" to ASCII with escape sequences. 56 97 if escape_quotes is nonzero, quote characters will be preceded by 57 98 backslashes as well. */ 58 static int u8_escape(char *buf, int sz, char *src, int escape_quotes); 99 static int u8_escape( 100 char *buf, 101 int sz, 102 char *src, 103 int escape_quotes 104 ); 59 105 60 106 /* utility predicates used by the above */ 61 static int octal_digit(char c); 62 static int hex_digit(char c); 107 static int octal_digit( 108 char c 109 ); 110 static int hex_digit( 111 char c 112 ); 63 113 64 114 /* return a pointer to the first occurrence of ch in s, or NULL if not 65 115 found. character index of found character returned in *charn. */ 66 static char *u8_strchr(char *s, u_int32_t ch, int *charn); 116 static char *u8_strchr( 117 char *s, 118 u_int32_t ch, 119 int *charn 120 ); 67 121 68 122 /* same as the above, but searches a buffer of a given size instead of 69 123 a NUL-terminated string. */ 70 static char *u8_memchr(char *s, u_int32_t ch, size_t sz, int *charn); 124 static char *u8_memchr( 125 char *s, 126 u_int32_t ch, 127 size_t sz, 128 int *charn 129 ); 71 130 72 131 /* count the number of characters in a UTF-8 string */ 73 static int u8_strlen(char *s); 74 75 static int u8_is_locale_utf8(char *locale); 132 static int u8_strlen( 133 char *s 134 ); 135 136 static int u8_is_locale_utf8( 137 char *locale 138 ); 76 139 77 140 /* printf where the format string and arguments may be in UTF-8. 78 141 you can avoid this function and just use ordinary printf() if the current 79 142 locale is UTF-8. */ 80 static int u8_vprintf(char *fmt, va_list ap); 81 static int u8_printf(char *fmt, ...); 82 143 static int u8_vprintf( 144 char *fmt, 145 va_list ap 146 ); 147 static int u8_printf( 148 char *fmt, 149 ... 150 ); 83 151 84 152 /* http://cprogramming.com/tutorial/utf8.c */ 85 86 153 87 154 /* … … 108 175 #endif 109 176 110 111 177 static const u_int32_t offsetsFromUTF8[6] = { 112 178 0x00000000UL, 0x00003080UL, 0x000E2080UL, … … 115 181 116 182 static const char trailingBytesForUTF8[256] = { 117 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 118 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 119 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 120 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 121 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 122 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 123 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 124 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 184 0, 0, 0, 0, 0, 0, 0, 185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 186 0, 0, 0, 0, 0, 0, 0, 187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 188 0, 0, 0, 0, 0, 0, 0, 189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 190 0, 0, 0, 0, 0, 0, 0, 191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 192 0, 0, 0, 0, 0, 0, 0, 193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 194 0, 0, 0, 0, 0, 0, 0, 195 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 196 1, 1, 1, 1, 1, 1, 1, 197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 198 4, 4, 4, 5, 5, 5, 5 125 199 }; 126 200 127 201 /* returns length of next utf-8 sequence */ 128 static int u8_seqlen(char *s) 202 static int 203 u8_seqlen( 204 char *s 205 ) 129 206 { 130 207 return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1; … … 141 218 if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space. 142 219 */ 143 static int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz) 220 static int 221 u8_toucs( 222 u_int32_t * dest, 223 int sz, 224 char *src, 225 int srcsz 226 ) 144 227 { 145 228 u_int32_t ch; 146 229 char *src_end = src + srcsz; 147 230 int nb; 148 int i =0;149 150 while (i < sz -1) {231 int i = 0; 232 233 while (i < sz - 1) { 151 234 nb = trailingBytesForUTF8[(unsigned char)*src]; 152 235 if (srcsz == -1) { … … 160 243 ch = 0; 161 244 switch (nb) { 162 /* these fall through deliberately */ 163 case 3: ch += (unsigned char)*src++; ch <<= 6; 164 case 2: ch += (unsigned char)*src++; ch <<= 6; 165 case 1: ch += (unsigned char)*src++; ch <<= 6; 166 case 0: ch += (unsigned char)*src++; 245 /* 246 these fall through deliberately 247 */ 248 case 3: 249 ch += (unsigned char)*src++; 250 ch <<= 6; 251 case 2: 252 ch += (unsigned char)*src++; 253 ch <<= 6; 254 case 1: 255 ch += (unsigned char)*src++; 256 ch <<= 6; 257 case 0: 258 ch += (unsigned char)*src++; 167 259 } 168 260 ch -= offsetsFromUTF8[nb]; 169 261 dest[i++] = ch; 170 262 } 171 done_toucs:263 done_toucs: 172 264 dest[i] = 0; 173 265 return i; … … 186 278 the destination string will never be bigger than the source string. 187 279 */ 188 static int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz) 280 static int 281 u8_toutf8( 282 char *dest, 283 int sz, 284 u_int32_t * src, 285 int srcsz 286 ) 189 287 { 190 288 u_int32_t ch; … … 192 290 char *dest_end = dest + sz; 193 291 194 while (srcsz <0 ? src[i]!=0 : i < srcsz) {292 while (srcsz < 0 ? src[i] != 0 : i < srcsz) { 195 293 ch = src[i]; 196 294 if (ch < 0x80) { … … 200 298 } 201 299 else if (ch < 0x800) { 202 if (dest >= dest_end -1)300 if (dest >= dest_end - 1) 203 301 return i; 204 *dest++ = (ch >>6) | 0xC0;302 *dest++ = (ch >> 6) | 0xC0; 205 303 *dest++ = (ch & 0x3F) | 0x80; 206 304 } 207 305 else if (ch < 0x10000) { 208 if (dest >= dest_end -2)306 if (dest >= dest_end - 2) 209 307 return i; 210 *dest++ = (ch >>12) | 0xE0;211 *dest++ = ((ch >>6) & 0x3F) | 0x80;308 *dest++ = (ch >> 12) | 0xE0; 309 *dest++ = ((ch >> 6) & 0x3F) | 0x80; 212 310 *dest++ = (ch & 0x3F) | 0x80; 213 311 } 214 312 else if (ch < 0x110000) { 215 if (dest >= dest_end -3)313 if (dest >= dest_end - 3) 216 314 return i; 217 *dest++ = (ch >>18) | 0xF0;218 *dest++ = ((ch >>12) & 0x3F) | 0x80;219 *dest++ = ((ch >>6) & 0x3F) | 0x80;315 *dest++ = (ch >> 18) | 0xF0; 316 *dest++ = ((ch >> 12) & 0x3F) | 0x80; 317 *dest++ = ((ch >> 6) & 0x3F) | 0x80; 220 318 *dest++ = (ch & 0x3F) | 0x80; 221 319 } … … 227 325 } 228 326 229 static int u8_wc_toutf8(char *dest, u_int32_t ch) 327 static int 328 u8_wc_toutf8( 329 char *dest, 330 u_int32_t ch 331 ) 230 332 { 231 333 if (ch < 0x80) { … … 234 336 } 235 337 if (ch < 0x800) { 236 dest[0] = (ch >>6) | 0xC0;338 dest[0] = (ch >> 6) | 0xC0; 237 339 dest[1] = (ch & 0x3F) | 0x80; 238 340 return 2; 239 341 } 240 342 if (ch < 0x10000) { 241 dest[0] = (ch >>12) | 0xE0;242 dest[1] = ((ch >>6) & 0x3F) | 0x80;343 dest[0] = (ch >> 12) | 0xE0; 344 dest[1] = ((ch >> 6) & 0x3F) | 0x80; 243 345 dest[2] = (ch & 0x3F) | 0x80; 244 346 return 3; 245 347 } 246 348 if (ch < 0x110000) { 247 dest[0] = (ch >>18) | 0xF0;248 dest[1] = ((ch >>12) & 0x3F) | 0x80;249 dest[2] = ((ch >>6) & 0x3F) | 0x80;349 dest[0] = (ch >> 18) | 0xF0; 350 dest[1] = ((ch >> 12) & 0x3F) | 0x80; 351 dest[2] = ((ch >> 6) & 0x3F) | 0x80; 250 352 dest[3] = (ch & 0x3F) | 0x80; 251 353 return 4; … … 255 357 256 358 /* charnum => byte offset */ 257 static int u8_offset(char *str, int charnum) 258 { 259 int offs=0; 359 static int 360 u8_offset( 361 char *str, 362 int charnum 363 ) 364 { 365 int offs = 0; 260 366 261 367 while (charnum > 0 && str[offs]) { 262 (void)(isutf(str[++offs]) || isutf(str[++offs]) || 263 isutf(str[++offs])|| ++offs);368 (void)(isutf(str[++offs]) || isutf(str[++offs]) || isutf(str[++offs]) 369 || ++offs); 264 370 charnum--; 265 371 } … … 268 374 269 375 /* byte offset => charnum */ 270 static int u8_charnum(char *s, int offset) 271 { 272 int charnum = 0, offs=0; 376 static int 377 u8_charnum( 378 char *s, 379 int offset 380 ) 381 { 382 int charnum = 0, offs = 0; 273 383 274 384 while (offs < offset && s[offs]) { 275 (void)(isutf(s[++offs]) || isutf(s[++offs]) || 276 isutf(s[++offs])|| ++offs);385 (void)(isutf(s[++offs]) || isutf(s[++offs]) || isutf(s[++offs]) 386 || ++offs); 277 387 charnum++; 278 388 } … … 281 391 282 392 /* number of characters */ 283 static int u8_strlen(char *s) 393 static int 394 u8_strlen( 395 char *s 396 ) 284 397 { 285 398 int count = 0; … … 293 406 294 407 /* reads the next utf-8 sequence out of a string, updating an index */ 295 static u_int32_t u8_nextchar(char *s, int *i) 408 static u_int32_t 409 u8_nextchar( 410 char *s, 411 int *i 412 ) 296 413 { 297 414 u_int32_t ch = 0; … … 303 420 sz++; 304 421 } while (s[*i] && !isutf(s[*i])); 305 ch -= offsetsFromUTF8[sz -1];422 ch -= offsetsFromUTF8[sz - 1]; 306 423 307 424 return ch; 308 425 } 309 426 310 static void u8_inc(char *s, int *i) 311 { 312 (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || 313 isutf(s[++(*i)]) || ++(*i)); 314 } 315 316 static void u8_dec(char *s, int *i) 317 { 318 (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || 319 isutf(s[--(*i)]) || --(*i)); 320 } 321 322 static int octal_digit(char c) 427 static void 428 u8_inc( 429 char *s, 430 int *i 431 ) 432 { 433 (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i)); 434 } 435 436 static void 437 u8_dec( 438 char *s, 439 int *i 440 ) 441 { 442 (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i)); 443 } 444 445 static int 446 octal_digit( 447 char c 448 ) 323 449 { 324 450 return (c >= '0' && c <= '7'); 325 451 } 326 452 327 static int hex_digit(char c) 328 { 329 return ((c >= '0' && c <= '9') || 330 (c >= 'A' && c <= 'F') || 331 (c >= 'a' && c <= 'f')); 453 static int 454 hex_digit( 455 char c 456 ) 457 { 458 return ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') 459 || (c >= 'a' && c <= 'f')); 332 460 } 333 461 334 462 /* assumes that src points to the character after a backslash 335 463 returns number of input characters processed */ 336 static int u8_read_escape_sequence(char *str, u_int32_t *dest) 464 static int 465 u8_read_escape_sequence( 466 char *str, 467 u_int32_t * dest 468 ) 337 469 { 338 470 u_int32_t ch; 339 char digs[9] ="\0\0\0\0\0\0\0\0\0";340 int dno =0, i=1;341 342 ch = (u_int32_t) str[0]; /* take literal character */471 char digs[9] = "\0\0\0\0\0\0\0\0\0"; 472 int dno = 0, i = 1; 473 474 ch = (u_int32_t) str[0]; /* take literal character */ 343 475 if (str[0] == 'n') 344 476 ch = L'\n'; … … 391 523 example: u8_unescape(mybuf, 256, "hello\\u220e") 392 524 note the double backslash is needed if called on a C string literal */ 393 static int u8_unescape(char *buf, int sz, char *src) 394 { 395 int c=0, amt; 525 static int 526 u8_unescape( 527 char *buf, 528 int sz, 529 char *src 530 ) 531 { 532 int c = 0, amt; 396 533 u_int32_t ch; 397 534 char temp[4]; … … 403 540 } 404 541 else { 405 ch = (u_int32_t) *src;542 ch = (u_int32_t) * src; 406 543 amt = 1; 407 544 } 408 545 src += amt; 409 546 amt = u8_wc_toutf8(temp, ch); 410 if (amt > sz -c)547 if (amt > sz - c) 411 548 break; 412 549 memcpy(&buf[c], temp, amt); … … 418 555 } 419 556 420 static int u8_escape_wchar(char *buf, int sz, u_int32_t ch) 557 static int 558 u8_escape_wchar( 559 char *buf, 560 int sz, 561 u_int32_t ch 562 ) 421 563 { 422 564 if (ch == L'\n') … … 439 581 return snprintf(buf, sz, "\\x%hhX", (unsigned char)ch); 440 582 else if (ch > 0xFFFF) 441 return snprintf(buf, sz, "\\U%.8X", (u_int32_t) ch);583 return snprintf(buf, sz, "\\U%.8X", (u_int32_t) ch); 442 584 else if (ch >= 0x80 && ch <= 0xFFFF) 443 585 return snprintf(buf, sz, "\\u%.4hX", (unsigned short)ch); … … 446 588 } 447 589 448 static int u8_escape(char *buf, int sz, char *src, int escape_quotes) 449 { 450 int c=0, i=0, amt; 590 static int 591 u8_escape( 592 char *buf, 593 int sz, 594 char *src, 595 int escape_quotes 596 ) 597 { 598 int c = 0, i = 0, amt; 451 599 452 600 while (src[i] && c < sz) { … … 466 614 } 467 615 468 static char *u8_strchr(char *s, u_int32_t ch, int *charn) 469 { 470 int i = 0, lasti=0; 616 static char * 617 u8_strchr( 618 char *s, 619 u_int32_t ch, 620 int *charn 621 ) 622 { 623 int i = 0, lasti = 0; 471 624 u_int32_t c; 472 625 … … 483 636 } 484 637 485 static char *u8_memchr(char *s, u_int32_t ch, size_t sz, int *charn) 486 { 487 int i = 0, lasti=0; 638 static char * 639 u8_memchr( 640 char *s, 641 u_int32_t ch, 642 size_t sz, 643 int *charn 644 ) 645 { 646 int i = 0, lasti = 0; 488 647 u_int32_t c; 489 648 int csz; … … 497 656 csz++; 498 657 } while (i < sz && !isutf(s[i])); 499 c -= offsetsFromUTF8[csz -1];658 c -= offsetsFromUTF8[csz - 1]; 500 659 501 660 if (c == ch) { … … 508 667 } 509 668 510 static int u8_is_locale_utf8(char *locale) 511 { 512 /* this code based on libutf8 */ 513 const char* cp = locale; 669 static int 670 u8_is_locale_utf8( 671 char *locale 672 ) 673 { 674 /* 675 this code based on libutf8 676 */ 677 const char *cp = locale; 514 678 515 679 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) { 516 680 if (*cp == '.') { 517 const char* encoding = ++cp; 518 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) 519 ; 520 if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5)) 521 || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4))) 522 return 1; /* it's UTF-8 */ 681 const char *encoding = ++cp; 682 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++); 683 if ((cp - encoding == 5 && !strncmp(encoding, "UTF-8", 5)) 684 || (cp - encoding == 4 && !strncmp(encoding, "utf8", 4))) 685 return 1; /* it's UTF-8 */ 523 686 break; 524 687 } … … 527 690 } 528 691 529 static int u8_vprintf(char *fmt, va_list ap) 530 { 531 int cnt, sz=0; 692 static int 693 u8_vprintf( 694 char *fmt, 695 va_list ap 696 ) 697 { 698 int cnt, sz = 0; 532 699 char *buf; 533 700 u_int32_t *wcs; 534 701 535 702 sz = 512; 536 buf = (char *)alloca(sz);537 try_print:703 buf = (char *)alloca(sz); 704 try_print: 538 705 cnt = vsnprintf(buf, sz, fmt, ap); 539 706 if (cnt >= sz) { 540 buf = (char *)alloca(cnt - sz + 1);707 buf = (char *)alloca(cnt - sz + 1); 541 708 sz = cnt + 1; 542 709 goto try_print; 543 710 } 544 wcs = (u_int32_t *)alloca((cnt+1) * sizeof(u_int32_t));545 cnt = u8_toucs(wcs, cnt +1, buf, cnt);546 printf("%ls", (wchar_t *)wcs);711 wcs = (u_int32_t *) alloca((cnt + 1) * sizeof(u_int32_t)); 712 cnt = u8_toucs(wcs, cnt + 1, buf, cnt); 713 printf("%ls", (wchar_t *) wcs); 547 714 return cnt; 548 715 } 549 716 550 static int u8_printf(char *fmt, ...) 717 static int 718 u8_printf( 719 char *fmt, 720 ... 721 ) 551 722 { 552 723 int cnt;
