Changeset 2141
- Timestamp:
- 04/30/08 00:03:02 (2 weeks ago)
- Files:
-
- libswish3/trunk/src/libswish3/config.c (modified) (2 diffs)
- libswish3/trunk/src/libswish3/libswish3.h (modified) (4 diffs)
- libswish3/trunk/src/libswish3/string.c (modified) (11 diffs)
- libswish3/trunk/src/libswish3/words.c (modified) (3 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/src/libswish3/config.c
r2140 r2141 19 19 */ 20 20 21 /* parse XML-style config files22 *23 * based on http://www.yolinux.com/TUTORIALS/GnomeLibXml2.html24 *25 */26 27 21 #include <sys/param.h> 28 22 #include <stdio.h> … … 196 190 flags->meta_ids = swish_init_hash(8); 197 191 flags->prop_ids = swish_init_hash(8); 198 flags->contexts = swish_init_hash(8);192 //flags->contexts = swish_init_hash(8); 199 193 200 194 return flags; libswish3/trunk/src/libswish3/libswish3.h
r2140 r2141 236 236 xmlHashTablePtr meta_ids; 237 237 xmlHashTablePtr prop_ids; 238 xmlHashTablePtr contexts;238 //xmlHashTablePtr contexts; 239 239 }; 240 240 … … 477 477 */ 478 478 void swish_verify_utf8_locale(); 479 int swish_is_ascii( xmlChar *str ); 479 boolean swish_is_ascii( xmlChar *str ); 480 int swish_bytes_in_wchar( int wchar ); 480 481 int swish_utf8_chr_len( xmlChar *utf8 ); 481 482 int swish_utf8_codepoint( xmlChar *utf8 ); … … 487 488 wchar_t * swish_wstr_tolower(wchar_t *s); 488 489 xmlChar * swish_str_tolower(xmlChar *s ); 490 xmlChar * swish_utf8_str_tolower(xmlChar *s); 491 xmlChar * swish_ascii_str_tolower(xmlChar *s); 489 492 xmlChar * swish_str_skip_ws(xmlChar *s); 490 493 void swish_str_trim_ws(xmlChar *string); … … 609 612 swish_Token * swish_next_token( swish_TokenIterator *it ); 610 613 int swish_tokenize3( swish_3 *s3, 614 swish_TokenList * tl, 615 xmlChar *buf, 616 swish_MetaName *meta, 617 xmlChar *context ); 618 int swish_tokenize3_ascii( 619 swish_3 *s3, 620 swish_TokenList * tl, 621 xmlChar *buf, 622 swish_MetaName *meta, 623 xmlChar *context ); 624 int swish_tokenize3_utf8( 625 swish_3 *s3, 611 626 swish_TokenList * tl, 612 627 xmlChar *buf, libswish3/trunk/src/libswish3/string.c
r2140 r2141 40 40 static xmlChar *getword( 41 41 xmlChar **in_buf 42 );43 static xmlChar *utf8_str_tolower(44 xmlChar *s45 );46 static xmlChar *ascii_str_tolower(47 xmlChar *s48 42 ); 49 43 static xmlChar *findlast( … … 302 296 */ 303 297 304 int 298 boolean 305 299 swish_is_ascii( 306 300 xmlChar *str … … 351 345 352 346 /* a bit about encodings: libxml2 takes whatever encoding the input XML is 353 * (latin1, ascii, utf8, etc) and standardizes it using iconv in xmlChar as 354 * UTF-8. However, we must ensure we have UTF-8 locale because all the mb* and wc* 355 * routines rely on the locale to correctly interpret chars. */ 347 * (latin1, ascii, utf8, etc) and standardizes it using iconv in xmlChar as 348 * UTF-8. However, we must ensure we have UTF-8 locale because all the mb* and wc* 349 * routines rely on the locale to correctly interpret chars. 350 */ 356 351 357 352 /* use LC_CTYPE specifically: http://mail.nl.linux.org/linux-utf8/2001-09/msg00030.html */ … … 462 457 463 458 if (swish_is_ascii(s)) 464 return ascii_str_tolower(s);459 return swish_ascii_str_tolower(s); 465 460 else 466 return utf8_str_tolower(s);461 return swish_utf8_str_tolower(s); 467 462 468 463 } … … 473 468 and free the wchar 474 469 */ 475 staticxmlChar *476 utf8_str_tolower(470 xmlChar * 471 swish_utf8_str_tolower( 477 472 xmlChar *s 478 473 ) … … 496 491 497 492 /* based on swstring.c in Swish-e */ 498 staticxmlChar *499 ascii_str_tolower(493 xmlChar * 494 swish_ascii_str_tolower( 500 495 xmlChar *s 501 496 ) … … 516 511 -- 2001-01-30 rasc 517 512 518 should be utf8 safe, unless a continuation byte evals true to isspace()513 TODO make utf8 safe. 519 514 */ 520 515 … … 534 529 **************************************/ 535 530 531 // TODO make utf8 safe 536 532 void 537 533 swish_str_trim_ws( … … 572 568 } 573 569 570 /* returns the number of UTF-8 char* needed to hold the codepoint 571 represented by 'ch'. 572 similar to swish_utf8_chr_len() except that the arg is already 573 a 4-byte container and we want to know how many of the 4 bytes 574 we really need. 575 */ 576 int 577 swish_bytes_in_wchar( 578 int ch 579 ) 580 { 581 int len = 0; 582 583 if (ch < 0x80) { 584 len = 1; 585 } 586 if (ch < 0x800) { 587 len = 2; 588 } 589 if (ch < 0x10000) { 590 len = 3; 591 } 592 if (ch < 0x110000) { 593 len = 4; 594 } 595 596 if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) 597 SWISH_DEBUG_MSG(" %lc is %d bytes long", ch, len); 598 599 return len; 600 } 601 602 574 603 /* from http://www.triptico.com/software/unicode.html */ 575 604 wchar_t * … … 587 616 len = mblen((const char *)str, 4); 588 617 589 /* a size of -1 is triggered by an error in encoding; never happen in ISO-8859-*590 * locales, but possible in UTF-8 */591 if (s == -1) {592 SWISH_WARN("error converting mbs to wide str: %s", str);593 return (0);594 } 618 /* a size of -1 is triggered by an error in encoding; 619 * never happen in ISO-8859-* locales, but possible in UTF-8 620 */ 621 if (s == -1) 622 SWISH_CROAK("error converting mbs to wide str: %s", str); 623 595 624 596 625 /* malloc the necessary space */ … … 621 650 /* a size of -1 means there are characters that could not be converted to current 622 651 * locale */ 623 if (s == -1) { 624 warn("error converting wide chars to mbs: %ls", str); 625 return (0); 626 } 652 if (s == -1) 653 SWISH_CROAK("error converting wide chars to mbs: %ls", str); 627 654 628 655 /* malloc the necessary space */ libswish3/trunk/src/libswish3/words.c
r2140 r2141 49 49 int c 50 50 ); 51 static int bytes_in_chr(52 int ch53 );54 51 static void make_ascii_tables( 55 52 ); … … 248 245 } 249 246 250 /* returns the number of UTF-8 char* needed to hold the codepoint251 represented by 'ch'.252 similar to swish_utf8_chr_len() except that the arg is already253 a 4-byte container and we want to know how many of the 4 bytes254 we really need.255 */256 static int257 bytes_in_chr(258 int ch259 )260 {261 int len = 0;262 263 if (ch < 0x80) {264 len = 1;265 }266 if (ch < 0x800) {267 len = 2;268 }269 if (ch < 0x10000) {270 len = 3;271 }272 if (ch < 0x110000) {273 len = 4;274 }275 276 if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)277 SWISH_DEBUG_MSG(" %lc is %d bytes long", ch, len);278 279 return len;280 }281 247 282 248 swish_WordList * … … 319 285 320 286 for (i = 0; wide[i] != '\0'; i++) { 321 c = ( int)towlower(wide[i]);322 nextc = ( int)towlower(wide[i + 1]);323 byte_count += bytes_in_chr((wint_t)c);287 c = (wchar_t)towlower(wide[i]); 288 nextc = (wchar_t)towlower(wide[i + 1]); 289 byte_count += swish_bytes_in_wchar(c); 324 290 325 291 if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
