Changeset 1922
- Timestamp:
- 03/15/07 23:24:22 (1 year ago)
- Files:
-
- libswish3/trunk/src/libswish3/words.c (modified) (13 diffs)
- libswish3/trunk/src/ucdata (deleted)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/src/libswish3/words.c
r1921 r1922 39 39 static int is_ignore_end(wint_t c); 40 40 static int is_ignore_word(wint_t c); 41 static int bytes_in_char(wint_t c); 41 42 static void set_debug(); 42 43 … … 146 147 return 0; 147 148 148 if( !c ||149 return ( !c || 149 150 isspace(c) || 150 151 iscntrl(c) || 151 152 ispunct(c) 152 153 ) 153 return 1; 154 155 return 0; 154 ? 1 : 0; 156 155 } 157 156 … … 204 203 205 204 static int 206 bytes_in_char(wchar_t c) 207 { 208 int len; 209 char *mb; 205 bytes_in_char(wint_t ch) 206 { 207 int len = 0; 210 208 211 mb = swish_xmalloc(sizeof(xmlChar)*sizeof(wchar_t)); 212 wctomb(mb, c); 213 len = mblen(mb, sizeof(wchar_t)); 214 swish_xfree(mb); 209 if (ch < 0x80) { 210 len = 1; 211 } 212 if (ch < 0x800) { 213 len = 2; 214 } 215 if (ch < 0x10000) { 216 len = 3; 217 } 218 if (ch < 0x110000) { 219 len = 4; 220 } 215 221 216 222 if( WORD_DEBUG > 5 ) 217 swish_debug_msg(" %lc is %d bytes long", c , len);223 swish_debug_msg(" %lc is %d bytes long", ch, len); 218 224 219 225 return len; 220 226 221 227 } 228 222 229 223 230 static swish_WordList * … … 236 243 xmlChar * utf8_str; 237 244 238 /* convert xmlChar str into a widechar string for comparing against tables */ 245 /* convert xmlChar str into a widechar string for comparing against isw*() functions. 246 * the returned pointer must be freed eventually. 247 */ 239 248 wchar_t *wide = swish_locale_to_wchar(str); 240 249 … … 259 268 c = (int) towlower(wide[i]); 260 269 nextc = (int) towlower(wide[i + 1]); 261 byte_count += bytes_in_char( c);270 byte_count += bytes_in_char((wint_t)c); 262 271 263 272 if (WORD_DEBUG > 10) … … 345 354 346 355 /* turn off flag */ 347 in_word = 0;356 in_word = 0; 348 357 349 word[w] = '\0';350 wl = strip_wide_chars(word, w);351 utf8_str = swish_wchar_to_locale((wchar_t *) word);358 word[w] = '\0'; 359 wl = strip_wide_chars(word, w); 360 utf8_str = swish_wchar_to_locale((wchar_t *) word); 352 361 353 362 if (wl >= minwordlen) … … 396 405 } 397 406 407 /************************************************ 408 * mimic the Swish-e WordCharacters lookup tables 409 * using the default is*() functions. 410 *************************************************/ 411 412 static int ascii_tables_created = 0; 413 static char ascii_word_table[128]; 414 static char ascii_start_table[128]; 415 static char ascii_end_table[128]; 416 417 static void 418 make_ascii_tables() 419 { 420 int i; 421 for (i = 0; i < 127; i++) 422 { 423 if (is_ignore_word_ascii(i)) 424 ascii_word_table[i] = 0; 425 else 426 ascii_word_table[i] = 1; 427 428 if (is_ignore_end_ascii(i)) 429 ascii_end_table[i] = 0; 430 else 431 ascii_end_table[i] = 1; 432 433 if (is_ignore_start_ascii(i)) 434 ascii_start_table[i] = 0; 435 else 436 ascii_start_table[i] = 1; 437 438 } 439 ascii_tables_created = 1; 440 } 441 442 443 398 444 /************************************************************ 399 * no attempt is made here to support the old Swish-e 400 * WordCharacters et al method of a 256-char lookup table. 401 * Instead we just use the native 402 * is*() functions -- the ascii versions, not the isw*() wide 403 * versions. This should save some overhead for the common case 404 * of all ascii text in a string. 445 * This should save some overhead compared to the utf8 version 446 * for the common case of all ascii text in a string. 405 447 * Just like all the Swish3 tokenizing code, this is just a sane 406 448 * fallback function. We expect and encourage users to write their … … 408 450 * 409 451 **************************************************************/ 452 410 453 411 454 static swish_WordList * … … 433 476 swish_debug_msg("parsing string: '%s' into words", str); 434 477 435 for (i = 0; str[i] != '\0'; i++) 436 { 437 c = (int) tolower(str[i]); 438 nextc = (int) tolower(str[i + 1]); 478 479 /* build tables if this is first time through */ 480 if (!ascii_tables_created) 481 make_ascii_tables(); 482 483 484 for (i = 0; str[i] != NULL; i++) 485 { 486 c = (int) tolower(str[i]); 487 nextc = (int) tolower(str[i + 1]); 439 488 byte_count++; 440 489 … … 455 504 456 505 457 if ( is_ignore_word_ascii(c))506 if (!ascii_word_table[(int)c]) 458 507 { 459 508 … … 468 517 /* add NULL */ 469 518 word[w] = NULL; 470 wl = strip_ascii_chars(word, w);519 wl = strip_ascii_chars(word, w); 471 520 472 521 if (wl >= minwordlen) … … 697 746 { 698 747 699 if ( is_ignore_end_ascii(word[i]))700 { 701 word[i] = '\0';748 if (!ascii_end_table[word[i]]) 749 { 750 word[i] = NULL; 702 751 end++; 703 752 } … … 714 763 { 715 764 k = i; 716 if ( !is_ignore_start_ascii(word[k]))765 if (ascii_start_table[word[k]]) 717 766 { 718 767 break;
