Changeset 2162
- Timestamp:
- 09/20/08 15:37:55 (4 months ago)
- Files:
-
- libswish3/trunk/src/libswish3/libswish3.h (modified) (3 diffs)
- libswish3/trunk/src/libswish3/parser.c (modified) (2 diffs)
- libswish3/trunk/src/libswish3/tokenizer.c (modified) (17 diffs)
- libswish3/trunk/src/swish_tokenize.c (modified) (5 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/src/libswish3/libswish3.h
r2159 r2162 303 303 { 304 304 swish_TokenList *tl; 305 swish_ Config *config;305 swish_3 *s3; 306 306 unsigned int pos; // position in iteration 307 307 int ref_cnt; … … 330 330 unsigned int minwordlen; // min word length 331 331 boolean tokenize; // should we parse into TokenList 332 int (*tokenizer) (swish_ 3*, xmlChar*, swish_TokenList*, swish_MetaName*, xmlChar*);332 int (*tokenizer) (swish_TokenIterator*, xmlChar*, swish_MetaName*, xmlChar*); 333 333 xmlChar* (*stemmer) (xmlChar*); 334 334 unsigned int lc; // should tokens be lowercased … … 537 537 swish_Token * swish_init_token(); 538 538 void swish_free_token( swish_Token *t ); 539 swish_TokenIterator *swish_init_token_iterator( swish_ Config *config, swish_TokenList *tl);539 swish_TokenIterator *swish_init_token_iterator( swish_3 *s3 ); 540 540 void swish_free_token_iterator( swish_TokenIterator *ti ); 541 541 swish_Token * swish_next_token( swish_TokenIterator *it ); 542 int swish_tokenize3( swish_ 3 *s3,542 int swish_tokenize3( swish_TokenIterator *ti, 543 543 xmlChar *buf, 544 swish_TokenList *tl,545 544 swish_MetaName *meta, 546 545 xmlChar *context ); 547 546 int swish_tokenize3_ascii( 548 swish_ 3 *s3,547 swish_TokenIterator *ti, 549 548 xmlChar *buf, 550 swish_TokenList * tl,551 549 swish_MetaName *meta, 552 550 xmlChar *context ); 553 551 int swish_tokenize3_utf8( 554 swish_ 3 *s3,552 swish_TokenIterator *ti, 555 553 xmlChar *buf, 556 swish_TokenList * tl,557 554 swish_MetaName *meta, 558 555 xmlChar *context ); libswish3/trunk/src/libswish3/parser.c
r2158 r2162 1018 1018 1019 1019 ptr->tag = NULL; 1020 ptr->token_iterator = swish_init_token_iterator(s3 ->config, swish_init_token_list());1020 ptr->token_iterator = swish_init_token_iterator(s3); 1021 1021 ptr->token_iterator->ref_cnt++; 1022 1022 ptr->properties = swish_init_nb(s3->config->properties); … … 2056 2056 2057 2057 parser_data->docinfo->nwords += 2058 (*parser_data->s3->analyzer->tokenizer) (parser_data->s3, string, 2059 parser_data->token_iterator->tl, 2060 meta, context); 2058 (*parser_data->s3->analyzer->tokenizer) (parser_data->token_iterator, 2059 string, meta, context); 2061 2060 return; 2062 2061 libswish3/trunk/src/libswish3/tokenizer.c
r2160 r2162 501 501 swish_TokenIterator * 502 502 swish_init_token_iterator( 503 swish_Config *config, 504 swish_TokenList *tl 503 swish_3 *s3 505 504 ) 506 505 { 507 506 swish_TokenIterator *it; 508 507 it = swish_xmalloc(sizeof(swish_TokenIterator)); 509 it-> config = config;510 it-> config->ref_cnt++;508 it->s3 = s3; 509 it->s3->ref_cnt++; 511 510 it->pos = 0; 512 it->tl = tl;511 it->tl = swish_init_token_list(); 513 512 it->tl->ref_cnt++; 514 513 it->ref_cnt = 0; … … 524 523 SWISH_WARN("freeing TokenIterator with ref_cnt != 0 (%d)", it->ref_cnt); 525 524 } 526 it->config->ref_cnt--; 527 if (it->config->ref_cnt == 0) 528 swish_free_config(it->config); 525 526 it->s3->ref_cnt--; 529 527 530 528 it->tl->ref_cnt--; … … 557 555 int 558 556 swish_tokenize3( 559 swish_3 *s3, 560 xmlChar *buf, 561 swish_TokenList *tl, 557 swish_TokenIterator *ti, 558 xmlChar *buf, 562 559 swish_MetaName *meta, 563 560 xmlChar *context … … 565 562 { 566 563 if (swish_is_ascii(buf)) { 567 return swish_tokenize3_ascii( s3, buf, tl, meta, context);564 return swish_tokenize3_ascii(ti, buf, meta, context); 568 565 } 569 566 else { 570 return swish_tokenize3_utf8( s3, buf, tl, meta, context);567 return swish_tokenize3_utf8(ti, buf, meta, context); 571 568 } 572 569 } … … 574 571 int 575 572 swish_tokenize3_utf8( 576 swish_3 *s3, 577 xmlChar *buf, 578 swish_TokenList *tl, 573 swish_TokenIterator *ti, 574 xmlChar *buf, 579 575 swish_MetaName *meta, 580 576 xmlChar *context 581 577 ) 582 578 { 583 int nstart, byte_pos, prev_pos, i, chr_len, cp, token_len; 579 int nstart, byte_pos, prev_pos, i, chr_len, cp, token_len, maxwordlen, minwordlen; 580 swish_TokenList *tl; 584 581 boolean inside_token; 585 582 xmlChar chr[5]; /* max len of UCS32 plus NULL */ 586 583 xmlChar *token, *copy, *buf_lower; 587 token = swish_xmalloc(sizeof(xmlChar) * s3->analyzer->maxwordlen); 588 buf_lower = swish_utf8_str_tolower(buf); 589 590 nstart = tl->n; 584 585 tl = ti->tl; 586 maxwordlen = ti->s3->analyzer->maxwordlen; 587 minwordlen = ti->s3->analyzer->minwordlen; 588 token = swish_xmalloc(sizeof(xmlChar) * maxwordlen); 589 buf_lower = swish_utf8_str_tolower(buf); 590 nstart = tl->n; 591 591 inside_token = 0; 592 byte_pos = 0;593 prev_pos = 0;594 token_len = 0;592 byte_pos = 0; 593 prev_pos = 0; 594 token_len = 0; 595 595 596 596 if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) … … 640 640 token_len = strip_utf8_chrs(token, token_len); 641 641 642 if (token[0] != '\0' && token_len >= s3->analyzer->minwordlen) {642 if (token[0] != '\0' && token_len >= minwordlen) { 643 643 644 644 swish_add_token(tl, token, token_len, meta, context); … … 684 684 685 685 /* edge case */ 686 if ((chr_len + token_len) > s3->analyzer->maxwordlen) {686 if ((chr_len + token_len) > maxwordlen) { 687 687 if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) 688 688 SWISH_DEBUG_MSG("token_len = %d forcing end of token: '%s'", … … 698 698 token_len += chr_len; 699 699 700 if (token_len >= s3->analyzer->maxwordlen || buf[byte_pos] == '\0') {700 if (token_len >= maxwordlen || buf[byte_pos] == '\0') { 701 701 702 702 if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) … … 710 710 token_len = strip_utf8_chrs(token, token_len); 711 711 712 if (token[0] != '\0' && token_len >= s3->analyzer->minwordlen) {712 if (token[0] != '\0' && token_len >= minwordlen) { 713 713 714 714 swish_add_token(tl, token, token_len, meta, context); … … 743 743 inside_token = 1; /* turn on flag */ 744 744 /* edge case */ 745 if (chr_len > s3->analyzer->maxwordlen)745 if (chr_len > maxwordlen) 746 746 continue; 747 747 … … 751 751 752 752 /* special case for one-character tokens */ 753 if (buf_lower[prev_pos] == '\0' && s3->analyzer->minwordlen == 1) {753 if (buf_lower[prev_pos] == '\0' && minwordlen == 1) { 754 754 inside_token = 0; 755 755 token[token_len++] = '\0'; … … 778 778 int 779 779 swish_tokenize3_ascii( 780 swish_3 *s3, 781 xmlChar *buf, 782 swish_TokenList *tl, 780 swish_TokenIterator *ti, 781 xmlChar *buf, 783 782 swish_MetaName *meta, 784 783 xmlChar *context … … 787 786 char c, nextc; 788 787 boolean inside_token; 789 int i, token_len, nstart ;788 int i, token_len, nstart, maxwordlen, minwordlen; 790 789 xmlChar *token, *copy; 791 token = swish_xmalloc(sizeof(xmlChar) * s3->analyzer->maxwordlen); 792 793 nstart = tl->n; 794 token_len = 0; 795 token[0] = '\0'; 796 inside_token = 0; 790 swish_TokenList *tl; 791 792 tl = ti->tl; 793 maxwordlen = ti->s3->analyzer->maxwordlen; 794 minwordlen = ti->s3->analyzer->minwordlen; 795 token = swish_xmalloc(sizeof(xmlChar) * maxwordlen); 796 nstart = tl->n; 797 token_len = 0; 798 token[0] = '\0'; 799 inside_token = 0; 797 800 798 801 if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) … … 818 821 token_len = strip_ascii_chrs(token, token_len); 819 822 820 if (token[0] != '\0' && token_len >= s3->analyzer->minwordlen) {823 if (token[0] != '\0' && token_len >= minwordlen) { 821 824 swish_add_token(tl, token, token_len, meta, context); 822 825 } … … 862 865 token[token_len++] = c; 863 866 864 if (token_len >= s3->analyzer->maxwordlen || nextc == '\0') {867 if (token_len >= maxwordlen || nextc == '\0') { 865 868 866 869 if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) … … 873 876 token_len = strip_ascii_chrs(token, token_len); 874 877 875 if (token[0] != '\0' && token_len >= s3->analyzer->minwordlen) {878 if (token[0] != '\0' && token_len >= minwordlen) { 876 879 swish_add_token(tl, token, token_len, meta, context); 877 880 } … … 905 908 906 909 /* special case for one-character tokens */ 907 if (nextc == '\0' && s3->analyzer->minwordlen == 1) {910 if (nextc == '\0' && minwordlen == 1) { 908 911 inside_token = 0; 909 912 token[token_len++] = '\0'; libswish3/trunk/src/swish_tokenize.c
r2148 r2162 71 71 extern int optind; 72 72 xmlChar *string; 73 swish_TokenList *list;74 73 swish_TokenIterator *iterator; 75 74 xmlChar *meta; … … 81 80 82 81 s3 = swish_init_swish3(NULL, NULL); 83 list = swish_init_token_list(); 84 iterator = swish_init_token_iterator(s3->config, list); 82 iterator = swish_init_token_iterator(s3); 85 83 86 84 while ((ch = getopt_long(argc, argv, "f:h", longopts, &option_index)) != -1) { … … 114 112 for (; i < argc; i++) { 115 113 ntokens = 116 swish_tokenize3( s3, (xmlChar *)argv[i], list,114 swish_tokenize3(iterator, (xmlChar *)argv[i], 117 115 swish_hash_fetch(s3->config->metanames, meta), meta); 118 116 printf("parsed %d tokens: %s\n", ntokens, argv[i]); … … 122 120 if (string != NULL) { 123 121 ntokens = 124 swish_tokenize3( s3, string, list,122 swish_tokenize3(iterator, string, 125 123 swish_hash_fetch(s3->config->metanames, meta), meta); 126 124 swish_debug_token_list(iterator); … … 129 127 130 128 swish_free_token_iterator(iterator); 131 swish_free_token_list(list);132 129 swish_free_swish3(s3); 133 130
