Changeset 2158
- Timestamp:
- 09/19/08 07:16:36 (4 months ago)
- Files:
-
- libswish3/trunk/src/Makefile.am (modified) (1 diff)
- libswish3/trunk/src/libswish3/Makefile.am (modified) (1 diff)
- libswish3/trunk/src/libswish3/analyzer.c (modified) (1 diff)
- libswish3/trunk/src/libswish3/libswish3.h (modified) (6 diffs)
- libswish3/trunk/src/libswish3/parser.c (modified) (9 diffs)
- libswish3/trunk/src/libswish3/swish.c (modified) (3 diffs)
- libswish3/trunk/src/libswish3/words.c (deleted)
- libswish3/trunk/src/swish_lint.c (modified) (4 diffs)
- libswish3/trunk/src/swish_words.c (deleted)
- libswish3/trunk/src/t/001-wordcount.t (modified) (2 diffs)
- libswish3/trunk/src/xapian/swish_xapian.cpp (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/src/Makefile.am
r2140 r2158 11 11 # -pg is for profiling -- don't use in production 12 12 13 bin_PROGRAMS = swish_lint swish_ words swish_tokenize swish_isw utf8test swish_header13 bin_PROGRAMS = swish_lint swish_tokenize swish_isw utf8test swish_header 14 14 check_PROGRAMS = swish_lint swish_header 15 15 swish_lint_SOURCES = swish_lint.c $(myheaders) 16 swish_words_SOURCES = swish_words.c $(myheaders)17 16 swish_tokenize_SOURCES = swish_tokenize.c $(myheaders) 18 17 swish_isw_SOURCES = swish_isw.c libswish3/trunk/src/libswish3/Makefile.am
r2140 r2158 23 23 string.c \ 24 24 times.c \ 25 words.c \26 25 swish.c \ 27 26 analyzer.c \ libswish3/trunk/src/libswish3/analyzer.c
r2150 r2158 41 41 a->ref_cnt = 0; 42 42 a->tokenize = config->flags->tokenize; 43 a->tokenlist = 0; // use wordlist by default44 43 45 44 if (!a->tokenize && SWISH_DEBUG) 46 SWISH_DEBUG_MSG("skipping WordList");45 SWISH_DEBUG_MSG("skipping tokenizer"); 47 46 48 47 /* tokenizer set in the parse* function */ libswish3/trunk/src/libswish3/libswish3.h
r2155 r2158 150 150 SWISH_DEBUG_DOCINFO = 1, 151 151 SWISH_DEBUG_TOKENIZER = 2, 152 SWISH_DEBUG_ WORDLIST = 4,152 SWISH_DEBUG_TOKENLIST = 4, 153 153 SWISH_DEBUG_PARSER = 8, 154 154 SWISH_DEBUG_CONFIG = 16, … … 182 182 typedef struct swish_MetaName swish_MetaName; 183 183 typedef struct swish_Property swish_Property; 184 typedef struct swish_Word swish_Word;185 typedef struct swish_WordList swish_WordList;186 184 typedef struct swish_Token swish_Token; 187 185 typedef struct swish_TokenList swish_TokenList; … … 281 279 }; 282 280 283 struct swish_Word284 {285 unsigned int position; // word position in doc286 xmlChar *metaname; // immediate metaname287 xmlChar *context; // metaname ancestry288 xmlChar *word; // the word itself (NOTE stored as multibyte not wchar)289 unsigned int start_offset; // start byte290 unsigned int end_offset; // end byte291 struct swish_Word *next; // pointer to next swish_Word292 struct swish_Word *prev; // pointer to prev swish_Word293 };294 295 struct swish_WordList296 {297 swish_Word *head;298 swish_Word *tail;299 swish_Word *current; // for iterating300 unsigned int nwords;301 unsigned int ref_cnt; // for bindings302 };303 304 281 struct swish_Token 305 282 { … … 351 328 unsigned int maxwordlen; // max word length 352 329 unsigned int minwordlen; // min word length 353 boolean tokenize; // should we parse into WordList 354 boolean tokenlist; // use new tokenizer 330 boolean tokenize; // should we parse into TokenList 355 331 int (*tokenizer) (swish_3*, xmlChar*, ...); 356 332 xmlChar* (*stemmer) (xmlChar*); … … 378 354 boolean is_html; // shortcut flag for html parser 379 355 boolean bump_word; // boolean for moving word position/adding space 380 unsigned int word_pos; // word position in document381 356 unsigned int offset; // current offset position 382 357 swish_TagStack *metastack; // stacks for tracking the tag => metaname 383 358 swish_TagStack *propstack; // stacks for tracking the tag => property 384 359 xmlParserCtxtPtr ctxt; // so we can free at end 385 swish_WordList *wordlist; // linked list of words386 360 swish_TokenIterator *token_iterator; // alternative tokenizer 387 361 swish_NamedBuffer *properties; // buffer all properties … … 549 523 =head2 Token Functions 550 524 */ 551 void swish_init_words();552 swish_WordList * swish_init_wordlist();553 void swish_free_wordlist(swish_WordList * list);554 int swish_tokenize( swish_3 * s3, xmlChar * str, ... );555 556 int swish_tokenize_utf8_string(557 swish_3 * s3,558 xmlChar * str,559 swish_WordList * wl,560 unsigned int offset,561 unsigned int word_pos,562 xmlChar * metaname,563 xmlChar * context564 );565 566 int swish_tokenize_ascii_string(567 swish_3 * s3,568 xmlChar * str,569 swish_WordList * wl,570 unsigned int offset,571 unsigned int word_pos,572 xmlChar * metaname,573 xmlChar * context574 );575 576 int swish_tokenize_regex(577 swish_3 * s3,578 xmlChar * str,579 swish_WordList * wl,580 unsigned int offset,581 unsigned int word_pos,582 xmlChar * metaname,583 xmlChar * context584 );585 586 size_t swish_add_to_wordlist( swish_WordList * list,587 xmlChar * word,588 xmlChar * metaname,589 xmlChar * context,590 int word_pos,591 int offset );592 593 int swish_add_to_wordlist_len(594 swish_WordList * list,595 xmlChar * str,596 int len,597 xmlChar * metaname,598 xmlChar * context,599 int word_pos,600 int offset );601 602 void swish_debug_wordlist( swish_WordList * list );603 604 525 swish_TokenList * swish_init_token_list(); 605 526 void swish_free_token_list( swish_TokenList *tl ); libswish3/trunk/src/libswish3/parser.c
r2155 r2158 342 342 343 343 /* 344 * need to bump word_posso we don't match across block *344 * need to bump token position so we don't match across block * 345 345 * elements 346 346 */ … … 501 501 502 502 if (SWISH_DEBUG & SWISH_DEBUG_PARSER) 503 SWISH_DEBUG_MSG("buffer is >>%s<< before flush, word_pos = %d", 504 xmlBufferContent(parser_data->meta_buf), parser_data->word_pos); 505 506 /* 507 * since we only flush the buffer when metaname changes, and we do 508 * not want to match across metanames, bump the word_pos here before 509 * parsing the string and making the tmp wordlist 510 */ 511 if (parser_data->word_pos) 512 parser_data->word_pos++; 503 SWISH_DEBUG_MSG("buffer is >>%s<< before flush", 504 xmlBufferContent(parser_data->meta_buf)); 513 505 514 506 /* … … 524 516 * Disabling this for now, as it ought to be up the handler() to decide 525 517 * to index a token under multiple metanames, and we associate context 526 * with the WordList518 * with the TokenList 527 519 */ 528 520 … … 1026 1018 1027 1019 ptr->tag = NULL; 1028 ptr->wordlist = swish_init_wordlist();1029 ptr->wordlist->ref_cnt++;1030 1020 ptr->token_iterator = swish_init_token_iterator(s3->config, swish_init_token_list()); 1031 1021 ptr->token_iterator->ref_cnt++; … … 1036 1026 1037 1027 /* 1038 * pick atokenizer if one has not been explicitly set1028 * set tokenizer if one has not been explicitly set 1039 1029 */ 1040 1030 if (s3->analyzer->tokenizer == NULL) { 1041 if (s3->analyzer->tokenlist) { 1042 s3->analyzer->tokenizer = (&swish_tokenize3); 1043 } 1044 else { 1045 s3->analyzer->tokenizer = (&swish_tokenize); 1046 } 1031 s3->analyzer->tokenizer = (&swish_tokenize3); 1047 1032 } 1048 1033 … … 1085 1070 1086 1071 /* 1087 * must be zero so that ++ works ok on first word1088 */1089 ptr->word_pos = 0;1090 1091 /*1092 1072 * always start at first byte 1093 1073 */ … … 1195 1175 SWISH_DEBUG_MSG("swish_ParserData libxml2 parser ctxt already freed"); 1196 1176 1197 }1198 1199 if (ptr->wordlist != NULL) {1200 1201 if (SWISH_DEBUG & SWISH_DEBUG_PARSER)1202 SWISH_DEBUG_MSG("free swish_ParserData wordList");1203 1204 ptr->wordlist->ref_cnt--;1205 swish_free_wordlist(ptr->wordlist);1206 1177 } 1207 1178 … … 1943 1914 SWISH_DEBUG_MSG("txt parser encoding: %s", parser_data->docinfo->encoding); 1944 1915 1945 if ( parser_data->docinfo->encoding != (xmlChar *)SWISH_DEFAULT_ENCODING) {1916 if (!xmlStrEqual(parser_data->docinfo->encoding, (xmlChar *)SWISH_DEFAULT_ENCODING)) { 1946 1917 SWISH_WARN("%s docinfo->encoding %s != %s", 1947 1918 parser_data->docinfo->uri, parser_data->docinfo->encoding, SWISH_DEFAULT_ENCODING); … … 2084 2055 context = parser_data->metastack->head->context; 2085 2056 2086 swish_WordList *tmplist; 2087 2088 if (parser_data->s3->analyzer->tokenlist) { 2089 2090 /* 2091 * array buffer (token_iterator) tokenizer 2092 */ 2093 2094 parser_data->docinfo->nwords += 2057 parser_data->docinfo->nwords += 2095 2058 (*parser_data->s3->analyzer->tokenizer) (parser_data->s3, string, 2096 2059 parser_data->token_iterator->tl, 2097 2060 meta, context); 2098 return; 2099 2100 } 2101 else { 2102 2103 /* 2104 * linked-list (wordlist) tokenizer 2105 */ 2106 2107 tmplist = swish_init_wordlist(); 2108 tmplist->ref_cnt++; 2109 parser_data->docinfo->nwords += 2110 (*parser_data->s3->analyzer->tokenizer) (parser_data->s3, string, tmplist, 2111 parser_data->offset, 2112 parser_data->word_pos, metaname, 2113 context); 2114 2115 if (tmplist->nwords == 0) { 2116 tmplist->ref_cnt--; 2117 swish_free_wordlist(tmplist); 2118 return; 2119 } 2120 2121 /* 2122 * append tmplist to master list 2123 */ 2124 parser_data->word_pos += tmplist->nwords; 2125 2126 if (parser_data->wordlist->head == 0) { 2127 swish_xfree(parser_data->wordlist); 2128 parser_data->wordlist = tmplist; 2129 } 2130 else { 2131 2132 /* 2133 * point tmp list first word's prev at current last word 2134 */ 2135 tmplist->head->prev = parser_data->wordlist->tail; 2136 2137 /* 2138 * point current last word's 'next' at first word of tmp list 2139 */ 2140 parser_data->wordlist->tail->next = tmplist->head; 2141 2142 /* 2143 * point current last word at last word of tmp list 2144 */ 2145 parser_data->wordlist->tail = tmplist->tail; 2146 2147 parser_data->wordlist->nwords += tmplist->nwords; 2148 2149 swish_xfree(tmplist); 2150 } 2151 2152 /* 2153 * global offset is now the same as the tail end_offset 2154 */ 2155 parser_data->offset = parser_data->wordlist->tail->end_offset; 2156 2157 } 2061 return; 2158 2062 2159 2063 } libswish3/trunk/src/libswish3/swish.c
r2140 r2158 91 91 setenv("SWISH_DEBUG_CONFIG", "0", 0); 92 92 setenv("SWISH_DEBUG_DOCINFO", "0", 0); 93 setenv("SWISH_DEBUG_ WORDLIST", "0", 0);93 setenv("SWISH_DEBUG_TOKENLIST", "0", 0); 94 94 setenv("SWISH_DEBUG_TOKENIZER", "0", 0); 95 95 setenv("SWISH_DEBUG_PARSER", "0", 0); … … 111 111 SWISH_DEBUG += SWISH_DEBUG_DOCINFO; 112 112 } 113 if (swish_string_to_int(getenv("SWISH_DEBUG_ WORDLIST"))) {114 SWISH_DEBUG += SWISH_DEBUG_ WORDLIST;113 if (swish_string_to_int(getenv("SWISH_DEBUG_TOKENLIST"))) { 114 SWISH_DEBUG += SWISH_DEBUG_TOKENLIST; 115 115 } 116 116 if (swish_string_to_int(getenv("SWISH_DEBUG_TOKENIZER"))) { … … 133 133 */ 134 134 LIBXML_TEST_VERSION swish_init_memory(); 135 swish_init_words();136 135 swish_verify_utf8_locale(); 137 136 libswish3/trunk/src/swish_lint.c
r2149 r2158 56 56 {"debug", required_argument, 0, 'd'}, 57 57 {"help", no_argument, 0, 'h'}, 58 {"tokenize3", no_argument, 0, 't'},59 58 {"verbose", no_argument, 0, 'v'}, 60 59 {0, 0, 0, 0} … … 89 88 printf("\tSWISH_DEBUG_DOCINFO 1\n"); 90 89 printf("\tSWISH_DEBUG_TOKENIZER 2\n"); 91 printf("\tSWISH_DEBUG_ WORDLIST 4\n");90 printf("\tSWISH_DEBUG_TOKENLIST 4\n"); 92 91 printf("\tSWISH_DEBUG_PARSER 8\n"); 93 92 printf("\tSWISH_DEBUG_CONFIG 16\n"); … … 129 128 swish_debug_docinfo(parser_data->docinfo); 130 129 131 if (SWISH_DEBUG & SWISH_DEBUG_ WORDLIST130 if (SWISH_DEBUG & SWISH_DEBUG_TOKENLIST 132 131 || verbose 133 132 ) { 134 if (parser_data->s3->analyzer->tokenlist) { 135 swish_debug_token_list(parser_data->token_iterator); 136 } 137 else { 138 swish_debug_wordlist(parser_data->wordlist); 139 } 133 swish_debug_token_list(parser_data->token_iterator); 140 134 } 141 135 … … 195 189 break; 196 190 197 case 't':198 s3->analyzer->tokenlist = 1;199 break;200 201 191 case 'v': 202 192 verbose = 1; libswish3/trunk/src/t/001-wordcount.t
r2150 r2158 50 50 sub words { 51 51 my $file = shift; 52 my $o = join( ' ', `./swish_lint -ttest_docs/$file` );52 my $o = join( ' ', `./swish_lint test_docs/$file` ); 53 53 my ($count) = ( $o =~ m/nwords: (\d+)/ ); 54 54 return $count || 0; … … 57 57 sub fromstdin { 58 58 my $file = shift; 59 my $o = join( ' ', `./swish_lint - t -< test_stdin/$file` );59 my $o = join( ' ', `./swish_lint - < test_stdin/$file` ); 60 60 my ($count) = ( $o =~ m/total words: (\d+)/ ); 61 61 return $count || 0; libswish3/trunk/src/xapian/swish_xapian.cpp
r2129 r2158 310 310 swish_debug_docinfo(parser_data->docinfo); 311 311 } 312 if (SWISH_DEBUG & SWISH_DEBUG_ WORDLIST) {313 swish_debug_ wordlist(parser_data->wordlist);312 if (SWISH_DEBUG & SWISH_DEBUG_TOKENLIST) { 313 swish_debug_token_list(parser_data->token_iterator); 314 314 } 315 315 if (SWISH_DEBUG & SWISH_DEBUG_NAMEDBUFFER) {
