Changeset 2140 for libswish3/trunk
- Timestamp:
- 04/28/08 22:02:04 (4 months ago)
- Files:
-
- libswish3/trunk/src/Makefile.am (modified) (1 diff)
- libswish3/trunk/src/libswish3/Makefile.am (modified) (1 diff)
- libswish3/trunk/src/libswish3/analyzer.c (modified) (1 diff)
- libswish3/trunk/src/libswish3/config.c (modified) (1 diff)
- libswish3/trunk/src/libswish3/libswish3.h (modified) (11 diffs)
- libswish3/trunk/src/libswish3/parser.c (modified) (4 diffs)
- libswish3/trunk/src/libswish3/string.c (modified) (10 diffs)
- libswish3/trunk/src/libswish3/swish.c (modified) (1 diff)
- libswish3/trunk/src/libswish3/words.c (modified) (7 diffs)
- libswish3/trunk/src/swish_lint.c (modified) (6 diffs)
- libswish3/trunk/src/swish_tokenize.c (added)
- libswish3/trunk/src/utf8test.c (added)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/src/Makefile.am
r2133 r2140 11 11 # -pg is for profiling -- don't use in production 12 12 13 bin_PROGRAMS = swish_lint swish_words swish_isw swish_header 14 #bin_PROGRAMS = swish_lint swish_words swish_isw utf8test swish_header 13 bin_PROGRAMS = swish_lint swish_words swish_tokenize swish_isw utf8test swish_header 15 14 check_PROGRAMS = swish_lint swish_header 16 15 swish_lint_SOURCES = swish_lint.c $(myheaders) 17 16 swish_words_SOURCES = swish_words.c $(myheaders) 17 swish_tokenize_SOURCES = swish_tokenize.c $(myheaders) 18 18 swish_isw_SOURCES = swish_isw.c 19 19 swish_header_SOURCES = swish_header.c $(myheaders) 20 #utf8test_SOURCES = utf8test.c $(myheaders)20 utf8test_SOURCES = utf8test.c $(myheaders) 21 21 22 22 TESTS = $(check_PROGRAMS) test.sh libswish3/trunk/src/libswish3/Makefile.am
r2042 r2140 29 29 metaname.c \ 30 30 header.c \ 31 tokenizer.c \ 31 32 $(myheaders) 32 33 libswish3/trunk/src/libswish3/analyzer.c
r2104 r2140 41 41 a->ref_cnt = 0; 42 42 a->tokenize = config->flags->tokenize; 43 a->tokenlist = 0; 43 44 44 45 if (!a->tokenize && SWISH_DEBUG) libswish3/trunk/src/libswish3/config.c
r2133 r2140 196 196 flags->meta_ids = swish_init_hash(8); 197 197 flags->prop_ids = swish_init_hash(8); 198 flags->contexts = swish_init_hash(8); 198 199 199 200 return flags; libswish3/trunk/src/libswish3/libswish3.h
r2132 r2140 32 32 #define SWISH_VERSION "3.0.0" 33 33 #define SWISH_BUFFER_CHUNK_SIZE 16384 34 #define SWISH_TOKEN_LIST_SIZE 1024 34 35 #define SWISH_MAXSTRLEN 2048 35 36 #define SWISH_MAX_HEADERS 6 … … 172 173 typedef char boolean; 173 174 typedef struct swish_3 swish_3; 174 typedef struct swish_Token swish_Token;175 175 typedef struct swish_StringList swish_StringList; 176 176 typedef struct swish_Config swish_Config; … … 185 185 typedef struct swish_Word swish_Word; 186 186 typedef struct swish_WordList swish_WordList; 187 typedef struct swish_Token swish_Token; 188 typedef struct swish_TokenList swish_TokenList; 189 typedef struct swish_TokenIterator swish_TokenIterator; 187 190 typedef struct swish_ParserData swish_ParserData; 188 191 typedef struct swish_Tag swish_Tag; … … 203 206 swish_Analyzer *analyzer; 204 207 swish_Parser *parser; 205 };206 207 struct swish_Token208 {209 xmlChar *start_ptr;210 int tok_bytes;211 int start;212 int end;213 xmlChar *meta;214 xmlChar *ctxt;215 unsigned int wpos;216 unsigned int offset;217 swish_Analyzer *analyzer;218 swish_WordList *list;219 208 }; 220 209 … … 247 236 xmlHashTablePtr meta_ids; 248 237 xmlHashTablePtr prop_ids; 238 xmlHashTablePtr contexts; 249 239 }; 250 240 … … 313 303 }; 314 304 305 struct swish_Token 306 { 307 unsigned int pos; 308 swish_MetaName *meta; 309 xmlChar *value; 310 xmlChar *context; // TODO refactor this into array of ints 311 unsigned int start_byte; 312 unsigned int len; 313 int ref_cnt; 314 }; 315 316 struct swish_TokenList 317 { 318 unsigned int n; 319 xmlBufferPtr buf; 320 swish_Token** tokens; 321 int ref_cnt; 322 }; 323 324 struct swish_TokenIterator 325 { 326 swish_TokenList *tl; 327 swish_Config *config; 328 unsigned int pos; 329 int ref_cnt; 330 }; 331 315 332 struct swish_Tag 316 333 { … … 335 352 unsigned int minwordlen; // min word length 336 353 boolean tokenize; // should we parse into WordList 354 boolean tokenlist; // use new tokenizer 337 355 swish_WordList* (*tokenizer) (swish_Analyzer*, xmlChar*, ...); 338 356 xmlChar* (*stemmer) (xmlChar*); … … 357 375 xmlChar *tag; // current tag name 358 376 swish_DocInfo *docinfo; // document-specific properties 359 boolean no_index; // toggle flag for special comments377 boolean no_index; // toggle flag. should buffer be indexed. 360 378 boolean is_html; // shortcut flag for html parser 361 379 boolean bump_word; // boolean for moving word position/adding space … … 366 384 xmlParserCtxtPtr ctxt; // so we can free at end 367 385 swish_WordList *wordlist; // linked list of words 386 swish_TokenIterator *token_iterator; // alternative tokenizer 368 387 swish_NamedBuffer *properties; // buffer all properties 369 388 swish_NamedBuffer *metanames; // buffer all metanames … … 460 479 int swish_is_ascii( xmlChar *str ); 461 480 int swish_utf8_chr_len( xmlChar *utf8 ); 481 int swish_utf8_codepoint( xmlChar *utf8 ); 482 int swish_utf8_num_chrs( xmlChar *utf8 ); 483 void swish_utf8_next_chr( xmlChar *s, int *i ); 484 void swish_utf8_prev_chr( xmlChar *s, int *i ); 462 485 wchar_t * swish_locale_to_wchar(xmlChar * str); 463 486 xmlChar * swish_wchar_to_locale(wchar_t * str); … … 572 595 573 596 void swish_debug_wordlist( swish_WordList * list ); 597 598 swish_TokenList * swish_init_token_list(); 599 void swish_free_token_list( swish_TokenList *tl ); 600 int swish_add_token( swish_TokenList *tl, 601 xmlChar *token, 602 int token_len, 603 swish_MetaName *meta, 604 xmlChar *context ); 605 swish_Token * swish_init_token(); 606 void swish_free_token( swish_Token *t ); 607 swish_TokenIterator *swish_init_token_iterator( swish_Config *config, swish_TokenList *tl ); 608 void swish_free_token_iterator( swish_TokenIterator *ti ); 609 swish_Token * swish_next_token( swish_TokenIterator *it ); 610 int swish_tokenize3( swish_3 *s3, 611 swish_TokenList * tl, 612 xmlChar *buf, 613 swish_MetaName *meta, 614 xmlChar *context ); 615 void swish_debug_token_list( swish_TokenIterator *it ); 616 void swish_debug_token( swish_Token *t ); 617 574 618 /* 575 619 =cut libswish3/trunk/src/libswish3/parser.c
r2132 r2140 1019 1019 ptr->wordlist = swish_init_wordlist(); 1020 1020 ptr->wordlist->ref_cnt++; 1021 ptr->token_iterator = swish_init_token_iterator(s3->config, swish_init_token_list()); 1022 ptr->token_iterator->ref_cnt++; 1021 1023 ptr->properties = swish_init_nb(s3->config->properties); 1022 1024 ptr->properties->ref_cnt++; … … 1181 1183 ptr->wordlist->ref_cnt--; 1182 1184 swish_free_wordlist(ptr->wordlist); 1185 } 1186 1187 if (ptr->token_iterator != NULL) { 1188 1189 if (SWISH_DEBUG & SWISH_DEBUG_PARSER) 1190 SWISH_DEBUG_MSG("free swish_ParserData TokenIterator"); 1191 1192 ptr->token_iterator->tl->ref_cnt--; 1193 swish_free_token_list(ptr->token_iterator->tl); 1194 ptr->token_iterator->ref_cnt--; 1195 swish_free_token_iterator(ptr->token_iterator); 1183 1196 } 1184 1197 … … 2028 2041 ) 2029 2042 { 2030 2043 swish_MetaName *meta; 2044 2045 meta = swish_hash_fetch(parser_data->s3->config->metanames, metaname); 2046 2031 2047 if (len == 0) 2032 2048 return; … … 2040 2056 swish_WordList *tmplist; 2041 2057 2042 if (parser_data->s3->analyzer->tokenizer == NULL) { 2058 if (parser_data->s3->analyzer->tokenlist) { 2059 2060 parser_data->docinfo->nwords += 2061 swish_tokenize3(parser_data->s3, 2062 parser_data->token_iterator->tl, 2063 string, 2064 meta, 2065 context 2066 ); 2067 2068 return; 2069 2070 } 2071 else if (parser_data->s3->analyzer->tokenizer == NULL) { 2043 2072 2044 2073 /* libswish3/trunk/src/libswish3/string.c
r2133 r2140 35 35 36 36 #include "libswish3.h" 37 #include "utf8.c"38 37 39 38 extern int SWISH_DEBUG; … … 56 55 ); 57 56 57 /* originally based on libutf8; this version (and other u8_* functions) 58 are from http://cprogramming.com/tutorial/unicode.html 59 */ 60 static int 61 u8_is_locale_utf8( 62 char *locale 63 ); 64 65 /* move to next character */ 66 static void u8_inc( 67 char *s, 68 int *i 69 ); 70 71 /* move to previous character */ 72 static void u8_dec( 73 char *s, 74 int *i 75 ); 76 77 78 /* is c the start of a utf8 sequence? */ 79 #define isutf(c) (((c)&0xC0)!=0x80) 80 81 static void 82 u8_inc( 83 char *s, 84 int *i 85 ) 86 { 87 (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i)); 88 } 89 90 static void 91 u8_dec( 92 char *s, 93 int *i 94 ) 95 { 96 (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i)); 97 } 98 99 58 100 /* these string conversion functions based on code from xapian-omega */ 59 101 #define BUFSIZE 100 … … 63 105 int ret;\ 64 106 str = swish_xmalloc(BUFSIZE);\ 65 ret = snprintf( str, BUFSIZE, (FMT), val);\107 ret = snprintf((char*)str, BUFSIZE, (FMT), val);\ 66 108 if (ret<0) SWISH_CROAK("snprintf failed with %d", ret);\ 67 109 return str; 68 110 69 int swish_string_to_int( 111 int 112 swish_string_to_int( 70 113 char *buf 71 ) 114 ) 72 115 { 73 116 long i; 74 117 errno = 0; 75 118 i = strtol(buf, (char **)NULL, 10); 76 /* Check for various possible errors */ 119 /* 120 Check for various possible errors 121 */ 77 122 if ((errno == ERANGE && (i == LONG_MAX || i == LONG_MIN)) 78 || (errno != 0 && i == 0)) {79 perror("strtol");80 exit(EXIT_FAILURE);123 || (errno != 0 && i == 0)) { 124 perror("strtol"); 125 exit(EXIT_FAILURE); 81 126 } 82 127 return (int)i; … … 137 182 abort(); /* Uh-oh, buffer overrun */ 138 183 #endif 139 return swish_xstrdup((xmlChar *)buf);184 return swish_xstrdup((xmlChar *)buf); 140 185 } 141 186 … … 166 211 */ 167 212 213 /* returns the UCS32 value for a UTF8 string -- the character's Unicode value. 214 see http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&item_id=IWS-AppendixA 215 */ 216 int 217 swish_utf8_codepoint( 218 xmlChar *utf8 219 ) 220 { 221 int len; 222 len = swish_utf8_chr_len(utf8); 223 224 switch (len) { 225 226 case 1: 227 return utf8[0]; 228 229 case 2: 230 return (utf8[0] - 192) * 64 + utf8[1] - 128; 231 232 case 3: 233 return (utf8[0] - 224) * 4096 + (utf8[1] - 128) * 64 + utf8[2] - 128; 234 235 case 4: 236 default: 237 return (utf8[0] - 240) * 262144 + (utf8[1] - 128) * 4096 + (utf8[2] - 128) * 64 + 238 utf8[3] - 128; 239 240 } 241 } 242 243 void 244 swish_utf8_next_chr( 245 xmlChar *s, 246 int *i 247 ) 248 { 249 u8_inc((char *)s, i); 250 } 251 252 void 253 swish_utf8_prev_chr( 254 xmlChar *s, 255 int *i 256 ) 257 { 258 u8_dec((char *)s, i); 259 } 260 261 168 262 /* returns length of a UTF8 character, based on first byte (see below) */ 169 263 int … … 172 266 ) 173 267 { 174 return u8_seqlen((char *)utf8); 268 int n; 269 n = xmlUTF8Size(utf8); 270 if (n == -1) 271 SWISH_CROAK("Bad UTF8 string: %s", utf8); 272 273 return n; 274 } 275 276 /* returns the number of UCS32 codepoints (characters) in a UTF8 string */ 277 int 278 swish_utf8_num_chrs( 279 xmlChar *utf8 280 ) 281 { 282 int n; 283 n = xmlUTF8Strlen(utf8); 284 if (n == -1) 285 SWISH_CROAK("Bad UTF8 string: %s", utf8); 286 287 return n; 175 288 } 176 289 … … 208 321 } 209 322 323 static int 324 u8_is_locale_utf8( 325 char *locale 326 ) 327 { 328 // this code based on libutf8 329 const char *cp = locale; 330 331 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) { 332 if (*cp == '.') { 333 const char *encoding = ++cp; 334 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++); 335 if ((cp - encoding == 5 && !strncmp(encoding, "UTF-8", 5)) 336 || (cp - encoding == 4 && !strncmp(encoding, "utf8", 4))) 337 return 1; // it's UTF-8 338 break; 339 } 340 } 341 return 0; 342 } 343 344 210 345 void 211 346 swish_verify_utf8_locale( … … 530 665 void 531 666 swish_merge_stringlists( 532 swish_StringList * sl1,533 swish_StringList * sl2667 swish_StringList * sl1, 668 swish_StringList * sl2 534 669 ) 535 670 { 536 671 int i; 537 672 // add sl1 -> sl2 538 sl2->word = (xmlChar **)swish_xrealloc(sl2->word, (sl1->n + sl2->n) * sizeof(xmlChar *) + 1); 539 for(i=0; i<sl1->n; i++) { 673 sl2->word = 674 (xmlChar **)swish_xrealloc(sl2->word, (sl1->n + sl2->n) * sizeof(xmlChar *) + 1); 675 for (i = 0; i < sl1->n; i++) { 540 676 // copy is a little overhead, but keeps mem count simple 541 sl2->word[sl2->n++] = swish_xstrdup( sl1->word[i]);677 sl2->word[sl2->n++] = swish_xstrdup(sl1->word[i]); 542 678 } 543 679 swish_free_stringlist(sl1); … … 546 682 swish_StringList * 547 683 swish_copy_stringlist( 548 swish_StringList * sl684 swish_StringList * sl 549 685 ) 550 686 { … … 553 689 s2 = swish_init_stringlist(); 554 690 s2->word = (xmlChar **)swish_xrealloc(s2->word, sl->n * sizeof(xmlChar *) + 1); 555 for (i=0; i<sl->n; i++) {556 s2->word[i] = swish_xstrdup( sl->word[i]);691 for (i = 0; i < sl->n; i++) { 692 s2->word[i] = swish_xstrdup(sl->word[i]); 557 693 } 558 694 s2->n = sl->n; libswish3/trunk/src/libswish3/swish.c
r2135 r2140 114 114 SWISH_DEBUG += SWISH_DEBUG_WORDLIST; 115 115 } 116 if (swish_string_to_int(getenv("SWISH_DEBUG_TOKENIZER"))) { 117 SWISH_DEBUG += SWISH_DEBUG_TOKENIZER; 118 } 116 119 if (swish_string_to_int(getenv("SWISH_DEBUG_PARSER"))) { 117 120 SWISH_DEBUG += SWISH_DEBUG_PARSER; libswish3/trunk/src/libswish3/words.c
r2103 r2140 20 20 /* word tokenizer(s) */ 21 21 22 #include <libxml/hash.h>23 22 #include <wchar.h> 24 23 #include <string.h> … … 31 30 32 31 extern int SWISH_DEBUG; 32 33 33 static int is_ignore_start_ascii( 34 34 char c … … 41 41 ); 42 42 static int is_ignore_start( 43 wint_t c43 int c 44 44 ); 45 45 static int is_ignore_end( 46 wint_t c46 int c 47 47 ); 48 48 static int is_ignore_word( 49 wint_t c49 int c 50 50 ); 51 51 static int bytes_in_chr( 52 wint_t ch52 int ch 53 53 ); 54 54 static void make_ascii_tables( … … 214 214 static int 215 215 is_ignore_start( 216 wint_t c 216 int c 217 ) 218 { 219 return (!c || iswspace((wint_t)c) || iswcntrl(c) || iswpunct(c) 220 ) 221 ? 1 : 0; 222 223 } 224 225 static int 226 is_ignore_end( 227 int c 217 228 ) 218 229 { … … 220 231 ) 221 232 ? 1 : 0; 222 223 }224 225 static int226 is_ignore_end(227 wint_t c228 )229 {230 return (!c || iswspace(c) || iswcntrl(c) || iswpunct(c)231 )232 ? 1 : 0;233 233 } 234 234 235 235 static int 236 236 is_ignore_word( 237 wint_t c237 int c 238 238 ) 239 239 { … … 256 256 static int 257 257 bytes_in_chr( 258 wint_t ch258 int ch 259 259 ) 260 260 { … … 706 706 break; 707 707 } 708 708 709 } 709 710 libswish3/trunk/src/swish_lint.c
r2131 r2140 55 55 {"debug", required_argument, 0, 'd'}, 56 56 {"help", no_argument, 0, 'h'}, 57 {"tokenize3", no_argument, 0, 't'}, 57 58 {0, 0, 0, 0} 58 59 }; … … 92 93 printf("\tSWISH_DEBUG_NAMEDBUFFER 64\n"); 93 94 printf("Set SWISH_PARSER_WARNINGS=1 to see libxml2 errors and warnings\n"); 95 printf("Set SWISH_WARNINGS=0 to turn off libswish3 warnings\n"); 94 96 printf("stdin headers:\n"); 95 97 printf("\tContent-Length\n"); … … 125 127 swish_debug_docinfo(parser_data->docinfo); 126 128 127 if (SWISH_DEBUG & SWISH_DEBUG_WORDLIST) 129 if (SWISH_DEBUG & SWISH_DEBUG_WORDLIST) { 130 if (parser_data->s3->analyzer->tokenlist) { 131 swish_debug_token_list(parser_data->token_iterator); 132 } 133 else { 128 134 swish_debug_wordlist(parser_data->wordlist); 135 } 136 } 129 137 130 138 if (SWISH_DEBUG & SWISH_DEBUG_NAMEDBUFFER) { … … 145 153 int option_index; 146 154 int files; 147 int overwrite;148 155 char *etime; 149 156 double start_time; … … 153 160 option_index = 0; 154 161 files = 0; 155 overwrite = 0;156 162 start_time = swish_time_elapsed(); 157 163 s3 = swish_init_swish3(&handler, NULL); 158 164 159 while ((ch = getopt_long(argc, argv, "c:d:f:h ", longopts, &option_index)) != -1) {165 while ((ch = getopt_long(argc, argv, "c:d:f:ht", longopts, &option_index)) != -1) { 160 166 161 167 switch (ch) { … … 184 190 SWISH_DEBUG = swish_string_to_int(optarg); 185 191 break; 186 187 case ' o':188 overwrite= 1;189 break; 190 192 193 case 't': 194 s3->analyzer->tokenlist = 1; 195 break; 196 191 197 case '?': 192 198 case 'h':
