Changeset 1930
- Timestamp:
- 04/30/07 23:08:43 (1 year ago)
- Files:
-
- libswish3/trunk/bindings/perl/3.xs (modified) (12 diffs)
- libswish3/trunk/bindings/perl/typemap (modified) (1 diff)
- libswish3/trunk/src/libswish3/Makefile.am (modified) (1 diff)
- libswish3/trunk/src/libswish3/libswish3.h (modified) (12 diffs)
- libswish3/trunk/src/libswish3/mem.c (modified) (1 diff)
- libswish3/trunk/src/libswish3/namedbuffer.c (added)
- libswish3/trunk/src/libswish3/parser.c (modified) (23 diffs)
- libswish3/trunk/src/libswish3/properties.c (deleted)
- libswish3/trunk/src/libswish3/words.c (modified) (8 diffs)
- libswish3/trunk/src/swish_lint.c (modified) (1 diff)
- libswish3/trunk/src/swish_words.c (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/bindings/perl/3.xs
r1929 r1930 72 72 */ 73 73 74 /* private package vars*/74 /* all XS stuff is prefixed with 'sp_' for Swish Perl */ 75 75 76 76 #define DEFAULT_BASE_CLASS "SWISH::3::Parser" 77 77 #define CONFIG_CLASS "SWISH::3::Config" 78 78 #define ANALYZER_CLASS "SWISH::3::Analyzer" 79 #define CONFIG_KEY " config"80 #define ANALYZER_KEY " analyzer"81 #define HANDLER_KEY " handler"79 #define CONFIG_KEY "sp_config" 80 #define ANALYZER_KEY "sp_analyzer" 81 #define HANDLER_KEY "sp_handler" 82 82 83 83 … … 340 340 swish_debug_docinfo( parse_data->docinfo ); 341 341 swish_debug_wordlist( parse_data->wordlist ); 342 swish_debug_PropHash( parse_data->propHash ); 342 swish_debug_nb( parse_data->properties, "Property" ); 343 swish_debug_nb( parse_data->metanames, "MetaName" ); 343 344 warn("\n"); 344 345 } … … 377 378 SV *token_re; 378 379 swish_WordList *list; 380 swish_WordList *(*token_handler)(xmlChar * start_ptr, int tok_bytes, int start, int end, ...); 379 381 va_list args; 380 382 va_start(args, str); … … 392 394 xmlChar *str_end = str_start + str_len; 393 395 394 token_re = analyzer->regex; /* TODO is this right ?? */ 396 token_re = analyzer->regex; 397 token_handler = analyzer->stash; 395 398 396 399 /* extract regexp struct from qr// entity */ … … 408 411 SvREADONLY_on(wrapper); 409 412 SvLEN(wrapper) = 0; 410 SvUTF8_on(wrapper); /* do UTF8 matching -- TODO conditional on swish_is_ascii() ??*/413 SvUTF8_on(wrapper); /* do UTF8 matching -- we trust str is already utf-8 encoded. */ 411 414 412 415 /* wrap the string in an SV to please the regex engine */ … … 415 418 SvPOK_on(wrapper); 416 419 417 list = swish_init_ WordList();420 list = swish_init_wordlist(); 418 421 num_code_points = 0; 419 422 … … 423 426 xmlChar * end_ptr = str + rx->endp[0]; 424 427 int start, end, tok_bytes, tok_pts; 425 xmlChar * token;426 428 427 429 /* get start and end offsets in Unicode code points */ … … 446 448 tok_pts = end - start; 447 449 tok_bytes = end_ptr - start_ptr; 448 449 /* TODO lc() ? */ 450 451 if (tok_pts < analyzer->minwordlen) 452 continue; 450 451 (*token_handler)( start_ptr, 452 tok_bytes, 453 start, 454 end, 455 meta, 456 ctxt, 457 ++wpos, 458 (tok_bytes + offset - 1), 459 analyzer, 460 list 461 ); 462 463 } 464 465 return list; 466 } 467 468 /* 469 default token handler is just to append to WordList 470 */ 471 swish_WordList* 472 sp_token_handler(xmlChar * start_ptr, int tok_bytes, int start, int end, ...) 473 { 474 xmlChar *meta, *ctxt; 475 int wpos, offset; 476 swish_Analyzer *analyzer; 477 swish_WordList *list; 478 va_list args; 479 va_start(args, end); 480 meta = va_arg(args, xmlChar *); 481 ctxt = va_arg(args, xmlChar *); 482 wpos = va_arg(args, unsigned int); 483 offset = va_arg(args, unsigned int); 484 analyzer = va_arg(args, swish_Analyzer*); 485 list = va_arg(args, swish_WordList*); 486 va_end(args); 487 488 if ((end - start) < analyzer->minwordlen) 489 return list; 453 490 454 if (tok_pts > analyzer->maxwordlen) 455 continue; 456 457 token = xmlStrndup(start_ptr, tok_bytes); 458 swish_add_to_wordlist( list, token, meta, ctxt, ++wpos, (tok_bytes + offset - 1) ); 459 460 if (SWISH_DEBUG) 461 { 462 warn("%s (%d %d)\n", token, start + 1, end); 463 } 464 465 free(token); 466 } 491 if ((end - start) > analyzer->maxwordlen) 492 return list; 493 494 495 /* TODO: lc() and stem() */ 496 497 swish_add_to_wordlist_len( list, 498 start_ptr, 499 tok_bytes, 500 meta, 501 ctxt, 502 wpos, 503 offset 504 ); 467 505 468 506 return list; 469 507 } 470 471 508 472 509 /******************************************************************************* … … 980 1017 if (self->ref_cnt < 1) 981 1018 { 982 swish_free_ WordList(self);1019 swish_free_wordlist(self); 983 1020 } 984 1021 … … 1030 1067 1031 1068 CODE: 1032 buf = xmlHashLookup(self->prop Hash,p);1069 buf = xmlHashLookup(self->properties->hash, p); 1033 1070 RETVAL = newSVpvn((char*)xmlBufferContent(buf), xmlBufferLength(buf)); 1034 1071 … … 1210 1247 RETVAL->regex = (void*)SvREFCNT_inc( regex ); 1211 1248 RETVAL->tokenizer = &sp_tokenize; 1249 RETVAL->stash = &sp_token_handler; 1212 1250 1213 1251 OUTPUT: … … 1296 1334 # TODO: get/set methods, including way to set tokenizer func ref 1297 1335 1336 1337 # tokenize_isw() uses native libswish3 tokenizer 1338 swish_WordList * 1339 tokenize_isw(self, str, ...) 1340 SV * self; 1341 SV * str; 1342 1343 PREINIT: 1344 char * CLASS; 1345 xmlChar * metaname = SWISH_DEFAULT_METANAME; 1346 xmlChar * context = SWISH_DEFAULT_METANAME; 1347 unsigned int word_pos = 0; 1348 unsigned int offset = 0; 1349 xmlChar * buf = SvPV(str, PL_na); 1350 1351 CODE: 1352 CLASS = sp_which_class("WordList"); 1353 1354 if (!SvUTF8(str)) 1355 { 1356 if (swish_is_ascii(buf)) 1357 SvUTF8_on(str); /* flags original SV ?? */ 1358 else 1359 croak("%s is not flagged as a UTF-8 string and is not ASCII", buf); 1360 } 1361 1362 if ( items > 2 ) 1363 { 1364 word_pos = (int)SvIV(ST(2)); 1365 1366 if ( items > 3 ) 1367 offset = (int)SvIV(ST(3)); 1368 1369 if ( items > 4 ) 1370 metaname = SvPV(ST(4), PL_na); 1371 1372 if ( items > 5 ) 1373 context = SvPV(ST(5), PL_na); 1374 1375 } 1376 1377 swish_init_words(); /* in case it wasn't initialized elsewhere... */ 1378 RETVAL = swish_tokenize( 1379 (swish_Analyzer*)sp_ptr_from_object(self), 1380 buf, 1381 word_pos, 1382 offset, 1383 metaname, 1384 context 1385 ); 1386 1387 RETVAL->ref_cnt++; 1388 1389 /* TODO do we need to worry about free()ing metaname and context ?? */ 1390 1391 OUTPUT: 1392 RETVAL 1393 libswish3/trunk/bindings/perl/typemap
r1927 r1930 11 11 swish_Analyzer * O_OBJECT 12 12 swish_Parser * O_OBJECT 13 swish_NamedBuffer * O_OBJECT 13 14 14 15 INPUT libswish3/trunk/src/libswish3/Makefile.am
r1927 r1930 20 20 mime_types.c \ 21 21 parser.c \ 22 properties.c \22 namedbuffer.c \ 23 23 string.c \ 24 24 times.c \ libswish3/trunk/src/libswish3/libswish3.h
r1928 r1930 52 52 #define SWISH_INCLUDE_FILE "IncludeConfigFile" 53 53 #define SWISH_PROP "PropertyNames" 54 #define SWISH_PROP_ASIS "nostripchars" 54 55 #define SWISH_PROP_MAX "PropertyNamesMaxLength" 55 56 #define SWISH_PROP_SORT "PropertyNamesSortKeyLength" … … 129 130 #define SWISH_DEBUG_TOKENIZER 5 130 131 #define SWISH_DEBUG_PARSER 9 132 #define SWISH_DEBUG_NAMEDBUFFER 15 131 133 132 134 #ifdef __cplusplus … … 170 172 void swish_mem_debug(); 171 173 xmlChar * swish_xstrdup( const xmlChar * ptr ); 174 xmlChar * swish_xstrndup( const xmlChar * ptr, int len ); 172 175 173 176 /* time functions */ … … 205 208 struct swish_Config 206 209 { 207 int ref_cnt; /* for scripting languages */208 void * stash; /* alsofor scripting languages */210 int ref_cnt; /* for scripting languages */ 211 void *stash; /* for scripting languages */ 209 212 xmlHashTablePtr conf; /* the meat */ 210 213 }; … … 259 262 typedef struct swish_Analyzer swish_Analyzer; 260 263 typedef struct swish_Parser swish_Parser; 264 typedef struct swish_NamedBuffer swish_NamedBuffer; 265 266 struct swish_NamedBuffer 267 { 268 int ref_cnt; /* for scripting languages */ 269 void *stash; /* for scripting languages */ 270 xmlHashTablePtr hash; /* the meat */ 271 }; 261 272 262 273 struct swish_DocInfo … … 299 310 xmlChar *name; 300 311 struct swish_Tag *next; 301 unsigned int n;312 unsigned int n; 302 313 }; 303 314 … … 321 332 void *stash; // for script bindings 322 333 void *regex; // optional regex 323 int ref_cnt; // for script bindings334 int ref_cnt; // for script bindings 324 335 }; 325 336 … … 336 347 struct swish_ParseData 337 348 { 338 xmlBufferPtr buf_ptr; // t extbuffer339 xmlBufferPtr prop_buf; // Property buffer349 xmlBufferPtr buf_ptr; // tmp text (MetaName) buffer 350 xmlBufferPtr prop_buf; // tmp Property buffer 340 351 xmlChar *tag; // current tag name 341 352 swish_DocInfo *docinfo; // document-specific properties 342 353 swish_Config *config; // global config 354 unsigned int context_as_meta; // index tokens under all applicable MetaNames 343 355 unsigned int no_index; // toggle flag for special comments 344 356 unsigned int is_html; // shortcut flag for html parser … … 348 360 swish_TagStack *metastack; // stacks for tracking the tag => metaname 349 361 swish_TagStack *propstack; // stacks for tracking the tag => property 350 xmlParserCtxtPtr ctxt; 362 xmlParserCtxtPtr ctxt; // so we can free at end 351 363 swish_WordList *wordlist; // linked list of words 352 xmlHashTablePtr propHash; // hash of Props, one for each property 364 swish_NamedBuffer *properties; // buffer all properties 365 swish_NamedBuffer *metanames; // buffer all metanames 353 366 swish_Analyzer *analyzer; // Analyzer struct 354 void *stash; // for script bindings367 void *stash; // for script bindings 355 368 }; 356 369 … … 374 387 375 388 376 /* utility buffers */377 void swish_append_buffer(xmlBufferPtr buf, const xmlChar * txt, int txtlen);378 379 380 381 389 /* word functions */ 382 390 void swish_init_words(); 383 swish_WordList * swish_init_ WordList();384 void swish_free_ WordList(swish_WordList * list);391 swish_WordList * swish_init_wordlist(); 392 void swish_free_wordlist(swish_WordList * list); 385 393 swish_WordList * swish_tokenize( swish_Analyzer * analyzer, xmlChar * str, ... ); 386 394 … … 418 426 int word_pos, 419 427 int offset ); 428 429 int swish_add_to_wordlist_len( 430 swish_WordList * list, 431 xmlChar * str, 432 int len, 433 xmlChar * metaname, 434 xmlChar * context, 435 int word_pos, 436 int offset ); 420 437 421 438 void swish_debug_wordlist( swish_WordList * list ); … … 429 446 void swish_free_docinfo( swish_DocInfo * ptr ); 430 447 int swish_check_docinfo(swish_DocInfo * docinfo, swish_Config * config); 431 int swish_docinfo_from_filesystem( xmlChar *filename, swish_DocInfo * i, swish_ParseData *parse_data ); 448 int swish_docinfo_from_filesystem( xmlChar *filename, 449 swish_DocInfo * i, 450 swish_ParseData *parse_data ); 432 451 void swish_debug_docinfo( swish_DocInfo * docinfo ); 433 452 434 453 435 /* Property functions */ 436 xmlHashTablePtr swish_init_PropHash( swish_Config * config); 437 void swish_free_PropHash( xmlHashTablePtr prophash); 438 void swish_debug_PropHash(xmlHashTablePtr propHash); 454 /* NamedBuffer functions */ 455 456 swish_NamedBuffer * swish_init_nb( swish_Config * config, xmlChar * configKey ); 457 void swish_free_nb( swish_NamedBuffer * nb ); 458 void swish_debug_nb( swish_NamedBuffer * nb, xmlChar * label ); 459 void swish_add_buf_to_nb( swish_NamedBuffer *nb, 460 xmlChar * name, 461 xmlBufferPtr buf, 462 xmlChar * joiner, 463 int cleanwsp, 464 int autovivify); 465 void swish_add_str_to_nb( swish_NamedBuffer * nb, 466 xmlChar * name, 467 xmlChar * str, 468 unsigned int len, 469 xmlChar * joiner, 470 int cleanwsp, 471 int autovivify); 472 void swish_append_buffer( xmlBufferPtr buf, xmlChar * txt, int len ); 473 439 474 440 475 libswish3/trunk/src/libswish3/mem.c
r1927 r1930 76 76 } 77 77 78 xmlChar * swish_xstrndup( const xmlChar * ptr, int len ) 79 { 80 memcount++; 81 if ( SWISH_DEBUG > 20 ) 82 swish_debug_msg( "memcount = %ld", memcount); 83 return( xmlStrndup( ptr, len ) ); 84 } 85 78 86 void swish_xfree( void *ptr ) 79 87 { libswish3/trunk/src/libswish3/parser.c
r1928 r1930 64 64 static void get_env_vars(); 65 65 66 static void flush_buffer( swish_ParseData * parse_data, xmlChar * metaname);67 static void add_to_prop_buf(xmlBufferPtr buf_ptr, 68 xmlHashTablePtr propHash,69 xmlChar * propName); 66 static void flush_buffer( swish_ParseData * parse_data, 67 xmlChar * metaname, xmlChar * context 68 ); 69 70 70 static void tokenize( swish_ParseData * parse_data, 71 71 xmlChar * string, … … 186 186 build_tag(swish_ParseData * parse_data, xmlChar * tag, xmlChar ** atts) 187 187 { 188 int i, is_html_tag;189 xmlChar *swishtag, *alias, *metaname, *metacontent;190 191 metaname = NULL;188 int i, is_html_tag; 189 xmlChar *swishtag, *alias, *metaname, *metacontent; 190 191 metaname = NULL; 192 192 metacontent = NULL; 193 193 194 194 /* normalize all tags */ 195 195 swishtag = swish_str_tolower(tag); 196 197 196 198 197 /* html tags */ … … 278 277 279 278 280 if (SWISH_DEBUG > 2)281 { 282 fprintf(stderr, " >>> startElement(%s (%s) ", tag, parse_data->tag);279 if (SWISH_DEBUG == SWISH_DEBUG_PARSER) 280 { 281 fprintf(stderr, " >>> build_tag (%s (%s) ", tag, parse_data->tag); 283 282 if (atts != 0) 284 283 { … … 293 292 } 294 293 fprintf(stderr, ")\n"); 295 296 294 } 297 295 298 296 299 297 /* change our internal name for this tag if it is aliased in config */ 300 alias = swish_get_config_value(parse_data->config, (xmlChar*)SWISH_ALIAS, parse_data->tag);298 alias = swish_get_config_value(parse_data->config, (xmlChar*)SWISH_ALIAS, swishtag); 301 299 if (alias) 302 300 { 301 //swish_debug_msg("%s alias -> %s", swishtag, alias); 303 302 swish_xfree(swishtag); 304 303 swishtag = swish_xstrdup(alias); … … 309 308 } 310 309 311 void312 swish_append_buffer(xmlBufferPtr buf, const xmlChar * txt, int txtlen)313 {314 int ret;315 316 if (txtlen == 0)317 /* shouldn't happen */318 return;319 320 if (buf == NULL)321 {322 swish_fatal_err("bad news. buf ptr is NULL");323 324 }325 326 ret = xmlBufferAdd( buf, txt, txtlen );327 if (ret)328 {329 swish_fatal_err("problem adding \n>>%s<<\n length %d to buffer. Err: %d",330 txt, txtlen, ret);331 }332 333 }334 310 335 311 static void 336 add_to_prop_buf(xmlBufferPtr buf, xmlHashTablePtr propHash, xmlChar * propName) 337 { 338 339 xmlChar * nowhitesp; 340 xmlBufferPtr propBuf = xmlHashLookup(propHash, propName); 341 342 if (propBuf && xmlBufferLength(propBuf)) 343 { 344 /* swish_debug_msg("adding %s to propBuf", propName); */ 345 346 /* if the propBuf already exists and we're about to add more, append the 347 * connect string */ 348 if (xmlBufferLength(buf)) 349 { 350 swish_append_buffer(propBuf, 351 (const xmlChar *) SWISH_PROP_CONNECTOR, 352 xmlStrlen((xmlChar *) SWISH_PROP_CONNECTOR)); 353 } 354 355 nowhitesp = swish_str_skip_ws((xmlChar *)xmlBufferContent(buf)); 356 swish_str_trim_ws(nowhitesp); 357 358 swish_append_buffer(propBuf, (const xmlChar *) nowhitesp, xmlStrlen(nowhitesp)); 359 } 360 361 } 362 363 static void 364 flush_buffer(swish_ParseData * parse_data, xmlChar * metaname) 365 { 366 312 flush_buffer(swish_ParseData * parse_data, xmlChar * metaname, xmlChar * context) 313 { 314 swish_TagStack *s = parse_data->metastack; 315 367 316 if (SWISH_DEBUG > 10) 368 317 swish_debug_msg("buffer is >>%s<< before flush, word_pos = %d", … … 375 324 if (parse_data->word_pos) 376 325 parse_data->word_pos++; 377 378 379 tokenize( parse_data, 380 (xmlChar *)xmlBufferContent(parse_data->buf_ptr), 381 xmlBufferLength(parse_data->buf_ptr), 382 parse_data->metastack->head->name, 383 metaname 326 327 /* add buf_ptr as-is to metanames buffer under current tag. 328 this gives us both tokens and raw text de-tagged but organized by metaname. 329 */ 330 swish_add_buf_to_nb( parse_data->metanames, 331 metaname, 332 parse_data->buf_ptr, '\0', 0, 1); 333 334 if (parse_data->context_as_meta) 335 { 336 for (s->temp = s->head; s->temp != NULL; s->temp = s->temp->next) 337 { 338 if (xmlStrEqual(s->temp->name, metaname)) /* just added above */ 339 continue; 340 341 swish_add_buf_to_nb(parse_data->metanames, 342 s->temp->name, 343 parse_data->buf_ptr, '\0', 0, 1); 344 } 345 } 346 347 if (parse_data->analyzer->tokenize) 348 { 349 350 tokenize( parse_data, 351 (xmlChar *)xmlBufferContent(parse_data->buf_ptr), 352 xmlBufferLength(parse_data->buf_ptr), 353 metaname, 354 context 384 355 ); 356 } 385 357 386 358 xmlBufferEmpty(parse_data->buf_ptr); … … 409 381 swish_debug_msg("endDocument()"); 410 382 411 flush_buffer(parse_data, NULL); /* whatever's left */ 383 /* whatever's left */ 384 flush_buffer(parse_data, (xmlChar*)SWISH_DEFAULT_METANAME, (xmlChar*)SWISH_DEFAULT_METANAME); 412 385 413 386 } … … 466 439 467 440 468 469 if (SWISH_DEBUG > 8) 441 if (SWISH_DEBUG == SWISH_DEBUG_PARSER) 470 442 swish_debug_msg("checking config for '%s' in watched tags", parse_data->tag); 471 443 472 444 473 445 /* set property if this tag is configured for it */ 474 if (swish_config_value_exists(parse_data->config, (xmlChar *)SWISH_PROP, parse_data->tag))475 { 476 if (SWISH_DEBUG > 8)446 if (swish_config_value_exists(parse_data->config, (xmlChar*)SWISH_PROP, parse_data->tag)) 447 { 448 if (SWISH_DEBUG == SWISH_DEBUG_PARSER) 477 449 swish_debug_msg(" %s = new property", parse_data->tag); 478 450 … … 486 458 487 459 /* likewise for metastack */ 488 if (swish_config_value_exists(parse_data->config, (xmlChar *)SWISH_META, parse_data->tag))489 { 490 if (SWISH_DEBUG > 8)460 if (swish_config_value_exists(parse_data->config, (xmlChar*)SWISH_META, parse_data->tag)) 461 { 462 if (SWISH_DEBUG == SWISH_DEBUG_PARSER) 491 463 swish_debug_msg(" %s = new metaname", parse_data->tag); 492 493 flush_buffer( parse_data, NULL);464 465 flush_buffer( parse_data, parse_data->metastack->head->name, parse_data->metastack->flat ); 494 466 495 467 parse_data->metastack = push_tag_stack(parse_data->metastack, parse_data->tag); 496 468 } 497 469 498 if (SWISH_DEBUG > 8)470 if (SWISH_DEBUG == SWISH_DEBUG_PARSER) 499 471 swish_debug_msg("config check for '%s' done", parse_data->tag); 500 472 … … 505 477 close_tag(void *data, const xmlChar * tag) 506 478 { 507 xmlChar *metaname;479 xmlChar *context; 508 480 swish_ParseData *parse_data; 509 481 parse_data = (swish_ParseData *) data; … … 519 491 swish_debug_msg(" endElement(%s) (%s)", (xmlChar *) tag, parse_data->tag); 520 492 521 if (( metaname= pop_tag_stack_on_match(parse_data->propstack, parse_data->tag)) != NULL)522 { 523 //swish_debug_msg("popped %s from propstack", parse_data->tag);493 if ((context = pop_tag_stack_on_match(parse_data->propstack, parse_data->tag)) != NULL) 494 { 495 //swish_debug_msg("popped %s from propstack", context); 524 496 add_stack_to_prop_buf(parse_data->tag, parse_data); 525 497 xmlBufferEmpty(parse_data->prop_buf); 526 swish_xfree( metaname);527 } 528 529 if (( metaname= pop_tag_stack_on_match(parse_data->metastack, parse_data->tag)) != NULL)498 swish_xfree(context); 499 } 500 501 if ((context = pop_tag_stack_on_match(parse_data->metastack, parse_data->tag)) != NULL) 530 502 { 531 503 /* swish_debug_msg("popped %s from metastack", parse_data->tag); */ 532 flush_buffer(parse_data, metaname);533 swish_xfree( metaname);504 flush_buffer(parse_data, parse_data->tag, context); 505 swish_xfree(context); 534 506 } 535 507 … … 572 544 573 545 if (parse_data->bump_word && xmlBufferLength(parse_data->prop_buf)) 546 { 547 //swish_debug_msg(" appending ' ' to prop_buf"); 574 548 swish_append_buffer(parse_data->prop_buf, (xmlChar *) " ", 1); 575 549 } 550 551 //swish_debug_msg(" appending '%s' to prop_buf", output); 576 552 swish_append_buffer(parse_data->prop_buf, output, len); 577 553 … … 771 747 772 748 ptr->tag = NULL; 773 ptr->wordlist = swish_init_WordList(); 774 ptr->propHash = swish_init_PropHash(config); 749 ptr->wordlist = swish_init_wordlist(); 750 ptr->properties = swish_init_nb(config, (xmlChar*)SWISH_PROP); 751 ptr->metanames = swish_init_nb(config, (xmlChar*)SWISH_META); 775 752 776 753 /* prime the stacks */ … … 781 758 ptr->metastack->flat = NULL; 782 759 ptr->metastack->count = 0; 783 ptr->metastack = push_tag_stack(ptr->metastack, (xmlChar *)SWISH_DEFAULT_METANAME);760 ptr->metastack = push_tag_stack(ptr->metastack, (xmlChar*)SWISH_DEFAULT_METANAME); 784 761 785 762 ptr->propstack = (swish_TagStack *) swish_xmalloc(sizeof(swish_TagStack)); … … 789 766 ptr->propstack->flat = NULL; 790 767 ptr->propstack->count = 0; 791 ptr->propstack = push_tag_stack(ptr->propstack, (xmlChar *) "_"); /* no such property --792 *just to seed stack */768 ptr->propstack = push_tag_stack(ptr->propstack, (xmlChar*)"_"); 769 /* no such property just to seed stack */ 793 770 794 771 /* gets toggled per-tag */ … … 807 784 ptr->offset = 0; 808 785 786 /* TODO make this configurable */ 787 ptr->context_as_meta = 1; 809 788 810 789 /* pointer to the xmlParserCtxt since we want to free it only after we're … … 857 836 858 837 if (SWISH_DEBUG > 9) 859 swish_debug_msg("freeing swish_ParseData propHash"); 860 861 swish_free_PropHash(ptr->propHash); 838 swish_debug_msg("freeing swish_ParseData properties"); 839 840 swish_free_nb(ptr->properties); 841 842 if (SWISH_DEBUG > 9) 843 swish_debug_msg("freeing swish_ParseData metanames"); 844 845 swish_free_nb(ptr->metanames); 846 862 847 863 848 if (SWISH_DEBUG > 9) … … 905 890 swish_debug_msg("free swish_ParseData wordList"); 906 891 907 swish_free_ WordList(ptr->wordlist);892 swish_free_wordlist(ptr->wordlist); 908 893 } 909 894 … … 1672 1657 1673 1658 parse_data->metastack = push_tag_stack( parse_data->metastack, 1674 (xmlChar *)SWISH_DEFAULT_METANAME);1659 (xmlChar*)SWISH_DEFAULT_METANAME); 1675 1660 1676 1661 if (SWISH_DEBUG > 2) … … 1678 1663 1679 1664 chars_to_words(parse_data, buffer, size); 1680 flush_buffer(parse_data, NULL);1665 flush_buffer(parse_data, (xmlChar*)SWISH_DEFAULT_METANAME, (xmlChar*)SWISH_DEFAULT_METANAME); 1681 1666 1682 1667 if (out != NULL) … … 1745 1730 ) 1746 1731 { 1747 1748 if (parse_data->analyzer->tokenize == 0)1749 return;1750 1732 1751 1733 if (len == 0) … … 1796 1778 if (tmplist->nwords == 0) 1797 1779 { 1798 swish_free_ WordList(tmplist);1780 swish_free_wordlist(tmplist); 1799 1781 return; 1800 1782 } … … 1904 1886 add_stack_to_prop_buf(xmlChar * tag, swish_ParseData * parse_data) 1905 1887 { 1906 swish_TagStack *s = parse_data->propstack; 1888 swish_TagStack *s = parse_data->propstack; 1889 int cleanwsp = 1; 1890 xmlHashTablePtr props = swish_subconfig_hash( parse_data->config, (xmlChar*)SWISH_PROP ); 1891 1892 /* should we strip whitespace from this particular property ? */ 1893 if( xmlStrEqual(xmlHashLookup(props, tag), (xmlChar*)SWISH_PROP_ASIS) ) 1894 cleanwsp = 0; 1895 1896 //swish_debug_msg(" add_stack_to_prop_buf: '%s'", xmlBufferContent(parse_data->prop_buf)); 1907 1897 1908 1898 if (tag != NULL) 1909 add_to_prop_buf(parse_data->prop_buf, parse_data->propHash, tag); 1899 swish_add_buf_to_nb(parse_data->properties, 1900 tag, 1901 parse_data->prop_buf, 1902 (xmlChar*)SWISH_PROP_CONNECTOR, 1903 cleanwsp, 0); 1910 1904 1911 1905 for (s->temp = s->head; s->temp != NULL; s->temp = s->temp->next) 1912 1906 { 1913 add_to_prop_buf(parse_data->prop_buf, parse_data->propHash, s->temp->name); 1907 if (xmlStrEqual(s->temp->name, "_")) /* top of the stack is just a placeholder */ 1908 continue; 1909 1910 swish_add_buf_to_nb(parse_data->properties, 1911 s->temp->name, 1912 parse_data->prop_buf, 1913 (xmlChar*)SWISH_PROP_CONNECTOR, 1914 cleanwsp, 0); 1914 1915 } 1915 1916 libswish3/trunk/src/libswish3/words.c
r1927 r1930 42 42 static int bytes_in_chr(wint_t c); 43 43 static void make_ascii_tables(); 44 44 static int add_to_wordlist( 45 swish_WordList * list, 46 xmlChar * word, 47 int len, 48 xmlChar * metaname, 49 xmlChar * context, 50 int word_pos, 51 int offset 52 ); 45 53 46 54 static int initialized = 0; … … 60 68 61 69 swish_WordList * 62 swish_init_ WordList()70 swish_init_wordlist() 63 71 { 64 72 swish_WordList *wl = (swish_WordList *) swish_xmalloc(sizeof(swish_WordList)); … … 72 80 73 81 void 74 swish_free_ WordList(swish_WordList * list)82 swish_free_wordlist(swish_WordList * list) 75 83 { 76 84 swish_Word *t; … … 133 141 { 134 142 135 swish_WordList *list = swish_init_ WordList();143 swish_WordList *list = swish_init_wordlist(); 136 144 137 145 … … 270 278 271 279 int byte_count = 0; 272 swish_WordList *list = swish_init_ WordList();280 swish_WordList *list = swish_init_wordlist(); 273 281 xmlChar * utf8_str; 274 282 … … 492 500 char c, nextc, in_word; 493 501 int i, w, wl, byte_count; 494 swish_WordList * list = swish_init_ WordList();502 swish_WordList * list = swish_init_wordlist(); 495 503 xmlChar * word = swish_xmalloc(sizeof(xmlChar*) * analyzer->maxwordlen); 496 504 … … 824 832 * 825 833 ***********************************************/ 834 static int 835 add_to_wordlist( 836 swish_WordList * list, 837 xmlChar * word, 838 int len, 839 xmlChar * metaname, 840 xmlChar * context, 841 int word_pos, 842 int offset 843 ) 844 { 845 swish_Word *thisword = (swish_Word *) swish_xmalloc(sizeof(swish_Word)); 846 847 if (SWISH_DEBUG == SWISH_DEBUG_TOKENIZER) 848 { 849 swish_debug_msg(" >>>>>>>>swish_Word<<<<<<<<: %s", word); 850 swish_debug_msg(" --METANAME--: %s", metaname); 851 swish_debug_msg(" --CONTEXT---: %s", context); 852 swish_debug_msg(" --POSITION--: %d", word_pos); 853 swish_debug_msg(" --OFFSET----: %d", offset); 854 swish_debug_msg(" --WORD LEN--: %d", len); 855 } 856 857 /* add to wordlist */ 858 859 thisword->word = word; 860 thisword->position = word_pos; 861 862 if (metaname != NULL) 863 thisword->metaname = swish_xstrdup(metaname); 864 else 865 thisword->metaname = swish_xstrdup((xmlChar*)SWISH_DEFAULT_METANAME); 866 867 if (context != NULL) 868 thisword->context = swish_xstrdup(context); 869 else 870 thisword->context = swish_xstrdup((xmlChar*)SWISH_DEFAULT_METANAME); 871 872 thisword->end_offset = offset; 873 thisword->start_offset = offset - len + 1; /* +1 because want the first byte */ 874 875 /* add thisword to list */ 876 if (list->head == 0) 877 { 878 list->head = thisword; 879 thisword->prev = 0; 880 } 881 else 882 { 883 list->tail->next = thisword; 884 thisword->prev = list->tail; 885 } 886 887 list->tail = thisword; 888 thisword->next = 0; 889 890 /* increment total count */ 891 list->nwords++; 892 893 return len; 894 } 895 826 896 size_t 827 897 swish_add_to_wordlist( … … 835 905 { 836 906 837 swish_Word *thisword = (swish_Word *) swish_xmalloc(sizeof(swish_Word)); 838 size_t len = xmlStrlen(word); 839 840 if (SWISH_DEBUG == SWISH_DEBUG_TOKENIZER) 841 { 842 swish_debug_msg(" >>>>>>>>swish_Word<<<<<<<<: %s", word); 843 swish_debug_msg(" --METANAME--: %s", metaname); 844 swish_debug_msg(" --CONTEXT---: %s", context); 845 swish_debug_msg(" --POSITION--: %d", word_pos); 846 swish_debug_msg(" --OFFSET----: %d", offset); 847 swish_debug_msg(" --WORD LEN--: %d", (int)len); 848
