Changeset 2018
- Timestamp:
- 02/13/08 00:48:19 (3 months ago)
- Files:
-
- libswish3/trunk/bindings/perl/3.xs (modified) (5 diffs)
- libswish3/trunk/bindings/perl/lib/SWISH/3.pm (modified) (3 diffs)
- libswish3/trunk/bindings/perl/t/08-handler.t (modified) (1 diff)
- libswish3/trunk/bindings/perl/t/10tokenize.t (modified) (1 diff)
- libswish3/trunk/bindings/perl/xs_helpers.c (modified) (11 diffs)
- libswish3/trunk/src/libswish3/words.c (modified) (7 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/bindings/perl/3.xs
r2015 r2018 23 23 HV* stash; 24 24 HV* analyzer_stash; 25 swish_3* s3; 25 26 26 27 CODE: 27 28 stash = newHV(); 28 RETVAL= swish_init_swish3( &sp_handler, newRV_inc((SV*)stash) );29 RETVAL->ref_cnt = 1;29 s3 = swish_init_swish3( &sp_handler, newRV_inc((SV*)stash) ); 30 s3->ref_cnt = 1; 30 31 31 32 sp_hv_store(stash, DATA_CLASS_KEY, newSVpv(DATA_CLASS, 0)); … … 34 35 sp_hv_store(stash, PARSER_CLASS_KEY, newSVpv(PARSER_CLASS, 0)); 35 36 36 //sp_describe_object( RETVAL->stash);37 //sp_describe_object(newRV_noinc((SV*) RETVAL->stash));38 39 RETVAL->analyzer->ref_cnt = 1;40 RETVAL->analyzer->tokenizer = &sp_tokenize;37 //sp_describe_object(s3->stash); 38 //sp_describe_object(newRV_noinc((SV*)s3->stash)); 39 40 s3->analyzer->ref_cnt = 1; 41 s3->analyzer->tokenizer = &sp_tokenize; 41 42 analyzer_stash = newHV(); 42 RETVAL->analyzer->stash = newRV_inc((SV*)analyzer_stash);43 44 RETVAL->config->ref_cnt = 1;45 RETVAL->parser->ref_cnt = 1;46 47 //SvREFCNT_inc(RETVAL);48 43 s3->analyzer->stash = newRV_inc((SV*)analyzer_stash); 44 45 s3->config->ref_cnt = 1; 46 s3->parser->ref_cnt = 1; 47 48 RETVAL = s3; 49 49 50 OUTPUT: 50 51 RETVAL … … 97 98 CODE: 98 99 file = SvPV(filename, PL_na); 99 //SvREFCNT_inc((SV*)self); 100 101 //warn("parse_file %s", file); 102 103 // TODO self is broken. SV = UNKNOWN. 104 // and yet, handler works... 105 106 //Perl_sv_dump((SV*)self); 107 //sp_describe_object(self->stash); 108 100 109 101 # need to swap return values to make it Perlish 110 RETVAL = swish_parse_file( self, (xmlChar*)file ) ? 0 : 1; 111 112 //SvREFCNT_dec((SV*)self); 102 RETVAL = swish_parse_file( self, (xmlChar*)file ) ? 0 : 1; 113 103 114 104 OUTPUT: … … 126 116 CODE: 127 117 buf = SvPV(buffer, PL_na); 128 SvREFCNT_inc((SV*)self); 129 130 118 131 119 # need to swap return values to make it Perlish 132 120 RETVAL = swish_parse_buffer( self, (xmlChar*)buf ) ? 0 : 1; 133 134 SvREFCNT_dec((SV*)self);135 121 136 122 OUTPUT: … … 527 513 PREINIT: 528 514 char* CLASS; 529 xmlChar* metaname = (xmlChar*)SWISH_DEFAULT_METANAME; 530 xmlChar* context = (xmlChar*)SWISH_DEFAULT_METANAME; 531 unsigned int word_pos = 0; 532 unsigned int offset = 0; 533 xmlChar* buf = (xmlChar*)SvPV(str, PL_na); 534 535 CODE: 536 CLASS = WORDLIST_CLASS; 515 xmlChar* metaname; 516 xmlChar* context; 517 unsigned int word_pos; 518 unsigned int offset; 519 xmlChar* buf; 520 521 CODE: 522 CLASS = WORDLIST_CLASS; 523 metaname = (xmlChar*)SWISH_DEFAULT_METANAME; 524 context = (xmlChar*)SWISH_DEFAULT_METANAME; 525 word_pos = 0; 526 offset = 0; 527 buf = (xmlChar*)SvPV(str, PL_na); 537 528 538 529 // TODO reimplement as hashref arg libswish3/trunk/bindings/perl/lib/SWISH/3.pm
r2015 r2018 21 21 *config = \&get_config; 22 22 *analyzer = \&get_analyzer; 23 *regex = \&get_regex; 24 *parser = \&get_parser; 23 25 24 26 sub new { … … 97 99 print "Doc\n"; 98 100 for my $d ( SWISH_DOC_FIELDS() ) { 99 100 #printf("%15s: %s\n", $d, $data->doc->$d); 101 printf( "%15s: %s\n", $d, $data->doc->$d ); 101 102 } 102 103 103 104 print "WordList\n"; 104 105 while ( my $swishword = $data->wordlist->next ) { 106 print '-' x 50, "\n"; 105 107 for my $w ( SWISH_WORD_FIELDS() ) { 106 107 108 printf( "%15s: %s\n", $w, $swishword->$w ); 108 109 } … … 123 124 config => 'path/to/config.xml', 124 125 handler => \&my_handler, 125 tokenizer=> qr/\w+(?:'\w+)*/,126 regex => qr/\w+(?:'\w+)*/, 126 127 ); 127 128 $swish3->parse( 'path/to/file.xml' ) libswish3/trunk/bindings/perl/t/08-handler.t
r2015 r2018 5 5 ok( my $s3 = SWISH::3->new(), "new parser" ); 6 6 ok( $s3->parse("t/test.html"), "parse HTML" ); 7 ok( $s3->parse("t/test.xml"), "parse XML" );7 ok( $s3->parse("t/test.xml"), "parse XML" ); 8 8 libswish3/trunk/bindings/perl/t/10tokenize.t
r2015 r2018 9 9 ok( my $wlist = $analyzer->tokenize( 10 10 "now is the time, ain't it? or when else might it be!", 11 5, 14, 'foo', 'bar'11 14, 5, 'foo', 'bar' 12 12 ), 13 13 "wordlist" libswish3/trunk/bindings/perl/xs_helpers.c
r2015 r2018 37 37 static void sp_token_handler( swish_Token *token ); 38 38 static void sp_SV_is_qr( SV *qr ); 39 static void sp_debug_token( swish_Token *token ); 39 40 40 41 static void … … 398 399 the Perl code. 399 400 */ 400 void401 static void 401 402 sp_handler( swish_ParseData* parse_data ) 402 403 { … … 435 436 } 436 437 437 438 static void 438 439 sp_call_token_handler( swish_Token *token, SV *method ) 439 440 { … … 452 453 453 454 /* this regex wizardry cribbed from KS - thanks Marvin! */ 454 s wish_WordList *455 static swish_WordList * 455 456 sp_tokenize(swish_Analyzer* analyzer, xmlChar* str, ...) 456 457 { … … 472 473 va_list args; 473 474 va_start(args, str); 474 wpos= va_arg(args, unsigned int);475 offset= va_arg(args, unsigned int);475 offset = va_arg(args, unsigned int); 476 wpos = va_arg(args, unsigned int); 476 477 meta = va_arg(args, xmlChar *); 477 478 ctxt = va_arg(args, xmlChar *); … … 525 526 526 527 527 // TODO 5.10 API528 529 528 while ( pregexec(rx, (char*)str, (char*)str_end, (char*)str, 1, wrapper, 1) ) 530 529 { 531 xmlChar* start_ptr = str + rx->startp[0];532 xmlChar* end_ptr = str + rx->endp[0];533 530 int start, end, tok_bytes, tok_pts; 531 xmlChar* start_ptr; 532 xmlChar* end_ptr; 533 534 #if ((PERL_VERSION > 9) || (PERL_VERSION == 9 && PERL_SUBVERSION >= 5)) 535 start_ptr = str + rx->offs[0].start; 536 end_ptr = str + rx->offs[0].end; 537 #else 538 start_ptr = str + rx->startp[0]; 539 end_ptr = str + rx->endp[0]; 540 #endif 541 534 542 535 543 /* get start and end offsets in Unicode code points */ … … 560 568 s3_token->end = end; 561 569 s3_token->wpos = ++wpos; 562 570 /* increment for next iteration */ 571 s3_token->offset += tok_bytes; // TODO this isn't any better than libswish3 algorithm 563 572 564 573 if (token_handler) { … … 568 577 } 569 578 570 /* increment for next iteration */ 571 s3_token->offset += tok_bytes; 579 572 580 } 573 581 … … 580 588 default token handler is just to append to WordList 581 589 */ 582 void590 static void 583 591 sp_token_handler( swish_Token *token ) 584 592 { … … 592 600 593 601 /* TODO: lc() and stem() */ 602 if (SWISH_DEBUG == SWISH_DEBUG_TOKENIZER) 603 sp_debug_token( token ); 594 604 595 605 swish_add_to_wordlist_len( token->list, … … 603 613 604 614 } 615 616 static void 617 sp_debug_token( swish_Token *token ) 618 { 619 warn("-------------------------------------\n"); 620 warn("start_ptr = %s\n", token->start_ptr); 621 warn("tok_bytes = %d\n", token->tok_bytes); 622 warn("meta = %s\n", token->meta); 623 warn("ctxt = %s\n", token->ctxt); 624 warn("wpos = %d\n", token->wpos); 625 warn("offset = %d\n", token->offset); 626 warn("start = %d\n", token->start); 627 warn("end = %d\n", token->end); 628 } libswish3/trunk/src/libswish3/words.c
r1952 r2018 518 518 519 519 520 for (i = 0; str[i] != NULL; i++)520 for (i = 0; str[i] != '\0'; i++) 521 521 { 522 522 c = (int) tolower(str[i]); … … 552 552 553 553 /* add NULL */ 554 word[w] = NULL;554 word[w] = '\0'; 555 555 wl = strip_ascii_chars(word, w); 556 556 … … 594 594 595 595 /* end the word if we've reached our limit or the end of the string */ 596 if (w >= analyzer->maxwordlen || nextc == NULL)596 if (w >= analyzer->maxwordlen || nextc == '\0') 597 597 { 598 598 … … 604 604 in_word = 0; 605 605 606 word[w] = NULL;606 word[w] = '\0'; 607 607 wl = strip_ascii_chars(word, w); 608 608 … … 783 783 if (!ascii_end_table[word[i]]) 784 784 { 785 word[i] = NULL;785 word[i] = '\0'; 786 786 end++; 787 787 } … … 818 818 819 819 /* Add the NULL */ 820 word[j] = NULL;820 word[j] = '\0'; 821 821 } 822 822 … … 939 939 { 940 940 SWISH_DEBUG_MSG(" ---------- WORD --------- "); 941 SWISH_DEBUG_MSG(" word : %s", list->current->word);942 SWISH_DEBUG_MSG(" meta: %s", list->current->metaname);943 SWISH_DEBUG_MSG(" context: %s", list->current->context);944 SWISH_DEBUG_MSG(" pos: %d", list->current->position);945 SWISH_DEBUG_MSG("soffset : %d", list->current->start_offset);946 SWISH_DEBUG_MSG("eoffset : %d", list->current->end_offset);941 SWISH_DEBUG_MSG(" word : %s", list->current->word); 942 SWISH_DEBUG_MSG(" meta : %s", list->current->metaname); 943 SWISH_DEBUG_MSG("context : %s", list->current->context); 944 SWISH_DEBUG_MSG(" pos : %d", list->current->position); 945 SWISH_DEBUG_MSG("soffset : %d", list->current->start_offset); 946 SWISH_DEBUG_MSG("eoffset : %d", list->current->end_offset); 947 947 948 948 list->current = list->current->next;
