Changeset 2161 for libswish3/trunk/bindings/perl
- Timestamp:
- 09/20/08 01:05:54 (4 months ago)
- Files:
-
- libswish3/trunk/bindings/perl/3.xs (modified) (7 diffs)
- libswish3/trunk/bindings/perl/XS/Analyzer.xs (modified) (3 diffs)
- libswish3/trunk/bindings/perl/XS/Constants.xs (modified) (1 diff)
- libswish3/trunk/bindings/perl/XS/Data.xs (modified) (3 diffs)
- libswish3/trunk/bindings/perl/XS/Token.xs (modified) (1 diff)
- libswish3/trunk/bindings/perl/XS/TokenIterator.xs (added)
- libswish3/trunk/bindings/perl/XS/Word.xs (deleted)
- libswish3/trunk/bindings/perl/XS/WordList.xs (deleted)
- libswish3/trunk/bindings/perl/lib/SWISH/3.pm (modified) (2 diffs)
- libswish3/trunk/bindings/perl/macros.h (modified) (2 diffs)
- libswish3/trunk/bindings/perl/t/06constants.t (modified) (1 diff)
- libswish3/trunk/bindings/perl/t/10tokenize.t (modified) (1 diff)
- libswish3/trunk/bindings/perl/t/15-analyzer.t (modified) (2 diffs)
- libswish3/trunk/bindings/perl/typemap (modified) (2 diffs)
- libswish3/trunk/bindings/perl/xs_helpers.c (modified) (7 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/bindings/perl/3.xs
r2151 r2161 40 40 //sp_describe_object( (SV*)s3->stash ); 41 41 42 // hardcode this till we can get ENV var or similar 43 s3->analyzer->tokenlist = 0; 44 45 if (s3->analyzer->tokenlist) { 46 s3->analyzer->tokenizer = (&sp_tokenize3); 47 } 48 else { 49 s3->analyzer->tokenizer = (&sp_tokenize); 50 } 42 s3->analyzer->tokenizer = (&sp_tokenize3); 43 51 44 s3->analyzer->stash = sp_Stash_new(); 52 45 sp_Stash_set_char( s3->analyzer->stash, SELF_CLASS_KEY, ANALYZER_CLASS ); 46 53 47 s3->config->stash = sp_Stash_new(); 54 48 sp_Stash_set_char( s3->config->stash, SELF_CLASS_KEY, CONFIG_CLASS ); … … 357 351 # utility methods 358 352 353 void 354 describe(self, obj) 355 SV* self; 356 SV* obj; 357 358 CODE: 359 sp_describe_object(obj); 360 361 362 363 359 364 # tokenize() from Perl space uses same C func as tokenizer callback 360 swish_ WordList*365 swish_TokenIterator * 361 366 tokenize(self, str, ...) 362 SV* self;367 swish_3* self; 363 368 SV* str; 364 369 365 370 PREINIT: 366 371 char* CLASS; 367 swish_WordList* list; 368 xmlChar* metaname; 372 swish_TokenIterator* ti; 373 swish_TokenList* tl; 374 swish_MetaName* metaname; 375 xmlChar* meta; 369 376 xmlChar* context; 370 unsigned int word_pos;371 unsigned int offset;372 377 xmlChar* buf; 373 int numtokens;374 375 CODE:376 CLASS = WORDLIST_CLASS;377 list = swish_init_wordlist();378 list->ref_cnt++;379 meta name= (xmlChar*)SWISH_DEFAULT_METANAME;378 379 CODE: 380 CLASS = TOKENITERATOR_CLASS; 381 tl = swish_init_token_list(); 382 ti = swish_init_token_iterator(self->config, tl); 383 ti->ref_cnt++; 384 meta = (xmlChar*)SWISH_DEFAULT_METANAME; 380 385 context = (xmlChar*)SWISH_DEFAULT_METANAME; 381 word_pos = 0;382 offset = 0;383 386 buf = (xmlChar*)SvPV(str, PL_na); 384 387 385 388 // TODO reimplement as hashref arg 386 389 390 // TODO why this check?? 387 391 if (!SvUTF8(str)) 388 392 { … … 394 398 395 399 if ( items > 2 ) 396 { 397 word_pos = (int)SvIV(ST(2));398 400 { 401 meta = (xmlChar*)SvPV(ST(2), PL_na); 402 399 403 if ( items > 3 ) 400 offset = (int)SvIV(ST(3)); 401 402 if ( items > 4 ) 403 metaname = (xmlChar*)SvPV(ST(4), PL_na); 404 405 if ( items > 5 ) 406 context = (xmlChar*)SvPV(ST(5), PL_na); 407 408 //warn ("word_pos %d offset %d metaname %s context %s\n", word_pos, offset, metaname, context ); 409 410 } 404 context = (xmlChar*)SvPV(ST(3), PL_na); 405 406 //warn ("metaname %s context %s\n", metaname, context ); 407 408 } 409 410 metaname = swish_init_metaname(meta); 411 metaname->ref_cnt++; 411 412 412 numtokens = sp_tokenize( 413 (swish_3*)sp_extract_ptr(self), 414 buf, 415 list, 416 word_pos, 417 offset, 418 metaname, 419 context 420 ); 421 422 RETVAL = list; 423 /* TODO do we need to worry about free()ing metaname and context ?? */ 413 sp_tokenize3( self, buf, tl, metaname, context ); 414 415 RETVAL = ti; 416 424 417 425 418 OUTPUT: … … 428 421 429 422 430 # tokenize_isw() usesnative libswish3 tokenizer431 swish_ WordList*432 tokenize_ isw(self, str, ...)433 SV* self;423 # native libswish3 tokenizer 424 swish_TokenIterator * 425 tokenize_native(self, str, ...) 426 swish_3* self; 434 427 SV* str; 435 428 436 429 PREINIT: 437 430 char* CLASS; 438 swish_WordList* list; 439 xmlChar* metaname; 431 swish_TokenIterator* ti; 432 swish_TokenList* tl; 433 swish_MetaName* metaname; 434 xmlChar* meta; 440 435 xmlChar* context; 441 unsigned int word_pos;442 unsigned int offset;443 436 xmlChar* buf; 444 int numwords;445 446 CODE:447 CLASS = WORDLIST_CLASS;448 list = swish_init_wordlist();449 list->ref_cnt++;450 meta name= (xmlChar*)SWISH_DEFAULT_METANAME;451 context = (xmlChar*)SWISH_DEFAULT_METANAME;452 word_pos = 0;453 offset = 0;454 buf = (xmlChar*)SvPV(str, PL_na);455 437 438 CODE: 439 CLASS = TOKENITERATOR_CLASS; 440 tl = swish_init_token_list(); 441 ti = swish_init_token_iterator(self->config, tl); 442 ti->ref_cnt++; 443 meta = (xmlChar*)SWISH_DEFAULT_METANAME; 444 context = (xmlChar*)SWISH_DEFAULT_METANAME; 445 buf = (xmlChar*)SvPV(str, PL_na); 446 447 // TODO reimplement as hashref arg 448 456 449 if (!SvUTF8(str)) 457 450 { … … 463 456 464 457 if ( items > 2 ) 465 { 466 word_pos = (int)SvIV(ST(2));467 458 { 459 meta = (xmlChar*)SvPV(ST(2), PL_na); 460 468 461 if ( items > 3 ) 469 offset = (int)SvIV(ST(3)); 470 471 if ( items > 4 ) 472 metaname = (xmlChar*)SvPV(ST(4), PL_na); 473 474 if ( items > 5 ) 475 context = (xmlChar*)SvPV(ST(5), PL_na); 476 477 } 478 479 swish_init_words(); /* in case it wasn't initialized elsewhere... */ 480 numwords = swish_tokenize( 481 (swish_3*)sp_extract_ptr(self), 482 buf, 483 list, 484 word_pos, 485 offset, 486 metaname, 487 context 488 ); 489 490 RETVAL = list; 491 492 /* TODO do we need to worry about free()ing metaname and context ?? */ 462 context = (xmlChar*)SvPV(ST(3), PL_na); 463 464 //warn ("metaname %s context %s\n", metaname, context ); 465 466 } 467 468 metaname = swish_init_metaname(meta); 469 metaname->ref_cnt++; 470 471 swish_tokenize3( self, buf, tl, metaname, context ); 472 473 RETVAL = ti; 474 493 475 494 476 OUTPUT: … … 500 482 INCLUDE: XS/Config.xs 501 483 INCLUDE: XS/Analyzer.xs 502 INCLUDE: XS/WordList.xs503 INCLUDE: XS/Word.xs504 484 INCLUDE: XS/Doc.xs 505 485 INCLUDE: XS/Data.xs … … 511 491 INCLUDE: XS/xml2Hash.xs 512 492 INCLUDE: XS/Token.xs 513 493 INCLUDE: XS/TokenIterator.xs 494 libswish3/trunk/bindings/perl/XS/Analyzer.xs
r2151 r2161 25 25 set_regex = 1 26 26 get_regex = 2 27 set_token_handler = 328 get_token_handler = 429 27 PREINIT: 30 28 SV* stash; … … 46 44 break; 47 45 48 // set token handler49 case 3: sp_Stash_replace(self->stash, TOKEN_HANDLER_KEY, ST(1));50 break;51 52 // get token handler53 case 4: if (!sp_hvref_exists(self->stash, TOKEN_HANDLER_KEY)) {54 croak("no token handler set");55 }56 57 RETVAL = sp_Stash_get(self->stash, TOKEN_HANDLER_KEY);58 break;59 46 60 47 END_SET_OR_GET_SWITCH … … 68 55 CODE: 69 56 self->ref_cnt--; 70 71 if (sp_hvref_exists(self->stash, TOKEN_HANDLER_KEY)) { 72 //warn("token handler set"); 73 } 74 57 75 58 if (SWISH_DEBUG) { 76 59 warn("DESTROYing swish_Analyzer object %s [%d] [ref_cnt = %d]", libswish3/trunk/bindings/perl/XS/Constants.xs
r2045 r2161 22 22 newCONSTSUB(stash, "SWISH_PROP_MTIME", newSVpv(SWISH_PROP_MTIME, 0)); 23 23 newCONSTSUB(stash, "SWISH_PROP_DESCRIPTION",newSVpv(SWISH_PROP_DESCRIPTION, 0)); 24 newCONSTSUB(stash, "SWISH_ PROP_CONNECTOR", newSVpv(SWISH_PROP_CONNECTOR, 0));24 newCONSTSUB(stash, "SWISH_TOKENPOS_BUMPER", newSVpv(SWISH_TOKENPOS_BUMPER, 0)); 25 25 newCONSTSUB(stash, "SWISH_PROP_STRING", newSViv(SWISH_PROP_STRING)); 26 26 newCONSTSUB(stash, "SWISH_PROP_DATE", newSViv(SWISH_PROP_DATE)); libswish3/trunk/bindings/perl/XS/Data.xs
r2151 r2161 104 104 CODE: 105 105 CLASS = DOC_CLASS; 106 self->docinfo->ref_cnt++; 106 107 RETVAL = self->docinfo; 107 108 … … 110 111 111 112 112 swish_ WordList*113 wordlist(self)113 swish_TokenIterator * 114 tokens(self) 114 115 swish_ParserData* self 115 116 … … 118 119 119 120 CODE: 120 CLASS = WORDLIST_CLASS; 121 122 # MUST increment refcnt 2x so that SWISH::3::WordList::DESTROY 123 # does not free it. 124 //self->wordlist->ref_cnt += 2; 125 self->wordlist->ref_cnt++; 126 RETVAL = self->wordlist; 121 CLASS = TOKENITERATOR_CLASS; 122 self->token_iterator->ref_cnt++; // TODO needed? 123 RETVAL = self->token_iterator; 127 124 128 125 OUTPUT: libswish3/trunk/bindings/perl/XS/Token.xs
r2151 r2161 3 3 PROTOTYPES: enable 4 4 5 SV* 6 value (self) 7 swish_Token * self; 8 9 PREINIT: 10 xmlChar *value; 11 12 CODE: 13 value = swish_get_token_value(self); 14 RETVAL = newSVpvn( (char*)value, strlen((char*)value) ); 15 16 OUTPUT: 17 RETVAL 18 19 20 swish_MetaName* 21 meta (self) 22 swish_Token * self; 23 24 PREINIT: 25 char* CLASS; 26 27 CODE: 28 CLASS = METANAME_CLASS; 29 RETVAL = self->meta; 30 31 OUTPUT: 32 RETVAL 33 34 SV* 35 context (self) 36 swish_Token * self; 37 CODE: 38 RETVAL = newSVpvn( (char*)self->context, strlen((char*)self->context) ); 39 40 OUTPUT: 41 RETVAL 42 43 44 SV* 45 pos (self) 46 swish_Token * self; 47 CODE: 48 RETVAL = newSViv( self->pos ); 49 50 OUTPUT: 51 RETVAL 52 53 SV* 54 start_byte (self) 55 swish_Token * self; 56 CODE: 57 RETVAL = newSViv( self->start_byte ); 58 59 OUTPUT: 60 RETVAL 61 62 SV* 63 len(self) 64 swish_Token * self; 65 CODE: 66 RETVAL = newSViv( self->len ); 67 68 OUTPUT: 69 RETVAL 70 71 5 72 void 6 debug(self)7 s p_Token* self;73 DESTROY(self) 74 swish_Token* self 8 75 9 76 CODE: 10 sp_debug_token(self); 77 self->ref_cnt--; 78 79 if (SWISH_DEBUG) { 80 warn("DESTROYing swish_Token object %s [%d] [ref_cnt = %d]", 81 SvPV(ST(0), PL_na), self, self->ref_cnt); 82 } 11 83 84 if (self->ref_cnt < 1) { 85 swish_free_token(self); 86 } 12 87 libswish3/trunk/bindings/perl/lib/SWISH/3.pm
r2151 r2161 19 19 use constant SWISH_DOC_FIELDS => 20 20 qw( mtime size encoding mime uri nwords ext parser ); 21 use constant SWISH_ WORD_FIELDS =>22 qw( word position metaname context start_offset end_offset);21 use constant SWISH_TOKEN_FIELDS => 22 qw( pos meta value context start_byte len ); 23 23 24 24 # load the XS at runtime, since we need $VERSION … … 138 138 while ( my $swishword = $wordlist->next ) { 139 139 print '-' x 50, "\n"; 140 for my $w (SWISH_ WORD_FIELDS) {140 for my $w (SWISH_TOKEN_FIELDS) { 141 141 printf( "%15s: %s\n", $w, $swishword->$w ); 142 142 } libswish3/trunk/bindings/perl/macros.h
r2045 r2161 11 11 #define DATA_CLASS_KEY "sp_data_class" 12 12 #define TOKEN_CLASS "SWISH::3::Token" 13 #define WORDLIST_CLASS "SWISH::3::WordList" 14 #define WORD_CLASS "SWISH::3::Word" 13 #define TOKENITERATOR_CLASS "SWISH::3::TokenIterator" 15 14 #define DOC_CLASS "SWISH::3::Doc" 16 15 #define PROPERTY_CLASS "SWISH::3::Property" … … 25 24 #define TOKENIZER_KEY "sp_tokenizer" 26 25 #define PARSER_KEY "sp_parser" 27 #define TOKEN_HANDLER_KEY "sp_token_handler"28 26 #define SELF_CLASS_KEY "sp_self_class" 29 27 libswish3/trunk/bindings/perl/t/06constants.t
r2019 r2161 5 5 is( SWISH_MIME, 'MIME', SWISH_MIME ); 6 6 is( SWISH_PROP, 'PropertyNames', SWISH_PROP ); 7 is( scalar(SWISH_ WORD_FIELDS), 6, 'SWISH_WORD_FIELDS' );7 is( scalar(SWISH_TOKEN_FIELDS), 6, 'SWISH_TOKEN_FIELDS' ); libswish3/trunk/bindings/perl/t/10tokenize.t
r2151 r2161 4 4 5 5 ok( my $s3 = SWISH::3->new, "new s3" ); 6 ok( my $ wlist= $s3->tokenize(6 ok( my $tokens = $s3->tokenize( 7 7 "now is the time, ain't it? or when else might it be!", 8 14, 5,'foo', 'bar'8 'foo', 'bar' 9 9 ), 10 10 "wordlist" 11 11 ); 12 12 13 ok( $ wlist->isa('SWISH::3::WordList'), 'isa wordlist' );13 ok( $tokens->isa('SWISH::3::TokenIterator'), 'isa TokenIterator' ); 14 14 15 while ( my $swishword = $wlist->next ) { 15 #$s3->describe($tokens); 16 16 17 my $word = $swishword->word; 17 while ( my $token = $tokens->next ) { 18 19 #$s3->describe($token); 20 21 my $word = $token->value; 18 22 if ( $word eq 'now' ) { 19 is( $ swishword->position, 6, "now position" );23 is( $token->pos, 1, "now position" ); 20 24 } 21 25 if ( $word eq 'time' ) { 22 is( $ swishword->position, 9, "time position" );26 is( $token->pos, 4, "time position" ); 23 27 } 24 28 if ( $word eq 'be' ) { 25 is( $ swishword->position, 17, "be position" );29 is( $token->pos, 12, "be position" ); 26 30 } 27 31 28 32 #diag( '=' x 60 ); 29 for my $w (SWISH_ WORD_FIELDS) {33 for my $w (SWISH_TOKEN_FIELDS) { 30 34 31 #diag( sprintf( "%15s: %s\n", $w, $ swishword->$w ) );35 #diag( sprintf( "%15s: %s\n", $w, $token->$w ) ); 32 36 33 37 } 34 38 } 35 39 36 #undef $analyzer;37 40 #undef $wlist; 38 41 #undef $s3; libswish3/trunk/bindings/perl/t/15-analyzer.t
r2151 r2161 1 use Test::More tests => 5;1 use Test::More tests => 3; 2 2 3 3 use SWISH::3; … … 7 7 #ok( my $analyzer = $s3->analyzer, "get analyzer" ); 8 8 9 eval { my $handler = $s3->analyzer->get_token_handler };10 11 ok( $@, "get token handler: $@" );12 13 9 like( 'foo', $s3->analyzer->get_regex, 'get regex' ); 14 15 ok( !$s3->analyzer->set_token_handler( sub { $_[0]->debug } ),16 "set token handler" );17 10 18 11 ok( $s3->tokenize('foo bar baz'), "tokenize" ); libswish3/trunk/bindings/perl/typemap
r2151 r2161 6 6 swish_ParserData* O_OBJECT 7 7 xmlBufferPtr T_IV 8 swish_WordList* O_OBJECT 8 swish_TokenIterator* O_OBJECT 9 swish_Token* O_OBJECT 9 10 swish_DocInfo* O_OBJECT 10 swish_Word* O_OBJECT11 11 swish_Analyzer* O_OBJECT 12 12 swish_Parser* O_OBJECT … … 15 15 swish_Property* O_OBJECT 16 16 swish_MetaName* O_OBJECT 17 sp_Token* O_OBJECT 17 18 18 19 19 INPUT libswish3/trunk/bindings/perl/xs_helpers.c
r2151 r2161 5 5 6 6 /* C code to make writing XS easier */ 7 8 9 // TODO replace this with swish_Token10 typedef struct sp_Token sp_Token;11 12 struct sp_Token13 {14 xmlChar *start_ptr;15 int tok_bytes;16 int start;17 int end;18 xmlChar *meta;19 xmlChar *ctxt;20 unsigned int wpos;21 unsigned int offset;22 swish_Analyzer *analyzer;23 swish_WordList *list;24 };25 26 7 27 8 static AV* sp_hv_keys(HV* hash); … … 55 36 static void sp_test_handler( swish_ParserData* parse_data ); 56 37 static void sp_handler( swish_ParserData* parse_data ); 57 static int sp_tokenize( swish_3* s3, xmlChar* str, ... ); 58 static int sp_tokenize3( swish_3* s3, xmlChar* str, ... ); 59 static void sp_token_handler( sp_Token *token ); 38 static int sp_tokenize3( swish_3 *s3, 39 xmlChar *buf, 40 swish_TokenList * tl, 41 swish_MetaName *meta, 42 xmlChar *context ); 60 43 static void sp_SV_is_qr( SV *qr ); 61 static void sp_debug_token( sp_Token *token );62 44 63 45 /* implement nearly all methods for SWISH::3::Stash, a private class */ … … 599 581 warn("handler called!\n"); 600 582 swish_debug_docinfo( parse_data->docinfo ); 601 swish_debug_ wordlist( parse_data->wordlist);583 swish_debug_token_list( parse_data->token_iterator ); 602 584 swish_debug_nb( parse_data->properties, (xmlChar*)"Property" ); 603 585 swish_debug_nb( parse_data->metanames, (xmlChar*)"MetaName" ); … … 647 629 } 648 630 649 static void650 sp_call_token_handler( sp_Token *token, SV *method )651 {652 dTHX;653 dSP;654 655 SV* obj;656 obj = sp_bless_ptr( TOKEN_CLASS, (IV)token );657 658 PUSHMARK(SP);659 XPUSHs(obj);660 PUTBACK;661 662 call_sv(method, G_DISCARD);663 }664 665 static int666 sp_tokenize3(swish_3* s3, xmlChar *str, ...)667 {668 669 670 return 0;671 }672 673 631 /* this regex wizardry cribbed from KS - thanks Marvin! */ 674 632 static int 675 sp_tokenize(swish_3* s3, xmlChar* str, ...) 676 { 677 dTHX; 678 679 unsigned int wpos, offset, num_code_points; 680 swish_WordList *list; 681 sp_Token *s3_token; 633 sp_tokenize3( 634 swish_3 *s3, 635 xmlChar *buf, 636 swish_TokenList *tl, 637 swish_MetaName *meta, 638 xmlChar *context 639 ) 640 { 641 dTHX; 642 643 /* declare */ 644 unsigned int num_tokens; 682 645 MAGIC *mg; 683 646 REGEXP *rx; … … 686 649 int str_len; 687 650 xmlChar *str_end; 688 xmlChar *meta, *ctxt;689 651 SV *token_re; 690 SV *token_handler; 691 692 va_list args; 693 va_start(args, str); 694 list = va_arg(args, swish_WordList*); 695 offset = va_arg(args, unsigned int); 696 wpos = va_arg(args, unsigned int); 697 meta = va_arg(args, xmlChar *); 698 ctxt = va_arg(args, xmlChar *); 699 va_end(args); 700 701 //warn("wpos %d offset %d meta %s ctxt %s\n", wpos, offset, meta, ctxt); 702 703 s3_token = swish_xmalloc(sizeof(sp_Token)); 652 653 /* initialize */ 654 num_tokens = 0; 704 655 mg = NULL; 705 656 rx = NULL; 706 657 wrapper = sv_newmortal(); 707 str_start = str;708 str_len = strlen((char*) str);658 str_start = buf; 659 str_len = strlen((char*)buf); 709 660 str_end = str_start + str_len; 710 661 token_re = s3->analyzer->regex; 711 token_handler = sp_hvref_exists( s3->analyzer->stash, TOKEN_HANDLER_KEY ) 712 ? sp_hvref_fetch( s3->analyzer->stash, TOKEN_HANDLER_KEY ) 713 : NULL; 714 715 716 /* extract regexp struct from qr// entity */ 662 663 664 /* extract regexp struct from qr// entity */ 717 665 if (SvROK(token_re)) { 718 666 SV *sv = SvRV(token_re); … … 725 673 rx = (REGEXP*)mg->mg_obj; 726 674 727 /* fake up an SV wrapper to feed to the regex engine */675 /* fake up an SV wrapper to feed to the regex engine */ 728 676 sv_upgrade(wrapper, SVt_PV); 729 677 SvREADONLY_on(wrapper); … … 731 679 SvUTF8_on(wrapper); /* do UTF8 matching -- we trust str is already utf-8 encoded. */ 732 680 733 /* wrap the string in an SV to please the regex engine */681 /* wrap the string in an SV to please the regex engine */ 734 682 SvPVX(wrapper) = (char*)str_start; 735 683 SvCUR_set(wrapper, str_len); 736 684 SvPOK_on(wrapper); 737 738 num_code_points = 0; 739 740 /* some things remain true for each iteration of regex match */ 741 s3_token->meta = meta; 742 s3_token->ctxt = ctxt; 743 s3_token->analyzer = s3->analyzer; 744 s3_token->list = list; 745 s3_token->offset = offset; // gets incremented 746 747 748 while ( pregexec(rx, (char*)str, (char*)str_end, (char*)str, 1, wrapper, 1) ) 749 { 750 int start, end, tok_bytes, tok_pts; 685 686 while ( pregexec(rx, (char*)buf, (char*)str_end, (char*)buf, 1, wrapper, 1) ) 687 { 688 int token_len; 751 689 xmlChar* start_ptr; 752 690 xmlChar* end_ptr; 753 691 754 692 #if ((PERL_VERSION > 9) || (PERL_VERSION == 9 && PERL_SUBVERSION >= 5)) 755 start_ptr = str+ rx->offs[0].start;756 end_ptr = str+ rx->offs[0].end;693 start_ptr = buf + rx->offs[0].start; 694 end_ptr = buf + rx->offs[0].end; 757 695 #else 758 start_ptr = str+ rx->startp[0];759 end_ptr = str+ rx->endp[0];696 start_ptr = buf + rx->startp[0]; 697 end_ptr = buf + rx->endp[0]; 760 698 #endif 761 762 763 /* get start and end offsets in Unicode code points */ 764 for( ; str < start_ptr; num_code_points++) 765 { 766 str += swish_utf8_chr_len(str); 767 if (str > str_end) 768 croak("scanned past end of '%s'", str_start); 769 } 770 771 start = num_code_points; 772 773 for( ; str < end_ptr; num_code_points++) 774 { 775 str += swish_utf8_chr_len(str); 776 if (str > str_end) 777 croak("scanned past end of '%s'", str_start); 778 } 779 780 end = num_code_points; /* characters (codepoints) */ 781 tok_pts = end - start; // TODO what is this for?? 782 tok_bytes = end_ptr - start_ptr; 783 784 s3_token->start_ptr = start_ptr; 785 s3_token->tok_bytes = tok_bytes; 786 s3_token->start = start; 787 s3_token->end = end; 788 s3_token->wpos = ++wpos; 789 /* increment for next iteration */ 790 s3_token->offset += tok_bytes; // TODO this isn't any better than libswish3 algorithm 791 792 if (token_handler) { 793 sp_call_token_handler( s3_token, token_handler ); 794 } else { 795 sp_token_handler( s3_token ); 796 } 797 798 799 } 800 801 swish_xfree( s3_token ); 802 803 return list->nwords; 804 } 805 806 /* 807 default token handler is just to append to WordList 808 */ 809 static void 810 sp_token_handler( sp_Token *token ) 811 { 812 813 if ((token->end - token->start) < token->analyzer->minwordlen) 814 return; 815 816 if ((token->end - token->start) > token->analyzer->maxwordlen) 817 return; 818 819 820 /* TODO: lc() and stem() */ 821 if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) 822 sp_debug_token( token ); 823 824 swish_add_to_wordlist_len( token->list, 825 token->start_ptr, 826 token->tok_bytes, 827 token->meta, 828 token->ctxt, 829 token->wpos, 830 token->offset 831 ); 832 833 } 834 835 static void 836 sp_debug_token( sp_Token *token ) 837 { 838 warn("-------------------------------------\n"); 839 warn("start_ptr = %s\n", token->start_ptr); 840 warn("tok_bytes = %d\n", token->tok_bytes); 841 warn("meta = %s\n", token->meta); 842 warn("ctxt = %s\n", token->ctxt); 843 warn("wpos = %d\n", token->wpos); 844 warn("offset = %d\n", token->offset); 845 warn("start = %d\n", token->start); 846 warn("end = %d\n", token->end); 847 } 848 699 700 buf = end_ptr; 701 702 //warn("Token: %s", start_ptr); 703 704 token_len = (end_ptr - start_ptr) + 1; 705 swish_add_token(tl, start_ptr, token_len, meta, context); 706 num_tokens++; 707 708 } 709 710 return num_tokens; 711 } 712
