Changeset 2151
- Timestamp:
- 07/29/08 21:37:14 (5 months ago)
- Files:
-
- libswish3/trunk/bindings/perl/3.xs (modified) (6 diffs)
- libswish3/trunk/bindings/perl/XS/Analyzer.xs (modified) (2 diffs)
- libswish3/trunk/bindings/perl/XS/Data.xs (modified) (2 diffs)
- libswish3/trunk/bindings/perl/XS/PropertyHash.xs (modified) (1 diff)
- libswish3/trunk/bindings/perl/XS/Token.xs (added)
- libswish3/trunk/bindings/perl/lib/SWISH/3.pm (modified) (1 diff)
- libswish3/trunk/bindings/perl/t/10tokenize.t (modified) (1 diff)
- libswish3/trunk/bindings/perl/t/15-analyzer.t (modified) (2 diffs)
- libswish3/trunk/bindings/perl/t/20metanames.t (modified) (2 diffs)
- libswish3/trunk/bindings/perl/t/bumper.html (added)
- libswish3/trunk/bindings/perl/t/test.html (modified) (1 diff)
- libswish3/trunk/bindings/perl/t/test.xml (modified) (1 diff)
- libswish3/trunk/bindings/perl/typemap (modified) (1 diff)
- libswish3/trunk/bindings/perl/xs_helpers.c (modified) (18 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/bindings/perl/3.xs
r2045 r2151 39 39 40 40 //sp_describe_object( (SV*)s3->stash ); 41 42 s3->analyzer->tokenizer = &sp_tokenize; 41 42 // hardcode this till we can get ENV var or similar 43 s3->analyzer->tokenlist = 0; 44 45 if (s3->analyzer->tokenlist) { 46 s3->analyzer->tokenizer = (&sp_tokenize3); 47 } 48 else { 49 s3->analyzer->tokenizer = (&sp_tokenize); 50 } 43 51 s3->analyzer->stash = sp_Stash_new(); 44 52 sp_Stash_set_char( s3->analyzer->stash, SELF_CLASS_KEY, ANALYZER_CLASS ); … … 186 194 187 195 // get_config 188 case 2: self->config->ref_cnt++; 196 case 2: if (GIMME_V != G_VOID) 197 self->config->ref_cnt++; 189 198 class = sp_Stash_get_char(self->stash, CONFIG_CLASS_KEY); 190 199 sp_Stash_set_char( self->config->stash, SELF_CLASS_KEY, class ); … … 212 221 213 222 // get_analyzer 214 case 4: self->analyzer->ref_cnt++; 223 case 4: if (GIMME_V != G_VOID) 224 self->analyzer->ref_cnt++; 215 225 class = sp_Stash_get_char(self->stash, ANALYZER_CLASS_KEY); 216 226 sp_Stash_set_char( self->analyzer->stash, SELF_CLASS_KEY, class ); … … 235 245 236 246 // get_parser 237 case 6: self->parser->ref_cnt++; 247 case 6: if (GIMME_V != G_VOID) 248 self->parser->ref_cnt++; 238 249 class = sp_Stash_get_char(self->stash, PARSER_CLASS_KEY); 239 250 RETVAL = sp_bless_ptr(class, (IV)self->parser); … … 344 355 345 356 346 357 # utility methods 358 359 # tokenize() from Perl space uses same C func as tokenizer callback 360 swish_WordList * 361 tokenize(self, str, ...) 362 SV* self; 363 SV* str; 364 365 PREINIT: 366 char* CLASS; 367 swish_WordList* list; 368 xmlChar* metaname; 369 xmlChar* context; 370 unsigned int word_pos; 371 unsigned int offset; 372 xmlChar* buf; 373 int numtokens; 374 375 CODE: 376 CLASS = WORDLIST_CLASS; 377 list = swish_init_wordlist(); 378 list->ref_cnt++; 379 metaname = (xmlChar*)SWISH_DEFAULT_METANAME; 380 context = (xmlChar*)SWISH_DEFAULT_METANAME; 381 word_pos = 0; 382 offset = 0; 383 buf = (xmlChar*)SvPV(str, PL_na); 384 385 // TODO reimplement as hashref arg 386 387 if (!SvUTF8(str)) 388 { 389 if (swish_is_ascii(buf)) 390 SvUTF8_on(str); /* flags original SV ?? */ 391 else 392 croak("%s is not flagged as a UTF-8 string and is not ASCII", buf); 393 } 394 395 if ( items > 2 ) 396 { 397 word_pos = (int)SvIV(ST(2)); 398 399 if ( items > 3 ) 400 offset = (int)SvIV(ST(3)); 401 402 if ( items > 4 ) 403 metaname = (xmlChar*)SvPV(ST(4), PL_na); 404 405 if ( items > 5 ) 406 context = (xmlChar*)SvPV(ST(5), PL_na); 407 408 //warn ("word_pos %d offset %d metaname %s context %s\n", word_pos, offset, metaname, context ); 409 410 } 411 412 numtokens = sp_tokenize( 413 (swish_3*)sp_extract_ptr(self), 414 buf, 415 list, 416 word_pos, 417 offset, 418 metaname, 419 context 420 ); 421 422 RETVAL = list; 423 /* TODO do we need to worry about free()ing metaname and context ?? */ 424 425 OUTPUT: 426 RETVAL 427 428 429 430 # tokenize_isw() uses native libswish3 tokenizer 431 swish_WordList * 432 tokenize_isw(self, str, ...) 433 SV* self; 434 SV* str; 435 436 PREINIT: 437 char* CLASS; 438 swish_WordList* list; 439 xmlChar* metaname; 440 xmlChar* context; 441 unsigned int word_pos; 442 unsigned int offset; 443 xmlChar* buf; 444 int numwords; 445 446 CODE: 447 CLASS = WORDLIST_CLASS; 448 list = swish_init_wordlist(); 449 list->ref_cnt++; 450 metaname = (xmlChar*)SWISH_DEFAULT_METANAME; 451 context = (xmlChar*)SWISH_DEFAULT_METANAME; 452 word_pos = 0; 453 offset = 0; 454 buf = (xmlChar*)SvPV(str, PL_na); 455 456 if (!SvUTF8(str)) 457 { 458 if (swish_is_ascii(buf)) 459 SvUTF8_on(str); /* flags original SV ?? */ 460 else 461 croak("%s is not flagged as a UTF-8 string and is not ASCII", buf); 462 } 463 464 if ( items > 2 ) 465 { 466 word_pos = (int)SvIV(ST(2)); 467 468 if ( items > 3 ) 469 offset = (int)SvIV(ST(3)); 470 471 if ( items > 4 ) 472 metaname = (xmlChar*)SvPV(ST(4), PL_na); 473 474 if ( items > 5 ) 475 context = (xmlChar*)SvPV(ST(5), PL_na); 476 477 } 478 479 swish_init_words(); /* in case it wasn't initialized elsewhere... */ 480 numwords = swish_tokenize( 481 (swish_3*)sp_extract_ptr(self), 482 buf, 483 list, 484 word_pos, 485 offset, 486 metaname, 487 context 488 ); 489 490 RETVAL = list; 491 492 /* TODO do we need to worry about free()ing metaname and context ?? */ 493 494 OUTPUT: 495 RETVAL 496 497 347 498 348 499 # include the other .xs files … … 359 510 INCLUDE: XS/MetaNameHash.xs 360 511 INCLUDE: XS/xml2Hash.xs 361 362 512 INCLUDE: XS/Token.xs 513 libswish3/trunk/bindings/perl/XS/Analyzer.xs
r2045 r2151 68 68 CODE: 69 69 self->ref_cnt--; 70 71 if (sp_hvref_exists(self->stash, TOKEN_HANDLER_KEY)) { 72 //warn("token handler set"); 73 } 70 74 71 75 if (SWISH_DEBUG) { … … 79 83 } 80 84 81 82 83 84 85 # tokenize() from Perl space uses same C func as tokenizer callback86 swish_WordList *87 tokenize(self, str, ...)88 SV* self;89 SV* str;90 91 PREINIT:92 char* CLASS;93 xmlChar* metaname;94 xmlChar* context;95 unsigned int word_pos;96 unsigned int offset;97 xmlChar* buf;98 99 CODE:100 CLASS = WORDLIST_CLASS;101 metaname = (xmlChar*)SWISH_DEFAULT_METANAME;102 context = (xmlChar*)SWISH_DEFAULT_METANAME;103 word_pos = 0;104 offset = 0;105 buf = (xmlChar*)SvPV(str, PL_na);106 107 // TODO reimplement as hashref arg108 109 if (!SvUTF8(str))110 {111 if (swish_is_ascii(buf))112 SvUTF8_on(str); /* flags original SV ?? */113 else114 croak("%s is not flagged as a UTF-8 string and is not ASCII", buf);115 }116 117 if ( items > 2 )118 {119 word_pos = (int)SvIV(ST(2));120 121 if ( items > 3 )122 offset = (int)SvIV(ST(3));123 124 if ( items > 4 )125 metaname = (xmlChar*)SvPV(ST(4), PL_na);126 127 if ( items > 5 )128 context = (xmlChar*)SvPV(ST(5), PL_na);129 130 //warn ("word_pos %d offset %d metaname %s context %s\n", word_pos, offset, metaname, context );131 132 }133 134 RETVAL = sp_tokenize(135 (swish_Analyzer*)sp_extract_ptr(self),136 buf,137 word_pos,138 offset,139 metaname,140 context141 );142 143 /* TODO do we need to worry about free()ing metaname and context ?? */144 145 OUTPUT:146 RETVAL147 148 149 150 # tokenize_isw() uses native libswish3 tokenizer151 swish_WordList *152 tokenize_isw(self, str, ...)153 SV* self;154 SV* str;155 156 PREINIT:157 char* CLASS;158 xmlChar* metaname = (xmlChar*)SWISH_DEFAULT_METANAME;159 xmlChar* context = (xmlChar*)SWISH_DEFAULT_METANAME;160 unsigned int word_pos = 0;161 unsigned int offset = 0;162 xmlChar* buf = (xmlChar*)SvPV(str, PL_na);163 164 CODE:165 CLASS = WORDLIST_CLASS;166 167 if (!SvUTF8(str))168 {169 if (swish_is_ascii(buf))170 SvUTF8_on(str); /* flags original SV ?? */171 else172 croak("%s is not flagged as a UTF-8 string and is not ASCII", buf);173 }174 175 if ( items > 2 )176 {177 word_pos = (int)SvIV(ST(2));178 179 if ( items > 3 )180 offset = (int)SvIV(ST(3));181 182 if ( items > 4 )183 metaname = (xmlChar*)SvPV(ST(4), PL_na);184 185 if ( items > 5 )186 context = (xmlChar*)SvPV(ST(5), PL_na);187 188 }189 190 swish_init_words(); /* in case it wasn't initialized elsewhere... */191 RETVAL = swish_tokenize(192 (swish_Analyzer*)sp_extract_ptr(self),193 buf,194 word_pos,195 offset,196 metaname,197 context198 );199 200 RETVAL->ref_cnt++;201 202 /* TODO do we need to worry about free()ing metaname and context ?? */203 204 OUTPUT:205 RETVAL206 libswish3/trunk/bindings/perl/XS/Data.xs
r2030 r2151 50 50 51 51 CODE: 52 buf = xmlHashLookup(self->properties->hash, p);52 buf = swish_hash_fetch(self->properties->hash, p); 53 53 RETVAL = newSVpvn((char*)xmlBufferContent(buf), xmlBufferLength(buf)); 54 54 … … 65 65 66 66 CODE: 67 buf = xmlHashLookup(self-> properties->hash, m);67 buf = xmlHashLookup(self->metanames->hash, m); 68 68 RETVAL = newSVpvn((char*)xmlBufferContent(buf), xmlBufferLength(buf)); 69 69 libswish3/trunk/bindings/perl/XS/PropertyHash.xs
r2045 r2151 22 22 23 23 void 24 set(self, prop)24 set(self, prop) 25 25 xmlHashTablePtr self; 26 26 swish_Property* prop; libswish3/trunk/bindings/perl/lib/SWISH/3.pm
r2030 r2151 116 116 print '~' x 80, "\n"; 117 117 118 my $props = $data->config->properties; 118 my $props = $data->properties; 119 my $prop_hash = $data->config->get_properties; 119 120 120 121 print "Properties\n"; 121 for my $p ( keys %$props ) { 122 my $v = $data->property($p); 123 my $type = $props->{$p}; 124 print " <$p type='$type'>$v</$p>\n"; 122 for my $p ( sort keys %$props ) { 123 print " key: $p\n"; 124 my $prop_value = $props->{$p}; 125 print " value: " . Data::Dump::dump($prop_value) . "\n"; 126 my $prop = $prop_hash->get($p); 127 printf( " <%s type='%s'>%s</%s>\n", 128 $prop->name, $prop->type, $data->property($p), $prop->name ); 125 129 } 126 130 libswish3/trunk/bindings/perl/t/10tokenize.t
r2029 r2151 1 use Test::More tests => 7;1 use Test::More tests => 6; 2 2 3 3 use SWISH::3 qw( :constants ); 4 4 5 5 ok( my $s3 = SWISH::3->new, "new s3" ); 6 ok( my $analyzer = $s3->analyzer, "new tokenizer" ); 7 8 ok( my $wlist = $analyzer->tokenize( 6 ok( my $wlist = $s3->tokenize( 9 7 "now is the time, ain't it? or when else might it be!", 10 8 14, 5, 'foo', 'bar' libswish3/trunk/bindings/perl/t/15-analyzer.t
r2045 r2151 1 use Test::More tests => 4;1 use Test::More tests => 5; 2 2 3 3 use SWISH::3; … … 5 5 ok( my $s3 = SWISH::3->new(), "new s3 object" ); 6 6 7 ok( my $analyzer = $s3->analyzer, "get analyzer" );7 #ok( my $analyzer = $s3->analyzer, "get analyzer" ); 8 8 9 eval { my $handler = $ analyzer->get_token_handler };9 eval { my $handler = $s3->analyzer->get_token_handler }; 10 10 11 ok( $@, "get token handler " );11 ok( $@, "get token handler: $@" ); 12 12 13 like( 'foo', $analyzer->get_regex, 'get regex' ); 13 like( 'foo', $s3->analyzer->get_regex, 'get regex' ); 14 15 ok( !$s3->analyzer->set_token_handler( sub { $_[0]->debug } ), 16 "set token handler" ); 17 18 ok( $s3->tokenize('foo bar baz'), "tokenize" ); 19 libswish3/trunk/bindings/perl/t/20metanames.t
r2045 r2151 1 use Test::More tests => 22; 1 use strict; 2 use warnings; 3 4 use Test::More tests => 28; 2 5 use Data::Dump qw( dump ); 3 6 … … 32 35 33 36 } 37 38 ok( $s3 = SWISH::3->new( 39 config => '<swish><MetaNames><foo /></MetaNames></swish>', 40 handler => \&metacheck 41 ), 42 "new s3" 43 ); 44 ok( $s3->parse_file("t/bumper.html"), "parse bumper.html" ); 45 46 sub metacheck { 47 my $data = shift; 48 my $meta = $data->metanames; 49 my $prop = $data->properties; 50 51 #dump $meta; 52 #dump $prop; 53 54 cmp_ok( $meta->{'foo'}->[0], 'eq', 'one two', "first foo meta" ); 55 cmp_ok( $meta->{'foo'}->[1], 'eq', 'three four', "second foo meta" ); 56 cmp_ok( 57 $meta->{'swishdefault'}->[0], 58 'eq', 59 'this is para one', 60 "first swishdefault meta" 61 ); 62 cmp_ok( 63 $meta->{'swishdefault'}->[1], 64 'eq', 65 'this is para two', 66 "second swishdefault meta" 67 ); 68 69 } 70 71 # TODO this ends with -177 mem err libswish3/trunk/bindings/perl/t/test.html
r2014 r2151 2 2 <head> 3 3 <title>This is the title of the test.html doc</title> 4 <meta name="foo" content="metaname bar" /> 4 <meta name="foo" content="metaname foo" /> 5 <meta name="bar" content="metaname bar" /> 5 6 </head> 6 7 <body> 7 8 <p>some words</p> 9 <p>more words</p> 8 10 </body> 9 11 </html> libswish3/trunk/bindings/perl/t/test.xml
r2014 r2151 1 1 <html> 2 <body> 2 3 <p>some words</p> 3 4 <swishtitle>title here</swishtitle> 5 <swishtitle>more title here</swishtitle> 6 </body> 4 7 </html> libswish3/trunk/bindings/perl/typemap
r2045 r2151 15 15 swish_Property* O_OBJECT 16 16 swish_MetaName* O_OBJECT 17 sp_Token* O_OBJECT 17 18 18 19 INPUT libswish3/trunk/bindings/perl/xs_helpers.c
r2045 r2151 5 5 6 6 /* C code to make writing XS easier */ 7 8 9 // TODO replace this with swish_Token 10 typedef struct sp_Token sp_Token; 11 12 struct sp_Token 13 { 14 xmlChar *start_ptr; 15 int tok_bytes; 16 int start; 17 int end; 18 xmlChar *meta; 19 xmlChar *ctxt; 20 unsigned int wpos; 21 unsigned int offset; 22 swish_Analyzer *analyzer; 23 swish_WordList *list; 24 }; 25 7 26 8 27 static AV* sp_hv_keys(HV* hash); … … 36 55 static void sp_test_handler( swish_ParserData* parse_data ); 37 56 static void sp_handler( swish_ParserData* parse_data ); 38 static swish_WordList* sp_tokenize( swish_Analyzer* analyzer, xmlChar* str, ... ); 39 static void sp_token_handler( swish_Token *token ); 57 static int sp_tokenize( swish_3* s3, xmlChar* str, ... ); 58 static int sp_tokenize3( swish_3* s3, xmlChar* str, ... ); 59 static void sp_token_handler( sp_Token *token ); 40 60 static void sp_SV_is_qr( SV *qr ); 41 static void sp_debug_token( s wish_Token *token );61 static void sp_debug_token( sp_Token *token ); 42 62 43 63 /* implement nearly all methods for SWISH::3::Stash, a private class */ … … 52 72 static void sp_Stash_destroy( SV *stash ); 53 73 static void sp_Stash_dec_values( SV *stash ); 74 54 75 55 76 static SV* … … 536 557 const xmlChar *str = xmlBufferContent(buf); 537 558 const xmlChar *tmp; 538 int bump = strlen(SWISH_ META_CONNECTOR);559 int bump = strlen(SWISH_TOKENPOS_BUMPER); 539 560 int len; 540 541 /* analogous to @strings = split(/SWISH_ META_CONNECTOR/, str) */542 while((tmp = xmlStrstr(str, (xmlChar*)SWISH_ META_CONNECTOR)) != NULL)561 562 /* analogous to @strings = split(/SWISH_TOKENPOS_BUMPER/, str) */ 563 while((tmp = xmlStrstr(str, (xmlChar*)SWISH_TOKENPOS_BUMPER)) != NULL) 543 564 { 544 565 len = tmp - str; … … 549 570 } 550 571 551 /* if there was only one string, make sure it's in array */ 552 if (xmlBufferLength(buf) && av_len(strings) == -1) 553 { 554 av_push(strings, 555 newSVpvn((char*)xmlBufferContent(buf), 556 xmlBufferLength(buf))); 572 /* no match and/or last match */ 573 if (!xmlStrstr(str, (xmlChar*)SWISH_TOKENPOS_BUMPER)) { 574 av_push(strings, newSVpvn((char*)str, strlen((char*)str))); 557 575 } 558 576 … … 630 648 631 649 static void 632 sp_call_token_handler( s wish_Token *token, SV *method )650 sp_call_token_handler( sp_Token *token, SV *method ) 633 651 { 634 652 dTHX; … … 645 663 } 646 664 665 static int 666 sp_tokenize3(swish_3* s3, xmlChar *str, ...) 667 { 668 669 670 return 0; 671 } 672 647 673 /* this regex wizardry cribbed from KS - thanks Marvin! */ 648 static swish_WordList *649 sp_tokenize(swish_ Analyzer* analyzer, xmlChar* str, ...)674 static int 675 sp_tokenize(swish_3* s3, xmlChar* str, ...) 650 676 { 651 677 dTHX; 652 678 653 679 unsigned int wpos, offset, num_code_points; 654 swish_Token *s3_token; 680 swish_WordList *list; 681 sp_Token *s3_token; 655 682 MAGIC *mg; 656 683 REGEXP *rx; … … 661 688 xmlChar *meta, *ctxt; 662 689 SV *token_re; 663 swish_WordList *list;664 690 SV *token_handler; 665 691 666 692 va_list args; 667 693 va_start(args, str); 694 list = va_arg(args, swish_WordList*); 668 695 offset = va_arg(args, unsigned int); 669 696 wpos = va_arg(args, unsigned int); … … 674 701 //warn("wpos %d offset %d meta %s ctxt %s\n", wpos, offset, meta, ctxt); 675 702 676 s3_token = swish_xmalloc(sizeof(s wish_Token));703 s3_token = swish_xmalloc(sizeof(sp_Token)); 677 704 mg = NULL; 678 705 rx = NULL; … … 681 708 str_len = strlen((char*)str); 682 709 str_end = str_start + str_len; 683 token_re = analyzer->regex;684 token_handler = sp_hvref_exists( analyzer->stash, TOKEN_HANDLER_KEY )685 ? sp_hvref_fetch( analyzer->stash, TOKEN_HANDLER_KEY)710 token_re = s3->analyzer->regex; 711 token_handler = sp_hvref_exists( s3->analyzer->stash, TOKEN_HANDLER_KEY ) 712 ? sp_hvref_fetch( s3->analyzer->stash, TOKEN_HANDLER_KEY ) 686 713 : NULL; 687 714 … … 695 722 if (!mg) 696 723 croak("regex is not a qr// entity"); 724 697 725 rx = (REGEXP*)mg->mg_obj; 698 726 … … 708 736 SvPOK_on(wrapper); 709 737 710 list = swish_init_wordlist();711 list->ref_cnt++;712 738 num_code_points = 0; 713 739 … … 715 741 s3_token->meta = meta; 716 742 s3_token->ctxt = ctxt; 717 s3_token->analyzer = analyzer;743 s3_token->analyzer = s3->analyzer; 718 744 s3_token->list = list; 719 745 s3_token->offset = offset; // gets incremented … … 752 778 } 753 779 754 end = num_code_points; /* characters (codepoints) */ 755 756 tok_pts = end - start; // TODO what is this for?? 757 tok_bytes = end_ptr - start_ptr; 780 end = num_code_points; /* characters (codepoints) */ 781 tok_pts = end - start; // TODO what is this for?? 782 tok_bytes = end_ptr - start_ptr; 758 783 759 784 s3_token->start_ptr = start_ptr; … … 776 801 swish_xfree( s3_token ); 777 802 778 return list ;803 return list->nwords; 779 804 } 780 805 … … 783 808 */ 784 809 static void 785 sp_token_handler( s wish_Token *token )810 sp_token_handler( sp_Token *token ) 786 811 { 787 812 … … 794 819 795 820 /* TODO: lc() and stem() */ 796 if (SWISH_DEBUG ==SWISH_DEBUG_TOKENIZER)821 if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) 797 822 sp_debug_token( token ); 798 823 … … 809 834 810 835 static void 811 sp_debug_token( s wish_Token *token )836 sp_debug_token( sp_Token *token ) 812 837 { 813 838 warn("-------------------------------------\n");
