| 346 | | |
|---|
| | 357 | # utility methods |
|---|
| | 358 | |
|---|
| | 359 | # tokenize() from Perl space uses same C func as tokenizer callback |
|---|
| | 360 | swish_WordList * |
|---|
| | 361 | tokenize(self, str, ...) |
|---|
| | 362 | SV* self; |
|---|
| | 363 | SV* str; |
|---|
| | 364 | |
|---|
| | 365 | PREINIT: |
|---|
| | 366 | char* CLASS; |
|---|
| | 367 | swish_WordList* list; |
|---|
| | 368 | xmlChar* metaname; |
|---|
| | 369 | xmlChar* context; |
|---|
| | 370 | unsigned int word_pos; |
|---|
| | 371 | unsigned int offset; |
|---|
| | 372 | xmlChar* buf; |
|---|
| | 373 | int numtokens; |
|---|
| | 374 | |
|---|
| | 375 | CODE: |
|---|
| | 376 | CLASS = WORDLIST_CLASS; |
|---|
| | 377 | list = swish_init_wordlist(); |
|---|
| | 378 | list->ref_cnt++; |
|---|
| | 379 | metaname = (xmlChar*)SWISH_DEFAULT_METANAME; |
|---|
| | 380 | context = (xmlChar*)SWISH_DEFAULT_METANAME; |
|---|
| | 381 | word_pos = 0; |
|---|
| | 382 | offset = 0; |
|---|
| | 383 | buf = (xmlChar*)SvPV(str, PL_na); |
|---|
| | 384 | |
|---|
| | 385 | // TODO reimplement as hashref arg |
|---|
| | 386 | |
|---|
| | 387 | if (!SvUTF8(str)) |
|---|
| | 388 | { |
|---|
| | 389 | if (swish_is_ascii(buf)) |
|---|
| | 390 | SvUTF8_on(str); /* flags original SV ?? */ |
|---|
| | 391 | else |
|---|
| | 392 | croak("%s is not flagged as a UTF-8 string and is not ASCII", buf); |
|---|
| | 393 | } |
|---|
| | 394 | |
|---|
| | 395 | if ( items > 2 ) |
|---|
| | 396 | { |
|---|
| | 397 | word_pos = (int)SvIV(ST(2)); |
|---|
| | 398 | |
|---|
| | 399 | if ( items > 3 ) |
|---|
| | 400 | offset = (int)SvIV(ST(3)); |
|---|
| | 401 | |
|---|
| | 402 | if ( items > 4 ) |
|---|
| | 403 | metaname = (xmlChar*)SvPV(ST(4), PL_na); |
|---|
| | 404 | |
|---|
| | 405 | if ( items > 5 ) |
|---|
| | 406 | context = (xmlChar*)SvPV(ST(5), PL_na); |
|---|
| | 407 | |
|---|
| | 408 | //warn ("word_pos %d offset %d metaname %s context %s\n", word_pos, offset, metaname, context ); |
|---|
| | 409 | |
|---|
| | 410 | } |
|---|
| | 411 | |
|---|
| | 412 | numtokens = sp_tokenize( |
|---|
| | 413 | (swish_3*)sp_extract_ptr(self), |
|---|
| | 414 | buf, |
|---|
| | 415 | list, |
|---|
| | 416 | word_pos, |
|---|
| | 417 | offset, |
|---|
| | 418 | metaname, |
|---|
| | 419 | context |
|---|
| | 420 | ); |
|---|
| | 421 | |
|---|
| | 422 | RETVAL = list; |
|---|
| | 423 | /* TODO do we need to worry about free()ing metaname and context ?? */ |
|---|
| | 424 | |
|---|
| | 425 | OUTPUT: |
|---|
| | 426 | RETVAL |
|---|
| | 427 | |
|---|
| | 428 | |
|---|
| | 429 | |
|---|
| | 430 | # tokenize_isw() uses native libswish3 tokenizer |
|---|
| | 431 | swish_WordList * |
|---|
| | 432 | tokenize_isw(self, str, ...) |
|---|
| | 433 | SV* self; |
|---|
| | 434 | SV* str; |
|---|
| | 435 | |
|---|
| | 436 | PREINIT: |
|---|
| | 437 | char* CLASS; |
|---|
| | 438 | swish_WordList* list; |
|---|
| | 439 | xmlChar* metaname; |
|---|
| | 440 | xmlChar* context; |
|---|
| | 441 | unsigned int word_pos; |
|---|
| | 442 | unsigned int offset; |
|---|
| | 443 | xmlChar* buf; |
|---|
| | 444 | int numwords; |
|---|
| | 445 | |
|---|
| | 446 | CODE: |
|---|
| | 447 | CLASS = WORDLIST_CLASS; |
|---|
| | 448 | list = swish_init_wordlist(); |
|---|
| | 449 | list->ref_cnt++; |
|---|
| | 450 | metaname = (xmlChar*)SWISH_DEFAULT_METANAME; |
|---|
| | 451 | context = (xmlChar*)SWISH_DEFAULT_METANAME; |
|---|
| | 452 | word_pos = 0; |
|---|
| | 453 | offset = 0; |
|---|
| | 454 | buf = (xmlChar*)SvPV(str, PL_na); |
|---|
| | 455 | |
|---|
| | 456 | if (!SvUTF8(str)) |
|---|
| | 457 | { |
|---|
| | 458 | if (swish_is_ascii(buf)) |
|---|
| | 459 | SvUTF8_on(str); /* flags original SV ?? */ |
|---|
| | 460 | else |
|---|
| | 461 | croak("%s is not flagged as a UTF-8 string and is not ASCII", buf); |
|---|
| | 462 | } |
|---|
| | 463 | |
|---|
| | 464 | if ( items > 2 ) |
|---|
| | 465 | { |
|---|
| | 466 | word_pos = (int)SvIV(ST(2)); |
|---|
| | 467 | |
|---|
| | 468 | if ( items > 3 ) |
|---|
| | 469 | offset = (int)SvIV(ST(3)); |
|---|
| | 470 | |
|---|
| | 471 | if ( items > 4 ) |
|---|
| | 472 | metaname = (xmlChar*)SvPV(ST(4), PL_na); |
|---|
| | 473 | |
|---|
| | 474 | if ( items > 5 ) |
|---|
| | 475 | context = (xmlChar*)SvPV(ST(5), PL_na); |
|---|
| | 476 | |
|---|
| | 477 | } |
|---|
| | 478 | |
|---|
| | 479 | swish_init_words(); /* in case it wasn't initialized elsewhere... */ |
|---|
| | 480 | numwords = swish_tokenize( |
|---|
| | 481 | (swish_3*)sp_extract_ptr(self), |
|---|
| | 482 | buf, |
|---|
| | 483 | list, |
|---|
| | 484 | word_pos, |
|---|
| | 485 | offset, |
|---|
| | 486 | metaname, |
|---|
| | 487 | context |
|---|
| | 488 | ); |
|---|
| | 489 | |
|---|
| | 490 | RETVAL = list; |
|---|
| | 491 | |
|---|
| | 492 | /* TODO do we need to worry about free()ing metaname and context ?? */ |
|---|
| | 493 | |
|---|
| | 494 | OUTPUT: |
|---|
| | 495 | RETVAL |
|---|
| | 496 | |
|---|
| | 497 | |
|---|