Changeset 2148
- Timestamp:
- 07/21/08 23:51:24 (4 months ago)
- Files:
-
- libswish3/trunk/src/libswish3/libswish3.h (modified) (5 diffs)
- libswish3/trunk/src/libswish3/parser.c (modified) (92 diffs)
- libswish3/trunk/src/libswish3/tokenizer.c (modified) (28 diffs)
- libswish3/trunk/src/libswish3/words.c (modified) (18 diffs)
- libswish3/trunk/src/swish_tokenize.c (modified) (2 diffs)
- libswish3/trunk/src/swish_words.c (modified) (2 diffs)
- libswish3/trunk/src/t/001-wordcount.t (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/src/libswish3/libswish3.h
r2142 r2148 353 353 boolean tokenize; // should we parse into WordList 354 354 boolean tokenlist; // use new tokenizer 355 swish_WordList* (*tokenizer) (swish_Analyzer*, xmlChar*, ...);355 int (*tokenizer) (swish_3*, xmlChar*, ...); 356 356 xmlChar* (*stemmer) (xmlChar*); 357 357 unsigned int lc; // should tokens be lowercased … … 552 552 swish_WordList * swish_init_wordlist(); 553 553 void swish_free_wordlist(swish_WordList * list); 554 swish_WordList * swish_tokenize( swish_Analyzer * analyzer, xmlChar * str, ... );555 556 swish_WordList *swish_tokenize_utf8_string(557 swish_ Analyzer * analyzer,554 int swish_tokenize( swish_3 * s3, xmlChar * str, ... ); 555 556 int swish_tokenize_utf8_string( 557 swish_3 * s3, 558 558 xmlChar * str, 559 swish_WordList * wl, 559 560 unsigned int offset, 560 561 unsigned int word_pos, … … 563 564 ); 564 565 565 swish_WordList *swish_tokenize_ascii_string(566 swish_ Analyzer * analyzer,566 int swish_tokenize_ascii_string( 567 swish_3 * s3, 567 568 xmlChar * str, 569 swish_WordList * wl, 568 570 unsigned int offset, 569 571 unsigned int word_pos, … … 572 574 ); 573 575 574 swish_WordList *swish_tokenize_regex(575 swish_ Analyzer * analyzer,576 int swish_tokenize_regex( 577 swish_3 * s3, 576 578 xmlChar * str, 579 swish_WordList * wl, 577 580 unsigned int offset, 578 581 unsigned int word_pos, … … 618 621 void swish_free_token_iterator( swish_TokenIterator *ti ); 619 622 swish_Token * swish_next_token( swish_TokenIterator *it ); 620 int swish_tokenize3( swish_3 *s3, 621 swish_TokenList * tl, 622 xmlChar *buf, 623 swish_MetaName *meta, 624 xmlChar *context ); 623 int swish_tokenize3( swish_3 *s3, xmlChar *buf, ... ); 625 624 int swish_tokenize3_ascii( 626 625 swish_3 *s3, 626 xmlChar *buf, 627 627 swish_TokenList * tl, 628 xmlChar *buf,629 628 swish_MetaName *meta, 630 629 xmlChar *context ); 631 630 int swish_tokenize3_utf8( 632 631 swish_3 *s3, 633 swish_TokenList * tl,634 632 xmlChar *buf, 633 swish_TokenList * tl, 635 634 swish_MetaName *meta, 636 635 xmlChar *context ); libswish3/trunk/src/libswish3/parser.c
r2140 r2148 16 16 * along with libswish3; if not, write to the Free Software 17 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 */18 */ 19 19 20 20 /* … … 30 30 * all the mb*() functions rely on locale to recognize multi-byte strings 31 31 * 32 */32 */ 33 33 34 34 #include <stdio.h> … … 93 93 /* 94 94 * SAX2 support 95 */95 */ 96 96 static void mystartElementNs( 97 97 void *parser_data, … … 181 181 /* 182 182 * parsing fh/buffer headers 183 */183 */ 184 184 typedef struct 185 185 { … … 208 208 ); 209 209 210 211 210 /* tag tracker */ 212 211 static xmlChar *flatten_tag_stack( … … 237 236 * end prototypes 238 237 ***********************************************************************/ 239 238 240 239 swish_Parser * 241 240 swish_init_parser( … … 248 247 p->ref_cnt = 0; 249 248 250 /*249 /* 251 250 * libxml2 stuff 252 */251 */ 253 252 xmlInitParser(); 254 253 xmlSubstituteEntitiesDefault(1); /* resolve text entities */ 255 254 256 /*255 /* 257 256 * debugging help 258 */257 */ 259 258 get_env_vars(); 260 259 … … 286 285 * turn the literal xml/html tag into a swish tag for matching against 287 286 * metanames and properties 288 */287 */ 289 288 static xmlChar * 290 289 build_tag( … … 314 313 metacontent = NULL; 315 314 316 /*315 /* 317 316 * normalize all tags 318 */317 */ 319 318 swishtag = swish_str_tolower(tag); 320 319 321 /*320 /* 322 321 * html tags 323 */322 */ 324 323 if (parser_data->is_html) { 325 324 326 /*325 /* 327 326 TODO config features about img tags and a/href tags 328 */327 */ 329 328 if (xmlStrEqual(swishtag, (xmlChar *)"br") 330 329 || xmlStrEqual(swishtag, (xmlChar *)"img")) { … … 339 338 else if (!element->isinline) { 340 339 341 /*340 /* 342 341 * need to bump word_pos so we don't match across block * 343 342 * elements 344 */343 */ 345 344 346 345 } 347 346 } 348 347 349 /*348 /* 350 349 * is this an HTML <meta> tag? treat 'name' attribute as a tag * 351 350 * and 'content' attribute as the tag content * we assume 'name' 352 351 * and 'content' are always in english. 353 */352 */ 354 353 355 354 if (atts != 0) { … … 361 360 if (xmlStrEqual(atts[i], (xmlChar *)"name")) { 362 361 363 /*362 /* 364 363 * SWISH_DEBUG_MSG("found name: %s", atts[i+1]); 365 */364 */ 366 365 metaname = (xmlChar *)atts[i + 1]; 367 366 } … … 369 368 else if (xmlStrEqual(atts[i], (xmlChar *)"content")) { 370 369 371 /*370 /* 372 371 * SWISH_DEBUG_MSG("found content: %s", atts[i+1]); 373 */372 */ 374 373 metacontent = (xmlChar *)atts[i + 1]; 375 374 } … … 382 381 SWISH_DEBUG_MSG("found HTML meta: %s => %s", metaname, metacontent); 383 382 384 /*383 /* 385 384 * do not match across metas 386 */385 */ 387 386 parser_data->bump_word = 1; 388 387 open_tag(parser_data, metaname, NULL); … … 395 394 } 396 395 397 /*396 /* 398 397 * xml tags 399 */398 */ 400 399 else { 401 400 402 /*401 /* 403 402 * TODO make this configurable ala swish2 404 */403 */ 405 404 406 405 parser_data->bump_word = 1; … … 422 421 attr_val_lower = swish_str_tolower(atts[i + 1]); 423 422 424 /*423 /* 425 424 is it one of ours? 426 */425 */ 427 426 for (j = 0; j < strlist->n; j++) { 428 427 if (xmlStrEqual(strlist->word[j], attr_lower)) { … … 430 429 SWISH_DEBUG_MSG("found %s: %s", attr_lower, attr_val_lower); 431 430 432 // eligible attribute name 433 size = xmlStrlen(swishtag) + xmlStrlen(attr_val_lower) + 2; / / dot + NULL431 /* eligible attribute name */ 432 size = xmlStrlen(swishtag) + xmlStrlen(attr_val_lower) + 2; /* dot + NULL */ 434 433 metaname = swish_xmalloc(size + 1); 435 434 snprintf((char *)metaname, size, "%s.%s", (char *)swishtag, … … 449 448 } 450 449 451 /*450 /* 452 451 * change our internal name for this tag if it is aliased in config 453 */452 */ 454 453 alias = swish_hash_fetch(parser_data->s3->config->tag_aliases, swishtag); 455 454 if (alias) { 456 455 457 /*456 /* 458 457 * SWISH_DEBUG_MSG("%s alias -> %s", swishtag, alias); 459 */458 */ 460 459 swish_xfree(swishtag); 461 460 swishtag = swish_xstrdup(alias); … … 482 481 xmlBufferContent(parser_data->meta_buf), parser_data->word_pos); 483 482 484 /*483 /* 485 484 * since we only flush the buffer when metaname changes, and we do 486 485 * not want to match across metanames, bump the word_pos here before 487 486 * parsing the string and making the tmp wordlist 488 */487 */ 489 488 if (parser_data->word_pos) 490 489 parser_data->word_pos++; 491 490 492 /*491 /* 493 492 * add meta_buf as-is to metanames buffer under current tag. this 494 493 * gives us both tokens and raw text de-tagged but organized by 495 494 * metaname. 496 */495 */ 497 496 swish_add_buf_to_nb(parser_data->metanames, metaname, parser_data->meta_buf, 498 497 (xmlChar *)SWISH_META_CONNECTOR, 0, 1); 499 498 500 /*499 /* 501 500 * add to every metaname on the stack. 502 501 * Disabling this for now, as it ought to be up the handler() to decide 503 502 * to index a token under multiple metanames, and we associate context 504 503 * with the WordList 505 */504 */ 506 505 507 506 if (parser_data->s3->config->flags->context_as_meta) { 508 507 for (s->temp = s->head; s->temp != NULL; s->temp = s->temp->next) { 509 if (xmlStrEqual(s->temp->baked, metaname)) // already added508 if (xmlStrEqual(s->temp->baked, metaname)) /* already added */ 510 509 continue; 511 510 … … 527 526 /* 528 527 * SAX2 callback 529 */528 */ 530 529 static void 531 530 mystartDocument( … … 534 533 { 535 534 536 /*535 /* 537 536 * swish_ParserData *parser_data = (swish_ParserData *) data; 538 */537 */ 539 538 540 539 if (SWISH_DEBUG & SWISH_DEBUG_PARSER) … … 545 544 /* 546 545 * SAX2 callback 547 */546 */ 548 547 static void 549 548 myendDocument( … … 555 554 SWISH_DEBUG_MSG("endDocument()"); 556 555 557 /*556 /* 558 557 * whatever's left 559 */558 */ 560 559 flush_buffer(parser_data, (xmlChar *)SWISH_DEFAULT_METANAME, 561 560 (xmlChar *)SWISH_DEFAULT_METANAME); … … 565 564 /* 566 565 * SAX1 callback 567 */566 */ 568 567 static void 569 568 mystartElement( … … 578 577 /* 579 578 * SAX1 callback 580 */579 */ 581 580 static void 582 581 myendElement( … … 590 589 /* 591 590 * SAX2 handler 592 */591 */ 593 592 static void 594 593 mystartElementNs( … … 630 629 for (i = 0; (atts[i] != NULL); i += 2) { 631 630 SWISH_DEBUG_MSG(" att: %s=%s", atts[i], atts[i + 1]); 632 //SWISH_DEBUG_MSG(" att: %s=", atts[i++], atts[i] || ""); 631 /* SWISH_DEBUG_MSG(" att: %s=", atts[i++], atts[i] || ""); */ 633 632 } 634 633 } … … 644 643 /* 645 644 * SAX2 handler 646 */645 */ 647 646 static void 648 647 myendElementNs( … … 674 673 SWISH_DEBUG_MSG("checking config for '%s' in watched tags", parser_data->tag); 675 674 676 /*675 /* 677 676 * set property if this tag is configured for it 678 */677 */ 679 678 if (swish_hash_exists(parser_data->s3->config->properties, parser_data->tag)) { 680 679 if (SWISH_DEBUG & SWISH_DEBUG_PARSER) 681 680 SWISH_DEBUG_MSG(" %s = new property", parser_data->tag); 682 681 683 add_stack_to_prop_buf(NULL, parser_data); /* NULL means all properties in the stack are added */682 add_stack_to_prop_buf(NULL, parser_data); /* NULL means all properties in the stack are added */ 684 683 xmlBufferEmpty(parser_data->prop_buf); 685 684 … … 690 689 } 691 690 692 /*691 /* 693 692 * likewise for metastack 694 */693 */ 695 694 if (swish_hash_exists(parser_data->s3->config->metanames, parser_data->tag)) { 696 695 if (SWISH_DEBUG & SWISH_DEBUG_PARSER) … … 719 718 parser_data = (swish_ParserData *)data; 720 719 721 /*720 /* 722 721 * lowercase all names for comparison against metanames (which are 723 722 * also * lowercased) 724 */723 */ 725 724 if (parser_data->tag != NULL) 726 725 swish_xfree(parser_data->tag); … … 744 743 } 745 744 746 /*745 /* 747 746 * turn flag off so next open_tag() can evaluate 748 */747 */ 749 748 parser_data->bump_word = 0; 750 749 … … 753 752 /* 754 753 * handle all characters in doc 755 */754 */ 756 755 static void 757 756 buffer_characters( … … 765 764 xmlBufferPtr buf = parser_data->meta_buf; 766 765 767 /*766 /* 768 767 * why not wchar_t ? len is number of bytes, not number of 769 768 * characters, so xmlChar (i.e., char) works 770 */771 772 /*769 */ 770 771 /* 773 772 * SWISH_DEBUG_MSG( "sizeof output buf is %d; len was %d\n", sizeof(output), 774 773 * len ); 775 */776 777 /*774 */ 775 776 /* 778 777 * SWISH_DEBUG_MSG( "characters"); 779 */778 */ 780 779 781 780 for (i = 0; i < len; i++) { 782 781 783 /*782 /* 784 783 * fprintf(stderr, "%c", ch[i]); 785 */784 */ 786 785 output[i] = ch[i]; 787 786 } … … 795 794 if (parser_data->bump_word && xmlBufferLength(parser_data->prop_buf)) { 796 795 797 /*796 /* 798 797 * SWISH_DEBUG_MSG(" appending ' ' to prop_buf"); 799 */798 */ 800 799 swish_append_buffer(parser_data->prop_buf, (xmlChar *)" ", 1); 801 800 } 802 801 803 /*802 /* 804 803 * SWISH_DEBUG_MSG(" appending '%s' to prop_buf", output); 805 */804 */ 806 805 swish_append_buffer(parser_data->prop_buf, output, len); 807 806 … … 810 809 /* 811 810 * SAX2 callback 812 */811 */ 813 812 static void 814 813 mycharacters( … … 826 825 /* 827 826 * SAX2 callback 828 */827 */ 829 828 static void 830 829 mycomments( … … 835 834 int len = strlen((char *)(char *)ch); 836 835 837 /*836 /* 838 837 * TODO: make comments indexing optional 839 */840 841 /*838 */ 839 840 /* 842 841 * TODO: enable noindex option 843 */842 */ 844 843 return; 845 844 … … 849 848 /* 850 849 * SAX2 callback 851 */850 */ 852 851 static void 853 852 myerr( … … 860 859 va_list args; 861 860 char str[1000]; 862 861 863 862 if (!SWISH_PARSER_WARNINGS) 864 863 return; 865 864 866 865 parser_data = (swish_ParserData *)data; 867 866 868 867 SWISH_WARN("libxml2 error for %s:", parser_data->docinfo->uri); 869 868 870 869 va_start(args, msg); 871 870 vsnprintf((char *)str, 1000, (char *)msg, args); … … 876 875 /* 877 876 * SAX2 callback 878 */877 */ 879 878 static void 880 879 mywarn( … … 887 886 va_list args; 888 887 char str[1000]; 889 888 890 889 if (!SWISH_PARSER_WARNINGS) 891 890 return; 892 891 893 892 parser_data = (swish_ParserData *)user_data; 894 893 895 894 SWISH_WARN("libxml2 warning for %s:", parser_data->docinfo->uri); 896 895 897 896 va_start(args, msg); 898 897 vsnprintf((char *)str, 1000, (char *)msg, args); … … 903 902 /* 904 903 * SAX2 handler struct for html and xml parsing 905 */904 */ 906 905 907 906 xmlSAXHandler my_parser = { … … 965 964 SWISH_DEBUG_MSG("%s -- using %s parser", parser_data->docinfo->uri, parser); 966 965 967 /*966 /* 968 967 * slurp file if not already in memory 969 */968 */ 970 969 if (filename && !buffer) { 971 970 buffer = swish_slurp_file_len(filename, (long)parser_data->docinfo->size); … … 989 988 if (filename) { 990 989 991 /*990 /* 992 991 * SWISH_DEBUG_MSG( "freeing buffer"); 993 */992 */ 994 993 swish_xfree(buffer); 995 994 } … … 1025 1024 ptr->metanames = swish_init_nb(s3->config->metanames); 1026 1025 ptr->metanames->ref_cnt++; 1027 1028 /* 1026 1027 /* 1028 * pick a tokenizer if one has not been explicitly set 1029 */ 1030 if (s3->analyzer->tokenizer == NULL) { 1031 if (s3->analyzer->tokenlist) { 1032 s3->analyzer->tokenizer = (&swish_tokenize3); 1033 } 1034 else { 1035 s3->analyzer->tokenizer = (&swish_tokenize); 1036 } 1037 } 1038 1039 /* 1029 1040 * prime the stacks 1030 */1041 */ 1031 1042 ptr->metastack = (swish_TagStack *)swish_xmalloc(sizeof(swish_TagStack)); 1032 1043 ptr->metastack->name = "MetaStack"; … … 1044 1055 push_tag_stack(ptr->propstack, (xmlChar *)"_", (xmlChar *)"_"); 1045 1056 1046 /*1057 /* 1047 1058 * no such property just to seed stack 1048 */1049 1050 /*1059 */ 1060 1061 /* 1051 1062 * gets toggled per-tag 1052 */1063 */ 1053 1064 ptr->bump_word = 1; 1054 1065 1055 /*1066 /* 1056 1067 * toggle 1057 */1068 */ 1058 1069 ptr->no_index = 0; 1059 1070 1060 /*1071 /* 1061 1072 * shortcut rather than looking parser up in hash for each tag event 1062 */1073 */ 1063 1074 ptr->is_html = 0; 1064 1075 1065 /*1076 /* 1066 1077 * must be zero so that ++ works ok on first word 1067 */1078 */ 1068 1079 ptr->word_pos = 0; 1069 1080 1070 /*1081 /* 1071 1082 * always start at first byte 1072 */1083 */ 1073 1084 ptr->offset = 0; 1074 1085 1075 /*1086 /* 1076 1087 * pointer to the xmlParserCtxt since we want to free it only after 1077 1088 * we're completely done with it. NOTE this is a change per libxml2 1078 1089 * vers > 2.6.16 1079 */1090 */ 1080 1091 ptr->ctxt = NULL; 1081 1092 … … 1097 1108 SWISH_DEBUG_MSG("freeing swish_ParserData"); 1098 1109 1099 /*1110 /* 1100 1111 * dec ref count for shared ptr 1101 */1112 */ 1102 1113 ptr->s3->ref_cnt--; 1103 1114 1104 /*1115 /* 1105 1116 * Pop the stacks 1106 */1117 */ 1107 1118 while ((st = pop_tag_stack(ptr->metastack)) != NULL) { 1108 1119 if (SWISH_DEBUG & SWISH_DEBUG_PARSER) … … 1184 1195 swish_free_wordlist(ptr->wordlist); 1185 1196 } 1186 1197 1187 1198 if (ptr->token_iterator != NULL) { 1188 1199 … … 1238 1249 while (j < SWISH_MAX_HEADERS && i <= SWISH_MAXSTRLEN) { 1239 1250 1240 /*1251 /* 1241 1252 * SWISH_DEBUG_MSG( "i = %d j = %d k = %d", i, j, k); 1242 */1253 */ 1243 1254 1244 1255 if (buf[k] == '\n') { … … 1247 1258 line[i] = buf[k]; 1248 1259 1249 /*1260 /* 1250 1261 * fprintf(stderr, "%c", line[i]); 1251 */1262 */ 1252 1263 i++; 1253 1264 k++; … … 1259 1270 h->nlines++; 1260 1271 1261 /*1272 /* 1262 1273 * get to the next char no matter what, then check if == '\n' 1263 */1274 */ 1264 1275 k++; 1265 1276 1266 1277 if (buf[k] == '\n') { 1267 1278 1268 /*1279 /* 1269 1280 * fprintf(stderr, "found blank line at byte %d\n", k); 1270 */1281 */ 1271 1282 h->body_start = k + 1; 1272 1283 break; … … 1403 1414 SWISH_WARN("Failed to find path name in Content-Type header '%s'", line); 1404 1415 1405 /*1416 /* 1406 1417 * TODO: get encoding out of this line too if 1407 1418 * present. example: text/xml; charset=ISO-8859-1 1408 */1419 */ 1409 1420 1410 1421 if (info->mime != NULL) … … 1431 1442 } 1432 1443 1433 /*1444 /* 1434 1445 * TODO update mode is a vers2 btree feature. still unclear if 1435 1446 * we'll actually support it 1436 */1447 */ 1437 1448 if (!xmlStrncasecmp(line, (const xmlChar *)"Update-Mode", 11)) { 1438 1449 … … 1450 1461 } 1451 1462 1452 /*1463 /* 1453 1464 * if we get here, unrecognized header line 1454 */1465 */ 1455 1466 SWISH_WARN("Unknown header line: '%s'\n", line); 1456 1467 … … 1470 1481 { 1471 1482 1472 /*1483 /* 1473 1484 * init the global env vars, but don't override if already set 1474 */1485 */ 1475 1486 1476 1487 setenv("SWISH_PARSER_WARNINGS", "0", 0); … … 1512 1523 xmlBufferCreateSize((SWISH_MAX_HEADERS * SWISH_MAXSTRLEN) + SWISH_MAX_HEADERS); 1513 1524 1514 /*1525 /* 1515 1526 * based on extprog.c 1516 */1527 */ 1517 1528 while (fgets((char *)ln, SWISH_MAXSTRLEN, fh) != 0) { 1518 1529 1519 /*1530 /* 1520 1531 * we don't use fgetws() because we don't care about * indiv 1521 1532 * characters yet 1522 */1533 */ 1523 1534 1524 1535 xmlChar *end; … … 1528 1539 end = (xmlChar *)strrchr((char *)line, '\n'); 1529 1540 1530 /*1541 /* 1531 1542 * trim any white space at end of doc, including \n 1532 */1543 */ 1533 1544 if (end) { 1534 1545 while (end > line && isspace((int)*(end - 1))) … … 1540 1551 if (nheaders >= min_headers && xmlStrlen(line) == 0) { 1541 1552 1542 /*1553 /* 1543 1554 * blank line indicates body 1544 */1555 */ 1545 1556 curTime = swish_time_elapsed(); 1546 1557 parser_data = init_parser_data(s3); … … 1555 1566 read_buffer = swish_slurp_fh(fh, parser_data->docinfo->size); 1556 1567 1557 /*1568 /* 1558 1569 * parse 1559 */1570 */ 1560 1571 xmlErr = 1561 1572 docparser(parser_data, NULL, read_buffer, parser_data->docinfo->size); … … 1575 1586 SWISH_DEBUG_MSG("passing to handler"); 1576 1587 1577 /*1588 /* 1578 1589 * pass to callback function 1579 */1590 */ 1580 1591 (*s3->parser->handler) (parser_data); 1581 1592 … … 1583 1594 SWISH_DEBUG_MSG("handler done"); 1584 1595 1585 /*1596 /* 1586 1597 * reset everything for next time 1587 */1598 */ 1588 1599 1589 1600 swish_xfree(read_buffer); … … 1593 1604 nheaders = 0; 1594 1605 1595 /*1606 /* 1596 1607 * count the file 1597 */1608 */ 1598 1609 file_cnt++; 1599 1610 … … 1604 1615 } 1605 1616 1606 /*1617 /* 1607 1618 * timer 1608 */1619 */ 1609 1620 curTime = swish_time_elapsed(); 1610 1621 … … 1621 1632 else { 1622 1633 1623 /*1634 /* 1624 1635 * we are reading headers 1625 */1636 */ 1626 1637 if (xmlBufferAdd(head_buf, line, -1)) 1627 1638 SWISH_CROAK("error adding header to buffer"); … … 1661 1672 /* 1662 1673 * PUBLIC 1663 */1674 */ 1664 1675 1665 1676 /* 1666 1677 * pass in a string including headers. like parsing fh, but only for one 1667 1678 * doc 1668 */1679 */ 1669 1680 int 1670 1681 swish_parse_buffer( … … 1689 1700 swish_check_docinfo(parser_data->docinfo, s3->config); 1690 1701 1691 /*1702 /* 1692 1703 * reposition buf pointer at start of body (just past head) 1693 */1704 */ 1694 1705 1695 1706 buf += head->body_start; … … 1697 1708 res = docparser(parser_data, 0, buf, xmlStrlen(buf)); 1698 1709 1699 /*1710 /* 1700 1711 * pass to callback function 1701 */1712 */ 1702 1713 (*s3->parser->handler) (parser_data); 1703 1714 … … 1709 1720 } 1710 1721 1711 /*1722 /* 1712 1723 * free buffers 1713 */1724 */ 1714 1725 free_head(head); 1715 1726 free_parser_data(parser_data); … … 1728 1739 /* 1729 1740 * PUBLIC 1730 */1741 */ 1731 1742 int 1732 1743 swish_parse_file( … … 1752 1763 res = docparser(parser_data, filename, 0, 0); 1753 1764 1754 /*1765 /* 1755 1766 * pass to callback function 1756 */1767 */ 1757 1768 (*s3->parser->handler) (parser_data); 1758 1769 … … 1764 1775 } 1765 1776 1766 /*1777 /* 1767 1778 * free buffers 1768 */1779 */ 1769 1780 free_parser_data(parser_data); 1770 1781 … … 1805 1816 ctxt->sax2 = 1; 1806 1817 1807 /*1818 /* 1808 1819 * always use sax2 -- this pulled from xmlDetextSAX2() 1809 */1820 */ 1810 1821 ctxt->str_xml = xmlDictLookup(ctxt->dict, BAD_CAST "xml", 3); 1811 1822 ctxt->str_xmlns = xmlDictLookup(ctxt->dict, BAD_CAST "xmlns", 5); … … 1814 1825 || (ctxt->str_xml_ns == NULL)) { 1815 1826 1816 /*1827 /* 1817 1828 * xmlErrMemory is/was not a public func but is in 1818 1829 * parserInternals.h * basically, this is a bad, fatal error, so 1819 1830 * we'll just die 1820 */1821 1822 /*1831 */ 1832 1833 /* 1823 1834 * xmlErrMemory(ctxt, NULL); 1824 */1835 */ 1825 1836 SWISH_CROAK("Fatal libxml2 memory error"); 1826 1837 } … … 1913 1924 enc = (xmlChar *)getenv("SWISH_ENCODING"); 1914 1925 1915 /*1926 /* 1916 1927 * TODO better encoding detection. for now we assume unknown text 1917 1928 * files are latin1 1918 */1929 */ 1919 1930 set_encoding(parser_data, buffer); 1920 1931 … … 1958 1969 } 1959 1970 1960 /*1971 /* 1961 1972 * we obviously haven't any tags on which to trigger our metanames, 1962 1973 * so set default 1963 1974 * TODO get title somehow? 1964 1975 * TODO check config to determine if we should buffer swish_prop_description etc 1965 */1976 */ 1966 1977 1967 1978 push_tag_stack(parser_data->metastack, (xmlChar *)SWISH_DEFAULT_METANAME, … … 1992 2003 { 1993 2004 1994 /*2005 /* 1995 2006 * this feels like it doesn't work ... would iconv() be better ? 1996 */2007 */ 1997 2008 1998 2009 swish_xfree(parser_data->docinfo->encoding); … … 2022 2033 else { 2023 2034 2024 /*2035 /* 2025 2036 * if we get here, we didn't error with bad encoding via SAX,
