Changeset 2150 for libswish3/trunk/src/libswish3
- Timestamp:
- 07/29/08 21:35:42 (5 months ago)
- Files:
-
- libswish3/trunk/src/libswish3/analyzer.c (modified) (1 diff)
- libswish3/trunk/src/libswish3/io.c (modified) (1 diff)
- libswish3/trunk/src/libswish3/libswish3.h (modified) (1 diff)
- libswish3/trunk/src/libswish3/namedbuffer.c (modified) (1 diff)
- libswish3/trunk/src/libswish3/parser.c (modified) (83 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/src/libswish3/analyzer.c
r2142 r2150 41 41 a->ref_cnt = 0; 42 42 a->tokenize = config->flags->tokenize; 43 a->tokenlist = 1;43 a->tokenlist = 0; // use wordlist by default 44 44 45 45 if (!a->tokenize && SWISH_DEBUG) libswish3/trunk/src/libswish3/io.c
r2108 r2150 55 55 j++; 56 56 } 57 if (buffer[i] == SWISH_META_CONNECTOR[0] 58 || buffer[i] == SWISH_PROP_CONNECTOR[0] 59 ) { 57 if (buffer[i] == SWISH_TOKENPOS_BUMPER[0]) { 60 58 buffer[i] = '\n'; 61 59 j++; libswish3/trunk/src/libswish3/libswish3.h
r2148 r2150 99 99 #define SWISH_PROP_MTIME "swishlastmodified" 100 100 #define SWISH_PROP_DESCRIPTION "swishdescription" 101 #define SWISH_PROP_CONNECTOR " " 102 #define SWISH_META_CONNECTOR "\3" 101 #define SWISH_TOKENPOS_BUMPER "\3" 103 102 104 103 /* built-in id values */ libswish3/trunk/src/libswish3/namedbuffer.c
r2126 r2150 124 124 125 125 buf = xmlBufferContent(buffer); 126 while ((substr = xmlStrstr(buf, (const xmlChar *)SWISH_ META_CONNECTOR)) != NULL) {126 while ((substr = xmlStrstr(buf, (const xmlChar *)SWISH_TOKENPOS_BUMPER)) != NULL) { 127 127 sub_len = substr - buf; 128 128 SWISH_DEBUG_MSG("%d <%s> substr: %s", sub_len, name, xmlStrsub(buf, 0, sub_len) ); libswish3/trunk/src/libswish3/parser.c
r2148 r2150 1 1 /* 2 * This file is part of libswish33 * Copyright (C) 2007 Peter Karman4 *5 * libswish3 is free software; you can redistribute it and/or modify6 * it under the terms of the GNU General Public License as published by7 * the Free Software Foundation; either version 2 of the License, or8 * (at your option) any later version.9 *10 * libswish3 is distributed in the hope that it will be useful,11 * but WITHOUT ANY WARRANTY; without even the implied warranty of12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the13 * GNU General Public License for more details.14 *15 * You should have received a copy of the GNU General Public License16 * along with libswish3; if not, write to the Free Software17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA2 * This file is part of libswish3 3 * Copyright (C) 2007 Peter Karman 4 * 5 * libswish3 is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * libswish3 is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with libswish3; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 18 */ 19 19 20 20 /* 21 * parse XML doc from memory using libxml2 SAX2 based on tutorial at22 * http://www.jamesh.id.au/articles/libxml-sax/libxml-sax.html23 *24 * save all character() data to buffer, flushing on new metanames25 * flush should split buffer into words, skipping nonwordchars/space, and26 * lowercase all27 *28 * see iswlower(3) man page, etc.29 *30 * all the mb*() functions rely on locale to recognize multi-byte strings31 *21 * parse XML doc from memory using libxml2 SAX2 based on tutorial at 22 * http://www.jamesh.id.au/articles/libxml-sax/libxml-sax.html 23 * 24 * save all character() data to buffer, flushing on new metanames 25 * flush should split buffer into words, skipping nonwordchars/space, and 26 * lowercase all 27 * 28 * see iswlower(3) man page, etc. 29 * 30 * all the mb*() functions rely on locale to recognize multi-byte strings 31 * 32 32 */ 33 33 … … 92 92 93 93 /* 94 * SAX2 support94 * SAX2 support 95 95 */ 96 96 static void mystartElementNs( … … 180 180 181 181 /* 182 * parsing fh/buffer headers182 * parsing fh/buffer headers 183 183 */ 184 184 typedef struct … … 234 234 235 235 /*********************************************************************** 236 * end prototypes237 ***********************************************************************/236 * end prototypes 237 ***********************************************************************/ 238 238 239 239 swish_Parser * … … 248 248 249 249 /* 250 * libxml2 stuff250 * libxml2 stuff 251 251 */ 252 252 xmlInitParser(); … … 254 254 255 255 /* 256 * debugging help256 * debugging help 257 257 */ 258 258 get_env_vars(); … … 283 283 284 284 /* 285 * turn the literal xml/html tag into a swish tag for matching against286 * metanames and properties285 * turn the literal xml/html tag into a swish tag for matching against 286 * metanames and properties 287 287 */ 288 288 static xmlChar * … … 314 314 315 315 /* 316 * normalize all tags316 * normalize all tags 317 317 */ 318 318 swishtag = swish_str_tolower(tag); 319 319 320 320 /* 321 * html tags321 * html tags 322 322 */ 323 323 if (parser_data->is_html) { … … 339 339 340 340 /* 341 * need to bump word_pos so we don't match across block * 342 * elements 343 */ 341 * need to bump word_pos so we don't match across block * 342 * elements 343 */ 344 parser_data->bump_word = 1; 344 345 345 346 } 346 } 347 348 /* 349 * is this an HTML <meta> tag? treat 'name' attribute as a tag * 350 * and 'content' attribute as the tag content * we assume 'name' 351 * and 'content' are always in english. 347 else { 348 349 parser_data->bump_word = 0; 350 351 } 352 } 353 354 /* 355 * is this an HTML <meta> tag? treat 'name' attribute as a tag * 356 * and 'content' attribute as the tag content * we assume 'name' 357 * and 'content' are always in english. 352 358 */ 353 359 … … 361 367 362 368 /* 363 * SWISH_DEBUG_MSG("found name: %s", atts[i+1]);369 * SWISH_DEBUG_MSG("found name: %s", atts[i+1]); 364 370 */ 365 371 metaname = (xmlChar *)atts[i + 1]; … … 369 375 370 376 /* 371 * SWISH_DEBUG_MSG("found content: %s", atts[i+1]);377 * SWISH_DEBUG_MSG("found content: %s", atts[i+1]); 372 378 */ 373 379 metacontent = (xmlChar *)atts[i + 1]; … … 377 383 } 378 384 379 if (metaname != NULL && metacontent != NULL) { 380 if (SWISH_DEBUG & SWISH_DEBUG_PARSER) 381 SWISH_DEBUG_MSG("found HTML meta: %s => %s", metaname, metacontent); 382 383 /* 384 * do not match across metas 385 */ 386 parser_data->bump_word = 1; 387 open_tag(parser_data, metaname, NULL); 388 buffer_characters(parser_data, metacontent, xmlStrlen(metacontent)); 389 close_tag(parser_data, metaname); 390 swish_xfree(swishtag); 391 return NULL; 392 } 393 394 } 395 396 /* 397 * xml tags 385 if (metaname != NULL) { 386 if (metacontent != NULL) { 387 if (SWISH_DEBUG & SWISH_DEBUG_PARSER) 388 SWISH_DEBUG_MSG("found HTML meta: %s => %s", metaname, metacontent); 389 390 /* 391 * do not match across metas 392 */ 393 parser_data->bump_word = 1; 394 open_tag(parser_data, metaname, NULL); 395 buffer_characters(parser_data, metacontent, xmlStrlen(metacontent)); 396 close_tag(parser_data, metaname); 397 swish_xfree(swishtag); 398 return NULL; 399 400 } 401 else { 402 SWISH_WARN("No content for meta tag '%s'", metaname); 403 } 404 } 405 406 } 407 408 /* 409 * xml tags 398 410 */ 399 411 else { 400 412 401 413 /* 402 * TODO make this configurable ala swish2414 * TODO make this configurable ala swish2 403 415 */ 404 416 … … 449 461 450 462 /* 451 * change our internal name for this tag if it is aliased in config463 * change our internal name for this tag if it is aliased in config 452 464 */ 453 465 alias = swish_hash_fetch(parser_data->s3->config->tag_aliases, swishtag); … … 455 467 456 468 /* 457 * SWISH_DEBUG_MSG("%s alias -> %s", swishtag, alias);469 * SWISH_DEBUG_MSG("%s alias -> %s", swishtag, alias); 458 470 */ 459 471 swish_xfree(swishtag); … … 482 494 483 495 /* 484 * since we only flush the buffer when metaname changes, and we do485 * not want to match across metanames, bump the word_pos here before486 * parsing the string and making the tmp wordlist496 * since we only flush the buffer when metaname changes, and we do 497 * not want to match across metanames, bump the word_pos here before 498 * parsing the string and making the tmp wordlist 487 499 */ 488 500 if (parser_data->word_pos) … … 490 502 491 503 /* 492 * add meta_buf as-is to metanames buffer under current tag. this493 * gives us both tokens and raw text de-tagged but organized by494 * metaname.504 * add meta_buf as-is to metanames buffer under current tag. this 505 * gives us both tokens and raw text de-tagged but organized by 506 * metaname. 495 507 */ 496 508 swish_add_buf_to_nb(parser_data->metanames, metaname, parser_data->meta_buf, 497 (xmlChar *)SWISH_ META_CONNECTOR, 0, 1);498 499 /* 500 * add to every metaname on the stack.501 * Disabling this for now, as it ought to be up the handler() to decide502 * to index a token under multiple metanames, and we associate context503 * with the WordList509 (xmlChar *)SWISH_TOKENPOS_BUMPER, 0, 1); 510 511 /* 512 * add to every metaname on the stack. 513 * Disabling this for now, as it ought to be up the handler() to decide 514 * to index a token under multiple metanames, and we associate context 515 * with the WordList 504 516 */ 505 517 … … 510 522 511 523 swish_add_buf_to_nb(parser_data->metanames, s->temp->baked, 512 parser_data->meta_buf, (xmlChar *)SWISH_ META_CONNECTOR, 0,513 1);524 parser_data->meta_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 525 0, 1); 514 526 } 515 527 } … … 525 537 526 538 /* 527 * SAX2 callback539 * SAX2 callback 528 540 */ 529 541 static void … … 534 546 535 547 /* 536 * swish_ParserData *parser_data = (swish_ParserData *) data;548 * swish_ParserData *parser_data = (swish_ParserData *) data; 537 549 */ 538 550 … … 543 555 544 556 /* 545 * SAX2 callback557 * SAX2 callback 546 558 */ 547 559 static void … … 555 567 556 568 /* 557 * whatever's left569 * whatever's left 558 570 */ 559 571 flush_buffer(parser_data, (xmlChar *)SWISH_DEFAULT_METANAME, … … 563 575 564 576 /* 565 * SAX1 callback577 * SAX1 callback 566 578 */ 567 579 static void … … 576 588 577 589 /* 578 * SAX1 callback590 * SAX1 callback 579 591 */ 580 592 static void … … 588 600 589 601 /* 590 * SAX2 handler602 * SAX2 handler 591 603 */ 592 604 static void … … 642 654 643 655 /* 644 * SAX2 handler656 * SAX2 handler 645 657 */ 646 658 static void … … 674 686 675 687 /* 676 * set property if this tag is configured for it688 * set property if this tag is configured for it 677 689 */ 678 690 if (swish_hash_exists(parser_data->s3->config->properties, parser_data->tag)) { … … 690 702 691 703 /* 692 * likewise for metastack704 * likewise for metastack 693 705 */ 694 706 if (swish_hash_exists(parser_data->s3->config->metanames, parser_data->tag)) { … … 719 731 720 732 /* 721 * lowercase all names for comparison against metanames (which are722 * also * lowercased)733 * lowercase all names for comparison against metanames (which are 734 * also * lowercased) 723 735 */ 724 736 if (parser_data->tag != NULL) … … 743 755 } 744 756 745 /*746 * turn flag off so next open_tag() can evaluate747 */748 parser_data->bump_word = 0;749 750 757 } 751 758 752 759 /* 753 * handle all characters in doc760 * handle all characters in doc 754 761 */ 755 762 static void … … 765 772 766 773 /* 767 * why not wchar_t ? len is number of bytes, not number of768 * characters, so xmlChar (i.e., char) works769 */ 770 771 /* 772 * SWISH_DEBUG_MSG( "sizeof output buf is %d; len was %d\n", sizeof(output),773 * len );774 */ 775 776 /* 777 * SWISH_DEBUG_MSG( "characters");774 * why not wchar_t ? len is number of bytes, not number of 775 * characters, so xmlChar (i.e., char) works 776 */ 777 778 /* 779 * SWISH_DEBUG_MSG( "sizeof output buf is %d; len was %d\n", sizeof(output), 780 * len ); 781 */ 782 783 /* 784 * SWISH_DEBUG_MSG( "characters"); 778 785 */ 779 786 780 787 for (i = 0; i < len; i++) { 781 782 /*783 * fprintf(stderr, "%c", ch[i]);784 */785 788 output[i] = ch[i]; 786 789 } 787 790 output[i] = (xmlChar)NULL; 788 791 789 if (parser_data->bump_word && xmlBufferLength(buf)) 790 swish_append_buffer(buf, (xmlChar *)" ", 1); 791 792 if (parser_data->bump_word && xmlBufferLength(buf)) { 793 swish_append_buffer(buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 1); 794 } 795 792 796 swish_append_buffer(buf, output, len); 793 797 794 798 if (parser_data->bump_word && xmlBufferLength(parser_data->prop_buf)) { 795 796 /* 797 * SWISH_DEBUG_MSG(" appending ' ' to prop_buf"); 798 */ 799 swish_append_buffer(parser_data->prop_buf, (xmlChar *)" ", 1); 800 } 801 802 /* 803 * SWISH_DEBUG_MSG(" appending '%s' to prop_buf", output); 804 */ 799 swish_append_buffer(parser_data->prop_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 1); 800 } 801 else if (xmlBufferLength(parser_data->prop_buf)) { 802 swish_append_buffer(parser_data->prop_buf, (xmlChar*)" ", 1); 803 } 804 805 805 swish_append_buffer(parser_data->prop_buf, output, len); 806 807 806 } 808 807 809 808 /* 810 * SAX2 callback809 * SAX2 callback 811 810 */ 812 811 static void … … 824 823 825 824 /* 826 * SAX2 callback825 * SAX2 callback 827 826 */ 828 827 static void … … 835 834 836 835 /* 837 * TODO: make comments indexing optional838 */ 839 840 /* 841 * TODO: enable noindex option836 * TODO: make comments indexing optional 837 */ 838 839 /* 840 * TODO: enable noindex option 842 841 */ 843 842 return; … … 847 846 848 847 /* 849 * SAX2 callback848 * SAX2 callback 850 849 */ 851 850 static void … … 874 873 875 874 /* 876 * SAX2 callback875 * SAX2 callback 877 876 */ 878 877 static void … … 901 900 902 901 /* 903 * SAX2 handler struct for html and xml parsing902 * SAX2 handler struct for html and xml parsing 904 903 */ 905 904 … … 965 964 966 965 /* 967 * slurp file if not already in memory966 * slurp file if not already in memory 968 967 */ 969 968 if (filename && !buffer) { … … 989 988 990 989 /* 991 * SWISH_DEBUG_MSG( "freeing buffer");990 * SWISH_DEBUG_MSG( "freeing buffer"); 992 991 */ 993 992 swish_xfree(buffer); … … 1024 1023 ptr->metanames = swish_init_nb(s3->config->metanames); 1025 1024 ptr->metanames->ref_cnt++; 1026 1025 1027 1026 /* 1028 1027 * pick a tokenizer if one has not been explicitly set … … 1038 1037 1039 1038 /* 1040 * prime the stacks1039 * prime the stacks 1041 1040 */ 1042 1041 ptr->metastack = (swish_TagStack *)swish_xmalloc(sizeof(swish_TagStack)); … … 1056 1055 1057 1056 /* 1058 * no such property just to seed stack1059 */ 1060 1061 /* 1062 * gets toggled per-tag1057 * no such property just to seed stack 1058 */ 1059 1060 /* 1061 * gets toggled per-tag 1063 1062 */ 1064 1063 ptr->bump_word = 1; 1065 1064 1066 1065 /* 1067 * toggle1066 * toggle 1068 1067 */ 1069 1068 ptr->no_index = 0; 1070 1069 1071 1070 /* 1072 * shortcut rather than looking parser up in hash for each tag event1071 * shortcut rather than looking parser up in hash for each tag event 1073 1072 */ 1074 1073 ptr->is_html = 0; 1075 1074 1076 1075 /* 1077 * must be zero so that ++ works ok on first word1076 * must be zero so that ++ works ok on first word 1078 1077 */ 1079 1078 ptr->word_pos = 0; 1080 1079 1081 1080 /* 1082 * always start at first byte1081 * always start at first byte 1083 1082 */ 1084 1083 ptr->offset = 0; 1085 1084 1086 1085 /* 1087 * pointer to the xmlParserCtxt since we want to free it only after1088 * we're completely done with it. NOTE this is a change per libxml21089 * vers > 2.6.161086 * pointer to the xmlParserCtxt since we want to free it only after 1087 * we're completely done with it. NOTE this is a change per libxml2 1088 * vers > 2.6.16 1090 1089 */ 1091 1090 ptr->ctxt = NULL; … … 1109 1108 1110 1109 /* 1111 * dec ref count for shared ptr1110 * dec ref count for shared ptr 1112 1111 */ 1113 1112 ptr->s3->ref_cnt--; 1114 1113 1115 1114 /* 1116 * Pop the stacks1115 * Pop the stacks 1117 1116 */ 1118 1117 while ((st = pop_tag_stack(ptr->metastack)) != NULL) { … … 1250 1249 1251 1250 /* 1252 * SWISH_DEBUG_MSG( "i = %d j = %d k = %d", i, j, k);1251 * SWISH_DEBUG_MSG( "i = %d j = %d k = %d", i, j, k); 1253 1252 */ 1254 1253 … … 1259 1258 1260 1259 /* 1261 * fprintf(stderr, "%c", line[i]);1260 * fprintf(stderr, "%c", line[i]); 1262 1261 */ 1263 1262 i++; … … 1271 1270 1272 1271 /* 1273 * get to the next char no matter what, then check if == '\n'1272 * get to the next char no matter what, then check if == '\n' 1274 1273 */ 1275 1274 k++; … … 1278 1277 1279 1278 /* 1280 * fprintf(stderr, "found blank line at byte %d\n", k);1279 * fprintf(stderr, "found blank line at byte %d\n", k); 1281 1280 */ 1282 1281 h->body_start = k + 1; … … 1415 1414 1416 1415 /* 1417 * TODO: get encoding out of this line too if1418 * present. example: text/xml; charset=ISO-8859-11416 * TODO: get encoding out of this line too if 1417 * present. example: text/xml; charset=ISO-8859-1 1419 1418 */ 1420 1419 … … 1443 1442 1444 1443 /* 1445 * TODO update mode is a vers2 btree feature. still unclear if1446 * we'll actually support it1444 * TODO update mode is a vers2 btree feature. still unclear if 1445 * we'll actually support it 1447 1446 */ 1448 1447 if (!xmlStrncasecmp(line, (const xmlChar *)"Update-Mode", 11)) { … … 1462 1461 1463 1462 /* 1464 * if we get here, unrecognized header line1463 * if we get here, unrecognized header line 1465 1464 */ 1466 1465 SWISH_WARN("Unknown header line: '%s'\n", line); … … 1482 1481 1483 1482 /* 1484 * init the global env vars, but don't override if already set1483 * init the global env vars, but don't override if already set 1485 1484 */ 1486 1485 … … 1524 1523 1525 1524 /* 1526 * based on extprog.c1525 * based on extprog.c 1527 1526 */ 1528 1527 while (fgets((char *)ln, SWISH_MAXSTRLEN, fh) != 0) { 1529 1528 1530 1529 /* 1531 * we don't use fgetws() because we don't care about * indiv1532 * characters yet1530 * we don't use fgetws() because we don't care about * indiv 1531 * characters yet 1533 1532 */ 1534 1533 … … 1540 1539 1541 1540 /* 1542 * trim any white space at end of doc, including \n1541 * trim any white space at end of doc, including \n 1543 1542 */ 1544 1543 if (end) { … … 1552 1551 1553 1552 /* 1554 * blank line indicates body1553 * blank line indicates body 1555 1554 */ 1556 1555 curTime = swish_time_elapsed(); … … 1567 1566 1568 1567 /* 1569 * parse1568 * parse 1570 1569 */ 1571 1570 xmlErr = … … 1587 1586 1588 1587 /* 1589 * pass to callback function1588 * pass to callback function 1590 1589 */ 1591 1590 (*s3->parser->handler) (parser_data); … … 1595 1594 1596 1595 /* 1597 * reset everything for next time1596 * reset everything for next time 1598 1597 */ 1599 1598 … … 1605 1604 1606 1605 /* 1607 * count the file1606 * count the file 1608 1607 */ 1609 1608 file_cnt++; … … 1616 1615 1617 1616 /* 1618 * timer1617 * timer 1619 1618 */ 1620 1619 curTime = swish_time_elapsed(); … … 1633 1632 1634 1633 /* 1635 * we are reading headers1634 * we are reading headers 1636 1635 */ 1637 1636 if (xmlBufferAdd(head_buf, line, -1)) … … 1671 1670 1672 1671 /* 1673 * PUBLIC1672 * PUBLIC 1674 1673 */ 1675 1674 1676 1675 /* 1677 * pass in a string including headers. like parsing fh, but only for one1678 * doc1676 * pass in a string including headers. like parsing fh, but only for one 1677 * doc 1679 1678 */ 1680 1679 int … … 1701 1700 1702 1701 /* 1703 * reposition buf pointer at start of body (just past head)1702 * reposition buf pointer at start of body (just past head) 1704 1703 */ 1705 1704 … … 1709 1708 1710 1709 /* 1711 * pass to callback function1710 * pass to callback function 1712 1711 */ 1713 1712 (*s3->parser->handler) (parser_data); … … 1721 1720 1722 1721 /* 1723 * free buffers1722 * free buffers 1724 1723 */ 1725 1724 free_head(head); … … 1738 1737 1739 1738 /* 1740 * PUBLIC1739 * PUBLIC 1741 1740 */ 1742 1741 int … … 1764 1763 1765 1764 /* 1766 * pass to callback function1765 * pass to callback function 1767 1766 */ 1768 1767 (*s3->parser->handler) (parser_data); … … 1776 1775 1777 1776 /* 1778 * free buffers1777 * free buffers 1779 1778 */ 1780 1779 free_parser_data(parser_data); … … 1791 1790 1792 1791 /** 1793 * based on libxml2 xmlSAXUserParseMemory in parser.c1794 * which we don't use directly so that we can get encoding1792 * based on libxml2 xmlSAXUserParseMemory in parser.c 1793 * which we don't use directly so that we can get encoding 1795 1794 */ 1796 1795 static int … … 1817 1816 1818 1817 /* 1819 * always use sax2 -- this pulled from xmlDetextSAX2()1818 * always use sax2 -- this pulled from xmlDetextSAX2() 1820 1819 */ 1821 1820 ctxt->str_xml = xmlDictLookup(ctxt->dict, BAD_CAST "xml", 3); … … 1826 1825 1827 1826 /* 1828 * xmlErrMemory is/was not a public func but is in1829 * parserInternals.h * basically, this is a bad, fatal error, so1830 * we'll just die1831 */ 1832 1833 /* 1834 * xmlErrMemory(ctxt, NULL);1827 * xmlErrMemory is/was not a public func but is in 1828 * parserInternals.h * basically, this is a bad, fatal error, so 1829 * we'll just die 1830 */ 1831 1832 /* 1833 * xmlErrMemory(ctxt, NULL); 1835 1834 */ 1836 1835 SWISH_CROAK("Fatal libxml2 memory error"); … … 1925 1924 1926 1925 /* 1927 * TODO better encoding detection. for now we assume unknown text1928 * files are latin11926 * TODO better encoding detection. for now we assume unknown text 1927 * files are latin1 1929 1928 */ 1930 1929 set_encoding(parser_data, buffer); … … 1970 1969 1971 1970 /* 1972 * we obviously haven't any tags on which to trigger our metanames,1973 * so set default1974 * TODO get title somehow?1975 * TODO check config to determine if we should buffer swish_prop_description etc1971 * we obviously haven't any tags on which to trigger our metanames, 1972 * so set default 1973 * TODO get title somehow? 1974 * TODO check config to determine if we should buffer swish_prop_description etc 1976 1975 */ 1977 1976 … … 2004 2003 2005 2004 /* 2006 * this feels like it doesn't work ... would iconv() be better ?2005 * this feels like it doesn't work ... would iconv() be better ? 2007 2006 */ 2008 2007 … … 2034 2033 2035 2034 /* 2036 * if we get here, we didn't error with bad encoding via SAX,2037 * so assume it's UTF-82035 * if we get here, we didn't error with bad encoding via SAX, 2036 * so assume it's UTF-8 2038 2037 */ 2039 2038 enc = swish_xstrdup((xmlChar *)SWISH_DEFAULT_ENCODING); … … 2070 2069 2071 2070 /* 2072 * array buffer (token_iterator) tokenizer2071 * array buffer (token_iterator) tokenizer 2073 2072 */ 2074 2073 … … 2084 2083 2085 2084 /* 2086 * linked-list (wordlist) tokenizer2085 * linked-list (wordlist) tokenizer 2087 2086 */ 2088 2087 … … 2102 2101 2103 2102 /* 2104 * append tmplist to master list2103 * append tmplist to master list 2105 2104 */ 2106 2105 parser_data->word_pos += tmplist->nwords; … … 2113 2112 2114 2113 /* 2115 * point tmp list first word's prev at current last word2114 * point tmp list first word's prev at current last word 2116 2115 */ 2117 2116 tmplist->head->prev = parser_data->wordlist->tail; 2118 2117 2119 2118 /* 2120 * point current last word's 'next' at first word of tmp list2119 * point current last word's 'next' at first word of tmp list 2121 2120 */ 2122 2121 parser_data->wordlist->tail->next = tmplist->head; 2123 2122 2124 2123 /* 2125 * point current last word at last word of tmp list2124 * point current last word at last word of tmp list 2126 2125 */ 2127 2126 parser_data->wordlist->tail = tmplist->tail; … … 2133 2132 2134 2133 /* 2135 * global offset is now the same as the tail end_offset2134 * global offset is now the same as the tail end_offset 2136 2135 */ 2137 2136 parser_data->offset = parser_data->wordlist->tail->end_offset; … … 2166 2165 2167 2166 /* 2168 * return stack as single string of space-separated names2167 * return stack as single string of space-separated names 2169 2168 */ 2170 2169 static xmlChar * … … 2229 2228 if (baked != NULL) { 2230 2229 prop = swish_hash_fetch(parser_data->s3->config->properties, baked); 2231 2232 /*2233 * should we strip whitespace from this particular property ?2234 */2235 2230 if (prop->verbatim) 2236 2231 cleanwsp = 0; 2237 2232 2238 2233 swish_add_buf_to_nb(parser_data->properties, baked, parser_data->prop_buf, 2239 (xmlChar *)SWISH_ PROP_CONNECTOR, cleanwsp, 0);2234 (xmlChar *)SWISH_TOKENPOS_BUMPER, cleanwsp, 0); 2240 2235 2241 2236 } … … 2248 2243 2249 2244 swish_add_buf_to_nb(parser_data->properties, stack->temp->baked, 2250 parser_data->prop_buf, (xmlChar *)SWISH_ PROP_CONNECTOR,2245 parser_data->prop_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 2251 2246 cleanwsp, 0); 2252 2247 } … … 2363 2358 2364 2359 /* 2365 * returns top of the stack if the current tag matches.2360 * returns top of the stack if the current tag matches. 2366 2361 */ 2367 2362 static swish_Tag * … … 2389 2384 2390 2385 /* 2391 * more than default meta2386 * more than default meta 2392 2387 */ 2393 2388 if ((st = pop_tag_stack(stack)) != NULL) { … … 2403 2398 2404 2399 /* 2405 * only tag on stack. TODO do we ever get here?2400 * only tag on stack. TODO do we ever get here? 2406 2401 */ 2407 2402 else if (stack->count) {
