Changeset 2150

Show
Ignore:
Timestamp:
07/29/08 21:35:42 (4 months ago)
Author:
karpet
Message:

ditch SWISH_META_CONNECTOR and SWISH_PROP_CONNECTOR in favor of SWISH_TOKENPOS_BUMPER; add tests showing differences in inline parsing for xml vs html

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • libswish3/trunk/src/libswish3/analyzer.c

    r2142 r2150  
    4141    a->ref_cnt = 0; 
    4242    a->tokenize = config->flags->tokenize; 
    43     a->tokenlist = 1; 
     43    a->tokenlist = 0;   // use wordlist by default 
    4444 
    4545    if (!a->tokenize && SWISH_DEBUG) 
  • libswish3/trunk/src/libswish3/io.c

    r2108 r2150  
    5555                j++; 
    5656            } 
    57             if (buffer[i] == SWISH_META_CONNECTOR[0] 
    58                 || buffer[i] == SWISH_PROP_CONNECTOR[0] 
    59                 ) { 
     57            if (buffer[i] == SWISH_TOKENPOS_BUMPER[0]) { 
    6058                buffer[i] = '\n'; 
    6159                j++; 
  • libswish3/trunk/src/libswish3/libswish3.h

    r2148 r2150  
    9999#define SWISH_PROP_MTIME           "swishlastmodified" 
    100100#define SWISH_PROP_DESCRIPTION     "swishdescription" 
    101 #define SWISH_PROP_CONNECTOR       " " 
    102 #define SWISH_META_CONNECTOR       "\3" 
     101#define SWISH_TOKENPOS_BUMPER      "\3" 
    103102 
    104103/* built-in id values */ 
  • libswish3/trunk/src/libswish3/namedbuffer.c

    r2126 r2150  
    124124     
    125125    buf = xmlBufferContent(buffer); 
    126     while ((substr = xmlStrstr(buf, (const xmlChar *)SWISH_META_CONNECTOR)) != NULL) { 
     126    while ((substr = xmlStrstr(buf, (const xmlChar *)SWISH_TOKENPOS_BUMPER)) != NULL) { 
    127127        sub_len = substr - buf; 
    128128        SWISH_DEBUG_MSG("%d <%s> substr: %s", sub_len, name, xmlStrsub(buf, 0, sub_len) ); 
  • libswish3/trunk/src/libswish3/parser.c

    r2148 r2150  
    11/*  
    2  * This file is part of libswish3 
    3  * Copyright (C) 2007 Peter Karman 
    4 
    5  *  libswish3 is free software; you can redistribute it and/or modify 
    6  *  it under the terms of the GNU General Public License as published by 
    7  *  the Free Software Foundation; either version 2 of the License, or 
    8  *  (at your option) any later version. 
    9 
    10  *  libswish3 is distributed in the hope that it will be useful, 
    11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of 
    12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
    13  *  GNU General Public License for more details. 
    14 
    15  *  You should have received a copy of the GNU General Public License 
    16  *  along with libswish3; if not, write to the Free Software 
    17  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
     2* This file is part of libswish3 
     3* Copyright (C) 2007 Peter Karman 
     4
     5*  libswish3 is free software; you can redistribute it and/or modify 
     6*  it under the terms of the GNU General Public License as published by 
     7*  the Free Software Foundation; either version 2 of the License, or 
     8*  (at your option) any later version. 
     9
     10*  libswish3 is distributed in the hope that it will be useful, 
     11*  but WITHOUT ANY WARRANTY; without even the implied warranty of 
     12*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
     13*  GNU General Public License for more details. 
     14
     15*  You should have received a copy of the GNU General Public License 
     16*  along with libswish3; if not, write to the Free Software 
     17*  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
    1818*/ 
    1919 
    2020/*  
    21  * parse XML doc from memory using libxml2 SAX2 based on tutorial at 
    22  * http://www.jamesh.id.au/articles/libxml-sax/libxml-sax.html 
    23 
    24  * save all character() data to buffer, flushing on new metanames 
    25  * flush should split buffer into words, skipping nonwordchars/space, and 
    26  * lowercase all 
    27 
    28  * see iswlower(3) man page, etc. 
    29 
    30  * all the mb*() functions rely on locale to recognize multi-byte strings 
    31 
     21* parse XML doc from memory using libxml2 SAX2 based on tutorial at 
     22* http://www.jamesh.id.au/articles/libxml-sax/libxml-sax.html 
     23
     24* save all character() data to buffer, flushing on new metanames 
     25* flush should split buffer into words, skipping nonwordchars/space, and 
     26* lowercase all 
     27
     28* see iswlower(3) man page, etc. 
     29
     30* all the mb*() functions rely on locale to recognize multi-byte strings 
     31
    3232*/ 
    3333 
     
    9292 
    9393/*  
    94  * SAX2 support  
     94* SAX2 support  
    9595*/ 
    9696static void mystartElementNs( 
     
    180180 
    181181/*  
    182  * parsing fh/buffer headers  
     182* parsing fh/buffer headers  
    183183*/ 
    184184typedef struct 
     
    234234 
    235235/*********************************************************************** 
    236  *                end prototypes 
    237  ***********************************************************************/ 
     236*                end prototypes 
     237***********************************************************************/ 
    238238 
    239239swish_Parser * 
     
    248248 
    249249/* 
    250      * libxml2 stuff  
     250* libxml2 stuff  
    251251*/ 
    252252    xmlInitParser(); 
     
    254254 
    255255/* 
    256      * debugging help  
     256* debugging help  
    257257*/ 
    258258    get_env_vars(); 
     
    283283 
    284284/*  
    285  * turn the literal xml/html tag into a swish tag for matching against 
    286  * metanames and properties  
     285* turn the literal xml/html tag into a swish tag for matching against 
     286* metanames and properties  
    287287*/ 
    288288static xmlChar * 
     
    314314 
    315315/* 
    316      * normalize all tags  
     316* normalize all tags  
    317317*/ 
    318318    swishtag = swish_str_tolower(tag); 
    319319 
    320320/* 
    321      * html tags  
     321* html tags  
    322322*/ 
    323323    if (parser_data->is_html) { 
     
    339339 
    340340/* 
    341                  * need to bump word_pos so we don't match across block * 
    342                  * elements  
    343 */ 
     341* need to bump word_pos so we don't match across block * 
     342* elements  
     343*/ 
     344                parser_data->bump_word = 1; 
    344345 
    345346            } 
    346         } 
    347  
    348 /* 
    349          * is this an HTML <meta> tag? treat 'name' attribute as a tag * 
    350          * and 'content' attribute as the tag content * we assume 'name' 
    351          * and 'content' are always in english.  
     347            else { 
     348             
     349                parser_data->bump_word = 0; 
     350             
     351            } 
     352        } 
     353 
     354/* 
     355* is this an HTML <meta> tag? treat 'name' attribute as a tag * 
     356* and 'content' attribute as the tag content * we assume 'name' 
     357* and 'content' are always in english.  
    352358*/ 
    353359 
     
    361367 
    362368/* 
    363                      * SWISH_DEBUG_MSG("found name: %s", atts[i+1]);  
     369* SWISH_DEBUG_MSG("found name: %s", atts[i+1]);  
    364370*/ 
    365371                    metaname = (xmlChar *)atts[i + 1]; 
     
    369375 
    370376/* 
    371                      * SWISH_DEBUG_MSG("found content: %s", atts[i+1]);  
     377* SWISH_DEBUG_MSG("found content: %s", atts[i+1]);  
    372378*/ 
    373379                    metacontent = (xmlChar *)atts[i + 1]; 
     
    377383        } 
    378384 
    379         if (metaname != NULL && metacontent != NULL) { 
    380             if (SWISH_DEBUG & SWISH_DEBUG_PARSER) 
    381                 SWISH_DEBUG_MSG("found HTML meta: %s => %s", metaname, metacontent); 
    382  
    383 /* 
    384              * do not match across metas  
    385 */ 
    386             parser_data->bump_word = 1; 
    387             open_tag(parser_data, metaname, NULL); 
    388             buffer_characters(parser_data, metacontent, xmlStrlen(metacontent)); 
    389             close_tag(parser_data, metaname); 
    390             swish_xfree(swishtag); 
    391             return NULL; 
    392         } 
    393  
    394     } 
    395  
    396 /* 
    397      * xml tags  
     385        if (metaname != NULL) { 
     386            if (metacontent != NULL) { 
     387                if (SWISH_DEBUG & SWISH_DEBUG_PARSER) 
     388                    SWISH_DEBUG_MSG("found HTML meta: %s => %s", metaname, metacontent); 
     389 
     390/* 
     391* do not match across metas  
     392*/ 
     393                parser_data->bump_word = 1; 
     394                open_tag(parser_data, metaname, NULL); 
     395                buffer_characters(parser_data, metacontent, xmlStrlen(metacontent)); 
     396                close_tag(parser_data, metaname); 
     397                swish_xfree(swishtag); 
     398                return NULL; 
     399 
     400            } 
     401            else { 
     402                SWISH_WARN("No content for meta tag '%s'", metaname); 
     403            } 
     404        } 
     405 
     406    } 
     407 
     408/* 
     409* xml tags  
    398410*/ 
    399411    else { 
    400412 
    401413/* 
    402          * TODO make this configurable ala swish2  
     414* TODO make this configurable ala swish2  
    403415*/ 
    404416 
     
    449461 
    450462/* 
    451      * change our internal name for this tag if it is aliased in config  
     463* change our internal name for this tag if it is aliased in config  
    452464*/ 
    453465    alias = swish_hash_fetch(parser_data->s3->config->tag_aliases, swishtag); 
     
    455467 
    456468/* 
    457          * SWISH_DEBUG_MSG("%s alias -> %s", swishtag, alias);  
     469* SWISH_DEBUG_MSG("%s alias -> %s", swishtag, alias);  
    458470*/ 
    459471        swish_xfree(swishtag); 
     
    482494 
    483495/* 
    484      * since we only flush the buffer when metaname changes, and we do 
    485      * not want to match across metanames, bump the word_pos here before  
    486      * parsing the string and making the tmp wordlist  
     496* since we only flush the buffer when metaname changes, and we do 
     497* not want to match across metanames, bump the word_pos here before  
     498* parsing the string and making the tmp wordlist  
    487499*/ 
    488500    if (parser_data->word_pos) 
     
    490502 
    491503/* 
    492      * add meta_buf as-is to metanames buffer under current tag. this 
    493      * gives us both tokens and raw text de-tagged but organized by 
    494      * metaname.  
     504* add meta_buf as-is to metanames buffer under current tag. this 
     505* gives us both tokens and raw text de-tagged but organized by 
     506* metaname.  
    495507*/ 
    496508    swish_add_buf_to_nb(parser_data->metanames, metaname, parser_data->meta_buf, 
    497                         (xmlChar *)SWISH_META_CONNECTOR, 0, 1); 
    498  
    499 /* 
    500      *  add to every metaname on the stack. 
    501      *  Disabling this for now, as it ought to be up the handler() to decide 
    502      *  to index a token under multiple metanames, and we associate context 
    503      *  with the WordList 
     509                        (xmlChar *)SWISH_TOKENPOS_BUMPER, 0, 1); 
     510 
     511/* 
     512*  add to every metaname on the stack. 
     513*  Disabling this for now, as it ought to be up the handler() to decide 
     514*  to index a token under multiple metanames, and we associate context 
     515*  with the WordList 
    504516*/ 
    505517 
     
    510522 
    511523            swish_add_buf_to_nb(parser_data->metanames, s->temp->baked, 
    512                                 parser_data->meta_buf, (xmlChar *)SWISH_META_CONNECTOR, 0
    513                                 1); 
     524                                parser_data->meta_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER
     525                                0, 1); 
    514526        } 
    515527    } 
     
    525537 
    526538/*  
    527  * SAX2 callback  
     539* SAX2 callback  
    528540*/ 
    529541static void 
     
    534546 
    535547/* 
    536      * swish_ParserData *parser_data = (swish_ParserData *) data;  
     548* swish_ParserData *parser_data = (swish_ParserData *) data;  
    537549*/ 
    538550 
     
    543555 
    544556/*  
    545  * SAX2 callback  
     557* SAX2 callback  
    546558*/ 
    547559static void 
     
    555567 
    556568/* 
    557      * whatever's left  
     569* whatever's left  
    558570*/ 
    559571    flush_buffer(parser_data, (xmlChar *)SWISH_DEFAULT_METANAME, 
     
    563575 
    564576/*  
    565  * SAX1 callback  
     577* SAX1 callback  
    566578*/ 
    567579static void 
     
    576588 
    577589/*  
    578  * SAX1 callback  
     590* SAX1 callback  
    579591*/ 
    580592static void 
     
    588600 
    589601/*  
    590  * SAX2 handler  
     602* SAX2 handler  
    591603*/ 
    592604static void 
     
    642654 
    643655/*  
    644  * SAX2 handler  
     656* SAX2 handler  
    645657*/ 
    646658static void 
     
    674686 
    675687/* 
    676      * set property if this tag is configured for it  
     688* set property if this tag is configured for it  
    677689*/ 
    678690    if (swish_hash_exists(parser_data->s3->config->properties, parser_data->tag)) { 
     
    690702 
    691703/* 
    692      * likewise for metastack  
     704* likewise for metastack  
    693705*/ 
    694706    if (swish_hash_exists(parser_data->s3->config->metanames, parser_data->tag)) { 
     
    719731 
    720732/* 
    721      * lowercase all names for comparison against metanames (which are 
    722      * also * lowercased)  
     733* lowercase all names for comparison against metanames (which are 
     734* also * lowercased)  
    723735*/ 
    724736    if (parser_data->tag != NULL) 
     
    743755    } 
    744756 
    745 /* 
    746      * turn flag off so next open_tag() can evaluate  
    747 */ 
    748     parser_data->bump_word = 0; 
    749  
    750757} 
    751758 
    752759/*  
    753  * handle all characters in doc  
     760* handle all characters in doc  
    754761*/ 
    755762static void 
     
    765772 
    766773/* 
    767      * why not wchar_t ? len is number of bytes, not number of 
    768      * characters, so xmlChar (i.e., char) works 
    769 */ 
    770  
    771 /* 
    772      * SWISH_DEBUG_MSG( "sizeof output buf is %d; len was %d\n", sizeof(output), 
    773      * len ); 
    774 */ 
    775  
    776 /* 
    777      * SWISH_DEBUG_MSG( "characters");  
     774* why not wchar_t ? len is number of bytes, not number of 
     775* characters, so xmlChar (i.e., char) works 
     776*/ 
     777 
     778/* 
     779* SWISH_DEBUG_MSG( "sizeof output buf is %d; len was %d\n", sizeof(output), 
     780* len ); 
     781*/ 
     782 
     783/* 
     784* SWISH_DEBUG_MSG( "characters");  
    778785*/ 
    779786 
    780787    for (i = 0; i < len; i++) { 
    781  
    782 /* 
    783          * fprintf(stderr, "%c", ch[i]);  
    784 */ 
    785788        output[i] = ch[i]; 
    786789    } 
    787790    output[i] = (xmlChar)NULL; 
    788791 
    789     if (parser_data->bump_word && xmlBufferLength(buf)) 
    790         swish_append_buffer(buf, (xmlChar *)" ", 1); 
    791  
     792    if (parser_data->bump_word && xmlBufferLength(buf)) { 
     793        swish_append_buffer(buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 1); 
     794    } 
     795     
    792796    swish_append_buffer(buf, output, len); 
    793797 
    794798    if (parser_data->bump_word && xmlBufferLength(parser_data->prop_buf)) { 
    795  
    796 /* 
    797          * SWISH_DEBUG_MSG(" appending ' ' to prop_buf");  
    798 */ 
    799         swish_append_buffer(parser_data->prop_buf, (xmlChar *)" ", 1); 
    800     } 
    801  
    802 /* 
    803      * SWISH_DEBUG_MSG(" appending '%s' to prop_buf", output);  
    804 */ 
     799        swish_append_buffer(parser_data->prop_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 1); 
     800    } 
     801    else if (xmlBufferLength(parser_data->prop_buf)) { 
     802        swish_append_buffer(parser_data->prop_buf, (xmlChar*)" ", 1); 
     803    } 
     804 
    805805    swish_append_buffer(parser_data->prop_buf, output, len); 
    806  
    807806} 
    808807 
    809808/*  
    810  * SAX2 callback  
     809* SAX2 callback  
    811810*/ 
    812811static void 
     
    824823 
    825824/*  
    826  * SAX2 callback  
     825* SAX2 callback  
    827826*/ 
    828827static void 
     
    835834 
    836835/* 
    837      * TODO: make comments indexing optional  
    838 */ 
    839  
    840 /* 
    841      * TODO: enable noindex option  
     836* TODO: make comments indexing optional  
     837*/ 
     838 
     839/* 
     840* TODO: enable noindex option  
    842841*/ 
    843842    return; 
     
    847846 
    848847/*  
    849  * SAX2 callback  
     848* SAX2 callback  
    850849*/ 
    851850static void 
     
    874873 
    875874/*  
    876  * SAX2 callback  
     875* SAX2 callback  
    877876*/ 
    878877static void 
     
    901900 
    902901/*  
    903  * SAX2 handler struct for html and xml parsing  
     902* SAX2 handler struct for html and xml parsing  
    904903*/ 
    905904 
     
    965964 
    966965/* 
    967      * slurp file if not already in memory  
     966* slurp file if not already in memory  
    968967*/ 
    969968    if (filename && !buffer) { 
     
    989988 
    990989/* 
    991          * SWISH_DEBUG_MSG( "freeing buffer");  
     990* SWISH_DEBUG_MSG( "freeing buffer");  
    992991*/ 
    993992        swish_xfree(buffer); 
     
    10241023    ptr->metanames = swish_init_nb(s3->config->metanames); 
    10251024    ptr->metanames->ref_cnt++; 
    1026      
     1025 
    10271026/* 
    10281027*   pick a tokenizer if one has not been explicitly set 
     
    10381037 
    10391038/* 
    1040      * prime the stacks  
     1039* prime the stacks  
    10411040*/ 
    10421041    ptr->metastack = (swish_TagStack *)swish_xmalloc(sizeof(swish_TagStack)); 
     
    10561055 
    10571056/* 
    1058      * no such property just to seed stack  
    1059 */ 
    1060  
    1061 /* 
    1062      * gets toggled per-tag  
     1057* no such property just to seed stack  
     1058*/ 
     1059 
     1060/* 
     1061* gets toggled per-tag  
    10631062*/ 
    10641063    ptr->bump_word = 1; 
    10651064 
    10661065/* 
    1067      * toggle  
     1066* toggle  
    10681067*/ 
    10691068    ptr->no_index = 0; 
    10701069 
    10711070/* 
    1072      * shortcut rather than looking parser up in hash for each tag event  
     1071* shortcut rather than looking parser up in hash for each tag event  
    10731072*/ 
    10741073    ptr->is_html = 0; 
    10751074 
    10761075/* 
    1077      * must be zero so that ++ works ok on first word  
     1076* must be zero so that ++ works ok on first word  
    10781077*/ 
    10791078    ptr->word_pos = 0; 
    10801079 
    10811080/* 
    1082      * always start at first byte  
     1081* always start at first byte  
    10831082*/ 
    10841083    ptr->offset = 0; 
    10851084 
    10861085/* 
    1087      * pointer to the xmlParserCtxt since we want to free it only after 
    1088      * we're completely done with it. NOTE this is a change per libxml2 
    1089      * vers > 2.6.16  
     1086* pointer to the xmlParserCtxt since we want to free it only after 
     1087* we're completely done with it. NOTE this is a change per libxml2 
     1088* vers > 2.6.16  
    10901089*/ 
    10911090    ptr->ctxt = NULL; 
     
    11091108 
    11101109/* 
    1111      * dec ref count for shared ptr  
     1110* dec ref count for shared ptr  
    11121111*/ 
    11131112    ptr->s3->ref_cnt--; 
    11141113 
    11151114/* 
    1116      * Pop the stacks  
     1115* Pop the stacks  
    11171116*/ 
    11181117    while ((st = pop_tag_stack(ptr->metastack)) != NULL) { 
     
    12501249 
    12511250/* 
    1252          * SWISH_DEBUG_MSG( "i = %d j = %d k = %d", i, j, k);  
     1251* SWISH_DEBUG_MSG( "i = %d j = %d k = %d", i, j, k);  
    12531252*/ 
    12541253 
     
    12591258 
    12601259/* 
    1261          * fprintf(stderr, "%c", line[i]);  
     1260* fprintf(stderr, "%c", line[i]);  
    12621261*/ 
    12631262        i++; 
     
    12711270 
    12721271/* 
    1273              * get to the next char no matter what, then check if == '\n'  
     1272* get to the next char no matter what, then check if == '\n'  
    12741273*/ 
    12751274            k++; 
     
    12781277 
    12791278/* 
    1280                  * fprintf(stderr, "found blank line at byte %d\n", k);  
     1279* fprintf(stderr, "found blank line at byte %d\n", k);  
    12811280*/ 
    12821281                h->body_start = k + 1; 
     
    14151414 
    14161415/* 
    1417              * TODO: get encoding out of this line too if 
    1418              * present. example:   text/xml; charset=ISO-8859-1 
     1416* TODO: get encoding out of this line too if 
     1417* present. example:   text/xml; charset=ISO-8859-1 
    14191418*/ 
    14201419 
     
    14431442 
    14441443/* 
    1445          * TODO update mode is a vers2 btree feature. still unclear if 
    1446          * we'll actually support it 
     1444* TODO update mode is a vers2 btree feature. still unclear if 
     1445* we'll actually support it 
    14471446*/ 
    14481447        if (!xmlStrncasecmp(line, (const xmlChar *)"Update-Mode", 11)) { 
     
    14621461 
    14631462/* 
    1464          * if we get here, unrecognized header line  
     1463* if we get here, unrecognized header line  
    14651464*/ 
    14661465        SWISH_WARN("Unknown header line: '%s'\n", line); 
     
    14821481 
    14831482/* 
    1484      * init the global env vars, but don't override if already set  
     1483* init the global env vars, but don't override if already set  
    14851484*/ 
    14861485 
     
    15241523 
    15251524/* 
    1526      * based on extprog.c  
     1525* based on extprog.c  
    15271526*/ 
    15281527    while (fgets((char *)ln, SWISH_MAXSTRLEN, fh) != 0) { 
    15291528 
    15301529/* 
    1531          * we don't use fgetws() because we don't care about * indiv 
    1532          * characters yet  
     1530* we don't use fgetws() because we don't care about * indiv 
     1531* characters yet  
    15331532*/ 
    15341533 
     
    15401539 
    15411540/* 
    1542          * trim any white space at end of doc, including \n  
     1541* trim any white space at end of doc, including \n  
    15431542*/ 
    15441543        if (end) { 
     
    15521551 
    15531552/* 
    1554              * blank line indicates body  
     1553* blank line indicates body  
    15551554*/ 
    15561555            curTime = swish_time_elapsed(); 
     
    15671566 
    15681567/* 
    1569              * parse  
     1568* parse  
    15701569*/ 
    15711570            xmlErr = 
     
    15871586 
    15881587/* 
    1589              * pass to callback function  
     1588* pass to callback function  
    15901589*/ 
    15911590            (*s3->parser->handler) (parser_data); 
     
    15951594 
    15961595/* 
    1597              * reset everything for next time  
     1596* reset everything for next time  
    15981597*/ 
    15991598 
     
    16051604 
    16061605/* 
    1607              * count the file  
     1606* count the file  
    16081607*/ 
    16091608            file_cnt++; 
     
    16161615 
    16171616/* 
    1618              * timer  
     1617* timer  
    16191618*/ 
    16201619            curTime = swish_time_elapsed(); 
     
    16331632 
    16341633/* 
    1635              * we are reading headers  
     1634* we are reading headers  
    16361635*/ 
    16371636            if (xmlBufferAdd(head_buf, line, -1)) 
     
    16711670 
    16721671/*  
    1673  * PUBLIC  
     1672* PUBLIC  
    16741673*/ 
    16751674 
    16761675/*  
    1677  * pass in a string including headers. like parsing fh, but only for one 
    1678  * doc 
     1676* pass in a string including headers. like parsing fh, but only for one 
     1677* doc 
    16791678*/ 
    16801679int 
     
    17011700 
    17021701/* 
    1703      * reposition buf pointer at start of body (just past head)  
     1702* reposition buf pointer at start of body (just past head)  
    17041703*/ 
    17051704 
     
    17091708 
    17101709/* 
    1711      * pass to callback function  
     1710* pass to callback function  
    17121711*/ 
    17131712    (*s3->parser->handler) (parser_data); 
     
    17211720 
    17221721/* 
    1723      * free buffers  
     1722* free buffers  
    17241723*/ 
    17251724    free_head(head); 
     
    17381737 
    17391738/*  
    1740  * PUBLIC  
     1739* PUBLIC  
    17411740*/ 
    17421741int 
     
    17641763 
    17651764/* 
    1766      * pass to callback function  
     1765* pass to callback function  
    17671766*/ 
    17681767    (*s3->parser->handler) (parser_data); 
     
    17761775 
    17771776/* 
    1778      * free buffers  
     1777* free buffers  
    17791778*/ 
    17801779    free_parser_data(parser_data); 
     
    17911790 
    17921791/** 
    1793  * based on libxml2 xmlSAXUserParseMemory in parser.c 
    1794  * which we don't use directly so that we can get encoding 
     1792* based on libxml2 xmlSAXUserParseMemory in parser.c 
     1793* which we don't use directly so that we can get encoding 
    17951794*/ 
    17961795static int 
     
    18171816 
    18181817/* 
    1819      * always use sax2 -- this pulled from xmlDetextSAX2()  
     1818* always use sax2 -- this pulled from xmlDetextSAX2()  
    18201819*/ 
    18211820    ctxt->str_xml = xmlDictLookup(ctxt->dict, BAD_CAST "xml", 3); 
     
    18261825 
    18271826/* 
    1828          * xmlErrMemory is/was not a public func but is in 
    1829          * parserInternals.h * basically, this is a bad, fatal error, so 
    1830          * we'll just die  
    1831 */ 
    1832  
    1833 /* 
    1834          * xmlErrMemory(ctxt, NULL);  
     1827* xmlErrMemory is/was not a public func but is in 
     1828* parserInternals.h * basically, this is a bad, fatal error, so 
     1829* we'll just die  
     1830*/ 
     1831 
     1832/* 
     1833* xmlErrMemory(ctxt, NULL);  
    18351834*/ 
    18361835        SWISH_CROAK("Fatal libxml2 memory error"); 
     
    19251924 
    19261925/* 
    1927      * TODO better encoding detection. for now we assume unknown text 
    1928      * files are latin1  
     1926* TODO better encoding detection. for now we assume unknown text 
     1927* files are latin1  
    19291928*/ 
    19301929    set_encoding(parser_data, buffer); 
     
    19701969 
    19711970/* 
    1972      * we obviously haven't any tags on which to trigger our metanames, 
    1973      * so set default 
    1974      * TODO get title somehow? 
    1975      * TODO check config to determine if we should buffer swish_prop_description etc 
     1971* we obviously haven't any tags on which to trigger our metanames, 
     1972* so set default 
     1973* TODO get title somehow? 
     1974* TODO check config to determine if we should buffer swish_prop_description etc 
    19761975*/ 
    19771976 
     
    20042003 
    20052004/* 
    2006      * this feels like it doesn't work ... would iconv() be better ?  
     2005* this feels like it doesn't work ... would iconv() be better ?  
    20072006*/ 
    20082007 
     
    20342033 
    20352034/* 
    2036          * if we get here, we didn't error with bad encoding via SAX, 
    2037          * so assume it's UTF-8 
     2035* if we get here, we didn't error with bad encoding via SAX, 
     2036* so assume it's UTF-8 
    20382037*/ 
    20392038        enc = swish_xstrdup((xmlChar *)SWISH_DEFAULT_ENCODING); 
     
    20702069 
    20712070/* 
    2072  * array buffer (token_iterator) tokenizer 
     2071* array buffer (token_iterator) tokenizer 
    20732072*/ 
    20742073 
     
    20842083 
    20852084/* 
    2086  * linked-list (wordlist) tokenizer 
     2085* linked-list (wordlist) tokenizer 
    20872086*/ 
    20882087 
     
    21022101 
    21032102/* 
    2104  *  append tmplist to master list  
     2103*  append tmplist to master list  
    21052104*/ 
    21062105        parser_data->word_pos += tmplist->nwords; 
     
    21132112 
    21142113/* 
    2115          * point tmp list first word's prev at current last word  
     2114* point tmp list first word's prev at current last word  
    21162115*/ 
    21172116            tmplist->head->prev = parser_data->wordlist->tail; 
    21182117 
    21192118/* 
    2120          * point current last word's 'next' at first word of tmp list  
     2119* point current last word's 'next' at first word of tmp list  
    21212120*/ 
    21222121            parser_data->wordlist->tail->next = tmplist->head; 
    21232122 
    21242123/* 
    2125          * point current last word at last word of tmp list  
     2124* point current last word at last word of tmp list  
    21262125*/ 
    21272126            parser_data->wordlist->tail = tmplist->tail; 
     
    21332132 
    21342133/* 
    2135      * global offset is now the same as the tail end_offset  
     2134* global offset is now the same as the tail end_offset  
    21362135*/ 
    21372136        parser_data->offset = parser_data->wordlist->tail->end_offset; 
     
    21662165 
    21672166/*  
    2168  * return stack as single string of space-separated names  
     2167* return stack as single string of space-separated names  
    21692168*/ 
    21702169static xmlChar * 
     
    22292228    if (baked != NULL) { 
    22302229        prop = swish_hash_fetch(parser_data->s3->config->properties, baked); 
    2231  
    2232 /* 
    2233          * should we strip whitespace from this particular property ?  
    2234 */ 
    22352230        if (prop->verbatim) 
    22362231            cleanwsp = 0; 
    22372232 
    22382233        swish_add_buf_to_nb(parser_data->properties, baked, parser_data->prop_buf, 
    2239                             (xmlChar *)SWISH_PROP_CONNECTOR, cleanwsp, 0); 
     2234                            (xmlChar *)SWISH_TOKENPOS_BUMPER, cleanwsp, 0); 
    22402235 
    22412236    } 
     
    22482243 
    22492244        swish_add_buf_to_nb(parser_data->properties, stack->temp->baked, 
    2250                             parser_data->prop_buf, (xmlChar *)SWISH_PROP_CONNECTOR, 
     2245                            parser_data->prop_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 
    22512246                            cleanwsp, 0); 
    22522247    } 
     
    23632358 
    23642359/*  
    2365  * returns top of the stack if the current tag matches. 
     2360* returns top of the stack if the current tag matches. 
    23662361*/ 
    23672362static swish_Tag * 
     
    23892384 
    23902385/* 
    2391          * more than default meta  
     2386* more than default meta  
    23922387*/ 
    23932388        if ((st = pop_tag_stack(stack)) != NULL) { 
     
    24032398 
    24042399/* 
    2405          * only tag on stack. TODO do we ever get here?  
     2400* only tag on stack. TODO do we ever get here?  
    24062401*/ 
    24072402        else if (stack->count) { 
  • libswish3/trunk/src/t/001-wordcount.t

    r2148 r2150  
    33use strict; 
    44use warnings; 
    5 use Test::More tests => 23
     5use Test::More tests => 25
    66use SwishTestUtils; 
    77 
     
    1717    'nested_meta.xml'  => '18', 
    1818    't.html'           => '6', 
    19     'testutf.xml'      => '8746',    #'8685', 
     19    'testutf.xml'      => '8754', 
    2020    'utf.xml'          => '30', 
    2121    'words.txt'        => '55', 
     
    2828    'html_broken.html' => '2', 
    2929    'properties.html'  => 19, 
     30    'inline.xml'       => 12, 
     31    'inline.html'      => 9, 
    3032 
    3133); 
    3234 
    3335my %stdindocs = ( 
    34     'doc.xml' => '8407', 
    35     'test.txt'   => 1, 
     36    'doc.xml' => '8407', 
     37    'test.txt' => 1, 
    3638 
    3739);