Changeset 2141 for libswish3/trunk

Show
Ignore:
Timestamp:
04/30/08 00:03:02 (2 months ago)
Author:
karpet
Message:

port the ascii optimizations in words.c to tokenizer.c and expose some previously private functions to the public API

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • libswish3/trunk/src/libswish3/config.c

    r2140 r2141  
    1919*/ 
    2020 
    21 /* parse XML-style config files 
    22  * 
    23  * based on http://www.yolinux.com/TUTORIALS/GnomeLibXml2.html 
    24  * 
    25 */ 
    26  
    2721#include <sys/param.h> 
    2822#include <stdio.h> 
     
    196190    flags->meta_ids = swish_init_hash(8); 
    197191    flags->prop_ids = swish_init_hash(8); 
    198     flags->contexts = swish_init_hash(8); 
     192    //flags->contexts = swish_init_hash(8); 
    199193 
    200194    return flags; 
  • libswish3/trunk/src/libswish3/libswish3.h

    r2140 r2141  
    236236    xmlHashTablePtr meta_ids; 
    237237    xmlHashTablePtr prop_ids; 
    238     xmlHashTablePtr contexts; 
     238    //xmlHashTablePtr contexts; 
    239239}; 
    240240 
     
    477477*/ 
    478478void                swish_verify_utf8_locale(); 
    479 int                 swish_is_ascii( xmlChar *str ); 
     479boolean             swish_is_ascii( xmlChar *str ); 
     480int                 swish_bytes_in_wchar( int wchar ); 
    480481int                 swish_utf8_chr_len( xmlChar *utf8 ); 
    481482int                 swish_utf8_codepoint( xmlChar *utf8 ); 
     
    487488wchar_t *           swish_wstr_tolower(wchar_t *s); 
    488489xmlChar *           swish_str_tolower(xmlChar *s ); 
     490xmlChar *           swish_utf8_str_tolower(xmlChar *s); 
     491xmlChar *           swish_ascii_str_tolower(xmlChar *s); 
    489492xmlChar *           swish_str_skip_ws(xmlChar *s); 
    490493void                swish_str_trim_ws(xmlChar *string); 
     
    609612swish_Token *       swish_next_token( swish_TokenIterator *it ); 
    610613int                 swish_tokenize3(    swish_3 *s3,  
     614                                        swish_TokenList * tl,  
     615                                        xmlChar *buf,  
     616                                        swish_MetaName *meta, 
     617                                        xmlChar *context ); 
     618int                 swish_tokenize3_ascii(     
     619                                        swish_3 *s3,  
     620                                        swish_TokenList * tl,  
     621                                        xmlChar *buf,  
     622                                        swish_MetaName *meta, 
     623                                        xmlChar *context ); 
     624int                 swish_tokenize3_utf8(     
     625                                        swish_3 *s3,  
    611626                                        swish_TokenList * tl,  
    612627                                        xmlChar *buf,  
  • libswish3/trunk/src/libswish3/string.c

    r2140 r2141  
    4040static xmlChar *getword( 
    4141    xmlChar **in_buf 
    42 ); 
    43 static xmlChar *utf8_str_tolower( 
    44     xmlChar *s 
    45 ); 
    46 static xmlChar *ascii_str_tolower( 
    47     xmlChar *s 
    4842); 
    4943static xmlChar *findlast( 
     
    302296*/ 
    303297 
    304 int 
     298boolean 
    305299swish_is_ascii( 
    306300    xmlChar *str 
     
    351345 
    352346/* a bit about encodings: libxml2 takes whatever encoding the input XML is 
    353      * (latin1, ascii, utf8, etc) and standardizes it using iconv in xmlChar as 
    354      * UTF-8. However, we must ensure we have UTF-8 locale because all the mb* and wc* 
    355      * routines rely on the locale to correctly interpret chars. */ 
     347 * (latin1, ascii, utf8, etc) and standardizes it using iconv in xmlChar as 
     348 * UTF-8. However, we must ensure we have UTF-8 locale because all the mb* and wc* 
     349 * routines rely on the locale to correctly interpret chars.  
     350 */ 
    356351 
    357352/* use LC_CTYPE specifically: http://mail.nl.linux.org/linux-utf8/2001-09/msg00030.html */ 
     
    462457 
    463458    if (swish_is_ascii(s)) 
    464         return ascii_str_tolower(s); 
     459        return swish_ascii_str_tolower(s); 
    465460    else 
    466         return utf8_str_tolower(s); 
     461        return swish_utf8_str_tolower(s); 
    467462 
    468463} 
     
    473468   and free the wchar 
    474469*/ 
    475 static xmlChar * 
    476 utf8_str_tolower( 
     470xmlChar * 
     471swish_utf8_str_tolower( 
    477472    xmlChar *s 
    478473) 
     
    496491 
    497492/* based on swstring.c in Swish-e */ 
    498 static xmlChar * 
    499 ascii_str_tolower( 
     493xmlChar * 
     494swish_ascii_str_tolower( 
    500495    xmlChar *s 
    501496) 
     
    516511  -- 2001-01-30  rasc 
    517512 
    518   should be utf8 safe, unless a continuation byte evals true to isspace() 
     513  TODO make utf8 safe.  
    519514*/ 
    520515 
     
    534529**************************************/ 
    535530 
     531// TODO make utf8 safe 
    536532void 
    537533swish_str_trim_ws( 
     
    572568} 
    573569 
     570/* returns the number of UTF-8 char* needed to hold the codepoint 
     571   represented by 'ch'. 
     572   similar to swish_utf8_chr_len() except that the arg is already 
     573   a 4-byte container and we want to know how many of the 4 bytes 
     574   we really need. 
     575*/ 
     576int 
     577swish_bytes_in_wchar( 
     578    int ch 
     579) 
     580{ 
     581    int len = 0; 
     582 
     583    if (ch < 0x80) { 
     584        len = 1; 
     585    } 
     586    if (ch < 0x800) { 
     587        len = 2; 
     588    } 
     589    if (ch < 0x10000) { 
     590        len = 3; 
     591    } 
     592    if (ch < 0x110000) { 
     593        len = 4; 
     594    } 
     595 
     596    if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) 
     597        SWISH_DEBUG_MSG(" %lc is %d bytes long", ch, len); 
     598 
     599    return len; 
     600} 
     601 
     602 
    574603/* from http://www.triptico.com/software/unicode.html */ 
    575604wchar_t * 
     
    587616    len = mblen((const char *)str, 4); 
    588617 
    589 /* a size of -1 is triggered by an error in encoding; never happen in ISO-8859-* 
    590      * locales, but possible in UTF-8 */ 
    591     if (s == -1) { 
    592         SWISH_WARN("error converting mbs to wide str: %s", str); 
    593         return (0); 
    594     } 
     618/* a size of -1 is triggered by an error in encoding;  
     619 * never happen in ISO-8859-* locales, but possible in UTF-8  
     620 */ 
     621    if (s == -1) 
     622        SWISH_CROAK("error converting mbs to wide str: %s", str); 
     623 
    595624 
    596625/* malloc the necessary space */ 
     
    621650/* a size of -1 means there are characters that could not be converted to current 
    622651     * locale */ 
    623     if (s == -1) { 
    624         warn("error converting wide chars to mbs: %ls", str); 
    625         return (0); 
    626     } 
     652    if (s == -1) 
     653        SWISH_CROAK("error converting wide chars to mbs: %ls", str); 
    627654 
    628655/* malloc the necessary space */ 
  • libswish3/trunk/src/libswish3/words.c

    r2140 r2141  
    4949    int c 
    5050); 
    51 static int bytes_in_chr( 
    52     int ch 
    53 ); 
    5451static void make_ascii_tables( 
    5552); 
     
    248245} 
    249246 
    250 /* returns the number of UTF-8 char* needed to hold the codepoint 
    251    represented by 'ch'. 
    252    similar to swish_utf8_chr_len() except that the arg is already 
    253    a 4-byte container and we want to know how many of the 4 bytes 
    254    we really need. 
    255 */ 
    256 static int 
    257 bytes_in_chr( 
    258     int ch 
    259 ) 
    260 { 
    261     int len = 0; 
    262  
    263     if (ch < 0x80) { 
    264         len = 1; 
    265     } 
    266     if (ch < 0x800) { 
    267         len = 2; 
    268     } 
    269     if (ch < 0x10000) { 
    270         len = 3; 
    271     } 
    272     if (ch < 0x110000) { 
    273         len = 4; 
    274     } 
    275  
    276     if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER) 
    277         SWISH_DEBUG_MSG(" %lc is %d bytes long", ch, len); 
    278  
    279     return len; 
    280 } 
    281247 
    282248swish_WordList * 
     
    319285 
    320286    for (i = 0; wide[i] != '\0'; i++) { 
    321         c = (int)towlower(wide[i]); 
    322         nextc = (int)towlower(wide[i + 1]); 
    323         byte_count += bytes_in_chr((wint_t) c); 
     287        c = (wchar_t)towlower(wide[i]); 
     288        nextc = (wchar_t)towlower(wide[i + 1]); 
     289        byte_count += swish_bytes_in_wchar(c); 
    324290 
    325291        if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)