Changeset 1934

Show
Ignore:
Timestamp:
05/07/07 22:11:18 (1 year ago)
Author:
karpet
Message:

change stdin to any filehandle pointer and add more POD

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • libswish3/trunk/doc/libswish3.3.pod.in

    r1933 r1934  
    3636=item 
    3737 
    38 swish_parse_stdin() 
     38swish_parse_fh() 
    3939 
    4040=item 
     
    6262headers and the full text of the document. 
    6363 
     64=item 
     65 
     66swish_parse_fh() takes a filehandle pointer, which if set to NULL, 
     67defaults to stdin. 
     68 
    6469=back 
    6570 
    6671See the L<Headers API> section for more 
    67 information on using swish_parse_stdin() and 
     72information on using swish_parse_fh() and 
    6873swish_parse_buffer(). 
    6974 
    70 The I<handler> function is called after a document is completely parsed. 
    71 The I<handler> function will receive one argument: 
    72 the B<swish_ParseData> struct object, which contains all the words, metadata, and  
    73 properties for the document. 
    74 After the I<handler> is called, all allocated memory in the B<swish_ParseData> 
    75 struct is freed automatically. 
    76  
    77 The I<handler> function allows you to store extracted words and document properties 
    78 in an index or database. See the L<Structures API> section for details of the  
    79 B<swish_ParseData> struct. 
     75See the L<I<handler> Function> section for more information on how 
     76to deal with the data extracted by each of the swish_parse_* functions. 
    8077 
    8178 
     
    226223B<libswish3> data structures. 
    227224 
    228  TODO 
     225For more details on any of these structures, see the SYNOPSIS. 
     226 
     227=head2 swish_Config 
     228 
     229A configuration object. This object is required for initializing both a C<swish_Analyzer> 
     230object and a C<swish_Parser> object. 
    229231 
    230232=head2 swish_Parser 
    231233 
     234A parser object. Required for executing any of the three C<swish_parse_*> functions. 
     235 
    232236=head2 swish_ParseData 
    233237 
     238A parser data object. This object is passed around internally by the libxml2 
     239SAX2 handlers, and is eventually the object passed to the I<handler> function pointer. 
     240See L<The I<handler> Function>. 
     241 
    234242=head2 swish_WordList 
    235243 
     244A list of words or tokens. The object contains a linked list of swish_Word objects. 
     245You can iterate over the contents of the WordList like this: 
     246 
     247 swish_debug_msg("%d words in list", list->nwords); 
     248 list->current = list->head; 
     249 while (list->current != NULL) 
     250 { 
     251        swish_debug_msg("   ---------- WORD ---------  "); 
     252        swish_debug_msg("word  : %s", list->current->word); 
     253        swish_debug_msg(" meta : %s", list->current->metaname); 
     254        swish_debug_msg(" context : %s", list->current->context); 
     255        swish_debug_msg("  pos : %d", list->current->position); 
     256        swish_debug_msg("soffset: %d", list->current->start_offset); 
     257        swish_debug_msg("eoffset: %d", list->current->end_offset); 
     258             
     259        list->current = list->current->next; 
     260 } 
     261 
    236262=head2 swish_Word 
    237263 
     264An object representing one word or token in an object. The word's start and end offset, 
     265position relative to other words, tag context and MetaName are all available in the object. 
     266 
    238267=head2 swish_DocInfo 
    239268 
     269An object describing metadata about the document itself: URI, MIME type, size, etc. 
     270 
    240271=head2 swish_Analyzer 
    241272 
     273The Analyzer object controls how the character content of a document is parsed: whether 
     274or not a WordList is created with a tokenizer, if the words (tokens) are lowercased or  
     275stemmed, etc. 
     276 
    242277=head1 The I<handler> Function 
    243278 
    244  TODO 
     279The I<handler> function pointer is the final link in the parsing chain. The function 
     280pointer is set in the Parser object constructor, and is called by each of the  
     281swish_parse_* functions after the entire document has been parsed and (optionally) 
     282tokenized. 
     283 
     284The I<handler> receives one argument: a swish_ParseData object containing all the metadata 
     285and words in the document. 
     286 
     287If all you wanted to do was print out a report about each document as it was parsed, 
     288your I<handler> function might be as simple as: 
     289 
     290 void 
     291 my_handler( swish_ParseData * parse_data ) 
     292 { 
     293    swish_debug_docinfo( parse_data->docinfo ); 
     294    swish_debug_wordlist( parse_data->wordlist ); 
     295    swish_debug_nb( parse_data->properties, "Property" ); 
     296    swish_debug_nb( parse_data->metanames, "MetaName" ); 
     297 } 
    245298  
     299B<IMPORTANT:> After the I<handler> function is called, all the structures referenced 
     300by the swish_ParseData object are automatically freed, so if you intend to keep any of the 
     301data for storing in an index, you will need to strdup() words, properties, docinfo, etc. 
     302as part of your indexing code. 
     303 
     304See the example C<swish_lint.c> file for how to create and pass in a I<handler> 
     305function pointer to the swish_Parser constructor. 
    246306 
    247307=head1 Configuration API 
  • libswish3/trunk/src/libswish3/io.c

    r1913 r1934  
    5454                j++; 
    5555            } 
     56            if (    buffer[i] == SWISH_META_CONNECTOR[0] 
     57                ||  buffer[i] == SWISH_META_CONNECTOR[0] 
     58                ) 
     59            { 
     60                buffer[i] = '\n'; 
     61                j++; 
     62            } 
    5663        } 
    5764 
    5865        if (j) 
    5966            swish_warn_err( 
    60                     "Substituted %d embedded null character(s) in file '%s' with newline(s)\n", 
     67                    "Substituted %d embedded null or connector character(s) in file '%s' with newline(s)\n", 
    6168                     j, filename); 
    6269    } 
     
    6673 
    6774xmlChar        * 
    68 swish_slurp_stdin(long flen) 
     75swish_slurp_fh(FILE * fh, long flen) 
    6976{ 
    7077 
     
    7784    *buffer = '\0'; 
    7885 
    79     bytes_read = fread(buffer, sizeof(xmlChar), flen, stdin); 
     86    bytes_read = fread(buffer, sizeof(xmlChar), flen, fh); 
    8087 
    8188    if (bytes_read != flen) 
     
    8794    /* printf("read %d bytes from stdin\n", bytes_read); */ 
    8895 
    89     no_nulls((xmlChar*)"stdin", buffer, (int)bytes_read); 
     96    no_nulls((xmlChar*)"filehandle", buffer, (int)bytes_read); 
    9097 
    9198    return buffer; 
  • libswish3/trunk/src/libswish3/libswish3.h

    r1933 r1934  
    301301=head2 I/O Functions 
    302302*/ 
    303 xmlChar *   swish_slurp_stdin( long flen ); 
     303xmlChar *   swish_slurp_fh( FILE * fh, long flen ); 
    304304xmlChar *   swish_slurp_file_len( xmlChar *filename, long flen ); 
    305305xmlChar *   swish_slurp_file( xmlChar *filename ); 
     
    414414                        xmlChar *filename, 
    415415                        void * stash ); 
    416 int swish_parse_stdin(  swish_Parser * parser, 
     416int swish_parse_fh(     swish_Parser * parser, 
     417                        FILE * fh, 
    417418                        void * stash  ); 
    418419int swish_parse_buffer( swish_Parser * parser, 
  • libswish3/trunk/src/libswish3/parser.c

    r1931 r1934  
    126126static void     free_parse_data(swish_ParseData * parse_data); 
    127127 
    128 /* parsing stdin/buffer headers */ 
     128/* parsing fh/buffer headers */ 
    129129typedef struct 
    130130{ 
     
    11861186/* TODO there's a memory leak somewhere in here. one more malloc than free */ 
    11871187int 
    1188 swish_parse_stdin
     1188swish_parse_fh
    11891189    swish_Parser * parser, 
     1190    FILE * fh, 
    11901191    void * stash  
    11911192) 
     
    12081209    min_headers = 2; 
    12091210     
     1211    if (fh == NULL) 
     1212        fh = stdin; 
     1213     
    12101214    swish_mem_debug(); 
    12111215 
     
    12161220     
    12171221    /* based on extprog.c */ 
    1218     while (fgets((char *) ln, SWISH_MAXSTRLEN, stdin) != 0) 
     1222    while (fgets((char *) ln, SWISH_MAXSTRLEN, fh) != 0) 
    12191223    {             
    12201224     
     
    12501254 
    12511255            if (SWISH_DEBUG > 9) 
    1252                 swish_debug_msg("reading %ld bytes from stdin\n",  
     1256                swish_debug_msg("reading %ld bytes from filehandle\n",  
    12531257                                (long int) parse_data->docinfo->size); 
    12541258 
    1255             read_buffer = swish_slurp_stdin(parse_data->docinfo->size); 
     1259            read_buffer = swish_slurp_fh(fh, parse_data->docinfo->size); 
    12561260 
    12571261            /* parse */ 
     
    13011305 
    13021306            if (SWISH_DEBUG) 
    1303                 swish_debug_msg("\n================ stdin done with file ===================\n"); 
     1307                swish_debug_msg("\n================ filehandle - done with file ===================\n"); 
    13041308 
    13051309 
     
    13071311        else if (xmlStrlen(line) == 0) 
    13081312        { 
    1309             swish_fatal_err("Not enough header lines reading from stdin"); 
     1313            swish_fatal_err("Not enough header lines reading from filehandle"); 
    13101314 
    13111315 
     
    13591363 
    13601364/* 
    1361  * pass in a string including headers. like parsing stdin, but only for one 
     1365 * pass in a string including headers. like parsing fh, but only for one 
    13621366 * doc 
    13631367 */ 
  • libswish3/trunk/src/swish_lint.c

    r1930 r1934  
    205205 
    206206            printf("reading from stdin\n"); 
    207             files = swish_parse_stdin(parser, NULL); 
     207            files = swish_parse_fh(parser, NULL, NULL); 
    208208 
    209209        }