Changeset 1934
- Timestamp:
- 05/07/07 22:11:18 (1 year ago)
- Files:
-
- libswish3/trunk/doc/libswish3.3.pod.in (modified) (3 diffs)
- libswish3/trunk/src/libswish3/io.c (modified) (4 diffs)
- libswish3/trunk/src/libswish3/libswish3.h (modified) (2 diffs)
- libswish3/trunk/src/libswish3/parser.c (modified) (8 diffs)
- libswish3/trunk/src/swish_lint.c (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/doc/libswish3.3.pod.in
r1933 r1934 36 36 =item 37 37 38 swish_parse_ stdin()38 swish_parse_fh() 39 39 40 40 =item … … 62 62 headers and the full text of the document. 63 63 64 =item 65 66 swish_parse_fh() takes a filehandle pointer, which if set to NULL, 67 defaults to stdin. 68 64 69 =back 65 70 66 71 See the L<Headers API> section for more 67 information on using swish_parse_ stdin() and72 information on using swish_parse_fh() and 68 73 swish_parse_buffer(). 69 74 70 The I<handler> function is called after a document is completely parsed. 71 The I<handler> function will receive one argument: 72 the B<swish_ParseData> struct object, which contains all the words, metadata, and 73 properties for the document. 74 After the I<handler> is called, all allocated memory in the B<swish_ParseData> 75 struct is freed automatically. 76 77 The I<handler> function allows you to store extracted words and document properties 78 in an index or database. See the L<Structures API> section for details of the 79 B<swish_ParseData> struct. 75 See the L<I<handler> Function> section for more information on how 76 to deal with the data extracted by each of the swish_parse_* functions. 80 77 81 78 … … 226 223 B<libswish3> data structures. 227 224 228 TODO 225 For more details on any of these structures, see the SYNOPSIS. 226 227 =head2 swish_Config 228 229 A configuration object. This object is required for initializing both a C<swish_Analyzer> 230 object and a C<swish_Parser> object. 229 231 230 232 =head2 swish_Parser 231 233 234 A parser object. Required for executing any of the three C<swish_parse_*> functions. 235 232 236 =head2 swish_ParseData 233 237 238 A parser data object. This object is passed around internally by the libxml2 239 SAX2 handlers, and is eventually the object passed to the I<handler> function pointer. 240 See L<The I<handler> Function>. 241 234 242 =head2 swish_WordList 235 243 244 A list of words or tokens. The object contains a linked list of swish_Word objects. 245 You can iterate over the contents of the WordList like this: 246 247 swish_debug_msg("%d words in list", list->nwords); 248 list->current = list->head; 249 while (list->current != NULL) 250 { 251 swish_debug_msg(" ---------- WORD --------- "); 252 swish_debug_msg("word : %s", list->current->word); 253 swish_debug_msg(" meta : %s", list->current->metaname); 254 swish_debug_msg(" context : %s", list->current->context); 255 swish_debug_msg(" pos : %d", list->current->position); 256 swish_debug_msg("soffset: %d", list->current->start_offset); 257 swish_debug_msg("eoffset: %d", list->current->end_offset); 258 259 list->current = list->current->next; 260 } 261 236 262 =head2 swish_Word 237 263 264 An object representing one word or token in an object. The word's start and end offset, 265 position relative to other words, tag context and MetaName are all available in the object. 266 238 267 =head2 swish_DocInfo 239 268 269 An object describing metadata about the document itself: URI, MIME type, size, etc. 270 240 271 =head2 swish_Analyzer 241 272 273 The Analyzer object controls how the character content of a document is parsed: whether 274 or not a WordList is created with a tokenizer, if the words (tokens) are lowercased or 275 stemmed, etc. 276 242 277 =head1 The I<handler> Function 243 278 244 TODO 279 The I<handler> function pointer is the final link in the parsing chain. The function 280 pointer is set in the Parser object constructor, and is called by each of the 281 swish_parse_* functions after the entire document has been parsed and (optionally) 282 tokenized. 283 284 The I<handler> receives one argument: a swish_ParseData object containing all the metadata 285 and words in the document. 286 287 If all you wanted to do was print out a report about each document as it was parsed, 288 your I<handler> function might be as simple as: 289 290 void 291 my_handler( swish_ParseData * parse_data ) 292 { 293 swish_debug_docinfo( parse_data->docinfo ); 294 swish_debug_wordlist( parse_data->wordlist ); 295 swish_debug_nb( parse_data->properties, "Property" ); 296 swish_debug_nb( parse_data->metanames, "MetaName" ); 297 } 245 298 299 B<IMPORTANT:> After the I<handler> function is called, all the structures referenced 300 by the swish_ParseData object are automatically freed, so if you intend to keep any of the 301 data for storing in an index, you will need to strdup() words, properties, docinfo, etc. 302 as part of your indexing code. 303 304 See the example C<swish_lint.c> file for how to create and pass in a I<handler> 305 function pointer to the swish_Parser constructor. 246 306 247 307 =head1 Configuration API libswish3/trunk/src/libswish3/io.c
r1913 r1934 54 54 j++; 55 55 } 56 if ( buffer[i] == SWISH_META_CONNECTOR[0] 57 || buffer[i] == SWISH_META_CONNECTOR[0] 58 ) 59 { 60 buffer[i] = '\n'; 61 j++; 62 } 56 63 } 57 64 58 65 if (j) 59 66 swish_warn_err( 60 "Substituted %d embedded null character(s) in file '%s' with newline(s)\n",67 "Substituted %d embedded null or connector character(s) in file '%s' with newline(s)\n", 61 68 j, filename); 62 69 } … … 66 73 67 74 xmlChar * 68 swish_slurp_ stdin(long flen)75 swish_slurp_fh(FILE * fh, long flen) 69 76 { 70 77 … … 77 84 *buffer = '\0'; 78 85 79 bytes_read = fread(buffer, sizeof(xmlChar), flen, stdin);86 bytes_read = fread(buffer, sizeof(xmlChar), flen, fh); 80 87 81 88 if (bytes_read != flen) … … 87 94 /* printf("read %d bytes from stdin\n", bytes_read); */ 88 95 89 no_nulls((xmlChar*)" stdin", buffer, (int)bytes_read);96 no_nulls((xmlChar*)"filehandle", buffer, (int)bytes_read); 90 97 91 98 return buffer; libswish3/trunk/src/libswish3/libswish3.h
r1933 r1934 301 301 =head2 I/O Functions 302 302 */ 303 xmlChar * swish_slurp_ stdin(long flen );303 xmlChar * swish_slurp_fh( FILE * fh, long flen ); 304 304 xmlChar * swish_slurp_file_len( xmlChar *filename, long flen ); 305 305 xmlChar * swish_slurp_file( xmlChar *filename ); … … 414 414 xmlChar *filename, 415 415 void * stash ); 416 int swish_parse_stdin( swish_Parser * parser, 416 int swish_parse_fh( swish_Parser * parser, 417 FILE * fh, 417 418 void * stash ); 418 419 int swish_parse_buffer( swish_Parser * parser, libswish3/trunk/src/libswish3/parser.c
r1931 r1934 126 126 static void free_parse_data(swish_ParseData * parse_data); 127 127 128 /* parsing stdin/buffer headers */128 /* parsing fh/buffer headers */ 129 129 typedef struct 130 130 { … … 1186 1186 /* TODO there's a memory leak somewhere in here. one more malloc than free */ 1187 1187 int 1188 swish_parse_ stdin(1188 swish_parse_fh( 1189 1189 swish_Parser * parser, 1190 FILE * fh, 1190 1191 void * stash 1191 1192 ) … … 1208 1209 min_headers = 2; 1209 1210 1211 if (fh == NULL) 1212 fh = stdin; 1213 1210 1214 swish_mem_debug(); 1211 1215 … … 1216 1220 1217 1221 /* based on extprog.c */ 1218 while (fgets((char *) ln, SWISH_MAXSTRLEN, stdin) != 0)1222 while (fgets((char *) ln, SWISH_MAXSTRLEN, fh) != 0) 1219 1223 { 1220 1224 … … 1250 1254 1251 1255 if (SWISH_DEBUG > 9) 1252 swish_debug_msg("reading %ld bytes from stdin\n",1256 swish_debug_msg("reading %ld bytes from filehandle\n", 1253 1257 (long int) parse_data->docinfo->size); 1254 1258 1255 read_buffer = swish_slurp_ stdin(parse_data->docinfo->size);1259 read_buffer = swish_slurp_fh(fh, parse_data->docinfo->size); 1256 1260 1257 1261 /* parse */ … … 1301 1305 1302 1306 if (SWISH_DEBUG) 1303 swish_debug_msg("\n================ stdindone with file ===================\n");1307 swish_debug_msg("\n================ filehandle - done with file ===================\n"); 1304 1308 1305 1309 … … 1307 1311 else if (xmlStrlen(line) == 0) 1308 1312 { 1309 swish_fatal_err("Not enough header lines reading from stdin");1313 swish_fatal_err("Not enough header lines reading from filehandle"); 1310 1314 1311 1315 … … 1359 1363 1360 1364 /* 1361 * pass in a string including headers. like parsing stdin, but only for one1365 * pass in a string including headers. like parsing fh, but only for one 1362 1366 * doc 1363 1367 */ libswish3/trunk/src/swish_lint.c
r1930 r1934 205 205 206 206 printf("reading from stdin\n"); 207 files = swish_parse_ stdin(parser, NULL);207 files = swish_parse_fh(parser, NULL, NULL); 208 208 209 209 }
