root/libswish3/trunk/src/swish_lint.c

Revision 2140, 6.2 kB (checked in by karpet, 3 months ago)

alternate utf8-savvy tokenizer with iterator. initial naive benchmark shows it is about as fast, with far fewer malloc/free calls. could like speed it up some by refactoring how "context" is stored internally

Line 
1 /*
2  * This file is part of libswish3
3  * Copyright (C) 2007 Peter Karman
4  *
5  *  libswish3 is free software; you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation; either version 2 of the License, or
8  *  (at your option) any later version.
9  *
10  *  libswish3 is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with libswish3; if not, write to the Free Software
17  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18  */
19
20 /* swish_lint.c -- test libswish3 */
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <stdarg.h>
25 #include <err.h>
26 #include <string.h>
27 #include <wctype.h>
28 #include <ctype.h>
29 #include <getopt.h>
30
31 #include "libswish3.h"
32
33 int debug = 0;
34
35 int main(
36     int argc,
37     char **argv
38 );
39 void usage(
40 );
41 void handler(
42     swish_ParserData *parser_data
43 );
44 void libxml2_version(
45 );
46 void swish_version(
47 );
48
49 int twords = 0;
50
51 extern int SWISH_DEBUG;
52
53 static struct option longopts[] = {
54     {"config", required_argument, 0, 'c'},
55     {"debug", required_argument, 0, 'd'},
56     {"help", no_argument, 0, 'h'},
57     {"tokenize3", no_argument, 0, 't'},
58     {0, 0, 0, 0}
59 };
60
61 void
62 libxml2_version(
63 )
64 {
65     printf("  libxml2 version:\t%s\n", LIBXML_DOTTED_VERSION);
66 }
67
68 void
69 swish_version(
70 )
71 {
72     printf("libswish3 version:\t%s\n", SWISH_LIB_VERSION);
73     printf("    swish version:\t%s\n", SWISH_VERSION);
74 }
75
76 void
77 usage(
78 )
79 {
80
81     char *descr = "swish_lint is an example program for using libswish3\n";
82     printf("swish_lint [opts] [- | file(s)]\n");
83     printf("opts:\n --config conf_file.xml\n --debug [lvl]\n --help\n");
84     printf("\n%s\n", descr);
85     printf("Debugging env vars:\n");
86     printf("\tSWISH_DEBUG <-- takes sum of ints below\n");
87     printf("\tSWISH_DEBUG_DOCINFO      1\n");
88     printf("\tSWISH_DEBUG_TOKENIZER    2\n");
89     printf("\tSWISH_DEBUG_WORDLIST     4\n");
90     printf("\tSWISH_DEBUG_PARSER       8\n");
91     printf("\tSWISH_DEBUG_CONFIG      16\n");
92     printf("\tSWISH_DEBUG_MEMORY      32\n");
93     printf("\tSWISH_DEBUG_NAMEDBUFFER 64\n");
94     printf("Set SWISH_PARSER_WARNINGS=1 to see libxml2 errors and warnings\n");
95     printf("Set SWISH_WARNINGS=0 to turn off libswish3 warnings\n");
96     printf("stdin headers:\n");
97     printf("\tContent-Length\n");
98     printf("\tLast-Modified\n");
99     printf("\tContent-Location\n");
100     printf("\tParser-Type\n");
101     printf("\tContent-Type\n");
102     printf("\tEncoding\n");
103     printf("\tUpdate-Mode\n");
104     libxml2_version();
105     swish_version();
106
107 }
108
109 void
110 handler(
111     swish_ParserData *parser_data
112 )
113 {
114
115     /*
116        return;
117      */
118
119     printf("nwords: %d\n", parser_data->docinfo->nwords);
120
121     if (SWISH_DEBUG & SWISH_DEBUG_MEMORY)
122         swish_mem_debug();
123
124     twords += parser_data->docinfo->nwords;
125
126     if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO)
127         swish_debug_docinfo(parser_data->docinfo);
128
129     if (SWISH_DEBUG & SWISH_DEBUG_WORDLIST) {
130       if (parser_data->s3->analyzer->tokenlist) {
131         swish_debug_token_list(parser_data->token_iterator);
132       }
133       else {
134         swish_debug_wordlist(parser_data->wordlist);
135       }
136     }
137
138     if (SWISH_DEBUG & SWISH_DEBUG_NAMEDBUFFER) {
139         swish_debug_nb(parser_data->properties, (xmlChar *)"Property");
140         swish_debug_nb(parser_data->metanames, (xmlChar *)"MetaName");
141     }
142 }
143
144 int
145 main(
146     int argc,
147     char **argv
148 )
149 {
150     int i, ch;
151     extern char *optarg;
152     extern int optind;
153     int option_index;
154     int files;
155     char *etime;
156     double start_time;
157     xmlChar *config_file = NULL;
158     swish_3 *s3;
159
160     option_index = 0;
161     files = 0;
162     start_time = swish_time_elapsed();
163     s3 = swish_init_swish3(&handler, NULL);
164
165     while ((ch = getopt_long(argc, argv, "c:d:f:ht", longopts, &option_index)) != -1) {
166
167         switch (ch) {
168         case 0:                /* If this option set a flag, do nothing else now. */
169             if (longopts[option_index].flag != 0)
170                 break;
171             printf("option %s", longopts[option_index].name);
172             if (optarg)
173                 printf(" with arg %s", optarg);
174             printf("\n");
175             break;
176
177         case 'c':              /* should we set up default config first ? then override
178                                  * here ? */
179
180             //printf("optarg = %s\n", optarg);
181             config_file = swish_xstrdup((xmlChar *)optarg);
182             break;
183
184         case 'd':
185             printf("turning on debug mode: %s\n", optarg);
186
187             if (!isdigit(optarg[0]))
188                 err(1, "-d option requires a positive integer as argument\n");
189
190             SWISH_DEBUG = swish_string_to_int(optarg);
191             break;
192            
193         case 't':
194             s3->analyzer->tokenlist = 1;
195             break;
196            
197         case '?':
198         case 'h':
199         default:
200             usage();
201             exit(0);
202
203         }
204
205     }
206
207     if (config_file != NULL) {
208         s3->config = swish_add_config(config_file, s3->config);
209     }
210
211     i = optind;
212
213     if (!i || i >= argc) {
214         usage();
215     }
216     else {
217         if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
218             swish_debug_config(s3->config);
219         }
220
221         for (; i < argc; i++) {
222
223             if (argv[i][0] != '-') {
224
225                 printf("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
226                 printf("parse_file for %s\n", argv[i]);
227                 if (!swish_parse_file(s3, (unsigned char *)argv[i]))
228                     files++;
229
230             }
231             else if (argv[i][0] == '-' && !argv[i][1]) {
232
233                 printf("reading from stdin\n");
234                 files = swish_parse_fh(s3, NULL);
235
236             }
237
238         }
239
240         printf("\n\n%d files indexed\n", files);
241         printf("total words: %d\n", twords);
242
243         etime = swish_print_time(swish_time_elapsed() - start_time);
244         printf("%s total time\n\n", etime);
245         swish_xfree(etime);
246
247     }
248
249     if (config_file != NULL)
250         swish_xfree(config_file);
251
252     swish_free_swish3(s3);
253
254     return (0);
255 }
Note: See TracBrowser for help on using the browser.