root/libswish3/trunk/src/swish_tokenize.c

Revision 2140, 3.2 kB (checked in by karpet, 3 months ago)

alternate utf8-savvy tokenizer with iterator. initial naive benchmark shows it is about as fast, with far fewer malloc/free calls. could like speed it up some by refactoring how "context" is stored internally

Line 
1 /*
2  * This file is part of libswish3
3  * Copyright (C) 2007 Peter Karman
4  *
5  *  libswish3 is free software; you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation; either version 2 of the License, or
8  *  (at your option) any later version.
9  *
10  *  libswish3 is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with libswish3; if not, write to the Free Software
17  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18  */
19
20 /* test utf8 tokenizer */
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <stdarg.h>
25 #include <err.h>
26 #include <string.h>
27 #include <wctype.h>
28 #include <ctype.h>
29 #include <getopt.h>
30
31 #include "libswish3.h"
32
33 static struct option longopts[] = {
34     {"file", required_argument, 0, 'f'},
35     {"help", no_argument, 0, 'h'},
36     {0, 0, 0, 0}
37 };
38
39 int main(
40     int argc,
41     char **argv
42 );
43 int usage(
44 );
45
46 extern int SWISH_DEBUG;
47
48 int
49 usage(
50 )
51 {
52
53     char *descr =
54         "swish_tokenize is an example program for testing the libswish3 tokenizer\n";
55     printf("swish_words [opts] [string(s)]\n");
56     printf("opts:\n --file file.txt\n");
57     printf("\n%s\n\n", descr);
58     exit(1);
59 }
60
61 int
62 main(
63     int argc,
64     char **argv
65 )
66 {
67     int i, ch;
68     int option_index;
69     int ntokens;
70     extern char *optarg;
71     extern int optind;
72     xmlChar *string;
73     swish_TokenList *list;
74     swish_TokenIterator *iterator;
75     xmlChar *meta;
76     swish_3 *s3;
77
78     meta = (xmlChar *)SWISH_DEFAULT_METANAME;
79     option_index = 0;
80     string = NULL;
81
82     s3 = swish_init_swish3(NULL, NULL);
83     list = swish_init_token_list();
84     iterator = swish_init_token_iterator(s3->config, list);
85
86     while ((ch = getopt_long(argc, argv, "f:h", longopts, &option_index)) != -1) {
87
88         switch (ch) {
89         case 0:                /* If this option set a flag, do nothing else now. */
90             if (longopts[option_index].flag != 0)
91                 break;
92             printf("option %s", longopts[option_index].name);
93             if (optarg)
94                 printf(" with arg %s", optarg);
95             printf("\n");
96             break;
97
98         case 'f':
99             printf("reading %s\n", optarg);
100             string = swish_slurp_file((xmlChar *)optarg);
101             break;
102
103         case '?':
104         case 'h':
105         default:
106             usage();
107
108         }
109
110     }
111
112     i = optind;
113
114     for (; i < argc; i++) {
115         ntokens =
116             swish_tokenize3(s3, list, (xmlChar *)argv[i],
117                             swish_hash_fetch(s3->config->metanames, meta), meta);
118         printf("parsed %d tokens: %s\n", ntokens, argv[i]);
119         swish_debug_token_list(iterator);
120     }
121
122     if (string != NULL) {
123         ntokens =
124             swish_tokenize3(s3, list, string,
125                             swish_hash_fetch(s3->config->metanames, meta), meta);
126         swish_debug_token_list(iterator);
127         swish_xfree(string);
128     }
129
130     swish_free_token_iterator(iterator);
131     swish_free_token_list(list);
132     swish_free_swish3(s3);
133
134     return (0);
135 }
Note: See TracBrowser for help on using the browser.