root/libswish3/trunk/src/utf8test.c

Revision 2171, 4.9 kB (checked in by karpet, 2 months ago)

get rid of the circular reference in TokenList/Token?; make swish_init() a separate call from swish_init_swish3() so it can be called as class method rather than once-per-object; clean up some ref_cnt logic and mem debugging

Line 
1 /* test utf8 functions */
2
3 #include <stdio.h>
4 #include <assert.h>
5 #include <libxml/hash.h>
6 #include <wchar.h>
7 #include <ctype.h>
8 #include <wctype.h>
9 #include <ctype.h>
10 #include <string.h>
11 #include <stdlib.h>
12 #include <locale.h>
13 #include <err.h>
14 #include <getopt.h>
15 #include "libswish3.h"
16
17 static struct option longopts[] = {
18     {"file", required_argument, 0, 'f'},
19     {"help", no_argument, 0, 'h'},
20     {"loop", required_argument, 0, 'l'},
21     {0, 0, 0, 0}
22 };
23
24 int main(
25     int argc,
26     char **argv
27 );
28 int usage(
29 );
30 void iterate(
31     xmlChar *utf8
32 );
33 int char_report(
34     xmlChar *ptr
35 );
36 void seq_by_seq(
37     xmlChar *ptr
38 );
39 char *types[] = {
40     "alnum", "cntrl", "ideogram", "print", "special",
41     "alpha", "digit", "lower", "punct", "upper",
42     "blank", "graph", "phonogram", "space", "xdigit"
43 };
44
45 int ntypes = 15;
46
47 int
48 usage(
49 )
50 {
51
52     char *descr = "utf8test is an example program for testing the utf8 functions\n";
53     printf("utf8test [opts] [string(s)]\n");
54     printf("opts:\n --file file.txt\n --debug\n");
55     printf("\n%s\n\n", descr);
56     exit(1);
57 }
58
59 int
60 main(
61     int argc,
62     char **argv
63 )
64 {
65     int i, ch, loop;
66     int option_index = 0;
67     extern char *optarg;
68     extern int optind;
69     xmlChar *string;
70     string = NULL;
71     loop = 1;
72
73     swish_init();   // always call first
74
75     while ((ch = getopt_long(argc, argv, "d:f:h", longopts, &option_index)) != -1) {
76
77         switch (ch) {
78         case 0:                /* If this option set a flag, do nothing else now. */
79             if (longopts[option_index].flag != 0)
80                 break;
81             printf("option %s", longopts[option_index].name);
82             if (optarg)
83                 printf(" with arg %s", optarg);
84             printf("\n");
85             break;
86
87         case 'f':
88             printf("reading %s\n", optarg);
89
90             string = swish_slurp_file((xmlChar *)optarg);
91
92             break;
93
94         case 'l':
95             loop = swish_string_to_int(optarg);
96
97         case '?':
98         case 'h':
99         default:
100             usage();
101
102         }
103
104     }
105
106     i = optind;
107
108     for (; i < argc; i++) {
109         printf("utf8: %s\n", argv[i]);
110         iterate((xmlChar *)argv[i]);
111     }
112
113     if (string != NULL) {
114         printf("parsing file\n");
115         while (loop--)
116             iterate(string);
117         swish_xfree(string);
118     }
119
120     return (0);
121 }
122
123 void
124 iterate(
125     xmlChar *utf8
126 )
127 {
128     int n_bytes;
129     xmlChar *ptr;
130
131     ptr = utf8;
132     n_bytes = xmlStrlen(utf8);
133
134     printf("%s\n", utf8);
135     //printf("iterate over %d characters %d bytes\n", n_chars, n_bytes);
136
137     if (utf8 == NULL) {
138         printf("first byte in utf8 string is NULL\n");
139         return;
140     }
141
142     seq_by_seq(utf8);
143
144     printf("----------------------------------------------------------\n");
145
146     /*
147        get first seq, then loop until done
148      */
149     ptr += char_report(ptr);
150
151     while (xmlStrlen(ptr)) {
152         ptr += char_report(ptr);
153     }
154
155 }
156
157 int
158 char_report(
159     xmlChar *ptr
160 )
161 {
162     xmlChar buf[5];             /* max length of ucs32 char plus NULL */
163     int sl, i, cp, j;
164
165     cp = swish_utf8_codepoint(ptr);
166     sl = swish_utf8_chr_len(ptr);
167     printf("clen = %d ", sl);
168     for (i = 0; i < sl; i++) {
169         buf[i] = ptr[i];
170         printf("0x%02x %d ", buf[i], buf[i]);
171     }
172     buf[i] = '\0';              /* terminate */
173     printf(" -> %s ", buf);
174
175     // get codepoint val
176     printf("[0x%x] [%d]\n", cp, cp);
177
178     printf("   %lc ", cp);
179
180     for (j = 0; j < ntypes; j++) {
181         printf(" %10s => %d\n", types[j], iswctype(cp, wctype(types[j])));
182     }
183
184     printf("\n");
185     return sl;
186 }
187
188 void
189 seq_by_seq(
190     xmlChar *ptr
191 )
192 {
193     xmlChar buf[5];             /* max length of ucs32 char plus NULL */
194     int byte_pos = 0;
195     int prev_pos = 0;
196     int clen, i;
197
198     /*
199        forward
200      */
201     for (byte_pos = 0; ptr[prev_pos] != '\0'; swish_utf8_next_chr(ptr, &byte_pos)) {
202         clen = byte_pos - prev_pos;
203         if (!clen) {
204             prev_pos = byte_pos;
205             continue;
206         }
207
208         printf("clen = %d ", clen);
209         for (i = 0; i < clen; i++) {
210             buf[i] = ptr[prev_pos + i];
211             printf("0x%02x ", buf[i]);
212         }
213         buf[i] = '\0';
214         printf(" -> %s ", buf);
215
216         // get codepoint val
217         printf("[%d]", swish_utf8_codepoint(buf));
218         printf("\n");
219         prev_pos = byte_pos;
220     }
221
222     return;
223
224     // the rest is optional
225
226     /*
227        reverse
228      */
229     byte_pos -= 2;              /* back past NULL */
230     for (; byte_pos >= 0; swish_utf8_prev_chr(ptr, &byte_pos)) {
231         clen = prev_pos - byte_pos;
232         if (!clen) {
233             prev_pos = byte_pos;
234             continue;
235         }
236         printf("clen = %d ", clen);
237         for (i = 0; i < clen; i++) {
238             buf[i] = ptr[byte_pos + i];
239             printf("0x%02x ", buf[i]);
240         }
241         buf[i] = '\0';
242         printf(" -> %s ", buf);
243
244         // get codepoint val
245         printf("[%d]", swish_utf8_codepoint(buf));
246         printf("\n");
247         prev_pos = byte_pos;
248     }
249
250 }
Note: See TracBrowser for help on using the browser.