root/libswish3/trunk/src/libswish3/utf8.c

Revision 2103, 15.8 kB (checked in by karpet, 8 months ago)

whitespace only. again.

I am now using gnu indent rather than the original bsd version. My opts are below:

--no-blank-lines-after-declarations
--blank-lines-after-procedures
--no-blank-lines-after-commas
--break-before-boolean-operator
//--break-function-decl-args
//--break-function-decl-args-end
// long options above do not work. use short below instead.
-bfda
-bfde
--braces-on-if-line
--brace-indent4
--braces-after-struct-decl-line
--dont-cuddle-else
--comment-delimiters-on-blank-lines
--else-endif-column1
--no-space-after-casts
--declaration-indentation4
--paren-indentation4
--dont-format-first-column-comments
--dont-format-comments
--ignore-newlines

--line-length80
--indent-level4
--parameter-indentation5
--continue-at-parentheses
--no-space-after-function-call-names
--no-space-after-parentheses
--procnames-start-lines
--space-after-for
--space-after-if
--space-after-while
--dont-star-comments
--swallow-optional-blank-lines
--no-tabs

-TxmlChar?
-Tswish_ParserData
-Tswish_Config
-Tswish_3
-Tswish_Analyzer
-Tswish_Parser
-Tswish_DocInfo
-Tswish_TagStack
-Tswish_MetaName
-Tswish_Property

Line 
1 /* see http://cprogramming.com/tutorial/unicode.html
2
3 this file is a simple UTF-8 string handling library based on the url above.
4 the .h and .c file have been combined and all functions labeled 'static'
5 so you must include utf8.c to get the library.
6 We include in string.c.
7 */
8
9 /* http://cprogramming.com/tutorial/utf8.h */
10
11 #include <stdarg.h>
12
13 /* is c the start of a utf8 sequence? */
14 #define isutf(c) (((c)&0xC0)!=0x80)
15
16 /* convert UTF-8 data to wide character */
17 static int u8_toucs(
18     u_int32_t * dest,
19     int sz,
20     char *src,
21     int srcsz
22 );
23
24 /* the opposite conversion */
25 static int u8_toutf8(
26     char *dest,
27     int sz,
28     u_int32_t * src,
29     int srcsz
30 );
31
32 /* single character to UTF-8 */
33 static int u8_wc_toutf8(
34     char *dest,
35     u_int32_t ch
36 );
37
38 /* character number to byte offset */
39 static int u8_offset(
40     char *str,
41     int charnum
42 );
43
44 /* byte offset to character number */
45 static int u8_charnum(
46     char *s,
47     int offset
48 );
49
50 /* return next character, updating an index variable */
51 static u_int32_t u8_nextchar(
52     char *s,
53     int *i
54 );
55
56 /* move to next character */
57 static void u8_inc(
58     char *s,
59     int *i
60 );
61
62 /* move to previous character */
63 static void u8_dec(
64     char *s,
65     int *i
66 );
67
68 /* returns length of next utf-8 sequence */
69 static int u8_seqlen(
70     char *s
71 );
72
73 /* assuming src points to the character after a backslash, read an
74    escape sequence, storing the result in dest and returning the number of
75    input characters processed */
76 static int u8_read_escape_sequence(
77     char *src,
78     u_int32_t * dest
79 );
80
81 /* given a wide character, convert it to an ASCII escape sequence stored in
82    buf, where buf is "sz" bytes. returns the number of characters output. */
83 static int u8_escape_wchar(
84     char *buf,
85     int sz,
86     u_int32_t ch
87 );
88
89 /* convert a string "src" containing escape sequences to UTF-8 */
90 static int u8_unescape(
91     char *buf,
92     int sz,
93     char *src
94 );
95
96 /* convert UTF-8 "src" to ASCII with escape sequences.
97    if escape_quotes is nonzero, quote characters will be preceded by
98    backslashes as well. */
99 static int u8_escape(
100     char *buf,
101     int sz,
102     char *src,
103     int escape_quotes
104 );
105
106 /* utility predicates used by the above */
107 static int octal_digit(
108     char c
109 );
110 static int hex_digit(
111     char c
112 );
113
114 /* return a pointer to the first occurrence of ch in s, or NULL if not
115    found. character index of found character returned in *charn. */
116 static char *u8_strchr(
117     char *s,
118     u_int32_t ch,
119     int *charn
120 );
121
122 /* same as the above, but searches a buffer of a given size instead of
123    a NUL-terminated string. */
124 static char *u8_memchr(
125     char *s,
126     u_int32_t ch,
127     size_t sz,
128     int *charn
129 );
130
131 /* count the number of characters in a UTF-8 string */
132 static int u8_strlen(
133     char *s
134 );
135
136 static int u8_is_locale_utf8(
137     char *locale
138 );
139
140 /* printf where the format string and arguments may be in UTF-8.
141    you can avoid this function and just use ordinary printf() if the current
142    locale is UTF-8. */
143 static int u8_vprintf(
144     char *fmt,
145     va_list ap
146 );
147 static int u8_printf(
148     char *fmt,
149     ...
150 );
151
152 /* http://cprogramming.com/tutorial/utf8.c */
153
154 /*
155   Basic UTF-8 manipulation routines
156   by Jeff Bezanson
157   placed in the public domain Fall 2005
158
159   This code is designed to provide the utilities you need to manipulate
160   UTF-8 as an internal string encoding. These functions do not perform the
161   error checking normally needed when handling UTF-8 data, so if you happen
162   to be from the Unicode Consortium you will want to flay me alive.
163   I do this because error checking can be performed at the boundaries (I/O),
164   with these routines reserved for higher performance on data known to be
165   valid.
166 */
167 #include <stdlib.h>
168 #include <stdio.h>
169 #include <string.h>
170 #include <stdarg.h>
171 #ifdef WIN32
172 #include <malloc.h>
173 #else
174 #include <alloca.h>
175 #endif
176
177 static const u_int32_t offsetsFromUTF8[6] = {
178     0x00000000UL, 0x00003080UL, 0x000E2080UL,
179     0x03C82080UL, 0xFA082080UL, 0x82082080UL
180 };
181
182 static const char trailingBytesForUTF8[256] = {
183     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
184         0, 0, 0, 0, 0, 0, 0,
185     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186         0, 0, 0, 0, 0, 0, 0,
187     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188         0, 0, 0, 0, 0, 0, 0,
189     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190         0, 0, 0, 0, 0, 0, 0,
191     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
192         0, 0, 0, 0, 0, 0, 0,
193     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194         0, 0, 0, 0, 0, 0, 0,
195     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
196         1, 1, 1, 1, 1, 1, 1,
197     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4,
198         4, 4, 4, 5, 5, 5, 5
199 };
200
201 /* returns length of next utf-8 sequence */
202 static int
203 u8_seqlen(
204     char *s
205 )
206 {
207     return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
208 }
209
210 /* conversions without error checking
211    only works for valid UTF-8, i.e. no 5- or 6-byte sequences
212    srcsz = source size in bytes, or -1 if 0-terminated
213    sz = dest size in # of wide characters
214
215    returns # characters converted
216    dest will always be L'\0'-terminated, even if there isn't enough room
217    for all the characters.
218    if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
219 */
220 static int
221 u8_toucs(
222     u_int32_t * dest,
223     int sz,
224     char *src,
225     int srcsz
226 )
227 {
228     u_int32_t ch;
229     char *src_end = src + srcsz;
230     int nb;
231     int i = 0;
232
233     while (i < sz - 1) {
234         nb = trailingBytesForUTF8[(unsigned char)*src];
235         if (srcsz == -1) {
236             if (*src == 0)
237                 goto done_toucs;
238         }
239         else {
240             if (src + nb >= src_end)
241                 goto done_toucs;
242         }
243         ch = 0;
244         switch (nb) {
245             /*
246                these fall through deliberately
247              */
248         case 3:
249             ch += (unsigned char)*src++;
250             ch <<= 6;
251         case 2:
252             ch += (unsigned char)*src++;
253             ch <<= 6;
254         case 1:
255             ch += (unsigned char)*src++;
256             ch <<= 6;
257         case 0:
258             ch += (unsigned char)*src++;
259         }
260         ch -= offsetsFromUTF8[nb];
261         dest[i++] = ch;
262     }
263   done_toucs:
264     dest[i] = 0;
265     return i;
266 }
267
268 /* srcsz = number of source characters, or -1 if 0-terminated
269    sz = size of dest buffer in bytes
270
271    returns # characters converted
272    dest will only be '\0'-terminated if there is enough space. this is
273    for consistency; imagine there are 2 bytes of space left, but the next
274    character requires 3 bytes. in this case we could NUL-terminate, but in
275    general we can't when there's insufficient space. therefore this function
276    only NUL-terminates if all the characters fit, and there's space for
277    the NUL as well.
278    the destination string will never be bigger than the source string.
279 */
280 static int
281 u8_toutf8(
282     char *dest,
283     int sz,
284     u_int32_t * src,
285     int srcsz
286 )
287 {
288     u_int32_t ch;
289     int i = 0;
290     char *dest_end = dest + sz;
291
292     while (srcsz < 0 ? src[i] != 0 : i < srcsz) {
293         ch = src[i];
294         if (ch < 0x80) {
295             if (dest >= dest_end)
296                 return i;
297             *dest++ = (char)ch;
298         }
299         else if (ch < 0x800) {
300             if (dest >= dest_end - 1)
301                 return i;
302             *dest++ = (ch >> 6) | 0xC0;
303             *dest++ = (ch & 0x3F) | 0x80;
304         }
305         else if (ch < 0x10000) {
306             if (dest >= dest_end - 2)
307                 return i;
308             *dest++ = (ch >> 12) | 0xE0;
309             *dest++ = ((ch >> 6) & 0x3F) | 0x80;
310             *dest++ = (ch & 0x3F) | 0x80;
311         }
312         else if (ch < 0x110000) {
313             if (dest >= dest_end - 3)
314                 return i;
315             *dest++ = (ch >> 18) | 0xF0;
316             *dest++ = ((ch >> 12) & 0x3F) | 0x80;
317             *dest++ = ((ch >> 6) & 0x3F) | 0x80;
318             *dest++ = (ch & 0x3F) | 0x80;
319         }
320         i++;
321     }
322     if (dest < dest_end)
323         *dest = '\0';
324     return i;
325 }
326
327 static int
328 u8_wc_toutf8(
329     char *dest,
330     u_int32_t ch
331 )
332 {
333     if (ch < 0x80) {
334         dest[0] = (char)ch;
335         return 1;
336     }
337     if (ch < 0x800) {
338         dest[0] = (ch >> 6) | 0xC0;
339         dest[1] = (ch & 0x3F) | 0x80;
340         return 2;
341     }
342     if (ch < 0x10000) {
343         dest[0] = (ch >> 12) | 0xE0;
344         dest[1] = ((ch >> 6) & 0x3F) | 0x80;
345         dest[2] = (ch & 0x3F) | 0x80;
346         return 3;
347     }
348     if (ch < 0x110000) {
349         dest[0] = (ch >> 18) | 0xF0;
350         dest[1] = ((ch >> 12) & 0x3F) | 0x80;
351         dest[2] = ((ch >> 6) & 0x3F) | 0x80;
352         dest[3] = (ch & 0x3F) | 0x80;
353         return 4;
354     }
355     return 0;
356 }
357
358 /* charnum => byte offset */
359 static int
360 u8_offset(
361     char *str,
362     int charnum
363 )
364 {
365     int offs = 0;
366
367     while (charnum > 0 && str[offs]) {
368         (void)(isutf(str[++offs]) || isutf(str[++offs]) || isutf(str[++offs])
369                || ++offs);
370         charnum--;
371     }
372     return offs;
373 }
374
375 /* byte offset => charnum */
376 static int
377 u8_charnum(
378     char *s,
379     int offset
380 )
381 {
382     int charnum = 0, offs = 0;
383
384     while (offs < offset && s[offs]) {
385         (void)(isutf(s[++offs]) || isutf(s[++offs]) || isutf(s[++offs])
386                || ++offs);
387         charnum++;
388     }
389     return charnum;
390 }
391
392 /* number of characters */
393 static int
394 u8_strlen(
395     char *s
396 )
397 {
398     int count = 0;
399     int i = 0;
400
401     while (u8_nextchar(s, &i) != 0)
402         count++;
403
404     return count;
405 }
406
407 /* reads the next utf-8 sequence out of a string, updating an index */
408 static u_int32_t
409 u8_nextchar(
410     char *s,
411     int *i
412 )
413 {
414     u_int32_t ch = 0;
415     int sz = 0;
416
417     do {
418         ch <<= 6;
419         ch += (unsigned char)s[(*i)++];
420         sz++;
421     } while (s[*i] && !isutf(s[*i]));
422     ch -= offsetsFromUTF8[sz - 1];
423
424     return ch;
425 }
426
427 static void
428 u8_inc(
429     char *s,
430     int *i
431 )
432 {
433     (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i));
434 }
435
436 static void
437 u8_dec(
438     char *s,
439     int *i
440 )
441 {
442     (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i));
443 }
444
445 static int
446 octal_digit(
447     char c
448 )
449 {
450     return (c >= '0' && c <= '7');
451 }
452
453 static int
454 hex_digit(
455     char c
456 )
457 {
458     return ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')
459             || (c >= 'a' && c <= 'f'));
460 }
461
462 /* assumes that src points to the character after a backslash
463    returns number of input characters processed */
464 static int
465 u8_read_escape_sequence(
466     char *str,
467     u_int32_t * dest
468 )
469 {
470     u_int32_t ch;
471     char digs[9] = "\0\0\0\0\0\0\0\0\0";
472     int dno = 0, i = 1;
473
474     ch = (u_int32_t) str[0];    /* take literal character */
475     if (str[0] == 'n')
476         ch = L'\n';
477     else if (str[0] == 't')
478         ch = L'\t';
479     else if (str[0] == 'r')
480         ch = L'\r';
481     else if (str[0] == 'b')
482         ch = L'\b';
483     else if (str[0] == 'f')
484         ch = L'\f';
485     else if (str[0] == 'v')
486         ch = L'\v';
487     else if (str[0] == 'a')
488         ch = L'\a';
489     else if (octal_digit(str[0])) {
490         i = 0;
491         do {
492             digs[dno++] = str[i++];
493         } while (octal_digit(str[i]) && dno < 3);
494         ch = strtol(digs, NULL, 8);
495     }
496     else if (str[0] == 'x') {
497         while (hex_digit(str[i]) && dno < 2) {
498             digs[dno++] = str[i++];
499         }
500         if (dno > 0)
501             ch = strtol(digs, NULL, 16);
502     }
503     else if (str[0] == 'u') {
504         while (hex_digit(str[i]) && dno < 4) {
505             digs[dno++] = str[i++];
506         }
507         if (dno > 0)
508             ch = strtol(digs, NULL, 16);
509     }
510     else if (str[0] == 'U') {
511         while (hex_digit(str[i]) && dno < 8) {
512             digs[dno++] = str[i++];
513         }
514         if (dno > 0)
515             ch = strtol(digs, NULL, 16);
516     }
517     *dest = ch;
518
519     return i;
520 }
521
522 /* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
523    example: u8_unescape(mybuf, 256, "hello\\u220e")
524    note the double backslash is needed if called on a C string literal */
525 static int
526 u8_unescape(
527     char *buf,
528     int sz,
529     char *src
530 )
531 {
532     int c = 0, amt;
533     u_int32_t ch;
534     char temp[4];
535
536     while (*src && c < sz) {
537         if (*src == '\\') {
538             src++;
539             amt = u8_read_escape_sequence(src, &ch);
540         }
541         else {
542             ch = (u_int32_t) * src;
543             amt = 1;
544         }
545         src += amt;
546         amt = u8_wc_toutf8(temp, ch);
547         if (amt > sz - c)
548             break;
549         memcpy(&buf[c], temp, amt);
550         c += amt;
551     }
552     if (c < sz)
553         buf[c] = '\0';
554     return c;
555 }
556
557 static int
558 u8_escape_wchar(
559     char *buf,
560     int sz,
561     u_int32_t ch
562 )
563 {
564     if (ch == L'\n')
565         return snprintf(buf, sz, "\\n");
566     else if (ch == L'\t')
567         return snprintf(buf, sz, "\\t");
568     else if (ch == L'\r')
569         return snprintf(buf, sz, "\\r");
570     else if (ch == L'\b')
571         return snprintf(buf, sz, "\\b");
572     else if (ch == L'\f')
573         return snprintf(buf, sz, "\\f");
574     else if (ch == L'\v')
575         return snprintf(buf, sz, "\\v");
576     else if (ch == L'\a')
577         return snprintf(buf, sz, "\\a");
578     else if (ch == L'\\')
579         return snprintf(buf, sz, "\\\\");
580     else if (ch < 32 || ch == 0x7f)
581         return snprintf(buf, sz, "\\x%hhX", (unsigned char)ch);
582     else if (ch > 0xFFFF)
583         return snprintf(buf, sz, "\\U%.8X", (u_int32_t) ch);
584     else if (ch >= 0x80 && ch <= 0xFFFF)
585         return snprintf(buf, sz, "\\u%.4hX", (unsigned short)ch);
586
587     return snprintf(buf, sz, "%c", (char)ch);
588 }
589
590 static int
591 u8_escape(
592     char *buf,
593     int sz,
594     char *src,
595     int escape_quotes
596 )
597 {
598     int c = 0, i = 0, amt;
599
600     while (src[i] && c < sz) {
601         if (escape_quotes && src[i] == '"') {
602             amt = snprintf(buf, sz - c, "\\\"");
603             i++;
604         }
605         else {
606             amt = u8_escape_wchar(buf, sz - c, u8_nextchar(src, &i));
607         }
608         c += amt;
609         buf += amt;
610     }
611     if (c < sz)
612         *buf = '\0';
613     return c;
614 }
615
616 static char *
617 u8_strchr(
618     char *s,
619     u_int32_t ch,
620     int *charn
621 )
622 {
623     int i = 0, lasti = 0;
624     u_int32_t c;
625
626     *charn = 0;
627     while (s[i]) {
628         c = u8_nextchar(s, &i);
629         if (c == ch) {
630             return &s[lasti];
631         }
632         lasti = i;
633         (*charn)++;
634     }
635     return NULL;
636 }
637
638 static char *
639 u8_memchr(
640     char *s,
641     u_int32_t ch,
642     size_t sz,
643     int *charn
644 )
645 {
646     int i = 0, lasti = 0;
647     u_int32_t c;
648     int csz;
649
650     *charn = 0;
651     while (i < sz) {
652         c = csz = 0;
653         do {
654             c <<= 6;
655             c += (unsigned char)s[i++];
656             csz++;
657         } while (i < sz && !isutf(s[i]));
658         c -= offsetsFromUTF8[csz - 1];
659
660         if (c == ch) {
661             return &s[lasti];
662         }
663         lasti = i;
664         (*charn)++;
665     }
666     return NULL;
667 }
668
669 static int
670 u8_is_locale_utf8(
671     char *locale
672 )
673 {
674     /*
675        this code based on libutf8
676      */
677     const char *cp = locale;
678
679     for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
680         if (*cp == '.') {
681             const char *encoding = ++cp;
682             for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++);
683             if ((cp - encoding == 5 && !strncmp(encoding, "UTF-8", 5))
684                 || (cp - encoding == 4 && !strncmp(encoding, "utf8", 4)))
685                 return 1;       /* it's UTF-8 */
686             break;
687         }
688     }
689     return 0;
690 }
691
692 static int
693 u8_vprintf(
694     char *fmt,
695     va_list ap
696 )
697 {
698     int cnt, sz = 0;
699     char *buf;
700     u_int32_t *wcs;
701
702     sz = 512;
703     buf = (char *)alloca(sz);
704   try_print:
705     cnt = vsnprintf(buf, sz, fmt, ap);
706     if (cnt >= sz) {
707         buf = (char *)alloca(cnt - sz + 1);
708         sz = cnt + 1;
709         goto try_print;
710     }
711     wcs = (u_int32_t *) alloca((cnt + 1) * sizeof(u_int32_t));
712     cnt = u8_toucs(wcs, cnt + 1, buf, cnt);
713     printf("%ls", (wchar_t *) wcs);
714     return cnt;
715 }
716
717 static int
718 u8_printf(
719     char *fmt,
720     ...
721 )
722 {
723     int cnt;
724     va_list args;
725
726     va_start(args, fmt);
727
728     cnt = u8_vprintf(fmt, args);
729
730     va_end(args);
731     return cnt;
732 }
Note: See TracBrowser for help on using the browser.