root/libswish3/trunk/src/libswish3/string.c

Revision 2178, 19.0 kB (checked in by karpet, 2 months ago)

some versions of html parser were passing through extra whitespace.
seems to be a specific libxml2 issue. in any case, added a new
whitespace check in both add to buf methods and perl bindings
(the latter where t/20-metanames.t was failing due to extra whitespace)

Line 
1 /*
2  * This file is part of libswish3
3  * Copyright (C) 2007 Peter Karman
4  *
5  *  libswish3 is free software; you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation; either version 2 of the License, or
8  *  (at your option) any later version.
9  *
10  *  libswish3 is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with libswish3; if not, write to the Free Software
17  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18 */
19
20 /* string.c -- handle xmlChar and wchar_t strings
21  * much of this module based on swstring.c in swish-e vers 2
22  * but re-written for UTF-8 support
23 */
24
25 #include <assert.h>
26 #include <wchar.h>
27 #include <wctype.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <stdlib.h>
31 #include <locale.h>
32 #include <err.h>
33 #include <limits.h>
34 #include <errno.h>
35
36 #include "libswish3.h"
37
38 extern int SWISH_DEBUG;
39
40 static xmlChar *getword(
41     xmlChar **in_buf
42 );
43 static xmlChar *findlast(
44     xmlChar *str,
45     xmlChar *set
46 );
47 static xmlChar *lastptr(
48     xmlChar *str
49 );
50
51 /* originally based on libutf8; this version (and other u8_* functions)
52    are from http://cprogramming.com/tutorial/unicode.html
53  */
54 static int
55 u8_is_locale_utf8(
56     char *locale
57 );
58
59 /* move to next character */
60 static void u8_inc(
61     char *s,
62     int *i
63 );
64
65 /* move to previous character */
66 static void u8_dec(
67     char *s,
68     int *i
69 );
70
71
72 /* is c the start of a utf8 sequence? */
73 #define isutf(c) (((c)&0xC0)!=0x80)
74
75 static void
76 u8_inc(
77     char *s,
78     int *i
79 )
80 {
81     (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i));
82 }
83
84 static void
85 u8_dec(
86     char *s,
87     int *i
88 )
89 {
90     (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i));
91 }
92
93
94 /* these string conversion functions based on code from xapian-omega */
95 #define BUFSIZE 100
96
97 #define CONVERT_TO_STRING(FMT) \
98     xmlChar *str;\
99     int ret;\
100     str = swish_xmalloc(BUFSIZE);\
101     ret = snprintf((char*)str, BUFSIZE, (FMT), val);\
102     if (ret<0) SWISH_CROAK("snprintf failed with %d", ret);\
103     return str;
104
105 int
106 swish_string_to_int(
107     char *buf
108 )
109 {
110     long i;
111     errno = 0;
112     i = strtol(buf, (char **)NULL, 10);
113     /*
114        Check for various possible errors
115      */
116     if ((errno == ERANGE && (i == LONG_MAX || i == LONG_MIN))
117         || (errno != 0 && i == 0)) {
118         perror("strtol");
119         exit(EXIT_FAILURE);
120     }
121     return (int)i;
122 }
123
124 xmlChar *
125 swish_int_to_string(
126     int val
127 )
128 {
129     CONVERT_TO_STRING("%d")
130 }
131
132 xmlChar *
133 swish_long_to_string(
134     long val
135 )
136 {
137     CONVERT_TO_STRING("%ld")
138 }
139
140 xmlChar *
141 swish_double_to_string(
142     double val
143 )
144 {
145     CONVERT_TO_STRING("%f")
146 }
147
148 xmlChar *
149 swish_date_to_string(
150     int y,
151     int m,
152     int d
153 )
154 {
155     char buf[11];
156     if (y < 0)
157         y = 0;
158     else if (y > 9999)
159         y = 9999;
160     if (m < 1)
161         m = 1;
162     else if (m > 12)
163         m = 12;
164     if (d < 1)
165         d = 1;
166     else if (d > 31)
167         d = 31;
168 #ifdef SNPRINTF
169     int len = SNPRINTF(buf, sizeof(buf), "%04d%02d%02d", y, m, d);
170     if (len == -1 || len > BUFSIZE)
171         buf[BUFSIZE + 1] = '\0';
172 #else
173     buf[BUFSIZE + 1] = '\0';
174     sprintf(buf, "%04d%02d%02d", y, m, d);
175     if (buf[BUFSIZE + 1])
176         abort();                /* Uh-oh, buffer overrun */
177 #endif
178     return swish_xstrdup((xmlChar *)buf);
179 }
180
181 /* TODO need these ??
182 inline uint32_t
183 binary_string_to_int(
184     const std::string & s
185 )
186 {
187     if (s.size() != 4)
188         return (uint32_t) - 1;
189     uint32_t
190         v;
191     memcpy(&v, s.data(), 4);
192     return ntohl(v);
193 }
194
195 inline
196     std::string
197 int_to_binary_string(
198     uint32_t v
199 )
200 {
201     v = htonl(v);
202     return std::string(reinterpret_cast < const char *>(&v), 4);
203 }
204
205 */
206
207 /* returns the UCS32 value for a UTF8 string -- the character's Unicode value.
208    see http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&item_id=IWS-AppendixA
209 */
210 int
211 swish_utf8_codepoint(
212     xmlChar *utf8
213 )
214 {
215     int len;
216     len = swish_utf8_chr_len(utf8);
217
218     switch (len) {
219
220     case 1:
221         return utf8[0];
222
223     case 2:
224         return (utf8[0] - 192) * 64 + utf8[1] - 128;
225
226     case 3:
227         return (utf8[0] - 224) * 4096 + (utf8[1] - 128) * 64 + utf8[2] - 128;
228
229     case 4:
230     default:
231         return (utf8[0] - 240) * 262144 + (utf8[1] - 128) * 4096 + (utf8[2] - 128) * 64 +
232             utf8[3] - 128;
233
234     }
235 }
236
237 void
238 swish_utf8_next_chr(
239     xmlChar *s,
240     int *i
241 )
242 {
243     u8_inc((char *)s, i);
244 }
245
246 void
247 swish_utf8_prev_chr(
248     xmlChar *s,
249     int *i
250 )
251 {
252     u8_dec((char *)s, i);
253 }
254
255
256 /* returns length of a UTF8 character, based on first byte (see below) */
257 int
258 swish_utf8_chr_len(
259     xmlChar *utf8
260 )
261 {
262     int n;
263     n = xmlUTF8Size(utf8);
264     if (n == -1)
265         SWISH_CROAK("Bad UTF8 string: %s", utf8);
266        
267     return n;
268 }
269
270 /* returns the number of UCS32 codepoints (characters) in a UTF8 string */
271 int
272 swish_utf8_num_chrs(
273     xmlChar *utf8
274 )
275 {
276     int n;
277     n = xmlUTF8Strlen(utf8);
278     if (n == -1)
279         SWISH_CROAK("Bad UTF8 string: %s", utf8);
280        
281     return n;
282 }
283
284 /* returns true if all bytes in the *str are in the ascii range.
285  * this helps speed up string handling when we don't need to worry
286  * about multi-byte chars.
287 */
288
289 /* from the libxml2 xmlstring.c file:
290      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
291      * are as follows (in "bit format"):
292      *    0xxxxxxx                                      valid 1-byte
293      *    110xxxxx 10xxxxxx                             valid 2-byte
294      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
295      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
296 */
297
298 boolean
299 swish_is_ascii(
300     xmlChar *str
301 )
302 {
303     int i;
304     int len = xmlStrlen(str);
305
306     if (!len || str == NULL)
307         return 0;
308
309     for (i = 0; i < len; i++) {
310         if (str[i] >= 0x80)
311             return 0;
312
313     }
314     return 1;
315 }
316
317 static int
318 u8_is_locale_utf8(
319     char *locale
320 )
321 {
322     // this code based on libutf8
323     const char *cp = locale;
324
325     for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
326         if (*cp == '.') {
327             const char *encoding = ++cp;
328             for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++);
329             if ((cp - encoding == 5 && !strncmp(encoding, "UTF-8", 5))
330                 || (cp - encoding == 4 && !strncmp(encoding, "utf8", 4)))
331                 return 1;       // it's UTF-8
332             break;
333         }
334     }
335     return 0;
336 }
337
338
339 void
340 swish_verify_utf8_locale(
341 )
342 {
343     char *loc;
344     const xmlChar *enc;
345
346 /* a bit about encodings: libxml2 takes whatever encoding the input XML is
347  * (latin1, ascii, utf8, etc) and standardizes it using iconv in xmlChar as
348  * UTF-8. However, we must ensure we have UTF-8 locale because all the mb* and wc*
349  * routines rely on the locale to correctly interpret chars.
350  */
351
352 /* use LC_CTYPE specifically: http://mail.nl.linux.org/linux-utf8/2001-09/msg00030.html */
353
354     loc = setlocale(LC_CTYPE, "");
355
356     enc = xmlStrchr((xmlChar *)loc, (xmlChar)'.');
357
358     if (enc != NULL) {
359         enc++;
360         if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
361             SWISH_DEBUG_MSG("encoding = %s", enc);
362     }
363     else {
364         if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
365             SWISH_DEBUG_MSG("no encoding in %s, using %s", loc, SWISH_DEFAULT_ENCODING);
366
367         enc = (xmlChar *)SWISH_DEFAULT_ENCODING;
368     }
369
370     setenv("SWISH_ENCODING", (char *)enc, 0);   /* remember in env var, if not already set */
371
372     if (!loc) {
373         SWISH_WARN("can't get locale via setlocale()");
374     }
375
376     if (u8_is_locale_utf8(loc)) {
377 /* a-ok */
378
379     }
380     else {
381 /* must be UTF-8 charset since libxml2 converts everything to UTF-8 */
382         if (SWISH_DEBUG)
383             SWISH_DEBUG_MSG
384                 ("Your locale (%s) was not UTF-8 so internally we are using %s", loc,
385                  SWISH_LOCALE);
386
387         setlocale(LC_CTYPE, SWISH_LOCALE);
388
389     }
390
391     if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
392         SWISH_DEBUG_MSG("locale set to %s", loc);
393
394 }
395
396 /* based on swstring.c  */
397
398 int
399 swish_wchar_t_comp(
400     const void *s1,
401     const void *s2
402 )
403 {
404     return (*(wchar_t *) s1 - *(wchar_t *) s2);
405 }
406
407 /* Sort a string */
408 int
409 swish_sort_wchar(
410     wchar_t * s
411 )
412 {
413     int i, j, len;
414
415     len = wcslen(s);
416     qsort(s, len, sizeof(wchar_t), &swish_wchar_t_comp);
417
418 /* printf("sorted array s is %d long\n", len); */
419
420     for (i = 0; s[i] != 0; i++)
421 /* printf("%d = %lc (%d)\n", i, s[i], s[i]); */
422
423         for (i = 1, j = 1; i < (len - 1); i++) {
424             if (s[i] != s[j - 1]) {
425                 s[j++] = s[i];
426 /* printf("%d item is %lc (%d)\n", j, s[j], s[j]); */
427             }
428         }
429
430     return s[j];
431
432 }
433
434 /* based on swstring.c in Swish-e but handles wide char strings instead */
435
436 wchar_t *
437 swish_wstr_tolower(
438     wchar_t * s
439 )
440 {
441     wchar_t *p = (wchar_t *) s;
442     while (*p) {
443         *p = (wchar_t) towlower(*p);
444         p++;
445     }
446     return s;
447 }
448
449 /* convert a string to lowercase.
450  * returns a new malloc'd string, so should be freed eventually
451 */
452 xmlChar *
453 swish_str_tolower(
454     xmlChar *s
455 )
456 {
457
458     if (swish_is_ascii(s))
459         return swish_ascii_str_tolower(s);
460     else
461         return swish_utf8_str_tolower(s);
462
463 }
464
465 /* convert utf8 to wchar,
466    lowercase the wchar,
467    then convert back to utf8
468    and free the wchar
469 */
470 xmlChar *
471 swish_utf8_str_tolower(
472     xmlChar *s
473 )
474 {
475     xmlChar *str;
476     wchar_t *wstr;
477
478 /* convert mb to wide -- must free */
479     wstr = swish_locale_to_wchar(s);
480
481 /* convert wide tolower */
482     swish_wstr_tolower(wstr);
483
484 /* convert wide back to mb */
485     str = swish_wchar_to_locale(wstr);
486
487     swish_xfree(wstr);
488
489     return str;
490 }
491
492 /* based on swstring.c in Swish-e */
493 xmlChar *
494 swish_ascii_str_tolower(
495     xmlChar *s
496 )
497 {
498     xmlChar *copy = swish_xstrdup(s);
499     xmlChar *p = copy;
500     while (*p) {
501         *p = tolower(*p);
502         p++;
503     }
504     return copy;
505 }
506
507 /*
508   -- Skip white spaces...
509   -- position to non space character
510   -- return: ptr. to non space char or \0
511   -- 2001-01-30  rasc
512
513   TODO make utf8 safe.
514 */
515
516 xmlChar *
517 swish_str_skip_ws(
518     xmlChar *s
519 )
520 {
521     while (*s && isspace((int)(xmlChar)*s))
522         s++;
523     return s;
524 }
525
526 /*************************************
527 * Trim trailing white space
528 * Returns void
529 **************************************/
530
531 // TODO make utf8 safe
532 void
533 swish_str_trim_ws(
534     xmlChar *s
535 )
536 {
537     int i = xmlStrlen(s);
538
539     while (i && isspace((int)s[i - 1]))
540         s[--i] = '\0';
541 }
542
543 boolean
544 swish_str_all_ws(
545     xmlChar *s
546 )
547 {
548     return swish_str_all_ws_len(s, xmlStrlen(s));
549 }
550
551 boolean
552 swish_str_all_ws_len(
553     xmlChar * s,
554     int len
555 )
556 {
557     int i;
558     for (i = 0; i < len; i++) {
559         if (!isspace((int)s[i])) {
560             return 0;
561         }
562     }
563     return 1;
564 }
565
566
567 void
568 swish_debug_wchars(
569     const wchar_t * widechars
570 )
571 {
572     int i;
573     for (i = 0; widechars[i] != 0; i++) {
574         printf(" >%lc< %ld %#lx \n", (wint_t) widechars[i], (long int)widechars[i],
575                (long unsigned int)widechars[i]);
576     }
577 }
578
579 /* returns the number of UTF-8 char* needed to hold the codepoint
580    represented by 'ch'.
581    similar to swish_utf8_chr_len() except that the arg is already
582    a 4-byte container and we want to know how many of the 4 bytes
583    we really need.
584 */
585 int
586 swish_bytes_in_wchar(
587     int ch
588 )
589 {
590     int len = 0;
591
592     if (ch < 0x80) {
593         len = 1;
594     }
595     if (ch < 0x800) {
596         len = 2;
597     }
598     if (ch < 0x10000) {
599         len = 3;
600     }
601     if (ch < 0x110000) {
602         len = 4;
603     }
604
605     if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
606         SWISH_DEBUG_MSG(" %lc is %d bytes long", ch, len);
607
608     return len;
609 }
610
611
612 /* from http://www.triptico.com/software/unicode.html */
613 wchar_t *
614 swish_locale_to_wchar(
615     xmlChar *str
616 )
617 {
618     wchar_t *ptr;
619     size_t s;
620     int len;
621
622 /* first arg == 0 means 'calculate needed space' */
623     s = mbstowcs(0, (const char *)str, 0);
624
625     len = mblen((const char *)str, 4);
626
627 /* a size of -1 is triggered by an error in encoding;
628  * never happen in ISO-8859-* locales, but possible in UTF-8
629  */
630     if (s == -1)
631         SWISH_CROAK("error converting mbs to wide str: %s", str);
632
633
634 /* malloc the necessary space */
635     ptr = swish_xmalloc((s + 1) * sizeof(wchar_t));
636
637 /* really do it */
638     s = mbstowcs(ptr, (const char *)str, s);
639
640 /* ensure NULL termination */
641     ptr[s] = '\0';
642
643 /* remember to free() ptr when done */
644     return (ptr);
645 }
646
647 /* from http://www.triptico.com/software/unicode.html */
648 xmlChar *
649 swish_wchar_to_locale(
650     wchar_t * str
651 )
652 {
653     xmlChar *ptr;
654     size_t s;
655
656 /* first arg == 0 means 'calculate needed space' */
657     s = wcstombs(0, str, 0);
658
659 /* a size of -1 means there are characters that could not be converted to current
660      * locale */
661     if (s == -1)
662         SWISH_CROAK("error converting wide chars to mbs: %ls", str);
663
664 /* malloc the necessary space */
665     ptr = (xmlChar *)swish_xmalloc(s + 1);
666
667 /* really do it */
668     s = wcstombs((char *)ptr, (const wchar_t *)str, s);
669
670 /* ensure NULL termination */
671     ptr[s] = '\0';
672
673 /* remember to free() ptr when done */
674     return (ptr);
675 }
676
677 /* StringList functions derived from swish-e vers 2 */
678 swish_StringList *
679 swish_init_stringlist(
680 )
681 {
682     swish_StringList *sl = swish_xmalloc(sizeof(swish_StringList));
683     sl->n = 0;
684     sl->word = swish_xmalloc(2 * sizeof(xmlChar *));
685 /* 2 to allow for NULL-terminate */
686     return sl;
687 }
688
689 void
690 swish_free_stringlist(
691     swish_StringList * sl
692 )
693 {
694     while (sl->n)
695         swish_xfree(sl->word[--sl->n]);
696
697     swish_xfree(sl->word);
698     swish_xfree(sl);
699 }
700
701 void
702 swish_merge_stringlists(
703     swish_StringList * sl1,
704     swish_StringList * sl2
705 )
706 {
707     int i;
708     // add sl1 -> sl2
709     sl2->word =
710         (xmlChar **)swish_xrealloc(sl2->word, (sl1->n + sl2->n) * sizeof(xmlChar *) + 1);
711     for (i = 0; i < sl1->n; i++) {
712         // copy is a little overhead, but keeps mem count simple
713         sl2->word[sl2->n++] = swish_xstrdup(sl1->word[i]);
714     }
715     swish_free_stringlist(sl1);
716 }
717
718 swish_StringList *
719 swish_copy_stringlist(
720     swish_StringList * sl
721 )
722 {
723     swish_StringList *s2;
724     int i;
725     s2 = swish_init_stringlist();
726     s2->word = (xmlChar **)swish_xrealloc(s2->word, sl->n * sizeof(xmlChar *) + 1);
727     for (i = 0; i < sl->n; i++) {
728         s2->word[i] = swish_xstrdup(sl->word[i]);
729     }
730     s2->n = sl->n;
731     return s2;
732 }
733
734 swish_StringList *
735 swish_make_stringlist(
736     xmlChar *line
737 )
738 {
739     swish_StringList *sl;
740     int cursize, maxsize;
741     xmlChar *p;
742
743     if (!line)
744         return (NULL);
745
746     sl = swish_init_stringlist();
747     p = (xmlChar *)strchr((const char *)line, '\n');
748     if (p != NULL)
749         *p = '\0';
750
751     cursize = 0;
752     maxsize = 2;
753
754     p = line;
755
756     while (&line && (p = getword(&line))) {
757 /* getword returns "" when not null, so need to free it if we are not using it */
758         if (!*p) {
759             swish_xfree(p);
760             break;
761         }
762
763         if (cursize == maxsize) {
764             sl->word =
765                 (xmlChar **)swish_xrealloc(sl->word, (maxsize *= 2) * sizeof(xmlChar *));
766         }
767
768         sl->word[cursize++] = (xmlChar *)p;
769     }
770     sl->n = cursize;
771
772 /* Add an extra NULL */
773     if (cursize == maxsize) {
774         sl->word =
775             (xmlChar **)swish_xrealloc(sl->word, (maxsize += 1) * sizeof(xmlChar *));
776     }
777
778     sl->word[cursize] = NULL;
779
780     return sl;
781 }
782
783 /* Gets the next word in a line. If the word's in quotes,
784  * include blank spaces in the word or phrase.
785  * should be utf-8 compatible; only pitfall would be if a continuation byte
786  * returns true for isspace().
787 */
788
789 static xmlChar *
790 getword(
791     xmlChar **in_buf
792 )
793 {
794     xmlChar quotechar;
795     xmlChar uc;
796     xmlChar *s = *in_buf;
797     xmlChar *start = *in_buf;
798     xmlChar buf[SWISH_MAX_WORD_LEN + 1];
799     xmlChar *cur_char = buf;
800     int backslash = 0;
801
802     quotechar = '\0';
803
804     s = swish_str_skip_ws(s);
805
806 /* anything to read? */
807     if (!*s) {
808         *in_buf = s;
809         return swish_xstrdup((xmlChar *)"\0");
810     }
811
812     if (*s == '\"' || *s == '\'')
813         quotechar = *s++;
814
815 /* find end of "more words" or word */
816
817     while (*s) {
818         uc = (xmlChar)*s;
819
820         if (uc == '\\' && !backslash && quotechar)
821 /* only enable backslash
822          * inside of quotes */
823         {
824             s++;
825             backslash++;
826             continue;
827         }
828
829 /* Can't see why we would need to escape these, can you? - always fed a
830          * single line */
831         if (uc == '\n' || uc == '\r') {
832             s++;
833             break;
834         }
835
836         if (!backslash) {
837 /* break on ending quote or unquoted space */
838
839             if (uc == quotechar || (!quotechar && isspace((int)uc))) {
840                 s++;            /* past quote or space char. */
841                 break;
842             }
843
844         }
845         else {
846             backslash = 0;
847         }
848
849         *cur_char++ = *s++;
850
851         if (cur_char - buf > SWISH_MAX_WORD_LEN) {
852             SWISH_WARN("Parsed word '%s' exceeded max length of %d", start,
853                        SWISH_MAX_WORD_LEN);
854         }
855
856     }
857
858     if (backslash)
859         *cur_char++ = '\\';
860
861     *cur_char = '\0';
862
863     *in_buf = s;
864
865     return swish_xstrdup(buf);
866
867 }
868
869 /* parse a URL to determine file ext */
870 /* inspired by http://www.tug.org/tex-archive/tools/zoo/ by Rahul Dhesi */
871 xmlChar *
872 swish_get_file_ext(
873     xmlChar *url
874 )
875 {
876     xmlChar *p;
877
878 /*    if (strlen(url) < 3)
879         return url;
880 */
881
882     if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
883         SWISH_DEBUG_MSG("parsing url %s for extension", url);
884
885     p = findlast(url, (xmlChar *)SWISH_EXT_SEP);        /* look for . or /         */
886
887     if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
888         SWISH_DEBUG_MSG("p = %s", p);
889
890     if (p == NULL)
891         return p;
892
893     if (p != NULL && *p != SWISH_EXT_CH)        /* found .?                     */
894         return NULL;            /* ... if not, ignore / */
895
896     if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
897         SWISH_DEBUG_MSG("p = %s", p);
898
899     if (*p == SWISH_EXT_CH)
900         p++;                    /* skip to next char after . */
901
902     if (SWISH_DEBUG & SWISH_DEBUG_TOKENIZER)
903         SWISH_DEBUG_MSG("ext is %s", p);
904
905     return swish_str_tolower(p);
906 }
907
908 /*******************/
909 /*
910 findlast() finds last occurrence in provided string of any of the characters
911 except the null character in the provided set.
912
913 If found, return value is pointer to character found, else it is NULL.
914 */
915
916 static xmlChar *
917 findlast(
918     xmlChar *str,
919     xmlChar *set
920 )
921 {
922     xmlChar *p;
923
924     if (str == NULL || set == NULL || *str == '\0' || *set == '\0')
925         return (NULL);
926
927     p = lastptr(str);           /* pointer to last char of string */
928     assert(p != NULL);
929
930     while (p != str && xmlStrchr(set, *p) == NULL) {
931         --p;
932     }
933
934 /* either p == str or we found a character or both */
935     if (xmlStrchr(set, *p) == NULL)
936         return (NULL);
937     else
938         return (p);
939 }
940
941 /*
942 lastptr() returns a pointer to the last non-null character in the string, if
943 any.  If the string is null it returns NULL
944 */
945
946 static xmlChar *
947 lastptr(
948     xmlChar *str
949 )
950 {
951     xmlChar *p;
952     if (str == NULL)
953         SWISH_CROAK("received null pointer while looking for last NULL");
954     if (*str == '\0')
955         return (NULL);
956     p = str;
957     while (*p != '\0')          /* find trailing null char */
958         ++p;
959     --p;                        /* point to just before it */
960     return (p);
961 }
Note: See TracBrowser for help on using the browser.