root/libswish3/trunk/src/libswish3/parser.c

Revision 2188, 57.3 kB (checked in by karpet, 2 weeks ago)

first pass at the dom-specific property and metaname feature.

Line 
1 /*
2 * This file is part of libswish3
3 * Copyright (C) 2007 Peter Karman
4 *
5 *  libswish3 is free software; you can redistribute it and/or modify
6 *  it under the terms of the GNU General Public License as published by
7 *  the Free Software Foundation; either version 2 of the License, or
8 *  (at your option) any later version.
9 *
10 *  libswish3 is distributed in the hope that it will be useful,
11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 *  GNU General Public License for more details.
14 *
15 *  You should have received a copy of the GNU General Public License
16 *  along with libswish3; if not, write to the Free Software
17 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18 */
19
20 /*
21 * parse XML doc from memory using libxml2 SAX2 based on tutorial at
22 * http://www.jamesh.id.au/articles/libxml-sax/libxml-sax.html
23 *
24 * save all character() data to buffer, flushing on new metanames
25 * flush should split buffer into words, skipping nonwordchars/space, and
26 * lowercase all
27 *
28 * see iswlower(3) man page, etc.
29 *
30 * all the mb*() functions rely on locale to recognize multi-byte strings
31 *
32 */
33
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <locale.h>
37 #include <stdarg.h>
38 #include <err.h>
39 #include <string.h>
40 #include <ctype.h>
41 #include <wctype.h>
42 #include <errno.h>
43 #include <libxml/parserInternals.h>
44 #include <libxml/parser.h>
45 #include <libxml/HTMLparser.h>
46 #include <libxml/globals.h>
47 #include <libxml/xmlerror.h>
48 #include <libxml/tree.h>
49 #include <libxml/debugXML.h>
50 #include <libxml/xmlmemory.h>
51
52 #include "libswish3.h"
53
54 extern int errno;
55 extern int SWISH_DEBUG;
56
57 // should we pass on libxml2 via SWISH_WARN()
58 int SWISH_PARSER_WARNINGS = 0;
59
60 static void get_env_vars(
61 );
62
63 static void flush_buffer(
64     swish_ParserData *parser_data,
65     xmlChar *metaname,
66     xmlChar *context
67 );
68
69 static void tokenize(
70     swish_ParserData *parser_data,
71     xmlChar *string,
72     int len,
73     xmlChar *metaname,
74     xmlChar *content
75 );
76
77 static void mystartDocument(
78     void *parser_data
79 );
80 static void myendDocument(
81     void *parser_data
82 );
83 static void mystartElement(
84     void *parser_data,
85     const xmlChar *name,
86     const xmlChar **atts
87 );
88 static void myendElement(
89     void *parser_data,
90     const xmlChar *name
91 );
92
93 /*
94 * SAX2 support
95 */
96 static void mystartElementNs(
97     void *parser_data,
98     const xmlChar *localname,
99     const xmlChar *prefix,
100     const xmlChar *URI,
101     int nb_namespaces,
102     const xmlChar **namespaces,
103     int nb_attributes,
104     int nb_defaulted,
105     const xmlChar **attributes
106 );
107
108 static void myendElementNs(
109     void *ctx ATTRIBUTE_UNUSED,
110     const xmlChar *localname,
111     const xmlChar *prefix,
112     const xmlChar *URI
113 );
114
115 static void buffer_characters(
116     swish_ParserData *parser_data,
117     const xmlChar *ch,
118     int len
119 );
120 static void mycharacters(
121     void *parser_data,
122     const xmlChar *ch,
123     int len
124 );
125 static void mycomments(
126     void *parser_data,
127     const xmlChar *ch
128 );
129 static void myerr(
130     void *user_data,
131     xmlChar *msg,
132     ...
133 );
134
135 static void open_tag(
136     void *data,
137     const xmlChar *tag,
138     xmlChar **atts
139 );
140 static void close_tag(
141     void *data,
142     const xmlChar *tag
143 );
144 static xmlChar *bake_tag(
145     swish_ParserData *parser_data,
146     xmlChar *tag,
147     xmlChar **atts
148 );
149
150 static int docparser(
151     swish_ParserData *parser_data,
152     xmlChar *filename,
153     xmlChar *buffer,
154     int size
155 );
156 static int xml_parser(
157     xmlSAXHandlerPtr sax,
158     void *user_data,
159     xmlChar *buffer,
160     int size
161 );
162 static int html_parser(
163     xmlSAXHandlerPtr sax,
164     void *user_data,
165     xmlChar *buffer,
166     int size
167 );
168 static int txt_parser(
169     swish_ParserData *parser_data,
170     xmlChar *buffer,
171     int size
172 );
173
174 static swish_ParserData *init_parser_data(
175     swish_3 *s3
176 );
177 static void free_parser_data(
178     swish_ParserData *parser_data
179 );
180
181 /*
182 * parsing fh/buffer headers
183 */
184 typedef struct
185 {
186     xmlChar **lines;
187     int body_start;
188     int nlines;
189 } HEAD;
190
191 static HEAD *buf_to_head(
192     xmlChar *buf
193 );
194 static void free_head(
195     HEAD * h
196 );
197 static swish_DocInfo *head_to_docinfo(
198     HEAD * h
199 );
200
201 static xmlChar *document_encoding(
202     xmlParserCtxtPtr ctxt
203 );
204
205 static void set_encoding(
206     swish_ParserData *parser_data,
207     xmlChar *buffer
208 );
209
210 /* tag tracker */
211 static xmlChar *flatten_tag_stack(
212     xmlChar *baked,
213     swish_TagStack *stack,
214     char flatten_join
215 );
216 static void add_stack_to_prop_buf(
217     xmlChar *baked,
218     swish_ParserData *parser_data
219 );
220 static void push_tag_stack(
221     swish_TagStack *stack,
222     xmlChar *raw,
223     xmlChar *baked,
224     char flatten_join
225 );
226 static swish_Tag *pop_tag_stack(
227     swish_TagStack *stack
228 );
229 static swish_Tag *pop_tag_stack_on_match(
230     swish_TagStack *stack,
231     xmlChar *raw
232 );
233 static void free_swishTag(
234     swish_Tag * st
235 );
236 static void
237 free_swishTagStack(
238     swish_TagStack *stack
239 );
240
241 /***********************************************************************
242 *                end prototypes
243 ***********************************************************************/
244
245 swish_Parser *
246 swish_init_parser(
247     void (*handler) (swish_ParserData *)
248 )
249 {
250     swish_Parser *p = (swish_Parser *)swish_xmalloc(sizeof(swish_Parser));
251
252     p->handler = handler;
253     p->ref_cnt = 0;
254
255 /*
256 * libxml2 stuff
257 */
258     xmlInitParser();
259     xmlSubstituteEntitiesDefault(1);    /* resolve text entities */
260
261 /*
262 * debugging help
263 */
264     get_env_vars();
265
266     if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
267         SWISH_DEBUG_MSG("parser ptr 0x%x", (long int)p);
268     }
269
270     return p;
271 }
272
273 void
274 swish_free_parser(
275     swish_Parser *p
276 )
277 {
278     if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
279         SWISH_DEBUG_MSG("freeing parser");
280         swish_mem_debug();
281     }
282     if (p->ref_cnt != 0) {
283         SWISH_WARN("parser ref_cnt != 0: %d\n", p->ref_cnt);
284     }
285     xmlCleanupParser();
286     xmlMemoryDump();
287     swish_xfree(p);
288 }
289
290 /*
291 * turn the literal xml/html tag into a swish tag for matching against
292 * metanames and properties
293 */
294 static xmlChar *
295 bake_tag(
296     swish_ParserData *parser_data,
297     xmlChar *tag,
298     xmlChar **atts
299 )
300 {
301     int i, j, is_html_tag, size;
302     xmlChar *swishtag, *attr_lower, *attr_val_lower, *alias, *metaname, *metacontent;
303     swish_StringList *strlist;
304
305     if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
306         SWISH_DEBUG_MSG(" tag: %s   parser->tag: %s ", tag, parser_data->tag);
307         if (atts != NULL) {
308             SWISH_DEBUG_MSG(" has attributes [%d]", xmlStrlen((xmlChar *)atts));
309             for (i = 0; (atts[i] != NULL); i += 2) {
310                 SWISH_DEBUG_MSG(" att: %s=", atts[i]);
311                 if (atts[i + 1] != NULL) {
312                     SWISH_DEBUG_MSG(" '%s'", atts[i + 1]);
313                 }
314             }
315         }
316     }
317
318     metaname = NULL;
319     metacontent = NULL;
320
321 /*
322 * normalize all tags
323 */
324     swishtag = swish_str_tolower(tag);
325
326 /*
327 * html tags
328 */
329     if (parser_data->is_html) {
330
331 /*
332            TODO config features about img tags and a/href tags
333 */
334         if (xmlStrEqual(swishtag, (xmlChar *)"br")
335             || xmlStrEqual(swishtag, (xmlChar *)"img")) {
336            
337             if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
338                 SWISH_DEBUG_MSG("found html tag '%s' ... bump_word = 1", swishtag);
339             parser_data->bump_word = 1;
340         }
341         else {
342             const htmlElemDesc *element = htmlTagLookup(swishtag);
343
344             if (!element)
345                 is_html_tag = 0;        /* flag that this might be a meta * name */
346
347             else if (!element->isinline) {
348
349 /*
350 * need to bump token position so we don't match across block *
351 * elements
352 */
353                 if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
354                     SWISH_DEBUG_MSG("found html !inline tag '%s' ... bump_word = 1", swishtag);
355                 parser_data->bump_word = 1;
356
357             }
358             else {
359            
360                 if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
361                     SWISH_DEBUG_MSG("found html inline tag '%s' ... bump_word = 0", swishtag);
362                 parser_data->bump_word = 0;
363            
364             }
365         }
366
367 /*
368 * is this an HTML <meta> tag? treat 'name' attribute as a tag *
369 * and 'content' attribute as the tag content * we assume 'name'
370 * and 'content' are always in english.
371 */
372
373         if (atts != 0) {
374             for (i = 0; (atts[i] != 0); i++) {
375
376                 if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
377                     SWISH_DEBUG_MSG("%d HTML attr: %s", i, atts[i]);
378
379                 if (xmlStrEqual(atts[i], (xmlChar *)"name")) {
380
381 /*
382 * SWISH_DEBUG_MSG("found name: %s", atts[i+1]);
383 */
384                     metaname = (xmlChar *)atts[i + 1];
385                 }
386
387                 else if (xmlStrEqual(atts[i], (xmlChar *)"content")) {
388
389 /*
390 * SWISH_DEBUG_MSG("found content: %s", atts[i+1]);
391 */
392                     metacontent = (xmlChar *)atts[i + 1];
393                 }
394
395             }
396         }
397
398         if (metaname != NULL) {
399             if (metacontent != NULL) {
400                 if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
401                     SWISH_DEBUG_MSG("found HTML meta: %s => %s", metaname, metacontent);
402
403 /*
404 * do not match across metas
405 */
406                 if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
407                     SWISH_DEBUG_MSG("found html meta tag '%s' ... bump_word = 1", metaname);
408                 parser_data->bump_word = 1;
409                 open_tag(parser_data, metaname, NULL);
410                 buffer_characters(parser_data, metacontent, xmlStrlen(metacontent));
411                 close_tag(parser_data, metaname);
412                 swish_xfree(swishtag);
413                 return NULL;
414
415             }
416             else {
417                 SWISH_WARN("No content for meta tag '%s'", metaname);
418             }
419         }
420
421     }
422
423 /*
424 * xml tags
425 */
426     else {
427
428 /*
429 * TODO make this configurable ala swish2
430 */
431
432         if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
433             SWISH_DEBUG_MSG("found xml tag '%s' ... bump_word = 1", swishtag);
434         parser_data->bump_word = 1;
435
436         if (atts != NULL
437             && swish_hash_exists(parser_data->s3->config->stringlists,
438                                  (xmlChar *)SWISH_CLASS_ATTRIBUTES)) {
439             strlist =
440                 swish_hash_fetch(parser_data->s3->config->stringlists,
441                                  (xmlChar *)SWISH_CLASS_ATTRIBUTES);
442
443             for (i = 0; (atts[i] != NULL); i += 2) {
444
445                 if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
446                     SWISH_DEBUG_MSG(" %d XML attr: %s=%s [%d]", i, atts[i], atts[i + 1],
447                                     xmlStrlen(atts[i + 1]));
448
449                 attr_lower = swish_str_tolower(atts[i]);
450                 attr_val_lower = swish_str_tolower(atts[i + 1]);
451
452 /*
453                    is it one of ours?
454 */
455                 for (j = 0; j < strlist->n; j++) {
456                     if (xmlStrEqual(strlist->word[j], attr_lower)) {
457                         if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
458                             SWISH_DEBUG_MSG("found %s: %s", attr_lower, attr_val_lower);
459
460 /*  eligible attribute name */
461                         size = xmlStrlen(swishtag) + xmlStrlen(attr_val_lower) + 2;     /*  dot + NULL */
462                         metaname = swish_xmalloc(size + 1);
463                         snprintf((char *)metaname, size, "%s.%s", (char *)swishtag,
464                                  (char *)attr_val_lower);
465
466                         swish_xfree(swishtag);
467                         swishtag = metaname;
468                     }
469                 }
470
471                 swish_xfree(attr_lower);
472                 swish_xfree(attr_val_lower);
473
474             }
475         }
476
477     }
478
479 /*
480 * change our internal name for this tag if it is aliased in config
481 */
482     alias = swish_hash_fetch(parser_data->s3->config->tag_aliases, swishtag);
483     if (alias) {
484
485 /*
486 * SWISH_DEBUG_MSG("%s alias -> %s", swishtag, alias);
487 */
488         swish_xfree(swishtag);
489         swishtag = swish_xstrdup(alias);
490     }
491
492     if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
493         SWISH_DEBUG_MSG(" swishtag = %s", swishtag);
494     }
495
496     return swishtag;
497 }
498
499 static void
500 flush_buffer(
501     swish_ParserData *parser_data,
502     xmlChar *metaname,
503     xmlChar *context
504 )
505 {
506     swish_TagStack *s = parser_data->metastack;
507
508     if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
509         SWISH_DEBUG_MSG("buffer is >>%s<< before flush",
510                         xmlBufferContent(parser_data->meta_buf));
511
512 /*
513 * add meta_buf as-is to metanames buffer under current tag. this
514 * gives us both tokens and raw text de-tagged but organized by
515 * metaname.
516 */
517     swish_add_buf_to_nb(parser_data->metanames, metaname, parser_data->meta_buf,
518                         (xmlChar *)SWISH_TOKENPOS_BUMPER, 0, 1);
519
520 /*
521 *  add to every metaname on the stack.
522 *  Disabling this for now, as it ought to be up the handler() to decide
523 *  to index a token under multiple metanames, and we associate context
524 *  with the TokenList
525 */
526
527     if (parser_data->s3->config->flags->context_as_meta) {
528         for (s->temp = s->head; s->temp != NULL; s->temp = s->temp->next) {
529             if (xmlStrEqual(s->temp->baked, metaname))  /*  already added */
530                 continue;
531
532             swish_add_buf_to_nb(parser_data->metanames, s->temp->baked,
533                                 parser_data->meta_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER,
534                                 0, 1);
535         }
536     }
537
538     if (parser_data->s3->analyzer->tokenize) {
539         tokenize(parser_data, (xmlChar *)xmlBufferContent(parser_data->meta_buf),
540                  xmlBufferLength(parser_data->meta_buf), metaname, context);
541     }
542
543     xmlBufferEmpty(parser_data->meta_buf);
544
545 }
546
547 /*
548 * SAX2 callback
549 */
550 static void
551 mystartDocument(
552     void *data
553 )
554 {
555
556 /*
557 * swish_ParserData *parser_data = (swish_ParserData *) data;
558 */
559
560     if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
561         SWISH_DEBUG_MSG("startDocument()");
562
563 }
564
565 /*
566 * SAX2 callback
567 */
568 static void
569 myendDocument(
570     void *parser_data
571 )
572 {
573
574     if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
575         SWISH_DEBUG_MSG("endDocument()");
576
577 /*
578 * whatever's left
579 */
580     flush_buffer(parser_data, (xmlChar *)SWISH_DEFAULT_METANAME,
581                  (xmlChar *)SWISH_DEFAULT_METANAME);
582
583 }
584
585 /*
586 * SAX1 callback
587 */
588 static void
589 mystartElement(
590     void *data,
591     const xmlChar *name,
592     const xmlChar **atts
593 )
594 {
595     open_tag(data, name, (xmlChar **)atts);
596 }
597
598 /*
599 * SAX1 callback
600 */
601 static void
602 myendElement(
603     void *data,
604     const xmlChar *name
605 )
606 {
607     close_tag(data, name);
608 }
609
610 /*
611 * SAX2 handler
612 */
613 static void
614 mystartElementNs(
615     void *data,
616     const xmlChar *localname,
617     const xmlChar *prefix,
618     const xmlChar *URI,
619     int nb_namespaces,
620     const xmlChar **namespaces,
621     int nb_attributes,
622     int nb_defaulted,
623     const xmlChar **attributes
624 )
625 {
626     int i, j, len;
627     xmlChar **atts;
628     atts = NULL;
629
630     if (nb_attributes > 0) {
631         atts = swish_xmalloc(((nb_attributes * 2) + 1) * sizeof(xmlChar *));
632         j = 0;
633         for (i = 0; i < nb_attributes * 5; i += 5) {
634             atts[j] = (xmlChar *)attributes[i];
635             len = (int)(attributes[i + 4] - attributes[i + 3]);
636             if (len > 0) {
637                 atts[j + 1] = xmlStrsub(attributes[i + 3], 0, len);
638             }
639             else {
640                 atts[j] = NULL;
641             }
642             j += 2;
643         }
644         atts[j] = NULL;
645     }
646
647     if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
648         SWISH_DEBUG_MSG(" tag: %s nb_attributes %d", localname, nb_attributes);
649         if (atts != NULL) {
650             for (i = 0; (atts[i] != NULL); i += 2) {
651                 SWISH_DEBUG_MSG(" att: %s=%s", atts[i], atts[i + 1]);
652 /* SWISH_DEBUG_MSG(" att: %s=", atts[i++], atts[i] || ""); */
653             }
654         }
655     }
656
657     open_tag(data, localname, atts);
658
659     if (atts != NULL) {
660         swish_xfree(atts);
661     }
662 }
663
664 /*
665 * SAX2 handler
666 */
667 static void
668 myendElementNs(
669     void *data,
670     const xmlChar *localname,
671     const xmlChar *prefix,
672     const xmlChar *URI
673 )
674 {
675     close_tag(data, localname);
676 }
677
678 static void
679 open_tag(
680     void *data,
681     const xmlChar *tag,
682     xmlChar **atts
683 )
684 {
685     swish_ParserData *parser_data;
686     xmlChar *baked;
687    
688     parser_data = (swish_ParserData *)data;
689    
690     if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
691         SWISH_DEBUG_MSG("<%s>", tag);
692
693     if (parser_data->tag != NULL)
694         swish_xfree(parser_data->tag);
695
696     parser_data->tag = bake_tag(parser_data, (xmlChar *)tag, (xmlChar **)atts);
697        
698     if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
699         SWISH_DEBUG_MSG("checking config for '%s' in watched tags", parser_data->tag);
700
701 /* all tags on domstack */
702
703     if (parser_data->tag == NULL) {
704         push_tag_stack(parser_data->domstack, (xmlChar *)tag, (xmlChar *)tag, SWISH_DOT);
705     }
706     else {
707         push_tag_stack(parser_data->domstack, (xmlChar *)tag, parser_data->tag, SWISH_DOT);
708     }
709    
710 /*
711 * set property if this tag is configured for it
712 */
713     if (swish_hash_exists(parser_data->s3->config->properties, parser_data->tag)
714         ||
715         swish_hash_exists(parser_data->s3->config->properties, parser_data->domstack->head->context)
716     ) {
717         if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
718             SWISH_DEBUG_MSG(" %s = new property", parser_data->tag);
719
720         add_stack_to_prop_buf(NULL, parser_data);       /* NULL means all properties in the stack are added */
721         xmlBufferEmpty(parser_data->prop_buf);
722        
723         if (swish_hash_exists(parser_data->s3->config->properties, parser_data->domstack->head->context))
724             baked = parser_data->domstack->head->context;
725         else
726             baked = parser_data->tag;
727
728         push_tag_stack(parser_data->propstack, (xmlChar *)tag, baked, SWISH_SPACE);
729
730         if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
731             SWISH_DEBUG_MSG("%s pushed ok unto propstack", parser_data->tag);
732     }
733
734 /*
735 * likewise for metastack
736 */
737     if (swish_hash_exists(parser_data->s3->config->metanames, parser_data->tag)
738         ||
739         swish_hash_exists(parser_data->s3->config->metanames, parser_data->domstack->head->context)
740     ) {
741         if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
742             SWISH_DEBUG_MSG(" %s = new metaname", parser_data->tag);
743
744         flush_buffer(parser_data, parser_data->metastack->head->baked,
745                      parser_data->metastack->head->context);
746                      
747         if (swish_hash_exists(parser_data->s3->config->properties, parser_data->domstack->head->context))
748             baked = parser_data->domstack->head->context;
749         else
750             baked = parser_data->tag;
751
752         push_tag_stack(parser_data->metastack, (xmlChar *)tag, baked, SWISH_SPACE);
753     }
754
755     if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
756         SWISH_DEBUG_MSG("config check for '%s' done", parser_data->tag);
757
758 }
759
760 static void
761 close_tag(
762     void *data,
763     const xmlChar *tag
764 )
765 {
766     swish_ParserData *parser_data;
767     swish_Tag *st;
768
769     parser_data = (swish_ParserData *)data;
770
771     if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
772         SWISH_DEBUG_MSG("</%s>", tag);
773        
774 /*
775 * lowercase all names for comparison against metanames (which are
776 * also * lowercased)
777 */
778     if (parser_data->tag != NULL)
779         swish_xfree(parser_data->tag);
780
781     parser_data->tag = bake_tag(parser_data, (xmlChar *)tag, NULL);
782
783     if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
784         SWISH_DEBUG_MSG(" endElement(%s) (%s)", (xmlChar *)tag, parser_data->tag);
785        
786     if (parser_data->tag == NULL)
787         return;
788
789     if ((st = pop_tag_stack_on_match(parser_data->propstack, (xmlChar *)tag)) != NULL) {
790
791         add_stack_to_prop_buf(st->baked, parser_data);
792         xmlBufferEmpty(parser_data->prop_buf);
793         free_swishTag(st);
794     }
795
796     if ((st = pop_tag_stack_on_match(parser_data->metastack, (xmlChar *)tag)) != NULL) {
797
798         flush_buffer(parser_data, st->baked, st->context);
799         free_swishTag(st);
800     }
801    
802     // always pop the raw domstack
803     st = pop_tag_stack(parser_data->domstack);
804     free_swishTag(st);
805
806 }
807
808 /*
809 * handle all characters in doc
810 */
811 static void
812 buffer_characters(
813     swish_ParserData *parser_data,
814     const xmlChar *ch,
815     int len
816 )
817 {
818     int i;
819     xmlChar output[len];
820     xmlBufferPtr buf = parser_data->meta_buf;
821
822 /*
823 * why not wchar_t ? len is number of bytes, not number of
824 * characters, so xmlChar (i.e., char) works
825 */
826
827 /*
828 * SWISH_DEBUG_MSG( "sizeof output buf is %d; len was %d\n", sizeof(output),
829 * len );
830 */
831
832 /*
833 * SWISH_DEBUG_MSG( "characters");
834 */
835
836     for (i = 0; i < len; i++) {
837         output[i] = ch[i];
838     }
839     output[i] = (xmlChar)NULL;
840
841     if (parser_data->bump_word && xmlBufferLength(buf)) {
842         swish_append_buffer(buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 1);
843     }
844    
845     swish_append_buffer(buf, output, len);
846
847     if (parser_data->bump_word && xmlBufferLength(parser_data->prop_buf)) {
848         swish_append_buffer(parser_data->prop_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER, 1);
849     }
850     else if (xmlBufferLength(parser_data->prop_buf)) {
851         swish_append_buffer(parser_data->prop_buf, (xmlChar*)" ", 1);
852     }
853
854     swish_append_buffer(parser_data->prop_buf, output, len);
855 }
856
857 /*
858 * SAX2 callback
859 */
860 static void
861 mycharacters(
862     void *parser_data,
863     const xmlChar *ch,
864     int len
865 )
866 {
867     if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
868         SWISH_DEBUG_MSG(" >> mycharacters()");
869
870     buffer_characters(parser_data, ch, len);
871 }
872
873 /*
874 * SAX2 callback
875 */
876 static void
877 mycomments(
878     void *parser_data,
879     const xmlChar *ch
880 )
881 {
882     int len = strlen((char *)(char *)ch);
883
884 /*
885 * TODO: make comments indexing optional
886 */
887
888 /*
889 * TODO: enable noindex option
890 */
891     return;
892
893     buffer_characters(parser_data, ch, len);
894 }
895
896 /*
897 * SAX2 callback
898 */
899 static void
900 myerr(
901     void *data,
902     xmlChar *msg,
903     ...
904 )
905 {
906     swish_ParserData *parser_data;
907     va_list args;
908     char str[1000];
909
910     if (!SWISH_PARSER_WARNINGS)
911         return;
912
913     parser_data = (swish_ParserData *)data;
914
915     SWISH_WARN("libxml2 error for %s:", parser_data->docinfo->uri);
916
917     va_start(args, msg);
918     vsnprintf((char *)str, 1000, (char *)msg, args);
919     xmlParserError(parser_data->ctxt, (char *)str);
920     va_end(args);
921 }
922
923 /*
924 * SAX2 callback
925 */
926 static void
927 mywarn(
928     void *user_data,
929     xmlChar *msg,
930     ...
931 )
932 {
933     swish_ParserData *parser_data;
934     va_list args;
935     char str[1000];
936
937     if (!SWISH_PARSER_WARNINGS)
938         return;
939
940     parser_data = (swish_ParserData *)user_data;
941
942     SWISH_WARN("libxml2 warning for %s:", parser_data->docinfo->uri);
943
944     va_start(args, msg);
945     vsnprintf((char *)str, 1000, (char *)msg, args);
946     xmlParserWarning(parser_data->ctxt, (char *)str);
947     va_end(args);
948 }
949
950 /*
951 * SAX2 handler struct for html and xml parsing
952 */
953
954 xmlSAXHandler my_parser = {
955     NULL,                       /* internalSubset */
956     NULL,                       /* isStandalone */
957     NULL,                       /* hasInternalSubset */
958     NULL,                       /* hasExternalSubset */
959     NULL,                       /* resolveEntity */
960     NULL,                       /* getEntity */
961     NULL,                       /* entityDecl */
962     NULL,                       /* notationDecl */
963     NULL,                       /* attributeDecl */
964     NULL,                       /* elementDecl */
965     NULL,                       /* unparsedEntityDecl */
966     NULL,                       /* setDocumentLocator */
967     mystartDocument,            /* startDocument */
968     myendDocument,              /* endDocument */
969     mystartElement,             /* startElement */
970     myendElement,               /* endElement */
971     NULL,                       /* reference */
972     mycharacters,               /* characters */
973     NULL,                       /* ignorableWhitespace */
974     NULL,                       /* processingInstruction */
975     mycomments,                 /* comment */
976     (warningSAXFunc) & mywarn,  /* xmlParserWarning */
977     (errorSAXFunc) & myerr,     /* xmlParserError */
978     (fatalErrorSAXFunc) & myerr,        /* xmlfatalParserError */
979     NULL,                       /* getParameterEntity */
980     NULL,                       /* cdataBlock -- should we handle this too *
981                                  * ?? */
982     NULL,                       /* externalSubset; */
983     XML_SAX2_MAGIC,
984     NULL,
985     mystartElementNs,           /* startElementNs */
986     myendElementNs,             /* endElementNs */
987     NULL                        /* xmlStructuredErrorFunc */
988 };
989
990 xmlSAXHandlerPtr my_parser_ptr = &my_parser;
991
992 static int
993 docparser(
994     swish_ParserData *parser_data,
995     xmlChar *filename,
996     xmlChar *buffer,
997     int size
998 )
999 {
1000
1001     int ret;
1002     xmlChar *mime = (xmlChar *)parser_data->docinfo->mime;
1003     xmlChar *parser = (xmlChar *)parser_data->docinfo->parser;
1004
1005     if (!size && !xmlStrlen(buffer) && !parser_data->docinfo->size) {
1006         SWISH_WARN("%s appears to be empty -- can't parse it", parser_data->docinfo->uri);
1007
1008         return 1;
1009     }
1010
1011     if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
1012         SWISH_DEBUG_MSG("%s -- using %s parser", parser_data->docinfo->uri, parser);
1013
1014 /*
1015 * slurp file if not already in memory
1016 */
1017     if (filename && !buffer) {
1018         buffer = swish_slurp_file_len(filename, (long)parser_data->docinfo->size);
1019         size = parser_data->docinfo->size;
1020     }
1021
1022     if (parser[0] == 'H') {
1023         parser_data->is_html = 1;
1024         ret = html_parser(my_parser_ptr, parser_data, buffer, size);
1025     }
1026
1027     else if (parser[0] == 'X')
1028         ret = xml_parser(my_parser_ptr, parser_data, buffer, size);
1029
1030     else if (parser[0] == 'T')
1031         ret = txt_parser(parser_data, (xmlChar *)buffer, size);