Changeset 530
- Timestamp:
- 07/26/01 10:40:48 (7 years ago)
- Files:
-
- trunk/swish-e/MANIFEST (modified) (2 diffs)
- trunk/swish-e/doc/bin/toc_file (modified) (previous)
- trunk/swish-e/example/swish.cgi (modified) (13 diffs)
- trunk/swish-e/pod/CHANGES.pod (modified) (1 diff)
- trunk/swish-e/prog-bin/spider.pl (modified) (3 diffs)
- trunk/swish-e/src/Makefile.in (modified) (5 diffs)
- trunk/swish-e/src/docprop.c (modified) (2 diffs)
- trunk/swish-e/src/expat (added)
- trunk/swish-e/src/expat/COPYING (added)
- trunk/swish-e/src/expat/xmlparse (added)
- trunk/swish-e/src/expat/xmlparse/xmlparse.c (added)
- trunk/swish-e/src/expat/xmlparse/xmlparse.h (added)
- trunk/swish-e/src/expat/xmltok (added)
- trunk/swish-e/src/expat/xmltok/ascii.h (added)
- trunk/swish-e/src/expat/xmltok/asciitab.h (added)
- trunk/swish-e/src/expat/xmltok/iasciitab.h (added)
- trunk/swish-e/src/expat/xmltok/latin1tab.h (added)
- trunk/swish-e/src/expat/xmltok/nametab.h (added)
- trunk/swish-e/src/expat/xmltok/utf8tab.h (added)
- trunk/swish-e/src/expat/xmltok/xmldef.h (added)
- trunk/swish-e/src/expat/xmltok/xmlrole.c (added)
- trunk/swish-e/src/expat/xmltok/xmlrole.h (added)
- trunk/swish-e/src/expat/xmltok/xmltok.c (added)
- trunk/swish-e/src/expat/xmltok/xmltok.h (added)
- trunk/swish-e/src/expat/xmltok/xmltok_impl.c (added)
- trunk/swish-e/src/expat/xmltok/xmltok_impl.h (added)
- trunk/swish-e/src/expat/xmltok/xmltok_ns.c (added)
- trunk/swish-e/src/index.c (modified) (2 diffs)
- trunk/swish-e/src/parse_conffile.c (modified) (3 diffs)
- trunk/swish-e/src/xml.c (modified) (2 diffs)
- trunk/swish-e/src/xml.h (modified) (1 diff)
- trunk/swish-e/tests/test.config (modified) (1 diff)
- trunk/swish-e/tests/test.xml (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/swish-e/MANIFEST
r456 r530 113 113 src/db_native.c 114 114 src/db_native.h 115 src/db_berkeley_db.c 116 src/db_berkeley_db.h 115 117 src/dump.c 116 118 src/dump.h … … 121 123 src/error.c 122 124 src/error.h 125 src/expat/COPYING 126 src/expat/xmlparse/xmlparse.c 127 src/expat/xmlparse/xmlparse.h 128 src/expat/xmltok/ascii.h 129 src/expat/xmltok/asciitab.h 130 src/expat/xmltok/iasciitab.h 131 src/expat/xmltok/latin1tab.h 132 src/expat/xmltok/nametab.h 133 src/expat/xmltok/utf8tab.h 134 src/expat/xmltok/xmldef.h 135 src/expat/xmltok/xmlrole.c 136 src/expat/xmltok/xmlrole.h 137 src/expat/xmltok/xmltok.c 138 src/expat/xmltok/xmltok.h 139 src/expat/xmltok/xmltok_impl.c 140 src/expat/xmltok/xmltok_impl.h 141 src/expat/xmltok/xmltok_ns.c 123 142 src/extprog.c 124 143 src/extprog.h trunk/swish-e/example/swish.cgi
r513 r530 1 1 #!/usr/local/bin/perl -w 2 use strict; 3 use lib '.'; 4 use CGI; 5 use Symbol; 2 6 3 7 4 #################################################################################### … … 34 31 # 35 32 #################################################################################### 36 37 use vars qw/%CONFIG $NotAWord/; 38 39 33 package SwishSearch; 34 use strict; 35 use lib '.'; 36 use CGI; 37 use Symbol; 38 39 use vars qw/$NotAWord/; 40 41 # Run the script 42 handler() unless $ENV{MOD_PERL}; 43 44 45 #================================================================================== 46 47 # This is written this was so the script can be used as a CGI script or a mod_perl 48 # module without any code changes. 49 50 sub handler { 51 my $r = shift; 52 53 54 # Taint issues 55 $ENV{PATH} = '/usr/bin'; # For taint checking 56 57 40 58 ##### Configuration Parameters ######### 41 42 43 %CONFIG = ( 59 60 my %CONFIG = ( 44 61 title => 'Search the Swish-e list', # Title of your choice. 45 62 swish_binary => './swish-e', # Location of swish-e binary … … 76 93 # property specified above as the description_prop (normally, 'swishdescription'). 77 94 78 95 79 96 highlight_words => 1, # enable highlighting 80 97 show_words => 12, # Number of swish words+non-swish words to show around highlighted word … … 85 102 highlight_on => '<font style="background:#FFFF99">', 86 103 highlight_off => '</font>', 87 104 88 105 89 106 # Property names listed here will be displayed in a table below each result … … 148 165 149 166 150 # disable "date_ranges" in the above example 167 # disable "date_ranges" in the above example -- need additional module for that feature 151 168 delete $CONFIG{date_ranges}; 152 169 … … 161 178 162 179 process_request( \%CONFIG ); 180 181 return Apache::Constants::OK() if $ENV{MOD_PERL}; 182 } 163 183 164 184 #============================================================================ … … 409 429 # 410 430 431 my $mod_perl = $ENV{MOD_PERL} 432 ? '<br><small>Response brought to you by <em>MOD_PERL</em> <a href="http://perl.apache.org">perl.apache.org</a></small>' 433 : ''; 434 411 435 sub footer { 412 436 return <<EOF; … … 414 438 <hr> 415 439 <small>Powered by <em>Swish-e</em> <a href="http://swish-e.org">swish-e.org</a></small> 440 $mod_perl 416 441 </body> 417 442 </html> … … 920 945 921 946 922 return ( 947 948 return $ENV{MOD_PERL} 949 ? ( 950 qr/([^$wc]+)/, # regexp for splitting into swish-words 951 qr/^$ignoref([$wc]+?)$ignorel$/i, # regexp for extracting out the words to compare 952 qr/^(?:$match_string)$/, # regexp for comparing extracted words to query 953 # Must force lower case before testing 954 ) 955 956 : ( 923 957 qr/([^$wc]+)/o, # regexp for splitting into swish-words 924 qr/^$ignoref([$wc]+?)$ignorel$/ io, # regexp for extracting out the words to compare958 qr/^$ignoref([$wc]+?)$ignorel$/oi, # regexp for extracting out the words to compare 925 959 qr/^(?:$match_string)$/o, # regexp for comparing extracted words to query 926 960 # Must force lower case before testing … … 1239 1273 } 1240 1274 1275 1; 1276 1241 1277 1242 1278 __END__ … … 1263 1299 Due to the forking nature of this program and its use of signals, 1264 1300 this script probably will not run under Windows without some modifications. 1301 1302 This script can be run under mod_perl. See below for details. 1265 1303 1266 1304 =head1 INSTALLATION … … 1466 1504 description_prop=> 'swishdescription', 1467 1505 highlight_words => 1, 1468 1469 1506 ); 1470 1507 … … 1478 1515 1479 1516 http://www.myserver.name/cgi-bin/swish.cgi 1517 1518 =head1 MOD_PERL 1519 1520 This script can be run under MOD_PERL. This will improve the response time of the 1521 script compared to running under CGI. 1522 1523 Configuration is simple. In your httpd.conf or your startup.pl file you need to 1524 load the script. For example, in httpd.conf you can use a perl section: 1525 1526 <perl> 1527 use lib '/usr/local/apache/cgi-bin'; 1528 require "swish.cgi"; 1529 </perl> 1530 1531 This loads the script into mod_perl. Then to configure the script to run: 1532 1533 <location /search> 1534 allow from all 1535 SetHandler perl-script 1536 PerlHandler SwishSearch 1537 </location> 1538 1539 Unlike CGI, mod_perl does not change dir to the location of the perl module, so 1540 your settings for the swish binary and the path to your index files must be absolute 1541 paths (or relative to the server root). 1542 1543 Please post to the swish-e discussion list if you have any questions about running this 1544 script under mod_perl. 1545 1480 1546 1481 1547 =head1 DEBUGGING trunk/swish-e/pod/CHANGES.pod
r527 r530 146 146 for more information. 147 147 148 =item * String properties are concatenated 149 150 Multiple I<string> properties of the same name in a document are now 151 concatenated into one property. A space character is added between the strings 152 if needed. A warning will be generated if multiple numeric or date properties are 153 found in the same document, and the additional properties will be ignored. 154 155 Previously, properties of the same name were added to the index, but could not be 156 retrieved. 157 158 To do: removed the C<next> pointer, and allow user-defined character to place between 159 properties. 160 161 =item * New XML Parser 162 163 Swish now uses James Clark's expat XML parser library. 164 148 165 =back 149 166 trunk/swish-e/prog-bin/spider.pl
r381 r530 163 163 164 164 165 eval { process_link( $server, $uri ) };165 eval { process_link( $server, $uri->canonical ) }; 166 166 print STDERR $@ if $@; 167 167 … … 191 191 #----------- Process a url and recurse ----------------------- 192 192 sub process_link { 193 my ( $server, $uri ) = @_; 193 my ( $server, $url ) = @_; 194 195 my $uri = URI->new( $url ); 194 196 195 197 die if $abort || $server->{abort}; … … 472 474 $u->authority( $server->{authority} ); # Force all the same host name 473 475 474 push @links, $u; 476 my $z = $u->as_string; 477 478 push @links, $z; 475 479 print STDERR qq[ ++ <$tag $_="$u"> Added to list of links to follow\n] if $server->{debug} & DEBUG_LINKS; 476 480 $found++; trunk/swish-e/src/Makefile.in
r455 r530 25 25 libdir = @libdir@ 26 26 27 # Flags for C compiler28 CFLAGS = -Wall @CFLAGS@ @DEFS@ -DSWISH_VERSION=\"$(VERSION)\"29 27 30 28 … … 35 33 ARFLAGS = cr 36 34 RANLIB = @RANLIB@ 35 37 36 38 37 # … … 46 45 WEB_OBJS =$(HTTP_OBJS) 47 46 47 48 # James Clark's Expat 49 50 XPDIR = expat 51 52 XP_OBJ = $(XPDIR)/xmltok/xmltok.o \ 53 $(XPDIR)/xmltok/xmlrole.o \ 54 $(XPDIR)/xmlparse/xmlparse.o 55 56 XP_INC = -I$(XPDIR)/xmltok -I$(XPDIR)/xmlparse 57 58 XP_H = $(XPDIR)/xmltok/*.h $(XPDIR)/xmlparse/*.h 59 60 # Flags for C compiler 61 CFLAGS = -Wall @CFLAGS@ @DEFS@ -DSWISH_VERSION=\"$(VERSION)\" $(XP_INC) 62 63 64 48 65 OBJS= check.o file.o index.o search.o error.o methods.o\ 49 66 hash.o list.o mem.o string.o merge.o swish2.o stemmer.o \ 50 67 soundex.o docprop.o compress.o xml.o txt.o html.o\ 51 68 metanames.o result_output.o parse_conffile.o result_sort.o\ 52 filter.o lst.osearch_alt.o keychar_out.o date_time.o \69 filter.o search_alt.o keychar_out.o date_time.o \ 53 70 extprog.o entities.o no_better_place_module.o \ 54 71 db.o dump.o db_native.o swish_words.o proplimit.o swish_qsort.o \ 55 72 ramdisk.o \ 56 $(FILESYSTEM_OBJS) $(HTTP_OBJS) 73 $(FILESYSTEM_OBJS) $(HTTP_OBJS) $(XP_OBJ) 57 74 58 .SUFFIXES:59 .SUFFIXES : .o .c60 .c.o :61 $(CC) $(CFLAGS) -c $<62 75 63 .SUFFIXES : .a .c 64 .c.a : 65 $(CC) $(CFLAGS) -c $< 66 $(AR) $(ARFLAGS) $@ $*.o 76 #.SUFFIXES: 77 #.SUFFIXES : .o .c 67 78 79 .c.o: 80 $(CC) $(CFLAGS) -c -o $@ $< 81 68 82 all: $(NAME) swish-search 69 83 70 $(NAME): $(OBJS)libswish-e.a swish.o84 $(NAME): libswish-e.a swish.o 71 85 $(CC) -o $@ $(CFLAGS) swish.o libswish-e.a $(LIBS) 72 86 chmod 755 $@ 73 87 74 88 libswish-e.a: $(OBJS) 75 $(AR) $(ARFLAGS) libswish-e.a$(OBJS)89 $(AR) $(ARFLAGS) $@ $(OBJS) 76 90 $(RANLIB) libswish-e.a 77 91 … … 80 94 81 95 clean: 82 rm -f ../tests/*.index ../tests/core ./core ./swish-e ./swish-search ./*.o ./index.swish ./libswish-e.a96 rm -f ../tests/*.index ../tests/core core swish-e swish-search swish.o $(OBJS) libswish-e.a 83 97 84 98 realclean: clean … … 126 140 *.c: *.h 127 141 142 $(XPDIR)/xmltok/*.o: $(XPDIR)/xmltok/*.h 143 $(XPDIR)/xmlparse/*.o: $(XPDIR)/xmlparse/*.h 128 144 145 trunk/swish-e/src/docprop.c
r528 r530 597 597 } 598 598 599 /******************************************************************* 600 * Appends a string onto a current property 601 * 602 * Call with: 603 * *propEntry 604 * *string 605 * length of string 606 * 607 *******************************************************************/ 608 propEntry *append_property( struct metaEntry *meta_entry, propEntry *p, char *str, int length ) 609 { 610 int newlen = p->propLen + length; 611 char *new_str = emalloc( newlen + 2 ); 612 propEntry *new_prop; 613 int error_flag; 614 int i,j; 615 616 /* Join the two strings */ 617 for (i=0,j=0; i < p->propLen; i++ ) 618 new_str[j++] = p->propValue[i]; 619 620 new_str[j++] = ' '; 621 622 for (i=0; i < length; i++) 623 if ( !isspace( str[i] ) ) 624 break; 625 626 for (; i < length; i++) 627 new_str[j++] = str[i]; 628 629 new_str[j++] = '\0'; 630 631 632 new_prop = CreateProperty( meta_entry, new_str, j, 0, &error_flag ); 633 634 635 freeProperty( p ); 636 efree( new_str ); 637 return new_prop; 638 } 639 640 641 599 642 600 643 /******************************************************************* … … 650 693 dp->n = meta_entry->metaID + 1; 651 694 } 695 } 696 697 /* Un-encoded STRINGS get appended to existing properties */ 698 if ( dp->propEntry[meta_entry->metaID] && !preEncoded ) 699 { 700 if ( is_meta_string(meta_entry) ) 701 { 702 dp->propEntry[meta_entry->metaID] = append_property( meta_entry, dp->propEntry[meta_entry->metaID], propValue, propLen ); 703 return 1; 704 } 705 else // Will this come back and bite me? 706 { 707 progwarn("Warning: Attempt to add duplicate property." ); 708 return 0; 709 } 652 710 } 653 711 trunk/swish-e/src/index.c
r517 r530 126 126 #include "xml.h" 127 127 #include "txt.h" 128 #include "lst.h"128 // #include "lst.h" 129 129 #include "metanames.h" 130 130 #include "result_sort.h" … … 439 439 break; 440 440 441 /* 441 442 case LST: 442 443 strcpy(strType,"LST"); 443 444 countwords = countwords_LST; 444 445 break; 446 */ 445 447 446 448 case WML: trunk/swish-e/src/parse_conffile.c
r517 r530 209 209 if (strcasecmp(w0, "IndexComments") == 0) 210 210 { 211 if (sl->n == 2) 212 { 213 sw->indexComments = atoi(sl->word[1]); 214 } 215 else 216 progerr("%s: IndexComments requires one value", w0); 217 211 sw->indexComments = getYesNoOrAbort(sl, 1, 1); 218 212 continue; 219 213 } … … 653 647 } 654 648 655 649 650 /* $$$ this needs fixing */ 656 651 if (strcasecmp(w0, "StoreDescription") == 0) 657 652 { … … 852 847 if (n < sl->n) 853 848 { 854 if (!strcasecmp(sl->word[n], "yes") )849 if (!strcasecmp(sl->word[n], "yes") || !strcasecmp(sl->word[n], "1") ) 855 850 return 1; 856 if (!strcasecmp(sl->word[n], "no")) 851 852 if (!strcasecmp(sl->word[n], "no") || !strcasecmp(sl->word[n], "0")) 857 853 return 0; 858 854 } 859 progerr("%s requires as %d. parameter\"Yes\" or \"No\" value", sl->word[0], n);855 progerr("%s requires parameter #%d of \"Yes\" or \"No\" value", sl->word[0], n); 860 856 return 0; 861 857 } trunk/swish-e/src/xml.c
r518 r530 1 1 /* 2 2 $Id$ 3 ** 4 ** 5 ** This program and library is free software; you can redistribute it and/or 6 ** modify it under the terms of the GNU (Library) General Public License 7 ** as published by the Free Software Foundation; either version 2 8 ** of the License, or any later version. 9 ** 10 ** This program is distributed in the hope that it will be useful, 11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of 12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 ** GNU (Library) General Public License for more details. 3 14 ** 4 15 ** … … 6 17 ** was: compatibility issue to v 1.x.x 7 18 ** 2001-05-09 rasc entities changed (new module) 19 ** 20 ** 2001-07-25 moseley complete rewrite to use James Clark's Expat parser 8 21 */ 9 22 10 23 #include "swish.h" 11 #include "xml.h"12 #include "html.h"13 24 #include "merge.h" 14 25 #include "mem.h" 15 26 #include "string.h" 16 #include "check.h"17 27 #include "docprop.h" 18 28 #include "error.h" 19 #include "compress.h"20 29 #include "index.h" 21 #include "file.h"22 30 #include "metanames.h" 23 #include "entities.h" 24 25 struct metaEntry *getXMLField(IndexFILE * indexf, char *tag, int applyautomaticmetanames, int verbose, int OkNoMeta, char **parsed_tag, 26 char *filename) 27 { 28 unsigned char *temp, 29 *temp2, 30 c; 31 int isendtag; 32 struct metaEntry *e; 33 34 temp = (unsigned char *) tag; 35 36 if (!temp) 37 return NULL; 38 39 /* Get to the beginning of the word disreguarding blanks */ 40 while (*temp) 41 { 42 if (isspace((int) (*(unsigned char *) temp))) 43 temp++; 44 else 45 break; 46 } 47 48 /* Are we at the start or end tag ? */ 49 if (*temp == '/') 50 { 51 temp++; 52 isendtag = 1; 53 } 31 32 #include "xmlparse.h" // James Clark's Expat 33 34 typedef struct { 35 int *metas; // array of current metaIDs in use 36 int meta_cnt; // current end pointer + 1 37 int metasize; 38 39 int *props; // array of current propIDs in use 40 int prop_cnt; 41 int propsize; 42 43 char *buffer; // text buffer for summary 44 int buffmax; // size of buffer 45 int buffend; // end of buffer. 46 struct metaEntry *summeta; // summary metaEntry 47 48 char *ignore_tag; // tag that triggered ignore (currently used for both) 49 int total_words; 50 int word_pos; 51 int filenum; 52 XML_Parser *parser; 53 INDEXDATAHEADER *header; 54 SWISH *sw; 55 FileProp *fprop; 56 struct file *thisFileEntry; 57 58 } PARSE_DATA; 59 60 61 /* Prototypes */ 62 int _countwords_XML(SWISH * sw, FileProp * fprop, char *buffer, int start, int size); 63 static void start_hndl(void *data, const char *el, const char **attr); 64 static void end_hndl(void *data, const char *el); 65 static void char_hndl(void *data, const char *txt, int txtlen); 66 static void comment_hndl(void *data, const char *txt); 67 static char *isIgnoreMetaName(SWISH * sw, char *tag); 68 static void add_meta( PARSE_DATA *parse_data, struct metaEntry *m ); 69 static void add_prop( PARSE_DATA *parse_data, struct metaEntry *m ); 70 static void append_summary_text( PARSE_DATA *parse_data, char *buf, int len); 71 static void write_summary( PARSE_DATA *parse_data ); 72 73 74 75 int countwords_XML(SWISH * sw, FileProp * fprop, char *buffer) 76 { 77 return _countwords_XML(sw, fprop, buffer, 0, fprop->fsize); 78 } 79 80 81 /********************************************************************* 82 * Entry to index an XML file. 83 * 84 * Creates an XML_Parser object and parses buffer 85 * 86 * Returns: 87 * Count of words indexed 88 * 89 * 90 *********************************************************************/ 91 92 int _countwords_XML(SWISH *sw, FileProp *fprop, char *buffer, int start, int size) 93 { 94 PARSE_DATA parse_data; 95 XML_Parser p = XML_ParserCreate(NULL); 96 IndexFILE *indexf = sw->indexlist; 97 struct MOD_Index *idx = sw->Index; 98 99 /* I have no idea why addtofilelist doesn't do this! */ 100 idx->filenum++; 101 102 /* Set defaults */ 103 memset(&parse_data, 0, sizeof(parse_data)); 104 105 parse_data.header = &indexf->header; 106 parse_data.parser = p; 107 parse_data.sw = sw; 108 parse_data.fprop = fprop; 109 parse_data.filenum = idx->filenum; 110 parse_data.word_pos= 1; /* compress doesn't like zero */ 111 112 113 addtofilelist(sw, indexf, fprop->real_path, &(parse_data.thisFileEntry) ); 114 addCommonProperties(sw, indexf, fprop->mtime, NULL,NULL, start, size); 115 116 117 118 if (!p) 119 progerr("Failed to create XML parser object for '%s'", fprop->real_path ); 120 121 122 123 /* allocate some space */ 124 parse_data.propsize = parse_data.metasize = parse_data.header->metaCounter + 100; 125 parse_data.props = (int *) emalloc( sizeof( int *) * parse_data.propsize ); 126 parse_data.metas = (int *) emalloc( sizeof( int *) * parse_data.metasize ); 127 128 129 /* Set event handlers */ 130 XML_SetUserData( p, (void *)&parse_data ); // local data to pass around 131 XML_SetElementHandler(p, start_hndl, end_hndl); 132 XML_SetCharacterDataHandler(p, char_hndl); 133 134 if( sw->indexComments ) 135 XML_SetCommentHandler( p, comment_hndl ); 136 137 //XML_SetProcessingInstructionHandler(p, proc_hndl); 138 139 if ( !XML_Parse(p, buffer, size, 1) ) 140 progwarn("XML parse error in file '%s' line %d. Error: %s", 141 fprop->real_path, XML_GetCurrentLineNumber(p),XML_ErrorString(XML_GetErrorCode(p))); 142 143 144 /* clean up */ 145 XML_ParserFree(p); 146 147 return parse_data.total_words; 148 } 149 150 /********************************************************************* 151 * Start Tag Event Handler 152 * 153 * These routines check to see if a given meta tag should be indexed 154 * and if the tags should be added as a property 155 * 156 * To Do: 157 * deal with attributes! 158 * 159 *********************************************************************/ 160 161 162 static void start_hndl(void *data, const char *el, const char **attr) 163 { 164 PARSE_DATA *parse_data = (PARSE_DATA *)data; 165 struct metaEntry *m; 166 SWISH *sw = parse_data->sw; 167 char *tag = estrdup( (char *)el ); 168 struct StoreDescription *stordesc = parse_data->fprop->stordesc; 169 170 strtolower( tag ); 171 172 173 /* Check for store description */ 174 if ( stordesc && !parse_data->buffer && ( strcmp(stordesc->field, tag) == 0) 175 && (parse_data->summeta = getPropNameByName(parse_data->header, AUTOPROPERTY_SUMMARY) )) 176 { 177 parse_data->buffmax = stordesc->size < RD_BUFFER_SIZE 178 ? stordesc->size 179 : RD_BUFFER_SIZE; 180 181 parse_data->buffer = (char *) emalloc( parse_data->buffmax + 1 ); 182 parse_data->buffend = 0; 183 } 184 185 186 187 /* return if within an ignore block */ 188 if ( parse_data->ignore_tag ) 189 return; 190 191 /* Bump on all meta names, unless overridden */ 192 /* Done before the ignore tag check since still need to bump */ 193 194 if (!isDontBumpMetaName(sw, tag)) 195 parse_data->word_pos++; 196 197 198 /* check for ignore tag (should propably remove char handler for speed) */ 199 if ( (parse_data->ignore_tag = isIgnoreMetaName( sw, tag ))) 200 return; 201 202 203 204 205 /* Check for metaNames */ 206 207 if ( (m = getMetaNameByName( parse_data->header, tag)) ) 208 add_meta( parse_data, m ); 209 54 210 else 55 isendtag = 0; 56 57 /* XML is case sensitive - Do not convert to lowercase !!! */ 58 /* Jump spaces */ 59 for (temp2 = temp; *temp2 && !isspace((int) (*(unsigned char *) temp2)); temp2++); 60 61 if (temp == temp2) 62 return NULL; 63 64 /* Check for empty xml tag . Eg: <mytag/> */ 65 if (!isendtag && (*(temp2 - 1)) == '/') 66 return NULL; 67 68 c = *temp2; 69 *temp2 = '\0'; 70 71 /* Go lowercase as discussed even if we are in xml */ 72 /* Use Rainer's routine */ 73 strtolower(temp); 74 75 *parsed_tag = estrdup(temp); 76 77 if ((e = getMetaNameByName(&indexf->header, temp))) 78 { 79 *temp2 = c; 80 return e; 81 } 82 83 84 if (applyautomaticmetanames && temp && *temp) 85 { 86 if (verbose) 87 printf("Adding automatic MetaName '%s' found in file '%s'\n", temp, filename); 88 89 return addMetaEntry(&indexf->header, temp, META_INDEX, 0); 90 } 91 92 93 if (!OkNoMeta) 94 progerr("UndefinedMetaNames=error. Found meta name '%s' in file '%s', not listed as a MetaNames in config", temp, filename); 95 96 *temp2 = c; 97 return NULL; 98 99 } 100 101 102 /* Indexes all the words in a XML file and adds the appropriate information 103 ** to the appropriate structures. 104 */ 105 106 int countwords_XML(SWISH * sw, FileProp * fprop, char *buffer) 107 { 108 return _countwords_XML(sw, fprop, buffer, 0, fprop->fsize); 109 } 110 111 int _countwords_XML(SWISH * sw, FileProp * fprop, char *buffer, int start, int size) 112 { 113 int ftotalwords; 114 int *metaID; 115 int metaIDlen; 116 int positionMeta; /* Position of word in file */ 117 int position_no_meta = 1; /* Counter for words in file (excluding metanames) */ 118 int position_meta = 1; /* Counter for words in metanames */ 119 int currentmetanames; 120 unsigned char *newp, 121 *p, 122 *tag, 123 *endtag = NULL, 124 *tempprop; 125 int structure; 126 struct file *thisFileEntry = NULL; 127 struct metaEntry *metaNameXML, 128 *metaNameXML2; 129 int i; 130 IndexFILE *indexf = sw->indexlist; 131 struct MOD_Index *idx = sw->Index; 132 char *summary = NULL; 133 int in_junk = 0; 134 135 idx->filenum++; 136 137 if (fprop->stordesc) 138 summary = parseXmlSummary(buffer, fprop->stordesc->field, fprop->stordesc->size); 139 140 addtofilelist(sw, indexf, fprop->real_path, &thisFileEntry); 141 addCommonProperties(sw, indexf, fprop->mtime, "", summary, start, size); 142 143 144 /* Init meta info */ 145 metaID = (int *) emalloc((metaIDlen = 1) * sizeof(int)); 146 147 currentmetanames = ftotalwords = 0; 148 structure = IN_FILE | IN_META; /* Assume everything is within a meta tag for xml? */ 149 metaID[0] = 1; 150 positionMeta = 1; 151 152 for (p = buffer; p && *p;) 153 { 154 if ((tag = strchr(p, '<'))) 155 { /* Look for '<' */ 156 /* Index up to the tag */ 157 *tag++ = '\0'; 158 if ((currentmetanames || (!currentmetanames && !sw->ReqMetaName)) && !in_junk) 211 { 212 if (sw->applyautomaticmetanames) 213 { 214 if (sw->verbose) 215 printf("Adding automatic MetaName '%s' found in file '%s'\n", tag, parse_data->fprop->real_path); 216 217 add_meta( parse_data, addMetaEntry( parse_data->header, tag, META_INDEX, 0)); 218 } 219 220 221 /* If set to "error" on undefined meta tags, then error */ 222 if (!sw->OkNoMeta) 223 progerr("UndefinedMetaNames=error. Found meta name '%s' in file '%s', not listed as a MetaNames in config", tag, parse_data->fprop->real_path); 224 } 225 226 227 /* Check property names */ 228 229 if ( (m = getPropNameByName( parse_data->header, tag)) ) 230 add_prop( parse_data, m ); 231 232 233 /* Check for store description */ 234 235 236 efree( tag ); 237 } 238 239 /* kind of ugly duplication */ 240 static void add_meta( PARSE_DATA *parse_data, struct metaEntry *m ) 241 { 242 if ( parse_data->meta_cnt >= parse_data->metasize ) 243 { 244 parse_data->metasize += 100; 245 parse_data->metas = (int *) erealloc( parse_data->metas, sizeof(int *) * parse_data->metasize); 246 } 247 parse_data->metas[ parse_data->meta_cnt++ ] = m->metaID; 248 } 249 250 static void add_prop( PARSE_DATA *parse_data, struct metaEntry *m ) 251 { 252 if ( parse_data->prop_cnt >= parse_data->propsize ) 253 { 254 parse_data->propsize += 100; 255 parse_data->props = (int *) erealloc( parse_data->props, sizeof(int *) * parse_data->propsize); 256 } 257 parse_data->props[ parse_data->prop_cnt++ ] = m->metaID; 258 } 259 260 261 262 /********************************************************************* 263 * End Tag Event Handler 264 * 265 * This routine will pop the meta/property tag off the stack 266 * 267 * 268 *********************************************************************/ 269 270 271 static void end_hndl(void *data, const char *el) 272 { 273 PARSE_DATA *parse_data = (PARSE_DATA *)data; 274 char *tag = estrdup( (char *)el ); 275 276 277 strtolower( tag ); 278 279 /* Check for store description */ 280 if ( parse_data->buffer && ( strcmp(parse_data->fprop->stordesc->field, tag) == 0)) 281 write_summary( parse_data ); 282 283 284 if ( parse_data->ignore_tag ) 285 { 286 if (strcmp( parse_data->ignore_tag, tag ) == 0) 287 { 288 efree( parse_data->ignore_tag ); 289 parse_data->ignore_tag = NULL; 290 } 291 return; 292 } 293 294 /* Tags must be ballanced, of course. */ 295 296 /* Exiting a metaID? */ 297 if ( parse_data->meta_cnt && getMetaNameByName( parse_data->header, tag) ) 298 parse_data->meta_cnt--; 299 300 301 /* Exiting a propID? */ 302 if ( parse_data->prop_cnt && getPropNameByName( parse_data->header, tag) ) 303 parse_data->prop_cnt--; 304 305 } 306 307 /********************************************************************* 308 * Character Data Event Handler 309 * 310 * This does the actual adding of text to the index and adding properties 311 * if any tags have been found to index 312 * 313 * 314 *********************************************************************/ 315 316 static void char_hndl(void *data, const char *txt, int txtlen) 317 { 318 PARSE_DATA *parse_data = (PARSE_DATA *)data; 319 SWISH *sw = parse_data->sw; 320 int i; 321 char *buf = (char *)emalloc( txtlen + 1 ); 322 323 strncpy( buf, txt, txtlen ); 324 buf[txtlen] = '\0'; 325 326 327 /* Add text to summary */ 328 if ( parse_data->buffer && parse_data->buffend < parse_data->fprop->stordesc->size ) 329 append_summary_text( parse_data, buf, txtlen); 330 331 332 333 334 335 /* If currently in an ignore block, then return */ 336 if ( parse_data->ignore_tag ) 337 return; 338 339 340 /* If outside all meta tags then add default */ 341 if ( !parse_data->meta_cnt && sw->OkNoMeta ) 342 { 343 struct metaEntry *m = getMetaNameByName( parse_data->header, AUTOPROPERTY_DEFAULT ); 344 if ( m ) 345 add_meta( parse_data, m ); 346 } 347 348 349 /* Index the text */ 350 if ( parse_data->meta_cnt ) 351 parse_data->total_words += 352 indexstring( 353 sw, 354 buf, 355 parse_data->filenum, 356 IN_FILE | IN_META, 357 parse_data->meta_cnt, 358 parse_data->metas, 359 &(parse_data->word_pos) 360 ); 361 362 /* Now store the properties -- will concat any existing property */ 363 364 for ( i = 0; i < parse_data->prop_cnt; i++ ) 365 { 366 struct metaEntry *m = getPropNameByID( parse_data->header, parse_data->props[i]); 367 if (!addDocProperty(&(parse_data->thisFileEntry->docProperties), m, buf, txtlen, 0)) 368 progwarn("property '%s' not added for document '%s'\n", m->metaName, parse_data->fprop->real_path); 369 } 370 371 efree( buf ); 372 373 } 374 375 /********************************************************************* 376 * Add characters to summary 377 * 378 * This REALLY shouldn't be here. 379 * Could do better with general purpose properties:size 380 * 381 * 382 *********************************************************************/ 383 384 static void append_summary_text( PARSE_DATA *parse_data, char *buf, int len) 385 { 386 int j; 387 388 /* trim trailing space */ 389 while ( isspace( buf[len-1] && len > 0 )) 390 len--; 391 392 393 /* skip leading space */ 394 for ( j=0; j < len && isspace( buf[j] ); j++ ); 395 396 397 if ( j >= len ) 398 return; 399 400 401 /* Add space between */ 402 if ( parse_data->buffend ) 403 parse_data->buffer[parse_data->buffend++] = ' '; 404 405 406 while ( j < len ) 407 { 408 /* Check for max size reached */ 409 if ( parse_data->buffend >= parse_data->fprop->stordesc->size ) 410 { 411 if ( !isspace( buf[j] ) && !isspace( parse_data->buffer[parse_data->buffend-1] )) 159 412 { 160 161 newp = sw_ConvHTMLEntities2ISO(sw, p); 162 163 ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &positionMeta); 164 if (newp != p) 165 efree(newp); 413 while ( parse_data->buffend && !isspace( parse_data->buffer[--parse_data->buffend] )); 414 parse_data->buffer[parse_data->buffend] = '\0'; 166 415 } 167 416 168 /* Now let us look for '>' */ 169 if ((endtag = strchr(tag, '>'))) 170 { 171 172 *endtag++ = '\0'; 173 174 if ((tag[0] != '!') && (tag[0] != '/')) 175 { 176 char *parsed_tag = NULL; 177 178 if ( 179 (metaNameXML = 180 getXMLField(indexf, tag, sw->applyautomaticmetanames, sw->verbose, sw->OkNoMeta, &parsed_tag, fprop->real_path))) 181 { 182 /* If the data must be indexed add the metaName to the currentlist of metaNames */ 183 if (!in_junk) 184 { 185 /* realloc memory if needed */ 186 i
