Changeset 933
- Timestamp:
- 08/13/02 18:45:11 (6 years ago)
- Files:
-
- trunk/swish-e/MANIFEST (modified) (1 diff)
- trunk/swish-e/conf/example4.config (modified) (2 diffs)
- trunk/swish-e/example/swish.cgi (modified) (1 diff)
- trunk/swish-e/pod/CHANGES.pod (modified) (1 diff)
- trunk/swish-e/pod/SWISH-CONFIG.pod (modified) (2 diffs)
- trunk/swish-e/prog-bin/spider.pl (modified) (1 diff)
- trunk/swish-e/src/parse_conffile.c (modified) (1 diff)
- trunk/swish-e/src/parser.c (modified) (8 diffs)
- trunk/swish-e/src/swish.h (modified) (1 diff)
- trunk/swish-e/src/swish2.c (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/swish-e/MANIFEST
r914 r933 231 231 src/txt.c 232 232 src/txt.h 233 src/vsnprintf.c 233 234 src/vms/acconfig.h_vms 234 235 src/vms/build_swish-e.com trunk/swish-e/conf/example4.config
r861 r933 63 63 64 64 65 # Put yes to ignore the total number of words in the file66 # when calculating ranking. Often better with merges and67 # small files. Default is no.68 69 IgnoreTotalWordCountWhenRanking yes70 71 72 65 # Set the stopwords (words to ignore when searching and when indexing) 73 66 # Carefully think about this feature before using a list of stopwords … … 83 76 # swish will determine what stopwords to use. But please 84 77 # read the documentation before using the IgnoreLimit directive. 85 86 87 # Since comments are not shown when viewing a document, we decide 88 # not to index the text of the comments. The default is to index 89 # html comments. 90 91 IndexComments 0 78 # It can be slow, and may not work with other options. 92 79 93 80 trunk/swish-e/example/swish.cgi
r920 r933 1370 1370 @properties = ( 'swishreccount', @properties ); 1371 1371 1372 $self->swish_command( -x => join( '\t', map { "<$_>" } @properties ) . '\n' ); 1372 my @props = map { 1373 $_ eq 'swishlastmodified' ? "$_ fmt='%Y-%m-%d'" : $_ 1374 } @properties; 1375 1376 $self->swish_command( -x => join( '\t', map { "<$_>" } @props ) . '\n' ); 1377 1373 1378 $self->swish_command( -H => 9 ); 1374 1379 trunk/swish-e/pod/CHANGES.pod
r868 r933 362 362 with libxml2 parser. 363 363 364 =item * New directive: IndexAltTagMetaName 365 366 Allows indexing of image ALT tags. Only available when using the libxml2 parser. 367 364 368 =item * New directive: AbsoluteLinks 365 369 trunk/swish-e/pod/SWISH-CONFIG.pod
r908 r933 245 245 246 246 L<IndexAdmin|/"item_IndexAdmin"> *text* 247 248 =item * 249 250 L<IndexAltTagMetaName/"item_IndexAltTagMetaName"> *tagname*|as-text 247 251 248 252 =item * … … 1022 1026 ImageLinksMetaName swishdefault 1023 1027 1024 This feature is only available with the libxml2 HTML parser. 1028 This feature is only available with the libxml2 HTML parser. 1029 1030 1031 =item IndexAltTagMetaName *tagname*|as-text 1032 1033 Allows indexing of images <IMG> ALT tag text. Specify either a tag name which will be 1034 used as a metaname, or the special text "as-text" which says to index the ALT text as 1035 if it were plain text at the current location. 1036 1037 For example, by specifying a tag name: 1038 1039 IndexAltTagMetaName bar 1040 1041 would make this markup: 1042 1043 <foo> 1044 <img src="/someimage.png" alt="Alt text here"> 1045 </foo> 1046 1047 appear like 1048 1049 <foo> 1050 <bar>Alt text here</bar> 1051 </foo> 1052 1053 Then the normal rules (C<MetaNames> and C<PropertyNames>) apply to how that text is indexed. 1054 1055 If you use the special tag "as-text" then 1056 1057 <foo> 1058 <img src="/someimage.png" alt="Alt text here"> 1059 </foo> 1060 1061 simply becomes 1062 1063 <foo> 1064 Alt text here 1065 </foo> 1066 1067 This feature is only available when using the libxml2 parser (HTML2 and XML2). 1068 1025 1069 1026 1070 =item AbsoluteLinks [yes|NO] trunk/swish-e/prog-bin/spider.pl
r888 r933 256 256 eval { spider( $server, $uri ) }; 257 257 print STDERR $@ if $@; 258 259 260 # provide a way to call a function in the config file when all done 261 check_user_function( 'spider_done', undef, $server ); 262 258 263 259 264 return if $server->{quiet}; trunk/swish-e/src/parse_conffile.c
r915 r933 596 596 } 597 597 598 599 /* What to do with IMG ATL tags? */ 600 if (strcasecmp(w0, "IndexAltTagMetaName") == 0) 601 { 602 if (sl->n <= 1) 603 progerr("%s: requires one value", w0); 604 605 if ( strcasecmp( sl->word[1], "as-text" ) == 0) 606 { 607 sw->IndexAltTag = 1; 608 if ( sw->IndexAltTagMeta ) 609 { 610 efree( sw->IndexAltTagMeta ); 611 sw->IndexAltTagMeta = NULL; 612 } 613 } 614 else 615 { 616 sw->IndexAltTag = 1; 617 if ( sw->IndexAltTagMeta ) 618 { 619 efree( sw->IndexAltTagMeta ); 620 sw->IndexAltTagMeta = NULL; 621 } 622 sw->IndexAltTagMeta = estrdup( sl->word[1] ); 623 } 624 continue; 625 } 626 627 628 598 629 599 630 /* Meta name to extract out <img src> links */ trunk/swish-e/src/parser.c
r929 r933 159 159 int abort; // flag to stop parsing 160 160 char *baseURL; // for fixing up relative links 161 int swish_noindex; // swishindex swishnoindex 161 int swish_noindex; // swishindex swishnoindex -- for hiding blocks with comments 162 162 } PARSE_DATA; 163 163 … … 183 183 static void Convert_to_latin1( PARSE_DATA *parse_data, char *txt, int txtlen ); 184 184 static int parse_chunks( PARSE_DATA *parse_data ); 185 186 static void index_alt_tab( PARSE_DATA *parse_data, const char **attr ); 185 187 static char *extract_html_links( PARSE_DATA *parse_data, const char **attr, struct metaEntry *meta_entry, char *tag ); 186 188 static int read_next_chunk( FileProp *fprop, char *buf, int buf_size, int max_size ); … … 631 633 /* Extract out links from images */ 632 634 else if ( strcmp( tag, "img") == 0 ) 635 { 636 if (parse_data->sw->IndexAltTag) 637 index_alt_tab( parse_data, attr ); 638 633 639 extract_html_links( parse_data, attr, parse_data->sw->images_meta, "src" ); 640 } 634 641 635 642 … … 900 907 * parse_data 901 908 * tag = tag to look for as a metaname/property 902 * endtag = tag to look for as the ending tag 909 * endtag = tag to look for as the ending tag (since might be different from start tag) 903 910 * meta_append = if zero, tells push that this is a new meta 904 911 * prop_append otherwise, says it's a sibling of a previous call … … 929 936 { 930 937 /* shouldn't need to flush buffer since it's just blocking out a section and should be balanced */ 938 /* but need to due to the weird way the char buffer is used (and shared with props) and how metatags are assigned to the buffer */ 939 /* basically, since flush_buffer looks at the ignore flag and always clears the buffer, need to do it now */ 940 /* flush_buffer really should not be in the business of checking the ignore flag, and rather we need to keep two buffers -- or maybe just always flush with any change */ 941 942 flush_buffer( parse_data, 1 ); 943 931 944 push_stack( &parse_data->meta_stack, endtag, NULL, meta_append, 1 ); 932 945 push_stack( &parse_data->prop_stack, endtag, NULL, prop_append, 1 ); … … 1473 1486 char *c; 1474 1487 1475 1476 1488 /* anything to do? */ 1477 1489 if ( !buf->cur ) … … 1511 1523 { 1512 1524 /* Index the text */ 1513 if ( !parse_data->meta_stack.ignore_flag ) 1525 if ( !parse_data->meta_stack.ignore_flag ) // this really is wrong -- should not check ignore here. Fix should be to use two buffers 1514 1526 parse_data->total_words += 1515 1527 indexstring( sw, c, parse_data->filenum, structure, 0, NULL, &(parse_data->word_pos) ); … … 1651 1663 1652 1664 /********************************************************************* 1665 * Index ALT tabs 1666 * 1667 * 1668 *********************************************************************/ 1669 static void index_alt_tab( PARSE_DATA *parse_data, const char **attr ) 1670 { 1671 int meta_append = 0; 1672 int prop_append = 0; 1673 char *tagbuf = parse_data->sw->IndexAltTagMeta; 1674 char *alt_text = extract_html_links( parse_data, attr, NULL, "alt"); 1675 1676 1677 if ( !alt_text ) 1678 return; 1679 1680 /* Index as regular text? */ 1681 if ( !parse_data->sw->IndexAltTagMeta ) 1682 { 1683 char_hndl( parse_data, alt_text, strlen( alt_text ) ); 1684 return; 1685 } 1686 1687 flush_buffer( parse_data, 1 ); 1688 start_metaTag( parse_data, tagbuf, tagbuf, &meta_append, &prop_append, 0 ); 1689 char_hndl( parse_data, alt_text, strlen( alt_text ) ); 1690 end_metaTag( parse_data, tagbuf, 0 ); 1691 } 1692 1693 1694 1695 1696 /********************************************************************* 1653 1697 * Extract out links for indexing 1654 1698 * trunk/swish-e/src/swish.h
r923 r933 879 879 struct metaEntry *images_meta; 880 880 881 882 /* if allocated the meta name to store alt tags as */ 883 int IndexAltTag; 884 char *IndexAltTagMeta; // use this meta-tag, if set 885 881 886 /* for converting relative links in href's and img src tags absoulte */ 882 887 int AbsoluteLinks; trunk/swish-e/src/swish2.c
r915 r933 174 174 /* FileRules?? */ 175 175 176 /* meta name for ALT tags */ 177 if ( sw->IndexAltTagMeta ) 178 { 179 efree( sw->IndexAltTagMeta ); 180 sw->IndexAltTagMeta = NULL; 181 } 182 183 176 184 177 185 while (tmpindexlist) {
