root/swish_website/etc/spider.config

Revision 1673, 6.9 kB (checked in by whmoseley, 3 years ago)

And make the redirect script work on sunsite.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1 # Configuration for swish-e site.
2 #
3 # a few custom callbacks are located after the @servers definition section
4 # these are used to split files into sections.
5
6 use URI;
7 use warnings;
8 use strict;
9
10 use vars '@servers';
11
12
13 my $base_path = $ENV{SWISH_SITE} || die "must set \$ENV{SWISH_SITE} (e.g SWISH_SITE=http://swish-e.org)";
14 $base_path =~ s[/$][];  # no trailing slash
15
16 my $top_path = URI->new("$base_path/")->path;  # don't go above this.
17 my $start_file = $ENV{START_FILE} || 'index.html';
18
19
20
21 @servers = (
22     {
23         base_url        => "$base_path/$start_file",
24
25         keep_alive      => 1,         # enable keep alives requests
26         email           => 'swish@domain.invalid',
27
28         delay_sec       => 0,  # why wait?
29
30         use_md5         => 1,
31
32
33         test_url => sub {
34             my $path = $_[0]->path;
35
36             return 0 if $path =~ /\.(?:gif|jpeg|.png|.gz)$/i;
37
38             return 0 if $path =~ m!/archive/!;  # don't index the list archive again
39             return 0 if $path =~ m!/search_archive/!;  # and the old search script
40
41             # Don't follow any links above the base_url
42             return 0 unless $path =~ /^\Q$top_path/;
43             return 1;
44         },
45
46         # Only index text/html -- do we have any text/plain?
47         test_response   => sub { return $_[2]->content_type =~ m[text/html] },
48
49         # split content - comment out to disable splitting
50         filter_content  => \&split_page,
51     },
52
53 );
54
55
56 #===============================================================================
57 # split_page -
58 #
59 # This is based on HTML::Parser.  More accurate than the regex method, but slower
60 #
61 #-------------------------------------------------------------------------------
62
63 sub split_page {
64
65     my %params;
66     @params{ qw/ uri server response content / } = @_;
67     $params{found} = 0;
68
69
70     my $doc = Swish::Split->new( \%params );
71
72     return unless $doc;
73
74     my $ret = !$doc->process;
75
76     $doc->tree->delete;
77     return $ret;
78 }
79
80 #---------------------------------------------------------------------------------
81
82 package Swish::Split;
83 use warnings;
84 use strict;
85 use HTML::TreeBuilder;
86 use HTML::Element;
87
88 sub new {
89     my ( $class, $params ) = @_;
90
91     # Parse the HTML into a tree
92     my $tree = HTML::TreeBuilder->new;
93     $tree->store_comments(1);  # let swish decided about indexing comments
94     $tree->parse( ${$params->{content}} );
95     $tree->eof;
96
97     $params->{tree} = $tree;
98
99     # Find the head section
100     $params->{head} = $tree->look_down( '_tag', 'head' );
101
102     $params->{page_length} = length ${$params->{content}};
103
104     my $self = bless $params, $class;
105     $self->accessorize;
106     return $self;
107 }
108
109 sub accessorize {
110     my ( $self ) = @_;
111
112     no strict 'refs';
113     for my $key ( keys %$self ) {
114         next if $self->can( $key );
115         *{$key} = sub { shift->{$key} };
116     }
117 }
118
119
120 #==================================================================================
121 # Process the document tree
122 #
123 # Returns: true if tree was processed.  False means still need to index file.
124 #
125 #---------------------------------------------------------------------------------
126
127 sub process {
128     my ( $self ) = @_;
129
130     my $uri = $self->uri;
131
132     warn "\nProcessing $uri\n" if $ENV{VERBOSE};
133
134
135     my $content_section = $self->tree->look_down( qw[ _tag div id main-copy ] );
136
137     unless ( $content_section ) {
138         warn qq[Failed to find <div class="main-copy"> in $uri. Indexing full content\n];
139         return;  # Return false indicating spider to index the page as normal
140     }
141
142     # Now look for content divided into sections
143     my @sub_sections = $content_section->look_down( qw[ _tag div class sub-section ] );
144
145     unless ( @sub_sections ) {
146         warn qq[Failed to find <div class="sub-section"> in $uri.  Indexing full content\n]
147             if $ENV{VERBOSE};
148
149         $self->create_page( $content_section, $self->head, $self->uri );
150
151     } else {
152
153         for ( @sub_sections ) {
154             my ( $new_head, $new_uri ) = $self->new_head( $_ );
155             $self->create_page( $_, $new_head, $new_uri );
156             $new_head->delete;
157         }
158     }
159
160     return 1;  # says we were sucessful -- so spider should not index the page
161
162 }
163
164 #================================================================================
165 # new_head() -- clones the head section and returns an array of a new head and uri
166 #
167 #--------------------------------------------------------------------------------
168 sub new_head {
169     my ( $self, $section ) = @_;
170
171     my $head    = $self->head->clone;
172     my $uri     = $self->uri->clone;
173
174     my $fragment = '';
175
176     # Look for the first <h> tag
177
178     #  <h3><a name="So, is Swish-e a search engine?"></a>So, is Swish-e a search engine?</h3>
179
180     if ( my $h_tag = $section->look_down( '_tag', qr/^h\d$/ ) ) {
181
182         my $description = $h_tag->as_text || 'missing description';  # for title
183
184
185         # grab the name= text for the fragment
186         if ( my $name = $h_tag->look_down( '_tag', 'a', sub { defined($_[0]->attr('name')) } ) ) {
187             $fragment = $name->attr('name');
188             $fragment =~ s/\n/ /g;
189             $uri->fragment( $fragment );
190         } else {
191             warn "Failed to find <a name> target for a section in $uri\n";
192         }
193
194
195         # Modify or create the title
196         my $title = $head->look_down('_tag', 'title');
197
198         if ( $title ) {
199             $title->push_content( ": $description" );
200
201         } else { # Create a new title
202             my $title = HTML::Element->new('title');
203             $title->push_content( $description );
204             $head->push_content( $title );
205         }
206     } else {
207         warn "Failed to find <h\\d> in one of the sections of $uri\n";
208     }
209
210     warn "  -> #$fragment\n" if $ENV{VERBOSE};
211
212     return ( $head, $uri );
213 }
214
215
216
217 #=================================================================================
218 # create_page() -- creates a new HTML page and indexes it.
219 #
220 #---------------------------------------------------------------------------------
221
222 sub create_page {
223     my ( $self, $section, $head, $uri ) = @_;
224
225
226     # Add a <meta> tag to allow limiting based on the type of doc
227     $head->push_content (
228         HTML::Element->new( 'meta',
229             name    => 'section',
230             content => ($uri =~ m!(?:(devel)_)?(docs)/!
231                             ? ($1 || $2)
232                             : 'website'),
233         )
234     );
235
236     # Add the total document length, which is different than the section length
237     $head->push_content(
238         HTML::Element->new('meta', name=> 'pagelen', content => $self->page_length )
239     );
240
241
242     my $body = HTML::Element->new('body');
243     my $doc  = HTML::Element->new('html');
244
245     $body->push_content( $section );
246     $doc->push_content( $head, $body );
247
248
249     my $new_content = $doc->as_HTML(undef,"\t");
250
251     # Fix up title - probably should get this from template
252     $new_content =~ s/<title>Swish-e ::\s+/<title>/;
253
254
255     # This calls code in the spider function.
256     main::output_content( $self->server, \$new_content,
257                     $uri, $self->response );
258
259     $doc->delete;
260 }
261
262
263
264 1;
265
Note: See TracBrowser for help on using the browser.