| 1 |
# Configuration for swish-e site. |
|---|
| 2 |
# |
|---|
| 3 |
# a few custom callbacks are located after the @servers definition section |
|---|
| 4 |
# these are used to split files into sections. |
|---|
| 5 |
|
|---|
| 6 |
use URI; |
|---|
| 7 |
use warnings; |
|---|
| 8 |
use strict; |
|---|
| 9 |
|
|---|
| 10 |
use vars '@servers'; |
|---|
| 11 |
|
|---|
| 12 |
|
|---|
| 13 |
my $base_path = $ENV{SWISH_SITE} || die "must set \$ENV{SWISH_SITE} (e.g SWISH_SITE=http://swish-e.org)"; |
|---|
| 14 |
$base_path =~ s[/$][]; # no trailing slash |
|---|
| 15 |
|
|---|
| 16 |
my $top_path = URI->new("$base_path/")->path; # don't go above this. |
|---|
| 17 |
my $start_file = $ENV{START_FILE} || 'index.html'; |
|---|
| 18 |
|
|---|
| 19 |
|
|---|
| 20 |
|
|---|
| 21 |
@servers = ( |
|---|
| 22 |
{ |
|---|
| 23 |
base_url => "$base_path/$start_file", |
|---|
| 24 |
|
|---|
| 25 |
keep_alive => 1, # enable keep alives requests |
|---|
| 26 |
email => 'swish@domain.invalid', |
|---|
| 27 |
|
|---|
| 28 |
delay_sec => 0, # why wait? |
|---|
| 29 |
|
|---|
| 30 |
use_md5 => 1, |
|---|
| 31 |
|
|---|
| 32 |
|
|---|
| 33 |
test_url => sub { |
|---|
| 34 |
my $path = $_[0]->path; |
|---|
| 35 |
|
|---|
| 36 |
return 0 if $path =~ /\.(?:gif|jpeg|.png|.gz)$/i; |
|---|
| 37 |
|
|---|
| 38 |
return 0 if $path =~ m!/archive/!; # don't index the list archive again |
|---|
| 39 |
return 0 if $path =~ m!/search_archive/!; # and the old search script |
|---|
| 40 |
|
|---|
| 41 |
# Don't follow any links above the base_url |
|---|
| 42 |
return 0 unless $path =~ /^\Q$top_path/; |
|---|
| 43 |
return 1; |
|---|
| 44 |
}, |
|---|
| 45 |
|
|---|
| 46 |
# Only index text/html -- do we have any text/plain? |
|---|
| 47 |
test_response => sub { return $_[2]->content_type =~ m[text/html] }, |
|---|
| 48 |
|
|---|
| 49 |
# split content - comment out to disable splitting |
|---|
| 50 |
filter_content => \&split_page, |
|---|
| 51 |
}, |
|---|
| 52 |
|
|---|
| 53 |
); |
|---|
| 54 |
|
|---|
| 55 |
|
|---|
| 56 |
#=============================================================================== |
|---|
| 57 |
# split_page - |
|---|
| 58 |
# |
|---|
| 59 |
# This is based on HTML::Parser. More accurate than the regex method, but slower |
|---|
| 60 |
# |
|---|
| 61 |
#------------------------------------------------------------------------------- |
|---|
| 62 |
|
|---|
| 63 |
sub split_page { |
|---|
| 64 |
|
|---|
| 65 |
my %params; |
|---|
| 66 |
@params{ qw/ uri server response content / } = @_; |
|---|
| 67 |
$params{found} = 0; |
|---|
| 68 |
|
|---|
| 69 |
|
|---|
| 70 |
my $doc = Swish::Split->new( \%params ); |
|---|
| 71 |
|
|---|
| 72 |
return unless $doc; |
|---|
| 73 |
|
|---|
| 74 |
my $ret = !$doc->process; |
|---|
| 75 |
|
|---|
| 76 |
$doc->tree->delete; |
|---|
| 77 |
return $ret; |
|---|
| 78 |
} |
|---|
| 79 |
|
|---|
| 80 |
#--------------------------------------------------------------------------------- |
|---|
| 81 |
|
|---|
| 82 |
package Swish::Split; |
|---|
| 83 |
use warnings; |
|---|
| 84 |
use strict; |
|---|
| 85 |
use HTML::TreeBuilder; |
|---|
| 86 |
use HTML::Element; |
|---|
| 87 |
|
|---|
| 88 |
sub new { |
|---|
| 89 |
my ( $class, $params ) = @_; |
|---|
| 90 |
|
|---|
| 91 |
# Parse the HTML into a tree |
|---|
| 92 |
my $tree = HTML::TreeBuilder->new; |
|---|
| 93 |
$tree->store_comments(1); # let swish decided about indexing comments |
|---|
| 94 |
$tree->parse( ${$params->{content}} ); |
|---|
| 95 |
$tree->eof; |
|---|
| 96 |
|
|---|
| 97 |
$params->{tree} = $tree; |
|---|
| 98 |
|
|---|
| 99 |
# Find the head section |
|---|
| 100 |
$params->{head} = $tree->look_down( '_tag', 'head' ); |
|---|
| 101 |
|
|---|
| 102 |
$params->{page_length} = length ${$params->{content}}; |
|---|
| 103 |
|
|---|
| 104 |
my $self = bless $params, $class; |
|---|
| 105 |
$self->accessorize; |
|---|
| 106 |
return $self; |
|---|
| 107 |
} |
|---|
| 108 |
|
|---|
| 109 |
sub accessorize { |
|---|
| 110 |
my ( $self ) = @_; |
|---|
| 111 |
|
|---|
| 112 |
no strict 'refs'; |
|---|
| 113 |
for my $key ( keys %$self ) { |
|---|
| 114 |
next if $self->can( $key ); |
|---|
| 115 |
*{$key} = sub { shift->{$key} }; |
|---|
| 116 |
} |
|---|
| 117 |
} |
|---|
| 118 |
|
|---|
| 119 |
|
|---|
| 120 |
#================================================================================== |
|---|
| 121 |
# Process the document tree |
|---|
| 122 |
# |
|---|
| 123 |
# Returns: true if tree was processed. False means still need to index file. |
|---|
| 124 |
# |
|---|
| 125 |
#--------------------------------------------------------------------------------- |
|---|
| 126 |
|
|---|
| 127 |
sub process { |
|---|
| 128 |
my ( $self ) = @_; |
|---|
| 129 |
|
|---|
| 130 |
my $uri = $self->uri; |
|---|
| 131 |
|
|---|
| 132 |
warn "\nProcessing $uri\n" if $ENV{VERBOSE}; |
|---|
| 133 |
|
|---|
| 134 |
|
|---|
| 135 |
my $content_section = $self->tree->look_down( qw[ _tag div id main-copy ] ); |
|---|
| 136 |
|
|---|
| 137 |
unless ( $content_section ) { |
|---|
| 138 |
warn qq[Failed to find <div class="main-copy"> in $uri. Indexing full content\n]; |
|---|
| 139 |
return; # Return false indicating spider to index the page as normal |
|---|
| 140 |
} |
|---|
| 141 |
|
|---|
| 142 |
# Now look for content divided into sections |
|---|
| 143 |
my @sub_sections = $content_section->look_down( qw[ _tag div class sub-section ] ); |
|---|
| 144 |
|
|---|
| 145 |
unless ( @sub_sections ) { |
|---|
| 146 |
warn qq[Failed to find <div class="sub-section"> in $uri. Indexing full content\n] |
|---|
| 147 |
if $ENV{VERBOSE}; |
|---|
| 148 |
|
|---|
| 149 |
$self->create_page( $content_section, $self->head, $self->uri ); |
|---|
| 150 |
|
|---|
| 151 |
} else { |
|---|
| 152 |
|
|---|
| 153 |
for ( @sub_sections ) { |
|---|
| 154 |
my ( $new_head, $new_uri ) = $self->new_head( $_ ); |
|---|
| 155 |
$self->create_page( $_, $new_head, $new_uri ); |
|---|
| 156 |
$new_head->delete; |
|---|
| 157 |
} |
|---|
| 158 |
} |
|---|
| 159 |
|
|---|
| 160 |
return 1; # says we were sucessful -- so spider should not index the page |
|---|
| 161 |
|
|---|
| 162 |
} |
|---|
| 163 |
|
|---|
| 164 |
#================================================================================ |
|---|
| 165 |
# new_head() -- clones the head section and returns an array of a new head and uri |
|---|
| 166 |
# |
|---|
| 167 |
#-------------------------------------------------------------------------------- |
|---|
| 168 |
sub new_head { |
|---|
| 169 |
my ( $self, $section ) = @_; |
|---|
| 170 |
|
|---|
| 171 |
my $head = $self->head->clone; |
|---|
| 172 |
my $uri = $self->uri->clone; |
|---|
| 173 |
|
|---|
| 174 |
my $fragment = ''; |
|---|
| 175 |
|
|---|
| 176 |
# Look for the first <h> tag |
|---|
| 177 |
|
|---|
| 178 |
# <h3><a name="So, is Swish-e a search engine?"></a>So, is Swish-e a search engine?</h3> |
|---|
| 179 |
|
|---|
| 180 |
if ( my $h_tag = $section->look_down( '_tag', qr/^h\d$/ ) ) { |
|---|
| 181 |
|
|---|
| 182 |
my $description = $h_tag->as_text || 'missing description'; # for title |
|---|
| 183 |
|
|---|
| 184 |
|
|---|
| 185 |
# grab the name= text for the fragment |
|---|
| 186 |
if ( my $name = $h_tag->look_down( '_tag', 'a', sub { defined($_[0]->attr('name')) } ) ) { |
|---|
| 187 |
$fragment = $name->attr('name'); |
|---|
| 188 |
$fragment =~ s/\n/ /g; |
|---|
| 189 |
$uri->fragment( $fragment ); |
|---|
| 190 |
} else { |
|---|
| 191 |
warn "Failed to find <a name> target for a section in $uri\n"; |
|---|
| 192 |
} |
|---|
| 193 |
|
|---|
| 194 |
|
|---|
| 195 |
# Modify or create the title |
|---|
| 196 |
my $title = $head->look_down('_tag', 'title'); |
|---|
| 197 |
|
|---|
| 198 |
if ( $title ) { |
|---|
| 199 |
$title->push_content( ": $description" ); |
|---|
| 200 |
|
|---|
| 201 |
} else { # Create a new title |
|---|
| 202 |
my $title = HTML::Element->new('title'); |
|---|
| 203 |
$title->push_content( $description ); |
|---|
| 204 |
$head->push_content( $title ); |
|---|
| 205 |
} |
|---|
| 206 |
} else { |
|---|
| 207 |
warn "Failed to find <h\\d> in one of the sections of $uri\n"; |
|---|
| 208 |
} |
|---|
| 209 |
|
|---|
| 210 |
warn " -> #$fragment\n" if $ENV{VERBOSE}; |
|---|
| 211 |
|
|---|
| 212 |
return ( $head, $uri ); |
|---|
| 213 |
} |
|---|
| 214 |
|
|---|
| 215 |
|
|---|
| 216 |
|
|---|
| 217 |
#================================================================================= |
|---|
| 218 |
# create_page() -- creates a new HTML page and indexes it. |
|---|
| 219 |
# |
|---|
| 220 |
#--------------------------------------------------------------------------------- |
|---|
| 221 |
|
|---|
| 222 |
sub create_page { |
|---|
| 223 |
my ( $self, $section, $head, $uri ) = @_; |
|---|
| 224 |
|
|---|
| 225 |
|
|---|
| 226 |
# Add a <meta> tag to allow limiting based on the type of doc |
|---|
| 227 |
$head->push_content ( |
|---|
| 228 |
HTML::Element->new( 'meta', |
|---|
| 229 |
name => 'section', |
|---|
| 230 |
content => ($uri =~ m!(?:(devel)_)?(docs)/! |
|---|
| 231 |
? ($1 || $2) |
|---|
| 232 |
: 'website'), |
|---|
| 233 |
) |
|---|
| 234 |
); |
|---|
| 235 |
|
|---|
| 236 |
# Add the total document length, which is different than the section length |
|---|
| 237 |
$head->push_content( |
|---|
| 238 |
HTML::Element->new('meta', name=> 'pagelen', content => $self->page_length ) |
|---|
| 239 |
); |
|---|
| 240 |
|
|---|
| 241 |
|
|---|
| 242 |
my $body = HTML::Element->new('body'); |
|---|
| 243 |
my $doc = HTML::Element->new('html'); |
|---|
| 244 |
|
|---|
| 245 |
$body->push_content( $section ); |
|---|
| 246 |
$doc->push_content( $head, $body ); |
|---|
| 247 |
|
|---|
| 248 |
|
|---|
| 249 |
my $new_content = $doc->as_HTML(undef,"\t"); |
|---|
| 250 |
|
|---|
| 251 |
# Fix up title - probably should get this from template |
|---|
| 252 |
$new_content =~ s/<title>Swish-e ::\s+/<title>/; |
|---|
| 253 |
|
|---|
| 254 |
|
|---|
| 255 |
# This calls code in the spider function. |
|---|
| 256 |
main::output_content( $self->server, \$new_content, |
|---|
| 257 |
$uri, $self->response ); |
|---|
| 258 |
|
|---|
| 259 |
$doc->delete; |
|---|
| 260 |
} |
|---|
| 261 |
|
|---|
| 262 |
|
|---|
| 263 |
|
|---|
| 264 |
1; |
|---|
| 265 |
|
|---|