root/swish_website/bin/index_hypermail.pl

Revision 1889, 7.8 kB (checked in by moseley, 1 year ago)

More updates getting ready to move to the new host
Combined the two search scripts into one.

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4
5 ## See documentation below.  Script will require customization
6
7 use File::Find;
8 use Date::Parse;
9 use HTML::TreeBuilder;
10 use Data::Dumper;
11
12
13 my $dumb_spamblock = '(at)not-real.'# String that is removed from email address
14
15 my $dir = shift || die "must specfy directory to search";
16 if ( $dir eq 'debug' ) {
17     debug(@ARGV);
18 }
19
20 $dir .= '/' unless $dir  =~ /\/$/;   # make path relative below
21 # Do all the work
22 find( { wanted => \&wanted }, $dir );
23
24 sub wanted {
25     return if -d;  # don't need to process directories
26     return unless /^\d+\.html$/;
27
28     #output_file( $File::Find::name,  parse_file($_) );
29     output_file( $File::Find::name,  fast_parse($_) );
30 }
31
32
33 sub output_file {
34     my ( $file, $data ) = @_;
35
36     #$file =~ s/$dir//;  # make path relative to top level
37
38     local $SIG{__WARN__} = sub { "$file: @_" };
39
40     # Get last_mod date
41     my $date = str2time( $data->{comments}{received} );
42     unless ( $data ) {
43         warn "Failed to parse received date in $file\n";
44         $date = str2time( $data->{comments}{send} );
45         unless ( $date ) {
46             warn "Failed to parse any dates: skipping $file\n";
47             return;
48         }
49     }
50
51     $data->{received} = $date;
52
53     my $comments = $data->{comments};
54
55     $comments->{email} =~ s/\Q$dumb_spamblock/-blabla-/;
56
57     my $metas = join "\n", map { qq[<meta name="$_" content="$comments->{$_}">] }
58         sort keys %{$data->{comments}};
59
60     my $title = $comments->{subject} || '';
61
62     my $html = <<EOF;
63 <html>
64 <head>
65 <title>$title</title>
66 $metas
67 <meta name="section" content="archive">
68 </head>
69 <body>
70     $data->{body}
71 </body>
72 </html>
73 EOF
74
75     my $bytecount = length pack 'C0a*', $html;
76
77     print <<EOF;
78 Path-Name: $file
79 Content-Length: $bytecount
80 Last-Mtime: $date
81 Document-Type: HTML*
82
83 EOF
84
85     print $html;
86
87 }
88
89
90 sub parse_file {
91     my ( $file ) = @_;
92
93     my %data;  # Return hash of data
94
95     my $tree = HTML::TreeBuilder->new;
96     $tree->store_comments(1);  # meta data is in the comments
97     $tree->warn(1);
98
99     $tree->parse_file( $file );
100
101     my %comments;
102     # Extract out metadata
103     for ( $tree->look_down( '_tag', '~comment' )) {
104         my $comment = $_->attr("text");
105         $comments{$1} = $2 if $comment =~ /(\w+)="([^"]+)/;
106     }
107
108     $data{comments} = \%comments if %comments;  # should die here if not.
109
110     # Extract out the searchable content
111     my $body = $tree->look_down('_tag', 'div', 'class', 'mail');
112     unless ( $body ) {
113         warn "$file: failed to find <div class='mail'>\n";
114         return;
115     }
116
117     # Remove some sub-nodes we don't care about
118     $body->look_down('_tag', 'address', 'class', 'headers')->delete;
119     $body->look_down('_tag', 'span', 'id', 'received')->delete;
120
121     $data{body} = $body->as_HTML;
122
123
124     $tree->delete;
125
126     return \%data;
127
128 }
129
130 sub fast_parse {
131     my $file = shift;
132     local $_;
133
134     unless ( open FH, "<$file" ) {
135         warn "Failed to open '$file'. Error: $!";
136         return;
137     }
138
139     my %data;
140     my %comments;
141
142
143     # First parse out the comments
144     while (<FH>) {
145
146         if ( my( $tag, $content) = /<!-- ([^=]+)="(.+)" -->$/ ) {
147
148             unless ( $content ) {
149                 warn "File '$file' tag '$tag' empty content\n";
150                 next;
151             }
152             last if $tag eq 'body'# no more comments in this section
153             $comments{$tag} = $content;
154         }
155     }
156
157     $data{comments} = \%comments;
158
159
160     # Now grab the content
161
162
163     my $end_str;  # for skipping sections
164     my $body = '';
165
166     while ( <FH> ) {
167         # loo for ending tag, or maybe even the signature
168
169         last if /<!-- body="end" -->/ || /^-- $/ || /^--$/ || /^(_|-){40,}\s*$/;
170
171         # Look for ending tag for a skipped tag set
172
173         if ( $end_str ) {
174             $end_str = '' if /\Q$end_str/;
175             next;
176         }
177
178
179         # These are sections to skip
180         if ( /\Q<address class="headers"/ ) {
181             $end_str = "</address>";
182             next;
183         }
184
185         if (/\Q<span id="received"/ ) {
186             $end_str = '</div>';
187             next;
188         }
189
190         $body .= $_;
191     }
192
193     $data{body} = $body;
194     return \%data;
195 }
196
197
198
199 sub debug {
200     for ( @_ ) {
201         print STDERR "Debugging [$_]\n";
202         my $result = parse_file( $_, 1 );
203         print STDERR Dumper $result;
204         output_file( $_, $result );
205     }
206     exit;
207 }
208 __END__
209
210 =head1 NAME
211
212 index_hypermail.pl - Parse Hypermail archive for indexing with Swish-e
213
214 =head1 SYNOPSIS
215
216 Using an example data structure like this:
217
218     hypermail/
219         archive/
220         search/
221
222 Create the hypermail archive:
223
224     $ cd hypermail
225     $ hypermail -i -d archive < messages.mbox
226
227 Create a swish-e config file:
228
229     $ search
230     $ cat swish.conf
231
232     # config for indexing hypermail v2.1.8 archives
233
234     IndexDir index_hypermail.pl
235     SwishProgParameters ../archive
236
237     MetaNames swishtitle name email
238     PropertyNames name email
239     IndexContents HTML* .html
240     StoreDescription HTML* <body> 100000
241     UndefinedMetaTags  ignore
242
243 Index the documents:
244
245     $ swish-e -c swish.conf -S prog
246
247 Now create the search interface:
248
249     $ cp /usr/local/lib/swish-e/swish.cgi .
250     $ cat .swishcgi.conf
251
252     $ENV{TZ} = 'UTC'; # display dates in UTC format
253
254     return {
255         title           => "Search the Foo List Archive",
256         display_props   => [qw/ name email swishlastmodified /],
257         sorts           => [qw/swishrank swishtitle email swishlastmodified/],
258         metanames       => [qw/swishdefault swishtitle name email/],
259         name_labels     => {
260             swishrank           =>  'Rank',
261             swishtitle          =>  'Subject Only',
262             name                =>  "Poster's Name",
263             email               =>  "Poster's Email",
264             swishlastmodified   =>  'Message Date',
265             swishdefault        =>  'Subject & Body',
266         },
267
268         highlight       => {
269             package         => 'SWISH::PhraseHighlight',
270
271             xhighlight_on    => '<font style="background:#FFFF99">',
272             xhighlight_off   => '</font>',
273
274             meta_to_prop_map => {   # this maps search metatags to display properties
275                 swishdefault    => [ qw/swishtitle swishdescription/ ],
276                 swishtitle      => [ qw/swishtitle/ ],
277                 email           => [ qw/email/ ],
278                 name            => [ qw/name/ ],
279                 swishdocpath    => [ qw/swishdocpath/ ],
280             },
281         },
282     };
283
284 Setup web server (OS/web server dependent):
285
286     /var/www # ln -s /path/to/hypermail/search
287     /var/www # ln -s /path/to/hypermail/archive
288
289 and maybe tell apache to run the script:
290
291     $ cat .htaccess
292     Deny from all
293     <files swish.cgi>
294         Allow from all
295         SetHandler cgi-script
296         Options +ExecCGI
297     </files>
298
299
300 =head1 DESCRIPTION
301
302 This script is used to parse files produced by hypermail.
303 Last tested with hypermail pre-2.1.9.
304
305 It scans the directory passed as the first parameter for files matching \d+\.html
306 and then extracts out the content, email, name and subject.  This is then passed to
307 swish-e for indexing.
308
309 The swish.cgi script is used for searching the resulting index.  Configuration settings
310 are stored in the .swish.cgi file located in the current directory.  By default, swish.cgi
311 expects the current working directory to be the location of the cgi script.  On other web
312 servers this may not be the case and you will need to edit swish.cgi to use absolute path
313 names for .swishcgi.conf and the index files.
314
315
316 =head1 USAGE
317
318 See the SYNOPSIS above.
319
320 =head1 COPYRIGHT
321
322 This library is free software; you can redistribute it
323 and/or modify it under the same terms as Perl itself.
324
325 =head1 SEE ALSO
326
327 Hypermail can be downloaded from:
328
329    http://hypermail.org
330
331 =head1 AUTHOR
332
333 Bill Moseley moseley@hank.org. 2004
334
335 =head1 SUPPORT
336
337 Please contact the Swish-e discussion email list for support with this module
338 or with Swish-e.  Please do not contact the developers directly.
339
Note: See TracBrowser for help on using the browser.