root/libswish3/trunk/perl/IndexerUtils.pm

Revision 2114, 0.8 kB (checked in by karpet, 3 months ago)

oops. forgot indexer needs this too, till we can integrate with libswish3 perl bindings

Line 
1 package IndexerUtils;
2 use strict;
3 use warnings;
4 use Carp;
5 use File::Find;
6 use File::Slurp;
7
8 sub aggregate {
9     my @where = @_;
10     my $Ext   = qr{html?|sgml?|xml|txt}i;
11     my @filenames;
12
13     find(
14         {   wanted => sub {
15                 return unless $_ =~ m/\.$Ext$/;
16
17                 push( @filenames, $_ );
18             },
19             no_chdir => 1
20         },
21         @where
22     );
23     return @filenames;
24 }
25
26 sub normalize {
27     my $file = shift or croak "file required";
28     my $verbose = shift || 0;
29
30     $verbose and print "indexing $file ...\n";
31     my $buf = read_file($file);
32
33     # strip any markup
34     $buf =~ s,<.+?>,,sg;
35     return $buf if !wantarray;
36
37     # naive tokenizer
38     my @w = grep {m/./} split( /\s+/, $buf );
39
40     $verbose and print scalar(@w), " words in $file\n";
41     return @w;
42 }
43
44 1;
Note: See TracBrowser for help on using the browser.