|
Revision 2114, 0.8 kB
(checked in by karpet, 3 months ago)
|
oops. forgot indexer needs this too, till we can integrate with libswish3 perl bindings
|
| Line | |
|---|
| 1 |
package IndexerUtils; |
|---|
| 2 |
use strict; |
|---|
| 3 |
use warnings; |
|---|
| 4 |
use Carp; |
|---|
| 5 |
use File::Find; |
|---|
| 6 |
use File::Slurp; |
|---|
| 7 |
|
|---|
| 8 |
sub aggregate { |
|---|
| 9 |
my @where = @_; |
|---|
| 10 |
my $Ext = qr{html?|sgml?|xml|txt}i; |
|---|
| 11 |
my @filenames; |
|---|
| 12 |
|
|---|
| 13 |
find( |
|---|
| 14 |
{ wanted => sub { |
|---|
| 15 |
return unless $_ =~ m/\.$Ext$/; |
|---|
| 16 |
|
|---|
| 17 |
push( @filenames, $_ ); |
|---|
| 18 |
}, |
|---|
| 19 |
no_chdir => 1 |
|---|
| 20 |
}, |
|---|
| 21 |
@where |
|---|
| 22 |
); |
|---|
| 23 |
return @filenames; |
|---|
| 24 |
} |
|---|
| 25 |
|
|---|
| 26 |
sub normalize { |
|---|
| 27 |
my $file = shift or croak "file required"; |
|---|
| 28 |
my $verbose = shift || 0; |
|---|
| 29 |
|
|---|
| 30 |
$verbose and print "indexing $file ...\n"; |
|---|
| 31 |
my $buf = read_file($file); |
|---|
| 32 |
|
|---|
| 33 |
|
|---|
| 34 |
$buf =~ s,<.+?>,,sg; |
|---|
| 35 |
return $buf if !wantarray; |
|---|
| 36 |
|
|---|
| 37 |
|
|---|
| 38 |
my @w = grep {m/./} split( /\s+/, $buf ); |
|---|
| 39 |
|
|---|
| 40 |
$verbose and print scalar(@w), " words in $file\n"; |
|---|
| 41 |
return @w; |
|---|
| 42 |
} |
|---|
| 43 |
|
|---|
| 44 |
1; |
|---|