root/libswish3/trunk/perl/countwords.pl

Revision 1913, 1.4 kB (checked in by karpet, 1 year ago)

for all the world to see

Line 
1 #!/usr/bin/perl -w
2 #
3 # count instances of words in a swish-e index
4 # and report on NUM number of top instances
5 #
6 # usage: countwords [NUM [INDEX]]
7 #
8 # Copyright 2005 karpet@peknet.com
9 # released under same terms as Perl
10 #
11 # TODO: swish-e ought to contain this feature natively
12 #
13
14 use strict;
15 use Text::FormatTable;
16
17 my ($num,$index) = @ARGV;
18
19 #defaults
20 $index ||= 'index.swish-e';
21 $num   ||= 50;
22
23 unless( -s $index )
24 {
25     die "no such index: $index\n";
26 }
27 unless( $num )
28 {
29     die "need a number of words to output\n";
30 }
31
32 my $count;
33 my $cmd = "swish-e -f $index -T INDEX_WORDS";
34
35 warn $cmd, $/;
36
37 open(SWISH, "$cmd |")
38         or die "can't exec '$cmd': $!\n";
39        
40 while(<SWISH>) {
41         chomp;
42         my ($word,@insts) = split /\[\d+ /, $_ ;
43         INST: for my $i (@insts) {
44                 next INST if ! $i;
45                 my ($doc,$cnt) = split(/\s+/,$i);
46                 $count->{$word}->[0] += $cnt;
47                 $count->{$word}->[1]++;
48         }
49 }
50
51 close(SWISH);
52
53 # print results, stopping at $num
54 # use FormatTable for pretty ASCII
55
56 my $tbl = new Text::FormatTable('r  r  l  l');
57 $tbl->head('#','word','count','unique docs');
58 $tbl->rule('=');
59 my $seen = 0;
60 my $n = 0;
61
62 for my $word (sort {
63         $count->{$b}->[0] <=> $count->{$a}->[0]
64         } keys %$count) {
65         my ($cnt,$docs) = @{ $count->{$word} };
66         $tbl->row(++$n, $word, $cnt, $docs);
67         last if ++$seen == $num;
68 }
69
70 print $tbl->render(60);
71
Note: See TracBrowser for help on using the browser.