root/Swishetest/trunk/BuildIndex.pm

Revision 2043, 2.7 kB (checked in by joshr, 5 months ago)

actually show the command we're running when env var TEST_VERBOSE is set,
not the output of the command.

Line 
1 # BuildIndex.pm
2 package BuildIndex;
3 use strict;
4 use warnings;
5
6 # given an Input directory, an Index to build, a Config file to use, and optional Extra Options,
7 # returns a hash of info parsed from the indexing
8 sub build_index_from_directory {
9     my ($input, $index, $config, $extra_options) = @_;
10     $config = "conf/basic-libxml2.conf" unless $config;
11     $extra_options = "" unless $extra_options;
12        
13         # we assume swish-e is in the PATH!
14     my $cmd = "swish-e -c $config -i '$input' -f '$index' -v 1 $extra_options";
15     my $output = `$cmd`;
16     print STDERR "$0: Running '$cmd'\n" if $ENV{TEST_VERBOSE};
17         # -v 1 is important, we use it to test the indexer
18     die "$0: Didn't get any output from $cmd\n" unless $output;
19     return parse_indexing_output( $output );
20 }
21
22 # given an external prog, an Index to build, a Config file to use, and optional Extra Options,
23 # returns a hash of info parsed from the indexing
24 sub build_index_from_external_program {
25     my ($external_program, $index, $config, $extra_options) = @_;
26     $config = "conf/basic-libxml2.conf" unless $config;
27     $extra_options = "" unless $extra_options;
28     # WE ASSUME SWISH-E is in the PATH
29     my $cmd = "$external_program | swish-e -c $config -i stdin -f '$index' -v 1 -S prog $extra_options";
30         # -v 1 is important, we use it to test the indexer
31     print STDERR "$0: Running '$cmd'\n" if $ENV{TEST_VERBOSE};
32     my $output = `$cmd`;   
33     die "$0: Didn't get any output from $cmd\n" unless $output;
34     return parse_indexing_output( $output );
35 }
36
37 # given the output from an swish-e indexing run with '-v 1' (or greater) enabled,
38 # returns a  hash of name->value pairs gleaned from the swish-e output
39 sub parse_indexing_output {
40     my $output = shift;
41     my @output = split(/\r|\n/, $output);   # both \n's and \r's are in $output.
42         # yup, @output and $output.
43     my %out;     # the hash of index output data that we'll return
44     my $numreg = '([0-9]+)';
45     for(@output) {
46         chomp();   
47         s/,//g;     # remove all commas, they made parsing harder.
48
49         print "PROCESSING: $_\n" if defined($ENV{TEST_VERBOSE}) && $ENV{TEST_VERBOSE} > 1; 
50
51         $out{unique} = $1       if /^\s*($numreg)\s+unique\s+words?\s+indexed/;
52         $out{properties} = $1   if /^\s*($numreg)\s+properties/;
53         $out{files} = $1        if /^\s*($numreg)\s+files?\s+indexed/;
54         $out{bytes} = $1        if  /\s($numreg)\s+total\s+byte/;
55         $out{words} = $1        if  /\s($numreg)\s+total\s+word/;
56     }
57     die "Couldn't get data from swish-e index build, got " .
58         join(", ", map { "$_ = {$out{$_}}" } keys(%out)) . "\n(output was " . join("\n", @output) . ")"
59             unless (scalar(keys(%out)) == 5);
60     return %out;
61 }
62
63 1;
Note: See TracBrowser for help on using the browser.