Changeset 1965
- Timestamp:
- 11/30/07 17:00:59 (6 months ago)
- Files:
-
- Swishetest/trunk/BuildIndex.pm (modified) (3 diffs)
- Swishetest/trunk/Changes (modified) (1 diff)
- Swishetest/trunk/DoSearch.pm (modified) (2 diffs)
- Swishetest/trunk/GetDictionaryWords.pm (modified) (1 diff)
- Swishetest/trunk/MinMax.pm (modified) (1 diff)
- Swishetest/trunk/NotRand.pm (modified) (1 diff)
- Swishetest/trunk/Swishetest.pm (modified) (1 diff)
- Swishetest/trunk/make_collection (modified) (10 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
Swishetest/trunk/BuildIndex.pm
r1963 r1965 7 7 # returns a hash of info parsed from the indexing 8 8 sub build_index_from_directory { 9 my ($input, $index, $config, $extra_options) = @_;10 $config = "conf/basic-libxml2.conf" unless $config;11 $extra_options = "" unless $extra_options;12 my $output = `/usr/local/bin/swish-e -c $config -i '$input' -f '$index' -v 1 $extra_options`; 13 print STDERR "$0: Running '$output'\n" if $ENV{TEST_VERBOSE};14 # -v 1 is important, we use it to test the indexer15 return parse_indexing_output( $output );9 my ($input, $index, $config, $extra_options) = @_; 10 $config = "conf/basic-libxml2.conf" unless $config; 11 $extra_options = "" unless $extra_options; 12 my $output = `/usr/local/bin/swish-e -c $config -i '$input' -f '$index' -v 1 $extra_options`; 13 print STDERR "$0: Running '$output'\n" if $ENV{TEST_VERBOSE}; 14 # -v 1 is important, we use it to test the indexer 15 return parse_indexing_output( $output ); 16 16 } 17 17 … … 19 19 # returns a hash of info parsed from the indexing 20 20 sub build_index_from_external_program { 21 my ($external_program, $index, $config, $extra_options) = @_;22 $config = "conf/basic-libxml2.conf" unless $config;23 $extra_options = "" unless $extra_options;24 my $cmd = "$external_program | /usr/local/bin/swish-e -c $config -i stdin -f '$index' -v 1 -S prog $extra_options";25 # -v 1 is important, we use it to test the indexer26 print STDERR "$0: Running '$cmd'\n" if $ENV{TEST_VERBOSE};27 my $output = `$cmd`; 28 return parse_indexing_output( $output );21 my ($external_program, $index, $config, $extra_options) = @_; 22 $config = "conf/basic-libxml2.conf" unless $config; 23 $extra_options = "" unless $extra_options; 24 my $cmd = "$external_program | /usr/local/bin/swish-e -c $config -i stdin -f '$index' -v 1 -S prog $extra_options"; 25 # -v 1 is important, we use it to test the indexer 26 print STDERR "$0: Running '$cmd'\n" if $ENV{TEST_VERBOSE}; 27 my $output = `$cmd`; 28 return parse_indexing_output( $output ); 29 29 } 30 30 … … 32 32 # returns a hash of name->value pairs gleaned from the swish-e output 33 33 sub parse_indexing_output { 34 my $output = shift;35 my @output = split(/\r|\n/, $output);# both \n's and \r's are in $output.36 # yup, @output and $output.37 my %out;# the hash of index output data that we'll return38 my $numreg = '([0-9]+)';39 for(@output) {40 chomp(); 41 s/,//g;# remove all commas, they made parsing harder.34 my $output = shift; 35 my @output = split(/\r|\n/, $output); # both \n's and \r's are in $output. 36 # yup, @output and $output. 37 my %out; # the hash of index output data that we'll return 38 my $numreg = '([0-9]+)'; 39 for(@output) { 40 chomp(); 41 s/,//g; # remove all commas, they made parsing harder. 42 42 43 print "PROCESSING: $_\n" if defined($ENV{TEST_VERBOSE}) && $ENV{TEST_VERBOSE} > 1; 43 print "PROCESSING: $_\n" if defined($ENV{TEST_VERBOSE}) && $ENV{TEST_VERBOSE} > 1; 44 44 45 $out{unique} = $1if /^\s*($numreg)\s+unique\s+words?\s+indexed/;46 $out{properties} = $1if /^\s*($numreg)\s+properties/;47 $out{files} = $1 if /^\s*($numreg)\s+files?\s+indexed/;48 $out{bytes} = $1 if /\s($numreg)\s+total\s+byte/;49 $out{words} = $1 if /\s($numreg)\s+total\s+word/;50 }51 die "Couldn't get data from swish-e index build, got " .45 $out{unique} = $1 if /^\s*($numreg)\s+unique\s+words?\s+indexed/; 46 $out{properties} = $1 if /^\s*($numreg)\s+properties/; 47 $out{files} = $1 if /^\s*($numreg)\s+files?\s+indexed/; 48 $out{bytes} = $1 if /\s($numreg)\s+total\s+byte/; 49 $out{words} = $1 if /\s($numreg)\s+total\s+word/; 50 } 51 die "Couldn't get data from swish-e index build, got " . 52 52 join(", ", map { "$_ = {$out{$_}}" } keys(%out)) . "\n(output was " . join("\n", @output) . ")" 53 unless (scalar(keys(%out)) == 5);54 return %out;53 unless (scalar(keys(%out)) == 5); 54 return %out; 55 55 } 56 56 Swishetest/trunk/Changes
r1961 r1965 5 5 - updated copyright dates through 2007 6 6 0.04 Thu Nov 29 16:30:00 2007 7 - there are 5 properties now, not 48 - use Carp to explain warnings and errors7 - there are 5 properties now, not 4 8 - use Carp to explain warnings and errors 9 9 - make each .pm file an actual module. 10 - comment updates10 - comment updates 11 11 - package up for Peter Karman 12 12 0.03 Tue Apr 12 20:08:00 2005 13 - major rewrite, code made more C-like.14 - Test XML, Text and HTML parsers on complete FC1 and OSX dictionaries (t/030*)15 - t/030* respects MAX_INDEX_FILES env variable for max index size13 - major rewrite, code made more C-like. 14 - Test XML, Text and HTML parsers on complete FC1 and OSX dictionaries (t/030*) 15 - t/030* respects MAX_INDEX_FILES env variable for max index size 16 16 0.02 Tue Dec 21 10:10:00 2004 17 - started structure with dirs qw(conf data index)18 - wrote collection data/C001-trivial-txt/(a|b|c).txt19 - wrote collection data/C002-trivial-html/(a|b|c).html20 - wrote collection data/C003-trivial-xml/(a|b|c).xml21 - copied OS X's /usr/share/dict/words to data/C020-words-txt/words.txt22 - setup ./mkmanifest script to update manifest based on . and t/*.t23 - wrote Swishtest.pm ::do_search() and ::build_index(), and docs24 - wrote 000-basicload.t, 001-SWISHAPI.t,25 - wrote t/010*.t and t/010*.t, which test building indices from data/C010-trivial-txt/*26 - wrote t/011-C011-trivial-html.t, t/012-C012-trivial-xml.t, &27 t/020-C020-wordsbasic-txt.t, each for its respective C0##28 - wrote make_collection (see ./make_collection -h)29 - wrote t/030-C030-extprog-xml.t , which uses data/C030- (from make_collection)30 - wrote t/040-C040-swishedocs-html.t which indexes swish-e docs at data/C040-...17 - started structure with dirs qw(conf data index) 18 - wrote collection data/C001-trivial-txt/(a|b|c).txt 19 - wrote collection data/C002-trivial-html/(a|b|c).html 20 - wrote collection data/C003-trivial-xml/(a|b|c).xml 21 - copied OS X's /usr/share/dict/words to data/C020-words-txt/words.txt 22 - setup ./mkmanifest script to update manifest based on . and t/*.t 23 - wrote Swishtest.pm ::do_search() and ::build_index(), and docs 24 - wrote 000-basicload.t, 001-SWISHAPI.t, 25 - wrote t/010*.t and t/010*.t, which test building indices from data/C010-trivial-txt/* 26 - wrote t/011-C011-trivial-html.t, t/012-C012-trivial-xml.t, & 27 t/020-C020-wordsbasic-txt.t, each for its respective C0## 28 - wrote make_collection (see ./make_collection -h) 29 - wrote t/030-C030-extprog-xml.t , which uses data/C030- (from make_collection) 30 - wrote t/040-C040-swishedocs-html.t which indexes swish-e docs at data/C040-... 31 31 0.01 Mon Dec 20 12:42:02 2004 32 - original version; created by h2xs 1.22 with options33 -n Swishetest -v 0.01 -XA32 - original version; created by h2xs 1.22 with options 33 -n Swishetest -v 0.01 -XA 34 34 Swishetest/trunk/DoSearch.pm
r1959 r1965 5 5 use warnings; 6 6 7 my %swishes = (); # map of filename -> SWISH::API7 my %swishes = (); # map of filename -> SWISH::API 8 8 9 9 # given an index filename and a query, opens the index if needed, … … 12 12 13 13 sub open_index { 14 my $index = shift;15 if (!exists($swishes{$index})) {16 my $swish = $swishes{$index} = SWISH::API->new( $index );17 die "$0: index $index could not be opened.\n" unless $swish;18 print STDERR "Index $index opened\n" if $ENV{TEST_VERBOSE};19 }20 return 1;14 my $index = shift; 15 if (!exists($swishes{$index})) { 16 my $swish = $swishes{$index} = SWISH::API->new( $index ); 17 die "$0: index $index could not be opened.\n" unless $swish; 18 print STDERR "Index $index opened\n" if $ENV{TEST_VERBOSE}; 19 } 20 return 1; 21 21 } 22 22 23 23 sub close_index { 24 my $index = shift;25 if (exists($swishes{$index})) {26 delete $swishes{$index};# remove from the hash, should close it27 } else {28 die "$0: index $index was not open.\n";29 }24 my $index = shift; 25 if (exists($swishes{$index})) { 26 delete $swishes{$index}; # remove from the hash, should close it 27 } else { 28 die "$0: index $index was not open.\n"; 29 } 30 30 } 31 31 sub do_search { 32 my ($index, $query) = @_; 33 my @r = ();34 #return @r unless $query;35 my $swish;36 eval {37 if (exists($swishes{$index})) {38 $swish = $swishes{$index};39 } else {40 #$swish = $swishes{$index} = SWISH::API->new( $index );41 die "$0: index $index was not opened.\n";42 }43 #print STDERR "Searching for $query\n" if $ENV{TEST_VERBOSE};44 my $results = $swish->Query( $query );45 my @props = map { $_->Name } ($swish->PropertyList( $index ) );46 if ($swish->Error()) {47 #print STDERR "$0: Error searching for $query: " . $swish->ErrorString();48 return @r;49 }32 my ($index, $query) = @_; 33 my @r = (); 34 #return @r unless $query; 35 my $swish; 36 eval { 37 if (exists($swishes{$index})) { 38 $swish = $swishes{$index}; 39 } else { 40 #$swish = $swishes{$index} = SWISH::API->new( $index ); 41 die "$0: index $index was not opened.\n"; 42 } 43 #print STDERR "Searching for $query\n" if $ENV{TEST_VERBOSE}; 44 my $results = $swish->Query( $query ); 45 my @props = map { $_->Name } ($swish->PropertyList( $index ) ); 46 if ($swish->Error()) { 47 #print STDERR "$0: Error searching for $query: " . $swish->ErrorString(); 48 return @r; 49 } 50 50 51 while ( my $result = $results->NextResult() ) {52 my %h;53 for my $p (@props) { $h{$p} = $result->Property($p); }54 push(@r, \%h);55 }56 };# end eval{}57 if ($@) {58 my $str = "$0: test failed: $@";59 if ($swish && $swish->Error()) {60 $str .= " (" . $swish->ErrorString() . ")";61 }62 }63 return @r;51 while ( my $result = $results->NextResult() ) { 52 my %h; 53 for my $p (@props) { $h{$p} = $result->Property($p); } 54 push(@r, \%h); 55 } 56 }; # end eval{} 57 if ($@) { 58 my $str = "$0: test failed: $@"; 59 if ($swish && $swish->Error()) { 60 $str .= " (" . $swish->ErrorString() . ")"; 61 } 62 } 63 return @r; 64 64 } 65 65 Swishetest/trunk/GetDictionaryWords.pm
r1959 r1965 8 8 # returns ref to array of the read words and ref to hash of word->count 9 9 sub get_dictionary_words { 10 my $dict = shift;11 my ($case_sensitive) = (shift || 0); 12 my ($max_words) = (shift || 0);# 0 means don't limit13 my @words;14 my %word_count;15 # Load words. Repeats are OK16 print STDERR "Loading dictionary...\n" if $ENV{TEST_VERBOSE};17 open (my $fh, "<", $dict)|| die "$0: Couldn't open $dict: $!";18 #for($num_words = 0; $words[$num_words] = <$fh>; ) {19 while( defined( my $word = <$fh> ) && ($max_words == 0 || scalar(@words) < $max_words)) {20 $word =~ s/[-',.<>]//g;# strip stuff21 chomp $word;# strip newline22 $word =~ s/^\s+//;23 $word =~ s/\s+$//;24 if($word =~ /^$/) { 25 warn "Skipping empty non-word '$word'\n";26 next;27 }28 push(@words, $word);29 my ($counted_word) = ($case_sensitive ? $word : lc($word));30 $word_count{$counted_word}++;31 }32 close $fh || die "$0: Couldn't close $dict: $!";33 return (\@words, \%word_count);10 my $dict = shift; 11 my ($case_sensitive) = (shift || 0); 12 my ($max_words) = (shift || 0); # 0 means don't limit 13 my @words; 14 my %word_count; 15 # Load words. Repeats are OK 16 print STDERR "Loading dictionary...\n" if $ENV{TEST_VERBOSE}; 17 open (my $fh, "<", $dict)|| die "$0: Couldn't open $dict: $!"; 18 #for($num_words = 0; $words[$num_words] = <$fh>; ) { 19 while( defined( my $word = <$fh> ) && ($max_words == 0 || scalar(@words) < $max_words)) { 20 $word =~ s/[-',.<>]//g; # strip stuff 21 chomp $word; # strip newline 22 $word =~ s/^\s+//; 23 $word =~ s/\s+$//; 24 if($word =~ /^$/) { 25 warn "Skipping empty non-word '$word'\n"; 26 next; 27 } 28 push(@words, $word); 29 my ($counted_word) = ($case_sensitive ? $word : lc($word)); 30 $word_count{$counted_word}++; 31 } 32 close $fh || die "$0: Couldn't close $dict: $!"; 33 return (\@words, \%word_count); 34 34 } 35 35 Swishetest/trunk/MinMax.pm
r1959 r1965 4 4 5 5 sub max { 6 my $max = shift;7 for(@_) { $max = $_ if $_ > $max; }8 return $max;6 my $max = shift; 7 for(@_) { $max = $_ if $_ > $max; } 8 return $max; 9 9 } 10 10 sub min { 11 my $min = shift;12 for(@_) { $min = $_ if $_ < $min; }13 return $min;11 my $min = shift; 12 for(@_) { $min = $_ if $_ < $min; } 13 return $min; 14 14 } 15 15 1; Swishetest/trunk/NotRand.pm
r1959 r1965 14 14 use vars qw( $last ); 15 15 sub not_rand { 16 my $max = $_[0] || 1;# if no value is passed, we return '0' or '1'17 use integer;# is it faster not to use integer? No, it's faster to USE int.18 $last = 1 unless defined($last);19 $last = ($last*21+1); 20 # from "Advanced Perl Programming", 4.4 Using Closures.21 # We truncate to 30 bits to preclude system overflow and thereby be more portable22 # that would be 'mod 2 ** 30' (1,073,741,824), which makes sense, 4.2G over 423 $last %= 1_073_741_824;# that's 2 to the 30th24 #print "rand of $max is " . abs($last % $max) . "\n";25 return abs($last % $max);# abs isn't needed16 my $max = $_[0] || 1; # if no value is passed, we return '0' or '1' 17 use integer; # is it faster not to use integer? No, it's faster to USE int. 18 $last = 1 unless defined($last); 19 $last = ($last*21+1); 20 # from "Advanced Perl Programming", 4.4 Using Closures. 21 # We truncate to 30 bits to preclude system overflow and thereby be more portable 22 # that would be 'mod 2 ** 30' (1,073,741,824), which makes sense, 4.2G over 4 23 $last %= 1_073_741_824; # that's 2 to the 30th 24 #print "rand of $max is " . abs($last % $max) . "\n"; 25 return abs($last % $max); # abs isn't needed 26 26 } 27 27 Swishetest/trunk/Swishetest.pm
r1960 r1965 33 33 use Swishetest qw(build_index do_search); 34 34 use Data::Dumper qw(Dumper); 35 35 36 36 my %info = build_index( "input/data", "out/myindex.index"); 37 # 3rd & 4th params 'configfile' and 'extraoptions' are optional37 # 3rd & 4th params 'configfile' and 'extraoptions' are optional 38 38 print Dumper( \%info ); 39 39 open_index("myindex.index"); 40 my @rows = do_search( "myindex.index", "this is the search" ); 41 # returns a list of hashrefs40 my @rows = do_search( "myindex.index", "this is the search" ); 41 # returns a list of hashrefs 42 42 close_index("myindex.index"); 43 43 print Dumper( \@rows ); Swishetest/trunk/make_collection
r1964 r1965 19 19 my $min_words_per_file=1; 20 20 my $max_words_per_file=1; 21 my $num_files=0; # 0 means one file for each word in dictionary22 my $num_words; # should be scalar(@words)23 my $base_dir = ""; # empty base_dir means be an -S prog external program24 my $randommode = 1; 25 # in randommode, words are randomly chosen, otherwise words are sequential from the dict26 my $englishify = 0; # insert commas, periods, and caps?27 my $filetype = "xml"; # type of file to create. can also be 'html' or 'txt'21 my $num_files=0; # 0 means one file for each word in dictionary 22 my $num_words; # should be scalar(@words) 23 my $base_dir = ""; # empty base_dir means be an -S prog external program 24 my $randommode = 1; 25 # in randommode, words are randomly chosen, otherwise words are sequential from the dict 26 my $englishify = 0; # insert commas, periods, and caps? 27 my $filetype = "xml"; # type of file to create. can also be 'html' or 'txt' 28 28 my $verbose = 0; 29 29 my $progress = 1; … … 31 31 32 32 sub Usage { 33 return "make_collection: [--dict=words.txt] [--base_dir=/your/location]\n" .34 "[--min_words_per_file=#] [--max_words_per_file=#] [--num_files=#]\n" .35 "[--verbose] [--englishify] [--filetype=(txt|html|xml)] [--(no)randommode]\n" .36 " Makes a set of (possibly random) xml, html, or txt files based on a dict.\n";33 return "make_collection: [--dict=words.txt] [--base_dir=/your/location]\n" . 34 "[--min_words_per_file=#] [--max_words_per_file=#] [--num_files=#]\n" . 35 "[--verbose] [--englishify] [--filetype=(txt|html|xml)] [--(no)randommode]\n" . 36 " Makes a set of (possibly random) xml, html, or txt files based on a dict.\n"; 37 37 } 38 38 … … 67 67 my ($words, $word_counts) = GetDictionaryWords::get_dictionary_words( $dict ); 68 68 69 if ($num_files == 0) { 69 if ($num_files == 0) { 70 70 $num_files = scalar(@$words); 71 71 print STDERR "$prog: set num_files to $num_files\n" if $verbose; … … 82 82 } 83 83 #if (($i+1) % 1000 == 0) { print STDERR "** working on file $i"; } 84 my $this_file_words = # choose how many words will be in the file84 my $this_file_words = # choose how many words will be in the file 85 85 int( not_rand( $max_words_per_file - $min_words_per_file + 1 ) ) + $min_words_per_file; 86 86 my $doc =""; 87 my $toCap = 1; # should we Capitalize the coming word?87 my $toCap = 1; # should we Capitalize the coming word? 88 88 for(my $j = 0; $j < $this_file_words; $j++, $wordcounter++) 89 89 { … … 92 92 93 93 if ($toCap) { $toadd = "\u$toadd"; $toCap = 0; } 94 if (!defined($toadd)) { next; } 94 if (!defined($toadd)) { next; } 95 95 $doc .= $toadd; 96 96 if ($englishify) { 97 my $r = int(not_rand(10000)); # random number we use to plop in punctuation & line breaks97 my $r = int(not_rand(10000)); # random number we use to plop in punctuation & line breaks 98 98 if ($j == $this_file_words-1 || $r % 9 == 0) { $doc .= ". "; $toCap = 1; } 99 99 elsif ($r % 7 == 0) { $doc .= ","; } … … 107 107 $doc = simple_xmlify( $doc ); 108 108 } elsif ($filetype =~ /^html$/i) { 109 $doc = simple_htmlify( extract_title($doc), $doc ); # title, content109 $doc = simple_htmlify( extract_title($doc), $doc ); # title, content 110 110 } else { 111 111 $doc = simple_txtify( $doc ); … … 126 126 # one block of text in xml 127 127 sub simple_xmlify { 128 return qq{<?xml version="1.0" encoding="ISO-8859-1"?>\n<swishdefault>\n} .129 $_[0] . "\n</swishdefault>\n\n";128 return qq{<?xml version="1.0" encoding="ISO-8859-1"?>\n<swishdefault>\n} . 129 $_[0] . "\n</swishdefault>\n\n"; 130 130 } 131 131 132 132 # one block of text in txt 133 133 sub simple_txtify { 134 return "$_[0]\n";134 return "$_[0]\n"; 135 135 } 136 136 … … 138 138 # one block of text, with a title, in html 139 139 sub simple_htmlify { 140 my ($title, $content) = @_;141 my $html = <<EOF;140 my ($title, $content) = @_; 141 my $html = <<EOF; 142 142 <html> 143 143 <head> … … 153 153 154 154 EOF 155 return $html;155 return $html; 156 156 } 157 157 … … 172 172 # return a title up to N characters based on the first 10 words. 173 173 sub extract_title { 174 my $doc = shift;175 my $maxtitlewords = 10;176 my $maxtitlelen = 25;177 my @w = split(' ', $doc, $maxtitlewords + 1);178 my $title = ""; 179 for(my $i=0; $i < scalar(@w) && length($title)+length($w[$i]) <= $maxtitlelen; $i++) {180 $title .= "$w[$i] ";181 }182 chop($title);# remove the ' ', sloppy and simple183 return $title;174 my $doc = shift; 175 my $maxtitlewords = 10; 176 my $maxtitlelen = 25; 177 my @w = split(' ', $doc, $maxtitlewords + 1); 178 my $title = ""; 179 for(my $i=0; $i < scalar(@w) && length($title)+length($w[$i]) <= $maxtitlelen; $i++) { 180 $title .= "$w[$i] "; 181 } 182 chop($title); # remove the ' ', sloppy and simple 183 return $title; 184 184 } 185 185 186 186 # given an extension, choose a parser 187 187 sub choose_parser { 188 my $ext = $_[0];189 if ($ext =~ /^xml$/i) {190 return "XML2";191 } elsif ($ext =~ /^html?$/i) {192 return "HTML2";193 }194 return "TXT";188 my $ext = $_[0]; 189 if ($ext =~ /^xml$/i) { 190 return "XML2"; 191 } elsif ($ext =~ /^html?$/i) { 192 return "HTML2"; 193 } 194 return "TXT"; 195 195 } 196 196
