root/Swishetest/trunk/make_collection

Revision 2092, 6.7 kB (checked in by joshr, 6 months ago)

If TEST_VERBOSE is set, print to STDERR the number of files
we're going to create.

  • Property svn:executable set to
Line 
1 #!/usr/bin/perl -w
2 #$Id: make_collection,v 1.10 2007/11/29 23:57:40 joshr Exp $
3 ## Copyright 2004-2007 Josh Rabinowitz
4
5 # script to create random collections for swish-e from a file like /usr/dict/words (one word per line)
6
7 use strict;
8 use warnings;
9
10 use Getopt::Long;
11 use GetDictionaryWords;
12 use NotRand qw(not_rand);
13
14 my $prog = "make_collection";
15
16 # Dict file with words. One word per line.
17 my $dict='data/C020-words-txt/words-linux-fc1.txt';     # 45,000 word dictionary (408K)
18
19 my $min_words_per_file=100;
20 my $max_words_per_file=100;
21 my $num_files=1000;    # 0 means one file for each word in dictionary
22 my $num_words;      # should be scalar(@words)
23 my $base_dir = ""# empty base_dir means be an -S prog external program
24 my $randommode = 1;
25     # in randommode, words are randomly chosen, otherwise words are sequential from the dict
26 my $englishify = 0; # insert commas, periods, and caps?
27 my $filetype = "xml";   # type of file to create. can also be 'html' or 'txt'
28 my $verbose = 0;
29 my $progress = 0;
30 my $progress_seconds = 60;
31 my $lastprogresstime = 0;
32
33 sub Usage {
34     return "make_collection: [--dict=words.txt] [--base_dir=/your/location]\n" .
35            "  [--min_words_per_file=$min_words_per_file] [--max_words_per_file=$max_words_per_file] [--num_files=$num_files]\n" .
36            "  [--verbose] [--englishify] [--filetype=(txt|html|xml)] [--(no)randommode]:\n" .
37            "   Makes a set of (possibly random) xml, html, or txt files based on a dict.\n" .
38            "   If you dont set a --base_dir, then it outputs data like a swish-e prog.\n";
39 }
40
41 main();
42
43 sub main {
44     GetOptions(
45         "min_words_per_file=i" => \$min_words_per_file,
46         "max_words_per_file=i" => \$max_words_per_file,
47         "num_files=i" => \$num_files,
48         "base_dir=s" => \$base_dir,
49         "dict=s" => \$dict,
50         "englishify!" => \$englishify,
51         "randommode!" => \$randommode,
52         "filetype=s" => \$filetype,
53         "verbose!"    => \$verbose
54     ) || die Usage();
55
56
57     die "$prog: Error: Filetype '$filetype' not understood\n" . Usage() unless $filetype =~ /^(txt|xml|html?)$/i;
58     if ($verbose) {
59         warn "$prog: Warning: No --base_dir option, running as swish-e external program\n" unless $base_dir;
60     }
61
62     my $parser = choose_parser($filetype);
63
64     if ($max_words_per_file < $min_words_per_file) {
65         die "$prog: max_words_per_file must be larger than min_words_per_file";
66     }
67
68     # ref to wordlist, and ref to counthash
69     my ($words, $word_counts) = GetDictionaryWords::get_dictionary_words( $dict );
70
71     if ($num_files == 0) { 
72         $num_files = scalar(@$words);
73         print STDERR "$prog: set num_files to $num_files\n" if $verbose;
74     }
75
76     print STDERR "$0: Outputting $num_files files...\n" if $ENV{TEST_VERBOSE};
77
78     my $wordcounter = 0;
79     print "Creating files...\n" if $verbose;
80     for(my $i = 0; $i < $num_files; $i++) {
81         if ($i && $progress && time() - $lastprogresstime >= $progress_seconds) {
82             my $percent = sprintf("%1.1f", $i / $num_files * 100);
83             print STDERR "$prog: $filetype: on file $i of $num_files ($percent%)\n";
84             $lastprogresstime = time();
85         }
86         #if (($i+1) % 1000 == 0) { print STDERR "** working on file $i"; }
87         my $this_file_words =   # choose how many words will be in the file
88             int( not_rand( $max_words_per_file - $min_words_per_file + 1 ) ) + $min_words_per_file;
89         my $doc ="";
90         my $toCap = 1;  # should we Capitalize the coming word?
91         for(my $j = 0; $j < $this_file_words; $j++, $wordcounter++)
92         {
93             my $toadd = $randommode ? $$words[ not_rand( scalar(@$words) ) ] : $$words[$wordcounter % scalar(@$words)];
94             # choose the next word, either randomly, or sequentially
95
96             if ($englishify && $toCap) { $toadd = "\u$toadd"; $toCap = 0; }
97             if (!defined($toadd)) { next; }
98             $doc .= $toadd;
99             if ($englishify) {
100                 my $r = int(not_rand(10000));   # random number we use to plop in punctuation & line breaks
101                 if ($j == $this_file_words-1 || $r % 9 == 0) { $doc .= ". "; $toCap = 1; }
102                 elsif ($r % 7 == 0) { $doc .= ","; }
103                 if (($j+$i+$r+1) % 5) { $doc  .= " "; } else { $doc .= "\n"; }
104             } else {
105                 $doc .= ($j+1) % 7 ? " " : "\n";
106             }
107         }
108            
109         if ($filetype =~ /^xml$/i) {
110             $doc = simple_xmlify( $doc );
111         } elsif ($filetype =~ /^html$/i) {
112             $doc = simple_htmlify( extract_title($doc), $doc ); # title, content
113         } else {
114             $doc = simple_txtify( $doc );
115         }
116         if ($base_dir) {
117             my $path = "$base_dir/$i.$filetype";
118             open(OUTFILE, ">", $path) || die "$prog: Couldn't open $path";
119             print OUTFILE $doc;
120             close(OUTFILE) || die "$prog: Couldn't close $path";
121             print STDERR "$prog: created $path...\n" if ($verbose && $i % 1000 == 0);
122         } else {
123             # act like a swish-e external program. This prints directly to stdout.
124             simple_swishe_progify($parser, "$i.$filetype", $doc, scalar(localtime(time())));
125         }
126     }
127 }
128
129 # one block of text in xml
130 sub simple_xmlify {
131     # we should test with other encodings. This tests with ISO-8859-1
132     return qq{<?xml version="1.0" encoding="ISO-8859-1"?>\n<swishdefault>\n} .
133         $_[0] . "\n</swishdefault>\n\n";
134 }
135
136 # one block of text in txt
137 sub simple_txtify {
138     return $_[0] . "\n";
139 }
140
141
142 # one block of text, with a title, in html
143 sub simple_htmlify {
144     my ($title, $content) = @_;
145     my $html = <<EOF;
146 <html>
147 <head>
148 <meta http-equiv="Content-Type" content="text/html" />
149 <title>
150     $title
151 </title>
152 </head>
153 <body>
154 $content
155 </body>
156 </html>
157            
158 EOF
159     return $html;
160 }
161
162 sub simple_swishe_progify {
163     #my ($parser, $path, $content, $lasttime) = @_;
164     # we dont use named here, based on the (probably misguided)
165     #thinking that it may be faster.
166     my $length = length($_[2]);
167     my $header= <<EOF;
168 Content-Length: $length
169 Last-Mtime: $_[3]
170 Path-Name: $_[1]
171 Document-Type: $_[0]
172
173 EOF
174     print $header, $_[2];
175 }
176
177 # given a document and a desired title length,
178 # return a title up to N characters based on the first 10 words.
179 sub extract_title {
180     my $doc = shift;
181     my $maxtitlewords = 10;
182     my $maxtitlelen = 25;
183     my @w = split(' ', $doc, $maxtitlewords + 1);
184     my $title = "";
185     for(my $i=0; $i < scalar(@w) && length($title)+length($w[$i]) <= $maxtitlelen; $i++) {
186         $title .= "$w[$i] ";
187     }
188     chop($title);   # remove the ' ', sloppy and simple
189     return $title;
190 }
191
192 # given an extension, choose a parser
193 sub choose_parser {
194     my $ext = $_[0];
195     if ($ext =~ /^xml$/i) {
196         return "XML2";
197     } elsif ($ext =~ /^html?$/i) {
198         return "HTML2";
199     }
200     return "TXT";
201 }
202
Note: See TracBrowser for help on using the browser.