| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
|
|---|
| 5 |
|
|---|
| 6 |
|
|---|
| 7 |
use strict; |
|---|
| 8 |
use warnings; |
|---|
| 9 |
|
|---|
| 10 |
use Getopt::Long; |
|---|
| 11 |
use GetDictionaryWords; |
|---|
| 12 |
use NotRand qw(not_rand); |
|---|
| 13 |
|
|---|
| 14 |
my $prog = "make_collection"; |
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 |
my $dict='data/C020-words-txt/words-linux-fc1.txt'; |
|---|
| 18 |
|
|---|
| 19 |
my $min_words_per_file=100; |
|---|
| 20 |
my $max_words_per_file=100; |
|---|
| 21 |
my $num_files=1000; |
|---|
| 22 |
my $num_words; |
|---|
| 23 |
my $base_dir = ""; |
|---|
| 24 |
my $randommode = 1; |
|---|
| 25 |
|
|---|
| 26 |
my $englishify = 0; |
|---|
| 27 |
my $filetype = "xml"; |
|---|
| 28 |
my $verbose = 0; |
|---|
| 29 |
my $progress = 0; |
|---|
| 30 |
my $progress_seconds = 60; |
|---|
| 31 |
my $lastprogresstime = 0; |
|---|
| 32 |
|
|---|
| 33 |
sub Usage { |
|---|
| 34 |
return "make_collection: [--dict=words.txt] [--base_dir=/your/location]\n" . |
|---|
| 35 |
" [--min_words_per_file=$min_words_per_file] [--max_words_per_file=$max_words_per_file] [--num_files=$num_files]\n" . |
|---|
| 36 |
" [--verbose] [--englishify] [--filetype=(txt|html|xml)] [--(no)randommode]:\n" . |
|---|
| 37 |
" Makes a set of (possibly random) xml, html, or txt files based on a dict.\n" . |
|---|
| 38 |
" If you dont set a --base_dir, then it outputs data like a swish-e prog.\n"; |
|---|
| 39 |
} |
|---|
| 40 |
|
|---|
| 41 |
main(); |
|---|
| 42 |
|
|---|
| 43 |
sub main { |
|---|
| 44 |
GetOptions( |
|---|
| 45 |
"min_words_per_file=i" => \$min_words_per_file, |
|---|
| 46 |
"max_words_per_file=i" => \$max_words_per_file, |
|---|
| 47 |
"num_files=i" => \$num_files, |
|---|
| 48 |
"base_dir=s" => \$base_dir, |
|---|
| 49 |
"dict=s" => \$dict, |
|---|
| 50 |
"englishify!" => \$englishify, |
|---|
| 51 |
"randommode!" => \$randommode, |
|---|
| 52 |
"filetype=s" => \$filetype, |
|---|
| 53 |
"verbose!" => \$verbose |
|---|
| 54 |
) || die Usage(); |
|---|
| 55 |
|
|---|
| 56 |
|
|---|
| 57 |
die "$prog: Error: Filetype '$filetype' not understood\n" . Usage() unless $filetype =~ /^(txt|xml|html?)$/i; |
|---|
| 58 |
if ($verbose) { |
|---|
| 59 |
warn "$prog: Warning: No --base_dir option, running as swish-e external program\n" unless $base_dir; |
|---|
| 60 |
} |
|---|
| 61 |
|
|---|
| 62 |
my $parser = choose_parser($filetype); |
|---|
| 63 |
|
|---|
| 64 |
if ($max_words_per_file < $min_words_per_file) { |
|---|
| 65 |
die "$prog: max_words_per_file must be larger than min_words_per_file"; |
|---|
| 66 |
} |
|---|
| 67 |
|
|---|
| 68 |
|
|---|
| 69 |
my ($words, $word_counts) = GetDictionaryWords::get_dictionary_words( $dict ); |
|---|
| 70 |
|
|---|
| 71 |
if ($num_files == 0) { |
|---|
| 72 |
$num_files = scalar(@$words); |
|---|
| 73 |
print STDERR "$prog: set num_files to $num_files\n" if $verbose; |
|---|
| 74 |
} |
|---|
| 75 |
|
|---|
| 76 |
print STDERR "$0: Outputting $num_files files...\n" if $ENV{TEST_VERBOSE}; |
|---|
| 77 |
|
|---|
| 78 |
my $wordcounter = 0; |
|---|
| 79 |
print "Creating files...\n" if $verbose; |
|---|
| 80 |
for(my $i = 0; $i < $num_files; $i++) { |
|---|
| 81 |
if ($i && $progress && time() - $lastprogresstime >= $progress_seconds) { |
|---|
| 82 |
my $percent = sprintf("%1.1f", $i / $num_files * 100); |
|---|
| 83 |
print STDERR "$prog: $filetype: on file $i of $num_files ($percent%)\n"; |
|---|
| 84 |
$lastprogresstime = time(); |
|---|
| 85 |
} |
|---|
| 86 |
|
|---|
| 87 |
my $this_file_words = |
|---|
| 88 |
int( not_rand( $max_words_per_file - $min_words_per_file + 1 ) ) + $min_words_per_file; |
|---|
| 89 |
my $doc =""; |
|---|
| 90 |
my $toCap = 1; |
|---|
| 91 |
for(my $j = 0; $j < $this_file_words; $j++, $wordcounter++) |
|---|
| 92 |
{ |
|---|
| 93 |
my $toadd = $randommode ? $$words[ not_rand( scalar(@$words) ) ] : $$words[$wordcounter % scalar(@$words)]; |
|---|
| 94 |
|
|---|
| 95 |
|
|---|
| 96 |
if ($englishify && $toCap) { $toadd = "\u$toadd"; $toCap = 0; } |
|---|
| 97 |
if (!defined($toadd)) { next; } |
|---|
| 98 |
$doc .= $toadd; |
|---|
| 99 |
if ($englishify) { |
|---|
| 100 |
my $r = int(not_rand(10000)); |
|---|
| 101 |
if ($j == $this_file_words-1 || $r % 9 == 0) { $doc .= ". "; $toCap = 1; } |
|---|
| 102 |
elsif ($r % 7 == 0) { $doc .= ","; } |
|---|
| 103 |
if (($j+$i+$r+1) % 5) { $doc .= " "; } else { $doc .= "\n"; } |
|---|
| 104 |
} else { |
|---|
| 105 |
$doc .= ($j+1) % 7 ? " " : "\n"; |
|---|
| 106 |
} |
|---|
| 107 |
} |
|---|
| 108 |
|
|---|
| 109 |
if ($filetype =~ /^xml$/i) { |
|---|
| 110 |
$doc = simple_xmlify( $doc ); |
|---|
| 111 |
} elsif ($filetype =~ /^html$/i) { |
|---|
| 112 |
$doc = simple_htmlify( extract_title($doc), $doc ); |
|---|
| 113 |
} else { |
|---|
| 114 |
$doc = simple_txtify( $doc ); |
|---|
| 115 |
} |
|---|
| 116 |
if ($base_dir) { |
|---|
| 117 |
my $path = "$base_dir/$i.$filetype"; |
|---|
| 118 |
open(OUTFILE, ">", $path) || die "$prog: Couldn't open $path"; |
|---|
| 119 |
print OUTFILE $doc; |
|---|
| 120 |
close(OUTFILE) || die "$prog: Couldn't close $path"; |
|---|
| 121 |
print STDERR "$prog: created $path...\n" if ($verbose && $i % 1000 == 0); |
|---|
| 122 |
} else { |
|---|
| 123 |
|
|---|
| 124 |
simple_swishe_progify($parser, "$i.$filetype", $doc, scalar(localtime(time()))); |
|---|
| 125 |
} |
|---|
| 126 |
} |
|---|
| 127 |
} |
|---|
| 128 |
|
|---|
| 129 |
|
|---|
| 130 |
sub simple_xmlify { |
|---|
| 131 |
|
|---|
| 132 |
return qq{<?xml version="1.0" encoding="ISO-8859-1"?>\n<swishdefault>\n} . |
|---|
| 133 |
$_[0] . "\n</swishdefault>\n\n"; |
|---|
| 134 |
} |
|---|
| 135 |
|
|---|
| 136 |
|
|---|
| 137 |
sub simple_txtify { |
|---|
| 138 |
return $_[0] . "\n"; |
|---|
| 139 |
} |
|---|
| 140 |
|
|---|
| 141 |
|
|---|
| 142 |
|
|---|
| 143 |
sub simple_htmlify { |
|---|
| 144 |
my ($title, $content) = @_; |
|---|
| 145 |
my $html = <<EOF; |
|---|
| 146 |
<html> |
|---|
| 147 |
<head> |
|---|
| 148 |
<meta http-equiv="Content-Type" content="text/html" /> |
|---|
| 149 |
<title> |
|---|
| 150 |
$title |
|---|
| 151 |
</title> |
|---|
| 152 |
</head> |
|---|
| 153 |
<body> |
|---|
| 154 |
$content |
|---|
| 155 |
</body> |
|---|
| 156 |
</html> |
|---|
| 157 |
|
|---|
| 158 |
EOF |
|---|
| 159 |
return $html; |
|---|
| 160 |
} |
|---|
| 161 |
|
|---|
| 162 |
sub simple_swishe_progify { |
|---|
| 163 |
|
|---|
| 164 |
|
|---|
| 165 |
|
|---|
| 166 |
my $length = length($_[2]); |
|---|
| 167 |
my $header= <<EOF; |
|---|
| 168 |
Content-Length: $length |
|---|
| 169 |
Last-Mtime: $_[3] |
|---|
| 170 |
Path-Name: $_[1] |
|---|
| 171 |
Document-Type: $_[0] |
|---|
| 172 |
|
|---|
| 173 |
EOF |
|---|
| 174 |
print $header, $_[2]; |
|---|
| 175 |
} |
|---|
| 176 |
|
|---|
| 177 |
|
|---|
| 178 |
|
|---|
| 179 |
sub extract_title { |
|---|
| 180 |
my $doc = shift; |
|---|
| 181 |
my $maxtitlewords = 10; |
|---|
| 182 |
my $maxtitlelen = 25; |
|---|
| 183 |
my @w = split(' ', $doc, $maxtitlewords + 1); |
|---|
| 184 |
my $title = ""; |
|---|
| 185 |
for(my $i=0; $i < scalar(@w) && length($title)+length($w[$i]) <= $maxtitlelen; $i++) { |
|---|
| 186 |
$title .= "$w[$i] "; |
|---|
| 187 |
} |
|---|
| 188 |
chop($title); |
|---|
| 189 |
return $title; |
|---|
| 190 |
} |
|---|
| 191 |
|
|---|
| 192 |
|
|---|
| 193 |
sub choose_parser { |
|---|
| 194 |
my $ext = $_[0]; |
|---|
| 195 |
if ($ext =~ /^xml$/i) { |
|---|
| 196 |
return "XML2"; |
|---|
| 197 |
} elsif ($ext =~ /^html?$/i) { |
|---|
| 198 |
return "HTML2"; |
|---|
| 199 |
} |
|---|
| 200 |
return "TXT"; |
|---|
| 201 |
} |
|---|
| 202 |
|
|---|