| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
use strict; |
|---|
| 4 |
use warnings; |
|---|
| 5 |
use SWISH::Prog::Headers; |
|---|
| 6 |
use Search::Tools::XML; |
|---|
| 7 |
use Term::ProgressBar; |
|---|
| 8 |
|
|---|
| 9 |
my $usage = "$0 [max_files] [utf_factor]\n"; |
|---|
| 10 |
|
|---|
| 11 |
die $usage unless @ARGV; |
|---|
| 12 |
|
|---|
| 13 |
my $docmaker = SWISH::Prog::Headers->new; |
|---|
| 14 |
|
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 |
|
|---|
| 18 |
|
|---|
| 19 |
|
|---|
| 20 |
|
|---|
| 21 |
|
|---|
| 22 |
my $dict = "/usr/share/dict/words"; |
|---|
| 23 |
|
|---|
| 24 |
my $min_words_per_file = 3; |
|---|
| 25 |
my $max_words_per_file = 9999; |
|---|
| 26 |
my $max_files = shift @ARGV; |
|---|
| 27 |
die $usage if $max_files =~ m/h/i; |
|---|
| 28 |
|
|---|
| 29 |
my $utf_factor = shift @ARGV; |
|---|
| 30 |
$utf_factor = 10 |
|---|
| 31 |
unless |
|---|
| 32 |
defined $utf_factor; |
|---|
| 33 |
|
|---|
| 34 |
my ( $num_words, @words ); |
|---|
| 35 |
|
|---|
| 36 |
binmode STDOUT, ":utf8"; |
|---|
| 37 |
|
|---|
| 38 |
|
|---|
| 39 |
open DICT, "<$dict" or die "can't open $dict: $!\n"; |
|---|
| 40 |
|
|---|
| 41 |
for ( $num_words = 0; $words[$num_words] = <DICT>; $num_words++ ) { |
|---|
| 42 |
chomp $words[$num_words]; |
|---|
| 43 |
|
|---|
| 44 |
|
|---|
| 45 |
if ( $utf_factor > 0 && !$num_words % $utf_factor ) { |
|---|
| 46 |
no bytes; |
|---|
| 47 |
|
|---|
| 48 |
my $utf_word = ''; |
|---|
| 49 |
for my $c ( split( //, $words[$num_words] ) ) { |
|---|
| 50 |
my $u = chr( ord($c) + 30000 ) |
|---|
| 51 |
; |
|---|
| 52 |
$utf_word .= $u; |
|---|
| 53 |
} |
|---|
| 54 |
|
|---|
| 55 |
|
|---|
| 56 |
$words[$num_words] = $utf_word; |
|---|
| 57 |
} |
|---|
| 58 |
} |
|---|
| 59 |
|
|---|
| 60 |
close DICT; |
|---|
| 61 |
|
|---|
| 62 |
srand; |
|---|
| 63 |
|
|---|
| 64 |
my $i = 0; |
|---|
| 65 |
my $progress |
|---|
| 66 |
= Term::ProgressBar->new( { term_width => 80, count => $max_files } ); |
|---|
| 67 |
|
|---|
| 68 |
|
|---|
| 69 |
my $doc = ' ' x ( $max_words_per_file * 10 ); |
|---|
| 70 |
my $xml = $doc; |
|---|
| 71 |
while ( $i++ < $max_files ) { |
|---|
| 72 |
my $this_file_words |
|---|
| 73 |
= int( rand( $max_words_per_file - $min_words_per_file + 1 ) ) |
|---|
| 74 |
+ $min_words_per_file; |
|---|
| 75 |
$doc = ''; |
|---|
| 76 |
my $word_cnt = 0; |
|---|
| 77 |
while ( $word_cnt++ < $this_file_words ) { |
|---|
| 78 |
$doc .= $words[ int( rand( $num_words - 1 ) ) ] . ' '; |
|---|
| 79 |
} |
|---|
| 80 |
Search::Tools::XML->escape($doc); |
|---|
| 81 |
$xml = qq(<?xml version="1.0" encoding="utf-8"?> |
|---|
| 82 |
<doc> |
|---|
| 83 |
$doc |
|---|
| 84 |
</doc> |
|---|
| 85 |
); |
|---|
| 86 |
|
|---|
| 87 |
my $header = $docmaker->head( |
|---|
| 88 |
$xml, |
|---|
| 89 |
{ url => $i, |
|---|
| 90 |
mtime => time(), |
|---|
| 91 |
mime => 'text/xml' |
|---|
| 92 |
} |
|---|
| 93 |
); |
|---|
| 94 |
|
|---|
| 95 |
print $header, $xml; |
|---|
| 96 |
|
|---|
| 97 |
$progress->update($i); |
|---|
| 98 |
|
|---|
| 99 |
} |
|---|
| 100 |
|
|---|
| 101 |
warn "\n"; |
|---|
| 102 |
|
|---|