Changeset 1955 for libswish3/trunk/perl/docmaker.pl
- Timestamp:
- 11/13/07 23:31:51 (1 year ago)
- Files:
-
- libswish3/trunk/perl/docmaker.pl (modified) (4 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libswish3/trunk/perl/docmaker.pl
r1948 r1955 5 5 use SWISH::Prog::Headers; 6 6 use Search::Tools::XML; 7 use Term::ProgressBar; 7 8 8 9 my $usage = "$0 [max_files] [utf_factor]\n"; … … 26 27 my $utf_factor = shift @ARGV; 27 28 $utf_factor = 10 28 unless29 defined $utf_factor;# every Nth word gets converted to random UTF string29 unless 30 defined $utf_factor; # every Nth word gets converted to random UTF string 30 31 31 my $counter = 0; 32 33 my ($num_words, @words, $i, $j); 32 my ( $num_words, @words ); 34 33 35 34 binmode STDOUT, ":utf8"; … … 38 37 open DICT, "<$dict" or die "can't open $dict: $!\n"; 39 38 40 for ($num_words = 0 ; $words[$num_words] = <DICT> ; $num_words++) 41 { 39 for ( $num_words = 0; $words[$num_words] = <DICT>; $num_words++ ) { 42 40 chomp $words[$num_words]; 43 41 44 42 # utf hack: convert every Nth word up a factor of $num_words > 1 45 if ($utf_factor > 0 && !$num_words % $utf_factor) 46 { 43 if ( $utf_factor > 0 && !$num_words % $utf_factor ) { 47 44 no bytes; # so ord() and chr() work as expected 48 45 #warn ">> $num_words: $words[$num_words]\n"; 49 46 my $utf_word = ''; 50 for my $c (split(//, $words[$num_words])) 51 { 52 my $u = 53 chr(ord($c) + 30000); # 30000 puts it in Chinese range, I think... 47 for my $c ( split( //, $words[$num_words] ) ) { 48 my $u = chr( ord($c) + 30000 ) 49 ; # 30000 puts it in Chinese range, I think... 54 50 $utf_word .= $u; 55 51 } … … 64 60 srand; 65 61 66 for ($i = 0 ; $i < $max_files ; $i++) 67 { 68 my $this_file_words = 69 int(rand($max_words_per_file - $min_words_per_file + 1)) + 70 $min_words_per_file; 71 my $doc = ""; 72 for ($j = 0 ; $j < $this_file_words ; $j++) 73 { 74 $doc .= $words[int(rand($num_words - 1))] . " "; 62 my $i = 0; 63 my $progress 64 = Term::ProgressBar->new( { term_width => 80, count => $max_files } ); 65 66 # preallocate memory (doesn't really matter after all...) 67 my $doc = ' ' x ( $max_words_per_file * 10 ); 68 my $xml = $doc; 69 while ( $i++ < $max_files ) { 70 my $this_file_words 71 = int( rand( $max_words_per_file - $min_words_per_file + 1 ) ) 72 + $min_words_per_file; 73 $doc = ''; 74 my $word_cnt = 0; 75 while ( $word_cnt++ < $this_file_words ) { 76 $doc .= $words[ int( rand( $num_words - 1 ) ) ] . ' '; 75 77 } 76 78 Search::Tools::XML->escape($doc); 77 $doc = <<EOF 78 <?xml version="1.0" encoding="utf-8"?> 79 $xml = qq(<?xml version="1.0" encoding="utf-8"?> 79 80 <doc> 80 81 $doc 81 82 </doc> 82 EOF 83 ; 83 ); 84 84 85 #print SWISH::Prog::Headers->head( $doc, { url=>$counter++, mtime=>time(), mime=>'text/xml' } ) . $doc; 86 print SWISH::Prog::Headers->head( 87 $doc, 88 { 89 url => $counter++, 90 mtime => time(), 91 mime => 'text/xml' 92 } 93 ) 94 . $doc; 85 my $header = SWISH::Prog::Headers->head( 86 $xml, 87 { url => $i, 88 mtime => time(), 89 mime => 'text/xml' 90 } 91 ); 92 93 print $header, $xml; 94 95 $progress->update($i); 95 96 96 97 }
