root/libswish3/trunk/perl/docmaker.pl

Revision 2097, 2.4 kB (checked in by karpet, 4 months ago)

write header

Line 
1 #!/usr/bin/perl
2
3 use strict;
4 use warnings;
5 use SWISH::Prog::Headers;
6 use Search::Tools::XML;
7 use Term::ProgressBar;
8
9 my $usage = "$0 [max_files] [utf_factor]\n";
10
11 die $usage unless @ARGV;
12
13 my $docmaker = SWISH::Prog::Headers->new;
14
15 #$ENV{SWISH3} = 1;
16
17 # based on
18 # http://sourceforge.net/mailarchive/message.php?msg_id=10319975
19 # and modified to make UTF-8 compliant and run under 'use strict'
20
21 # Dict file with words. One word per line.
22 my $dict = "/usr/share/dict/words";
23
24 my $min_words_per_file = 3;
25 my $max_words_per_file = 9999;
26 my $max_files          = shift @ARGV;
27 die $usage if $max_files =~ m/h/i;
28
29 my $utf_factor = shift @ARGV;
30 $utf_factor = 10
31     unless
32     defined $utf_factor;  # every Nth word gets converted to random UTF string
33
34 my ( $num_words, @words );
35
36 binmode STDOUT, ":utf8";
37
38 # Load words
39 open DICT, "<$dict" or die "can't open $dict: $!\n";
40
41 for ( $num_words = 0; $words[$num_words] = <DICT>; $num_words++ ) {
42     chomp $words[$num_words];
43
44     # utf hack: convert every Nth word up a factor of $num_words > 1
45     if ( $utf_factor > 0 && !$num_words % $utf_factor ) {
46         no bytes;    # so ord() and chr() work as expected
47                      #warn ">> $num_words: $words[$num_words]\n";
48         my $utf_word = '';
49         for my $c ( split( //, $words[$num_words] ) ) {
50             my $u = chr( ord($c) + 30000 )
51                 ;    # 30000 puts it in Chinese range, I think...
52             $utf_word .= $u;
53         }
54
55         #warn "utf: $utf_word\n";
56         $words[$num_words] = $utf_word;
57     }
58 }
59
60 close DICT;
61
62 srand;
63
64 my $i = 0;
65 my $progress
66     = Term::ProgressBar->new( { term_width => 80, count => $max_files } );
67
68 # preallocate memory (doesn't really matter after all...)
69 my $doc = ' ' x ( $max_words_per_file * 10 );
70 my $xml = $doc;
71 while ( $i++ < $max_files ) {
72     my $this_file_words
73         = int( rand( $max_words_per_file - $min_words_per_file + 1 ) )
74         + $min_words_per_file;
75     $doc = '';
76     my $word_cnt = 0;
77     while ( $word_cnt++ < $this_file_words ) {
78         $doc .= $words[ int( rand( $num_words - 1 ) ) ] . ' ';
79     }
80     Search::Tools::XML->escape($doc);
81     $xml = qq(<?xml version="1.0" encoding="utf-8"?>
82 <doc>
83 $doc
84 </doc>
85 );
86
87     my $header = $docmaker->head(
88         $xml,
89         {   url   => $i,
90             mtime => time(),
91             mime  => 'text/xml'
92         }
93     );
94
95     print $header, $xml;
96
97     $progress->update($i);
98
99 }
100
101 warn "\n";
102
Note: See TracBrowser for help on using the browser.