root/libswish3/trunk/src/xapian/swish_xapian.cpp

Revision 2129, 17.7 kB (checked in by karpet, 5 months ago)

use our string_to_int instead of strtol() directly

Line 
1 /*
2  * This file is part of libswish3
3  * Copyright (C) 2008 Peter Karman
4  *
5  *  libswish3 is free software; you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation; either version 2 of the License, or
8  *  (at your option) any later version.
9  *
10  *  libswish3 is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with libswish3; if not, write to the Free Software
17  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18  */
19
20 /* 
21     example Swish3 program using Xapian IR backend.
22     many of the string conversion functions and the index_document() code
23     come nearly verbatim from the xapian-omega distribution.
24
25 */
26
27 #include <algorithm>
28 #include <fstream>
29 #include <iostream>
30 #include <string>
31 #include <map>
32 #include <vector>
33
34 #include <time.h>
35
36 #include <sys/types.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <stdarg.h>
40 #include <err.h>
41 #include <string.h>
42 #include <wctype.h>
43 #include <ctype.h>
44 #include <getopt.h>
45
46 #include <xapian.h>
47
48 #include <libxml/hash.h>
49 #include "libswish3.h"
50
51 using namespace std;
52
53 /* prototypes */
54 int main(
55     int argc,
56     char **argv
57 );
58 int usage(
59 );
60 void handler(
61     swish_ParserData *parser_data
62 );
63 int open_writeable_index(
64     char *dbpath
65 );
66 int open_readable_index(
67     char *dbpath
68 );
69 int search(
70     char *query
71 );
72
73 /* global vars */
74 static int debug = 0;
75 static
76     Xapian::WritableDatabase
77     wdb;
78 static
79     Xapian::Database::Database
80     rdb;
81 static
82     Xapian::Stem
83 stemmer(
84     "english"
85 );                              // TODO make this configurable
86 static
87     Xapian::TermGenerator
88     indexer;
89 static int
90     twords = 0;
91 static int
92     skip_duplicates = 0;
93 static int
94     overwrite = 0;
95 static
96     vector <
97     bool >
98     updated;
99 static swish_3 *
100     s3;
101
102 extern int
103     SWISH_DEBUG;
104
105 static struct option
106     longopts[] = {
107     {"config", required_argument, 0, 'c'},
108     {"debug", required_argument, 0, 'd'},
109     {"help", no_argument, 0, 'h'},
110     {"index", required_argument, 0, 'i'},
111     {"skip-duplicates", no_argument, 0, 's'},
112     {"overwrite", no_argument, 0, 'o'},
113     {"query", required_argument, 0, 'q'},
114     {0, 0, 0, 0}
115 };
116
117 // This ought to be enough for any of the conversions below.
118 #define BUFSIZE 100
119
120 #ifdef SNPRINTF
121 #define CONVERT_TO_STRING(FMT) \
122     char buf[BUFSIZE];\
123     int len = SNPRINTF(buf, BUFSIZE, (FMT), val);\
124     if (len == -1 || len > BUFSIZE) return string(buf, BUFSIZE);\
125     return string(buf, len);
126 #else
127 #define CONVERT_TO_STRING(FMT) \
128     char buf[BUFSIZE];\
129     buf[BUFSIZE - 1] = '\0';\
130     sprintf(buf, (FMT), val);\
131     if (buf[BUFSIZE - 1]) abort(); /* Uh-oh, buffer overrun */ \
132     return string(buf);
133 #endif
134
135 int
136 string_to_int(
137     const string & s
138 )
139 {
140     return atoi(s.c_str());
141 }
142
143 string
144 int_to_string(
145     int val
146 )
147 {
148     CONVERT_TO_STRING("%d")
149 }
150
151 string
152 long_to_string(
153     long val
154 )
155 {
156     CONVERT_TO_STRING("%ld")
157 }
158
159 string
160 double_to_string(
161     double val
162 )
163 {
164     CONVERT_TO_STRING("%f")
165 }
166
167 string
168 date_to_string(
169     int y,
170     int m,
171     int d
172 )
173 {
174     char
175         buf[11];
176     if (y < 0)
177         y = 0;
178     else if (y > 9999)
179         y = 9999;
180     if (m < 1)
181         m = 1;
182     else if (m > 12)
183         m = 12;
184     if (d < 1)
185         d = 1;
186     else if (d > 31)
187         d = 31;
188 #ifdef SNPRINTF
189     int
190         len = SNPRINTF(buf, sizeof(buf), "%04d%02d%02d", y, m, d);
191     if (len == -1 || len > BUFSIZE)
192         return string(buf, BUFSIZE);
193     return string(buf, len);
194 #else
195     buf[sizeof(buf) - 1] = '\0';
196     sprintf(buf, "%04d%02d%02d", y, m, d);
197     if (buf[sizeof(buf) - 1])
198         abort();                /* Uh-oh, buffer overrun */
199     return string(buf);
200 #endif
201 }
202
203 inline
204     uint32_t
205 binary_string_to_int(
206     const std::string & s
207 )
208 {
209     if (s.size() != 4)
210         return (uint32_t) - 1;
211     uint32_t v;
212     memcpy(&v, s.data(), 4);
213     return ntohl(v);
214 }
215
216 inline std::string
217 int_to_binary_string(
218     uint32_t v
219 )
220 {
221     v = htonl(v);
222     return std::string(reinterpret_cast < const char *>(&v), 4);
223 }
224
225 static
226     string
227 get_prefix(
228     xmlChar *metaname,
229     swish_Config *config
230 )
231 {
232     string
233         prefix;
234     swish_MetaName *
235         meta = (swish_MetaName *)swish_hash_fetch(config->metanames, metaname);
236     prefix = int_to_string(meta->id);
237     return prefix + string((const char *)":");
238 }
239
240 static void
241 add_prefix(
242     swish_MetaName *meta,
243     Xapian::QueryParser qp,
244     xmlChar *name
245 )
246 {
247     qp.add_prefix(string((const char *)name),
248                   int_to_string(meta->id) + string((const char *)":"));
249 }
250
251 static unsigned int
252 get_weight(
253     xmlChar *metaname,
254     swish_Config *config
255 )
256 {
257     unsigned int
258         w;
259     swish_MetaName *
260         meta = (swish_MetaName *)swish_hash_fetch(config->metanames, metaname);
261     return meta->bias > 0 ? meta->bias : 1;     // TODO need to account for negative values.
262 }
263
264 static void
265 add_metanames(
266     xmlBufferPtr buffer,
267     void *config,
268     xmlChar *metaname
269 )
270 {
271     // lookup weight and prefix
272     string
273         prefix = get_prefix(metaname, (swish_Config *)config);
274     unsigned int
275         weight = get_weight(metaname, (swish_Config *)config);
276     indexer.index_text((const char *)xmlBufferContent(buffer), weight, prefix);
277     // index swishdefault and swishtitle without any prefix too
278     if (xmlStrEqual(metaname, BAD_CAST SWISH_DEFAULT_METANAME)
279         || xmlStrEqual(metaname, BAD_CAST SWISH_TITLE_METANAME)
280         ) {
281         indexer.index_text((const char *)xmlBufferContent(buffer), weight);
282     }
283 }
284
285 static void
286 add_properties(
287     xmlBufferPtr buffer,
288     Xapian::Document doc,
289     xmlChar *name
290 )
291 {
292     swish_Property *
293         prop;
294     prop = (swish_Property *)swish_hash_fetch(s3->config->properties, name);
295     //SWISH_DEBUG_MSG("adding property %s [%d]: %s", name, prop->id,
296     //                xmlBufferContent(buffer));
297     doc.add_value(prop->id, (const char *)xmlBufferContent(buffer));
298 }
299
300 void
301 handler(
302     swish_ParserData *parser_data
303 )
304 {
305     //printf("nwords: %d\n", parser_data->docinfo->nwords);
306
307     twords += parser_data->docinfo->nwords;
308
309     if (SWISH_DEBUG & SWISH_DEBUG_DOCINFO) {
310         swish_debug_docinfo(parser_data->docinfo);
311     }
312     if (SWISH_DEBUG & SWISH_DEBUG_WORDLIST) {
313         swish_debug_wordlist(parser_data->wordlist);
314     }
315     if (SWISH_DEBUG & SWISH_DEBUG_NAMEDBUFFER) {
316         swish_debug_nb(parser_data->properties, BAD_CAST "Property");
317         swish_debug_nb(parser_data->metanames, BAD_CAST "MetaName");
318     }
319
320     // Put the data in the document
321     Xapian::Document newdocument;
322     xmlChar *
323         title = BAD_CAST swish_nb_get_value(parser_data->properties,
324                                               BAD_CAST SWISH_PROP_TITLE);
325     //printf("title = %s", (char *)title);
326     string
327         unique_id = SWISH_PREFIX_URL + string((const char *)parser_data->docinfo->uri);
328     string
329         record = "url=" + string((const char *)parser_data->docinfo->uri);
330     record += "\ntitle=" + string((const char *)title);
331     record += "\ntype=" + string((const char *)parser_data->docinfo->mime);
332     record += "\nmodtime=" + long_to_string(parser_data->docinfo->mtime);
333     record += "\nsize=" + long_to_string(parser_data->docinfo->size);
334     newdocument.set_data(record);
335
336     // Index the title, document text, and keywords.
337     indexer.set_document(newdocument);
338     indexer.increase_termpos(100);
339     newdocument.add_term(SWISH_PREFIX_MTIME +
340                          long_to_string(parser_data->docinfo->mtime));
341     newdocument.add_term(unique_id);
342
343     struct tm *
344         tm = localtime(&(parser_data->docinfo->mtime));
345     string
346         date_term = "D" + date_to_string(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday);
347     newdocument.add_term(date_term);    // Date (YYYYMMDD)
348     date_term.resize(7);
349     date_term[0] = 'M';
350     newdocument.add_term(date_term);    // Month (YYYYMM)
351     date_term.resize(5);
352     date_term[0] = 'Y';
353     newdocument.add_term(date_term);    // Year (YYYY)
354
355     // add all docinfo values as properties
356     newdocument.add_value(SWISH_PROP_MTIME_ID,
357                           long_to_string(parser_data->docinfo->mtime));
358     newdocument.add_value(SWISH_PROP_DOCPATH_ID,
359                           string((const char *)parser_data->docinfo->uri));
360     newdocument.add_value(SWISH_PROP_SIZE_ID, long_to_string(parser_data->docinfo->size));
361     newdocument.add_value(SWISH_PROP_MIME_ID,
362                           string((const char *)parser_data->docinfo->mime));
363     newdocument.add_value(SWISH_PROP_PARSER_ID,
364                           string((const char *)parser_data->docinfo->parser));
365     newdocument.add_value(SWISH_PROP_NWORDS_ID,
366                           long_to_string(parser_data->docinfo->nwords));
367
368     // title is special value
369     newdocument.add_value(SWISH_PROP_TITLE_ID, string((const char *)title));
370
371     // add all metanames and properties
372     xmlHashScan(parser_data->metanames->hash, (xmlHashScanner)add_metanames, s3->config);
373     xmlHashScan(parser_data->properties->hash, (xmlHashScanner)add_properties, &newdocument);
374
375     if (!skip_duplicates) {
376         // If this document has already been indexed, update the existing
377         // entry.
378         try {
379             Xapian::docid did = wdb.replace_document(unique_id, newdocument);
380             if (did < updated.size()) {
381                 updated[did] = true;
382                 cout << "        .... updated." << endl;
383             }
384             else {
385                 cout << "        .... added." << endl;
386             }
387         }
388         catch(...) {
389             // FIXME: is this ever actually needed?
390             wdb.add_document(newdocument);
391             cout << "added (failed re-seek for duplicate)." << endl;
392         }
393     }
394     else {
395         // If this were a duplicate, we'd have skipped it above.
396         wdb.add_document(newdocument);
397         cout << "added." << endl;
398     }
399 }
400
401 int
402 open_writeable_index(
403     char *dbpath
404 )
405 {
406     int
407         exitcode = 1;
408     string
409         header;
410     try {
411         if (!overwrite) {
412             wdb = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
413             if (!skip_duplicates) {
414                 // + 1 so that wdb.get_lastdocid() is a valid subscript.
415                 updated.resize(wdb.get_lastdocid() + 1);
416             }
417         }
418         else {
419             wdb = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
420         }
421
422         indexer.set_stemmer(stemmer);
423
424         // read header if it exists
425         header =
426             dbpath + string((const char *)SWISH_PATH_SEP) +
427             string((const char *)SWISH_HEADER_FILE);
428         if (swish_file_exists(BAD_CAST header.c_str())) {
429             swish_merge_config_with_header((char *)header.c_str(), s3->config);
430         }
431
432         wdb.flush();
433
434         // cout << "\n\nNow we have " << wdb.get_doccount() << " documents.\n";
435         exitcode = 0;
436     }
437     catch(const Xapian::Error & e
438     )
439     {
440         cout << "Exception: " << e.get_msg() << endl;
441     } catch(const string & s
442     )
443     {
444         cout << "Exception: " << s << endl;
445     } catch(const char *s
446     )
447     {
448         cout << "Exception: " << s << endl;
449     } catch(...) {
450         cout << "Caught unknown exception" << endl;
451     }
452
453     return exitcode;
454 }
455
456 int
457 open_readable_index(
458     char *dbpath
459 )
460 {
461     int
462         exitcode = 1;
463     string
464         header;
465     try {
466         rdb = Xapian::Database::Database(dbpath);
467
468         header =
469             dbpath + string((const char *)SWISH_PATH_SEP) +
470             string((const char *)SWISH_HEADER_FILE);
471         if (swish_file_exists(BAD_CAST header.c_str())) {
472             swish_merge_config_with_header((char *)header.c_str(), s3->config);
473         }
474
475         exitcode = 0;
476     } catch(const Xapian::Error & e
477     )
478     {
479         cout << "Exception: " << e.get_msg() << endl;
480     } catch(const string & s
481     )
482     {
483         cout << "Exception: " << s << endl;
484     } catch(const char *s
485     )
486     {
487         cout << "Exception: " << s << endl;
488     } catch(...) {
489         cout << "Caught unknown exception" << endl;
490     }
491
492     return exitcode;
493
494 }
495
496 int
497 search(
498     char *qstr
499 )
500 {
501     int
502         total_matches;
503     Xapian::Enquire * enquire;
504     Xapian::Query query;
505     Xapian::QueryParser qparser;
506     Xapian::MSet mset;
507     Xapian::MSetIterator iterator;
508     Xapian::Document doc;
509
510     total_matches = 0;
511     qparser.set_stemmer(stemmer);       // TODO make this configurable
512     qparser.set_database(rdb);
513
514     // map all human metanames to internal prefix
515     xmlHashScan(s3->config->metanames, (xmlHashScanner)add_prefix, &qparser);
516
517     // TODO boolean_prefix?
518
519     try {
520         query = qparser.parse_query(string(qstr));
521     }
522     catch(Xapian::QueryParserError & e) {
523         SWISH_CROAK("query parser error: %s", e.get_msg().c_str());
524     }
525
526     // this is very simplistic. swish-e does paging etc.
527     enquire = new Xapian::Enquire(rdb);
528     enquire->set_query(query);
529     mset = enquire->get_mset(0, 100);
530     printf("# %d estimated matches\n", mset.get_matches_estimated());
531     cout << "# " + query.get_description() << endl;
532     iterator = mset.begin();
533    
534     // output format is simple, not as flexible as swish-e.
535     // But hey. It's an example.
536     for (; iterator != mset.end(); ++iterator) {
537         doc = iterator.get_document();
538         printf("%3d0 %s \"%s\" %s\n", iterator.get_percent(),
539                doc.get_value(SWISH_PROP_DOCPATH_ID).c_str(),
540                doc.get_value(SWISH_PROP_TITLE_ID).c_str(),
541                doc.get_value(SWISH_PROP_SIZE_ID).c_str()
542             );
543         total_matches++;
544     }
545    
546     //printf("# %d total matches\n", total_matches);
547 }
548
549 int
550 usage(
551 )
552 {
553
554     char *
555         descr = "swish_xapian is an example program for using libswish3 with Xapian\n";
556     printf("swish_xapian [opts] [- | file(s)]\n");
557     printf("opts:\n --config conf_file.xml\n --query <query>\n --debug [lvl]\n --help\n");
558     printf(" --index path/to/index\n --skip-duplicates\n --overwrite\n");
559     printf("\n%s\n", descr);
560     exit(0);
561 }
562
563 int
564 main(
565     int argc,
566     char **argv
567 )
568 {
569     int
570         i,
571         ch;
572     extern char *
573         optarg;
574     extern int
575         optind;
576     int
577         option_index;
578     int
579         files;
580     char *
581         etime;
582     char *
583         query;
584     char *
585         dbpath;
586     string
587         header;
588     double
589         start_time;
590     xmlChar *
591         config_file;
592
593     config_file = NULL;
594     option_index = 0;
595     files = 0;
596     query = NULL;
597     dbpath = NULL;
598     start_time = swish_time_elapsed();
599     s3 = swish_init_swish3(&handler, NULL);
600
601     while ((ch = getopt_long(argc, argv, "c:d:f:i:q:soh", longopts, &option_index)) != -1) {
602
603         switch (ch) {
604         case 0:                /* If this option set a flag, do nothing else now. */
605             if (longopts[option_index].flag != 0)
606                 break;
607             printf("option %s", longopts[option_index].name);
608             if (optarg)
609                 printf(" with arg %s", optarg);
610             printf("\n");
611             break;
612
613         case 'c':
614             //printf("optarg = %s\n", optarg);
615             config_file = swish_xstrdup(BAD_CAST optarg);
616             break;
617
618         case 'd':
619             printf("turning on debug mode: %s\n", optarg);
620
621             if (!isdigit(optarg[0]))
622                 err(1, "-d option requires a positive integer as argument\n");
623
624             SWISH_DEBUG = swish_string_to_int(optarg);
625             break;
626
627         case 'o':
628             overwrite = 1;
629             break;
630
631         case 'i':
632             dbpath = (char *)swish_xstrdup(BAD_CAST optarg);
633             break;
634
635         case 's':
636             skip_duplicates = swish_string_to_int(optarg);
637             break;
638
639         case 'q':
640             query = (char *)swish_xstrdup(BAD_CAST optarg);
641             break;
642
643         case '?':
644         case 'h':
645         default:
646             usage();
647
648         }
649
650     }
651
652     if (config_file != NULL) {
653         s3->config = swish_add_config(config_file, s3->config);
654     }
655
656     i = optind;
657
658     /*
659        die with no args
660      */
661     if ((!i || i >= argc) && !query) {
662         swish_free_swish3(s3);
663         usage();
664
665     }
666
667     if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
668         swish_debug_config(s3->config);
669     }
670
671     if (!dbpath) {
672         dbpath = (char *)swish_xstrdup(BAD_CAST SWISH_INDEX_FILENAME);
673     }
674
675     // indexing mode
676     if (!query) {
677
678         open_writeable_index(dbpath);
679
680         for (; i < argc; i++) {
681             if (argv[i][0] != '-') {
682                 //printf("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
683                 printf("parse_file for %s", argv[i]);
684                 if (!swish_parse_file(s3, (unsigned char *)argv[i]))
685                     files++;
686
687             }
688             else if (argv[i][0] == '-' && !argv[i][1]) {
689                 printf("reading from stdin\n");
690                 files = swish_parse_fh(s3, NULL);
691             }
692
693         }
694
695         printf("\n\n%d files indexed\n", files);
696         printf("total words: %d\n", twords);
697
698         // how do we know when to write a header file?
699         // it's legitimate to re-write if the config was defined
700         // but also if it is not (defaults).
701         // so we re-write every time we have a writeable db.
702         header =
703             dbpath + string((const char *)SWISH_PATH_SEP) +
704             string((const char *)SWISH_HEADER_FILE);
705         swish_write_header((char *)header.c_str(), s3->config);
706
707     }
708
709     // searching mode
710     else {
711         open_readable_index(dbpath);
712         search(query);
713         swish_xfree(BAD_CAST query);
714     }
715
716     etime = swish_print_time(swish_time_elapsed() - start_time);
717     printf("# %s total time\n\n", etime);
718     swish_xfree(etime);
719     swish_xfree(dbpath);
720     swish_free_swish3(s3);
721
722     if (config_file != NULL)
723         swish_xfree(config_file);
724
725     return (0);
726 }
Note: See TracBrowser for help on using the browser.