Changeset 2107
- Timestamp:
- 03/31/08 14:39:58 (2 months ago)
- Files:
-
- swish-e/branches/2.6/src/compress.c (modified) (1 diff)
- swish-e/branches/2.6/src/compress.h (modified) (1 diff)
- swish-e/branches/2.6/src/db_write.c (modified) (1 diff)
- swish-e/branches/2.6/src/dump.c (modified) (5 diffs)
- swish-e/branches/2.6/src/merge.c (modified) (3 diffs)
- swish-e/branches/2.6/src/search.c (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
swish-e/branches/2.6/src/compress.c
r2079 r2107 936 936 } 937 937 938 939 /* 2002/09 jmruiz940 ** This routine changes longs in worddata by shorter compressed941 ** numbers.942 **943 ** Here are two reasons for using compressed numbers in worddata944 ** instead of longs:945 ** - Compressed numbers are more portable: longs are usually 4 bytes946 ** long in a 32 bit machine but in a 64 bit alpha they are 8 bytes947 ** long (this a waste of space).948 ** - The obvious one is that compressed numbers use less disk space949 **950 ** BTW, Any change in worddata will also affect to dump.c, merge.c and search.c951 ** (getfileinfo routine).952 **953 ** worddata has the following format before entering the routine954 ** <tfreq><metaID><nextposmetaID><data><metaID><nextposmetaID><data>...955 **956 ** Entering this routine nextposmetaID is the offset to next metaid957 ** in bytes starting to count them from the begining of worddata.958 ** It is a packed long number (sizeof(long) bytes).959 **960 ** Exiting this routine, nextposmetaID has changed to be the size of961 ** the data block and is stored as a compressed number.962 **963 ** In other words, worddata has the following format:964 ** <tfreq><metaID><data_len><data><metaID><data_len><data>...965 **966 */967 void remove_worddata_longs(unsigned char *worddata,int *sz_worddata)968 {969 unsigned char *src,*dst; //source and dest pointers for worddata970 unsigned int metaID, tfrequency, data_len;971 unsigned long nextposmetaID;972 973 src = worddata;974 975 /* Jump over tfrequency and get first metaID */976 tfrequency = uncompress2(&src); /* tfrequency */977 metaID = uncompress2(&src); /* metaID */978 dst = src;979 980 while(1)981 {982 /* Get offset to next one */983 nextposmetaID = UNPACKLONG2(src);984 src += sizeof(long);985 986 /* Compute data length for this metaID */987 data_len = (int)nextposmetaID - (src - worddata);988 989 /* Store data_len as a compressed number */990 dst = compress3(data_len,dst);991 992 /* This must not happen. Anyway check it */993 if(dst > src)994 progerr("Internal error in remove_worddata_longs");995 996 /* dst may be smaller than src. So move the data */997 /* valgrind complains that dst and src overlap, */998 /* which results in undefined behavior with memcpy, so use memmove. */999 /* Q: Is it correct for dst and src to overlap here? */1000 memmove(dst,src,data_len);1001 1002 /* Increase pointers */1003 src += data_len;1004 dst += data_len;1005 1006 /* Check if we are at the end of the buffer */1007 if ((src - worddata) == *sz_worddata)1008 break; /* End of worddata */1009 1010 /* Get next metaID */1011 metaID = uncompress2(&src);1012 dst = compress3(metaID,dst);1013 }1014 /* Adjust to new size */1015 *sz_worddata = dst - worddata;1016 }swish-e/branches/2.6/src/compress.h
r1736 r2107 63 63 int compress_worddata(unsigned char *, int, int ); 64 64 void uncompress_worddata(unsigned char **,int *, int); 65 void remove_worddata_longs(unsigned char *,int *);66 65 67 66 /* Here is the worst case size for a compressed number swish-e/branches/2.6/src/db_write.c
r1944 r2107 306 306 /* 04/2002 jmruiz 307 307 ** New simpler routine to write worddata 308 **309 ** 10/2002 jmruiz310 ** Add extra compression for worddata. Call to remove_worddata_longs311 308 */ 312 309 void write_worddata(SWISH * sw, ENTRY * ep, IndexFILE * indexf ) 313 310 { 314 311 int zlib_size; 315 316 /* Get some extra compression */317 remove_worddata_longs(sw->Index->worddata_buffer,&sw->Index->sz_worddata_buffer);318 312 319 313 if(sw->compressPositions) swish-e/branches/2.6/src/dump.c
r1944 r2107 190 190 unsigned int *posdata; 191 191 int metadata_length; 192 long r_nextposmeta; 192 193 char word[2]; 193 194 char *resultword; … … 248 249 tmpval = uncompress2(&s); /* tfrequency */ 249 250 metaID = uncompress2(&s); /* metaID */ 250 metadata_length = uncompress2(&s); 251 252 r_nextposmeta = UNPACKLONG2(s); 253 s += sizeof(long); 254 metadata_length = (int)r_nextposmeta - (s - worddata); 251 255 252 256 filenum = 0; … … 274 278 filenum = 0; 275 279 metaID = uncompress2(&s); 276 metadata_length = uncompress2(&s); 280 281 r_nextposmeta = UNPACKLONG2(s); 282 s += sizeof(long); 283 metadata_length = (int)r_nextposmeta - (s - worddata); 284 277 285 start = s; 278 286 } … … 331 339 tmpval = uncompress2(&s); /* tfrequency */ 332 340 metaID = uncompress2(&s); /* metaID */ 333 metadata_length = uncompress2(&s); 341 342 r_nextposmeta = UNPACKLONG2(s); 343 s += sizeof(long); 344 metadata_length = (int)r_nextposmeta - (s - worddata); 345 334 346 335 347 filenum = 0; … … 428 440 filenum = 0; 429 441 metaID = uncompress2(&s); 430 metadata_length = uncompress2(&s); 442 443 r_nextposmeta = UNPACKLONG2(s); 444 s += sizeof(long); 445 metadata_length = (int)r_nextposmeta - (s - worddata); 446 431 447 start = s; 432 448 } swish-e/branches/2.6/src/merge.c
r1976 r2107 93 93 DB_WORDID *wordID, *tmp; 94 94 int metadata_length = 0; 95 long r_nextposmeta; 95 96 unsigned char *worddata; 96 97 unsigned char *s, *start; … … 230 231 if (metaID) 231 232 { 232 metadata_length = uncompress2(&s); 233 r_nextposmeta = UNPACKLONG2(s); 234 s += sizeof(long); 235 metadata_length = (int)r_nextposmeta - (s - worddata); 233 236 } 234 237 … … 279 282 filenum = 0; 280 283 metaID = uncompress2(&s); 281 metadata_length = uncompress2(&s); 284 285 r_nextposmeta = UNPACKLONG2(s); 286 s += sizeof(long); 287 metadata_length = (int)r_nextposmeta - (s - worddata); 288 282 289 start = s; 283 290 } swish-e/branches/2.6/src/search.c
r1944 r2107 1509 1509 DB_WORDID *wordID, *tmp; 1510 1510 int metadata_length; 1511 long r_nextposmeta; 1511 1512 char *p; 1512 1513 int tfrequency = 0; … … 1728 1729 while (curmetaID) 1729 1730 { 1730 metadata_length = uncompress2(&s); 1731 r_nextposmeta = UNPACKLONG2(s); 1732 s += sizeof(long); 1733 1734 metadata_length = (int)r_nextposmeta - (s - buffer); 1731 1735 1732 1736 if (curmetaID >= metaID)
