| 219 | | DB_ReadWordData(sw_input, wordID->wordID, &worddata, &sz_worddata, &saved_bytes, cur_index->DB); |
|---|
| 220 | | uncompress_worddata(&worddata,&sz_worddata,saved_bytes); |
|---|
| 221 | | |
|---|
| 222 | | /* Now, parse word's data */ |
|---|
| 223 | | s = worddata; |
|---|
| 224 | | tmpval = uncompress2(&s); /* tfrequency */ |
|---|
| 225 | | metaID = uncompress2(&s); /* metaID */ |
|---|
| 226 | | |
|---|
| 227 | | if (metaID) |
|---|
| | 219 | for(tmp = wordID; tmp ; tmp = tmp->next) |
|---|
| 229 | | metadata_length = uncompress2(&s); |
|---|
| | 221 | |
|---|
| | 222 | DB_ReadWordData(sw_input, tmp->wordID, &worddata, &sz_worddata, &saved_bytes, cur_index->DB); |
|---|
| | 223 | uncompress_worddata(&worddata,&sz_worddata,saved_bytes); |
|---|
| | 224 | |
|---|
| | 225 | /* Now, parse word's data */ |
|---|
| | 226 | s = worddata; |
|---|
| | 227 | tmpval = uncompress2(&s); /* tfrequency */ |
|---|
| | 228 | metaID = uncompress2(&s); /* metaID */ |
|---|
| | 229 | |
|---|
| | 230 | if (metaID) |
|---|
| | 231 | { |
|---|
| | 232 | metadata_length = uncompress2(&s); |
|---|
| | 233 | } |
|---|
| | 234 | |
|---|
| | 235 | filenum = 0; |
|---|
| | 236 | start = s; |
|---|
| | 237 | |
|---|
| | 238 | while(1) |
|---|
| | 239 | { /* Read on all items */ |
|---|
| | 240 | uncompress_location_values(&s,&flag,&tmpval,&frequency); |
|---|
| | 241 | filenum += tmpval; |
|---|
| | 242 | /* Use stack array when possible to avoid malloc/free overhead */ |
|---|
| | 243 | if(frequency > MAX_STACK_POSITIONS) |
|---|
| | 244 | posdata = (unsigned int *) emalloc(frequency * sizeof(int)); |
|---|
| | 245 | else |
|---|
| | 246 | posdata = local_posdata; |
|---|
| | 247 | |
|---|
| | 248 | /* Read the positions */ |
|---|
| | 249 | uncompress_location_positions(&s,flag,frequency,posdata); |
|---|
| | 250 | |
|---|
| | 251 | |
|---|
| | 252 | /* now we have the word data */ |
|---|
| | 253 | for (i = 0; i < frequency; i++, loc_count++) |
|---|
| | 254 | write_word_pos( cur_index, sw_output, cur_index->merge_file_num_map, filenum, e, metaID, posdata[i]); |
|---|
| | 255 | |
|---|
| | 256 | if(e->tfrequency) |
|---|
| | 257 | { |
|---|
| | 258 | /* 08/2002 jmruiz - We will call CompressCurrentLocEntry from time |
|---|
| | 259 | ** to time to help addentry. |
|---|
| | 260 | ** If we do not do this, addentry routine will have to run linked lists |
|---|
| | 261 | ** of positions with thousands of elements and makes the merge proccess |
|---|
| | 262 | ** very slow |
|---|
| | 263 | */ |
|---|
| | 264 | if(!(loc_count % 100)) |
|---|
| | 265 | CompressCurrentLocEntry(sw_output, e); |
|---|
| | 266 | } |
|---|
| | 267 | |
|---|
| | 268 | |
|---|
| | 269 | if(posdata != local_posdata) |
|---|
| | 270 | efree(posdata); |
|---|
| | 271 | |
|---|
| | 272 | /* Check for enf of worddata */ |
|---|
| | 273 | if ((s - worddata) == sz_worddata) |
|---|
| | 274 | break; /* End of worddata */ |
|---|
| | 275 | |
|---|
| | 276 | /* Check for end of current metaID data */ |
|---|
| | 277 | if ( metadata_length == (s - start)) |
|---|
| | 278 | { |
|---|
| | 279 | filenum = 0; |
|---|
| | 280 | metaID = uncompress2(&s); |
|---|
| | 281 | metadata_length = uncompress2(&s); |
|---|
| | 282 | start = s; |
|---|
| | 283 | } |
|---|
| | 284 | } |
|---|
| | 285 | |
|---|
| | 286 | if(e->tfrequency) |
|---|
| | 287 | CompressCurrentLocEntry(sw_output, e); |
|---|
| | 288 | |
|---|
| | 289 | efree(worddata); |
|---|
| 231 | | |
|---|
| 232 | | filenum = 0; |
|---|
| 233 | | start = s; |
|---|
| 234 | | |
|---|
| 235 | | while(1) |
|---|
| 236 | | { /* Read on all items */ |
|---|
| 237 | | uncompress_location_values(&s,&flag,&tmpval,&frequency); |
|---|
| 238 | | filenum += tmpval; |
|---|
| 239 | | /* Use stack array when possible to avoid malloc/free overhead */ |
|---|
| 240 | | if(frequency > MAX_STACK_POSITIONS) |
|---|
| 241 | | posdata = (unsigned int *) emalloc(frequency * sizeof(int)); |
|---|
| 242 | | else |
|---|
| 243 | | posdata = local_posdata; |
|---|
| 244 | | |
|---|
| 245 | | /* Read the positions */ |
|---|
| 246 | | uncompress_location_positions(&s,flag,frequency,posdata); |
|---|
| 247 | | |
|---|
| 248 | | |
|---|
| 249 | | /* now we have the word data */ |
|---|
| 250 | | for (i = 0; i < frequency; i++, loc_count++) |
|---|
| 251 | | write_word_pos( cur_index, sw_output, cur_index->merge_file_num_map, filenum, e, metaID, posdata[i]); |
|---|
| 252 | | |
|---|
| 253 | | if(e->tfrequency) |
|---|
| 254 | | { |
|---|
| 255 | | /* 08/2002 jmruiz - We will call CompressCurrentLocEntry from time |
|---|
| 256 | | ** to time to help addentry. |
|---|
| 257 | | ** If we do not do this, addentry routine will have to run linked lists |
|---|
| 258 | | ** of positions with thousands of elements and makes the merge proccess |
|---|
| 259 | | ** very slow |
|---|
| 260 | | */ |
|---|
| 261 | | if(!(loc_count % 100)) |
|---|
| 262 | | CompressCurrentLocEntry(sw_output, e); |
|---|
| 263 | | } |
|---|
| 264 | | |
|---|
| 265 | | |
|---|
| 266 | | if(posdata != local_posdata) |
|---|
| 267 | | efree(posdata); |
|---|
| 268 | | |
|---|
| 269 | | /* Check for enf of worddata */ |
|---|
| 270 | | if ((s - worddata) == sz_worddata) |
|---|
| 271 | | break; /* End of worddata */ |
|---|
| 272 | | |
|---|
| 273 | | /* Check for end of current metaID data */ |
|---|
| 274 | | if ( metadata_length == (s - start)) |
|---|
| 275 | | { |
|---|
| 276 | | filenum = 0; |
|---|
| 277 | | metaID = uncompress2(&s); |
|---|
| 278 | | metadata_length = uncompress2(&s); |
|---|
| 279 | | start = s; |
|---|
| 280 | | } |
|---|
| 281 | | } |
|---|
| 282 | | |
|---|
| 283 | | if(e->tfrequency) |
|---|
| 284 | | CompressCurrentLocEntry(sw_output, e); |
|---|
| 285 | | |
|---|
| 286 | | efree(worddata); |
|---|
| 1000 | | for(j=0;j<256;j++) |
|---|
| 1001 | | { |
|---|
| 1002 | | |
|---|
| 1003 | | word[0] = (unsigned char) j; word[1] = '\0'; |
|---|
| 1004 | | DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
|---|
| 1005 | | |
|---|
| 1006 | | while(wordID) |
|---|
| | 1003 | word[0] = '\0'; |
|---|
| | 1004 | previousword = estrdup(word); |
|---|
| | 1005 | DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
|---|
| | 1006 | |
|---|
| | 1007 | while(wordID) |
|---|
| | 1008 | { |
|---|
| | 1009 | /* Add resultword to output if a new word is found */ |
|---|
| | 1010 | /* The word index can contain duplicates */ |
|---|
| | 1011 | if(strcmp(previousword,resultword) != 0) |
|---|
| 1011 | | DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
|---|
| 1012 | | word_count++; |
|---|
| 1013 | | if(!word_count % 10000) |
|---|
| 1014 | | printf("Getting words in index '%s': %3d words\r", indexf->line, word_count); |
|---|
| 1015 | | } |
|---|
| | 1019 | DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); |
|---|
| | 1020 | word_count++; |
|---|
| | 1021 | if(!word_count % 10000) |
|---|
| | 1022 | printf("Getting words in index '%s': %3d words\r", indexf->line, word_count); |
|---|