Changeset 1487

Show
Ignore:
Timestamp:
07/25/04 23:29:20 (4 years ago)
Author:
karman
Message:

Added IDF word weighting in getrank() to weight words more heavily that appear less often.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/swish-e/pod/CHANGES.pod

    r1484 r1487  
    1010 
    1111=over 4 
     12 
     13=item Added IDF word weighting in getrank() 
     14 
     15Added Inverse Document Frequency calculation to the getrank() routine. 
     16This will allow the relative frequency of a word in relationship to other 
     17words in the query to impact the ranking of documents. 
     18 
     19Example: if 'foo' is present twice as often as 'bar' in the collection as a whole, 
     20a search for 'foo bar' will weight documents with 'bar' more heavily (i.e., higher 
     21rank) than those with 'foo'.  
     22 
     23The impact is greatest when OR'ing words in a query rather than 
     24AND'ing them (which is the default). 
     25 
    1226 
    1327=item Updates to the example scripts 
  • trunk/swish-e/src/rank.c

    r1296 r1487  
    289289    int         metaID; 
    290290    int         freq; 
     291    int         total_files; 
     292    int         idf; 
     293    int         tfreq; 
     294    int         mystruct; 
     295     
    291296#ifdef DEBUG_RANK 
    292297    int        struct_tally[256]; 
     
    294299        struct_tally[i] = 0; 
    295300#endif 
     301 
     302 
     303     
    296304 
    297305    /* has rank already been calculated? */ 
     
    303311    sw      = indexf->sw; 
    304312    posdata = r->posdata; 
    305  
     313     
    306314 
    307315    /* Get bias for the current metaID - metaID is stored in the rank for ease here */ 
     
    328336    /*  this word.  If the word is not found in many files then it should be ranked higher */ 
    329337 
     338 
    330339    rank = 1; 
    331340    freq = r->frequency; 
     
    336345    { 
    337346        /* GET_STRUCTURE must return value in range! */ 
    338         rank += sw->structure_map[ GET_STRUCTURE(posdata[i]) ] + meta_bias; 
     347 
     348        rank += sw->structure_map[ GET_STRUCTURE(posdata[i]) ] + meta_bias ; 
     349         
    339350#ifdef DEBUG_RANK 
    340351        // fprintf(stderr, "Word entry %d at position %d has struct %d\n", i,  GET_POSITION(posdata[i]),  GET_STRUCTURE(posdata[i]) ); 
     
    355366    if ( rank < 1 ) 
    356367        rank = 1; 
     368 
     369 
     370    /* weight rank by word's idf */ 
     371 
     372    /*  
     373    IDF is the Inverse Document Frequency, or, the weight of the word in relationship to the  
     374    collection of documents as a whole. 
     375    Multiply the weight against the rank to give greater weight to words that appear less often 
     376    in the collection. 
     377     
     378    The biggest impact should be seen when OR'ing words together instead of AND'ing them. 
     379     
     380    karman - Sun Jul 25 22:18:15 CDT 2004 
     381    */ 
     382     
     383    total_files = indexf->header.totalfiles; 
     384    tfreq       = r->tfrequency; 
     385    idf         = (int) log( total_files / tfreq ); 
     386 
     387#ifdef DEBUG_RANK 
     388        fprintf(stderr, "Total files: %d   Total word freq: %d   IDF: %d\n", total_files, tfreq, idf ); 
     389        fprintf(stderr, "Rank before IDF weighting: %d\n", rank ); 
     390#endif 
     391        rank = rank * idf; 
     392         
     393#ifdef DEBUG_RANK 
     394        fprintf(stderr, "Rank after IDF weighting: %d\n", rank ); 
     395#endif 
    357396 
    358397 
     
    385424 
    386425 
    387  
    388426    /* Return if IgnoreTotalWordCountWhenRanking is true (the default) */ 
    389427    if ( indexf->header.ignoreTotalWordCountWhenRanking )