root/swish-e/branches/2.6/src/check.c

Revision 1736, 5.2 kB (checked in by karman, 4 years ago)

changed license header to refer to COPYING for exception details

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1 /*
2 ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
3 ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
4 **
5 *
6
7     This file is part of Swish-e.
8
9     Swish-e is free software; you can redistribute it and/or modify
10     it under the terms of the GNU General Public License as published by
11     the Free Software Foundation; either version 2 of the License, or
12     (at your option) any later version.
13
14     Swish-e is distributed in the hope that it will be useful,
15     but WITHOUT ANY WARRANTY; without even the implied warranty of
16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17     GNU General Public License for more details.
18
19     You should have received a copy of the GNU General Public License
20     along  with Swish-e; if not, write to the Free Software
21     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
22     
23     See the COPYING file that accompanies the Swish-e distribution for details
24     of the GNU GPL and the special exception available for linking against
25     the Swish-e library.
26     
27 ** Mon May  9 15:51:39 CDT 2005
28 ** added GPL
29 ** if it was re-written by rasc, is it still copyright HP?
30
31
32 **
33 ** fixed non-int subscripting pointed out by "gcc -Wall"
34 ** SRE 2/22/00
35 **
36 ** 2001-03-08 rasc   rewritten and enhanced suffix routines
37 **
38 */
39
40 #include "swish.h"
41 #include "check.h"
42 #include "hash.h"
43 #include "swstring.h"
44 #include "mem.h"
45
46 /* Check if a file with a particular suffix should be indexed
47 ** according to the settings in the configuration file.
48 */
49
50 /* Should a word be indexed? Consults the stopword hash list
51 ** and checks if the word is of a reasonable length...
52 ** If you have any good rules that can work with most languages,
53 ** please let me know...
54 */
55
56 int     isokword(sw, word, indexf)
57      SWISH  *sw;
58      char   *word;
59      IndexFILE *indexf;
60 {
61     int     i,
62             same,
63             hasnumber,
64             hasvowel,
65             hascons,
66             numberrow,
67             vowelrow,
68             consrow,
69             wordlen;
70     char    lastchar;
71
72     if (word[0] == '\0')
73         return 0;
74
75     if ( is_word_in_hash_table( indexf->header.hashstoplist, word ) )
76         return 0;
77
78     wordlen = strlen(word);
79     if ((wordlen < indexf->header.minwordlimit) || (wordlen > indexf->header.maxwordlimit))
80         return 0;
81
82     lastchar = '\0';
83     same = 0;
84     hasnumber = hasvowel = hascons = 0;
85     numberrow = vowelrow = consrow = 0;
86
87     for (i = 0; word[i] != '\0'; i++)
88     {
89         /* Max number of times a char can repeat in a word */
90         if (word[i] == lastchar)
91         {
92             same++;
93             if (same > IGNORESAME)
94                 return 0;
95         }
96         else
97             same = 0;
98
99         /* Max number of consecutive digits */
100         if (isdigit((int) ( (unsigned char) word[i])))
101         {
102             hasnumber = 1;
103             numberrow++;
104             if (numberrow > IGNOREROWN)
105                 return 0;
106             vowelrow = 0;
107             consrow = 0;
108         }
109
110         /* maximum number of consecutive vowels a word can have */
111         else if (isvowel(sw, word[i]))
112         {
113             hasvowel = 1;
114             vowelrow++;
115             if (vowelrow > IGNOREROWV)
116                 return 0;
117             numberrow = 0;
118             consrow = 0;
119         }
120
121         /* maximum number of consecutive consonants a word can have */
122         else if (!ispunct((int) ( (unsigned char) word[i])))
123         {
124             hascons = 1;
125             consrow++;
126             if (consrow > IGNOREROWC)
127                 return 0;
128             numberrow = 0;
129             vowelrow = 0;
130         }
131         lastchar = word[i];
132     }
133
134     /* If IGNOREALLV is 1, words containing all vowels won't be indexed. */
135     if (IGNOREALLV)
136         if (hasvowel && !hascons)
137             return 0;
138
139     /* If IGNOREALLC is 1, words containing all consonants won't be indexed */
140     if (IGNOREALLC)
141         if (hascons && !hasvowel)
142             return 0;
143
144     /* If IGNOREALLN is 1, words containing all digits won't be indexed */
145     if (IGNOREALLN)
146         if (hasnumber && !hasvowel && !hascons)
147             return 0;
148
149     return 1;
150 }
151
152
153 /*
154   -- Determine document type by checking the file extension
155   -- of the filename
156   -- Return: doctype
157   -- 2001-03-08 rasc   rewritten (optimize and match also
158   --                   e.g. ".htm", ".htm.de" or ".html.gz")
159 */
160
161 int     getdoctype(char *filename, struct IndexContents *indexcontents)
162 {
163     struct swline *swl;
164     char   *s,
165            *fe;
166
167
168     if (!indexcontents)
169         return NODOCTYPE;
170
171     /* basically do a right to left compare */
172     fe = (filename + strlen(filename));
173     while (indexcontents)
174     {
175         swl = indexcontents->patt;
176
177         while (swl)
178         {
179             s = fe - strlen(swl->line);
180             if (s >= filename)
181             {                   /* no negative overflow! */
182                 if (!strcasecmp(swl->line, s))
183                 {
184                     return indexcontents->DocType;;
185                 }
186             }
187             swl = swl->next;
188         }
189
190         indexcontents = indexcontents->next;
191     }
192
193     return NODOCTYPE;
194 }
195
196
197
198
199
200 struct StoreDescription *hasdescription(int doctype, struct StoreDescription *sd)
201 {
202     while (sd)
203     {
204         if (sd->DocType == doctype)
205             return sd;
206         sd = sd->next;
207     }
208     return NULL;
209 }
Note: See TracBrowser for help on using the browser.