1 |
<?php
|
2 |
/*
|
3 |
* $Id$
|
4 |
*
|
5 |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
6 |
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
7 |
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
8 |
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
9 |
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
10 |
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
11 |
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
12 |
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
13 |
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
14 |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
15 |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
16 |
*
|
17 |
* This software consists of voluntary contributions made by many individuals
|
18 |
* and is licensed under the LGPL. For more information, see
|
19 |
* <http://www.phpdoctrine.org>.
|
20 |
*/
|
21 |
|
22 |
/**
|
23 |
* Doctrine_Search_Analyzer_Standard
|
24 |
*
|
25 |
* @package Doctrine
|
26 |
* @subpackage Search
|
27 |
* @author Konsta Vesterinen <kvesteri@cc.hut.fi>
|
28 |
* @license http://www.opensource.org/licenses/lgpl-license.php LGPL
|
29 |
* @version $Revision$
|
30 |
* @link www.phpdoctrine.org
|
31 |
* @since 1.0
|
32 |
*/
|
33 |
class Doctrine_Search_Analyzer_Standard implements Doctrine_Search_Analyzer_Interface
|
34 |
{
|
35 |
protected static $_stopwords = array(
|
36 |
'0',
|
37 |
'1',
|
38 |
'2',
|
39 |
'3',
|
40 |
'4',
|
41 |
'5',
|
42 |
'6',
|
43 |
'7',
|
44 |
'8',
|
45 |
'9',
|
46 |
'10',
|
47 |
'a',
|
48 |
'about',
|
49 |
'after',
|
50 |
'all',
|
51 |
'almost',
|
52 |
'along',
|
53 |
'also',
|
54 |
'although',
|
55 |
'amp',
|
56 |
'an',
|
57 |
'and',
|
58 |
'another',
|
59 |
'any',
|
60 |
'are',
|
61 |
'area',
|
62 |
'arent',
|
63 |
'around',
|
64 |
'as',
|
65 |
'at',
|
66 |
'available',
|
67 |
'back',
|
68 |
'be',
|
69 |
'because',
|
70 |
'been',
|
71 |
'before',
|
72 |
'being',
|
73 |
'best',
|
74 |
'better',
|
75 |
'big',
|
76 |
'bit',
|
77 |
'both',
|
78 |
'but',
|
79 |
'by',
|
80 |
'c',
|
81 |
'came',
|
82 |
'can',
|
83 |
'capable',
|
84 |
'control',
|
85 |
'could',
|
86 |
'course',
|
87 |
'd',
|
88 |
'dan',
|
89 |
'day',
|
90 |
'decided',
|
91 |
'did',
|
92 |
'didn',
|
93 |
'different',
|
94 |
'div',
|
95 |
'do',
|
96 |
'doesn',
|
97 |
'don',
|
98 |
'down',
|
99 |
'drive',
|
100 |
'e',
|
101 |
'each',
|
102 |
'easily',
|
103 |
'easy',
|
104 |
'edition',
|
105 |
'either',
|
106 |
'end',
|
107 |
'enough',
|
108 |
'even',
|
109 |
'every',
|
110 |
'example',
|
111 |
'few',
|
112 |
'find',
|
113 |
'first',
|
114 |
'for',
|
115 |
'found',
|
116 |
'from',
|
117 |
'get',
|
118 |
'go',
|
119 |
'going',
|
120 |
'good',
|
121 |
'got',
|
122 |
'gt',
|
123 |
'had',
|
124 |
'hard',
|
125 |
'has',
|
126 |
'have',
|
127 |
'he',
|
128 |
'her',
|
129 |
'here',
|
130 |
'how',
|
131 |
'i',
|
132 |
'if',
|
133 |
'in',
|
134 |
'into',
|
135 |
'is',
|
136 |
'isn',
|
137 |
'it',
|
138 |
'just',
|
139 |
'know',
|
140 |
'last',
|
141 |
'left',
|
142 |
'li',
|
143 |
'like',
|
144 |
'little',
|
145 |
'll',
|
146 |
'long',
|
147 |
'look',
|
148 |
'lot',
|
149 |
'lt',
|
150 |
'm',
|
151 |
'made',
|
152 |
'make',
|
153 |
'many',
|
154 |
'mb',
|
155 |
'me',
|
156 |
'menu',
|
157 |
'might',
|
158 |
'mm',
|
159 |
'more',
|
160 |
'most',
|
161 |
'much',
|
162 |
'my',
|
163 |
'name',
|
164 |
'nbsp',
|
165 |
'need',
|
166 |
'new',
|
167 |
'no',
|
168 |
'not',
|
169 |
'now',
|
170 |
'number',
|
171 |
'of',
|
172 |
'off',
|
173 |
'old',
|
174 |
'on',
|
175 |
'one',
|
176 |
'only',
|
177 |
'or',
|
178 |
'original',
|
179 |
'other',
|
180 |
'our',
|
181 |
'out',
|
182 |
'over',
|
183 |
'part',
|
184 |
'place',
|
185 |
'point',
|
186 |
'pretty',
|
187 |
'probably',
|
188 |
'problem',
|
189 |
'put',
|
190 |
'quite',
|
191 |
'quot',
|
192 |
'r',
|
193 |
're',
|
194 |
'really',
|
195 |
'results',
|
196 |
'right',
|
197 |
's',
|
198 |
'same',
|
199 |
'saw',
|
200 |
'see',
|
201 |
'set',
|
202 |
'several',
|
203 |
'she',
|
204 |
'sherree',
|
205 |
'should',
|
206 |
'since',
|
207 |
'size',
|
208 |
'small',
|
209 |
'so',
|
210 |
'some',
|
211 |
'something',
|
212 |
'special',
|
213 |
'still',
|
214 |
'stuff',
|
215 |
'such',
|
216 |
'sure',
|
217 |
'system',
|
218 |
't',
|
219 |
'take',
|
220 |
'than',
|
221 |
'that',
|
222 |
'the',
|
223 |
'their',
|
224 |
'them',
|
225 |
'then',
|
226 |
'there',
|
227 |
'these',
|
228 |
'they',
|
229 |
'thing',
|
230 |
'things',
|
231 |
'think',
|
232 |
'this',
|
233 |
'those',
|
234 |
'though',
|
235 |
'through',
|
236 |
'time',
|
237 |
'to',
|
238 |
'today',
|
239 |
'together',
|
240 |
'too',
|
241 |
'took',
|
242 |
'two',
|
243 |
'up',
|
244 |
'us',
|
245 |
'use',
|
246 |
'used',
|
247 |
'using',
|
248 |
've',
|
249 |
'very',
|
250 |
'want',
|
251 |
'was',
|
252 |
'way',
|
253 |
'we',
|
254 |
'well',
|
255 |
'went',
|
256 |
'were',
|
257 |
'what',
|
258 |
'when',
|
259 |
'where',
|
260 |
'which',
|
261 |
'while',
|
262 |
'white',
|
263 |
'who',
|
264 |
'will',
|
265 |
'with',
|
266 |
'would',
|
267 |
'yet',
|
268 |
'you',
|
269 |
'your',
|
270 |
'yours'
|
271 |
);
|
272 |
|
273 |
public function analyze($text)
|
274 |
{
|
275 |
$text = preg_replace('/[\'`´"]/', '', $text);
|
276 |
$text = preg_replace('/[^A-Za-z0-9]/', ' ', $text);
|
277 |
$text = str_replace(' ', ' ', $text);
|
278 |
|
279 |
$terms = explode(' ', $text);
|
280 |
|
281 |
$ret = array();
|
282 |
if ( ! empty($terms)) {
|
283 |
foreach ($terms as $i => $term) {
|
284 |
if (empty($term)) {
|
285 |
continue;
|
286 |
}
|
287 |
$lower = strtolower(trim($term));
|
288 |
|
289 |
if (in_array($lower, self::$_stopwords)) {
|
290 |
continue;
|
291 |
}
|
292 |
|
293 |
$ret[$i] = $lower;
|
294 |
}
|
295 |
}
|
296 |
return $ret;
|
297 |
}
|
298 |
}
|