1
0
mirror of synced 2025-01-10 02:57:10 +03:00
doctrine2/lib/Doctrine/Search/Analyzer/Standard.php

299 lines
11 KiB
PHP
Raw Normal View History

2007-07-11 15:30:27 +04:00
<?php
/*
* $Id$
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* This software consists of voluntary contributions made by many individuals
* and is licensed under the LGPL. For more information, see
* <http://www.phpdoctrine.org>.
2007-07-11 15:30:27 +04:00
*/
/**
* Doctrine_Search_Analyzer_Standard
*
* @package Doctrine
* @subpackage Search
* @author Konsta Vesterinen <kvesteri@cc.hut.fi>
2007-07-11 15:30:27 +04:00
* @license http://www.opensource.org/licenses/lgpl-license.php LGPL
* @version $Revision$
* @link www.phpdoctrine.org
2007-07-11 15:30:27 +04:00
* @since 1.0
*/
class Doctrine_Search_Analyzer_Standard implements Doctrine_Search_Analyzer_Interface
{
protected static $_stopwords = array(
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
'10',
'a',
'about',
'after',
'all',
'almost',
'along',
'also',
2007-10-17 01:49:55 +04:00
'although',
'amp',
'an',
'and',
'another',
'any',
'are',
'area',
2007-10-17 02:10:49 +04:00
'arent',
'around',
'as',
'at',
'available',
'back',
'be',
'because',
'been',
2007-10-17 01:49:55 +04:00
'before',
'being',
'best',
'better',
'big',
'bit',
'both',
'but',
'by',
'c',
'came',
'can',
'capable',
'control',
'could',
'course',
'd',
'dan',
'day',
'decided',
'did',
'didn',
'different',
'div',
'do',
'doesn',
'don',
'down',
'drive',
'e',
'each',
'easily',
'easy',
'edition',
2007-10-17 01:49:55 +04:00
'either',
'end',
'enough',
'even',
'every',
'example',
'few',
'find',
'first',
'for',
'found',
'from',
'get',
'go',
'going',
'good',
'got',
'gt',
'had',
'hard',
'has',
'have',
'he',
'her',
'here',
'how',
'i',
'if',
'in',
'into',
'is',
'isn',
'it',
'just',
'know',
'last',
'left',
'li',
'like',
'little',
'll',
'long',
'look',
'lot',
'lt',
'm',
'made',
'make',
'many',
'mb',
'me',
'menu',
'might',
'mm',
'more',
'most',
'much',
'my',
'name',
'nbsp',
'need',
'new',
'no',
'not',
'now',
'number',
'of',
'off',
'old',
'on',
'one',
'only',
'or',
'original',
'other',
'our',
'out',
'over',
'part',
'place',
'point',
'pretty',
'probably',
'problem',
'put',
'quite',
'quot',
'r',
're',
'really',
'results',
'right',
's',
'same',
'saw',
'see',
'set',
'several',
'she',
'sherree',
'should',
'since',
'size',
'small',
'so',
'some',
'something',
'special',
'still',
'stuff',
'such',
'sure',
'system',
't',
'take',
'than',
'that',
'the',
'their',
'them',
'then',
'there',
'these',
'they',
'thing',
'things',
'think',
'this',
'those',
'though',
'through',
'time',
'to',
'today',
'together',
'too',
'took',
'two',
'up',
'us',
'use',
'used',
'using',
've',
'very',
'want',
'was',
'way',
'we',
'well',
'went',
'were',
'what',
'when',
'where',
'which',
'while',
'white',
'who',
'will',
'with',
'would',
2007-10-17 02:10:49 +04:00
'yet',
'you',
'your',
2007-10-17 02:10:49 +04:00
'yours'
);
2007-07-11 15:30:27 +04:00
public function analyze($text)
{
2007-10-17 02:10:49 +04:00
$text = preg_replace('/[\'`<60>"]/', '', $text);
2007-10-17 01:49:55 +04:00
$text = preg_replace('/[^A-Za-z0-9]/', ' ', $text);
$text = str_replace(' ', ' ', $text);
$terms = explode(' ', $text);
2007-07-11 17:20:39 +04:00
$ret = array();
if ( ! empty($terms)) {
foreach ($terms as $i => $term) {
if (empty($term)) {
continue;
}
$lower = strtolower(trim($term));
2007-07-11 17:20:39 +04:00
if (in_array($lower, self::$_stopwords)) {
continue;
}
2007-07-12 02:03:47 +04:00
$ret[$i] = $lower;
}
}
return $ret;
2007-07-11 15:30:27 +04:00
}
2007-10-17 01:49:55 +04:00
}