8398c9048d
code was modified slightly, so the code differs from the original downloadable 1.9.5 version
347 lines
15 KiB
PHP
347 lines
15 KiB
PHP
<?php
|
||
/**
|
||
* Zend Framework
|
||
*
|
||
* LICENSE
|
||
*
|
||
* This source file is subject to the new BSD license that is bundled
|
||
* with this package in the file LICENSE.txt.
|
||
* It is also available through the world-wide-web at this URL:
|
||
* http://framework.zend.com/license/new-bsd
|
||
* If you did not receive a copy of the license and are unable to
|
||
* obtain it through the world-wide-web, please send an email
|
||
* to license@zend.com so we can send you a copy immediately.
|
||
*
|
||
* @category Zend
|
||
* @package Zend_Search_Lucene
|
||
* @subpackage UnitTests
|
||
* @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
|
||
* @license http://framework.zend.com/license/new-bsd New BSD License
|
||
* @version $Id: AnalysisTest.php 17363 2009-08-03 07:40:18Z bkarwin $
|
||
*/
|
||
|
||
/**
|
||
* Zend_Search_Lucene
|
||
*/
|
||
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
|
||
|
||
/**
|
||
* PHPUnit test case
|
||
*/
|
||
require_once 'PHPUnit/Framework/TestCase.php';
|
||
|
||
/**
|
||
* @category Zend
|
||
* @package Zend_Search_Lucene
|
||
* @subpackage UnitTests
|
||
* @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
|
||
* @license http://framework.zend.com/license/new-bsd New BSD License
|
||
* @group Zend_Search_Lucene
|
||
*/
|
||
class Zend_Search_Lucene_AnalysisTest extends PHPUnit_Framework_TestCase
|
||
{
|
||
public function testAnalyzer()
|
||
{
|
||
$currentAnalyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
|
||
$this->assertTrue($currentAnalyzer instanceof Zend_Search_Lucene_Analysis_Analyzer);
|
||
|
||
$newAnalyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
|
||
Zend_Search_Lucene_Analysis_Analyzer::setDefault($newAnalyzer);
|
||
$this->assertTrue(Zend_Search_Lucene_Analysis_Analyzer::getDefault() === $newAnalyzer);
|
||
|
||
// Set analyzer to the default value (used in other tests)
|
||
Zend_Search_Lucene_Analysis_Analyzer::setDefault($currentAnalyzer);
|
||
}
|
||
|
||
public function testText()
|
||
{
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text();
|
||
|
||
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
|
||
|
||
$this->assertEquals(count($tokenList), 3);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'Word');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 4);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[1]->getTermText(), 'Word');
|
||
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
|
||
$this->assertEquals($tokenList[1]->getEndOffset(), 10);
|
||
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
|
||
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
|
||
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
|
||
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
|
||
}
|
||
|
||
public function testTextCaseInsensitive()
|
||
{
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
|
||
|
||
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
|
||
|
||
$this->assertEquals(count($tokenList), 3);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'word');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 4);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[1]->getTermText(), 'word');
|
||
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
|
||
$this->assertEquals($tokenList[1]->getEndOffset(), 10);
|
||
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
|
||
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
|
||
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
|
||
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
|
||
}
|
||
|
||
public function testTextNum()
|
||
{
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum();
|
||
|
||
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
|
||
|
||
$this->assertEquals(count($tokenList), 3);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'Word1');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[1]->getTermText(), 'Word2');
|
||
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
|
||
$this->assertEquals($tokenList[1]->getEndOffset(), 11);
|
||
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
|
||
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
|
||
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
|
||
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
|
||
}
|
||
|
||
public function testTextNumCaseInsensitive()
|
||
{
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive();
|
||
|
||
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
|
||
|
||
$this->assertEquals(count($tokenList), 3);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'word1');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[1]->getTermText(), 'word2');
|
||
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
|
||
$this->assertEquals($tokenList[1]->getEndOffset(), 11);
|
||
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
|
||
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
|
||
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
|
||
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
|
||
}
|
||
|
||
public function testUtf8()
|
||
{
|
||
if (@preg_match('/\pL/u', 'a') != 1) {
|
||
// PCRE unicode support is turned off
|
||
return;
|
||
}
|
||
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
|
||
|
||
// UTF-8 text with a cyrillic symbols
|
||
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
|
||
|
||
$this->assertEquals(count($tokenList), 3);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'Слово');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[1]->getTermText(), 'Слово');
|
||
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
|
||
$this->assertEquals($tokenList[1]->getEndOffset(), 12);
|
||
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
|
||
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
|
||
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
|
||
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
|
||
}
|
||
|
||
public function testUtf8Num()
|
||
{
|
||
if (@preg_match('/\pL/u', 'a') != 1) {
|
||
// PCRE unicode support is turned off
|
||
return;
|
||
}
|
||
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
|
||
|
||
// UTF-8 text with a cyrillic symbols
|
||
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
|
||
|
||
$this->assertEquals(count($tokenList), 3);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'Слово1');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 6);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[1]->getTermText(), 'Слово2');
|
||
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
|
||
$this->assertEquals($tokenList[1]->getEndOffset(), 13);
|
||
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
|
||
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
|
||
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
|
||
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
|
||
}
|
||
|
||
public function testUtf8CaseInsensitive()
|
||
{
|
||
if (@preg_match('/\pL/u', 'a') != 1) {
|
||
// PCRE unicode support is turned off
|
||
return;
|
||
}
|
||
if (!function_exists('mb_strtolower')) {
|
||
// mbstring extension is disabled
|
||
return;
|
||
}
|
||
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive();
|
||
|
||
// UTF-8 text with a cyrillic symbols
|
||
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
|
||
|
||
$this->assertEquals(count($tokenList), 3);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'слово');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[1]->getTermText(), 'слово');
|
||
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
|
||
$this->assertEquals($tokenList[1]->getEndOffset(), 12);
|
||
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
|
||
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
|
||
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
|
||
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
|
||
}
|
||
|
||
public function testUtf8NumCaseInsensitive()
|
||
{
|
||
if (@preg_match('/\pL/u', 'a') != 1) {
|
||
// PCRE unicode support is turned off
|
||
return;
|
||
}
|
||
if (!function_exists('mb_strtolower')) {
|
||
// mbstring extension is disabled
|
||
return;
|
||
}
|
||
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive();
|
||
|
||
// UTF-8 text with a cyrillic symbols
|
||
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
|
||
|
||
$this->assertEquals(count($tokenList), 3);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'слово1');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 6);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[1]->getTermText(), 'слово2');
|
||
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
|
||
$this->assertEquals($tokenList[1]->getEndOffset(), 13);
|
||
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
|
||
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
|
||
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
|
||
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
|
||
}
|
||
|
||
public function testEncoding()
|
||
{
|
||
if (PHP_OS == 'AIX') {
|
||
$this->markTestSkipped('Test not available on AIX');
|
||
}
|
||
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
|
||
|
||
// UTF-8 text with a cyrillic symbols
|
||
$tokenList = $analyzer->tokenize(iconv('UTF-8', 'Windows-1251', 'Слово1 Слово2 ДругоеСлово'), 'Windows-1251');
|
||
|
||
$this->assertEquals(count($tokenList), 3);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'Слово');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[1]->getTermText(), 'Слово');
|
||
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
|
||
$this->assertEquals($tokenList[1]->getEndOffset(), 12);
|
||
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
|
||
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
|
||
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
|
||
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
|
||
}
|
||
|
||
public function testStopWords()
|
||
{
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
|
||
$stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords(array('word', 'and', 'or'));
|
||
|
||
$analyzer->addFilter($stopWordsFilter);
|
||
|
||
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
|
||
|
||
$this->assertEquals(count($tokenList), 1);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'anotherword');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 12);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 23);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
}
|
||
|
||
public function testShortWords()
|
||
{
|
||
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
|
||
$stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords(4 /* Minimal length */);
|
||
|
||
$analyzer->addFilter($stopWordsFilter);
|
||
|
||
$tokenList = $analyzer->tokenize('Word1 and anotherWord');
|
||
|
||
$this->assertEquals(count($tokenList), 2);
|
||
|
||
$this->assertEquals($tokenList[0]->getTermText(), 'word');
|
||
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
|
||
$this->assertEquals($tokenList[0]->getEndOffset(), 4);
|
||
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
|
||
|
||
$this->assertEquals($tokenList[1]->getTermText(), 'anotherword');
|
||
$this->assertEquals($tokenList[1]->getStartOffset(), 10);
|
||
$this->assertEquals($tokenList[1]->getEndOffset(), 21);
|
||
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
|
||
}
|
||
}
|