cacert-testmgr/external/ZendFramework-1.9.5/tests/Zend/Search/Lucene/AnalysisTest.php
Markus Warg 8398c9048d initially import ZendFramework-1.9.5 into repository
code was modified slightly, so the code differs from the original downloadable 1.9.5 version
2010-03-31 10:12:32 +02:00

347 lines
15 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage UnitTests
* @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: AnalysisTest.php 17363 2009-08-03 07:40:18Z bkarwin $
*/
/**
* Zend_Search_Lucene
*/
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/**
* PHPUnit test case
*/
require_once 'PHPUnit/Framework/TestCase.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage UnitTests
* @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @group Zend_Search_Lucene
*/
class Zend_Search_Lucene_AnalysisTest extends PHPUnit_Framework_TestCase
{
public function testAnalyzer()
{
$currentAnalyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
$this->assertTrue($currentAnalyzer instanceof Zend_Search_Lucene_Analysis_Analyzer);
$newAnalyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
Zend_Search_Lucene_Analysis_Analyzer::setDefault($newAnalyzer);
$this->assertTrue(Zend_Search_Lucene_Analysis_Analyzer::getDefault() === $newAnalyzer);
// Set analyzer to the default value (used in other tests)
Zend_Search_Lucene_Analysis_Analyzer::setDefault($currentAnalyzer);
}
public function testText()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text();
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'Word');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 4);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'Word');
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
$this->assertEquals($tokenList[1]->getEndOffset(), 10);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testTextCaseInsensitive()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'word');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 4);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'word');
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
$this->assertEquals($tokenList[1]->getEndOffset(), 10);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testTextNum()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum();
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'Word1');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'Word2');
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
$this->assertEquals($tokenList[1]->getEndOffset(), 11);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testTextNumCaseInsensitive()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive();
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'word1');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'word2');
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
$this->assertEquals($tokenList[1]->getEndOffset(), 11);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testUtf8()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
return;
}
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
// UTF-8 text with a cyrillic symbols
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'Слово');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'Слово');
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
$this->assertEquals($tokenList[1]->getEndOffset(), 12);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testUtf8Num()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
return;
}
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
// UTF-8 text with a cyrillic symbols
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'Слово1');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 6);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'Слово2');
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
$this->assertEquals($tokenList[1]->getEndOffset(), 13);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testUtf8CaseInsensitive()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
return;
}
if (!function_exists('mb_strtolower')) {
// mbstring extension is disabled
return;
}
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive();
// UTF-8 text with a cyrillic symbols
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'слово');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'слово');
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
$this->assertEquals($tokenList[1]->getEndOffset(), 12);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testUtf8NumCaseInsensitive()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
return;
}
if (!function_exists('mb_strtolower')) {
// mbstring extension is disabled
return;
}
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive();
// UTF-8 text with a cyrillic symbols
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'слово1');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 6);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'слово2');
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
$this->assertEquals($tokenList[1]->getEndOffset(), 13);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testEncoding()
{
if (PHP_OS == 'AIX') {
$this->markTestSkipped('Test not available on AIX');
}
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
// UTF-8 text with a cyrillic symbols
$tokenList = $analyzer->tokenize(iconv('UTF-8', 'Windows-1251', 'Слово1 Слово2 ДругоеСлово'), 'Windows-1251');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'Слово');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'Слово');
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
$this->assertEquals($tokenList[1]->getEndOffset(), 12);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testStopWords()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
$stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords(array('word', 'and', 'or'));
$analyzer->addFilter($stopWordsFilter);
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
$this->assertEquals(count($tokenList), 1);
$this->assertEquals($tokenList[0]->getTermText(), 'anotherword');
$this->assertEquals($tokenList[0]->getStartOffset(), 12);
$this->assertEquals($tokenList[0]->getEndOffset(), 23);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
}
public function testShortWords()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
$stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords(4 /* Minimal length */);
$analyzer->addFilter($stopWordsFilter);
$tokenList = $analyzer->tokenize('Word1 and anotherWord');
$this->assertEquals(count($tokenList), 2);
$this->assertEquals($tokenList[0]->getTermText(), 'word');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 4);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'anotherword');
$this->assertEquals($tokenList[1]->getStartOffset(), 10);
$this->assertEquals($tokenList[1]->getEndOffset(), 21);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
}
}