You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cacert-testmgr/external/ZendFramework-1.9.5/tests/Zend/Search/Lucene/AnalysisTest.php

348 lines
15 KiB
PHP

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage UnitTests
* @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: AnalysisTest.php 17363 2009-08-03 07:40:18Z bkarwin $
*/
/**
* Zend_Search_Lucene
*/
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/**
* PHPUnit test case
*/
require_once 'PHPUnit/Framework/TestCase.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage UnitTests
* @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @group Zend_Search_Lucene
*/
class Zend_Search_Lucene_AnalysisTest extends PHPUnit_Framework_TestCase
{
public function testAnalyzer()
{
$currentAnalyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
$this->assertTrue($currentAnalyzer instanceof Zend_Search_Lucene_Analysis_Analyzer);
$newAnalyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
Zend_Search_Lucene_Analysis_Analyzer::setDefault($newAnalyzer);
$this->assertTrue(Zend_Search_Lucene_Analysis_Analyzer::getDefault() === $newAnalyzer);
// Set analyzer to the default value (used in other tests)
Zend_Search_Lucene_Analysis_Analyzer::setDefault($currentAnalyzer);
}
public function testText()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text();
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'Word');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 4);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'Word');
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
$this->assertEquals($tokenList[1]->getEndOffset(), 10);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testTextCaseInsensitive()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'word');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 4);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'word');
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
$this->assertEquals($tokenList[1]->getEndOffset(), 10);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testTextNum()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum();
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'Word1');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'Word2');
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
$this->assertEquals($tokenList[1]->getEndOffset(), 11);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testTextNumCaseInsensitive()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive();
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'word1');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'word2');
$this->assertEquals($tokenList[1]->getStartOffset(), 6);
$this->assertEquals($tokenList[1]->getEndOffset(), 11);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
$this->assertEquals($tokenList[2]->getStartOffset(), 12);
$this->assertEquals($tokenList[2]->getEndOffset(), 23);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testUtf8()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
return;
}
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
// UTF-8 text with a cyrillic symbols
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'Слово');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'Слово');
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
$this->assertEquals($tokenList[1]->getEndOffset(), 12);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testUtf8Num()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
return;
}
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
// UTF-8 text with a cyrillic symbols
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'Слово1');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 6);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'Слово2');
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
$this->assertEquals($tokenList[1]->getEndOffset(), 13);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testUtf8CaseInsensitive()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
return;
}
if (!function_exists('mb_strtolower')) {
// mbstring extension is disabled
return;
}
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive();
// UTF-8 text with a cyrillic symbols
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'слово');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'слово');
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
$this->assertEquals($tokenList[1]->getEndOffset(), 12);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testUtf8NumCaseInsensitive()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
return;
}
if (!function_exists('mb_strtolower')) {
// mbstring extension is disabled
return;
}
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive();
// UTF-8 text with a cyrillic symbols
$tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'слово1');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 6);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'слово2');
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
$this->assertEquals($tokenList[1]->getEndOffset(), 13);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testEncoding()
{
if (PHP_OS == 'AIX') {
$this->markTestSkipped('Test not available on AIX');
}
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
// UTF-8 text with a cyrillic symbols
$tokenList = $analyzer->tokenize(iconv('UTF-8', 'Windows-1251', 'Слово1 Слово2 ДругоеСлово'), 'Windows-1251');
$this->assertEquals(count($tokenList), 3);
$this->assertEquals($tokenList[0]->getTermText(), 'Слово');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 5);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'Слово');
$this->assertEquals($tokenList[1]->getStartOffset(), 7);
$this->assertEquals($tokenList[1]->getEndOffset(), 12);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
$this->assertEquals($tokenList[2]->getStartOffset(), 14);
$this->assertEquals($tokenList[2]->getEndOffset(), 25);
$this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
}
public function testStopWords()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
$stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords(array('word', 'and', 'or'));
$analyzer->addFilter($stopWordsFilter);
$tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
$this->assertEquals(count($tokenList), 1);
$this->assertEquals($tokenList[0]->getTermText(), 'anotherword');
$this->assertEquals($tokenList[0]->getStartOffset(), 12);
$this->assertEquals($tokenList[0]->getEndOffset(), 23);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
}
public function testShortWords()
{
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
$stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords(4 /* Minimal length */);
$analyzer->addFilter($stopWordsFilter);
$tokenList = $analyzer->tokenize('Word1 and anotherWord');
$this->assertEquals(count($tokenList), 2);
$this->assertEquals($tokenList[0]->getTermText(), 'word');
$this->assertEquals($tokenList[0]->getStartOffset(), 0);
$this->assertEquals($tokenList[0]->getEndOffset(), 4);
$this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
$this->assertEquals($tokenList[1]->getTermText(), 'anotherword');
$this->assertEquals($tokenList[1]->getStartOffset(), 10);
$this->assertEquals($tokenList[1]->getEndOffset(), 21);
$this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
}
}