Spade
Mini Shell
| Directory:~$ /home/lmsyaran/public_html/joomla5/libraries/vendor/wamania/php-stemmer/src/Stemmer/ |
| [Home] [System Details] [Kill Me] |
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/dutch/stemmer.html
* @author wamania
*
*/
class Dutch extends Stem
{
/**
* All dutch vowels
*/
protected static $vowels = array('a', 'e',
'i', 'o', 'u', 'y',
'è');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
// First, remove all umlaut and acute accents.
$this->word = UTF8::str_replace(
array('ä', 'ë', 'ï',
'ö', 'ü', 'á', 'é',
'í', 'ó', 'ú'),
array('a', 'e', 'i',
'o', 'u', 'a', 'e', 'i',
'o', 'u'),
$this->word);
$this->plainVowels = implode('', self::$vowels);
// Put initial y, y after a vowel, and i between vowels into upper
case.
$this->word = preg_replace('#^y#u', 'Y',
$this->word);
$this->word =
preg_replace('#(['.$this->plainVowels.'])y#u',
'$1Y', $this->word);
$this->word =
preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u',
'$1I$2', $this->word);
// R1 and R2 (see the note on R1 and R2) are then defined as in
German.
// R1 and R2 are first set up in the standard way
$this->r1();
$this->r2();
// but then R1 is adjusted so that the region before it contains at
least 3 letters.
if ($this->r1Index < 3) {
$this->r1Index = 3;
$this->r1 = UTF8::substr($this->word, 3);
}
// Do each of steps 1, 2 3 and 4.
$this->step1();
$removedE = $this->step2();
$this->step3a();
$this->step3b($removedE);
$this->step4();
$this->finish();
return $this->word;
}
/**
* Define a valid s-ending as a non-vowel other than j.
* @param string $ending
* @return boolean
*/
private function hasValidSEnding($word)
{
$lastLetter = UTF8::substr($word, -1, 1);
return !in_array($lastLetter, array_merge(self::$vowels,
array('j')));
}
/**
* Define a valid en-ending as a non-vowel, and not gem.
* @param string $ending
* @return boolean
*/
private function hasValidEnEnding($word)
{
$lastLetter = UTF8::substr($word, -1, 1);
if (in_array($lastLetter, self::$vowels)) {
return false;
}
$threeLastLetters = UTF8::substr($word, -3, 3);
if ($threeLastLetters == 'gem') {
return false;
}
return true;
}
/**
* Define undoubling the ending as removing the last letter if the
word ends kk, dd or tt.
*/
private function unDoubling()
{
if ($this->search(array('kk', 'dd',
'tt')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
}
}
/**
* Step 1
* Search for the longest among the following suffixes, and perform the
action indicated
*/
private function step1()
{
// heden
// replace with heid if in R1
if ( ($position = $this->search(array('heden'))) !==
false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(heden)$#u',
'heid', $this->word);
}
return true;
}
// en ene
// delete if in R1 and preceded by a valid en-ending, and then
undouble the ending
if ( ($position = $this->search(array('ene',
'en'))) !== false) {
if ($this->inR1($position)) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidEnEnding($word)) {
$this->word = $word;
$this->unDoubling();
}
}
return true;
}
// s se
// delete if in R1 and preceded by a valid s-ending
if ( ($position = $this->search(array('se',
's'))) !== false) {
if ($this->inR1($position)) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidSEnding($word)) {
$this->word = $word;
}
}
return true;
}
return false;
}
/**
* Step 2
* Delete suffix e if in R1 and preceded by a non-vowel, and then
undouble the ending
*/
private function step2()
{
if ( ($position = $this->search(array('e'))) !==
false) {
if ($this->inR1($position)) {
$letter = UTF8::substr($this->word, -2, 1);
if (!in_array($letter, self::$vowels)) {
$this->word = UTF8::substr($this->word, 0,
$position);
$this->unDoubling();
return true;
}
}
}
return false;
}
/**
* Step 3a: heid
* delete heid if in R2 and not preceded by c, and treat a preceding en
as in step 1(b)
*/
private function step3a()
{
if ( ($position = $this->search(array('heid'))) !==
false) {
if ($this->inR2($position)) {
$letter = UTF8::substr($this->word, -5, 1);
if ($letter !== 'c') {
$this->word = UTF8::substr($this->word, 0,
$position);
if ( ($position =
$this->search(array('en'))) !== false) {
if ($this->inR1($position)) {
$word = UTF8::substr($this->word, 0,
$position);
if ($this->hasValidEnEnding($word)) {
$this->word = $word;
$this->unDoubling();
}
}
}
}
}
}
}
/**
* Step 3b: d-suffixe
* Search for the longest among the following suffixes, and perform the
action indicated.
*/
private function step3b($removedE)
{
// end ing
// delete if in R2
// if preceded by ig, delete if in R2 and not preceded by e,
otherwise undouble the ending
if ( ($position = $this->search(array('end',
'ing'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0,
$position);
if ( ($position2 =
$this->searchIfInR2(array('ig'))) !== false) {
$letter = UTF8::substr($this->word, -3, 1);
if ($letter !== 'e') {
$this->word = UTF8::substr($this->word, 0,
$position2);
}
} else {
$this->unDoubling();
}
}
return true;
}
// ig
// delete if in R2 and not preceded by e
if ( ($position = $this->search(array('ig'))) !==
false) {
if ($this->inR2($position)) {
$letter = UTF8::substr($this->word, -3, 1);
if ($letter !== 'e') {
$this->word = UTF8::substr($this->word, 0,
$position);
}
}
return true;
}
// lijk
// delete if in R2, and then repeat step 2
if ( ($position = $this->search(array('lijk'))) !==
false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0,
$position);
$this->step2();
}
return true;
}
// baar
// delete if in R2
if ( ($position = $this->search(array('baar'))) !==
false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0,
$position);
}
return true;
}
// bar
// delete if in R2 and if step 2 actually removed an e
if ( ($position = $this->search(array('bar'))) !==
false) {
if ($this->inR2($position) && $removedE) {
$this->word = UTF8::substr($this->word, 0,
$position);
}
return true;
}
return false;
}
/**
* Step 4: undouble vowel
* If the words ends CVD, where C is a non-vowel, D is a non-vowel
other than I, and V is double a, e, o or u,
* remove one of the vowels from V (for example, maan -> man, brood
-> brod).
*/
private function step4()
{
// D is a non-vowel other than I
$d = UTF8::substr($this->word, -1, 1);
if (in_array($d, array_merge(self::$vowels, array('I'))))
{
return false;
}
// V is double a, e, o or u
$v = UTF8::substr($this->word, -3, 2);
if (!in_array($v, array('aa', 'ee',
'oo', 'uu'))) {
return false;
}
$singleV = UTF8::substr($v, 0, 1);
// C is a non-vowel
$c = UTF8::substr($this->word, -4, 1);
if (in_array($c, self::$vowels)) {
return false;
}
$this->word = UTF8::substr($this->word, 0, -4);
$this->word .= $c . $singleV .$d;
}
/**
* Finally
* Turn I and Y back into lower case.
*/
private function finish()
{
$this->word = UTF8::str_replace(array('I',
'Y'), array('i', 'y'), $this->word);
}
}