[0.3.1] [UKR] Распознавание фамилии имя отчества

This commit is contained in:
Andriy Chaika 2011-07-02 13:31:26 +03:00
parent ec4aad97e5
commit 13f036b920
16 changed files with 43163 additions and 4 deletions

View File

@ -696,7 +696,11 @@ class NCLNameCaseUa extends NCLNameCaseCore implements NCLNameCaseInterface
}
if (in_array($FLastTwo, array('он', 'ов', 'ав', 'ам', 'ол', 'ан', 'рд', 'мп', 'ко', 'ло')))
{
$man+=0.3;
$man+=0.5;
}
if (in_array($FLastThree, array('бов', 'нка', 'яра', 'ила')))
{
$woman+=0.5;
}
if ($this->in($FLastSymbol, $this->consonant))
{
@ -718,10 +722,12 @@ class NCLNameCaseUa extends NCLNameCaseCore implements NCLNameCaseInterface
}
}
// $man*=1.2;
// $woman*=1.2;
if (isset($this->secondName) and $this->secondName)
{
if (in_array($SLastTwo, array('ов', 'ин', 'ев','ич', 'єв', 'ін', 'їн', 'ий', 'їв', 'ів', 'ій', 'ой','ей')))
if (in_array($SLastTwo, array('ов', 'ин', 'ев', 'єв', 'ін', 'їн', 'ий', 'їв', 'ів', 'ой', 'ей')))
{
$man+=0.4;
}
@ -757,8 +763,65 @@ class NCLNameCaseUa extends NCLNameCaseCore implements NCLNameCaseInterface
protected function detectNamePart($namepart)
{
$LastSymbol = mb_substr($namepart, -1, 1, 'utf-8');
$LastTwo = mb_substr($namepart, -2, 2, 'utf-8');
$LastThree = mb_substr($namepart, -3, 3, 'utf-8');
$LastFour = mb_substr($namepart, -4, 4, 'utf-8');
return 'N';
//Считаем вероятность
$first = 0;
$second = 0;
$father = 0;
//если смахивает на отчество
if (in_array($LastThree, array('вна', 'чна', 'ліч')) or in_array($LastFour, array('ьмич', 'ович')))
{
$father+=3;
}
//Похоже на имя
if (in_array($LastThree, array('тин')) or in_array($LastFour, array('ьмич', 'юбов')))
{
$first+=0.5;
}
//Исключения
if (in_array($namepart, array('Лев', 'Гаїна', 'Афіна', 'Антоніна', 'Ангеліна', 'Альвіна', 'Альбіна', 'Аліна', 'Павло', 'Олесь')))
{
$first+=10;
}
//похоже на фамилию
if (in_array($LastTwo, array('ов', 'ін', 'ев', 'єв', 'ий', 'ин', 'ой', 'ко', 'ук', 'як', 'ца', 'их', 'ик', 'ун', 'ок', 'ша', 'ая', 'га', 'єк', 'аш', 'ив', 'юк', 'ус', 'це', 'ак', 'бр', 'яр', 'іл', 'ів', 'ич', 'сь', 'ей', 'нс', 'яс', 'ер', 'ай', 'ян', 'ах', 'ць', 'ющ', 'іс', 'ач', 'уб', 'ох', 'юх','ут','ча','ул','вк','зь' /*{endings2}*/)))
{
$second+=0.4;
}
if (in_array($LastThree, array('ова', 'ева', 'єва', 'тих', 'рик', 'вач', 'аха', 'шен', 'мей', 'арь', 'вка', 'шир', 'бан', 'чий', 'іна', 'їна', 'ька', 'ань', 'ива', 'аль','ура','ран','ало','ола','кур','оба','оль','нта','зій','ґан','іло','шта', 'юпа', 'рна', 'бла', 'еїн', 'има', 'мар', 'кар', 'оха', 'чур', 'ниш', 'ета', 'тна', 'зур', 'нір', 'йма', 'орж', 'рба' /*{endings3}*/)))
{
$second+=0.4;
}
if (in_array($LastFour, array('ьник', 'нчук', 'тник', 'кирь', 'ский', 'шена', 'шина', 'вина', 'нина' /*{endings4}*/)))
{
$second+=0.4;
}
$max = max(array($first, $second, $father));
if ($first == $max)
{
return 'N';
}
elseif ($second == $max)
{
return 'S';
}
else
{
return 'F';
}
}
}

2
Tests/AutoTest/Endings/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*.txt
compiled/

View File

@ -0,0 +1,81 @@
<?php
/**
* Description of BathTester
*
* @author seagull
*/
require 'EndingsCompiler.php';
class BatchTester
{
private $compiler;
private $mode;
public $epsilon;
public function __construct($mode)
{
copy('../../../Library/NCL.NameCase.core.php','compiled/NCL.NameCase.core.php');
$testPHP = file_get_contents('../../Library/NameSplitUAname.php');
$testPHP = str_replace("require_once dirname(__FILE__) . '/../../Library/NCL.NameCase.ua.php';", "require_once dirname(__FILE__) . '/NCL.NameCase.ua.php';", $testPHP);
file_put_contents('compiled/Test.php', $testPHP);
$this->compiler = new EndingsCompiler($mode);
$this->mode = $mode;
$this->compiler->compile();
$this->epsilon = $this->runTest();
}
public function testText($ending)
{
$this->compiler->compile($ending);
return $this->runTest();
}
public function runTest()
{
exec("phpunit compiled/Test.php", $output);
preg_match_all('#Tests: ([\d]+), Assertions: ([\d]+), Failures: ([\d]+).#is', $output[count($output)-1], $found);
$tests = (float)$found[2][0];
$bad = (float)$found[3][0];
//var_dump($tests);
//var_dump(($bad/$tests));
// print_r($found);
//echo $bad." ".$tests;
return (1-($bad/$tests));
}
}
$good = array();
$bad = array();
$tester = new BatchTester('endings3');
$epsilon = $tester->epsilon;
$endings = file('endings3.txt');
foreach ($endings as $ending)
{
$persent = $tester->testText(trim($ending));
echo trim($ending)." - ".$persent."\n";
if($persent==$epsilon)
{
$good[] = $ending;
}
else
{
$bad[] = $ending;
}
}
echo "\n\n GOOD";
print_r($good);
echo "\n\n BAD";
print_r($bad);
?>

View File

@ -0,0 +1,41 @@
<?php
/**
* Description of EndingsCompiler
*
* @author seagull
*/
class EndingsCompiler
{
private $template;
private $destination = 'compiled/NCL.NameCase.ua.php';
private $sourceFile = '../../../Library/NCL.NameCase.ua.php';
private $compiled = '';
private $mode = 'endings3';
public function __construct($mode)
{
$this->template = file_get_contents($this->sourceFile);
}
private function save()
{
file_put_contents($this->destination, $this->compiled);
}
public function compile($text = '')
{
if($text)
{
$text = ", '".$text."'";
}
$this->compiled = str_replace('/*{'.$this->mode.'}*/', $text , $this->template);
//$this->compiled = $this->template;
$this->save();
}
}
?>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,66 @@
<?php
require_once dirname(__FILE__) . '/../../Library/NCL.NameCase.ua.php';
class NCLNameCaseUaTest extends PHPUnit_Framework_TestCase
{
/**
* @var NCLNameCaseRu
*/
protected $object;
/**
* Sets up the fixture, for example, opens a network connection.
* This method is called before a test is executed.
*/
protected function setUp()
{
$this->object = new NCLNameCaseUa;
}
/**
* Tears down the fixture, for example, closes a network connection.
* This method is called after a test is executed.
*/
protected function tearDown()
{
}
public function testSplDetect0()
{
$this->assertEquals('F-Олександрович', $this->object->splitFullName('Олександрович').'-Олександрович');
}
public function testSplDetect1()
{
$this->assertEquals('F-Миколайович', $this->object->splitFullName('Миколайович').'-Миколайович');
}
public function testSplDetect2()
{
$this->assertEquals('F-Кузьмич', $this->object->splitFullName('Кузьмич').'-Кузьмич');
}
public function testSplDetect3()
{
$this->assertEquals('F-Петрович', $this->object->splitFullName('Петрович').'-Петрович');
}
public function testSplDetect4()
{
$this->assertEquals('F-Ілліч', $this->object->splitFullName('Ілліч').'-Ілліч');
}
public function testSplDetect5()
{
$this->assertEquals('F-Василівна', $this->object->splitFullName('Василівна').'-Василівна');
}
public function testSplDetect6()
{
$this->assertEquals('F-Антонівна', $this->object->splitFullName('Антонівна').'-Антонівна');
}
public function testSplDetect7()
{
$this->assertEquals('F-Яківна', $this->object->splitFullName('Яківна').'-Яківна');
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,99 @@
<?php
class TestGenerator
{
public $maintemplate;
public $testtemplate;
public $resultArrB=array();
public $resultArrG=array();
private $gender;
private $tests;
private $count=0;
public function __construct()
{
mysql_connect('localhost', 'root', '');
mysql_select_db('names');
mysql_set_charset('utf8');
//$this->resultArr[]=file('Names/boy_full_result.txt');
//$this->resultArr[]=file('Names/girl_full_result.txt');
}
private function make($gender)
{
$res = mysql_query("SELECT * FROM {$gender}second");
$secondArr = array();
while($row = mysql_fetch_assoc($res))
{
$secondArr[] = $row['name'];
}
$res = mysql_query("SELECT * FROM {$gender}names");
$firstArr = array();
while($row = mysql_fetch_assoc($res))
{
$firstArr[] = $row['name'];
}
$countFirst = count($firstArr);
$countSecond = count($secondArr);
$max = max(array($countFirst, $countSecond));
$result = array();
for($i=0; $i<$max; $i++)
{
$indexFirst = $i % $countFirst;
$indexSecond = $i % $countSecond;
$this->generateTest($secondArr[$indexSecond],$firstArr[$indexFirst]);
}
return $result;
}
public function generate()
{
$this->maintemplate = file_get_contents('Template/NCLNameCaseUaTest.main');
$this->testtemplate = file_get_contents('Template/GenderGenBoth.test');
$this->count=0;
$this->gender = 1;
$this->make('boy');
$this->gender = 2;
$this->make('girl');
$res=str_replace('{% tests %}', $this->tests, $this->maintemplate);
file_put_contents('../Library/GenderDetectUABoth.php',$res);
}
private function generateTest($sname, $fname)
{
$tpl=$this->testtemplate;
$tpl=str_replace('{% id %}', $this->count, $tpl);
$tpl=str_replace('{% name %}', $fname, $tpl);
$tpl=str_replace('{% sname %}', $sname, $tpl);
$tpl=str_replace('{% gender %}', $this->gender, $tpl);
$this->tests.=$tpl;
$this->count++;
}
}
$p= new TestGenerator;
$p->generate('Both');
?>

View File

@ -0,0 +1,87 @@
<?php
class TestGenerator
{
public $maintemplate;
public $testtemplate;
public $resultArrB = array();
public $resultArrG = array();
private $gender;
private $tests;
private $count = 0;
public function __construct()
{
mysql_connect('localhost', 'root', '');
mysql_select_db('names');
mysql_set_charset('utf8');
//$this->resultArr[]=file('Names/boy_full_result.txt');
//$this->resultArr[]=file('Names/girl_full_result.txt');
}
public function generate($fname)
{
$this->maintemplate = file_get_contents('Template/NCLNameCaseUaTest.main');
$this->testtemplate = file_get_contents('Template/GenderGen' . $fname . '.test');
// $this->resultArrB=file('NamesAll/boy'.$fname.'.txt');
// $this->resultArrG=file('NamesAll/girl'.$fname.'.txt');
$this->count = 0;
$fnewname = $fname;
if ($fnewname == 'sirname')
{
$fnewname = 'second';
}
else
{
$fnewname = 'names';
}
$this->gender = 1;
$this->dbTable = 'boy' . strtolower($fnewname);
$this->resultArr = mysql_query("SELECT * FROM {$this->dbTable}");
//foreach ($this->resultArr as $key=>$value)
//{
//$this->gender=$key+1;
while ($row = mysql_fetch_array($this->resultArr))
{
$this->generateTest(trim($row['name']));
}
$this->dbTable = 'girl' . strtolower($fnewname);
$this->gender = 2;
$this->resultArr = mysql_query("SELECT * FROM {$this->dbTable}");
//foreach ($this->resultArr as $key=>$value)
//{
//$this->gender=$key+1;
while ($row = mysql_fetch_array($this->resultArr))
{
$this->generateTest(trim($row['name']));
}
//}
$res = str_replace('{% tests %}', $this->tests, $this->maintemplate);
$res = str_replace('{% name %}', $fname, $res);
file_put_contents('../Library/GenderDetectUA' . $fname . '.php', $res);
}
private function generateTest($name)
{
$tpl = $this->testtemplate;
$tpl = str_replace('{% id %}', $this->count, $tpl);
$tpl = str_replace('{% name %}', $name, $tpl);
$tpl = str_replace('{% gender %}', $this->gender, $tpl);
$this->tests.=$tpl;
$this->count++;
}
}
$p = new TestGenerator;
$p->generate('name');
$p = new TestGenerator;
$p->generate('sirname');
?>

View File

@ -0,0 +1,96 @@
<?php
class TestGenerator
{
public $maintemplate;
public $testtemplate;
public $resultArrB = array();
public $resultArrG = array();
private $gender;
private $tests;
private $count = 0;
public function __construct()
{
mysql_connect('localhost', 'root', '');
mysql_select_db('names');
mysql_set_charset('utf8');
}
private function getList($gender, $type)
{
$res = mysql_query("SELECT * FROM {$gender}{$type}");
$resultArr = array();
while($row = mysql_fetch_assoc($res))
{
$resultArr[] = $row['name'];
}
return $resultArr;
}
public function generate($fname)
{
$this->maintemplate = file_get_contents('Template/NCLNameCaseUaTest.main');
$this->testtemplate = file_get_contents('Template/NameSplit.test');
$this->count = 0;
if ($fname == 'name')
{
$this->gender = 'N';
$tableName = 'names';
}
elseif ($fname == 'sirname')
{
$this->gender = 'S';
$tableName = 'second';
}
else
{
$this->gender = 'F';
$tableName = 'father';
}
$names = array_merge($this->getList('boy', $tableName), $this->getList('girl', $tableName));
//foreach ($this->resultArr as $key=>$value)
//{
//$this->gender=$key+1;
foreach ($names as $id => $name)
{
$this->generateTest(trim($name));
}
//foreach ($this->resultArr as $key=>$value)
//{
//$this->gender=$key+1;
// foreach ($this->resultArrG as $id => $name)
// {
// $this->generateTest(trim($name));
// }
//}
$res = str_replace('{% tests %}', $this->tests, $this->maintemplate);
$res = str_replace('{% name %}', $fname, $res);
file_put_contents('../Library/NameSplitUA' . $fname . '.php', $res);
}
private function generateTest($name)
{
$tpl = $this->testtemplate;
$tpl = str_replace('{% id %}', $this->count, $tpl);
$tpl = str_replace('{% name %}', $name, $tpl);
$tpl = str_replace('{% gender %}', $this->gender, $tpl);
$this->tests.=$tpl;
$this->count++;
}
}
$p = new TestGenerator;
$p->generate('name');
$p = new TestGenerator;
$p->generate('sirname');
$p = new TestGenerator;
$p->generate('father');
?>

View File

@ -16,7 +16,6 @@ class TestGenerator
public function __construct()
{
//$this->resultArr[]=file('Names/boy_full_result.txt');
//$this->resultArr[]=file('Names/girl_full_result.txt');
}