Add tests for logic
Try to translate the title of a PDF from UTF-8 (then UTF-16BE, then WINDOWS-1252) to UTF-8
This commit is contained in:
parent
f80f16dfc8
commit
c01d953292
@ -90,8 +90,8 @@ class ContentProxy
|
|||||||
* @return string (maybe contains invalid UTF-8 character)
|
* @return string (maybe contains invalid UTF-8 character)
|
||||||
*/
|
*/
|
||||||
private function convertPdfEncodingToUTF8($title) {
|
private function convertPdfEncodingToUTF8($title) {
|
||||||
// first try UTF-16 (then UTF-8) because its easier to detect its present/absence
|
// first try UTF-8 because its easier to detect its present/absence
|
||||||
foreach (array('UTF-16BE', 'UTF-16LE', 'UTF-8', 'WINDOWS-1252') as $encoding) {
|
foreach (array('UTF-8', 'UTF-16BE', 'WINDOWS-1252') as $encoding) {
|
||||||
if (mb_check_encoding($title, $encoding)) {
|
if (mb_check_encoding($title, $encoding)) {
|
||||||
return mb_convert_encoding($title, 'UTF-8', $encoding);
|
return mb_convert_encoding($title, 'UTF-8', $encoding);
|
||||||
}
|
}
|
||||||
|
@ -531,6 +531,242 @@ class ContentProxyTest extends TestCase
|
|||||||
$this->assertSame('1.1.1.1', $entry->getDomainName());
|
$this->assertSame('1.1.1.1', $entry->getDomainName());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testWebsiteWithValidUTF8Title_doNothing()
|
||||||
|
{
|
||||||
|
// You can use https://www.online-toolz.com/tools/text-hex-convertor.php to convert UTF-8 text <=> hex
|
||||||
|
// See http://graphemica.com for more info about the characters
|
||||||
|
// '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
|
||||||
|
$actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '7A');
|
||||||
|
|
||||||
|
$tagger = $this->getTaggerMock();
|
||||||
|
$tagger->expects($this->once())
|
||||||
|
->method('tag');
|
||||||
|
|
||||||
|
$graby = $this->getMockBuilder('Graby\Graby')
|
||||||
|
->setMethods(['fetchContent'])
|
||||||
|
->disableOriginalConstructor()
|
||||||
|
->getMock();
|
||||||
|
|
||||||
|
$graby->expects($this->any())
|
||||||
|
->method('fetchContent')
|
||||||
|
->willReturn([
|
||||||
|
'html' => false,
|
||||||
|
'title' => $actualTitle,
|
||||||
|
'url' => '',
|
||||||
|
'content_type' => 'text/html',
|
||||||
|
'language' => '',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
|
||||||
|
$entry = new Entry(new User());
|
||||||
|
$proxy->updateEntry($entry, 'http://0.0.0.0');
|
||||||
|
|
||||||
|
// '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
|
||||||
|
$expectedTitle = 'F09F98BB' . 'E284A4' . '7A';
|
||||||
|
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testWebsiteWithInvalidUTF8Title_removeInvalidCharacter()
|
||||||
|
{
|
||||||
|
// See http://graphemica.com for more info about the characters
|
||||||
|
// 'a€b' (61;80;62) in hexadecimal and WINDOWS-1252 - but 80 is a invalid UTF-8 character.
|
||||||
|
// The correct UTF-8 € character (U+20AC) is E282AC
|
||||||
|
$actualTitle = $this->hexToStr('61' . '80' . '62');
|
||||||
|
|
||||||
|
$tagger = $this->getTaggerMock();
|
||||||
|
$tagger->expects($this->once())
|
||||||
|
->method('tag');
|
||||||
|
|
||||||
|
$graby = $this->getMockBuilder('Graby\Graby')
|
||||||
|
->setMethods(['fetchContent'])
|
||||||
|
->disableOriginalConstructor()
|
||||||
|
->getMock();
|
||||||
|
|
||||||
|
$graby->expects($this->any())
|
||||||
|
->method('fetchContent')
|
||||||
|
->willReturn([
|
||||||
|
'html' => false,
|
||||||
|
'title' => $actualTitle,
|
||||||
|
'url' => '',
|
||||||
|
'content_type' => 'text/html',
|
||||||
|
'language' => '',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
|
||||||
|
$entry = new Entry(new User());
|
||||||
|
$proxy->updateEntry($entry, 'http://0.0.0.0');
|
||||||
|
|
||||||
|
// 'ab' (61;62) because all invalid UTF-8 character (like 80) are removed
|
||||||
|
$expectedTitle = '61' . '62';
|
||||||
|
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testPdfWithUTF16BETitle_convertToUTF8()
|
||||||
|
{
|
||||||
|
// See http://graphemica.com for more info about the characters
|
||||||
|
// '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF16BE
|
||||||
|
$actualTitle = $this->hexToStr('D83DDE3B');
|
||||||
|
|
||||||
|
$tagger = $this->getTaggerMock();
|
||||||
|
$tagger->expects($this->once())
|
||||||
|
->method('tag');
|
||||||
|
|
||||||
|
$graby = $this->getMockBuilder('Graby\Graby')
|
||||||
|
->setMethods(['fetchContent'])
|
||||||
|
->disableOriginalConstructor()
|
||||||
|
->getMock();
|
||||||
|
|
||||||
|
$graby->expects($this->any())
|
||||||
|
->method('fetchContent')
|
||||||
|
->willReturn([
|
||||||
|
'html' => false,
|
||||||
|
'title' => $actualTitle,
|
||||||
|
'url' => '',
|
||||||
|
'content_type' => 'application/pdf',
|
||||||
|
'language' => '',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
|
||||||
|
$entry = new Entry(new User());
|
||||||
|
$proxy->updateEntry($entry, 'http://0.0.0.0');
|
||||||
|
|
||||||
|
// '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8
|
||||||
|
$expectedTitle = 'F09F98BB';
|
||||||
|
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testPdfWithUTF8Title_doNothing()
|
||||||
|
{
|
||||||
|
// See http://graphemica.com for more info about the characters
|
||||||
|
// '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF8
|
||||||
|
$actualTitle = $this->hexToStr('F09F98BB');
|
||||||
|
|
||||||
|
$tagger = $this->getTaggerMock();
|
||||||
|
$tagger->expects($this->once())
|
||||||
|
->method('tag');
|
||||||
|
|
||||||
|
$graby = $this->getMockBuilder('Graby\Graby')
|
||||||
|
->setMethods(['fetchContent'])
|
||||||
|
->disableOriginalConstructor()
|
||||||
|
->getMock();
|
||||||
|
|
||||||
|
$graby->expects($this->any())
|
||||||
|
->method('fetchContent')
|
||||||
|
->willReturn([
|
||||||
|
'html' => false,
|
||||||
|
'title' => $actualTitle,
|
||||||
|
'url' => '',
|
||||||
|
'content_type' => 'application/pdf',
|
||||||
|
'language' => '',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
|
||||||
|
$entry = new Entry(new User());
|
||||||
|
$proxy->updateEntry($entry, 'http://0.0.0.0');
|
||||||
|
|
||||||
|
// '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8
|
||||||
|
$expectedTitle = 'F09F98BB';
|
||||||
|
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testPdfWithWINDOWS1252Title_convertToUTF8()
|
||||||
|
{
|
||||||
|
// See http://graphemica.com for more info about the characters
|
||||||
|
// '€' (80) in hexadecimal and WINDOWS-1252
|
||||||
|
$actualTitle = $this->hexToStr('80');
|
||||||
|
|
||||||
|
$tagger = $this->getTaggerMock();
|
||||||
|
$tagger->expects($this->once())
|
||||||
|
->method('tag');
|
||||||
|
|
||||||
|
$graby = $this->getMockBuilder('Graby\Graby')
|
||||||
|
->setMethods(['fetchContent'])
|
||||||
|
->disableOriginalConstructor()
|
||||||
|
->getMock();
|
||||||
|
|
||||||
|
$graby->expects($this->any())
|
||||||
|
->method('fetchContent')
|
||||||
|
->willReturn([
|
||||||
|
'html' => false,
|
||||||
|
'title' => $actualTitle,
|
||||||
|
'url' => '',
|
||||||
|
'content_type' => 'application/pdf',
|
||||||
|
'language' => '',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
|
||||||
|
$entry = new Entry(new User());
|
||||||
|
$proxy->updateEntry($entry, 'http://0.0.0.0');
|
||||||
|
|
||||||
|
// '€' (U+20AC or E282AC) in hexadecimal and UTF-8
|
||||||
|
$expectedTitle = 'E282AC';
|
||||||
|
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testPdfWithInvalidCharacterInTitle_removeInvalidCharacter()
|
||||||
|
{
|
||||||
|
// See http://graphemica.com for more info about the characters
|
||||||
|
// '😻ℤ<F09F98BB>z' (U+1F63B or F09F98BB; U+2124 or E284A4; invalid character 81; U+007A or 7A) in hexadecimal and UTF-8
|
||||||
|
// 0x81 is not a valid character for UTF16, UTF8 and WINDOWS-1252
|
||||||
|
$actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '81' . '7A');
|
||||||
|
|
||||||
|
$tagger = $this->getTaggerMock();
|
||||||
|
$tagger->expects($this->once())
|
||||||
|
->method('tag');
|
||||||
|
|
||||||
|
$graby = $this->getMockBuilder('Graby\Graby')
|
||||||
|
->setMethods(['fetchContent'])
|
||||||
|
->disableOriginalConstructor()
|
||||||
|
->getMock();
|
||||||
|
|
||||||
|
$graby->expects($this->any())
|
||||||
|
->method('fetchContent')
|
||||||
|
->willReturn([
|
||||||
|
'html' => false,
|
||||||
|
'title' => $actualTitle,
|
||||||
|
'url' => '',
|
||||||
|
'content_type' => 'application/pdf',
|
||||||
|
'language' => '',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
|
||||||
|
$entry = new Entry(new User());
|
||||||
|
$proxy->updateEntry($entry, 'http://0.0.0.0');
|
||||||
|
|
||||||
|
// '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
|
||||||
|
// the 0x81 (represented by <20>) is invalid for UTF16, UTF8 and WINDOWS-1252 and is removed
|
||||||
|
$expectedTitle = 'F09F98BB' . 'E284A4' . '7A';
|
||||||
|
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* https://stackoverflow.com/a/18506801
|
||||||
|
* @param $string
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
function strToHex($string){
|
||||||
|
$hex = '';
|
||||||
|
for ($i=0; $i<strlen($string); $i++){
|
||||||
|
$ord = ord($string[$i]);
|
||||||
|
$hexCode = dechex($ord);
|
||||||
|
$hex .= substr('0'.$hexCode, -2);
|
||||||
|
}
|
||||||
|
return strToUpper($hex);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* https://stackoverflow.com/a/18506801
|
||||||
|
* @param $hex
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
function hexToStr($hex){
|
||||||
|
$string='';
|
||||||
|
for ($i=0; $i < strlen($hex)-1; $i+=2){
|
||||||
|
$string .= chr(hexdec($hex[$i].$hex[$i+1]));
|
||||||
|
}
|
||||||
|
return $string;
|
||||||
|
}
|
||||||
|
|
||||||
private function getTaggerMock()
|
private function getTaggerMock()
|
||||||
{
|
{
|
||||||
return $this->getMockBuilder(RuleBasedTagger::class)
|
return $this->getMockBuilder(RuleBasedTagger::class)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user