Merge pull request #3725 from Tobi823/master

Bugfix: Sanitize the title of a saved webpage from invalid UTF-8 characters.
This commit is contained in:
Kevin Decherf 2018-09-24 18:34:16 +02:00 committed by GitHub
commit 0f5c15d543
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 298 additions and 0 deletions

View file

@ -53,6 +53,7 @@ class ContentProxy
if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
$fetchedContent = $this->graby->fetchContent($url);
$fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']);
// when content is imported, we have information in $content
// in case fetching content goes bad, we'll keep the imported information instead of overriding them
@ -176,6 +177,59 @@ class ContentProxy
$entry->setTitle($path);
}
/**
* Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character.
*
* @param $title
* @param $contentType
*
* @return string
*/
private function sanitizeContentTitle($title, $contentType)
{
if ('application/pdf' === $contentType) {
$title = $this->convertPdfEncodingToUTF8($title);
}
return $this->sanitizeUTF8Text($title);
}
/**
* If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not
* UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8.
*
* @param $title
*
* @return string (maybe contains invalid UTF-8 character)
*/
private function convertPdfEncodingToUTF8($title)
{
// first try UTF-8 because its easier to detect its present/absence
foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) {
if (mb_check_encoding($title, $encoding)) {
return mb_convert_encoding($title, 'UTF-8', $encoding);
}
}
return $title;
}
/**
* Remove invalid UTF-8 characters from the given string.
*
* @param string $rawText
*
* @return string
*/
private function sanitizeUTF8Text($rawText)
{
if (mb_check_encoding($rawText, 'UTF-8')) {
return $rawText;
}
return iconv('UTF-8', 'UTF-8//IGNORE', $rawText);
}
/**
* Stock entry with fetched or imported content.
* Will fall back to OpenGraph data if available.

View file

@ -531,6 +531,250 @@ class ContentProxyTest extends TestCase
$this->assertSame('1.1.1.1', $entry->getDomainName());
}
public function testWebsiteWithValidUTF8Title_doNothing()
{
// You can use https://www.online-toolz.com/tools/text-hex-convertor.php to convert UTF-8 text <=> hex
// See http://graphemica.com for more info about the characters
// '😻z' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
$actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '7A');
$tagger = $this->getTaggerMock();
$tagger->expects($this->once())
->method('tag');
$graby = $this->getMockBuilder('Graby\Graby')
->setMethods(['fetchContent'])
->disableOriginalConstructor()
->getMock();
$graby->expects($this->any())
->method('fetchContent')
->willReturn([
'html' => false,
'title' => $actualTitle,
'url' => '',
'content_type' => 'text/html',
'language' => '',
]);
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
$entry = new Entry(new User());
$proxy->updateEntry($entry, 'http://0.0.0.0');
// '😻z' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
$expectedTitle = 'F09F98BB' . 'E284A4' . '7A';
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
}
public function testWebsiteWithInvalidUTF8Title_removeInvalidCharacter()
{
// See http://graphemica.com for more info about the characters
// 'a€b' (61;80;62) in hexadecimal and WINDOWS-1252 - but 80 is a invalid UTF-8 character.
// The correct UTF-8 € character (U+20AC) is E282AC
$actualTitle = $this->hexToStr('61' . '80' . '62');
$tagger = $this->getTaggerMock();
$tagger->expects($this->once())
->method('tag');
$graby = $this->getMockBuilder('Graby\Graby')
->setMethods(['fetchContent'])
->disableOriginalConstructor()
->getMock();
$graby->expects($this->any())
->method('fetchContent')
->willReturn([
'html' => false,
'title' => $actualTitle,
'url' => '',
'content_type' => 'text/html',
'language' => '',
]);
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
$entry = new Entry(new User());
$proxy->updateEntry($entry, 'http://0.0.0.0');
// 'ab' (61;62) because all invalid UTF-8 character (like 80) are removed
$expectedTitle = '61' . '62';
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
}
public function testPdfWithUTF16BETitle_convertToUTF8()
{
// See http://graphemica.com for more info about the characters
// '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF16BE
$actualTitle = $this->hexToStr('D83DDE3B');
$tagger = $this->getTaggerMock();
$tagger->expects($this->once())
->method('tag');
$graby = $this->getMockBuilder('Graby\Graby')
->setMethods(['fetchContent'])
->disableOriginalConstructor()
->getMock();
$graby->expects($this->any())
->method('fetchContent')
->willReturn([
'html' => false,
'title' => $actualTitle,
'url' => '',
'content_type' => 'application/pdf',
'language' => '',
]);
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
$entry = new Entry(new User());
$proxy->updateEntry($entry, 'http://0.0.0.0');
// '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8
$expectedTitle = 'F09F98BB';
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
}
public function testPdfWithUTF8Title_doNothing()
{
// See http://graphemica.com for more info about the characters
// '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF8
$actualTitle = $this->hexToStr('F09F98BB');
$tagger = $this->getTaggerMock();
$tagger->expects($this->once())
->method('tag');
$graby = $this->getMockBuilder('Graby\Graby')
->setMethods(['fetchContent'])
->disableOriginalConstructor()
->getMock();
$graby->expects($this->any())
->method('fetchContent')
->willReturn([
'html' => false,
'title' => $actualTitle,
'url' => '',
'content_type' => 'application/pdf',
'language' => '',
]);
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
$entry = new Entry(new User());
$proxy->updateEntry($entry, 'http://0.0.0.0');
// '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8
$expectedTitle = 'F09F98BB';
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
}
public function testPdfWithWINDOWS1252Title_convertToUTF8()
{
// See http://graphemica.com for more info about the characters
// '€' (80) in hexadecimal and WINDOWS-1252
$actualTitle = $this->hexToStr('80');
$tagger = $this->getTaggerMock();
$tagger->expects($this->once())
->method('tag');
$graby = $this->getMockBuilder('Graby\Graby')
->setMethods(['fetchContent'])
->disableOriginalConstructor()
->getMock();
$graby->expects($this->any())
->method('fetchContent')
->willReturn([
'html' => false,
'title' => $actualTitle,
'url' => '',
'content_type' => 'application/pdf',
'language' => '',
]);
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
$entry = new Entry(new User());
$proxy->updateEntry($entry, 'http://0.0.0.0');
// '€' (U+20AC or E282AC) in hexadecimal and UTF-8
$expectedTitle = 'E282AC';
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
}
public function testPdfWithInvalidCharacterInTitle_removeInvalidCharacter()
{
// See http://graphemica.com for more info about the characters
// '😻<F09F98BB>z' (U+1F63B or F09F98BB; U+2124 or E284A4; invalid character 81; U+007A or 7A) in hexadecimal and UTF-8
// 0x81 is not a valid character for UTF16, UTF8 and WINDOWS-1252
$actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '81' . '7A');
$tagger = $this->getTaggerMock();
$tagger->expects($this->once())
->method('tag');
$graby = $this->getMockBuilder('Graby\Graby')
->setMethods(['fetchContent'])
->disableOriginalConstructor()
->getMock();
$graby->expects($this->any())
->method('fetchContent')
->willReturn([
'html' => false,
'title' => $actualTitle,
'url' => '',
'content_type' => 'application/pdf',
'language' => '',
]);
$proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
$entry = new Entry(new User());
$proxy->updateEntry($entry, 'http://0.0.0.0');
// '😻z' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
// the 0x81 (represented by <20>) is invalid for UTF16, UTF8 and WINDOWS-1252 and is removed
$expectedTitle = 'F09F98BB' . 'E284A4' . '7A';
$this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
}
/**
* https://stackoverflow.com/a/18506801.
*
* @param $string
*
* @return string
*/
private function strToHex($string)
{
$hex = '';
for ($i = 0; $i < \strlen($string); ++$i) {
$ord = \ord($string[$i]);
$hexCode = dechex($ord);
$hex .= substr('0' . $hexCode, -2);
}
return strtoupper($hex);
}
/**
* https://stackoverflow.com/a/18506801.
*
* @param $hex
*
* @return string
*/
private function hexToStr($hex)
{
$string = '';
for ($i = 0; $i < \strlen($hex) - 1; $i += 2) {
$string .= \chr(hexdec($hex[$i] . $hex[$i + 1]));
}
return $string;
}
private function getTaggerMock()
{
return $this->getMockBuilder(RuleBasedTagger::class)