mirror of
https://github.com/wallabag/wallabag.git
synced 2024-12-15 09:57:41 +01:00
Bugfix: Sanitize the title of a saved webpage from invalid UTF-8 characters
This commit is contained in:
parent
759c91940b
commit
d76a5a6d60
@ -53,6 +53,7 @@ class ContentProxy
|
|||||||
|
|
||||||
if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
|
if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
|
||||||
$fetchedContent = $this->graby->fetchContent($url);
|
$fetchedContent = $this->graby->fetchContent($url);
|
||||||
|
$fetchedContent['title'] = $this->sanitizeUTF8Text($fetchedContent['title']);
|
||||||
|
|
||||||
// when content is imported, we have information in $content
|
// when content is imported, we have information in $content
|
||||||
// in case fetching content goes bad, we'll keep the imported information instead of overriding them
|
// in case fetching content goes bad, we'll keep the imported information instead of overriding them
|
||||||
@ -68,6 +69,28 @@ class ContentProxy
|
|||||||
$this->stockEntry($entry, $content);
|
$this->stockEntry($entry, $content);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove invalid UTF-8 characters from the given string in following steps:
|
||||||
|
* - try to interpret the given string as ISO-8859-1, convert it to UTF-8 and return it (if its valid)
|
||||||
|
* - simply remove every invalid UTF-8 character and return the result (https://stackoverflow.com/a/1433665)
|
||||||
|
* @param String $rawText
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
private function sanitizeUTF8Text(String $rawText) {
|
||||||
|
if (mb_check_encoding($rawText, 'utf-8')) {
|
||||||
|
return $rawText; // return because its valid utf-8 text
|
||||||
|
}
|
||||||
|
|
||||||
|
// we assume that $text is encoded in ISO-8859-1 (and not the similar Windows-1252 or other encoding)
|
||||||
|
$convertedText = utf8_encode($rawText);
|
||||||
|
if (mb_check_encoding($convertedText, 'utf-8')) {
|
||||||
|
return $convertedText;
|
||||||
|
}
|
||||||
|
|
||||||
|
// last resort: simply remove invalid UTF-8 character because $rawText can have some every exotic encoding
|
||||||
|
return iconv("UTF-8", "UTF-8//IGNORE", $rawText);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Use a Symfony validator to ensure the language is well formatted.
|
* Use a Symfony validator to ensure the language is well formatted.
|
||||||
*
|
*
|
||||||
|
Loading…
Reference in New Issue
Block a user