[Vbox7IE] Sanitise ld+json containing unexpected characters

* based on PR #29680
* added hack to force invoking `transform_source`
* fixes #26218
This commit is contained in:
dirkf 2024-01-27 18:17:09 +00:00 committed by GitHub Actions
parent 7160f2f1f8
commit 665b2543ce

View File

@ -5,6 +5,7 @@ import re
import time import time
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_kwargs
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
@ -75,6 +76,27 @@ class Vbox7IE(InfoExtractor):
if mobj: if mobj:
return mobj.group('url') return mobj.group('url')
# transform_source=None, fatal=True
def _parse_json(self, json_string, video_id, *args, **kwargs):
if '"@context"' in json_string[:30]:
# this is ld+json, or that's the way to bet
transform_source = args[0] if len(args) > 0 else kwargs.get('transform_source')
if not transform_source:
def fix_chars(src):
# fix malformed ld+json: replace raw CRLFs with escaped LFs
return re.sub(
r'"[^"]+"', lambda m: re.sub(r'\r?\n', r'\\n', m.group(0)), src)
if len(args) > 0:
args = (fix_chars,) + args[1:]
else:
kwargs['transform_source'] = fix_chars
kwargs = compat_kwargs(kwargs)
return super(Vbox7IE, self)._parse_json(
json_string, video_id, *args, **kwargs)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
url = 'https://vbox7.com/play:%s' % (video_id,) url = 'https://vbox7.com/play:%s' % (video_id,)