yt-dlp/yt_dlp/downloader/mhtml.py

import io
import quopri
import re
import uuid

from .fragment import FragmentFD
from ..utils import escapeHTML, formatSeconds, srt_subtitles_timecode, urljoin
from ..version import __version__ as YT_DLP_VERSION


class MhtmlFD(FragmentFD):
    _STYLESHEET = """\
html, body {
    margin: 0;
    padding: 0;
    height: 100vh;
}

html {
    overflow-y: scroll;
    scroll-snap-type: y mandatory;
}

body {
    scroll-snap-type: y mandatory;
    display: flex;
    flex-flow: column;
}

body > figure {
    max-width: 100vw;
    max-height: 100vh;
    scroll-snap-align: center;
}

body > figure > figcaption {
    text-align: center;
    height: 2.5em;
}

body > figure > img {
    display: block;
    margin: auto;
    max-width: 100%;
    max-height: calc(100vh - 5em);
}
"""
    _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
    _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)

    @staticmethod
    def _escape_mime(s):
        return '=?utf-8?Q?' + (b''.join(
            bytes((b,)) if b >= 0x20 else b'=%02X' % b
            for b in quopri.encodestring(s.encode(), header=True)
        )).decode('us-ascii') + '?='

    def _gen_cid(self, i, fragment, frag_boundary):
        return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)

    def _gen_stub(self, *, fragments, frag_boundary, title):
        output = io.StringIO()

        output.write((
            '<!DOCTYPE html>'
            '<html>'
            '<head>'
            ''  '<meta name="generator" content="yt-dlp {version}">'
            ''  '<title>{title}</title>'
            ''  '<style>{styles}</style>'
            '<body>'
        ).format(
            version=escapeHTML(YT_DLP_VERSION),
            styles=self._STYLESHEET,
            title=escapeHTML(title)
        ))

        t0 = 0
        for i, frag in enumerate(fragments):
            output.write('<figure>')
            try:
                t1 = t0 + frag['duration']
                output.write((
                    '<figcaption>Slide #{num}: {t0} – {t1} (duration: {duration})</figcaption>'
                ).format(
                    num=i + 1,
                    t0=srt_subtitles_timecode(t0),
                    t1=srt_subtitles_timecode(t1),
                    duration=formatSeconds(frag['duration'], msec=True)
                ))
            except (KeyError, ValueError, TypeError):
                t1 = None
                output.write((
                    '<figcaption>Slide #{num}</figcaption>'
                ).format(num=i + 1))
            output.write('<img src="cid:{cid}">'.format(
                cid=self._gen_cid(i, frag, frag_boundary)))
            output.write('</figure>')
            t0 = t1

        return output.getvalue()

    def real_download(self, filename, info_dict):
        fragment_base_url = info_dict.get('fragment_base_url')
        fragments = info_dict['fragments'][:1] if self.params.get(
            'test', False) else info_dict['fragments']
        title = info_dict.get('title', info_dict['format_id'])
        origin = info_dict.get('webpage_url', info_dict['url'])

        ctx = {
            'filename': filename,
            'total_frags': len(fragments),
        }

        self._prepare_and_start_frag_download(ctx, info_dict)

        extra_state = ctx.setdefault('extra_state', {
            'header_written': False,
            'mime_boundary': str(uuid.uuid4()).replace('-', ''),
        })

        frag_boundary = extra_state['mime_boundary']

        if not extra_state['header_written']:
            stub = self._gen_stub(
                fragments=fragments,
                frag_boundary=frag_boundary,
                title=title
            )

            ctx['dest_stream'].write((
                'MIME-Version: 1.0\r\n'
                'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
                'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
                'Subject: {title}\r\n'
                'Content-type: multipart/related; '
                ''  'boundary="{boundary}"; '
                ''  'type="text/html"\r\n'
                'X.yt-dlp.Origin: {origin}\r\n'
                '\r\n'
                '--{boundary}\r\n'
                'Content-Type: text/html; charset=utf-8\r\n'
                'Content-Length: {length}\r\n'
                '\r\n'
                '{stub}\r\n'
            ).format(
                origin=origin,
                boundary=frag_boundary,
                length=len(stub),
                title=self._escape_mime(title),
                stub=stub
            ).encode())
            extra_state['header_written'] = True

        for i, fragment in enumerate(fragments):
            if (i + 1) <= ctx['fragment_index']:
                continue

            fragment_url = fragment.get('url')
            if not fragment_url:
                assert fragment_base_url
                fragment_url = urljoin(fragment_base_url, fragment['path'])

            success = self._download_fragment(ctx, fragment_url, info_dict)
            if not success:
                continue
            frag_content = self._read_fragment(ctx)

            mime_type = b'image/jpeg'
            if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
                mime_type = b'image/png'
            if frag_content.startswith((b'GIF87a', b'GIF89a')):
                mime_type = b'image/gif'
            if frag_content.startswith(b'RIFF') and frag_content[8:12] == b'WEBP':
                mime_type = b'image/webp'

            frag_header = io.BytesIO()
            frag_header.write(
                b'--%b\r\n' % frag_boundary.encode('us-ascii'))
            frag_header.write(
                b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
            frag_header.write(
                b'Content-type: %b\r\n' % mime_type)
            frag_header.write(
                b'Content-length: %u\r\n' % len(frag_content))
            frag_header.write(
                b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
            frag_header.write(
                b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
            frag_header.write(b'\r\n')
            self._append_fragment(
                ctx, frag_header.getvalue() + frag_content + b'\r\n')

        ctx['dest_stream'].write(
            b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
        self._finish_frag_download(ctx, info_dict)
        return True
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
+								import io
 								import quopri
 								import re
 								import uuid
 								from .fragment import FragmentFD
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 00:32:57 +02:00
+								from ..utils import escapeHTML, formatSeconds, srt_subtitles_timecode, urljoin
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
+								from ..version import __version__ as YT_DLP_VERSION
 								class MhtmlFD(FragmentFD):
 								    _STYLESHEET = """\
 								html, body {
 								    margin: 0;
 								    padding: 0;
 								    height: 100vh;
 								}
 								html {
 								    overflow-y: scroll;
 								    scroll-snap-type: y mandatory;
 								}
 								body {
 								    scroll-snap-type: y mandatory;
 								    display: flex;
 								    flex-flow: column;
 								}
 								body > figure {
 								    max-width: 100vw;
 								    max-height: 100vh;
 								    scroll-snap-align: center;
 								}
 								body > figure > figcaption {
 								    text-align: center;
 								    height: 2.5em;
 								}
 								body > figure > img {
 								    display: block;
 								    margin: auto;
 								    max-width: 100%;
 								    max-height: calc(100vh - 5em);
 								}
 								"""
 								    _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
 								    _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)
 								    @staticmethod
 								    def _escape_mime(s):
 								        return '=?utf-8?Q?' + (b''.join(
 								            bytes((b,)) if b >= 0x20 else b'=%02X' % b
-												[cleanup] Minor fixes (See desc)

* [youtube] Fix `--youtube-skip-dash-manifest`
* [build] Use `$()` in `Makefile`. Closes #3684
* Fix bug in 385ffb467b2285e85a2a5495b90314ba1f8e0700
* Fix bug in 43d7f5a5d0c77556156a3f8caa6976d3908a1e38
* [cleanup] Remove unnecessary `utf-8` from `str.encode`/`bytes.decode`
* [utils] LazyList: Expose unnecessarily "protected" attributes
and other minor cleanup

											
										
										
											2022-05-09 13:54:28 +02:00
+								            for b in quopri.encodestring(s.encode(), header=True)
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
+								        )).decode('us-ascii') + '?='
 								    def _gen_cid(self, i, fragment, frag_boundary):
 								        return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)
 								    def _gen_stub(self, *, fragments, frag_boundary, title):
 								        output = io.StringIO()
 								        output.write((
 								            '<!DOCTYPE html>'
 								            '<html>'
 								            '<head>'
 								            ''  '<meta name="generator" content="yt-dlp {version}">'
 								            ''  '<title>{title}</title>'
 								            ''  '<style>{styles}</style>'
 								            '<body>'
 								        ).format(
 								            version=escapeHTML(YT_DLP_VERSION),
 								            styles=self._STYLESHEET,
 								            title=escapeHTML(title)
 								        ))
 								        t0 = 0
 								        for i, frag in enumerate(fragments):
 								            output.write('<figure>')
 								            try:
 								                t1 = t0 + frag['duration']
 								                output.write((
 								                    '<figcaption>Slide #{num}: {t0} – {t1} (duration: {duration})</figcaption>'
 								                ).format(
 								                    num=i + 1,
 								                    t0=srt_subtitles_timecode(t0),
 								                    t1=srt_subtitles_timecode(t1),
 								                    duration=formatSeconds(frag['duration'], msec=True)
 								                ))
 								            except (KeyError, ValueError, TypeError):
 								                t1 = None
 								                output.write((
 								                    '<figcaption>Slide #{num}</figcaption>'
 								                ).format(num=i + 1))
 								            output.write('<img src="cid:{cid}">'.format(
 								                cid=self._gen_cid(i, frag, frag_boundary)))
 								            output.write('</figure>')
 								            t0 = t1
 								        return output.getvalue()
 								    def real_download(self, filename, info_dict):
 								        fragment_base_url = info_dict.get('fragment_base_url')
 								        fragments = info_dict['fragments'][:1] if self.params.get(
 								            'test', False) else info_dict['fragments']
-												Fix `--check-formats` for `mhtml`
Closes #1709

											
										
										
											2021-11-20 03:57:47 +01:00
+								        title = info_dict.get('title', info_dict['format_id'])
 								        origin = info_dict.get('webpage_url', info_dict['url'])
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
 								        ctx = {
 								            'filename': filename,
 								            'total_frags': len(fragments),
 								        }
-												[downloader] Pass `info_dict` to `progress_hook`s

											
										
										
											2021-07-21 19:28:43 +02:00
+								        self._prepare_and_start_frag_download(ctx, info_dict)
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
 								        extra_state = ctx.setdefault('extra_state', {
 								            'header_written': False,
 								            'mime_boundary': str(uuid.uuid4()).replace('-', ''),
 								        })
 								        frag_boundary = extra_state['mime_boundary']
 								        if not extra_state['header_written']:
 								            stub = self._gen_stub(
 								                fragments=fragments,
 								                frag_boundary=frag_boundary,
 								                title=title
 								            )
 								            ctx['dest_stream'].write((
 								                'MIME-Version: 1.0\r\n'
 								                'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
 								                'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
 								                'Subject: {title}\r\n'
 								                'Content-type: multipart/related; '
 								                ''  'boundary="{boundary}"; '
 								                ''  'type="text/html"\r\n'
 								                'X.yt-dlp.Origin: {origin}\r\n'
 								                '\r\n'
 								                '--{boundary}\r\n'
 								                'Content-Type: text/html; charset=utf-8\r\n'
 								                'Content-Length: {length}\r\n'
 								                '\r\n'
 								                '{stub}\r\n'
 								            ).format(
 								                origin=origin,
 								                boundary=frag_boundary,
 								                length=len(stub),
 								                title=self._escape_mime(title),
 								                stub=stub
-												[cleanup] Minor fixes (See desc)

* [youtube] Fix `--youtube-skip-dash-manifest`
* [build] Use `$()` in `Makefile`. Closes #3684
* Fix bug in 385ffb467b2285e85a2a5495b90314ba1f8e0700
* Fix bug in 43d7f5a5d0c77556156a3f8caa6976d3908a1e38
* [cleanup] Remove unnecessary `utf-8` from `str.encode`/`bytes.decode`
* [utils] LazyList: Expose unnecessarily "protected" attributes
and other minor cleanup

											
										
										
											2022-05-09 13:54:28 +02:00
+								            ).encode())
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
+								            extra_state['header_written'] = True
 								        for i, fragment in enumerate(fragments):
 								            if (i + 1) <= ctx['fragment_index']:
 								                continue
-												[downloader/mhtml] Fix fragments with absolute urls (#3044)

Authored-by: coletdjnz
											
										
										
											2022-03-13 23:03:40 +01:00
+								            fragment_url = fragment.get('url')
 								            if not fragment_url:
 								                assert fragment_base_url
 								                fragment_url = urljoin(fragment_base_url, fragment['path'])
-												[fragment] Read downloaded fragments only when needed (#3069)

Authored by: Lesmiscore
											
										
										
											2022-03-15 04:27:41 +01:00
+								            success = self._download_fragment(ctx, fragment_url, info_dict)
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
+								            if not success:
 								                continue
-												[fragment] Read downloaded fragments only when needed (#3069)

Authored by: Lesmiscore
											
										
										
											2022-03-15 04:27:41 +01:00
+								            frag_content = self._read_fragment(ctx)
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
 								            mime_type = b'image/jpeg'
 								            if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
 								                mime_type = b'image/png'
 								            if frag_content.startswith((b'GIF87a', b'GIF89a')):
 								                mime_type = b'image/gif'
-												[cleanup] Misc fixes

Closes #3565, https://github.com/yt-dlp/yt-dlp/issues/3514#issuecomment-1105944364

											
										
										
											2022-04-29 03:48:36 +02:00
+								            if frag_content.startswith(b'RIFF') and frag_content[8:12] == b'WEBP':
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
+								                mime_type = b'image/webp'
 								            frag_header = io.BytesIO()
 								            frag_header.write(
 								                b'--%b\r\n' % frag_boundary.encode('us-ascii'))
 								            frag_header.write(
 								                b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
 								            frag_header.write(
 								                b'Content-type: %b\r\n' % mime_type)
 								            frag_header.write(
 								                b'Content-length: %u\r\n' % len(frag_content))
 								            frag_header.write(
 								                b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
 								            frag_header.write(
 								                b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
 								            frag_header.write(b'\r\n')
 								            self._append_fragment(
 								                ctx, frag_header.getvalue() + frag_content + b'\r\n')
 								        ctx['dest_stream'].write(
 								            b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
-												[downloader] Pass `info_dict` to `progress_hook`s

											
										
										
											2021-07-21 19:28:43 +02:00
+								        self._finish_frag_download(ctx, info_dict)
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
+								        return True