[instagram] Extract metadata from JSON

This commit is contained in:
Sergey M․ 2016-06-12 06:06:04 +07:00
parent 329ca3bef6
commit 98960c911c
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
1 changed files with 61 additions and 11 deletions

View File

@ -8,6 +8,7 @@ from ..utils import (
int_or_none, int_or_none,
limit_length, limit_length,
lowercase_escape, lowercase_escape,
try_get,
) )
@ -19,10 +20,16 @@ class InstagramIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'aye83DjauH', 'id': 'aye83DjauH',
'ext': 'mp4', 'ext': 'mp4',
'uploader_id': 'naomipq',
'title': 'Video by naomipq', 'title': 'Video by naomipq',
'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
} 'thumbnail': 're:^https?://.*\.jpg',
'timestamp': 1371748545,
'upload_date': '20130620',
'uploader_id': 'naomipq',
'uploader': 'Naomi Leonor Phan-Quang',
'like_count': int,
'comment_count': int,
},
}, { }, {
# missing description # missing description
'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
@ -31,6 +38,13 @@ class InstagramIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'uploader_id': 'britneyspears', 'uploader_id': 'britneyspears',
'title': 'Video by britneyspears', 'title': 'Video by britneyspears',
'thumbnail': 're:^https?://.*\.jpg',
'timestamp': 1453760977,
'upload_date': '20160125',
'uploader_id': 'britneyspears',
'uploader': 'Britney Spears',
'like_count': int,
'comment_count': int,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -67,21 +81,57 @@ class InstagramIE(InfoExtractor):
url = mobj.group('url') url = mobj.group('url')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
(video_url, description, thumbnail, timestamp, uploader,
uploader_id, like_count, comment_count) = [None] * 8
shared_data = self._parse_json(
self._search_regex(
r'window\._sharedData\s*=\s*({.+?});',
webpage, 'shared data', default='{}'),
video_id, fatal=False)
if shared_data:
media = try_get(
shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict)
if media:
video_url = media.get('video_url')
description = media.get('caption')
thumbnail = media.get('display_src')
timestamp = int_or_none(media.get('date'))
uploader = media.get('owner', {}).get('full_name')
uploader_id = media.get('owner', {}).get('username')
like_count = int_or_none(media.get('likes', {}).get('count'))
comment_count = int_or_none(media.get('comments', {}).get('count'))
if not video_url:
video_url = self._og_search_video_url(webpage, secure=False)
if not uploader_id:
uploader_id = self._search_regex(
r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"',
webpage, 'uploader id', fatal=False) webpage, 'uploader id', fatal=False)
desc = self._search_regex(
r'"caption":"(.+?)"', webpage, 'description', default=None) if not description:
if desc is not None: description = self._search_regex(
desc = lowercase_escape(desc) r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None)
if description is not None:
description = lowercase_escape(description)
if not thumbnail:
thumbnail = self._og_search_thumbnail(webpage)
return { return {
'id': video_id, 'id': video_id,
'url': self._og_search_video_url(webpage, secure=False), 'url': video_url,
'ext': 'mp4', 'ext': 'mp4',
'title': 'Video by %s' % uploader_id, 'title': 'Video by %s' % uploader_id,
'thumbnail': self._og_search_thumbnail(webpage), 'description': description,
'thumbnail': thumbnail,
'timestamp': timestamp,
'uploader_id': uploader_id, 'uploader_id': uploader_id,
'description': desc, 'uploader': uploader,
'like_count': like_count,
'comment_count': comment_count,
} }