2018-04-10 20:51:57 +02:00
import hashlib
2022-07-15 18:44:43 +02:00
import itertools
2018-03-15 14:33:36 +01:00
import json
2013-07-01 21:08:54 +02:00
import re
2021-10-22 02:53:45 +02:00
import time
2022-07-15 18:44:43 +02:00
import urllib . error
2013-07-01 21:08:54 +02:00
from . common import InfoExtractor
2015-06-07 19:46:33 +02:00
from . . utils import (
2018-04-17 17:37:50 +02:00
ExtractorError ,
2022-07-15 18:44:43 +02:00
decode_base_n ,
encode_base_n ,
2021-04-01 10:28:33 +02:00
float_or_none ,
2022-07-15 18:44:43 +02:00
format_field ,
2016-03-24 09:29:33 +01:00
get_element_by_attribute ,
2015-06-07 19:46:33 +02:00
int_or_none ,
2016-03-24 09:30:01 +01:00
lowercase_escape ,
2022-01-24 18:04:34 +01:00
str_or_none ,
2021-12-23 23:13:10 +01:00
str_to_int ,
2021-11-24 13:52:42 +01:00
traverse_obj ,
2018-07-21 14:08:28 +02:00
url_or_none ,
2021-10-22 02:53:45 +02:00
urlencode_postdata ,
2015-06-07 19:46:33 +02:00
)
2013-07-01 21:08:54 +02:00
2022-07-15 18:44:43 +02:00
_ENCODING_CHARS = ' ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_ '
def _pk_to_id ( id ) :
""" Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id """
return encode_base_n ( int ( id . split ( ' _ ' ) [ 0 ] ) , table = _ENCODING_CHARS )
def _id_to_pk ( shortcode ) :
""" Covert a shortcode to a numeric value """
return decode_base_n ( shortcode [ : 11 ] , table = _ENCODING_CHARS )
2014-02-10 20:24:12 +01:00
2021-10-31 06:08:04 +01:00
class InstagramBaseIE ( InfoExtractor ) :
2021-10-22 02:53:45 +02:00
_NETRC_MACHINE = ' instagram '
2021-10-31 06:08:04 +01:00
_IS_LOGGED_IN = False
2022-08-18 22:57:46 +02:00
_API_BASE_URL = ' https://i.instagram.com/api/v1 '
_LOGIN_URL = ' https://www.instagram.com/accounts/login '
_API_HEADERS = {
' X-IG-App-ID ' : ' 936619743392459 ' ,
' X-ASBD-ID ' : ' 198387 ' ,
' X-IG-WWW-Claim ' : ' 0 ' ,
' Origin ' : ' https://www.instagram.com ' ,
' Accept ' : ' */* ' ,
' User-Agent ' : ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 ' ,
}
2022-03-18 21:53:33 +01:00
def _perform_login ( self , username , password ) :
if self . _IS_LOGGED_IN :
2021-10-31 06:08:04 +01:00
return
login_webpage = self . _download_webpage (
2022-08-18 22:57:46 +02:00
self . _LOGIN_URL , None , note = ' Downloading login webpage ' , errnote = ' Failed to download login webpage ' )
2021-10-31 06:08:04 +01:00
2022-08-18 22:57:46 +02:00
shared_data = self . _parse_json ( self . _search_regex (
r ' window \ ._sharedData \ s*= \ s*( { .+?}); ' , login_webpage , ' shared data ' , default = ' {} ' ) , None )
login = self . _download_json (
f ' { self . _LOGIN_URL } /ajax/ ' , None , note = ' Logging in ' , headers = {
* * self . _API_HEADERS ,
' X-Requested-With ' : ' XMLHttpRequest ' ,
' X-CSRFToken ' : shared_data [ ' config ' ] [ ' csrf_token ' ] ,
' X-Instagram-AJAX ' : shared_data [ ' rollout_hash ' ] ,
' Referer ' : ' https://www.instagram.com/ ' ,
} , data = urlencode_postdata ( {
' enc_password ' : f ' #PWD_INSTAGRAM_BROWSER:0: { int ( time . time ( ) ) } : { password } ' ,
' username ' : username ,
' queryParams ' : ' {} ' ,
' optIntoOneTap ' : ' false ' ,
' stopDeletionNonce ' : ' ' ,
' trustedDeviceRecords ' : ' {} ' ,
} ) )
2021-10-31 06:08:04 +01:00
if not login . get ( ' authenticated ' ) :
if login . get ( ' message ' ) :
raise ExtractorError ( f ' Unable to login: { login [ " message " ] } ' )
2021-11-28 13:29:55 +01:00
elif login . get ( ' user ' ) :
raise ExtractorError ( ' Unable to login: Sorry, your password was incorrect. Please double-check your password. ' , expected = True )
elif login . get ( ' user ' ) is False :
raise ExtractorError ( ' Unable to login: The username you entered doesn \' t belong to an account. Please check your username and try again. ' , expected = True )
2021-10-31 06:08:04 +01:00
raise ExtractorError ( ' Unable to login ' )
InstagramBaseIE . _IS_LOGGED_IN = True
2021-11-24 13:52:42 +01:00
def _get_count ( self , media , kind , * keys ) :
return traverse_obj (
media , ( kind , ' count ' ) , * ( ( f ' edge_media_ { key } ' , ' count ' ) for key in keys ) ,
expected_type = int_or_none )
def _get_dimension ( self , name , media , webpage = None ) :
return (
traverse_obj ( media , ( ' dimensions ' , name ) , expected_type = int_or_none )
or int_or_none ( self . _html_search_meta (
( f ' og:video: { name } ' , f ' video: { name } ' ) , webpage or ' ' , default = None ) ) )
def _extract_nodes ( self , nodes , is_direct = False ) :
for idx , node in enumerate ( nodes , start = 1 ) :
if node . get ( ' __typename ' ) != ' GraphVideo ' and node . get ( ' is_video ' ) is not True :
continue
video_id = node . get ( ' shortcode ' )
if is_direct :
info = {
' id ' : video_id or node [ ' id ' ] ,
' url ' : node . get ( ' video_url ' ) ,
' width ' : self . _get_dimension ( ' width ' , node ) ,
' height ' : self . _get_dimension ( ' height ' , node ) ,
' http_headers ' : {
' Referer ' : ' https://www.instagram.com/ ' ,
}
}
elif not video_id :
continue
else :
info = {
' _type ' : ' url ' ,
' ie_key ' : ' Instagram ' ,
' id ' : video_id ,
' url ' : f ' https://instagram.com/p/ { video_id } ' ,
}
yield {
* * info ,
' title ' : node . get ( ' title ' ) or ( f ' Video { idx } ' if is_direct else None ) ,
' description ' : traverse_obj (
node , ( ' edge_media_to_caption ' , ' edges ' , 0 , ' node ' , ' text ' ) , expected_type = str ) ,
' thumbnail ' : traverse_obj (
node , ' display_url ' , ' thumbnail_src ' , ' display_src ' , expected_type = url_or_none ) ,
' duration ' : float_or_none ( node . get ( ' video_duration ' ) ) ,
' timestamp ' : int_or_none ( node . get ( ' taken_at_timestamp ' ) ) ,
' view_count ' : int_or_none ( node . get ( ' video_view_count ' ) ) ,
' comment_count ' : self . _get_count ( node , ' comments ' , ' preview_comment ' , ' to_comment ' , ' to_parent_comment ' ) ,
' like_count ' : self . _get_count ( node , ' likes ' , ' preview_like ' ) ,
}
2022-01-24 18:04:34 +01:00
def _extract_product_media ( self , product_media ) :
2022-08-18 22:57:46 +02:00
media_id = product_media . get ( ' code ' ) or _pk_to_id ( product_media . get ( ' pk ' ) )
2022-01-24 18:04:34 +01:00
vcodec = product_media . get ( ' video_codec ' )
dash_manifest_raw = product_media . get ( ' video_dash_manifest ' )
videos_list = product_media . get ( ' video_versions ' )
if not ( dash_manifest_raw or videos_list ) :
2022-02-02 04:25:43 +01:00
return { }
2022-01-24 18:04:34 +01:00
formats = [ {
' format_id ' : format . get ( ' id ' ) ,
' url ' : format . get ( ' url ' ) ,
' width ' : format . get ( ' width ' ) ,
' height ' : format . get ( ' height ' ) ,
' vcodec ' : vcodec ,
} for format in videos_list or [ ] ]
if dash_manifest_raw :
formats . extend ( self . _parse_mpd_formats ( self . _parse_xml ( dash_manifest_raw , media_id ) , mpd_id = ' dash ' ) )
self . _sort_formats ( formats )
thumbnails = [ {
' url ' : thumbnail . get ( ' url ' ) ,
' width ' : thumbnail . get ( ' width ' ) ,
' height ' : thumbnail . get ( ' height ' )
} for thumbnail in traverse_obj ( product_media , ( ' image_versions2 ' , ' candidates ' ) ) or [ ] ]
return {
' id ' : media_id ,
' duration ' : float_or_none ( product_media . get ( ' video_duration ' ) ) ,
' formats ' : formats ,
' thumbnails ' : thumbnails
}
def _extract_product ( self , product_info ) :
if isinstance ( product_info , list ) :
product_info = product_info [ 0 ]
2022-07-15 18:44:43 +02:00
comment_data = traverse_obj ( product_info , ( ' edge_media_to_parent_comment ' , ' edges ' ) )
comments = [ {
' author ' : traverse_obj ( comment_dict , ( ' node ' , ' owner ' , ' username ' ) ) ,
' author_id ' : traverse_obj ( comment_dict , ( ' node ' , ' owner ' , ' id ' ) ) ,
' id ' : traverse_obj ( comment_dict , ( ' node ' , ' id ' ) ) ,
' text ' : traverse_obj ( comment_dict , ( ' node ' , ' text ' ) ) ,
' timestamp ' : traverse_obj ( comment_dict , ( ' node ' , ' created_at ' ) , expected_type = int_or_none ) ,
} for comment_dict in comment_data ] if comment_data else None
2022-01-24 18:04:34 +01:00
user_info = product_info . get ( ' user ' ) or { }
info_dict = {
2022-08-18 22:57:46 +02:00
' id ' : product_info . get ( ' code ' ) or _pk_to_id ( product_info . get ( ' pk ' ) ) ,
2022-01-24 18:04:34 +01:00
' title ' : product_info . get ( ' title ' ) or f ' Video by { user_info . get ( " username " ) } ' ,
' description ' : traverse_obj ( product_info , ( ' caption ' , ' text ' ) , expected_type = str_or_none ) ,
' timestamp ' : int_or_none ( product_info . get ( ' taken_at ' ) ) ,
' channel ' : user_info . get ( ' username ' ) ,
' uploader ' : user_info . get ( ' full_name ' ) ,
' uploader_id ' : str_or_none ( user_info . get ( ' pk ' ) ) ,
' view_count ' : int_or_none ( product_info . get ( ' view_count ' ) ) ,
' like_count ' : int_or_none ( product_info . get ( ' like_count ' ) ) ,
' comment_count ' : int_or_none ( product_info . get ( ' comment_count ' ) ) ,
2022-07-15 18:44:43 +02:00
' comments ' : comments ,
2022-01-24 18:04:34 +01:00
' http_headers ' : {
' Referer ' : ' https://www.instagram.com/ ' ,
}
}
carousel_media = product_info . get ( ' carousel_media ' )
if carousel_media :
return {
' _type ' : ' playlist ' ,
* * info_dict ,
' title ' : f ' Post by { user_info . get ( " username " ) } ' ,
' entries ' : [ {
* * info_dict ,
* * self . _extract_product_media ( product_media ) ,
} for product_media in carousel_media ] ,
}
return {
* * info_dict ,
* * self . _extract_product_media ( product_info )
}
2021-10-31 06:08:04 +01:00
2021-11-05 22:31:34 +01:00
class InstagramIOSIE ( InfoExtractor ) :
2021-11-09 23:44:42 +01:00
IE_DESC = ' IOS instagram:// URL '
2021-11-05 22:31:34 +01:00
_VALID_URL = r ' instagram://media \ ?id=(?P<id>[ \ d_]+) '
_TESTS = [ {
' url ' : ' instagram://media?id=482584233761418119 ' ,
' md5 ' : ' 0d2da106a9d2631273e192b372806516 ' ,
' info_dict ' : {
' id ' : ' aye83DjauH ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video by naomipq ' ,
' description ' : ' md5:1f17f0ab29bd6fe2bfad705f58de3cb8 ' ,
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
' duration ' : 0 ,
' timestamp ' : 1371748545 ,
' upload_date ' : ' 20130620 ' ,
' uploader_id ' : ' naomipq ' ,
' uploader ' : ' B E A U T Y F O R A S H E S ' ,
' like_count ' : int ,
' comment_count ' : int ,
' comments ' : list ,
} ,
' add_ie ' : [ ' Instagram ' ]
} ]
def _real_extract ( self , url ) :
2022-07-15 18:44:43 +02:00
video_id = _pk_to_id ( self . _match_id ( url ) )
return self . url_result ( f ' http://instagram.com/tv/ { video_id } ' , InstagramIE , video_id )
2021-11-05 22:31:34 +01:00
2021-10-31 06:08:04 +01:00
class InstagramIE ( InstagramBaseIE ) :
2021-12-12 19:01:00 +01:00
_VALID_URL = r ' (?P<url>https?://(?:www \ .)?instagram \ .com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+)) '
2022-08-01 03:23:25 +02:00
_EMBED_REGEX = [ r ' <iframe[^>]+src=([ " \' ])(?P<url>(?:https?:)?//(?:www \ .)?instagram \ .com/p/[^/]+/embed.*?) \ 1 ' ]
2015-11-14 02:21:20 +01:00
_TESTS = [ {
2015-05-18 11:21:09 +02:00
' url ' : ' https://instagram.com/p/aye83DjauH/?foo=bar#abc ' ,
2014-02-10 20:24:12 +01:00
' md5 ' : ' 0d2da106a9d2631273e192b372806516 ' ,
' info_dict ' : {
' id ' : ' aye83DjauH ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video by naomipq ' ,
' description ' : ' md5:1f17f0ab29bd6fe2bfad705f58de3cb8 ' ,
2017-01-02 13:08:07 +01:00
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
2021-04-01 10:28:33 +02:00
' duration ' : 0 ,
2016-06-12 01:06:04 +02:00
' timestamp ' : 1371748545 ,
' upload_date ' : ' 20130620 ' ,
2022-01-24 18:04:34 +01:00
' uploader_id ' : ' 2815873 ' ,
2021-01-01 13:26:37 +01:00
' uploader ' : ' B E A U T Y F O R A S H E S ' ,
2022-01-24 18:04:34 +01:00
' channel ' : ' naomipq ' ,
2016-06-12 01:06:04 +02:00
' like_count ' : int ,
' comment_count ' : int ,
2016-09-28 16:54:06 +02:00
' comments ' : list ,
2016-06-12 01:06:04 +02:00
} ,
2016-01-26 16:46:51 +01:00
} , {
# missing description
' url ' : ' https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears ' ,
' info_dict ' : {
' id ' : ' BA-pQFBG8HZ ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video by britneyspears ' ,
2017-01-02 13:08:07 +01:00
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
2021-04-01 10:28:33 +02:00
' duration ' : 0 ,
2016-06-12 01:06:04 +02:00
' timestamp ' : 1453760977 ,
' upload_date ' : ' 20160125 ' ,
2022-01-24 18:04:34 +01:00
' uploader_id ' : ' 12246775 ' ,
2016-06-12 01:06:04 +02:00
' uploader ' : ' Britney Spears ' ,
2022-01-24 18:04:34 +01:00
' channel ' : ' britneyspears ' ,
2016-06-12 01:06:04 +02:00
' like_count ' : int ,
' comment_count ' : int ,
2016-09-28 16:54:06 +02:00
' comments ' : list ,
2016-01-26 16:46:51 +01:00
} ,
' params ' : {
' skip_download ' : True ,
} ,
2017-02-23 12:02:04 +01:00
} , {
# multi video post
' url ' : ' https://www.instagram.com/p/BQ0eAlwhDrw/ ' ,
' playlist ' : [ {
' info_dict ' : {
' id ' : ' BQ0dSaohpPW ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video 1 ' ,
} ,
} , {
' info_dict ' : {
' id ' : ' BQ0dTpOhuHT ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video 2 ' ,
} ,
} , {
' info_dict ' : {
' id ' : ' BQ0dT7RBFeF ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video 3 ' ,
} ,
} ] ,
' info_dict ' : {
' id ' : ' BQ0eAlwhDrw ' ,
' title ' : ' Post by instagram ' ,
' description ' : ' md5:0f9203fc6a2ce4d228da5754bcf54957 ' ,
} ,
2021-04-01 10:28:33 +02:00
} , {
# IGTV
' url ' : ' https://www.instagram.com/tv/BkfuX9UB-eK/ ' ,
' info_dict ' : {
' id ' : ' BkfuX9UB-eK ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Fingerboarding Tricks with @cass.fb ' ,
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
' duration ' : 53.83 ,
' timestamp ' : 1530032919 ,
' upload_date ' : ' 20180626 ' ,
2022-01-24 18:04:34 +01:00
' uploader_id ' : ' 25025320 ' ,
2021-04-01 10:28:33 +02:00
' uploader ' : ' Instagram ' ,
2022-01-24 18:04:34 +01:00
' channel ' : ' instagram ' ,
2021-04-01 10:28:33 +02:00
' like_count ' : int ,
' comment_count ' : int ,
' comments ' : list ,
' description ' : ' Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded. ' ,
}
2015-11-14 02:21:20 +01:00
} , {
' url ' : ' https://instagram.com/p/-Cmh1cukG2/ ' ,
' only_matching ' : True ,
2016-04-16 18:23:08 +02:00
} , {
' url ' : ' http://instagram.com/p/9o6LshA7zy/embed/ ' ,
' only_matching ' : True ,
2019-09-21 22:57:45 +02:00
} , {
' url ' : ' https://www.instagram.com/tv/aye83DjauH/ ' ,
' only_matching ' : True ,
2021-01-01 13:26:37 +01:00
} , {
' url ' : ' https://www.instagram.com/reel/CDUMkliABpa/ ' ,
' only_matching ' : True ,
2021-12-12 19:01:00 +01:00
} , {
' url ' : ' https://www.instagram.com/marvelskies.fc/reel/CWqAgUZgCku/ ' ,
' only_matching ' : True ,
2015-11-14 02:21:20 +01:00
} ]
2013-07-01 21:08:54 +02:00
2022-08-01 03:23:25 +02:00
@classmethod
def _extract_embed_urls ( cls , url , webpage ) :
res = tuple ( super ( ) . _extract_embed_urls ( url , webpage ) )
if res :
return res
2016-03-24 09:29:33 +01:00
2022-08-01 03:23:25 +02:00
mobj = re . search ( r ' <a[^>]+href=([ \' " ])(?P<link>[^ \' " ]+) \ 1 ' ,
get_element_by_attribute ( ' class ' , ' instagram-media ' , webpage ) or ' ' )
2016-03-24 09:29:33 +01:00
if mobj :
2022-08-01 03:23:25 +02:00
return [ mobj . group ( ' link ' ) ]
2016-03-24 09:29:33 +01:00
2013-07-01 21:08:54 +02:00
def _real_extract ( self , url ) :
2021-11-24 13:52:42 +01:00
video_id , url = self . _match_valid_url ( url ) . group ( ' id ' , ' url ' )
2022-08-18 22:57:46 +02:00
media , webpage = { } , ' '
api_check = self . _download_json (
f ' { self . _API_BASE_URL } /web/get_ruling_for_content/?content_type=MEDIA&target_id= { _id_to_pk ( video_id ) } ' ,
video_id , headers = self . _API_HEADERS , fatal = False , note = ' Setting up session ' , errnote = False ) or { }
csrf_token = self . _get_cookies ( ' https://www.instagram.com ' ) . get ( ' csrftoken ' )
if not csrf_token :
self . report_warning ( ' No csrf token set by Instagram API ' , video_id )
elif api_check . get ( ' status ' ) != ' ok ' :
self . report_warning ( ' Instagram API is not granting access ' , video_id )
else :
if self . _get_cookies ( url ) . get ( ' sessionid ' ) :
2022-08-19 00:15:49 +02:00
media . update ( traverse_obj ( self . _download_json (
2022-08-18 22:57:46 +02:00
f ' { self . _API_BASE_URL } /media/ { _id_to_pk ( video_id ) } /info/ ' , video_id ,
fatal = False , note = ' Downloading video info ' , headers = {
* * self . _API_HEADERS ,
' X-CSRFToken ' : csrf_token . value ,
2022-08-19 00:15:49 +02:00
} ) , ( ' items ' , 0 ) ) or { } )
2022-08-18 22:57:46 +02:00
if media :
return self . _extract_product ( media )
variables = {
' shortcode ' : video_id ,
' child_comment_count ' : 3 ,
' fetch_comment_count ' : 40 ,
' parent_comment_count ' : 24 ,
' has_threaded_comments ' : True ,
}
general_info = self . _download_json (
' https://www.instagram.com/graphql/query/ ' , video_id , fatal = False ,
headers = {
* * self . _API_HEADERS ,
' X-CSRFToken ' : csrf_token . value ,
' X-Requested-With ' : ' XMLHttpRequest ' ,
' Referer ' : url ,
} , query = {
' query_hash ' : ' 9f8827793ef34641b2fb195d4d41151c ' ,
' variables ' : json . dumps ( variables , separators = ( ' , ' , ' : ' ) ) ,
} )
2022-08-19 00:15:49 +02:00
media . update ( traverse_obj ( general_info , ( ' data ' , ' shortcode_media ' ) ) or { } )
2022-08-18 22:57:46 +02:00
2021-01-01 13:26:37 +01:00
if not media :
2022-08-18 22:57:46 +02:00
self . report_warning ( ' General metadata extraction failed (some metadata might be missing). ' , video_id )
webpage , urlh = self . _download_webpage_handle ( url , video_id )
shared_data = self . _search_json (
2022-08-19 00:15:49 +02:00
r ' window \ ._sharedData \ s*= ' , webpage , ' shared data ' , video_id , fatal = False ) or { }
2022-08-18 22:57:46 +02:00
2022-08-19 00:15:49 +02:00
if shared_data and self . _LOGIN_URL not in urlh . geturl ( ) :
2022-08-18 22:57:46 +02:00
media . update ( traverse_obj (
shared_data , ( ' entry_data ' , ' PostPage ' , 0 , ' graphql ' , ' shortcode_media ' ) ,
( ' entry_data ' , ' PostPage ' , 0 , ' media ' ) , expected_type = dict ) or { } )
else :
self . report_warning ( ' Main webpage is locked behind the login page. Retrying with embed webpage ' )
webpage = self . _download_webpage (
f ' { url } /embed/ ' , video_id , note = ' Downloading embed webpage ' , fatal = False )
additional_data = self . _search_json (
r ' window \ .__additionalDataLoaded \ s* \ ( \ s*[^,]+, \ s* ' , webpage , ' additional data ' , video_id , fatal = False )
if not additional_data :
2022-08-19 00:15:49 +02:00
self . raise_login_required ( ' Requested content is not available, rate-limit reached or login required ' )
2022-08-18 22:57:46 +02:00
product_item = traverse_obj ( additional_data , ( ' items ' , 0 ) , expected_type = dict )
if product_item :
media . update ( product_item )
return self . _extract_product ( media )
media . update ( traverse_obj (
additional_data , ( ' graphql ' , ' shortcode_media ' ) , ' shortcode_media ' , expected_type = dict ) or { } )
2021-11-24 13:52:42 +01:00
2022-01-24 18:04:34 +01:00
username = traverse_obj ( media , ( ' owner ' , ' username ' ) ) or self . _search_regex (
r ' " owner " \ s*: \ s* { \ s* " username " \ s*: \ s* " (.+?) " ' , webpage , ' username ' , fatal = False )
2021-11-24 13:52:42 +01:00
description = (
traverse_obj ( media , ( ' edge_media_to_caption ' , ' edges ' , 0 , ' node ' , ' text ' ) , expected_type = str )
or media . get ( ' caption ' ) )
if not description :
description = self . _search_regex (
r ' " caption " \ s*: \ s* " (.+?) " ' , webpage , ' description ' , default = None )
if description is not None :
description = lowercase_escape ( description )
2016-06-12 01:06:04 +02:00
2021-11-24 13:52:42 +01:00
video_url = media . get ( ' video_url ' )
2016-06-12 01:06:04 +02:00
if not video_url :
2021-11-24 13:52:42 +01:00
nodes = traverse_obj ( media , ( ' edge_sidecar_to_children ' , ' edges ' , . . . , ' node ' ) , expected_type = dict ) or [ ]
if nodes :
return self . playlist_result (
self . _extract_nodes ( nodes , True ) , video_id ,
2022-06-18 04:00:12 +02:00
format_field ( username , None , ' Post by %s ' ) , description )
2021-11-24 13:52:42 +01:00
2016-06-12 01:06:04 +02:00
video_url = self . _og_search_video_url ( webpage , secure = False )
2016-09-28 17:28:16 +02:00
formats = [ {
' url ' : video_url ,
2021-11-24 13:52:42 +01:00
' width ' : self . _get_dimension ( ' width ' , media , webpage ) ,
' height ' : self . _get_dimension ( ' height ' , media , webpage ) ,
2016-09-28 17:28:16 +02:00
} ]
2021-11-24 13:52:42 +01:00
dash = traverse_obj ( media , ( ' dash_info ' , ' video_dash_manifest ' ) )
2021-10-31 03:54:39 +01:00
if dash :
formats . extend ( self . _parse_mpd_formats ( self . _parse_xml ( dash , video_id ) , mpd_id = ' dash ' ) )
self . _sort_formats ( formats )
2016-09-28 17:28:16 +02:00
2021-12-23 23:13:10 +01:00
comment_data = traverse_obj ( media , ( ' edge_media_to_parent_comment ' , ' edges ' ) )
2021-11-24 13:52:42 +01:00
comments = [ {
' author ' : traverse_obj ( comment_dict , ( ' node ' , ' owner ' , ' username ' ) ) ,
' author_id ' : traverse_obj ( comment_dict , ( ' node ' , ' owner ' , ' id ' ) ) ,
' id ' : traverse_obj ( comment_dict , ( ' node ' , ' id ' ) ) ,
' text ' : traverse_obj ( comment_dict , ( ' node ' , ' text ' ) ) ,
' timestamp ' : traverse_obj ( comment_dict , ( ' node ' , ' created_at ' ) , expected_type = int_or_none ) ,
2021-12-23 23:13:10 +01:00
} for comment_dict in comment_data ] if comment_data else None
2021-11-24 13:52:42 +01:00
display_resources = (
media . get ( ' display_resources ' )
or [ { ' src ' : media . get ( key ) } for key in ( ' display_src ' , ' display_url ' ) ]
or [ { ' src ' : self . _og_search_thumbnail ( webpage ) } ] )
thumbnails = [ {
' url ' : thumbnail [ ' src ' ] ,
' width ' : thumbnail . get ( ' config_width ' ) ,
' height ' : thumbnail . get ( ' config_height ' ) ,
} for thumbnail in display_resources if thumbnail . get ( ' src ' ) ]
2013-07-01 21:08:54 +02:00
2014-02-10 20:24:12 +01:00
return {
' id ' : video_id ,
2016-09-28 17:28:16 +02:00
' formats ' : formats ,
2022-01-24 18:04:34 +01:00
' title ' : media . get ( ' title ' ) or ' Video by %s ' % username ,
2016-06-12 01:06:04 +02:00
' description ' : description ,
2021-11-24 13:52:42 +01:00
' duration ' : float_or_none ( media . get ( ' video_duration ' ) ) ,
' timestamp ' : traverse_obj ( media , ' taken_at_timestamp ' , ' date ' , expected_type = int_or_none ) ,
2022-01-24 18:04:34 +01:00
' uploader_id ' : traverse_obj ( media , ( ' owner ' , ' id ' ) ) ,
2021-11-24 13:52:42 +01:00
' uploader ' : traverse_obj ( media , ( ' owner ' , ' full_name ' ) ) ,
2022-01-24 18:04:34 +01:00
' channel ' : username ,
2021-12-23 23:13:10 +01:00
' like_count ' : self . _get_count ( media , ' likes ' , ' preview_like ' ) or str_to_int ( self . _search_regex (
r ' data-log-event= " likeCountClick " [^>]*>[^ \ d]*([ \ d, \ .]+) ' , webpage , ' like count ' , fatal = False ) ) ,
2021-11-24 13:52:42 +01:00
' comment_count ' : self . _get_count ( media , ' comments ' , ' preview_comment ' , ' to_comment ' , ' to_parent_comment ' ) ,
2016-09-28 16:54:06 +02:00
' comments ' : comments ,
2021-11-24 13:52:42 +01:00
' thumbnails ' : thumbnails ,
2021-08-14 21:15:01 +02:00
' http_headers ' : {
' Referer ' : ' https://www.instagram.com/ ' ,
}
2014-02-10 20:24:12 +01:00
}
2014-03-23 16:06:03 +01:00
2021-10-31 06:08:04 +01:00
class InstagramPlaylistBaseIE ( InstagramBaseIE ) :
2019-01-20 10:10:46 +01:00
_gis_tmpl = None # used to cache GIS request type
2014-03-23 16:06:03 +01:00
2019-01-20 10:10:46 +01:00
def _parse_graphql ( self , webpage , item_id ) :
# Reads a webpage and returns its GraphQL data.
return self . _parse_json (
self . _search_regex (
r ' sharedData \ s*= \ s*( { .+?}) \ s*; \ s*[< \ n] ' , webpage , ' data ' ) ,
item_id )
2018-04-17 17:37:50 +02:00
2019-01-20 10:10:46 +01:00
def _extract_graphql ( self , data , url ) :
# Parses GraphQL queries containing videos and generates a playlist.
uploader_id = self . _match_id ( url )
2018-04-10 20:25:41 +02:00
csrf_token = data [ ' config ' ] [ ' csrf_token ' ]
rhx_gis = data . get ( ' rhx_gis ' ) or ' 3c7ca9dcefcf966d11dacf1f151335e8 '
2018-03-21 17:43:03 +01:00
cursor = ' '
for page_num in itertools . count ( 1 ) :
2019-01-20 10:10:46 +01:00
variables = {
2018-04-18 05:12:24 +02:00
' first ' : 12 ,
2018-04-10 20:25:41 +02:00
' after ' : cursor ,
2019-01-20 10:10:46 +01:00
}
variables . update ( self . _query_vars_for ( data ) )
variables = json . dumps ( variables )
2018-04-17 17:37:50 +02:00
if self . _gis_tmpl :
gis_tmpls = [ self . _gis_tmpl ]
else :
gis_tmpls = [
' %s ' % rhx_gis ,
' ' ,
' %s : %s ' % ( rhx_gis , csrf_token ) ,
2022-01-28 22:55:35 +01:00
' %s : %s : %s ' % ( rhx_gis , csrf_token , self . get_param ( ' http_headers ' ) [ ' User-Agent ' ] ) ,
2018-04-17 17:37:50 +02:00
]
2019-01-20 10:10:46 +01:00
# try all of the ways to generate a GIS query, and not only use the
# first one that works, but cache it for future requests
2018-04-17 17:37:50 +02:00
for gis_tmpl in gis_tmpls :
try :
2019-01-20 10:10:46 +01:00
json_data = self . _download_json (
2018-04-17 17:37:50 +02:00
' https://www.instagram.com/graphql/query/ ' , uploader_id ,
' Downloading JSON page %d ' % page_num , headers = {
' X-Requested-With ' : ' XMLHttpRequest ' ,
' X-Instagram-GIS ' : hashlib . md5 (
( ' %s : %s ' % ( gis_tmpl , variables ) ) . encode ( ' utf-8 ' ) ) . hexdigest ( ) ,
} , query = {
2019-01-20 10:10:46 +01:00
' query_hash ' : self . _QUERY_HASH ,
2018-04-17 17:37:50 +02:00
' variables ' : variables ,
2019-01-20 10:10:46 +01:00
} )
media = self . _parse_timeline_from ( json_data )
2018-04-17 17:37:50 +02:00
self . _gis_tmpl = gis_tmpl
break
except ExtractorError as e :
2019-01-20 10:10:46 +01:00
# if it's an error caused by a bad query, and there are
# more GIS templates to try, ignore it and keep trying
2022-07-15 18:44:43 +02:00
if isinstance ( e . cause , urllib . error . HTTPError ) and e . cause . code == 403 :
2018-04-17 17:37:50 +02:00
if gis_tmpl != gis_tmpls [ - 1 ] :
continue
raise
2018-03-21 17:43:03 +01:00
2021-11-24 13:52:42 +01:00
nodes = traverse_obj ( media , ( ' edges ' , . . . , ' node ' ) , expected_type = dict ) or [ ]
if not nodes :
2018-03-21 17:43:03 +01:00
break
2021-11-24 13:52:42 +01:00
yield from self . _extract_nodes ( nodes )
2018-03-21 17:43:03 +01:00
2021-11-24 13:52:42 +01:00
has_next_page = traverse_obj ( media , ( ' page_info ' , ' has_next_page ' ) )
cursor = traverse_obj ( media , ( ' page_info ' , ' end_cursor ' ) , expected_type = str )
if not has_next_page or not cursor :
2018-03-21 17:43:03 +01:00
break
2017-11-12 12:35:17 +01:00
def _real_extract ( self , url ) :
2019-01-20 10:10:46 +01:00
user_or_tag = self . _match_id ( url )
webpage = self . _download_webpage ( url , user_or_tag )
data = self . _parse_graphql ( webpage , user_or_tag )
2018-04-10 20:25:41 +02:00
2019-01-20 10:10:46 +01:00
self . _set_cookie ( ' instagram.com ' , ' ig_pr ' , ' 1 ' )
2018-04-10 20:25:41 +02:00
2017-11-12 12:35:17 +01:00
return self . playlist_result (
2019-01-20 10:10:46 +01:00
self . _extract_graphql ( data , url ) , user_or_tag , user_or_tag )
2021-10-31 06:08:04 +01:00
class InstagramUserIE ( InstagramPlaylistBaseIE ) :
2019-01-20 10:10:46 +01:00
_VALID_URL = r ' https?://(?:www \ .)?instagram \ .com/(?P<id>[^/] { 2,})/?(?:$|[?#]) '
IE_DESC = ' Instagram user profile '
IE_NAME = ' instagram:user '
2021-10-31 06:08:04 +01:00
_TESTS = [ {
2019-01-20 10:10:46 +01:00
' url ' : ' https://instagram.com/porsche ' ,
' info_dict ' : {
' id ' : ' porsche ' ,
' title ' : ' porsche ' ,
} ,
' playlist_count ' : 5 ,
' params ' : {
' extract_flat ' : True ,
' skip_download ' : True ,
' playlistend ' : 5 ,
}
2021-10-31 06:08:04 +01:00
} ]
2019-01-20 10:10:46 +01:00
_QUERY_HASH = ' 42323d64886122307be10013ad2dcc44 ' ,
@staticmethod
def _parse_timeline_from ( data ) :
# extracts the media timeline data from a GraphQL result
return data [ ' data ' ] [ ' user ' ] [ ' edge_owner_to_timeline_media ' ]
@staticmethod
def _query_vars_for ( data ) :
# returns a dictionary of variables to add to the timeline query based
# on the GraphQL of the original page
return {
' id ' : data [ ' entry_data ' ] [ ' ProfilePage ' ] [ 0 ] [ ' graphql ' ] [ ' user ' ] [ ' id ' ]
}
2021-10-31 06:08:04 +01:00
class InstagramTagIE ( InstagramPlaylistBaseIE ) :
2019-01-20 10:10:46 +01:00
_VALID_URL = r ' https?://(?:www \ .)?instagram \ .com/explore/tags/(?P<id>[^/]+) '
2021-11-28 22:22:52 +01:00
IE_DESC = ' Instagram hashtag search URLs '
2019-01-20 10:10:46 +01:00
IE_NAME = ' instagram:tag '
2021-10-31 06:08:04 +01:00
_TESTS = [ {
2019-01-20 10:10:46 +01:00
' url ' : ' https://instagram.com/explore/tags/lolcats ' ,
' info_dict ' : {
' id ' : ' lolcats ' ,
' title ' : ' lolcats ' ,
} ,
' playlist_count ' : 50 ,
' params ' : {
' extract_flat ' : True ,
' skip_download ' : True ,
' playlistend ' : 50 ,
}
2021-10-31 06:08:04 +01:00
} ]
2019-01-20 10:10:46 +01:00
_QUERY_HASH = ' f92f56d47dc7a55b606908374b43a314 ' ,
@staticmethod
def _parse_timeline_from ( data ) :
# extracts the media timeline data from a GraphQL result
return data [ ' data ' ] [ ' hashtag ' ] [ ' edge_hashtag_to_media ' ]
@staticmethod
def _query_vars_for ( data ) :
# returns a dictionary of variables to add to the timeline query based
# on the GraphQL of the original page
return {
' tag_name ' :
data [ ' entry_data ' ] [ ' TagPage ' ] [ 0 ] [ ' graphql ' ] [ ' hashtag ' ] [ ' name ' ]
}
2021-12-28 19:58:06 +01:00
class InstagramStoryIE ( InstagramBaseIE ) :
_VALID_URL = r ' https?://(?:www \ .)?instagram \ .com/stories/(?P<user>[^/]+)/(?P<id> \ d+) '
IE_NAME = ' instagram:story '
_TESTS = [ {
' url ' : ' https://www.instagram.com/stories/highlights/18090946048123978/ ' ,
' info_dict ' : {
' id ' : ' 18090946048123978 ' ,
' title ' : ' Rare ' ,
} ,
' playlist_mincount ' : 50
} ]
def _real_extract ( self , url ) :
username , story_id = self . _match_valid_url ( url ) . groups ( )
2022-07-15 18:44:43 +02:00
story_info = self . _download_webpage ( url , story_id )
user_info = self . _search_json ( r ' " user " : ' , story_info , ' user info ' , story_id , fatal = False )
if not user_info :
self . raise_login_required ( ' This content is unreachable ' )
user_id = user_info . get ( ' id ' )
2021-12-28 19:58:06 +01:00
story_info_url = user_id if username != ' highlights ' else f ' highlight: { story_id } '
2022-07-15 18:44:43 +02:00
videos = traverse_obj ( self . _download_json (
2022-08-18 22:57:46 +02:00
f ' { self . _API_BASE_URL } /feed/reels_media/?reel_ids= { story_info_url } ' ,
story_id , errnote = False , fatal = False , headers = self . _API_HEADERS ) , ' reels ' )
2022-07-15 18:44:43 +02:00
if not videos :
self . raise_login_required ( ' You need to log in to access this content ' )
2022-01-19 20:36:40 +01:00
2022-07-15 18:44:43 +02:00
full_name = traverse_obj ( videos , ( f ' highlight: { story_id } ' , ' user ' , ' full_name ' ) , ( str ( user_id ) , ' user ' , ' full_name ' ) )
story_title = traverse_obj ( videos , ( f ' highlight: { story_id } ' , ' title ' ) )
if not story_title :
story_title = f ' Story by { username } '
2022-01-19 20:36:40 +01:00
2022-01-24 18:04:34 +01:00
highlights = traverse_obj ( videos , ( f ' highlight: { story_id } ' , ' items ' ) , ( str ( user_id ) , ' items ' ) )
2022-07-15 18:44:43 +02:00
info_data = [ ]
for highlight in highlights :
highlight_data = self . _extract_product ( highlight )
if highlight_data . get ( ' formats ' ) :
info_data . append ( {
* * highlight_data ,
' uploader ' : full_name ,
' uploader_id ' : user_id ,
} )
return self . playlist_result ( info_data , playlist_id = story_id , playlist_title = story_title )