diff --git a/lib/yt_dlp/extractor/_extractors.py b/lib/yt_dlp/extractor/_extractors.py index bf0c67542..dd670d59c 100644 --- a/lib/yt_dlp/extractor/_extractors.py +++ b/lib/yt_dlp/extractor/_extractors.py @@ -223,7 +223,11 @@ BiliBiliPlayerIE, BilibiliSpaceVideoIE, BilibiliSpaceAudioIE, - BilibiliSpacePlaylistIE, + BilibiliCollectionListIE, + BilibiliSeriesListIE, + BilibiliFavoritesListIE, + BilibiliWatchlaterIE, + BilibiliPlaylistIE, BiliIntlIE, BiliIntlSeriesIE, BiliLiveIE, @@ -1501,6 +1505,7 @@ from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE from .porn91 import Porn91IE +from .pornbox import PornboxIE from .porncom import PornComIE from .pornflip import PornFlipIE from .pornhd import PornHdIE @@ -1555,7 +1560,14 @@ from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE -from .radiofrance import FranceCultureIE, RadioFranceIE +from .radiofrance import ( + FranceCultureIE, + RadioFranceIE, + RadioFranceLiveIE, + RadioFrancePodcastIE, + RadioFranceProfileIE, + RadioFranceProgramScheduleIE, +) from .radiozet import RadioZetPodcastIE from .radiokapital import ( RadioKapitalIE, @@ -2360,7 +2372,8 @@ ) from .weibo import ( WeiboIE, - WeiboMobileIE + WeiboVideoIE, + WeiboUserIE, ) from .weiqitv import WeiqiTVIE from .weverse import ( diff --git a/lib/yt_dlp/extractor/bilibili.py b/lib/yt_dlp/extractor/bilibili.py index 290340078..5e7042dbb 100644 --- a/lib/yt_dlp/extractor/bilibili.py +++ b/lib/yt_dlp/extractor/bilibili.py @@ -15,6 +15,7 @@ GeoRestrictedError, InAdvancePagedList, OnDemandPagedList, + bool_or_none, filter_dict, float_or_none, format_field, @@ -35,6 +36,7 @@ unsmuggle_url, url_or_none, urlencode_postdata, + variadic, ) @@ -156,7 +158,7 @@ def _get_episodes_from_season(self, ss_id, url): class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -252,7 +254,7 @@ class BiliBiliIE(BilibiliBaseIE): 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', 'duration': 313.557, 'upload_date': '20220709', - 'uploader': '小夫Tech', + 'uploader': '小夫太渴', 'timestamp': 1657347907, 'uploader_id': '1326814124', 'comment_count': int, @@ -509,7 +511,7 @@ def _real_extract(self, url): class BiliBiliBangumiMediaIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/media/md(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'info_dict': { @@ -528,7 +530,7 @@ def _real_extract(self, url): class BiliBiliBangumiSeasonIE(BilibiliBaseIE): - _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P\d+)' + _VALID_URL = r'(?x)https?://(?:www\.)?bilibili\.com/bangumi/play/ss(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ss26801', 'info_dict': { @@ -679,13 +681,35 @@ def get_entries(page_data): return self.playlist_result(paged_list, playlist_id) -class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE): - _VALID_URL = r'https?://space.bilibili\.com/(?P\d+)/channel/collectiondetail\?sid=(?P\d+)' +class BilibiliSpaceListBaseIE(BilibiliSpaceBaseIE): + def _get_entries(self, page_data, bvid_keys, ending_key='bvid'): + for bvid in traverse_obj(page_data, (*variadic(bvid_keys, (str, bytes, dict, set)), ..., ending_key, {str})): + yield self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE, bvid) + + def _get_uploader(self, uid, playlist_id): + webpage = self._download_webpage(f'https://space.bilibili.com/{uid}', playlist_id, fatal=False) + return self._search_regex(r'(?s)]*>([^<]+)的个人空间-', webpage, 'uploader', fatal=False) + + def _extract_playlist(self, fetch_page, get_metadata, get_entries): + metadata, page_list = super()._extract_playlist(fetch_page, get_metadata, get_entries) + metadata.pop('page_count', None) + metadata.pop('page_size', None) + return metadata, page_list + + +class BilibiliCollectionListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P\d+)/channel/collectiondetail/?\?sid=(?P\d+)' _TESTS = [{ 'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445', 'info_dict': { 'id': '2142762_57445', - 'title': '《底特律 变人》' + 'title': '【完结】《底特律 变人》全结局流程解说', + 'description': '', + 'uploader': '老戴在此', + 'uploader_id': '2142762', + 'timestamp': int, + 'upload_date': str, + 'thumbnail': 'https://archive.biliimg.com/bfs/archive/e0e543ae35ad3df863ea7dea526bc32e70f4c091.jpg', }, 'playlist_mincount': 31, }] @@ -706,22 +730,251 @@ def get_metadata(page_data): return { 'page_count': math.ceil(entry_count / page_size), 'page_size': page_size, - 'title': traverse_obj(page_data, ('meta', 'name')) + 'uploader': self._get_uploader(mid, playlist_id), + **traverse_obj(page_data, { + 'title': ('meta', 'name', {str}), + 'description': ('meta', 'description', {str}), + 'uploader_id': ('meta', 'mid', {str_or_none}), + 'timestamp': ('meta', 'ptime', {int_or_none}), + 'thumbnail': ('meta', 'cover', {url_or_none}), + }) } def get_entries(page_data): - for entry in page_data.get('archives', []): - yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', - BiliBiliIE, entry['bvid']) + return self._get_entries(page_data, 'archives') metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) - return self.playlist_result(paged_list, playlist_id, metadata['title']) + return self.playlist_result(paged_list, playlist_id, **metadata) + + +class BilibiliSeriesListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P\d+)/channel/seriesdetail/?\?\bsid=(?P\d+)' + _TESTS = [{ + 'url': 'https://space.bilibili.com/1958703906/channel/seriesdetail?sid=547718&ctype=0', + 'info_dict': { + 'id': '1958703906_547718', + 'title': '直播回放', + 'description': '直播回放', + 'uploader': '靡烟miya', + 'uploader_id': '1958703906', + 'timestamp': 1637985853, + 'upload_date': '20211127', + 'modified_timestamp': int, + 'modified_date': str, + }, + 'playlist_mincount': 513, + }] + + def _real_extract(self, url): + mid, sid = self._match_valid_url(url).group('mid', 'sid') + playlist_id = f'{mid}_{sid}' + playlist_meta = traverse_obj(self._download_json( + f'https://api.bilibili.com/x/series/series?series_id={sid}', playlist_id, fatal=False + ), { + 'title': ('data', 'meta', 'name', {str}), + 'description': ('data', 'meta', 'description', {str}), + 'uploader_id': ('data', 'meta', 'mid', {str_or_none}), + 'timestamp': ('data', 'meta', 'ctime', {int_or_none}), + 'modified_timestamp': ('data', 'meta', 'mtime', {int_or_none}), + }) + + def fetch_page(page_idx): + return self._download_json( + 'https://api.bilibili.com/x/series/archives', + playlist_id, note=f'Downloading page {page_idx}', + query={'mid': mid, 'series_id': sid, 'pn': page_idx + 1, 'ps': 30})['data'] + + def get_metadata(page_data): + page_size = page_data['page']['size'] + entry_count = page_data['page']['total'] + return { + 'page_count': math.ceil(entry_count / page_size), + 'page_size': page_size, + 'uploader': self._get_uploader(mid, playlist_id), + **playlist_meta + } + + def get_entries(page_data): + return self._get_entries(page_data, 'archives') + + metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) + return self.playlist_result(paged_list, playlist_id, **metadata) + + +class BilibiliFavoritesListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:space\.bilibili\.com/\d+/favlist/?\?fid=|(?:www\.)?bilibili\.com/medialist/detail/ml)(?P\d+)' + _TESTS = [{ + 'url': 'https://space.bilibili.com/84912/favlist?fid=1103407912&ftype=create', + 'info_dict': { + 'id': '1103407912', + 'title': '【V2】(旧)', + 'description': '', + 'uploader': '晓月春日', + 'uploader_id': '84912', + 'timestamp': 1604905176, + 'upload_date': '20201109', + 'modified_timestamp': int, + 'modified_date': str, + 'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg", + 'view_count': int, + 'like_count': int, + }, + 'playlist_mincount': 22, + }, { + 'url': 'https://www.bilibili.com/medialist/detail/ml1103407912', + 'only_matching': True, + }] + + def _real_extract(self, url): + fid = self._match_id(url) + + list_info = self._download_json( + f'https://api.bilibili.com/x/v3/fav/resource/list?media_id={fid}&pn=1&ps=20', + fid, note='Downloading favlist metadata') + if list_info['code'] == -403: + self.raise_login_required(msg='This is a private favorites list. You need to log in as its owner') + + entries = self._get_entries(self._download_json( + f'https://api.bilibili.com/x/v3/fav/resource/ids?media_id={fid}', + fid, note='Download favlist entries'), 'data') + + return self.playlist_result(entries, fid, **traverse_obj(list_info, ('data', 'info', { + 'title': ('title', {str}), + 'description': ('intro', {str}), + 'uploader': ('upper', 'name', {str}), + 'uploader_id': ('upper', 'mid', {str_or_none}), + 'timestamp': ('ctime', {int_or_none}), + 'modified_timestamp': ('mtime', {int_or_none}), + 'thumbnail': ('cover', {url_or_none}), + 'view_count': ('cnt_info', 'play', {int_or_none}), + 'like_count': ('cnt_info', 'thumb_up', {int_or_none}), + }))) + + +class BilibiliWatchlaterIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/watchlater/#/list', + 'info_dict': {'id': 'watchlater'}, + 'playlist_mincount': 0, + 'skip': 'login required', + }] + + def _real_extract(self, url): + list_id = getattr(self._get_cookies(url).get('DedeUserID'), 'value', 'watchlater') + watchlater_info = self._download_json( + 'https://api.bilibili.com/x/v2/history/toview/web?jsonp=jsonp', list_id) + if watchlater_info['code'] == -101: + self.raise_login_required(msg='You need to login to access your watchlater list') + entries = self._get_entries(watchlater_info, ('data', 'list')) + return self.playlist_result(entries, id=list_id, title='稍后再看') + + +class BilibiliPlaylistIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:medialist/play|list)/(?P\w+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/list/1958703906?sid=547718', + 'info_dict': { + 'id': '5_547718', + 'title': '直播回放', + 'uploader': '靡烟miya', + 'uploader_id': '1958703906', + 'timestamp': 1637985853, + 'upload_date': '20211127', + }, + 'playlist_mincount': 513, + }, { + 'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1', + 'info_dict': { + 'id': '5_547718', + }, + 'playlist_mincount': 513, + 'skip': 'redirect url', + }, { + 'url': 'https://www.bilibili.com/list/ml1103407912', + 'info_dict': { + 'id': '3_1103407912', + 'title': '【V2】(旧)', + 'uploader': '晓月春日', + 'uploader_id': '84912', + 'timestamp': 1604905176, + 'upload_date': '20201109', + 'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg", + }, + 'playlist_mincount': 22, + }, { + 'url': 'https://www.bilibili.com/medialist/play/ml1103407912', + 'info_dict': { + 'id': '3_1103407912', + }, + 'playlist_mincount': 22, + 'skip': 'redirect url', + }, { + 'url': 'https://www.bilibili.com/list/watchlater', + 'info_dict': {'id': 'watchlater'}, + 'playlist_mincount': 0, + 'skip': 'login required', + }, { + 'url': 'https://www.bilibili.com/medialist/play/watchlater', + 'info_dict': {'id': 'watchlater'}, + 'playlist_mincount': 0, + 'skip': 'login required', + }] + + def _extract_medialist(self, query, list_id): + for page_num in itertools.count(1): + page_data = self._download_json( + 'https://api.bilibili.com/x/v2/medialist/resource/list', + list_id, query=query, note=f'getting playlist {query["biz_id"]} page {page_num}' + )['data'] + yield from self._get_entries(page_data, 'media_list', ending_key='bv_id') + query['oid'] = traverse_obj(page_data, ('media_list', -1, 'id')) + if not page_data.get('has_more', False): + break + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id) + if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200: + error_code = traverse_obj(initial_state, ('error', 'trueCode', {int_or_none})) + error_message = traverse_obj(initial_state, ('error', 'message', {str_or_none})) + if error_code == -400 and list_id == 'watchlater': + self.raise_login_required('You need to login to access your watchlater playlist') + elif error_code == -403: + self.raise_login_required('This is a private playlist. You need to login as its owner') + elif error_code == 11010: + raise ExtractorError('Playlist is no longer available', expected=True) + raise ExtractorError(f'Could not access playlist: {error_code} {error_message}') + + query = { + 'ps': 20, + 'with_current': False, + **traverse_obj(initial_state, { + 'type': ('playlist', 'type', {int_or_none}), + 'biz_id': ('playlist', 'id', {int_or_none}), + 'tid': ('tid', {int_or_none}), + 'sort_field': ('sortFiled', {int_or_none}), + 'desc': ('desc', {bool_or_none}, {str_or_none}, {str.lower}), + }) + } + metadata = { + 'id': f'{query["type"]}_{query["biz_id"]}', + **traverse_obj(initial_state, ('mediaListInfo', { + 'title': ('title', {str}), + 'uploader': ('upper', 'name', {str}), + 'uploader_id': ('upper', 'mid', {str_or_none}), + 'timestamp': ('ctime', {int_or_none}), + 'thumbnail': ('cover', {url_or_none}), + })), + } + return self.playlist_result(self._extract_medialist(query, list_id), **metadata) class BilibiliCategoryIE(InfoExtractor): IE_NAME = 'Bilibili category extractor' _MAX_RESULTS = 1000000 - _VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' _TESTS = [{ 'url': 'https://www.bilibili.com/v/kichiku/mad', 'info_dict': { @@ -1406,7 +1659,7 @@ def _real_extract(self, url): class BiliLiveIE(InfoExtractor): - _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P\d+)' + _VALID_URL = r'https?://live\.bilibili\.com/(?:blanc/)?(?P\d+)' _TESTS = [{ 'url': 'https://live.bilibili.com/196', diff --git a/lib/yt_dlp/extractor/ccc.py b/lib/yt_dlp/extractor/ccc.py index 22e3a22ec..ca6b82c98 100644 --- a/lib/yt_dlp/extractor/ccc.py +++ b/lib/yt_dlp/extractor/ccc.py @@ -90,10 +90,17 @@ class CCCPlaylistIE(InfoExtractor): 'id': '30c3', }, 'playlist_count': 135, + }, { + 'url': 'https://media.ccc.de/c/DS2023', + 'info_dict': { + 'title': 'Datenspuren 2023', + 'id': 'DS2023', + }, + 'playlist_count': 37 }] def _real_extract(self, url): - playlist_id = self._match_id(url).lower() + playlist_id = self._match_id(url) conf = self._download_json( 'https://media.ccc.de/public/conferences/' + playlist_id, diff --git a/lib/yt_dlp/extractor/n1.py b/lib/yt_dlp/extractor/n1.py index 55345f398..edc41443a 100644 --- a/lib/yt_dlp/extractor/n1.py +++ b/lib/yt_dlp/extractor/n1.py @@ -33,7 +33,7 @@ def _real_extract(self, url): class N1InfoIIE(InfoExtractor): IE_NAME = 'N1Info:article' - _VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P[^/]+)' + _VALID_URL = r'https?://(?:(?:\w+\.)?n1info\.\w+|nova\.rs)/(?:[^/?#]+/){1,2}(?P[^/?#]+)' _TESTS = [{ # Youtube embedded 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', @@ -94,6 +94,16 @@ class N1InfoIIE(InfoExtractor): 'upload_date': '20211102', 'timestamp': 1635861677, }, + }, { + 'url': 'https://n1info.rs/vesti/cuta-biti-u-kosovskoj-mitrovici-znaci-da-te-docekaju-eksplozivnim-napravama/', + 'info_dict': { + 'id': '1332368', + 'ext': 'mp4', + 'title': 'Ćuta: Biti u Kosovskoj Mitrovici znači da te dočekaju eksplozivnim napravama', + 'upload_date': '20230620', + 'timestamp': 1687290536, + 'thumbnail': 'https://cdn.brid.tv/live/partners/26827/snapshot/1332368_th_6492013a8356f_1687290170.jpg' + }, }, { 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'only_matching': True, @@ -105,19 +115,35 @@ def _real_extract(self, url): title = self._html_search_regex(r']+>(.+?)', webpage, 'title') timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage)) - - videos = re.findall(r'(?m)(]+>)', webpage) + plugin_data = self._html_search_meta('BridPlugin', webpage) entries = [] - for video in videos: - video_data = extract_attributes(video) - entries.append({ - '_type': 'url_transparent', - 'url': video_data.get('data-url'), - 'id': video_data.get('id'), - 'title': title, - 'thumbnail': video_data.get('data-thumbnail'), - 'timestamp': timestamp, - 'ie_key': 'N1InfoAsset'}) + if plugin_data: + site_id = self._html_search_regex(r'site:(\d+)', webpage, 'site id') + for video_data in re.findall(r'\$bp\("Brid_\d+", (.+)\);', webpage): + video_id = self._parse_json(video_data, title)['video'] + entries.append({ + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'thumbnail': self._html_search_meta('thumbnailURL', webpage), + 'formats': self._extract_m3u8_formats( + f'https://cdn-uc.brid.tv/live/partners/{site_id}/streaming/{video_id}/{video_id}.m3u8', + video_id, fatal=False), + }) + else: + # Old player still present in older articles + videos = re.findall(r'(?m)(]+>)', webpage) + for video in videos: + video_data = extract_attributes(video) + entries.append({ + '_type': 'url_transparent', + 'url': video_data.get('data-url'), + 'id': video_data.get('id'), + 'title': title, + 'thumbnail': video_data.get('data-thumbnail'), + 'timestamp': timestamp, + 'ie_key': 'N1InfoAsset', + }) embedded_videos = re.findall(r'(]+>)', webpage) for embedded_video in embedded_videos: diff --git a/lib/yt_dlp/extractor/pornbox.py b/lib/yt_dlp/extractor/pornbox.py new file mode 100644 index 000000000..c381382e9 --- /dev/null +++ b/lib/yt_dlp/extractor/pornbox.py @@ -0,0 +1,113 @@ +from .common import InfoExtractor +from ..compat import functools +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + qualities, + str_or_none, + traverse_obj, + url_or_none, +) + + +class PornboxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornbox\.com/application/watch-page/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://pornbox.com/application/watch-page/212108', + 'md5': '3ff6b6e206f263be4c5e987a3162ac6e', + 'info_dict': { + 'id': '212108', + 'ext': 'mp4', + 'title': 'md5:ececc5c6e6c9dd35d290c45fed05fd49', + 'uploader': 'Lily Strong', + 'timestamp': 1665871200, + 'upload_date': '20221015', + 'age_limit': 18, + 'availability': 'needs_auth', + 'duration': 1505, + 'cast': ['Lily Strong', 'John Strong'], + 'tags': 'count:11', + 'description': 'md5:589c7f33e183aa8aa939537300efb859', + 'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$' + } + }, { + 'url': 'https://pornbox.com/application/watch-page/216045', + 'info_dict': { + 'id': '216045', + 'title': 'md5:3e48528e73a9a2b12f7a2772ed0b26a2', + 'description': 'md5:3e631dcaac029f15ed434e402d1b06c7', + 'uploader': 'VK Studio', + 'timestamp': 1618264800, + 'upload_date': '20210412', + 'age_limit': 18, + 'availability': 'premium_only', + 'duration': 2710, + 'cast': 'count:3', + 'tags': 'count:29', + 'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$', + 'subtitles': 'count:6' + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True + }, + 'expected_warnings': [ + 'You are either not logged in or do not have access to this scene', + 'No video formats found', 'Requested format is not available'] + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + public_data = self._download_json(f'https://pornbox.com/contents/{video_id}', video_id) + + subtitles = {country_code: [{ + 'url': f'https://pornbox.com/contents/{video_id}/subtitles/{country_code}', + 'ext': 'srt' + }] for country_code in traverse_obj(public_data, ('subtitles', ..., {str}))} + + is_free_scene = traverse_obj( + public_data, ('price', 'is_available_for_free', {bool}), default=False) + + metadata = { + 'id': video_id, + **traverse_obj(public_data, { + 'title': ('scene_name', {str.strip}), + 'description': ('small_description', {str.strip}), + 'uploader': 'studio', + 'duration': ('runtime', {parse_duration}), + 'cast': (('models', 'male_models'), ..., 'model_name'), + 'thumbnail': ('player_poster', {url_or_none}), + 'tags': ('niches', ..., 'niche'), + }), + 'age_limit': 18, + 'timestamp': parse_iso8601(traverse_obj( + public_data, ('studios', 'release_date'), 'publish_date')), + 'availability': self._availability(needs_auth=True, needs_premium=not is_free_scene), + 'subtitles': subtitles, + } + + if not public_data.get('is_purchased') or not is_free_scene: + self.raise_login_required( + 'You are either not logged in or do not have access to this scene', metadata_available=True) + return metadata + + media_id = traverse_obj(public_data, ( + 'medias', lambda _, v: v['title'] == 'Full video', 'media_id', {int}), get_all=False) + if not media_id: + self.raise_no_formats('Could not find stream id', video_id=video_id) + + stream_data = self._download_json( + f'https://pornbox.com/media/{media_id}/stream', video_id=video_id, note='Getting manifest urls') + + get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k']) + metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], { + 'url': 'src', + 'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + 'format_id': ('quality', {str_or_none}), + 'quality': ('quality', {get_quality}), + 'width': ('size', {lambda x: int(x[:-1])}), + })) + + return metadata diff --git a/lib/yt_dlp/extractor/radiofrance.py b/lib/yt_dlp/extractor/radiofrance.py index 92e51b7f4..35f4b91dd 100644 --- a/lib/yt_dlp/extractor/radiofrance.py +++ b/lib/yt_dlp/extractor/radiofrance.py @@ -1,7 +1,18 @@ +import itertools import re +import urllib.parse from .common import InfoExtractor -from ..utils import parse_duration, unified_strdate +from ..utils import ( + int_or_none, + join_nonempty, + js_to_json, + parse_duration, + strftime_or_none, + traverse_obj, + unified_strdate, + urljoin, +) class RadioFranceIE(InfoExtractor): @@ -56,8 +67,32 @@ def _real_extract(self, url): } -class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P[^?#]+)-(?P\d+)($|[?#])' +class RadioFranceBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr' + + _STATIONS_RE = '|'.join(map(re.escape, ( + 'franceculture', + 'franceinfo', + 'franceinter', + 'francemusique', + 'fip', + 'mouv', + ))) + + def _extract_data_from_webpage(self, webpage, display_id, key): + return traverse_obj(self._search_json( + r'\bconst\s+data\s*=', webpage, key, display_id, + contains_pattern=r'(\[\{.*?\}\]);', transform_source=js_to_json), + (..., 'data', key, {dict}), get_all=False) or {} + + +class FranceCultureIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?:{RadioFranceBaseIE._STATIONS_RE}) + /podcasts/(?:[^?#]+/)?(?P[^?#]+)-(?P\d{{6,}})(?:$|[?#]) + ''' + _TESTS = [ { 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', @@ -67,14 +102,30 @@ class FranceCultureIE(InfoExtractor): 'ext': 'mp3', 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', - 'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'upload_date': '20220514', 'duration': 2750, }, }, + { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675', + 'info_dict': { + 'id': '2107675', + 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023', + 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot', + 'description': 'md5:36ee74351ede77a314fdebb94026b916', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'upload_date': '20230310', + 'duration': 8977, + 'ext': 'mp3', + }, + }, { 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200', + 'only_matching': True, } ] @@ -89,7 +140,6 @@ def _real_extract(self, url): 'id': video_id, 'display_id': display_id, 'url': video_data['contentUrl'], - 'ext': video_data.get('encodingFormat'), 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, 'duration': parse_duration(video_data.get('duration')), 'title': self._html_search_regex(r'(?s)]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)', @@ -102,3 +152,322 @@ def _real_extract(self, url): 'upload_date': unified_strdate(self._search_regex( r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) } + + +class RadioFranceLiveIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + https?://(?:www\.)?radiofrance\.fr + /(?P{RadioFranceBaseIE._STATIONS_RE}) + /?(?Pradio-[\w-]+)?(?:[#?]|$) + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinter/', + 'info_dict': { + 'id': 'franceinter', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/franceculture', + 'info_dict': { + 'id': 'franceculture', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family', + 'info_dict': { + 'id': 'mouv-radio-musique-kids-family', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul', + 'info_dict': { + 'id': 'mouv-radio-rnb-soul', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix', + 'info_dict': { + 'id': 'mouv-radio-musique-mix', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/fip/radio-rock', + 'info_dict': { + 'id': 'fip-radio-rock', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv', + 'only_matching': True, + }] + + def _real_extract(self, url): + station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id') + + if substation_id: + webpage = self._download_webpage(url, station_id) + api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData') + else: + api_response = self._download_json( + f'https://www.radiofrance.fr/{station_id}/api/live', station_id) + + formats, subtitles = [], {} + for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])): + if media_source.get('format') == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': media_source['url'], + 'abr': media_source.get('bitrate'), + }) + + return { + 'id': join_nonempty(station_id, substation_id), + 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty( + ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } + + +class RadioFrancePlaylistBase(RadioFranceBaseIE): + """Subclasses must set _METADATA_KEY""" + + def _call_api(self, content_id, cursor, page_num): + raise NotImplementedError('This method must be implemented by subclasses') + + def _generate_playlist_entries(self, content_id, content_response): + for page_num in itertools.count(2): + for entry in content_response['items']: + yield self.url_result( + f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, { + 'title': 'title', + 'description': 'standFirst', + 'timestamp': ('publishedDate', {int_or_none}), + 'thumbnail': ('visual', 'src'), + })) + + next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False) + if not next_cursor: + break + + content_response = self._call_api(content_id, next_cursor, page_num) + + def _real_extract(self, url): + display_id = self._match_id(url) + + metadata = self._download_json( + 'https://www.radiofrance.fr/api/v2.1/path', display_id, + query={'value': urllib.parse.urlparse(url).path})['content'] + + content_id = metadata['id'] + + return self.playlist_result( + self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id, + display_id=display_id, **{**traverse_obj(metadata, { + 'title': 'title', + 'description': 'standFirst', + 'thumbnail': ('visual', 'src'), + }), **traverse_obj(metadata, { + 'title': 'name', + 'description': 'role', + })}) + + +class RadioFrancePodcastIE(RadioFrancePlaylistBase): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?:{RadioFranceBaseIE._STATIONS_RE}) + /podcasts/(?P[\w-]+)/?(?:[?#]|$) + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert', + 'info_dict': { + 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17', + 'display_id': 'le-billet-vert', + 'title': 'Le billet sciences', + 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale', + 'info_dict': { + 'id': '566fd524-3074-4fbc-ac69-8696f2152a54', + 'display_id': 'jean-marie-le-pen-l-obsession-nationale', + 'title': 'Jean-Marie Le Pen, l\'obsession nationale', + 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine', + 'info_dict': { + 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d', + 'display_id': 'serie-thomas-grjebine', + 'title': 'Thomas Grjebine', + }, + 'playlist_count': 1, + }, { + 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip', + 'info_dict': { + 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e', + 'display_id': 'certains-l-aiment-fip', + 'title': 'Certains l’aiment Fip', + 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 321, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9', + 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix', + 'only_matching': True, + }] + + _METADATA_KEY = 'expressions' + + def _call_api(self, podcast_id, cursor, page_num): + return self._download_json( + f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id, + note=f'Downloading page {page_num}', query={'pageCursor': cursor}) + + +class RadioFranceProfileIE(RadioFrancePlaylistBase): + _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3', + 'info_dict': { + 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb', + 'display_id': 'thomas-pesquet', + 'title': 'Thomas Pesquet', + 'description': 'Astronaute à l\'agence spatiale européenne', + }, + 'playlist_mincount': 212, + }, { + 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie', + 'info_dict': { + 'id': '9593050b-0183-4972-a0b5-d8f699079e02', + 'display_id': 'eugenie-bastie', + 'title': 'Eugénie Bastié', + 'description': 'Journaliste et essayiste', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 39, + }, { + 'url': 'https://www.radiofrance.fr/personnes/lea-salame', + 'only_matching': True, + }] + + _METADATA_KEY = 'documents' + + def _call_api(self, profile_id, cursor, page_num): + resp = self._download_json( + f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id, + note=f'Downloading page {page_num}', query={ + 'relation': 'personality', + 'cursor': cursor, + }) + + resp['next'] = traverse_obj(resp, ('pagination', 'next')) + return resp + + +class RadioFranceProgramScheduleIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?P{RadioFranceBaseIE._STATIONS_RE}) + /grille-programmes(?:\?date=(?P[\d-]+))? + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023', + 'info_dict': { + 'id': 'franceinter-program-20230217', + 'upload_date': '20230217', + }, + 'playlist_count': 25, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023', + 'info_dict': { + 'id': 'franceculture-program-20230201', + 'upload_date': '20230201', + }, + 'playlist_count': 25, + }, { + 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023', + 'info_dict': { + 'id': 'mouv-program-20230319', + 'upload_date': '20230319', + }, + 'playlist_count': 3, + }, { + 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023', + 'info_dict': { + 'id': 'francemusique-program-20230318', + 'upload_date': '20230318', + }, + 'playlist_count': 15, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes', + 'only_matching': True, + }] + + def _generate_playlist_entries(self, webpage_url, api_response): + for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])): + yield self.url_result( + urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE, + url_transparent=True, **traverse_obj(entry, { + 'title': ('expression', 'title'), + 'thumbnail': ('expression', 'visual', 'src'), + 'timestamp': ('startTime', {int_or_none}), + 'series_id': ('concept', 'id'), + 'series': ('concept', 'title'), + })) + + def _real_extract(self, url): + station, date = self._match_valid_url(url).group('station', 'date') + webpage = self._download_webpage(url, station) + grid_data = self._extract_data_from_webpage(webpage, station, 'grid') + upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d') + + return self.playlist_result( + self._generate_playlist_entries(url, grid_data), + join_nonempty(station, 'program', upload_date), upload_date=upload_date) diff --git a/lib/yt_dlp/extractor/weibo.py b/lib/yt_dlp/extractor/weibo.py index bc9a71abe..b0c3052b6 100644 --- a/lib/yt_dlp/extractor/weibo.py +++ b/lib/yt_dlp/extractor/weibo.py @@ -1,134 +1,241 @@ -from .common import InfoExtractor - -import json import random -import re +import itertools +import urllib.parse -from ..compat import ( - compat_parse_qs, - compat_str, -) +from .common import InfoExtractor from ..utils import ( - js_to_json, + int_or_none, + make_archive_id, + mimetype2ext, + parse_resolution, + str_or_none, strip_jsonp, + traverse_obj, + url_or_none, urlencode_postdata, + urljoin, ) -class WeiboIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' - _TEST = { - 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', +class WeiboBaseIE(InfoExtractor): + def _update_visitor_cookies(self, video_id): + visitor_data = self._download_json( + 'https://passport.weibo.com/visitor/genvisitor', video_id, + note='Generating first-visit guest request', + transform_source=strip_jsonp, + data=urlencode_postdata({ + 'cb': 'gen_callback', + 'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', + })) + + self._download_webpage( + 'https://passport.weibo.com/visitor/visitor', video_id, + note='Running first-visit callback to get guest cookies', + query={ + 'a': 'incarnate', + 't': visitor_data['data']['tid'], + 'w': 2, + 'c': '%03d' % visitor_data['data']['confidence'], + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': random.random(), + }) + + def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs): + webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs) + if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com': + self._update_visitor_cookies(video_id) + webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs) + return self._parse_json(webpage, video_id, fatal=fatal) + + def _extract_formats(self, video_info): + media_info = traverse_obj(video_info, ('page_info', 'media_info')) + formats = traverse_obj(media_info, ( + 'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', { + 'url': 'url', + 'format': ('quality_desc', {str}), + 'format_id': ('label', {str}), + 'ext': ('mime', {mimetype2ext}), + 'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}), + 'vcodec': ('video_codecs', {str}), + 'fps': ('fps', {int_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'acodec': ('audio_codecs', {str}), + 'asr': ('audio_sample_rate', {int_or_none}), + 'audio_channels': ('audio_channels', {int_or_none}), + })) + if not formats: # fallback, should be barely used + for url in set(traverse_obj(media_info, (..., {url_or_none}))): + if 'label=' in url: # filter out non-video urls + format_id, resolution = self._search_regex( + r'label=(\w+)&template=(\d+x\d+)', url, 'format info', + group=(1, 2), default=(None, None)) + formats.append({ + 'url': url, + 'format_id': format_id, + **parse_resolution(resolution), + **traverse_obj(media_info, ( + 'video_details', lambda _, v: v['label'].startswith(format_id), { + 'size': ('size', {int_or_none}), + 'tbr': ('bitrate', {int_or_none}), + } + ), get_all=False), + }) + return formats + + def _parse_video_info(self, video_info, video_id=None): + return { + 'id': video_id, + 'extractor_key': WeiboIE.ie_key(), + 'extractor': WeiboIE.IE_NAME, + 'formats': self._extract_formats(video_info), + 'http_headers': {'Referer': 'https://weibo.com/'}, + '_old_archive_ids': [make_archive_id('WeiboMobile', video_id)], + **traverse_obj(video_info, { + 'id': (('id', 'id_str', 'mid'), {str_or_none}), + 'display_id': ('mblogid', {str_or_none}), + 'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}), + 'description': ('text_raw', {str}), + 'duration': ('page_info', 'media_info', 'duration', {int_or_none}), + 'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}), + 'thumbnail': ('page_info', 'page_pic', {url_or_none}), + 'uploader': ('user', 'screen_name', {str}), + 'uploader_id': ('user', ('id', 'id_str'), {str_or_none}), + 'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}), + 'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}), + 'like_count': ('attitudes_count', {int_or_none}), + 'repost_count': ('reposts_count', {int_or_none}), + }, get_all=False), + 'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None, + } + + +class WeiboIE(WeiboBaseIE): + _VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://weibo.com/7827771738/N4xlMvjhI', 'info_dict': { - 'id': 'Fp6RGfbff', + 'id': '4910815147462302', + 'ext': 'mp4', + 'display_id': 'N4xlMvjhI', + 'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】', + 'description': 'md5:e2637a7673980d68694ea7c43cf12a5f', + 'duration': 918, + 'timestamp': 1686312819, + 'upload_date': '20230609', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': '睡前视频基地', + 'uploader_id': '7827771738', + 'uploader_url': 'https://weibo.com/u/7827771738', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'], + }, + }, { + 'url': 'https://m.weibo.cn/status/4189191225395228', + 'info_dict': { + 'id': '4189191225395228', 'ext': 'mp4', - 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', + 'display_id': 'FBqgOmDxO', + 'title': '柴犬柴犬的秒拍视频', + 'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f', + 'duration': 53, + 'timestamp': 1514264429, + 'upload_date': '20171226', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': '柴犬柴犬', + 'uploader_id': '5926682210', + 'uploader_url': 'https://weibo.com/u/5926682210', + 'view_count': int, + 'like_count': int, + 'repost_count': int, } - } + }, { + 'url': 'https://weibo.com/0/4224132150961381', + 'note': 'no playback_list example', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id) - - visitor_url = urlh.url - - if 'passport.weibo.com' in visitor_url: - # first visit - visitor_data = self._download_json( - 'https://passport.weibo.com/visitor/genvisitor', video_id, - note='Generating first-visit data', - transform_source=strip_jsonp, - headers={'Referer': visitor_url}, - data=urlencode_postdata({ - 'cb': 'gen_callback', - 'fp': json.dumps({ - 'os': '2', - 'browser': 'Gecko57,0,0,0', - 'fonts': 'undefined', - 'screenInfo': '1440*900*24', - 'plugins': '', - }), - })) - - tid = visitor_data['data']['tid'] - cnfd = '%03d' % visitor_data['data']['confidence'] - - self._download_webpage( - 'https://passport.weibo.com/visitor/visitor', video_id, - note='Running first-visit callback', - query={ - 'a': 'incarnate', - 't': tid, - 'w': 2, - 'c': cnfd, - 'cb': 'cross_domain', - 'from': 'weibo', - '_rand': random.random(), - }) - - webpage = self._download_webpage( - url, video_id, note='Revisiting webpage') - - title = self._html_extract_title(webpage) - - video_formats = compat_parse_qs(self._search_regex( - r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) - - formats = [] - supported_resolutions = (480, 720) - for res in supported_resolutions: - vid_urls = video_formats.get(compat_str(res)) - if not vid_urls or not isinstance(vid_urls, list): - continue - - vid_url = vid_urls[0] - formats.append({ - 'url': vid_url, - 'height': res, - }) - uploader = self._og_search_property( - 'nick-name', webpage, 'uploader', default=None) - - return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'formats': formats - } + return self._parse_video_info(self._weibo_download_json( + f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id)) -class WeiboMobileIE(InfoExtractor): - _VALID_URL = r'https?://m\.weibo\.cn/status/(?P[0-9]+)(\?.+)?' - _TEST = { - 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', +class WeiboVideoIE(WeiboBaseIE): + _VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P\d+:\d+)' + _TESTS = [{ + 'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow', 'info_dict': { - 'id': '4189191225395228', + 'id': '4797700463137878', 'ext': 'mp4', - 'title': '午睡当然是要甜甜蜜蜜的啦', - 'uploader': '柴犬柴犬' + 'display_id': 'LEZDodaiW', + 'title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了', + 'description': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了 http://t.cn/A6aerGsM ​​​', + 'duration': 76, + 'timestamp': 1659344278, + 'upload_date': '20220801', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': '君子爱财陈平安', + 'uploader_id': '3905382233', + 'uploader_url': 'https://weibo.com/u/3905382233', + 'view_count': int, + 'like_count': int, + 'repost_count': int, } - } + }] def _real_extract(self, url): video_id = self._match_id(url) - # to get Referer url for genvisitor - webpage = self._download_webpage(url, video_id, note='visit the page') - weibo_info = self._parse_json(self._search_regex( - r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', - webpage, 'js_code', flags=re.DOTALL), - video_id, transform_source=js_to_json) + post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode() + video_info = self._weibo_download_json( + f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}', + video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo'] + return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE) - status_data = weibo_info.get('status', {}) - page_info = status_data.get('page_info') - title = status_data['status_title'] - uploader = status_data.get('user', {}).get('screen_name') - return { - 'id': video_id, - 'title': title, +class WeiboUserIE(WeiboBaseIE): + _VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P\d+)' + _TESTS = [{ + 'url': 'https://weibo.com/u/2066652961?tabtype=video', + 'info_dict': { + 'id': '2066652961', + 'title': '萧影殿下的视频', + 'description': '萧影殿下的全部视频', + 'uploader': '萧影殿下', + }, + 'playlist_mincount': 195, + }] + + def _fetch_page(self, uid, cursor=0, page=1): + return self._weibo_download_json( + 'https://weibo.com/ajax/profile/getWaterFallContent', + uid, note=f'Downloading videos page {page}', + query={'uid': uid, 'cursor': cursor})['data'] + + def _entries(self, uid, first_page): + cursor = 0 + for page in itertools.count(1): + response = first_page if page == 1 else self._fetch_page(uid, cursor, page) + for video_info in traverse_obj(response, ('list', ..., {dict})): + yield self._parse_video_info(video_info) + cursor = response.get('next_cursor') + if (int_or_none(cursor) or -1) < 0: + break + + def _real_extract(self, url): + uid = self._match_id(url) + first_page = self._fetch_page(uid) + uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False) + metainfo = { + 'title': f'{uploader}的视频', + 'description': f'{uploader}的全部视频', 'uploader': uploader, - 'url': page_info['media_info']['stream_url'] - } + } if uploader else {} + + return self.playlist_result(self._entries(uid, first_page), uid, **metainfo) diff --git a/lib/yt_dlp_version b/lib/yt_dlp_version index 9ccd49477..56ca8108d 100644 --- a/lib/yt_dlp_version +++ b/lib/yt_dlp_version @@ -1 +1 @@ -20fbbd9249a2f26c7ae579bde5ba5d69aa8fac69 \ No newline at end of file +cf11b40ac40e3d23a6352753296f3a732886efb9 \ No newline at end of file