From 3b31478dfd18c03a1305089e79c30dc929f53eeb Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 31 Mar 2023 12:30:22 +0200 Subject: [PATCH 01/47] Fix support for NPO downloads --- youtube_dl/extractor/npo.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index e525ad92843..eff9edb8b6d 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,6 +1,8 @@ from __future__ import unicode_literals import re +import urllib.parse +from http.cookies import SimpleCookie from .common import InfoExtractor from ..compat import ( @@ -184,22 +186,28 @@ def _real_extract(self, url): return self._get_info(url, video_id) or self._get_old_info(video_id) def _get_info(self, url, video_id): - token = self._download_json( + _, xsrf_token_response = self._download_webpage_handle( 'https://www.npostart.nl/api/token', video_id, 'Downloading token', headers={ 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', - })['token'] + }) + cookies = SimpleCookie() + cookies.load(xsrf_token_response.headers['Set-Cookie']) + cookies = {k: v.value for k, v in cookies.items()} + xsrf_token = cookies['XSRF-TOKEN'] player = self._download_json( 'https://www.npostart.nl/player/%s' % video_id, video_id, - 'Downloading player JSON', data=urlencode_postdata({ + 'Downloading player JSON', + headers={"x-xsrf-token": urllib.parse.unquote(xsrf_token)}, + data=urlencode_postdata({ 'autoplay': 0, 'share': 1, 'pageUrl': url, + 'isFavourite': "false", 'hasAdConsent': 0, - '_token': token, - })) + },)) player_token = player['token'] @@ -215,7 +223,7 @@ def _get_info(self, url, video_id): 'quality': 'npo', 'tokenId': player_token, 'streamType': 'broadcast', - }) + }, data=b"") if not streams: continue stream = streams.get('stream') From b4776f2e36e6235c6a3142973355be7e03eee919 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 31 Mar 2023 12:39:11 +0200 Subject: [PATCH 02/47] Import from compat --- youtube_dl/extractor/npo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index eff9edb8b6d..dba42205801 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,13 +1,13 @@ from __future__ import unicode_literals import re -import urllib.parse -from http.cookies import SimpleCookie from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, + compat_urllib_parse_unquote_plus, + compat_cookies_SimpleCookie, ) from ..utils import ( determine_ext, @@ -192,7 +192,7 @@ def _get_info(self, url, video_id): 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', }) - cookies = SimpleCookie() + cookies = compat_cookies_SimpleCookie() cookies.load(xsrf_token_response.headers['Set-Cookie']) cookies = {k: v.value for k, v in cookies.items()} xsrf_token = cookies['XSRF-TOKEN'] @@ -200,7 +200,7 @@ def _get_info(self, url, video_id): player = self._download_json( 'https://www.npostart.nl/player/%s' % video_id, video_id, 'Downloading player JSON', - headers={"x-xsrf-token": urllib.parse.unquote(xsrf_token)}, + headers={"x-xsrf-token": compat_urllib_parse_unquote_plus(xsrf_token)}, data=urlencode_postdata({ 'autoplay': 0, 'share': 1, From fb2b4e2894171825c6c85d813a8120b679eadf52 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 31 Mar 2023 12:46:05 +0200 Subject: [PATCH 03/47] Add line comment --- youtube_dl/extractor/npo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index dba42205801..646b0f43370 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -207,7 +207,7 @@ def _get_info(self, url, video_id): 'pageUrl': url, 'isFavourite': "false", 'hasAdConsent': 0, - },)) + })) player_token = player['token'] @@ -223,7 +223,8 @@ def _get_info(self, url, video_id): 'quality': 'npo', 'tokenId': player_token, 'streamType': 'broadcast', - }, data=b"") + }, + data=b"") # empty byte string to force a POST request instead of GET, without it HTTP 405 will happen if not streams: continue stream = streams.get('stream') From 9e1acb2527a9141710657a35d358dba54b4c8ddd Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 31 Mar 2023 12:56:18 +0200 Subject: [PATCH 04/47] Fix flake8 --- youtube_dl/extractor/npo.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 646b0f43370..e8e596be198 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -200,7 +200,9 @@ def _get_info(self, url, video_id): player = self._download_json( 'https://www.npostart.nl/player/%s' % video_id, video_id, 'Downloading player JSON', - headers={"x-xsrf-token": compat_urllib_parse_unquote_plus(xsrf_token)}, + headers={ + "x-xsrf-token": compat_urllib_parse_unquote_plus(xsrf_token) + }, data=urlencode_postdata({ 'autoplay': 0, 'share': 1, @@ -224,7 +226,9 @@ def _get_info(self, url, video_id): 'tokenId': player_token, 'streamType': 'broadcast', }, - data=b"") # empty byte string to force a POST request instead of GET, without it HTTP 405 will happen + data=b"") + # Empty byte string in the call above to force a POST request + # Without it HTTP 405 will happen if not streams: continue stream = streams.get('stream') From 632897860b94c20bab65c9fd0ad81d6ae3ab30c1 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Mon, 3 Apr 2023 09:50:21 +0200 Subject: [PATCH 05/47] Accept suggestions on PR; comply with conventions Co-authored-by: dirkf --- youtube_dl/extractor/npo.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index e8e596be198..84bde9683dd 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -4,10 +4,10 @@ from .common import InfoExtractor from ..compat import ( + compat_cookies_SimpleCookie, compat_HTTPError, compat_str, compat_urllib_parse_unquote_plus, - compat_cookies_SimpleCookie, ) from ..utils import ( determine_ext, @@ -194,20 +194,20 @@ def _get_info(self, url, video_id): }) cookies = compat_cookies_SimpleCookie() cookies.load(xsrf_token_response.headers['Set-Cookie']) - cookies = {k: v.value for k, v in cookies.items()} + cookies = dict((k, v.value) for k, v in cookies.items()) xsrf_token = cookies['XSRF-TOKEN'] player = self._download_json( 'https://www.npostart.nl/player/%s' % video_id, video_id, 'Downloading player JSON', headers={ - "x-xsrf-token": compat_urllib_parse_unquote_plus(xsrf_token) + 'x-xsrf-token': compat_urllib_parse_unquote_plus(xsrf_token) }, data=urlencode_postdata({ 'autoplay': 0, 'share': 1, 'pageUrl': url, - 'isFavourite': "false", + 'isFavourite': 'false', 'hasAdConsent': 0, })) @@ -226,7 +226,7 @@ def _get_info(self, url, video_id): 'tokenId': player_token, 'streamType': 'broadcast', }, - data=b"") + data=b'') # Empty byte string in the call above to force a POST request # Without it HTTP 405 will happen if not streams: From 0c7261db901e79aed3dfd20f0b3c99ccbd32d20a Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 6 Apr 2023 01:51:02 +0100 Subject: [PATCH 06/47] Update npo.py * simplify comment * force CI --- youtube_dl/extractor/npo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 84bde9683dd..d6379f1d35b 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -226,9 +226,8 @@ def _get_info(self, url, video_id): 'tokenId': player_token, 'streamType': 'broadcast', }, + # empty data to force a POST request, avoiding HTTP 405 data=b'') - # Empty byte string in the call above to force a POST request - # Without it HTTP 405 will happen if not streams: continue stream = streams.get('stream') From da3d1f4321ec0b374b4201e092c085550003aec3 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 1 Mar 2024 10:36:03 +0100 Subject: [PATCH 07/47] Add notes on new npo.nl site --- youtube_dl/extractor/npo.py | 96 ++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 472da54ac0c..aef007e6a2b 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -36,6 +36,7 @@ def _get_token(self, video_id): class NPOIE(NPOBaseIE): IE_NAME = 'npo' + # TODO find out if all hosts still work: IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl' _VALID_URL = r'''(?x) (?: @@ -62,6 +63,10 @@ class NPOIE(NPOBaseIE): 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', 'upload_date': '20140622', }, + 'skip': 'Video gone', + }, { + 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', + # TODO other test attributes }, { 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', @@ -73,8 +78,9 @@ class NPOIE(NPOBaseIE): 'upload_date': '20090227', 'duration': 2400, }, + 'skip': 'Video gone', }, { - 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', + 'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika', 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', 'info_dict': { 'id': 'VPWON_1169289', @@ -95,7 +101,8 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video gone', }, { # non asf in streams 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', @@ -106,7 +113,8 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video gone', }, { 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', 'info_dict': { @@ -119,7 +127,8 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video gone', }, { 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', 'info_dict': { @@ -132,7 +141,8 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video gone', }, { # audio 'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437', @@ -148,15 +158,19 @@ class NPOIE(NPOBaseIE): }, { 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547', 'only_matching': True, + 'skip': 'Video gone', }, { 'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118', 'only_matching': True, + 'skip': 'Video gone', }, { 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', 'only_matching': True, + 'skip': 'Video gone', }, { 'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870', 'only_matching': True, + 'skip': 'Video gone', }, { # live stream 'url': 'npo:LI_NL1_4188102', @@ -704,7 +718,6 @@ class VPROIE(NPOPlaylistBaseIE): 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, - 'skip': 'Video gone', }, { 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', @@ -723,6 +736,7 @@ class VPROIE(NPOPlaylistBaseIE): 'title': 'education education', }, 'playlist_count': 2, + 'skip': 'Video gone', }, { 'url': 'http://www.2doc.nl/documentaires/series/2doc/2015/oktober/de-tegenprestatie.html', @@ -778,3 +792,73 @@ class AndereTijdenIE(NPOPlaylistBaseIE): }, 'playlist_count': 3, }] + +############################################################### +# Description of the new process of getting to the stream # +############################################################### + +# Valid URLs for new tests +# https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/ +# https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/afspelen + +# Step 1: Normalize the URL +# If the URL ends with /afspelen, strip that +# We need the slug in the next stepto find the productId + +# Step 2: Find the productId +# In the contents of the URL is a JSON blob: +# ')[0] + next_data = json.loads(next_data) + product_id, description, thumbnail, title = None, None, None, None + for query in next_data['props']['pageProps']['dehydratedState']['queries']: + if isinstance(query['state']['data'], list): + for entry in query['state']['data']: + print(entry) + try: + if entry['slug'] == slug: + product_id = entry['productId'] + title = entry['title'] + synopsis = entry['synopsis'] + description = synopsis.get('long', synopsis.get('short', synopsis.get('brief', ''))) + thumbnail = entry['images'][0]['url'] + break + except KeyError: continue - video_url = ref.get('href') - if not video_url or video_url in urls: + except IndexError: continue - urls.add(video_url) - formats.append({ - 'url': video_url, - 'ext': stream.get('formaat', 'asf'), - 'quality': stream.get('kwaliteit'), - 'preference': -10, - }) - else: - formats.append({ - 'url': stream_url, - 'quality': stream.get('kwaliteit'), - }) + if not product_id: + raise ExtractorError('No productId found for slug: %s' % slug) + + token = self._get_token(product_id) + + stream_link = self._download_json( + 'https://prod.npoplayer.nl/stream-link', video_id=slug, + data=json.dumps({ + "profileName": "dash", + "drmType": "widevine", + "referrerUrl": url, + }).encode('utf8'), + headers={ + "Authorization": token, + "Content-Type": "application/json", + } + ) - self._sort_formats(formats) + stream_url = stream_link['stream']['streamURL'] - subtitles = {} - if metadata.get('tt888') == 'ja': - subtitles['nl'] = [{ - 'ext': 'vtt', - 'url': 'http://tt888.omroep.nl/tt888/%s' % video_id, - }] + # TODO other formats than dash / mpd + mpd = self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False) return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'description': metadata.get('info'), - 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], - 'upload_date': unified_strdate(metadata.get('gidsdatum')), - 'duration': parse_duration(metadata.get('tijdsduur')), - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, + 'id': slug, + 'formats': mpd, + 'title': title or slug, + 'description': description, + 'thumbnail': thumbnail, + # TODO fill in other metadata that's available } - -############################################################### -# Description of the new process of getting to the stream # -############################################################### - -# Valid URLs for new tests -# https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/ -# https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/afspelen - -# Step 1: Normalize the URL -# If the URL ends with /afspelen, strip that -# We need the slug in the next stepto find the productId - -# Step 2: Find the productId -# In the contents of the URL is a JSON blob: -# ')[0] next_data = json.loads(next_data) - product_id, description, thumbnail, title = None, None, None, None + product_id, title, description, thumbnail = None, None, None, None for query in next_data['props']['pageProps']['dehydratedState']['queries']: if isinstance(query['state']['data'], list): for entry in query['state']['data']: - print(entry) - try: - if entry['slug'] == slug: - product_id = entry['productId'] - title = entry['title'] - synopsis = entry['synopsis'] - description = synopsis.get('long', synopsis.get('short', synopsis.get('brief', ''))) - thumbnail = entry['images'][0]['url'] - break - except KeyError: - continue - except IndexError: - continue + if entry['slug'] == slug: + product_id = entry.get('productId') + title = entry.get('title') + synopsis = entry.get('synopsis', {}) + description = ( + synopsis.get('long') + or synopsis.get('short') + or synopsis.get('brief') + ) + thumbnails = entry.get('images') + for thumbnail_entry in thumbnails: + if 'url' in thumbnail_entry: + thumbnail = thumbnail_entry.get('url') if not product_id: raise ExtractorError('No productId found for slug: %s' % slug) @@ -97,19 +96,18 @@ def _real_extract(self, url): stream_link = self._download_json( 'https://prod.npoplayer.nl/stream-link', video_id=slug, data=json.dumps({ - "profileName": "dash", - "drmType": "widevine", - "referrerUrl": url, + 'profileName': 'dash', + 'drmType': 'widevine', + 'referrerUrl': url, }).encode('utf8'), headers={ - "Authorization": token, - "Content-Type": "application/json", + 'Authorization': token, + 'Content-Type': 'application/json', } ) - stream_url = stream_link['stream']['streamURL'] - # TODO other formats than dash / mpd + stream_url = stream_link.get('stream', {}).get('streamURL') mpd = self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False) return { From fb7b7179ff7ff08ad7e32539c0b0d440e0899903 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 1 Mar 2024 15:08:10 +0100 Subject: [PATCH 12/47] Speculate about other ways of getting productId --- youtube_dl/extractor/npo.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 502d276fff9..7f90aa827ed 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -70,6 +70,8 @@ def _real_extract(self, url): page = self._download_webpage(url, slug, 'Finding productId using slug: %s' % slug) # TODO find out what proper HTML parsing utilities are available in youtube-dl next_data = page.split('')[0] + # TODO The data in this script tag feels like GraphQL, so there might be an easier way + # to get the product id, maybe using a GraphQL endpoint next_data = json.loads(next_data) product_id, title, description, thumbnail = None, None, None, None for query in next_data['props']['pageProps']['dehydratedState']['queries']: From f9e59b0c49c8f0fc3951f8ca01705abb46ed51e4 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 1 Mar 2024 15:28:14 +0100 Subject: [PATCH 13/47] Add the possibility to add 'hls' later --- youtube_dl/extractor/npo.py | 43 ++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 7f90aa827ed..3e543e35015 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -82,9 +82,9 @@ def _real_extract(self, url): title = entry.get('title') synopsis = entry.get('synopsis', {}) description = ( - synopsis.get('long') - or synopsis.get('short') - or synopsis.get('brief') + synopsis.get('long') + or synopsis.get('short') + or synopsis.get('brief') ) thumbnails = entry.get('images') for thumbnail_entry in thumbnails: @@ -95,26 +95,29 @@ def _real_extract(self, url): token = self._get_token(product_id) - stream_link = self._download_json( - 'https://prod.npoplayer.nl/stream-link', video_id=slug, - data=json.dumps({ - 'profileName': 'dash', - 'drmType': 'widevine', - 'referrerUrl': url, - }).encode('utf8'), - headers={ - 'Authorization': token, - 'Content-Type': 'application/json', - } - ) - - # TODO other formats than dash / mpd - stream_url = stream_link.get('stream', {}).get('streamURL') - mpd = self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False) + formats = [] + for profile in ( + 'dash', + # 'hls', # TODO test what needs to change for 'hls' support + ): + stream_link = self._download_json( + 'https://prod.npoplayer.nl/stream-link', video_id=slug, + data=json.dumps({ + 'profileName': profile, + 'drmType': 'widevine', + 'referrerUrl': url, + }).encode('utf8'), + headers={ + 'Authorization': token, + 'Content-Type': 'application/json', + } + ) + stream_url = stream_link.get('stream', {}).get('streamURL') + formats.extend(self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False)) return { 'id': slug, - 'formats': mpd, + 'formats': formats, 'title': title or slug, 'description': description, 'thumbnail': thumbnail, From 8b1a7d9a7c09d7c88fa03f885ebdc5347c007f69 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 1 Mar 2024 16:23:19 +0100 Subject: [PATCH 14/47] Use provided util --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 3e543e35015..e7275e1b338 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -72,7 +72,7 @@ def _real_extract(self, url): next_data = page.split('')[0] # TODO The data in this script tag feels like GraphQL, so there might be an easier way # to get the product id, maybe using a GraphQL endpoint - next_data = json.loads(next_data) + next_data = self._parse_json(next_data, slug) product_id, title, description, thumbnail = None, None, None, None for query in next_data['props']['pageProps']['dehydratedState']['queries']: if isinstance(query['state']['data'], list): From 34b5b2010774fab2cb8984c720fcd7c62110669a Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 3 Mar 2024 17:47:15 +0100 Subject: [PATCH 15/47] Refactor into reusable method --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/npo.py | 60 ++++++++++++++++++++++++------ 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dabcd60cb75..696fd8e1e08 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -847,7 +847,7 @@ NownessSeriesIE, ) from .noz import NozIE -from .npo import NPOIE +from .npo import BNNVaraIE, NPOIE from .npr import NprIE from .nrk import ( NRKIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index e7275e1b338..3896968611a 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -13,7 +13,6 @@ class NPOIE(InfoExtractor): IE_DESC = 'npo.nl' _VALID_URL = r'''(?x) (?: - npo:| https?:// (?:www\.)? (?: @@ -82,9 +81,9 @@ def _real_extract(self, url): title = entry.get('title') synopsis = entry.get('synopsis', {}) description = ( - synopsis.get('long') - or synopsis.get('short') - or synopsis.get('brief') + synopsis.get('long') + or synopsis.get('short') + or synopsis.get('brief') ) thumbnails = entry.get('images') for thumbnail_entry in thumbnails: @@ -93,8 +92,19 @@ def _real_extract(self, url): if not product_id: raise ExtractorError('No productId found for slug: %s' % slug) - token = self._get_token(product_id) + formats = self._download_by_product_id(product_id, slug, url) + + return { + 'id': slug, + 'formats': formats, + 'title': title or slug, + 'description': description, + 'thumbnail': thumbnail, + # TODO fill in other metadata that's available + } + def _download_by_product_id(self, product_id, slug, url=None): + token = self._get_token(product_id) formats = [] for profile in ( 'dash', @@ -105,7 +115,7 @@ def _real_extract(self, url): data=json.dumps({ 'profileName': profile, 'drmType': 'widevine', - 'referrerUrl': url, + 'referrerUrl': url or '', }).encode('utf8'), headers={ 'Authorization': token, @@ -114,12 +124,40 @@ def _real_extract(self, url): ) stream_url = stream_link.get('stream', {}).get('streamURL') formats.extend(self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False)) + return formats + + +class BNNVaraIE(NPOIE): + IE_NAME = 'bnnvara' + IE_DESC = 'bnnvara.nl' + _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*' + + def _real_extract(self, url): + url = url.rstrip('/') + video_id = url.split('/')[-1] + + media = self._download_json('https://api.bnnvara.nl/bff/graphql', + video_id, + data=json.dumps( + { + 'operationName': 'getMedia', + 'variables': { + 'id': video_id, + 'hasAdConsent': False, + 'atInternetId': 70 + }, + 'query': 'query getMedia($id: ID!, $mediaUrl: String, $hasAdConsent: Boolean!, $atInternetId: Int) {\n player(\n id: $id\n mediaUrl: $mediaUrl\n hasAdConsent: $hasAdConsent\n atInternetId: $atInternetId\n ) {\n ... on PlayerSucces {\n brand {\n name\n slug\n broadcastsEnabled\n __typename\n }\n title\n programTitle\n pomsProductId\n broadcasters {\n name\n __typename\n }\n duration\n classifications {\n title\n imageUrl\n type\n __typename\n }\n image {\n title\n url\n __typename\n }\n cta {\n title\n url\n __typename\n }\n genres {\n name\n __typename\n }\n subtitles {\n url\n language\n __typename\n }\n sources {\n name\n url\n ratio\n __typename\n }\n type\n token\n __typename\n }\n ... on PlayerError {\n error\n __typename\n }\n __typename\n }\n}' + }).encode('utf8'), + headers={ + 'Content-Type': 'application/json', + }) + product_id = media.get('data', {}).get('player', {}).get('pomsProductId') + + formats = self._download_by_product_id(product_id, video_id) return { - 'id': slug, + 'id': product_id, + 'title': media.get('data', {}).get('player', {}).get('title'), 'formats': formats, - 'title': title or slug, - 'description': description, - 'thumbnail': thumbnail, - # TODO fill in other metadata that's available + 'thumbnail': media.get('data', {}).get('player', {}).get('image').get('url'), } From 4fc423845e8b5f8855fb6e5a0a5087064401b12b Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Tue, 5 Mar 2024 12:49:22 +0100 Subject: [PATCH 16/47] Fix lint --- youtube_dl/extractor/npo.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 3896968611a..53fd816f734 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -80,11 +80,9 @@ def _real_extract(self, url): product_id = entry.get('productId') title = entry.get('title') synopsis = entry.get('synopsis', {}) - description = ( - synopsis.get('long') - or synopsis.get('short') - or synopsis.get('brief') - ) + description = (synopsis.get('long') + or synopsis.get('short') + or synopsis.get('brief')) thumbnails = entry.get('images') for thumbnail_entry in thumbnails: if 'url' in thumbnail_entry: From 28ba01f1ccfc5560be7d027b1669822e44d4143f Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Tue, 5 Mar 2024 13:43:56 +0100 Subject: [PATCH 17/47] Add Ongehoord Nederland and test URL for BNNVARA --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/npo.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 696fd8e1e08..802e498f9dd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -847,7 +847,7 @@ NownessSeriesIE, ) from .noz import NozIE -from .npo import BNNVaraIE, NPOIE +from .npo import BNNVaraIE, NPOIE, ONIE from .npr import NprIE from .nrk import ( NRKIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 53fd816f734..d8573d343e3 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor from ..utils import ( @@ -129,6 +130,9 @@ class BNNVaraIE(NPOIE): IE_NAME = 'bnnvara' IE_DESC = 'bnnvara.nl' _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*' + _TESTS = [{ + 'url': 'https://www.bnnvara.nl/videos/27455', + }] def _real_extract(self, url): url = url.rstrip('/') @@ -159,3 +163,29 @@ def _real_extract(self, url): 'formats': formats, 'thumbnail': media.get('data', {}).get('player', {}).get('image').get('url'), } + + +class ONIE(NPOIE): + IE_NAME = 'on' + IE_DESC = 'ongehoordnederland.tv' + _VALID_URL = r'https?://(?:www\.)?ongehoordnederland.tv/.*' + _TESTS = [{ + 'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/', + }] + + def _real_extract(self, url): + video_id = url.rstrip('/').split('/')[-1] + page, _ = self._download_webpage_handle(url, video_id) + results = re.findall("page: '(.+)'", page) + formats = [] + for result in results: + formats.extend(self._download_by_product_id(result, video_id)) + + if not formats: + raise ExtractorError('Could not find a POMS product id in the provided URL.') + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } From eb6e396bfb66965487ef1e7c50edbf6e28130462 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Tue, 5 Mar 2024 13:55:59 +0100 Subject: [PATCH 18/47] First version of a VPRO regex --- youtube_dl/extractor/npo.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index d8573d343e3..d48a4cda078 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -4,9 +4,7 @@ import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) +from ..utils import ExtractorError class NPOIE(InfoExtractor): @@ -189,3 +187,29 @@ def _real_extract(self, url): 'title': video_id, 'formats': formats, } + + +class VPROIE(NPOIE): + IE_NAME = 'vpro' + IE_DESC = 'vpro.nl' + _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*' + _TESTS = [{ + 'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html', + }] + + def _real_extract(self, url): + video_id = url.rstrip('/').split('/')[-1] + page, _ = self._download_webpage_handle(url, video_id) + results = re.findall('data-media-id="(.+_.+)"\s', page) + formats = [] + for result in results: + formats.extend(self._download_by_product_id(result, video_id)) + + if not formats: + raise ExtractorError('Could not find a POMS product id in the provided URL.') + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } From d36d50fe5cf166899adfc85e7ca9b0f8f5272d19 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Tue, 5 Mar 2024 14:04:03 +0100 Subject: [PATCH 19/47] Re-add Zapp --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/npo.py | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 802e498f9dd..b3a9fdfbaf1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -847,7 +847,7 @@ NownessSeriesIE, ) from .noz import NozIE -from .npo import BNNVaraIE, NPOIE, ONIE +from .npo import BNNVaraIE, NPOIE, ONIE, VPROIE from .npr import NprIE from .nrk import ( NRKIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index d48a4cda078..84b41443bc3 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -204,6 +204,7 @@ def _real_extract(self, url): formats = [] for result in results: formats.extend(self._download_by_product_id(result, video_id)) + break # TODO find a better solution, VPRO pages can have multiple videos embedded if not formats: raise ExtractorError('Could not find a POMS product id in the provided URL.') @@ -213,3 +214,24 @@ def _real_extract(self, url): 'title': video_id, 'formats': formats, } + + +class ZAPPIE(NPOIE): + IE_NAME = 'zapp' + IE_DESC = 'zapp.nl' + _VALID_URL = r'https?://(?:www\.)?zapp.nl/.*' + + _TESTS = [{ + 'url': 'https://www.zapp.nl/programmas/zappsport/gemist/AT_300003973', + }] + + def _real_extract(self, url): + video_id = url.rstrip('/').split('/')[-1] + + formats = self._download_by_product_id(url, video_id) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } From d426a92a60ba9b6eb01256d3dcad4dcbfecd742c Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Tue, 5 Mar 2024 14:11:49 +0100 Subject: [PATCH 20/47] Encoding suggestion from PR --- youtube_dl/extractor/npo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 84b41443bc3..01eb54fc02d 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import json From 3b3d73cbe6f64d6485e03cb658cc491d4fa62333 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Wed, 6 Mar 2024 11:52:08 +0100 Subject: [PATCH 21/47] Use program-detail endpoint and remove a test --- youtube_dl/extractor/npo.py | 61 +++++++++++++++---------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 01eb54fc02d..239583b5bd1 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -25,18 +25,6 @@ class NPOIE(InfoExtractor): _TESTS = [{ 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', # TODO fill in other test attributes - }, { - 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', - 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', - 'info_dict': { - 'id': 'VARA_101191800', - 'ext': 'm4v', - 'title': 'De Mega Mike & Mega Thomas show: The best of.', - 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', - 'upload_date': '20090227', - 'duration': 2400, - }, - 'skip': 'Video gone', }, { 'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika', 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', @@ -66,27 +54,21 @@ def _real_extract(self, url): url = url[:-10] url = url.rstrip('/') slug = url.split('/')[-1] - page = self._download_webpage(url, slug, 'Finding productId using slug: %s' % slug) - # TODO find out what proper HTML parsing utilities are available in youtube-dl - next_data = page.split('')[0] - # TODO The data in this script tag feels like GraphQL, so there might be an easier way - # to get the product id, maybe using a GraphQL endpoint - next_data = self._parse_json(next_data, slug) - product_id, title, description, thumbnail = None, None, None, None - for query in next_data['props']['pageProps']['dehydratedState']['queries']: - if isinstance(query['state']['data'], list): - for entry in query['state']['data']: - if entry['slug'] == slug: - product_id = entry.get('productId') - title = entry.get('title') - synopsis = entry.get('synopsis', {}) - description = (synopsis.get('long') - or synopsis.get('short') - or synopsis.get('brief')) - thumbnails = entry.get('images') - for thumbnail_entry in thumbnails: - if 'url' in thumbnail_entry: - thumbnail = thumbnail_entry.get('url') + + program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail', + slug, + query={'slug': slug}) + product_id = program_metadata.get('productId') + images = program_metadata.get('images') + thumbnail = None + for image in images: + thumbnail = image.get('url') + break + title = program_metadata.get('title') + descriptions = program_metadata.get('description', {}) + description = descriptions.get('long') or descriptions.get('short') or descriptions.get('brief') + duration = program_metadata.get('durationInSeconds') + if not product_id: raise ExtractorError('No productId found for slug: %s' % slug) @@ -96,17 +78,18 @@ def _real_extract(self, url): 'id': slug, 'formats': formats, 'title': title or slug, - 'description': description, + 'description': description or title or slug, 'thumbnail': thumbnail, - # TODO fill in other metadata that's available + 'duration': duration, } def _download_by_product_id(self, product_id, slug, url=None): token = self._get_token(product_id) formats = [] for profile in ( - 'dash', - # 'hls', # TODO test what needs to change for 'hls' support + 'dash', + # 'hls' is available too, but implementing it doesn't add much + # As far as I know 'dash' is always available ): stream_link = self._download_json( 'https://prod.npoplayer.nl/stream-link', video_id=slug, @@ -131,6 +114,7 @@ class BNNVaraIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*' _TESTS = [{ 'url': 'https://www.bnnvara.nl/videos/27455', + # TODO fill in other test attributes }] def _real_extract(self, url): @@ -170,6 +154,7 @@ class ONIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?ongehoordnederland.tv/.*' _TESTS = [{ 'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/', + # TODO fill in other test attributes }] def _real_extract(self, url): @@ -196,6 +181,7 @@ class VPROIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*' _TESTS = [{ 'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html', + # TODO fill in other test attributes }] def _real_extract(self, url): @@ -224,6 +210,7 @@ class ZAPPIE(NPOIE): _TESTS = [{ 'url': 'https://www.zapp.nl/programmas/zappsport/gemist/AT_300003973', + # TODO fill in other test attributes }] def _real_extract(self, url): From 4b24e5f00da0b11f3e2989d5a568e862285d34ea Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Wed, 6 Mar 2024 12:22:27 +0100 Subject: [PATCH 22/47] Re-add SchoolTV --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/npo.py | 42 +++++++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b3a9fdfbaf1..5f2ac7ced47 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -847,7 +847,7 @@ NownessSeriesIE, ) from .noz import NozIE -from .npo import BNNVaraIE, NPOIE, ONIE, VPROIE +from .npo import BNNVaraIE, NPOIE, ONIE, VPROIE, SchoolTVIE from .npr import NprIE from .nrk import ( NRKIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 239583b5bd1..a28915bd08f 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -87,9 +87,9 @@ def _download_by_product_id(self, product_id, slug, url=None): token = self._get_token(product_id) formats = [] for profile in ( - 'dash', - # 'hls' is available too, but implementing it doesn't add much - # As far as I know 'dash' is always available + 'dash', + # 'hls' is available too, but implementing it doesn't add much + # As far as I know 'dash' is always available ): stream_link = self._download_json( 'https://prod.npoplayer.nl/stream-link', video_id=slug, @@ -223,3 +223,39 @@ def _real_extract(self, url): 'title': video_id, 'formats': formats, } + + +class SchoolTVIE(NPOIE): + IE_NAME = 'schooltv' + IE_DESC = 'schooltv.nl' + _VALID_URL = r'https?://(?:www\.)?schooltv.nl/item/.*' + + _TESTS = [{ + 'url': 'https://schooltv.nl/item/zapp-music-challenge-2015-zapp-music-challenge-2015', + # TODO fill in other test attributes + }] + + def _real_extract(self, url): + video_id = url.rstrip('/').split('/')[-1] + + build_id = 'b7eHUzAVO7wHXCopYxQhV' + + metadata_url = 'https://schooltv.nl/_next/data/' \ + + build_id \ + + '/item/' \ + + video_id + '.json' + + metadata = self._download_json(metadata_url, + video_id).get('pageProps', {}).get('data', {}) + + formats = self._download_by_product_id(metadata.get('poms_mid'), video_id) + + if not formats: + raise ExtractorError('Could not find a POMS product id in the provided URL.') + + return { + 'id': video_id, + 'title': metadata.get('title', '') + ' - ' + metadata.get('subtitle', ''), + 'description': metadata.get('description') or metadata.get('short_description'), + 'formats': formats, + } From 681b39032ae34709a74c5a4ab8f0d2275aab6880 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Wed, 6 Mar 2024 12:32:34 +0100 Subject: [PATCH 23/47] Fix flake8 and better error reporting --- youtube_dl/extractor/npo.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index a28915bd08f..c4e4097e34b 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -166,7 +166,8 @@ def _real_extract(self, url): formats.extend(self._download_by_product_id(result, video_id)) if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL.') + raise ExtractorError('Could not find a POMS product id in the provided URL, ' + 'perhaps because all stream URLs are DRM protected.') return { 'id': video_id, @@ -187,14 +188,15 @@ class VPROIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] page, _ = self._download_webpage_handle(url, video_id) - results = re.findall('data-media-id="(.+_.+)"\s', page) + results = re.findall(r'data-media-id="(.+_.+)"\s', page) formats = [] for result in results: formats.extend(self._download_by_product_id(result, video_id)) break # TODO find a better solution, VPRO pages can have multiple videos embedded if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL.') + raise ExtractorError('Could not find a POMS product id in the provided URL, ' + 'perhaps because all stream URLs are DRM protected.') return { 'id': video_id, @@ -238,6 +240,8 @@ class SchoolTVIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] + # TODO Find out how we could obtain this automatically + # Otherwise this extractor might break each time SchoolTV deploys a new release build_id = 'b7eHUzAVO7wHXCopYxQhV' metadata_url = 'https://schooltv.nl/_next/data/' \ @@ -251,7 +255,8 @@ def _real_extract(self, url): formats = self._download_by_product_id(metadata.get('poms_mid'), video_id) if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL.') + raise ExtractorError('Could not find a POMS product id in the provided URL, ' + 'perhaps because all stream URLs are DRM protected.') return { 'id': video_id, From 159f825edd6326fda7f43fb27d13db6cd2bbc4ca Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Wed, 6 Mar 2024 12:53:37 +0100 Subject: [PATCH 24/47] Add scaffolding for last few extractors and change order so the PR diff looks nice --- youtube_dl/extractor/npo.py | 91 +++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c4e4097e34b..196ab9d1b11 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -11,16 +11,7 @@ class NPOIE(InfoExtractor): IE_NAME = 'npo' IE_DESC = 'npo.nl' - _VALID_URL = r'''(?x) - (?: - https?:// - (?:www\.)? - (?: - npo\.nl/(?:[^/]+/)* - ) - ) - (?P[^/?#]+) - ''' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/.*' _TESTS = [{ 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', @@ -176,35 +167,6 @@ def _real_extract(self, url): } -class VPROIE(NPOIE): - IE_NAME = 'vpro' - IE_DESC = 'vpro.nl' - _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*' - _TESTS = [{ - 'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html', - # TODO fill in other test attributes - }] - - def _real_extract(self, url): - video_id = url.rstrip('/').split('/')[-1] - page, _ = self._download_webpage_handle(url, video_id) - results = re.findall(r'data-media-id="(.+_.+)"\s', page) - formats = [] - for result in results: - formats.extend(self._download_by_product_id(result, video_id)) - break # TODO find a better solution, VPRO pages can have multiple videos embedded - - if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL, ' - 'perhaps because all stream URLs are DRM protected.') - - return { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - class ZAPPIE(NPOIE): IE_NAME = 'zapp' IE_DESC = 'zapp.nl' @@ -264,3 +226,54 @@ def _real_extract(self, url): 'description': metadata.get('description') or metadata.get('short_description'), 'formats': formats, } + + +class HetKlokhuisIE(NPOIE): + ... + + def _real_extract(self, url): + ... + + +class VPROIE(NPOIE): + IE_NAME = 'vpro' + IE_DESC = 'vpro.nl' + _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*' + _TESTS = [{ + 'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html', + # TODO fill in other test attributes + }] + + def _real_extract(self, url): + video_id = url.rstrip('/').split('/')[-1] + page, _ = self._download_webpage_handle(url, video_id) + results = re.findall(r'data-media-id="(.+_.+)"\s', page) + formats = [] + for result in results: + formats.extend(self._download_by_product_id(result, video_id)) + break # TODO find a better solution, VPRO pages can have multiple videos embedded + + if not formats: + raise ExtractorError('Could not find a POMS product id in the provided URL, ' + 'perhaps because all stream URLs are DRM protected.') + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class WNLIE(NPOIE): + ... + + def _real_extract(self, url): + ... + + +class AndereTijdenIE(NPOIE): + ... + + def _real_extract(self, url): + ... + From 0cbcd1aec656998d44dbffe59cbb0adac4b84b45 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Wed, 6 Mar 2024 12:55:51 +0100 Subject: [PATCH 25/47] Make diff better --- youtube_dl/extractor/extractors.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5f2ac7ced47..b1093a1ac0e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -847,7 +847,16 @@ NownessSeriesIE, ) from .noz import NozIE -from .npo import BNNVaraIE, NPOIE, ONIE, VPROIE, SchoolTVIE +from .npo import ( + AndereTijdenIE, + BNNVaraIE, + NPOIE, + ONIE, + SchoolTVIE, + HetKlokhuisIE, + VPROIE, + WNLIE, +) from .npr import NprIE from .nrk import ( NRKIE, From 0ab79c37ae2c465678276bef0e9032efb30f464b Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 7 Mar 2024 16:23:09 +0100 Subject: [PATCH 26/47] Reusable code for two NTR sites --- youtube_dl/extractor/npo.py | 53 +++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 196ab9d1b11..77411da5215 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -228,11 +228,35 @@ def _real_extract(self, url): } -class HetKlokhuisIE(NPOIE): - ... - +class NTRSubsiteIE(NPOIE): def _real_extract(self, url): - ... + video_id = url.rstrip('/').split('/')[-1] + + page, _ = self._download_webpage_handle(url) + results = re.findall(r'data-mid="(.+_.+)"', page) + formats = [] + for result in results: + formats.extend(self._download_by_product_id(result, video_id)) + break + + if not formats: + raise ExtractorError('Could not find a POMS product id in the provided URL, ' + 'perhaps because all stream URLs are DRM protected.') + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class HetKlokhuisIE(NTRSubsiteIE): + IE_NAME = 'het-klokhuis' + IE_DESC = 'hetklokhuis.nl' + _VALID_URL = r'https?://(?:www\.)?het-klokhuis\.nl/.*' + _TESTS = [{ + 'url': 'https://hetklokhuis.nl/dossier/142/zoek-het-uit/tv-uitzending/2987/aliens' + }] class VPROIE(NPOIE): @@ -264,16 +288,11 @@ def _real_extract(self, url): } -class WNLIE(NPOIE): - ... - - def _real_extract(self, url): - ... - - -class AndereTijdenIE(NPOIE): - ... - - def _real_extract(self, url): - ... - +class AndereTijdenIE(NTRSubsiteIE): + IE_NAME = 'anderetijden' + IE_DESC = 'anderetijden.nl' + _VALID_URL = r'https?://(?:www\.)?anderetijden\.nl/.*' + _TESTS = [{ + 'url': 'https://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem' + # TODO fill in other test attributes + }] From c08f29f45b6b7f41127c8d9260617de7d69430f9 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 10 Mar 2024 16:27:40 +0100 Subject: [PATCH 27/47] Update unit tests --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/npo.py | 37 ++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b1093a1ac0e..e5c9af8ba40 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -855,7 +855,6 @@ SchoolTVIE, HetKlokhuisIE, VPROIE, - WNLIE, ) from .npr import NprIE from .nrk import ( diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 77411da5215..f5f7485735f 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -15,17 +15,24 @@ class NPOIE(InfoExtractor): _TESTS = [{ 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', - # TODO fill in other test attributes + 'md5': 'f9ce9c43cc8bc3b8138df1562b99c379', + 'info_dict': { + 'description': 'Wie is de mol? (2)', + 'ext': 'm4v', + 'duration': 2439, + 'id': 'wie-is-de-mol-2', + 'thumbnail': 'https://assets-start.npo.nl/resources/2023/07/01/e723c3cf-3e42-418a-9ba5-f6dbb64b516a.jpg', + 'title': 'Wie is de mol? (2)' + } }, { 'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika', - 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', + 'md5': 'c84d054219c4888ed53b4ee3d01b2d93', 'info_dict': { - 'id': 'VPWON_1169289', - 'ext': 'm4v', - 'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika', - 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', - 'upload_date': '20130225', - 'duration': 3000, + 'id': 'zwart-geld-de-toekomst-komt-uit-afrika', + 'title': 'Zwart geld: de toekomst komt uit Afrika', + 'description': 'Zwart geld: de toekomst komt uit Afrika', + 'thumbnail': 'https://assets-start.npo.nl/resources/2023/06/30/d9879593-1944-4249-990c-1561dac14d8e.jpg', + 'duration': 3000 }, }] @@ -105,7 +112,12 @@ class BNNVaraIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*' _TESTS = [{ 'url': 'https://www.bnnvara.nl/videos/27455', - # TODO fill in other test attributes + 'md5': '392dd367877739e49b9e0a9a550b178a', + 'info_dict': { + 'id': 'VARA_101369808', + 'thumbnail': 'https://media.vara.nl/files/thumbnails/321291_custom_zembla__wie_is_de_mol_680x383.jpg', + 'title': 'Zembla - Wie is de mol?' + } }] def _real_extract(self, url): @@ -265,7 +277,12 @@ class VPROIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*' _TESTS = [{ 'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html', - # TODO fill in other test attributes + 'md5': 'cf302e066b5313cfaf8d5adf50d64f13', + 'info_dict': { + 'id': 'offline-als-luxe.html', + 'title': 'offline-als-luxe.html', + 'ext': 'm4v', + } }] def _real_extract(self, url): From 28624cfe0930655b815f40d4b4820f76728de65e Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 10 Mar 2024 16:57:31 +0100 Subject: [PATCH 28/47] Work work --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/npo.py | 28 +++++++++++++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e5c9af8ba40..1a1905d13d7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -855,6 +855,7 @@ SchoolTVIE, HetKlokhuisIE, VPROIE, + ZAPPIE, ) from .npr import NprIE from .nrk import ( diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index f5f7485735f..699eedf1227 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -208,7 +208,12 @@ class SchoolTVIE(NPOIE): _TESTS = [{ 'url': 'https://schooltv.nl/item/zapp-music-challenge-2015-zapp-music-challenge-2015', - # TODO fill in other test attributes + 'md5': 'e9ef151c4886994e2bea23593348cb14', + 'info_dict': { + 'id': 'zapp-music-challenge-2015-zapp-music-challenge-2015', + 'title': 'Zapp Music Challenge 2015 - Alain Clark & Yaell', + 'description': "Een nummer schrijven met de super bekende soulzanger en producer Alain Clark? Dat is de uitdaging voor de dertienjarige Yaell uit Delft. En als het dan echt goed is, mag hij het ook nog eens live gaan spelen op de speelplaats bij Giel Beelen! Muziek is heel erg belangrijk in het leven van Yaell. 'Als er geen muziek zou zijn, dan zou ik heel veel niet kunnen.' Hij is dan ook altijd aan het schrijven, vaak over zijn eigen leven. Maar soms is het best lastig om die teksten te verzinnen. Vindt hij de inspiratie om een hit te maken met Alain?" + }, }] def _real_extract(self, url): @@ -244,7 +249,7 @@ class NTRSubsiteIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] - page, _ = self._download_webpage_handle(url) + page, _ = self._download_webpage_handle(url, video_id) results = re.findall(r'data-mid="(.+_.+)"', page) formats = [] for result in results: @@ -263,11 +268,16 @@ def _real_extract(self, url): class HetKlokhuisIE(NTRSubsiteIE): - IE_NAME = 'het-klokhuis' + IE_NAME = 'hetklokhuis' IE_DESC = 'hetklokhuis.nl' - _VALID_URL = r'https?://(?:www\.)?het-klokhuis\.nl/.*' + _VALID_URL = r'https?://(?:www\.)?hetklokhuis\.nl/.*' _TESTS = [{ - 'url': 'https://hetklokhuis.nl/dossier/142/zoek-het-uit/tv-uitzending/2987/aliens' + 'url': 'https://hetklokhuis.nl/dossier/142/zoek-het-uit/tv-uitzending/2987/aliens', + 'md5': '4664b54ed4e05183b1e4f2f4290d551e', + 'info_dict': { + 'id': 'aliens', + 'title': 'aliens' + } }] @@ -310,6 +320,10 @@ class AndereTijdenIE(NTRSubsiteIE): IE_DESC = 'anderetijden.nl' _VALID_URL = r'https?://(?:www\.)?anderetijden\.nl/.*' _TESTS = [{ - 'url': 'https://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem' - # TODO fill in other test attributes + 'url': 'https://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'md5': '3d607b16e00b459156b4ab6e163dccd7', + 'info_dict': { + 'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'title': 'Duitse-soldaten-over-de-Slag-bij-Arnhem' + } }] From 1ca4e686a3f9001cb52c8b682b57c1fba65700db Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 10 Mar 2024 17:04:00 +0100 Subject: [PATCH 29/47] Add an MD5 --- youtube_dl/extractor/npo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 699eedf1227..f4cd137ff93 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -157,7 +157,10 @@ class ONIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?ongehoordnederland.tv/.*' _TESTS = [{ 'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/', - # TODO fill in other test attributes + 'md5': 'a85ebd50fa86fe5cbce654655f7dbb12', + 'info_dict': { + + } }] def _real_extract(self, url): From 4398f6832f76948ee79025f0e055117182d1dfb3 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Mon, 11 Mar 2024 13:40:23 +0100 Subject: [PATCH 30/47] Fix zapp extractor --- youtube_dl/extractor/npo.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index f4cd137ff93..a5413a1d748 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -99,7 +99,8 @@ def _download_by_product_id(self, product_id, slug, url=None): headers={ 'Authorization': token, 'Content-Type': 'application/json', - } + }, + fatal=False, ) stream_url = stream_link.get('stream', {}).get('streamURL') formats.extend(self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False)) @@ -188,14 +189,18 @@ class ZAPPIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?zapp.nl/.*' _TESTS = [{ - 'url': 'https://www.zapp.nl/programmas/zappsport/gemist/AT_300003973', - # TODO fill in other test attributes + 'url': 'https://www.zapp.nl/programmas/zappsport/gemist/POMS_AT_811523', + 'md5': '9eb2d8b6f88b72b6b986ea2c26a81588', + 'info_dict': { + 'id': 'POMS_AT_811523', + 'title': 'POMS_AT_811523', + }, }] def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] - formats = self._download_by_product_id(url, video_id) + formats = self._download_by_product_id(video_id, video_id, url=url) return { 'id': video_id, @@ -279,8 +284,8 @@ class HetKlokhuisIE(NTRSubsiteIE): 'md5': '4664b54ed4e05183b1e4f2f4290d551e', 'info_dict': { 'id': 'aliens', - 'title': 'aliens' - } + 'title': 'aliens', + }, }] @@ -295,7 +300,7 @@ class VPROIE(NPOIE): 'id': 'offline-als-luxe.html', 'title': 'offline-als-luxe.html', 'ext': 'm4v', - } + }, }] def _real_extract(self, url): @@ -327,6 +332,6 @@ class AndereTijdenIE(NTRSubsiteIE): 'md5': '3d607b16e00b459156b4ab6e163dccd7', 'info_dict': { 'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', - 'title': 'Duitse-soldaten-over-de-Slag-bij-Arnhem' - } + 'title': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', + }, }] From 58d7a00e3f07744b65ad53d12fcee1ec0050de74 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Mon, 11 Mar 2024 14:14:38 +0100 Subject: [PATCH 31/47] Resolve some of the pull request feedback --- youtube_dl/extractor/npo.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index a5413a1d748..ea1e0fd2bad 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -18,8 +18,8 @@ class NPOIE(InfoExtractor): 'md5': 'f9ce9c43cc8bc3b8138df1562b99c379', 'info_dict': { 'description': 'Wie is de mol? (2)', - 'ext': 'm4v', 'duration': 2439, + 'ext': 'm4v', 'id': 'wie-is-de-mol-2', 'thumbnail': 'https://assets-start.npo.nl/resources/2023/07/01/e723c3cf-3e42-418a-9ba5-f6dbb64b516a.jpg', 'title': 'Wie is de mol? (2)' @@ -30,6 +30,7 @@ class NPOIE(InfoExtractor): 'info_dict': { 'id': 'zwart-geld-de-toekomst-komt-uit-afrika', 'title': 'Zwart geld: de toekomst komt uit Afrika', + 'ext': 'mp4', 'description': 'Zwart geld: de toekomst komt uit Afrika', 'thumbnail': 'https://assets-start.npo.nl/resources/2023/06/30/d9879593-1944-4249-990c-1561dac14d8e.jpg', 'duration': 3000 @@ -70,7 +71,7 @@ def _real_extract(self, url): if not product_id: raise ExtractorError('No productId found for slug: %s' % slug) - formats = self._download_by_product_id(product_id, slug, url) + formats = self._extract_formats_by_product_id(product_id, slug, url) return { 'id': slug, @@ -81,7 +82,7 @@ def _real_extract(self, url): 'duration': duration, } - def _download_by_product_id(self, product_id, slug, url=None): + def _extract_formats_by_product_id(self, product_id, slug, url=None): token = self._get_token(product_id) formats = [] for profile in ( @@ -93,7 +94,6 @@ def _download_by_product_id(self, product_id, slug, url=None): 'https://prod.npoplayer.nl/stream-link', video_id=slug, data=json.dumps({ 'profileName': profile, - 'drmType': 'widevine', 'referrerUrl': url or '', }).encode('utf8'), headers={ @@ -117,7 +117,8 @@ class BNNVaraIE(NPOIE): 'info_dict': { 'id': 'VARA_101369808', 'thumbnail': 'https://media.vara.nl/files/thumbnails/321291_custom_zembla__wie_is_de_mol_680x383.jpg', - 'title': 'Zembla - Wie is de mol?' + 'title': 'Zembla - Wie is de mol?', + 'ext': 'mp4', } }] @@ -142,7 +143,7 @@ def _real_extract(self, url): }) product_id = media.get('data', {}).get('player', {}).get('pomsProductId') - formats = self._download_by_product_id(product_id, video_id) + formats = self._extract_formats_by_product_id(product_id, video_id) return { 'id': product_id, @@ -170,7 +171,7 @@ def _real_extract(self, url): results = re.findall("page: '(.+)'", page) formats = [] for result in results: - formats.extend(self._download_by_product_id(result, video_id)) + formats.extend(self._extract_formats_by_product_id(result, video_id)) if not formats: raise ExtractorError('Could not find a POMS product id in the provided URL, ' @@ -200,7 +201,7 @@ class ZAPPIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] - formats = self._download_by_product_id(video_id, video_id, url=url) + formats = self._extract_formats_by_product_id(video_id, video_id, url=url) return { 'id': video_id, @@ -239,7 +240,7 @@ def _real_extract(self, url): metadata = self._download_json(metadata_url, video_id).get('pageProps', {}).get('data', {}) - formats = self._download_by_product_id(metadata.get('poms_mid'), video_id) + formats = self._extract_formats_by_product_id(metadata.get('poms_mid'), video_id) if not formats: raise ExtractorError('Could not find a POMS product id in the provided URL, ' @@ -261,7 +262,7 @@ def _real_extract(self, url): results = re.findall(r'data-mid="(.+_.+)"', page) formats = [] for result in results: - formats.extend(self._download_by_product_id(result, video_id)) + formats.extend(self._extract_formats_by_product_id(result, video_id)) break if not formats: @@ -309,7 +310,7 @@ def _real_extract(self, url): results = re.findall(r'data-media-id="(.+_.+)"\s', page) formats = [] for result in results: - formats.extend(self._download_by_product_id(result, video_id)) + formats.extend(self._extract_formats_by_product_id(result, video_id)) break # TODO find a better solution, VPRO pages can have multiple videos embedded if not formats: From ad64f3751e74c5ee2bbe45a6d5110813dbdd77f3 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 14 Mar 2024 13:34:33 +0100 Subject: [PATCH 32/47] Improve regex Co-authored-by: Roy --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index ea1e0fd2bad..27582ae9f18 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -307,7 +307,7 @@ class VPROIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] page, _ = self._download_webpage_handle(url, video_id) - results = re.findall(r'data-media-id="(.+_.+)"\s', page) + results = re.findall(r'data-media-id="([a-zA-Z0-9_]+)"\s', page) formats = [] for result in results: formats.extend(self._extract_formats_by_product_id(result, video_id)) From bc86c5f73b189a3ab5caa0f63d62ed8e3b70d741 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 14 Mar 2024 13:37:41 +0100 Subject: [PATCH 33/47] Make regex more specific and remove redundant .* --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 27582ae9f18..4651e68685f 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -11,7 +11,7 @@ class NPOIE(InfoExtractor): IE_NAME = 'npo' IE_DESC = 'npo.nl' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/.*' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/start/serie/' _TESTS = [{ 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', From 4c90b2f5875593af17dff13f96b8b05791f64a21 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 14 Mar 2024 13:39:59 +0100 Subject: [PATCH 34/47] Adhere to code style Co-authored-by: dirkf --- youtube_dl/extractor/npo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 4651e68685f..4a70e251b7f 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -55,8 +55,7 @@ def _real_extract(self, url): slug = url.split('/')[-1] program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail', - slug, - query={'slug': slug}) + slug, query={'slug': slug}) product_id = program_metadata.get('productId') images = program_metadata.get('images') thumbnail = None From 007bbeacd78e0d158f684b5a8833d6425a0312f9 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 14 Mar 2024 13:41:01 +0100 Subject: [PATCH 35/47] Remove afspelen and trailing slashes with one regex Co-authored-by: dirkf --- youtube_dl/extractor/npo.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 4a70e251b7f..545e585099e 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -44,14 +44,8 @@ def _get_token(self, video_id): note='Downloading token')['token'] def _real_extract(self, url): - # You might want to use removesuffix here, - # but removesuffix is introduced in Python 3.9 - # and youtube-dl supports Python 3.2+ - if url.endswith('/afspelen'): - url = url[:-9] - elif url.endswith('/afspelen/'): - url = url[:-10] - url = url.rstrip('/') + # Remove /afspelen and/or any trailing `/`s + url = re.sub(r'/(?:afspelen)?/*$', '', url) slug = url.split('/')[-1] program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail', From a60972e253dfe88c81601eaa2e2899afbc4c29fd Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 15 Mar 2024 13:02:56 +0100 Subject: [PATCH 36/47] Fix indent from suggestion --- youtube_dl/extractor/npo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 545e585099e..4dbab16ab47 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -44,8 +44,8 @@ def _get_token(self, video_id): note='Downloading token')['token'] def _real_extract(self, url): - # Remove /afspelen and/or any trailing `/`s - url = re.sub(r'/(?:afspelen)?/*$', '', url) + # Remove /afspelen and/or any trailing `/`s + url = re.sub(r'/(?:afspelen)?/*$', '', url) slug = url.split('/')[-1] program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail', From ad6ee6fdd2548cc153d85c74675a941699437a25 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sat, 21 Sep 2024 21:58:53 +0200 Subject: [PATCH 37/47] Commit two suggestions from the PR Co-authored-by: dirkf --- youtube_dl/extractor/npo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 4dbab16ab47..bfa96e6a787 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -95,7 +95,7 @@ def _extract_formats_by_product_id(self, product_id, slug, url=None): }, fatal=False, ) - stream_url = stream_link.get('stream', {}).get('streamURL') + stream_url = traverse_obj(stream_link, ('stream', 'streamURL')) formats.extend(self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False)) return formats @@ -160,7 +160,7 @@ class ONIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] - page, _ = self._download_webpage_handle(url, video_id) + page = self._download_webpage(url, video_id) results = re.findall("page: '(.+)'", page) formats = [] for result in results: From bf91db4846df82f474938fbe091055fc82eb8a0d Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sat, 21 Sep 2024 20:04:50 +0000 Subject: [PATCH 38/47] Use suggested util --- youtube_dl/extractor/npo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index bfa96e6a787..70e297e19d9 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ExtractorError, join_nonempty class NPOIE(InfoExtractor): @@ -241,7 +241,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': metadata.get('title', '') + ' - ' + metadata.get('subtitle', ''), + 'title': join_nonempty('title', 'subtitle', from_dict=metadata), 'description': metadata.get('description') or metadata.get('short_description'), 'formats': formats, } From 6de650f51fc9da6ae8a261b3f57e26f4ec78a2d1 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sat, 21 Sep 2024 20:08:53 +0000 Subject: [PATCH 39/47] Use traverse_obj in another place as well --- youtube_dl/extractor/npo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 70e297e19d9..e130f2dbdab 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor -from ..utils import ExtractorError, join_nonempty +from ..utils import ExtractorError, join_nonempty, traverse_obj class NPOIE(InfoExtractor): @@ -140,9 +140,9 @@ def _real_extract(self, url): return { 'id': product_id, - 'title': media.get('data', {}).get('player', {}).get('title'), + 'title': traverse_obj(media, ('data', 'player', 'title')), 'formats': formats, - 'thumbnail': media.get('data', {}).get('player', {}).get('image').get('url'), + 'thumbnail': traverse_obj(media, ('data', 'player', 'image', 'url')), } From c748eca829194de5aba9a66035549f2b9112aba2 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 20 Oct 2024 11:49:09 +0200 Subject: [PATCH 40/47] Automatically obtain NextJS buildId and change item to video-item --- youtube_dl/extractor/npo.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index e130f2dbdab..8299bfb168b 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -221,13 +221,14 @@ class SchoolTVIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] - # TODO Find out how we could obtain this automatically - # Otherwise this extractor might break each time SchoolTV deploys a new release - build_id = 'b7eHUzAVO7wHXCopYxQhV' + build_id = self._search_nextjs_data( + self._download_webpage(url, video_id), + video_id, + )['buildId'] metadata_url = 'https://schooltv.nl/_next/data/' \ + build_id \ - + '/item/' \ + + '/video-item/' \ + video_id + '.json' metadata = self._download_json(metadata_url, @@ -304,7 +305,7 @@ def _real_extract(self, url): formats = [] for result in results: formats.extend(self._extract_formats_by_product_id(result, video_id)) - break # TODO find a better solution, VPRO pages can have multiple videos embedded + break if not formats: raise ExtractorError('Could not find a POMS product id in the provided URL, ' From 41157b2b49914e1d786d86972122e1d4ebbbb6b5 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 20 Oct 2024 12:00:44 +0200 Subject: [PATCH 41/47] Move GraphQL query into separate variable --- youtube_dl/extractor/npo.py | 66 ++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 8299bfb168b..a53a3c3d92e 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -118,6 +118,70 @@ class BNNVaraIE(NPOIE): def _real_extract(self, url): url = url.rstrip('/') video_id = url.split('/')[-1] + graphql_query = """query getMedia($id: ID!, $mediaUrl: String, $hasAdConsent: Boolean!, $atInternetId: Int) { + player( + id: $id + mediaUrl: $mediaUrl + hasAdConsent: $hasAdConsent + atInternetId: $atInternetId + ) { + ... on PlayerSucces { + brand { + name + slug + broadcastsEnabled + __typename + } + title + programTitle + pomsProductId + broadcasters { + name + __typename + } + duration + classifications { + title + imageUrl + type + __typename + } + image { + title + url + __typename + } + cta { + title + url + __typename + } + genres { + name + __typename + } + subtitles { + url + language + __typename + } + sources { + name + url + ratio + __typename + } + type + token + __typename + } + ... on PlayerError { + error + __typename + } + __typename + } +}""" media = self._download_json('https://api.bnnvara.nl/bff/graphql', video_id, @@ -129,7 +193,7 @@ def _real_extract(self, url): 'hasAdConsent': False, 'atInternetId': 70 }, - 'query': 'query getMedia($id: ID!, $mediaUrl: String, $hasAdConsent: Boolean!, $atInternetId: Int) {\n player(\n id: $id\n mediaUrl: $mediaUrl\n hasAdConsent: $hasAdConsent\n atInternetId: $atInternetId\n ) {\n ... on PlayerSucces {\n brand {\n name\n slug\n broadcastsEnabled\n __typename\n }\n title\n programTitle\n pomsProductId\n broadcasters {\n name\n __typename\n }\n duration\n classifications {\n title\n imageUrl\n type\n __typename\n }\n image {\n title\n url\n __typename\n }\n cta {\n title\n url\n __typename\n }\n genres {\n name\n __typename\n }\n subtitles {\n url\n language\n __typename\n }\n sources {\n name\n url\n ratio\n __typename\n }\n type\n token\n __typename\n }\n ... on PlayerError {\n error\n __typename\n }\n __typename\n }\n}' + 'query': graphql_query }).encode('utf8'), headers={ 'Content-Type': 'application/json', From c3026dd70c4a0d74dc6079331cd037ed6fa7a479 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 20 Oct 2024 12:08:50 +0200 Subject: [PATCH 42/47] Apply suggestion from PR --- youtube_dl/extractor/npo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index a53a3c3d92e..4bb70ad5339 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -198,9 +198,10 @@ def _real_extract(self, url): headers={ 'Content-Type': 'application/json', }) - product_id = media.get('data', {}).get('player', {}).get('pomsProductId') - - formats = self._extract_formats_by_product_id(product_id, video_id) + + product_id = traverse_obj(media, ('data', 'player', 'pomsProductId')) + formats = self._download_by_product_id(product_id, video_id) if product_id else [] + self._sort_formats(formats) return { 'id': product_id, From 7f1c09bea173bd9bb6a81d793e85e49ddd0882c0 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 20 Oct 2024 12:11:30 +0200 Subject: [PATCH 43/47] Use _sort_formats util --- youtube_dl/extractor/npo.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 4bb70ad5339..96e854ae6ba 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -198,7 +198,7 @@ def _real_extract(self, url): headers={ 'Content-Type': 'application/json', }) - + product_id = traverse_obj(media, ('data', 'player', 'pomsProductId')) formats = self._download_by_product_id(product_id, video_id) if product_id else [] self._sort_formats(formats) @@ -231,9 +231,7 @@ def _real_extract(self, url): for result in results: formats.extend(self._extract_formats_by_product_id(result, video_id)) - if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL, ' - 'perhaps because all stream URLs are DRM protected.') + self._sort_formats(formats) return { 'id': video_id, From 0e1a0cfa03dcc0f089525cb0a2bdc82364927cc1 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 20 Oct 2024 12:28:10 +0200 Subject: [PATCH 44/47] Apply some more PR feedback --- youtube_dl/extractor/npo.py | 44 +++++++++++++------------------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 96e854ae6ba..98095ac6b19 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -50,30 +50,22 @@ def _real_extract(self, url): program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail', slug, query={'slug': slug}) - product_id = program_metadata.get('productId') - images = program_metadata.get('images') - thumbnail = None - for image in images: - thumbnail = image.get('url') - break - title = program_metadata.get('title') - descriptions = program_metadata.get('description', {}) - description = descriptions.get('long') or descriptions.get('short') or descriptions.get('brief') - duration = program_metadata.get('durationInSeconds') - + product_id = traverse_obj(program_metadata, 'productId') if not product_id: - raise ExtractorError('No productId found for slug: %s' % slug) - + raise ExtractorError('No productId found for slug: %s' % (slug,)) formats = self._extract_formats_by_product_id(product_id, slug, url) - - return { + self._sort_formats(formats) + return merge_dicts(traverse_obj(program_metadata, { + 'title': 'title', + 'description': (('description', ('long', 'short', 'brief')), 'title'), + 'thumbnail': ('images', Ellipsis, 'url', T(url_or_none)), + 'duration': ('durationInSeconds', T(int_or_none)), + }, get_all=False), { 'id': slug, 'formats': formats, - 'title': title or slug, - 'description': description or title or slug, - 'thumbnail': thumbnail, - 'duration': duration, - } + 'title': slug, + 'description': slug, + }) def _extract_formats_by_product_id(self, product_id, slug, url=None): token = self._get_token(product_id) @@ -299,9 +291,7 @@ def _real_extract(self, url): formats = self._extract_formats_by_product_id(metadata.get('poms_mid'), video_id) - if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL, ' - 'perhaps because all stream URLs are DRM protected.') + self._sort_formats(formats) return { 'id': video_id, @@ -322,9 +312,7 @@ def _real_extract(self, url): formats.extend(self._extract_formats_by_product_id(result, video_id)) break - if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL, ' - 'perhaps because all stream URLs are DRM protected.') + self._sort_formats(formats) return { 'id': video_id, @@ -370,9 +358,7 @@ def _real_extract(self, url): formats.extend(self._extract_formats_by_product_id(result, video_id)) break - if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL, ' - 'perhaps because all stream URLs are DRM protected.') + self._sort_formats(formats) return { 'id': video_id, From 817e2e5938707ae1ff2bde374b03d4f3265c6cae Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 20 Oct 2024 12:37:48 +0200 Subject: [PATCH 45/47] Fix some missing imports --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 98095ac6b19..8d8499e8411 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor -from ..utils import ExtractorError, join_nonempty, traverse_obj +from ..utils import ExtractorError, int_or_none, join_nonempty, merge_dicts, traverse_obj, url_or_none, T class NPOIE(InfoExtractor): From 75266ce4ed190192082cc86f8e17b16d984873d9 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 20 Oct 2024 13:41:53 +0200 Subject: [PATCH 46/47] Fix old metadata reference --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 8d8499e8411..89403a9d897 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -192,7 +192,7 @@ def _real_extract(self, url): }) product_id = traverse_obj(media, ('data', 'player', 'pomsProductId')) - formats = self._download_by_product_id(product_id, video_id) if product_id else [] + formats = self._extract_formats_by_product_id(product_id, video_id) if product_id else [] self._sort_formats(formats) return { From 6f271423e8564fd503f0430fdbd627923503ca43 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 20 Oct 2024 14:38:03 +0200 Subject: [PATCH 47/47] Update tests --- youtube_dl/extractor/npo.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 89403a9d897..a6281f2a41c 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -17,23 +17,23 @@ class NPOIE(InfoExtractor): 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', 'md5': 'f9ce9c43cc8bc3b8138df1562b99c379', 'info_dict': { - 'description': 'Wie is de mol? (2)', + 'title': 'Wie is de mol? (2)', + 'thumbnail': 'https://assets-start.npo.nl/resources/2023/07/01/e723c3cf-3e42-418a-9ba5-f6dbb64b516a.jpg', 'duration': 2439, - 'ext': 'm4v', 'id': 'wie-is-de-mol-2', - 'thumbnail': 'https://assets-start.npo.nl/resources/2023/07/01/e723c3cf-3e42-418a-9ba5-f6dbb64b516a.jpg', - 'title': 'Wie is de mol? (2)' + 'description': 'wie-is-de-mol-2', + 'ext': 'mp4', } }, { 'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika', 'md5': 'c84d054219c4888ed53b4ee3d01b2d93', 'info_dict': { - 'id': 'zwart-geld-de-toekomst-komt-uit-afrika', 'title': 'Zwart geld: de toekomst komt uit Afrika', - 'ext': 'mp4', - 'description': 'Zwart geld: de toekomst komt uit Afrika', 'thumbnail': 'https://assets-start.npo.nl/resources/2023/06/30/d9879593-1944-4249-990c-1561dac14d8e.jpg', - 'duration': 3000 + 'duration': 3000, + 'id': 'zwart-geld-de-toekomst-komt-uit-afrika', + 'description': 'zwart-geld-de-toekomst-komt-uit-afrika', + 'ext': 'mp4', }, }] @@ -211,7 +211,9 @@ class ONIE(NPOIE): 'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/', 'md5': 'a85ebd50fa86fe5cbce654655f7dbb12', 'info_dict': { - + 'id': 'heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel', + 'title': 'heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel', + 'ext': 'mp4', } }] @@ -243,6 +245,7 @@ class ZAPPIE(NPOIE): 'info_dict': { 'id': 'POMS_AT_811523', 'title': 'POMS_AT_811523', + 'ext': 'mp4', }, }] @@ -268,8 +271,9 @@ class SchoolTVIE(NPOIE): 'md5': 'e9ef151c4886994e2bea23593348cb14', 'info_dict': { 'id': 'zapp-music-challenge-2015-zapp-music-challenge-2015', - 'title': 'Zapp Music Challenge 2015 - Alain Clark & Yaell', - 'description': "Een nummer schrijven met de super bekende soulzanger en producer Alain Clark? Dat is de uitdaging voor de dertienjarige Yaell uit Delft. En als het dan echt goed is, mag hij het ook nog eens live gaan spelen op de speelplaats bij Giel Beelen! Muziek is heel erg belangrijk in het leven van Yaell. 'Als er geen muziek zou zijn, dan zou ik heel veel niet kunnen.' Hij is dan ook altijd aan het schrijven, vaak over zijn eigen leven. Maar soms is het best lastig om die teksten te verzinnen. Vindt hij de inspiratie om een hit te maken met Alain?" + 'title': 'Zapp Music Challenge 2015-Alain Clark & Yaell', + 'description': "Een nummer schrijven met de super bekende soulzanger en producer Alain Clark? Dat is de uitdaging voor de dertienjarige Yaell uit Delft. En als het dan echt goed is, mag hij het ook nog eens live gaan spelen op de speelplaats bij Giel Beelen! Muziek is heel erg belangrijk in het leven van Yaell. 'Als er geen muziek zou zijn, dan zou ik heel veel niet kunnen.' Hij is dan ook altijd aan het schrijven, vaak over zijn eigen leven. Maar soms is het best lastig om die teksten te verzinnen. Vindt hij de inspiratie om een hit te maken met Alain?", + 'ext': 'mp4', }, }] @@ -331,6 +335,7 @@ class HetKlokhuisIE(NTRSubsiteIE): 'info_dict': { 'id': 'aliens', 'title': 'aliens', + 'ext': 'mp4', }, }] @@ -345,7 +350,7 @@ class VPROIE(NPOIE): 'info_dict': { 'id': 'offline-als-luxe.html', 'title': 'offline-als-luxe.html', - 'ext': 'm4v', + 'ext': 'mp4', }, }] @@ -377,5 +382,6 @@ class AndereTijdenIE(NTRSubsiteIE): 'info_dict': { 'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', 'title': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'ext': 'mp4', }, }]