[YouTube] Always extract using MWEB API client

* temporary fix-up for 403 on download * MWEB parameters from yt-dlp 2024-12-06
ytdl-org · Dec 14, 2024 · 12bc377 · 12bc377
1 parent 82b5256
commit 12bc377
Showing 1 changed file with 77 additions and 7 deletions.
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
@@ -3,11 +3,13 @@
 from __future__ import unicode_literals
 
 import collections
+import hashlib
 import itertools
 import json
 import os.path
 import random
 import re
+import time
 import traceback
 
 from .common import InfoExtractor, SearchInfoExtractor
@@ -290,6 +292,33 @@ def _real_initialize(self):
     _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
     _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
 
+    _SAPISID = None
+
+    def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
+        time_now = round(time.time())
+        if self._SAPISID is None:
+            yt_cookies = self._get_cookies('https://www.youtube.com')
+            # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
+            # See: https://github.com/yt-dlp/yt-dlp/issues/393
+            sapisid_cookie = dict_get(
+                yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
+            if sapisid_cookie and sapisid_cookie.value:
+                self._SAPISID = sapisid_cookie.value
+                self.write_debug('Extracted SAPISID cookie')
+                # SAPISID cookie is required if not already present
+                if not yt_cookies.get('SAPISID'):
+                    self.write_debug('Copying __Secure-3PAPISID cookie to SAPISID cookie')
+                    self._set_cookie(
+                        '.youtube.com', 'SAPISID', self._SAPISID, secure=True, expire_time=time_now + 3600)
+            else:
+                self._SAPISID = False
+        if not self._SAPISID:
+            return None
+        # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
+        sapisidhash = hashlib.sha1(
+            '{0} {1} {2}'.format(time_now, self._SAPISID, origin).encode('utf-8')).hexdigest()
+        return 'SAPISIDHASH {0}_{1}'.format(time_now, sapisidhash)
+
     def _call_api(self, ep, query, video_id, fatal=True, headers=None):
         data = self._DEFAULT_API_DATA.copy()
         data.update(query)
@@ -1914,9 +1943,50 @@ def _real_extract(self, url):
             player_response = self._extract_yt_initial_variable(
                 webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
                 video_id, 'initial player response')
-        if not player_response:
+        if False and not player_response:
             player_response = self._call_api(
                 'player', {'videoId': video_id}, video_id)
+        if True or not player_response:
+            origin = 'https://www.youtube.com'
+            pb_context = {'html5Preference': 'HTML5_PREF_WANTS'}
+
+            player_url = self._extract_player_url(webpage)
+            ytcfg = self._extract_ytcfg(video_id, webpage)
+            sts = self._extract_signature_timestamp(video_id, player_url, ytcfg)
+            if sts:
+                pb_context['signatureTimestamp'] = sts
+
+            query = {
+                'playbackContext': {
+                    'contentPlaybackContext': pb_context,
+                    'contentCheckOk': True,
+                    'racyCheckOk': True,
+                },
+                'context': {
+                    'client': {
+                        'clientName': 'MWEB',
+                        'clientVersion': '2.20241202.07.00',
+                        'hl': 'en',
+                        'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)',
+                        'timeZone': 'UTC',
+                        'utcOffsetMinutes': 0,
+                    },
+                },
+                'videoId': video_id,
+            }
+            headers = {
+                'X-YouTube-Client-Name': '2',
+                'X-YouTube-Client-Version': '2.20241202.07.00',
+                'Origin': origin,
+                'Sec-Fetch-Mode': 'navigate',
+                'User-Agent': query['context']['client']['userAgent'],
+            }
+            auth = self._generate_sapisidhash_header(origin)
+            if auth is not None:
+                headers['Authorization'] = auth
+                headers['X-Origin'] = origin
+
+            player_response = self._call_api('player', query, video_id, fatal=False, headers=headers)
 
         def is_agegated(playability):
             if not isinstance(playability, dict):
@@ -2223,12 +2293,12 @@ def process_manifest_format(f, proto, client_name, itag, all_formats=False):
                         formats.append(f)
 
         playable_formats = [f for f in formats if not f.get('has_drm')]
-        if formats and not playable_formats:
-            # If there are no formats that definitely don't have DRM, all have DRM
-            self.report_drm(video_id)
-        formats[:] = playable_formats
-
-        if not formats:
+        if formats:
+            if not playable_formats:
+                # If there are no formats that definitely don't have DRM, all have DRM
+                self.report_drm(video_id)
+            formats[:] = playable_formats
+        else:
             if streaming_data.get('licenseInfos'):
                 raise ExtractorError(
                     'This video is DRM protected.', expected=True)