Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Small extraction updates #32944

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
__pycache__/
*.pyc
*.pyo
*.class
*~
*.DS_Store
wine-py2exe/
py2exe.log
.pytest_cache/
*.kate-swp
build/
dist/
Expand Down
10 changes: 8 additions & 2 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1603,11 +1603,14 @@ def test_urshift(self):

def test_get_element_by_class(self):
html = '''
<span class="foo bar">nice</span>
<span class="foo bar baz-bam">nice</span>
'''

self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('bar', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
self.assertEqual(get_element_by_class('baz', html), None)
self.assertEqual(get_element_by_class('bam', html), None)

def test_get_element_by_attribute(self):
html = '''
Expand All @@ -1626,10 +1629,13 @@ def test_get_element_by_attribute(self):

def test_get_elements_by_class(self):
html = '''
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
<span class="foo bar baz-bam">nice</span><span class="foo bar">also nice</span>
'''

self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('bar', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('baz', html), [])
self.assertEqual(get_elements_by_class('bam', html), [])
self.assertEqual(get_elements_by_class('no-such-class', html), [])

def test_get_elements_by_attribute(self):
Expand Down
3 changes: 2 additions & 1 deletion youtube_dl/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3128,7 +3128,8 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
continue
urls.add(source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
# https://github.com/yt-dlp/yt-dlp/pull/10956
ext = determine_ext(source_url, default_ext=mimetype2ext(source_type))
if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
Expand Down
85 changes: 69 additions & 16 deletions youtube_dl/extractor/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,14 @@
parse_resolution,
sanitized_Request,
smuggle_url,
strip_or_none,
T,
traverse_obj,
unescapeHTML,
unified_timestamp,
unsmuggle_url,
UnsupportedError,
update_url_query,
url_or_none,
urljoin,
xpath_attr,
Expand Down Expand Up @@ -2237,6 +2241,7 @@ class GenericIE(InfoExtractor):
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July',
'description': 'Kelis - 4th Of July',
'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
}, {
Expand All @@ -2246,7 +2251,7 @@ class GenericIE(InfoExtractor):
'id': '105',
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July / Embed Player',
'title': r're:Kelis - 4th Of July(?: / Embed Player)?$',
'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
'params': {
Expand Down Expand Up @@ -2297,6 +2302,32 @@ class GenericIE(InfoExtractor):
'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
},
'skip': 'needs Referer ?',
}, {
# KVS Player v10
'url': 'https://www.cambro.tv/588174/marleny-1/',
'md5': '759d2050590986c6fc341da0592c4d8e',
'info_dict': {
'id': '588174',
'display_id': 'marleny-1',
'ext': 'mp4',
'title': 'marleny 1',
'description': 'la maestra de tic toc',
'thumbnail': r're:https?://www\.cambro\.tv/contents/videos_screenshots/588000/588174/preview\.jpg',
'age_limit': 18,
},
}, {
# KVS Player v10 embed, NSFW
'url': 'https://www.cambro.tv/embed/436185',
'md5': '24338dc8b182900a2c9eda075a0a46c0',
'info_dict': {
'id': '436185',
'display_id': 'jaeandbailey-chaturbate-webcam-porn-videos',
'ext': 'mp4',
'title': 'jaeandbailey Chaturbate webcam porn videos',
'thumbnail': r're:https?://www\.cambro\.tv/contents/videos_screenshots/436000/436185/preview\.jpg',
'age_limit': 18,
},
}, {
'url': 'https://mrdeepfakes.com/video/5/selena-gomez-pov-deep-fakes',
'md5': 'fec4ad5ec150f655e0c74c696a4a2ff4',
Expand All @@ -2309,14 +2340,16 @@ class GenericIE(InfoExtractor):
'height': 720,
'age_limit': 18,
},
# 'skip': 'Geo-blocked in some mjurisdictions',
}, {
# KVS Player v2
'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
'md5': 'e2f0a4c329f7986280b7328e24036d60',
'info_dict': {
'id': '284002',
'display_id': 'just-out-of-the-shower-joi',
'ext': 'mp4',
'title': 'Just Out Of The Shower JOI - Shooshtime',
'title': r're:Just Out Of The Shower JOI(?: - Shooshtime)?$',
'height': 720,
'age_limit': 18,
},
Expand Down Expand Up @@ -2482,9 +2515,12 @@ def spells(x, o):
return '/'.join(urlparts) + '?' + url_query

flashvars = self._search_regex(
r'(?s)<script\b[^>]*>.*?var\s+flashvars\s*=\s*(\{.+?\});.*?</script>',
webpage, 'flashvars')
flashvars = self._parse_json(flashvars, video_id, transform_source=js_to_json)
r'''(?<![=!+*-])=\s*kt_player\s*\(\s*'kt_player'\s*,\s*[^)]+,\s*([\w$]+)\s*\)''',
webpage, 'flashvars name', default='flashvars')
flashvars = self._search_json(
r'<script(?:\s[^>]*)?>[\s\S]*?var\s+%s\s*=' % (flashvars,),
webpage, 'flashvars', video_id, end_pattern=r';[\s\S]*?</script>',
transform_source=js_to_json)

# extract the part after the last / as the display_id from the
# canonical URL.
Expand All @@ -2493,22 +2529,21 @@ def spells(x, o):
r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
webpage, 'display_id', fatal=False
)
title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')

thumbnail = flashvars['preview_url']
if thumbnail.startswith('//'):
protocol, _, _ = url.partition('/')
thumbnail = protocol + thumbnail
title = flashvars.get('video_title') or self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')

url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
formats = []
for key in url_keys:
if '/get_file/' not in flashvars[key]:
continue
format_id = flashvars.get(key + '_text', key)
f_url = urljoin(url, getrealurl(flashvars[key], flashvars['license_code']))
rnd = flashvars.get('rnd', key)
if rnd:
f_url = update_url_query(f_url, {'rnd': rnd})
formats.append(merge_dicts(
parse_resolution(format_id) or parse_resolution(flashvars[key]), {
'url': urljoin(url, getrealurl(flashvars[key], flashvars['license_code'])),
'url': f_url,
'format_id': format_id,
'ext': 'mp4',
'http_headers': {'Referer': url},
Expand All @@ -2518,13 +2553,31 @@ def spells(x, o):

self._sort_formats(formats)

return {
csv2list = (T(lambda s: s.split(',')), Ellipsis, T(strip_or_none))
info = traverse_obj(flashvars, {
'tags': ('video_tags',) + csv2list,
'categories': ('video_categories',) + csv2list,
'thumbnails': (
T(dict.items), lambda _, k_v: k_v[0].startswith('preview_url'), {
'url': (1, T(lambda u: urljoin(url, u))),
'preference': (0, T(lambda k: 100 - len(k))),
}),
})
info = merge_dicts(info, {
'id': flashvars['video_id'],
'display_id': display_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
}
})

# check-porn test for embed pages
if 'age_limit' not in info and traverse_obj(info, (
('title', (('tags', 'categories'), Ellipsis) or []),
T(lambda t: bool(re.search(r'(?i)(?:^|\s+)porn(?:$|\s+)', t)) or None)),
get_all=False):
info['age_limit'] = 18

return info

def _real_extract(self, url):
if url.startswith('//'):
Expand Down Expand Up @@ -3598,7 +3651,7 @@ def _real_extract(self, url):
), webpage, 'KVS player', group='ver', default=False)
if found:
self.report_extraction('%s: KVS Player' % (video_id, ))
if found.split('.')[0] not in ('4', '5', '6'):
if found.split('.')[0] not in ('2', '4', '5', '6', '10'):
self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found, ))
return merge_dicts(
self._extract_kvs(url, webpage, video_id),
Expand Down
20 changes: 13 additions & 7 deletions youtube_dl/extractor/hentaistigma.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from ..utils import (
merge_dicts,
traverse_obj,
)


class HentaiStigmaIE(InfoExtractor):
Expand All @@ -24,16 +29,17 @@ def _real_extract(self, url):
title = self._html_search_regex(
r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>',
webpage, 'title')
wrap_url = self._html_search_regex(

wrap_url = self._search_regex(
r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url')
wrap_webpage = self._download_webpage(wrap_url, video_id)

video_url = self._html_search_regex(
r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url')
vid_page = self._download_webpage(wrap_url, video_id)

entries = self._parse_html5_media_entries(wrap_url, vid_page, video_id)
self._sort_formats(traverse_obj(entries, (0, 'formats')) or [])

return {
return merge_dicts({
'id': video_id,
'url': video_url,
'title': title,
'age_limit': 18,
}
}, entries[0])
2 changes: 1 addition & 1 deletion youtube_dl/extractor/kaltura.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class KalturaIE(InfoExtractor):
(?:
kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|
https?://
(:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/
(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/
(?:
(?:
# flash player
Expand Down
2 changes: 1 addition & 1 deletion youtube_dl/extractor/mgoon.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

class MgoonIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:www\.)?
(?:(:?m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)|
(?:(?:m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)|
video\.mgoon\.com)/(?P<id>[0-9]+)'''
_API_URL = 'http://mpos.mgoon.com/player/video?id={0:}'
_TESTS = [
Expand Down
6 changes: 5 additions & 1 deletion youtube_dl/extractor/orf.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class ORFRadioIE(ORFRadioBase):

_VALID_URL = (
r'https?://sound\.orf\.at/radio/(?P<station>{0})/sendung/(?P<id>\d+)(?:/(?P<show>\w+))?'.format(_STATION_RE),
r'https?://(?P<station>{0})\.orf\.at/player/(?P<date>\d{{8}})/(?P<id>\d+)'.format(_STATION_RE),
r'https?://(?P<station>{0})\.orf\.at/(?:player|programm)/(?P<date>\d{{8}})/(?P<id>\d+)'.format(_STATION_RE),
)

_TESTS = [{
Expand Down Expand Up @@ -150,6 +150,10 @@ class ORFRadioIE(ORFRadioBase):
'duration': 1500,
},
'skip': 'Shows from ORF Sound are only available for 30 days.'
}, {
# yt-dlp/yt-dlp#11014
'url': 'https://oe1.orf.at/programm/20240916/769302/Playgrounds',
'only_matching': True,
}]

def _real_extract(self, url):
Expand Down
Loading
Loading