yt-dlp · JerryZhouSirui · Apr 26, 2024 · Apr 26, 2024 · pukkandan · Apr 26, 2024
diff --git a/supportedsites.md b/supportedsites.md
@@ -503,6 +503,7 @@
  - **gem.cbc.ca**: [*cbcgem*](## "netrc machine")
  - **gem.cbc.ca:live**
  - **gem.cbc.ca:playlist**
+ - **generic**: Generic downloader that works on some sites
  - **Genius**
  - **GeniusLyrics**
  - **GetCourseRu**: [*getcourseru*](## "netrc machine")

diff --git a/yt_dlp/extractor/beatport.py b/yt_dlp/extractor/beatport.py
@@ -2,7 +2,7 @@
 
 from .common import InfoExtractor
 from ..compat import compat_str
-from ..utils import int_or_none
+from ..utils import int_or_none, ExtractorError
 
 
 class BeatportIE(InfoExtractor):
@@ -43,55 +43,47 @@ def _real_extract(self, url):
 
  webpage = self._download_webpage(url, display_id)
 
- playables = self._parse_json(
- self._search_regex(
- r'window\.Playables\s*=\s*({.+?});', webpage,
- 'playables info', flags=re.DOTALL),
- track_id)
+ try:
+ playables_json = self._search_regex(
+ r'window\.Playables\s*=\s*({.+?})\s*;', webpage,
+ 'playables info', default='{}', flags=re.DOTALL)
+ playables = self._parse_json(playables_json, track_id)
+ except re.error:
+ raise ExtractorError('Failed to extract playables information. The page structure may have changed.')
 
- track = next(t for t in playables['tracks'] if t['id'] == int(track_id))
+ if not playables or 'tracks' not in playables:
+ raise ExtractorError('No playable tracks found in the extracted information.')
 
- title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name']
- if track['mix']:
+ track = next((t for t in playables['tracks'] if t['id'] == int(track_id)), None)
+ if not track:
+ raise ExtractorError(f'No track with ID {track_id} found.')
+
+ title = ', '.join(a['name'] for a in track['artists']) + ' - ' + track['name']
+ if track.get('mix'):
  title += ' (' + track['mix'] + ')'
- if not playables or 'tracks' not in playables:
- raise ExtractorError('No playable tracks found in the extracted information.')
-
- title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name']
- if track['mix']:
- track = next((t for t in playables['tracks'] if t['id'] == int(track_id)), None)
- if not track:
- raise ExtractorError(f'No track with ID {track_id} found.')
-
- title = ', '.join(a['name'] for a in track['artists']) + ' - ' + track['name']
- if track.get('mix'):
- title += ' (' + track['mix'] + ')'
+ track = traverse_obj(playables, ('tracks', lambda _, t: t['id'] == int(track_id), {dict}))
+ if not track:
+ raise ExtractorError(f'No track with ID {track_id} found')
+ title = join_nonempty(
+ ', '.join(traverse_obj(track, ('artists', ..., 'name'))),
+ track.get('name'), format_field(track, 'mix', '(%s)'))
- if not playables or 'tracks' not in playables:
- raise ExtractorError('No playable tracks found in the extracted information.')
-
- title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name']
- if track['mix']:
- track = next((t for t in playables['tracks'] if t['id'] == int(track_id)), None)
- if not track:
- raise ExtractorError(f'No track with ID {track_id} found.')
-
- title = ', '.join(a['name'] for a in track['artists']) + ' - ' + track['name']
- if track.get('mix'):
- title += ' (' + track['mix'] + ')'
+ track = traverse_obj(playables, ('tracks', lambda _, t: t['id'] == int(track_id), {dict}))
+ if not track:
+ raise ExtractorError(f'No track with ID {track_id} found')
+ title = join_nonempty(
+ ', '.join(traverse_obj(track, ('artists', ..., 'name'))),
+ track.get('name'), format_field(track, 'mix', '(%s)'))
 
  formats = []
- for ext, info in track['preview'].items():
- if not info['url']:
- continue
- fmt = {
- 'url': info['url'],
- 'ext': ext,
- 'format_id': ext,
- 'vcodec': 'none',
- }
- if ext == 'mp3':
- fmt['acodec'] = 'mp3'
- fmt['abr'] = 96
- fmt['asr'] = 44100
- elif ext == 'mp4':
- fmt['acodec'] = 'aac'
- fmt['abr'] = 96
- fmt['asr'] = 44100
- formats.append(fmt)
+ for ext, info in track.get('preview', {}).items():
+ url = info.get('url')
+ if url:
+ fmt = {
+ 'url': url,
+ 'ext': ext,
+ 'format_id': ext,
+ 'vcodec': 'none',
+ 'acodec': 'mp3' if ext == 'mp3' else 'aac',
+ 'abr': 96,
+ 'asr': 44100
+ }
+ formats.append(fmt)
 
- images = []
- for name, info in track['images'].items():
- image_url = info.get('url')
- if name == 'dynamic' or not image_url:
- continue
- image = {
- 'id': name,
- 'url': image_url,
- 'height': int_or_none(info.get('height')),
- 'width': int_or_none(info.get('width')),
- }
- images.append(image)
+ images = [{'id': name, 'url': info['url'], 'height': int_or_none(info.get('height')), 'width': int_or_none(info.get('width'))}
+ for name, info in track.get('images', {}).items() if name != 'dynamic' and info.get('url')]
 
  return {
- 'id': compat_str(track.get('id')) or track_id,
- 'display_id': track.get('slug') or display_id,
+ 'id': compat_str(track.get('id', track_id)),
+ 'display_id': track.get('slug', display_id),
  'title': title,
  'formats': formats,
- 'thumbnails': images,
- }
+ 'thumbnails': images
+ }
diff --git a/yt_dlp/extractor/tele5.py b/yt_dlp/extractor/tele5.py
@@ -1,17 +1,68 @@
+import re
+
+import requests
+
 from .dplay import DPlayIE
 from ..compat import compat_urlparse
 from ..utils import (
  ExtractorError,
- extract_attributes,
 )
 
+def _generate_video_specific_cache_url(slug, parent_slug):
+ """
+ Generate the MAGIC string for the video specific cache url.
+ :param slug: The part of the url that identifies the video by title.
+ :param parent_slug: The part of the url that identifies the PARENT directory.
+ :return: The generated url.
+ """
+ return 'https://de-api.loma-cms.com/feloma/page/{0}/?environment=tele5&parent_slug={1}&v=2'.format(slug,
+ parent_slug)
+def _do_cached_post(s: requests.session,
+ referer: str,
+ url: str) -> dict:
+ """
+ Do the API call to CACHED json endpoint.
+ It is likely connected to the new "loma-cms" API.
+ :param s: The session we use.
+ :param referer: The referer url.
+ :param url: The url to retrieve the cached data for.
+ :return: The json dict from the response.
+ """
+ r = s.post(url='https://tele5.de/cached',
+ headers={
+ 'Origin': 'https://tele5.de',
+ 'Referer': referer,
+ # Referer is a mandatory key,
+ 'User-Agent': 'Youtube-DL',
+ # User-Agent is a mandatory key, it can be anything!
+ },
+ json={'path': url}
+ )
+ r.raise_for_status()
+ return r.json()
 
 class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE
  _WORKING = False
  _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  _GEO_COUNTRIES = ['DE']
  _TESTS = [{
+ 'url': 'https://tele5.de/mediathek/sorority-babes-in-the-slimeball-bowl-o-rama',
+ 'info_dict': {
+ 'id': '5582852',
+ 'title': 'Sorority Babes in the Slimeball Bowl-O-Rama',
+ 'ext': 'mp4',
+ 'series': 'Sorority Babes in the Slimeball Bowl-O-Rama',
+ 'duration': 4779.88,
+ 'description': 'md5:1d8d30ed3d221613861aaefa8d7e887e',
+ 'timestamp': 1697839800,
+ 'upload_date': '20231020',
+ 'creator': 'Tele5',
+ 'tags': [],
+ 'thumbnail': 'https://eu1-prod-images.disco-api.com/2023/10/02/501fa839-d3ac-3c04-aa61-57f98802c532.jpeg',
+ },
+ }, {
  'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416',
+ 'only_matching': True,
  'info_dict': {
  'id': '1549416',
  'ext': 'mp4',
@@ -26,6 +77,7 @@ class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE
  }, {
  # jwplatform, nexx unavailable
  'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/',
+ 'only_matching': True,
  'info_dict': {
  'id': 'WJuiOlUp',
  'ext': 'mp4',
@@ -40,6 +92,7 @@ class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE
  'skip': 'No longer available, redirects to Filme page',
  }, {
  'url': 'https://tele5.de/mediathek/angel-of-mine/',
+ 'only_matching': True,
  'info_dict': {
  'id': '1252360',
  'ext': 'mp4',
@@ -72,18 +125,43 @@ class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE
  }]
 
  def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- player_element = self._search_regex(r'(<hyoga-player\b[^>]+?>)', webpage, 'video player')
- player_info = extract_attributes(player_element)
- asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', ))
- endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname
- source_type = player_info.get('sourcetype')
- if source_type:
- endpoint = '%s-%s' % (source_type, endpoint)
- try:
- return self._get_disco_api_info(url, asset_id, endpoint, realm, country)
- except ExtractorError as e:
- if getattr(e, 'message', '') == 'Missing deviceId in context':
- self.report_drm(video_id)
- raise
+ content_regex = re.compile(r'https?://(?:www\.)?(?P<environment>[^.]+)\.de/(?P<parent_slug>[^/]+)/(?P<slug>[^/?#&]+)')
+ m = content_regex.search(url)
+ if m is not None:
+ environment, parent_slug, slug = m.groups()
+ s = requests.session()
+ headers_for_origin = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0'}
+ r = s.get(url=url,
+ headers=headers_for_origin)
+ r.raise_for_status()
+
+ cached_base = _do_cached_post(s=s,
+ referer=url,
+ url='https://de-api.loma-cms.com/feloma/configurations/?environment={0}'.format(environment))
+
+ site_info = cached_base.get('data').get('settings').get('site')
+ player_info = site_info.get('player')
+
+ sonic_realm = player_info['sonicRealm']
+ sonic_endpoint = compat_urlparse.urlparse(player_info['sonicEndpoint']).hostname
+ country = site_info['info']['country']
+
+ cached_video_specific = _do_cached_post(s=s, referer=url,
+ url=_generate_video_specific_cache_url(
+ slug=slug,
+ parent_slug=parent_slug))
+
+ video_id = cached_video_specific['data']['blocks'][1]['videoId']
+
+ try:
+ return self._get_disco_api_info(url=url,
+ display_id=video_id,
+ disco_host=sonic_endpoint,
+ realm=sonic_realm,
+ country=country,
+ api_version=3,
+ )
+ except ExtractorError as e:
+ if getattr(e, 'message', '') == 'Missing deviceId in context':
+ self.report_drm(video_id)
+ raise