Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[extractor/tele5] Modified tele5 extractor to fix Issue #8501 #9792

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,7 @@
- **gem.cbc.ca**: [*cbcgem*](## "netrc machine")
- **gem.cbc.ca:live**
- **gem.cbc.ca:playlist**
- **generic**: Generic downloader that works on some sites
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

- **Genius**
- **GeniusLyrics**
- **GetCourseRu**: [*getcourseru*](## "netrc machine")
Expand Down
78 changes: 35 additions & 43 deletions yt_dlp/extractor/beatport.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import int_or_none
from ..utils import int_or_none, ExtractorError


class BeatportIE(InfoExtractor):
Expand Down Expand Up @@ -43,55 +43,47 @@ def _real_extract(self, url):

webpage = self._download_webpage(url, display_id)

playables = self._parse_json(
self._search_regex(
r'window\.Playables\s*=\s*({.+?});', webpage,
'playables info', flags=re.DOTALL),
track_id)
try:
playables_json = self._search_regex(
r'window\.Playables\s*=\s*({.+?})\s*;', webpage,
'playables info', default='{}', flags=re.DOTALL)
playables = self._parse_json(playables_json, track_id)
except re.error:
raise ExtractorError('Failed to extract playables information. The page structure may have changed.')
Comment on lines +46 to +52
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might as well convert this to self._search_json since we are editing it


track = next(t for t in playables['tracks'] if t['id'] == int(track_id))
if not playables or 'tracks' not in playables:
raise ExtractorError('No playable tracks found in the extracted information.')

title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name']
if track['mix']:
track = next((t for t in playables['tracks'] if t['id'] == int(track_id)), None)
if not track:
raise ExtractorError(f'No track with ID {track_id} found.')

title = ', '.join(a['name'] for a in track['artists']) + ' - ' + track['name']
if track.get('mix'):
title += ' (' + track['mix'] + ')'
Comment on lines +54 to 63
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if not playables or 'tracks' not in playables:
raise ExtractorError('No playable tracks found in the extracted information.')
title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name']
if track['mix']:
track = next((t for t in playables['tracks'] if t['id'] == int(track_id)), None)
if not track:
raise ExtractorError(f'No track with ID {track_id} found.')
title = ', '.join(a['name'] for a in track['artists']) + ' - ' + track['name']
if track.get('mix'):
title += ' (' + track['mix'] + ')'
track = traverse_obj(playables, ('tracks', lambda _, t: t['id'] == int(track_id), {dict}))
if not track:
raise ExtractorError(f'No track with ID {track_id} found')
title = join_nonempty(
', '.join(traverse_obj(track, ('artists', ..., 'name'))),
track.get('name'), format_field(track, 'mix', '(%s)'))


formats = []
for ext, info in track['preview'].items():
if not info['url']:
continue
fmt = {
'url': info['url'],
'ext': ext,
'format_id': ext,
'vcodec': 'none',
}
if ext == 'mp3':
fmt['acodec'] = 'mp3'
fmt['abr'] = 96
fmt['asr'] = 44100
elif ext == 'mp4':
fmt['acodec'] = 'aac'
fmt['abr'] = 96
fmt['asr'] = 44100
formats.append(fmt)
for ext, info in track.get('preview', {}).items():
url = info.get('url')
if url:
fmt = {
'url': url,
'ext': ext,
'format_id': ext,
'vcodec': 'none',
'acodec': 'mp3' if ext == 'mp3' else 'aac',
'abr': 96,
'asr': 44100
}
formats.append(fmt)

images = []
for name, info in track['images'].items():
image_url = info.get('url')
if name == 'dynamic' or not image_url:
continue
image = {
'id': name,
'url': image_url,
'height': int_or_none(info.get('height')),
'width': int_or_none(info.get('width')),
}
images.append(image)
images = [{'id': name, 'url': info['url'], 'height': int_or_none(info.get('height')), 'width': int_or_none(info.get('width'))}
for name, info in track.get('images', {}).items() if name != 'dynamic' and info.get('url')]

return {
'id': compat_str(track.get('id')) or track_id,
'display_id': track.get('slug') or display_id,
'id': compat_str(track.get('id', track_id)),
'display_id': track.get('slug', display_id),
'title': title,
'formats': formats,
'thumbnails': images,
}
'thumbnails': images
}
Comment on lines 83 to +89
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

revert

110 changes: 94 additions & 16 deletions yt_dlp/extractor/tele5.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,68 @@
import re

import requests
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do not use requests. All network access should go through the helper functions


from .dplay import DPlayIE
from ..compat import compat_urlparse
from ..utils import (
ExtractorError,
extract_attributes,
)

def _generate_video_specific_cache_url(slug, parent_slug):
"""
Generate the MAGIC string for the video specific cache url.
:param slug: The part of the url that identifies the video by title.
:param parent_slug: The part of the url that identifies the PARENT directory.
:return: The generated url.
"""
return 'https://de-api.loma-cms.com/feloma/page/{0}/?environment=tele5&parent_slug={1}&v=2'.format(slug,
parent_slug)
def _do_cached_post(s: requests.session,
referer: str,
url: str) -> dict:
"""
Do the API call to CACHED json endpoint.
It is likely connected to the new "loma-cms" API.
:param s: The session we use.
:param referer: The referer url.
:param url: The url to retrieve the cached data for.
:return: The json dict from the response.
"""
r = s.post(url='https://tele5.de/cached',
headers={
'Origin': 'https://tele5.de',
'Referer': referer,
# Referer is a mandatory key,
'User-Agent': 'Youtube-DL',
# User-Agent is a mandatory key, it can be anything!
},
json={'path': url}
)
r.raise_for_status()
return r.json()

class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE
_WORKING = False
_VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_GEO_COUNTRIES = ['DE']
_TESTS = [{
'url': 'https://tele5.de/mediathek/sorority-babes-in-the-slimeball-bowl-o-rama',
'info_dict': {
'id': '5582852',
'title': 'Sorority Babes in the Slimeball Bowl-O-Rama',
'ext': 'mp4',
'series': 'Sorority Babes in the Slimeball Bowl-O-Rama',
'duration': 4779.88,
'description': 'md5:1d8d30ed3d221613861aaefa8d7e887e',
'timestamp': 1697839800,
'upload_date': '20231020',
'creator': 'Tele5',
'tags': [],
'thumbnail': 'https://eu1-prod-images.disco-api.com/2023/10/02/501fa839-d3ac-3c04-aa61-57f98802c532.jpeg',
},
}, {
'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416',
'only_matching': True,
'info_dict': {
'id': '1549416',
'ext': 'mp4',
Expand All @@ -26,6 +77,7 @@ class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE
}, {
# jwplatform, nexx unavailable
'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/',
'only_matching': True,
'info_dict': {
'id': 'WJuiOlUp',
'ext': 'mp4',
Expand All @@ -40,6 +92,7 @@ class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE
'skip': 'No longer available, redirects to Filme page',
}, {
'url': 'https://tele5.de/mediathek/angel-of-mine/',
'only_matching': True,
'info_dict': {
'id': '1252360',
'ext': 'mp4',
Expand Down Expand Up @@ -72,18 +125,43 @@ class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE
}]

def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
player_element = self._search_regex(r'(<hyoga-player\b[^>]+?>)', webpage, 'video player')
player_info = extract_attributes(player_element)
asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', ))
endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname
source_type = player_info.get('sourcetype')
if source_type:
endpoint = '%s-%s' % (source_type, endpoint)
try:
return self._get_disco_api_info(url, asset_id, endpoint, realm, country)
except ExtractorError as e:
if getattr(e, 'message', '') == 'Missing deviceId in context':
self.report_drm(video_id)
raise
content_regex = re.compile(r'https?://(?:www\.)?(?P<environment>[^.]+)\.de/(?P<parent_slug>[^/]+)/(?P<slug>[^/?#&]+)')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self._search_regex

m = content_regex.search(url)
if m is not None:
environment, parent_slug, slug = m.groups()
s = requests.session()
headers_for_origin = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0'}
r = s.get(url=url,
headers=headers_for_origin)
r.raise_for_status()

cached_base = _do_cached_post(s=s,
referer=url,
url='https://de-api.loma-cms.com/feloma/configurations/?environment={0}'.format(environment))

site_info = cached_base.get('data').get('settings').get('site')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

traverse_obj

player_info = site_info.get('player')

sonic_realm = player_info['sonicRealm']
sonic_endpoint = compat_urlparse.urlparse(player_info['sonicEndpoint']).hostname
country = site_info['info']['country']

cached_video_specific = _do_cached_post(s=s, referer=url,
url=_generate_video_specific_cache_url(
slug=slug,
parent_slug=parent_slug))

video_id = cached_video_specific['data']['blocks'][1]['videoId']

try:
return self._get_disco_api_info(url=url,
display_id=video_id,
disco_host=sonic_endpoint,
realm=sonic_realm,
country=country,
api_version=3,
)
except ExtractorError as e:
if getattr(e, 'message', '') == 'Missing deviceId in context':
self.report_drm(video_id)
raise