Update to ytdl-commit-cf2dbec

ytdl-org/youtube-dl@cf2dbec Except: [kakao] improve info extraction and detect geo restriction ytdl-org/youtube-dl@d808558
yt-dlp · Feb 19, 2021 · bc2ca1b · bc2ca1b
1 parent 5e41dca
commit bc2ca1b
Show file tree

Hide file tree

Showing 19 changed files with 1,012 additions and 394 deletions.
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py
@@ -12,6 +12,7 @@
 
 from youtube_dlc.extractor import (
  YoutubePlaylistIE,
+ YoutubeTabIE,
  YoutubeIE,
 )
 
@@ -57,14 +58,22 @@ def test_youtube_toptracks(self):
  entries = result['entries']
  self.assertEqual(len(entries), 100)
 
- def test_youtube_flat_playlist_titles(self):
+ def test_youtube_flat_playlist_extraction(self):
  dl = FakeYDL()
  dl.params['extract_flat'] = True
- ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv')
+ ie = YoutubeTabIE(dl)
+ result = ie.extract('https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc')
  self.assertIsPlaylist(result)
- for entry in result['entries']:
- self.assertTrue(entry.get('title'))
+ entries = list(result['entries'])
+ self.assertTrue(len(entries) == 1)
+ video = entries[0]
+ self.assertEqual(video['_type'], 'url_transparent')
+ self.assertEqual(video['ie_key'], 'Youtube')
+ self.assertEqual(video['id'], 'BaW_jenozKc')
+ self.assertEqual(video['url'], 'BaW_jenozKc')
+ self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐')
+ self.assertEqual(video['duration'], 10)
+ self.assertEqual(video['uploader'], 'Philipp Hagemeister')
 
 
 if __name__ == '__main__':

diff --git a/youtube_dlc/extractor/ard.py b/youtube_dlc/extractor/ard.py
@@ -324,20 +324,42 @@ def _real_extract(self, url):
 
  formats = []
  for a in video_node.findall('.//asset'):
+ file_name = xpath_text(a, './fileName', default=None)
+ if not file_name:
+ continue
+ format_type = a.attrib.get('type')
+ format_url = url_or_none(file_name)
+ if format_url:
+ ext = determine_ext(file_name)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, display_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id=format_type or 'hls', fatal=False))
+ continue
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ update_url_query(format_url, {'hdcore': '3.7.0'}),
+ display_id, f4m_id=format_type or 'hds', fatal=False))
+ continue
  f = {
- 'format_id': a.attrib['type'],
- 'width': int_or_none(a.find('./frameWidth').text),
- 'height': int_or_none(a.find('./frameHeight').text),
- 'vbr': int_or_none(a.find('./bitrateVideo').text),
- 'abr': int_or_none(a.find('./bitrateAudio').text),
- 'vcodec': a.find('./codecVideo').text,
- 'tbr': int_or_none(a.find('./totalBitrate').text),
+ 'format_id': format_type,
+ 'width': int_or_none(xpath_text(a, './frameWidth')),
+ 'height': int_or_none(xpath_text(a, './frameHeight')),
+ 'vbr': int_or_none(xpath_text(a, './bitrateVideo')),
+ 'abr': int_or_none(xpath_text(a, './bitrateAudio')),
+ 'vcodec': xpath_text(a, './codecVideo'),
+ 'tbr': int_or_none(xpath_text(a, './totalBitrate')),
  }
- if a.find('./serverPrefix').text:
- f['url'] = a.find('./serverPrefix').text
- f['playpath'] = a.find('./fileName').text
+ server_prefix = xpath_text(a, './serverPrefix', default=None)
+ if server_prefix:
+ f.update({
+ 'url': server_prefix,
+ 'playpath': file_name,
+ })
  else:
- f['url'] = a.find('./fileName').text
+ if not format_url:
+ continue
+ f['url'] = format_url
  formats.append(f)
  self._sort_formats(formats)
 

diff --git a/youtube_dlc/extractor/canvas.py b/youtube_dlc/extractor/canvas.py
@@ -7,19 +7,21 @@
 from .gigya import GigyaBaseIE
 from ..compat import compat_HTTPError
 from ..utils import (
- extract_attributes,
  ExtractorError,
- strip_or_none,
+ clean_html,
+ extract_attributes,
  float_or_none,
+ get_element_by_class,
  int_or_none,
  merge_dicts,
  str_or_none,
+ strip_or_none,
  url_or_none,
 )
 
 
 class CanvasIE(InfoExtractor):
- _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
  _TESTS = [{
  'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  'md5': '68993eda72ef62386a15ea2cf3c93107',
@@ -332,3 +334,51 @@ def _real_extract(self, url):
  'display_id': display_id,
  'season_number': int_or_none(page.get('episode_season')),
  })
+
+
+class DagelijkseKostIE(InfoExtractor):
+ IE_DESC = 'dagelijksekost.een.be'
+ _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
+ 'md5': '30bfffc323009a3e5f689bef6efa2365',
+ 'info_dict': {
+ 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
+ 'display_id': 'hachis-parmentier-met-witloof',
+ 'ext': 'mp4',
+ 'title': 'Hachis parmentier met witloof',
+ 'description': 'md5:9960478392d87f63567b5b117688cdc5',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 283.02,
+ },
+ 'expected_warnings': ['is not a supported codec'],
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ title = strip_or_none(get_element_by_class(
+ 'dish-metadata__title', webpage
+ ) or self._html_search_meta(
+ 'twitter:title', webpage))
+
+ description = clean_html(get_element_by_class(
+ 'dish-description', webpage)
+ ) or self._html_search_meta(
+ ('description', 'twitter:description', 'og:description'),
+ webpage)
+
+ video_id = self._html_search_regex(
+ r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
+ group='id')
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id,
+ 'ie_key': CanvasIE.ie_key(),
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ }
diff --git a/youtube_dlc/extractor/ccma.py b/youtube_dlc/extractor/ccma.py
@@ -1,12 +1,14 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import calendar
 import datetime
 import re
 
 from .common import InfoExtractor
 from ..utils import (
  clean_html,
+ extract_timezone,
  int_or_none,
  parse_duration,
  parse_resolution,
@@ -97,8 +99,9 @@ def _real_extract(self, url):
  timestamp = None
  data_utc = try_get(informacio, lambda x: x['data_emissio']['utc'])
  try:
- timestamp = datetime.datetime.strptime(
- data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp()
+ timezone, data_utc = extract_timezone(data_utc)
+ timestamp = calendar.timegm((datetime.datetime.strptime(
+ data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple())
  except TypeError:
  pass