Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit de39d12

Browse files
authored
[extractor/ceskatelevize] Back-port extractor from yt-dlp, etc (#30713)
* back-port extractor, removing CeskaTelevizePoradyIE * follow redirect URL * support liveBroadcast and videobonusDetail in __NEXT__ data * return single video for singleton playlist * fix/add tests
1 parent 27ed77a commit de39d12

File tree

2 files changed

+92
-83
lines changed

2 files changed

+92
-83
lines changed

youtube_dl/extractor/ceskatelevize.py

Lines changed: 91 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -12,70 +12,136 @@
1212
ExtractorError,
1313
float_or_none,
1414
sanitized_Request,
15-
unescapeHTML,
16-
update_url_query,
15+
str_or_none,
16+
traverse_obj,
1717
urlencode_postdata,
1818
USER_AGENTS,
1919
)
2020

2121

2222
class CeskaTelevizeIE(InfoExtractor):
23-
_VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
23+
_VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
2424
_TESTS = [{
25-
'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
25+
'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
2626
'info_dict': {
27-
'id': '61924494877246241',
27+
'id': '61924494877028507',
2828
'ext': 'mp4',
29-
'title': 'Hyde Park Civilizace: Život v Grónsku',
30-
'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626',
29+
'title': 'Bonus 01 - En - Hyde Park Civilizace',
30+
'description': 'English Subtittles',
3131
'thumbnail': r're:^https?://.*\.jpg',
32-
'duration': 3350,
32+
'duration': 81.3,
3333
},
3434
'params': {
3535
# m3u8 download
3636
'skip_download': True,
3737
},
3838
}, {
39-
'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
39+
# live stream
40+
'url': 'http://www.ceskatelevize.cz/zive/ct1/',
4041
'info_dict': {
41-
'id': '61924494877028507',
42+
'id': '102',
4243
'ext': 'mp4',
43-
'title': 'Hyde Park Civilizace: Bonus 01 - En',
44-
'description': 'English Subtittles',
45-
'thumbnail': r're:^https?://.*\.jpg',
46-
'duration': 81.3,
44+
'title': r'ČT1 - živé vysílání online',
45+
'description': 'Sledujte živé vysílání kanálu ČT1 online. Vybírat si můžete i z dalších kanálů České televize na kterémkoli z vašich zařízení.',
46+
'is_live': True,
4747
},
4848
'params': {
4949
# m3u8 download
5050
'skip_download': True,
5151
},
5252
}, {
53-
# live stream
53+
# another
5454
'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
55+
'only_matching': True,
5556
'info_dict': {
5657
'id': 402,
5758
'ext': 'mp4',
5859
'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
5960
'is_live': True,
6061
},
62+
# 'skip': 'Georestricted to Czech Republic',
63+
}, {
64+
'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
65+
'only_matching': True,
66+
}, {
67+
# video with 18+ caution trailer
68+
'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
69+
'info_dict': {
70+
'id': '215562210900007-bogotart',
71+
'title': 'Bogotart - Queer',
72+
'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti',
73+
},
74+
'playlist': [{
75+
'info_dict': {
76+
'id': '61924494877311053',
77+
'ext': 'mp4',
78+
'title': 'Bogotart - Queer (Varování 18+)',
79+
'duration': 11.9,
80+
},
81+
}, {
82+
'info_dict': {
83+
'id': '61924494877068022',
84+
'ext': 'mp4',
85+
'title': 'Bogotart - Queer (Queer)',
86+
'thumbnail': r're:^https?://.*\.jpg',
87+
'duration': 1558.3,
88+
},
89+
}],
6190
'params': {
6291
# m3u8 download
6392
'skip_download': True,
6493
},
65-
'skip': 'Georestricted to Czech Republic',
6694
}, {
67-
'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
95+
# iframe embed
96+
'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
6897
'only_matching': True,
6998
}]
7099

100+
def _search_nextjs_data(self, webpage, video_id, **kw):
101+
return self._parse_json(
102+
self._search_regex(
103+
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
104+
webpage, 'next.js data', **kw),
105+
video_id, **kw)
106+
71107
def _real_extract(self, url):
72108
playlist_id = self._match_id(url)
73-
74-
webpage = self._download_webpage(url, playlist_id)
109+
webpage, urlh = self._download_webpage_handle(url, playlist_id)
110+
parsed_url = compat_urllib_parse_urlparse(urlh.geturl())
111+
site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize')
112+
playlist_title = self._og_search_title(webpage, default=None)
113+
if site_name and playlist_title:
114+
playlist_title = re.split(r'\s*[—|]\s*%s' % (site_name, ), playlist_title, 1)[0]
115+
playlist_description = self._og_search_description(webpage, default=None)
116+
if playlist_description:
117+
playlist_description = playlist_description.replace('\xa0', ' ')
118+
119+
type_ = 'IDEC'
120+
if re.search(r'(^/porady|/zive)/', parsed_url.path):
121+
next_data = self._search_nextjs_data(webpage, playlist_id)
122+
if '/zive/' in parsed_url.path:
123+
idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'idec'), get_all=False)
124+
else:
125+
idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False)
126+
if not idec:
127+
idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'videobonusDetail', 'bonusId'), get_all=False)
128+
if idec:
129+
type_ = 'bonus'
130+
if not idec:
131+
raise ExtractorError('Failed to find IDEC id')
132+
iframe_hash = self._download_webpage(
133+
'https://www.ceskatelevize.cz/v-api/iframe-hash/',
134+
playlist_id, note='Getting IFRAME hash')
135+
query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec, }
136+
webpage = self._download_webpage(
137+
'https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php',
138+
playlist_id, note='Downloading player', query=query)
75139

76140
NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
77141
if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
78-
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
142+
self.raise_geo_restricted(NOT_AVAILABLE_STRING)
143+
if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )):
144+
raise ExtractorError('no video with IDEC available', video_id=idec, expected=True)
79145

80146
type_ = None
81147
episode_id = None
@@ -100,15 +166,15 @@ def _real_extract(self, url):
100166
data = {
101167
'playlist[0][type]': type_,
102168
'playlist[0][id]': episode_id,
103-
'requestUrl': compat_urllib_parse_urlparse(url).path,
169+
'requestUrl': parsed_url.path,
104170
'requestSource': 'iVysilani',
105171
}
106172

107173
entries = []
108174

109175
for user_agent in (None, USER_AGENTS['Safari']):
110176
req = sanitized_Request(
111-
'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
177+
'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/',
112178
data=urlencode_postdata(data))
113179

114180
req.add_header('Content-type', 'application/x-www-form-urlencoded')
@@ -130,9 +196,6 @@ def _real_extract(self, url):
130196
req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
131197
req.add_header('Referer', url)
132198

133-
playlist_title = self._og_search_title(webpage, default=None)
134-
playlist_description = self._og_search_description(webpage, default=None)
135-
136199
playlist = self._download_json(req, playlist_id, fatal=False)
137200
if not playlist:
138201
continue
@@ -167,7 +230,7 @@ def _real_extract(self, url):
167230
entries[num]['formats'].extend(formats)
168231
continue
169232

170-
item_id = item.get('id') or item['assetId']
233+
item_id = str_or_none(item.get('id') or item['assetId'])
171234
title = item['title']
172235

173236
duration = float_or_none(item.get('duration'))
@@ -181,8 +244,6 @@ def _real_extract(self, url):
181244

182245
if playlist_len == 1:
183246
final_title = playlist_title or title
184-
if is_live:
185-
final_title = self._live_title(final_title)
186247
else:
187248
final_title = '%s (%s)' % (playlist_title, title)
188249

@@ -200,6 +261,8 @@ def _real_extract(self, url):
200261
for e in entries:
201262
self._sort_formats(e['formats'])
202263

264+
if len(entries) == 1:
265+
return entries[0]
203266
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
204267

205268
def _get_subtitles(self, episode_id, subs):
@@ -236,54 +299,3 @@ def _fix_subtitle(subtitle):
236299
yield line
237300

238301
return '\r\n'.join(_fix_subtitle(subtitles))
239-
240-
241-
class CeskaTelevizePoradyIE(InfoExtractor):
242-
_VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
243-
_TESTS = [{
244-
# video with 18+ caution trailer
245-
'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
246-
'info_dict': {
247-
'id': '215562210900007-bogotart',
248-
'title': 'Queer: Bogotart',
249-
'description': 'Alternativní průvodce současným queer světem',
250-
},
251-
'playlist': [{
252-
'info_dict': {
253-
'id': '61924494876844842',
254-
'ext': 'mp4',
255-
'title': 'Queer: Bogotart (Varování 18+)',
256-
'duration': 10.2,
257-
},
258-
}, {
259-
'info_dict': {
260-
'id': '61924494877068022',
261-
'ext': 'mp4',
262-
'title': 'Queer: Bogotart (Queer)',
263-
'thumbnail': r're:^https?://.*\.jpg',
264-
'duration': 1558.3,
265-
},
266-
}],
267-
'params': {
268-
# m3u8 download
269-
'skip_download': True,
270-
},
271-
}, {
272-
# iframe embed
273-
'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
274-
'only_matching': True,
275-
}]
276-
277-
def _real_extract(self, url):
278-
video_id = self._match_id(url)
279-
280-
webpage = self._download_webpage(url, video_id)
281-
282-
data_url = update_url_query(unescapeHTML(self._search_regex(
283-
(r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
284-
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'),
285-
webpage, 'iframe player url', group='url')), query={
286-
'autoStart': 'true',
287-
})
288-
289-
return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())

youtube_dl/extractor/extractors.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,10 +208,7 @@
208208
from .ccma import CCMAIE
209209
from .cctv import CCTVIE
210210
from .cda import CDAIE
211-
from .ceskatelevize import (
212-
CeskaTelevizeIE,
213-
CeskaTelevizePoradyIE,
214-
)
211+
from .ceskatelevize import CeskaTelevizeIE
215212
from .channel9 import Channel9IE
216213
from .charlierose import CharlieRoseIE
217214
from .chaturbate import ChaturbateIE

0 commit comments

Comments
 (0)