-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- update dependencies - bypass rgpd form post - new proxy api for thumbnails
- Loading branch information
Showing
10 changed files
with
442 additions
and
299 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from typing import Dict, Iterator, Type, TypeVar | ||
|
||
from jsonpath_ng import parse | ||
|
||
T = TypeVar("T") | ||
|
||
|
||
def json_iter(path: str, payload: Dict, cls: Type[T] | None = None) -> Iterator[T]: | ||
for match in parse(path).find(payload): | ||
out = match.value | ||
if out is not None and (cls is None or isinstance(out, cls)): | ||
yield out | ||
|
||
|
||
def json_first(path: str, payload: Dict, cls: Type[T] | None = None) -> T: | ||
return next(json_iter(path, payload, cls=cls)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from asyncio import sleep | ||
from dataclasses import dataclass | ||
from typing import AsyncIterator, List | ||
|
||
from .client import YoutubeWebApi | ||
from .metadata import VideoData, VideoDescription, YoutubeWebPage | ||
|
||
|
||
@dataclass | ||
class VideoScrapper: | ||
youtube_api: YoutubeWebApi | ||
|
||
async def iter_videos( | ||
self, channel_id: str, *, shorts: bool = False, delay: float = 0 | ||
) -> AsyncIterator[List[VideoDescription]]: | ||
resp = await self.youtube_api.get_html( | ||
f"https://www.youtube.com/channel/{channel_id}/{'videos' if not shorts else 'shorts'}" | ||
) | ||
first_page = YoutubeWebPage(resp) | ||
assert (client_data := first_page.find_client_data()) is not None | ||
assert (video_data := first_page.find_initial_video_data()) is not None | ||
while True: | ||
videos = list( | ||
video_data.iter_videos( | ||
selector="videoRenderer" if not shorts else "reelItemRenderer" | ||
) | ||
) | ||
if len(videos) > 0: | ||
# yield all videos from the page | ||
yield videos | ||
else: | ||
# could not find any video | ||
break | ||
if video_data.continuation_token is None: | ||
# no continuation token, stop | ||
break | ||
# get next page using json api | ||
if delay > 0: | ||
await sleep(delay) | ||
video_data = VideoData( | ||
await self.youtube_api.api_browse( | ||
{ | ||
"context": { | ||
"clickTracking": { | ||
"clickTrackingParams": video_data.click_tracking_params | ||
}, | ||
"client": client_data, | ||
}, | ||
"continuation": video_data.continuation_token, | ||
} | ||
) | ||
) |
Oops, something went wrong.