diff --git a/cyberdrop_dl/crawlers/_one_manager.py b/cyberdrop_dl/crawlers/_one_manager.py index 30f441332..22ae7f5b9 100644 --- a/cyberdrop_dl/crawlers/_one_manager.py +++ b/cyberdrop_dl/crawlers/_one_manager.py @@ -21,17 +21,13 @@ from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL, ScrapeItem -class Selectors: +class Selector: TABLE = "table#list-table" FILE_LINK = "a.download" FOLDER_LINK = "a[name='folderlist']" FILE = f"tr:has({FILE_LINK})" FOLDER = f"tr:has({FOLDER_LINK})" DATE = "td.updated_at" - README = "div#head.markdown-body" - - -_SELECTORS = Selectors() class OneManagerCrawler(Crawler, is_abc=True): @@ -41,51 +37,49 @@ class OneManagerCrawler(Crawler, is_abc=True): async def fetch(self, scrape_item: ScrapeItem) -> None: scrape_item.url = scrape_item.url.with_query(None) if self.PRIMARY_URL not in scrape_item.parent_threads: - self.init_item(scrape_item) - await self.process_path(scrape_item) + self._init_item(scrape_item) + await self._path(scrape_item) async def __async_post_init__(self) -> None: self.manager.client_manager.download_slots.update({self.DOMAIN: 2}) @error_handling_wrapper - async def process_path(self, scrape_item: ScrapeItem) -> None: + async def _path(self, scrape_item: ScrapeItem) -> None: try: soup = await self.request_soup(scrape_item.url) - except InvalidContentTypeError: # This is a file, not html + except InvalidContentTypeError: # This is a file, not HTML scrape_item.parent_title = scrape_item.parent_title.rsplit("/", 1)[0] link = scrape_item.url scrape_item.url = link.parent - return await self._process_file(scrape_item, link) - - # TODO: save readme as a sidecard - if soup.select_one(_SELECTORS.README): - pass + return await self._file(scrape_item, link) # href are not actual links, they only have the name of the new part - table = css.select(soup, _SELECTORS.TABLE) - for file in css.iselect(table, _SELECTORS.FILE): - await self.process_file(scrape_item, file) + table = css.select(soup, Selector.TABLE) + + for file in css.iselect(table, Selector.FILE): + await self.file(scrape_item, file) scrape_item.add_children() - for folder in css.iselect(table, _SELECTORS.FOLDER): - link = scrape_item.url / css.select(folder, _SELECTORS.FOLDER_LINK, "href") - new_scrape_item = scrape_item.create_child(link, new_title_part=link.name) - self.create_task(self.run(new_scrape_item)) + for folder in css.iselect(table, Selector.FOLDER): + link = scrape_item.url / css.select(folder, Selector.FOLDER_LINK, "href") + new_item = scrape_item.create_child(link) + new_item.add_to_parent_title(link.name) + self.create_task(self.run(new_item)) scrape_item.add_children() @error_handling_wrapper - async def process_file(self, scrape_item: ScrapeItem, file: Tag) -> None: - datetime = self.parse_date(css.select_text(file, _SELECTORS.DATE)) - link = scrape_item.url / css.select(file, _SELECTORS.FILE_LINK, "href") - await self._process_file(scrape_item, link, datetime) + async def file(self, scrape_item: ScrapeItem, file: Tag) -> None: + datetime = self.parse_iso_date(css.select_text(file, Selector.DATE)) + link = scrape_item.url / css.select(file, Selector.FILE_LINK, "href") + await self._file(scrape_item, link, datetime) - async def _process_file(self, scrape_item: ScrapeItem, link: AbsoluteHttpURL, datetime: int | None = None) -> None: + async def _file(self, scrape_item: ScrapeItem, link: AbsoluteHttpURL, uploaded_at: int | None = None) -> None: preview_url = link.with_query("preview") # The query param needs to be `?preview` exactly, with no value or `=` - new_scrape_item = scrape_item.create_child(preview_url, possible_datetime=datetime) - filename, ext = self.get_filename_and_ext(link.name) - await self.handle_file(link, new_scrape_item, filename, ext) + new_item = scrape_item.create_child(preview_url) + new_item.uploaded_at = uploaded_at + await self.direct_file(new_item, link) - def init_item(self, scrape_item: ScrapeItem) -> None: + def _init_item(self, scrape_item: ScrapeItem) -> None: scrape_item.setup_as_album(self.FOLDER_DOMAIN, album_id=self.DOMAIN) for part in scrape_item.url.parts[1:]: scrape_item.add_to_parent_title(part) diff --git a/cyberdrop_dl/crawlers/dirtyship.py b/cyberdrop_dl/crawlers/dirtyship.py index 4bddcb89f..5e7c5dac7 100644 --- a/cyberdrop_dl/crawlers/dirtyship.py +++ b/cyberdrop_dl/crawlers/dirtyship.py @@ -1,178 +1,88 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING, ClassVar, NamedTuple +from typing import TYPE_CHECKING, ClassVar from cyberdrop_dl.crawlers.crawler import Crawler, SupportedPaths +from cyberdrop_dl.data_structures.mediaprops import Resolution from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL -from cyberdrop_dl.exceptions import ScrapeError from cyberdrop_dl.utils import css -from cyberdrop_dl.utils.utilities import error_handling_wrapper +from cyberdrop_dl.utils.utilities import error_handling_wrapper, parse_url if TYPE_CHECKING: + from collections.abc import Generator + from bs4 import BeautifulSoup from cyberdrop_dl.data_structures.url_objects import ScrapeItem -class Selectors: +class Selector: VIDEO = "video#fp-video-0 > source" - FLOWPLAYER_VIDEO = "div.freedomplayer" + FLOWPLAYER = ".freedomplayer" PLAYLIST_ITEM = "li.thumi > a" - GALLERY_TITLE = "div#album p[style='text-align: center;']" - GALLERY_ALTERNATIVE_TITLE = "h1.singletitle" - GALLERY_THUMBNAILS = "div.gallery_grid img.gallery-img" - GALLERY_ALTERNATIVE_THUMBNAILS = "div#gallery-1 img" - GALLERY_DECODING_ASYNC = "div#album img[decoding='async']" - SINGLE_PHOTO = "div.resolutions a" - - -_SELECTORS = Selectors() - - -class Format(NamedTuple): - resolution: int | None - url: AbsoluteHttpURL - - -PRIMARY_URL = AbsoluteHttpURL("https://dirtyship.com") class DirtyShipCrawler(Crawler): SUPPORTED_PATHS: ClassVar[SupportedPaths] = { - "Category": "/category/...", - "Tag": "/tag/...", - "Video": "/", - "Gallery": "/gallery/...", - "Photo": "/gallery/.../...", + "Category": "/category/", + "Tag": "/tag/", + "Video": "/", } - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = PRIMARY_URL + PRIMARY_URL: ClassVar[AbsoluteHttpURL] = AbsoluteHttpURL("https://dirtyship.com") NEXT_PAGE_SELECTOR: ClassVar[str] = "a.page-next" DOMAIN: ClassVar[str] = "dirtyship" FOLDER_DOMAIN: ClassVar[str] = "DirtyShip" async def fetch(self, scrape_item: ScrapeItem) -> None: - if any(p in scrape_item.url.parts for p in ("tag", "category")): - return await self.playlist(scrape_item) - if "gallery" in scrape_item.url.parts: - if len(scrape_item.url.parts) >= 4: - return await self.photo(scrape_item) - else: - return await self.gallery(scrape_item) - return await self.video(scrape_item) - - @error_handling_wrapper - async def photo(self, scrape_item: ScrapeItem) -> None: - if await self.check_complete_from_referer(scrape_item): - return - if not scrape_item.url.suffix == ".jpg": - soup = await self.request_soup(scrape_item.url) - url = self.parse_url( - next(css.attr(a, "href") for a in soup.select(_SELECTORS.SINGLE_PHOTO) if "full" in a.get_text()) - ) - else: - url = scrape_item.url - filename, ext = self.get_filename_and_ext(url.name) - await self.handle_file(url, scrape_item, filename, ext) + match scrape_item.url.parts[1:]: + case ["tag" | "category" as type_, _]: + return await self.playlist(scrape_item, type_) + case [_]: + return await self.video(scrape_item) + case _: + raise ValueError @error_handling_wrapper - async def gallery(self, scrape_item: ScrapeItem) -> None: + async def playlist(self, scrape_item: ScrapeItem, type_: str) -> None: title: str = "" async for soup in self.web_pager(scrape_item.url): if not title: - title_tag = soup.select_one(_SELECTORS.GALLERY_TITLE) or soup.select_one( - _SELECTORS.GALLERY_ALTERNATIVE_TITLE - ) - assert title_tag - title: str = title_tag.get_text(strip=True) - title = self.create_title(title) + name = css.select_text(soup, "title").split("Archives", 1)[0] + title = self.create_title(f"{name} [{type_}]") scrape_item.setup_as_album(title) - thumbnails = ( - soup.select(_SELECTORS.GALLERY_THUMBNAILS) - or soup.select(_SELECTORS.GALLERY_ALTERNATIVE_THUMBNAILS) - or soup.select(_SELECTORS.GALLERY_DECODING_ASYNC) - ) - - for img in thumbnails: - url = ( - css.attr(img, "src") - if img.get("decoding") == "async" - else get_highest_resolution_picture(css.attr(img, "srcset")) - ) - if not url: - raise ScrapeError(404) - url = self.parse_url(url) - filename, ext = self.get_filename_and_ext(url.name) - await self.handle_file(url, scrape_item, filename, ext) - - @error_handling_wrapper - async def playlist(self, scrape_item: ScrapeItem) -> None: - title: str = "" - async for soup in self.web_pager(scrape_item.url): - if not title: - title: str = css.select_text(soup, "title") - title = title.split("Archives - DirtyShip")[0] - title = self.create_title(title) - scrape_item.setup_as_album(title) - - for _, new_scrape_item in self.iter_children(scrape_item, soup, _SELECTORS.PLAYLIST_ITEM): + for _, new_scrape_item in self.iter_children(scrape_item, soup, Selector.PLAYLIST_ITEM): self.create_task(self.run(new_scrape_item)) @error_handling_wrapper async def video(self, scrape_item: ScrapeItem) -> None: soup = await self.request_soup(scrape_item.url) - - title: str = css.select_text(soup, "title") - title = title.split(" - DirtyShip")[0] - videos = soup.select(_SELECTORS.VIDEO) - - def get_formats(): - for video in videos: - link_str: str = css.attr(video, "src") - if link_str.startswith("type="): - continue - res: str = css.attr(video, "title") - link = self.parse_url(link_str) - yield (Format(int(res), link)) - - formats = set(get_formats()) - if not formats: - formats = self.get_flowplayer_sources(soup) - if not formats: - raise ScrapeError(422, message="No video source found") - - res, link = sorted(formats)[-1] - filename, ext = self.get_filename_and_ext(link.name) - custom_filename = self.create_custom_filename(title, ext, resolution=res) - await self.handle_file(link, scrape_item, filename, ext, custom_filename=custom_filename) - - def get_flowplayer_sources(self, soup: BeautifulSoup) -> set[Format]: - flow_player = soup.select_one(_SELECTORS.FLOWPLAYER_VIDEO) - data_item: str | None = css.attr_or_none(flow_player, "data-item") if flow_player else None - if not data_item: - return set() - data_item = data_item.replace(r"\/", "/") - json_data = json.loads(data_item) - sources = json_data["sources"] - return {Format(None, self.parse_url(s["src"])) for s in sources} - - -"""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" - - -def get_highest_resolution_picture(srcset: str) -> str | None: - """ - Parses a srcset string and returns the URL with the highest resolution (width). - """ - candidates = [] - for item in srcset.split(","): - parts = item.strip().split() - if len(parts) == 2: - url, width = parts - try: - width = int(width.rstrip("w")) - candidates.append((width, url)) - except ValueError: - continue - return max(candidates)[1] if candidates else None + props = css.json_ld(soup)["@graph"] + article: dict[str, str] = next(prop for prop in props if prop["@type"] == "Article") + title = css.unescape(article["headline"]) + _preview = next(prop["contentUrl"] for prop in props if prop["@type"] == "ImageObject") + scrape_item.uploaded_at = self.parse_iso_date(article["datePublished"]) + + try: + resolution, src = max(_parse_flowplayer_sources(soup)) + except css.SelectorError: + resolution, src = max(_parse_html5_formats(soup)) + + filename, ext = self.get_filename_and_ext(src.name) + custom_filename = self.create_custom_filename(title, ext, resolution=resolution) + await self.handle_file(src, scrape_item, filename, ext, custom_filename=custom_filename) + + +def _parse_html5_formats(soup: BeautifulSoup) -> Generator[tuple[Resolution, AbsoluteHttpURL]]: + for video in css.iselect(soup, Selector.VIDEO): + res = Resolution.parse(css.attr(video, "title")) + link = parse_url(css.attr(video, "src")) + yield res, link + + +def _parse_flowplayer_sources(soup: BeautifulSoup) -> Generator[tuple[Resolution, AbsoluteHttpURL]]: + flow_player = css.select(soup, Selector.FLOWPLAYER, "data-item").replace(r"\/", "/") + source: dict[str, str] + for source in json.loads(flow_player)["sources"]: + yield Resolution.unknown(), parse_url(source["src"]) diff --git a/cyberdrop_dl/crawlers/doodstream.py b/cyberdrop_dl/crawlers/doodstream.py index 7bf7d40da..b2a399af8 100644 --- a/cyberdrop_dl/crawlers/doodstream.py +++ b/cyberdrop_dl/crawlers/doodstream.py @@ -1,8 +1,9 @@ from __future__ import annotations +import dataclasses import random import string -from datetime import UTC, datetime +import time from typing import TYPE_CHECKING, ClassVar from cyberdrop_dl.crawlers.crawler import Crawler, SupportedDomains, SupportedPaths @@ -16,79 +17,94 @@ from cyberdrop_dl.data_structures.url_objects import ScrapeItem -class Selectors: +class Selector: VIDEO = "div#video_player video" MD5_JS = "script:-soup-contains('/pass_md5/')" FILE_ID_JS = "script:-soup-contains('file_id')" -_SELECTORS = Selectors() -API_MD5_ENTRYPOINT = AbsoluteHttpURL("https://doodstream.com/pass_md5/") -TOKEN_CHARS = string.ascii_letters + string.digits - -PRIMARY_URL = AbsoluteHttpURL("https://doodstream.com/") +@dataclasses.dataclass(slots=True) +class Video: + id: str + title: str + dl_link: AbsoluteHttpURL class DoodStreamCrawler(Crawler): - SUPPORTED_PATHS: ClassVar[SupportedPaths] = {"Video": "/e/"} + SUPPORTED_PATHS: ClassVar[SupportedPaths] = { + "Video": "/e/", + } SUPPORTED_DOMAINS: ClassVar[SupportedDomains] = ( - "vidply.com", + "all3do.com", + "d000d.com", + "do7go.com", "dood.re", - "doodstream", + "dood.yt", "doodcdn", "doodstream.co", - "dood.yt", - "do7go.com", - "all3do.com", + "doodstream", + "myvidplay.com", + "playmogo.com", + "vidply.com", ) - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = PRIMARY_URL + PRIMARY_URL: ClassVar[AbsoluteHttpURL] = AbsoluteHttpURL("https://doodstream.com/") UPDATE_UNSUPPORTED: ClassVar[bool] = True DOMAIN: ClassVar[str] = "doodstream" FOLDER_DOMAIN: ClassVar[str] = "DoodStream" async def fetch(self, scrape_item: ScrapeItem) -> None: - if "e" in scrape_item.url.parts: - return await self.video(scrape_item) - raise ValueError + match scrape_item.url.parts[1:]: + case ["e", _, *_]: + return await self.video(scrape_item) + case _: + raise ValueError @error_handling_wrapper async def video(self, scrape_item: ScrapeItem) -> None: - canonical_url = scrape_item.url.with_host("doodstream.com") + canonical_url = scrape_item.url.with_host(self.PRIMARY_URL.host) if await self.check_complete_from_referer(canonical_url): return - async with self.request(scrape_item.url, impersonate=True) as resp: - actual_host = resp.url.host - soup = await resp.soup() - - title = css.page_title(soup, "DoodStream") - file_id = _get_file_id(soup) - debrid_link = await self.get_download_url(actual_host, soup) - filename, ext = self.get_filename_and_ext(f"{file_id}.mp4") - custom_filename = self.create_custom_filename(title, ext, file_id=file_id) + video = await self._get_video_info(scrape_item.url) + filename, ext = self.get_filename_and_ext(f"{video.id}.mp4") + custom_filename = self.create_custom_filename(video.title, ext, file_id=video.id) scrape_item.url = canonical_url + await self.handle_file( - scrape_item.url, scrape_item, filename, ext, debrid_link=debrid_link, custom_filename=custom_filename + scrape_item.url, + scrape_item, + filename, + ext, + debrid_link=video.dl_link, + custom_filename=custom_filename, ) - async def get_download_url(self, host: str, soup: BeautifulSoup) -> AbsoluteHttpURL: - md5_path = _get_md5_path(soup) - api_url = API_MD5_ENTRYPOINT / md5_path - token = api_url.name + async def _get_video_info(self, url: AbsoluteHttpURL) -> Video: + async with self.request(url, impersonate=True) as resp: + soup = await resp.soup() - text = await self.request_text(api_url.with_host(host), impersonate=True) - random_padding = "".join(random.choice(TOKEN_CHARS) for _ in range(10)) - expire = int(datetime.now(UTC).timestamp() * 1000) - download_url = self.parse_url(text + random_padding) - return download_url.with_query(token=token, expiry=expire) + api_url = resp.url.origin() / "pass_md5" / _md5_pass(soup) + download_url = await self.request_text(api_url, impersonate=True) + return Video( + id=_file_id(soup), + title=css.page_title(soup, "DoodStream"), + dl_link=self.parse_url(download_url + _random_padding()).with_query( + token=api_url.name, + expiry=int(time.time() * 1000), + ), + ) -def _get_md5_path(soup: BeautifulSoup) -> str: - js_text = css.select_text(soup, _SELECTORS.MD5_JS) +def _md5_pass(soup: BeautifulSoup) -> str: + js_text = css.select_text(soup, Selector.MD5_JS) return get_text_between(js_text, "/pass_md5/", "'") -def _get_file_id(soup: BeautifulSoup) -> str: - js_text = css.select_text(soup, _SELECTORS.FILE_ID_JS) +def _file_id(soup: BeautifulSoup) -> str: + js_text = css.select_text(soup, Selector.FILE_ID_JS) _, file_id, _ = js_text.split("'file_id'")[-1].split("'", 2) return file_id + + +def _random_padding() -> str: + return "".join(random.choices(string.ascii_letters + string.digits, k=10)) diff --git a/cyberdrop_dl/crawlers/flugel_anime.py b/cyberdrop_dl/crawlers/flugel_anime.py deleted file mode 100644 index 1186ab92a..000000000 --- a/cyberdrop_dl/crawlers/flugel_anime.py +++ /dev/null @@ -1,12 +0,0 @@ -from __future__ import annotations - -from typing import ClassVar - -from cyberdrop_dl.crawlers._one_manager import OneManagerCrawler -from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL - - -class FlugelAnimeCrawler(OneManagerCrawler): - DOMAIN: ClassVar[str] = "flugel-anime" - PRIMARY_URL: ClassVar = AbsoluteHttpURL("https://flugelanime.com") - FOLDER_DOMAIN: ClassVar = "Flugel-Anime" diff --git a/cyberdrop_dl/crawlers/fourchan.py b/cyberdrop_dl/crawlers/fourchan.py index 1584cba3b..3c59fa4f0 100644 --- a/cyberdrop_dl/crawlers/fourchan.py +++ b/cyberdrop_dl/crawlers/fourchan.py @@ -1,10 +1,10 @@ from __future__ import annotations -from typing import TYPE_CHECKING, ClassVar, NotRequired, TypedDict, cast +from typing import TYPE_CHECKING, Any, ClassVar, NotRequired, TypedDict from bs4 import BeautifulSoup -from cyberdrop_dl.crawlers.crawler import Crawler, SupportedPaths +from cyberdrop_dl.crawlers.crawler import Crawler, RateLimit, SupportedPaths from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL from cyberdrop_dl.exceptions import ScrapeError from cyberdrop_dl.utils.utilities import error_handling_wrapper @@ -12,51 +12,44 @@ if TYPE_CHECKING: from cyberdrop_dl.data_structures.url_objects import ScrapeItem -API_ENTRYPOINT = AbsoluteHttpURL("https://a.4cdn.org/") -FILES_BASE_URL = AbsoluteHttpURL("https://i.4cdn.org/") -PRIMARY_URL = AbsoluteHttpURL("https://boards.4chan.org") +_API_ENTRYPOINT = AbsoluteHttpURL("https://a.4cdn.org/") +_FILES_BASE_URL = AbsoluteHttpURL("https://i.4cdn.org/") -class Post(TypedDict): - sub: NotRequired[str] # Subject - com: NotRequired[str] # Comment - time: int # Unix timestamp - - -class ImagePost(Post): +class ImagePost(TypedDict): filename: str # File stem ext: str tim: int # Unix timestamp + microtime of uploaded image - - -class Thread(TypedDict): - no: int # Original post ID - - -class ThreadList(TypedDict): - page: int - threads: list[Thread] + sub: NotRequired[str] # Subject + com: NotRequired[str] # Comment + time: int # Unix timestamp class FourChanCrawler(Crawler): - SUPPORTED_PATHS: ClassVar[SupportedPaths] = {"Board": "/", "Thread": "/thread"} - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = PRIMARY_URL + SUPPORTED_PATHS: ClassVar[SupportedPaths] = { + "Board": "/", + "Thread": "//thread/", + } + PRIMARY_URL: ClassVar[AbsoluteHttpURL] = AbsoluteHttpURL("https://boards.4chan.org") DOMAIN: ClassVar[str] = "4chan" _DOWNLOAD_SLOTS: ClassVar[int | None] = 1 - _RATE_LIMIT = 3, 10 + _RATE_LIMIT: ClassVar[RateLimit] = 3, 10 async def fetch(self, scrape_item: ScrapeItem) -> None: - if "thread" in scrape_item.url.parts: - return await self.thread(scrape_item) - elif len(scrape_item.url.parts) == 2: - return await self.board(scrape_item) - raise ValueError + match scrape_item.url.parts[1:]: + case [board, "thread", thread_id, *_]: + return await self.thread(scrape_item, board, thread_id) + case [board]: + return await self.board(scrape_item, board) + case [board, _]: + return await self.board(scrape_item, board) + case _: + raise ValueError @error_handling_wrapper - async def thread(self, scrape_item: ScrapeItem) -> None: - board, _, thread_id = scrape_item.url.parts[1:4] - api_url = API_ENTRYPOINT / board / f"thread/{thread_id}.json" - response: dict[str, list[Post]] = await self.request_json(api_url, cache_disabled=True) + async def thread(self, scrape_item: ScrapeItem, board: str, thread_id: str) -> None: + api_url = _API_ENTRYPOINT / board / f"thread/{thread_id}.json" + response: dict[str, list[ImagePost]] = await self.request_json(api_url) if not response: raise ScrapeError(404) @@ -67,34 +60,43 @@ async def thread(self, scrape_item: ScrapeItem) -> None: title = BeautifulSoup(comment).get_text(strip=True) else: title = f"#{thread_id}" + title = self.create_title(f"{title} [thread]", thread_id) scrape_item.setup_as_album(title, album_id=thread_id) results = await self.get_album_results(thread_id) for post in response["posts"]: - if file_stem := post.get("filename"): - post = cast("ImagePost", post) - file_micro_timestamp, ext = post["tim"], post["ext"] - url = FILES_BASE_URL / board / f"{file_micro_timestamp}{ext}" - if self.check_album_results(url, results): - continue - - custom_filename = self.create_custom_filename(file_stem, ext) - filename, _ = self.get_filename_and_ext(url.name) - new_scrape_item = scrape_item.copy() - new_scrape_item.uploaded_at = post["time"] - await self.handle_file(url, new_scrape_item, filename, ext, custom_filename=custom_filename) - scrape_item.add_children() + file_stem = post.get("filename") + if not file_stem: + continue + + file_micro_timestamp, ext = post["tim"], post["ext"] + url = _FILES_BASE_URL / board / f"{file_micro_timestamp}{ext}" + if self.check_album_results(url, results): + continue + + custom_filename = self.create_custom_filename(file_stem, ext) + filename, _ = self.get_filename_and_ext(url.name) + new_scrape_item = scrape_item.copy() + new_scrape_item.uploaded_at = post["time"] + await self.handle_file( + url, + new_scrape_item, + filename, + ext, + custom_filename=custom_filename, + metadata=post, + ) + scrape_item.add_children() @error_handling_wrapper - async def board(self, scrape_item: ScrapeItem) -> None: - board: str = scrape_item.url.parts[-1] - api_url = API_ENTRYPOINT / board / "threads.json" - threads: list[ThreadList] = await self.request_json(api_url, cache_disabled=True) + async def board(self, scrape_item: ScrapeItem, board: str) -> None: + api_url: AbsoluteHttpURL = _API_ENTRYPOINT / board / "threads.json" + threads: list[dict[str, Any]] = await self.request_json(api_url) scrape_item.setup_as_forum("") for page in threads: for thread in page["threads"]: - url = PRIMARY_URL / board / f"thread/{thread['no']}" + url = self.PRIMARY_URL / board / f"thread/{thread['no']}" new_scrape_item = scrape_item.create_child(url) self.create_task(self.run(new_scrape_item)) scrape_item.add_children() diff --git a/cyberdrop_dl/crawlers/noodle_magazine.py b/cyberdrop_dl/crawlers/noodle_magazine.py index 351bce8a4..3449a7965 100644 --- a/cyberdrop_dl/crawlers/noodle_magazine.py +++ b/cyberdrop_dl/crawlers/noodle_magazine.py @@ -1,101 +1,131 @@ from __future__ import annotations +import dataclasses import itertools import json -from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple +from typing import TYPE_CHECKING, ClassVar -from cyberdrop_dl.crawlers.crawler import Crawler, SupportedPaths +from cyberdrop_dl.crawlers.crawler import Crawler, RateLimit, SupportedPaths from cyberdrop_dl.data_structures.mediaprops import Resolution from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL from cyberdrop_dl.exceptions import ScrapeError from cyberdrop_dl.utils import css -from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_text_between +from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_text_between, parse_url if TYPE_CHECKING: + from collections.abc import Generator + + from bs4 import BeautifulSoup + from cyberdrop_dl.data_structures.url_objects import ScrapeItem -PLAYLIST_SELECTOR = "script:-soup-contains('window.playlist')" -METADATA_SELECTOR = "script[type='application/ld+json']" -SEARCH_STRING_SELECTOR = "div.mh_line > h1.c_title" -VIDEOS_SELECTOR = "div#list_videos a.item_link" +class Selector: + PLAYLIST = "script:-soup-contains('window.playlist')" + VIDEOS = "div#list_videos a.item_link" -class Source(NamedTuple): - resolution: Resolution - file: str +@dataclasses.dataclass(slots=True) +class Video: + id: str + title: str + uploaded_at: str - @staticmethod - def new(source_dict: dict[str, Any]) -> Source: - resolution = Resolution.parse(source_dict["label"]) - return Source(resolution, source_dict["file"]) + resolution: Resolution + content_url: AbsoluteHttpURL + src: AbsoluteHttpURL -PRIMARY_URL = AbsoluteHttpURL("https://noodlemagazine.com") +_VIDEO_PER_PAGE = 24 class NoodleMagazineCrawler(Crawler): - SUPPORTED_PATHS: ClassVar[SupportedPaths] = {"Search": "/video/"} - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = PRIMARY_URL + SUPPORTED_PATHS: ClassVar[SupportedPaths] = { + "Search": "/video/", + "Video": "/watch/", + } + PRIMARY_URL: ClassVar[AbsoluteHttpURL] = AbsoluteHttpURL("https://noodlemagazine.com") DOMAIN: ClassVar[str] = "noodlemagazine" FOLDER_DOMAIN: ClassVar[str] = "NoodleMagazine" + _DOWNLOAD_SLOTS: ClassVar[int | None] = 2 - _RATE_LIMIT = 1, 3 + _RATE_LIMIT: ClassVar[RateLimit] = 1, 3 + _IMPERSONATE: ClassVar[str | bool | None] = True async def fetch(self, scrape_item: ScrapeItem) -> None: - if "video" in scrape_item.url.parts: - return await self.search(scrape_item) - elif "watch" in scrape_item.url.parts: - return await self.video(scrape_item) - raise ValueError + match scrape_item.url.parts[1:]: + case ["watch", _]: + return await self.video(scrape_item) + case ["video", query]: + return await self.search(scrape_item, query) + case _: + raise ValueError @error_handling_wrapper - async def search(self, scrape_item: ScrapeItem) -> None: - title: str = "" + async def search(self, scrape_item: ScrapeItem, query: str) -> None: + scrape_item.setup_as_album(self.create_title(f"{query} [search]")) init_page = int(scrape_item.url.query.get("p") or 1) seen_urls: set[AbsoluteHttpURL] = set() + for page in itertools.count(1, init_page): n_videos = 0 page_url = scrape_item.url.with_query(p=page) - soup = await self.request_soup(page_url, impersonate=True) - - if not title: - search_string: str = css.select_text(soup, SEARCH_STRING_SELECTOR) - title = search_string.rsplit(" videos", 1)[0] - title = self.create_title(f"{title} [search]") - scrape_item.setup_as_album(title) + soup = await self.request_soup(page_url) - for _, new_scrape_item in self.iter_children(scrape_item, soup, VIDEOS_SELECTOR): + for _, new_scrape_item in self.iter_children(scrape_item, soup, Selector.VIDEOS): if new_scrape_item.url not in seen_urls: seen_urls.add(new_scrape_item.url) n_videos += 1 self.create_task(self.run(new_scrape_item)) - if n_videos < 24: + if n_videos < _VIDEO_PER_PAGE: break @error_handling_wrapper async def video(self, scrape_item: ScrapeItem) -> None: if await self.check_complete_from_referer(scrape_item.url): return - soup = await self.request_soup(scrape_item.url, impersonate=True) - - metadata_text = css.select(soup, METADATA_SELECTOR).get_text() - metadata = json.loads(metadata_text.strip()) - playlist = soup.select_one(PLAYLIST_SELECTOR) - if not playlist: - raise ScrapeError(404) - - playlist_data = json.loads(get_text_between(playlist.text, "window.playlist = ", ";\nwindow.ads")) - best_source = max(Source.new(source) for source in playlist_data["sources"]) - title: str = css.select(soup, "title").get_text().split(" watch online")[0] - - scrape_item.uploaded_at = self.parse_iso_date(metadata["uploadDate"]) - content_url = self.parse_url(metadata["contentUrl"]) - filename, ext = self.get_filename_and_ext(content_url.name) - video_id = filename.removesuffix(ext) - custom_filename = self.create_custom_filename(title, ext, file_id=video_id, resolution=best_source.resolution) - src_url = self.parse_url(best_source.file) + + soup = await self.request_soup(scrape_item.url) + video = _parse_video(soup) + + scrape_item.uploaded_at = self.parse_iso_date(video.uploaded_at) + _, ext = self.get_filename_and_ext(filename=video.content_url.name) + filename = self.create_custom_filename(video.title, ext, file_id=video.id, resolution=video.resolution) await self.handle_file( - content_url, scrape_item, filename, ext, custom_filename=custom_filename, debrid_link=src_url + video.content_url, + scrape_item, + video.title, + ext, + custom_filename=filename, + debrid_link=video.src, ) + + +def _parse_video(soup: BeautifulSoup) -> Video: + json_ld = css.json_ld(soup) + + try: + playlist_js = css.select_text(soup, Selector.PLAYLIST) + except css.SelectorError: + raise ScrapeError(404) from None + + playlist = json.loads(get_text_between(playlist_js, "window.playlist = ", ";\nwindow.ads")) + + resolution, src = max(_parse_sources(playlist["sources"])) + content_url = parse_url(json_ld["contentUrl"]) + + return Video( + title=json_ld["name"], + uploaded_at=json_ld["uploadDate"], + content_url=content_url, + id=content_url.name.removesuffix(content_url.suffix), + resolution=resolution, + src=src, + ) + + +def _parse_sources(sources: list[dict[str, str]]) -> Generator[tuple[Resolution, AbsoluteHttpURL]]: + for source in sources: + resolution = Resolution.parse(source["label"]) + yield resolution, parse_url(source["file"]) diff --git a/cyberdrop_dl/crawlers/onedrive.py b/cyberdrop_dl/crawlers/onedrive.py index 9cddc1cbb..927bb5011 100644 --- a/cyberdrop_dl/crawlers/onedrive.py +++ b/cyberdrop_dl/crawlers/onedrive.py @@ -5,7 +5,7 @@ from __future__ import annotations from dataclasses import dataclass -from datetime import UTC, datetime, timedelta +from datetime import datetime from functools import partial from typing import TYPE_CHECKING, Any, ClassVar, Self @@ -102,19 +102,7 @@ class OneDriveCrawler(Crawler): FOLDER_DOMAIN: ClassVar[str] = "OneDrive" def __post_init__(self) -> None: - badger_token: str = "" - badger_token_expires: str = "" - self.auth_headers = {} - expired = True - if badger_token_expires: - if badger_token_expires.endswith("Z"): - badger_token_expires = badger_token_expires.replace("Z", "+00:00") - expire_datetime = datetime.fromisoformat(badger_token_expires) - t_delta = expire_datetime - datetime.now(UTC) - if t_delta > timedelta(hours=12): - expired = False - if badger_token and not expired: - self.auth_headers = {"Prefer": "autoredeem", "Authorization": f"Badger {badger_token}"} + self.auth_headers: dict[str, str] = {} async def __async_post_init__(self) -> None: if self.auth_headers: diff --git a/cyberdrop_dl/crawlers/safe_soul.py b/cyberdrop_dl/crawlers/safe_soul.py deleted file mode 100644 index bfcef92ed..000000000 --- a/cyberdrop_dl/crawlers/safe_soul.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import annotations - -import datetime -from typing import TYPE_CHECKING, ClassVar - -from cyberdrop_dl.crawlers._chibisafe import Album, ChibiSafeCrawler, File -from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL -from cyberdrop_dl.utils import css -from cyberdrop_dl.utils.utilities import error_handling_wrapper - -if TYPE_CHECKING: - import bs4 - - from cyberdrop_dl.data_structures.url_objects import ScrapeItem - - -class Selector: - ALBUM_TITLE = "#title" - FILE = "#table .image-container" - FILE_DATE = ".details .file-date" - FILE_NAME = ".details .name" - FILE_URL = "a.image" - - -class SafeSoulCrawler(ChibiSafeCrawler): - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = AbsoluteHttpURL("https://safe.soul.lol") - DOMAIN: ClassVar[str] = "safe.soul.lol" - FOLDER_DOMAIN: ClassVar[str] = "Safe.Soul" - - async def file(self, scrape_item: ScrapeItem, file_id: str) -> None: - # file endpoint is disabled - return await self.direct_file(scrape_item) - - @error_handling_wrapper - async def album(self, scrape_item: ScrapeItem, album_id: str) -> None: - # The album endpoint is enabled, but it returns incomplete info (ex: missing date) - # So we scrape the HTML - - soup = await self.request_soup(scrape_item.url) - album = Album( - id=album_id, - name=css.select_text(soup, Selector.ALBUM_TITLE), - files=[_parse_file(tag) for tag in soup.select(Selector.FILE)], - ) - return await self._handle_album(scrape_item, album) - - -def _parse_file(file_tag: bs4.Tag) -> File: - timestamp = int(css.select(file_tag, Selector.FILE_DATE, "data-value")) - - return File( - name=css.select_text(file_tag, Selector.FILE_NAME), - url=css.select(file_tag, Selector.FILE_URL, "href"), - createdAt=datetime.datetime.fromtimestamp(timestamp), - ) diff --git a/cyberdrop_dl/crawlers/tokyomotion.py b/cyberdrop_dl/crawlers/tokyomotion.py index 0e0c848f0..ff1e75283 100644 --- a/cyberdrop_dl/crawlers/tokyomotion.py +++ b/cyberdrop_dl/crawlers/tokyomotion.py @@ -2,33 +2,30 @@ from typing import TYPE_CHECKING, ClassVar -from cyberdrop_dl.crawlers.crawler import Crawler, SupportedPaths +from cyberdrop_dl import signature +from cyberdrop_dl.crawlers.crawler import Crawler, SupportedPaths, auto_task_id from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL from cyberdrop_dl.exceptions import ScrapeError -from cyberdrop_dl.utils import css -from cyberdrop_dl.utils.utilities import error_handling_wrapper, remove_parts +from cyberdrop_dl.utils import css, open_graph +from cyberdrop_dl.utils.utilities import error_handling_wrapper if TYPE_CHECKING: - from cyberdrop_dl.data_structures.url_objects import ScrapeItem + from collections.abc import AsyncIterator + + from bs4 import BeautifulSoup -PRIMARY_URL = AbsoluteHttpURL("https://www.tokyomotion.net") + from cyberdrop_dl.data_structures.url_objects import ScrapeItem -class Selectors: +class Selector: ALBUM = 'a[href^="/album/"]' - IMAGE = "img[class='img-responsive-mw']" - IMAGE_THUMB = "div[id*='_photo_'] img[id^='album_photo_']" - VIDEO_DIV = "div[id*='video_']" - VIDEO = 'a[href^="/video/"]' - SEARCH_DIV = "div[class^='well']" - VIDEO_DATE = "div.pull-right.big-views-xs.visible-xs > span.text-white" - ALBUM_TITLE = "div.panel.panel-default > div.panel-heading > div.pull-left" - VIDEO_SRC_SD = 'source[title="SD"]' - VIDEO_SRC_HD = 'source[title="HD"]' - VIDEO_SRC = f"{VIDEO_SRC_HD}, {VIDEO_SRC_SD}" + ALBUM_NAME = ".panel-heading > .pull-left" + IMAGE = "img.img-responsive-mw" + THUMBNAIL = "img[id^='album_photo_']" -_SELECTORS = Selectors() + VIDEO = "a[href^='/video/']" + VIDEO_SRC = "source[title='HD'], source[title='SD']" class TokioMotionCrawler(Crawler): @@ -46,173 +43,140 @@ class TokioMotionCrawler(Crawler): "Search Results": "/search?...", "Video": "/video/", } - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = PRIMARY_URL + PRIMARY_URL: ClassVar[AbsoluteHttpURL] = AbsoluteHttpURL("https://www.tokyomotion.net") NEXT_PAGE_SELECTOR: ClassVar[str] = "a.prevnext" DOMAIN: ClassVar[str] = "tokyomotion" async def fetch(self, scrape_item: ScrapeItem) -> None: scrape_item.url = scrape_item.url.without_query_params("page") - if "video" in scrape_item.url.parts: - return await self.video(scrape_item) - if "videos" in scrape_item.url.parts: - return await self.playlist(scrape_item) - if "photo" in scrape_item.url.parts: - return await self.image(scrape_item) - if any(part in scrape_item.url.parts for part in ("album", "photos")): - return await self.album(scrape_item) - if "albums" in scrape_item.url.parts: - return await self.albums(scrape_item) - if "user" in scrape_item.url.parts: - return await self.profile(scrape_item) - if "search" in scrape_item.url.parts and scrape_item.url.query.get("search_type") != "users": - return await self.search(scrape_item) - raise ValueError + match scrape_item.url.parts[1:]: + case ["video", video_id, *_]: + return await self.video(scrape_item, video_id) + case ["photo", _]: + return await self.photo(scrape_item) + case ["album", album_id, *_]: + return await self.album(scrape_item, album_id) + case ["user", user, *_]: + title = self.create_title(f"{user} [user]") + scrape_item.setup_as_profile(title) + return await self.profile(scrape_item) + case ["search"] if ( + (query := scrape_item.url.query.get("search_query")) + and (query_type := scrape_item.url.query.get("search_type")) + and query_type != "users" + ): + return await self.search(scrape_item, query, query_type) + case _: + raise ValueError @error_handling_wrapper - async def video(self, scrape_item: ScrapeItem) -> None: - if await self.check_complete_from_referer(scrape_item): + async def video(self, scrape_item: ScrapeItem, video_id: str) -> None: + if await self.check_complete_from_referer(scrape_item.url): return - canonical_url = scrape_item.url.with_path("/".join(scrape_item.url.parts[1:3])) - scrape_item.url = canonical_url - if await self.check_complete_from_referer(canonical_url): + scrape_item.url = scrape_item.url.with_path(f"video/{video_id}") + if await self.check_complete_from_referer(scrape_item.url): return - video_id = scrape_item.url.parts[2] soup = await self.request_soup(scrape_item.url) - - src = soup.select_one(_SELECTORS.VIDEO_SRC) - if not src: - if "This is a private" in soup.text: - raise ScrapeError(401, "Private video") - raise ScrapeError(422, "Couldn't find video source") - - scrape_item.uploaded_at = self.parse_date(css.select_text(soup, _SELECTORS.VIDEO_DATE)) - link_str = css.attr(src, "src") - link = self.parse_url(link_str) - title = css.select_text(soup, "title").rsplit(" - TOKYO Motion")[0].strip() - filename, ext = f"{video_id}.mp4", ".mp4" - custom_filename = self.create_custom_filename(title, ext, file_id=video_id) - await self.handle_file(link, scrape_item, filename, ext, custom_filename=custom_filename) + _check_private(soup) + src = css.select(soup, Selector.VIDEO_SRC, "src") + link = self.parse_url(src) + title = open_graph.title(soup) + filename = self.create_custom_filename(title, ext := ".mp4", file_id=video_id) + await self.handle_file(link, scrape_item, video_id + ext, ext, custom_filename=filename) @error_handling_wrapper - async def image(self, scrape_item: ScrapeItem) -> None: + async def photo(self, scrape_item: ScrapeItem) -> None: if await self.check_complete_from_referer(scrape_item): return soup = await self.request_soup(scrape_item.url) - - img = soup.select_one(_SELECTORS.IMAGE) - if not img: - if "This is a private" in soup.text: - raise ScrapeError(401, "Private Photo") - raise ScrapeError(422, "Couldn't find image source") - - link_str: str = css.attr(img, "src") - link = self.parse_url(link_str) - filename, ext = self.get_filename_and_ext(link.name) - await self.handle_file(link, scrape_item, filename, ext) + _check_private(soup) + src = css.select(soup, Selector.IMAGE, "src") + await self.direct_file(scrape_item, self.parse_url(src)) @error_handling_wrapper - async def album(self, scrape_item: ScrapeItem) -> None: - title = await self.get_album_title(scrape_item) - if "user" in scrape_item.url.parts: - self.add_user_title(scrape_item) - - else: - canonical_url = scrape_item.url.with_path("/".join(scrape_item.url.parts[1:3])) - scrape_item.url = canonical_url - album_id = scrape_item.url.parts[2] - scrape_item.album_id = album_id - title = self.create_title(title, album_id) - - scrape_item.part_of_album = True - - if title not in scrape_item.parent_title: - scrape_item.add_to_parent_title(title) - if title == "favorite": - scrape_item.add_to_parent_title("photos") - - async for soup in self.web_pager(scrape_item.url): - if "This is a private" in soup.text: - raise ScrapeError(401, "Private album") - for _, link in self.iter_tags(soup, _SELECTORS.IMAGE_THUMB, "src"): - link = remove_parts(link, "tmb") - filename, ext = self.get_filename_and_ext(link.name) - await self.handle_file(link, scrape_item, filename, ext) - - """------------------------------------------------------------------------------------------------------------------------""" - - @error_handling_wrapper - async def albums(self, scrape_item: ScrapeItem) -> None: - self.add_user_title(scrape_item) - async for soup in self.web_pager(scrape_item.url): - for _, new_scrape_item in self.iter_children(scrape_item, soup, _SELECTORS.ALBUM, new_title_part="albums"): - await self.album(new_scrape_item) + async def album(self, scrape_item: ScrapeItem, album_id: str) -> None: + soup = await self.request_soup(scrape_item.url) + _check_private(soup) + title = css.select_text(soup, Selector.ALBUM_NAME) + scrape_item.setup_as_album(self.create_title(title, album_id), album_id=album_id) + + while True: + self._iter_album_images(scrape_item, soup) + try: + next_page = css.select(soup, self.NEXT_PAGE_SELECTOR, "href") + except css.SelectorError: + break + soup = await self.request_soup(self.parse_url(next_page)) + + def _iter_album_images(self, scrape_item: ScrapeItem, soup: BeautifulSoup) -> None: + for link in css.iselect(soup, Selector.THUMBNAIL, "src"): + src = self.parse_url(link.replace("/tmb/", "/")) + self.create_task(self.direct_file(scrape_item, src)) @error_handling_wrapper async def profile(self, scrape_item: ScrapeItem) -> None: - self.add_user_title(scrape_item) - new_parts = ["albums", "favorite/photos", "videos", "favorite/videos"] - scrapers = [self.albums, self.album, self.playlist, self.playlist] - for part, scraper in zip(new_parts, scrapers, strict=False): - link = scrape_item.url / part - new_scrape_item = scrape_item.create_child(link) - await scraper(new_scrape_item) + match scrape_item.url.parts[3:]: + case ["favorite", "videos"]: + scrape_item.setup_as_album("favorite") + scrape_item.add_to_parent_title("videos") + return await self.crawl_children(scrape_item, Selector.VIDEO) + + case ["videos"]: + scrape_item.setup_as_album("videos") + return await self.crawl_children(scrape_item, Selector.VIDEO) + + case ["favorite", "photos"]: + scrape_item.setup_as_album("favorite") + scrape_item.add_to_parent_title("photos") + async for soup in self.web_pager(scrape_item.url): + self._iter_album_images(scrape_item, soup) + + case ["photos"]: + scrape_item.setup_as_album("photos") + async for soup in self.web_pager(scrape_item.url): + self._iter_album_images(scrape_item, soup) + + case ["albums"]: + scrape_item.setup_as_album("albums") + return await self.crawl_children(scrape_item, Selector.ALBUM) + + case []: + for path in ("albums", "favorite/photos", "videos", "favorite/videos"): + new_item = scrape_item.create_child(scrape_item.url / path) + self.create_task(self._profile_page(new_item)) + scrape_item.add_children() + case _: + raise ScrapeError("Unknown URL path") + + _profile_page = auto_task_id(profile) @error_handling_wrapper - async def search(self, scrape_item: ScrapeItem) -> None: - search_type = scrape_item.url.query.get("search_type") - search_query = scrape_item.url.query.get("search_query") - search_title = f"{search_query} [{search_type} search]" - is_album = search_type == "photos" - if not scrape_item.parent_title: - search_title = self.create_title(search_title) - - scrape_item.setup_as_album(search_title) - selector = f"{_SELECTORS.SEARCH_DIV} " - selector += _SELECTORS.ALBUM if is_album else _SELECTORS.VIDEO - scraper = self.album if is_album else self.video - - async for soup in self.web_pager(scrape_item.url): - for _, new_scrape_item in self.iter_children(scrape_item, soup, selector): - await scraper(new_scrape_item) + async def search(self, scrape_item: ScrapeItem, query: str, query_type: str) -> None: + title = f"{query} [{query_type} search]" + scrape_item.setup_as_album(self.create_title(title)) + selector = Selector.ALBUM if query_type == "photos" else Selector.VIDEO + return await self.crawl_children(scrape_item, selector) @error_handling_wrapper - async def playlist(self, scrape_item: ScrapeItem) -> None: - self.add_user_title(scrape_item) - if "favorite" in scrape_item.url.parts: - scrape_item.add_to_parent_title("favorite") - - scrape_item.setup_as_album("videos") - selector = f"{_SELECTORS.VIDEO_DIV} {_SELECTORS.VIDEO}" - + async def crawl_children(self, scrape_item: ScrapeItem, selector: str) -> None: async for soup in self.web_pager(scrape_item.url): - if "This is a private" in soup.text: - raise ScrapeError(401, "Private playlist") - for _, new_scrape_item in self.iter_children(scrape_item, soup, selector): - await self.video(new_scrape_item) - - """--------------------------------------------------------------------------------------------------------------------------""" - - async def get_album_title(self, scrape_item: ScrapeItem) -> str: - if "favorite" in scrape_item.url.parts: - return "favorite" - if "album" in scrape_item.url.parts and len(scrape_item.url.parts) > 3: - return scrape_item.url.parts[3] - soup = await self.request_soup(scrape_item.url) - return css.select_text(soup, _SELECTORS.ALBUM_TITLE) - - def add_user_title(self, scrape_item: ScrapeItem) -> None: - try: - user_index = scrape_item.url.parts.index("user") - user = scrape_item.url.parts[user_index + 1] - except ValueError: - return - user_title = f"{user} [user]" - full_user_title = self.create_title(user_title) - if not scrape_item.parent_title: - scrape_item.add_to_parent_title(full_user_title) - if user_title not in scrape_item.parent_title: - scrape_item.add_to_parent_title(user_title) + for _, new_item in self.iter_children(scrape_item, soup, selector): + self.create_task(self.run(new_item)) + + @signature.copy(Crawler.web_pager) + async def web_pager(self, url: AbsoluteHttpURL, *args, **kwargs) -> AsyncIterator[BeautifulSoup]: + is_fist_page: bool = True + async for soup in super().web_pager(url): + if is_fist_page: + _check_private(soup) + is_fist_page = False + yield soup + + +def _check_private(soup: BeautifulSoup) -> None: + if "This is a private" in soup.get_text(): + raise ScrapeError(401, "Private - Requires being friends with the owner") diff --git a/cyberdrop_dl/crawlers/transflix.py b/cyberdrop_dl/crawlers/transflix.py index a14841103..de80170e6 100644 --- a/cyberdrop_dl/crawlers/transflix.py +++ b/cyberdrop_dl/crawlers/transflix.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import TYPE_CHECKING, ClassVar -from cyberdrop_dl.crawlers.crawler import Crawler, SupportedPaths +from cyberdrop_dl.crawlers.crawler import Crawler, RateLimit, SupportedPaths from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL from cyberdrop_dl.utils import css, open_graph from cyberdrop_dl.utils.utilities import error_handling_wrapper @@ -11,45 +11,44 @@ if TYPE_CHECKING: from cyberdrop_dl.data_structures.url_objects import ScrapeItem +_UNIX_TIMESTAMP_LENGTH: int = 10 -class Selectors: + +class Selector: VIDEO = "video#player > source" - SEARCH_VIDEOS = "div.list-videos div.item > a" + SEARCH_RESULTS = "div.list-videos div.item > a" NEXT_PAGE = "li.next > a" -_SELECTORS = Selectors() -PRIMARY_URL = AbsoluteHttpURL("https://transflix.net") - - class TransflixCrawler(Crawler): SUPPORTED_PATHS: ClassVar[SupportedPaths] = { "Video": "/video/-", - "Search": "/search/?q=...", + "Search": "/search/?q=", } DOMAIN: ClassVar[str] = "transflix" FOLDER_DOMAIN: ClassVar[str] = "TransFlix" - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = PRIMARY_URL - NEXT_PAGE_SELECTOR: ClassVar[str] = _SELECTORS.NEXT_PAGE - _RATE_LIMIT = 3, 10 + PRIMARY_URL: ClassVar[AbsoluteHttpURL] = AbsoluteHttpURL("https://transflix.net") + NEXT_PAGE_SELECTOR: ClassVar[str] = Selector.NEXT_PAGE + _RATE_LIMIT: ClassVar[RateLimit] = 3, 2 async def fetch(self, scrape_item: ScrapeItem) -> None: - if "video" in scrape_item.url.parts: - return await self.video(scrape_item) - elif "search" in scrape_item.url.parts and (query := scrape_item.url.query.get("q")): - return await self.search(scrape_item, query) - raise ValueError + match scrape_item.url.parts[1:]: + case ["video", slug] if video_id := slug.rsplit("-", 1)[-1]: + return await self.video(scrape_item, video_id) + case ["search"] if query := scrape_item.url.query.get("q"): + return await self.search(scrape_item, query) + case _: + raise ValueError @error_handling_wrapper - async def video(self, scrape_item: ScrapeItem) -> None: - video_id: str = scrape_item.url.parts[-1].split("-")[-1] - if await self.check_complete_from_referer(scrape_item): + async def video(self, scrape_item: ScrapeItem, video_id: str) -> None: + if await self.check_complete_from_referer(scrape_item.url): return soup = await self.request_soup(scrape_item.url) title = open_graph.title(soup) - video = css.select(soup, _SELECTORS.VIDEO) - link = self.parse_url(css.attr(video, "src")) + video = css.select(soup, Selector.VIDEO, "src") + link = self.parse_url(video) filename, ext = self.get_filename_and_ext(link.name) scrape_item.uploaded_at = _timestamp_from_filename(link.name) custom_filename = self.create_custom_filename(title, ext, file_id=video_id) @@ -61,15 +60,16 @@ async def search(self, scrape_item: ScrapeItem, query: str) -> None: title = self.create_title(f"Search - {query}") scrape_item.setup_as_album(title) - async for soup in self.web_pager(scrape_item.url, _SELECTORS.NEXT_PAGE): - for _, new_scrape_item in self.iter_children(scrape_item, soup, _SELECTORS.SEARCH_VIDEOS): + async for soup in self.web_pager(scrape_item.url, Selector.NEXT_PAGE): + for _, new_scrape_item in self.iter_children(scrape_item, soup, Selector.SEARCH_RESULTS): self.create_task(self.run(new_scrape_item)) def _timestamp_from_filename(filename: str) -> int | None: - UNIX_TIMESTAMP_LENGTH: int = 10 stem = Path(filename).stem - if len(stem) >= UNIX_TIMESTAMP_LENGTH: - possible_timestamp = stem[-UNIX_TIMESTAMP_LENGTH:] - if possible_timestamp.isdecimal(): + if len(stem) >= _UNIX_TIMESTAMP_LENGTH: + possible_timestamp = stem[-_UNIX_TIMESTAMP_LENGTH:] + try: return int(possible_timestamp) + except ValueError: + return diff --git a/cyberdrop_dl/crawlers/vipr_dot_im.py b/cyberdrop_dl/crawlers/vipr_dot_im.py index 6192cd2ba..b06b2b181 100644 --- a/cyberdrop_dl/crawlers/vipr_dot_im.py +++ b/cyberdrop_dl/crawlers/vipr_dot_im.py @@ -12,22 +12,31 @@ from cyberdrop_dl.data_structures.url_objects import ScrapeItem -PRIMARY_URL = AbsoluteHttpURL("https://vipr.im") -IMG_SELECTOR = "div#body a > img" - - class ViprImCrawler(Crawler): - SUPPORTED_PATHS: ClassVar[SupportedPaths] = {"Image": "/...", "Thumbnail": "/th/..."} - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = PRIMARY_URL + SUPPORTED_PATHS: ClassVar[SupportedPaths] = { + "Image": "/", + "Direct Image": "/i/.../", + "Thumbnail": "/th/.../", + } + PRIMARY_URL: ClassVar[AbsoluteHttpURL] = AbsoluteHttpURL("https://vipr.im") DOMAIN: ClassVar[str] = "vipr.im" FOLDER_DOMAIN: ClassVar[str] = "Vipr.im" async def fetch(self, scrape_item: ScrapeItem) -> None: - if "th" in scrape_item.url.parts: - return await self.thumbnail(scrape_item) - if len(scrape_item.url.parts) == 2: - return await self.image(scrape_item) - raise ValueError + match scrape_item.url.parts[1:]: + case [_]: + return await self.image(scrape_item) + case _: + raise ValueError + + @classmethod + def transform_url(cls, url: AbsoluteHttpURL) -> AbsoluteHttpURL: + url = super().transform_url(url) + match url.parts[1:]: + case ["th" | "i", _, slug, *_]: + return cls.PRIMARY_URL / Path(slug).stem + case _: + return url @error_handling_wrapper async def image(self, scrape_item: ScrapeItem) -> None: @@ -35,19 +44,5 @@ async def image(self, scrape_item: ScrapeItem) -> None: return soup = await self.request_soup(scrape_item.url) - - link_str: str = css.select(soup, IMG_SELECTOR, "src") - link = self.parse_url(link_str) - filename, ext = self.get_filename_and_ext(link.name, assume_ext=".jpg") - await self.handle_file(link, scrape_item, filename, ext) - - async def thumbnail(self, scrape_item: ScrapeItem) -> None: - scrape_item.url = self.get_canonical_url(scrape_item.url) - self.create_task(self.run(scrape_item)) - - def get_canonical_url(self, url: AbsoluteHttpURL) -> AbsoluteHttpURL: - return PRIMARY_URL / get_image_id(url) - - -def get_image_id(url: AbsoluteHttpURL) -> str: - return Path(url.name).stem + link_str: str = css.select(soup, "div#body a > img", "src") + await self.direct_file(scrape_item, self.parse_url(link_str)) diff --git a/cyberdrop_dl/crawlers/xbunkr.py b/cyberdrop_dl/crawlers/xbunkr.py deleted file mode 100644 index c1c19fc7a..000000000 --- a/cyberdrop_dl/crawlers/xbunkr.py +++ /dev/null @@ -1,45 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, ClassVar - -from cyberdrop_dl.crawlers.crawler import Crawler, SupportedPaths -from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL -from cyberdrop_dl.utils import css -from cyberdrop_dl.utils.utilities import error_handling_wrapper - -if TYPE_CHECKING: - from cyberdrop_dl.data_structures.url_objects import ScrapeItem - - -IMAGE_SELECTOR = "a[class=image]" -TITLE_SELECTOR = "h1#title" -PRIMARY_URL = AbsoluteHttpURL("https://xbunkr.com") - - -class XBunkrCrawler(Crawler): - SUPPORTED_PATHS: ClassVar[SupportedPaths] = {"Albums": "/a/...", "Direct links": ""} - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = PRIMARY_URL - DOMAIN: ClassVar[str] = "xbunkr" - FOLDER_DOMAIN: ClassVar[str] = "XBunkr" - - async def fetch(self, scrape_item: ScrapeItem) -> None: - if "media" in scrape_item.url.host: - await self.file(scrape_item) - return await self.album(scrape_item) - - @error_handling_wrapper - async def album(self, scrape_item: ScrapeItem) -> None: - soup = await self.request_soup(scrape_item.url) - - album_id = scrape_item.url.parts[2] - title = self.create_title(css.select_text(soup, TITLE_SELECTOR), album_id) - scrape_item.setup_as_album(title, album_id=album_id) - - for _, link in self.iter_tags(soup, IMAGE_SELECTOR): - filename, ext = self.get_filename_and_ext(link.name, assume_ext=".jpg") - await self.handle_file(link, scrape_item, filename, ext) - - @error_handling_wrapper - async def file(self, scrape_item: ScrapeItem) -> None: - filename, ext = self.get_filename_and_ext(scrape_item.url.name) - await self.handle_file(scrape_item.url, scrape_item, filename, ext) diff --git a/cyberdrop_dl/crawlers/xvideos.py b/cyberdrop_dl/crawlers/xvideos.py index 2998ed47a..b50d37dba 100644 --- a/cyberdrop_dl/crawlers/xvideos.py +++ b/cyberdrop_dl/crawlers/xvideos.py @@ -88,17 +88,19 @@ def __post_init__(self) -> None: self._seen_domains: set[str] = set() async def fetch(self, scrape_item: ScrapeItem) -> None: - if ".red" not in scrape_item.url.host: - match scrape_item.url.parts[1:]: - case [part, _] if part.startswith("video"): - return await self.video(scrape_item) - case [_ as part, _] if part in _EXTENDED_ACCOUNTS: - return await self.account(scrape_item) - case [_ as part, _, "photos" | "post", gallery_id, *_] if part in _EXTENDED_ACCOUNTS: - return await self.gallery(scrape_item, gallery_id) - case [_ as part] if part not in _EXTENDED_ACCOUNTS: # channel - return await self.account(scrape_item) - raise ValueError + if scrape_item.url.host.endswith(".red"): + raise ValueError + match scrape_item.url.parts[1:]: + case [part, _] if part.startswith("video"): + return await self.video(scrape_item) + case [_ as part, _] if part in _EXTENDED_ACCOUNTS: + return await self.account(scrape_item) + case [_ as part, _, "photos" | "post", gallery_id, *_] if part in _EXTENDED_ACCOUNTS: + return await self.gallery(scrape_item, gallery_id) + case [_ as part] if part not in _EXTENDED_ACCOUNTS: # channel + return await self.account(scrape_item) + case _: + raise ValueError @error_handling_wrapper async def video(self, scrape_item: ScrapeItem) -> None: diff --git a/tests/crawlers/test_cases/4chan.py b/tests/crawlers/test_cases/4chan.py new file mode 100644 index 000000000..7d02343d4 --- /dev/null +++ b/tests/crawlers/test_cases/4chan.py @@ -0,0 +1,199 @@ +DOMAIN = "4chan" +TEST_CASES = [ + ( + "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + [ + { + "url": "https://i.4cdn.org/wg/1772229579282188.jpg", + "filename": "1721679422587858.jpg", + "debrid_link": None, + "original_filename": "1772229579282188.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1772229579, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1772229618623720.jpg", + "filename": "1721679462267124.jpg", + "debrid_link": None, + "original_filename": "1772229618623720.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1772229618, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1772229845208678.jpg", + "filename": "1679317543266345.jpg", + "debrid_link": None, + "original_filename": "1772229845208678.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1772229845, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1772230012496094.jpg", + "filename": "1582653914932.jpg", + "debrid_link": None, + "original_filename": "1772230012496094.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1772230012, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1772580813916586.jpg", + "filename": "pexels-jodaarba-3254036.jpg", + "debrid_link": None, + "original_filename": "1772580813916586.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1772580813, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1772741190457608.jpg", + "filename": "2150044197.jpg", + "debrid_link": None, + "original_filename": "1772741190457608.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1772741190, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1772839481404900.jpg", + "filename": "1653775158127.jpg", + "debrid_link": None, + "original_filename": "1772839481404900.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1772839481, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1772839528758378.jpg", + "filename": "François-Antoine Bossuet - Granada.jpg", + "debrid_link": None, + "original_filename": "1772839528758378.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1772839528, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1773205209056503.jpg", + "filename": "1501864637261.jpg", + "debrid_link": None, + "original_filename": "1773205209056503.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1773205209, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1774044270225055.png", + "filename": "Screenshot_20250605-042403.png", + "debrid_link": None, + "original_filename": "1774044270225055.png", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1774044270, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1774994218477765.png", + "filename": "Screenshot_20250615-094019.png", + "debrid_link": None, + "original_filename": "1774994218477765.png", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1774994218, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1775075417678932.jpg", + "filename": "dc821bc71370fdbc2737f79ba7620ed5.jpg", + "debrid_link": None, + "original_filename": "1775075417678932.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1775075417, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1775821908934617.jpg", + "filename": "La_Fuite_en_Egypte,_par_Ziani,_huile_sur_toile,_2015.jpg", + "debrid_link": None, + "original_filename": "1775821908934617.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1775821908, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1775821975421436.png", + "filename": "1664768119485462.png", + "debrid_link": None, + "original_filename": "1775821975421436.png", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1775821975, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1775822014155017.jpg", + "filename": "Nasir Al-Mulk Mosque, Iran.jpg", + "debrid_link": None, + "original_filename": "1775822014155017.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1775822014, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1775822046035196.jpg", + "filename": "somewhere in isfahan.jpg", + "debrid_link": None, + "original_filename": "1775822046035196.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1775822046, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1775822102289319.jpg", + "filename": "Welcome-to-Jerusalem.jpg", + "debrid_link": None, + "original_filename": "1775822102289319.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1775822102, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1775822165948596.jpg", + "filename": "akbar-nemati-aldc3QDhE28-unsplash.jpg", + "debrid_link": None, + "original_filename": "1775822165948596.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1775822165, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + { + "url": "https://i.4cdn.org/wg/1775822211664472.jpg", + "filename": "Nasir Al-Mulk Mosque, Iran(2).jpg", + "debrid_link": None, + "original_filename": "1775822211664472.jpg", + "referer": "https://boards.4chan.org/wg/thread/8131148/muslim-islamic-persian", + "album_id": "8131148", + "uploaded_at": 1775822211, + "download_folder": "re:Muslim- Islamic- Persian [thread] (4chan)", + }, + ], + 19, + ) +] diff --git a/tests/crawlers/test_cases/dirtyship.py b/tests/crawlers/test_cases/dirtyship.py new file mode 100644 index 000000000..7aab451af --- /dev/null +++ b/tests/crawlers/test_cases/dirtyship.py @@ -0,0 +1,101 @@ +DOMAIN = "dirtyship" +TEST_CASES = [ + ( + "https://dirtyship.com/rose-asmr-intimate-ear-licks-and-kisses-patreon-video-leaked", + [ + { + "url": "https://cdn11.dirtyship.net/Roseasmrearlickis.mp4", + "filename": "Rose ASMR Intimate Ear licks And Kisses Patreon Video Leaked.mp4", + "debrid_link": None, + "original_filename": "Roseasmrearlickis.mp4", + "referer": "https://dirtyship.com/rose-asmr-intimate-ear-licks-and-kisses-patreon-video-leaked", + "album_id": None, + "uploaded_at": 1736980488, + "download_folder": "re:Loose Files (DirtyShip)", + } + ], + 1, + ), + ( + "https://dirtyship.com/aftynrose-cuddly-naughty-lingerie-try-on-haul-asmr-video", + [ + { + "url": "https://cdn6.dirtyship.net/cdndirtyship/aftynroselingerietryonhaulv.mp4", + "filename": "AftynRose Cuddly & Naughty Lingerie Try On Haul ASMR Video [1080p].mp4", + "debrid_link": None, + "original_filename": "aftynroselingerietryonhaulv.mp4", + "referer": "https://dirtyship.com/aftynrose-cuddly-naughty-lingerie-try-on-haul-asmr-video", + "album_id": None, + "uploaded_at": 1612180561, + "download_folder": "re:Loose Files (DirtyShip)", + } + ], + 1, + ), + ( + "https://dirtyship.com/tag/aftynrose-2021", + [ + { + "url": "https://cdn10.dirtyship.net/dirtyship/cdn2/aftynroseasmrrelaxtonighv.mp4", + "filename": "AftynRose ASMR Relax in My Lap Tonight Video [1080p].mp4", + "debrid_link": None, + "original_filename": "aftynroseasmrrelaxtonighv.mp4", + "referer": "https://dirtyship.com/aftynrose-asmr-relax-in-my-lap-tonight-video", + "album_id": None, + "uploaded_at": 1616896081, + "download_folder": "re:AftynRose 2021 [tag] (DirtyShip)", + }, + { + "url": "https://cdn7.dirtyship.net/cdn2/aftyrosesexynightv.mp4", + "filename": "AftynRose ASMR Sleepy Sexy Night With Me Video [1080p].mp4", + "debrid_link": None, + "original_filename": "aftyrosesexynightv.mp4", + "referer": "https://dirtyship.com/aftynrose-asmr-sleepy-sexy-night-with-me-video-1", + "album_id": None, + "uploaded_at": 1613311594, + "download_folder": "re:AftynRose 2021 [tag] (DirtyShip)", + }, + { + "url": "https://cdn7.dirtyship.net/cdn2/AftynRoseUndressingv.mp4", + "filename": "AftynRose ASMR Slowly Undressing For Bed Video [1080p].mp4", + "debrid_link": None, + "original_filename": "AftynRoseUndressingv.mp4", + "referer": "https://dirtyship.com/aftynrose-asmr-slowly-undressing-for-bed-video-a", + "album_id": None, + "uploaded_at": 1616124852, + "download_folder": "re:AftynRose 2021 [tag] (DirtyShip)", + }, + { + "url": "https://cdn7.dirtyship.net/cdn2/aftynrosewetshirtv.mp4", + "filename": "AftynRose ASMR Wet T-Shirt Cheeky Spray Video [1080p].mp4", + "debrid_link": None, + "original_filename": "aftynrosewetshirtv.mp4", + "referer": "https://dirtyship.com/aftynrose-asmr-wet-t-shirt-cheeky-spray-video-1", + "album_id": None, + "uploaded_at": 1616636602, + "download_folder": "re:AftynRose 2021 [tag] (DirtyShip)", + }, + { + "url": "https://cdn6.dirtyship.net/cdndirtyship/aftynrosefunwithtonguev.mp4", + "filename": "AftynRose ASMR Fun With The Tongue Video [1080p].mp4", + "debrid_link": None, + "original_filename": "aftynrosefunwithtonguev.mp4", + "referer": "https://dirtyship.com/aftynrose-asmr-fun-with-the-tongue-video", + "album_id": None, + "uploaded_at": 1615256752, + "download_folder": "re:AftynRose 2021 [tag] (DirtyShip)", + }, + { + "url": "https://cdn7.dirtyship.net/cdn2/Aftynroseballsv.mp4", + "filename": "AftynRose ASMR Bulmas Quest For More Balls Video [1080p].mp4", + "debrid_link": None, + "original_filename": "Aftynroseballsv.mp4", + "referer": "https://dirtyship.com/aftynrose-asmr-bulmas-quest-for-more-balls-video-1", + "album_id": None, + "uploaded_at": 1612309588, + "download_folder": "re:AftynRose 2021 [tag] (DirtyShip)", + }, + ], + 6, + ), +] diff --git a/tests/crawlers/test_cases/doodstream.py b/tests/crawlers/test_cases/doodstream.py new file mode 100644 index 000000000..66ef3b0d2 --- /dev/null +++ b/tests/crawlers/test_cases/doodstream.py @@ -0,0 +1,29 @@ +DOMAIN = "doodstream" +TEST_CASES = [ + ( + "https://doodstream.com/e/l1ebnruggzly", + [ + { + "url": "https://doodstream.com/e/l1ebnruggzly", + "filename": "The Darwin Incident S01e03 Heterosis 1080P pl [242416709].mp4", + "debrid_link": "NOT_NONE", + "original_filename": "242416709.mp4", + "referer": "https://doodstream.com/e/l1ebnruggzly", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:Loose Files (DoodStream)", + } + ], + ), + ( + "https://playmogo.com/e/l1ebnruggzly", + [ + { + "url": "https://doodstream.com/e/l1ebnruggzly", + "filename": "The Darwin Incident S01e03 Heterosis 1080P pl [242416709].mp4", + "referer": "https://doodstream.com/e/l1ebnruggzly", + "download_folder": "re:Loose Files (DoodStream)", + } + ], + ), +] diff --git a/tests/crawlers/test_cases/noodlemagazine.py b/tests/crawlers/test_cases/noodlemagazine.py index 6c7c9651f..c0bc85e4f 100644 --- a/tests/crawlers/test_cases/noodlemagazine.py +++ b/tests/crawlers/test_cases/noodlemagazine.py @@ -7,8 +7,24 @@ "url": "https://noodlemagazine.com/videofile/-161131426_456241895.mp4", "filename": "Goddess brianna the christmas story [-161131426_456241895][720p].mp4", "referer": "https://noodlemagazine.com/watch/-161131426_456241895", - "datetime": 1587524400, + "uploaded_at": 1587531600, }, ], ), + ( + "https://noodlemagazine.com/watch/-146082307_456239302", + [ + { + "url": "https://noodlemagazine.com/videofile/-146082307_456239302.mp4", + "filename": "Sp azumi mizushima,asian red black nylon jav,japanese dance st [-146082307_456239302][480p].mp4", + "debrid_link": "NOT_NONE", + "original_filename": "Sp azumi mizushima,asian red black nylon jav,japanese dance strip on the six,гимнастка доводит до оргазма,solo dildo fetish", + "referer": "https://noodlemagazine.com/watch/-146082307_456239302", + "album_id": None, + "uploaded_at": 1574312400, + "download_folder": "re:Loose Files (NoodleMagazine)", + } + ], + 1, + ), ] diff --git a/tests/crawlers/test_cases/tokyomotion.py b/tests/crawlers/test_cases/tokyomotion.py new file mode 100644 index 000000000..9f5d71c41 --- /dev/null +++ b/tests/crawlers/test_cases/tokyomotion.py @@ -0,0 +1,370 @@ +DOMAIN = "tokyomotion" +TEST_CASES = [ + ( + "https://www.tokyomotion.net/album/68827", + [ + { + "url": "https://cdn.tokyo-motion.net/media/photos/895409.jpg", + "filename": "895409.jpg", + "debrid_link": None, + "original_filename": "895409.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895410.jpg", + "filename": "895410.jpg", + "debrid_link": None, + "original_filename": "895410.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895411.jpg", + "filename": "895411.jpg", + "debrid_link": None, + "original_filename": "895411.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895412.jpg", + "filename": "895412.jpg", + "debrid_link": None, + "original_filename": "895412.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895413.jpg", + "filename": "895413.jpg", + "debrid_link": None, + "original_filename": "895413.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895414.jpg", + "filename": "895414.jpg", + "debrid_link": None, + "original_filename": "895414.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895415.jpg", + "filename": "895415.jpg", + "debrid_link": None, + "original_filename": "895415.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895416.jpg", + "filename": "895416.jpg", + "debrid_link": None, + "original_filename": "895416.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895417.jpg", + "filename": "895417.jpg", + "debrid_link": None, + "original_filename": "895417.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895418.jpg", + "filename": "895418.jpg", + "debrid_link": None, + "original_filename": "895418.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895419.jpg", + "filename": "895419.jpg", + "debrid_link": None, + "original_filename": "895419.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895420.jpg", + "filename": "895420.jpg", + "debrid_link": None, + "original_filename": "895420.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895421.jpg", + "filename": "895421.jpg", + "debrid_link": None, + "original_filename": "895421.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895422.jpg", + "filename": "895422.jpg", + "debrid_link": None, + "original_filename": "895422.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895423.jpg", + "filename": "895423.jpg", + "debrid_link": None, + "original_filename": "895423.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895424.jpg", + "filename": "895424.jpg", + "debrid_link": None, + "original_filename": "895424.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895425.jpg", + "filename": "895425.jpg", + "debrid_link": None, + "original_filename": "895425.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895426.jpg", + "filename": "895426.jpg", + "debrid_link": None, + "original_filename": "895426.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895427.jpg", + "filename": "895427.jpg", + "debrid_link": None, + "original_filename": "895427.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895428.jpg", + "filename": "895428.jpg", + "debrid_link": None, + "original_filename": "895428.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895429.jpg", + "filename": "895429.jpg", + "debrid_link": None, + "original_filename": "895429.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895430.jpg", + "filename": "895430.jpg", + "debrid_link": None, + "original_filename": "895430.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + { + "url": "https://cdn.tokyo-motion.net/media/photos/895431.jpg", + "filename": "895431.jpg", + "debrid_link": None, + "original_filename": "895431.jpg", + "referer": "https://www.tokyomotion.net/album/68827", + "album_id": "68827", + "uploaded_at": None, + "download_folder": "re:mikansu1_4 (Tokyomotion)", + }, + ], + 23, + ), + ( + "https://www.tokyomotion.net/video/6347379", + [ + { + "url": "https://www.tokyomotion.net/vsrc/sd/73ed951a49b29f184a98", + "filename": "レイナ 13-3 [6347379].mp4", + "debrid_link": None, + "original_filename": "6347379.mp4", + "referer": "https://www.tokyomotion.net/video/6347379", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:Loose Files (Tokyomotion)", + } + ], + 1, + ), + ( + "https://www.tokyomotion.net/user/cocomincho/videos", + [ + { + "url": "https://www.tokyomotion.net/vsrc/sd/73ed951a49b29f184a98", + "filename": "レイナ 13-3 [6347379].mp4", + "debrid_link": None, + "original_filename": "6347379.mp4", + "referer": "https://www.tokyomotion.net/video/6347379", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:cocomincho [user] (Tokyomotion)/videos", + }, + { + "url": "https://www.tokyomotion.net/vsrc/sd/ae2e1d5a43a5b51e4748", + "filename": "レイナ 13-1 [6347377].mp4", + "debrid_link": None, + "original_filename": "6347377.mp4", + "referer": "https://www.tokyomotion.net/video/6347377", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:cocomincho [user] (Tokyomotion)/videos", + }, + ], + 2, + ), + ( + "https://www.tokyomotion.net/user/cocomincho", + [ + { + "url": "https://www.tokyomotion.net/vsrc/sd/73ed951a49b29f184a98", + "filename": "レイナ 13-3 [6347379].mp4", + "debrid_link": None, + "original_filename": "6347379.mp4", + "referer": "https://www.tokyomotion.net/video/6347379", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:cocomincho [user] (Tokyomotion)/videos", + }, + { + "url": "https://www.tokyomotion.net/vsrc/sd/ae2e1d5a43a5b51e4748", + "filename": "レイナ 13-1 [6347377].mp4", + "debrid_link": None, + "original_filename": "6347377.mp4", + "referer": "https://www.tokyomotion.net/video/6347377", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:cocomincho [user] (Tokyomotion)/videos", + }, + { + "url": "https://www.tokyomotion.net/vsrc/sd/be73a49ae8a96a385e5a", + "filename": "3P [6075129].mp4", # noqa: RUF001 + "debrid_link": None, + "original_filename": "6075129.mp4", + "referer": "https://www.tokyomotion.net/video/6075129", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:cocomincho [user] (Tokyomotion)/favorite/videos", + }, + { + "url": "https://www.tokyomotion.net/vsrc/sd/7e4e57f3ceaaff4737b4", + "filename": "FC2PPV 4828978 [6028400].mp4", + "debrid_link": None, + "original_filename": "6028400.mp4", + "referer": "https://www.tokyomotion.net/video/6028400", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:cocomincho [user] (Tokyomotion)/favorite/videos", + }, + { + "url": "https://www.tokyomotion.net/vsrc/sd/38fdcb71df78655169af", + "filename": "LC [6010613].mp4", + "debrid_link": None, + "original_filename": "6010613.mp4", + "referer": "https://www.tokyomotion.net/video/6010613", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:cocomincho [user] (Tokyomotion)/favorite/videos", + }, + { + "url": "https://www.tokyomotion.net/vsrc/sd/86007db51c27ccbe0d87", + "filename": "LC_SEX配信 [5932940].mp4", + "debrid_link": None, + "original_filename": "5932940.mp4", + "referer": "https://www.tokyomotion.net/video/5932940", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:cocomincho [user] (Tokyomotion)/favorite/videos", + }, + { + "url": "https://www.tokyomotion.net/vsrc/sd/e9815ae3efe16dffdcbc", + "filename": "FC2-PPV-4799828 【衝撃映像】坂道系Gカップ美女の寄り道極秘 [5841705].mp4", + "debrid_link": None, + "original_filename": "5841705.mp4", + "referer": "https://www.tokyomotion.net/video/5841705", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:cocomincho [user] (Tokyomotion)/favorite/videos", + }, + ], + 7, + ), + ( + "https://www.tokyomotion.net/photo/895409/", + [ + { + "url": "https://cdn.tokyo-motion.net/media/photos/895409.jpg", + "filename": "895409.jpg", + "referer": "https://www.tokyomotion.net/photo/895409/", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:Loose Files (Tokyomotion)", + }, + ], + ), +] diff --git a/tests/crawlers/test_cases/transflix.py b/tests/crawlers/test_cases/transflix.py index e7bf6e486..0832bbb05 100644 --- a/tests/crawlers/test_cases/transflix.py +++ b/tests/crawlers/test_cases/transflix.py @@ -7,14 +7,14 @@ "url": "https://cdn.transflix.net/video/2025-10-20/1760979682.mp4", "filename": "Avery Lust Chanel Chance [48259].mp4", "referer": "https://transflix.net/video/avery-lust-chanel-chance-48259", - "datetime": 1760979682, + "uploaded_at": 1760979682, } ], ), ( "https://transflix.net/search?q=ruby+wren", [], - 4, + 6, ), # Timestamp mixed with leading letters ( @@ -24,7 +24,7 @@ "url": "https://cdn.transflix.net/video/2025-03-28/qa2cxgri1743182001.mp4", "filename": "Hunnypaint - Take Care - Tranny Videos Xxx [43343].mp4", "referer": "https://transflix.net/video/hunnypaint-take-care-tranny-videos-xxx-43343", - "datetime": 1743182001, + "uploaded_at": 1743182001, } ], ), diff --git a/tests/crawlers/test_cases/vipr_im.py b/tests/crawlers/test_cases/vipr_im.py index facdfbf23..223614199 100644 --- a/tests/crawlers/test_cases/vipr_im.py +++ b/tests/crawlers/test_cases/vipr_im.py @@ -33,4 +33,36 @@ }, ], ), + ( + "https://vipr.im/kcd5jcuhgs3v", + [ + { + "url": "https://i7.vipr.im/i/00021/kcd5jcuhgs3v.jpg/sommer01035.jpg", + "filename": "sommer01035.jpg", + "debrid_link": None, + "original_filename": "sommer01035.jpg", + "referer": "https://vipr.im/kcd5jcuhgs3v", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:Loose Files (Vipr.im)", + } + ], + 1, + ), + ( + "https://vipr.im/kcd5jcuhgs3v.html", + [ + { + "url": "https://i7.vipr.im/i/00021/kcd5jcuhgs3v.jpg/sommer01035.jpg", + "filename": "sommer01035.jpg", + "debrid_link": None, + "original_filename": "sommer01035.jpg", + "referer": "https://vipr.im/kcd5jcuhgs3v.html", + "album_id": None, + "uploaded_at": None, + "download_folder": "re:Loose Files (Vipr.im)", + } + ], + 1, + ), ] diff --git a/tests/crawlers/test_crawlers.py b/tests/crawlers/test_crawlers.py index dae1c7051..2ee11527f 100644 --- a/tests/crawlers/test_crawlers.py +++ b/tests/crawlers/test_crawlers.py @@ -67,6 +67,8 @@ def _load_test_cases(path: Path) -> None: assert module_spec and module_spec.loader module = importlib.util.module_from_spec(module_spec) module_spec.loader.exec_module(module) + if module.DOMAIN in _TEST_DATA: + raise RuntimeError(f"Multiple tests files for {module.DOMAIN}") _TEST_DATA[module.DOMAIN] = list(_fix_test_cases(module.TEST_CASES)) diff --git a/tests/test_supported_sites.py b/tests/test_supported_sites.py index e7102d4b7..4e8b796f0 100644 --- a/tests/test_supported_sites.py +++ b/tests/test_supported_sites.py @@ -3,4 +3,4 @@ def test_rich_table() -> None: table = get_crawlers_info_as_rich_table() - assert len(table.rows) >= 169 + assert len(table.rows) >= 166