From 68389c55625493f9e5c6e84df039513cfeea355a Mon Sep 17 00:00:00 2001 From: NTFSvolume <172021377+NTFSvolume@users.noreply.github.com> Date: Fri, 10 Apr 2026 08:09:36 -0500 Subject: [PATCH 1/4] fix: download URL extraction (MixDrop) --- cyberdrop_dl/crawlers/mixdrop.py | 81 ++++++++++++-------------------- cyberdrop_dl/utils/jsunpacker.py | 78 ++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 50 deletions(-) create mode 100644 cyberdrop_dl/utils/jsunpacker.py diff --git a/cyberdrop_dl/crawlers/mixdrop.py b/cyberdrop_dl/crawlers/mixdrop.py index c03db60a4..27744e2e1 100644 --- a/cyberdrop_dl/crawlers/mixdrop.py +++ b/cyberdrop_dl/crawlers/mixdrop.py @@ -1,29 +1,18 @@ from __future__ import annotations -from datetime import datetime, timedelta from typing import TYPE_CHECKING, ClassVar from cyberdrop_dl.crawlers.crawler import Crawler, SupportedDomains, SupportedPaths from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL -from cyberdrop_dl.utils import css -from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_text_between +from cyberdrop_dl.utils import css, jsunpacker +from cyberdrop_dl.utils.utilities import error_handling_wrapper if TYPE_CHECKING: - from bs4 import BeautifulSoup + from collections.abc import Generator from cyberdrop_dl.data_structures.url_objects import ScrapeItem -class Selectors: - JS = "script:-soup-contains('MDCore.ref')" - FILE_NAME = "div.tbl-c.title b" - - -_SELECTOR = Selectors() - -PRIMARY_URL = AbsoluteHttpURL("https://mixdrop.sb") - - class MixDropCrawler(Crawler): SUPPORTED_PATHS: ClassVar[SupportedPaths] = { "File": ( @@ -31,49 +20,41 @@ class MixDropCrawler(Crawler): "/f/", ) } - SUPPORTED_DOMAINS: ClassVar[SupportedDomains] = "mxdrop", "mixdrop" - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = PRIMARY_URL + SUPPORTED_DOMAINS: ClassVar[SupportedDomains] = "mxdrop", "mixdrop", "m1xdrop" + PRIMARY_URL: ClassVar[AbsoluteHttpURL] = AbsoluteHttpURL("https://mixdrop.sb") DOMAIN: ClassVar[str] = "mixdrop" FOLDER_DOMAIN: ClassVar[str] = "MixDrop" async def fetch(self, scrape_item: ScrapeItem) -> None: - if any(p in scrape_item.url.parts for p in ("f", "e")): - return await self.file(scrape_item) - raise ValueError + match scrape_item.url.parts[1:]: + case ["f" | "e", file_id]: + return await self.file(scrape_item, file_id) + case _: + raise ValueError @error_handling_wrapper - async def file(self, scrape_item: ScrapeItem) -> None: - file_id = scrape_item.url.name - video_url = PRIMARY_URL / "f" / file_id - embed_url = self.get_embed_url(video_url) + async def file(self, scrape_item: ScrapeItem, file_id: str) -> None: + video_url = MixDropCrawler.PRIMARY_URL / "f" / file_id - if await self.check_complete_from_referer(embed_url): + if await self.check_complete(video_url, video_url): return - scrape_item.url = embed_url + scrape_item.url = video_url soup = await self.request_soup(video_url) - - title = css.select_text(soup, _SELECTOR.FILE_NAME) - - soup = await self.request_soup(embed_url) - - link = self.create_download_link(soup) - filename, ext = self.get_filename_and_ext(link.name) - custom_filename = self.create_custom_filename(title, ext) - await self.handle_file(video_url, scrape_item, filename, ext, custom_filename=custom_filename, debrid_link=link) - - @staticmethod - def create_download_link(soup: BeautifulSoup) -> AbsoluteHttpURL: - # Defined as a method to simplify subclasses calls - js_text = css.select_text(soup, _SELECTOR.JS) - file_id = get_text_between(js_text, "|v2||", "|") - parts = get_text_between(js_text, "MDCore||", "|thumbs").split("|") - secure_key = get_text_between(js_text, f"{file_id}|", "|") - timestamp = int((datetime.now() + timedelta(hours=1)).timestamp()) - host, ext, expires = ".".join(parts[:-3]), parts[-3], parts[-1] - url = AbsoluteHttpURL(f"https://s-{host}/v2/{file_id}.{ext}") - return url.with_query(s=secure_key, e=expires, t=timestamp) - - @staticmethod - def get_embed_url(url: AbsoluteHttpURL) -> AbsoluteHttpURL: - return PRIMARY_URL / "e" / url.name + title = css.select_text(soup, "div.tbl-c.title b") + link = await self._request_download(file_id) + filename, ext = self.get_filename_and_ext(title) + await self.handle_file(video_url, scrape_item, title, ext, custom_filename=filename, debrid_link=link) + + async def _request_download(self, file_id: str) -> AbsoluteHttpURL: + embed_url = MixDropCrawler.PRIMARY_URL / "e" / file_id + html = await self.request_text(embed_url) + info = dict(_extract_info(html)) + return self.parse_url(info["wurl"]) + + +def _extract_info(html: str) -> Generator[tuple[str, str]]: + content = jsunpacker.unpack(html) + for line in content.split(";MDCore."): + name, _, value = line.partition("=") + yield name.removeprefix("MDCore."), value.strip('"').strip() diff --git a/cyberdrop_dl/utils/jsunpacker.py b/cyberdrop_dl/utils/jsunpacker.py new file mode 100644 index 000000000..f546b879d --- /dev/null +++ b/cyberdrop_dl/utils/jsunpacker.py @@ -0,0 +1,78 @@ +"""Unpacker for Dean Edward's p.a.c.k.e.r, adapted from javascript beautifier + +https://github.com/beautifier/js-beautify/blob/e89b8269e198492b6e6026d2cc5e8d750d59c42c/python/jsbeautifier/unpackers/packer.py + +Original License: MIT + +The MIT License (MIT) + +Copyright (c) 2007-2018 Einar Lielmanis, Liam Newman, and contributors. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Callable + +_ALPHABET = { + 62: "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", + 95: (" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"), +} + + +def unpack(source: str) -> str: + content, base, count, words_list = _parse(source) + + if count != len(words_list): + raise RuntimeError + + decode = _make_decoder(base) + + def replace(match: re.Match[str]) -> str: + word = match.group(0) + return words_list[decode(word)] or word + + content = content.replace("\\\\", "\\").replace("\\'", "'") + source = re.sub(r"\b\w+\b", replace, content, flags=re.ASCII) + return source + + +def _parse(source: str) -> tuple[str, int, int, list[str]]: + if match := re.search((r"}\('(.*)', *(\d+|\[\]), *(\d+), *'(.*)'\.split\('\|'\)"), source, re.DOTALL): + content, base, count, words_list = match.groups() + if base == "[]": + base = 62 + + return content, int(base), int(count), words_list.split("|") + + raise RuntimeError + + +def _make_decoder(base: int) -> Callable[[str], int]: + if 2 <= base <= 36: + return lambda text: int(text, base) + + if 36 < base < 62: + if base not in _ALPHABET: + _ALPHABET[base] = _ALPHABET[62][:base] + + lookup = {char: idx for idx, char in enumerate(_ALPHABET[base])} + + def decode(text: str) -> int: + return sum((base**index) * lookup[char] for index, char in enumerate(reversed(text))) + + return decode From 11da9940fd3bca4270a8728628677d20f35d0bf7 Mon Sep 17 00:00:00 2001 From: NTFSvolume <172021377+NTFSvolume@users.noreply.github.com> Date: Fri, 10 Apr 2026 21:45:04 -0500 Subject: [PATCH 2/4] refactor: simplify --- cyberdrop_dl/crawlers/mixdrop.py | 40 ++++++++++++------- .../utils/{jsunpacker.py => js_unpacker.py} | 0 2 files changed, 25 insertions(+), 15 deletions(-) rename cyberdrop_dl/utils/{jsunpacker.py => js_unpacker.py} (100%) diff --git a/cyberdrop_dl/crawlers/mixdrop.py b/cyberdrop_dl/crawlers/mixdrop.py index 27744e2e1..bf9575a16 100644 --- a/cyberdrop_dl/crawlers/mixdrop.py +++ b/cyberdrop_dl/crawlers/mixdrop.py @@ -1,10 +1,11 @@ from __future__ import annotations +import asyncio from typing import TYPE_CHECKING, ClassVar from cyberdrop_dl.crawlers.crawler import Crawler, SupportedDomains, SupportedPaths from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL -from cyberdrop_dl.utils import css, jsunpacker +from cyberdrop_dl.utils import css, js_unpacker from cyberdrop_dl.utils.utilities import error_handling_wrapper if TYPE_CHECKING: @@ -34,27 +35,36 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: @error_handling_wrapper async def file(self, scrape_item: ScrapeItem, file_id: str) -> None: - video_url = MixDropCrawler.PRIMARY_URL / "f" / file_id - if await self.check_complete(video_url, video_url): + embed_url = self.PRIMARY_URL / "e" / file_id + + if await self.check_complete(embed_url, embed_url): return - scrape_item.url = video_url - soup = await self.request_soup(video_url) - title = css.select_text(soup, "div.tbl-c.title b") - link = await self._request_download(file_id) + scrape_item.url = embed_url + title, link = await self._request_file_info(file_id) filename, ext = self.get_filename_and_ext(title) - await self.handle_file(video_url, scrape_item, title, ext, custom_filename=filename, debrid_link=link) + await self.handle_file( + scrape_item.url, + scrape_item, + title, + ext, + custom_filename=filename, + debrid_link=link, + ) - async def _request_download(self, file_id: str) -> AbsoluteHttpURL: - embed_url = MixDropCrawler.PRIMARY_URL / "e" / file_id - html = await self.request_text(embed_url) - info = dict(_extract_info(html)) - return self.parse_url(info["wurl"]) + async def _request_file_info(self, file_id: str) -> tuple[str, AbsoluteHttpURL]: + video_url = self.PRIMARY_URL / "f" / file_id + embed_url = self.PRIMARY_URL / "e" / file_id + + soup, embed_html = await asyncio.gather(self.request_soup(video_url), self.request_text(embed_url)) + title = css.select_text(soup, "div.tbl-c.title b") + md_props = dict(_extract_properties(embed_html)) + return title, self.parse_url(md_props["wurl"]) -def _extract_info(html: str) -> Generator[tuple[str, str]]: - content = jsunpacker.unpack(html) +def _extract_properties(html: str) -> Generator[tuple[str, str]]: + content = js_unpacker.unpack(html) for line in content.split(";MDCore."): name, _, value = line.partition("=") yield name.removeprefix("MDCore."), value.strip('"').strip() diff --git a/cyberdrop_dl/utils/jsunpacker.py b/cyberdrop_dl/utils/js_unpacker.py similarity index 100% rename from cyberdrop_dl/utils/jsunpacker.py rename to cyberdrop_dl/utils/js_unpacker.py From 7782e634623bd5da4cb5ed00be4b0a25565b436b Mon Sep 17 00:00:00 2001 From: NTFSvolume <172021377+NTFSvolume@users.noreply.github.com> Date: Fri, 10 Apr 2026 22:38:59 -0500 Subject: [PATCH 3/4] refactor: update archivebate Do not inherit from mixdrop --- cyberdrop_dl/crawlers/archivebate.py | 87 ++++++++---------------- cyberdrop_dl/crawlers/mixdrop.py | 5 +- tests/crawlers/test_cases/archivebate.py | 19 ++++++ 3 files changed, 49 insertions(+), 62 deletions(-) create mode 100644 tests/crawlers/test_cases/archivebate.py diff --git a/cyberdrop_dl/crawlers/archivebate.py b/cyberdrop_dl/crawlers/archivebate.py index 259a75c9d..5c29dd13b 100644 --- a/cyberdrop_dl/crawlers/archivebate.py +++ b/cyberdrop_dl/crawlers/archivebate.py @@ -2,89 +2,58 @@ from typing import TYPE_CHECKING, ClassVar -from cyberdrop_dl.crawlers.mixdrop import MixDropCrawler +from cyberdrop_dl.crawlers.crawler import Crawler, RateLimit from cyberdrop_dl.data_structures.url_objects import AbsoluteHttpURL from cyberdrop_dl.exceptions import ScrapeError from cyberdrop_dl.utils import css, open_graph from cyberdrop_dl.utils.utilities import error_handling_wrapper, get_text_between if TYPE_CHECKING: - from cyberdrop_dl.crawlers.crawler import SupportedDomains, SupportedPaths + from cyberdrop_dl.crawlers.crawler import SupportedPaths from cyberdrop_dl.data_structures.url_objects import ScrapeItem -class Selectors: - JS = "script:-soup-contains('MDCore.ref')" - VIDEO = "iframe[src*=mixdrop]" +class Selector: + VIDEO = "input[name=fid]" USER_NAME = "div.info a[href*='archivebate.store/profile/']" SITE_NAME = f"{USER_NAME} + p" - NEXT_PAGE = "a.page-link[rel='next']" - PROFILE_VIDEOS = "section.video_item a" -_SELECTORS = Selectors() -PRIMARY_URL = AbsoluteHttpURL("https://www.archivebate.store") - - -class ArchiveBateCrawler(MixDropCrawler): - SUPPORTED_PATHS: ClassVar[SupportedPaths] = {"Video": "/watch/"} - SUPPORTED_DOMAINS: ClassVar[SupportedDomains] = () +class ArchiveBateCrawler(Crawler): + SUPPORTED_PATHS: ClassVar[SupportedPaths] = { + "Video": "/watch/", + } DOMAIN: ClassVar[str] = "archivebate" FOLDER_DOMAIN: ClassVar[str] = "ArchiveBate" - PRIMARY_URL: ClassVar[AbsoluteHttpURL] = PRIMARY_URL - NEXT_PAGE_SELECTOR = _SELECTORS.NEXT_PAGE - _RATE_LIMIT = 4, 1 + PRIMARY_URL: ClassVar[AbsoluteHttpURL] = AbsoluteHttpURL("https://www.archivebate.store") + _RATE_LIMIT: ClassVar[RateLimit] = 4, 1 async def fetch(self, scrape_item: ScrapeItem) -> None: - if "watch" in scrape_item.url.parts: - return await self.video(scrape_item) - - if "profile" in scrape_item.url.parts: - return await self.profile(scrape_item) - - raise ValueError - - async def profile(self, scrape_item: ScrapeItem) -> None: - # Not supported, video entries are dynamically generated with javascript - # They have an API to request them but it also returns javascript - raise ValueError + match scrape_item.url.parts[1:]: + case ["watch", _]: + return await self.video(scrape_item) + case _: + raise ValueError @error_handling_wrapper async def video(self, scrape_item: ScrapeItem) -> None: if await self.check_complete_from_referer(scrape_item.url): return - url = scrape_item.url - # Can't use check_complete_by_referer. We need the mixdrop url for that - db_path = self.__db_path__(url) - check_complete = await self.manager.database.history.check_complete(self.DOMAIN, url, url, db_path) - if check_complete: - self.log.info(f"Skipping {scrape_item.url} as it has already been downloaded") - self.manager.progress_manager.download_progress.add_previously_completed() - return - soup = await self.request_soup(scrape_item.url) - if "This video has been deleted" in soup.text: + if "This video has been deleted" in soup.get_text(): raise ScrapeError(410) - description = open_graph.description(soup) - date_str = get_text_between(description, "show on", " - ").strip() - user_name = css.select_text(soup, _SELECTORS.USER_NAME) - site_name = css.select_text(soup, _SELECTORS.SITE_NAME) - video_src = css.select(soup, _SELECTORS.VIDEO, "src") - title = self.create_title(f"{user_name} [{site_name}]") - scrape_item.setup_as_profile(title) - scrape_item.uploaded_at = self.parse_date(date_str) - mixdrop_url = self.get_embed_url(self.parse_url(video_src)) # Override domain - - if await self.check_complete_from_referer(mixdrop_url): - return - - soup = await self.request_soup(mixdrop_url) - - link = self.create_download_link(soup) - filename, ext = self.get_filename_and_ext(link.name) - custom_filename = self.create_custom_filename(f"Show on {date_str}", ext) - scrape_item.url = mixdrop_url - await self.handle_file(url, scrape_item, filename, ext, custom_filename=custom_filename, debrid_link=link) + upload_date = get_text_between(open_graph.description(soup), "show on", " - ").strip() + user_name = css.select_text(soup, Selector.USER_NAME) + site_name = css.select_text(soup, Selector.SITE_NAME) + scrape_item.setup_as_profile(self.create_title(f"{user_name} [{site_name}]")) + + scrape_item.uploaded_at = self.parse_iso_date(upload_date) + scrape_item.add_to_parent_title(f"Show on {upload_date}") + download_url = self.parse_url(css.select(soup, Selector.VIDEO, "value")) + self.handle_external_links( + scrape_item.create_child(download_url), + reset=False, + ) diff --git a/cyberdrop_dl/crawlers/mixdrop.py b/cyberdrop_dl/crawlers/mixdrop.py index bf9575a16..097118839 100644 --- a/cyberdrop_dl/crawlers/mixdrop.py +++ b/cyberdrop_dl/crawlers/mixdrop.py @@ -35,10 +35,8 @@ async def fetch(self, scrape_item: ScrapeItem) -> None: @error_handling_wrapper async def file(self, scrape_item: ScrapeItem, file_id: str) -> None: - embed_url = self.PRIMARY_URL / "e" / file_id - - if await self.check_complete(embed_url, embed_url): + if await self.check_complete(embed_url): return scrape_item.url = embed_url @@ -51,6 +49,7 @@ async def file(self, scrape_item: ScrapeItem, file_id: str) -> None: ext, custom_filename=filename, debrid_link=link, + referer=scrape_item.parent or scrape_item.url, ) async def _request_file_info(self, file_id: str) -> tuple[str, AbsoluteHttpURL]: diff --git a/tests/crawlers/test_cases/archivebate.py b/tests/crawlers/test_cases/archivebate.py new file mode 100644 index 000000000..dde64c18f --- /dev/null +++ b/tests/crawlers/test_cases/archivebate.py @@ -0,0 +1,19 @@ +DOMAIN = "archivebate" +TEST_CASES = [ + ( + "https://www.archivebate.store/watch/16321694", + [ + { + "url": "https://mixdrop.sb/e/67pwzwz6uxjggd", + "filename": "7e73779e-d4be-4e7d-982c-5244ad4f6b33.mp4", + "debrid_link": "NOT_NONE", + "original_filename": "7e73779e-d4be-4e7d-982c-5244ad4f6b33.mp4", + "referer": "https://www.archivebate.store/watch/16321694", + "album_id": None, + "uploaded_at": 1775887244, + "download_folder": "re:clairwittenmyer [Chaturbate] (ArchiveBate)/Show on 2026-04-11 01-00-44", + } + ], + 1, + ) +] From 3f45f1df4729d543c499bb5aab90e5bebd924668 Mon Sep 17 00:00:00 2001 From: NTFSvolume <172021377+NTFSvolume@users.noreply.github.com> Date: Fri, 10 Apr 2026 22:40:31 -0500 Subject: [PATCH 4/4] refactor: create handle embed method --- cyberdrop_dl/crawlers/archivebate.py | 7 ++----- cyberdrop_dl/crawlers/crawler.py | 4 ++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cyberdrop_dl/crawlers/archivebate.py b/cyberdrop_dl/crawlers/archivebate.py index 5c29dd13b..03e4adbf0 100644 --- a/cyberdrop_dl/crawlers/archivebate.py +++ b/cyberdrop_dl/crawlers/archivebate.py @@ -52,8 +52,5 @@ async def video(self, scrape_item: ScrapeItem) -> None: scrape_item.uploaded_at = self.parse_iso_date(upload_date) scrape_item.add_to_parent_title(f"Show on {upload_date}") - download_url = self.parse_url(css.select(soup, Selector.VIDEO, "value")) - self.handle_external_links( - scrape_item.create_child(download_url), - reset=False, - ) + embed_url = self.parse_url(css.select(soup, Selector.VIDEO, "value")) + self.handle_embed(scrape_item.create_child(embed_url)) diff --git a/cyberdrop_dl/crawlers/crawler.py b/cyberdrop_dl/crawlers/crawler.py index ac4688076..1a1c93ee8 100644 --- a/cyberdrop_dl/crawlers/crawler.py +++ b/cyberdrop_dl/crawlers/crawler.py @@ -555,6 +555,10 @@ def handle_external_links(self, scrape_item: ScrapeItem, reset: bool = True) -> scrape_item.reset() self.create_task(self.manager.scrape_mapper.send_to_crawler(scrape_item)) + @final + def handle_embed(self, scrape_item: ScrapeItem) -> None: + self.handle_external_links(scrape_item, reset=False) + @final def get_filename_and_ext( self,