diff --git a/LICENSE b/LICENSE index 5798ff9..22d8e6f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 饰乐 +Copyright (c) 2024 Les Freire Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md new file mode 100644 index 0000000..6d97efe --- /dev/null +++ b/README.md @@ -0,0 +1,71 @@ + +
+ +![:name](https://count.getloli.com/@astrbot_plugin_parser?name=astrbot_plugin_parser&theme=minecraft&padding=6&offset=0&align=top&scale=1&pixelated=1&darkmode=auto) + +# astrbot_plugin_parser + +_✨ 链接解析器 ✨_ + +[![License](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) +[![Python 3.10+](https://img.shields.io/badge/Python-3.10%2B-blue.svg)](https://www.python.org/) +[![AstrBot](https://img.shields.io/badge/AstrBot-3.4%2B-orange.svg)](https://github.com/Soulter/AstrBot) +[![GitHub](https://img.shields.io/badge/作者-Zhalslar-blue)](https://github.com/Zhalslar) + +
+ +## 📖 介绍 + +| 平台 | 触发的消息形态 | 视频 | 图集 | 音频 | +| ------- | --------------------------------- | ---- | ---- | ---- | +| B 站 | av 号/BV 号/链接/短链/卡片/小程序 | ✅​ | ✅​ | ✅​ | +| 抖音 | 链接(分享链接,兼容电脑端链接) | ✅​ | ✅​ | ❌️ | +| 微博 | 链接(博文,视频,show, 文章) | ✅​ | ✅​ | ❌️ | +| 小红书 | 链接(含短链)/卡片 | ✅​ | ✅​ | ❌️ | +| 快手 | 链接(包含标准链接和短链) | ✅​ | ✅​ | ❌️ | +| acfun | 链接 | ✅​ | ❌️ | ❌️ | +| youtube | 链接(含短链) | ✅​ | ❌️ | ✅​ | +| tiktok | 链接 | ✅​ | ❌️ | ❌️ | +| twitter | 链接 | ✅​ | ✅​ | ❌️ | + +## 🎨 效果图 + +插件默认启用 PIL 实现的通用媒体卡片渲染,效果图如下 + +
+ + + + + + + +
+ +## 💿 安装 + +直接在astrbot的插件市场搜索astrbot_plugin_parser,点击安装,等待完成即可 + +## ⚙️ 配置 + +请在astrbot的插件配置面板查看并修改 + +## 🎉 使用 + +| 指令 | 权限 | 说明 | +| :------: | :-------------------: | :---------------: | +| 开启解析 | ADMIN | 开启解析 | +| 关闭解析 | ADMIN | 关闭解析 | +| bm | - | 下载 B 站音频 | +| ym | - | 下载 youtube 音频 | +| blogin | ADMIN | 扫码获取 B 站凭证 | + +## 🧩 扩展 + +插件支持自定义解析器,通过继承 `BaseParser` 类并实现 `platform`, `handle` 即可。 + +示例解析器请看 [示例解析器](https://github.com/Zhalslar/astrbot_plugin_parser/blob/main/core/parsers/example.py) + +## 🎉 致谢 + +本项目核心代码来自[nonebot-plugin-parser](https://github.com/fllesser/nonebot-plugin-parser),请前往原仓库给作者点个Star! diff --git a/_conf_schema.json b/_conf_schema.json new file mode 100644 index 0000000..db148ea --- /dev/null +++ b/_conf_schema.json @@ -0,0 +1,160 @@ +{ + "disabled_sessions": { + "description": "关闭解析的会话", + "type": "list", + "hint": "在会话中使用命令 “开启解析” 和 “关闭解析” 来设置某会话的解析状态", + "default": [] + }, + "enable_platforms": { + "description": "启用解析的平台", + "type": "list", + "hint": "", + "options": [ + "A站", + "B站", + "微博", + "小红书", + "抖音", + "快手", + "NGA", + "TikTok", + "推特", + "油管" + ], + "default": [ + "A站", + "B站", + "微博", + "小红书", + "抖音", + "快手", + "NGA", + "TikTok", + "推特", + "油管" + ] + }, + "forward_contents": { + "description": "转发媒体内容", + "type": "bool", + "hint": "是否将解析到的图片/视频/音频作为合并转发消息发送", + "default": true + }, + "upload_audio": { + "description": "上传音频文件", + "type": "bool", + "hint": "是否将解析到的音频文件上传到群文件", + "default": false + }, + "max_size": { + "description": "资源最大大小", + "type": "int", + "hint": "允许下载的音视频最大体积,单位 MB", + "default": 90 + }, + "max_duration": { + "description": "资源最大时长", + "type": "int", + "hint": "允许下载的音视频最大时长,单位秒", + "default": 480 + }, + "download_timeout": { + "description": "下载请求超时时间", + "type": "int", + "hint": "下载视频、音频等较大文件时的请求超时时间,单位秒。 建议设置大一点,视频、音频较大时下载耗时较长", + "default": 280 + }, + "common_timeout": { + "description": "普通请求超时时间", + "type": "int", + "hint": "普通请求超时时间,单位秒。用于一些普通的请求 ", + "default": 15 + }, + "bili_ck": { + "description": "Bilibili Cookies", + "type": "text", + "hint": "用于B站解析的登录Cookies,留空则使用无登录状态", + "default": "" + }, + "bili_video_codecs": { + "description": "B站视频编码", + "type": "string", + "hint": "优先下载的编码类型,可选:AVC、AV1、HEV", + "options": [ + "AVC", + "AV1", + "HEV" + ], + "default": "AVC" + }, + "bili_video_quality": { + "description": "B站视频分辨率", + "type": "string", + "hint": "下载B站视频的分辨率", + "options": [ + "_360P", + "_480P", + "_720P", + "_1080P", + "_1080P_PLUS", + "_1080P_60", + "_4K", + "HDR", + "DOLBY", + "_8K", + "AI_REPAIR" + ], + "default": "_720P" + }, + "ytb_ck": { + "description": "YouTube Cookies", + "type": "text", + "hint": "用于YouTube解析的登录Cookies,留空则使用无登录状态", + "default": "" + }, + "proxy": { + "description": "代理地址", + "type": "string", + "hint": "如 http://127.0.0.1:7890,留空则直连。仅作用于 youtube, tiktok 解析", + "default": "" + }, + "emoji_cdn": { + "description": "Pilmoji 表情 CDN", + "type": "string", + "hint": "渲染表情使用的 CDN 地址,一般无需修改", + "default": "https://cdn.jsdelivr.net/npm/emoji-datasource-facebook@14.0.0/img/facebook/64/" + }, + "emoji_style": { + "description": "Pilmoji 表情样式", + "type": "string", + "hint": "可选:APPLE、FACEBOOK、GOOGLE、TWITTER", + "options": [ + "APPLE", + "FACEBOOK", + "GOOGLE", + "TWITTER" + ], + "default": "FACEBOOK" + }, + "clean_cron": { + "description": "自动清理缓存的触发周期", + "type": "string", + "hint": "使用 Cron 表达式(分 时 日 月 周)定义。例如:“30 2 * * *” 表示每天 2:30 。留空表示禁用自动清理", + "default": "30 2 * * *" + }, + "data_dir": { + "description": "数据目录", + "type": "string", + "invisible": true + }, + "cache_dir": { + "description": "缓存目录", + "type": "string", + "invisible": true + }, + "ytb_cookies_file": { + "description": "YouTube Cookies 文件", + "type": "string", + "invisible": true + } +} \ No newline at end of file diff --git a/core/__init__.py b/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/core/clean.py b/core/clean.py new file mode 100644 index 0000000..b44627b --- /dev/null +++ b/core/clean.py @@ -0,0 +1,62 @@ +import asyncio +import zoneinfo +from pathlib import Path + +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from apscheduler.triggers.cron import CronTrigger + +from astrbot.api import logger +from astrbot.core.config.astrbot_config import AstrBotConfig +from astrbot.core.star.context import Context + +from .utils import safe_unlink + + +class CacheCleaner: + """ + 每天固定时间自动清理插件缓存目录的调度器封装。 + """ + JOBNAME = "CacheCleaner" + def __init__(self, context: Context, config: AstrBotConfig): + self.clean_cron = config["clean_cron"] + self.cache_dir = Path(config["cache_dir"]) + + tz = context.get_config().get("timezone") + self.timezone = ( + zoneinfo.ZoneInfo(tz) if tz else zoneinfo.ZoneInfo("Asia/Shanghai") + ) + self.scheduler = AsyncIOScheduler(timezone=self.timezone) + self.scheduler.start() + + self.register_task() + + logger.info(f"{self.JOBNAME} 已启动,任务周期:{self.clean_cron}") + + def register_task(self): + try: + self.trigger = CronTrigger.from_crontab(self.clean_cron) + self.scheduler.add_job( + func=self._clean_plugin_cache, + trigger=self.trigger, + name=f"{self.JOBNAME}_scheduler", + max_instances=1, + ) + except Exception as e: + logger.error(f"[{self.JOBNAME}] Cron 格式错误:{e}") + + async def _clean_plugin_cache(self) -> None: + """真正的清理逻辑。""" + try: + files = [f for f in self.cache_dir.iterdir() if f.is_file()] + if not files: + logger.info("No cache files to clean.") + return + + await asyncio.gather(*(safe_unlink(f) for f in files)) + logger.info(f"Successfully cleaned {len(files)} cache files.") + except Exception: + logger.exception("Error while cleaning cache files.") + + async def stop(self): + self.scheduler.remove_all_jobs() + logger.info(f"[{self.JOBNAME}] 已停止") diff --git a/core/constants.py b/core/constants.py new file mode 100644 index 0000000..13574bb --- /dev/null +++ b/core/constants.py @@ -0,0 +1,39 @@ +from enum import Enum +from typing import Final + +COMMON_HEADER: Final[dict[str, str]] = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36" + ) +} + +IOS_HEADER: Final[dict[str, str]] = { + "User-Agent": ( + "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) " + "Version/16.6 Mobile/15E148 Safari/604.1 Edg/132.0.0.0" + ) +} + +ANDROID_HEADER: Final[dict[str, str]] = { + "User-Agent": ( + "Mozilla/5.0 (Linux; Android 15; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/132.0.0.0 Mobile Safari/537.36 Edg/132.0.0.0" + ) +} + + +class PlatformEnum(str, Enum): + ACFUN = "acfun" + BILIBILI = "bilibili" + DOUYIN = "douyin" + KUAISHOU = "kuaishou" + NGA = "nga" + TIKTOK = "tiktok" + TWITTER = "twitter" + WEIBO = "weibo" + XIAOHONGSHU = "xiaohongshu" + YOUTUBE = "youtube" + + def __str__(self) -> str: + return self.value diff --git a/core/download.py b/core/download.py new file mode 100644 index 0000000..269821d --- /dev/null +++ b/core/download.py @@ -0,0 +1,375 @@ +import asyncio +from asyncio import Task, create_task +from collections.abc import Callable, Coroutine +from functools import wraps +from pathlib import Path +from typing import Any, ParamSpec, TypeVar + +import aiofiles +import yt_dlp +from aiohttp import ClientError, ClientSession, ClientTimeout +from msgspec import Struct, convert +from tqdm.asyncio import tqdm + +from astrbot.api import logger +from astrbot.core.config.astrbot_config import AstrBotConfig + +from .constants import COMMON_HEADER +from .exception import ( + DownloadException, + DurationLimitException, + ParseException, + SizeLimitException, + ZeroSizeException, +) +from .utils import LimitedSizeDict, generate_file_name, merge_av, safe_unlink + +P = ParamSpec("P") +T = TypeVar("T") + + +def auto_task(func: Callable[P, Coroutine[Any, Any, T]]) -> Callable[P, Task[T]]: + """装饰器:自动将异步函数调用转换为 Task, 完整保留类型提示""" + + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> Task[T]: + coro = func(*args, **kwargs) + name = " | ".join(str(arg) for arg in args if isinstance(arg, str)) + return create_task(coro, name=func.__name__ + " | " + name) + + return wrapper + + +class VideoInfo(Struct): + title: str + """标题""" + channel: str + """频道名称""" + uploader: str + """上传者 id""" + duration: int + """时长""" + timestamp: int + """发布时间戳""" + thumbnail: str + """封面图片""" + description: str + """简介""" + channel_id: str + """频道 id""" + + @property + def author_name(self) -> str: + return f"{self.channel}@{self.uploader}" + + +class Downloader: + """下载器,支持youtube-dlp 和 httpx 流式下载""" + + def __init__(self, config: AstrBotConfig): + self.config = config + self.cache_dir = Path(config["cache_dir"]) + self.proxy: str | None = self.config["proxy"] or None + self.max_duration: int = config["max_duration"] + self.max_size = self.config["max_size"] + self.headers: dict[str, str] = COMMON_HEADER.copy() + # 视频信息缓存 + self.info_cache: LimitedSizeDict[str, VideoInfo] = LimitedSizeDict() + # 用于流式下载的客户端 + self.client = ClientSession( + timeout=ClientTimeout(total=config["download_timeout"]) + ) + @auto_task + async def streamd( + self, + url: str, + *, + file_name: str | None = None, + ext_headers: dict[str, str] | None = None, + ) -> Path: + """download file by url with stream + + Args: + url (str): url address + file_name (str | None): file name. Defaults to generate_file_name. + ext_headers (dict[str, str] | None): ext headers. Defaults to None. + + Returns: + Path: file path + + Raises: + httpx.HTTPError: When download fails + """ + + if not file_name: + file_name = generate_file_name(url) + file_path = self.cache_dir / file_name + # 如果文件存在,则直接返回 + if file_path.exists(): + return file_path + + headers = {**self.headers, **(ext_headers or {})} + + try: + async with self.client.get( + url, headers=headers, allow_redirects=True + ) as response: + if response.status >= 400: + raise ClientError( + f"HTTP {response.status} {response.reason}" + ) + content_length = response.headers.get("Content-Length") + content_length = int(content_length) if content_length else 0 + + if content_length == 0: + logger.warning(f"媒体 url: {url}, 大小为 0, 取消下载") + raise ZeroSizeException + if (file_size := content_length / 1024 / 1024) > self.max_size: + logger.warning( + f"媒体 url: {url} 大小 {file_size:.2f} MB 超过 {self.max_size} MB, 取消下载" + ) + raise SizeLimitException + + with self.get_progress_bar(file_name, content_length) as bar: + async with aiofiles.open(file_path, "wb") as file: + async for chunk in response.content.iter_chunked(1024 * 1024): + await file.write(chunk) + bar.update(len(chunk)) + + except ClientError: + await safe_unlink(file_path) + logger.exception(f"下载失败 | url: {url}, file_path: {file_path}") + raise DownloadException("媒体下载失败") + return file_path + + @staticmethod + def get_progress_bar(desc: str, total: int | None = None) -> tqdm: + """获取进度条 bar + + Args: + desc (str): 描述 + total (int | None): 总大小. Defaults to None. + + Returns: + tqdm: 进度条 + """ + return tqdm( + total=total, + unit="B", + unit_scale=True, + unit_divisor=1024, + dynamic_ncols=True, + colour="green", + desc=desc, + ) + + @auto_task + async def download_video( + self, + url: str, + *, + video_name: str | None = None, + ext_headers: dict[str, str] | None = None, + use_ytdlp: bool = False, + cookiefile: Path | None = None, + ) -> Path: + """download video file by url with stream + + Args: + url (str): url address + video_name (str | None): video name. Defaults to get name by parse url. + ext_headers (dict[str, str] | None): ext headers. Defaults to None. + use_ytdlp (bool): use ytdlp to download video. Defaults to False. + cookiefile (Path | None): cookie file path. Defaults to None. + + Returns: + Path: video file path + + Raises: + httpx.HTTPError: When download fails + """ + if use_ytdlp: + return await self._ytdlp_download_video(url, cookiefile) + + if video_name is None: + video_name = generate_file_name(url, ".mp4") + return await self.streamd(url, file_name=video_name, ext_headers=ext_headers) + + @auto_task + async def download_audio( + self, + url: str, + *, + audio_name: str | None = None, + ext_headers: dict[str, str] | None = None, + use_ytdlp: bool = False, + cookiefile: Path | None = None, + ) -> Path: + """download audio file by url with stream + + Args: + url (str): url address + audio_name (str | None ): audio name. Defaults to generate from url. + ext_headers (dict[str, str] | None): ext headers. Defaults to None. + + Returns: + Path: audio file path + + Raises: + httpx.HTTPError: When download fails + """ + if use_ytdlp: + return await self._ytdlp_download_audio(url, cookiefile) + + if audio_name is None: + audio_name = generate_file_name(url, ".mp3") + return await self.streamd(url, file_name=audio_name, ext_headers=ext_headers) + + @auto_task + async def download_img( + self, + url: str, + *, + img_name: str | None = None, + ext_headers: dict[str, str] | None = None, + ) -> Path: + """download image file by url with stream + + Args: + url (str): url + img_name (str | None): image name. Defaults to generate from url. + ext_headers (dict[str, str] | None): ext headers. Defaults to None. + + Returns: + Path: image file path + + Raises: + httpx.HTTPError: When download fails + """ + if img_name is None: + img_name = generate_file_name(url, ".jpg") + return await self.streamd(url, file_name=img_name, ext_headers=ext_headers) + + async def download_imgs_without_raise( + self, + urls: list[str], + *, + ext_headers: dict[str, str] | None = None, + ) -> list[Path]: + """download images without raise + + Args: + urls (list[str]): urls + ext_headers (dict[str, str] | None): ext headers. Defaults to None. + + Returns: + list[Path]: image file paths + """ + paths_or_errs = await asyncio.gather( + *[self.download_img(url, ext_headers=ext_headers) for url in urls], + return_exceptions=True, + ) + return [p for p in paths_or_errs if isinstance(p, Path)] + + @auto_task + async def download_av_and_merge( + self, + v_url: str, + a_url: str, + *, + output_path: Path, + ext_headers: dict[str, str] | None = None, + ) -> Path: + """download video and audio file by url with stream and merge""" + v_path, a_path = await asyncio.gather( + self.download_video(v_url, ext_headers=ext_headers), + self.download_audio(a_url, ext_headers=ext_headers), + ) + await merge_av(v_path=v_path, a_path=a_path, output_path=output_path) + return output_path + + # region -------------------- 私有:yt-dlp -------------------- + + async def ytdlp_extract_info( + self, url: str, cookiefile: Path | None = None + ) -> VideoInfo: + if (info := self.info_cache.get(url)) is not None: + return info + opts = { + "quiet": True, + "skip_download": True, + "force_generic_extractor": True, + "cookiefile": None, + } + if self.proxy: + opts["proxy"] = self.proxy + if cookiefile and cookiefile.is_file(): + opts["cookiefile"] = str(cookiefile) + with yt_dlp.YoutubeDL(opts) as ydl: + raw = await asyncio.to_thread(ydl.extract_info, url, download=False) + if not raw: + raise ParseException("获取视频信息失败") + info = convert(raw, VideoInfo) + self.info_cache[url] = info + return info + + async def _ytdlp_download_video( + self, url: str, cookiefile: Path | None = None + ) -> Path: + info = await self.ytdlp_extract_info(url, cookiefile) + if info.duration > self.max_duration: + raise DurationLimitException + + video_path = self.cache_dir / generate_file_name(url, ".mp4") + if video_path.exists(): + return video_path + + opts = { + "outtmpl": str(video_path), + "merge_output_format": "mp4", + # "format": f"bv[filesize<={info.duration // 10 + 10}M]+ba/b[filesize<={info.duration // 8 + 10}M]", + "format": "best[height<=720]/bestvideo[height<=720]+bestaudio/best", + "postprocessors": [ + {"key": "FFmpegVideoConvertor", "preferedformat": "mp4"} + ], + "cookiefile": None, + } + if self.proxy: + opts["proxy"] = self.proxy + if cookiefile and cookiefile.is_file(): + opts["cookiefile"] = str(cookiefile) + + with yt_dlp.YoutubeDL(opts) as ydl: + await asyncio.to_thread(ydl.download, [url]) + return video_path + + async def _ytdlp_download_audio(self, url: str, cookiefile: Path | None) -> Path: + file_name = generate_file_name(url) + audio_path = self.cache_dir / f"{file_name}.flac" + if audio_path.exists(): + return audio_path + + opts = { + "outtmpl": str(self.cache_dir / file_name) + ".%(ext)s", + "format": "bestaudio/best", + "postprocessors": [ + { + "key": "FFmpegExtractAudio", + "preferredcodec": "flac", + "preferredquality": "0", + } + ], + "cookiefile": None, + } + if self.proxy: + opts["proxy"] = self.proxy + if cookiefile and cookiefile.is_file(): + opts["cookiefile"] = str(cookiefile) + + with yt_dlp.YoutubeDL(opts) as ydl: + await asyncio.to_thread(ydl.download, [url]) + return audio_path + + async def close(self): + """关闭网络客户端""" + await self.client.close() diff --git a/core/exception.py b/core/exception.py new file mode 100644 index 0000000..31a9549 --- /dev/null +++ b/core/exception.py @@ -0,0 +1,46 @@ +class ParseException(Exception): + """异常基类""" + + def __init__(self, message: str): + super().__init__(message) + self.message = message + + +class TipException(ParseException): + """提示异常""" + + pass + + +class DownloadException(ParseException): + """下载异常""" + + def __init__(self, message: str | None = None): + super().__init__(message or "媒体下载失败") + + +class DownloadLimitException(DownloadException): + """下载超过限制异常""" + + pass + + +class SizeLimitException(DownloadLimitException): + """下载大小超过限制异常""" + + def __init__(self): + super().__init__("媒体大小超过配置限制,取消下载") + + +class DurationLimitException(DownloadLimitException): + """下载时长超过限制异常""" + + def __init__(self): + super().__init__("媒体时长超过配置限制,取消下载") + + +class ZeroSizeException(DownloadException): + """下载大小为 0 异常""" + + def __init__(self): + super().__init__("媒体大小为 0, 取消下载") diff --git a/core/parsers/__init__.py b/core/parsers/__init__.py new file mode 100644 index 0000000..cdb3ceb --- /dev/null +++ b/core/parsers/__init__.py @@ -0,0 +1,48 @@ + +from .acfun import AcfunParser +from .base import BaseParser, handle +from .bilibili import BilibiliParser +from .data import ( + AudioContent, + Author, + DynamicContent, + GraphicsContent, + ImageContent, + ParseResult, + Platform, + VideoContent, +) +from .douyin import DouyinParser +from .kuaishou import KuaiShouParser +from .nga import NGAParser +from .tiktok import TikTokParser +from .twitter import TwitterParser +from .weibo import WeiBoParser +from .xiaohongshu import XiaoHongShuParser +from .youtube import YouTubeParser + +__all__ = [ + # 数据模型 + "AudioContent", + "Author", + "DynamicContent", + "GraphicsContent", + "ImageContent", + "ParseResult", + "Platform", + "VideoContent", + # 基础组件 + "BaseParser", + "handle", + # 各平台 Parser + "AcfunParser", + "BilibiliParser", + "DouyinParser", + "KuaiShouParser", + "NGAParser", + "TikTokParser", + "TwitterParser", + "WeiBoParser", + "XiaoHongShuParser", + "YouTubeParser", +] diff --git a/core/parsers/acfun.py b/core/parsers/acfun.py new file mode 100644 index 0000000..fb0ff82 --- /dev/null +++ b/core/parsers/acfun.py @@ -0,0 +1,161 @@ +import asyncio +import json +import re +import time +from pathlib import Path +from typing import ClassVar + +import aiofiles +from aiohttp import ClientError + +from astrbot.api import logger +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..download import Downloader +from ..exception import DownloadException, ParseException +from ..utils import safe_unlink +from .base import BaseParser, Platform, PlatformEnum, handle + + +class AcfunParser(BaseParser): + # 平台信息 + platform: ClassVar[Platform] = Platform(name=PlatformEnum.ACFUN, display_name="A站") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + self.headers["referer"] = "https://www.acfun.cn/" + self.cache_dir = Path(config["cache_dir"]) + self.max_size = self.config["max_size"] + + @handle("acfun.cn", r"(?:ac=|/ac)(?P\d+)") + async def _parse(self, searched: re.Match[str]): + acid = int(searched.group("acid")) + url = f"https://www.acfun.cn/v/ac{acid}" + + m3u8_url, title, description, author, upload_time = await self.parse_video_info(url) + author = self.create_author(author) if author else None + + # 2024-12-1 -> timestamp + try: + timestamp = int(time.mktime(time.strptime(upload_time, "%Y-%m-%d"))) + except ValueError: + timestamp = None + text = f"简介: {description}" + + # 下载视频 + video_task = asyncio.create_task(self.download_video(m3u8_url, acid)) + + return self.result( + title=title, + text=text, + author=author, + timestamp=timestamp, + contents=[self.create_video_content(video_task)], + ) + + async def parse_video_info(self, url: str) -> tuple[str, str, str, str, str]: + """解析acfun链接获取详细信息 + + Args: + url (str): 链接 + + Returns: + tuple: (m3u8_url, title, description, author, upload_time) + """ + + # 拼接查询参数 + url = f"{url}?quickViewId=videoInfo_new&ajaxpipe=1" + + async with self.client.get(url, headers=self.headers) as resp: + if resp.status >= 400: + raise ClientError(f"HTTP {resp.status}") + raw = await resp.text() + + matched = re.search(r"window\.videoInfo =(.*?)", raw) + if not matched: + raise ParseException("解析 acfun 视频信息失败") + json_str = str(matched.group(1)) + json_str = json_str.replace('\\\\"', '\\"').replace('\\"', '"') + video_info = json.loads(json_str) + + title = video_info.get("title", "") + description = video_info.get("description", "") + author = video_info.get("user", {}).get("name", "") + upload_time = video_info.get("createTime", "") + + ks_play_json = video_info["currentVideoInfo"]["ksPlayJson"] + ks_play = json.loads(ks_play_json) + representations = ks_play["adaptationSet"][0]["representation"] + # 这里[d['url'] for d in representations],从 4k ~ 360,此处默认720p + m3u8_url = [d["url"] for d in representations][3] + + return m3u8_url, title, description, author, upload_time + + async def download_video(self, m3u8s_url: str, acid: int) -> Path: + """下载acfun视频 + + Args: + m3u8s_url (str): m3u8链接 + acid (int): acid + + Returns: + Path: 下载的mp4文件 + """ + + m3u8_full_urls = await self._parse_m3u8(m3u8s_url) + video_file = self.cache_dir / f"acfun_{acid}.mp4" + if video_file.exists(): + return video_file + + max_size = self.max_size * 1024 * 1024 + + try: + async with aiofiles.open(video_file, "wb") as f: + with self.downloader.get_progress_bar(video_file.name) as bar: + total = 0 + for url in m3u8_full_urls: + async with self.client.get(url, headers=self.headers) as resp: + if resp.status >= 400: + raise ClientError(f"{resp.status} {resp.reason}") + async for chunk in resp.content.iter_chunked(1024 * 1024): + await f.write(chunk) + total += len(chunk) + bar.update(len(chunk)) + if total > max_size: # 大小截断 + break + if total > max_size: + break + + except ClientError: + await safe_unlink(video_file) + logger.exception("视频下载失败") + raise DownloadException("视频下载失败") + return video_file + + async def _parse_m3u8(self, m3u8_url: str): + """解析m3u8链接 + + Args: + m3u8_url (str): m3u8链接 + + Returns: + list[str]: 视频链接 + """ + async with self.client.get(m3u8_url, headers=self.headers) as resp: + if resp.status >= 400: + raise ClientError(f"{resp.status} {resp.reason}") + m3u8_file = await resp.text() + # 分离ts文件链接 + raw_pieces = re.split(r"\n#EXTINF:.{8},\n", m3u8_file) + # 过滤头部\ + m3u8_relative_links = raw_pieces[1:] + + # 修改尾部 去掉尾部多余的结束符 + patched_tail = m3u8_relative_links[-1].split("\n")[0] + m3u8_relative_links[-1] = patched_tail + + # 完整链接,直接加 m3u8Url 的通用前缀 + m3u8_prefix = "/".join(m3u8_url.split("/")[0:-1]) + m3u8_full_urls = [f"{m3u8_prefix}/{d}" for d in m3u8_relative_links] + + return m3u8_full_urls diff --git a/core/parsers/base.py b/core/parsers/base.py new file mode 100644 index 0000000..3a6d9a6 --- /dev/null +++ b/core/parsers/base.py @@ -0,0 +1,287 @@ +"""Parser 基类定义""" + +from abc import ABC +from asyncio import Task +from collections.abc import Callable, Coroutine +from pathlib import Path +from re import Match, Pattern, compile +from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, cast + +from aiohttp import ClientError, ClientSession, ClientTimeout, TCPConnector +from typing_extensions import Unpack + +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..constants import ANDROID_HEADER, COMMON_HEADER, IOS_HEADER +from ..constants import PlatformEnum as PlatformEnum +from ..download import Downloader +from ..exception import DownloadException as DownloadException +from ..exception import DurationLimitException as DurationLimitException +from ..exception import ParseException as ParseException +from ..exception import SizeLimitException as SizeLimitException +from ..exception import TipException as TipException +from ..exception import ZeroSizeException as ZeroSizeException +from .data import ParseResult, ParseResultKwargs, Platform + +T = TypeVar("T", bound="BaseParser") +HandlerFunc = Callable[[T, Match[str]], Coroutine[Any, Any, ParseResult]] +KeyPatterns = list[tuple[str, Pattern[str]]] + +_KEY_PATTERNS = "_key_patterns" + + +# 注册处理器装饰器 +def handle(keyword: str, pattern: str): + """注册处理器装饰器""" + + def decorator(func: HandlerFunc[T]) -> HandlerFunc[T]: + if not hasattr(func, _KEY_PATTERNS): + setattr(func, _KEY_PATTERNS, []) + + key_patterns: KeyPatterns = getattr(func, _KEY_PATTERNS) + key_patterns.append((keyword, compile(pattern))) + + return func + + return decorator + + +class BaseParser: + """所有平台 Parser 的抽象基类 + + 子类必须实现: + - platform: 平台信息(包含名称和显示名称) + """ + + _registry: ClassVar[list[type["BaseParser"]]] = [] + """ 存储所有已注册的 Parser 类 """ + + platform: ClassVar[Platform] + """ 平台信息(包含名称和显示名称) """ + + _session: ClassVar[ClientSession | None] = None + """ 全局 ClientSession 对象 """ + + if TYPE_CHECKING: + _key_patterns: ClassVar[KeyPatterns] + _handlers: ClassVar[dict[str, HandlerFunc]] + + def __init__( + self, + config: AstrBotConfig, + downloader: Downloader, + ): + self.headers = COMMON_HEADER.copy() + self.ios_headers = IOS_HEADER.copy() + self.android_headers = ANDROID_HEADER.copy() + self.config = config + self.downloader = downloader + self.client = self.get_session(config["common_timeout"]) + + def __init_subclass__(cls, **kwargs): + """自动注册子类到 _registry""" + super().__init_subclass__(**kwargs) + if ABC not in cls.__bases__: # 跳过抽象类 + BaseParser._registry.append(cls) + + cls._handlers = {} + cls._key_patterns = [] + + # 获取所有被 handle 装饰的方法 + for attr_name in dir(cls): + attr = getattr(cls, attr_name) + if callable(attr) and hasattr(attr, _KEY_PATTERNS): + key_patterns: KeyPatterns = getattr(attr, _KEY_PATTERNS) + handler = cast(HandlerFunc, attr) + for keyword, pattern in key_patterns: + cls._handlers[keyword] = handler + cls._key_patterns.append((keyword, pattern)) + + # 按关键字长度降序排序 + cls._key_patterns.sort(key=lambda x: -len(x[0])) + + @classmethod + def get_all_subclass(cls) -> list[type["BaseParser"]]: + """获取所有已注册的 Parser 类""" + return cls._registry + + @classmethod + def get_session(cls, timeout: float = 30) -> ClientSession: + """取全局单例,首次调用时创建""" + if cls._session is None or cls._session.closed: + cls._session = ClientSession( + connector=TCPConnector(ssl=False), + timeout=ClientTimeout(total=timeout), + ) + return cls._session + + @classmethod + async def close_session(cls) -> None: + """关闭全局单例,插件卸载时调用一次即可""" + if cls._session and not cls._session.closed: + await cls._session.close() + cls._session = None + + async def parse(self, keyword: str, searched: Match[str]) -> ParseResult: + """解析 URL 提取信息 + + Args: + keyword: 关键词 + searched: 正则表达式匹配对象,由平台对应的模式匹配得到 + + Returns: + ParseResult: 解析结果 + + Raises: + ParseException: 解析失败时抛出 + """ + return await self._handlers[keyword](self, searched) + + async def parse_with_redirect( + self, + url: str, + headers: dict[str, str] | None = None, + ) -> ParseResult: + """先重定向再解析""" + redirect_url = await self.get_redirect_url(url, headers=headers or self.headers) + + if redirect_url == url: + raise ParseException(f"无法重定向 URL: {url}") + + keyword, searched = self.search_url(redirect_url) + return await self.parse(keyword, searched) + + @classmethod + def search_url(cls, url: str) -> tuple[str, Match[str]]: + """搜索 URL 匹配模式""" + for keyword, pattern in cls._key_patterns: + if keyword not in url: + continue + if searched := pattern.search(url): + return keyword, searched + raise ParseException(f"无法匹配 {url}") + + @classmethod + def result(cls, **kwargs: Unpack[ParseResultKwargs]) -> ParseResult: + """构建解析结果""" + return ParseResult(platform=cls.platform, **kwargs) + + + async def get_redirect_url( + self, + url: str, + headers: dict[str, str] | None = None, + ) -> str: + """获取重定向后的 URL, 单次重定向""" + + headers = headers or COMMON_HEADER.copy() + async with self.client.get(url, headers=headers, allow_redirects=False) as resp: + if resp.status >= 400: + raise ClientError(f"redirect check {resp.status} {resp.reason}") + return resp.headers.get("Location", url) + + async def get_final_url( + self, + url: str, + headers: dict[str, str] | None = None, + ) -> str: + """获取重定向后的 URL, 允许多次重定向""" + headers = headers or COMMON_HEADER.copy() + async with self.client.get( + url, headers=headers, allow_redirects=True + ) as resp: + if resp.status >= 400: + raise ClientError(f"final url check {resp.status} {resp.reason}") + return str(resp.url) + + def create_author( + self, + name: str, + avatar_url: str | None = None, + description: str | None = None, + ): + """创建作者对象""" + from .data import Author + + avatar_task = None + if avatar_url: + avatar_task = self.downloader.download_img( + avatar_url, ext_headers=self.headers + ) + return Author(name=name, avatar=avatar_task, description=description) + + def create_video_content( + self, + url_or_task: str | Task[Path], + cover_url: str | None = None, + duration: float = 0.0, + ): + """创建视频内容""" + from .data import VideoContent + + cover_task = None + if cover_url: + cover_task = self.downloader.download_img( + cover_url, ext_headers=self.headers + ) + if isinstance(url_or_task, str): + url_or_task = self.downloader.download_video( + url_or_task, ext_headers=self.headers + ) + + return VideoContent(url_or_task, cover_task, duration) + + def create_image_contents( + self, + image_urls: list[str], + ): + """创建图片内容列表""" + from .data import ImageContent + + contents: list[ImageContent] = [] + for url in image_urls: + task = self.downloader.download_img(url, ext_headers=self.headers) + contents.append(ImageContent(task)) + return contents + + def create_dynamic_contents( + self, + dynamic_urls: list[str], + ): + """创建动态图片内容列表""" + from .data import DynamicContent + + contents: list[DynamicContent] = [] + for url in dynamic_urls: + task = self.downloader.download_video(url, ext_headers=self.headers) + contents.append(DynamicContent(task)) + return contents + + def create_audio_content( + self, + url_or_task: str | Task[Path], + duration: float = 0.0, + ): + """创建音频内容""" + from .data import AudioContent + + if isinstance(url_or_task, str): + url_or_task = self.downloader.download_audio( + url_or_task, ext_headers=self.headers + ) + + return AudioContent(url_or_task, duration) + + def create_graphics_content( + self, + image_url: str, + text: str | None = None, + alt: str | None = None, + ): + """创建图文内容 图片不能为空 文字可空 渲染时文字在前 图片在后""" + from .data import GraphicsContent + + image_task = self.downloader.download_img(image_url, ext_headers=self.headers) + return GraphicsContent(image_task, text, alt) + + diff --git a/core/parsers/bilibili/__init__.py b/core/parsers/bilibili/__init__.py new file mode 100644 index 0000000..44c55e0 --- /dev/null +++ b/core/parsers/bilibili/__init__.py @@ -0,0 +1,524 @@ +import asyncio +import json +from collections.abc import AsyncGenerator +from pathlib import Path +from re import Match +from typing import ClassVar + +from bilibili_api import HEADERS, Credential, request_settings, select_client +from bilibili_api.login_v2 import QrCodeLogin, QrCodeLoginEvents +from bilibili_api.opus import Opus +from bilibili_api.video import Video, VideoCodecs, VideoQuality +from msgspec import convert + +from astrbot.api import logger +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ...utils import ck2dict +from ..base import ( + BaseParser, + Downloader, + DownloadException, + DurationLimitException, + ParseException, + PlatformEnum, + handle, +) +from ..data import ImageContent, MediaContent, Platform + +# 选择客户端 +select_client("curl_cffi") +# 模拟浏览器,第二参数数值参考 curl_cffi 文档 +# https://curl-cffi.readthedocs.io/en/latest/impersonate.html +request_settings.set("impersonate", "chrome131") + + +class BilibiliParser(BaseParser): + # 平台信息 + platform: ClassVar[Platform] = Platform(name=PlatformEnum.BILIBILI, display_name="B站") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + self.headers = HEADERS.copy() + self._credential: Credential | None = None + self.max_duration = config["max_duration"] + self.cache_dir = Path(config["cache_dir"]) + + self.video_quality = getattr( + VideoQuality, config["bili_video_quality"].upper(), VideoQuality._720P + ) + self.codecs = getattr( + VideoCodecs, config["bili_video_codecs"].upper(), VideoCodecs.AVC + ) + self.bili_ck = config["bili_ck"] + self._cookies_file = Path(config["data_dir"]) / "bilibili_cookies.json" + + @handle("b23.tv", r"b23\.tv/[A-Za-z\d\._?%&+\-=/#]+") + @handle("bili2233", r"bili2233\.cn/[A-Za-z\d\._?%&+\-=/#]+") + async def _parse_short_link(self, searched: Match[str]): + """解析短链""" + url = f"https://{searched.group(0)}" + return await self.parse_with_redirect(url) + + @handle("BV", r"^(?PBV[0-9a-zA-Z]{10})(?:\s)?(?P\d{1,3})?$") + @handle("/BV", r"bilibili\.com(?:/video)?/(?PBV[0-9a-zA-Z]{10})(?:\?p=(?P\d{1,3}))?") + async def _parse_bv(self, searched: Match[str]): + """解析视频信息""" + bvid = str(searched.group("bvid")) + page_num = int(searched.group("page_num") or 1) + + return await self.parse_video(bvid=bvid, page_num=page_num) + + @handle("av", r"^av(?P\d{6,})(?:\s)?(?P\d{1,3})?$") + @handle("/av", r"bilibili\.com(?:/video)?/av(?P\d{6,})(?:\?p=(?P\d{1,3}))?") + async def _parse_av(self, searched: Match[str]): + """解析视频信息""" + avid = int(searched.group("avid")) + page_num = int(searched.group("page_num") or 1) + + return await self.parse_video(avid=avid, page_num=page_num) + + @handle("/dynamic/", r"bilibili\.com/dynamic/(?P\d+)") + @handle("t.bili", r"t\.bilibili\.com/(?P\d+)") + async def _parse_dynamic(self, searched: Match[str]): + """解析动态信息""" + dynamic_id = int(searched.group("dynamic_id")) + return await self.parse_dynamic(dynamic_id) + + @handle("live.bili", r"live\.bilibili\.com/(?P\d+)") + async def _parse_live(self, searched: Match[str]): + """解析直播信息""" + room_id = int(searched.group("room_id")) + return await self.parse_live(room_id) + + @handle("/favlist", r"favlist\?fid=(?P\d+)") + async def _parse_favlist(self, searched: Match[str]): + """解析收藏夹信息""" + fav_id = int(searched.group("fav_id")) + return await self.parse_favlist(fav_id) + + @handle("/read/", r"bilibili\.com/read/cv(?P\d+)") + async def _parse_read(self, searched: Match[str]): + """解析专栏信息""" + read_id = int(searched.group("read_id")) + return await self.parse_read(read_id) + + @handle("/opus/", r"bilibili\.com/opus/(?P\d+)") + async def _parse_opus(self, searched: Match[str]): + """解析图文动态信息""" + opus_id = int(searched.group("opus_id")) + return await self.parse_opus(opus_id) + + async def parse_video( + self, + *, + bvid: str | None = None, + avid: int | None = None, + page_num: int = 1, + ): + """解析视频信息 + + Args: + bvid (str | None): bvid + avid (int | None): avid + page_num (int): 页码 + """ + + from .video import AIConclusion, VideoInfo + + video = await self._get_video(bvid=bvid, avid=avid) + # 转换为 msgspec struct + video_info = convert(await video.get_info(), VideoInfo) + # 获取简介 + text = f"简介: {video_info.desc}" if video_info.desc else None + # up + author = self.create_author(video_info.owner.name, video_info.owner.face) + # 处理分 p + page_info = video_info.extract_info_with_page(page_num) + + # 获取 AI 总结 + if self._credential: + cid = await video.get_cid(page_info.index) + ai_conclusion = await video.get_ai_conclusion(cid) + ai_conclusion = convert(ai_conclusion, AIConclusion) + ai_summary = ai_conclusion.summary + else: + ai_summary: str = "哔哩哔哩 cookie 未配置或失效, 无法使用 AI 总结" + + url = f"https://bilibili.com/{video_info.bvid}" + url += f"?p={page_info.index + 1}" if page_info.index > 0 else "" + + # 视频下载 task + async def download_video(): + output_path = self.cache_dir / f"{video_info.bvid}-{page_num}.mp4" + if output_path.exists(): + return output_path + v_url, a_url = await self.extract_download_urls(video=video, page_index=page_info.index) + if page_info.duration > self.max_duration: + raise DurationLimitException + if a_url is not None: + return await self.downloader.download_av_and_merge( + v_url, a_url, output_path=output_path, ext_headers=self.headers + ) + else: + return await self.downloader.streamd( + v_url, file_name=output_path.name, ext_headers=self.headers + ) + + video_task = asyncio.create_task(download_video()) + video_content = self.create_video_content( + video_task, + page_info.cover, + page_info.duration, + ) + + return self.result( + url=url, + title=page_info.title, + timestamp=page_info.timestamp, + text=text, + author=author, + contents=[video_content], + extra={"info": ai_summary}, + ) + + async def parse_dynamic(self, dynamic_id: int): + """解析动态信息 + + Args: + url (str): 动态链接 + """ + from bilibili_api.dynamic import Dynamic + + from .dynamic import DynamicItem + + dynamic = Dynamic(dynamic_id, await self.credential) + + # 转换为结构体 + dynamic_data = convert(await dynamic.get_info(), DynamicItem) + dynamic_info = dynamic_data.item + # 使用结构体属性提取信息 + author = self.create_author(dynamic_info.name, dynamic_info.avatar) + + # 下载图片 + contents: list[MediaContent] = [] + for image_url in dynamic_info.image_urls: + img_task = self.downloader.download_img(image_url, ext_headers=self.headers) + contents.append(ImageContent(img_task)) + + return self.result( + title=dynamic_info.title, + text=dynamic_info.text, + timestamp=dynamic_info.timestamp, + author=author, + contents=contents, + ) + + async def parse_opus(self, opus_id: int): + """解析图文动态信息 + + Args: + opus_id (int): 图文动态 id + """ + opus = Opus(opus_id, await self.credential) + return await self._parse_opus_obj(opus) + + async def parse_read_old(self, read_id: int): + """解析专栏信息, 已废弃 + + Args: + read_id (int): 专栏 id + """ + from bilibili_api.article import Article + + article = Article(read_id) + return await self._parse_opus_obj(await article.turn_to_opus()) + + async def _parse_opus_obj(self, bili_opus: Opus): + """解析图文动态信息 + + Args: + opus_id (int): 图文动态 id + + Returns: + ParseResult: 解析结果 + """ + + from .opus import ImageNode, OpusItem, TextNode + + opus_info = await bili_opus.get_info() + if not isinstance(opus_info, dict): + raise ParseException("获取图文动态信息失败") + # 转换为结构体 + opus_data = convert(opus_info, OpusItem) + logger.debug(f"opus_data: {opus_data}") + author = self.create_author(*opus_data.name_avatar) + + # 按顺序处理图文内容(参考 parse_read 的逻辑) + contents: list[MediaContent] = [] + current_text = "" + + for node in opus_data.gen_text_img(): + if isinstance(node, ImageNode): + contents.append(self.create_graphics_content(node.url, current_text.strip(), node.alt)) + current_text = "" + elif isinstance(node, TextNode): + current_text += node.text + + return self.result( + title=opus_data.title, + author=author, + timestamp=opus_data.timestamp, + contents=contents, + text=current_text.strip(), + ) + + async def parse_live(self, room_id: int): + """解析直播信息 + + Args: + room_id (int): 直播 id + + Returns: + ParseResult: 解析结果 + """ + from bilibili_api.live import LiveRoom + + from .live import RoomData + + room = LiveRoom(room_display_id=room_id, credential=await self.credential) + info_dict = await room.get_room_info() + + room_data = convert(info_dict, RoomData) + contents: list[MediaContent] = [] + # 下载封面 + if cover := room_data.cover: + cover_task = self.downloader.download_img(cover, ext_headers=self.headers) + contents.append(ImageContent(cover_task)) + + # 下载关键帧 + if keyframe := room_data.keyframe: + keyframe_task = self.downloader.download_img( + keyframe, ext_headers=self.headers + ) + contents.append(ImageContent(keyframe_task)) + + author = self.create_author(room_data.name, room_data.avatar) + + url = f"https://www.bilibili.com/blackboard/live/live-activity-player.html?enterTheRoom=0&cid={room_id}" + return self.result( + url=url, + title=room_data.title, + text=room_data.detail, + contents=contents, + author=author, + ) + + async def parse_read(self, read_id: int): + """专栏解析 + + Args: + read_id (int): 专栏 id + + Returns: + texts: list[str], urls: list[str] + """ + from bilibili_api.article import Article + + from .article import ArticleInfo, ImageNode, TextNode + + ar = Article(read_id) + # 加载内容 + await ar.fetch_content() + data = ar.json() + article_info = convert(data, ArticleInfo) + logger.debug(f"article_info: {article_info}") + + contents: list[MediaContent] = [] + current_text = "" + for child in article_info.gen_text_img(): + if isinstance(child, ImageNode): + contents.append(self.create_graphics_content(child.url, current_text.strip(), child.alt)) + current_text = "" + elif isinstance(child, TextNode): + current_text += child.text + + author = self.create_author(*article_info.author_info) + + return self.result( + title=article_info.title, + timestamp=article_info.timestamp, + text=current_text.strip(), + author=author, + contents=contents, + ) + + async def parse_favlist(self, fav_id: int): + """解析收藏夹信息 + + Args: + fav_id (int): 收藏夹 id + + Returns: + list[GraphicsContent]: 图文内容列表 + """ + from bilibili_api.favorite_list import get_video_favorite_list_content + + from .favlist import FavData + + # 只会取一页,20 个 + fav_dict = await get_video_favorite_list_content(fav_id) + + if fav_dict["medias"] is None: + raise ParseException("收藏夹内容为空, 或被风控") + + favdata = convert(fav_dict, FavData) + + return self.result( + title=favdata.title, + timestamp=favdata.timestamp, + author=self.create_author(favdata.info.upper.name, favdata.info.upper.face), + contents=[self.create_graphics_content(fav.cover, fav.desc) for fav in favdata.medias], + ) + + async def _get_video(self, *, bvid: str | None = None, avid: int | None = None) -> Video: + """解析视频信息 + + Args: + bvid (str | None): bvid + avid (int | None): avid + """ + if avid: + return Video(aid=avid, credential=await self.credential) + elif bvid: + return Video(bvid=bvid, credential=await self.credential) + else: + raise ParseException("avid 和 bvid 至少指定一项") + + async def extract_download_urls( + self, + video: Video | None = None, + *, + bvid: str | None = None, + avid: int | None = None, + page_index: int = 0, + ) -> tuple[str, str | None]: + """解析视频下载链接 + + Args: + bvid (str | None): bvid + avid (int | None): avid + page_index (int): 页索引 = 页码 - 1 + """ + + from bilibili_api.video import ( + AudioStreamDownloadURL, + VideoDownloadURLDataDetecter, + VideoStreamDownloadURL, + ) + + if video is None: + video = await self._get_video(bvid=bvid, avid=avid) + + # 获取下载数据 + download_url_data = await video.get_download_url(page_index=page_index) + detecter = VideoDownloadURLDataDetecter(download_url_data) + streams = detecter.detect_best_streams( + video_max_quality=self.video_quality, + codecs=[self.codecs], + no_dolby_video=True, + no_hdr=True, + ) + video_stream = streams[0] + if not isinstance(video_stream, VideoStreamDownloadURL): + raise DownloadException("未找到可下载的视频流") + logger.debug(f"视频流质量: {video_stream.video_quality.name}, 编码: {video_stream.video_codecs}") + + audio_stream = streams[1] + if not isinstance(audio_stream, AudioStreamDownloadURL): + return video_stream.url, None + logger.debug(f"音频流质量: {audio_stream.audio_quality.name}") + return video_stream.url, audio_stream.url + + def _save_credential(self): + """存储哔哩哔哩登录凭证""" + if self._credential is None: + return + + self._cookies_file.write_text(json.dumps(self._credential.get_cookies())) + + def _load_credential(self): + """从文件加载哔哩哔哩登录凭证""" + if not self._cookies_file.exists(): + return + + self._credential = Credential.from_cookies(json.loads(self._cookies_file.read_text())) + + async def login_with_qrcode(self) -> bytes: + """通过二维码登录获取哔哩哔哩登录凭证""" + self._qr_login = QrCodeLogin() + await self._qr_login.generate_qrcode() + + qr_pic = self._qr_login.get_qrcode_picture() + return qr_pic.content + + async def check_qr_state(self) -> AsyncGenerator[str, None]: + """检查二维码登录状态""" + scan_tip_pending = True + + for _ in range(30): + state = await self._qr_login.check_state() + match state: + case QrCodeLoginEvents.DONE: + yield "登录成功" + self._credential = self._qr_login.get_credential() + self._save_credential() + break + case QrCodeLoginEvents.CONF: + if scan_tip_pending: + yield "二维码已扫描, 请确认登录" + scan_tip_pending = False + case QrCodeLoginEvents.TIMEOUT: + yield "二维码过期, 请重新生成" + break + await asyncio.sleep(2) + else: + yield "二维码登录超时, 请重新生成" + + async def _init_credential(self): + """初始化哔哩哔哩登录凭证""" + if not self.bili_ck: + self._load_credential() + return + + credential = Credential.from_cookies(ck2dict(self.bili_ck)) + if await credential.check_valid(): + logger.info(f"`parser_bili_ck` 有效, 保存到 {self._cookies_file}") + self._credential = credential + self._save_credential() + else: + logger.info(f"`parser_bili_ck` 已过期, 尝试从 {self._cookies_file} 加载") + self._load_credential() + + @property + async def credential(self) -> Credential | None: + """哔哩哔哩登录凭证""" + + if self._credential is None: + await self._init_credential() + return self._credential + + if not await self._credential.check_valid(): + logger.warning("哔哩哔哩凭证已过期, 请重新配置") + return None + + if await self._credential.check_refresh(): + logger.info("哔哩哔哩凭证需要刷新") + if self._credential.has_ac_time_value() and self._credential.has_bili_jct(): + await self._credential.refresh() + logger.info(f"哔哩哔哩凭证刷新成功, 保存到 {self._cookies_file}") + self._save_credential() + else: + logger.warning("哔哩哔哩凭证刷新需要包含 `SESSDATA`, `ac_time_value` 项") + + return self._credential diff --git a/core/parsers/bilibili/article.py b/core/parsers/bilibili/article.py new file mode 100644 index 0000000..3126763 --- /dev/null +++ b/core/parsers/bilibili/article.py @@ -0,0 +1,118 @@ +"""Bilibili 专栏文章解析器""" + +from collections.abc import Generator +from typing import Any + +from msgspec import Struct + + +class TextNode(Struct): + """文本节点""" + + text: str + + +class ImageNode(Struct): + """图片节点""" + + url: str + alt: str | None = None + + +class Author(Struct): + """作者信息""" + + mid: int + name: str + face: str + fans: int + level: int + + +class Stats(Struct): + """统计信息""" + + view: int + favorite: int + like: int + reply: int + share: int + coin: int + + +class Meta(Struct): + """文章元信息""" + + id: int + title: str + summary: str + publish_time: int + author: Author + stats: Stats + tags: list[dict[str, Any]] + words: int + + +class ArticleInfo(Struct): + """文章信息""" + + type: str + meta: Meta + children: list[dict[str, Any]] + + def gen_text_img(self) -> Generator[TextNode | ImageNode, None, None]: + """生成文本和图片节点(保持顺序)""" + for child in self.children: + if child.get("type") == "ParagraphNode": + # 处理段落节点,提取所有文本内容 + text_content = self._extract_text_from_children(child.get("children", [])) + text_content = text_content.strip() + if text_content: + yield TextNode(text="\n\n" + text_content) + elif child.get("type") == "ImageNode": + # 处理图片节点 + yield ImageNode(url=child.get("url", ""), alt=child.get("alt")) + elif child.get("type") == "VideoCardNode": + # 处理视频卡片节点(转换为文本描述) + yield TextNode(text=f"\n [视频卡片: {child.get('aid', 0)}]") + + def _extract_text_from_children(self, children: list[dict[str, Any]]) -> str: + """从子节点列表中提取文本内容""" + text_content = "" + for child in children: + if child.get("type") == "TextNode": + text_content += child.get("text", "") + elif child.get("type") in ["BoldNode", "FontSizeNode", "ColorNode"]: + # 递归处理嵌套节点 + text_content += self._extract_text_from_children(child.get("children", [])) + return text_content + + @property + def author_info(self) -> tuple[str, str]: + """获取作者信息""" + return self.meta.author.name, self.meta.author.face + + @property + def title(self) -> str: + """获取标题""" + return self.meta.title + + @property + def timestamp(self) -> int: + """获取发布时间戳""" + return self.meta.publish_time + + @property + def summary(self) -> str: + """获取摘要""" + return self.meta.summary + + @property + def stats(self) -> Stats: + """获取统计信息""" + return self.meta.stats + + @property + def tags(self) -> list[str]: + """获取标签列表""" + return [tag.get("name", "") for tag in self.meta.tags] diff --git a/core/parsers/bilibili/common.py b/core/parsers/bilibili/common.py new file mode 100644 index 0000000..3ab328b --- /dev/null +++ b/core/parsers/bilibili/common.py @@ -0,0 +1,10 @@ +from msgspec import Struct + + +class Upper(Struct): + mid: int + """用户 ID""" + name: str + """作者""" + face: str + """头像""" diff --git a/core/parsers/bilibili/dynamic.py b/core/parsers/bilibili/dynamic.py new file mode 100644 index 0000000..40e2eef --- /dev/null +++ b/core/parsers/bilibili/dynamic.py @@ -0,0 +1,197 @@ +from typing import Any + +from msgspec import Struct, convert + + +class AuthorInfo(Struct): + """作者信息""" + + name: str + face: str + mid: int + pub_time: str + pub_ts: int + # jump_url: str + # following: bool = False + # official_verify: dict[str, Any] | None = None + # vip: dict[str, Any] | None = None + # pendant: dict[str, Any] | None = None + + +class VideoArchive(Struct): + """视频信息""" + + aid: str + bvid: str + title: str + desc: str + cover: str + # duration_text: str + # jump_url: str + # stat: dict[str, str] + # badge: dict[str, Any] | None = None + + +class OpusImage(Struct): + """图文动态图片信息""" + + url: str + # width: int + # height: int + # size: float + # aigc: dict[str, Any] | None = None + # live_url: str | None = None + + +class OpusSummary(Struct): + """图文动态摘要""" + + text: str + # rich_text_nodes: list[dict[str, Any]] + + +class OpusContent(Struct): + """图文动态内容""" + + jump_url: str + pics: list[OpusImage] + summary: OpusSummary + title: str | None = None + # fold_action: list[str] | None = None + + +class DynamicMajor(Struct): + """动态主要内容""" + + type: str + archive: VideoArchive | None = None + opus: OpusContent | None = None + + @property + def title(self) -> str | None: + """获取标题""" + if self.type == "MAJOR_TYPE_ARCHIVE" and self.archive: + return self.archive.title + return None + + @property + def text(self) -> str | None: + """获取文本内容""" + if self.type == "MAJOR_TYPE_ARCHIVE" and self.archive: + return self.archive.desc + elif self.type == "MAJOR_TYPE_OPUS" and self.opus: + return self.opus.summary.text + return None + + @property + def image_urls(self) -> list[str]: + """获取图片URL列表""" + if self.type == "MAJOR_TYPE_OPUS" and self.opus: + return [pic.url for pic in self.opus.pics] + elif self.type == "MAJOR_TYPE_ARCHIVE" and self.archive and self.archive.cover: + return [self.archive.cover] + return [] + + @property + def cover_url(self) -> str | None: + """获取封面URL""" + if self.type == "MAJOR_TYPE_ARCHIVE" and self.archive: + return self.archive.cover + return None + + +class DynamicModule(Struct): + """动态模块""" + + module_author: AuthorInfo + module_dynamic: dict[str, Any] | None = None + module_stat: dict[str, Any] | None = None + + @property + def author_name(self) -> str: + """获取作者名称""" + return self.module_author.name + + @property + def author_face(self) -> str: + """获取作者头像URL""" + return self.module_author.face + + @property + def pub_ts(self) -> int: + """获取发布时间戳""" + return self.module_author.pub_ts + + @property + def major_info(self) -> dict[str, Any] | None: + """获取主要内容信息""" + if self.module_dynamic: + return self.module_dynamic.get("major") + return None + + +class DynamicInfo(Struct): + """动态信息""" + + id_str: str + type: str + visible: bool + modules: DynamicModule + basic: dict[str, Any] | None = None + + @property + def name(self) -> str: + """获取作者名称""" + return self.modules.author_name + + @property + def avatar(self) -> str: + """获取作者头像URL""" + return self.modules.author_face + + @property + def timestamp(self) -> int: + """获取发布时间戳""" + return self.modules.pub_ts + + @property + def title(self) -> str | None: + """获取标题""" + major_info = self.modules.major_info + if major_info: + major = convert(major_info, DynamicMajor) + return major.title + return None + + @property + def text(self) -> str | None: + """获取文本内容""" + major_info = self.modules.major_info + if major_info: + major = convert(major_info, DynamicMajor) + return major.text + return None + + @property + def image_urls(self) -> list[str]: + """获取图片URL列表""" + major_info = self.modules.major_info + if major_info: + major = convert(major_info, DynamicMajor) + return major.image_urls + return [] + + @property + def cover_url(self) -> str | None: + """获取封面URL""" + major_info = self.modules.major_info + if major_info: + major = convert(major_info, DynamicMajor) + return major.cover_url + return None + + +class DynamicItem(Struct): + """动态项目""" + + item: DynamicInfo diff --git a/core/parsers/bilibili/favlist.py b/core/parsers/bilibili/favlist.py new file mode 100644 index 0000000..823deca --- /dev/null +++ b/core/parsers/bilibili/favlist.py @@ -0,0 +1,66 @@ +from msgspec import Struct + +from .common import Upper + + +class FavItem(Struct): + title: str + cover: str + intro: str + link: str + + @property + def url(self) -> str: + """完整链接""" + return self.link.replace("bilibili://video/", "https://bilibili.com/video/av") + + @property + def desc(self) -> str: + """描述""" + return f"标题: {self.title}\n简介: {self.intro}\n链接: {self.url}" + + @property + def avid(self) -> int: + """avid""" + return int(self.link.split("/")[-1]) + + +class FavInfo(Struct): + # id: int + # fid: int + # mid: int + title: str + """标题""" + cover: str + """封面""" + upper: Upper + """up 主信息""" + ctime: int + """创建时间戳""" + mtime: int + """修改时间戳""" + media_count: int + """媒体数量""" + intro: str + """简介""" + + +class FavData(Struct): + info: FavInfo + medias: list[FavItem] + + @property + def title(self) -> str: + return f"收藏夹 - {self.info.title}" + + @property + def cover(self) -> str: + return self.info.cover + + @property + def desc(self) -> str: + return f"简介: {self.info.intro}" + + @property + def timestamp(self) -> int: + return self.info.ctime diff --git a/core/parsers/bilibili/live.py b/core/parsers/bilibili/live.py new file mode 100644 index 0000000..25bbd81 --- /dev/null +++ b/core/parsers/bilibili/live.py @@ -0,0 +1,72 @@ +from msgspec import Struct + + +class RoomInfo(Struct): + title: str + """标题""" + cover: str + """封面""" + keyframe: str + """关键帧""" + tags: str + """标签""" + area_name: str + """分区名称""" + parent_area_name: str + """父分区名称""" + + +class BaseInfo(Struct): + uname: str + """用户名""" + face: str + """头像""" + gender: str + """性别""" + + +class LiveInfo(Struct): + level: int + """等级""" + level_color: int + """等级颜色""" + score: int + """分数""" + + +class AnchorInfo(Struct): + base_info: BaseInfo + """基础信息""" + live_info: LiveInfo + """直播信息""" + + +class RoomData(Struct): + room_info: RoomInfo + """房间信息""" + anchor_info: AnchorInfo + """主播信息""" + + @property + def title(self) -> str: + return f"直播 - {self.room_info.title}" + + @property + def cover(self) -> str: + return self.room_info.cover + + @property + def detail(self) -> str: + return f"分区: {self.room_info.area_name} | {self.room_info.parent_area_name}\n标签: {self.room_info.tags}" + + @property + def keyframe(self) -> str: + return self.room_info.keyframe + + @property + def name(self) -> str: + return self.anchor_info.base_info.uname + + @property + def avatar(self) -> str: + return self.anchor_info.base_info.face diff --git a/core/parsers/bilibili/opus.py b/core/parsers/bilibili/opus.py new file mode 100644 index 0000000..6ffc695 --- /dev/null +++ b/core/parsers/bilibili/opus.py @@ -0,0 +1,153 @@ +from collections.abc import Generator +from typing import Any + +from msgspec import Struct + + +class TextNode(Struct, tag="TextNode"): + """图文动态文本节点""" + + text: str + """文本内容""" + + +class ImageNode(Struct, tag="ImageNode"): + """图文动态图片节点""" + + url: str + """图片链接""" + alt: str | None = None + """图片描述""" + + +class Author(Struct): + """图文动态作者信息""" + + name: str + face: str + mid: int + pub_time: str + pub_ts: int + + +class Image(Struct): + """图文动态图片信息""" + + url: str + # width: int + # height: int + # size: float + + +class Pic(Struct): + """图文动态图片组""" + + pics: list[Image] + style: int + + +class Text(Struct): + """图文动态文本""" + + nodes: list[dict[str, Any]] + + +class Paragraph(Struct): + """图文动态段落""" + + para_type: int + text: Text | None = None + pic: Pic | None = None + # align: int = 0 + # format: dict[str, Any] | None = None + + +class Content(Struct): + """图文动态内容""" + + paragraphs: list[Paragraph] + + +class Stat(Struct): + """图文动态统计""" + + like: dict[str, Any] | None = None + comment: dict[str, Any] | None = None + forward: dict[str, Any] | None = None + favorite: dict[str, Any] | None = None + coin: dict[str, Any] | None = None + + +class Module(Struct): + """图文动态模块""" + + module_type: str + module_author: Author | None = None + module_content: Content | None = None + # module_stat: OpusStat | None = None + + +class Basic(Struct): + """图文动态基本信息""" + + title: str + + +class Info(Struct): + """图文动态信息""" + + id_str: str + type: int + modules: list[Module] + basic: Basic | None = None + + +class OpusItem(Struct): + """图文动态项目""" + + item: Info + + @property + def title(self) -> str | None: + return self.item.basic.title if self.item.basic else None + + @property + def name_avatar(self) -> tuple[str, str]: + author_module = next(module.module_author for module in self.item.modules if module.module_author) + return author_module.name, author_module.face + + @property + def timestamp(self) -> int | None: + """获取发布时间戳""" + for module in self.item.modules: + if module.module_type == "MODULE_TYPE_AUTHOR" and module.module_author: + return module.module_author.pub_ts + return None + + def gen_text_img(self) -> Generator[TextNode | ImageNode, None, None]: + """生成图文节点(保持顺序)""" + for module in self.item.modules: + if module.module_type == "MODULE_TYPE_CONTENT" and module.module_content: + for paragraph in module.module_content.paragraphs: + # 处理文本段落 + if paragraph.text and paragraph.text.nodes: + text_content = self._extract_text_from_nodes(paragraph.text.nodes) + text_content = text_content.strip() + if text_content: + yield TextNode(text="\n\n" + text_content) + + # 处理图片段落 + if paragraph.pic and paragraph.pic.pics: + for pic in paragraph.pic.pics: + yield ImageNode(url=pic.url) + + def _extract_text_from_nodes(self, nodes: list[dict[str, Any]]) -> str: + """从节点列表中提取文本内容""" + text_content = "" + for node in nodes: + if node.get("type") in [ + "TEXT_NODE_TYPE_WORD", + "TEXT_NODE_TYPE_RICH", + ] and node.get("word"): + text_content += node["word"].get("words", "") + return text_content diff --git a/core/parsers/bilibili/video.py b/core/parsers/bilibili/video.py new file mode 100644 index 0000000..3740a72 --- /dev/null +++ b/core/parsers/bilibili/video.py @@ -0,0 +1,140 @@ +from dataclasses import dataclass + +from msgspec import Struct + +from .common import Upper + + +class Stats(Struct): + view: int + """播放量""" + danmaku: int + """弹幕数""" + reply: int + """回复数""" + favorite: int + """收藏数""" + coin: int + """硬币数""" + share: int + """分享数""" + like: int + """点赞数""" + + +class Page(Struct): + part: str + """分集标题""" + ctime: int + """创建时间戳""" + duration: int + """时长""" + first_frame: str | None = None + """封面图片""" + + +@dataclass(frozen=True, slots=True) +class PageInfo: + index: int + title: str + duration: int + timestamp: int + cover: str | None = None + + +class VideoInfo(Struct): + bvid: str + """bvid""" + title: str + """标题""" + desc: str + """简介""" + duration: int + """时长""" + owner: Upper + """作者信息""" + stat: Stats + """统计信息""" + pubdate: int + """公开时间戳""" + ctime: int + """创建时间戳""" + pic: str | None = None + """封面图片""" + pages: list[Page] | None = None + """分集信息""" + + @property + def title_with_part(self) -> str: + if self.pages and len(self.pages) > 1: + return f"{self.title} - {self.pages[0].part}" + return self.title + + @property + def formatted_stats_info(self) -> str: + """ + 格式化视频信息 + """ + # 定义需要展示的数据及其显示名称 + stats_mapping = [ + ("👍", self.stat.like), + ("🪙", self.stat.coin), + ("⭐", self.stat.favorite), + ("↩️", self.stat.share), + ("💬", self.stat.reply), + ("👀", self.stat.view), + ("💭", self.stat.danmaku), + ] + + # 构建结果字符串 + result_parts = [] + for display_name, value in stats_mapping: + # 数值超过10000时转换为万为单位 + formatted_value = f"{value / 10000:.1f}万" if value > 10000 else str(value) + result_parts.append(f"{display_name} {formatted_value}") + + return " ".join(result_parts) + + def extract_info_with_page(self, page_num: int = 1) -> PageInfo: + """获取视频信息,包含页索引、标题、时长、封面 + Args: + page_num (int): 页索引. Defaults to 1. + + Returns: + tuple[int, str, int, str | None]: 页索引、标题、时长、封面 + """ + page_idx = page_num - 1 + title = self.title + duration = self.duration + cover = self.pic + timestamp = self.pubdate + + if self.pages and len(self.pages) > 1: + page_idx = page_idx % len(self.pages) + page = self.pages[page_idx] + title += f" | 分集 - {page.part}" + duration = page.duration + cover = page.first_frame + timestamp = page.ctime + + return PageInfo( + index=page_idx, + title=title, + duration=duration, + timestamp=timestamp, + cover=cover, + ) + + +class ModelResult(Struct): + summary: str + + +class AIConclusion(Struct): + model_result: ModelResult | None = None + + @property + def summary(self) -> str: + if self.model_result and self.model_result.summary: + return f"AI总结: {self.model_result.summary}" + return "该视频暂不支持AI总结" diff --git a/core/parsers/data.py b/core/parsers/data.py new file mode 100644 index 0000000..e56f6f8 --- /dev/null +++ b/core/parsers/data.py @@ -0,0 +1,241 @@ +from asyncio import Task +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any, TypedDict + + +def repr_path_task(path_task: Path | Task[Path]) -> str: + if isinstance(path_task, Path): + return f"path={path_task.name}" + else: + return f"task={path_task.get_name()}, done={path_task.done()}" + + +@dataclass(repr=False, slots=True) +class MediaContent: + path_task: Path | Task[Path] + + async def get_path(self) -> Path: + if isinstance(self.path_task, Path): + return self.path_task + self.path_task = await self.path_task + return self.path_task + + def __repr__(self) -> str: + prefix = self.__class__.__name__ + return f"{prefix}({repr_path_task(self.path_task)})" + + +@dataclass(repr=False, slots=True) +class AudioContent(MediaContent): + """音频内容""" + + duration: float = 0.0 + + +@dataclass(repr=False, slots=True) +class VideoContent(MediaContent): + """视频内容""" + + cover: Path | Task[Path] | None = None + """视频封面""" + duration: float = 0.0 + """时长 单位: 秒""" + + async def get_cover_path(self) -> Path | None: + if self.cover is None: + return None + if isinstance(self.cover, Path): + return self.cover + self.cover = await self.cover + return self.cover + + @property + def display_duration(self) -> str: + minutes = int(self.duration) // 60 + seconds = int(self.duration) % 60 + return f"时长: {minutes}:{seconds:02d}" + + def __repr__(self) -> str: + repr = f"VideoContent(path={repr_path_task(self.path_task)}" + if self.cover is not None: + repr += f", cover={repr_path_task(self.cover)}" + return repr + ")" + + +@dataclass(repr=False, slots=True) +class ImageContent(MediaContent): + """图片内容""" + + pass + + +@dataclass(repr=False, slots=True) +class DynamicContent(MediaContent): + """动态内容 视频格式 后续转 gif""" + + gif_path: Path | None = None + + +@dataclass(repr=False, slots=True) +class GraphicsContent(MediaContent): + """图文内容 渲染时文字在前 图片在后""" + + text: str | None = None + """图片前的文本内容""" + alt: str | None = None + """图片描述 渲染时居中显示""" + + def __repr__(self) -> str: + repr = f"GraphicsContent(path={repr_path_task(self.path_task)}" + if self.text: + repr += f", text={self.text}" + if self.alt: + repr += f", alt={self.alt}" + return repr + ")" + + +@dataclass(slots=True) +class Platform: + """平台信息""" + + name: str + """ 平台名称 """ + display_name: str + """ 平台显示名称 """ + + +@dataclass(repr=False, slots=True) +class Author: + """作者信息""" + + name: str + """作者名称""" + avatar: Path | Task[Path] | None = None + """作者头像 URL 或本地路径""" + description: str | None = None + """作者个性签名等""" + + async def get_avatar_path(self) -> Path | None: + if self.avatar is None: + return None + if isinstance(self.avatar, Path): + return self.avatar + self.avatar = await self.avatar + return self.avatar + + def __repr__(self) -> str: + repr = f"Author(name={self.name}" + if self.avatar: + repr += f", avatar_{repr_path_task(self.avatar)}" + if self.description: + repr += f", description={self.description}" + return repr + ")" + + +@dataclass(repr=False, slots=True) +class ParseResult: + """完整的解析结果""" + + platform: Platform + """平台信息""" + author: Author | None = None + """作者信息""" + title: str | None = None + """标题""" + text: str | None = None + """文本内容""" + timestamp: int | None = None + """发布时间戳, 秒""" + url: str | None = None + """来源链接""" + contents: list[MediaContent] = field(default_factory=list) + """媒体内容""" + extra: dict[str, Any] = field(default_factory=dict) + """额外信息""" + repost: "ParseResult | None" = None + """转发的内容""" + render_image: Path | None = None + """渲染图片""" + + @property + def header(self) -> str | None: + """头信息 仅用于 default render""" + header = self.platform.display_name + if self.author: + header += f" @{self.author.name}" + if self.title: + header += f" | {self.title}" + return header + + @property + def display_url(self) -> str | None: + return f"链接: {self.url}" if self.url else None + + @property + def repost_display_url(self) -> str | None: + return f"原帖: {self.repost.url}" if self.repost and self.repost.url else None + + @property + def extra_info(self) -> str | None: + return self.extra.get("info") + + @property + def video_contents(self) -> list[VideoContent]: + return [cont for cont in self.contents if isinstance(cont, VideoContent)] + + @property + def img_contents(self) -> list[ImageContent]: + return [cont for cont in self.contents if isinstance(cont, ImageContent)] + + @property + def audio_contents(self) -> list[AudioContent]: + return [cont for cont in self.contents if isinstance(cont, AudioContent)] + + @property + def dynamic_contents(self) -> list[DynamicContent]: + return [cont for cont in self.contents if isinstance(cont, DynamicContent)] + + @property + def graphics_contents(self) -> list[GraphicsContent]: + return [cont for cont in self.contents if isinstance(cont, GraphicsContent)] + + @property + async def cover_path(self) -> Path | None: + """获取封面路径""" + for cont in self.contents: + if isinstance(cont, VideoContent): + return await cont.get_cover_path() + return None + + @property + def formatted_datetime(self, fmt: str = "%Y-%m-%d %H:%M:%S") -> str | None: + """格式化时间戳""" + return datetime.fromtimestamp(self.timestamp).strftime(fmt) if self.timestamp is not None else None + + def __repr__(self) -> str: + return ( + f"platform: {self.platform.display_name}, " + f"timestamp: {self.timestamp}, " + f"title: {self.title}, " + f"text: {self.text}, " + f"url: {self.url}, " + f"author: {self.author}, " + f"contents: {self.contents}, " + f"extra: {self.extra}, " + f"repost: <<<<<<<{self.repost}>>>>>>, " + f"render_image: {self.render_image.name if self.render_image else 'None'}" + ) + + + +class ParseResultKwargs(TypedDict, total=False): + title: str | None + text: str | None + contents: list[MediaContent] + timestamp: int | None + url: str | None + author: Author | None + extra: dict[str, Any] + repost: ParseResult | None diff --git a/core/parsers/douyin/__init__.py b/core/parsers/douyin/__init__.py new file mode 100644 index 0000000..2c05d90 --- /dev/null +++ b/core/parsers/douyin/__init__.py @@ -0,0 +1,148 @@ +import re +from typing import ClassVar + +import msgspec +from aiohttp import TCPConnector + +from astrbot.api import logger +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..base import ( + BaseParser, + Downloader, + ParseException, + Platform, + PlatformEnum, + handle, +) + + +class DouyinParser(BaseParser): + # 平台信息 + platform: ClassVar[Platform] = Platform(name=PlatformEnum.DOUYIN, display_name="抖音") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + # https://v.douyin.com/_2ljF4AmKL8 + @handle("v.douyin", r"v\.douyin\.com/[a-zA-Z0-9_\-]+") + @handle("jx.douyin", r"jx\.douyin\.com/[a-zA-Z0-9_\-]+") + async def _parse_short_link(self, searched: re.Match[str]): + url = f"https://{searched.group(0)}" + return await self.parse_with_redirect(url) + + # https://www.douyin.com/video/7521023890996514083 + # https://www.douyin.com/note/7469411074119322899 + @handle("douyin", r"douyin\.com/(?Pvideo|note)/(?P\d+)") + @handle("iesdouyin", r"iesdouyin\.com/share/(?Pslides|video|note)/(?P\d+)") + @handle("m.douyin", r"m\.douyin\.com/share/(?Pslides|video|note)/(?P\d+)") + # https://jingxuan.douyin.com/m/video/7574300896016862490?app=yumme&utm_source=copy_link + @handle( + "jingxuan.douyin", + r"jingxuan\.douyin.com/m/(?Pslides|video|note)/(?P\d+)", + ) + async def _parse_douyin(self, searched: re.Match[str]): + ty, vid = searched.group("ty"), searched.group("vid") + if ty == "slides": + return await self.parse_slides(vid) + + for url in ( + self._build_m_douyin_url(ty, vid), + self._build_iesdouyin_url(ty, vid), + ): + try: + return await self.parse_video(url) + except ParseException as e: + logger.warning(f"failed to parse {url}, error: {e}") + continue + raise ParseException("分享已删除或资源直链提取失败, 请稍后再试") + + @staticmethod + def _build_iesdouyin_url(ty: str, vid: str) -> str: + return f"https://www.iesdouyin.com/share/{ty}/{vid}" + + @staticmethod + def _build_m_douyin_url(ty: str, vid: str) -> str: + return f"https://m.douyin.com/share/{ty}/{vid}" + + async def parse_video(self, url: str): + async with self.client.get( + url, + headers=self.ios_headers, + allow_redirects=False, + connector=TCPConnector(ssl=False), + ) as resp: + if resp.status != 200: + raise ParseException(f"status: {resp.status}") + text = await resp.text() + + pattern = re.compile( + pattern=r"window\._ROUTER_DATA\s*=\s*(.*?)", + flags=re.DOTALL, + ) + matched = pattern.search(text) + + if not matched or not matched.group(1): + raise ParseException("can't find _ROUTER_DATA in html") + + from .video import RouterData + + video_data = msgspec.json.decode(matched.group(1).strip(), type=RouterData).video_data + # 使用新的简洁构建方式 + contents = [] + + # 添加图片内容 + if image_urls := video_data.image_urls: + contents.extend(self.create_image_contents(image_urls)) + + # 添加视频内容 + elif video_url := video_data.video_url: + cover_url = video_data.cover_url + duration = video_data.video.duration if video_data.video else 0 + contents.append(self.create_video_content(video_url, cover_url, duration)) + + # 构建作者 + author = self.create_author(video_data.author.nickname, video_data.avatar_url) + + return self.result( + title=video_data.desc, + author=author, + contents=contents, + timestamp=video_data.create_time, + ) + + async def parse_slides(self, video_id: str): + url = "https://www.iesdouyin.com/web/api/v2/aweme/slidesinfo/" + params = { + "aweme_ids": f"[{video_id}]", + "request_source": "200", + } + async with self.client.get( + url, + params=params, + headers=self.android_headers, + connector=TCPConnector(ssl=False), + ) as resp: + resp.raise_for_status() + + from .slides import SlidesInfo + + slides_data = msgspec.json.decode(await resp.read(), type=SlidesInfo).aweme_details[0] + contents = [] + + # 添加图片内容 + if image_urls := slides_data.image_urls: + contents.extend(self.create_image_contents(image_urls)) + + # 添加动态内容 + if dynamic_urls := slides_data.dynamic_urls: + contents.extend(self.create_dynamic_contents(dynamic_urls)) + + # 构建作者 + author = self.create_author(slides_data.name, slides_data.avatar_url) + + return self.result( + title=slides_data.desc, + author=author, + contents=contents, + timestamp=slides_data.create_time, + ) diff --git a/core/parsers/douyin/slides.py b/core/parsers/douyin/slides.py new file mode 100644 index 0000000..b69fdf6 --- /dev/null +++ b/core/parsers/douyin/slides.py @@ -0,0 +1,59 @@ +from random import choice + +from msgspec import Struct, field + + +class PlayAddr(Struct): + url_list: list[str] + + +class Cover(Struct): + url_list: list[str] + + +class Video(Struct): + play_addr: PlayAddr + cover: Cover + duration: int + + +class Image(Struct): + video: Video | None = None + url_list: list[str] = field(default_factory=list) + + +class Avatar(Struct): + url_list: list[str] + + +class Author(Struct): + nickname: str + # avatar_larger: Avatar + avatar_thumb: Avatar + + +class SlidesData(Struct): + author: Author + desc: str + create_time: int + images: list[Image] + + @property + def name(self) -> str: + return self.author.nickname + + @property + def avatar_url(self) -> str: + return choice(self.author.avatar_thumb.url_list) + + @property + def image_urls(self) -> list[str]: + return [choice(image.url_list) for image in self.images] + + @property + def dynamic_urls(self) -> list[str]: + return [choice(image.video.play_addr.url_list) for image in self.images if image.video] + + +class SlidesInfo(Struct): + aweme_details: list[SlidesData] = field(default_factory=list) diff --git a/core/parsers/douyin/video.py b/core/parsers/douyin/video.py new file mode 100644 index 0000000..8cbe0a1 --- /dev/null +++ b/core/parsers/douyin/video.py @@ -0,0 +1,95 @@ +from random import choice +from typing import Any + +from msgspec import Struct, field + +from ..base import ParseException + + +class Avatar(Struct): + url_list: list[str] + + +class Author(Struct): + nickname: str + avatar_thumb: Avatar | None = None + avatar_medium: Avatar | None = None + + +class PlayAddr(Struct): + url_list: list[str] + + +class Cover(Struct): + url_list: list[str] + + +class Video(Struct): + play_addr: PlayAddr + cover: Cover + duration: int + + +class Image(Struct): + video: Video | None = None + url_list: list[str] = field(default_factory=list) + + +class VideoData(Struct): + create_time: int + author: Author + desc: str + images: list[Image] | None = None + video: Video | None = None + + @property + def image_urls(self) -> list[str]: + return [choice(image.url_list) for image in self.images] if self.images else [] + + @property + def video_url(self) -> str | None: + return choice(self.video.play_addr.url_list).replace("playwm", "play") if self.video else None + + @property + def cover_url(self) -> str | None: + return choice(self.video.cover.url_list) if self.video else None + + @property + def avatar_url(self) -> str | None: + if avatar := self.author.avatar_thumb: + return choice(avatar.url_list) + elif avatar := self.author.avatar_medium: + return choice(avatar.url_list) + return None + + +class VideoInfoRes(Struct): + item_list: list[VideoData] = field(default_factory=list) + + @property + def video_data(self) -> VideoData: + if len(self.item_list) == 0: + raise ParseException("can't find data in videoInfoRes") + return choice(self.item_list) + + +class VideoOrNotePage(Struct): + video_info_res: VideoInfoRes = field(name="videoInfoRes", default_factory=VideoInfoRes) + + +class LoaderData(Struct): + video_page: VideoOrNotePage | None = field(name="video_(id)/page", default=None) + note_page: VideoOrNotePage | None = field(name="note_(id)/page", default=None) + + +class RouterData(Struct): + loader_data: LoaderData = field(name="loaderData", default_factory=LoaderData) + errors: dict[str, Any] | None = None + + @property + def video_data(self) -> VideoData: + if page := self.loader_data.video_page: + return page.video_info_res.video_data + elif page := self.loader_data.note_page: + return page.video_info_res.video_data + raise ParseException("can't find video_(id)/page or note_(id)/page in router data") diff --git a/core/parsers/example.py b/core/parsers/example.py new file mode 100644 index 0000000..c869f8b --- /dev/null +++ b/core/parsers/example.py @@ -0,0 +1,134 @@ +from re import Match +from typing import ClassVar + +from aiohttp import ClientError + +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..download import Downloader +from .base import BaseParser, handle +from .data import Platform + +""" +这是一个示例解析器,请感兴趣的开发者自行实现解析器,并提交PR。 + +""" + +class ExampleParser(BaseParser): + """示例视频网站解析器""" + + platform: ClassVar[Platform] = Platform(name="example", display_name="示例网站") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + + @handle("ex.short", r"ex\.short/\w+)") + async def _parse_short_link(self, searched: Match[str]): + """解析短链""" + url = f"https://{searched.group(0)}" + # 重定向再解析,请确保重定向链接的 handle 存在 + # 比如 url 重定向到 example.com/... 就会调用 _parse 解析 + return await self.parse_with_redirect(url) + + @handle("example.com", r"example\.com/video/(?P\w+)") + @handle("exam.ple", r"exam\.ple/(?P\w+)") + async def _parse(self, searched: Match[str]): + # 1. 提取视频 ID + video_id = searched.group("video_id") + url = f"https://api.example.com/video/{video_id}" + # 2. 请求 API 获取视频信息 + async with self.client.get(url, headers=self.headers) as resp: + if resp.status >= 400: + raise ClientError(f"HTTP {resp.status} {resp.reason}") + data = await resp.json() + + # 3. 提取数据 + title = data["title"] + author_name = data["author"]["name"] + avatar_url = data["author"]["avatar"] + video_url = data["video_url"] + cover_url = data["cover_url"] + duration = data["duration"] + timestamp = data["publish_time"] + description = data.get("description", "") + + # 4. 视频内容 + author = self.create_author(author_name, avatar_url) + video = self.create_video_content(video_url, cover_url, duration) + + # 5. 图集内容 + image_urls = data.get("images") + images = self.create_image_contents(image_urls) + + # 6. 返回解析结果 + return self.result( + title=title, + text=description, + author=author, + contents=[video, *images], + timestamp=timestamp, + url=f"https://example.com/video/{video_id}", + ) + + +""" + +# 构建作者信息 + +author = self.create_author( + name="作者名", + avatar_url="https://example.com/avatar.jpg", # 可选,会自动下载 + description="个性签名" # 可选 +) + + +# 构建视频内容 + +## 方式1:传入 URL,自动下载 +video = self.create_video_content( + url_or_task="https://example.com/video.mp4", + cover_url="https://example.com/cover.jpg", # 可选 + duration=120.5 # 可选,单位:秒 +) + +## 方式2:传入已创建的下载任务 +video_task = self.download.download_video(url, ext_headers=self.headers) +video = self.create_video_content( + url_or_task=video_task, + cover_url=cover_url, + duration=duration +) + + +# 并发下载图集内容 +images = self.create_image_contents([ + "https://example.com/img1.jpg", + "https://example.com/img2.jpg", +]) + + +# 构建图文内容(适用于类似 Bilibili 动态图文混排) + +graphics = self.create_graphics_content( + image_url="https://example.com/image.jpg", + text="图片前的文字说明", # 可选 + alt="图片描述" # 可选,居中显示 +) + + +# 创建动图GIF内容,平台一般只提供视频, 后续插件会做自动转为 gif 的处理 + +dynamics = self.create_dynamic_contents([ + "https://example.com/dynamic1.mp4", + "https://example.com/dynamic2.mp4", +]) + + +# 重定向 url + +real_url = await self.get_redirect_url( + url="https://short.url/abc", + headers=self.headers # 可选 +) + +""" diff --git a/core/parsers/kuaishou.py b/core/parsers/kuaishou.py new file mode 100644 index 0000000..f6cf99e --- /dev/null +++ b/core/parsers/kuaishou.py @@ -0,0 +1,142 @@ +import re +from random import choice +from typing import ClassVar, TypeAlias + +import msgspec +from msgspec import Struct, field + +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..download import Downloader +from .base import BaseParser, ParseException, PlatformEnum, handle +from .data import Platform + + +class KuaiShouParser(BaseParser): + """快手解析器""" + + # 平台信息 + platform: ClassVar[Platform] = Platform(name=PlatformEnum.KUAISHOU, display_name="快手") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + self.ios_headers["Referer"] = "https://v.kuaishou.com/" + + # https://v.kuaishou.com/2yAnzeZ + @handle("v.kuaishou", r"v\.kuaishou\.com/[A-Za-z\d._?%&+\-=/#]+") + # https://www.kuaishou.com/short-video/3xhjgcmir24m4nm + @handle("kuaishou", r"(?:www\.)?kuaishou\.com/[A-Za-z\d._?%&+\-=/#]+") + # https://v.m.chenzhongtech.com/fw/photo/3xburnkmj3auazc + @handle("chenzhongtech", r"(?:v\.m\.)?chenzhongtech\.com/fw/[A-Za-z\d._?%&+\-=/#]+") + async def _parse_v_kuaishou(self, searched: re.Match[str]): + # 从匹配对象中获取原始URL + url = f"https://{searched.group(0)}" + real_url = await self.get_redirect_url(url, headers=self.ios_headers) + + if len(real_url) <= 0: + raise ParseException("failed to get location url from url") + + # /fw/long-video/ 返回结果不一样, 统一替换为 /fw/photo/ 请求 + real_url = real_url.replace("/fw/long-video/", "/fw/photo/") + + async with self.client.get(real_url, headers=self.ios_headers) as resp: + + if resp.status >= 400: + raise ParseException(f"获取页面失败 {resp.status}") + response_text = await resp.text() + + pattern = r"window\.INIT_STATE\s*=\s*(.*?)" + matched = re.search(pattern, response_text) + + if not matched: + raise ParseException("failed to parse video JSON info from HTML") + + json_str = matched.group(1).strip() + init_state = msgspec.json.decode(json_str, type=KuaishouInitState) + photo = next((d.photo for d in init_state.values() if d.photo is not None), None) + if photo is None: + raise ParseException("window.init_state don't contains videos or pics") + + # 简洁的构建方式 + contents = [] + + # 添加视频内容 + if video_url := photo.video_url: + contents.append(self.create_video_content(video_url, photo.cover_url, photo.duration)) + + # 添加图片内容 + if img_urls := photo.img_urls: + contents.extend(self.create_image_contents(img_urls)) + + # 构建作者 + author = self.create_author(photo.name, photo.head_url) + + return self.result( + title=photo.caption, + author=author, + contents=contents, + timestamp=photo.timestamp // 1000, + ) + + + + + +class CdnUrl(Struct): + cdn: str + url: str | None = None + + +class Atlas(Struct): + music_cdn_list: list[CdnUrl] = field(name="musicCdnList", default_factory=list) + cdn_list: list[CdnUrl] = field(name="cdnList", default_factory=list) + size: list[dict] = field(name="size", default_factory=list) + img_route_list: list[str] = field(name="list", default_factory=list) + + @property + def img_urls(self): + if len(self.cdn_list) == 0 or len(self.img_route_list) == 0: + return [] + cdn = choice(self.cdn_list).cdn + return [f"https://{cdn}/{url}" for url in self.img_route_list] + + +class ExtParams(Struct): + atlas: Atlas = field(default_factory=Atlas) + + +class Photo(Struct): + # 标题 + caption: str + timestamp: int + duration: int = 0 + user_name: str = field(default="未知用户", name="userName") + head_url: str | None = field(default=None, name="headUrl") + cover_urls: list[CdnUrl] = field(name="coverUrls", default_factory=list) + main_mv_urls: list[CdnUrl] = field(name="mainMvUrls", default_factory=list) + ext_params: ExtParams = field(name="ext_params", default_factory=ExtParams) + + @property + def name(self) -> str: + return self.user_name.replace("\u3164", "").strip() + + @property + def cover_url(self): + return choice(self.cover_urls).url if len(self.cover_urls) != 0 else None + + @property + def video_url(self): + return choice(self.main_mv_urls).url if len(self.main_mv_urls) != 0 else None + + @property + def img_urls(self): + return self.ext_params.atlas.img_urls + + +class TusjohData(Struct): + result: int + photo: Photo | None = None + + + +KuaishouInitState: TypeAlias = dict[str, TusjohData] diff --git a/core/parsers/nga.py b/core/parsers/nga.py new file mode 100644 index 0000000..a10fe6d --- /dev/null +++ b/core/parsers/nga.py @@ -0,0 +1,190 @@ +import asyncio +import json +import random +import re +import time +from typing import ClassVar + +from aiohttp import ClientError +from bs4 import BeautifulSoup, Tag + +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..download import Downloader +from ..exception import ParseException +from .base import BaseParser, Platform, PlatformEnum, handle + + +class NGAParser(BaseParser): + # 平台信息 + platform: ClassVar[Platform] = Platform(name=PlatformEnum.NGA, display_name="NGA") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + extra_headers = { + "Referer": "https://nga.178.com/", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } + self.headers.update(extra_headers) + self.base_img_url = "https://img.nga.178.com/attachments" + + @staticmethod + def nga_url(tid: str | int) -> str: + return f"https://nga.178.com/read.php?tid={tid}" + + # ("ngabbs.com", r"https?://ngabbs\.com/read\.php\?tid=(?P\d+)(?:[&#A-Za-z\d=_-]+)?"), + # ("nga.178.com", r"https?://nga\.178\.com/read\.php\?tid=(?P\d+)(?:[&#A-Za-z\d=_-]+)?"), + # ("bbs.nga.cn", r"https?://bbs\.nga\.cn/read\.php\?tid=(?P\d+)(?:[&#A-Za-z\d=_-]+)?"), + @handle("ngabbs.com", r"tid=(?P\d+)") + @handle("nga.178.com", r"tid=(?P\d+)") + @handle("bbs.nga.cn", r"tid=(?P\d+)") + async def _parse(self, searched: re.Match[str]): + # 从匹配对象中获取原始URL + tid = searched.group("tid") + url = self.nga_url(tid) + html = None + async with self.client.get(url, headers=self.headers, allow_redirects=True) as resp: + try: + # 第一次请求可能返回403,但包含设置cookie的JavaScript + html = await resp.text() + + # 如果返回403且包含guestJs cookie设置,提取cookie并重试 + if resp.status == 403 and "guestJs" in html: + # 从JavaScript中提取guestJs cookie值 + cookie_match = re.search( + r"document\.cookie\s*=\s*['\"]guestJs=([^;'\"]+)", + html, + ) + if cookie_match: + guest_js = cookie_match.group(1) + # 等待一小段时间(模拟JavaScript的setTimeout) + await asyncio.sleep(0.3) + + # 添加随机参数避免缓存(模拟JavaScript的行为) + rand_param = random.randint(0, 999) + separator = "&" if "?" in url else "?" + retry_url = f"{url}{separator}rand={rand_param}" + clean_headers = self.headers.copy() + clean_headers["Cookie"] = f"guestJs={guest_js}" + async with self.client.get( + retry_url, + headers=clean_headers, + allow_redirects=True, + ) as retry_resp: + + html = await retry_resp.text() + # 用 retry_resp 的状态继续后面的检查 + resp = retry_resp + if resp.status != 200: + raise ParseException( + f"仍无法获取页面, HTTP {resp.status}" + ) + + except ClientError as e: + raise ParseException(f"请求失败: {e}") + + if resp.status != 200: + raise ParseException(f"无法获取页面, HTTP {resp.status}") + + # 简单识别是否需要登录或被拦截 + if "需要" in html and ("登录" in html or "请登录" in html): + raise ParseException("页面可能需要登录后访问") + + # 使用 BeautifulSoup 解析 HTML + soup = BeautifulSoup(html, "html.parser") + + # 提取 title - 从 postsubject0 + title = None + title_tag = soup.find(id="postsubject0") + if title_tag and isinstance(title_tag, Tag): + title = title_tag.get_text(strip=True) + + # 提取作者 - 先从 postauthor0 标签提取 uid,再从 JavaScript 中查找用户名 + author = None + author_tag = soup.find(id="postauthor0") + if author_tag and isinstance(author_tag, Tag): + # 从 href 属性中提取 uid: href="nuke.php?func=ucp&uid=24278093" + href = author_tag.get("href", "") + uid_match = re.search(r"[?&]uid=(\d+)", str(href)) + if uid_match: + uid = uid_match.group(1) + # 从 JavaScript 的 commonui.userInfo.setAll() 中查找对应用户名 + script_pattern = r"commonui\.userInfo\.setAll\s*\(\s*(\{.*?\})\s*\)" + script_match = re.search(script_pattern, html, re.DOTALL) + if script_match: + try: + user_info_json = script_match.group(1) + user_info = json.loads(user_info_json) + # 使用提取的 uid 查找用户名 + if uid in user_info: + author = user_info[uid].get("username") + except (json.JSONDecodeError, KeyError): + # JSON 解析失败或数据结构不符合预期,保持 author 为 None + pass + author = self.create_author(author) if author else None + # 提取时间 - 从第一个帖子的 postdate0 + timestamp = None + time_tag = soup.find(id="postdate0") + if time_tag and isinstance(time_tag, Tag): + timestr = time_tag.get_text(strip=True) + timestamp = int(time.mktime(time.strptime(timestr, "%Y-%m-%d %H:%M"))) + + # 提取文本 - postcontent0 + text = None + content_tag = soup.find(id="postcontent0") + contents = [] + if content_tag and isinstance(content_tag, Tag): + text = content_tag.get_text("\n", strip=True) + # 清理 BBCode 标签并限制长度 + img_urls: list[str] = re.findall(r"\[img\](.*?)\[/img\]", text) + img_urls = [self.base_img_url + url[1:] for url in img_urls] + contents.extend(self.create_image_contents(img_urls)) + text = self.clean_nga_text(text) + + return self.result( + title=title, + text=text, + url=url, + author=author, + contents=contents, + timestamp=timestamp, + ) + + @staticmethod + def clean_nga_text(text: str, max_length: int = 500) -> str: + rules: list[tuple[str, str, int]] = [ + # 移除图片标签(完整和不完整的) + (r"\[img\][^\[\]]*\[/img\]", "", 0), + (r"\[img\][^\[\]]*", "", 0), + # 处理URL标签,保留链接文本 + (r"\[url=[^\]]*\]([^\[]*?)\[/url\]", r"\1", 0), + (r"\[url\]([^\[]*?)\[/url\]", r"\1", 0), + # 移除引用标签 + (r"\[quote\].*?\[/quote\]", "", re.DOTALL), + # 处理格式标签,保留文本内容(b, i, u) + (r"\[(b|i|u)\](.*?)\[/\1\]", r"\2", re.DOTALL), + # 处理带属性的格式标签(color, size) + (r"\[(color|size)=[^\]]*\](.*?)\[/\1\]", r"\2", re.DOTALL), + # 移除其他未配对的标签 + (r"\[[^]]+\]", "", 0), + # 清理空白字符 + (r"\n{3,}", "\n\n", 0), # 多个换行符压缩为两个 + (r"[ \t]+", " ", 0), # 多个空格/制表符压缩为一个空格 + (r"\n\s+\n", "\n\n", 0), # 清理空行中的空白字符 + ] + + for rule in rules: + pattern, replacement, flags = rule[0], rule[1], rule[2] + text = re.sub(pattern, replacement, text, flags=flags) + + text = text.strip() + + # 限制文本长度 + if len(text) > max_length: + text = text[:max_length] + "..." + + return text diff --git a/core/parsers/tiktok.py b/core/parsers/tiktok.py new file mode 100644 index 0000000..3e01257 --- /dev/null +++ b/core/parsers/tiktok.py @@ -0,0 +1,38 @@ +import re +from typing import ClassVar + +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..download import Downloader +from .base import BaseParser, PlatformEnum, handle +from .data import Author, Platform, VideoContent + + +class TikTokParser(BaseParser): + # 平台信息 + platform: ClassVar[Platform] = Platform(name=PlatformEnum.TIKTOK, display_name="TikTok") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + + @handle("tiktok.com", r"(?:https?://)?(www|vt|vm)\.tiktok\.com/[A-Za-z0-9._?%&+\-=/#@]*") + async def _parse(self, searched: re.Match[str]): + # 从匹配对象中获取原始URL + url, prefix = searched.group(0), searched.group(1) + + if prefix in ("vt", "vm"): + url = await self.get_redirect_url(url) + + # 获取视频信息 + video_info = await self.downloader.ytdlp_extract_info(url) + + # 下载封面和视频 + cover = self.downloader.download_img(video_info.thumbnail) + video = self.downloader.download_video(url, use_ytdlp=True) + + return self.result( + title=video_info.title, + author=Author(name=video_info.channel), + contents=[VideoContent(video, cover, duration=video_info.duration)], + timestamp=video_info.timestamp, + ) diff --git a/core/parsers/twitter.py b/core/parsers/twitter.py new file mode 100644 index 0000000..a66775a --- /dev/null +++ b/core/parsers/twitter.py @@ -0,0 +1,133 @@ +import re +from itertools import chain +from typing import Any, ClassVar + +from aiohttp import ClientError +from bs4 import BeautifulSoup, Tag + +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..download import Downloader +from ..exception import ParseException +from .base import BaseParser, PlatformEnum, handle +from .data import ParseResult, Platform + + +class TwitterParser(BaseParser): + # 平台信息 + platform: ClassVar[Platform] = Platform(name=PlatformEnum.TWITTER, display_name="推特") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + + async def _req_xdown_api(self, url: str) -> dict[str, Any]: + headers = { + "Accept": "application/json, text/plain, */*", + "Content-Type": "application/x-www-form-urlencoded", + "Origin": "https://xdown.app", + "Referer": "https://xdown.app/", + **self.headers, + } + data = {"q": url, "lang": "zh-cn"} + + async with self.client.post( + "https://xdown.app/api/ajaxSearch", + data=data, + headers=headers, + ) as resp: + if resp.status >= 400: + raise ClientError(f"xdown API {resp.status} {resp.reason}") + return await resp.json() + + @handle("x.com", r"https?://x.com/[0-9-a-zA-Z_]{1,20}/status/([0-9]+)") + async def _parse(self, searched: re.Match[str]) -> ParseResult: + # 从匹配对象中获取原始URL + url = searched.group(0) + resp = await self._req_xdown_api(url) + if resp.get("status") != "ok": + raise ParseException("解析失败") + + html_content = resp.get("data") + + if html_content is None: + raise ParseException("解析失败, 数据为空") + + return self.parse_twitter_html(html_content) + + def parse_twitter_html(self, html_content: str) -> ParseResult: + """解析 Twitter HTML 内容 + + Args: + html_content (str): Twitter HTML 内容 + + Returns: + ParseResult: 解析结果 + """ + soup = BeautifulSoup(html_content, "html.parser") + + # 初始化数据 + title = None + cover_url = None + video_url = None + images_urls = [] + dynamic_urls = [] + + # 1. 提取缩略图链接 + thumb_tag = soup.find("img") + if isinstance(thumb_tag, Tag): + if cover := thumb_tag.get("src"): + cover_url = str(cover) + + # 2. 提取下载链接 + tw_button_tags = soup.find_all("a", class_="tw-button-dl") + abutton_tags = soup.find_all("a", class_="abutton") + for tag in chain(tw_button_tags, abutton_tags): + if not isinstance(tag, Tag): + continue + href = tag.get("href") + if href is None: + continue + + href = str(href) + text = tag.get_text(strip=True) + if "下载 MP4" in text: + video_url = href + break + elif "下载图片" in text: + images_urls.append(href) + elif "下载 gif" in text: + dynamic_urls.append(href) + + # 3. 提取标题 + title_tag = soup.find("h3") + if title_tag: + title = title_tag.get_text(strip=True) + + # 简洁的构建方式 + contents = [] + + # 添加视频内容 + if video_url: + contents.append(self.create_video_content(video_url, cover_url)) + + # 添加图片内容 + if images_urls: + contents.extend(self.create_image_contents(images_urls)) + + # 添加动态内容 + if dynamic_urls: + contents.extend(self.create_dynamic_contents(dynamic_urls)) + + return self.result( + title=title, + author=self.create_author("无用户名"), + contents=contents, + ) + # # 4. 提取Twitter ID + # twitter_id_input = soup.find("input", {"id": "TwitterId"}) + # if ( + # twitter_id_input + # and isinstance(twitter_id_input, Tag) + # and (value := twitter_id_input.get("value")) + # and isinstance(value, str) + # ): diff --git a/core/parsers/weibo.py b/core/parsers/weibo.py new file mode 100644 index 0000000..79847ed --- /dev/null +++ b/core/parsers/weibo.py @@ -0,0 +1,420 @@ +from re import Match, sub +from time import time +from typing import ClassVar +from uuid import uuid4 + +import msgspec +from aiohttp import ClientError +from bs4 import BeautifulSoup, Tag +from msgspec import Struct + +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..download import Downloader +from .base import BaseParser, ParseException, Platform, PlatformEnum, handle +from .data import MediaContent + + +class WeiBoParser(BaseParser): + # 平台信息 + platform: ClassVar[Platform] = Platform(name=PlatformEnum.WEIBO, display_name="微博") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + extra_headers = { + "accept": ( + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif," + "image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" + ), + "referer": "https://weibo.com/", + } + self.headers.update(extra_headers) + + # https://weibo.com/tv/show/1034:5007449447661594?mid=5007452630158934 + @handle("weibo.com/tv", r"weibo\.com/tv/show/\d{4}:\d+\?mid=(?P\d+)") + async def _parse_weibo_tv(self, searched: Match[str]): + mid = str(searched.group("mid")) + weibo_id = self._mid2id(mid) + return await self.parse_weibo_id(weibo_id) + + # https://video.weibo.com/show?fid=1034:5145615399845897 + @handle("video.weibo", r"video\.weibo\.com/show\?fid=(?P\d+:\d+)") + async def _parse_video_weibo(self, searched: Match[str]): + fid = str(searched.group("fid")) + return await self.parse_fid(fid) + + # https://m.weibo.cn/status/5234367615996775 + # https://m.weibo.cn/detail/4976424138313924 + @handle("m.weibo.cn", r"m\.weibo\.cn/(?:status|detail)/(?P\d+)") + # https://weibo.com/7207262816/P5kWdcfDe + @handle("weibo.com", r"weibo\.com/\d+/(?P[0-9a-zA-Z]+)") + async def _parse_m_weibo_cn(self, searched: Match[str]): + wid = str(searched.group("wid")) + return await self.parse_weibo_id(wid) + + # https://mapp.api.weibo.cn/fx/233911ddcc6bffea835a55e725fb0ebc.html + @handle("mapp.api.weibo", r"mapp\.api\.weibo\.cn/fx/[A-Za-z\d]+\.html") + async def _parse_mapp_api_weibo(self, searched: Match[str]): + url = f"https://{searched.group(0)}" + return await self.parse_with_redirect(url) + + # https://weibo.com/ttarticle/p/show?id=2309404962180771742222 + # https://weibo.com/ttarticle/x/m/show#/id=2309404962180771742222 + @handle("weibo.com/ttarticle", r"id=(?P\d+)") + # https://card.weibo.com/article/m/show/id/2309404962180771742222 + @handle("weibo.com/article", r"/id/(?P\d+)") + async def _parse_article(self, searched: Match[str]): + _id = searched.group("id") + return await self.parse_article(_id) + + async def parse_article(self, _id: str): + class UserInfo(Struct): + screen_name: str + profile_image_url: str + + class Data(Struct): + url: str + title: str + content: str + userinfo: UserInfo + create_at_unix: int + + class Detail(Struct): + code: str + msg: str + data: Data + + url = "https://card.weibo.com/article/m/aj/detail" + params = { + "_rid": str(uuid4()), + "id": _id, + "_t": int(time() * 1000), + } + + + async with self.client.post( + url=url, + data=params, + headers=self.headers, + ) as resp: + if resp.status >= 400: + raise ClientError(f"article API {resp.status} {resp.reason}") + detail = msgspec.json.decode(await resp.read(), type=Detail) + + if detail.msg != "success": + raise ParseException("请求失败") + + data = detail.data + + soup = BeautifulSoup(data.content, "html.parser") + contents: list[MediaContent] = [] + text_buffer: list[str] = [] + + for element in soup.find_all(["p", "img"]): + if not isinstance(element, Tag): + continue + + if element.name == "p": + text = element.get_text(strip=True) + # 去除零宽空格 + text = text.replace("\u200b", "") + if text: + text_buffer.append(text) + elif element.name == "img": + src = element.get("src") + if isinstance(src, str): + text = "\n\n".join(text_buffer) + contents.append(self.create_graphics_content(src, text=text)) + text_buffer.clear() + + author = self.create_author( + data.userinfo.screen_name, + data.userinfo.profile_image_url, + ) + + end_text = "\n\n".join(text_buffer) if text_buffer else None + + return self.result( + url=data.url, + title=data.title, + author=author, + timestamp=data.create_at_unix, + text=end_text, + contents=contents, + ) + + async def parse_fid(self, fid: str): + """ + 解析带 fid 的微博视频 + """ + + req_url = f"https://h5.video.weibo.com/api/component?page=/show/{fid}" + headers = { + "Referer": f"https://h5.video.weibo.com/show/{fid}", + "Content-Type": "application/x-www-form-urlencoded", + **self.headers, + } + post_content = 'data={"Component_Play_Playinfo":{"oid":"' + fid + '"}}' + + async with self.client.post( + req_url, + data=post_content, + headers=headers, + ) as resp: + if resp.status >= 400: + raise ClientError(f"video API {resp.status} {resp.reason}") + json_data = await resp.json() + + data = json_data.get("data", {}).get("Component_Play_Playinfo", {}) + if not data: + raise ParseException("Component_Play_Playinfo 数据为空") + # 提取作者 + user = data.get("reward", {}).get("user", {}) + author_name, avatar, description = ( + user.get("name", "未知"), + user.get("profile_image_url"), + user.get("description"), + ) + author = self.create_author(author_name, avatar, description) + + # 提取标题和文本 + title, text = data.get("title", ""), data.get("text", "") + if text: + text = sub(r"<[^>]*>", "", text) + text = text.replace("\n\n", "").strip() + + # 获取封面 + cover_url = data.get("cover_image") + if cover_url: + cover_url = "https:" + cover_url + + # 获取视频下载链接 + contents = [] + video_url_dict = data.get("urls") + if video_url_dict and isinstance(video_url_dict, dict): + # stream_url码率最低,urls中第一条码率最高 + first_mp4_url: str = next(iter(video_url_dict.values())) + video_url = "https:" + first_mp4_url + else: + video_url = data.get("stream_url") + + if video_url: + contents.append(self.create_video_content(video_url, cover_url)) + + # 时间戳 + timestamp = data.get("real_date") + + return self.result( + title=title, + text=text, + author=author, + contents=contents, + timestamp=timestamp, + ) + + async def parse_weibo_id(self, weibo_id: str): + """解析微博 id (无 Cookie + 伪装 XHR + 不跟随重定向)""" + headers = { + "accept": "application/json, text/plain, */*", + "referer": f"https://m.weibo.cn/detail/{weibo_id}", + "origin": "https://m.weibo.cn", + "x-requested-with": "XMLHttpRequest", + "mweibo-pwa": "1", + "sec-fetch-site": "same-origin", + "sec-fetch-mode": "cors", + "sec-fetch-dest": "empty", + **self.headers, + } + + # 加时间戳参数,减少被缓存/规则命中的概率 + ts = int(time() * 1000) + url = f"https://m.weibo.cn/statuses/show?id={weibo_id}&_={ts}" + + # 关键:不带 cookie、不跟随重定向(避免二跳携 cookie) + async with self.client.get( + url=url, + headers=headers, + allow_redirects=False, + ) as resp: + if resp.status != 200: + if resp.status in (403, 418): + raise ParseException(f"被风控拦截({resp.status}),可尝试更换 UA/Referer 或稍后重试") + raise ParseException(f"获取数据失败 {resp.status} {resp.reason}") + + ctype = resp.headers.get("content-type", "") + if "application/json" not in ctype: + raise ParseException(f"获取数据失败 content-type is not application/json (got: {ctype})") + + # 用 bytes 更稳,避免编码歧义 + weibo_data = msgspec.json.decode(await resp.read(), type=WeiboResponse).data + + return self.build_weibo_data(weibo_data) + + def build_weibo_data(self, data: "WeiboData"): + contents = [] + + # 添加视频内容 + if video_url := data.video_url: + cover_url = data.cover_url + contents.append(self.create_video_content(video_url, cover_url)) + + # 添加图片内容 + if image_urls := data.image_urls: + contents.extend(self.create_image_contents(image_urls)) + + # 构建作者 + author = self.create_author(data.display_name, data.user.profile_image_url) + repost = None + if data.retweeted_status: + repost = self.build_weibo_data(data.retweeted_status) + + return self.result( + title=data.title, + text=data.text_content, + author=author, + contents=contents, + timestamp=data.timestamp, + url=data.url, + repost=repost, + ) + + def _base62_encode(self, number: int) -> str: + """将数字转换为 base62 编码""" + alphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + if number == 0: + return "0" + + result = "" + while number > 0: + result = alphabet[number % 62] + result + number //= 62 + + return result + + def _mid2id(self, mid: str) -> str: + """将微博 mid 转换为 id""" + from math import ceil + + mid = str(mid)[::-1] # 反转输入字符串 + size = ceil(len(mid) / 7) # 计算每个块的大小 + result = [] + + for i in range(size): + # 对每个块进行处理并反转 + s = mid[i * 7 : (i + 1) * 7][::-1] + # 将字符串转为整数后进行 base62 编码 + s = self._base62_encode(int(s)) + # 如果不是最后一个块并且长度不足4位,进行左侧补零操作 + if i < size - 1 and len(s) < 4: + s = "0" * (4 - len(s)) + s + result.append(s) + + result.reverse() # 反转结果数组 + return "".join(result) # 将结果数组连接成字符串 + + + + + +class LargeInPic(Struct): + url: str + + +class Pic(Struct): + url: str + large: LargeInPic + + +class Urls(Struct): + mp4_720p_mp4: str | None = None + mp4_hd_mp4: str | None = None + mp4_ld_mp4: str | None = None + + def get_video_url(self) -> str | None: + return self.mp4_720p_mp4 or self.mp4_hd_mp4 or self.mp4_ld_mp4 or None + + +class PagePic(Struct): + url: str + + +class PageInfo(Struct): + title: str | None = None + urls: Urls | None = None + page_pic: PagePic | None = None + + +class User(Struct): + id: int + screen_name: str + """用户昵称""" + profile_image_url: str + """头像""" + + +class WeiboData(Struct): + user: User + text: str + # source: str # 如 微博网页版 + # region_name: str | None = None + + bid: str + created_at: str + """发布时间 格式: `Thu Oct 02 14:39:33 +0800 2025`""" + + status_title: str | None = None + pics: list[Pic] | None = None + page_info: PageInfo | None = None + retweeted_status: "WeiboData | None" = None # 转发微博 + + @property + def title(self) -> str | None: + return self.page_info.title if self.page_info else None + + @property + def display_name(self) -> str: + return self.user.screen_name + + @property + def text_content(self) -> str: + # 将
转换为 \n + text = self.text.replace("
", "\n") + # 去除 html 标签 + text = sub(r"<[^>]*>", "", text) + return text + + @property + def cover_url(self) -> str | None: + if self.page_info is None: + return None + if self.page_info.page_pic: + return self.page_info.page_pic.url + return None + + @property + def video_url(self) -> str | None: + if self.page_info and self.page_info.urls: + return self.page_info.urls.get_video_url() + return None + + @property + def image_urls(self) -> list[str]: + if self.pics: + return [x.large.url for x in self.pics] + return [] + + @property + def url(self) -> str: + return f"https://weibo.com/{self.user.id}/{self.bid}" + + @property + def timestamp(self) -> int: + from time import mktime, strptime + + create_at = strptime(self.created_at, "%a %b %d %H:%M:%S %z %Y") + return int(mktime(create_at)) + + +class WeiboResponse(Struct): + ok: int + data: WeiboData diff --git a/core/parsers/xiaohongshu.py b/core/parsers/xiaohongshu.py new file mode 100644 index 0000000..3aa8551 --- /dev/null +++ b/core/parsers/xiaohongshu.py @@ -0,0 +1,247 @@ +import json +import re +from typing import Any, ClassVar + +from msgspec import Struct, convert, field + +from astrbot.api import logger +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..download import Downloader +from .base import BaseParser, ParseException, Platform, PlatformEnum, handle + + +class XiaoHongShuParser(BaseParser): + # 平台信息 + platform: ClassVar[Platform] = Platform(name=PlatformEnum.XIAOHONGSHU, display_name="小红书") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + explore_headers = { + "accept": ( + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif," + "image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" + ) + } + self.headers.update(explore_headers) + discovery_headers = { + "origin": "https://www.xiaohongshu.com", + "x-requested-with": "XMLHttpRequest", + "sec-fetch-site": "same-origin", + "sec-fetch-mode": "cors", + "sec-fetch-dest": "empty", + } + self.ios_headers.update(discovery_headers) + + @handle("xhslink.com", r"xhslink\.com/[A-Za-z0-9._?%&+=/#@-]*") + async def _parse_short_link(self, searched: re.Match[str]): + url = f"https://{searched.group(0)}" + return await self.parse_with_redirect(url, self.ios_headers) + + # https://www.xiaohongshu.com/explore/68feefe40000000007030c4a?xsec_token=ABjAKjfMHJ7ck4UjPlugzVqMb35utHMRe_vrgGJ2AwJnc=&xsec_source=pc_feed + @handle( + "hongshu.com/explore", + r"explore/(?P[0-9a-zA-Z]+)\?[A-Za-z0-9._%&+=/#@-]*", + ) + async def _parse_explore(self, searched: re.Match[str]): + url = f"https://www.xiaohongshu.com/{searched.group(0)}" + xhs_id = searched.group("xhs_id") + return await self.parse_explore(url, xhs_id) + + # https://www.xiaohongshu.com/discovery/item/68e8e3fa00000000030342ec?app_platform=android&ignoreEngage=true&app_version=9.6.0&share_from_user_hidden=true&xsec_source=app_share&type=normal&xsec_token=CBW9rwIV2qhcCD-JsQAOSHd2tTW9jXAtzqlgVXp6c52Sw%3D&author_share=1&xhsshare=QQ&shareRedId=ODs3RUk5ND42NzUyOTgwNjY3OTo8S0tK&apptime=1761372823&share_id=3b61945239ac403db86bea84a4f15124&share_channel=qq + @handle( + "hongshu.com/discovery/item/", + r"discovery/item/(?P[0-9a-zA-Z]+)\?[A-Za-z0-9._%&+=/#@-]*", + ) + async def _parse_discovery(self, searched: re.Match[str]): + route = searched.group(0) + explore_route = route.replace("discovery/item", "explore", 1) + xhs_id = searched.group("xhs_id") + + try: + return await self.parse_explore(f"https://www.xiaohongshu.com/{explore_route}", xhs_id) + except ParseException: + logger.debug("parse_explore failed, fallback to parse_discovery") + return await self.parse_discovery(f"https://www.xiaohongshu.com/{route}") + + async def parse_explore(self, url: str, xhs_id: str): + async with self.client.get(url, headers=self.headers) as resp: + html = await resp.text() + logger.debug(f"url: {resp.url} | status: {resp.status}") + + json_obj = self._extract_initial_state_json(html) + + # ["note"]["noteDetailMap"][xhs_id]["note"] + note_data = json_obj.get("note", {}).get("noteDetailMap", {}).get(xhs_id, {}).get("note", {}) + if not note_data: + raise ParseException("can't find note detail in json_obj") + + class Image(Struct): + urlDefault: str + + class User(Struct): + nickname: str + avatar: str + + class NoteDetail(Struct): + type: str + title: str + desc: str + user: User + imageList: list[Image] = field(default_factory=list) + video: Video | None = None + + @property + def nickname(self) -> str: + return self.user.nickname + + @property + def avatar_url(self) -> str: + return self.user.avatar + + @property + def image_urls(self) -> list[str]: + return [item.urlDefault for item in self.imageList] + + @property + def video_url(self) -> str | None: + if self.type != "video" or not self.video: + return None + return self.video.video_url + + note_detail = convert(note_data, type=NoteDetail) + + contents = [] + # 添加视频内容 + if video_url := note_detail.video_url: + # 使用第一张图片作为封面 + cover_url = note_detail.image_urls[0] if note_detail.image_urls else None + contents.append(self.create_video_content(video_url, cover_url)) + + # 添加图片内容 + elif image_urls := note_detail.image_urls: + contents.extend(self.create_image_contents(image_urls)) + + # 构建作者 + author = self.create_author(note_detail.nickname, note_detail.avatar_url) + + return self.result( + title=note_detail.title, + text=note_detail.desc, + author=author, + contents=contents, + ) + + async def parse_discovery(self, url: str): + async with self.client.get( + url, + headers=self.ios_headers, + allow_redirects=True, + ) as resp: + html = await resp.text() + + json_obj = self._extract_initial_state_json(html) + note_data = json_obj.get("noteData") + if not note_data: + raise ParseException("can't find noteData in json_obj") + preload_data = note_data.get("normalNotePreloadData", {}) + note_data = note_data.get("data", {}).get("noteData", {}) + if not note_data: + raise ParseException("can't find noteData in noteData.data") + + class Image(Struct): + url: str + urlSizeLarge: str | None = None + + class User(Struct): + nickName: str + avatar: str + + class NoteData(Struct): + type: str + title: str + desc: str + user: User + time: int + lastUpdateTime: int + imageList: list[Image] = [] # 有水印 + video: Video | None = None + + @property + def image_urls(self) -> list[str]: + return [item.url for item in self.imageList] + + @property + def video_url(self) -> str | None: + if self.type != "video" or not self.video: + return None + return self.video.video_url + + class NormalNotePreloadData(Struct): + title: str + desc: str + imagesList: list[Image] = [] # 无水印, 但只有一只,用于视频封面 + + @property + def image_urls(self) -> list[str]: + return [item.urlSizeLarge or item.url for item in self.imagesList] + + note_data = convert(note_data, type=NoteData) + + contents = [] + if video_url := note_data.video_url: + if preload_data: + preload_data = convert(preload_data, type=NormalNotePreloadData) + img_urls = preload_data.image_urls + else: + img_urls = note_data.image_urls + contents.append(self.create_video_content(video_url, img_urls[0])) + elif img_urls := note_data.image_urls: + contents.extend(self.create_image_contents(img_urls)) + + return self.result( + title=note_data.title, + author=self.create_author(note_data.user.nickName, note_data.user.avatar), + contents=contents, + text=note_data.desc, + timestamp=note_data.time // 1000, + ) + + def _extract_initial_state_json(self, html: str) -> dict[str, Any]: + pattern = r"window\.__INITIAL_STATE__=(.*?)" + matched = re.search(pattern, html) + if not matched: + raise ParseException("小红书分享链接失效或内容已删除") + + json_str = matched.group(1).replace("undefined", "null") + return json.loads(json_str) + + +class Stream(Struct): + h264: list[dict[str, Any]] | None = None + h265: list[dict[str, Any]] | None = None + av1: list[dict[str, Any]] | None = None + h266: list[dict[str, Any]] | None = None + + +class Media(Struct): + stream: Stream + + +class Video(Struct): + media: Media + + @property + def video_url(self) -> str | None: + stream = self.media.stream + + # h264 有水印,h265 无水印 + if stream.h265: + return stream.h265[0]["masterUrl"] + elif stream.h264: + return stream.h264[0]["masterUrl"] + elif stream.av1: + return stream.av1[0]["masterUrl"] + elif stream.h266: + return stream.h266[0]["masterUrl"] + return None diff --git a/core/parsers/youtube.py b/core/parsers/youtube.py new file mode 100644 index 0000000..f2fc438 --- /dev/null +++ b/core/parsers/youtube.py @@ -0,0 +1,166 @@ +import re +from pathlib import Path +from typing import ClassVar + +import msgspec +from aiohttp import ClientError +from msgspec import Struct + +from astrbot.core.config.astrbot_config import AstrBotConfig + +from ..download import Downloader +from .base import BaseParser, Platform, PlatformEnum, handle + + +class YouTubeParser(BaseParser): + # 平台信息 + platform: ClassVar[Platform] = Platform(name=PlatformEnum.YOUTUBE, display_name="油管") + + def __init__(self, config: AstrBotConfig, downloader: Downloader): + super().__init__(config, downloader) + self.ytb_cookies_file = Path(self.config["ytb_cookies_file"]) or None + self.max_duration = config["max_duration"] + + @handle("youtu.be", r"https?://(?:www\.)?youtu\.be/[A-Za-z\d\._\?%&\+\-=/#]+") + @handle( + "youtube.com", + r"https?://(?:www\.)?youtube\.com/(?:watch|shorts)(?:/[A-Za-z\d_\-]+|\?v=[A-Za-z\d_\-]+)", + ) + async def _parse_video(self, searched: re.Match[str]): + return await self.parse_video(searched) + + async def parse_video(self, searched: re.Match[str]): + # 从匹配对象中获取原始URL + url = searched.group(0) + + video_info = await self.downloader.ytdlp_extract_info( + url, cookiefile=self.ytb_cookies_file + ) + author = await self._fetch_author_info(video_info.channel_id) + + contents = [] + if video_info.duration <= self.max_duration: + video = self.downloader.download_video( + url, use_ytdlp=True, cookiefile=self.ytb_cookies_file + ) + contents.append( + self.create_video_content( + video, + video_info.thumbnail, + video_info.duration, + ) + ) + else: + contents.extend(self.create_image_contents([video_info.thumbnail])) + + return self.result( + title=video_info.title, + author=author, + contents=contents, + timestamp=video_info.timestamp, + ) + + async def parse_audio(self, url: str): + """解析 YouTube URL 并标记为音频下载 + + Args: + url: YouTube 链接 + + Returns: + ParseResult: 解析结果(音频内容) + + """ + video_info = await self.downloader.ytdlp_extract_info(url, self.ytb_cookies_file) + author = await self._fetch_author_info(video_info.channel_id) + + contents = [] + contents.extend(self.create_image_contents([video_info.thumbnail])) + + if video_info.duration <= self.max_duration: + audio_task = self.downloader.download_audio( + url, use_ytdlp=True, cookiefile=self.ytb_cookies_file + ) + contents.append(self.create_audio_content(audio_task, duration=video_info.duration)) + + return self.result( + title=video_info.title, + author=author, + contents=contents, + timestamp=video_info.timestamp, + ) + + async def _fetch_author_info(self, channel_id: str): + url = "https://www.youtube.com/youtubei/v1/browse?prettyPrint=false" + payload = { + "context": { + "client": { + "hl": "zh-HK", + "gl": "US", + "deviceMake": "Apple", + "deviceModel": "", + "clientName": "WEB", + "clientVersion": "2.20251002.00.00", + "osName": "Macintosh", + "osVersion": "10_15_7", + }, + "user": {"lockedSafetyMode": False}, + "request": { + "useSsl": True, + "internalExperimentFlags": [], + "consistencyTokenJars": [], + }, + }, + "browseId": channel_id, + } + async with self.client.post( + url, + json=payload, + headers=self.headers, + ) as resp: + if resp.status >= 400: + raise ClientError(f"YouTube browse API {resp.status} {resp.reason}") + browse = msgspec.json.decode(await resp.read(), type=BrowseResponse) + + return self.create_author(browse.name, browse.avatar_url, browse.description) + + + + + +class Thumbnail(Struct): + url: str + + +class AvatarInfo(Struct): + thumbnails: list[Thumbnail] + + +class ChannelMetadataRenderer(Struct): + title: str + description: str + avatar: AvatarInfo + + +class Metadata(Struct): + channelMetadataRenderer: ChannelMetadataRenderer + + +class Avatar(Struct): + thumbnails: list[Thumbnail] + + +class BrowseResponse(Struct): + metadata: Metadata + + @property + def name(self) -> str: + return self.metadata.channelMetadataRenderer.title + + @property + def avatar_url(self) -> str | None: + thumbnails = self.metadata.channelMetadataRenderer.avatar.thumbnails + return thumbnails[0].url if thumbnails else None + + @property + def description(self) -> str: + return self.metadata.channelMetadataRenderer.description diff --git a/core/render.py b/core/render.py new file mode 100644 index 0000000..8f06cc1 --- /dev/null +++ b/core/render.py @@ -0,0 +1,1455 @@ +import uuid +from collections.abc import AsyncGenerator, Awaitable, Callable +from dataclasses import dataclass +from functools import lru_cache, wraps +from io import BytesIO +from itertools import chain +from pathlib import Path +from typing import ClassVar, ParamSpec, TypeVar + +import aiofiles +from apilmoji import Apilmoji, EmojiCDNSource +from apilmoji.core import get_font_height +from PIL import Image, ImageDraw, ImageFont + +from astrbot.api import logger +from astrbot.core.config.astrbot_config import AstrBotConfig +from astrbot.core.message.components import BaseMessageComponent, Plain, Record, Video +from astrbot.core.message.components import Image as AstrImage + +from .exception import DownloadException, DownloadLimitException, ZeroSizeException +from .parsers import ( + AudioContent, + DynamicContent, + GraphicsContent, + ImageContent, + ParseResult, + VideoContent, +) +from .utils import construct_forward_message + +# 定义类型变量 +P = ParamSpec("P") +T = TypeVar("T") + +Color = tuple[int, int, int] +PILImage = Image.Image + + +def suppress_exception( + func: Callable[P, T], +) -> Callable[P, T | None]: + """装饰器:捕获所有异常并返回 None""" + + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> T | None: + try: + return func(*args, **kwargs) + except Exception as e: + logger.debug(f"函数 {func.__name__} 执行失败: {e}") + return None + + return wrapper + + +def suppress_exception_async( + func: Callable[P, Awaitable[T]], +) -> Callable[P, Awaitable[T | None]]: + """装饰器:捕获所有异常并返回 None""" + + @wraps(func) + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T | None: + try: + return await func(*args, **kwargs) + except Exception as e: + logger.debug(f"函数 {func.__name__} 执行失败: {e}") + return None + + return wrapper + + +@dataclass(eq=False, frozen=True, slots=True) +class FontInfo: + """字体信息数据类""" + + font: ImageFont.FreeTypeFont + line_height: int + cjk_width: int + + def __hash__(self) -> int: + """实现哈希方法以支持 @lru_cache""" + return hash((id(self.font), self.line_height, self.cjk_width)) + + @lru_cache(maxsize=400) + def get_char_width(self, char: str) -> int: + """获取字符宽度,使用缓存优化""" + # bbox = self.font.getbbox(char) + # width = int(bbox[2] - bbox[0]) + # return width + return int(self.font.getlength(char)) + + def get_char_width_fast(self, char: str) -> int: + """快速获取单个字符宽度""" + if "\u4e00" <= char <= "\u9fff": + return self.cjk_width + else: + return self.get_char_width(char) + + def get_text_width(self, text: str) -> int: + """计算文本宽度,使用预计算的字符宽度优化性能 + + Args: + text: 要计算宽度的文本 + + Returns: + 文本宽度(像素) + """ + if not text: + return 0 + + total_width = 0 + for char in text: + total_width += self.get_char_width_fast(char) + return total_width + + +@dataclass(eq=False, frozen=True, slots=True) +class FontSet: + """字体集数据类""" + + _FONT_SIZES = ( + ("name", 28), + ("title", 30), + ("text", 24), + ("extra", 24), + ("indicator", 60), + ) + """字体大小""" + + name_font: FontInfo + title_font: FontInfo + text_font: FontInfo + extra_font: FontInfo + indicator_font: FontInfo + + @classmethod + def new(cls, font_path: Path): + font_infos: dict[str, FontInfo] = {} + for name, size in cls._FONT_SIZES: + font = ImageFont.truetype(font_path, size) + font_infos[f"{name}_font"] = FontInfo( + font=font, + line_height=get_font_height(font), + cjk_width=size, + ) + return FontSet(**font_infos) + + +@dataclass(eq=False, frozen=True, slots=True) +class SectionData: + """基础部分数据类""" + + height: int + + +@dataclass(eq=False, frozen=True, slots=True) +class HeaderSectionData(SectionData): + """Header 部分数据""" + + avatar: PILImage | None + name_lines: list[str] + time_lines: list[str] + text_height: int + + +@dataclass(eq=False, frozen=True, slots=True) +class TitleSectionData(SectionData): + """标题部分数据""" + + lines: list[str] + + +@dataclass(eq=False, frozen=True, slots=True) +class CoverSectionData(SectionData): + """封面部分数据""" + + cover_img: PILImage + + +@dataclass(eq=False, frozen=True, slots=True) +class TextSectionData(SectionData): + """文本部分数据""" + + lines: list[str] + + +@dataclass(eq=False, frozen=True, slots=True) +class ExtraSectionData(SectionData): + """额外信息部分数据""" + + lines: list[str] + + +@dataclass(eq=False, frozen=True, slots=True) +class RepostSectionData(SectionData): + """转发部分数据""" + + scaled_image: PILImage + + +@dataclass(eq=False, frozen=True, slots=True) +class ImageGridSectionData(SectionData): + """图片网格部分数据""" + + images: list[PILImage] + cols: int + rows: int + has_more: bool + remaining_count: int + + +@dataclass(eq=False, frozen=True, slots=True) +class GraphicsSectionData(SectionData): + """图文内容部分数据""" + + text_lines: list[str] + image: PILImage + alt_text: str | None = None + + +@dataclass +class RenderContext: + """渲染上下文,存储渲染过程中的状态信息""" + + result: ParseResult + """解析结果""" + card_width: int + """卡片宽度""" + content_width: int + """内容宽度""" + image: PILImage + """当前图像""" + draw: ImageDraw.ImageDraw + """绘图对象""" + not_repost: bool = True + """是否为非转发内容""" + y_pos: int = 0 + """当前绘制位置(绘制阶段使用)""" + + +class CommonRenderer: + """统一的渲染器,将解析结果转换为消息""" + + # 卡片配置常量 + PADDING = 25 + """内边距""" + AVATAR_SIZE = 80 + """头像大小""" + AVATAR_TEXT_GAP = 15 + """头像和文字之间的间距""" + MAX_COVER_WIDTH = 1000 + """封面最大宽度""" + MAX_COVER_HEIGHT = 800 + """封面最大高度""" + DEFAULT_CARD_WIDTH = 800 + """默认卡片宽度""" + MIN_CARD_WIDTH = 400 + """最小卡片宽度""" + SECTION_SPACING = 15 + """部分间距""" + NAME_TIME_GAP = 5 + """名称和时间之间的间距""" + AVATAR_UPSCALE_FACTOR = 2 + """头像圆形框超采样倍数""" + + # 图片处理配置 + MIN_COVER_WIDTH = 300 + """最小封面宽度""" + MIN_COVER_HEIGHT = 200 + """最小封面高度""" + MAX_IMAGE_HEIGHT = 800 + """图片最大高度限制""" + IMAGE_3_GRID_SIZE = 300 + """图片3列网格最大尺寸""" + IMAGE_2_GRID_SIZE = 400 + """图片2列网格最大尺寸""" + IMAGE_GRID_SPACING = 4 + """图片网格间距""" + MAX_IMAGES_DISPLAY = 9 + """最大显示图片数量""" + IMAGE_GRID_COLS = 3 + """图片网格列数""" + + # 转发内容配置 + REPOST_PADDING = 12 + """转发内容内边距""" + REPOST_SCALE = 0.88 + """转发缩放比例""" + + # 资源名称 + _EMOJIS = "emojis" + _RESOURCES = "resources" + _BUTTON_FILENAME = "media_button.png" + _FONT_FILENAME = "HYSongYunLangHeiW-1.ttf" + + # 颜色配置 + BG_COLOR: ClassVar[Color] = (255, 255, 255) + """背景色""" + TEXT_COLOR: ClassVar[Color] = (51, 51, 51) + """文本色""" + HEADER_COLOR: ClassVar[Color] = (0, 122, 255) + """标题色""" + EXTRA_COLOR: ClassVar[Color] = (136, 136, 136) + """额外信息色""" + REPOST_BG_COLOR: ClassVar[Color] = (247, 247, 247) + """转发背景色""" + REPOST_BORDER_COLOR: ClassVar[Color] = (230, 230, 230) + """转发边框色""" + + # 路径配置 + RESOURCES_DIR: ClassVar[Path] = Path(__file__).parent / _RESOURCES + """资源目录""" + DEFAULT_FONT_PATH: ClassVar[Path] = RESOURCES_DIR / _FONT_FILENAME + """默认字体路径""" + DEFAULT_VIDEO_BUTTON_PATH: ClassVar[Path] = RESOURCES_DIR / _BUTTON_FILENAME + """默认视频按钮路径""" + + def __init__(self, config: AstrBotConfig): + self.config = config + self.cache_dir = Path(config["cache_dir"]) + self.EMOJI_SOURCE = EmojiCDNSource( + base_url=config["emoji_cdn"], + style=config["emoji_style"], + cache_dir=self.cache_dir / self._EMOJIS, + enable_tqdm=True, + ) + """Emoji Source""" + @classmethod + def load_resources(cls): + """加载资源""" + cls._load_fonts() + cls._load_video_button() + cls._load_platform_logos() + + @classmethod + def _load_fonts(cls): + """预加载自定义字体""" + + font_path = cls.DEFAULT_FONT_PATH + # 创建 FontSet 对象 + cls.fontset = FontSet.new(font_path) + logger.debug(f"加载字体「{font_path.name}」成功") + + @classmethod + def _load_video_button(cls): + """预加载视频按钮""" + with Image.open(cls.DEFAULT_VIDEO_BUTTON_PATH) as img: + cls.video_button_image: PILImage = img.convert("RGBA") + + # 设置透明度为 30% + alpha = cls.video_button_image.split()[-1] # 获取 alpha 通道 + alpha = alpha.point(lambda x: int(x * 0.3)) # 将透明度设置为 30% + cls.video_button_image.putalpha(alpha) + + @classmethod + def _load_platform_logos(cls): + """预加载平台 logo""" + from .constants import PlatformEnum + + cls.platform_logos: dict[str, PILImage] = {} + for platform_name in PlatformEnum: + logo_path = cls.RESOURCES_DIR / f"{platform_name}.png" + if logo_path.exists(): + with Image.open(logo_path) as img: + cls.platform_logos[str(platform_name)] = img.convert("RGBA") + + + async def text( + self, + ctx: RenderContext, + xy: tuple[int, int], + lines: list[str], + font: FontInfo, + fill: Color, + ) -> int: + """绘制文本""" + await Apilmoji.text( + ctx.image, + xy, + lines, + font.font, + fill=fill, + line_height=font.line_height, + source=self.EMOJI_SOURCE, + ) + return font.line_height * len(lines) + + + async def render_messages(self, result: ParseResult): + """渲染消息 + + Args: + result (ParseResult): 解析结果 + """ + yield [await self.cache_or_render_image(result)] + + # 媒体内容 + async for message in self.render_contents(result): + yield message + + async def cache_or_render_image(self, result: ParseResult) -> AstrImage: + """获取缓存图片 + + Args: + result (ParseResult): 解析结果 + + Returns: + Image: 图片组件 + """ + if result.render_image is None: + image_raw = await self.render_image(result) + image_path = await self.save_img(image_raw) + result.render_image = image_path + + return AstrImage(str(result.render_image)) + + + async def save_img(self, raw: bytes) -> Path: + """保存图片 + + Args: + raw (bytes): 图片字节 + + Returns: + Path: 图片路径 + """ + file_name = f"{uuid.uuid4().hex}.png" + image_path = self.cache_dir / file_name + async with aiofiles.open(image_path, "wb+") as f: + await f.write(raw) + return image_path + + async def render_contents( + self, result: ParseResult + ) -> AsyncGenerator[list[BaseMessageComponent], None]: + """渲染媒体内容消息 + + Args: + result (ParseResult): 解析结果 + + Returns: + AsyncGenerator[UniMessage[Any], None]: 消息生成器 + """ + failed_count = 0 + forward_segs: list[BaseMessageComponent] = [] + dynamic_segs: list[BaseMessageComponent] = [] + + for cont in chain( + result.contents, result.repost.contents if result.repost else () + ): + try: + path = await cont.get_path() + # 继续渲染其他内容, 类似之前 gather (return_exceptions=True) 的处理 + except (DownloadLimitException, ZeroSizeException): + # 预期异常,不抛出 + # yield UniMessage(e.message) + continue + except DownloadException: + failed_count += 1 + continue + + match cont: + case VideoContent(): + yield [Video(str(path))] + case AudioContent(): + yield [Record(str(path))] + case ImageContent(): + forward_segs.append(AstrImage(str(path))) + case DynamicContent(): + dynamic_segs.append(Video(str(path))) + case GraphicsContent() as graphics: + forward_segs.append(AstrImage(str(path))) + if graphics.text: + forward_segs.append(Plain(graphics.text)) + if graphics.alt: + forward_segs.append(Plain(graphics.alt)) + + if forward_segs: + if result.text: + forward_segs.append(Plain(result.text)) + + if self.config["forward_contents"] or len(forward_segs) > 4: + forward_msg = construct_forward_message( + forward_segs + dynamic_segs + ) + yield [forward_msg] + else: + yield forward_segs + + if dynamic_segs: + yield [construct_forward_message(dynamic_segs)] + + if failed_count > 0: + message = f"{failed_count} 项媒体下载失败" + yield [Plain(message)] + raise DownloadException(message) + + async def render_image(self, result: ParseResult) -> bytes: + """使用 PIL 绘制通用社交媒体帖子卡片 + + Args: + result: 解析结果 + + Returns: + PNG 图片的字节数据,如果没有足够的内容则返回 None + """ + # 调用内部方法生成图片 + image = await self._create_card_image(result) + + # 将图片转换为字节 + output = BytesIO() + image.save(output, format="PNG") + return output.getvalue() + + async def _create_card_image( + self, + result: ParseResult, + not_repost: bool = True, + ) -> PILImage: + """创建卡片图片(内部方法,用于递归调用) + + Args: + result: 解析结果 + not_repost: 是否为非转发内容,转发内容为 False + + Returns: + PIL Image 对象 + """ + # 计算必要参数 + card_width = self.DEFAULT_CARD_WIDTH + content_width = card_width - 2 * self.PADDING + + # 计算各部分内容的高度 + sections = await self._calculate_sections(result, content_width) + + # 计算总高度 + card_height = sum(section.height for section in sections) + card_height += self.PADDING * 2 + self.SECTION_SPACING * (len(sections) - 1) + + # 创建画布 + bg_color = self.BG_COLOR if not_repost else self.REPOST_BG_COLOR + image = Image.new( + "RGB", + (card_width, card_height), + bg_color, + ) + + # 创建完整的渲染上下文 + ctx = RenderContext( + result=result, + card_width=card_width, + content_width=content_width, + image=image, + draw=ImageDraw.Draw(image), + not_repost=not_repost, + y_pos=self.PADDING, # 以 padding 作为起始 + ) + # 绘制各部分内容 + await self._draw_sections(ctx, sections) + return image + + @suppress_exception + def _load_and_resize_cover( + self, + cover_path: Path | None, + content_width: int, + ) -> PILImage | None: + """加载并调整封面尺寸 + + Args: + cover_path: 封面路径 + content_width: 内容区域宽度, 封面会缩放到此宽度以确保左右padding一致 + """ + if not cover_path or not cover_path.exists(): + return None + + with Image.open(cover_path) as original_img: + # 转换为 RGB 模式以确保兼容性 + if original_img.mode not in ("RGB", "RGBA"): + cover_img = original_img.convert("RGB") + else: + cover_img = original_img + + # 封面宽度应该等于内容区域宽度,以确保左右padding一致 + target_width = content_width + + # 计算缩放比例(保持宽高比) + if cover_img.width != target_width: + scale_ratio = target_width / cover_img.width + new_width = target_width + new_height = int(cover_img.height * scale_ratio) + + # 检查高度是否超过最大限制 + if new_height > self.MAX_COVER_HEIGHT: + # 如果高度超限,按高度重新计算 + scale_ratio = self.MAX_COVER_HEIGHT / new_height + new_height = self.MAX_COVER_HEIGHT + new_width = int(new_width * scale_ratio) + + cover_img = cover_img.resize( + (new_width, new_height), + Image.Resampling.LANCZOS, + ) + elif cover_img is original_img: + # 如果没有做任何转换,需要 copy 一份,因为原图会在 with 结束时关闭 + cover_img = cover_img.copy() + + return cover_img + + @suppress_exception + def _load_and_process_avatar(self, avatar: Path | None) -> PILImage | None: + """加载并处理头像(圆形裁剪,带抗锯齿)""" + if not avatar or not avatar.exists(): + return None + + with Image.open(avatar) as original_img: + # 转换为 RGBA 模式(用于更好的抗锯齿效果) + if original_img.mode != "RGBA": + avatar_img = original_img.convert("RGBA") + else: + avatar_img = original_img + + # 使用超采样技术提高质量:先放大到指定倍数 + scale = self.AVATAR_UPSCALE_FACTOR + temp_size = self.AVATAR_SIZE * scale + avatar_img = avatar_img.resize( + (temp_size, temp_size), + Image.Resampling.LANCZOS, + ) + + # 创建高分辨率圆形遮罩(带抗锯齿) + mask = Image.new("L", (temp_size, temp_size), 0) + mask_draw = ImageDraw.Draw(mask) + mask_draw.ellipse((0, 0, temp_size - 1, temp_size - 1), fill=255) + + # 应用遮罩 + output_avatar = Image.new( + "RGBA", + (temp_size, temp_size), + (0, 0, 0, 0), + ) + output_avatar.paste(avatar_img, (0, 0)) + output_avatar.putalpha(mask) + + # 缩小到目标尺寸(抗锯齿缩放) + output_avatar = output_avatar.resize( + (self.AVATAR_SIZE, self.AVATAR_SIZE), + Image.Resampling.LANCZOS, + ) + + return output_avatar + + async def _calculate_sections(self, result: ParseResult, content_width: int) -> list[SectionData]: + """计算各部分内容的高度和数据""" + sections: list[SectionData] = [] + + # 1. Header 部分 + header_section = await self._calculate_header_section(result, content_width) + if header_section is not None: + sections.append(header_section) + + # 2. 标题部分 + if result.title: + title_lines = self._wrap_text( + result.title, + content_width, + self.fontset.title_font, + ) + title_height = len(title_lines) * self.fontset.title_font.line_height + sections.append(TitleSectionData(height=title_height, lines=title_lines)) + + # 3. 封面,图集,图文内容 + if cover_img := self._load_and_resize_cover( + await result.cover_path, + content_width=content_width, + ): + sections.append(CoverSectionData(height=cover_img.height, cover_img=cover_img)) + elif result.img_contents: + # 如果没有封面但有图片,处理图片列表 + img_grid_section = await self._calculate_image_grid_section( + result, + content_width, + ) + if img_grid_section: + sections.append(img_grid_section) + elif result.graphics_contents: + for graphics_content in result.graphics_contents: + graphics_section = await self._calculate_graphics_section( + graphics_content, + content_width, + ) + if graphics_section: + sections.append(graphics_section) + + # 5. 文本内容 + if result.text: + text_lines = self._wrap_text( + result.text, + content_width, + self.fontset.text_font, + ) + text_height = len(text_lines) * self.fontset.text_font.line_height + sections.append(TextSectionData(height=text_height, lines=text_lines)) + + # 6. 额外信息 + if result.extra_info: + extra_lines = self._wrap_text( + result.extra_info, + content_width, + self.fontset.extra_font, + ) + extra_height = len(extra_lines) * self.fontset.extra_font.line_height + sections.append(ExtraSectionData(height=extra_height, lines=extra_lines)) + + # 7. 转发内容 + if result.repost: + repost_section = await self._calculate_repost_section(result.repost) + sections.append(repost_section) + + return sections + + @suppress_exception_async + async def _calculate_graphics_section( + self, graphics_content: GraphicsContent, content_width: int + ) -> GraphicsSectionData | None: + """计算图文内容部分的高度和内容""" + # 加载图片 + img_path = await graphics_content.get_path() + with Image.open(img_path) as original_img: + # 调整图片尺寸以适应内容宽度 + if original_img.width > content_width: + ratio = content_width / original_img.width + new_height = int(original_img.height * ratio) + image = original_img.resize( + (content_width, new_height), + Image.Resampling.LANCZOS, + ) + else: + # 如果不需要缩放,copy 一份 + image = original_img.copy() + + # 处理文本内容 + text_lines = [] + if graphics_content.text: + text_lines = self._wrap_text( + graphics_content.text, + content_width, + self.fontset.text_font, + ) + + # 计算总高度:文本高度 + 图片高度 + alt文本高度 + 间距 + text_height = len(text_lines) * self.fontset.text_font.line_height if text_lines else 0 + alt_height = self.fontset.extra_font.line_height if graphics_content.alt else 0 + total_height = text_height + image.height + alt_height + if text_lines: + total_height += self.SECTION_SPACING # 文本和图片之间的间距 + if graphics_content.alt: + total_height += self.SECTION_SPACING # 图片和alt文本之间的间距 + + return GraphicsSectionData( + height=total_height, + text_lines=text_lines, + image=image, + alt_text=graphics_content.alt, + ) + + async def _calculate_header_section( + self, + result: ParseResult, + content_width: int, + ) -> HeaderSectionData | None: + """计算 header 部分的高度和内容""" + if result.author is None: + return None + + # 加载头像 + avatar_img = self._load_and_process_avatar(await result.author.get_avatar_path()) + + # 计算文字区域宽度(始终预留头像空间) + text_area_width = content_width - (self.AVATAR_SIZE + self.AVATAR_TEXT_GAP) + + # 发布者名称 + name_lines = self._wrap_text( + result.author.name, + text_area_width, + self.fontset.name_font, + ) + + # 时间 + time_text = result.formartted_datetime + time_lines = self._wrap_text( + time_text, + text_area_width, + self.fontset.extra_font, + ) + + # 计算 header 高度(取头像和文字中较大者) + text_height = len(name_lines) * self.fontset.name_font.line_height + if time_lines: + text_height += self.NAME_TIME_GAP + len(time_lines) * self.fontset.extra_font.line_height + header_height = max(self.AVATAR_SIZE, text_height) + + return HeaderSectionData( + height=header_height, + avatar=avatar_img, + name_lines=name_lines, + time_lines=time_lines, + text_height=text_height, + ) + + async def _calculate_repost_section(self, repost: ParseResult) -> RepostSectionData: + """计算转发内容的高度和内容(递归调用绘制方法)""" + repost_image = await self._create_card_image(repost, False) + # 缩放图片 + scaled_width = int(repost_image.width * self.REPOST_SCALE) + scaled_height = int(repost_image.height * self.REPOST_SCALE) + repost_image_scaled = repost_image.resize( + (scaled_width, scaled_height), + Image.Resampling.LANCZOS, + ) + + return RepostSectionData( + height=scaled_height + self.REPOST_PADDING * 2, # 加上转发容器的内边距 + scaled_image=repost_image_scaled, + ) + + async def _calculate_image_grid_section( + self, result: ParseResult, content_width: int + ) -> ImageGridSectionData | None: + """计算图片网格部分的高度和内容""" + if not result.img_contents: + return None + + # 检查是否有超过最大显示数量的图片 + total_images = len(result.img_contents) + has_more = total_images > self.MAX_IMAGES_DISPLAY + + # 如果超过最大显示数量,处理前N张,最后一张显示+N效果 + if has_more: + img_contents = result.img_contents[: self.MAX_IMAGES_DISPLAY] + remaining_count = total_images - self.MAX_IMAGES_DISPLAY + else: + img_contents = result.img_contents[: self.MAX_IMAGES_DISPLAY] + remaining_count = 0 + + processed_images = [] + img_count = len(img_contents) + + for img_content in img_contents: + img_path = await img_content.get_path() + # 使用装饰器保护的方法,失败会返回 None + img = await self._load_and_process_grid_image(img_path, content_width, img_count) + if img is not None: + processed_images.append(img) + + if not processed_images: + return None + + # 计算网格布局 + image_count = len(processed_images) + + if image_count == 1: + # 单张图片 + cols, rows = 1, 1 + elif image_count in (2, 4): + # 2张或4张图片,使用2列布局 + cols, rows = 2, (image_count + 1) // 2 + else: + # 多张图片,使用3列布局(九宫格) + cols = self.IMAGE_GRID_COLS + rows = (image_count + cols - 1) // cols + + # 计算高度 + max_img_height = max(img.height for img in processed_images) + if len(processed_images) == 1: + # 单张图片 + grid_height = max_img_height + else: + # 多张图片:上间距 + (图片 + 间距) * 行数 + grid_height = self.IMAGE_GRID_SPACING + rows * (max_img_height + self.IMAGE_GRID_SPACING) + + return ImageGridSectionData( + height=grid_height, + images=processed_images, + cols=cols, + rows=rows, + has_more=has_more, + remaining_count=remaining_count, + ) + + @suppress_exception_async + async def _load_and_process_grid_image( + self, + img_path: Path, + content_width: int, + img_count: int, + ) -> PILImage | None: + """加载并处理网格图片 + + Args: + img_path: 图片路径 + content_width: 内容宽度 + img_count: 图片总数(用于决定处理方式) + + Returns: + 处理后的图片对象,失败返回 None + """ + if not img_path.exists(): + return None + + with Image.open(img_path) as original_img: + img = original_img + + # 根据图片数量决定处理方式 + if img_count >= 2: + # 2张及以上图片,统一为方形 + img = self._crop_to_square(img) + + # 计算图片尺寸 + if img_count == 1: + # 单张图片,根据卡片宽度调整,与视频封面保持一致 + max_width = content_width + max_height = min(self.MAX_IMAGE_HEIGHT, content_width) # 限制最大高度 + if img.width > max_width or img.height > max_height: + ratio = min(max_width / img.width, max_height / img.height) + new_size = (int(img.width * ratio), int(img.height * ratio)) + img = img.resize(new_size, Image.Resampling.LANCZOS) + elif img is original_img: + # 如果没有做任何转换,需要 copy 一份 + img = img.copy() + else: + # 多张图片,计算最大尺寸 + if img_count in (2, 4): + # 2张或4张图片,使用2列布局 + num_gaps = 3 # 2列有3个间距 + max_size = (content_width - self.IMAGE_GRID_SPACING * num_gaps) // 2 + max_size = min(max_size, self.IMAGE_2_GRID_SIZE) + else: + # 多张图片,使用3列布局 + num_gaps = self.IMAGE_GRID_COLS + 1 + max_size = (content_width - self.IMAGE_GRID_SPACING * num_gaps) // self.IMAGE_GRID_COLS + max_size = min(max_size, self.IMAGE_3_GRID_SIZE) + + # 调整多张图片的尺寸 + if img.width > max_size or img.height > max_size: + ratio = min(max_size / img.width, max_size / img.height) + new_size = (int(img.width * ratio), int(img.height * ratio)) + img = img.resize(new_size, Image.Resampling.LANCZOS) + elif img is original_img: + # 如果没有做任何转换,需要 copy 一份 + img = img.copy() + + return img + + def _crop_to_square(self, img: PILImage) -> PILImage: + """将图片裁剪为方形(上下切割)""" + width, height = img.size + + if width == height: + return img + + if width > height: + # 宽图片,左右切割 + left = (width - height) // 2 + right = left + height + return img.crop((left, 0, right, height)) + else: + # 高图片,上下切割 + top = (height - width) // 2 + bottom = top + width + return img.crop((0, top, width, bottom)) + + async def _draw_sections(self, ctx: RenderContext, sections: list[SectionData]) -> None: + """绘制所有内容到画布上""" + for section in sections: + match section: + case HeaderSectionData() as header: + await self._draw_header(ctx, header) + case TitleSectionData() as title: + await self._draw_title(ctx, title.lines) + case CoverSectionData() as cover: + self._draw_cover(ctx, cover.cover_img) + case TextSectionData() as text: + await self._draw_text(ctx, text.lines) + case GraphicsSectionData() as graphics: + await self._draw_graphics(ctx, graphics) + case ExtraSectionData() as extra: + await self._draw_extra(ctx, extra.lines) + case RepostSectionData() as repost: + self._draw_repost(ctx, repost) + case ImageGridSectionData() as image_grid: + self._draw_image_grid(ctx, image_grid) + + def _create_avatar_placeholder(self) -> PILImage: + """创建默认头像占位符""" + # 头像占位符配置常量 + placeholder_bg_color = (230, 230, 230, 255) + placeholder_fg_color = (200, 200, 200, 255) + head_ratio = 0.35 # 头部位置比例 + head_radius_ratio = 1 / 6 # 头部半径比例 + shoulder_y_ratio = 0.55 # 肩部 Y 位置比例 + shoulder_width_ratio = 0.55 # 肩部宽度比例 + shoulder_height_ratio = 0.6 # 肩部高度比例 + + placeholder = Image.new( + "RGBA", + (self.AVATAR_SIZE, self.AVATAR_SIZE), + (0, 0, 0, 0), + ) + draw = ImageDraw.Draw(placeholder) + + # 绘制圆形背景 + draw.ellipse( + (0, 0, self.AVATAR_SIZE - 1, self.AVATAR_SIZE - 1), + fill=placeholder_bg_color, + ) + + # 绘制简单的用户图标(圆形头部 + 肩部) + center_x = self.AVATAR_SIZE // 2 + + # 头部圆形 + head_radius = int(self.AVATAR_SIZE * head_radius_ratio) + head_y = int(self.AVATAR_SIZE * head_ratio) + draw.ellipse( + ( + center_x - head_radius, + head_y - head_radius, + center_x + head_radius, + head_y + head_radius, + ), + fill=placeholder_fg_color, + ) + + # 肩部 + shoulder_y = int(self.AVATAR_SIZE * shoulder_y_ratio) + shoulder_width = int(self.AVATAR_SIZE * shoulder_width_ratio) + shoulder_height = int(self.AVATAR_SIZE * shoulder_height_ratio) + draw.ellipse( + ( + center_x - shoulder_width // 2, + shoulder_y, + center_x + shoulder_width // 2, + shoulder_y + shoulder_height, + ), + fill=placeholder_fg_color, + ) + + # 创建圆形遮罩确保不超出边界 + mask = Image.new("L", (self.AVATAR_SIZE, self.AVATAR_SIZE), 0) + mask_draw = ImageDraw.Draw(mask) + mask_draw.ellipse((0, 0, self.AVATAR_SIZE - 1, self.AVATAR_SIZE - 1), fill=255) + + # 应用遮罩 + placeholder.putalpha(mask) + return placeholder + + async def _draw_header(self, ctx: RenderContext, section: HeaderSectionData) -> None: + """绘制 header 部分""" + x_pos = self.PADDING + + # 绘制头像或占位符 + avatar = section.avatar if section.avatar else self._create_avatar_placeholder() + ctx.image.paste(avatar, (x_pos, ctx.y_pos), avatar) + + # 文字始终从头像位置后面开始 + text_x = self.PADDING + self.AVATAR_SIZE + self.AVATAR_TEXT_GAP + + # 计算文字垂直居中位置(对齐头像中轴) + avatar_center = ctx.y_pos + self.AVATAR_SIZE // 2 + text_start_y = avatar_center - section.text_height // 2 + text_y = text_start_y + + # 发布者名称(蓝色) + text_y += await self.text( + ctx, + (text_x, text_y), + section.name_lines, + self.fontset.name_font, + fill=self.HEADER_COLOR, + ) + + # 时间(灰色) + if section.time_lines: + text_y += self.NAME_TIME_GAP + text_y += await self.text( + ctx, + (text_x, text_y), + section.time_lines, + self.fontset.extra_font, + fill=self.EXTRA_COLOR, + ) + + # 在右侧绘制平台 logo(仅在非转发内容时绘制) + if ctx.not_repost: + platform_name = ctx.result.platform.name + if platform_name in self.platform_logos: + logo_img = self.platform_logos[platform_name] + # 计算 logo 位置(右侧对齐) + logo_x = ctx.image.width - self.PADDING - logo_img.width + # 垂直居中对齐头像 + logo_y = ctx.y_pos + (self.AVATAR_SIZE - logo_img.height) // 2 + ctx.image.paste(logo_img, (logo_x, logo_y), logo_img) + + ctx.y_pos += section.height + self.SECTION_SPACING + + async def _draw_title(self, ctx: RenderContext, lines: list[str]) -> None: + """绘制标题""" + ctx.y_pos += await self.text( + ctx, + (self.PADDING, ctx.y_pos), + lines, + self.fontset.title_font, + self.TEXT_COLOR, + ) + + ctx.y_pos += self.SECTION_SPACING + + def _draw_cover(self, ctx: RenderContext, cover_img: PILImage) -> None: + """绘制封面""" + # 封面从左边padding开始,和文字、头像对齐 + x_pos = self.PADDING + ctx.image.paste(cover_img, (x_pos, ctx.y_pos)) + + # 添加视频播放按钮(居中) + button_size = 128 # 固定使用 128x128 尺寸 + button_x = x_pos + (cover_img.width - button_size) // 2 + button_y = ctx.y_pos + (cover_img.height - button_size) // 2 + ctx.image.paste( + self.video_button_image, + (button_x, button_y), + self.video_button_image, + ) + + ctx.y_pos += cover_img.height + self.SECTION_SPACING + + async def _draw_text(self, ctx: RenderContext, lines: list[str]) -> None: + """绘制文本内容""" + ctx.y_pos += await self.text( + ctx, + (self.PADDING, ctx.y_pos), + lines, + self.fontset.text_font, + fill=self.TEXT_COLOR, + ) + ctx.y_pos += self.SECTION_SPACING + + async def _draw_graphics(self, ctx: RenderContext, section: GraphicsSectionData) -> None: + """绘制图文内容""" + # 绘制文本内容(如果有) + if section.text_lines: + ctx.y_pos += await self.text( + ctx, + (self.PADDING, ctx.y_pos), + section.text_lines, + self.fontset.text_font, + fill=self.TEXT_COLOR, + ) + ctx.y_pos += self.SECTION_SPACING # 文本和图片之间的间距 + + # 绘制图片(居中) + x_pos = self.PADDING + (ctx.content_width - section.image.width) // 2 + ctx.image.paste(section.image, (x_pos, ctx.y_pos)) + ctx.y_pos += section.image.height + + # 绘制 alt 文本(如果有,居中显示) + if section.alt_text: + ctx.y_pos += self.SECTION_SPACING # 图片和alt文本之间的间距 + # 计算文本居中位置 + extra_font_info = self.fontset.extra_font + text_width = extra_font_info.get_text_width(section.alt_text) + text_x = self.PADDING + (ctx.content_width - text_width) // 2 + ctx.y_pos += await self.text( + ctx, + (text_x, ctx.y_pos), + [section.alt_text], + self.fontset.extra_font, + fill=self.EXTRA_COLOR, + ) + + ctx.y_pos += self.SECTION_SPACING + + async def _draw_extra(self, ctx: RenderContext, lines: list[str]) -> None: + """绘制额外信息""" + ctx.y_pos += await self.text( + ctx, + (self.PADDING, ctx.y_pos), + lines, + self.fontset.extra_font, + fill=self.EXTRA_COLOR, + ) + + def _draw_repost(self, ctx: RenderContext, section: RepostSectionData) -> None: + """绘制转发内容""" + # 获取缩放后的转发图片 + repost_image = section.scaled_image + + # 转发框占满整个内容区域,左右和边缘对齐 + repost_x = self.PADDING + repost_y = ctx.y_pos + repost_width = ctx.content_width # 转发框宽度等于内容区域宽度 + repost_height = section.height + + # 绘制转发背景(圆角矩形) + self._draw_rounded_rectangle( + ctx.image, + (repost_x, repost_y, repost_x + repost_width, repost_y + repost_height), + self.REPOST_BG_COLOR, + radius=8, + ) + + # 绘制转发边框 + self._draw_rounded_rectangle_border( + ctx.draw, + (repost_x, repost_y, repost_x + repost_width, repost_y + repost_height), + self.REPOST_BORDER_COLOR, + radius=8, + width=1, + ) + + # 转发图片在转发容器中居中 + card_x = repost_x + (repost_width - repost_image.width) // 2 + card_y = repost_y + self.REPOST_PADDING + + # 将缩放后的转发图片贴到主画布上 + ctx.image.paste(repost_image, (card_x, card_y)) + + ctx.y_pos += repost_height + self.SECTION_SPACING + + def _draw_image_grid(self, ctx: RenderContext, section: ImageGridSectionData) -> None: + """绘制图片网格""" + images = section.images + cols = section.cols + rows = section.rows + has_more = section.has_more + remaining_count = section.remaining_count + + if not images: + return + + # 计算每个图片的尺寸和间距 + available_width = ctx.content_width # 可用宽度 + img_spacing = self.IMAGE_GRID_SPACING + + # 根据图片数量计算每个图片的尺寸 + if len(images) == 1: + # 单张图片,使用完整的可用宽度,与视频封面保持一致 + max_img_size = available_width + else: + # 多张图片,统一使用间距计算,确保所有间距相同 + num_gaps = cols + 1 # 2列有3个间距,3列有4个间距 + calculated_size = (available_width - img_spacing * num_gaps) // cols + max_img_size = self.IMAGE_2_GRID_SIZE if cols == 2 else self.IMAGE_3_GRID_SIZE + max_img_size = min(calculated_size, max_img_size) + + current_y = ctx.y_pos + + for row in range(rows): + row_start = row * cols + row_end = min(row_start + cols, len(images)) + row_images = images[row_start:row_end] + + # 计算这一行的最大高度 + max_height = max(img.height for img in row_images) + + # 绘制这一行的图片 + for i, img in enumerate(row_images): + # 统一使用间距计算方式 + img_x = self.PADDING + img_spacing + i * (max_img_size + img_spacing) + + img_y = current_y + img_spacing # 每行上方都有间距 + + # 居中放置图片 + y_offset = (max_height - img.height) // 2 + ctx.image.paste(img, (img_x, img_y + y_offset)) + + # 如果是最后一张图片且有更多图片,绘制+N效果 + if has_more and row == rows - 1 and i == len(row_images) - 1 and len(images) == self.MAX_IMAGES_DISPLAY: + self._draw_more_indicator( + ctx.image, + img_x, + img_y, + max_img_size, + max_height, + remaining_count, + ) + + current_y += img_spacing + max_height + + ctx.y_pos = current_y + img_spacing + self.SECTION_SPACING + + def _draw_more_indicator( + self, + image: PILImage, + img_x: int, + img_y: int, + img_width: int, + img_height: int, + count: int, + ): + """在图片上绘制+N指示器""" + draw = ImageDraw.Draw(image) + + # 创建半透明黑色遮罩(透明度 1/4) + overlay = Image.new("RGBA", (img_width, img_height), (0, 0, 0, 0)) + overlay_draw = ImageDraw.Draw(overlay) + overlay_draw.rectangle((0, 0, img_width - 1, img_height - 1), fill=(0, 0, 0, 100)) + + # 将遮罩贴到图片上 + image.paste(overlay, (img_x, img_y), overlay) + + # 绘制+N文字 + text = f"+{count}" + font_info = self.fontset.indicator_font + # 计算文字位置(居中) + text_width = font_info.get_text_width(text) + text_x = img_x + (img_width - text_width) // 2 + text_y = img_y + (img_height - font_info.line_height) // 2 + + # 绘制50%透明白色文字 + draw.text((text_x, text_y), text, fill=(255, 255, 255), font=font_info.font) + + def _draw_rounded_rectangle( + self, + image: PILImage, + bbox: tuple[int, int, int, int], + fill_color: Color, + radius: int = 8, + ): + """绘制圆角矩形""" + x1, y1, x2, y2 = bbox + draw = ImageDraw.Draw(image) + + # 绘制主体矩形 + draw.rectangle((x1 + radius, y1, x2 - radius, y2), fill=fill_color) + draw.rectangle((x1, y1 + radius, x2, y2 - radius), fill=fill_color) + + # 绘制四个圆角 + draw.pieslice((x1, y1, x1 + 2 * radius, y1 + 2 * radius), 180, 270, fill=fill_color) + draw.pieslice((x2 - 2 * radius, y1, x2, y1 + 2 * radius), 270, 360, fill=fill_color) + draw.pieslice((x1, y2 - 2 * radius, x1 + 2 * radius, y2), 90, 180, fill=fill_color) + draw.pieslice((x2 - 2 * radius, y2 - 2 * radius, x2, y2), 0, 90, fill=fill_color) + + def _draw_rounded_rectangle_border( + self, + draw: ImageDraw.ImageDraw, + bbox: tuple[int, int, int, int], + border_color: Color, + radius: int = 8, + width: int = 1, + ): + """绘制圆角矩形边框""" + x1, y1, x2, y2 = bbox + + # 绘制主体边框 + draw.rectangle((x1 + radius, y1, x2 - radius, y1 + width), fill=border_color) # 上 + draw.rectangle((x1 + radius, y2 - width, x2 - radius, y2), fill=border_color) # 下 + draw.rectangle((x1, y1 + radius, x1 + width, y2 - radius), fill=border_color) # 左 + draw.rectangle((x2 - width, y1 + radius, x2, y2 - radius), fill=border_color) # 右 + + # 绘制四个圆角边框 + draw.arc( + (x1, y1, x1 + 2 * radius, y1 + 2 * radius), + 180, + 270, + fill=border_color, + width=width, + ) + draw.arc( + (x2 - 2 * radius, y1, x2, y1 + 2 * radius), + 270, + 360, + fill=border_color, + width=width, + ) + draw.arc( + (x1, y2 - 2 * radius, x1 + 2 * radius, y2), + 90, + 180, + fill=border_color, + width=width, + ) + draw.arc( + (x2 - 2 * radius, y2 - 2 * radius, x2, y2), + 0, + 90, + fill=border_color, + width=width, + ) + + def _wrap_text(self, text: str | None, max_width: int, font_info: FontInfo) -> list[str]: + """优化的文本自动换行算法,考虑中英文字符宽度相同 + + Args: + text: 要处理的文本 + max_width: 最大宽度(像素) + font_info: 字体信息对象 + + Returns: + 换行后的文本列表 + """ + if not text: + return [] + + lines: list[str] = [] + paragraphs = text.splitlines() + + def is_punctuation(char: str) -> bool: + """判断是否为不能为行首的标点符号""" + return char in ",。!?;:、)】》〉」』〕〗〙〛…—·" or char in ",.;:!?)]}" + + for paragraph in paragraphs: + if not paragraph: + lines.append("") + continue + + current_line = "" + current_line_width = 0 + remaining_text = paragraph + + while remaining_text: + next_char = remaining_text[0] + char_width = font_info.get_char_width_fast(next_char) + # 如果当前行为空,直接添加字符 + if not current_line: + current_line = next_char + current_line_width = char_width + remaining_text = remaining_text[1:] + continue + + # 如果是标点符号,直接添加到当前行(标点符号不应该单独成行) + if is_punctuation(next_char): + current_line += next_char + current_line_width += char_width + remaining_text = remaining_text[1:] + continue + + # 测试添加下一个字符后的宽度 + test_width = current_line_width + char_width + + if test_width <= max_width: + # 宽度合适,继续添加 + current_line += next_char + current_line_width = test_width + remaining_text = remaining_text[1:] + else: + # 宽度超限,需要断行 + lines.append(current_line) + current_line = next_char + current_line_width = char_width + remaining_text = remaining_text[1:] + + # 保存最后一行 + if current_line: + lines.append(current_line) + + return lines diff --git a/core/resources/HYSongYunLangHeiW-1.ttf b/core/resources/HYSongYunLangHeiW-1.ttf new file mode 100644 index 0000000..79ed7df Binary files /dev/null and b/core/resources/HYSongYunLangHeiW-1.ttf differ diff --git a/core/resources/bilibili.png b/core/resources/bilibili.png new file mode 100644 index 0000000..00f03f4 Binary files /dev/null and b/core/resources/bilibili.png differ diff --git a/core/resources/douyin.png b/core/resources/douyin.png new file mode 100644 index 0000000..357b2d6 Binary files /dev/null and b/core/resources/douyin.png differ diff --git a/core/resources/kuaishou.png b/core/resources/kuaishou.png new file mode 100644 index 0000000..f1689b0 Binary files /dev/null and b/core/resources/kuaishou.png differ diff --git a/core/resources/media_button.png b/core/resources/media_button.png new file mode 100644 index 0000000..b57717f Binary files /dev/null and b/core/resources/media_button.png differ diff --git a/core/resources/tiktok.png b/core/resources/tiktok.png new file mode 100644 index 0000000..138f500 Binary files /dev/null and b/core/resources/tiktok.png differ diff --git a/core/resources/twitter.png b/core/resources/twitter.png new file mode 100644 index 0000000..d37c609 Binary files /dev/null and b/core/resources/twitter.png differ diff --git a/core/resources/weibo.png b/core/resources/weibo.png new file mode 100644 index 0000000..bd2f545 Binary files /dev/null and b/core/resources/weibo.png differ diff --git a/core/resources/xiaohongshu.png b/core/resources/xiaohongshu.png new file mode 100644 index 0000000..621987c Binary files /dev/null and b/core/resources/xiaohongshu.png differ diff --git a/core/resources/youtube.png b/core/resources/youtube.png new file mode 100644 index 0000000..1997bb2 Binary files /dev/null and b/core/resources/youtube.png differ diff --git a/core/utils.py b/core/utils.py new file mode 100644 index 0000000..997ebd6 --- /dev/null +++ b/core/utils.py @@ -0,0 +1,285 @@ +import asyncio +import hashlib +import re +from collections import OrderedDict +from http import cookiejar +from pathlib import Path +from typing import TypeVar +from urllib.parse import urlparse + +from astrbot.api import logger +from astrbot.core.message.components import BaseMessageComponent, Node, Nodes + +K = TypeVar("K") +V = TypeVar("V") + + +class LimitedSizeDict(OrderedDict[K, V]): + """ + 定长字典 + """ + + def __init__(self, *args, max_size=20, **kwargs): + self.max_size = max_size + super().__init__(*args, **kwargs) + + def __setitem__(self, key: K, value: V): + super().__setitem__(key, value) + if len(self) > self.max_size: + self.popitem(last=False) # 移除最早添加的项 + + +def construct_forward_message( + chain: list[BaseMessageComponent], + user_id: str | None = None, +) -> Nodes: + """构造转发消息 + + Args: + chain (list[BaseMessageComponent]): 消息链 + user_id (str): 用户ID + + Returns: + Nodes: 转发组件 + """ + if user_id is None: + user_id = "114514" + nodes = Nodes([]) + for seg in chain: + node = Node(uin=user_id, name="astrbot", content=[seg]) + nodes.nodes.append(node) + + return nodes + +def keep_zh_en_num(text: str) -> str: + """ + 保留字符串中的中英文和数字 + """ + return re.sub(r"[^\u4e00-\u9fa5a-zA-Z0-9\-_]", "", text.replace(" ", "_")) + + +async def safe_unlink(path: Path): + """ + 安全删除文件 + """ + try: + await asyncio.to_thread(path.unlink, missing_ok=True) + except Exception: + logger.warning(f"删除 {path} 失败") + + +async def exec_ffmpeg_cmd(cmd: list[str]) -> None: + """执行命令 + + Args: + cmd (list[str]): 命令序列 + """ + try: + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + _, stderr = await process.communicate() + return_code = process.returncode + except FileNotFoundError: + raise RuntimeError("ffmpeg 未安装或无法找到可执行文件") + + if return_code != 0: + error_msg = stderr.decode().strip() + raise RuntimeError(f"ffmpeg 执行失败: {error_msg}") + + +async def merge_av( + *, + v_path: Path, + a_path: Path, + output_path: Path, +) -> None: + """合并视频和音频 + + Args: + v_path (Path): 视频文件路径 + a_path (Path): 音频文件路径 + output_path (Path): 输出文件路径 + """ + logger.info(f"Merging {v_path.name} and {a_path.name} to {output_path.name}") + + cmd = [ + "ffmpeg", + "-y", + "-i", + str(v_path), + "-i", + str(a_path), + "-c", + "copy", + "-map", + "0:v:0", + "-map", + "1:a:0", + str(output_path), + ] + + await exec_ffmpeg_cmd(cmd) + await asyncio.gather(safe_unlink(v_path), safe_unlink(a_path)) + logger.info(f"Merged {output_path.name}, {fmt_size(output_path)}") + + +async def merge_av_h264( + *, + v_path: Path, + a_path: Path, + output_path: Path, +) -> None: + """合并视频和音频,并使用 H.264 编码 + + Args: + v_path (Path): 视频文件路径 + a_path (Path): 音频文件路径 + output_path (Path): 输出文件路径 + """ + logger.info(f"Merging {v_path.name} and {a_path.name} to {output_path.name} with H.264") + + # 修改命令以确保视频使用 H.264 编码 + cmd = [ + "ffmpeg", + "-y", + "-i", + str(v_path), + "-i", + str(a_path), + "-c:v", + "libx264", # 明确指定使用 H.264 编码 + "-preset", + "medium", # 编码速度和质量的平衡 + "-crf", + "23", # 质量因子,值越低质量越高 + "-c:a", + "aac", # 音频使用 AAC 编码 + "-b:a", + "128k", # 音频比特率 + "-map", + "0:v:0", + "-map", + "1:a:0", + str(output_path), + ] + + await exec_ffmpeg_cmd(cmd) + await asyncio.gather(safe_unlink(v_path), safe_unlink(a_path)) + logger.info(f"Merged {output_path.name} with H.264, {fmt_size(output_path)}") + + +async def encode_video_to_h264(video_path: Path) -> Path: + """将视频重新编码到 h264 + + Args: + video_path (Path): 视频路径 + + Returns: + Path: 编码后的视频路径 + """ + output_path = video_path.with_name(f"{video_path.stem}_h264{video_path.suffix}") + if output_path.exists(): + return output_path + cmd = [ + "ffmpeg", + "-y", + "-i", + str(video_path), + "-c:v", + "libx264", + "-preset", + "medium", + "-crf", + "23", + str(output_path), + ] + await exec_ffmpeg_cmd(cmd) + logger.info(f"视频重新编码为 H.264 成功: {output_path}, {fmt_size(output_path)}") + await safe_unlink(video_path) + return output_path + + +def fmt_size(file_path: Path) -> str: + """格式化文件大小 + + Args: + video_path (Path): 视频路径 + """ + return f"大小: {file_path.stat().st_size / 1024 / 1024:.2f} MB" + + +def generate_file_name(url: str, default_suffix: str = "") -> str: + """根据 url 生成文件名 + + Args: + url (str): url + default_suffix (str): 默认后缀. Defaults to "". + + Returns: + str: 文件名 + """ + # 根据 url 获取文件后缀 + path = Path(urlparse(url).path) + suffix = path.suffix if path.suffix else default_suffix + # 获取 url 的 md5 值 + url_hash = hashlib.md5(url.encode()).hexdigest()[:16] + file_name = f"{url_hash}{suffix}" + return file_name + + + +def save_cookies_with_netscape(cookies_str: str, file_path: Path, domain: str): + """以 netscape 格式保存 cookies + + Args: + cookies_str: cookies 字符串 + file_path: 保存的文件路径 + domain: 域名 + """ + # 创建 MozillaCookieJar 对象 + cj = cookiejar.MozillaCookieJar(file_path) + + # 从字符串创建 cookies 并添加到 MozillaCookieJar 对象 + for cookie in cookies_str.split(";"): + name, value = cookie.strip().split("=", 1) + cj.set_cookie( + cookiejar.Cookie( + version=0, + name=name, + value=value, + port=None, + port_specified=False, + domain="." + domain, + domain_specified=True, + domain_initial_dot=False, + path="/", + path_specified=True, + secure=True, + expires=0, + discard=True, + comment=None, + comment_url=None, + rest={"HttpOnly": ""}, + rfc2109=False, + ) + ) + + # 保存 cookies 到文件 + cj.save(ignore_discard=True, ignore_expires=True) + + +def ck2dict(cookies_str: str) -> dict[str, str]: + """将 cookies 字符串转换为字典 + + Args: + cookies_str: cookies 字符串 + + Returns: + dict[str, str]: 字典 + """ + res = {} + for cookie in cookies_str.split(";"): + name, value = cookie.strip().split("=", 1) + res[name] = value + return res diff --git a/logo.png b/logo.png new file mode 100644 index 0000000..52e91ef Binary files /dev/null and b/logo.png differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..916d147 --- /dev/null +++ b/main.py @@ -0,0 +1,231 @@ +# main.py + +import re +from pathlib import Path + +from astrbot.api import logger +from astrbot.api.event import filter +from astrbot.api.star import Context, Star, StarTools, register +from astrbot.core import AstrBotConfig +from astrbot.core.message.components import Image, Record +from astrbot.core.platform.astr_message_event import AstrMessageEvent + +from .core.clean import CacheCleaner +from .core.download import Downloader +from .core.parsers import BaseParser, BilibiliParser, ParseResult, YouTubeParser +from .core.render import CommonRenderer +from .core.utils import save_cookies_with_netscape + + +@register("astrbot_plugin_parser", "Zhalslar", "...", "...") +class ParserPlugin(Star): + def __init__(self, context: Context, config: AstrBotConfig): + super().__init__(context) + self.context = context + self.config = config + + # 插件数据目录 + self.data_dir: Path = StarTools.get_data_dir("astrbot_plugin_parser") + config["data_dir"] = str(self.data_dir) + + # 缓存目录 + self.cache_dir: Path = self.data_dir / "cache_dir" + self.cache_dir.mkdir(parents=True, exist_ok=True) + config["cache_dir"] = str(self.cache_dir) + + # ytb_cookies + if self.config["ytb_ck"]: + ytb_cookies_file = self.data_dir / "ytb_cookies.txt" + ytb_cookies_file.parent.mkdir(parents=True, exist_ok=True) + save_cookies_with_netscape( + self.config["ytb_ck"], + ytb_cookies_file, + "youtube.com", + ) + config["ytb_cookies_file"] = str(ytb_cookies_file) + + config.save_config() + + # 关键词 -> Parser 映射 + self.parser_map: dict[str, BaseParser] = {} + + # 关键词 -> 正则 列表 + self.key_pattern_list: list[tuple[str, re.Pattern[str]]] = [] + + # 渲染器 + self.renderer = CommonRenderer(config) + + # 下载器 + self.downloader = Downloader(config) + + # 缓存清理器 + self.cleaner = CacheCleaner(self.context, self.config) + + async def initialize(self): + """加载、重载插件时触发""" + self.register_parser() + CommonRenderer.load_resources() + + def register_parser(self): + """注册解析器""" + # 获取所有解析器 + all_subclass = BaseParser.get_all_subclass() + # 过滤掉禁用的平台 + enabled_classes = [ + _cls + for _cls in all_subclass + if _cls.platform.display_name in self.config["enable_platforms"] + ] + # 启用的平台 + platform_names = [] + for _cls in enabled_classes: + parser = _cls(self.config, self.downloader) + platform_names.append(parser.platform.display_name) + for keyword, _ in _cls._key_patterns: + self.parser_map[keyword] = parser + logger.info(f"启用平台: {'、'.join(platform_names)}") + + # 关键词-正则对,一次性生成并排序 + patterns: list[tuple[str, re.Pattern[str]]] = [ + (kw, re.compile(pt) if isinstance(pt, str) else pt) + for cls in enabled_classes + for kw, pt in cls._key_patterns + ] + # 长关键词优先 + patterns.sort(key=lambda x: -len(x[0])) + logger.debug(f"关键词-正则对已生成:{patterns}") + self.key_pattern_list = patterns + + def get_parser_by_type(self, parser_type): + for parser in self.parser_map.values(): + if isinstance(parser, parser_type): + return parser + raise ValueError(f"未找到类型为 {parser_type} 的 parser 实例") + + @filter.event_message_type(filter.EventMessageType.ALL) + async def prob_read_feed(self, event: AstrMessageEvent): + """消息的统一入口""" + umo = event.unified_msg_origin + + # 禁用会话 + if umo in self.config["disabled_sessions"]: + return + + text = event.message_str + + # 过滤空文本 + if not text: + return + + # 匹配 (关键词 + 正则双重判定) + keyword: str = "" + searched: re.Match[str] | None = None + for kw, pat in self.key_pattern_list: + if kw not in text: + continue + if m := pat.search(text): + keyword, searched = kw, m + break + if searched is None: + return + + logger.debug(f"匹配结果: {keyword}, {searched}") + + # 取解析器 + parser = self.parser_map[keyword] + # 解析 + parse_res: ParseResult = await parser.parse(keyword, searched) + + # 渲染内容并发送 + async for chain in self.renderer.render_messages(parse_res): + yield event.chain_result(chain) # type: ignore + + @filter.permission_type(filter.PermissionType.ADMIN) + @filter.command("bm") + async def bm(self, event: AstrMessageEvent): + """获取B站的音频""" + text = event.message_str + matched = re.search(r"(BV[A-Za-z0-9]{10})(\s\d{1,3})?", text) + if not matched: + yield event.plain_result("请发送正确的 BV 号") + return + + bvid, page_num = matched.group(1), matched.group(2) + page_idx = int(page_num) if page_num else 0 + + parser: BilibiliParser = self.get_parser_by_type(BilibiliParser) # type: ignore + + _, audio_url = await parser.extract_download_urls( + bvid=bvid, page_index=page_idx + ) + if not audio_url: + yield event.plain_result("未找到可下载的音频") + return + + audio_path = await self.downloader.download_audio( + audio_url, audio_name=f"{bvid}-{page_idx}.mp3", ext_headers=parser.headers + ) + yield event.chain_result([Record(audio_path)]) # type: ignore + + if self.config["upload_audio"]: + pass + + @filter.permission_type(filter.PermissionType.ADMIN) + @filter.command("ym") + async def ym(self, event: AstrMessageEvent): + """获取油管的音频""" + text = event.message_str + parser = self.get_parser_by_type(YouTubeParser) + _, matched = parser.search_url(text) + if not matched: + yield event.plain_result("请发送正确的油管链接") + return + + url = matched.group(0) + + audio_path = await self.downloader.download_audio(url, use_ytdlp=True) + yield event.chain_result([Record(audio_path)]) # type: ignore + + if self.config["upload_audio"]: + pass + + @filter.permission_type(filter.PermissionType.ADMIN) + @filter.command("登录B站", alias={"blogin", "登录b站"}) + async def login_bilibili(self, event: AstrMessageEvent): + """扫码登录B站""" + parser: BilibiliParser = self.get_parser_by_type(BilibiliParser) # type: ignore + qrcode = await parser.login_with_qrcode() + yield event.chain_result([Image.fromBytes(qrcode)]) + async for msg in parser.check_qr_state(): + yield event.plain_result(msg) + + @filter.command("开启解析") + async def open_parser(self, event: AstrMessageEvent): + """开启当前会话的解析""" + umo = event.unified_msg_origin + if umo in self.config["disabled_sessions"]: + self.config["disabled_sessions"].remove(umo) + self.config.save_config() + yield event.plain_result("解析已开启") + else: + yield event.plain_result("解析已开启,无需重复开启") + + @filter.command("关闭解析") + async def close_parser(self, event: AstrMessageEvent): + """关闭当前会话的解析""" + umo = event.unified_msg_origin + if umo not in self.config["disabled_sessions"]: + self.config["disabled_sessions"].append(umo) + self.config.save_config() + yield event.plain_result("解析已关闭") + else: + yield event.plain_result("解析已关闭,无需重复关闭") + + async def terminate(self): + """插件卸载时""" + # 关下载器里的会话 + await self.downloader.close() + # 关所有解析器里的会话 + await BaseParser.close_session() + # 关缓存清理器 + await self.cleaner.stop() diff --git a/metadata.yaml b/metadata.yaml new file mode 100644 index 0000000..d923e29 --- /dev/null +++ b/metadata.yaml @@ -0,0 +1,8 @@ +name: astrbot_plugin_parser # 这是你的插件的唯一识别名。 +display_name: 链接解析器 +desc: 高性能低耦合的万能链接解析器。支持的类型:视频、图集、音频。 支持的平台:A站、B站、抖音、tiktok、微博、小红书、快手、油管、推特... +help: 略 # 插件的帮助信息 +version: v1.0.0 # 插件版本号。格式:v1.1.1 或者 v1.1 +author: Zhalslar # 作者 +repo: https://github.com/Zhalslar/astrbot_plugin_parser # 插件的仓库地址 + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1581925 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +# Astrbot已规定的依赖此处不再填写 + +tqdm>=4.67.1,<5.0.0 +curl_cffi>=0.13.0,<1.0.0 +msgspec>=0.20.0,<1.0.0 +apilmoji[tqdm]>=0.2.3,<1.0.0 +bilibili-api-python>=17.4.0,<18.0.0 +yt-dlp[default]>=2025.11.12