Skip to content

Commit 9d77952

Browse files
authored
fix(listener): 拦贴纸/GIF聚合站 + 裸媒体文件,避免 Discord 表情包误入分享库 (#2)
* fix(listener): 拦贴纸/GIF聚合站 + 裸媒体文件,避免 Discord 表情包误入分享库 事故:用户 yhn 在分享频道发了一个 Discord 贴纸(klipy GIF),message.content 里就是裸 https://klipy.com/gifs/... URL,listener 当成正常分享走完 OG fetch + 分类,被打成 APPROVED 上架成 #18。 原 _SKIP_HOSTS 只拦了 discord.com / cdn.discordapp.com 等 Discord 自家域,没考虑贴纸面板默认走 tenor / klipy / giphy。同类问题:mmbiz.qpic.cn 这类纯图片直链(#5)也不该入库。 改法两层:(1) _SKIP_HOSTS 加入 tenor / klipy / giphy 全套;(2) 兜底在 path 上做媒体扩展名(.gif/.png/.jpg/.mp4/...)匹配,host 永远穷举不完。匹配只看 path,query 里出现 .jpg 不算(避免误伤带 ?file=foo.jpg 的正常 API 链接)。+19 个测试 case 覆盖。 * fix(replies): 反馈链接从 /share 改成 /feed,'已上架' 应跳列表页而非提交表单 /share 是单页提交入口(带 ?url=... 预填,给 bookmarklet 用),/feed 才是已审核通过的展示墙。Bot 在 listener.py(首条 reply + APPROVED 终态 reply)和 commands.py(/share 斜杠命令成功回执)三处都把 '点此查看 / 已收录到内卷地狱分享库' 链接指向 /share——结果用户点过去看到的是空提交表单,不是自己刚分享的内容。 * fix(listener): 自家 GitHub 仓库 PR/issue/commit 等 dev 子路径不入分享库 用户在分享频道贴自己 PR (#2) 通告,bot 把它当 '社区分享' 收成 #19。同类还会有 issue/commit/compare/actions/releases/discussions/blob/tree 等 dev 子路径。 策略:path 至少 3 段(/<org>/<repo>/<sub>)且 org=involutionhell 时 skip,仓库主页和第三方仓库全放行。这是 dev 自循环噪声专杀,不影响合法分享。+11 测试 case。
1 parent daba7da commit 9d77952

3 files changed

Lines changed: 167 additions & 12 deletions

File tree

src/chat_bot/cogs/commands.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ async def share(
8686
# 小字 caption 用 `-# ...` 语法(Discord 的 subtext 行,显示为灰色细字)
8787
content = (
8888
f"{url}\n"
89-
f"-# ✅ 已收录到 [内卷地狱分享库](https://involutionhell.com/share) "
89+
f"-# ✅ 已收录到 [内卷地狱分享库](https://involutionhell.com/feed) "
9090
f"· `#{result.link_id}` · by {interaction.user.display_name}"
9191
)
9292
if recommendation:

src/chat_bot/cogs/listener.py

Lines changed: 85 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,31 +28,105 @@
2828

2929
_URL_RE = re.compile(r"https?://[^\s<>\"'\]\)]+", re.IGNORECASE)
3030

31-
# 跳过 Discord 自身的各种链接:用户经常复制错(比如右键"复制消息链接"会粘
32-
# discord.com/channels/.../... 出来,这不该被当作"分享"入库)。静默忽略,不
33-
# 回复也不提交,像 bot 没看到一样。
31+
# 跳过的链接源。两层:
32+
# 1. Discord 自身(消息链接 / 附件 CDN)—— 用户复制消息链接时常误粘
33+
# 2. 贴纸 / GIF / meme 聚合站 —— Discord 内置贴纸面板会发 tenor/klipy/giphy
34+
# 链接出来,message.content 里就是裸 URL。这些不是"分享资源",不该入库
35+
# 静默忽略,不回复不提交,像 bot 没看到一样。
3436
_SKIP_HOSTS = frozenset({
35-
# 主站
37+
# Discord 主站
3638
"discord.com",
3739
"www.discord.com",
3840
"canary.discord.com",
3941
"ptb.discord.com",
40-
# 邀请短链
42+
# Discord 邀请短链
4143
"discord.gg",
42-
# 附件 / CDN
44+
# Discord 附件 / CDN
4345
"discordapp.com",
4446
"cdn.discordapp.com",
4547
"media.discordapp.net",
48+
# 贴纸 / GIF 聚合(Discord 贴纸面板默认走这些)
49+
"tenor.com",
50+
"media.tenor.com",
51+
"c.tenor.com",
52+
"giphy.com",
53+
"media.giphy.com",
54+
"media0.giphy.com",
55+
"media1.giphy.com",
56+
"media2.giphy.com",
57+
"media3.giphy.com",
58+
"media4.giphy.com",
59+
"klipy.com",
60+
"media.klipy.com",
4661
})
4762

63+
# 兜底:只指向静态媒体文件的 URL(路径以这些扩展名结尾)一律跳过——常见于
64+
# WeChat / 各种图床的裸图片链接,非分享资源。把扩展名匹配做在 path 上避免误伤
65+
# 带 query 的正常链接(query 里出现 .jpg 不算)。
66+
_MEDIA_EXTENSIONS = (
67+
".gif",
68+
".png",
69+
".jpg",
70+
".jpeg",
71+
".webp",
72+
".bmp",
73+
".svg",
74+
".ico",
75+
".mp4",
76+
".webm",
77+
".mov",
78+
".m4v",
79+
".mp3",
80+
".wav",
81+
".ogg",
82+
".flac",
83+
)
84+
85+
86+
_INTERNAL_GITHUB_ORG = "involutionhell" # GitHub 路径不区分大小写,统一比小写
87+
88+
89+
def _is_self_org_github_chatter(parsed) -> bool:
90+
"""github.com/InvolutionHell/<repo>/<sub-path> 视为内部 dev 讨论,不入分享库。
91+
92+
放行 case:
93+
- github.com/InvolutionHell/<repo> 仓库主页("安利自家工具"这种正常分享)
94+
- github.com/InvolutionHell/<repo>/ 同上,带尾斜杠
95+
- github.com/InvolutionHell org 主页(罕见,但放行)
96+
- github.com/<其它 org>/... 第三方仓库的任何路径
97+
98+
拦截 case:
99+
- github.com/InvolutionHell/<repo>/pull/N
100+
- github.com/InvolutionHell/<repo>/issues/N
101+
- github.com/InvolutionHell/<repo>/commit/<sha>
102+
- github.com/InvolutionHell/<repo>/blob/...
103+
- github.com/InvolutionHell/<repo>/tree/...
104+
- github.com/InvolutionHell/<repo>/actions/...
105+
- github.com/InvolutionHell/<repo>/discussions/...
106+
- github.com/InvolutionHell/<repo>/releases/tag/...
107+
—— 这些是 PR/issue 自动通知或 dev 联调时贴的,不是给社区"上架"的资源
108+
"""
109+
host = parsed.netloc.lower().split(":")[0]
110+
if host not in {"github.com", "www.github.com"}:
111+
return False
112+
segs = [s for s in parsed.path.split("/") if s]
113+
# /<org>/<repo>/<sub-path...> (>= 3 段才算 dev 子路径)
114+
return len(segs) >= 3 and segs[0].lower() == _INTERNAL_GITHUB_ORG
115+
48116

49117
def _should_skip(url: str) -> bool:
50-
"""URL 是否属于需要跳过的源(当前只屏蔽 Discord 自身域名)。"""
118+
"""URL 是否属于需要跳过的源Discord 域、贴纸聚合、自家 GitHub dev 子路径、或裸媒体文件。"""
51119
try:
52-
host = urlparse(url).netloc.lower().split(":")[0]
120+
parsed = urlparse(url)
53121
except Exception:
54122
return False
55-
return host in _SKIP_HOSTS
123+
host = parsed.netloc.lower().split(":")[0]
124+
if host in _SKIP_HOSTS:
125+
return True
126+
if _is_self_org_github_chatter(parsed):
127+
return True
128+
# path 走小写匹配,跟 query 解耦:?foo=bar.jpg 不会误命中
129+
return parsed.path.lower().endswith(_MEDIA_EXTENSIONS)
56130

57131
# 轮询最终状态的参数:每 2s 查一次,最多 30s
58132
_POLL_INTERVAL_SEC = 2.0
@@ -150,7 +224,7 @@ async def _handle_one_url(self, message: discord.Message, url: str) -> None:
150224
await self._safe_reply(
151225
message,
152226
f"感谢 {message.author.mention} 大佬分享!正在过审核,"
153-
f"通过后会上架 [内卷地狱分享库](<https://involutionhell.com/share>) #{result.link_id}",
227+
f"通过后会上架 [内卷地狱分享库](<https://involutionhell.com/feed>) #{result.link_id}",
154228
)
155229

156230
# 后台轮询拿最终状态,拿到了再发第二条
@@ -197,7 +271,7 @@ async def _send_status_update(
197271
await self._safe_reply(
198272
message,
199273
f"🎉 {user} 已上架 · #{link_id} "
200-
f"[点此查看](<https://involutionhell.com/share>)",
274+
f"[点此查看](<https://involutionhell.com/feed>)",
201275
)
202276
return
203277

tests/test_listener_skip.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,70 @@ def test_should_skip_discord_urls(url: str) -> None:
3131
assert _should_skip(url) is True
3232

3333

34+
@pytest.mark.parametrize(
35+
"url",
36+
[
37+
# 这次实事故的 klipy GIF
38+
"https://klipy.com/gifs/hello-8126--k01KQ1SBY07FP9N8QRABJGVNGQC",
39+
# Tenor(Discord 贴纸面板默认)
40+
"https://tenor.com/view/cat-cute-gif-1234567",
41+
"https://media.tenor.com/AbCdEfGhIj/cat.gif",
42+
# Giphy(也常见)
43+
"https://giphy.com/gifs/cat-cute-AbCdEfGhIj",
44+
"https://media2.giphy.com/media/AbCdEfGhIj/giphy.gif",
45+
# Klipy CDN
46+
"https://media.klipy.com/some.gif",
47+
],
48+
)
49+
def test_should_skip_sticker_gif_aggregators(url: str) -> None:
50+
assert _should_skip(url) is True
51+
52+
53+
@pytest.mark.parametrize(
54+
"url",
55+
[
56+
# 裸图片(WeChat 图床、随便哪个 host 的图片直链)
57+
"https://mmbiz.qpic.cn/mmbiz_jpg/abc/640.jpg",
58+
"https://example.com/path/photo.PNG",
59+
"https://i.example.com/cat.gif",
60+
"https://example.com/foo.webp",
61+
# 视频/音频直链
62+
"https://example.com/clip.mp4",
63+
"https://example.com/audio.mp3",
64+
# SVG(即便 host 不在黑名单也拦,配合服务端 SVG 上传黑名单)
65+
"https://example.com/icon.svg",
66+
],
67+
)
68+
def test_should_skip_bare_media_files(url: str) -> None:
69+
assert _should_skip(url) is True
70+
71+
72+
@pytest.mark.parametrize(
73+
"url",
74+
[
75+
# path 不带媒体扩展,但 query 里出现 .jpg —— 不该误命中
76+
"https://example.com/api?file=foo.jpg",
77+
# 微信公众号文章 URL(典型分享)
78+
"https://mp.weixin.qq.com/s/abc",
79+
# 小红书帖子(path 没扩展名)
80+
"https://www.xiaohongshu.com/explore/abc123",
81+
],
82+
)
83+
def test_should_not_skip_normal_articles_with_media_query(url: str) -> None:
84+
assert _should_skip(url) is False
85+
86+
3487
@pytest.mark.parametrize(
3588
"url",
3689
[
3790
"https://arxiv.org/abs/2501.00001",
3891
"https://mp.weixin.qq.com/s/abc",
92+
# 自家仓库主页是合法分享("看看我们的新工具"),允许
3993
"https://github.com/InvolutionHell/ChatBot",
94+
"https://github.com/InvolutionHell/ChatBot/",
95+
# 第三方仓库的任何路径都允许
96+
"https://github.com/torvalds/linux/commit/abc123",
97+
"https://github.com/openai/openai-python/pull/42",
4098
"https://scholar.google.com/scholar?q=rag",
4199
# 只有 host 相似但不完全匹配就不该 skip(防范未来新域名放行策略)
42100
"https://not-discord.com/x",
@@ -47,6 +105,29 @@ def test_should_not_skip_other_urls(url: str) -> None:
47105
assert _should_skip(url) is False
48106

49107

108+
@pytest.mark.parametrize(
109+
"url",
110+
[
111+
# 实际事故:bot 自己的 PR 通告被自己捕获成 #19
112+
"https://github.com/InvolutionHell/ChatBot/pull/2",
113+
# 各种 dev 子路径都该 skip
114+
"https://github.com/InvolutionHell/ChatBot/issues/5",
115+
"https://github.com/InvolutionHell/ChatBot/commit/abc123",
116+
"https://github.com/InvolutionHell/ChatBot/compare/main...feature",
117+
"https://github.com/InvolutionHell/ChatBot/actions/runs/123",
118+
"https://github.com/InvolutionHell/ChatBot/releases/tag/v1.0",
119+
"https://github.com/InvolutionHell/ChatBot/discussions/10",
120+
"https://github.com/InvolutionHell/ChatBot/blob/main/README.md",
121+
"https://github.com/InvolutionHell/ChatBot/tree/main/src",
122+
# 大小写漂移也要拦
123+
"https://github.com/INVOLUTIONHELL/ChatBot/pull/2",
124+
"https://www.github.com/InvolutionHell/involutionhell/pull/320",
125+
],
126+
)
127+
def test_should_skip_self_org_github_dev_chatter(url: str) -> None:
128+
assert _should_skip(url) is True
129+
130+
50131
def test_should_skip_handles_bad_url_gracefully() -> None:
51132
# 坏 URL 不应抛异常;当前 urlparse 对大多数输入都不抛,兜底返回 False
52133
assert _should_skip("not-a-url") is False

0 commit comments

Comments
 (0)