powerscripts/chapter_group.py at master · Hemilt0n/powerscripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#!/usr/bin/env python3
"""
智能识别章节并分组文件

用法：
    python chapter_group.py <folder1> [folder2] ...

算法：
    1. 解析文件名，按分隔符分割成多个部分
    2. 分析每个位置是否为递增序列
    3. 章节号通常在递增序列的前一个位置
    4. 同一目录下文件命名规则一致
"""

import os
import re
import sys
import shutil
from pathlib import Path
from collections import defaultdict


def parse_filename(filename: str) -> list:
    """
    解析文件名，分割成多个部分

    支持的分隔符：下划线、空格、点（扩展名前）
    """
    # 移除扩展名
    name = Path(filename).stem

    # 尝试多种分隔符
    parts = re.split(r'[_\s\-\.]+', name)

    # 过滤空字符串，保留数字和文本
    parts = [p for p in parts if p]

    return parts


def is_number(s: str) -> bool:
    """检查字符串是否为数字（整数或小数）"""
    try:
        float(s)
        return True
    except ValueError:
        return False


def detect_chapter_position(files: list) -> int:
    """
    检测章节号在文件名中的位置

    算法：
    1. 解析所有文件名
    2. 分析每个位置的变化特征：
       - 递增序列（页码）：变化频繁，值递增但会重置
       - 章节号：变化较少，值有跳跃（非连续）
       - 文件序号：全局递增，变化频繁且连续
    3. 章节号特征：变化次数少，值有跳跃

    Returns:
        int: 章节号所在位置索引，-1 表示无法识别
    """
    if not files:
        return -1

    # 解析所有文件名
    parsed = [parse_filename(f) for f in files]

    # 过滤掉解析结果太短的
    parsed = [p for p in parsed if len(p) >= 2]

    if not parsed:
        return -1

    # 检查每个位置的长度是否一致
    part_lengths = [len(p) for p in parsed]
    if len(set(part_lengths)) > 1:
        # 长度不一致，取最常见的长度
        from collections import Counter
        most_common_len = Counter(part_lengths).most_common(1)[0][0]
        parsed = [p for p in parsed if len(p) == most_common_len]

    if not parsed:
        return -1

    num_parts = len(parsed[0])
    total_files = len(parsed)

    # 分析每个位置的变化特征
    position_stats = []
    for pos in range(num_parts):
        values = []
        for p in parsed:
            val = p[pos]
            if is_number(val):
                values.append(float(val))
            else:
                values.append(None)

        # 只分析数值位置
        numeric_values = [v for v in values if v is not None]
        if not numeric_values or len(numeric_values) < total_files * 0.9:
            # 非数值或数值少于90%，跳过
            position_stats.append({
                'pos': pos,
                'unique': 0,
                'changes': 0,
                'jumps': 0,
                'is_chapter': False
            })
            continue

        unique_values = len(set(numeric_values))
        change_count = 0  # 值变化的次数
        jump_count = 0    # 值跳跃的次数（差值>1）

        for i in range(1, len(numeric_values)):
            diff = numeric_values[i] - numeric_values[i-1]
            if diff != 0:
                change_count += 1
                if abs(diff) > 1:
                    jump_count += 1

        # 计算变化率
        change_rate = change_count / (len(numeric_values) - 1) if len(numeric_values) > 1 else 0
        jump_rate = jump_count / change_count if change_count > 0 else 0

        position_stats.append({
            'pos': pos,
            'unique': unique_values,
            'changes': change_count,
            'jumps': jump_count,
            'change_rate': change_rate,
            'jump_rate': jump_rate,
            'is_chapter': False
        })

    # 章节识别逻辑：
    # 1. 章节号变化率较低（< 0.3），因为一章有多个页
    # 2. 章节号有跳跃（值不连续），jump_rate 较高
    # 3. 章节号有多个不同值（> 1）

    candidates = []
    for stat in position_stats:
        if stat['unique'] > 1 and stat['change_rate'] < 0.3 and stat['jump_rate'] > 0.5:
            candidates.append(stat)

    if candidates:
        # 选择变化率最低的作为章节
        candidates.sort(key=lambda x: x['change_rate'])
        return candidates[0]['pos']

    # 如果没有找到符合条件的，尝试找变化较少的位置
    numeric_stats = [s for s in position_stats if s['unique'] > 1]
    if numeric_stats:
        # 按变化率排序，取最低的
        numeric_stats.sort(key=lambda x: x['change_rate'])
        return numeric_stats[0]['pos']

    return -1


def analyze_directory(directory: Path) -> tuple:
    """
    分析目录，识别章节分组

    Returns:
        tuple: (groups, chapter_pos, ungrouped)
        groups: dict[str, list] 章节号 -> 文件列表
        chapter_pos: int 章节位置索引
        ungrouped: list 无法识别的文件
    """
    files = [f.name for f in directory.iterdir() if f.is_file()]

    if not files:
        return {}, -1, []

    # 检测章节位置
    chapter_pos = detect_chapter_position(files)

    if chapter_pos == -1:
        return {}, -1, files

    # 按章节分组
    groups = defaultdict(list)
    ungrouped = []

    for filename in files:
        parts = parse_filename(filename)

        if len(parts) > chapter_pos:
            chapter = parts[chapter_pos]
            if is_number(chapter):
                # 数字章节号，格式化
                chapter = str(int(float(chapter))).zfill(3)
            groups[chapter].append(directory / filename)
        else:
            ungrouped.append(filename)

    return groups, chapter_pos, ungrouped


def show_plan(groups: dict, chapter_pos: int, ungrouped: list, directory: Path) -> bool:
    """显示分组计划"""
    print("=" * 60)
    print("  分组计划预览")
    print("=" * 60)
    print()
    print(f"目标目录: {directory}")
    print()

    if not groups:
        print("没有可分组的目标文件")
        return False

    # 显示识别结果
    sample_file = next(iter(groups.values()))[0].name
    parts = parse_filename(sample_file)
    print(f"识别规则: 文件名第 {chapter_pos + 1} 部分 ({parts[chapter_pos]})")
    print(f"示例: '{sample_file}' -> 章节 '{parts[chapter_pos]}'")
    print()

    print("-" * 60)
    print(f"\n将创建 {len(groups)} 个章节文件夹:\n")

    for chapter in sorted(groups.keys()):
        files = groups[chapter]
        folder_name = chapter.zfill(3) if len(chapter) < 3 else chapter
        print(f"  {folder_name}/  ({len(files)} 个文件)")

        # 显示前3个文件名作为示例
        for f in files[:3]:
            print(f"      - {f.name}")
        if len(files) > 3:
            print(f"      ... 还有 {len(files) - 3} 个文件")
        print()

    if ungrouped:
        print("-" * 60)
        print(f"\n以下 {len(ungrouped)} 个文件无法识别（将被跳过):\n")
        for f in ungrouped[:5]:
            print(f"      - {f}")
        if len(ungrouped) > 5:
            print(f"      ... 还有 {len(ungrouped) - 5} 个文件")

    print("-" * 60)
    return True


def execute_grouping(groups: dict, directory: Path) -> int:
    """执行分组操作"""
    print("\n" + "=" * 60)
    print("  开始执行分组...")
    print("=" * 60)
    print()

    total_moved = 0

    for chapter in sorted(groups.keys()):
        folder_name = chapter.zfill(3) if len(chapter) < 3 else chapter
        folder_path = directory / folder_name

        # 创建文件夹
        if not folder_path.exists():
            folder_path.mkdir()
            print(f"创建文件夹: {folder_name}/")

        # 移动文件
        for file_path in groups[chapter]:
            dest = folder_path / file_path.name

            # 处理重名
            counter = 1
            while dest.exists():
                stem = file_path.stem
                suffix = file_path.suffix
                dest = folder_path / f"{stem}_{counter:02d}{suffix}"
                counter += 1

            try:
                shutil.move(str(file_path), str(dest))
                total_moved += 1
            except Exception as e:
                print(f"  移动失败: {file_path.name} - {e}")

        print(f"  已移动 {len(groups[chapter])} 个文件到 {folder_name}/")

    print()
    print(f"分组完成！共移动 {total_moved} 个文件到 {len(groups)} 个文件夹")
    return total_moved


def process_folder(folder_path: Path) -> bool:
    """处理单个文件夹"""
    if not folder_path.is_dir():
        print(f"跳过: {folder_path} 不是文件夹")
        return False

    print(f"\n正在扫描: {folder_path}")

    groups, chapter_pos, ungrouped = analyze_directory(folder_path)

    if not show_plan(groups, chapter_pos, ungrouped, folder_path):
        return False

    # 用户确认
    print()
    confirm = input("确认执行分组? [y/N]: ").strip().lower()

    if confirm == 'y':
        execute_grouping(groups, folder_path)
        return True
    else:
        print("已取消操作")
        return False


def main():
    if len(sys.argv) < 2:
        print("用法: python chapter_group.py <folder1> [folder2] ...")
        print("请拖放文件夹到此脚本上，或通过右键菜单调用")
        input("按回车键退出...")
        sys.exit(1)

    folders = [Path(arg) for arg in sys.argv[1:]]

    success_count = 0
    fail_count = 0

    for folder in folders:
        if process_folder(folder):
            success_count += 1
        else:
            fail_count += 1

    print(f"\n处理完成: 成功 {success_count}, 失败 {fail_count}")

    if os.environ.get('PROMPT'):
        input("\n按回车键退出...")


if __name__ == '__main__':
    main()