forked from archlinux/archivetools
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patharchive-cleaner
More file actions
executable file
·199 lines (166 loc) · 8.4 KB
/
archive-cleaner
File metadata and controls
executable file
·199 lines (166 loc) · 8.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python
from argparse import ArgumentParser
from datetime import datetime
from glob import glob
from multiprocessing import Pool
from multiprocessing import cpu_count
from os import unlink
from os.path import basename
from os.path import isfile
from os.path import join
from os.path import getsize
from re import finditer
from re import search
from sys import stderr
from libarchive import file_reader
ARCHS = ['x86_64']
PKG_EXTENSIONS = ['.tar.xz', '.tar.gz', '.tar.bz2', '.tar.zst', '.tar.lrz', '.tar.lzo', '.tar.Z', '.tar.lz4', '.tar.lz']
SIG_EXT = '.sig'
DEFAULT_ARCHIVE_DIR = '/srv/archive'
DEFAULT_SOURCE_DIR = '/srv/ftp'
SOURCE_WHITELIST = ['core', 'extra', 'staging', 'testing', 'community', 'community-staging', 'community-testing', 'multilib', 'multilib-staging', 'multilib-testing', 'gnome-unstable', 'kde-unstable']
DEFAULT_KEEP_YEARS = 3
def get_sourceball(path):
print(path)
m = search('\d{4}/\d{2}/\d{2}', path)
date = m.group(0)
pkgname = basename(path)
pkg_exts = list(filter(lambda ext: pkgname.endswith(ext), PKG_EXTENSIONS))
if not pkg_exts:
raise ValueError(f'Expecting path to end with one of these extensions {" ".join(PKG_EXTENSIONS)}. {path} given')
source_ext = f'.src{pkg_exts[0]}'
pkgver, pkgrel, _ = pkgname.split('-')[-3:]
pkgname, _ = pkgname.split(f'-{pkgver}-{pkgrel}')
return f'/srv/archive/repos/{date}/sources/packages/{pkgname}-{pkgver}-{pkgrel}{source_ext}'
def human_filesize(size):
for unit in ['', 'KB', 'MB', 'GB', 'TB']:
if size < 1024:
break
size /= 1024
return f'{size:.2f} {unit}'
def read_file(archive, filename, arch):
packages = []
with file_reader(filename) as pkg:
for entry in pkg:
if entry.name == '.BUILDINFO':
packages.extend(get_packages_from_buildinfo(archive, filename, entry, arch))
break
if not packages:
print(f'WARNING: .BUILDINFO not found in package {filename}', file=stderr)
packages.append(basename(filename))
return packages
def get_packages_from_buildinfo(archive, filename, archive_entry, arch):
buildinfo = b''.join(archive_entry.get_blocks()).decode()
m = search('format = (\\d)', buildinfo)
buildinfo_format = 0 if not m else int(m.group(1))
installed = [m.group(1) for m in finditer(r'installed = (.+)', buildinfo)]
print(f'INFO: found .BUILDINFO format {buildinfo_format} with {len(installed)} installed packages for {filename}', file=stderr)
packages = []
if buildinfo_format > 0:
for package in installed:
pkgfilenames = [f'{package}.pkg{ext}' for ext in PKG_EXTENSIONS]
pkgname = '-'.join(package.split('-')[:-3])
if not pkgname:
print(f'ERROR: failed to parse {package} with format {buildinfo_format} needed for {filename}', file=stderr)
continue
directory = join(archive, 'packages', pkgname[0], pkgname)
paths = [join(directory, pkgfilename) for pkgfilename in pkgfilenames]
existing_paths = list(filter(lambda path: isfile(path), paths))
if not existing_paths:
print(f'ERROR: {package} needed for {filename} not in archive {" ".join(paths)}', file=stderr)
continue
# as it shouldn't be possible to have more than one extension take the first and append it
packages.append(basename(existing_paths[0]))
return packages
for package in installed:
filenames_any = [f'{package}-any.pkg{ext}' for ext in PKG_EXTENSIONS]
filenames_arch = [f'{package}-{arch}.pkg{ext}' for ext in PKG_EXTENSIONS]
pkgname = '-'.join(package.split('-')[:-2])
directory = join(archive, 'packages', pkgname[0], pkgname)
paths_any = [join(directory, filename_any) for filename_any in filenames_any]
paths_arch = [join(directory, filename_arch) for filename_arch in filenames_arch]
existing_paths_any = list(filter(lambda path: isfile(path), paths_any))
existing_paths_arch = list(filter(lambda path: isfile(path), paths_arch))
if existing_paths_any:
# as it shouldn't be possible to have more than one extension take the first and append it
packages.append(basename(existing_paths_any[0]))
elif existing_paths_arch:
packages.append(basename(existing_paths_arch[0]))
else:
print(f'ERROR: {package} needed for {filename} not in archive {" ".join(paths_any + paths_arch)}', file=stderr)
return packages
def main(repo=DEFAULT_SOURCE_DIR, archive=DEFAULT_ARCHIVE_DIR, keep_years=DEFAULT_KEEP_YEARS, processes=cpu_count() * 2, delete=False):
tasks = []
needed = set()
for subdir in SOURCE_WHITELIST:
for arch in ARCHS:
directory = join(repo, subdir, 'os', arch)
for filename in filter(lambda filename: not filename.endswith(SIG_EXT), glob(join(directory, f'*.pkg.*'))):
tasks.append((archive, filename, arch))
with Pool(processes=processes) as pool:
results = pool.starmap(read_file, tasks)
for result in results:
needed = needed.union(result)
for need in needed:
print(f'INFO: needed package: {need}')
keep_until = datetime.now().year - keep_years
# fix before year 3000
years = list(filter(lambda year: year <= keep_until,
map(lambda path: int(basename(path)), glob(join(archive, 'repos', '2*')))))
purge_paths = [path for paths in [glob(join(archive, 'repos', f'{year}', '*', '*')) for year in years] for path in paths]
purge_paths = [join(path, '*', 'os', '*', '*.pkg.tar.*') for path in purge_paths] + \
[join(path, 'pool', '*', '*.pkg.tar.*') for path in purge_paths]
with Pool(processes=processes) as pool:
results = pool.map(glob, purge_paths)
all_files = [path for paths in results for path in paths if not path.endswith(SIG_EXT)]
purge = list(filter(lambda path: basename(path) not in needed, all_files))
keep = list(filter(lambda path: basename(path) in needed, all_files))
for path in keep:
print(f'INFO: keeping file {path}')
cleanup_size = 0
removed = set()
for path in purge:
print(f'INFO: deleting file {path}')
sourceball = get_sourceball(path)
print(sourceball)
if delete:
try:
unlink(path)
except FileNotFoundError:
print(f'ERROR: file not found for deletion: {path}')
try:
unlink(f'{path}{SIG_EXT}')
except FileNotFoundError:
print(f'ERROR: file not found for deletion: {path}{SIG_EXT}')
try:
unlink(sourceball)
except FileNotFoundError:
print(f'ERROR: file not found for deletion: {sourceball}')
else:
pkg = basename(path)
if pkg in removed:
continue
try:
cleanup_size += getsize(path)
removed.add(pkg)
except FileNotFoundError:
print(f'ERROR: file not found for calculation size: {path}')
try:
cleanup_size += getsize(f'{path}{SIG_EXT}')
except FileNotFoundError:
print(f'ERROR: file not found for calculation size: {path}{SIG_EXT}')
try:
cleanup_size += getsize(sourceball)
except FileNotFoundError:
print(f'ERROR: file not found for calculation size: {sourceball}')
cleanup_size = human_filesize(cleanup_size)
print(f'total estimated cleanup size {cleanup_size}')
if __name__ == '__main__':
parser = ArgumentParser(description='Archive cleaner')
parser.add_argument('--archive', default=DEFAULT_ARCHIVE_DIR, help=f'root directory of the archive (default: {DEFAULT_ARCHIVE_DIR})')
parser.add_argument('--repo', default=DEFAULT_SOURCE_DIR, help=f'root directory of the repo (default: {DEFAULT_SOURCE_DIR})')
parser.add_argument('--keep-years', type=int, default=DEFAULT_KEEP_YEARS, help=f'amount of years to keep (default: {DEFAULT_KEEP_YEARS})')
parser.add_argument('--processes', type=int, default=cpu_count() * 2, help=f'number of parallel processes (default: {cpu_count()*2})')
parser.add_argument('--delete', default=False, action='store_true', help=f'really delete unwanted files')
args = parser.parse_args()
main(args.repo, args.archive, args.keep_years, args.processes, args.delete)