-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdiff.py
More file actions
199 lines (166 loc) · 6.6 KB
/
diff.py
File metadata and controls
199 lines (166 loc) · 6.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python3
"""
Test if there is a significant difference between two PDFs using ImageMagick
and pdftocairo.
File edited by Maximilian Eitelwein to return the most different pages
"""
INFINITY = float("inf")
import os.path, pathlib, subprocess, sys, tempfile, time
from concurrent.futures import ThreadPoolExecutor
from constants import DEFAULT_THRESHOLD, DEFAULT_VERBOSITY, DEFAULT_DPI
from constants import VERB_PRINT_REASON, VERB_PRINT_TMPDIR
from constants import VERB_PERPAGE, VERB_PRINT_CMD, VERB_ROUGH_PROGRESS
from constants import DEFAULT_NUM_THREADS, MAX_REPORT_PAGENOS
import external_programs
from external_programs import verbose_run
def pdftopng(sourcepath, destdir, basename, verbosity, dpi):
"""
Invoke pdftocairo to convert the given PDF path to a PNG per page.
Return a list of page numbers (as strings).
"""
if [] != list(destdir.glob(basename + "*")):
raise ValueError("destdir not clean: " + repr(destdir))
verbose_run(
(verbosity > VERB_PRINT_CMD),
[
"pdftocairo",
"-png",
"-r",
str(dpi),
str(sourcepath),
str(destdir / basename),
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
)
# list of strings with decimals
numbers = sorted(path.name for path in destdir.glob(basename + "*" + ".png"))
return [s[len(basename) + 1 : -4] for s in numbers]
# returns a float, which can be inf
def imgdiff(a, b, diff, log, print_cmds):
assert a.is_file()
assert b.is_file()
assert not diff.exists()
assert not log.exists()
with log.open("w+b") as f:
cmdresult = verbose_run(
print_cmds,
external_programs.compare_cmd(print_cmds)
+ ["-verbose", "-metric", "PSNR", str(a), str(b), str(diff),],
stdout=f,
stderr=subprocess.STDOUT,
)
if cmdresult.returncode > 1:
raise ValueError("compare crashed, status=" + str(cmdresult.returncode))
with log.open("r") as f:
lines = f.readlines()
if any("image widths or heights differ" in l for l in lines):
raise ValueError("image widths or heights differ")
PREF = " all: "
all_line = [l for l in lines if l.startswith(PREF)]
assert len(all_line) == 1
all_str = all_line[0][len(PREF) :].strip()
all_num = INFINITY if (all_str == "0" or all_str == "1.#INF") else float(all_str)
return all_num
def pdfdiff(
a,
b,
threshold=DEFAULT_THRESHOLD,
verbosity=DEFAULT_VERBOSITY,
dpi=DEFAULT_DPI,
time_to_inspect=0,
num_threads=DEFAULT_NUM_THREADS,
max_report_pagenos=MAX_REPORT_PAGENOS,
):
"""
Return True if the PDFs are sufficiently similar.
The name of this function is slightly confusing: it returns whether the
PDFs are *not* different.
"""
assert os.path.isfile(a), "file {} must exist".format(a)
assert os.path.isfile(b), "file {} must exist".format(b)
with tempfile.TemporaryDirectory(prefix="diffpdf") as d:
p = pathlib.Path(d)
if verbosity >= VERB_PRINT_TMPDIR:
print(" Temporary directory: {}".format(p))
if verbosity >= VERB_ROUGH_PROGRESS:
print(" Converting each page of the PDFs to an image...")
# expand pdfs to pngs
with ThreadPoolExecutor(max_workers=num_threads) as pool:
a_i_ = pool.submit(pdftopng, a, p, "a", verbosity=verbosity, dpi=dpi)
b_i_ = pool.submit(pdftopng, b, p, "b", verbosity=verbosity, dpi=dpi)
# Wait for results
a_i = a_i_.result()
b_i = b_i_.result()
if a_i != b_i:
assert len(a_i) != len(
b_i
), "mishap with weird page numbers: {} vs {}".format(a_i, b_i)
if verbosity >= VERB_PRINT_REASON:
print("Different number of pages: {} vs {}".format(len(a_i), len(b_i)))
out = "Different number of pages: {} vs {}".format(len(a_i), len(b_i))
#return False #original
return out, False
assert len(a_i) > 0
if verbosity >= VERB_ROUGH_PROGRESS:
print(
" PDFs have same number of pages. Checking each pair of converted images..."
)
significances = []
for pageno in a_i:
# remember pageno is a string
pageapath = p / "a-{}.png".format(pageno)
pagebpath = p / "b-{}.png".format(pageno)
diffpath = p / "diff-{}.png".format(pageno)
logpath = p / "log-{}.txt".format(pageno)
s = imgdiff(
pageapath, pagebpath, diffpath, logpath, (verbosity > VERB_PRINT_CMD)
)
if verbosity >= VERB_PERPAGE:
print("- Page {}: significance={}".format(pageno, s))
significances.append(s)
min_significance = min(significances, default=INFINITY)
significant = min_significance <= threshold
largest_significances = sorted(
(sgf, pageno_minus_one + 1)
for (pageno_minus_one, sgf) in enumerate(significances)
if sgf < INFINITY
)
if verbosity >= VERB_PRINT_REASON:
freetext = "different" if significant else "the same"
out = "{}".format(
""
if largest_significances == []
else " The most different pages are: {}.".format(", ".join("page {} (sgf. {})".format(pageno, sgf)
for (sgf, pageno) in largest_significances[
:max_report_pagenos
]
)
),
)
print("Min sig = {}, significant={}. The PDFs are {}.{}".format(
min_significance,
significant,
freetext,
""
if largest_significances == []
else " The most different pages are: {}.".format(", ".join("page {} (sgf. {})".format(pageno, sgf)
for (sgf, pageno) in largest_significances[
:max_report_pagenos
]
)
),
)
)
if time_to_inspect > 0:
print(
"Waiting for {} seconds before removing temporary directory...".format(
time_to_inspect
),
end="",
flush=True,
)
time.sleep(time_to_inspect)
print(" done.")
return out, not significant