Skip to content

Commit e7900fc

Browse files
committed
Decompress object streams asynchronously when it's possible
Most of the time, the object streams are compressed using FlateDecode (and in future with BrotliDecode). So in order to improve the performances we can decompress those streams with a built-in decompressor but it has to be done asynchronously. Since it cannot be done when fetching which is synchronous, we need to do it as part of the PDF parsing process. The drawback is that this requires more memory since we need to keep both the compressed and uncompressed versions of the object streams in memory until the parsing is done.
1 parent 1c12b07 commit e7900fc

4 files changed

Lines changed: 90 additions & 6 deletions

File tree

src/core/core_utils.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,11 @@ function getLookupTableFactory(initializer) {
6262
}
6363

6464
class MissingDataException extends BaseException {
65-
constructor(begin, end) {
65+
constructor(begin, end, objStreamRefNum = 0) {
6666
super(`Missing data [${begin}, ${end})`, "MissingDataException");
6767
this.begin = begin;
6868
this.end = end;
69+
this.objStreamRefNum = objStreamRefNum;
6970
}
7071
}
7172

src/core/document.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1038,8 +1038,8 @@ class PDFDocument {
10381038
};
10391039
}
10401040

1041-
parse(recoveryMode) {
1042-
this.xref.parse(recoveryMode);
1041+
async parse(recoveryMode) {
1042+
await this.xref.parse(recoveryMode);
10431043
this.catalog = new Catalog(this.pdfManager, this.xref);
10441044
}
10451045

src/core/pdf_manager.js

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,14 +191,19 @@ class NetworkPdfManager extends BasePdfManager {
191191
try {
192192
const value = obj[prop];
193193
if (typeof value === "function") {
194-
return value.apply(obj, args);
194+
return await value.apply(obj, args);
195195
}
196196
return value;
197197
} catch (ex) {
198198
if (!(ex instanceof MissingDataException)) {
199199
throw ex;
200200
}
201201
await this.requestRange(ex.begin, ex.end);
202+
if (ex.objectStreamOffset) {
203+
await this.pdfDocument.xref.decompressObjectStreams(
204+
ex.objectStreamOffset
205+
);
206+
}
202207
return this.ensure(obj, prop, args);
203208
}
204209
}

src/core/xref.js

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import {
3131
} from "./core_utils.js";
3232
import { BaseStream } from "./base_stream.js";
3333
import { CipherTransformFactory } from "./crypto.js";
34+
import { Stream } from "./stream.js";
3435

3536
class XRef {
3637
constructor(stream, pdfManager) {
@@ -43,6 +44,7 @@ class XRef {
4344
this._newPersistentRefNum = null;
4445
this._newTemporaryRefNum = null;
4546
this._persistentRefsCache = null;
47+
this._objectStreams = new Map();
4648
}
4749

4850
getNewPersistentRef(obj) {
@@ -96,7 +98,7 @@ class XRef {
9698
this.startXRefQueue = [startXRef];
9799
}
98100

99-
parse(recoveryMode = false) {
101+
async parse(recoveryMode = false) {
100102
let trailerDict;
101103
if (!recoveryMode) {
102104
trailerDict = this.readXRef();
@@ -107,6 +109,8 @@ class XRef {
107109
trailerDict.assignXref(this);
108110
this.trailer = trailerDict;
109111

112+
await this.decompressObjectStreams();
113+
110114
let encrypt;
111115
try {
112116
encrypt = trailerDict.get("Encrypt");
@@ -925,7 +929,29 @@ class XRef {
925929

926930
fetchCompressed(ref, xrefEntry, suppressEncryption = false) {
927931
const tableOffset = xrefEntry.offset;
928-
const stream = this.fetch(Ref.get(tableOffset, 0));
932+
const objectStream = this._objectStreams.get(tableOffset);
933+
let stream;
934+
if (objectStream) {
935+
// The object stream has already been parsed.
936+
stream = objectStream;
937+
this._objectStreams.delete(tableOffset);
938+
} else {
939+
try {
940+
stream = this.fetch(Ref.get(tableOffset, 0));
941+
} catch (ex) {
942+
if (ex instanceof MissingDataException) {
943+
const objStream = this.entries[tableOffset];
944+
const start = this.stream.start + objStream.offset;
945+
const end = this.stream.start + this.entries[tableOffset + 1].offset;
946+
throw new MissingDataException(
947+
start,
948+
end,
949+
/* objStreamRefNum = */ ref.num
950+
);
951+
}
952+
throw new FormatError("bad ObjStm stream");
953+
}
954+
}
929955
if (!(stream instanceof BaseStream)) {
930956
throw new FormatError("bad ObjStm stream");
931957
}
@@ -1030,6 +1056,58 @@ class XRef {
10301056
getCatalogObj() {
10311057
return this.root;
10321058
}
1059+
1060+
async decompressObjectStreams(entryOffset = null) {
1061+
const done = new Set([0]);
1062+
const promises = [];
1063+
let entries = this.entries;
1064+
if (entryOffset !== null) {
1065+
entries = { [entryOffset]: this.entries[entryOffset] };
1066+
}
1067+
for (const num in entries) {
1068+
if (!Object.hasOwn(entries, num)) {
1069+
continue;
1070+
}
1071+
const entry = entries[num];
1072+
if (entry.uncompressed) {
1073+
continue;
1074+
}
1075+
const tableOffset = entry.offset;
1076+
if (done.has(tableOffset)) {
1077+
continue;
1078+
}
1079+
done.add(tableOffset);
1080+
let stream;
1081+
try {
1082+
stream = this.fetch(Ref.get(tableOffset, 0));
1083+
} catch {}
1084+
1085+
if (
1086+
!(stream instanceof BaseStream) ||
1087+
!stream.isAsync ||
1088+
!stream.isDataLoaded
1089+
) {
1090+
continue;
1091+
}
1092+
1093+
promises.push(
1094+
stream
1095+
.asyncGetBytes()
1096+
.then(bytes => {
1097+
if (bytes) {
1098+
this._objectStreams.set(
1099+
tableOffset,
1100+
new Stream(bytes, 0, bytes.length, stream.dict)
1101+
);
1102+
}
1103+
})
1104+
.catch(() => {
1105+
/* no-op */
1106+
})
1107+
);
1108+
}
1109+
await Promise.all(promises);
1110+
}
10331111
}
10341112

10351113
export { XRef };

0 commit comments

Comments
 (0)