-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathncdufile.cr
More file actions
647 lines (563 loc) · 19.4 KB
/
ncdufile.cr
File metadata and controls
647 lines (563 loc) · 19.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
# SPDX-FileCopyrightText: Yorhel <projects@yorhel.nl>
# SPDX-License-Identifier: AGPL-3.0-only
require "bit_array"
module NcduFile
@[Link("libzstd")]
lib ZSTD
fun decompress = ZSTD_decompress(dst: UInt8*, dstCapacity: LibC::SizeT, src: UInt8*, compressedSize: LibC::SizeT) : LibC::SizeT
fun getFrameContentSize = ZSTD_getFrameContentSize(src: UInt8*, srcSize: LibC::SizeT) : LibC::ULongLong
end
# Block header/footer
struct Hdr
@raw : UInt32
TYPE_SHIFT = 28
LEN_MASK = (1<<TYPE_SHIFT) - 1
def initialize(@raw)
end
def self.from_io(io, format)
new io.read_bytes UInt32, IO::ByteFormat::BigEndian
end
def len
(@raw & LEN_MASK).to_i32
end
def type
@raw >> TYPE_SHIFT
end
def data?
type == 0
end
def index?
type == 1
end
def to_s(io)
@raw.to_s io, 16
end
end
# Index pointer
struct Ptr
@raw : UInt64
OFFSET_SHIFT = 24
LEN_MASK = (1<<OFFSET_SHIFT) - 1
def initialize(@raw)
end
def initialize(offset, len)
@raw = (offset.to_u64 << OFFSET_SHIFT) + len
end
def self.from_io(io, format)
new io.read_bytes UInt64, IO::ByteFormat::BigEndian
end
def len
(@raw & LEN_MASK).to_i32
end
def offset
@raw >> OFFSET_SHIFT
end
def empty?
@raw == 0
end
def to_s(io)
@raw.to_s io, 16
end
end
# Absolute Itemref
struct Ref
property raw : UInt64
BLOCK_SHIFT = 24
OFFSET_MASK = (1<<BLOCK_SHIFT) - 1
def initialize(@raw)
end
def initialize(block, offset)
@raw = (block.to_u64 << BLOCK_SHIFT) + offset
end
def self.from_io(io, format)
new io.read_bytes UInt64, IO::ByteFormat::BigEndian
end
def offset
(raw & OFFSET_MASK).to_i32
end
def block
raw >> BLOCK_SHIFT
end
def to_s(io)
io << "i#"
raw.to_s io, 16, precision: 10
end
end
# Custom CBOR parser. I know, I know, there exists a crystal-cbor shard, but
# that one doesn't make it easy to validate the various constraints I
# documented for the Item encoding.
# This is an ugly parser, but ought to be conforming to the spec. I've
# only tested this against ncdu's output so far, though, so it's pretty
# much guaranteed to have bugs. :(
class CborReader
getter offset : Int32
enum Major : UInt8
PosInt; NegInt; Bytes; Text; Array; Map; Tag; Simple
end
# Read directly from a slice rather than an IO. Even specializing for
# IO::Memory turns out to have pretty measurable overhead.
def initialize(@input : Bytes, @offset : Int32 = 0)
end
@[AlwaysInline]
def con : UInt8
@input[@offset] # relies on bounds checking to throw an error on unexpected EOF
ensure
@offset += 1
end
def parse(&)
stack = [1] of (UInt16 | Major | Nil)
while !stack.empty?
last = stack.last
first = con
major = Major.new first >> 5
minor = first & 0x1f
arg : UInt64 | Nil = nil
# Parse header byte + value
case minor
when 0x00..0x17 then arg = minor.to_u64
when 0x18 then arg = con.to_u64
when 0x19 then arg = (con.to_u64 << 8) + con.to_u64
when 0x1a then arg = (con.to_u64 << 24) + (con.to_u64 << 16) + (con.to_u64 << 8) + con.to_u64
when 0x1b then arg = (con.to_u64 << 56) + (con.to_u64 << 48) + (con.to_u64 << 40) + (con.to_u64 << 32) + (con.to_u64 << 24) + (con.to_u64 << 16) + (con.to_u64 << 8) + con.to_u64
when 0x1f
case major
when .pos_int?, .neg_int?, .tag?
raise "Invalid CBOR"
when .simple?
raise "Invalid CBOR" if last.is_a? UInt16
end
else
raise "Invalid CBOR"
end
raise "Invalid CBOR" if last.is_a? Major && (arg.nil? || last != major)
next if major.tag?
# Read text/bytes argument
data = nil
if !arg.nil? && (major.text? || major.bytes?)
data = @input[@offset ... @offset + arg]
@offset += arg
raise "Invalid CBOR" if major.text? && !Unicode.valid? data
end
yield stack.size, major, arg, data
# Update parsing state
if arg.nil? && (major.text? || major.bytes?)
stack.push major
elsif (major.array? || major.map?) && (arg.nil? || arg > 0)
if arg.nil?
stack.push nil
else
raise "Invalid CBOR" if arg >= (1<<15)
stack.push (major.map? ? arg * 2 : arg).to_u16
end
else
stack.pop if first == 0xff
loop do
last = stack.last?
if last.is_a? UInt16
stack[-1] = last - 1
break if last > 1
stack.pop
else
break
end
end
end
end
end
end
class Item
property ref : Ref # Ref to current item
property type : Int32 = Int32::MIN
# Beware: The 'name' points into the read buffer, and said buffer will not
# be GC'ed as long as the item is alive. For longer-lived items, make sure
# to call detach() to create a separate allocation.
property name : Bytes = Bytes.empty # Might not validate as UTF-8
property prev : Ref?
property asize : UInt64 = 0
property dsize : UInt64 = 0
property dev : UInt64?
property rderr : Bool?
property cumasize : UInt64 = 0
property cumdsize : UInt64 = 0
property shrasize : UInt64 = 0
property shrdsize : UInt64 = 0
property items : UInt64 = 0
property sub : Ref?
property ino : UInt64 = 0
property nlink : UInt32 = 0
property uid : UInt32?
property gid : UInt32?
property mode : UInt16?
property mtime : UInt64?
@attached = true
def initialize(@ref)
end
# Wonderfully unhygenic macros to help with field parsing.
macro m_uint(f, bits=64)
if major.pos_int? && arg.not_nil! <= UInt{{bits}}::MAX
item.{{f}} = arg.not_nil!.to_u{{bits}}
else
raise "Invalid '#{ {{ f.stringify }} }' field for item #{ref}: #{major} #{arg}"
end
end
macro m_ref(f)
if major.pos_int?
item.{{f}} = Ref.new arg.not_nil!
elsif major.neg_int? && arg.not_nil! < ref.offset
item.{{f}} = Ref.new ref.block, ref.offset - arg.not_nil! - 1
else
raise "Invalid '#{ {{ f.stringify }} }' itemref for item #{ref}: #{major} #{arg}"
end
end
def self.read(parser, ref, warn_unknown_fields = false) : Item
item = new ref
key = nil
parser.parse do |depth, major, arg, data|
if depth != 2
raise "Invalid item at #{ref}: #{major}" if depth == 1 && !major.map?
next # We don't have any fields that support nesting, so can safely skip
end
case key
when .nil?
if major.pos_int? && arg.not_nil! <= 18
key = arg.not_nil!.to_i32
elsif !major.simple? || !arg.nil?
STDERR.puts "WARNING: Unknown field in item #{ref}: #{major} #{arg} #{data}" if warn_unknown_fields
key = -1
end
next
when -1
# skip unknown field
when 0
case major
when .pos_int? then item.type = arg.not_nil! > 3 ? 2 : arg.not_nil!.to_i32
when .neg_int? then item.type = arg.not_nil! > 3 ? -2 : -(arg.not_nil!.to_i32) - 1
else raise "Invalid 'type' field for item #{ref}: #{major} #{arg} #{data}"
end
when 1
# 'data' is only defined for definite-length text or byte strings, so testing for that is sufficient
item.name = data || raise "Invalid 'name' field for item #{ref}: #{major} #{arg}"
when 2 then m_ref prev
when 3 then m_uint asize
when 4 then m_uint dsize
when 5 then m_uint dev
when 6
if major.simple? && (arg == 20 || arg == 21)
item.rderr = arg == 21
else
raise "Invalid 'rderr' field for item #{ref}: #{major} #{arg}"
end
when 7 then m_uint cumasize
when 8 then m_uint cumdsize
when 9 then m_uint shrasize
when 10 then m_uint shrdsize
when 11 then m_uint items
when 12 then m_ref sub
when 13 then m_uint ino
when 14 then m_uint nlink, 32
when 15 then m_uint uid, 32
when 16 then m_uint gid, 32
when 17 then m_uint mode, 16
when 18 then m_uint mtime
else abort "Unreachable"
end
key = nil
end
raise "Missing or empty 'name' for item #{ref}" if item.name == ""
raise "Missing 'type' for item #{ref}" if item.type == Int32::MIN
# TODO: Validate the presence of optional fields for the wrong type? Ncdu doesn't really care...
item
end
def to_s(io)
io << ref
io << "{type=" << type
io << " prev=" << prev if prev
io << " asize=" << asize if asize != 0
io << " dsize=" << dsize if dsize != 0
io << " dev=" << dev if dev
io << " rderrr=" << rderr if rderr
io << " cumasize=" << cumasize if cumasize != 0
io << " cumdsize=" << cumdsize if cumdsize != 0
io << " shrasize=" << shrasize if shrasize != 0
io << " shrdsize=" << shrdsize if shrdsize != 0
io << " items=" << items if items != 0
io << " sub=" << sub if sub
io << " ino=" << ino if ino != 0
io << " nlink=" << nlink if nlink != 0
io << " uid=" << uid if uid
io << " gid=" << gid if gid
io << " mode=" << mode if mode
io << " mtime=" << mtime if mtime
io << " name="
io.write name
io << "}"
end
def detach
@name = @name.clone if @attached
@attached = false
self
end
end
# Low-ish level binary export file reader
class Reader
@file : File
@path : String
@index : Slice(Ptr)
@blocks : Slice(Block)
@block_counter : UInt64 = 0
getter size : Int64
getter root : Ref
class Block
property idx : UInt64 = UInt64::MAX
property last : UInt64 = 0
property data : Slice(UInt8) = "".to_slice
end
def initialize(@path)
@file = File.new @path
@file.read_buffering = false
header = @file.read_string 8
raise "Unrecognized file type" if header != "\xbfncduEX1"
# Find the start of the index block
@file.seek -4, IO::Seek::End
index_hdr = @file.read_bytes Hdr
@size = @file.tell
raise "Invalid index block" if !index_hdr.index? || index_hdr.len < 24 || index_hdr.len & 7 != 0
@file.seek 0 - index_hdr.len, IO::Seek::End
index_start = @file.tell
# Read the index block
@file.read_buffering = true
raise "Invalid index block" if index_hdr != @file.read_bytes Hdr
@index = Slice(Ptr).new(((index_hdr.len - 16)>>3).to_i32, read_only: true) do |i|
ptr = @file.read_bytes Ptr
raise "Invalid pointer at index #{i} (#{ptr})" if !ptr.empty? && (ptr.len <= 12 || ptr.offset >= index_start - 12)
ptr
end
@root = @file.read_bytes Ref
@file.read_buffering = false
@blocks = Slice(Block).new 8 { Block.new }
end
def dup
# Crystal std doesn't seem to expose dup(). It does have dup2(), but unsure how to use that.
File.new @path
end
def close
@file.close
end
def read_block(idx)
raise "Invalid block index #{idx}" if idx >= @index.size || @index[idx].empty?
@file.seek @index[idx].offset + 8, IO::Seek::Set
compressed = @file.read_string @index[idx].len - 12
len = ZSTD.getFrameContentSize compressed.to_unsafe, compressed.bytesize
raise "Invalid decompressed size for block #{idx} (#{len})" if len <= 0 || len >= (1<<24)
data = Pointer(UInt8).malloc len
ret = ZSTD.decompress data, len, compressed.to_unsafe, compressed.bytesize
raise "Error decompressing block #{idx}: #{ret}" if ret != len
Slice(UInt8).new data, len, read_only: true
end
# read_block() but with an LRU cache, works the same as ncdu's bin_reader
# because I lack creativity.
def get_block(idx)
block = @blocks[0]
@block_counter += 1
@blocks.each_with_index do |b, i|
if b.idx == idx
b.last = @block_counter
return b.data
elsif block.last > b.last
block = b
end
end
block.idx = idx
block.last = @block_counter
block.data = read_block idx
block.data
end
def get_item(ref : Ref)
data = get_block ref.block
p = CborReader.new data, ref.offset
Item.read p, ref
end
class ValidateOpts
property list_blocks : Bool = false
property list_items : Bool = false
property warn_unknown_blocks : Bool = true
property warn_unknown_fields : Bool = true
property warn_multiple_refs : Bool = true
property warn_unref : Bool = true
# TODO: warn_multiple_refs and warn_unref are not actually sufficient to
# detect cycles or items that are unreachable from the root: it's
# possible to construct a loop that is unconnected to the main tree. On
# the upside, such a loop will never be reached by ncdu and (most likely)
# other tools, so that seems harmless. On the other hand, there are
# legitimate use cases for having both unreferenced items and multiple
# references: COW-based tree mutation. It might be nice to have a loop
# detector that works for such files as well.
end
def validate_items(idx, data, items_seen, opts)
p = CborReader.new data
while p.offset < data.size
item = Item.read p, Ref.new(idx, p.offset), opts.warn_unknown_fields
puts " #{item}" if opts.list_items
items_seen.update(item.ref) { |v| v + 1 }
item.prev.try do |r|
items_seen.update(r) do |v|
STDERR.puts "WARNING: Item #{r} is referenced from multiple places (second = #{item.ref}.prev)" if opts.warn_multiple_refs && v > 1
v | 2
end
end
item.sub.try do |r|
items_seen.update(r) do |v|
STDERR.puts "WARNING: Item #{r} is referenced from multiple places (second = #{item.ref}.sub)" if opts.warn_multiple_refs && v > 1
v | 2
end
end
end
end
def validate(opts)
blocks_seen = BitArray.new @index.size
items_seen = Hash(Ref, UInt8).new 0, 4096 # bits: 1 = item seen, 2 = ref to item seen
items_seen[root] = 2
@file.seek 8, IO::Seek::Set
loop do
off = @file.tell
hdr = @file.read_bytes Hdr
if hdr.index?
puts "#{off}: Index block, length = #{hdr.len}" if opts.list_blocks
break # already validated in initialize()
end
if !hdr.data?
puts "#{off}: Unknown block, type = #{hdr.type}, length = #{hdr.len}" if opts.list_blocks
STDERR.puts "WARNING: Unknown block type #{hdr.type} at offset #{off} (length = #{hdr.len})" if opts.warn_unknown_blocks
raise "Invalid data block length at offset #{off} (#{hdr.len})" if hdr.len < 8
@file.seek hdr.len - 8, IO::Seek::Current
foot = @file.read_bytes Hdr
raise "Block at offset #{off} has mismatched header/footer (#{hdr} vs. #{foot})" if hdr != foot
next
end
# data block
raise "Invalid data block length at offset #{off} (#{hdr.len})" if hdr.len <= 12
idx = @file.read_bytes UInt32, IO::ByteFormat::BigEndian
raise "Data block #{idx} at offset #{off} is not listed in the index" if idx >= @index.size
idxptr = Ptr.new off, hdr.len
raise "Data block #{idx} at offset #{off} has invalid index pointer (expected #{idxptr} got #{@index[idx]})" if idxptr != @index[idx]
data = read_block idx
puts "#{off}: Data block #{idx}, length = #{hdr.len}, uncompressed size = #{data.size}" if opts.list_blocks
# Assumption: file position is at the end of the data after read_block()
foot = @file.read_bytes Hdr
raise "Block at offset #{off} has mismatched header/footer (#{hdr} vs. #{foot})" if hdr != foot
blocks_seen[idx] = true
validate_items idx, data, items_seen, opts
end
@index.each_with_index do |ptr, i|
raise "Data block #{i} not found in the file, but the index lists it at #{ptr}" if !ptr.empty? && !blocks_seen[i]
end
items_seen.each do |ref, v|
STDERR.puts "WARNING: No reference found to item #{ref}" if opts.warn_unref && v == 1
raise "Invalid reference found to #{ref}" if v == 2
end
end
end
class Listing
include Iterator(Item)
def initialize(@reader : Reader, @next : Ref?)
end
def next
if @next.nil?
stop
else
item = @reader.get_item @next.not_nil!
@next = item.prev
item
end
end
end
# Higher-level virtual filesystem API.
# TODO: Add filtering and sorting options
class Browser
getter reader : Reader
getter root : Item
# Responsibility of the caller to use this correctly.
property lock : Mutex
def initialize(path : String)
@reader = Reader.new path
@lock = Mutex.new
@root = @reader.get_item(@reader.root).detach
end
def list(ref : Ref)
Listing.new @reader, ref
end
class Parents < Array(Item)
def push(item)
item.dev = last.dev if !empty? && !item.dev
super item.detach
end
def to_s(io)
return if empty?
io.write self[0].name unless self[0].name == "/".to_slice && size > 1
self[1..].each do |i|
io.write_byte 47
io.write i.name
end
end
def to_s(io, subname)
io << self
io.write_byte 47 unless empty? || (size == 1 && self[0].name == "/".to_slice)
io.write subname
end
end
# yields Parents, Item
# Parents is modified in-place as directories are traversed.
def walk(&)
parents = Parents.new
yield parents, root
stack = [] of Listing
stack.push list root.sub.not_nil! if root.sub
parents.push root
while !stack.empty?
item = stack[-1].next
if item.is_a?(Iterator::Stop)
stack.pop
parents.pop
else
yield parents, item
if !item.sub.nil?
parents.push item
stack.push list item.sub.not_nil!
end
end
end
end
# TODO: Memoize/cache
def resolve(path)
parents = Parents.new
parents.push root
path = path.strip('/').to_slice
return parents if path.empty?
while true
found = false
sub = parents.last.sub || return nil
list(sub).each do |item|
# Prefix match, while checking that the boundary of the match is
# either the entire path or a separator.
# item.name may contain a path rather than just a component, but this
# algorithm assumes that the export does not contain ambiguous names,
# e.g. if our path is 'a/b/c' and the directory listing has both 'a'
# and 'a/b', the first that happens to match is used, even if it
# doesn't have a matching sub-path.
next if item.name.size > path.size
next if item.name.size < path.size && path[item.name.size] != 47
next if item.name != path[0...item.name.size]
parents.push item
if item.name.size == path.size
return parents
else
path = path[item.name.size+1..]
found = true
break
end
end
return nil unless found
end
end
end
end