diff --git a/.gitignore b/.gitignore index 0665207..28f78ed 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ Manifest*.toml +data/ diff --git a/Project.toml b/Project.toml index a6286f4..3ca4184 100644 --- a/Project.toml +++ b/Project.toml @@ -23,7 +23,7 @@ YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" # TODO: Remove after https://github.com/JuliaIO/FileIO.jl/pull/427 [sources] -FileIO = {url = "https://github.com/icweaver/FileIO.jl", rev = "asdf"} +FileIO = {url = "https://github.com/JuliaIO/FileIO.jl", rev = "master"} [compat] AbstractTrees = "0.4.5" diff --git a/README.md b/README.md index 04468ff..d470d54 100644 --- a/README.md +++ b/README.md @@ -10,4 +10,27 @@ A new [Advanced Scientific Data Format (ASDF)](https://asdf-standard.readthedocs.io/en/latest/index.html) package, written in Julia. +## Quickstart + +```julia +using ASDF +``` + +```julia +# Load +af = ASDF.load_file("") +``` + +```julia +# Acess and modify +af.metadata[""] = +``` + +```julia +# Write +doc = Dict() +ASDF.write_file("", doc) +``` + +======= See [`ASDF.jl v1`](https://github.com/JuliaAstro/ASDF.jl/tree/v1) for the older one which is based on the Python [`asdf`](https://github.com/spacetelescope/asdf) Python package. diff --git a/docs/Project.toml b/docs/Project.toml index 2df60e2..4044da6 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,6 +1,11 @@ [deps] ASDF = "686f71d1-807d-59a4-a860-28280ea06d7b" +AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" +AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" [compat] Documenter = "1" diff --git a/docs/make.jl b/docs/make.jl index 1f6ae0a..32c468e 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -8,9 +8,17 @@ makedocs(; repo = GitHub("JuliaAstro/ASDF.jl"), sitename = "ASDF.jl", format = Documenter.HTML( - prettyurls = true, canonical = "https://juliaastro.org/ASDF/stable/", ), + pages = [ + "Home" => "index.md", + "Introduction" => "intro.md", + "Examples" => [ + "JWST" => "examples/jwst.md", + "Roman" => "examples/roman.md", + ], + "API" => "api.md", + ], doctest = false, ) diff --git a/docs/src/api.md b/docs/src/api.md new file mode 100644 index 0000000..93b75ca --- /dev/null +++ b/docs/src/api.md @@ -0,0 +1,14 @@ +# API + +## Public + +```@autodocs +Modules = [ASDF] +Private = false +``` + +## Private +```@autodocs +Modules = [ASDF] +Public = false +``` diff --git a/docs/src/examples/jwst.md b/docs/src/examples/jwst.md new file mode 100644 index 0000000..7c6d884 --- /dev/null +++ b/docs/src/examples/jwst.md @@ -0,0 +1,54 @@ +# JWST + +*Adapted from [ADASS 2024 workshop](https://github.com/asdf-format/asdf-adass2024/blob/main/02_working_with_asdf/Working_with_ASDF.ipynb).* + +In this example, we show how to use ASDF.jl to load and view some astronomical data taken from the James Webb Space Telescope ([JWST](https://science.nasa.gov/mission/webb/)). + +!!! note "Data availability" + The sample data for this example can be [downloaded here](https://data.science.stsci.edu/redirect/Roman/Roman_Data_Workshop/ADASS2024/jwst.asdf) from the data repository of the Space Telescope Science Institute ([STScI](https://www.stsci.edu/)). Note: it is a moderately large file (~100 MB). + +## Load + +```@repl jwst + +fpath = let + data_dir = joinpath("..", "..", "data") + mkpath(data_dir) + joinpath(data_dir, "jwst.asdf") +end; + +if !isfile(fpath) + using Downloads: download + + download("https://data.science.stsci.edu/redirect/Roman/Roman_Data_Workshop/ADASS2024/jwst.asdf", fpath) +end +``` + +```@repl jwst +using ASDF + +af = load(fpath; extensions = true) +``` + +## Plot + +```@example jwst +using CairoMakie + +img_sci = let + img = af["data"][] + img[img .< 0] .= 1 + img +end + +fig, ax, hm = heatmap(img_sci; + colorrange = (1, 1e3), + colorscale = log10, + colormap = :cividis, + nan_color = :limegreen, # NaNs are handled automatically +) + +Colorbar(fig[1, 2], hm) + +fig +``` diff --git a/docs/src/examples/roman.md b/docs/src/examples/roman.md new file mode 100644 index 0000000..3382cd5 --- /dev/null +++ b/docs/src/examples/roman.md @@ -0,0 +1,55 @@ +# Roman + +*Adapted from [STScI Roman Notebooks](https://spacetelescope.github.io/roman_notebooks/notebooks/working_with_asdf/working_with_asdf.html).* + +In this example, we show how to use ASDF.jl to load and view some simulated astronomical data created in preparation for the future ([Nancy Grace Roman Space Telescope](https://science.nasa.gov/mission/roman-space-telescope/)) mission. + +!!! note "Data availability" + Simulated data products are currently provided by STScI via [AWS S3 buckets](https://spacetelescope.github.io/roman_notebooks/notebooks/data_discovery_and_access/data_discovery_and_access.html). Note: The data product used for this example is a moderately large file (~300 MB). + +## Load + +```@repl roman +fpath = let + data_dir = joinpath("..", "..", "data") + mkpath(data_dir) + joinpath(data_dir, "roman.asdf") +end; + +if !isfile(fpath) + using AWSS3, AWS + aws_config = AWS.AWSConfig(; creds = nothing, region = "us-east-1") + AWSConfig(nothing, "us-east-1", "json", 3) + + # This is a large file, will take some time to download + s3_get_file( + aws_config, + "stpubdata", + "roman/nexus/soc_simulations/tutorial_data/r0003201001001001004_0001_wfi01_f106_cal.asdf", + fpath + ) +end +``` + +```@repl roman +using ASDF + +af = load(fpath; extensions = true, validate_checksum = false) +``` + +## Plot + +```@example roman +using CairoMakie + +img = af["roman"]["data"][] + +fig, ax, hm = heatmap(img[begin:1000, begin:1000]; colorscale = asinh, colorrange = (0.5, 4)) + +Colorbar(fig[1, 2], hm) + +fig +``` + +!!! note + Some ASDF files produced by the Python implementation of ASDF may save a checksum in its header block computed from the original decompressed file. This will cause ASDF.jl to fail because in constrast, it computes the checksum based on the compressed (i.e., "used data"), as per the [current specification for ASDF](https://www.asdf-format.org/projects/asdf-standard/en/1.0.1/file_layout.html#block-header). To handle this potenial failure mode, we pass `validate_checksum = false` to avoid running the default checksum. diff --git a/docs/src/index.md b/docs/src/index.md index fd82086..dad8695 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,5 +1,34 @@ # ASDF.jl -```@autodocs -Modules = [ASDF] +A new [Advanced Scientific Data Format (ASDF)](https://asdf-standard.readthedocs.io/en/latest/index.html) package, written in Julia. + + +## Quickstart + + +### Installation + +```julia-repl +pkg> add ASDF, OrderedCollections +``` + +### Usage + +```@repl +using ASDF, OrderedCollections + +doc = OrderedDict( + "field_1" => [5, 6, 7, 8], + "field_2" => ["up", "down", "left", "right"], + "field_3" => OrderedDict( + "field_3a" => ["apple", "orange", "pear"], + "field_3b" => [1.0, 2.0, 3.0], + ) +); + +save("example.asdf", doc) + +af = load("example.asdf") + +ASDF.info(af; max_rows = 3) ``` diff --git a/docs/src/intro.md b/docs/src/intro.md new file mode 100644 index 0000000..8d799b7 --- /dev/null +++ b/docs/src/intro.md @@ -0,0 +1,231 @@ +# Introduction + +*Adapted from [ADASS 2024 workshop](https://github.com/asdf-format/asdf-adass2024/blob/main/03_creating_asdf_files/Creating_ASDF_Files.ipynb).* + +The ASDF file format is based on the human-readable [YAML](http://yaml.org/) standard, extended with efficient binary blocks to store array data. Basic arithmetic types (`Bool`, `Int`, `Float`, `Complex`) and `String` types are supported out of the box. Other types (structures) need to be declared to be supported. + +ASDF supports arbitrary array strides, both C (Python) and Fortran (Julia) memory layouts, as well as compression. The YAML metadata can contain arbitrary information corresponding to scalars, arrays, or dictionaries. + +The ASDF file format targets a similar audience as the [HDF5](https://www.hdfgroup.org/solutions/hdf5/) format. + +## Getting started + +ASDF files are initially created as a dictionary with arbitrarily nested data: + +```jldoctest intro +julia> using OrderedCollections + +julia> af_payload = OrderedDict("field_1" => [5, 6, 7, 8], "field_2" => ["up", "down", "left", "right"], "field_3" => OrderedDict("field_3a" => ["apple", "orange", "pear"], "field_3b" => [1.0, 2.0, 3.0])); +``` + +!!! note + We use an `OrderedDict` from [OrderedCollections.jl](https://github.com/JuliaCollections/OrderedCollections.jl) to preserve the order of data on write and load, and to maximize compatibility with YAML.jl. For a potential alternative using [Dictionaries.jl](https://github.com/andyferris/Dictionaries.jl), see this [experimental branch](https://github.com/JuliaAstro/ASDF.jl/tree/dictionaries). + +ASDF.jl is registered with [FileIO.jl](https://juliaio.github.io/FileIO.jl/stable/), so this data can be written to the ASDF file format with the generic [`save`](@ref) function: + +```jldoctest intro +julia> using ASDF + +julia> save("intro.asdf", af_payload) +``` + +The saved file contains the following human-readable contents: + +!!! details "View file" + ```jldoctest intro + julia> read("intro.asdf", String) |> print + #ASDF 1.0.0 + #ASDF_STANDARD 1.2.0 + # This is an ASDF file + %YAML 1.1 + %TAG ! tag:stsci.edu:asdf/ + --- + !core/asdf-1.1.0 + field_1: + - 5 + - 6 + - 7 + - 8 + field_2: + - "up" + - "down" + - "left" + - "right" + field_3: + field_3a: + - "apple" + - "orange" + - "pear" + field_3b: + - 1.0 + - 2.0 + - 3.0 + asdf/library: !core/software-1.0.0 + name: "ASDF.jl" + author: "Erik Schnetter " + homepage: "https://github.com/JuliaAstro/ASDF.jl" + version: "2.0.0" + ... + #ASDF BLOCK INDEX + %YAML 1.1 + --- + [] + ... + ``` + +which can be loaded back with FileIO.jl's generic [`load`](@ref) function: + +```jldoctest intro +julia> af = load("intro.asdf") +intro.asdf +├─ field_1::Vector{Int64} | shape = (4,) +├─ field_2::Vector{String} | shape = (4,) +├─ field_3::String +│ ├─ field_3a::Vector{String} | shape = (3,) +│ └─ field_3b::Vector{Float64} | shape = (3,) +└─ asdf/library::String + ├─ author::String | Erik Schnetter + ├─ homepage::String | https://github.com/JuliaAstro/ASDF.jl + ├─ name::String | ASDF.jl + └─ version::String | 2.0.0 +``` + +This is stored as an [`ASDF.ASDFFile`](@ref). To change the number of rows shown, pass this object to [`ASDF.info`](@ref): + +```jldoctest intro +julia> ASDF.info(af; max_rows = 3) +intro.asdf +├─ field_1::Vector{Int64} | shape = (4,) +├─ field_2::Vector{String} | shape = (4,) + ⋮ (8) more rows +``` + +It contains a `metadata` field, which is a new dictionary that merges information about this library (stored under the `asdf/library` key) with the original user-defined `af_payload` dictionary. For convenience, `af.metadata[]` can be accessed directly as `af[key]`. Since the underlying data is a dictionary, it can be modified in the standard way: + +```jldoctest intro +julia> af["field_1"] = [50, 60, 70, 80]; + +julia> af["field_1"] +4-element Vector{Int64}: + 50 + 60 + 70 + 80 +``` + +The convenience syntax can also be used to save the modified `ASDF.ASDFFile` object directly: + +```jldoctest intro +julia> save("intro_modified.asdf", af) +``` + +!!! details "View file" + ```jldoctest intro + julia> read("intro_modified.asdf", String) |> print + #ASDF 1.0.0 + #ASDF_STANDARD 1.2.0 + # This is an ASDF file + %YAML 1.1 + %TAG ! tag:stsci.edu:asdf/ + --- + !core/asdf-1.1.0 + field_1: + - 50 + - 60 + - 70 + - 80 + field_2: + - "up" + - "down" + - "left" + - "right" + field_3: + field_3a: + - "apple" + - "orange" + - "pear" + field_3b: + - 1.0 + - 2.0 + - 3.0 + asdf/library: !core/software-1.0.0 + name: "ASDF.jl" + author: "Erik Schnetter " + homepage: "https://github.com/JuliaAstro/ASDF.jl" + version: "2.0.0" + ... + #ASDF BLOCK INDEX + %YAML 1.1 + --- + [] + ... + ``` + +## Array storage + +By default, array data is written inline as a literal to the ASDF file. This can be stored and later accessed more efficiently by wrapping your data in an [`ASDF.NDArrayWrapper`](@ref). This allows for your data to be stored as a binary via the `inline = false` keyword (default), which can be further optimized by specifying a supported [compression algorithm](@ref ASDF.Compression) to use via the `compression` keyword: + +```jldoctest intro +julia> af_payload = OrderedDict("meta" => OrderedDict("my" => OrderedDict("nested" => "metadata")), "data" => ASDF.NDArrayWrapper([1, 2, 3, 4]; compression = ASDF.C_Bzip2)); + +julia> save("intro_compressed.asdf", af_payload) + +julia> af = load("intro_compressed.asdf") +intro_compressed.asdf +├─ meta::String +│ └─ my::String +│ └─ nested::String | metadata +├─ data::ASDF.NDArray | shape = [4] +└─ asdf/library::String + ├─ author::String | Erik Schnetter + ├─ homepage::String | https://github.com/JuliaAstro/ASDF.jl + ├─ name::String | ASDF.jl + └─ version::String | 2.0.0 +``` + +!!! details "View file" + ```julia-repl + julia> read("intro_compressed.asdf", String) |> print + #ASDF 1.0.0 + #ASDF_STANDARD 1.2.0 + # This is an ASDF file + %YAML 1.1 + %TAG ! tag:stsci.edu:asdf/ + --- + !core/asdf-1.1.0 + meta: + my: + nested: "metadata" + data: !core/ndarray-1.0.0 + source: 0 + shape: + - 4 + datatype: "int64" + byteorder: "little" + asdf/library: !core/software-1.0.0 + name: "ASDF.jl" + author: "Erik Schnetter " + homepage: "https://github.com/JuliaAstro/ASDF.jl" + version: "2.0.0" + ... + �BLK0 f�0xj�sq���r#ASDF BLOCK INDEX + %YAML 1.1 + --- + [463,] + ... + ``` + +Using `NDArrayWrapper` allows for the wrapped data to be lazily accessed as a strided view. To access the underlying data, use the `[]` (dereference) syntax: + +```jldoctest intro +julia> af["data"][] +4-element reshape(reinterpret(Int64, ::StridedViews.StridedView{UInt8, 2, Memory{UInt8}, typeof(identity)}), 4) with eltype Int64: + 1 + 2 + 3 + 4 +``` + +## Tagged objects + +Come back soon to see how custom Julia objects can be handled in ASDF.jl. diff --git a/src/ASDF.jl b/src/ASDF.jl index e9f40e5..93375ec 100644 --- a/src/ASDF.jl +++ b/src/ASDF.jl @@ -25,6 +25,21 @@ const software_version = string(PkgVersion.@Version) ################################################################################ +""" +Identifies the compression algorithm used for a data block. Available variants: + +| Scheme | 4-byte key | Backend | Description | +| :--------------- | :------------- | :--------------------- | :------------------------------------------------------------ | +| `C_None` | `\\0\\0\\0\\0` | -- | Fast I/O speed, no CPU overhead | +| `C_Blosc` | `blsc` | ChunkCodecLibBlosc.jl | Multi-threaded, shuffle-aware, best with typed numeric arrays | +| `C_Blosc2` | `bls2` | See [Issue #49](https://github.com/JuliaIO/ChunkCodecs.jl/issues/49) | Like Blosc but supports more than 2 GB of data | +| `C_Bzip2` | `bzp2` | ChunkCodecLibBzip2.jl | Good ratio, moderate speed (default) | +| `C_Lz4` (:block) | `lz4\\0` | ChunkCodecLibLz4.jl | Fastest decompression, Python-compatible | +| `C_Lz4` (:frame) | `lz4\\0` | ChunkCodecLibLz4.jl | LZ4 frame format for non-Python consumers | +| `C_Xz` | `xz\\0\\0` | CodecXz.jl | Highest compression ratio, slowest | +| `C_Zlib` | `zlib` | ChunkCodecLibZlib.jl | Broad compatibility | +| `C_Zstd` | `zstd` | ChunkCodecLibZstd.jl | Best ratio/speed trade-off | +""" @enum Compression C_None C_Blosc C_Blosc2 C_Bzip2 C_Lz4 C_Xz C_Zlib C_Zstd const compression_keys = Dict{Compression,Vector{UInt8}}( @@ -45,11 +60,47 @@ const compression_enums = Dict{Vector{UInt8},Compression}( ) ################################################################################ +""" + BlockHeader + +Parsed representation of a single ASDF binary block header. + +Every binary block in an ASDF file begins with a fixed-layout header that describes the block's compression, size, and integrity checksum. `BlockHeader` captures all decoded fields from that header, together with the `IO` handle and file position needed to subsequently read the block's data payload. + +# Fields + +| Field | Description | +| :------------------ | :---------- | +| `io` | The open file handle from which the block's data can be read. | +| `position` | Absolute byte offset of the block magic token within `io`. | +| `token` | 4-byte magic token. Always equal to [`block_magic_token`](@ref) (`\\323BLK`). | +| `header_size` | Size of the extended header region in bytes (excludes the 6-byte prefix). | +| `flags` | Block flags. Bit 0 (`0x1`) indicates a streamed block (not currently supported). | +| `compression` | 4-byte compression key (e.g. `"zstd"`, `"bzp2"`). | +| `allocated_size` | Number of bytes allocated in the file for this block's data (≥ `used_size`). | +| `used_size` | Number of bytes of (compressed) data actually written. | +| `data_size` | Number of bytes of the *uncompressed* data. | +| `checksum` | 16-byte MD5 digest of the compressed data, or all zeros if omitted. | +| `validate_checksum` | When `true`, [`ASDF.read_block`](@ref) verifies the MD5 digest before returning data. | +# File layout + +The block occupies the following byte range within `io`, starting at `position`: + +| | | | +| :-- | :-- | :-- | +| `[position]` | 4 bytes | magic token | +| `[position + 4]` | 2 bytes | header_size (big-endian UInt16) | +| `[position + 6]` | header_size bytes | extended header fields | +| `[position + 6 + header_size]` | allocated_size bytes | data payload | + +The extended header (always 48 bytes in the current implementation) contains, in order: `flags` (4 B), `compression` (4 B), `allocated_size` (8 B), `used_size` (8 B), `data_size` (8 B), and `checksum` (16 B). + +All multi-byte integers in the header are stored in **big-endian** byte order. +""" struct BlockHeader io::IO position::Int64 - token::AbstractVector{UInt8} # length 4 header_size::UInt16 flags::UInt32 @@ -61,11 +112,37 @@ struct BlockHeader validate_checksum::Bool end +""" + LazyBlockHeaders + +A mutable container holding the complete list of [`ASDF.BlockHeader`](@ref) values scanned from an ASDF file, shared by reference with every [`ASDF.NDArray`](@ref) +and [`ASDF.ChunkedNDArray`](@ref) constructed during parsing. + +# Reference sharing + +Every [`ASDF.NDArray`](@ref) created during parsing of a given file holds a reference to the same `LazyBlockHeaders` instance. When `ndarray[]` is called, it indexes into `lazy_block_headers.block_headers` using the array's zero-based `source` field: + +```julia +header = ndarray.lazy_block_headers.block_headers[ndarray.source + 1] +data = ASDF.read_block(header) +``` + +Because `block_headers` is populated after all `NDArray` objects are constructed, no array needs to be updated individually when block scanning completes. The shared mutable reference propagates the result automatically. + +# Mutability + +`LazyBlockHeaders` is a `mutable struct` solely to allow `block_headers` to be assigned after construction. The field is written exactly once per file load, immediately after `YAML.load` within [`ASDF.load_file`](@ref) returns. It is never modified again during normal use. Treat it as effectively immutable after [`load_file`](@ref) returns. +""" mutable struct LazyBlockHeaders block_headers::Vector{BlockHeader} LazyBlockHeaders() = new(BlockHeader[]) end +""" + block_magic_token + +The 4-byte sentinel `UInt8[0xd3, 0x42, 0x4c, 0x4b]`. +""" const block_magic_token = UInt8[0xd3, 0x42, 0x4c, 0x4b] # "\323BLK" find_first_block(io::IO) = find_next_block(io, Int64(0)) @@ -119,6 +196,11 @@ native2big_U16(val::Integer) = native2big_U16(UInt16(val)) native2big_U32(val::Integer) = native2big_U32(UInt32(val)) native2big_U64(val::Integer) = native2big_U64(UInt64(val)) +""" + read_block_header(io::IO, position::Int64; validate_checksum::Bool) + +Constructs an [`ASDF.BlockHeader`](@ref) by parsing the file at a given offset. +""" function read_block_header(io::IO, position::Int64; validate_checksum::Bool) # Read block header max_header_size = 6 + 48 @@ -156,6 +238,11 @@ function read_block_header(io::IO, position::Int64; validate_checksum::Bool) return BlockHeader(io, position, token, header_size, flags, compression, allocated_size, used_size, data_size, checksum, validate_checksum) end +""" + find_all_blocks(io::IO, pos::Int64=Int64(0); validate_checksum::Bool) + +Scans an `IO` stream and returns all [`ASDF.BlockHeader`](@ref) values found. +""" function find_all_blocks(io::IO, pos::Int64=Int64(0); validate_checksum::Bool) headers = BlockHeader[] pos = find_next_block(io, pos) @@ -168,6 +255,11 @@ function find_all_blocks(io::IO, pos::Int64=Int64(0); validate_checksum::Bool) return headers end +""" + read_block(header::BlockHeader) + +Uses an [`ASDF.BlockHeader`](@ref) to read, verify, and decompress the data payload. +""" function read_block(header::BlockHeader) block_data_start = header.position + 6 + header.header_size seek(header.io, block_data_start) @@ -260,7 +352,12 @@ end ################################################################################ """ - @enum ASDF.Byteorder Byteorder_little Byteorder_big + ASDF.Byteorder + +Represents the byte order of array data stored in a block. Available variants: + +- `Byteorder_little` : Little-endian +- `Byteorder_big`: Big-endian """ @enum Byteorder Byteorder_little Byteorder_big const byteorder_string_dict = Dict{Byteorder,String}(Byteorder_little => "little", Byteorder_big => "big") @@ -268,21 +365,54 @@ const string_byteorder_dict = Dict{String,Byteorder}(val => key for (key, val) i """ ASDF.Byteorder(str::AbstractString)::Byteorder + +Convenience conversion for [`ASDF.Byteorder`](@ref). Inverse of [`Base.string(byteorder::Byteorder)`](@ref). + +# Examples + +```jldoctest +julia> ASDF.Byteorder("little") +Byteorder_little::Byteorder = 0 +``` """ Byteorder(str::AbstractString) = string_byteorder_dict[str] """ string(byteorder::Byteorder)::AbstractString + +Convenience conversion for [`ASDF.Byteorder`](@ref). Inverse of [`ASDF.Byteorder(str::AbstractString)`](@ref). + +# Examples + +```jldoctest +julia> string(ASDF.Byteorder_little) +"little" +``` """ Base.string(byteorder::Byteorder) = byteorder_string_dict[byteorder] Base.show(io::IO, byteorder::Byteorder) = show(io, string(byteorder)) +""" + host_byteorder + +Native byte order of the running machine, detected at load time. Defined by [`ASDF.Byteorder`](@ref). +""" const host_byteorder = reinterpret(UInt8, UInt16[1])[1] == 1 ? Byteorder_little : Byteorder_big ################################################################################ """ -Careful, there is also `Base.DataType`, which is a different type. +Maps ASDF datatype strings to Julia types. Note this is unrelated to `Base.DataType`. Defined mappings: + +| ASDF string | Julia type | +| :---------------------- | :---------------------- | +| `bool8` | `Bool` | +| `int8` ... `int128` | `Int8` ... `Int128` | +| `uint8` ... `uint128` | `UInt8` ... `UInt128` | +| `float16` ... `float64` | `Float16` ... `Float64` | +| `complex32` | `Complex{Float16}` | +| `complex64` | `Complex{Float32}` | +| `complex128` | `Complex{Float64}` | """ @enum Datatype begin Datatype_bool8 @@ -353,9 +483,28 @@ Datatype(type::Type) = type_datatype_dict[type] ################################################################################ +""" + NDArray + +A lazily-materialized N-dimensional array stored in an ASDF file, either as a binary block or inline within the ASDF file. + +`NDArray` is the in-memory representation of an `!core/ndarray-1.0.0` YAML node. It holds the array's shape, type, and layout metadata, but defers reading and decompressing block data until the array is explicitly materialized by calling [`Base.getindex(ndarray::NDArray)`](@ref). + +# Fields + +| Field | Description | +| :-------------------- | :---------- | +| `lazy_block_headers` | Reference to the file's block header list. Used to resolve `source` indices at materialization time. | +| `source` | Zero-based index of the backing binary block, or `nothing` for inline arrays. | +| `data` | In-memory array for inline data, or `nothing` for block-backed arrays. | +| `shape` | Array dimensions in **Python/C (row-major) order** — outermost dimension first. The equivalent Julia shape is `reverse(shape)`. | +| `datatype` | Element type, as an [`ASDF.Datatype`](@ref) enum value. Convert to a Julia type with `Type(datatype)`. | +| `byteorder` | Byte order of the stored data (`Byteorder_little` or `Byteorder_big`). | +| `offset` | Byte offset from the start of the block to the first array element. Non-negative. | +| `strides` | Byte strides in **Python/C (row-major) order**. Must all be positive and `length(strides) == length(shape)`. | +""" struct NDArray lazy_block_headers::LazyBlockHeaders - source::Union{Nothing,Int64,AbstractString} data::Union{Nothing,AbstractArray} shape::Vector{Int64} @@ -461,6 +610,17 @@ function make_construct_yaml_ndarray(block_headers::LazyBlockHeaders) return construct_yaml_ndarray end +""" + Base.getindex(ndarray::NDArray) + +Returns the fully materialized array. See [`ASDF.NDArray`](@ref) for definitions. When block-backed (`source` !== `nothing`), this reads and decompresses the block, applies offset and strides via a [`StridedViews.StridedView`](https://github.com/QuantumKitHub/StridedViews.jl), reinterprets bytes to `Type(datatype)`, and byte-swaps if `byteorder != host_byteorder`. The returned array satisfies: + +```julia +size(result) == Tuple(reverse(ndarray.shape)) +eltype(result) == Type(ndarray.datatype) +sizeof(eltype) .* strides(result) == Tuple(reverse(ndarray.strides)) +``` +""" function Base.getindex(ndarray::NDArray) if ndarray.data !== nothing data = ndarray.data @@ -506,6 +666,18 @@ end ################################################################################ +""" + NDArrayChunk + +A positioned tile within an [`ASDF.ChunkedNDArray`](@ref), pairing an [`ASDF.NDArray`](@ref) with a zero-based origin that locates the tile in the parent array's coordinate space. + +# Fields + +| Field | Description | +| :-------- | :---------- | +| `start` | Zero-based origin of the tile in **Python/C (row-major) order**, outermost dimension first. All elements must be non-negative. `length(start)` must equal `length(ndarray.strides)`. | +| `ndarray` | The tile data, including its own shape, datatype, byte order, and backing block reference. | +""" struct NDArrayChunk start::Vector{Int64} ndarray::NDArray @@ -535,6 +707,21 @@ function make_construct_yaml_ndarray_chunk(block_headers::LazyBlockHeaders) return construct_yaml_ndarray_chunk end +""" + ChunkedNDArray + +A logical N-dimensional array assembled from a collection of arbitrarily-positioned [`ASDF.NDArrayChunk`](@ref) tiles, each backed by its own binary block or inline data. + +`ChunkedNDArray` is the in-memory representation of a `!core/chunked-ndarray-1.X.Y` YAML node. It defines the shape and element type of the full logical array, but defers all data access to the individual chunks. The full array is only allocated and populated when [`Base.getindex(ndarray::NDArray)`](@ref) is called. + +# Fields + +| Field | Description | +| :--------- | :---------- | +| `shape` | Dimensions of the full logical array in **Python/C (row-major) order**, outermost dimension first. All elements must be non-negative. The equivalent Julia shape is `reverse(shape)`. | +| `datatype` | Element type shared by all chunks. Convert to a Julia type with `Type(datatype)`. | +| `chunks` | Ordered collection of tiles that together populate the logical array. Tiles are written in iteration order when materializing. Later tiles overwrite earlier ones in any overlapping region. | +""" struct ChunkedNDArray shape::Vector{Int64} datatype::Datatype @@ -583,6 +770,11 @@ function make_construct_yaml_chunked_ndarray(block_headers::LazyBlockHeaders) return construct_yaml_chunked_ndarray end +""" + Base.getindex(chunked_ndarray::ChunkedNDArray) + +Allocates a dense array of shape `reverse(shape)` and fills it by calling `chunk.ndarray[]` for each chunk, placing the result at the correct offset. +""" function Base.getindex(chunked_ndarray::ChunkedNDArray) shape = chunked_ndarray.shape datatype = Type(chunked_ndarray.datatype) @@ -597,6 +789,19 @@ end ################################################################################ +""" + ASDFFile + +The in-memory representation of a loaded ASDF file, combining the parsed YAML metadata tree with the binary block infrastructure needed to lazily materialize any array data it references. + +# Fields + +| Field | Description | +| :-------------------- | :---------- | +| `filename` | Path of the file on disk, as passed to [`ASDF.load_file`](@ref). Used for display and diagnostics only. The file is not kept open after loading. | +| `metadata` | The fully parsed YAML tree. Keys are strings matching the top-level YAML keys. Values may be any Julia type produced by the YAML constructors, including [`ASDF.NDArray`](@ref), [`ASDF.ChunkedNDArray`](@ref), nested `Dict`s, `Vector`s, and scalars. | +| `lazy_block_headers` | All binary block headers found in the file, scanned once at load time. Shared by reference with every [`ASDF.NDArray`](@ref) in `metadata`, allowing them to locate and read their backing blocks on demand. | +""" struct ASDFFile filename::AbstractString metadata::OrderedDict{Any,Any} @@ -608,6 +813,7 @@ function YAML.write(file::ASDFFile) end Base.getindex(af::ASDFFile, key) = af.metadata[key] +Base.setindex!(af::ASDFFile, value, key) = (af.metadata[key] = value) struct ASDFTreeNode key::Any @@ -635,7 +841,7 @@ Display up to `max_rows` lines of `af` tree. `Base.show` calls this function int ```jldoctest julia> using OrderedCollections: OrderedDict -julia> doc = OrderedDict("field_\$(i)" => rand(10) for i in 1:25); +julia> doc = OrderedDict(string("field_", i) => rand(10) for i in 1:25); julia> save("long.asdf", doc) @@ -725,14 +931,30 @@ Base.show(io::IO, ::MIME"text/plain", af::ASDFFile) = info(io, af) # Display up ################################################################################ -ordered_map_constructor = (constructor, node) -> YAML.construct_mapping(OrderedDict{Any,Any}, constructor, node) -asdf_constructors = copy(YAML.default_yaml_constructors) -delete!(asdf_constructors, "tag:yaml.org,2002:map") # Let dicttype= handle plain maps -asdf_constructors["tag:stsci.edu:asdf/core/asdf-1.1.0"] = ordered_map_constructor -asdf_constructors["tag:stsci.edu:asdf/core/software-1.0.0"] = ordered_map_constructor -asdf_constructors["tag:stsci.edu:asdf/core/extension_metadata-1.0.0"] = ordered_map_constructor +""" + load_file(filename::AbstractString; extensions = false, validate_checksum = true) + +Reads an ASDF file from disk. + +| Parameter | Description | +| :------------------ | :--------------------------------------------------------------------------------------------------------------- | +| `filename` | Path to the `.asdf` file | +| `extensions` | When `true`, unknown YAML tags are parsed leniently (as maps, sequences, or scalars) instead of raising an error | +| `validate_checksum` | When `true`, each block's MD5 checksum is verified against the stored value | +Block data is located lazily. Block headers are scanned after the YAML is parsed, and array data (`ndarray`) is read only when [`Base.getindex(ndarray::NDArray)`](@ref) is called, i.e., `ndarray[]`. + +!!! note "File handle lifetime" + The file handle opened by `load_file` is retained for the lifetime of the returned [`ASDF.ASDFFile`](@ref) so that block data can be read on demand. Do not move, truncate, or overwrite the source file while any [`ASDF.NDArray`](@ref) from it may still be accessed. +""" function load_file(filename::AbstractString; extensions = false, validate_checksum = true) + ordered_map_constructor = (constructor, node) -> YAML.construct_mapping(OrderedDict{Any,Any}, constructor, node) + asdf_constructors = copy(YAML.default_yaml_constructors) + delete!(asdf_constructors, "tag:yaml.org,2002:map") # Let dicttype= handle plain maps + asdf_constructors["tag:stsci.edu:asdf/core/asdf-1.1.0"] = ordered_map_constructor + asdf_constructors["tag:stsci.edu:asdf/core/software-1.0.0"] = ordered_map_constructor + asdf_constructors["tag:stsci.edu:asdf/core/extension_metadata-1.0.0"] = ordered_map_constructor + if extensions # Use fallbacks for now asdf_constructors[nothing] = (constructor, node) -> begin @@ -774,7 +996,7 @@ Load an asdf file at filepath `f`. ```jldoctest julia> using OrderedCollections: OrderedDict -julia> doc = OrderedDict("field_\$(i)" => rand(10) for i in 1:5); # Create some sample data +julia> doc = OrderedDict(string("field_", i) => rand(10) for i in 1:5); # Create some sample data julia> save("myfile.asdf", doc) @@ -792,8 +1014,8 @@ myfile.asdf └─ version::String | 2.0.0 ``` """ -function fileio_load(f::File{format"ASDF"}) - return load_file(f.filename) +function fileio_load(f::File{format"ASDF"}; kwargs...) + return load_file(f.filename; kwargs...) end @doc (@doc fileio_load) load @@ -802,6 +1024,11 @@ end ################################################################################ ################################################################################ +""" + ASDFLibrary + +Software provenance metadata, serialized as a `!core/software-1.0.0 YAML` tag. [`ASDF.write_file`] inserts an entry automatically under the key `"asdf/library"` if one is not already present, using the package's own name, author, homepage, and version. +""" struct ASDFLibrary name::AbstractString author::AbstractString @@ -814,6 +1041,20 @@ function YAML._print(io::IO, val::ASDFLibrary, level::Int=0, ignore_level::Bool= YAML._print(io, library, level, ignore_level) end +""" + NDArrayWrapper + +A write-side wrapper around a Julia array that carries compression and layout options. Used as the value type when building a document dict for [`ASDF.write_file`](@ref). + +Parameter | Default | Description | +| :---------- | :-------- | :------------------------------------------------------------------------ | +`compression` | `C_Bzip2` | Applied compression scheme | +`inline` | `false` | Embed data in YAML instead of a binary block | +`lz4_layout` | `:block` | `:block` for Python-compatible chunked LZ4, `:frame` for LZ4 frame format | + +!!! note + If the compressed output is larger than the raw input, the block is stored uncompressed regardless of the chosen compression. +""" struct NDArrayWrapper array::AbstractArray compression::Compression @@ -825,6 +1066,24 @@ function NDArrayWrapper(array::AbstractArray; compression::Compression=C_Bzip2, end Base.getindex(val::NDArrayWrapper) = val.array +""" + Blocks + +Module-level accumulator that collects [`ASDF.NDArrayWrapper`](@ref) values and their corresponding file positions during a single call to [`ASDF.write_file`](@ref). + +`Blocks` acts as a two-phase write buffer. In the first phase, as the YAML tree is serialized, each non-inline [`ASDF.NDArrayWrapper`](@ref) appends itself to `arrays` and reserves a sequential source index. In the second phase, [`ASDF.write_file`](@ref) iterates over `arrays`, compresses and writes each block to disk, and records the resulting file offsets in `positions`. The finalized `positions` vector is then written as the ASDF block index at the end of the file. + +!!! warning "Not thread-safe" + A single instance of `Blocks` is held in the module-level constant `ASDF.blocks`. Because this global state is mutated by [`ASDF.write_file`](@ref), concurrent calls to `write_file` from multiple threads will corrupt each other's block lists. Do not call `write_file` concurrently. + +# Fields + +| Field | Description | +| :---------- | :---------- | +| `arrays` | Ordered list of arrays to be written as binary blocks, accumulated during YAML serialisation. The position of each wrapper in this vector is its zero-based block source index. | +| `positions` | Absolute byte offsets of each written block within the output file, populated during the block-writing phase of [`ASDF.write_file`](@ref). `positions[i]` corresponds to `arrays[i]`. | + +""" struct Blocks arrays::Vector{NDArrayWrapper} positions::Vector{Int64} @@ -905,6 +1164,18 @@ function encode_Lz4_block(input::AbstractVector{UInt8}; chunk_size::Int = 1024 * return out end +""" + write_file(filename::AbstractString, document::AbstractDict) + +Writes an ASDF file to disk. `document` is a plain `Dict` whose values may include [`NDArrayWrapper`](@ref) instances. These are serialized as binary blocks with appropriate compression. + +Layout of the output file: + +1. ASDF/YAML header (`#ASDF 1.0.0, #ASDF_STANDARD 1.2.0, %YAML 1.1`) +1. YAML tree (`!core/asdf-1.1.0`) +1. Binary blocks — one per [`NDArrayWrapper`](@ref) that has `inline == false` +1. Block index (`#ASDF BLOCK INDEX`) +""" function write_file(filename::AbstractString, document::AbstractDict) # Set up block descriptors global blocks @@ -1065,7 +1336,7 @@ Save `data` to an asdf file at filepath `f`. ```jldoctest julia> using OrderedCollections: OrderedDict -julia> data = OrderedDict("field_\$(i)" => rand(10) for i in 1:5); # Create some sample data +julia> data = OrderedDict(string("field_", i) => rand(10) for i in 1:5); # Create some sample data julia> save("myfile.asdf", data) ``` @@ -1074,6 +1345,8 @@ function fileio_save(f::File{format"ASDF"}, data) return write_file(f.filename, data) end +fileio_save(f::File{format"ASDF"}, af::ASDFFile) = save(f, af.metadata) + @doc (@doc fileio_save) save end