Skip to content

Commit de9d02f

Browse files
authored
Merge pull request #17 from ccdavis/check-server-status
Add dataversion utility for extracting version info from data files
2 parents 09ca47e + 338efc9 commit de9d02f

7 files changed

Lines changed: 779 additions & 66 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ sql-builder="3.1"
1212
interner="*"
1313
compressed_string = "*"
1414
csv = "1.1"
15+
flate2 = "1.0"
1516
extended = "*"
1617
ascii = "*"
1718
bstr = "1.7.0"
@@ -33,6 +34,10 @@ path = "src/lib.rs"
3334
name = "abacus"
3435
path = "src/bin/abacus.rs"
3536

37+
[[bin]]
38+
name = "dataversion"
39+
path = "src/bin/dataversion.rs"
40+
3641
[[bench]]
3742
name = "tabulate_simple_request_benchmark"
3843
harness = false

src/bin/dataversion.rs

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
//! A command-line utility to extract version information from IPUMS data files.
2+
//!
3+
//! This tool reads version metadata from both Parquet and fixed-width IPUMS data files
4+
//! and outputs it in either JSON or human-readable text format.
5+
//!
6+
//! # Usage
7+
//!
8+
//! ```bash
9+
//! # For parquet data (directory containing .parquet files)
10+
//! dataversion /pkg/ipums/usa/output_data/current/parquet/us2015b
11+
//!
12+
//! # For fixed-width data (.dat.gz file)
13+
//! dataversion /pkg/ipums/usa/output_data/current/us2015b_usa.dat.gz
14+
//!
15+
//! # Output as JSON (default is text)
16+
//! dataversion --format json /path/to/data
17+
//! ```
18+
19+
use cimdea::data_version::{extract_version, DataVersion};
20+
use clap::{Parser, ValueEnum};
21+
use std::process;
22+
23+
#[derive(Parser, Debug)]
24+
#[command(
25+
name = "dataversion",
26+
version,
27+
about = "Extract version information from IPUMS data files",
28+
long_about = "Extract version information from IPUMS data files.\n\n\
29+
Supports both Parquet and fixed-width (.dat.gz) formats.\n\
30+
Version information includes release numbers, commit hashes,\n\
31+
branch names, and other build metadata."
32+
)]
33+
struct Args {
34+
/// Path to the data file or directory.
35+
///
36+
/// For Parquet: path to a directory containing .parquet files
37+
/// (e.g., /pkg/ipums/usa/output_data/current/parquet/us2015b)
38+
///
39+
/// For fixed-width: path to a .dat.gz file
40+
/// (e.g., /pkg/ipums/usa/output_data/current/us2015b_usa.dat.gz)
41+
#[arg(value_name = "PATH")]
42+
path: String,
43+
44+
/// Output format
45+
#[arg(short, long, value_enum, default_value = "text")]
46+
format: OutputFormat,
47+
}
48+
49+
#[derive(Debug, Clone, Copy, ValueEnum)]
50+
enum OutputFormat {
51+
/// Human-readable text output
52+
Text,
53+
/// Machine-readable JSON output
54+
Json,
55+
}
56+
57+
fn main() {
58+
let args = Args::parse();
59+
60+
match extract_version(&args.path) {
61+
Ok(version) => {
62+
output_version(&version, args.format);
63+
}
64+
Err(e) => {
65+
eprintln!("Error: {}", e);
66+
process::exit(1);
67+
}
68+
}
69+
}
70+
71+
fn output_version(version: &DataVersion, format: OutputFormat) {
72+
match format {
73+
OutputFormat::Text => {
74+
println!("{}", version.to_text());
75+
}
76+
OutputFormat::Json => match version.to_json() {
77+
Ok(json) => println!("{}", json),
78+
Err(e) => {
79+
eprintln!("Error serializing to JSON: {}", e);
80+
process::exit(1);
81+
}
82+
},
83+
}
84+
}

src/conventions.rs

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,8 @@ impl MicroDataCollection {
155155
})?;
156156

157157
// Collect record types and filenames first to avoid borrow issues
158-
let record_types: Vec<(String, String)> = self.record_types
158+
let record_types: Vec<(String, String)> = self
159+
.record_types
159160
.keys()
160161
.map(|k| {
161162
let base_filename = self.base_filename_for_dataset_and_rectype(dataset_name, k);
@@ -254,10 +255,7 @@ impl MicroDataCollection {
254255
/// Takes a path like ../output_data/current/parquet/, which could be derived
255256
/// automatically from defaults based on data root or product root. Scans all
256257
/// parquet schema information and embedded metadata.
257-
pub fn load_metadata_from_all_parquet(
258-
&mut self,
259-
parquet_path: &Path,
260-
) -> Result<(), MdError> {
258+
pub fn load_metadata_from_all_parquet(&mut self, parquet_path: &Path) -> Result<(), MdError> {
261259
if !parquet_path.exists() {
262260
return Err(metadata_error!(
263261
"Parquet path does not exist: {}",
@@ -278,9 +276,8 @@ impl MicroDataCollection {
278276
let mut errors = Vec::new();
279277

280278
for entry in entries {
281-
let entry = entry.map_err(|e| {
282-
metadata_error!("Failed to read directory entry: {}", e)
283-
})?;
279+
let entry =
280+
entry.map_err(|e| metadata_error!("Failed to read directory entry: {}", e))?;
284281

285282
let path = entry.path();
286283
if path.is_dir() {

0 commit comments

Comments
 (0)