Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,284 changes: 0 additions & 1,284 deletions src/enrich/maintainer.rs

This file was deleted.

19 changes: 19 additions & 0 deletions src/enrich/maintainer/codeberg.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use super::types::{LookupError, MaintainerInfo};

/// Stub: Codeberg (Forgejo/Gitea v1) URL parsing and Host dispatch are wired,
/// but the per-author first-commit lookup is not yet implemented. Gitea's
/// commits endpoint gained reliable `?author=` filtering in v1.20; Codeberg's
/// exact API version and behavior need verification before shipping. Returns
/// no finding so the enricher stays clean rather than guessing.
///
/// TODO: implement lookup once Forgejo v1.20+ per-author commit filter is
/// confirmed. API base would be https://codeberg.org/api/v1.
pub(super) fn lookup_codeberg_repo(
_agent: &ureq::Agent,
_owner: &str,
_repo: &str,
_token: Option<&str>,
_now_secs: i64,
) -> std::result::Result<MaintainerInfo, LookupError> {
Ok(MaintainerInfo { finding: None })
}
118 changes: 118 additions & 0 deletions src/enrich/maintainer/github.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
use anyhow::Context;

use super::parsing::{parse_first_commit_date, parse_link_last_page, parse_top_contributor_login};
use super::types::{
GithubResponse, LookupError, MAX_CONTRIBUTORS_FOR_SIGNAL, MaintainerInfo, USER_AGENT,
};
use crate::enrich::maintainer::parsing::iso8601_to_unix_seconds;

/// Resolve a single `owner/repo` on GitHub. Returns the maintainer's login +
/// first commit date + days-old when the repo is in scope, or
/// `MaintainerInfo { finding: None }` when deliberately skipped.
pub(super) fn lookup_github_repo(
agent: &ureq::Agent,
base_url: &str,
owner: &str,
repo: &str,
token: Option<&str>,
now_secs: i64,
) -> std::result::Result<MaintainerInfo, LookupError> {
// Step 1: top contributor (per_page=1 returns the highest-commit-count author).
let top_url = format!("{base_url}/repos/{owner}/{repo}/contributors?per_page=1");
let top_resp = github_get(agent, &top_url, token)?;
let top_login = parse_top_contributor_login(&top_resp.body)
.context("parsing top-contributor response from GitHub")
.map_err(LookupError::Other)?;
let Some(top_login) = top_login else {
return Ok(MaintainerInfo { finding: None });
};

// Step 2: estimate contributor count. Asking for per_page=1 and reading the
// last-page number from the Link header is a one-request count without
// pulling 100 contributor records we don't need.
let count_url = format!("{base_url}/repos/{owner}/{repo}/contributors?per_page=1&anon=true");
let count_resp = github_get(agent, &count_url, token)?;
let contributor_count = parse_link_last_page(count_resp.link_header.as_deref()).unwrap_or(1);
if contributor_count > MAX_CONTRIBUTORS_FOR_SIGNAL {
return Ok(MaintainerInfo { finding: None });
}

// Step 3: first commit by that author. The `?author=...&per_page=1` query
// returns commits newest-first; the LAST page contains the oldest commit.
let commits_first_url =
format!("{base_url}/repos/{owner}/{repo}/commits?author={top_login}&per_page=1");
let commits_first = github_get(agent, &commits_first_url, token)?;
let last_page = parse_link_last_page(commits_first.link_header.as_deref());

let oldest_body = match last_page {
Some(page) if page > 1 => {
let last_url = format!(
"{base_url}/repos/{owner}/{repo}/commits?author={top_login}&per_page=1&page={page}"
);
github_get(agent, &last_url, token)?.body
}
// No pagination, or single page: the first response IS the last page.
_ => commits_first.body,
};

let date_str = match parse_first_commit_date(&oldest_body) {
Ok(Some(d)) => d,
Ok(None) => return Ok(MaintainerInfo { finding: None }),
Err(e) => return Err(LookupError::Other(e)),
};

let Some(commit_secs) = iso8601_to_unix_seconds(&date_str) else {
return Ok(MaintainerInfo { finding: None });
};
let days = (now_secs - commit_secs) / 86_400;

Ok(MaintainerInfo {
finding: Some((top_login, date_str, days)),
})
}

fn github_get(
agent: &ureq::Agent,
url: &str,
token: Option<&str>,
) -> std::result::Result<GithubResponse, LookupError> {
let mut req = agent
.get(url)
.set("user-agent", USER_AGENT)
.set("accept", "application/vnd.github+json")
.set("x-github-api-version", "2022-11-28");
if let Some(t) = token {
req = req.set("authorization", &format!("Bearer {t}"));
}
match req.call() {
Ok(resp) => {
let link_header = resp.header("link").map(str::to_string);
let body = resp
.into_string()
.context("reading GitHub response body")
.map_err(LookupError::Other)?;
Ok(GithubResponse { body, link_header })
}
Err(ureq::Error::Status(403, resp)) => {
if resp.header("x-ratelimit-remaining") == Some("0") {
Err(LookupError::RateLimited)
} else {
Err(LookupError::Other(anyhow::anyhow!(
"GitHub returned 403 for {url}"
)))
}
}
Err(ureq::Error::Status(404, _)) => {
// Not-found is a deliberate skip, not an error: the repo may have
// moved or been deleted. Surface as an empty body the callers parse
// as "no data".
Ok(GithubResponse {
body: "[]".to_string(),
link_header: None,
})
}
Err(e) => Err(LookupError::Other(
anyhow::Error::new(e).context(format!("GET {url} failed")),
)),
}
}
129 changes: 129 additions & 0 deletions src/enrich/maintainer/gitlab.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
use anyhow::Context;

use super::parsing::{
iso8601_to_unix_seconds, normalize_iso8601, parse_gitlab_first_commit_date,
parse_gitlab_top_contributor_name, parse_link_last_page, percent_encode,
};
use super::types::{
GITLAB_API_BASE, GitlabResponse, LookupError, MAX_CONTRIBUTORS_FOR_SIGNAL, MaintainerInfo,
USER_AGENT,
};

/// Resolve a single `owner/repo` on GitLab using the v4 REST API.
/// Uses `X-Total` header for contributor count (no Link-header parsing needed).
/// Author names (not logins) are stored; GitLab contributors are identified by
/// commit author name/email, not a username.
pub(super) fn lookup_gitlab_repo(
agent: &ureq::Agent,
owner: &str,
repo: &str,
token: Option<&str>,
now_secs: i64,
) -> std::result::Result<MaintainerInfo, LookupError> {
let project_id = percent_encode(&format!("{owner}/{repo}"));

// Steps 1+2 combined: per_page=1 returns the top contributor by commit
// count, and GitLab includes X-Total (total contributor count) on any
// paginated response regardless of per_page.
let top_url = format!(
"{GITLAB_API_BASE}/projects/{project_id}/repository/contributors\
?order_by=commits&sort=desc&per_page=1"
);
let top_resp = gitlab_get(agent, &top_url, token)?;

let contributor_count = top_resp.x_total.unwrap_or(u64::MAX);
if contributor_count > MAX_CONTRIBUTORS_FOR_SIGNAL {
return Ok(MaintainerInfo { finding: None });
}

let top_name = parse_gitlab_top_contributor_name(&top_resp.body)
.context("parsing GitLab top-contributor response")
.map_err(LookupError::Other)?;
let Some(top_name) = top_name else {
return Ok(MaintainerInfo { finding: None });
};

// Step 3: first commit by that author. GitLab's commits endpoint accepts
// ?author=<name> to filter by author name. Newest-first; paginate to last
// page for the oldest commit, same Link-header trick as GitHub.
let author_enc = percent_encode(&top_name);
let commits_first_url = format!(
"{GITLAB_API_BASE}/projects/{project_id}/repository/commits\
?author={author_enc}&per_page=1"
);
let commits_first = gitlab_get(agent, &commits_first_url, token)?;
let last_page = parse_link_last_page(commits_first.link_header.as_deref());

let oldest_body = match last_page {
Some(page) if page > 1 => {
let last_url = format!(
"{GITLAB_API_BASE}/projects/{project_id}/repository/commits\
?author={author_enc}&per_page=1&page={page}"
);
gitlab_get(agent, &last_url, token)?.body
}
_ => commits_first.body,
};

let date_str = match parse_gitlab_first_commit_date(&oldest_body) {
Ok(Some(d)) => d,
Ok(None) => return Ok(MaintainerInfo { finding: None }),
Err(e) => return Err(LookupError::Other(e)),
};

// GitLab timestamps vary: "2024-04-15T12:34:56.000+00:00", "...Z", etc.
// Normalize to YYYY-MM-DDTHH:MM:SSZ for our parser. Day-granularity
// calculations absorb the small UTC-offset error.
let normalized = match normalize_iso8601(&date_str) {
Some(d) => d,
None => return Ok(MaintainerInfo { finding: None }),
};

let Some(commit_secs) = iso8601_to_unix_seconds(&normalized) else {
return Ok(MaintainerInfo { finding: None });
};
let days = (now_secs - commit_secs) / 86_400;

Ok(MaintainerInfo {
finding: Some((top_name, normalized, days)),
})
}

fn gitlab_get(
agent: &ureq::Agent,
url: &str,
token: Option<&str>,
) -> std::result::Result<GitlabResponse, LookupError> {
let mut req = agent.get(url).set("user-agent", USER_AGENT);
if let Some(t) = token {
req = req.set("PRIVATE-TOKEN", t);
}
match req.call() {
Ok(resp) => {
let link_header = resp.header("link").map(str::to_string);
let x_total = resp.header("x-total").and_then(|v| v.parse::<u64>().ok());
let body = resp
.into_string()
.context("reading GitLab response body")
.map_err(LookupError::Other)?;
Ok(GitlabResponse {
body,
link_header,
x_total,
})
}
Err(ureq::Error::Status(429, _)) => Err(LookupError::RateLimited),
Err(ureq::Error::Status(401 | 403 | 404, _)) => {
// 401/403: repo is private or token missing; skip silently.
// 404: repo gone or moved; skip.
Ok(GitlabResponse {
body: "[]".to_string(),
link_header: None,
x_total: Some(0),
})
}
Err(e) => Err(LookupError::Other(
anyhow::Error::new(e).context(format!("GET {url} failed")),
)),
}
}
79 changes: 79 additions & 0 deletions src/enrich/maintainer/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
//! Maintainer-age enrichment: flag newly added dependencies hosted on GitHub,
//! GitLab, or Codeberg whose top contributor's first commit is suspiciously
//! recent.
//!
//! ## The signal
//!
//! The xz/`liblzma` backdoor of 2024 (CVE-2024-3094) was authored by a GitHub
//! identity ("Jia Tan") that started contributing two years before introducing
//! the malicious payload. The pattern -- a brand-new account becoming the de
//! facto sole maintainer of a low-traffic but widely-depended-upon package --
//! is a leading indicator of long-game supply-chain takeovers. We can't catch
//! Jia Tan in retrospect, but we can flag the next one earlier in their arc by
//! surfacing "this package's top contributor opened their first commit less than
//! 90 days ago" at the moment a new dep is added.
//!
//! ## Threshold
//!
//! 90 days is intentionally aggressive. Most legitimate new packages will trip
//! this on initial introduction; that's fine -- a human reviewer can dismiss
//! "the package is brand-new and the author is its only maintainer" trivially.
//! The expensive miss is the **silent takeover** of an existing package by a
//! recently-arrived contributor, which is what 90-day captures. Tune later if
//! the false-positive rate is unworkable in practice.
//!
//! ## Why no octocrab / no chrono
//!
//! `octocrab` pulls in `tokio` and ~70 transitive crates for what amounts to
//! three GET requests. `chrono` similarly bloats the dep tree for parsing one
//! ISO-8601 timestamp shape (GitHub always emits the canonical
//! `YYYY-MM-DDTHH:MM:SSZ`). Hand-rolled `ureq` calls and a 25-line ISO-8601
//! parser keep the binary under our 5 MB target. The same constraint applies
//! to GitLab and Codeberg; no new heavyweight dependencies are added.
//!
//! ## Network behavior
//!
//! Best-effort, mirrors the OSV enricher: per-request timeout 15 seconds,
//! errors surface as warnings on stderr, the diff still renders. Token env
//! vars raise rate limits: `GITHUB_TOKEN` (Bearer, GitHub REST), `GITLAB_TOKEN`
//! (PRIVATE-TOKEN header, GitLab v4), `CODEBERG_TOKEN` (Authorization: token,
//! Gitea v1). All three are optional; absent means unauthenticated requests,
//! fine for low volume.
//!
//! ## Skipped cases
//!
//! - Components without a `source_url` (CycloneDX `externalReferences[type=vcs]`
//! absent, etc.) -- silently skipped.
//! - Source URLs not matching github.com, gitlab.com, or codeberg.org --
//! silently skipped.
//! - Repositories with > 50 contributors -- skipped because the "top
//! contributor's first commit" loses meaning on monorepos and multi-vendor
//! projects (Linux, Kubernetes, React).
//! - Per-repo results are cached within a single bomdrift run so repeated
//! `cs.added` entries from the same project don't multiply HTTP requests.
//!
//! Always informational severity -- never trips fail-on.

mod codeberg;
mod github;
mod gitlab;
mod parsing;
mod pipeline;
mod types;

#[cfg(test)]
mod tests;

pub use pipeline::{enrich, enrich_with, enrich_with_hosts};
pub use types::{Host, MaintainerAgeFinding, YOUNG_MAINTAINER_DAYS};

// Re-export crate-visible helpers that callers outside this module historically
// accessed via `crate::enrich::maintainer::...`. The pre-split file made
// `parse_github_repo`, `parse_gitlab_repo`, `parse_codeberg_repo`,
// `parse_link_last_page`, and `iso8601_to_unix_seconds` `pub(crate)`; keep
// those paths working even if no in-tree caller currently uses them.
#[allow(unused_imports)]
pub(crate) use parsing::{
iso8601_to_unix_seconds, parse_codeberg_repo, parse_github_repo, parse_gitlab_repo,
parse_link_last_page,
};
Loading
Loading