quiltdata · drernie · Mar 19, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 19, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,20 @@ All notable changes to the RAJA project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.3.0] - 2026-03-19
+
+### Added
+
+- **Lake Formation–native Iceberg catalog flow**: DataZone can now import Glue tables registered under Lake Formation. A new `seed_glue_tables.py` script creates and registers Glue databases and Iceberg tables, then imports them into DataZone as assets.
+- **`seed_glue_tables.py` script**: New seeding script that provisions Glue databases, registers S3 locations with Lake Formation, creates Iceberg tables, and drives the DataZone Glue import flow end-to-end.
+- **LF-native PoC scripts** (`scripts/lf_native_poc/`): Exploratory scripts for the Lake Formation–native Iceberg catalog path, including a throwaway-subscriber creation helper and a package-tag import proof-of-concept.
+
+### Changed
+
+- **Terraform: `DATAZONE_PROJECTS` declared in Terraform and fed back via `tf-outputs.json`**: The `DATAZONE_PROJECTS` env var is now sourced from Terraform outputs, eliminating config drift after domain recreation.
+- **Lake Formation access granted to DataZone Glue import role**: The IAM role used by DataZone to import Glue assets now has the necessary Lake Formation permissions (`lakeformation:GetDataAccess`, `lakeformation:GrantPermissions`, etc.).
+- **Seed scripts no longer use hardcoded project names**: `seed_glue_tables.py`, `seed_users.py`, and `seed_packages.py` derive project names from `seed-config.yaml` rather than hard-coded strings.
+
 ## [1.2.0] - 2026-03-18
 
 ### Added

diff --git a/infra/terraform/main.tf b/infra/terraform/main.tf
diff --git a/infra/terraform/outputs.tf b/infra/terraform/outputs.tf
@@ -53,6 +53,11 @@ output "datazone_package_asset_type_revision" {
   value       = aws_datazone_asset_type.quilt_package.revision
 }
 
+output "iceberg_lf_database_name" {
+  description = "Lake Formation-native Glue database mirroring the Quilt Iceberg tables."
+  value       = local.iceberg_enabled ? aws_glue_catalog_database.iceberg_lf[0].name : ""
+}
+
 output "control_plane_lambda_arn" {
   description = "Control plane Lambda ARN."
   value       = aws_lambda_function.control_plane.arn

diff --git a/infra/terraform/terraform.tfvars.example b/infra/terraform/terraform.tfvars.example
@@ -14,3 +14,4 @@ certificate_arn      = ""
 ecs_cpu_architecture = "ARM64"
 rajee_task_cpu       = 256
 rajee_task_memory    = 512
+iceberg_s3_bucket    = ""
diff --git a/infra/terraform/variables.tf b/infra/terraform/variables.tf
@@ -121,6 +121,12 @@ variable "registry_accessor_arns" {
   default     = []
 }
 
+variable "iceberg_s3_bucket" {
+  description = "S3 bucket containing the Quilt Iceberg tables (without s3:// prefix)."
+  type        = string
+  default     = ""
+}
+
 variable "datazone_domain_name" {
   description = "Amazon DataZone domain name for the RAJA package-grant POC."
   type        = string
@@ -150,3 +156,9 @@ variable "datazone_package_asset_type" {
   type        = string
   default     = "QuiltPackage"
 }
+
+variable "datazone_projects" {
+  description = "JSON blob mapping project keys to DataZone project_id/environment_id/project_label. Populated by sagemaker_gaps.py after environments are created and fed back in via TF_VAR_datazone_projects on subsequent runs."
+  type        = string
+  default     = ""
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "raja"
-version = "1.2.0"
+version = "1.3.0"
 description = "Add your description here"
 readme = "README.md"
 authors = [
@@ -117,11 +117,12 @@ demo-package = { cmd = "pytest tests/integration/test_rajee_package_grant.py -v
 demo-translation = { cmd = "pytest tests/integration/test_rajee_translation_grant.py -v -s", help = "Run translation grant (TAJ-package) demonstrations" }
 
 # AWS deployment
-deploy = { sequence = ["_terraform-apply", "_wait-rajee-stable", "_sagemaker-gaps", "_seed-users", "_seed-packages", "_show-outputs"], help = "Deploy standalone RAJA stack, wait for RAJEE, fill V2 gaps, and seed integration fixtures" }
+deploy = { sequence = ["_terraform-apply", "_wait-rajee-stable", "_sagemaker-gaps", "_seed-users", "_seed-packages", "_seed-glue-tables", "_show-outputs"], help = "Deploy standalone RAJA stack, wait for RAJEE, fill V2 gaps, and seed integration fixtures" }
 deploy-fast = { sequence = ["deploy"], help = "Alias for Terraform deploy" }
 destroy = { sequence = ["_terraform-destroy"], help = "Destroy Terraform stack" }
 seed-users = { shell = "set -a; [ -f .env ] && . ./.env; set +a; uv run --extra aws python -m scripts.seed_users", help = "Seed integration test principals into DataZone" }
 seed-packages = { shell = "set -a; [ -f .env ] && . ./.env; set +a; uv run --extra aws python -m scripts.seed_packages", help = "Seed quilt3 packages into raja-poc-registry from raja-poc-test and publish the DataZone listing" }
+seed-glue-tables = { shell = "set -a; [ -f .env ] && . ./.env; set +a; uv run --extra aws python -m scripts.seed_glue_tables", help = "Seed LF-native Glue table assets into DataZone and auto-approve subscriber project grants" }
 sagemaker-gaps = { shell = "set -a; [ -f .env ] && . ./.env; set +a; uv run --extra aws python scripts/sagemaker_gaps.py", help = "Fill current SageMaker Unified Studio V2 / DataZone Terraform gaps" }
 
 # Docker image building
@@ -146,11 +147,12 @@ _format = { cmd = "uv run --extra dev ruff format src tests infra lambda_handler
 _lint-fix = { cmd = "uv run --extra dev ruff check --fix src tests infra lambda_handlers", help = "Internal: fix lint issues" }
 _typecheck = { cmd = "uv run --extra dev mypy src", help = "Internal: run type checker" }
 _show-outputs = { cmd = "python scripts/show_outputs.py", help = "Internal: print deployment summary" }
-_terraform-apply = { shell = "set -a; [ -f .env ] && . ./.env; set +a; if [ -z \"${RAJA_ADMIN_KEY:-}\" ]; then echo \"Missing RAJA_ADMIN_KEY\" >&2; exit 1; fi; export TF_VAR_raja_admin_key=${RAJA_ADMIN_KEY}; export TF_VAR_raja_default_principal_username=$(python3 -c 'import os; users=[u.strip() for u in os.environ.get(\"RAJA_USERS\", \"\").split(\",\") if u.strip()]; print(users[0] if users else \"\")'); cd infra/terraform && terraform init -input=false && terraform apply -auto-approve -input=false && terraform output -json | python3 -c \"import json,sys; print(json.dumps({k:v['value'] for k,v in json.load(sys.stdin).items()}))\" > ../tf-outputs.json", help = "Internal: deploy Terraform stack and persist outputs" }
+_terraform-apply = { shell = "set -a; [ -f .env ] && . ./.env; set +a; if [ -z \"${RAJA_ADMIN_KEY:-}\" ]; then echo \"Missing RAJA_ADMIN_KEY\" >&2; exit 1; fi; export TF_VAR_raja_admin_key=${RAJA_ADMIN_KEY}; export TF_VAR_raja_default_principal_username=$(python3 -c 'import os; users=[u.strip() for u in os.environ.get(\"RAJA_USERS\", \"\").split(\",\") if u.strip()]; print(users[0] if users else \"\")'); export TF_VAR_datazone_projects=$(python3 -c 'import json,os; d=json.load(open(\"infra/tf-outputs.json\")) if os.path.exists(\"infra/tf-outputs.json\") else {}; print(d.get(\"datazone_projects\", \"\"))'); cd infra/terraform && terraform init -input=false && terraform apply -auto-approve -input=false && terraform output -json | python3 -c \"import json,sys; print(json.dumps({k:v['value'] for k,v in json.load(sys.stdin).items()}))\" > ../tf-outputs.json", help = "Internal: deploy Terraform stack and persist outputs" }
 _wait-rajee-stable = { shell = "set -a; [ -f .env ] && . ./.env; set +a; aws ecs wait services-stable --cluster raja-standalone-rajee-cluster --services raja-standalone-rajee-service", help = "Internal: wait for the RAJEE ECS service to reach a stable state" }
 _sagemaker-gaps = { shell = "set -a; [ -f .env ] && . ./.env; set +a; uv run --extra aws python scripts/sagemaker_gaps.py", help = "Internal: fill current SageMaker Unified Studio V2 / DataZone Terraform gaps" }
 _seed-users = { shell = "set -a; [ -f .env ] && . ./.env; set +a; uv run --extra aws python -m scripts.seed_users", help = "Internal: seed integration test principals into DataZone" }
 _seed-packages = { shell = "set -a; [ -f .env ] && . ./.env; set +a; uv run --extra aws python -m scripts.seed_packages", help = "Internal: seed quilt packages and DataZone listings" }
+_seed-glue-tables = { shell = "set -a; [ -f .env ] && . ./.env; set +a; uv run --extra aws python -m scripts.seed_glue_tables", help = "Internal: seed Glue table assets and DataZone subscriptions" }
 _terraform-destroy = { shell = "set -a; [ -f .env ] && . ./.env; set +a; if [ -z \"${RAJA_ADMIN_KEY:-}\" ]; then echo \"Missing RAJA_ADMIN_KEY\" >&2; exit 1; fi; export TF_VAR_raja_admin_key=${RAJA_ADMIN_KEY}; export TF_VAR_raja_default_principal_username=$(python3 -c 'import os; users=[u.strip() for u in os.environ.get(\"RAJA_USERS\", \"\").split(\",\") if u.strip()]; print(users[0] if users else \"\")'); cd infra/terraform && terraform init -input=false && terraform destroy -auto-approve -input=false && rm -f ../tf-outputs.json", help = "Internal: destroy Terraform stack" }
 
 [tool.mypy]

diff --git a/scripts/lf_native_poc/README.md b/scripts/lf_native_poc/README.md
@@ -0,0 +1,69 @@
+# LF-Native Import POC
+
+This folder contains the isolated probes that turned the LF-native
+Iceberg/DataZone blocker from guesswork into a reproducible path.
+
+Entrypoints:
+
+- `python -m scripts.lf_native_poc.package_tag_import_poc`
+- `python -m scripts.lf_native_poc.create_throwaway_subscriber`
+
+## What Actually Worked
+
+- Do not use manual `create_asset` Glue-table assets for LF-native tables.
+  They can reach `ACCEPTED` subscriptions without producing managed LF grants.
+- Use a real DataZone-managed Glue import and subscribe against the imported
+  listings, not the manual ones.
+- Give the owner Glue data source role Lake Formation access to the database,
+  every table, and every table location. This was the missing step that let the
+  data source import all four tables instead of only `package_tag`.
+- Re-run the Glue data source import whenever DataZone has imported fewer than
+  the expected four tables.
+- Keep the default Lakehouse blueprint healthy. Fresh subscriber projects only
+  worked after fixing the live Tooling and Lakehouse blueprint configs.
+- The successful LF fulfillment shape is not a direct table grant to the
+  subscriber environment role ARN. DataZone writes conditional `SELECT` grants
+  on `712023778557:IAMPrincipals` scoped by `context.datazone.projectId`.
+
+## Final Working Path
+
+1. Mirror the four Iceberg tables into `raja-standalone-iceberg-lf`.
+2. Ensure the owner project has a DataZone Glue data source for that database.
+3. Grant the data source role LF `ALL` on the database and tables plus
+   `DATA_LOCATION_ACCESS` on all table locations.
+4. Start or restart the DataZone Glue import.
+5. Wait for imported listings for:
+   - `package_entry`
+   - `package_manifest`
+   - `package_revision`
+   - `package_tag`
+6. Subscribe `bio` and `compute` against those imported listings.
+7. Verify LF conditional `SELECT` grants exist for both subscriber project IDs.
+
+## Working Evidence
+
+- Imported listings now used by the main seed flow:
+  - `package_entry` -> `cll99ezfwkw8pz`
+  - `package_manifest` -> `6q5wgwn4bjha5j`
+  - `package_revision` -> `apj78613rljtpj`
+  - `package_tag` -> `5za88zhymk4qzr`
+- Completed DataZone grant objects now exist for:
+  - `package_entry`
+  - `package_manifest`
+  - `package_revision`
+  - `package_tag`
+- Final LF state now includes conditional `SELECT` grants for both subscriber
+  project IDs:
+  - `bm7eqh5dc6olrb`
+  - `b3byg401pnpjjb`
+
+## Cleanup
+
+- Delete the throwaway subscriber project `60st0m21xz0a3r` if it is no longer
+  needed.
+- Old failed `package_tag` grant records from earlier experiments still exist in
+  DataZone history. They do not block the working path, but they are noise when
+  inspecting grant history.
+- Terraform still does not fully own the live DataZone domain role selection if
+  `ignore_changes` remains on `domain_execution_role` / `service_role`. That is
+  worth reconciling separately so the production-shaped role choice is durable.
diff --git a/scripts/lf_native_poc/__init__.py b/scripts/lf_native_poc/__init__.py
@@ -0,0 +1 @@
+"""Standalone LF-native/DataZone import POCs."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Standalone LF-native/DataZone import POCs."""