diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java index bd301c729..b9feb9dfb 100644 --- a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java +++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java @@ -14,7 +14,6 @@ import com.linkedin.openhouse.jobs.util.SparkJobUtil; import com.linkedin.openhouse.jobs.util.TableStatsCollector; import java.io.IOException; -import java.nio.file.Paths; import java.time.ZonedDateTime; import java.time.temporal.ChronoUnit; import java.util.Collections; @@ -356,7 +355,7 @@ private Map> prepareBackupDataManifests( Collectors.groupingBy( task -> { String path = task.file().path().toString(); - return Paths.get(path).getParent().toString(); + return dataFileParent(path); }, Collectors.mapping(task -> task.file().path().toString(), Collectors.toList()))); } catch (IOException e) { @@ -403,7 +402,20 @@ private void exposeBackupLocation(Table table, String backupDir) { table.name(), AppConstants.BACKUP_DIR_KEY, fullyQualifiedBackupDir)); } - private Path getTrashPath(String path, String filePath, String trashDir) { + /** + * Return the parent directory of a data file path. Uses Hadoop {@link Path} (not {@link + * java.nio.file.Paths}, which is not URI-aware) so that the scheme and authority of a + * fully-qualified path such as {@code hdfs://cluster/a/b/f.orc} are preserved. Dropping the + * authority here would later resolve the backup manifest against the default filesystem root and + * fail with a permission error. + */ + @VisibleForTesting + static String dataFileParent(String dataFilePath) { + return new Path(dataFilePath).getParent().toString(); + } + + @VisibleForTesting + static Path getTrashPath(String path, String filePath, String trashDir) { return new Path(filePath.replace(path, new Path(path, trashDir).toString())); } diff --git a/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/OperationsBackupPathTest.java b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/OperationsBackupPathTest.java new file mode 100644 index 000000000..0fd3a83c2 --- /dev/null +++ b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/OperationsBackupPathTest.java @@ -0,0 +1,72 @@ +package com.linkedin.openhouse.jobs.spark; + +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Pure-Java unit tests for the backup-manifest path derivation in {@link Operations}. No Spark + * session, no filesystem — exercises the scheme/authority handling that broke retention backups + * when data files are stored as fully-qualified {@code hdfs://authority/...} URIs. + * + *

These guard against a regression where {@code java.nio.file.Paths} was used to derive parent + * paths: it is not URI-aware and collapses {@code hdfs://authority} into {@code hdfs:/authority}, + * dropping the authority. The mangled path later resolves against the default filesystem root and + * fails with {@code Permission denied ... inode="/"}. + */ +public class OperationsBackupPathTest { + + @Test + public void dataFileParentPreservesSchemeAndAuthority() { + String dataFile = + "hdfs://ltx1-holdem/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-26-00/f.orc"; + + String parent = Operations.dataFileParent(dataFile); + + // Authority (//ltx1-holdem) must be retained; the old Paths.get path produced + // "hdfs:/ltx1-holdem/..." (single slash, authority dropped). + Assertions.assertEquals( + "hdfs://ltx1-holdem/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-26-00", parent); + } + + @Test + public void dataFileParentPreservesBarePath() { + String dataFile = "/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-06-00/f.orc"; + + Assertions.assertEquals( + "/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-06-00", + Operations.dataFileParent(dataFile)); + } + + @Test + public void manifestDestPathForQualifiedDataFileKeepsAuthorityAndInsertsBackupDir() { + // Mirrors the production chain in prepareBackupDataManifests + writeBackupDataManifests. + // Iceberg stores table.location() bare (no scheme), but newer data files are fully qualified. + String tableLocation = "/data/openhouse/db/tbl-uuid"; + String dataFile = + "hdfs://ltx1-holdem/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-26-00/f.orc"; + + String partitionPath = Operations.dataFileParent(dataFile); + String manifestSource = new Path(partitionPath, "data_manifest_1.json").toString(); + Path dest = Operations.getTrashPath(tableLocation, manifestSource, ".backup"); + + Assertions.assertEquals( + "hdfs://ltx1-holdem/data/openhouse/db/tbl-uuid/.backup/data/" + + "datepartition=2026-03-26-00/data_manifest_1.json", + dest.toString()); + } + + @Test + public void manifestDestPathForBareDataFileInsertsBackupDir() { + String tableLocation = "/data/openhouse/db/tbl-uuid"; + String dataFile = "/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-06-00/f.orc"; + + String partitionPath = Operations.dataFileParent(dataFile); + String manifestSource = new Path(partitionPath, "data_manifest_1.json").toString(); + Path dest = Operations.getTrashPath(tableLocation, manifestSource, ".backup"); + + Assertions.assertEquals( + "/data/openhouse/db/tbl-uuid/.backup/data/datepartition=2026-03-06-00/data_manifest_1.json", + dest.toString()); + } +}