Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import com.linkedin.openhouse.jobs.util.SparkJobUtil;
import com.linkedin.openhouse.jobs.util.TableStatsCollector;
import java.io.IOException;
import java.nio.file.Paths;
import java.time.ZonedDateTime;
import java.time.temporal.ChronoUnit;
import java.util.Collections;
Expand Down Expand Up @@ -356,7 +355,7 @@ private Map<String, List<String>> prepareBackupDataManifests(
Collectors.groupingBy(
task -> {
String path = task.file().path().toString();
return Paths.get(path).getParent().toString();
return dataFileParent(path);
},
Collectors.mapping(task -> task.file().path().toString(), Collectors.toList())));
} catch (IOException e) {
Expand Down Expand Up @@ -403,7 +402,20 @@ private void exposeBackupLocation(Table table, String backupDir) {
table.name(), AppConstants.BACKUP_DIR_KEY, fullyQualifiedBackupDir));
}

private Path getTrashPath(String path, String filePath, String trashDir) {
/**
* Return the parent directory of a data file path. Uses Hadoop {@link Path} (not {@link
* java.nio.file.Paths}, which is not URI-aware) so that the scheme and authority of a
* fully-qualified path such as {@code hdfs://cluster/a/b/f.orc} are preserved. Dropping the
* authority here would later resolve the backup manifest against the default filesystem root and
* fail with a permission error.
*/
@VisibleForTesting
static String dataFileParent(String dataFilePath) {
return new Path(dataFilePath).getParent().toString();
}

@VisibleForTesting
static Path getTrashPath(String path, String filePath, String trashDir) {
return new Path(filePath.replace(path, new Path(path, trashDir).toString()));
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package com.linkedin.openhouse.jobs.spark;

import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

/**
* Pure-Java unit tests for the backup-manifest path derivation in {@link Operations}. No Spark
* session, no filesystem — exercises the scheme/authority handling that broke retention backups
* when data files are stored as fully-qualified {@code hdfs://authority/...} URIs.
*
* <p>These guard against a regression where {@code java.nio.file.Paths} was used to derive parent
* paths: it is not URI-aware and collapses {@code hdfs://authority} into {@code hdfs:/authority},
* dropping the authority. The mangled path later resolves against the default filesystem root and
* fails with {@code Permission denied ... inode="/"}.
*/
public class OperationsBackupPathTest {

@Test
public void dataFileParentPreservesSchemeAndAuthority() {
String dataFile =
"hdfs://ltx1-holdem/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-26-00/f.orc";

String parent = Operations.dataFileParent(dataFile);

// Authority (//ltx1-holdem) must be retained; the old Paths.get path produced
// "hdfs:/ltx1-holdem/..." (single slash, authority dropped).
Assertions.assertEquals(
"hdfs://ltx1-holdem/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-26-00", parent);
}

@Test
public void dataFileParentPreservesBarePath() {
String dataFile = "/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-06-00/f.orc";

Assertions.assertEquals(
"/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-06-00",
Operations.dataFileParent(dataFile));
}

@Test
public void manifestDestPathForQualifiedDataFileKeepsAuthorityAndInsertsBackupDir() {
// Mirrors the production chain in prepareBackupDataManifests + writeBackupDataManifests.
// Iceberg stores table.location() bare (no scheme), but newer data files are fully qualified.
String tableLocation = "/data/openhouse/db/tbl-uuid";
String dataFile =
"hdfs://ltx1-holdem/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-26-00/f.orc";

String partitionPath = Operations.dataFileParent(dataFile);
String manifestSource = new Path(partitionPath, "data_manifest_1.json").toString();
Path dest = Operations.getTrashPath(tableLocation, manifestSource, ".backup");

Assertions.assertEquals(
"hdfs://ltx1-holdem/data/openhouse/db/tbl-uuid/.backup/data/"
+ "datepartition=2026-03-26-00/data_manifest_1.json",
dest.toString());
}

@Test
public void manifestDestPathForBareDataFileInsertsBackupDir() {
String tableLocation = "/data/openhouse/db/tbl-uuid";
String dataFile = "/data/openhouse/db/tbl-uuid/data/datepartition=2026-03-06-00/f.orc";

String partitionPath = Operations.dataFileParent(dataFile);
String manifestSource = new Path(partitionPath, "data_manifest_1.json").toString();
Path dest = Operations.getTrashPath(tableLocation, manifestSource, ".backup");

Assertions.assertEquals(
"/data/openhouse/db/tbl-uuid/.backup/data/datepartition=2026-03-06-00/data_manifest_1.json",
dest.toString());
}
}
Loading