From 3ce3e3dac3e3413728829caec640f8218d44cf51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=87a=C4=9Flar=20Eker?= Date: Tue, 3 Mar 2026 17:32:25 +0300 Subject: [PATCH 1/2] Filter out metadata directories from HUDI partitions --- .../xtable/hudi/BaseFileUpdatesExtractor.java | 5 +++-- .../xtable/hudi/HudiDataFileExtractor.java | 3 ++- .../org/apache/xtable/hudi/HudiPathUtils.java | 17 +++++++++++++++++ .../catalog/HudiCatalogPartitionSyncTool.java | 4 +++- .../TestHudiCatalogPartitionSyncTool.java | 15 +++++++++++++++ 5 files changed, 40 insertions(+), 4 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/hudi/BaseFileUpdatesExtractor.java b/xtable-core/src/main/java/org/apache/xtable/hudi/BaseFileUpdatesExtractor.java index 325fc8a53..5a6b6a9a4 100644 --- a/xtable-core/src/main/java/org/apache/xtable/hudi/BaseFileUpdatesExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/hudi/BaseFileUpdatesExtractor.java @@ -90,8 +90,9 @@ ReplaceMetadata extractSnapshotChanges( // can be dropped Set partitionPathsToDrop = new HashSet<>( - FSUtils.getAllPartitionPaths( - engineContext, metadataConfig, metaClient.getBasePathV2().toString())); + HudiPathUtils.filterMetadataPaths( + FSUtils.getAllPartitionPaths( + engineContext, metadataConfig, metaClient.getBasePathV2().toString()))); ReplaceMetadata replaceMetadata = partitionedDataFiles.stream() .map( diff --git a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiDataFileExtractor.java b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiDataFileExtractor.java index 5e17b389f..f164d9217 100644 --- a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiDataFileExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiDataFileExtractor.java @@ -115,7 +115,8 @@ public List getFilesCurrentState(InternalTable table) { tableMetadata != null ? tableMetadata.getAllPartitionPaths() : FSUtils.getAllPartitionPaths(engineContext, metadataConfig, basePath.toString()); - return getInternalDataFilesForPartitions(allPartitionPaths, table); + return getInternalDataFilesForPartitions( + HudiPathUtils.filterMetadataPaths(allPartitionPaths), table); } catch (IOException ex) { throw new ReadException( "Unable to read partitions for table " + metaClient.getTableConfig().getTableName(), ex); diff --git a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java index 545f32150..aae55bae1 100644 --- a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java +++ b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java @@ -18,6 +18,9 @@ package org.apache.xtable.hudi; +import java.util.List; +import java.util.stream.Collectors; + import org.apache.hadoop.fs.Path; public class HudiPathUtils { @@ -28,4 +31,18 @@ public static String getPartitionPath(Path tableBasePath, Path filePath) { int endIndex = pathStr.length() - fileName.length() - 1; return endIndex <= startIndex ? "" : pathStr.substring(startIndex, endIndex); } + + /** Filters out metadata/hidden directory paths like _delta_log and .hoodie. */ + public static List filterMetadataPaths(List partitionPaths) { + return partitionPaths.stream() + .filter( + p -> { + if (p.isEmpty()) { + return true; + } + String name = new Path(p).getName(); + return !name.startsWith("_") && !name.startsWith("."); + }) + .collect(Collectors.toList()); + } } diff --git a/xtable-core/src/main/java/org/apache/xtable/hudi/catalog/HudiCatalogPartitionSyncTool.java b/xtable-core/src/main/java/org/apache/xtable/hudi/catalog/HudiCatalogPartitionSyncTool.java index 792c70635..8c5003a2c 100644 --- a/xtable-core/src/main/java/org/apache/xtable/hudi/catalog/HudiCatalogPartitionSyncTool.java +++ b/xtable-core/src/main/java/org/apache/xtable/hudi/catalog/HudiCatalogPartitionSyncTool.java @@ -49,6 +49,7 @@ import org.apache.xtable.catalog.CatalogPartitionSyncOperations; import org.apache.xtable.catalog.CatalogPartitionSyncTool; import org.apache.xtable.exception.CatalogSyncException; +import org.apache.xtable.hudi.HudiPathUtils; import org.apache.xtable.hudi.HudiTableManager; import org.apache.xtable.model.InternalTable; import org.apache.xtable.model.catalog.CatalogTableIdentifier; @@ -213,7 +214,8 @@ private void updateLastCommitTimeSynced( public List getAllPartitionPathsOnStorage(String basePath) { HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(configuration); // ToDo - if we need to config to validate assumeDatePartitioning - return FSUtils.getAllPartitionPaths(engineContext, basePath, true, false); + return HudiPathUtils.filterMetadataPaths( + FSUtils.getAllPartitionPaths(engineContext, basePath, true, false)); } public List getWrittenPartitionsSince( diff --git a/xtable-core/src/test/java/org/apache/xtable/hudi/catalog/TestHudiCatalogPartitionSyncTool.java b/xtable-core/src/test/java/org/apache/xtable/hudi/catalog/TestHudiCatalogPartitionSyncTool.java index 0c33013a5..aae876535 100644 --- a/xtable-core/src/test/java/org/apache/xtable/hudi/catalog/TestHudiCatalogPartitionSyncTool.java +++ b/xtable-core/src/test/java/org/apache/xtable/hudi/catalog/TestHudiCatalogPartitionSyncTool.java @@ -120,6 +120,21 @@ private void setupCommonMocks() { mockHudiCatalogPartitionSyncTool = createMockHudiPartitionSyncTool(); } + @Test + void testGetAllPartitionPathsOnStorageFiltersMetadataDirs() { + setupCommonMocks(); + try (MockedStatic mockFSUtils = mockStatic(FSUtils.class)) { + mockFSUtils + .when(() -> FSUtils.getAllPartitionPaths(any(), eq(TEST_BASE_PATH), eq(true), eq(false))) + .thenReturn(Arrays.asList("key1", "_delta_log", ".hoodie", "key2")); + + List result = + mockHudiCatalogPartitionSyncTool.getAllPartitionPathsOnStorage(TEST_BASE_PATH); + + assertEquals(Arrays.asList("key1", "key2"), result); + } + } + @SneakyThrows @Test void testSyncAllPartitions() { From d3a8116aff0337a041e72ec6e4bb8065dc7442f4 Mon Sep 17 00:00:00 2001 From: caglareker Date: Sat, 7 Mar 2026 00:02:26 +0300 Subject: [PATCH 2/2] Tighten metadata filter to known dirs and add unit tests --- .../org/apache/xtable/hudi/HudiPathUtils.java | 13 +++- .../apache/xtable/hudi/TestHudiPathUtils.java | 65 +++++++++++++++++++ 2 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 xtable-core/src/test/java/org/apache/xtable/hudi/TestHudiPathUtils.java diff --git a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java index aae55bae1..62db51832 100644 --- a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java +++ b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java @@ -18,12 +18,19 @@ package org.apache.xtable.hudi; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import org.apache.hadoop.fs.Path; public class HudiPathUtils { + + private static final Set METADATA_DIR_NAMES = + new HashSet<>(Arrays.asList(".hoodie", "_delta_log")); + public static String getPartitionPath(Path tableBasePath, Path filePath) { String fileName = filePath.getName(); String pathStr = filePath.toUri().getPath(); @@ -32,7 +39,7 @@ public static String getPartitionPath(Path tableBasePath, Path filePath) { return endIndex <= startIndex ? "" : pathStr.substring(startIndex, endIndex); } - /** Filters out metadata/hidden directory paths like _delta_log and .hoodie. */ + /** Filters out known metadata directory paths like _delta_log and .hoodie. */ public static List filterMetadataPaths(List partitionPaths) { return partitionPaths.stream() .filter( @@ -40,8 +47,8 @@ public static List filterMetadataPaths(List partitionPaths) { if (p.isEmpty()) { return true; } - String name = new Path(p).getName(); - return !name.startsWith("_") && !name.startsWith("."); + String name = p.substring(p.lastIndexOf('/') + 1); + return !METADATA_DIR_NAMES.contains(name); }) .collect(Collectors.toList()); } diff --git a/xtable-core/src/test/java/org/apache/xtable/hudi/TestHudiPathUtils.java b/xtable-core/src/test/java/org/apache/xtable/hudi/TestHudiPathUtils.java new file mode 100644 index 000000000..376a8dccd --- /dev/null +++ b/xtable-core/src/test/java/org/apache/xtable/hudi/TestHudiPathUtils.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.hudi; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.junit.jupiter.api.Test; + +public class TestHudiPathUtils { + + @Test + void filterMetadataPaths_removesKnownMetadataDirs() { + List input = Arrays.asList("region=US", "_delta_log", ".hoodie", "year=2024"); + List result = HudiPathUtils.filterMetadataPaths(input); + assertEquals(Arrays.asList("region=US", "year=2024"), result); + } + + @Test + void filterMetadataPaths_nestedMetadataDirAsLastSegment() { + List input = Arrays.asList("year=2024/_delta_log", "region=US/.hoodie"); + List result = HudiPathUtils.filterMetadataPaths(input); + assertEquals(Collections.emptyList(), result); + } + + @Test + void filterMetadataPaths_keepsEmptyString() { + List input = Arrays.asList("", "region=US"); + List result = HudiPathUtils.filterMetadataPaths(input); + assertEquals(Arrays.asList("", "region=US"), result); + } + + @Test + void filterMetadataPaths_keepsPartitionStartingWithUnderscore() { + List input = Arrays.asList("_status=active", "_year=2024", "region=US"); + List result = HudiPathUtils.filterMetadataPaths(input); + assertEquals(Arrays.asList("_status=active", "_year=2024", "region=US"), result); + } + + @Test + void filterMetadataPaths_keepsPartitionStartingWithDot() { + List input = Arrays.asList(".version=1", "region=US"); + List result = HudiPathUtils.filterMetadataPaths(input); + assertEquals(Arrays.asList(".version=1", "region=US"), result); + } +} \ No newline at end of file