Skip to content

Commit

Permalink
Fix Vacuum deleting files with whitespace in paths
Browse files Browse the repository at this point in the history
When a Delta Lake file path contains whitespace, the VACUUM procedure
unexpectedly removes it. This commit fixes the issue by wrapping the
file path using RFC 2396 URI encoding, ensuring consistency with how
file paths are handled when writing add or remove entries.
  • Loading branch information
chenjian2664 committed Feb 3, 2025
1 parent 4971ef1 commit 0baf55b
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3282,7 +3282,7 @@ private void setRollback(Runnable action)
checkState(rollbackAction.compareAndSet(null, action), "rollback action is already set");
}

private static String toUriFormat(String path)
public static String toUriFormat(String path)
{
verify(!path.startsWith("/") && !path.contains(":/"), "unexpected path: %s", path);
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
import static io.trino.plugin.deltalake.DeltaLakeMetadata.MAX_WRITER_VERSION;
import static io.trino.plugin.deltalake.DeltaLakeMetadata.checkUnsupportedUniversalFormat;
import static io.trino.plugin.deltalake.DeltaLakeMetadata.checkValidTableHandle;
import static io.trino.plugin.deltalake.DeltaLakeMetadata.toUriFormat;
import static io.trino.plugin.deltalake.DeltaLakeSessionProperties.getVacuumMinRetention;
import static io.trino.plugin.deltalake.transactionlog.DeltaLakeTableFeatures.DELETION_VECTORS_FEATURE_NAME;
import static io.trino.plugin.deltalake.transactionlog.DeltaLakeTableFeatures.unsupportedWriterFeatures;
Expand Down Expand Up @@ -269,7 +270,9 @@ private void doVacuum(
"Unexpected path [%s] returned when listing files under [%s]",
location,
tableLocation);
String relativePath = location.substring(commonPathPrefix.length());

// Paths are RFC 2396 URI encoded https://github.com/delta-io/delta/blob/master/PROTOCOL.md#add-file-and-remove-file
String relativePath = toUriFormat(location.substring(commonPathPrefix.length()));
if (relativePath.isEmpty()) {
// A file returned for "tableLocation/", might be possible on S3.
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1798,6 +1798,52 @@ public void testVacuumWithTrailingSlash()
}
}

@Test
public void testVacuumWithWhiteSpace()
throws Exception
{
String catalog = getSession().getCatalog().orElseThrow();
String tableName = "test_vacuum_white_space_" + randomNameSuffix();
String tableLocation = getLocationForTable(bucketName, tableName) + "/";
Session sessionWithShortRetentionUnlocked = Session.builder(getSession())
.setCatalogSessionProperty(catalog, "vacuum_min_retention", "0s")
.build();

assertUpdate(format("CREATE TABLE %s (val int, col_white_space timestamp(6)) WITH (location = '%s', partitioned_by = ARRAY['col_white_space'])", tableName, tableLocation));

assertUpdate("INSERT INTO " + tableName + " VALUES (1, TIMESTAMP '2024-12-13 11:00:00.000000'), (2, TIMESTAMP '2024-12-13 12:00:00.000000')", 2);

try {
Set<String> initialFiles = getActiveFiles(tableName);
assertThat(initialFiles).hasSize(2);

computeActual("UPDATE " + tableName + " SET val = val + 100");
Stopwatch timeSinceUpdate = Stopwatch.createStarted();
Set<String> updatedFiles = getActiveFiles(tableName);
assertThat(updatedFiles).hasSize(2).doesNotContainAnyElementsOf(initialFiles);
assertThat(getAllDataFilesFromTableDirectory(tableName)).isEqualTo(union(initialFiles, updatedFiles));

// vacuum with high retention period, nothing should change
assertUpdate(sessionWithShortRetentionUnlocked, "CALL system.vacuum(CURRENT_SCHEMA, '" + tableName + "', '10m')");
assertQuery("SELECT * FROM " + tableName, "VALUES (101, TIMESTAMP '2024-12-13 11:00:00.000000'), (102, TIMESTAMP '2024-12-13 12:00:00.000000')");
assertThat(getActiveFiles(tableName)).isEqualTo(updatedFiles);
assertThat(getAllDataFilesFromTableDirectory(tableName)).isEqualTo(union(initialFiles, updatedFiles));

// vacuum with low retention period
MILLISECONDS.sleep(1_000 - timeSinceUpdate.elapsed(MILLISECONDS) + 1);
assertUpdate(sessionWithShortRetentionUnlocked, "CALL system.vacuum(CURRENT_SCHEMA, '" + tableName + "', '1s')");
// table data shouldn't change
assertQuery("SELECT * FROM " + tableName, "VALUES (101, TIMESTAMP '2024-12-13 11:00:00.000000'), (102, TIMESTAMP '2024-12-13 12:00:00.000000')");
// active files shouldn't change
assertThat(getActiveFiles(tableName)).isEqualTo(updatedFiles);
// old files should be cleaned up
assertThat(getAllDataFilesFromTableDirectory(tableName)).isEqualTo(updatedFiles);
}
finally {
assertUpdate("DROP TABLE " + tableName);
}
}

@Test
public void testVacuumParameterValidation()
{
Expand Down

0 comments on commit 0baf55b

Please sign in to comment.