Skip to content

Commit

Permalink
[Spark] Fall back to zordering when clustering on a single column (#3109
Browse files Browse the repository at this point in the history
)

## Description
Fall back to zorder when clustering on a single column, because hilbert
clustering doesn't support 1 column.

Resolves #3087 

## How was this patch tested?
New unit test.
  • Loading branch information
zedtang authored May 17, 2024
1 parent 8a8e757 commit e15132b
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ object MultiDimClustering {
curve: String): DataFrame = {
assert(colNames.nonEmpty, "Cannot cluster by zero columns!")
val clusteringImpl = curve match {
case "hilbert" if colNames.size == 1 => ZOrderClustering
case "hilbert" => HilbertClustering
case "zorder" => ZOrderClustering
case unknownCurve =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,28 @@ class ClusteredTableClusteringSuite extends SparkFunSuite
}
}
}

test("cluster by 1 column") {
withSQLConf(SQLConf.MAX_RECORDS_PER_FILE.key -> "2") {
withClusteredTable(
table = table,
schema = "col1 int, col2 int",
clusterBy = "col1") {
addFiles(table, numFiles = 4)
val files0 = getFiles(table)
assert(files0.size === 4)
assertNotClustered(files0)

// Optimize should cluster the data into two 2 files since MAX_RECORDS_PER_FILE is 2.
runOptimize(table) { metrics =>
assert(metrics.numFilesRemoved == 4)
assert(metrics.numFilesAdded == 2)
}

val files1 = getFiles(table)
assert(files1.size == 2)
assertClustered(files1)
}
}
}
}

0 comments on commit e15132b

Please sign in to comment.