diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index f1b6aeaf..d44968eb 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -27,15 +27,17 @@ Please verify that your change does not introduce performance regressions.
 -->
 - [ ] **No Impact**: This change does not affect the critical path (e.g., build system, doc, error handling).
 - [ ] **Positive Impact**: I have run benchmarks.
-    <details>
-    <summary>Click to view Benchmark Results</summary>
+<details>
+<summary>Click to view Benchmark Results</summary>
+
+```text
+Paste your google-benchmark or TPC-H results here.
+Before: 10.5s
+After:   8.2s  (+20%)
+```
+
+</details>
 
-    ```text
-    Paste your google-benchmark or TPC-H results here.
-    Before: 10.5s
-    After:   8.2s  (+20%)
-    ```
-    </details>
 - [ ] **Negative Impact**: Explained below (e.g., trade-off for correctness).
 
 ### Release Note
@@ -70,13 +72,12 @@ If yes, please describe how users should migrate.
 
 - [ ] No
 - [ ] Yes (Description: ...)
-    <details>
-    <summary>Click to view Breaking Changes</summary>
-
-    ```text
-    Breaking Changes:
-    - Description of the breaking change.
-    - Possible solutions or workarounds.
-    - Any other relevant information.
-    ```
-    </details>
+<details>
+<summary>Click to view Breaking Changes</summary>
+```text
+Breaking Changes:
+- Description of the breaking change.
+- Possible solutions or workarounds.
+- Any other relevant information.
+```
+</details>  
diff --git a/Makefile b/Makefile
index 0f8592a0..cd8a4f69 100644
--- a/Makefile
+++ b/Makefile
@@ -348,6 +348,17 @@ unittest_coverage: debug_with_test_cov		#: Build with debugging and run unit tes
 	lcov --remove coverage.info '/usr/*' '*/.conan/data/*' '*/_build/*' '*/tests/*' '*/test/*' --output-file coverage_striped.info && \
 	genhtml --ignore-errors source coverage_striped.info --output-directory coverage
 
+unittest_single:
+ifndef TARGET
+	$(error TARGET is undefined. Usage: make unittest_single TARGET=TargetName [TEST=TestFilter])
+endif
+	cmake --build $(BUILD_BASE_DIR)/Release --target $(TARGET) -j $(NUM_THREADS)
+ifneq ($(TEST),)
+	export GTEST_FILTER="$(TEST)" && ctest --test-dir $(BUILD_BASE_DIR)/Release -R "$(TARGET)" --output-on-failure --timeout 7200
+else
+	ctest --test-dir $(BUILD_BASE_DIR)/Release -R "$(TARGET)" --output-on-failure --timeout 7200
+endif
+
 hdfstest: hdfs-debug-build #: Build with debugging, hdfs enabled and run hdfs tests
 	ctest --test-dir $(BUILD_BASE_DIR)/Debug -j ${NUM_THREADS} --output-on-failure -R bolt_hdfs_file_test
 
diff --git a/bolt/connectors/Connector.h b/bolt/connectors/Connector.h
index 510628df..b1f91e98 100644
--- a/bolt/connectors/Connector.h
+++ b/bolt/connectors/Connector.h
@@ -412,7 +412,7 @@ class AsyncThreadCtx {
   void disallowPreload() {
     if (adaptive_ && allowPreload_.load()) {
       allowPreload_ = false;
-      LOG(WARNING) << "Disallow scan preload due to limited memory";
+      LOG(INFO) << "Disallow scan preload due to limited memory";
     }
   }
 
diff --git a/bolt/core/QueryConfig.h b/bolt/core/QueryConfig.h
index af9be731..39c4ef11 100644
--- a/bolt/core/QueryConfig.h
+++ b/bolt/core/QueryConfig.h
@@ -652,6 +652,21 @@ class QueryConfig {
   static constexpr const char* kSkewRowCountRatioThreshold =
       "hash_join_skewed_row_count_ratio";
 
+  static constexpr const char* kEnableHashJoinArrayRecluster =
+      "enable_hash_join_array_recluster";
+
+  static constexpr const char* kHashJoinArrayReclusterMode =
+      "hash_join_array_recluster_mode";
+
+  static constexpr const char* kHashJoinArrayReclusterDuplicateRatioThreshold =
+      "hash_join_array_recluster_duplicate_ratio_threshold";
+
+  static constexpr const char* kHashJoinArrayReclusterMinProbeRowNumber =
+      "hash_join_array_recluster_min_probe_row_number";
+
+  static constexpr const char* kHashJoinArrayReclusterMinDistinctRowNumber =
+      "hash_join_array_recluster_min_distinct_row_number";
+
   // -1 means print all exceptions, usualing for debug
   // 0 means disable all exceptions,
   // 1 means print exceptions whose prefix is in the white list(default)
@@ -1490,6 +1505,26 @@ class QueryConfig {
     return get<bool>(kHashJoinSkewedPartitionEnabled, true);
   }
 
+  bool hashJoinArrayReclusterEnabled() const {
+    return get<bool>(kEnableHashJoinArrayRecluster, true);
+  }
+
+  std::string hashJoinArrayReclusterMode() const {
+    return get<std::string>(kHashJoinArrayReclusterMode, "hash");
+  }
+
+  int64_t hashJoinArrayReclusterDuplicateRatioThreshold() const {
+    return get<int64_t>(kHashJoinArrayReclusterDuplicateRatioThreshold, 128);
+  }
+
+  int64_t hashJoinArrayReclusterMinProbeRowNumber() const {
+    return get<int64_t>(kHashJoinArrayReclusterMinProbeRowNumber, 500000);
+  }
+
+  int64_t hashJoinArrayReclusterMinDistinctRowNumber() const {
+    return get<int64_t>(kHashJoinArrayReclusterMinDistinctRowNumber, 32);
+  }
+
   int32_t skewFileSizeRatioThreshold() const {
     return get<int32_t>(kSkewFileSizeRatioThreshold, 10);
   }
diff --git a/bolt/exec/HashBuild.cpp b/bolt/exec/HashBuild.cpp
index 2ad5df41..b34ff24e 100644
--- a/bolt/exec/HashBuild.cpp
+++ b/bolt/exec/HashBuild.cpp
@@ -183,6 +183,7 @@ void HashBuild::initialize() {
   if (isAntiJoin(joinType_) && joinNode_->filter()) {
     setupFilterForAntiJoins(keyChannelMap_);
   }
+  table_->setNumEstimatedProbeRows(joinBridge_->numEstimatedProbeRows());
 }
 
 void HashBuild::setupTable() {
@@ -203,6 +204,12 @@ void HashBuild::setupTable() {
     dependentTypes.emplace_back(tableType_->childAt(i));
   }
   auto& queryConfig = operatorCtx_->driverCtx()->queryConfig();
+  HashTableReclusterConfig hashTableReclusterConfig(
+      queryConfig.hashJoinArrayReclusterDuplicateRatioThreshold(),
+      queryConfig.hashJoinArrayReclusterMinProbeRowNumber(),
+      queryConfig.hashJoinArrayReclusterMinDistinctRowNumber(),
+      queryConfig.hashJoinArrayReclusterMode(),
+      queryConfig.hashJoinArrayReclusterEnabled());
   if (joinNode_->isRightJoin() || joinNode_->isFullJoin() ||
       joinNode_->isRightSemiProjectJoin()) {
     // Do not ignore null keys.
@@ -215,7 +222,8 @@ void HashBuild::setupTable() {
                      : BaseHashTable::HashMode::kArray,
         queryConfig.minTableRowsForParallelJoinBuild(),
         pool(),
-        queryConfig.enableJitRowEqVectors());
+        queryConfig.enableJitRowEqVectors(),
+        hashTableReclusterConfig);
   } else {
     // Right semi join needs to tag build rows that were probed.
     const bool needProbedFlag = joinNode_->isRightSemiFilterJoin();
@@ -231,7 +239,8 @@ void HashBuild::setupTable() {
                        : BaseHashTable::HashMode::kArray,
           queryConfig.minTableRowsForParallelJoinBuild(),
           pool(),
-          queryConfig.enableJitRowEqVectors());
+          queryConfig.enableJitRowEqVectors(),
+          hashTableReclusterConfig);
     } else {
       // Ignore null keys
       table_ = HashTable<true>::createForJoin(
@@ -243,7 +252,8 @@ void HashBuild::setupTable() {
                        : BaseHashTable::HashMode::kArray,
           queryConfig.minTableRowsForParallelJoinBuild(),
           pool(),
-          queryConfig.enableJitRowEqVectors());
+          queryConfig.enableJitRowEqVectors(),
+          hashTableReclusterConfig);
     }
   }
   lookup_ = std::make_unique<HashLookup>(
diff --git a/bolt/exec/HashJoinBridge.h b/bolt/exec/HashJoinBridge.h
index cf2f8b71..c057fd6f 100644
--- a/bolt/exec/HashJoinBridge.h
+++ b/bolt/exec/HashJoinBridge.h
@@ -134,6 +134,14 @@ class HashJoinBridge : public JoinBridge {
     }
   }
 
+  uint64_t numEstimatedProbeRows() const {
+    return numEstimatedProbeRows_;
+  }
+
+  void setNumEstimatedProbeRows(uint64_t numEstimatedProbeRows) {
+    numEstimatedProbeRows_ = numEstimatedProbeRows;
+  }
+
  private:
   uint32_t numBuilders_{0};
 
@@ -158,6 +166,8 @@ class HashJoinBridge : public JoinBridge {
   // This set can grow if HashBuild operator cannot load full partition in
   // memory and engages in recursive spilling.
   SpillPartitionSet spillPartitionSets_;
+
+  uint64_t numEstimatedProbeRows_{0};
 };
 
 // Indicates if 'joinNode' is null-aware anti or left semi project join type and
diff --git a/bolt/exec/HashProbe.cpp b/bolt/exec/HashProbe.cpp
index 197afb87..ff0a798c 100644
--- a/bolt/exec/HashProbe.cpp
+++ b/bolt/exec/HashProbe.cpp
@@ -158,6 +158,9 @@ HashProbe::HashProbe(
 
 void HashProbe::initialize() {
   Operator::initialize();
+  uint64_t totalRowCnt{0};
+  operatorCtx_->traverseOpToGetRowCount(totalRowCnt);
+  joinBridge_->setNumEstimatedProbeRows(totalRowCnt);
   auto jitRowEqVectors =
       operatorCtx_->driverCtx()->queryConfig().enableJitRowEqVectors();
   BOLT_CHECK(hashers_.empty());
diff --git a/bolt/exec/HashTable.cpp b/bolt/exec/HashTable.cpp
index 10a17207..7849f57b 100644
--- a/bolt/exec/HashTable.cpp
+++ b/bolt/exec/HashTable.cpp
@@ -820,7 +820,12 @@ void HashTable<ignoreNullKeys>::allocateTables(uint64_t size) {
   BOLT_CHECK_GT(size, 0);
   capacity_ = size;
   const uint64_t byteSize = capacity_ * tableSlotSize();
-  BOLT_CHECK_EQ(byteSize % kBucketSize, 0);
+  BOLT_CHECK_EQ(
+      byteSize % kBucketSize,
+      0,
+      "byteSize: {}, kBucketSize: {}, ",
+      byteSize,
+      kBucketSize);
   numTombstones_ = 0;
   sizeMask_ = byteSize - 1;
   numBuckets_ = byteSize / kBucketSize;
@@ -1550,6 +1555,35 @@ void HashTable<ignoreNullKeys>::clearUseRange(std::vector<bool>& useRange) {
   }
 }
 
+template <bool ignoreNullKeys>
+void HashTable<ignoreNullKeys>::tryRecluster() {
+  if (!reclusterConfig_.enableArrayRecluster) {
+    return;
+  }
+
+  // note that maxDistinctNumber is the maximum number of distinct values
+  // in all hashers. But numDistinct_ is not DISTINCT number of values in
+  // the row container.
+
+  if (numDistinct_ >= numEstimatedProbeRows_ ||
+      numEstimatedProbeRows_ < reclusterConfig_.minProbeRowNumber) {
+    return;
+  }
+  size_t maxDistinctNumber = 0;
+
+  for (auto& hasher : hashers_) {
+    maxDistinctNumber = std::max(maxDistinctNumber, hasher->numUniqueValues());
+  }
+  int64_t duplicateRatio =
+      maxDistinctNumber > 0 ? numDistinct_ / maxDistinctNumber : 0;
+  if (duplicateRatio < reclusterConfig_.duplicateRatioThreshold ||
+      maxDistinctNumber < reclusterConfig_.minDistinctRowNumber) {
+    return;
+  }
+
+  reclusterDataByKey();
+}
+
 template <bool ignoreNullKeys>
 void HashTable<ignoreNullKeys>::decideHashMode(
     int32_t numNew,
@@ -1775,6 +1809,107 @@ bool mayUseValueIds(const BaseHashTable& table) {
 }
 } // namespace
 
+template <bool ignoreNullKeys>
+void HashTable<ignoreNullKeys>::reclusterDataByKey() {
+  if (!isJoinBuild_) {
+    LOG(INFO) << "reclusterDataByKey: joinBuild_ is false";
+    return;
+  }
+  if (rows_->numRows() == 0) {
+    LOG(INFO) << "reclusterDataByKey: numRows is 0";
+    return;
+  }
+  if (rows_->keyTypes().empty()) {
+    LOG(INFO) << "reclusterDataByKey: keyTypes is empty";
+    return;
+  }
+  if (rows_->accumulators().size() > 0) {
+    LOG(INFO) << "reclusterDataByKey: accumulators is not empty";
+    return;
+  }
+  if (sorted_) {
+    LOG(INFO) << "reclusterDataByKey: sorted_ is true";
+    return;
+  }
+
+  if (hashMode_ != HashMode::kArray) {
+    LOG(INFO) << "reclusterDataByKey: hashMode_ is not kArray";
+    return;
+  }
+
+  auto numRows = rows_->numRows();
+  std::vector<char*> sortedRows(numRows);
+
+  RowContainerIterator iter;
+  rows_->listRows(&iter, numRows, sortedRows.data());
+
+  if (reclusterConfig_.reclusterMode ==
+      HashTableReclusterConfig::ReclusterMode::kSort) {
+    HybridSorter sorter{SortAlgo::kAuto};
+    sorter.template sort(
+        sortedRows.begin(),
+        sortedRows.end(),
+        [this](const char* leftRow, const char* rightRow) {
+          return rows_->compareRows(leftRow, rightRow) < 0;
+        });
+  } else if (
+      reclusterConfig_.reclusterMode ==
+      HashTableReclusterConfig::ReclusterMode::kHash) {
+    std::vector<uint64_t> rowHashes(numRows);
+    folly::Range<char**> rowRange(sortedRows.data(), numRows);
+
+    for (size_t i = 0; i < rows_->keyTypes().size(); ++i) {
+      bool mix = (i > 0);
+      rows_->hash(i, rowRange, mix, rowHashes.data());
+    }
+
+    folly::F14FastMap<uint64_t, size_t> counts;
+    counts.reserve(4096);
+    for (uint64_t h : rowHashes) {
+      counts[h]++;
+    }
+
+    folly::F14FastMap<uint64_t, size_t> offsets;
+    offsets.reserve(counts.size());
+
+    size_t currentOffset = 0;
+    for (auto& kv : counts) {
+      offsets[kv.first] = currentOffset;
+      currentOffset += kv.second;
+    }
+    std::vector<char*> result(numRows);
+
+    for (int32_t i = 0; i < numRows; ++i) {
+      uint64_t h = rowHashes[i];
+
+      size_t pos = offsets[h]++;
+
+      result[pos] = sortedRows[i];
+    }
+    sortedRows = std::move(result);
+  } else {
+    LOG(ERROR) << "reclusterDataByKey: unknown reclusterMode: "
+               << static_cast<int>(reclusterConfig_.reclusterMode);
+    return;
+  }
+
+  rows_ = std::move(rows_->cloneByOrder(sortedRows));
+  if (table_ != nullptr) {
+    rows_->pool()->freeContiguous(tableAllocation_);
+    table_ = nullptr;
+  }
+  numTombstones_ = 0;
+
+  for (size_t i = 0; i < otherTables_.size(); ++i) {
+    otherTables_[i]->reclusterDataByKey();
+  }
+  capacity_ = bits::nextPowerOfTwo(
+      std::max(static_cast<uint64_t>(numRows), kBucketSize));
+  allocateTables(capacity_);
+  rehash(true);
+  sorted_ = true;
+}
+
 template <bool ignoreNullKeys>
 void HashTable<ignoreNullKeys>::prepareJoinTable(
     std::vector<std::unique_ptr<BaseHashTable>> tables,
@@ -1852,6 +1987,9 @@ void HashTable<ignoreNullKeys>::prepareJoinTable(
     }
   } else {
     decideHashMode(0);
+    if (hashMode_ == HashMode::kArray) {
+      tryRecluster();
+    }
   }
   checkHashBitsOverlap(spillInputStartPartitionBit);
   LOG(INFO) << __FUNCTION__ << ": capacity_ = " << capacity_
diff --git a/bolt/exec/HashTable.h b/bolt/exec/HashTable.h
index 17ab5407..41e22455 100644
--- a/bolt/exec/HashTable.h
+++ b/bolt/exec/HashTable.h
@@ -431,6 +431,14 @@ class BaseHashTable {
     return offThreadBuildTiming_;
   }
 
+  uint64_t numEstimatedProbeRows() const {
+    return numEstimatedProbeRows_;
+  }
+
+  void setNumEstimatedProbeRows(uint64_t numEstimatedProbeRows) {
+    numEstimatedProbeRows_ = numEstimatedProbeRows;
+  }
+
  protected:
   static FOLLY_ALWAYS_INLINE size_t tableSlotSize() {
     // Each slot is 8 bytes.
@@ -458,6 +466,8 @@ class BaseHashTable {
 
   // Time spent in build outside of the calling thread.
   CpuWallTiming offThreadBuildTiming_;
+
+  uint64_t numEstimatedProbeRows_ = 0;
 };
 
 FOLLY_ALWAYS_INLINE std::ostream& operator<<(
@@ -521,6 +531,62 @@ class ProbeState {
   uint8_t indexInTags_ = kNotSet;
 };
 
+struct HashTableReclusterConfig {
+  enum class ReclusterMode { kHash = 0, kSort = 1 };
+
+  int64_t duplicateRatioThreshold = 128;
+  int64_t minProbeRowNumber = 500000;
+  int64_t minDistinctRowNumber = 32;
+  ReclusterMode reclusterMode = ReclusterMode::kHash;
+  bool enableArrayRecluster = false;
+
+  HashTableReclusterConfig() = default;
+  HashTableReclusterConfig(
+      int64_t duplicateRatioThreshold,
+      int64_t minProbeRowNumber,
+      int64_t minDistinctRowNumber,
+      ReclusterMode reclusterMode,
+      bool enableArrayRecluster)
+      : duplicateRatioThreshold(duplicateRatioThreshold),
+        minProbeRowNumber(minProbeRowNumber),
+        minDistinctRowNumber(minDistinctRowNumber),
+        reclusterMode(reclusterMode),
+        enableArrayRecluster(enableArrayRecluster) {}
+  HashTableReclusterConfig(
+      int64_t duplicateRatioThreshold,
+      int64_t minProbeRowNumber,
+      int64_t minDistinctRowNumber,
+      const std::string& reclusterMod,
+      bool enableArrayRecluster)
+      : duplicateRatioThreshold(duplicateRatioThreshold),
+        minProbeRowNumber(minProbeRowNumber),
+        minDistinctRowNumber(minDistinctRowNumber),
+        reclusterMode(parseReclusterMode(reclusterMod)),
+        enableArrayRecluster(enableArrayRecluster) {}
+
+  static ReclusterMode parseReclusterMode(const std::string& mode) {
+    if (mode == "hash") {
+      return ReclusterMode::kHash;
+    } else if (mode == "sort") {
+      return ReclusterMode::kSort;
+    } else {
+      BOLT_FAIL("Unknown hash join array recluster mode: {}", mode);
+    }
+  }
+  static std::string modeString(ReclusterMode mode) {
+    switch (mode) {
+      case ReclusterMode::kHash:
+        return "hash";
+      case ReclusterMode::kSort:
+        return "sort";
+      default:
+        BOLT_FAIL(
+            "Unknown hash join array recluster mode: {}",
+            static_cast<int32_t>(mode));
+    }
+  }
+};
+
 template <bool ignoreNullKeys>
 class HashTable : public BaseHashTable {
  public:
@@ -586,8 +652,9 @@ class HashTable : public BaseHashTable {
       HashMode mode,
       uint32_t minTableSizeForParallelJoinBuild,
       memory::MemoryPool* pool,
-      bool jitRowEqVectors) {
-    return std::make_unique<HashTable>(
+      bool jitRowEqVectors,
+      const HashTableReclusterConfig& reclusterConfig = {}) {
+    auto hashTable = std::make_unique<HashTable>(
         std::move(hashers),
         std::vector<Accumulator>{},
         dependentTypes,
@@ -599,6 +666,8 @@ class HashTable : public BaseHashTable {
         pool,
         nullptr,
         jitRowEqVectors);
+    hashTable->reclusterConfig_ = reclusterConfig;
+    return hashTable;
   }
 
   void groupProbe(HashLookup& lookup) override;
@@ -690,6 +759,10 @@ class HashTable : public BaseHashTable {
     return hashMode_;
   }
 
+  void reclusterDataByKey();
+
+  void tryRecluster();
+
   void decideHashMode(int32_t numNew, bool disableRangeArrayHash = false)
       override;
 
@@ -1157,8 +1230,9 @@ class HashTable : public BaseHashTable {
 #ifdef ENABLE_BOLT_JIT
   bolt::jit::CompiledModuleSP jitModule_;
   bolt::jit::CompiledModuleSP jitModuleRow_;
-
 #endif
+  bool sorted_ = false;
+  HashTableReclusterConfig reclusterConfig_{};
 };
 
 } // namespace exec
diff --git a/bolt/exec/Operator.cpp b/bolt/exec/Operator.cpp
index 834f85e9..8c781960 100644
--- a/bolt/exec/Operator.cpp
+++ b/bolt/exec/Operator.cpp
@@ -346,6 +346,41 @@ void OperatorCtx::traverseOpToGetRowCount(
   }
 }
 
+void OperatorCtx::traverseOpToGetRowCount(uint64_t& totalRowCount) const {
+  auto numDrivers = task()->numDrivers(driverCtx());
+
+  if (numDrivers == 1) {
+    const auto& operators = driver()->operators();
+
+    VLOG(5) << "operators.size()=" << operators.size()
+            << ", operatorId=" << operatorId();
+
+    for (auto i = operatorId() - 1; i >= 0; --i) {
+      auto metricValueStr = operators[i]->getRuntimeMetric(
+          OperatorMetricKey::kCanUsedToEstimateHashBuildPartitionNum, "false");
+      auto metricValue = folly::to<bool>(metricValueStr);
+
+      VLOG(5) << "OperatorIndex=" << i << ", operator is "
+              << operators[i]->toString()
+              << ", kCanUsedToEstimateHashBuildPartitionNum="
+              << (metricValue ? "true" : "false");
+
+      if (metricValue) {
+        auto totalRowCountStr =
+            operators[i]->getRuntimeMetric(OperatorMetricKey::kTotalRowCount);
+
+        BOLT_CHECK_NE(totalRowCountStr, "", "totalRowCountStr can't be empty")
+
+        totalRowCount = folly::to<uint64_t>(totalRowCountStr);
+
+        LOG(INFO) << toString() << " totalRowCountStr = " << totalRowCountStr
+                  << ", numDrivers = " << numDrivers;
+        break;
+      }
+    }
+  }
+}
+
 void OperatorCtx::adjustSpillCompressionKind(
     common::SpillConfig*& spillConfig) {
   if (!isFirstSpill_) {
diff --git a/bolt/exec/Operator.h b/bolt/exec/Operator.h
index f13a5562..a07f14b5 100644
--- a/bolt/exec/Operator.h
+++ b/bolt/exec/Operator.h
@@ -121,6 +121,8 @@ class OperatorCtx {
       uint64_t& totalRowCount,
       uint64_t& processedRowCount);
 
+  void traverseOpToGetRowCount(uint64_t& totalRowCount) const;
+
   /// adjust SpillCompressionKind if estimatedSpillSize too large
   void adjustSpillCompressionKind(common::SpillConfig*& spillConfig);
 
diff --git a/bolt/exec/RowContainer.cpp b/bolt/exec/RowContainer.cpp
index 8ee26890..292c0d88 100644
--- a/bolt/exec/RowContainer.cpp
+++ b/bolt/exec/RowContainer.cpp
@@ -338,6 +338,110 @@ RowContainer::~RowContainer() {
   clear();
 }
 
+std::unique_ptr<RowContainer> RowContainer::cloneByOrder(
+    const std::vector<char*>& sortedRows,
+    memory::MemoryPool* pool,
+    std::shared_ptr<HashStringAllocator> stringAllocator) {
+  std::vector<TypePtr> dependentTypes;
+  if (types_.size() > keyTypes_.size()) {
+    dependentTypes.reserve(types_.size() - keyTypes_.size());
+    for (size_t i = keyTypes_.size(); i < types_.size(); ++i) {
+      dependentTypes.push_back(types_[i]);
+    }
+  }
+
+  auto newContainer = std::make_unique<RowContainer>(
+      keyTypes_,
+      nullableKeys_,
+      accumulators_,
+      dependentTypes,
+      nextOffset_ != 0,
+      isJoinBuild_,
+      probedFlagOffset_ != 0,
+      hasNormalizedKeys_,
+      pool == nullptr ? rows_.pool() : pool,
+      stringAllocator == nullptr ? pool == nullptr
+              ? std::make_shared<HashStringAllocator>(rows_.pool())
+              : std::make_shared<HashStringAllocator>(pool)
+                                 : stringAllocator);
+  if (hasVariableAccumulator_) {
+    BOLT_CHECK(
+        !usesExternalMemory_,
+        "Direct copy with external memory accumulators is not fully supported in this optimized path.");
+  }
+  auto& targetStringAllocator = newContainer->stringAllocator();
+
+  for (char* sourceRow : sortedRows) {
+    BOLT_CHECK_NOT_NULL(sourceRow, "Source row cannot be null");
+    char* targetRow = newContainer->newRow();
+
+    ::memcpy(targetRow, sourceRow, fixedRowSize_);
+
+    if (normalizedKeySize_ > 0) {
+      RowContainer::normalizedKey(targetRow) =
+          RowContainer::normalizedKey(sourceRow);
+    }
+
+    bits::clearBit(targetRow, newContainer->freeFlagOffset_);
+
+    if (nextOffset_ != 0) {
+      *reinterpret_cast<char**>(targetRow + nextOffset_) = nullptr;
+    }
+
+    if (rowSizeOffset_ != 0) {
+      *reinterpret_cast<uint32_t*>(targetRow + rowSizeOffset_) = 0;
+    }
+
+    for (int i = 0; i < types_.size(); ++i) {
+      if (types_[i]->isFixedWidth()) {
+        continue;
+      }
+
+      auto col = rowColumns_[i];
+      if (isNullAt(sourceRow, col)) {
+        continue;
+      }
+
+      auto typeKind = types_[i]->kind();
+
+      if (typeKind == TypeKind::ROW || typeKind == TypeKind::ARRAY ||
+          typeKind == TypeKind::MAP) {
+        auto sourceView = valueAt<std::string_view>(sourceRow, col.offset());
+        if (!sourceView.empty()) {
+          RowSizeTracker tracker(
+              targetRow[rowSizeOffset_], targetStringAllocator);
+          targetStringAllocator.copyMultipart(
+              StringView(sourceView.data(), sourceView.size()),
+              targetRow,
+              col.offset());
+        }
+      } else if (
+          typeKind == TypeKind::VARCHAR || typeKind == TypeKind::VARBINARY) {
+        StringView sourceView = valueAt<StringView>(sourceRow, col.offset());
+        if (!sourceView.isInline()) {
+          RowSizeTracker tracker(
+              targetRow[rowSizeOffset_], targetStringAllocator);
+          targetStringAllocator.copyMultipart(
+              sourceView, targetRow, col.offset());
+        }
+      }
+    }
+
+    for (const auto& accumulator : accumulators_) {
+      if (accumulator.serializable()) {
+        uint32_t serializeSize = accumulator.getSerializeSize(sourceRow);
+        if (serializeSize > 0) {
+          std::vector<char> buffer(serializeSize);
+          accumulator.serializeAccumulator(sourceRow, buffer.data());
+          accumulator.deserializeAccumulator(targetRow, buffer.data());
+        }
+      }
+    }
+  }
+
+  return newContainer;
+}
+
 char* RowContainer::newRow() {
   BOLT_DCHECK(mutable_, "Can't add row into an immutable row container");
   ++numRows_;
diff --git a/bolt/exec/RowContainer.h b/bolt/exec/RowContainer.h
index fa097dfa..ce7cd80b 100644
--- a/bolt/exec/RowContainer.h
+++ b/bolt/exec/RowContainer.h
@@ -235,6 +235,11 @@ class RowContainer {
 
   ~RowContainer();
 
+  std::unique_ptr<RowContainer> cloneByOrder(
+      const std::vector<char*>& rows,
+      memory::MemoryPool* pool = nullptr,
+      std::shared_ptr<HashStringAllocator> stringAllocator = nullptr);
+
   static int32_t combineAlignments(int32_t a, int32_t b);
 
   /// 'keyTypes' gives the type of the key of each row. For a group by,
diff --git a/bolt/exec/tests/HashTableTest.cpp b/bolt/exec/tests/HashTableTest.cpp
index fb4a3fc1..633f845b 100644
--- a/bolt/exec/tests/HashTableTest.cpp
+++ b/bolt/exec/tests/HashTableTest.cpp
@@ -1230,4 +1230,73 @@ TEST(HashTableTest, tableInsertPartitionInfo) {
     ASSERT_EQ(overflows[i], info.overflows[i]);
   }
 }
+
+TEST_P(HashTableTest, reclusterDataByKey) {
+  std::vector<TypePtr> dependentTypes = {BIGINT()};
+  std::vector<std::unique_ptr<VectorHasher>> hashers;
+  hashers.emplace_back(std::make_unique<VectorHasher>(BIGINT(), 0));
+
+  HashTableReclusterConfig config;
+  // Reusing 'enableRunParallel' to test different recluster modes (kSort vs
+  // kHash).
+  config.reclusterMode = GetParam().enableRunParallel
+      ? HashTableReclusterConfig::ReclusterMode::kSort
+      : HashTableReclusterConfig::ReclusterMode::kHash;
+  config.enableArrayRecluster = true;
+  config.duplicateRatioThreshold = 0;
+  config.minDistinctRowNumber = 0;
+
+  auto hashTable = HashTable<true>::createForJoin(
+      std::move(hashers),
+      dependentTypes,
+      true /*allowDuplicates*/,
+      false /*hasProbedFlag*/,
+      BaseHashTable::HashMode::kArray,
+      1 /*minTableSizeForParallelJoinBuild*/,
+      pool(),
+      GetParam().jitRowEqVectors,
+      config);
+  const int32_t numRows = 4096;
+  const int32_t numDistinctKeys = 16;
+  auto keyVector = makeFlatVector<int64_t>(
+      numRows, [](auto row) { return (row % numDistinctKeys) + 1; });
+  auto payloadVector =
+      makeFlatVector<int64_t>(numRows, [](auto row) { return row * 10; });
+
+  auto batch = makeRowVector({keyVector, payloadVector});
+  copyVectorsToTable({batch}, 0, hashTable.get());
+  hashTable->prepareJoinTable({}, executor_.get());
+  auto& rows = *hashTable->rows();
+  EXPECT_EQ(rows.numRows(), numRows);
+
+  hashTable->reclusterDataByKey();
+
+  auto& newRows = *hashTable->rows();
+  EXPECT_EQ(newRows.numRows(), numRows);
+
+  int32_t keyOffset = newRows.columnAt(0).offset();
+  RowContainerIterator iter;
+  std::vector<int64_t> resultKeys;
+  char* rowPtrs[1];
+
+  while (newRows.listRows(&iter, 1, rowPtrs) > 0) {
+    char* row = rowPtrs[0];
+    resultKeys.push_back(newRows.valueAt<int64_t>(row, keyOffset));
+  }
+
+  ASSERT_EQ(resultKeys.size(), numRows);
+  std::unordered_set<int64_t> seenKeys(resultKeys.begin(), resultKeys.end());
+  EXPECT_EQ(seenKeys.size(), numDistinctKeys);
+  int changeCount = 0;
+  for (size_t i = 1; i < resultKeys.size(); ++i) {
+    if (resultKeys[i] != resultKeys[i - 1]) {
+      changeCount++;
+    }
+  }
+  EXPECT_EQ(changeCount, numDistinctKeys - 1)
+      << "Data should be clustered into " << numDistinctKeys
+      << " groups, so there should be exactly " << numDistinctKeys - 1
+      << " transitions.";
+}
+
 } // namespace bytedance::bolt::exec::test
diff --git a/bolt/exec/tests/RowContainerTest.cpp b/bolt/exec/tests/RowContainerTest.cpp
index b8d6f55f..71630626 100644
--- a/bolt/exec/tests/RowContainerTest.cpp
+++ b/bolt/exec/tests/RowContainerTest.cpp
@@ -1747,6 +1747,68 @@ DEBUG_ONLY_TEST_F(RowContainerTest, eraseAfterOomStoringString) {
   rowContainer->eraseRows(folly::Range<char**>(rows.data(), numRows));
 }
 
+TEST_F(RowContainerTest, cloneByOrder) {
+  std::vector<TypePtr> keyTypes = {BIGINT()};
+  std::vector<TypePtr> dependentTypes = {VARCHAR()};
+
+  auto data = makeRowContainer(keyTypes, dependentTypes);
+  auto keyOffset = data->columnAt(0).offset();
+  auto storeString = [&](char* row, const std::string& s) {
+    auto vector = makeFlatVector<StringView>({StringView(s)});
+    DecodedVector decoded(*vector);
+    data->store(decoded, 0, row, 1);
+  };
+  // Insert 3 rows: (1, "a"), (2, "b"), (3, "c")
+  // Row 1
+  auto* row1 = data->newRow();
+  data->valueAt<int64_t>(row1, keyOffset) = 1;
+  std::string str1 = "value_a_long_string";
+  storeString(row1, str1);
+
+  // Row 2
+  auto* row2 = data->newRow();
+  data->valueAt<int64_t>(row2, keyOffset) = 2;
+  std::string str2 = "value_b";
+  storeString(row2, str2);
+
+  // Row 3
+  auto* row3 = data->newRow();
+  data->valueAt<int64_t>(row3, keyOffset) = 3;
+  std::string str3 = "value_c";
+  storeString(row3, str3);
+
+  EXPECT_EQ(data->numRows(), 3);
+
+  // Create a vector of rows in reverse order: 3, 2, 1
+  std::vector<char*> sortedRows = {row3, row2, row1};
+
+  auto newData = data->cloneByOrder(sortedRows);
+
+  EXPECT_EQ(newData->numRows(), 3);
+  EXPECT_NE(newData.get(), data.get());
+
+  int64_t expectedKeys[] = {3, 2, 1};
+  std::string expectedVals[] = {str3, str2, str1};
+
+  std::vector<char*> clonedRows;
+  clonedRows.resize(3);
+  RowContainerIterator iter;
+  auto numRows = newData->listRows(&iter, 3, clonedRows.data());
+  EXPECT_EQ(numRows, 3);
+  int counter = 0;
+  auto newKeyOffset = newData->columnAt(0).offset();
+  auto newStringOffset = newData->columnAt(1).offset();
+
+  for (auto row : clonedRows) {
+    int64_t key = newData->valueAt<int64_t>(row, newKeyOffset);
+    EXPECT_EQ(key, expectedKeys[counter]);
+
+    auto val = newData->valueAt<StringView>(row, newStringOffset);
+    EXPECT_EQ(std::string(val), expectedVals[counter]);
+    ++counter;
+  }
+}
+
 TEST_F(RowContainerTest, DISABLED_ConvertBenchmark) {
   VectorFuzzer fuzzer(
       {.vectorSize = 100000, .nullRatio = 0.1, .containerLength = 10}, pool());