Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions src/paimon/core/catalog/file_system_catalog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "paimon/common/utils/arrow/status_utils.h"
#include "paimon/common/utils/path_util.h"
#include "paimon/common/utils/string_utils.h"
#include "paimon/core/core_options.h"
#include "paimon/core/snapshot.h"
#include "paimon/core/table/system/system_table.h"
#include "paimon/core/table/system/system_table_schema.h"
Expand Down Expand Up @@ -338,16 +339,18 @@ Result<std::vector<std::string>> FileSystemCatalog::GetSchemaExternalPaths(
std::set<std::string> external_paths_set;
for (const auto& schema : schemas) {
const auto& options = schema->Options();
auto iter = options.find(Options::DATA_FILE_EXTERNAL_PATHS);
if (iter != options.end() && !iter->second.empty()) {
auto paths = StringUtils::Split(iter->second, ",", /*ignore_empty=*/true);
for (const auto& path : paths) {
std::string trimmed_path = path;
StringUtils::Trim(&trimmed_path);
if (!trimmed_path.empty()) {
external_paths_set.insert(trimmed_path);
}
}
PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, CoreOptions::FromMap(options));
// collect external data file path
PAIMON_ASSIGN_OR_RAISE(std::vector<std::string> data_external_paths,
core_options.CreateExternalPaths());
for (const auto& path : data_external_paths) {
external_paths_set.insert(path);
}
// collect external global index path
PAIMON_ASSIGN_OR_RAISE(std::optional<std::string> index_external_path,
core_options.CreateGlobalIndexExternalPath());
if (index_external_path != std::nullopt) {
external_paths_set.insert(index_external_path.value());
}
}
return std::vector<std::string>(external_paths_set.begin(), external_paths_set.end());
Expand Down
134 changes: 134 additions & 0 deletions src/paimon/core/catalog/file_system_catalog_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,7 @@ TEST(FileSystemCatalogTest, TestDropTableWithExternalPath) {
auto external_dir = UniqueTestDirectory::Create();
ASSERT_TRUE(external_dir);
std::string external_path = external_dir->Str();
external_path = "FILE://" + external_path;

// Create a file in external path to simulate external data
ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFactory::Get("local", dir->Str(), {}));
Expand All @@ -665,6 +666,7 @@ TEST(FileSystemCatalogTest, TestDropTableWithExternalPath) {

std::map<std::string, std::string> table_options = options;
table_options[Options::DATA_FILE_EXTERNAL_PATHS] = external_path;
table_options[Options::DATA_FILE_EXTERNAL_PATHS_STRATEGY] = "round-robin";

ASSERT_OK(catalog.CreateTable(Identifier("test_db", "tbl_with_external"), &schema, {}, {},
table_options, false));
Expand Down Expand Up @@ -707,7 +709,9 @@ TEST(FileSystemCatalogTest, TestDropTableWithMultipleExternalPaths) {
ASSERT_TRUE(external_dir2);

std::string external_path1 = external_dir1->Str();
external_path1 = "FILE://" + external_path1;
std::string external_path2 = external_dir2->Str();
external_path2 = "FILE://" + external_path2;

// Create files in external paths
ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFactory::Get("local", dir->Str(), {}));
Expand All @@ -727,6 +731,7 @@ TEST(FileSystemCatalogTest, TestDropTableWithMultipleExternalPaths) {

std::map<std::string, std::string> table_options = options;
table_options[Options::DATA_FILE_EXTERNAL_PATHS] = external_path1 + "," + external_path2;
table_options[Options::DATA_FILE_EXTERNAL_PATHS_STRATEGY] = "round-robin";

ASSERT_OK(catalog.CreateTable(Identifier("test_db", "tbl_multi_external"), &schema, {}, {},
table_options, false));
Expand All @@ -744,6 +749,133 @@ TEST(FileSystemCatalogTest, TestDropTableWithMultipleExternalPaths) {
ArrowSchemaRelease(&schema);
}

TEST(FileSystemCatalogTest, TestDropTableWithGlobalIndexExternalPathOnMainBranch) {
std::map<std::string, std::string> options;
options[Options::FILE_SYSTEM] = "local";
options[Options::FILE_FORMAT] = "orc";
ASSERT_OK_AND_ASSIGN(auto core_options, CoreOptions::FromMap(options));
auto dir = UniqueTestDirectory::Create();
ASSERT_TRUE(dir);
FileSystemCatalog catalog(core_options.GetFileSystem(), dir->Str());
ASSERT_OK(catalog.CreateDatabase("test_db", options, /*ignore_if_exists=*/false));

// Create external path directory for global index
auto global_index_dir = UniqueTestDirectory::Create();
ASSERT_TRUE(global_index_dir);
std::string global_index_path = global_index_dir->Str();
global_index_path = "FILE://" + global_index_path;

// Create a file in external path to simulate global index data
ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFactory::Get("local", dir->Str(), {}));
std::string index_file = PathUtil::JoinPath(global_index_path, "index-file.bin");
ASSERT_OK(fs->WriteFile(index_file, "index data", /*overwrite=*/true));

// Verify index file exists
ASSERT_OK_AND_ASSIGN(bool index_file_exists, fs->Exists(index_file));
ASSERT_TRUE(index_file_exists);

// Create table with global index external path on main branch
arrow::FieldVector fields = {
arrow::field("f0", arrow::int32()),
arrow::field("f1", arrow::utf8()),
};
arrow::Schema typed_schema(fields);
::ArrowSchema schema;
ASSERT_TRUE(arrow::ExportSchema(typed_schema, &schema).ok());

std::map<std::string, std::string> table_options = options;
table_options[Options::GLOBAL_INDEX_EXTERNAL_PATH] = global_index_path;

ASSERT_OK(catalog.CreateTable(Identifier("test_db", "tbl_global_index"), &schema, {}, {},
table_options, false));

// Verify table exists
ASSERT_OK_AND_ASSIGN(bool table_exists,
catalog.TableExists(Identifier("test_db", "tbl_global_index")));
ASSERT_TRUE(table_exists);

// Drop the table
ASSERT_OK(catalog.DropTable(Identifier("test_db", "tbl_global_index"),
/*ignore_if_not_exists=*/false));

// Verify table is dropped
ASSERT_OK_AND_ASSIGN(table_exists,
catalog.TableExists(Identifier("test_db", "tbl_global_index")));
ASSERT_FALSE(table_exists);

// Verify global index external path is also cleaned up
ASSERT_OK_AND_ASSIGN(bool global_index_exists, fs->Exists(global_index_path));
ASSERT_FALSE(global_index_exists);

ArrowSchemaRelease(&schema);
}

TEST(FileSystemCatalogTest, TestDropTableWithGlobalIndexExternalPathOnBranch) {
// Copy the real append_table_with_rt_branch.db test data, then patch the
// branch-rt schema-1 to include a GLOBAL_INDEX_EXTERNAL_PATH option.
// DropTable should discover and clean up that external path.
std::map<std::string, std::string> options;
options[Options::FILE_SYSTEM] = "local";
options[Options::FILE_FORMAT] = "orc";
ASSERT_OK_AND_ASSIGN(auto core_options, CoreOptions::FromMap(options));
auto dir = UniqueTestDirectory::Create();
ASSERT_TRUE(dir);

// Copy the real test_data db into a temp directory
std::string test_data_path = GetDataDir() + "/orc/append_table_with_rt_branch.db";
std::string db_path = dir->Str() + "/test_db.db";
ASSERT_TRUE(TestUtil::CopyDirectory(test_data_path, db_path));

// Create a temporary external directory that represents branch global index data
auto branch_external_dir = UniqueTestDirectory::Create();
ASSERT_TRUE(branch_external_dir);
std::string branch_external_path = branch_external_dir->Str();
branch_external_path = "FILE://" + branch_external_path;

ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFactory::Get("local", dir->Str(), {}));
ASSERT_OK(fs->WriteFile(PathUtil::JoinPath(branch_external_path, "index.bin"),
"branch global index data", /*overwrite=*/true));

// Patch branch-rt/schema/schema-1: add GLOBAL_INDEX_EXTERNAL_PATH to options
std::string branch_schema_path =
PathUtil::JoinPath(db_path, "append_table_with_rt_branch/branch/branch-rt/schema/schema-1");
std::string schema_content;
ASSERT_OK(fs->ReadFile(branch_schema_path, &schema_content));

// Insert the global index external path option into the JSON options block.
// Original: "file.format" : "orc"
// Patched: "file.format" : "orc",
// "global-index.external-path" : "<branch_external_path>"
std::string search_str = R"("file.format" : "orc")";
std::string replace_str = R"("file.format" : "orc",
"global-index.external-path" : ")" +
branch_external_path + R"(")";
auto pos = schema_content.rfind(search_str);
ASSERT_NE(pos, std::string::npos);
schema_content.replace(pos, search_str.length(), replace_str);
ASSERT_OK(fs->WriteFile(branch_schema_path, schema_content, /*overwrite=*/true));

// Verify external path exists before drop
ASSERT_OK_AND_ASSIGN(bool external_exists, fs->Exists(branch_external_path));
ASSERT_TRUE(external_exists);

// Drop the table via catalog
FileSystemCatalog catalog(core_options.GetFileSystem(), dir->Str());
Identifier identifier("test_db", "append_table_with_rt_branch");
ASSERT_OK_AND_ASSIGN(bool table_exists, catalog.TableExists(identifier));
ASSERT_TRUE(table_exists);

ASSERT_OK(catalog.DropTable(identifier, /*ignore_if_not_exists=*/false));

// Verify table is dropped
ASSERT_OK_AND_ASSIGN(table_exists, catalog.TableExists(identifier));
ASSERT_FALSE(table_exists);

// Verify the branch global index external path is cleaned up by DropTable
ASSERT_OK_AND_ASSIGN(external_exists, fs->Exists(branch_external_path));
ASSERT_FALSE(external_exists);
}

TEST(FileSystemCatalogTest, TestListSnapshots) {
std::map<std::string, std::string> options;
options[Options::FILE_SYSTEM] = "local";
Expand Down Expand Up @@ -817,6 +949,7 @@ TEST(FileSystemCatalogTest, TestDropTableWithBranchExternalPaths) {
auto branch_external_dir = UniqueTestDirectory::Create();
ASSERT_TRUE(branch_external_dir);
std::string branch_external_path = branch_external_dir->Str();
branch_external_path = "FILE://" + branch_external_path;

ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFactory::Get("local", dir->Str(), {}));
ASSERT_OK(fs->WriteFile(PathUtil::JoinPath(branch_external_path, "data.orc"),
Expand All @@ -834,6 +967,7 @@ TEST(FileSystemCatalogTest, TestDropTableWithBranchExternalPaths) {
// "data-file.external-paths" : "<branch_external_path>"
std::string search_str = R"("file.format" : "orc")";
std::string replace_str = R"("file.format" : "orc",
"data-file.external-paths.strategy" : "round-robin",
"data-file.external-paths" : ")" +
branch_external_path + R"(")";
auto pos = schema_content.rfind(search_str);
Expand Down
Loading