Skip to content

Commit 656ce28

Browse files
authored
Merge branch 'main' into full-text-seach
2 parents 5457513 + 9f513f1 commit 656ce28

62 files changed

Lines changed: 1852 additions & 333 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

build_and_package.sh

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@ set -euo pipefail
1818

1919
SOURCE_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
2020
OUTPUT_DIR="$SOURCE_ROOT/output"
21-
BUILD_TYPE="release"
21+
BUILD_TYPE="Release"
22+
BUILD_DIR="$SOURCE_ROOT/build-release"
2223
BUILD_NAME="paimon-cpp"
2324
MAKE_CLEAN=false
2425
PACKAGE=false
25-
CMAKE_OPTIONS=""
26+
CMAKE_OPTIONS=()
27+
JOBS=""
2628

2729
show_help() {
2830
cat << EOF
@@ -33,15 +35,16 @@ Options:
3335
-d, --debug Build debug version
3436
-c, --clean Clean build directory before building
3537
-p, --package Package creation
38+
-j, --jobs <num> Number of parallel jobs for building (default: auto-detect)
3639
-h, --help Show this help message
3740
3841
CMake Options:
3942
Any unrecognized options will be passed directly to CMake.
4043
You can specify multiple CMake options.
4144
4245
Examples:
43-
$0 -r -p -DPAIMON_BUILD_SHARED=ON -DPAIMON_BUILD_STATIC=OFF
44-
$0 --debug --clean --package
46+
$0 -r -p -j 8 -DPAIMON_BUILD_SHARED=ON -DPAIMON_BUILD_STATIC=OFF
47+
$0 --debug --clean --package --jobs 4
4548
4649
EOF
4750
}
@@ -50,11 +53,14 @@ while [[ $# -gt 0 ]]; do
5053
case "$1" in
5154
-r|--release)
5255
BUILD_TYPE="Release"
56+
BUILD_DIR="$SOURCE_ROOT/build-release"
57+
BUILD_NAME="paimon-cpp"
5358
shift
5459
;;
5560
-d|--debug)
5661
BUILD_TYPE="Debug"
57-
BUILD_NAME=$BUILD_NAME"-debug"
62+
BUILD_DIR="$SOURCE_ROOT/build-debug"
63+
BUILD_NAME="paimon-cpp-debug"
5864
shift
5965
;;
6066
-c|--clean)
@@ -65,27 +71,40 @@ while [[ $# -gt 0 ]]; do
6571
PACKAGE=true
6672
shift
6773
;;
74+
-j|--jobs)
75+
shift
76+
if [[ $# -gt 0 && $1 =~ ^[0-9]+$ ]]; then
77+
JOBS="$1"
78+
shift
79+
else
80+
echo "Error: -j/--jobs requires a numeric argument" >&2
81+
exit 1
82+
fi
83+
;;
6884
-h|--help)
6985
show_help
7086
exit 0
7187
;;
7288
*)
7389
# All remaining parameters are CMake options
74-
CMAKE_OPTIONS="$CMAKE_OPTIONS $1"
90+
CMAKE_OPTIONS+=("$1")
7591
shift
7692
;;
7793
esac
7894
done
7995

80-
CMAKE_OPTIONS=$(echo "$CMAKE_OPTIONS" | xargs)
81-
8296
echo "========== Build Configuration =========="
8397
echo "Build Type: $BUILD_TYPE"
8498
echo "Package Name: $BUILD_NAME"
8599
echo "Clean Build: $MAKE_CLEAN"
86100
echo "Package: $PACKAGE"
87-
if [ -n "$CMAKE_OPTIONS" ]; then
88-
echo "CMake Options: $CMAKE_OPTIONS"
101+
if [ -n "$JOBS" ]; then
102+
echo "Parallel Jobs: $JOBS"
103+
else
104+
echo "Parallel Jobs: auto-detect"
105+
fi
106+
if [ ${#CMAKE_OPTIONS[@]} -gt 0 ]; then
107+
echo "CMake Options: ${CMAKE_OPTIONS[*]}"
89108
else
90109
echo "CMake Options: None"
91110
fi
@@ -95,33 +114,39 @@ echo "Step 1: Downloading dependencies..."
95114
"$SOURCE_ROOT"/third_party/download_dependencies.sh
96115

97116
echo "Step 2: Building Paimon..."
98-
BUILD_DIR="$SOURCE_ROOT/build-$BUILD_TYPE"
99117
PACKAGE_DIR="$OUTPUT_DIR/$BUILD_NAME"
100118

101119
if [ "$MAKE_CLEAN" = true ]; then
120+
echo "Cleaning build directory: $BUILD_DIR"
102121
rm -rf "$BUILD_DIR"
103122
fi
104123
mkdir -p "$BUILD_DIR"
105124
cd "$BUILD_DIR"
125+
106126
CMAKE_ARGS=(
127+
-G "Ninja"
107128
-DCMAKE_BUILD_TYPE="$BUILD_TYPE"
108129
-DCMAKE_INSTALL_PREFIX="$PACKAGE_DIR"
109130
)
110131

111-
if [ -n "$CMAKE_OPTIONS" ]; then
112-
CMAKE_ARGS+=("$CMAKE_OPTIONS")
132+
if [ ${#CMAKE_OPTIONS[@]} -gt 0 ]; then
133+
CMAKE_ARGS+=("${CMAKE_OPTIONS[@]}")
113134
fi
114135

115136
cmake "${CMAKE_ARGS[@]}" ..
116137

117-
JOBS=$(nproc 2>/dev/null || echo 4)
118-
make -j"$JOBS"
138+
# Set default JOBS if not specified
139+
if [ -z "$JOBS" ]; then
140+
JOBS=$(nproc 2>/dev/null || echo 4)
141+
fi
142+
143+
ninja -j"$JOBS"
119144

120145
if [ "$PACKAGE" = true ]; then
121146
echo "Step 3: Packaging..."
122147
mkdir -p "$OUTPUT_DIR"
123148
cd "$BUILD_DIR"
124-
make install
149+
ninja install
125150
tar -czvf "$OUTPUT_DIR/$BUILD_NAME.tar.gz" -C "$OUTPUT_DIR" "$BUILD_NAME"
126151
echo "Package created: $OUTPUT_DIR/$BUILD_NAME.tar.gz"
127152
else

cmake_modules/orc.diff

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,20 @@ index a914e5f26..efe1d4933 100644
3030
};
3131

3232
// Specializations for char
33+
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
34+
index b015b6491..585e50ec5 100644
35+
--- a/c++/include/orc/Reader.hh
36+
+++ b/c++/include/orc/Reader.hh
37+
@@ -659,6 +659,9 @@ namespace orc {
38+
virtual void preBuffer(const std::vector<uint32_t>& stripes,
39+
const std::list<uint64_t>& includeTypes) = 0;
40+
41+
+ virtual std::vector<std::pair<uint64_t, uint64_t>> preBufferRange(
42+
+ const std::vector<uint32_t>& stripes, const std::list<uint64_t>& includeTypes) = 0;
43+
+
44+
/**
45+
* Release cached entries whose right boundary is less than or equal to the given boundary.
46+
* @param boundary the boundary value to release cache entries
3347
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
3448
index af434c37c..08393259c 100644
3549
--- a/c++/src/ColumnReader.cc
@@ -328,6 +342,86 @@ index ed7fee737..a8ee8a67c 100644
328342
reserve(newSize);
329343
if (newSize > currentSize_) {
330344
memset(buf_ + currentSize_, 0, newSize - currentSize_);
345+
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
346+
index c93c62f6c..2a821b622 100644
347+
--- a/c++/src/Reader.cc
348+
+++ b/c++/src/Reader.cc
349+
@@ -1531,8 +1531,8 @@ namespace orc {
350+
}
351+
}
352+
353+
- void ReaderImpl::preBuffer(const std::vector<uint32_t>& stripes,
354+
- const std::list<uint64_t>& includeTypes) {
355+
+ std::vector<std::pair<uint64_t, uint64_t>> ReaderImpl::preBufferRange(
356+
+ const std::vector<uint32_t>& stripes, const std::list<uint64_t>& includeTypes) {
357+
std::vector<uint32_t> newStripes;
358+
for (auto stripe : stripes) {
359+
if (stripe < static_cast<uint32_t>(footer_->stripes_size())) newStripes.push_back(stripe);
360+
@@ -1544,7 +1544,7 @@ namespace orc {
361+
}
362+
363+
if (newStripes.empty() || newIncludeTypes.empty()) {
364+
- return;
365+
+ return {};
366+
}
367+
368+
orc::RowReaderOptions rowReaderOptions;
369+
@@ -1553,7 +1553,7 @@ namespace orc {
370+
std::vector<bool> selectedColumns;
371+
columnSelector.updateSelected(selectedColumns, rowReaderOptions);
372+
373+
- std::vector<ReadRange> ranges;
374+
+ std::vector<std::pair<uint64_t, uint64_t>> ranges;
375+
ranges.reserve(newIncludeTypes.size());
376+
for (auto stripe : newStripes) {
377+
// get stripe information
378+
@@ -1598,17 +1598,23 @@ namespace orc {
379+
380+
offset += stream.length();
381+
}
382+
+ }
383+
+ return ranges;
384+
+ }
385+
386+
- {
387+
- std::lock_guard<std::mutex> lock(contents_->readCacheMutex);
388+
-
389+
- if (!contents_->readCache) {
390+
- contents_->readCache = std::make_shared<ReadRangeCache>(
391+
- getStream(), options_.getCacheOptions(), contents_->pool, contents_->readerMetrics);
392+
- }
393+
- contents_->readCache->cache(std::move(ranges));
394+
- }
395+
+ void ReaderImpl::preBuffer(const std::vector<uint32_t>& stripes,
396+
+ const std::list<uint64_t>& includeTypes) {
397+
+ auto ranges = preBufferRange(stripes, includeTypes);
398+
+ std::vector<ReadRange> read_ranges;
399+
+ for (const auto& range : ranges) {
400+
+ read_ranges.emplace_back(range.first, range.second);
401+
+ }
402+
+ std::lock_guard<std::mutex> lock(contents_->readCacheMutex);
403+
+ if (!contents_->readCache) {
404+
+ contents_->readCache = std::make_shared<ReadRangeCache>(
405+
+ getStream(), options_.getCacheOptions(), contents_->pool, contents_->readerMetrics);
406+
}
407+
+ contents_->readCache->cache(std::move(read_ranges));
408+
}
409+
410+
RowReader::~RowReader() {
411+
diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh
412+
index 39ca73967..13da45a49 100644
413+
--- a/c++/src/Reader.hh
414+
+++ b/c++/src/Reader.hh
415+
@@ -387,6 +387,9 @@ namespace orc {
416+
std::map<uint32_t, BloomFilterIndex> getBloomFilters(
417+
uint32_t stripeIndex, const std::set<uint32_t>& included) const override;
418+
419+
+ std::vector<std::pair<uint64_t, uint64_t>> preBufferRange(
420+
+ const std::vector<uint32_t>& stripes, const std::list<uint64_t>& includeTypes) override;
421+
+
422+
void preBuffer(const std::vector<uint32_t>& stripes,
423+
const std::list<uint64_t>& includeTypes) override;
424+
void releaseBuffer(uint64_t boundary) override;
331425
diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake
332426
index 9b2c829c7..434841224 100644
333427
--- a/cmake_modules/ThirdpartyToolchain.cmake

include/paimon/catalog/catalog.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,13 @@ class PAIMON_EXPORT Catalog {
4646
///
4747
/// @param root_path Path to the root directory where the catalog is located.
4848
/// @param options Configuration options for catalog initialization.
49+
/// @param file_system Specifies the file system for file operations.
50+
/// If not set, use default file system (configured in
51+
/// `Options::FILE_SYSTEM`)
4952
/// @return A result containing a unique pointer to a `Catalog` instance, or an error status.
5053
static Result<std::unique_ptr<Catalog>> Create(
51-
const std::string& root_path, const std::map<std::string, std::string>& options);
54+
const std::string& root_path, const std::map<std::string, std::string>& options,
55+
const std::shared_ptr<FileSystem>& file_system = nullptr);
5256

5357
virtual ~Catalog() = default;
5458

@@ -124,6 +128,16 @@ class PAIMON_EXPORT Catalog {
124128
/// @return A string representing the expected location of the table.
125129
virtual std::string GetTableLocation(const Identifier& identifier) const = 0;
126130

131+
/// Returns the root path of the catalog.
132+
///
133+
/// @return A string representing the root path of the catalog.
134+
virtual std::string GetRootPath() const = 0;
135+
136+
/// Returns the file system used by the catalog.
137+
///
138+
/// @return A shared pointer to the file system instance.
139+
virtual std::shared_ptr<FileSystem> GetFileSystem() const = 0;
140+
127141
/// Loads the latest schema of a specified table.
128142
///
129143
/// @note System tables will not be supported.

include/paimon/commit_context.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class PAIMON_EXPORT CommitContext {
3939
bool ignore_empty_commit, bool use_rest_catalog_commit,
4040
const std::shared_ptr<MemoryPool>& memory_pool,
4141
const std::shared_ptr<Executor>& executor,
42+
const std::shared_ptr<FileSystem>& specific_file_system,
4243
const std::map<std::string, std::string>& options);
4344
~CommitContext();
4445

@@ -66,6 +67,10 @@ class PAIMON_EXPORT CommitContext {
6667
return executor_;
6768
}
6869

70+
std::shared_ptr<FileSystem> GetSpecificFileSystem() const {
71+
return specific_file_system_;
72+
}
73+
6974
const std::map<std::string, std::string>& GetOptions() const {
7075
return options_;
7176
}
@@ -77,6 +82,7 @@ class PAIMON_EXPORT CommitContext {
7782
bool use_rest_catalog_commit_;
7883
std::shared_ptr<MemoryPool> memory_pool_;
7984
std::shared_ptr<Executor> executor_;
85+
std::shared_ptr<FileSystem> specific_file_system_;
8086
std::map<std::string, std::string> options_;
8187
};
8288

@@ -129,6 +135,15 @@ class PAIMON_EXPORT CommitContextBuilder {
129135
/// @return Reference to this builder for method chaining.
130136
CommitContextBuilder& WithExecutor(const std::shared_ptr<Executor>& executor);
131137

138+
/// Sets a custom file system instance to be used for all file operations in this commit
139+
/// context.
140+
/// This bypasses the global file system registry and uses the provided implementation directly.
141+
///
142+
/// @param file_system The file system to use.
143+
/// @return Reference to this builder for method chaining.
144+
/// @note If not set, use default file system (configured in `Options::FILE_SYSTEM`)
145+
CommitContextBuilder& WithFileSystem(const std::shared_ptr<FileSystem>& file_system);
146+
132147
/// Build and return a `CommitContext` instance with input validation.
133148
/// @return Result containing the constructed `CommitContext` or an error status.
134149
Result<std::unique_ptr<CommitContext>> Finish();

include/paimon/global_index/global_index_write_task.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,16 @@ class PAIMON_EXPORT GlobalIndexWriteTask {
4444
/// @param options Index-specific configuration (e.g., false positive rate for bloom
4545
/// filters).
4646
/// @param pool Memory pool for temporary allocations during index construction.
47-
// If `nullptr`, the system's default memory pool will be used.
47+
/// If `nullptr`, the system's default memory pool will be used.
48+
/// @param file_system Specifies the file system for file operations.
49+
/// If `nullptr`, use default file system.
4850
/// @return A `Result` containing a shared pointer to the `CommitMessage` with index metadata,
4951
/// or an error if indexing fails (e.g., unsupported type, I/O error).
5052
static Result<std::shared_ptr<CommitMessage>> WriteIndex(
5153
const std::string& table_path, const std::string& field_name, const std::string& index_type,
5254
const std::shared_ptr<IndexedSplit>& indexed_split,
53-
const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool);
55+
const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool,
56+
const std::shared_ptr<FileSystem>& file_system = nullptr);
5457
};
5558

5659
} // namespace paimon

include/paimon/migrate/file_meta_utils.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,15 @@ class PAIMON_EXPORT FileMetaUtils {
5656
/// tables. Use empty map for non-partitioned tables.
5757
/// @param options Set a configuration options map to set some option entries which are not
5858
/// defined in the table schema or whose values you want to overwrite.
59+
/// @param file_system Specifies the file system for file operations.
60+
/// If `nullptr`, use default file system.
5961
/// @return Result containing a unique pointer to the generated `CommitMessage`,
6062
/// or an error status if the migration cannot be performed.
6163
static Result<std::unique_ptr<CommitMessage>> GenerateCommitMessage(
6264
const std::vector<std::string>& src_data_files, const std::string& dst_table_path,
6365
const std::map<std::string, std::string>& partition_values,
64-
const std::map<std::string, std::string>& options);
66+
const std::map<std::string, std::string>& options,
67+
const std::shared_ptr<FileSystem>& file_system = nullptr);
6568
};
6669

6770
} // namespace paimon

0 commit comments

Comments
 (0)