Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,8 @@ struct IsDeque_t<std::deque<T>> : std::true_type {};

void CheckForDuplicateSnapshotColumns(const ColumnNames_t &cols);

void CheckSnapshotOptionsFormatCompatibility(const ROOT::RDF::RSnapshotOptions &opts);

template <typename T>
struct InnerValueType {
using type = T; // fallback for when T is not a nested RVec
Expand Down
2 changes: 2 additions & 0 deletions tree/dataframe/inc/ROOT/RDF/RInterface.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -1376,6 +1376,8 @@ public:
}
};

RDFInternal::CheckSnapshotOptionsFormatCompatibility(options);

if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple) {
// The data source of the RNTuple resulting from the Snapshot action does not exist yet here, so we create one
// without a data source for now, and set it once the actual data source can be created (i.e., after
Expand Down
8 changes: 6 additions & 2 deletions tree/dataframe/inc/ROOT/RSnapshotOptions.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
#ifndef ROOT_RSNAPSHOTOPTIONS
#define ROOT_RSNAPSHOTOPTIONS

#include "ROOT/RNTupleWriteOptions.hxx"
#include <Compression.h>
#include <optional>

#include <string_view>
#include <string>

Expand All @@ -31,6 +32,7 @@ struct RSnapshotOptions {
RSnapshotOptions() = default;
RSnapshotOptions(std::string_view mode, ECAlgo comprAlgo, int comprLevel, int autoFlush, int splitLevel, bool lazy,
bool overwriteIfExists = false, bool vector2RVec = true, int basketSize = -1,
ROOT::RNTupleWriteOptions ntupleWriteOpts = ROOT::RNTupleWriteOptions(),
ESnapshotOutputFormat outputFormat = ESnapshotOutputFormat::kDefault)
: fMode(mode),
fCompressionAlgorithm(comprAlgo),
Expand All @@ -41,6 +43,7 @@ struct RSnapshotOptions {
fOverwriteIfExists(overwriteIfExists),
fVector2RVec(vector2RVec),
fBasketSize(basketSize),
fNTupleWriteOpts(ntupleWriteOpts),
fOutputFormat(outputFormat)
{
}
Expand All @@ -55,7 +58,8 @@ struct RSnapshotOptions {
bool fVector2RVec = true; ///< If set to true will convert std::vector columns to RVec when saving to disk
int fBasketSize = -1; ///< Set a custom basket size option. For more details, see
///< https://root.cern/manual/trees/#baskets-clusters-and-the-tree-header
ESnapshotOutputFormat fOutputFormat = ESnapshotOutputFormat::kDefault; ///< Which data format to write to
ROOT::RNTupleWriteOptions fNTupleWriteOpts = ROOT::RNTupleWriteOptions(); ///< RNTuple-specific write options
ESnapshotOutputFormat fOutputFormat = ESnapshotOutputFormat::kDefault; ///< Which data format to write to
};
} // namespace RDF
} // namespace ROOT
Expand Down
33 changes: 33 additions & 0 deletions tree/dataframe/src/RDFInterfaceUtils.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,39 @@ void CheckForDuplicateSnapshotColumns(const ColumnNames_t &cols)
}
}

void CheckSnapshotOptionsFormatCompatibility(const ROOT::RDF::RSnapshotOptions &opts)
{
const ROOT::RDF::RSnapshotOptions defaultSnapshotOpts;
if ((opts.fOutputFormat == ROOT::RDF::ESnapshotOutputFormat::kTTree ||
opts.fOutputFormat == ROOT::RDF::ESnapshotOutputFormat::kDefault) &&
opts.fNTupleWriteOpts != defaultSnapshotOpts.fNTupleWriteOpts) {
Warning(
"Snapshot",
"The RNTuple-specific fNTupleWriteOptions option in RSnapshotOptions has been set, but the output format is "
"set to TTree, so this option won't have any effect. Use the other options available in RSnapshotOptions to "
"configure the output TTree. Alternatively, change fOutputFormat to snapshot to RNTuple instead.");
} else if (opts.fOutputFormat == ROOT::RDF::ESnapshotOutputFormat::kRNTuple) {
std::string optionName;

if (opts.fAutoFlush != defaultSnapshotOpts.fAutoFlush) {
optionName = "fAutoFlush";
} else if (opts.fSplitLevel != defaultSnapshotOpts.fSplitLevel) {
optionName = "fSplitLevel";
} else if (opts.fBasketSize != defaultSnapshotOpts.fBasketSize) {
optionName = "fBasketSize";
}

if (!optionName.empty()) {
Warning(
"Snapshot",
"The TTree-specific %s option in RSnapshotOptions has been set, but the output format is set to RNTuple, "
"so this option won't have any effect. Use the fNTupleWriteOptions option available in RSnapshotOptions to "
"configure the output RNTuple. Alternatively, change fOutputFormat to snapshot to TTree instead.",
optionName.c_str());
}
}
}

/// Return copies of colsWithoutAliases and colsWithAliases with size branches for variable-sized array branches added
/// in the right positions (i.e. before the array branches that need them).
std::pair<std::vector<std::string>, std::vector<std::string>>
Expand Down
12 changes: 9 additions & 3 deletions tree/dataframe/src/RDFSnapshotHelpers.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -852,8 +852,14 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize()
}
model->Freeze();

ROOT::RNTupleWriteOptions writeOptions;
writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
// Propagate fOptions.fCompressionAlgorithm and fOptions.fCompressionLevel to fOptions.fNTupleWriteOpts *only* if the
// compression settings in fNTupleWriteOpts have not been changed, and the compression algorithm or level in fOptions
// have.
if (fOptions.fNTupleWriteOpts.GetCompression() == RCompressionSetting::EDefaults::kUseGeneralPurpose &&
(fOptions.fCompressionAlgorithm != RCompressionSetting::EAlgorithm::kZLIB ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that leaves the possibility that a user explicitly sets fCompression... to what happens to be the current default, but then is surprised because RNTuple doesn't pick it up but continue to use its zstd default. I don't have a good solution though, maybe that's ok.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, good point. Perhaps to avoid situations like these it might be better to also warn users for fCompression..., and require them to be set through fNTupleWriteOpts only.

fOptions.fCompressionLevel != RCompressionSetting::ELevel::kDefaultZLIB)) {
fOptions.fNTupleWriteOpts.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
}

fOutputFile.reset(TFile::Open(fFileName.c_str(), fOptions.fMode.c_str()));
if (!fOutputFile)
Expand All @@ -871,7 +877,7 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize()

// The RNTupleParallelWriter has exclusive access to the underlying TFile, no further synchronization is needed for
// calls to Fill() (in Exec) and FlushCluster() (in FinalizeTask).
fWriter = ROOT::RNTupleParallelWriter::Append(std::move(model), fNTupleName, *outputDir, writeOptions);
fWriter = ROOT::RNTupleParallelWriter::Append(std::move(model), fNTupleName, *outputDir, fOptions.fNTupleWriteOpts);
}

void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::InitTask(TTreeReader *, unsigned int slot)
Expand Down
12 changes: 12 additions & 0 deletions tree/dataframe/test/dataframe_snapshot.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,18 @@ TEST_F(RDFSnapshot, Snapshot_action_with_options)
test_snapshot_options(tdf);
}

TEST_F(RDFSnapshot, Snapshot_action_warn_on_rntuple_specific_opts)
{
RSnapshotOptions opts;
opts.fNTupleWriteOpts.SetApproxZippedClusterSize(64);

ROOT_EXPECT_WARNING(
tdf.Snapshot("ntuple", "snapshot_test_warn_on_rntuple_specific_opts.root", "", opts), "Snapshot",
"The RNTuple-specific fNTupleWriteOptions option in RSnapshotOptions has been set, but the output format is set "
"to TTree, so this option won't have any effect. Use the other options available in RSnapshotOptions to "
"configure the output TTree. Alternatively, change fOutputFormat to snapshot to RNTuple instead.");
}

void checkSnapshotArrayFile(RResultPtr<RInterface<RLoopManager>> &df, unsigned int kNEvents)
{
// fixedSizeArr and varSizeArr are RResultPtr<vector<vector<T>>>
Expand Down
123 changes: 114 additions & 9 deletions tree/dataframe/test/dataframe_snapshot_ntuple.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -101,25 +101,130 @@ TEST(RDFSnapshotRNTuple, LazyNotTriggered)
EXPECT_TRUE(gSystem->AccessPathName(fileGuard.GetPath().c_str()));
}

TEST(RDFSnapshotRNTuple, WriteOpts)
{
FileRAII fileGuard{"RDFSnapshotRNTuple_write_opts.root"};
const std::vector<std::string> columns = {"x"};

auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; });

{
ROOT::RNTupleWriteOptions writeOpts;
writeOpts.SetEnablePageChecksums(false);

RSnapshotOptions opts;
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
opts.fNTupleWriteOpts = writeOpts;

auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), "x", opts);

EXPECT_EQ(columns, sdf->GetColumnNames());

auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath());
EXPECT_FALSE(reader->GetDescriptor().GetClusterDescriptor(0).GetPageRange(0).GetPageInfos()[0].HasChecksum());
}

// Setting TTree-specific options while the output format is set to RNTuple should result in a warning
{
RSnapshotOptions opts;
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
opts.fAutoFlush = 1;

ROOT_EXPECT_WARNING(df.Snapshot("ntuple", fileGuard.GetPath(), "x", opts), "Snapshot",
"The TTree-specific fAutoFlush option in RSnapshotOptions has been set, but the output "
"format is set to RNTuple, so this option won't have any effect. Use the fNTupleWriteOptions "
"option available in RSnapshotOptions to configure the output RNTuple. Alternatively, change "
"fOutputFormat to snapshot to TTree instead.");
}
{
RSnapshotOptions opts;
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
opts.fSplitLevel = 1;

ROOT_EXPECT_WARNING(df.Snapshot("ntuple", fileGuard.GetPath(), "x", opts), "Snapshot",
"The TTree-specific fSplitLevel option in RSnapshotOptions has been set, but the output "
"format is set to RNTuple, so this option won't have any effect. Use the fNTupleWriteOptions "
"option available in RSnapshotOptions to configure the output RNTuple. Alternatively, change "
"fOutputFormat to snapshot to TTree instead.");
}
{
RSnapshotOptions opts;
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
opts.fBasketSize = 64000;

ROOT_EXPECT_WARNING(df.Snapshot("ntuple", fileGuard.GetPath(), "x", opts), "Snapshot",
"The TTree-specific fBasketSize option in RSnapshotOptions has been set, but the output "
"format is set to RNTuple, so this option won't have any effect. Use the fNTupleWriteOptions "
"option available in RSnapshotOptions to configure the output RNTuple. Alternatively, change "
"fOutputFormat to snapshot to TTree instead.");
}
}

TEST(RDFSnapshotRNTuple, Compression)
{
FileRAII fileGuard{"RDFSnapshotRNTuple_compression.root"};
const std::vector<std::string> columns = {"x"};

auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; });

RSnapshotOptions opts;
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
opts.fCompressionAlgorithm = ROOT::RCompressionSetting::EAlgorithm::kLZ4;
opts.fCompressionLevel = 4;
// Default should be taken from RNTupleWriteOptions provided by RSnapshotOptions (zstd)
{
RSnapshotOptions opts;
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;

auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), "x", opts);
auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), "x", opts);

EXPECT_EQ(columns, sdf->GetColumnNames());
EXPECT_EQ(columns, sdf->GetColumnNames());

auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath());
auto compSettings = *reader->GetDescriptor().GetClusterDescriptor(0).GetColumnRange(0).GetCompressionSettings();
EXPECT_EQ(404, compSettings);
auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath());
auto compSettings = *reader->GetDescriptor().GetClusterDescriptor(0).GetColumnRange(0).GetCompressionSettings();
EXPECT_EQ(505, compSettings);
}
// Directly through RSnapshotOptions
{
RSnapshotOptions opts;
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
opts.fCompressionAlgorithm = ROOT::RCompressionSetting::EAlgorithm::kLZ4;
opts.fCompressionLevel = 4;

auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), "x", opts);

EXPECT_EQ(columns, sdf->GetColumnNames());

auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath());
auto compSettings = *reader->GetDescriptor().GetClusterDescriptor(0).GetColumnRange(0).GetCompressionSettings();
EXPECT_EQ(404, compSettings);
}
// Through RNTupleWriteOptions provided by RSnapshotOptions
{
RSnapshotOptions opts;
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
opts.fNTupleWriteOpts.SetCompression(404);

auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), "x", opts);

EXPECT_EQ(columns, sdf->GetColumnNames());

auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath());
auto compSettings = *reader->GetDescriptor().GetClusterDescriptor(0).GetColumnRange(0).GetCompressionSettings();
EXPECT_EQ(404, compSettings);
}
// When both are set, preference is given to RNTupleWriteOptions
{
RSnapshotOptions opts;
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
opts.fCompressionAlgorithm = ROOT::RCompressionSetting::EAlgorithm::kLZ4;
opts.fCompressionLevel = 4;
opts.fNTupleWriteOpts.SetCompression(207);

auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), "x", opts);

EXPECT_EQ(columns, sdf->GetColumnNames());

auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath());
auto compSettings = *reader->GetDescriptor().GetClusterDescriptor(0).GetColumnRange(0).GetCompressionSettings();
EXPECT_EQ(207, compSettings);
}
}

class RDFSnapshotRNTupleTest : public ::testing::Test {
Expand Down
14 changes: 14 additions & 0 deletions tree/ntuple/inc/ROOT/RNTupleWriteOptions.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,20 @@ public:
void SetEnableSamePageMerging(bool val);

std::uint64_t GetMaxKeySize() const { return fMaxKeySize; }

friend bool operator==(const RNTupleWriteOptions &lhs, const RNTupleWriteOptions &rhs)
{
return lhs.fCompression == rhs.fCompression && lhs.fApproxZippedClusterSize == rhs.fApproxZippedClusterSize &&
lhs.fMaxUnzippedClusterSize == rhs.fMaxUnzippedClusterSize &&
lhs.fInitialUnzippedPageSize == rhs.fInitialUnzippedPageSize &&
lhs.fMaxUnzippedPageSize == rhs.fMaxUnzippedPageSize && lhs.fPageBufferBudget == rhs.fPageBufferBudget &&
lhs.fUseBufferedWrite == rhs.fUseBufferedWrite && lhs.fUseDirectIO == rhs.fUseDirectIO &&
lhs.fWriteBufferSize == rhs.fWriteBufferSize && lhs.fUseImplicitMT == rhs.fUseImplicitMT &&
lhs.fEnablePageChecksums == rhs.fEnablePageChecksums &&
lhs.fEnableSamePageMerging == rhs.fEnableSamePageMerging && lhs.fMaxKeySize == rhs.fMaxKeySize;
}

friend bool operator!=(const RNTupleWriteOptions &lhs, const RNTupleWriteOptions &rhs) { return !(lhs == rhs); }
};

namespace Internal {
Expand Down
Loading