From 19167f701a0af60cbd6ce65fa7550b52e34db534 Mon Sep 17 00:00:00 2001 From: Martin Foll Date: Thu, 29 Jan 2026 15:50:44 +0100 Subject: [PATCH 1/3] Introduce eager loading from dataframe(s) in RBatchGenerator This commit introduces the RDatasetLoader class which takes as input a vector of dataframes and loads each of them in memory and further splits them into training and validation datasets that are added to a vector for the datasets from each dataframe. The RSampler class is introduced to concatenate the training and validation datasets from the vector of datasets from RDatasetLoader and further shuffle them before the dataset is passed to RBatchLoader. Some changes are done to the existing classes to help with integrating the eager loading along side the existing chunk loading: - Remove numEntries and rdf_entries as input parameters to the RChunkLoader class - Replace numColumns with cols and vecSizes as input parameters to the RBatchLoader class - Add slice and concatenate methods for Flat2DMatrix in Flat2DMatrixOperator In the RBatchGenerator class the changes mentioned above are integrated to enable eager loading from dataframe(s). --- tmva/tmva/CMakeLists.txt | 2 + .../TMVA/BatchGenerator/RBatchGenerator.hxx | 202 +++++++++------- .../inc/TMVA/BatchGenerator/RBatchLoader.hxx | 13 +- .../inc/TMVA/BatchGenerator/RChunkLoader.hxx | 19 +- .../TMVA/BatchGenerator/RDatasetLoader.hxx | 220 ++++++++++++++++++ .../BatchGenerator/RFlat2DMatrixOperators.hxx | 42 ++++ .../tmva/inc/TMVA/BatchGenerator/RSampler.hxx | 76 ++++++ 7 files changed, 485 insertions(+), 89 deletions(-) create mode 100644 tmva/tmva/inc/TMVA/BatchGenerator/RDatasetLoader.hxx create mode 100644 tmva/tmva/inc/TMVA/BatchGenerator/RSampler.hxx diff --git a/tmva/tmva/CMakeLists.txt b/tmva/tmva/CMakeLists.txt index b4217ac7e4d1b..52be7ff086aa4 100644 --- a/tmva/tmva/CMakeLists.txt +++ b/tmva/tmva/CMakeLists.txt @@ -449,6 +449,8 @@ ROOT_STANDARD_LIBRARY_PACKAGE(TMVAUtils TMVA/BatchGenerator/RChunkConstructor.hxx TMVA/BatchGenerator/RFlat2DMatrix.hxx TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx + TMVA/BatchGenerator/RDatasetLoader.hxx + TMVA/BatchGenerator/RSampler.hxx SOURCES diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx index 623516165c6f3..fea9f4966b770 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx @@ -2,10 +2,10 @@ // Author: Kristupas Pranckietis, Vilnius University 05/2024 // Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024 // Author: Vincenzo Eduardo Padulano, CERN 10/2024 -// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025 +// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026 /************************************************************************* - * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. * + * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. * * All rights reserved. * * * * For the licensing terms see $ROOTSYS/LICENSE. * @@ -16,7 +16,10 @@ #define TMVA_RBATCHGENERATOR #include "TMVA/BatchGenerator/RFlat2DMatrix.hxx" +#include "TMVA/BatchGenerator/RSampler.hxx" #include "ROOT/RDF/RDatasetSpec.hxx" + +#include "TMVA/BatchGenerator/RDatasetLoader.hxx" #include "TMVA/BatchGenerator/RChunkLoader.hxx" #include "TMVA/BatchGenerator/RBatchLoader.hxx" #include "TROOT.h" @@ -46,38 +49,38 @@ template class RBatchGenerator { private: std::vector fCols; + std::vector fVecSizes; // clang-format on std::size_t fChunkSize; std::size_t fMaxChunks; std::size_t fBatchSize; std::size_t fBlockSize; - std::size_t fNumColumns; - std::size_t fNumChunkCols; - std::size_t fNumEntries; std::size_t fSetSeed; - std::size_t fSumVecSizes; - ROOT::RDF::RResultPtr> fEntries; float fValidationSplit; + std::unique_ptr> fDatasetLoader; std::unique_ptr> fChunkLoader; - std::unique_ptr fBatchLoader; std::unique_ptr fTrainingBatchLoader; std::unique_ptr fValidationBatchLoader; + std::unique_ptr fTrainingSampler; + std::unique_ptr fValidationSampler; + std::vector f_rdfs; + std::unique_ptr fLoadingThread; std::size_t fTrainingChunkNum; std::size_t fValidationChunkNum; - ROOT::RDF::RNode &f_rdf; - std::mutex fIsActiveMutex; bool fDropRemainder; bool fShuffle; + bool fLoadEager; + std::string fSampleType; + bool fIsActive{false}; // Whether the loading thread is active - bool fNotFiltered; bool fUseWholeFile; bool fEpochActive{false}; @@ -91,6 +94,12 @@ private: std::size_t fNumValidationChunks; // flattened buffers for chunks and temporary tensors (rows * cols) + std::vector fTrainingDatasets; + std::vector fValidationDatasets; + + RFlat2DMatrix fSampledTrainingDataset; + RFlat2DMatrix fSampledValidationDataset; + RFlat2DMatrix fTrainTensor; RFlat2DMatrix fTrainChunkTensor; @@ -98,14 +107,16 @@ private: RFlat2DMatrix fValidationChunkTensor; public: - RBatchGenerator(ROOT::RDF::RNode &rdf, const std::size_t chunkSize, const std::size_t blockSize, + RBatchGenerator(const std::vector &rdfs, const std::size_t chunkSize, const std::size_t blockSize, const std::size_t batchSize, const std::vector &cols, const std::vector &vecSizes = {}, const float vecPadding = 0.0, const float validationSplit = 0.0, const std::size_t maxChunks = 0, bool shuffle = true, - bool dropRemainder = true, const std::size_t setSeed = 0) + bool dropRemainder = true, const std::size_t setSeed = 0, bool loadEager = false, + std::string sampleType = "random") - : f_rdf(rdf), + : f_rdfs(rdfs), fCols(cols), + fVecSizes(vecSizes), fChunkSize(chunkSize), fBlockSize(blockSize), fBatchSize(batchSize), @@ -114,38 +125,49 @@ public: fDropRemainder(dropRemainder), fSetSeed(setSeed), fShuffle(shuffle), - fNotFiltered(f_rdf.GetFilterNames().empty()), - fUseWholeFile(maxChunks == 0), - fNumColumns(cols.size()) + fLoadEager(loadEager), + fSampleType(sampleType), + fUseWholeFile(maxChunks == 0) { + if (fLoadEager) { + fDatasetLoader = std::make_unique>(f_rdfs, fValidationSplit, fCols, fVecSizes, + vecPadding, fShuffle, fSetSeed); + // split the datasets and extract the training and validation datasets + fDatasetLoader->SplitDatasets(); + fTrainingDatasets = fDatasetLoader->GetTrainingDatasets(); + fValidationDatasets = fDatasetLoader->GetValidationDatasets(); + + fTrainingSampler = std::make_unique(fTrainingDatasets, fSampleType, fShuffle, fSetSeed); + fValidationSampler = std::make_unique(fValidationDatasets, fSampleType, fShuffle, fSetSeed); + + // sample the training and validation dataset from the datasets + fTrainingSampler->Sampler(fSampledTrainingDataset); + fValidationSampler->Sampler(fSampledValidationDataset); + + fNumTrainingEntries = fTrainingSampler->GetNumEntries(); + fNumValidationEntries = fValidationSampler->GetNumEntries(); + } - fNumEntries = f_rdf.Count().GetValue(); - fEntries = f_rdf.Take("rdfentry_"); - - fSumVecSizes = std::accumulate(vecSizes.begin(), vecSizes.end(), 0); - fNumChunkCols = fNumColumns + fSumVecSizes - vecSizes.size(); - - // add the last element in entries to not go out of range when filling chunks - fEntries->push_back((*fEntries)[fNumEntries - 1] + 1); + else { + fChunkLoader = + std::make_unique>(f_rdfs[0], fChunkSize, fBlockSize, fValidationSplit, + fCols, fVecSizes, vecPadding, fShuffle, fSetSeed); - fChunkLoader = - std::make_unique>(f_rdf, fNumEntries, fEntries, fChunkSize, fBlockSize, fValidationSplit, - fCols, vecSizes, vecPadding, fShuffle, fSetSeed); - fBatchLoader = std::make_unique(fBatchSize, fNumChunkCols, fNumEntries, fDropRemainder); + // split the dataset into training and validation sets + fChunkLoader->SplitDataset(); - // split the dataset into training and validation sets - fChunkLoader->SplitDataset(); + fNumTrainingEntries = fChunkLoader->GetNumTrainingEntries(); + fNumValidationEntries = fChunkLoader->GetNumValidationEntries(); - // number of training and validation entries after the split - fNumValidationEntries = static_cast(fValidationSplit * fNumEntries); - fNumTrainingEntries = fNumEntries - fNumValidationEntries; + // number of training and validation chunks, calculated in RChunkConstructor + fNumTrainingChunks = fChunkLoader->GetNumTrainingChunks(); + fNumValidationChunks = fChunkLoader->GetNumValidationChunks(); + } - fTrainingBatchLoader = std::make_unique(fBatchSize, fNumChunkCols, fNumTrainingEntries, fDropRemainder); - fValidationBatchLoader = std::make_unique(fBatchSize, fNumChunkCols, fNumValidationEntries, fDropRemainder); - - // number of training and validation chunks, calculated in RChunkConstructor - fNumTrainingChunks = fChunkLoader->GetNumTrainingChunks(); - fNumValidationChunks = fChunkLoader->GetNumValidationChunks(); + fTrainingBatchLoader = std::make_unique(fBatchSize, fCols, fVecSizes, + fNumTrainingEntries, fDropRemainder); + fValidationBatchLoader = std::make_unique(fBatchSize, fCols, fVecSizes, + fNumValidationEntries, fDropRemainder); } ~RBatchGenerator() { DeActivate(); } @@ -157,7 +179,8 @@ public: fIsActive = false; } - fBatchLoader->DeActivate(); + fTrainingBatchLoader->DeActivate(); + fValidationBatchLoader->DeActivate(); if (fLoadingThread) { if (fLoadingThread->joinable()) { @@ -178,7 +201,8 @@ public: fIsActive = true; } - fBatchLoader->Activate(); + fTrainingBatchLoader->Activate(); + fValidationBatchLoader->Activate(); // fLoadingThread = std::make_unique(&RBatchGenerator::LoadChunks, this); } @@ -197,63 +221,77 @@ public: /// \brief Create training batches by first loading a chunk (see RChunkLoader) and split it into batches (see RBatchLoader) void CreateTrainBatches() { - fChunkLoader->CreateTrainingChunksIntervals(); - fTrainingEpochActive = true; - fTrainingChunkNum = 0; - fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum); - fTrainingBatchLoader->CreateBatches(fTrainChunkTensor, fNumTrainingChunks); - fTrainingChunkNum++; + fTrainingEpochActive = true; + if (fLoadEager) { + fTrainingBatchLoader->CreateBatches(fSampledTrainingDataset, 1); + } + + else { + fChunkLoader->CreateTrainingChunksIntervals(); + fTrainingChunkNum = 0; + fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum); + fTrainingBatchLoader->CreateBatches(fTrainChunkTensor, fNumTrainingChunks); + fTrainingChunkNum++; + } } /// \brief Creates validation batches by first loading a chunk (see RChunkLoader), and then split it into batches (see RBatchLoader) void CreateValidationBatches() { - fChunkLoader->CreateValidationChunksIntervals(); - fValidationEpochActive = true; - fValidationChunkNum = 0; - fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum); - fValidationBatchLoader->CreateBatches(fValidationChunkTensor, fNumValidationChunks); - fValidationChunkNum++; - } - - /// \brief Loads a training batch from the queue - RFlat2DMatrix GetTrainBatch() - { - auto batchQueue = fTrainingBatchLoader->GetNumBatchQueue(); - - // load the next chunk if the queue is empty - if (batchQueue < 1 && fTrainingChunkNum < fNumTrainingChunks) { - fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum); - std::size_t lastTrainingBatch = fNumTrainingChunks - fTrainingChunkNum; - fTrainingBatchLoader->CreateBatches(fTrainChunkTensor, lastTrainingBatch); - fTrainingChunkNum++; + fValidationEpochActive = true; + if (fLoadEager) { + fValidationBatchLoader->CreateBatches(fSampledValidationDataset, 1); } else { - ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, 0, fNumEntries); + fChunkLoader->CreateValidationChunksIntervals(); + fValidationChunkNum = 0; + fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum); + fValidationBatchLoader->CreateBatches(fValidationChunkTensor, fNumValidationChunks); + fValidationChunkNum++; } + } - // Get next batch if available - return fTrainingBatchLoader->GetBatch(); + /// \brief Loads a training batch from the queue + RFlat2DMatrix GetTrainBatch() + { + if (!fLoadEager) { + auto batchQueue = fTrainingBatchLoader->GetNumBatchQueue(); + + // load the next chunk if the queue is empty + if (batchQueue < 1 && fTrainingChunkNum < fNumTrainingChunks) { + fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum); + std::size_t lastTrainingBatch = fNumTrainingChunks - fTrainingChunkNum; + fTrainingBatchLoader->CreateBatches(fTrainChunkTensor, lastTrainingBatch); + fTrainingChunkNum++; + } + + else { + fChunkLoader->ResetDataframe(); + } + } + // Get next batch if available + return fTrainingBatchLoader->GetBatch(); } /// \brief Loads a validation batch from the queue RFlat2DMatrix GetValidationBatch() { - auto batchQueue = fValidationBatchLoader->GetNumBatchQueue(); - - // load the next chunk if the queue is empty - if (batchQueue < 1 && fValidationChunkNum < fNumValidationChunks) { - fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum); - std::size_t lastValidationBatch = fNumValidationChunks - fValidationChunkNum; - fValidationBatchLoader->CreateBatches(fValidationChunkTensor, lastValidationBatch); - fValidationChunkNum++; - } + if (!fLoadEager) { + auto batchQueue = fValidationBatchLoader->GetNumBatchQueue(); + + // load the next chunk if the queue is empty + if (batchQueue < 1 && fValidationChunkNum < fNumValidationChunks) { + fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum); + std::size_t lastValidationBatch = fNumValidationChunks - fValidationChunkNum; + fValidationBatchLoader->CreateBatches(fValidationChunkTensor, lastValidationBatch); + fValidationChunkNum++; + } - else { - ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, 0, fNumEntries); + else { + fChunkLoader->ResetDataframe(); + } } - // Get next batch if available return fValidationBatchLoader->GetBatch(); } diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx index d92f18a458abb..8cd69afb99410 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx @@ -41,6 +41,10 @@ which are loaded into a queue. This is done for both the training and validation class RBatchLoader { private: std::size_t fBatchSize; + // needed for calculating the total number of batch columns when vectors columns are present + std::vector fCols; + std::vector fVecSizes; + std::size_t fSumVecSizes; std::size_t fNumColumns; std::size_t fNumEntries; bool fDropRemainder; @@ -66,13 +70,18 @@ private: std::unique_ptr fSecondaryLeftoverBatch; public: - RBatchLoader(std::size_t batchSize, std::size_t numColumns, std::size_t numEntries, bool dropRemainder) + RBatchLoader(std::size_t batchSize, const std::vector &cols, const std::vector &vecSizes = {}, + std::size_t numEntries = 0, bool dropRemainder = false) : fBatchSize(batchSize), - fNumColumns(numColumns), + fCols(cols), + fVecSizes(vecSizes), fNumEntries(numEntries), fDropRemainder(dropRemainder) { + fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0); + fNumColumns = fCols.size() + fSumVecSizes - fVecSizes.size(); + if (fBatchSize == 0) { fBatchSize = fNumEntries; } diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx index f2396dfaecc8c..60b73559af1cb 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx @@ -136,14 +136,11 @@ private: std::unique_ptr fValidation; public: - RChunkLoader(ROOT::RDF::RNode &rdf, std::size_t numEntries, - ROOT::RDF::RResultPtr> rdf_entries, const std::size_t chunkSize, - const std::size_t blockSize, const float validationSplit, const std::vector &cols, + RChunkLoader(ROOT::RDF::RNode &rdf, const std::size_t chunkSize, const std::size_t blockSize, + const float validationSplit, const std::vector &cols, const std::vector &vecSizes = {}, const float vecPadding = 0.0, bool shuffle = true, const std::size_t setSeed = 0) : f_rdf(rdf), - fNumEntries(numEntries), - fEntries(rdf_entries), fCols(cols), fVecSizes(vecSizes), fVecPadding(vecPadding), @@ -155,6 +152,13 @@ public: fSetSeed(setSeed) { fTensorOperators = std::make_unique(fShuffle, fSetSeed); + + fNumEntries = f_rdf.Count().GetValue(); + fEntries = f_rdf.Take("rdfentry_"); + + // add the last element in entries to not go out of range when filling chunks + fEntries->push_back((*fEntries)[fNumEntries - 1] + 1); + fNumCols = fCols.size(); fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0); @@ -418,6 +422,11 @@ public: } } + void ResetDataframe() + { + ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, 0, fNumEntries); + } + std::vector GetTrainingChunkSizes() { return fTraining->ChunksSizes; } std::vector GetValidationChunkSizes() { return fValidation->ChunksSizes; } diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RDatasetLoader.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RDatasetLoader.hxx new file mode 100644 index 0000000000000..17f627640cb7a --- /dev/null +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RDatasetLoader.hxx @@ -0,0 +1,220 @@ +// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026 + +/************************************************************************* + * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef TMVA_RDATASETLOADER +#define TMVA_RDATASETLOADER + +#include +#include + +#include "TMVA/RTensor.hxx" +#include "ROOT/RDataFrame.hxx" +#include "TMVA/BatchGenerator/RFlat2DMatrix.hxx" +#include "TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx" +#include "ROOT/RDF/Utils.hxx" +#include "ROOT/RVec.hxx" + +#include "ROOT/RLogger.hxx" + +namespace TMVA { +namespace Experimental { +namespace Internal { + +// clang-format off +/** +\class ROOT::TMVA::Experimental::Internal::RDatasetLoaderFunctor +\ingroup tmva +\brief Loading chunks made in RDatasetLoader into tensors from data from RDataFrame. +*/ + +template +class RDatasetLoaderFunctor { + // clang-format on + std::size_t fOffset{}; + std::size_t fVecSizeIdx{}; + float fVecPadding{}; + std::vector fMaxVecSizes{}; + RFlat2DMatrix &fDatasetTensor; + + std::size_t fNumDatasetCols; + + int fI; + int fNumColumns; + + ////////////////////////////////////////////////////////////////////////// + /// \brief Copy the content of a column into RTensor when the column consits of vectors + template ::value, int> = 0> + void AssignToTensor(const T &vec, int i, int numColumns) + { + std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++]; + std::size_t vec_size = vec.size(); + if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding + { + std::copy(vec.begin(), vec.end(), &fDatasetTensor.GetData()[fOffset + numColumns * i]); + std::fill(&fDatasetTensor.GetData()[fOffset + numColumns * i + vec_size], + &fDatasetTensor.GetData()[fOffset + numColumns * i + max_vec_size], fVecPadding); + } else // Copy only max_vec_size length from vector column + { + std::copy(vec.begin(), vec.begin() + max_vec_size, &fDatasetTensor.GetData()[fOffset + numColumns * i]); + } + fOffset += max_vec_size; + } + + ////////////////////////////////////////////////////////////////////////// + /// \brief Copy the content of a column into RTensor when the column consits of single values + template ::value, int> = 0> + void AssignToTensor(const T &val, int i, int numColumns) + { + fDatasetTensor.GetData()[fOffset + numColumns * i] = val; + fOffset++; + } + +public: + RDatasetLoaderFunctor(RFlat2DMatrix &datasetTensor, std::size_t numColumns, + const std::vector &maxVecSizes, float vecPadding, int i) + : fDatasetTensor(datasetTensor), fMaxVecSizes(maxVecSizes), fVecPadding(vecPadding), fI(i), fNumColumns(numColumns) + { + } + + void operator()(const ColTypes &...cols) + { + fVecSizeIdx = 0; + (AssignToTensor(cols, fI, fNumColumns), ...); + } +}; + +// clang-format off +/** +\class ROOT::TMVA::Experimental::Internal::RDatasetLoader +\ingroup tmva +\brief Load the whole dataset into memory. + +In this class the whole dataset is loaded into memory. The dataset is further shuffled and spit into training and validation sets with the user-defined validation split fraction. +*/ + +template +class RDatasetLoader { +private: + // clang-format on + std::size_t fNumEntries; + float fValidationSplit; + + std::vector fVecSizes; + std::size_t fSumVecSizes; + std::size_t fVecPadding; + std::size_t fNumDatasetCols; + + std::vector fTrainingDatasets; + std::vector fValidationDatasets; + + std::size_t fNumTrainingEntries; + std::size_t fNumValidationEntries; + std::unique_ptr fTensorOperators; + + std::vector f_rdfs; + std::vector fCols; + std::size_t fNumCols; + std::size_t fSetSeed; + + bool fNotFiltered; + bool fShuffle; + + ROOT::RDF::RResultPtr> fEntries; + +public: + RDatasetLoader(const std::vector &rdfs, const float validationSplit, + const std::vector &cols, const std::vector &vecSizes = {}, + const float vecPadding = 0.0, bool shuffle = true, const std::size_t setSeed = 0) + : f_rdfs(rdfs), + fCols(cols), + fVecSizes(vecSizes), + fVecPadding(vecPadding), + fValidationSplit(validationSplit), + fShuffle(shuffle), + fSetSeed(setSeed) + { + fTensorOperators = std::make_unique(fShuffle, fSetSeed); + fNumCols = fCols.size(); + fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0); + + fNumDatasetCols = fNumCols + fSumVecSizes - fVecSizes.size(); + } + + ////////////////////////////////////////////////////////////////////////// + /// \brief Split an individual dataframe into a training and validation dataset + /// \param[in] rdf Dataframe that will be split into training and validation + /// \param[in] TrainingDataset Tensor for the training dataset + /// \param[in] ValidationDataset Tensor for the validation dataset + void SplitDataframe(ROOT::RDF::RNode &rdf, RFlat2DMatrix &TrainingDataset, RFlat2DMatrix &ValidationDataset) + { + std::size_t NumEntries = rdf.Count().GetValue(); + ROOT::RDF::RResultPtr> Entries = rdf.Take("rdfentry_"); + + // add the last element in entries to not go out of range when filling chunks + Entries->push_back((*Entries)[NumEntries - 1] + 1); + + // number of training and validation entries after the split + std::size_t NumValidationEntries = static_cast(fValidationSplit * NumEntries); + std::size_t NumTrainingEntries = NumEntries - NumValidationEntries; + + RFlat2DMatrix Dataset({NumEntries, fNumDatasetCols}); + + bool NotFiltered = rdf.GetFilterNames().empty(); + if (NotFiltered) { + RDatasetLoaderFunctor func(Dataset, fNumDatasetCols, fVecSizes, fVecPadding, 0); + rdf.Foreach(func, fCols); + } + + else { + std::size_t datasetEntry = 0; + for (std::size_t j = 0; j < NumEntries; j++) { + RDatasetLoaderFunctor func(Dataset, fNumDatasetCols, fVecSizes, fVecPadding, datasetEntry); + ROOT::Internal::RDF::ChangeBeginAndEndEntries(rdf, (*Entries)[j], (*Entries)[j + 1]); + rdf.Foreach(func, fCols); + datasetEntry++; + } + } + RFlat2DMatrix ShuffledDataset({NumEntries, fNumDatasetCols}); + fTensorOperators->ShuffleTensor(ShuffledDataset, Dataset); + fTensorOperators->SliceTensor(TrainingDataset, ShuffledDataset, {{0, NumTrainingEntries}, {0, fNumDatasetCols}}); + fTensorOperators->SliceTensor(ValidationDataset, ShuffledDataset, {{NumTrainingEntries, NumEntries}, {0, fNumDatasetCols}}); + } + + ////////////////////////////////////////////////////////////////////////// + /// \brief Split the dataframes in a training and validation dataset + void SplitDatasets() + { + fNumEntries = 0; + fNumTrainingEntries = 0; + fNumValidationEntries = 0; + + for (auto& rdf : f_rdfs) { + RFlat2DMatrix TrainingDataset; + RFlat2DMatrix ValidationDataset; + + SplitDataframe(rdf, TrainingDataset, ValidationDataset); + fTrainingDatasets.push_back(TrainingDataset); + fValidationDatasets.push_back(ValidationDataset); + + fNumTrainingEntries += TrainingDataset.GetRows(); + fNumValidationEntries += ValidationDataset.GetRows(); + fNumEntries += TrainingDataset.GetRows() + ValidationDataset.GetRows(); + } + } + + std::vector GetTrainingDatasets() {return fTrainingDatasets;} + std::vector GetValidationDatasets() {return fValidationDatasets;} + +}; + +} // namespace Internal +} // namespace Experimental +} // namespace TMVA +#endif // TMVA_RDATASETLOADER diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx index efaabc9bc0be6..0c3ebb044822d 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx @@ -67,6 +67,48 @@ public: ShuffledTensor.GetData() + i * cols); } } + + void SliceTensor(RFlat2DMatrix& SlicedTensor, RFlat2DMatrix& Tensor, + const std::vector>& slice) + { + const auto& rowSlice = slice[0]; + const auto& colSlice = slice[1]; + + std::size_t rowStart = rowSlice[0]; + std::size_t rowEnd = rowSlice[1]; + std::size_t colStart = colSlice[0]; + std::size_t colEnd = colSlice[1]; + + std::size_t rows = rowEnd - rowStart; + std::size_t cols = colEnd - colStart; + + SlicedTensor.Resize(rows, cols); + std::copy(Tensor.GetData() + rowStart * cols, + Tensor.GetData() + rowStart * cols + rows * cols, + SlicedTensor.GetData()); + } + + void ConcatenateTensors(RFlat2DMatrix &ConcatTensor, std::vector &Tensors) + { + std::size_t cols = Tensors[0].GetCols(); + std::size_t rows = 0; + + for (const auto& t : Tensors) { + rows += t.GetRows(); + } + + ConcatTensor.Resize(rows, cols); + + std::size_t index = 0; + for (std::size_t i = 0; i < Tensors.size(); i++) { + std::size_t tensorRows = Tensors[i].GetRows(); + std::copy(Tensors[i].GetData(), + Tensors[i].GetData() + tensorRows * cols, + ConcatTensor.GetData() + index * cols); + index += tensorRows; + } + } + }; diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RSampler.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RSampler.hxx new file mode 100644 index 0000000000000..5e5fe5cc5b879 --- /dev/null +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RSampler.hxx @@ -0,0 +1,76 @@ +// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026 + +/************************************************************************* + * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef TMVA_RSAMPLER +#define TMVA_RSAMPLER + +#include +#include +#include + +#include "ROOT/RDataFrame.hxx" +#include "ROOT/RDF/Utils.hxx" +#include "ROOT/RVec.hxx" +#include "TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx" +#include "ROOT/RLogger.hxx" + +namespace TMVA::Experimental::Internal { +// clang-format off +/** +\class ROOT::TMVA::Experimental::Internal::RSampler +\ingroup tmva +\brief Implementation of different sampling strategies. +*/ + +class RSampler { +private: + // clang-format on + std::vector &fDatasets; + std::string fSampleType; + bool fShuffle; + std::size_t fSetSeed; + std::size_t fNumEntries; + + std::unique_ptr fTensorOperators; +public: + RSampler(std::vector &datasets, const std::string &sampleType, bool shuffle = true, const std::size_t setSeed = 0) + : fDatasets(datasets), + fSampleType(sampleType), + fShuffle(shuffle), + fSetSeed(setSeed) + { + fTensorOperators = std::make_unique(fShuffle, fSetSeed); + } + + ////////////////////////////////////////////////////////////////////////// + /// \brief Collection of sampling types + /// \param[in] SampledTensor Tensor with all the sampled entries + void Sampler(RFlat2DMatrix &SampledTensor) + { + if (fSampleType == "random") { + RandomSampler(SampledTensor); + } + } + + ////////////////////////////////////////////////////////////////////////// + /// \brief Sample all entries randomly from the datasets + /// \param[in] SampledTensor Tensor with all the sampled entries + void RandomSampler(RFlat2DMatrix &SampledTensor) { + RFlat2DMatrix ConcatTensor; + fTensorOperators->ConcatenateTensors(ConcatTensor, fDatasets); + fTensorOperators->ShuffleTensor(SampledTensor, ConcatTensor); + fNumEntries = SampledTensor.GetRows(); + } + + std::size_t GetNumEntries() { return fNumEntries;} +}; + +} // namespace TMVA::Experimental::Internal +#endif // TMVA_RSAMPLER From 3bb292705a0c6c81b6ba4779ab780c7791384f24 Mon Sep 17 00:00:00 2001 From: Martin Foll Date: Mon, 26 Jan 2026 19:38:42 +0100 Subject: [PATCH 2/3] Adjust the python bindings for RBatchGenerator to enable eager loading from dataframe(s) This commits adjusts the python bindings from RBatchGenerator such that eager loading is enabled in the batch loading from Numpy, PyTorch and TensorFlow. The load_eager (bool) parameter is added to choose between eager loading (True) or chunk loading (False). The sampling_type (str) parameter is added to distingush between which sampling strategy is chosen for eager loading. Further, the rdataframes input parameter is changes such that it now can either be a single dataframe or a list of dataframes. --- .../_pythonization/_tmva/_batchgenerator.py | 137 ++++++++++++++---- 1 file changed, 107 insertions(+), 30 deletions(-) diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py index bb6c0047e96d4..8388bdbd90620 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py @@ -2,10 +2,10 @@ # Author: Kristupas Pranckietis, Vilnius University 05/2024 # Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024 # Author: Vincenzo Eduardo Padulano, CERN 10/2024 -# Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025 +# Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026 ################################################################################ -# Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. # +# Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. # # All rights reserved. # # # # For the licensing terms see $ROOTSYS/LICENSE. # @@ -82,10 +82,10 @@ def get_template( def __init__( self, - rdataframe: ROOT.RDF.RNode, - batch_size: int, - chunk_size: int, - block_size: int, + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + batch_size: int = 0, + chunk_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -96,6 +96,8 @@ def __init__( shuffle: bool = True, drop_remainder: bool = True, set_seed: int = 0, + load_eager: bool = False, + sampling_type: str = "random", ): """Wrapper around the Cpp RBatchGenerator @@ -105,6 +107,10 @@ def __init__( chunk_size (int): The size of the chunks loaded from the ROOT file. Higher chunk size results in better randomization, but also higher memory usage. + block_size (int): + The size of the blocks of consecutive entries from the dataframe. + A chunk is build up from multiple blocks. Lower block size results in + a better randomization, but also higher memory usage. columns (list[str], optional): Columns to be returned. If not given, all columns are used. max_vec_sizes (dict[std, int], optional): @@ -134,6 +140,13 @@ def __init__( For reproducibility: Set the seed for the random number generator used to split the dataset into training and validation and shuffling of the chunks Defaults to 0 which means that the seed is set to the random device. + load_eager (bool): + Load the full dataframe(s) into memory (True) or + load chunks from the dataframe into memory (False). + Defuaults to False. + sampling_type (str): + Describes the mode of sampling from the dataframe(s). Options: 'random'. + Defaults to 'random' and requires load_eager = True. """ import ROOT @@ -148,7 +161,7 @@ def __init__( using RBatchGenerator" ) - if chunk_size < batch_size: + if load_eager == False and chunk_size < batch_size: raise ValueError( f"chunk_size cannot be smaller than batch_size: chunk_size: \ {chunk_size}, batch_size: {batch_size}" @@ -160,7 +173,9 @@ def __init__( given value is {validation_split}" ) - self.noded_rdf = RDF.AsRNode(rdataframe) + if not isinstance(rdataframes, list): + rdataframes = [rdataframes] + self.noded_rdfs = [RDF.AsRNode(rdf) for rdf in rdataframes] if isinstance(target, str): target = [target] @@ -169,7 +184,7 @@ def __init__( self.weights_column = weights template, max_vec_sizes_list = self.get_template( - rdataframe, columns, max_vec_sizes + rdataframes[0], columns, max_vec_sizes ) self.num_columns = len(self.all_columns) @@ -222,7 +237,7 @@ def __init__( EnableThreadSafety() self.generator = TMVA.Experimental.Internal.RBatchGenerator(template)( - self.noded_rdf, + self.noded_rdfs, chunk_size, block_size, batch_size, @@ -234,6 +249,8 @@ def __init__( shuffle, drop_remainder, set_seed, + load_eager, + sampling_type, ) atexit.register(self.DeActivate) @@ -652,10 +669,10 @@ def __call__(self) -> Any: return None def CreateNumPyGenerators( - rdataframe: ROOT.RDF.RNode, - batch_size: int, - chunk_size: int, - block_size: int, + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + batch_size: int = 0, + chunk_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -666,6 +683,8 @@ def CreateNumPyGenerators( shuffle: bool = True, drop_remainder=True, set_seed: int = 0, + load_eager: bool = False, + sampling_type: str = "random", ) -> Tuple[TrainRBatchGenerator, ValidationRBatchGenerator]: """ Return two batch generators based on the given ROOT file and tree or RDataFrame @@ -678,6 +697,10 @@ def CreateNumPyGenerators( chunk_size (int): The size of the chunks loaded from the ROOT file. Higher chunk size results in better randomization, but also higher memory usage. + block_size (int): + The size of the blocks of consecutive entries from the dataframe. + A chunk is build up from multiple blocks. Lower block size results in + a better randomization, but also higher memory usage. columns (list[str], optional): Columns to be returned. If not given, all columns are used. max_vec_sizes (list[int], optional): @@ -706,6 +729,20 @@ def CreateNumPyGenerators( [4, 5, 6, 7] will be returned. If drop_remainder = False, then three batches [0, 1, 2, 3], [4, 5, 6, 7] and [8, 9] will be returned. + set_seed (int): + For reproducibility: Set the seed for the random number generator used + to split the dataset into training and validation and shuffling of the chunks + Defaults to 0 which means that the seed is set to the random device. + load_eager (bool): + Load the full dataframe(s) into memory (True) or + load chunks from the dataframe into memory (False). + Defuaults to False. + sampling_type (str): + Describes the mode of sampling from the dataframe(s). Options: 'random'. + Defaults to 'random' and requires load_eager = True. + + + Returns: TrainRBatchGenerator or @@ -721,7 +758,7 @@ def CreateNumPyGenerators( import numpy as np base_generator = BaseGenerator( - rdataframe, + rdataframes, batch_size, chunk_size, block_size, @@ -734,7 +771,9 @@ def CreateNumPyGenerators( max_chunks, shuffle, drop_remainder, - set_seed, + set_seed, + load_eager, + sampling_type, ) train_generator = TrainRBatchGenerator( @@ -752,10 +791,10 @@ def CreateNumPyGenerators( def CreateTFDatasets( - rdataframe: ROOT.RDF.RNode, - batch_size: int, - chunk_size: int, - block_size: int, + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + batch_size: int = 0, + chunk_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -765,7 +804,9 @@ def CreateTFDatasets( max_chunks: int = 0, shuffle: bool = True, drop_remainder=True, - set_seed: int = 0, + set_seed: int = 0, + load_eager: bool = False, + sampling_type: str = "random", ) -> Tuple[tf.data.Dataset, tf.data.Dataset]: """ Return two Tensorflow Datasets based on the given ROOT file and tree or RDataFrame @@ -778,6 +819,10 @@ def CreateTFDatasets( chunk_size (int): The size of the chunks loaded from the ROOT file. Higher chunk size results in better randomization, but also higher memory usage. + block_size (int): + The size of the blocks of consecutive entries from the dataframe. + A chunk is build up from multiple blocks. Lower block size results in + a better randomization, but also higher memory usage. columns (list[str], optional): Columns to be returned. If not given, all columns are used. max_vec_sizes (list[int], optional): @@ -806,6 +851,17 @@ def CreateTFDatasets( [4, 5, 6, 7] will be returned. If drop_remainder = False, then three batches [0, 1, 2, 3], [4, 5, 6, 7] and [8, 9] will be returned. + set_seed (int): + For reproducibility: Set the seed for the random number generator used + to split the dataset into training and validation and shuffling of the chunks + Defaults to 0 which means that the seed is set to the random device. + load_eager (bool): + Load the full dataframe(s) into memory (True) or + load chunks from the dataframe into memory (False). + Defuaults to False. + sampling_type (str): + Describes the mode of sampling from the dataframe(s). Options: 'random'. + Defaults to 'random' and requires load_eager = True. Returns: TrainRBatchGenerator or @@ -820,7 +876,7 @@ def CreateTFDatasets( import tensorflow as tf base_generator = BaseGenerator( - rdataframe, + rdataframes, batch_size, chunk_size, block_size, @@ -833,7 +889,9 @@ def CreateTFDatasets( max_chunks, shuffle, drop_remainder, - set_seed, + set_seed, + load_eager, + sampling_type, ) train_generator = TrainRBatchGenerator( @@ -901,10 +959,10 @@ def CreateTFDatasets( def CreatePyTorchGenerators( - rdataframe: ROOT.RDF.RNode, - batch_size: int, - chunk_size: int, - block_size: int, + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + batch_size: int = 0, + chunk_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -914,7 +972,9 @@ def CreatePyTorchGenerators( max_chunks: int = 0, shuffle: bool = True, drop_remainder=True, - set_seed: int = 0, + set_seed: int = 0, + load_eager: bool = False, + sampling_type: str = "random", ) -> Tuple[TrainRBatchGenerator, ValidationRBatchGenerator]: """ Return two Tensorflow Datasets based on the given ROOT file and tree or RDataFrame @@ -927,6 +987,10 @@ def CreatePyTorchGenerators( chunk_size (int): The size of the chunks loaded from the ROOT file. Higher chunk size results in better randomization, but also higher memory usage. + block_size (int): + The size of the blocks of consecutive entries from the dataframe. + A chunk is build up from multiple blocks. Lower block size results in + a better randomization, but also higher memory usage. columns (list[str], optional): Columns to be returned. If not given, all columns are used. max_vec_sizes (list[int], optional): @@ -955,6 +1019,17 @@ def CreatePyTorchGenerators( [4, 5, 6, 7] will be returned. If drop_remainder = False, then three batches [0, 1, 2, 3], [4, 5, 6, 7] and [8, 9] will be returned. + set_seed (int): + For reproducibility: Set the seed for the random number generator used + to split the dataset into training and validation and shuffling of the chunks + Defaults to 0 which means that the seed is set to the random device. + load_eager (bool): + Load the full dataframe(s) into memory (True) or + load chunks from the dataframe into memory (False). + Defuaults to False. + sampling_type (str): + Describes the mode of sampling from the dataframe(s). Options: 'random'. + Defaults to 'random' and requires load_eager = True. Returns: TrainRBatchGenerator or @@ -967,7 +1042,7 @@ def CreatePyTorchGenerators( validation generator will return no batches. """ base_generator = BaseGenerator( - rdataframe, + rdataframes, batch_size, chunk_size, block_size, @@ -980,7 +1055,9 @@ def CreatePyTorchGenerators( max_chunks, shuffle, drop_remainder, - set_seed, + set_seed, + load_eager, + sampling_type, ) train_generator = TrainRBatchGenerator( From 2d3b47a3b8a2d0ee9dea8bd50ba5eacad083f59d Mon Sep 17 00:00:00 2001 From: Martin Foll Date: Mon, 26 Jan 2026 19:40:05 +0100 Subject: [PATCH 3/3] Add tests for eager loading from dataframe(s) in RBatchGenerator --- .../test/rbatchgenerator_completeness.py | 2357 +++++++++++++++++ 1 file changed, 2357 insertions(+) diff --git a/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py b/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py index 43453702c553c..4aac5235dc2a5 100644 --- a/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py +++ b/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py @@ -1170,6 +1170,2363 @@ def test16_vector_padding(self): self.teardown_file(self.file_name3) raise +class RBatchGeneratorEagerLoading(unittest.TestCase): + + file_name1 = "first_half.root" + file_name2 = "second_half.root" + file_name3 = "vector_columns.root" + tree_name = "mytree" + + # default constants + n_train_batch = 2 + n_val_batch = 1 + val_remainder = 1 + + # Helpers + def define_rdf(self, num_of_entries=10): + df = ROOT.RDataFrame(num_of_entries)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(double) b1*b1") + + return df + + def create_file(self, num_of_entries=10): + self.define_rdf(num_of_entries).Snapshot( + self.tree_name, self.file_name1) + + def create_5_entries_file(self): + df1 = ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 10")\ + .Define("b2", "(double) b1 * b1")\ + .Snapshot(self.tree_name, self.file_name2) + + def create_vector_file(self, num_of_entries=10): + df3 = ROOT.RDataFrame(10)\ + .Define("b1", "(int) rdfentry_")\ + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ + .Snapshot(self.tree_name, self.file_name3) + + def teardown_file(self, file): + os.remove(file) + + def test01_each_element_is_generated_unshuffled(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True, + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 7.0, 8.0, 9.0] + results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, 25.0] + results_y_val = [36.0, 49.0, 64.0, 81.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + + except: + self.teardown_file(self.file_name1) + raise + + def test02_each_element_is_generated_shuffled(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=True, + drop_remainder=False, + load_eager=True + ) + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + self.teardown_file(self.file_name1) + + except: + self.teardown_file(self.file_name1) + raise + + def test04_dropping_remainder(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=True, + load_eager=True + ) + + collected_x = [] + collected_y = [] + + for x, y in gen_train: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x.append(x) + collected_y.append(y) + + for x, y in gen_validation: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x.append(x) + collected_y.append(y) + + self.assertEqual(len(collected_x), 3) + self.assertEqual(len(collected_y), 3) + + self.teardown_file(self.file_name1) + + except: + self.teardown_file(self.file_name1) + raise + + + def test05_more_than_one_file(self): + self.create_file() + self.create_5_entries_file() + + try: + df = ROOT.RDataFrame( + self.tree_name, [self.file_name1, self.file_name2]) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0] + results_x_val = [9.0, 10.0, 11.0, 12.0, 13.0, 14.0] + results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, + 25.0, 36.0, 49.0, 64.0] + results_y_val = [81.0, 100.0, 121.0, 144.0, 169.0, 196.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + for x, y in gen_train: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for x, y in gen_validation: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + def test06_multiple_target_columns(self): + file_name = "multiple_target_columns.root" + + ROOT.RDataFrame(10)\ + .Define("b1", "(Short_t) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Define("b3", "(double) rdfentry_ * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name) + try: + df = ROOT.RDataFrame("myTree", file_name) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 7.0, 8.0, 9.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0] + results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] + results_z_val = [60.0, 70.0, 80.0, 90.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + collected_z_train.append(z.tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 2)) + self.assertTrue(z.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name) + + except: + self.teardown_file(file_name) + raise + + def test07_multiple_input_columns(self): + file_name = "multiple_input_columns.root" + + ROOT.RDataFrame(10)\ + .Define("b1", "(Short_t) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Define("b3", "(double) rdfentry_ * 10")\ + .Snapshot("myTree", file_name) + + try: + df = ROOT.RDataFrame("myTree", file_name) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 0.0, 1.0, 10.0, 2.0, + 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0] + results_x_val = [6.0, 60.0, 7.0, 70.0, 8.0, 80.0, 9.0, 90.0] + results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, 25.0] + results_y_val = [36.0, 49.0, 64.0, 81.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 2)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 2)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 2)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(file_name) + + except: + self.teardown_file(file_name) + raise + + def test08_filtered(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + dff = df.Filter("b1 % 2 == 0", "name") + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + dff, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 2.0, 4.0] + results_x_val = [6.0, 8.0] + results_y_train = [0.0, 4.0, 16.0] + results_y_val = [36.0, 64.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (2, 1)) + self.assertTrue(y.shape == (2, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + + except: + self.teardown_file(self.file_name1) + raise + + def test09_filtered_last_chunk(self): + file_name = "filtered_last_chunk.root" + tree_name = "myTree" + + ROOT.RDataFrame(20)\ + .Define("b1", "(Short_t) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Snapshot(tree_name, file_name) + + try: + df = ROOT.RDataFrame(tree_name, file_name) + + dff = df.Filter("b1 % 2 == 0", "name") + + gen_train, _ = ROOT.TMVA.Experimental.CreateNumPyGenerators( + dff, + batch_size=3, + target="b2", + validation_split=0, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 2.0, 4.0, 6.0, + 8.0, 10.0, 12.0, 14.0, 16.0, 18.0] + results_y_train = [0.0, 4.0, 16.0, 36.0, + 64.0, 100.0, 144.0, 196.0, 256.0, 324.0] + + collected_x_train = [] + collected_y_train = [] + + train_iter = iter(gen_train) + + for _ in range(3): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(train_iter) + self.assertTrue(x.shape == (1, 1)) + self.assertTrue(y.shape == (1, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_y_train, flat_y_train) + + self.teardown_file(file_name) + + except: + self.teardown_file(file_name) + raise + + def test10_two_epochs_shuffled(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + both_epochs_collected_x_val = [] + both_epochs_collected_y_val = [] + + for _ in range(2): + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = { + x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = { + y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + both_epochs_collected_x_val.append(collected_x_val) + both_epochs_collected_y_val.append(collected_y_val) + + self.assertEqual( + both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) + self.assertEqual( + both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) + finally: + self.teardown_file(self.file_name1) + + def test11_number_of_training_and_validation_batches_remainder(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + number_of_training_batches = 0 + number_of_validation_batches = 0 + + for _ in gen_train: + number_of_training_batches += 1 + + for _ in gen_validation: + number_of_validation_batches += 1 + + self.assertEqual(gen_train.number_of_batches, + number_of_training_batches) + self.assertEqual(gen_validation.number_of_batches, + number_of_validation_batches) + self.assertEqual(gen_train.last_batch_no_of_rows, 0) + self.assertEqual(gen_validation.last_batch_no_of_rows, 1) + + self.teardown_file(self.file_name1) + + except: + self.teardown_file(self.file_name1) + raise + + def test12_PyTorch(self): + import torch + + file_name = "multiple_target_columns.root" + + ROOT.RDataFrame(10)\ + .Define("b1", "(Short_t) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Define("b3", "(double) rdfentry_ * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name) + + try: + df = ROOT.RDataFrame("myTree", file_name) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + df, + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 7.0, 8.0, 9.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0] + results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] + results_z_val = [60.0, 70.0, 80.0, 90.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + collected_z_train.append(z.tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 2)) + self.assertTrue(z.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name) + + except: + self.teardown_file(file_name) + raise + + def test13_TensorFlow(self): + import tensorflow as tf + + file_name = "multiple_target_columns.root" + + ROOT.RDataFrame(10)\ + .Define("b1", "(Short_t) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Define("b3", "(double) rdfentry_ * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name) + + try: + df = ROOT.RDataFrame("myTree", file_name) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateTFDatasets( + df, + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 7.0, 8.0, 9.0, 0.0, 0.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0, 0.0, 0.0, 0.0, 0.0] + results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] + results_z_val = [60.0, 70.0, 80.0, 90.0, 0.0, 0.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.numpy().tolist()) + collected_y_train.append(y.numpy().tolist()) + collected_z_train.append(z.numpy().tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.numpy().tolist()) + collected_y_val.append(y.numpy().tolist()) + collected_z_val.append(z.numpy().tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.numpy().tolist()) + collected_y_val.append(y.numpy().tolist()) + collected_z_val.append(z.numpy().tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name) + + except: + self.teardown_file(file_name) + raise + + def test14_big_data(self): + file_name = "big_data.root" + tree_name = "myTree" + + entries_in_rdf = randrange(10000, 30000) + chunk_size = randrange(1000, 3001) + batch_size = randrange(100, 501) + + error_message = f"\n Batch size: {batch_size} Chunk size: {chunk_size}\ + Number of entries: {entries_in_rdf}" + + def define_rdf(num_of_entries): + ROOT.RDataFrame(num_of_entries)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(double) rdfentry_ * 2")\ + .Define("b3", "(int) rdfentry_ + 10192")\ + .Define("b4", "(int) -rdfentry_")\ + .Define("b5", "(double) -rdfentry_ - 10192")\ + .Snapshot(tree_name, file_name) + + def test(size_of_batch, size_of_chunk, num_of_entries): + define_rdf(num_of_entries) + + try: + df = ROOT.RDataFrame(tree_name, file_name) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=size_of_batch, + target=["b3", "b5"], + weights="b2", + validation_split=0.3, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + collect_x = [] + + train_remainder = gen_train.last_batch_no_of_rows + val_remainder = gen_validation.last_batch_no_of_rows + + n_train_batches = gen_train.number_of_batches - \ + 1 if train_remainder else gen_train.number_of_batches + n_val_batches = gen_validation.number_of_batches - \ + 1 if val_remainder else gen_validation.number_of_batches + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for i in range(n_train_batches): + x, y, z = next(iter_train) + + self.assertTrue(x.shape == (size_of_batch, 2), + error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), + error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), + error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue( + np.all(x[:, 0]*(-1) == x[:, 1]), error_message + f" row: {i}") + self.assertTrue( + np.all(x[:, 0]+10192 == y[:, 0]), error_message + f" row: {i}") + # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) + self.assertTrue( + np.all(x[:, 0]*2 == z[:, 0]), error_message + f" row: {i}") + + collect_x.extend(list(x[:, 0])) + + if train_remainder: + x, y, z = next(iter_train) + self.assertTrue(x.shape == ( + train_remainder, 2), error_message) + self.assertTrue(y.shape == ( + train_remainder, 2), error_message) + self.assertTrue(z.shape == ( + train_remainder, 1), error_message) + collect_x.extend(list(x[:, 0])) + + for _ in range(n_val_batches): + x, y, z = next(iter_val) + + self.assertTrue(x.shape == (size_of_batch, 2), + error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), + error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), + error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue( + np.all(x[:, 0]*(-1) == x[:, 1]), error_message) + self.assertTrue( + np.all(x[:, 0]+10192 == y[:, 0]), error_message) + # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) + self.assertTrue( + np.all(x[:, 0]*2 == z[:, 0]), error_message) + + collect_x.extend(list(x[:, 0])) + + if val_remainder: + x, y, z = next(iter_val) + self.assertTrue(x.shape == ( + val_remainder, 2), error_message) + self.assertTrue(y.shape == ( + val_remainder, 2), error_message) + self.assertTrue(z.shape == ( + val_remainder, 1), error_message) + collect_x.extend(list(x[:, 0])) + + self.assertTrue(set(collect_x) == set(i for i in range(num_of_entries)), f"collected length: {len(set(collect_x))}\ + generated length {len(set(i for i in range(num_of_entries)))}") + + except: + self.teardown_file(file_name) + raise + + test(batch_size, chunk_size, entries_in_rdf) + + + def test15_two_runs_set_seed(self): + self.create_file() + + try: + + both_runs_collected_x_val = [] + both_runs_collected_y_val = [] + + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + for _ in range(2): + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=True, + drop_remainder=False, + set_seed = 42, + load_eager=True + ) + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = { + x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = { + y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + both_runs_collected_x_val.append(collected_x_val) + both_runs_collected_y_val.append(collected_y_val) + self.assertEqual( + both_runs_collected_x_val[0], both_runs_collected_x_val[1]) + self.assertEqual( + both_runs_collected_y_val[0], both_runs_collected_y_val[1]) + finally: + self.teardown_file(self.file_name1) + + def test16_vector_padding(self): + self.create_vector_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name3) + max_vec_sizes = {"v1": 3, "v2": 2} + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b1", + validation_split=0.4, + max_vec_sizes=max_vec_sizes, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + + results_x_train = [0.0, 0.0, 0.0, 0.0, 0.0, + 1.0, 10.0, 0, 100.0, 1000.0, + 2.0, 20.0, 0, 200.0, 2000.0, + 3.0, 30.0, 0, 300.0, 3000.0, + 4.0, 40.0, 0, 400.0, 4000.0, + 5.0, 50.0, 0, 500.0, 5000.0] + results_y_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 60.0, 0.0, 600.0, 6000.0, + 7.0, 70.0, 0.0, 700.0, 7000.0, + 8.0, 80.0, 0.0, 800.0, 8000.0, + 9.0, 90.0, 0.0, 900.0, 9000.0] + results_y_val = [6.0, 7.0, 8.0, 9.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 5)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 5)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 5)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name3) + + except: + self.teardown_file(self.file_name3) + raise + +class RBatchGeneratorEagerLoadingMultipleDataframes(unittest.TestCase): + + file_name1 = "first_half.root" + file_name2 = "second_half.root" + file_name3 = "second_file.root" + file_name4 = "vector_columns_1.root" + file_name5 = "vector_columns_2.root" + tree_name = "mytree" + + # default constants + n_train_batch = 2 + n_val_batch = 1 + val_remainder = 1 + + # Helpers + def define_rdf1(self, num_of_entries=5): + df = ROOT.RDataFrame(num_of_entries)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(double) b1*b1") + + return df + + def define_rdf2(self, num_of_entries=5): + df = ROOT.RDataFrame(num_of_entries)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("b2", "(double) b1*b1") + + return df + + def create_file1(self, num_of_entries=5): + self.define_rdf1(num_of_entries).Snapshot( + self.tree_name, self.file_name1) + + def create_file2(self, num_of_entries=5): + self.define_rdf2(num_of_entries).Snapshot( + self.tree_name, self.file_name2) + + def create_5_entries_file(self): + df1 = ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 10")\ + .Define("b2", "(double) b1 * b1")\ + .Snapshot(self.tree_name, self.file_name3) + + def create_vector_file1(self, num_of_entries=5): + df3 = ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_")\ + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ + .Snapshot(self.tree_name, self.file_name4) + + def create_vector_file2(self, num_of_entries=5): + df3 = ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ + .Snapshot(self.tree_name, self.file_name5) + + def teardown_file(self, file): + os.remove(file) + + + def test01_each_element_is_generated_unshuffled(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True, + ) + + results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 4.0, 8.0, 9.0] + results_y_train = [0.0, 1.0, 4.0, 25.0, 36.0, 49.0] + results_y_val = [9.0, 16.0, 64.0, 81.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + def test02_each_element_is_generated_shuffled(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=True, + drop_remainder=False, + load_eager=True + ) + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + def test04_dropping_remainder(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=True, + load_eager=True + ) + + collected_x = [] + collected_y = [] + + for x, y in gen_train: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x.append(x) + collected_y.append(y) + + for x, y in gen_validation: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x.append(x) + collected_y.append(y) + + self.assertEqual(len(collected_x), 3) + self.assertEqual(len(collected_y), 3) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + + def test05_more_than_one_file(self): + self.create_file1() + self.create_file2() + self.create_5_entries_file() + + try: + df1 = ROOT.RDataFrame( + self.tree_name, [self.file_name1, self.file_name2]) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name3) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 11.0, 12.0] + results_x_val = [6.0, 7.0, 8.0, 9.0, 13.0, 14.0] + results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, + 25.0, 100.0, 121.0, 144.0] + results_y_val = [36.0, 49.0, 64.0, 81.0, 169.0, 196.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + for x, y in gen_train: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for x, y in gen_validation: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + self.teardown_file(self.file_name3) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + self.teardown_file(self.file_name3) + raise + + def test06_multiple_target_columns(self): + file_name1 = "multiple_target_columns_1.root" + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name1) + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name2) + try: + df1 = ROOT.RDataFrame("myTree", file_name1) + df2 = ROOT.RDataFrame("myTree", file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 4.0, 8.0, 9.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] + results_y_val = [9.0, 300.0, 16.0, 400.0, 64.0, 800.0, 81.0, 900.0] + results_z_train = [0.0, 10.0, 20.0, 50.0, 60.0, 70.0] + results_z_val = [30.0, 40.0, 80.0, 90.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + collected_z_train.append(z.tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 2)) + self.assertTrue(z.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name1) + self.teardown_file(file_name2) + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + def test07_multiple_input_columns(self): + file_name1 = "multiple_target_columns_1.root" + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Snapshot("myTree", file_name1) + + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Snapshot("myTree", file_name2) + + try: + df1 = ROOT.RDataFrame("myTree", file_name1) + df2 = ROOT.RDataFrame("myTree", file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 0.0, 1.0, 10.0, 2.0, + 20.0, 5.0, 50.0, 6.0, 60.0, 7.0, 70.0] + results_x_val = [3.0, 30.0, 4.0, 40.0, 8.0, 80.0, 9.0, 90.0] + results_y_train = [0.0, 1.0, 4.0, 25.0, 36.0, 49.0] + results_y_val = [9.0, 16.0, 64.0, 81.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 2)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 2)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 2)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(file_name1) + self.teardown_file(file_name2) + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + def test08_filtered(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + dff1 = df1.Filter("b1 % 2 == 0", "name") + dff2 = df2.Filter("b1 % 2 != 0", "name") + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [dff1, dff2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 2.0, 5.0] + results_x_val = [4.0, 9.0] + results_y_train = [0.0, 4.0, 25.0] + results_y_val = [16.0, 81.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (2, 1)) + self.assertTrue(y.shape == (2, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + def test09_filtered_last_chunk(self): + file_name1 = "filtered_last_chunk_1.root" + file_name2 = "filtered_last_chunk_2.root" + tree_name = "myTree" + + ROOT.RDataFrame(10)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Snapshot(tree_name, file_name1) + + ROOT.RDataFrame(10)\ + .Define("b1", "(int) rdfentry_ + 10")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Snapshot(tree_name, file_name2) + + try: + df1 = ROOT.RDataFrame(tree_name, file_name1) + df2 = ROOT.RDataFrame(tree_name, file_name2) + + dff1 = df1.Filter("b1 % 2 == 0", "name") + dff2 = df2.Filter("b1 % 2 == 0", "name") + + gen_train, _ = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [dff1, dff2], + batch_size=3, + target="b2", + validation_split=0, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 2.0, 4.0, 6.0, + 8.0, 10.0, 12.0, 14.0, 16.0, 18.0] + results_y_train = [0.0, 4.0, 16.0, 36.0, + 64.0, 100.0, 144.0, 196.0, 256.0, 324.0] + + collected_x_train = [] + collected_y_train = [] + + train_iter = iter(gen_train) + + for _ in range(3): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(train_iter) + self.assertTrue(x.shape == (1, 1)) + self.assertTrue(y.shape == (1, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_y_train, flat_y_train) + + self.teardown_file(file_name1) + self.teardown_file(file_name2) + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + def test10_two_epochs_shuffled(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + both_epochs_collected_x_val = [] + both_epochs_collected_y_val = [] + + for _ in range(2): + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = { + x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = { + y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + both_epochs_collected_x_val.append(collected_x_val) + both_epochs_collected_y_val.append(collected_y_val) + + self.assertEqual( + both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) + self.assertEqual( + both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) + finally: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + def test11_number_of_training_and_validation_batches_remainder(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + number_of_training_batches = 0 + number_of_validation_batches = 0 + + for _ in gen_train: + number_of_training_batches += 1 + + for _ in gen_validation: + number_of_validation_batches += 1 + + self.assertEqual(gen_train.number_of_batches, + number_of_training_batches) + self.assertEqual(gen_validation.number_of_batches, + number_of_validation_batches) + self.assertEqual(gen_train.last_batch_no_of_rows, 0) + self.assertEqual(gen_validation.last_batch_no_of_rows, 1) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + def test12_PyTorch(self): + import torch + + file_name1 = "multiple_target_columns_1.root" + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name1) + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name2) + + try: + df1 = ROOT.RDataFrame("myTree", file_name1) + df2 = ROOT.RDataFrame("myTree", file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + [df1, df2], + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 4.0, 8.0, 9.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] + results_y_val = [9.0, 300.0, 16.0, 400.0, 64.0, 800.0, 81.0, 900.0] + results_z_train = [0.0, 10.0, 20.0, 50.0, 60.0, 70.0] + results_z_val = [30.0, 40.0, 80.0, 90.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + collected_z_train.append(z.tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 2)) + self.assertTrue(z.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name1) + self.teardown_file(file_name2) + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + def test13_TensorFlow(self): + import tensorflow as tf + + file_name1 = "multiple_target_columns_1.root" + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name1) + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name2) + + try: + df1 = ROOT.RDataFrame("myTree", file_name1) + df2 = ROOT.RDataFrame("myTree", file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateTFDatasets( + [df1, df2], + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 4.0, 8.0, 9.0, 0.0, 0.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] + results_y_val = [9.0, 300.0, 16.0, 400.0, 64.0, 800.0, 81.0, 900.0, 0.0, 0.0, 0.0, 0.0] + results_z_train = [0.0, 10.0, 20.0, 50.0, 60.0, 70.0] + results_z_val = [30.0, 40.0, 80.0, 90.0, 0.0, 0.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.numpy().tolist()) + collected_y_train.append(y.numpy().tolist()) + collected_z_train.append(z.numpy().tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.numpy().tolist()) + collected_y_val.append(y.numpy().tolist()) + collected_z_val.append(z.numpy().tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.numpy().tolist()) + collected_y_val.append(y.numpy().tolist()) + collected_z_val.append(z.numpy().tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name1) + self.teardown_file(file_name2) + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + def test14_big_data(self): + file_name1 = "big_data_1.root" + file_name2 = "big_data_2.root" + tree_name = "myTree" + + entries_in_rdf = randrange(10000, 30000) + chunk_size = randrange(1000, 3001) + batch_size = randrange(100, 501) + + error_message = f"\n Batch size: {batch_size} Chunk size: {chunk_size}\ + Number of entries: {entries_in_rdf}" + + def define_rdf(num_of_entries, file_name): + ROOT.RDataFrame(num_of_entries)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(double) rdfentry_ * 2")\ + .Define("b3", "(int) rdfentry_ + 10192")\ + .Define("b4", "(int) -rdfentry_")\ + .Define("b5", "(double) -rdfentry_ - 10192")\ + .Snapshot(tree_name, file_name) + + def test(size_of_batch, size_of_chunk, num_of_entries): + define_rdf(num_of_entries, file_name1) + define_rdf(num_of_entries, file_name2) + + try: + df1 = ROOT.RDataFrame(tree_name, file_name1) + df2 = ROOT.RDataFrame(tree_name, file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=size_of_batch, + target=["b3", "b5"], + weights="b2", + validation_split=0.3, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + collect_x = [] + + train_remainder = gen_train.last_batch_no_of_rows + val_remainder = gen_validation.last_batch_no_of_rows + + n_train_batches = gen_train.number_of_batches - \ + 1 if train_remainder else gen_train.number_of_batches + n_val_batches = gen_validation.number_of_batches - \ + 1 if val_remainder else gen_validation.number_of_batches + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for i in range(n_train_batches): + x, y, z = next(iter_train) + + self.assertTrue(x.shape == (size_of_batch, 2), + error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), + error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), + error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue( + np.all(x[:, 0]*(-1) == x[:, 1]), error_message + f" row: {i}") + self.assertTrue( + np.all(x[:, 0]+10192 == y[:, 0]), error_message + f" row: {i}") + # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) + self.assertTrue( + np.all(x[:, 0]*2 == z[:, 0]), error_message + f" row: {i}") + + collect_x.extend(list(x[:, 0])) + + if train_remainder: + x, y, z = next(iter_train) + self.assertTrue(x.shape == ( + train_remainder, 2), error_message) + self.assertTrue(y.shape == ( + train_remainder, 2), error_message) + self.assertTrue(z.shape == ( + train_remainder, 1), error_message) + collect_x.extend(list(x[:, 0])) + + for _ in range(n_val_batches): + x, y, z = next(iter_val) + + self.assertTrue(x.shape == (size_of_batch, 2), + error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), + error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), + error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue( + np.all(x[:, 0]*(-1) == x[:, 1]), error_message) + self.assertTrue( + np.all(x[:, 0]+10192 == y[:, 0]), error_message) + # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) + self.assertTrue( + np.all(x[:, 0]*2 == z[:, 0]), error_message) + + collect_x.extend(list(x[:, 0])) + + if val_remainder: + x, y, z = next(iter_val) + self.assertTrue(x.shape == ( + val_remainder, 2), error_message) + self.assertTrue(y.shape == ( + val_remainder, 2), error_message) + self.assertTrue(z.shape == ( + val_remainder, 1), error_message) + collect_x.extend(list(x[:, 0])) + + self.assertTrue(set(collect_x) == set(i for i in range(num_of_entries)), f"collected length: {len(set(collect_x))}\ + generated length {len(set(i for i in range(num_of_entries)))}") + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + test(batch_size, chunk_size, entries_in_rdf) + + + def test15_two_runs_set_seed(self): + self.create_file1() + self.create_file2() + + try: + both_runs_collected_x_val = [] + both_runs_collected_y_val = [] + + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + for _ in range(2): + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=True, + drop_remainder=False, + set_seed = 42, + load_eager=True + ) + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = { + x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = { + y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + both_runs_collected_x_val.append(collected_x_val) + both_runs_collected_y_val.append(collected_y_val) + self.assertEqual( + both_runs_collected_x_val[0], both_runs_collected_x_val[1]) + self.assertEqual( + both_runs_collected_y_val[0], both_runs_collected_y_val[1]) + finally: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + + def test16_vector_padding(self): + self.create_vector_file1() + self.create_vector_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name4) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name5) + max_vec_sizes = {"v1": 3, "v2": 2} + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b1", + validation_split=0.4, + max_vec_sizes=max_vec_sizes, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + + results_x_train = [0.0, 0.0, 0.0, 0.0, 0.0, + 1.0, 10.0, 0, 100.0, 1000.0, + 2.0, 20.0, 0, 200.0, 2000.0, + 5.0, 50.0, 0, 500.0, 5000.0, + 6.0, 60.0, 0.0, 600.0, 6000.0, + 7.0, 70.0, 0.0, 700.0, 7000.0] + results_y_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 30.0, 0.0, 300.0, 3000.0, + 4.0, 40.0, 0.0, 400.0, 4000.0, + 8.0, 80.0, 0.0, 800.0, 8000.0, + 9.0, 90.0, 0.0, 900.0, 9000.0] + results_y_val = [3.0, 4.0, 8.0, 9.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 5)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 5)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 5)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name4) + self.teardown_file(self.file_name5) + + except: + self.teardown_file(self.file_name4) + self.teardown_file(self.file_name5) + raise if __name__ == '__main__': unittest.main()