diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py index bb6c0047e96d4..8388bdbd90620 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py @@ -2,10 +2,10 @@ # Author: Kristupas Pranckietis, Vilnius University 05/2024 # Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024 # Author: Vincenzo Eduardo Padulano, CERN 10/2024 -# Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025 +# Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026 ################################################################################ -# Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. # +# Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. # # All rights reserved. # # # # For the licensing terms see $ROOTSYS/LICENSE. # @@ -82,10 +82,10 @@ def get_template( def __init__( self, - rdataframe: ROOT.RDF.RNode, - batch_size: int, - chunk_size: int, - block_size: int, + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + batch_size: int = 0, + chunk_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -96,6 +96,8 @@ def __init__( shuffle: bool = True, drop_remainder: bool = True, set_seed: int = 0, + load_eager: bool = False, + sampling_type: str = "random", ): """Wrapper around the Cpp RBatchGenerator @@ -105,6 +107,10 @@ def __init__( chunk_size (int): The size of the chunks loaded from the ROOT file. Higher chunk size results in better randomization, but also higher memory usage. + block_size (int): + The size of the blocks of consecutive entries from the dataframe. + A chunk is build up from multiple blocks. Lower block size results in + a better randomization, but also higher memory usage. columns (list[str], optional): Columns to be returned. If not given, all columns are used. max_vec_sizes (dict[std, int], optional): @@ -134,6 +140,13 @@ def __init__( For reproducibility: Set the seed for the random number generator used to split the dataset into training and validation and shuffling of the chunks Defaults to 0 which means that the seed is set to the random device. + load_eager (bool): + Load the full dataframe(s) into memory (True) or + load chunks from the dataframe into memory (False). + Defuaults to False. + sampling_type (str): + Describes the mode of sampling from the dataframe(s). Options: 'random'. + Defaults to 'random' and requires load_eager = True. """ import ROOT @@ -148,7 +161,7 @@ def __init__( using RBatchGenerator" ) - if chunk_size < batch_size: + if load_eager == False and chunk_size < batch_size: raise ValueError( f"chunk_size cannot be smaller than batch_size: chunk_size: \ {chunk_size}, batch_size: {batch_size}" @@ -160,7 +173,9 @@ def __init__( given value is {validation_split}" ) - self.noded_rdf = RDF.AsRNode(rdataframe) + if not isinstance(rdataframes, list): + rdataframes = [rdataframes] + self.noded_rdfs = [RDF.AsRNode(rdf) for rdf in rdataframes] if isinstance(target, str): target = [target] @@ -169,7 +184,7 @@ def __init__( self.weights_column = weights template, max_vec_sizes_list = self.get_template( - rdataframe, columns, max_vec_sizes + rdataframes[0], columns, max_vec_sizes ) self.num_columns = len(self.all_columns) @@ -222,7 +237,7 @@ def __init__( EnableThreadSafety() self.generator = TMVA.Experimental.Internal.RBatchGenerator(template)( - self.noded_rdf, + self.noded_rdfs, chunk_size, block_size, batch_size, @@ -234,6 +249,8 @@ def __init__( shuffle, drop_remainder, set_seed, + load_eager, + sampling_type, ) atexit.register(self.DeActivate) @@ -652,10 +669,10 @@ def __call__(self) -> Any: return None def CreateNumPyGenerators( - rdataframe: ROOT.RDF.RNode, - batch_size: int, - chunk_size: int, - block_size: int, + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + batch_size: int = 0, + chunk_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -666,6 +683,8 @@ def CreateNumPyGenerators( shuffle: bool = True, drop_remainder=True, set_seed: int = 0, + load_eager: bool = False, + sampling_type: str = "random", ) -> Tuple[TrainRBatchGenerator, ValidationRBatchGenerator]: """ Return two batch generators based on the given ROOT file and tree or RDataFrame @@ -678,6 +697,10 @@ def CreateNumPyGenerators( chunk_size (int): The size of the chunks loaded from the ROOT file. Higher chunk size results in better randomization, but also higher memory usage. + block_size (int): + The size of the blocks of consecutive entries from the dataframe. + A chunk is build up from multiple blocks. Lower block size results in + a better randomization, but also higher memory usage. columns (list[str], optional): Columns to be returned. If not given, all columns are used. max_vec_sizes (list[int], optional): @@ -706,6 +729,20 @@ def CreateNumPyGenerators( [4, 5, 6, 7] will be returned. If drop_remainder = False, then three batches [0, 1, 2, 3], [4, 5, 6, 7] and [8, 9] will be returned. + set_seed (int): + For reproducibility: Set the seed for the random number generator used + to split the dataset into training and validation and shuffling of the chunks + Defaults to 0 which means that the seed is set to the random device. + load_eager (bool): + Load the full dataframe(s) into memory (True) or + load chunks from the dataframe into memory (False). + Defuaults to False. + sampling_type (str): + Describes the mode of sampling from the dataframe(s). Options: 'random'. + Defaults to 'random' and requires load_eager = True. + + + Returns: TrainRBatchGenerator or @@ -721,7 +758,7 @@ def CreateNumPyGenerators( import numpy as np base_generator = BaseGenerator( - rdataframe, + rdataframes, batch_size, chunk_size, block_size, @@ -734,7 +771,9 @@ def CreateNumPyGenerators( max_chunks, shuffle, drop_remainder, - set_seed, + set_seed, + load_eager, + sampling_type, ) train_generator = TrainRBatchGenerator( @@ -752,10 +791,10 @@ def CreateNumPyGenerators( def CreateTFDatasets( - rdataframe: ROOT.RDF.RNode, - batch_size: int, - chunk_size: int, - block_size: int, + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + batch_size: int = 0, + chunk_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -765,7 +804,9 @@ def CreateTFDatasets( max_chunks: int = 0, shuffle: bool = True, drop_remainder=True, - set_seed: int = 0, + set_seed: int = 0, + load_eager: bool = False, + sampling_type: str = "random", ) -> Tuple[tf.data.Dataset, tf.data.Dataset]: """ Return two Tensorflow Datasets based on the given ROOT file and tree or RDataFrame @@ -778,6 +819,10 @@ def CreateTFDatasets( chunk_size (int): The size of the chunks loaded from the ROOT file. Higher chunk size results in better randomization, but also higher memory usage. + block_size (int): + The size of the blocks of consecutive entries from the dataframe. + A chunk is build up from multiple blocks. Lower block size results in + a better randomization, but also higher memory usage. columns (list[str], optional): Columns to be returned. If not given, all columns are used. max_vec_sizes (list[int], optional): @@ -806,6 +851,17 @@ def CreateTFDatasets( [4, 5, 6, 7] will be returned. If drop_remainder = False, then three batches [0, 1, 2, 3], [4, 5, 6, 7] and [8, 9] will be returned. + set_seed (int): + For reproducibility: Set the seed for the random number generator used + to split the dataset into training and validation and shuffling of the chunks + Defaults to 0 which means that the seed is set to the random device. + load_eager (bool): + Load the full dataframe(s) into memory (True) or + load chunks from the dataframe into memory (False). + Defuaults to False. + sampling_type (str): + Describes the mode of sampling from the dataframe(s). Options: 'random'. + Defaults to 'random' and requires load_eager = True. Returns: TrainRBatchGenerator or @@ -820,7 +876,7 @@ def CreateTFDatasets( import tensorflow as tf base_generator = BaseGenerator( - rdataframe, + rdataframes, batch_size, chunk_size, block_size, @@ -833,7 +889,9 @@ def CreateTFDatasets( max_chunks, shuffle, drop_remainder, - set_seed, + set_seed, + load_eager, + sampling_type, ) train_generator = TrainRBatchGenerator( @@ -901,10 +959,10 @@ def CreateTFDatasets( def CreatePyTorchGenerators( - rdataframe: ROOT.RDF.RNode, - batch_size: int, - chunk_size: int, - block_size: int, + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + batch_size: int = 0, + chunk_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -914,7 +972,9 @@ def CreatePyTorchGenerators( max_chunks: int = 0, shuffle: bool = True, drop_remainder=True, - set_seed: int = 0, + set_seed: int = 0, + load_eager: bool = False, + sampling_type: str = "random", ) -> Tuple[TrainRBatchGenerator, ValidationRBatchGenerator]: """ Return two Tensorflow Datasets based on the given ROOT file and tree or RDataFrame @@ -927,6 +987,10 @@ def CreatePyTorchGenerators( chunk_size (int): The size of the chunks loaded from the ROOT file. Higher chunk size results in better randomization, but also higher memory usage. + block_size (int): + The size of the blocks of consecutive entries from the dataframe. + A chunk is build up from multiple blocks. Lower block size results in + a better randomization, but also higher memory usage. columns (list[str], optional): Columns to be returned. If not given, all columns are used. max_vec_sizes (list[int], optional): @@ -955,6 +1019,17 @@ def CreatePyTorchGenerators( [4, 5, 6, 7] will be returned. If drop_remainder = False, then three batches [0, 1, 2, 3], [4, 5, 6, 7] and [8, 9] will be returned. + set_seed (int): + For reproducibility: Set the seed for the random number generator used + to split the dataset into training and validation and shuffling of the chunks + Defaults to 0 which means that the seed is set to the random device. + load_eager (bool): + Load the full dataframe(s) into memory (True) or + load chunks from the dataframe into memory (False). + Defuaults to False. + sampling_type (str): + Describes the mode of sampling from the dataframe(s). Options: 'random'. + Defaults to 'random' and requires load_eager = True. Returns: TrainRBatchGenerator or @@ -967,7 +1042,7 @@ def CreatePyTorchGenerators( validation generator will return no batches. """ base_generator = BaseGenerator( - rdataframe, + rdataframes, batch_size, chunk_size, block_size, @@ -980,7 +1055,9 @@ def CreatePyTorchGenerators( max_chunks, shuffle, drop_remainder, - set_seed, + set_seed, + load_eager, + sampling_type, ) train_generator = TrainRBatchGenerator( diff --git a/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py b/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py index 43453702c553c..4aac5235dc2a5 100644 --- a/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py +++ b/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py @@ -1170,6 +1170,2363 @@ def test16_vector_padding(self): self.teardown_file(self.file_name3) raise +class RBatchGeneratorEagerLoading(unittest.TestCase): + + file_name1 = "first_half.root" + file_name2 = "second_half.root" + file_name3 = "vector_columns.root" + tree_name = "mytree" + + # default constants + n_train_batch = 2 + n_val_batch = 1 + val_remainder = 1 + + # Helpers + def define_rdf(self, num_of_entries=10): + df = ROOT.RDataFrame(num_of_entries)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(double) b1*b1") + + return df + + def create_file(self, num_of_entries=10): + self.define_rdf(num_of_entries).Snapshot( + self.tree_name, self.file_name1) + + def create_5_entries_file(self): + df1 = ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 10")\ + .Define("b2", "(double) b1 * b1")\ + .Snapshot(self.tree_name, self.file_name2) + + def create_vector_file(self, num_of_entries=10): + df3 = ROOT.RDataFrame(10)\ + .Define("b1", "(int) rdfentry_")\ + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ + .Snapshot(self.tree_name, self.file_name3) + + def teardown_file(self, file): + os.remove(file) + + def test01_each_element_is_generated_unshuffled(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True, + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 7.0, 8.0, 9.0] + results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, 25.0] + results_y_val = [36.0, 49.0, 64.0, 81.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + + except: + self.teardown_file(self.file_name1) + raise + + def test02_each_element_is_generated_shuffled(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=True, + drop_remainder=False, + load_eager=True + ) + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + self.teardown_file(self.file_name1) + + except: + self.teardown_file(self.file_name1) + raise + + def test04_dropping_remainder(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=True, + load_eager=True + ) + + collected_x = [] + collected_y = [] + + for x, y in gen_train: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x.append(x) + collected_y.append(y) + + for x, y in gen_validation: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x.append(x) + collected_y.append(y) + + self.assertEqual(len(collected_x), 3) + self.assertEqual(len(collected_y), 3) + + self.teardown_file(self.file_name1) + + except: + self.teardown_file(self.file_name1) + raise + + + def test05_more_than_one_file(self): + self.create_file() + self.create_5_entries_file() + + try: + df = ROOT.RDataFrame( + self.tree_name, [self.file_name1, self.file_name2]) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0] + results_x_val = [9.0, 10.0, 11.0, 12.0, 13.0, 14.0] + results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, + 25.0, 36.0, 49.0, 64.0] + results_y_val = [81.0, 100.0, 121.0, 144.0, 169.0, 196.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + for x, y in gen_train: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for x, y in gen_validation: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + def test06_multiple_target_columns(self): + file_name = "multiple_target_columns.root" + + ROOT.RDataFrame(10)\ + .Define("b1", "(Short_t) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Define("b3", "(double) rdfentry_ * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name) + try: + df = ROOT.RDataFrame("myTree", file_name) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 7.0, 8.0, 9.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0] + results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] + results_z_val = [60.0, 70.0, 80.0, 90.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + collected_z_train.append(z.tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 2)) + self.assertTrue(z.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name) + + except: + self.teardown_file(file_name) + raise + + def test07_multiple_input_columns(self): + file_name = "multiple_input_columns.root" + + ROOT.RDataFrame(10)\ + .Define("b1", "(Short_t) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Define("b3", "(double) rdfentry_ * 10")\ + .Snapshot("myTree", file_name) + + try: + df = ROOT.RDataFrame("myTree", file_name) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 0.0, 1.0, 10.0, 2.0, + 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0] + results_x_val = [6.0, 60.0, 7.0, 70.0, 8.0, 80.0, 9.0, 90.0] + results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, 25.0] + results_y_val = [36.0, 49.0, 64.0, 81.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 2)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 2)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 2)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(file_name) + + except: + self.teardown_file(file_name) + raise + + def test08_filtered(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + dff = df.Filter("b1 % 2 == 0", "name") + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + dff, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 2.0, 4.0] + results_x_val = [6.0, 8.0] + results_y_train = [0.0, 4.0, 16.0] + results_y_val = [36.0, 64.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (2, 1)) + self.assertTrue(y.shape == (2, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + + except: + self.teardown_file(self.file_name1) + raise + + def test09_filtered_last_chunk(self): + file_name = "filtered_last_chunk.root" + tree_name = "myTree" + + ROOT.RDataFrame(20)\ + .Define("b1", "(Short_t) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Snapshot(tree_name, file_name) + + try: + df = ROOT.RDataFrame(tree_name, file_name) + + dff = df.Filter("b1 % 2 == 0", "name") + + gen_train, _ = ROOT.TMVA.Experimental.CreateNumPyGenerators( + dff, + batch_size=3, + target="b2", + validation_split=0, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 2.0, 4.0, 6.0, + 8.0, 10.0, 12.0, 14.0, 16.0, 18.0] + results_y_train = [0.0, 4.0, 16.0, 36.0, + 64.0, 100.0, 144.0, 196.0, 256.0, 324.0] + + collected_x_train = [] + collected_y_train = [] + + train_iter = iter(gen_train) + + for _ in range(3): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(train_iter) + self.assertTrue(x.shape == (1, 1)) + self.assertTrue(y.shape == (1, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_y_train, flat_y_train) + + self.teardown_file(file_name) + + except: + self.teardown_file(file_name) + raise + + def test10_two_epochs_shuffled(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + both_epochs_collected_x_val = [] + both_epochs_collected_y_val = [] + + for _ in range(2): + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = { + x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = { + y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + both_epochs_collected_x_val.append(collected_x_val) + both_epochs_collected_y_val.append(collected_y_val) + + self.assertEqual( + both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) + self.assertEqual( + both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) + finally: + self.teardown_file(self.file_name1) + + def test11_number_of_training_and_validation_batches_remainder(self): + self.create_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + number_of_training_batches = 0 + number_of_validation_batches = 0 + + for _ in gen_train: + number_of_training_batches += 1 + + for _ in gen_validation: + number_of_validation_batches += 1 + + self.assertEqual(gen_train.number_of_batches, + number_of_training_batches) + self.assertEqual(gen_validation.number_of_batches, + number_of_validation_batches) + self.assertEqual(gen_train.last_batch_no_of_rows, 0) + self.assertEqual(gen_validation.last_batch_no_of_rows, 1) + + self.teardown_file(self.file_name1) + + except: + self.teardown_file(self.file_name1) + raise + + def test12_PyTorch(self): + import torch + + file_name = "multiple_target_columns.root" + + ROOT.RDataFrame(10)\ + .Define("b1", "(Short_t) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Define("b3", "(double) rdfentry_ * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name) + + try: + df = ROOT.RDataFrame("myTree", file_name) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + df, + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 7.0, 8.0, 9.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0] + results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] + results_z_val = [60.0, 70.0, 80.0, 90.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + collected_z_train.append(z.tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 2)) + self.assertTrue(z.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name) + + except: + self.teardown_file(file_name) + raise + + def test13_TensorFlow(self): + import tensorflow as tf + + file_name = "multiple_target_columns.root" + + ROOT.RDataFrame(10)\ + .Define("b1", "(Short_t) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Define("b3", "(double) rdfentry_ * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name) + + try: + df = ROOT.RDataFrame("myTree", file_name) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateTFDatasets( + df, + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 7.0, 8.0, 9.0, 0.0, 0.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0, 0.0, 0.0, 0.0, 0.0] + results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] + results_z_val = [60.0, 70.0, 80.0, 90.0, 0.0, 0.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.numpy().tolist()) + collected_y_train.append(y.numpy().tolist()) + collected_z_train.append(z.numpy().tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.numpy().tolist()) + collected_y_val.append(y.numpy().tolist()) + collected_z_val.append(z.numpy().tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.numpy().tolist()) + collected_y_val.append(y.numpy().tolist()) + collected_z_val.append(z.numpy().tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name) + + except: + self.teardown_file(file_name) + raise + + def test14_big_data(self): + file_name = "big_data.root" + tree_name = "myTree" + + entries_in_rdf = randrange(10000, 30000) + chunk_size = randrange(1000, 3001) + batch_size = randrange(100, 501) + + error_message = f"\n Batch size: {batch_size} Chunk size: {chunk_size}\ + Number of entries: {entries_in_rdf}" + + def define_rdf(num_of_entries): + ROOT.RDataFrame(num_of_entries)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(double) rdfentry_ * 2")\ + .Define("b3", "(int) rdfentry_ + 10192")\ + .Define("b4", "(int) -rdfentry_")\ + .Define("b5", "(double) -rdfentry_ - 10192")\ + .Snapshot(tree_name, file_name) + + def test(size_of_batch, size_of_chunk, num_of_entries): + define_rdf(num_of_entries) + + try: + df = ROOT.RDataFrame(tree_name, file_name) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=size_of_batch, + target=["b3", "b5"], + weights="b2", + validation_split=0.3, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + collect_x = [] + + train_remainder = gen_train.last_batch_no_of_rows + val_remainder = gen_validation.last_batch_no_of_rows + + n_train_batches = gen_train.number_of_batches - \ + 1 if train_remainder else gen_train.number_of_batches + n_val_batches = gen_validation.number_of_batches - \ + 1 if val_remainder else gen_validation.number_of_batches + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for i in range(n_train_batches): + x, y, z = next(iter_train) + + self.assertTrue(x.shape == (size_of_batch, 2), + error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), + error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), + error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue( + np.all(x[:, 0]*(-1) == x[:, 1]), error_message + f" row: {i}") + self.assertTrue( + np.all(x[:, 0]+10192 == y[:, 0]), error_message + f" row: {i}") + # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) + self.assertTrue( + np.all(x[:, 0]*2 == z[:, 0]), error_message + f" row: {i}") + + collect_x.extend(list(x[:, 0])) + + if train_remainder: + x, y, z = next(iter_train) + self.assertTrue(x.shape == ( + train_remainder, 2), error_message) + self.assertTrue(y.shape == ( + train_remainder, 2), error_message) + self.assertTrue(z.shape == ( + train_remainder, 1), error_message) + collect_x.extend(list(x[:, 0])) + + for _ in range(n_val_batches): + x, y, z = next(iter_val) + + self.assertTrue(x.shape == (size_of_batch, 2), + error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), + error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), + error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue( + np.all(x[:, 0]*(-1) == x[:, 1]), error_message) + self.assertTrue( + np.all(x[:, 0]+10192 == y[:, 0]), error_message) + # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) + self.assertTrue( + np.all(x[:, 0]*2 == z[:, 0]), error_message) + + collect_x.extend(list(x[:, 0])) + + if val_remainder: + x, y, z = next(iter_val) + self.assertTrue(x.shape == ( + val_remainder, 2), error_message) + self.assertTrue(y.shape == ( + val_remainder, 2), error_message) + self.assertTrue(z.shape == ( + val_remainder, 1), error_message) + collect_x.extend(list(x[:, 0])) + + self.assertTrue(set(collect_x) == set(i for i in range(num_of_entries)), f"collected length: {len(set(collect_x))}\ + generated length {len(set(i for i in range(num_of_entries)))}") + + except: + self.teardown_file(file_name) + raise + + test(batch_size, chunk_size, entries_in_rdf) + + + def test15_two_runs_set_seed(self): + self.create_file() + + try: + + both_runs_collected_x_val = [] + both_runs_collected_y_val = [] + + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + for _ in range(2): + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=True, + drop_remainder=False, + set_seed = 42, + load_eager=True + ) + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = { + x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = { + y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + both_runs_collected_x_val.append(collected_x_val) + both_runs_collected_y_val.append(collected_y_val) + self.assertEqual( + both_runs_collected_x_val[0], both_runs_collected_x_val[1]) + self.assertEqual( + both_runs_collected_y_val[0], both_runs_collected_y_val[1]) + finally: + self.teardown_file(self.file_name1) + + def test16_vector_padding(self): + self.create_vector_file() + + try: + df = ROOT.RDataFrame(self.tree_name, self.file_name3) + max_vec_sizes = {"v1": 3, "v2": 2} + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + df, + batch_size=3, + target="b1", + validation_split=0.4, + max_vec_sizes=max_vec_sizes, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + + results_x_train = [0.0, 0.0, 0.0, 0.0, 0.0, + 1.0, 10.0, 0, 100.0, 1000.0, + 2.0, 20.0, 0, 200.0, 2000.0, + 3.0, 30.0, 0, 300.0, 3000.0, + 4.0, 40.0, 0, 400.0, 4000.0, + 5.0, 50.0, 0, 500.0, 5000.0] + results_y_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 60.0, 0.0, 600.0, 6000.0, + 7.0, 70.0, 0.0, 700.0, 7000.0, + 8.0, 80.0, 0.0, 800.0, 8000.0, + 9.0, 90.0, 0.0, 900.0, 9000.0] + results_y_val = [6.0, 7.0, 8.0, 9.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 5)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 5)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 5)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name3) + + except: + self.teardown_file(self.file_name3) + raise + +class RBatchGeneratorEagerLoadingMultipleDataframes(unittest.TestCase): + + file_name1 = "first_half.root" + file_name2 = "second_half.root" + file_name3 = "second_file.root" + file_name4 = "vector_columns_1.root" + file_name5 = "vector_columns_2.root" + tree_name = "mytree" + + # default constants + n_train_batch = 2 + n_val_batch = 1 + val_remainder = 1 + + # Helpers + def define_rdf1(self, num_of_entries=5): + df = ROOT.RDataFrame(num_of_entries)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(double) b1*b1") + + return df + + def define_rdf2(self, num_of_entries=5): + df = ROOT.RDataFrame(num_of_entries)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("b2", "(double) b1*b1") + + return df + + def create_file1(self, num_of_entries=5): + self.define_rdf1(num_of_entries).Snapshot( + self.tree_name, self.file_name1) + + def create_file2(self, num_of_entries=5): + self.define_rdf2(num_of_entries).Snapshot( + self.tree_name, self.file_name2) + + def create_5_entries_file(self): + df1 = ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 10")\ + .Define("b2", "(double) b1 * b1")\ + .Snapshot(self.tree_name, self.file_name3) + + def create_vector_file1(self, num_of_entries=5): + df3 = ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_")\ + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ + .Snapshot(self.tree_name, self.file_name4) + + def create_vector_file2(self, num_of_entries=5): + df3 = ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ + .Snapshot(self.tree_name, self.file_name5) + + def teardown_file(self, file): + os.remove(file) + + + def test01_each_element_is_generated_unshuffled(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True, + ) + + results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 4.0, 8.0, 9.0] + results_y_train = [0.0, 1.0, 4.0, 25.0, 36.0, 49.0] + results_y_val = [9.0, 16.0, 64.0, 81.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + def test02_each_element_is_generated_shuffled(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=True, + drop_remainder=False, + load_eager=True + ) + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + def test04_dropping_remainder(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=True, + load_eager=True + ) + + collected_x = [] + collected_y = [] + + for x, y in gen_train: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x.append(x) + collected_y.append(y) + + for x, y in gen_validation: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x.append(x) + collected_y.append(y) + + self.assertEqual(len(collected_x), 3) + self.assertEqual(len(collected_y), 3) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + + def test05_more_than_one_file(self): + self.create_file1() + self.create_file2() + self.create_5_entries_file() + + try: + df1 = ROOT.RDataFrame( + self.tree_name, [self.file_name1, self.file_name2]) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name3) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 11.0, 12.0] + results_x_val = [6.0, 7.0, 8.0, 9.0, 13.0, 14.0] + results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, + 25.0, 100.0, 121.0, 144.0] + results_y_val = [36.0, 49.0, 64.0, 81.0, 169.0, 196.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + for x, y in gen_train: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for x, y in gen_validation: + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + self.teardown_file(self.file_name3) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + self.teardown_file(self.file_name3) + raise + + def test06_multiple_target_columns(self): + file_name1 = "multiple_target_columns_1.root" + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name1) + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name2) + try: + df1 = ROOT.RDataFrame("myTree", file_name1) + df2 = ROOT.RDataFrame("myTree", file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 4.0, 8.0, 9.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] + results_y_val = [9.0, 300.0, 16.0, 400.0, 64.0, 800.0, 81.0, 900.0] + results_z_train = [0.0, 10.0, 20.0, 50.0, 60.0, 70.0] + results_z_val = [30.0, 40.0, 80.0, 90.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + collected_z_train.append(z.tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 2)) + self.assertTrue(z.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name1) + self.teardown_file(file_name2) + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + def test07_multiple_input_columns(self): + file_name1 = "multiple_target_columns_1.root" + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Snapshot("myTree", file_name1) + + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Snapshot("myTree", file_name2) + + try: + df1 = ROOT.RDataFrame("myTree", file_name1) + df2 = ROOT.RDataFrame("myTree", file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 0.0, 1.0, 10.0, 2.0, + 20.0, 5.0, 50.0, 6.0, 60.0, 7.0, 70.0] + results_x_val = [3.0, 30.0, 4.0, 40.0, 8.0, 80.0, 9.0, 90.0] + results_y_train = [0.0, 1.0, 4.0, 25.0, 36.0, 49.0] + results_y_val = [9.0, 16.0, 64.0, 81.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 2)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 2)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 2)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(file_name1) + self.teardown_file(file_name2) + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + def test08_filtered(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + dff1 = df1.Filter("b1 % 2 == 0", "name") + dff2 = df2.Filter("b1 % 2 != 0", "name") + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [dff1, dff2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 2.0, 5.0] + results_x_val = [4.0, 9.0] + results_y_train = [0.0, 4.0, 25.0] + results_y_val = [16.0, 81.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (2, 1)) + self.assertTrue(y.shape == (2, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + def test09_filtered_last_chunk(self): + file_name1 = "filtered_last_chunk_1.root" + file_name2 = "filtered_last_chunk_2.root" + tree_name = "myTree" + + ROOT.RDataFrame(10)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Snapshot(tree_name, file_name1) + + ROOT.RDataFrame(10)\ + .Define("b1", "(int) rdfentry_ + 10")\ + .Define("b2", "(UShort_t) b1 * b1")\ + .Snapshot(tree_name, file_name2) + + try: + df1 = ROOT.RDataFrame(tree_name, file_name1) + df2 = ROOT.RDataFrame(tree_name, file_name2) + + dff1 = df1.Filter("b1 % 2 == 0", "name") + dff2 = df2.Filter("b1 % 2 == 0", "name") + + gen_train, _ = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [dff1, dff2], + batch_size=3, + target="b2", + validation_split=0, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 2.0, 4.0, 6.0, + 8.0, 10.0, 12.0, 14.0, 16.0, 18.0] + results_y_train = [0.0, 4.0, 16.0, 36.0, + 64.0, 100.0, 144.0, 196.0, 256.0, 324.0] + + collected_x_train = [] + collected_y_train = [] + + train_iter = iter(gen_train) + + for _ in range(3): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(train_iter) + self.assertTrue(x.shape == (1, 1)) + self.assertTrue(y.shape == (1, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_y_train, flat_y_train) + + self.teardown_file(file_name1) + self.teardown_file(file_name2) + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + def test10_two_epochs_shuffled(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + both_epochs_collected_x_val = [] + both_epochs_collected_y_val = [] + + for _ in range(2): + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = { + x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = { + y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + both_epochs_collected_x_val.append(collected_x_val) + both_epochs_collected_y_val.append(collected_y_val) + + self.assertEqual( + both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) + self.assertEqual( + both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) + finally: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + def test11_number_of_training_and_validation_batches_remainder(self): + self.create_file1() + self.create_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + number_of_training_batches = 0 + number_of_validation_batches = 0 + + for _ in gen_train: + number_of_training_batches += 1 + + for _ in gen_validation: + number_of_validation_batches += 1 + + self.assertEqual(gen_train.number_of_batches, + number_of_training_batches) + self.assertEqual(gen_validation.number_of_batches, + number_of_validation_batches) + self.assertEqual(gen_train.last_batch_no_of_rows, 0) + self.assertEqual(gen_validation.last_batch_no_of_rows, 1) + + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + except: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + raise + + def test12_PyTorch(self): + import torch + + file_name1 = "multiple_target_columns_1.root" + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name1) + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name2) + + try: + df1 = ROOT.RDataFrame("myTree", file_name1) + df2 = ROOT.RDataFrame("myTree", file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + [df1, df2], + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 4.0, 8.0, 9.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] + results_y_val = [9.0, 300.0, 16.0, 400.0, 64.0, 800.0, 81.0, 900.0] + results_z_train = [0.0, 10.0, 20.0, 50.0, 60.0, 70.0] + results_z_val = [30.0, 40.0, 80.0, 90.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + collected_z_train.append(z.tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 2)) + self.assertTrue(z.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + collected_z_val.append(z.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name1) + self.teardown_file(file_name2) + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + def test13_TensorFlow(self): + import tensorflow as tf + + file_name1 = "multiple_target_columns_1.root" + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name1) + ROOT.RDataFrame(5)\ + .Define("b1", "(int) rdfentry_ + 5")\ + .Define("b2", "(int) b1 * b1")\ + .Define("b3", "(double) b1 * 10")\ + .Define("b4", "(double) b3 * 10")\ + .Snapshot("myTree", file_name2) + + try: + df1 = ROOT.RDataFrame("myTree", file_name1) + df2 = ROOT.RDataFrame("myTree", file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateTFDatasets( + [df1, df2], + batch_size=3, + target=["b2", "b4"], + weights="b3", + validation_split=0.4, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 4.0, 8.0, 9.0, 0.0, 0.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, + 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] + results_y_val = [9.0, 300.0, 16.0, 400.0, 64.0, 800.0, 81.0, 900.0, 0.0, 0.0, 0.0, 0.0] + results_z_train = [0.0, 10.0, 20.0, 50.0, 60.0, 70.0] + results_z_val = [30.0, 40.0, 80.0, 90.0, 0.0, 0.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + collected_z_train = [] + collected_z_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y, z = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_train.append(x.numpy().tolist()) + collected_y_train.append(y.numpy().tolist()) + collected_z_train.append(z.numpy().tolist()) + + for _ in range(self.n_val_batch): + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.numpy().tolist()) + collected_y_val.append(y.numpy().tolist()) + collected_z_val.append(z.numpy().tolist()) + + x, y, z = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 2)) + self.assertTrue(z.shape == (3, 1)) + collected_x_val.append(x.numpy().tolist()) + collected_y_val.append(y.numpy().tolist()) + collected_z_val.append(z.numpy().tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + flat_z_train = [ + z for zl in collected_z_train for zs in zl for z in zs] + flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + self.assertEqual(results_z_train, flat_z_train) + self.assertEqual(results_z_val, flat_z_val) + + self.teardown_file(file_name1) + self.teardown_file(file_name2) + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + def test14_big_data(self): + file_name1 = "big_data_1.root" + file_name2 = "big_data_2.root" + tree_name = "myTree" + + entries_in_rdf = randrange(10000, 30000) + chunk_size = randrange(1000, 3001) + batch_size = randrange(100, 501) + + error_message = f"\n Batch size: {batch_size} Chunk size: {chunk_size}\ + Number of entries: {entries_in_rdf}" + + def define_rdf(num_of_entries, file_name): + ROOT.RDataFrame(num_of_entries)\ + .Define("b1", "(int) rdfentry_")\ + .Define("b2", "(double) rdfentry_ * 2")\ + .Define("b3", "(int) rdfentry_ + 10192")\ + .Define("b4", "(int) -rdfentry_")\ + .Define("b5", "(double) -rdfentry_ - 10192")\ + .Snapshot(tree_name, file_name) + + def test(size_of_batch, size_of_chunk, num_of_entries): + define_rdf(num_of_entries, file_name1) + define_rdf(num_of_entries, file_name2) + + try: + df1 = ROOT.RDataFrame(tree_name, file_name1) + df2 = ROOT.RDataFrame(tree_name, file_name2) + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=size_of_batch, + target=["b3", "b5"], + weights="b2", + validation_split=0.3, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + collect_x = [] + + train_remainder = gen_train.last_batch_no_of_rows + val_remainder = gen_validation.last_batch_no_of_rows + + n_train_batches = gen_train.number_of_batches - \ + 1 if train_remainder else gen_train.number_of_batches + n_val_batches = gen_validation.number_of_batches - \ + 1 if val_remainder else gen_validation.number_of_batches + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for i in range(n_train_batches): + x, y, z = next(iter_train) + + self.assertTrue(x.shape == (size_of_batch, 2), + error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), + error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), + error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue( + np.all(x[:, 0]*(-1) == x[:, 1]), error_message + f" row: {i}") + self.assertTrue( + np.all(x[:, 0]+10192 == y[:, 0]), error_message + f" row: {i}") + # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) + self.assertTrue( + np.all(x[:, 0]*2 == z[:, 0]), error_message + f" row: {i}") + + collect_x.extend(list(x[:, 0])) + + if train_remainder: + x, y, z = next(iter_train) + self.assertTrue(x.shape == ( + train_remainder, 2), error_message) + self.assertTrue(y.shape == ( + train_remainder, 2), error_message) + self.assertTrue(z.shape == ( + train_remainder, 1), error_message) + collect_x.extend(list(x[:, 0])) + + for _ in range(n_val_batches): + x, y, z = next(iter_val) + + self.assertTrue(x.shape == (size_of_batch, 2), + error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), + error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), + error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue( + np.all(x[:, 0]*(-1) == x[:, 1]), error_message) + self.assertTrue( + np.all(x[:, 0]+10192 == y[:, 0]), error_message) + # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) + self.assertTrue( + np.all(x[:, 0]*2 == z[:, 0]), error_message) + + collect_x.extend(list(x[:, 0])) + + if val_remainder: + x, y, z = next(iter_val) + self.assertTrue(x.shape == ( + val_remainder, 2), error_message) + self.assertTrue(y.shape == ( + val_remainder, 2), error_message) + self.assertTrue(z.shape == ( + val_remainder, 1), error_message) + collect_x.extend(list(x[:, 0])) + + self.assertTrue(set(collect_x) == set(i for i in range(num_of_entries)), f"collected length: {len(set(collect_x))}\ + generated length {len(set(i for i in range(num_of_entries)))}") + + except: + self.teardown_file(file_name1) + self.teardown_file(file_name2) + raise + + test(batch_size, chunk_size, entries_in_rdf) + + + def test15_two_runs_set_seed(self): + self.create_file1() + self.create_file2() + + try: + both_runs_collected_x_val = [] + both_runs_collected_y_val = [] + + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) + + for _ in range(2): + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b2", + validation_split=0.4, + shuffle=True, + drop_remainder=False, + set_seed = 42, + load_eager=True + ) + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + iter_train = iter(gen_train) + iter_val = iter(gen_validation) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + for _ in range(self.n_val_batch): + x, y = next(iter_val) + self.assertTrue(x.shape == (3, 1)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + x, y = next(iter_val) + self.assertTrue(x.shape == (self.val_remainder, 1)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = { + x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = { + x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = { + y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = { + y for yl in collected_y_val for ys in yl for y in ys} + + self.assertEqual(len(flat_x_train), 6) + self.assertEqual(len(flat_x_val), 4) + self.assertEqual(len(flat_y_train), 6) + self.assertEqual(len(flat_y_val), 4) + + both_runs_collected_x_val.append(collected_x_val) + both_runs_collected_y_val.append(collected_y_val) + self.assertEqual( + both_runs_collected_x_val[0], both_runs_collected_x_val[1]) + self.assertEqual( + both_runs_collected_y_val[0], both_runs_collected_y_val[1]) + finally: + self.teardown_file(self.file_name1) + self.teardown_file(self.file_name2) + + + def test16_vector_padding(self): + self.create_vector_file1() + self.create_vector_file2() + + try: + df1 = ROOT.RDataFrame(self.tree_name, self.file_name4) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name5) + max_vec_sizes = {"v1": 3, "v2": 2} + + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + [df1, df2], + batch_size=3, + target="b1", + validation_split=0.4, + max_vec_sizes=max_vec_sizes, + shuffle=False, + drop_remainder=False, + load_eager=True + ) + + + results_x_train = [0.0, 0.0, 0.0, 0.0, 0.0, + 1.0, 10.0, 0, 100.0, 1000.0, + 2.0, 20.0, 0, 200.0, 2000.0, + 5.0, 50.0, 0, 500.0, 5000.0, + 6.0, 60.0, 0.0, 600.0, 6000.0, + 7.0, 70.0, 0.0, 700.0, 7000.0] + results_y_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 30.0, 0.0, 300.0, 3000.0, + 4.0, 40.0, 0.0, 400.0, 4000.0, + 8.0, 80.0, 0.0, 800.0, 8000.0, + 9.0, 90.0, 0.0, 900.0, 9000.0] + results_y_val = [3.0, 4.0, 8.0, 9.0] + + collected_x_train = [] + collected_x_val = [] + collected_y_train = [] + collected_y_val = [] + + train_iter = iter(gen_train) + val_iter = iter(gen_validation) + + for _ in range(self.n_val_batch): + x, y = next(val_iter) + self.assertTrue(x.shape == (3, 5)) + self.assertTrue(y.shape == (3, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + for _ in range(self.n_train_batch): + x, y = next(train_iter) + self.assertTrue(x.shape == (3, 5)) + self.assertTrue(y.shape == (3, 1)) + collected_x_train.append(x.tolist()) + collected_y_train.append(y.tolist()) + + x, y = next(val_iter) + self.assertTrue(x.shape == (self.val_remainder, 5)) + self.assertTrue(y.shape == (self.val_remainder, 1)) + collected_x_val.append(x.tolist()) + collected_y_val.append(y.tolist()) + + flat_x_train = [ + x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [ + y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] + + self.assertEqual(results_x_train, flat_x_train) + self.assertEqual(results_x_val, flat_x_val) + self.assertEqual(results_y_train, flat_y_train) + self.assertEqual(results_y_val, flat_y_val) + + self.teardown_file(self.file_name4) + self.teardown_file(self.file_name5) + + except: + self.teardown_file(self.file_name4) + self.teardown_file(self.file_name5) + raise if __name__ == '__main__': unittest.main() diff --git a/tmva/tmva/CMakeLists.txt b/tmva/tmva/CMakeLists.txt index b4217ac7e4d1b..52be7ff086aa4 100644 --- a/tmva/tmva/CMakeLists.txt +++ b/tmva/tmva/CMakeLists.txt @@ -449,6 +449,8 @@ ROOT_STANDARD_LIBRARY_PACKAGE(TMVAUtils TMVA/BatchGenerator/RChunkConstructor.hxx TMVA/BatchGenerator/RFlat2DMatrix.hxx TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx + TMVA/BatchGenerator/RDatasetLoader.hxx + TMVA/BatchGenerator/RSampler.hxx SOURCES diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx index 623516165c6f3..fea9f4966b770 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx @@ -2,10 +2,10 @@ // Author: Kristupas Pranckietis, Vilnius University 05/2024 // Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024 // Author: Vincenzo Eduardo Padulano, CERN 10/2024 -// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025 +// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026 /************************************************************************* - * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. * + * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. * * All rights reserved. * * * * For the licensing terms see $ROOTSYS/LICENSE. * @@ -16,7 +16,10 @@ #define TMVA_RBATCHGENERATOR #include "TMVA/BatchGenerator/RFlat2DMatrix.hxx" +#include "TMVA/BatchGenerator/RSampler.hxx" #include "ROOT/RDF/RDatasetSpec.hxx" + +#include "TMVA/BatchGenerator/RDatasetLoader.hxx" #include "TMVA/BatchGenerator/RChunkLoader.hxx" #include "TMVA/BatchGenerator/RBatchLoader.hxx" #include "TROOT.h" @@ -46,38 +49,38 @@ template class RBatchGenerator { private: std::vector fCols; + std::vector fVecSizes; // clang-format on std::size_t fChunkSize; std::size_t fMaxChunks; std::size_t fBatchSize; std::size_t fBlockSize; - std::size_t fNumColumns; - std::size_t fNumChunkCols; - std::size_t fNumEntries; std::size_t fSetSeed; - std::size_t fSumVecSizes; - ROOT::RDF::RResultPtr> fEntries; float fValidationSplit; + std::unique_ptr> fDatasetLoader; std::unique_ptr> fChunkLoader; - std::unique_ptr fBatchLoader; std::unique_ptr fTrainingBatchLoader; std::unique_ptr fValidationBatchLoader; + std::unique_ptr fTrainingSampler; + std::unique_ptr fValidationSampler; + std::vector f_rdfs; + std::unique_ptr fLoadingThread; std::size_t fTrainingChunkNum; std::size_t fValidationChunkNum; - ROOT::RDF::RNode &f_rdf; - std::mutex fIsActiveMutex; bool fDropRemainder; bool fShuffle; + bool fLoadEager; + std::string fSampleType; + bool fIsActive{false}; // Whether the loading thread is active - bool fNotFiltered; bool fUseWholeFile; bool fEpochActive{false}; @@ -91,6 +94,12 @@ private: std::size_t fNumValidationChunks; // flattened buffers for chunks and temporary tensors (rows * cols) + std::vector fTrainingDatasets; + std::vector fValidationDatasets; + + RFlat2DMatrix fSampledTrainingDataset; + RFlat2DMatrix fSampledValidationDataset; + RFlat2DMatrix fTrainTensor; RFlat2DMatrix fTrainChunkTensor; @@ -98,14 +107,16 @@ private: RFlat2DMatrix fValidationChunkTensor; public: - RBatchGenerator(ROOT::RDF::RNode &rdf, const std::size_t chunkSize, const std::size_t blockSize, + RBatchGenerator(const std::vector &rdfs, const std::size_t chunkSize, const std::size_t blockSize, const std::size_t batchSize, const std::vector &cols, const std::vector &vecSizes = {}, const float vecPadding = 0.0, const float validationSplit = 0.0, const std::size_t maxChunks = 0, bool shuffle = true, - bool dropRemainder = true, const std::size_t setSeed = 0) + bool dropRemainder = true, const std::size_t setSeed = 0, bool loadEager = false, + std::string sampleType = "random") - : f_rdf(rdf), + : f_rdfs(rdfs), fCols(cols), + fVecSizes(vecSizes), fChunkSize(chunkSize), fBlockSize(blockSize), fBatchSize(batchSize), @@ -114,38 +125,49 @@ public: fDropRemainder(dropRemainder), fSetSeed(setSeed), fShuffle(shuffle), - fNotFiltered(f_rdf.GetFilterNames().empty()), - fUseWholeFile(maxChunks == 0), - fNumColumns(cols.size()) + fLoadEager(loadEager), + fSampleType(sampleType), + fUseWholeFile(maxChunks == 0) { + if (fLoadEager) { + fDatasetLoader = std::make_unique>(f_rdfs, fValidationSplit, fCols, fVecSizes, + vecPadding, fShuffle, fSetSeed); + // split the datasets and extract the training and validation datasets + fDatasetLoader->SplitDatasets(); + fTrainingDatasets = fDatasetLoader->GetTrainingDatasets(); + fValidationDatasets = fDatasetLoader->GetValidationDatasets(); + + fTrainingSampler = std::make_unique(fTrainingDatasets, fSampleType, fShuffle, fSetSeed); + fValidationSampler = std::make_unique(fValidationDatasets, fSampleType, fShuffle, fSetSeed); + + // sample the training and validation dataset from the datasets + fTrainingSampler->Sampler(fSampledTrainingDataset); + fValidationSampler->Sampler(fSampledValidationDataset); + + fNumTrainingEntries = fTrainingSampler->GetNumEntries(); + fNumValidationEntries = fValidationSampler->GetNumEntries(); + } - fNumEntries = f_rdf.Count().GetValue(); - fEntries = f_rdf.Take("rdfentry_"); - - fSumVecSizes = std::accumulate(vecSizes.begin(), vecSizes.end(), 0); - fNumChunkCols = fNumColumns + fSumVecSizes - vecSizes.size(); - - // add the last element in entries to not go out of range when filling chunks - fEntries->push_back((*fEntries)[fNumEntries - 1] + 1); + else { + fChunkLoader = + std::make_unique>(f_rdfs[0], fChunkSize, fBlockSize, fValidationSplit, + fCols, fVecSizes, vecPadding, fShuffle, fSetSeed); - fChunkLoader = - std::make_unique>(f_rdf, fNumEntries, fEntries, fChunkSize, fBlockSize, fValidationSplit, - fCols, vecSizes, vecPadding, fShuffle, fSetSeed); - fBatchLoader = std::make_unique(fBatchSize, fNumChunkCols, fNumEntries, fDropRemainder); + // split the dataset into training and validation sets + fChunkLoader->SplitDataset(); - // split the dataset into training and validation sets - fChunkLoader->SplitDataset(); + fNumTrainingEntries = fChunkLoader->GetNumTrainingEntries(); + fNumValidationEntries = fChunkLoader->GetNumValidationEntries(); - // number of training and validation entries after the split - fNumValidationEntries = static_cast(fValidationSplit * fNumEntries); - fNumTrainingEntries = fNumEntries - fNumValidationEntries; + // number of training and validation chunks, calculated in RChunkConstructor + fNumTrainingChunks = fChunkLoader->GetNumTrainingChunks(); + fNumValidationChunks = fChunkLoader->GetNumValidationChunks(); + } - fTrainingBatchLoader = std::make_unique(fBatchSize, fNumChunkCols, fNumTrainingEntries, fDropRemainder); - fValidationBatchLoader = std::make_unique(fBatchSize, fNumChunkCols, fNumValidationEntries, fDropRemainder); - - // number of training and validation chunks, calculated in RChunkConstructor - fNumTrainingChunks = fChunkLoader->GetNumTrainingChunks(); - fNumValidationChunks = fChunkLoader->GetNumValidationChunks(); + fTrainingBatchLoader = std::make_unique(fBatchSize, fCols, fVecSizes, + fNumTrainingEntries, fDropRemainder); + fValidationBatchLoader = std::make_unique(fBatchSize, fCols, fVecSizes, + fNumValidationEntries, fDropRemainder); } ~RBatchGenerator() { DeActivate(); } @@ -157,7 +179,8 @@ public: fIsActive = false; } - fBatchLoader->DeActivate(); + fTrainingBatchLoader->DeActivate(); + fValidationBatchLoader->DeActivate(); if (fLoadingThread) { if (fLoadingThread->joinable()) { @@ -178,7 +201,8 @@ public: fIsActive = true; } - fBatchLoader->Activate(); + fTrainingBatchLoader->Activate(); + fValidationBatchLoader->Activate(); // fLoadingThread = std::make_unique(&RBatchGenerator::LoadChunks, this); } @@ -197,63 +221,77 @@ public: /// \brief Create training batches by first loading a chunk (see RChunkLoader) and split it into batches (see RBatchLoader) void CreateTrainBatches() { - fChunkLoader->CreateTrainingChunksIntervals(); - fTrainingEpochActive = true; - fTrainingChunkNum = 0; - fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum); - fTrainingBatchLoader->CreateBatches(fTrainChunkTensor, fNumTrainingChunks); - fTrainingChunkNum++; + fTrainingEpochActive = true; + if (fLoadEager) { + fTrainingBatchLoader->CreateBatches(fSampledTrainingDataset, 1); + } + + else { + fChunkLoader->CreateTrainingChunksIntervals(); + fTrainingChunkNum = 0; + fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum); + fTrainingBatchLoader->CreateBatches(fTrainChunkTensor, fNumTrainingChunks); + fTrainingChunkNum++; + } } /// \brief Creates validation batches by first loading a chunk (see RChunkLoader), and then split it into batches (see RBatchLoader) void CreateValidationBatches() { - fChunkLoader->CreateValidationChunksIntervals(); - fValidationEpochActive = true; - fValidationChunkNum = 0; - fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum); - fValidationBatchLoader->CreateBatches(fValidationChunkTensor, fNumValidationChunks); - fValidationChunkNum++; - } - - /// \brief Loads a training batch from the queue - RFlat2DMatrix GetTrainBatch() - { - auto batchQueue = fTrainingBatchLoader->GetNumBatchQueue(); - - // load the next chunk if the queue is empty - if (batchQueue < 1 && fTrainingChunkNum < fNumTrainingChunks) { - fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum); - std::size_t lastTrainingBatch = fNumTrainingChunks - fTrainingChunkNum; - fTrainingBatchLoader->CreateBatches(fTrainChunkTensor, lastTrainingBatch); - fTrainingChunkNum++; + fValidationEpochActive = true; + if (fLoadEager) { + fValidationBatchLoader->CreateBatches(fSampledValidationDataset, 1); } else { - ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, 0, fNumEntries); + fChunkLoader->CreateValidationChunksIntervals(); + fValidationChunkNum = 0; + fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum); + fValidationBatchLoader->CreateBatches(fValidationChunkTensor, fNumValidationChunks); + fValidationChunkNum++; } + } - // Get next batch if available - return fTrainingBatchLoader->GetBatch(); + /// \brief Loads a training batch from the queue + RFlat2DMatrix GetTrainBatch() + { + if (!fLoadEager) { + auto batchQueue = fTrainingBatchLoader->GetNumBatchQueue(); + + // load the next chunk if the queue is empty + if (batchQueue < 1 && fTrainingChunkNum < fNumTrainingChunks) { + fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum); + std::size_t lastTrainingBatch = fNumTrainingChunks - fTrainingChunkNum; + fTrainingBatchLoader->CreateBatches(fTrainChunkTensor, lastTrainingBatch); + fTrainingChunkNum++; + } + + else { + fChunkLoader->ResetDataframe(); + } + } + // Get next batch if available + return fTrainingBatchLoader->GetBatch(); } /// \brief Loads a validation batch from the queue RFlat2DMatrix GetValidationBatch() { - auto batchQueue = fValidationBatchLoader->GetNumBatchQueue(); - - // load the next chunk if the queue is empty - if (batchQueue < 1 && fValidationChunkNum < fNumValidationChunks) { - fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum); - std::size_t lastValidationBatch = fNumValidationChunks - fValidationChunkNum; - fValidationBatchLoader->CreateBatches(fValidationChunkTensor, lastValidationBatch); - fValidationChunkNum++; - } + if (!fLoadEager) { + auto batchQueue = fValidationBatchLoader->GetNumBatchQueue(); + + // load the next chunk if the queue is empty + if (batchQueue < 1 && fValidationChunkNum < fNumValidationChunks) { + fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum); + std::size_t lastValidationBatch = fNumValidationChunks - fValidationChunkNum; + fValidationBatchLoader->CreateBatches(fValidationChunkTensor, lastValidationBatch); + fValidationChunkNum++; + } - else { - ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, 0, fNumEntries); + else { + fChunkLoader->ResetDataframe(); + } } - // Get next batch if available return fValidationBatchLoader->GetBatch(); } diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx index d92f18a458abb..8cd69afb99410 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx @@ -41,6 +41,10 @@ which are loaded into a queue. This is done for both the training and validation class RBatchLoader { private: std::size_t fBatchSize; + // needed for calculating the total number of batch columns when vectors columns are present + std::vector fCols; + std::vector fVecSizes; + std::size_t fSumVecSizes; std::size_t fNumColumns; std::size_t fNumEntries; bool fDropRemainder; @@ -66,13 +70,18 @@ private: std::unique_ptr fSecondaryLeftoverBatch; public: - RBatchLoader(std::size_t batchSize, std::size_t numColumns, std::size_t numEntries, bool dropRemainder) + RBatchLoader(std::size_t batchSize, const std::vector &cols, const std::vector &vecSizes = {}, + std::size_t numEntries = 0, bool dropRemainder = false) : fBatchSize(batchSize), - fNumColumns(numColumns), + fCols(cols), + fVecSizes(vecSizes), fNumEntries(numEntries), fDropRemainder(dropRemainder) { + fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0); + fNumColumns = fCols.size() + fSumVecSizes - fVecSizes.size(); + if (fBatchSize == 0) { fBatchSize = fNumEntries; } diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx index f2396dfaecc8c..60b73559af1cb 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx @@ -136,14 +136,11 @@ private: std::unique_ptr fValidation; public: - RChunkLoader(ROOT::RDF::RNode &rdf, std::size_t numEntries, - ROOT::RDF::RResultPtr> rdf_entries, const std::size_t chunkSize, - const std::size_t blockSize, const float validationSplit, const std::vector &cols, + RChunkLoader(ROOT::RDF::RNode &rdf, const std::size_t chunkSize, const std::size_t blockSize, + const float validationSplit, const std::vector &cols, const std::vector &vecSizes = {}, const float vecPadding = 0.0, bool shuffle = true, const std::size_t setSeed = 0) : f_rdf(rdf), - fNumEntries(numEntries), - fEntries(rdf_entries), fCols(cols), fVecSizes(vecSizes), fVecPadding(vecPadding), @@ -155,6 +152,13 @@ public: fSetSeed(setSeed) { fTensorOperators = std::make_unique(fShuffle, fSetSeed); + + fNumEntries = f_rdf.Count().GetValue(); + fEntries = f_rdf.Take("rdfentry_"); + + // add the last element in entries to not go out of range when filling chunks + fEntries->push_back((*fEntries)[fNumEntries - 1] + 1); + fNumCols = fCols.size(); fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0); @@ -418,6 +422,11 @@ public: } } + void ResetDataframe() + { + ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, 0, fNumEntries); + } + std::vector GetTrainingChunkSizes() { return fTraining->ChunksSizes; } std::vector GetValidationChunkSizes() { return fValidation->ChunksSizes; } diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RDatasetLoader.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RDatasetLoader.hxx new file mode 100644 index 0000000000000..17f627640cb7a --- /dev/null +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RDatasetLoader.hxx @@ -0,0 +1,220 @@ +// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026 + +/************************************************************************* + * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef TMVA_RDATASETLOADER +#define TMVA_RDATASETLOADER + +#include +#include + +#include "TMVA/RTensor.hxx" +#include "ROOT/RDataFrame.hxx" +#include "TMVA/BatchGenerator/RFlat2DMatrix.hxx" +#include "TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx" +#include "ROOT/RDF/Utils.hxx" +#include "ROOT/RVec.hxx" + +#include "ROOT/RLogger.hxx" + +namespace TMVA { +namespace Experimental { +namespace Internal { + +// clang-format off +/** +\class ROOT::TMVA::Experimental::Internal::RDatasetLoaderFunctor +\ingroup tmva +\brief Loading chunks made in RDatasetLoader into tensors from data from RDataFrame. +*/ + +template +class RDatasetLoaderFunctor { + // clang-format on + std::size_t fOffset{}; + std::size_t fVecSizeIdx{}; + float fVecPadding{}; + std::vector fMaxVecSizes{}; + RFlat2DMatrix &fDatasetTensor; + + std::size_t fNumDatasetCols; + + int fI; + int fNumColumns; + + ////////////////////////////////////////////////////////////////////////// + /// \brief Copy the content of a column into RTensor when the column consits of vectors + template ::value, int> = 0> + void AssignToTensor(const T &vec, int i, int numColumns) + { + std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++]; + std::size_t vec_size = vec.size(); + if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding + { + std::copy(vec.begin(), vec.end(), &fDatasetTensor.GetData()[fOffset + numColumns * i]); + std::fill(&fDatasetTensor.GetData()[fOffset + numColumns * i + vec_size], + &fDatasetTensor.GetData()[fOffset + numColumns * i + max_vec_size], fVecPadding); + } else // Copy only max_vec_size length from vector column + { + std::copy(vec.begin(), vec.begin() + max_vec_size, &fDatasetTensor.GetData()[fOffset + numColumns * i]); + } + fOffset += max_vec_size; + } + + ////////////////////////////////////////////////////////////////////////// + /// \brief Copy the content of a column into RTensor when the column consits of single values + template ::value, int> = 0> + void AssignToTensor(const T &val, int i, int numColumns) + { + fDatasetTensor.GetData()[fOffset + numColumns * i] = val; + fOffset++; + } + +public: + RDatasetLoaderFunctor(RFlat2DMatrix &datasetTensor, std::size_t numColumns, + const std::vector &maxVecSizes, float vecPadding, int i) + : fDatasetTensor(datasetTensor), fMaxVecSizes(maxVecSizes), fVecPadding(vecPadding), fI(i), fNumColumns(numColumns) + { + } + + void operator()(const ColTypes &...cols) + { + fVecSizeIdx = 0; + (AssignToTensor(cols, fI, fNumColumns), ...); + } +}; + +// clang-format off +/** +\class ROOT::TMVA::Experimental::Internal::RDatasetLoader +\ingroup tmva +\brief Load the whole dataset into memory. + +In this class the whole dataset is loaded into memory. The dataset is further shuffled and spit into training and validation sets with the user-defined validation split fraction. +*/ + +template +class RDatasetLoader { +private: + // clang-format on + std::size_t fNumEntries; + float fValidationSplit; + + std::vector fVecSizes; + std::size_t fSumVecSizes; + std::size_t fVecPadding; + std::size_t fNumDatasetCols; + + std::vector fTrainingDatasets; + std::vector fValidationDatasets; + + std::size_t fNumTrainingEntries; + std::size_t fNumValidationEntries; + std::unique_ptr fTensorOperators; + + std::vector f_rdfs; + std::vector fCols; + std::size_t fNumCols; + std::size_t fSetSeed; + + bool fNotFiltered; + bool fShuffle; + + ROOT::RDF::RResultPtr> fEntries; + +public: + RDatasetLoader(const std::vector &rdfs, const float validationSplit, + const std::vector &cols, const std::vector &vecSizes = {}, + const float vecPadding = 0.0, bool shuffle = true, const std::size_t setSeed = 0) + : f_rdfs(rdfs), + fCols(cols), + fVecSizes(vecSizes), + fVecPadding(vecPadding), + fValidationSplit(validationSplit), + fShuffle(shuffle), + fSetSeed(setSeed) + { + fTensorOperators = std::make_unique(fShuffle, fSetSeed); + fNumCols = fCols.size(); + fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0); + + fNumDatasetCols = fNumCols + fSumVecSizes - fVecSizes.size(); + } + + ////////////////////////////////////////////////////////////////////////// + /// \brief Split an individual dataframe into a training and validation dataset + /// \param[in] rdf Dataframe that will be split into training and validation + /// \param[in] TrainingDataset Tensor for the training dataset + /// \param[in] ValidationDataset Tensor for the validation dataset + void SplitDataframe(ROOT::RDF::RNode &rdf, RFlat2DMatrix &TrainingDataset, RFlat2DMatrix &ValidationDataset) + { + std::size_t NumEntries = rdf.Count().GetValue(); + ROOT::RDF::RResultPtr> Entries = rdf.Take("rdfentry_"); + + // add the last element in entries to not go out of range when filling chunks + Entries->push_back((*Entries)[NumEntries - 1] + 1); + + // number of training and validation entries after the split + std::size_t NumValidationEntries = static_cast(fValidationSplit * NumEntries); + std::size_t NumTrainingEntries = NumEntries - NumValidationEntries; + + RFlat2DMatrix Dataset({NumEntries, fNumDatasetCols}); + + bool NotFiltered = rdf.GetFilterNames().empty(); + if (NotFiltered) { + RDatasetLoaderFunctor func(Dataset, fNumDatasetCols, fVecSizes, fVecPadding, 0); + rdf.Foreach(func, fCols); + } + + else { + std::size_t datasetEntry = 0; + for (std::size_t j = 0; j < NumEntries; j++) { + RDatasetLoaderFunctor func(Dataset, fNumDatasetCols, fVecSizes, fVecPadding, datasetEntry); + ROOT::Internal::RDF::ChangeBeginAndEndEntries(rdf, (*Entries)[j], (*Entries)[j + 1]); + rdf.Foreach(func, fCols); + datasetEntry++; + } + } + RFlat2DMatrix ShuffledDataset({NumEntries, fNumDatasetCols}); + fTensorOperators->ShuffleTensor(ShuffledDataset, Dataset); + fTensorOperators->SliceTensor(TrainingDataset, ShuffledDataset, {{0, NumTrainingEntries}, {0, fNumDatasetCols}}); + fTensorOperators->SliceTensor(ValidationDataset, ShuffledDataset, {{NumTrainingEntries, NumEntries}, {0, fNumDatasetCols}}); + } + + ////////////////////////////////////////////////////////////////////////// + /// \brief Split the dataframes in a training and validation dataset + void SplitDatasets() + { + fNumEntries = 0; + fNumTrainingEntries = 0; + fNumValidationEntries = 0; + + for (auto& rdf : f_rdfs) { + RFlat2DMatrix TrainingDataset; + RFlat2DMatrix ValidationDataset; + + SplitDataframe(rdf, TrainingDataset, ValidationDataset); + fTrainingDatasets.push_back(TrainingDataset); + fValidationDatasets.push_back(ValidationDataset); + + fNumTrainingEntries += TrainingDataset.GetRows(); + fNumValidationEntries += ValidationDataset.GetRows(); + fNumEntries += TrainingDataset.GetRows() + ValidationDataset.GetRows(); + } + } + + std::vector GetTrainingDatasets() {return fTrainingDatasets;} + std::vector GetValidationDatasets() {return fValidationDatasets;} + +}; + +} // namespace Internal +} // namespace Experimental +} // namespace TMVA +#endif // TMVA_RDATASETLOADER diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx index efaabc9bc0be6..0c3ebb044822d 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx @@ -67,6 +67,48 @@ public: ShuffledTensor.GetData() + i * cols); } } + + void SliceTensor(RFlat2DMatrix& SlicedTensor, RFlat2DMatrix& Tensor, + const std::vector>& slice) + { + const auto& rowSlice = slice[0]; + const auto& colSlice = slice[1]; + + std::size_t rowStart = rowSlice[0]; + std::size_t rowEnd = rowSlice[1]; + std::size_t colStart = colSlice[0]; + std::size_t colEnd = colSlice[1]; + + std::size_t rows = rowEnd - rowStart; + std::size_t cols = colEnd - colStart; + + SlicedTensor.Resize(rows, cols); + std::copy(Tensor.GetData() + rowStart * cols, + Tensor.GetData() + rowStart * cols + rows * cols, + SlicedTensor.GetData()); + } + + void ConcatenateTensors(RFlat2DMatrix &ConcatTensor, std::vector &Tensors) + { + std::size_t cols = Tensors[0].GetCols(); + std::size_t rows = 0; + + for (const auto& t : Tensors) { + rows += t.GetRows(); + } + + ConcatTensor.Resize(rows, cols); + + std::size_t index = 0; + for (std::size_t i = 0; i < Tensors.size(); i++) { + std::size_t tensorRows = Tensors[i].GetRows(); + std::copy(Tensors[i].GetData(), + Tensors[i].GetData() + tensorRows * cols, + ConcatTensor.GetData() + index * cols); + index += tensorRows; + } + } + }; diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RSampler.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RSampler.hxx new file mode 100644 index 0000000000000..5e5fe5cc5b879 --- /dev/null +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RSampler.hxx @@ -0,0 +1,76 @@ +// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026 + +/************************************************************************* + * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef TMVA_RSAMPLER +#define TMVA_RSAMPLER + +#include +#include +#include + +#include "ROOT/RDataFrame.hxx" +#include "ROOT/RDF/Utils.hxx" +#include "ROOT/RVec.hxx" +#include "TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx" +#include "ROOT/RLogger.hxx" + +namespace TMVA::Experimental::Internal { +// clang-format off +/** +\class ROOT::TMVA::Experimental::Internal::RSampler +\ingroup tmva +\brief Implementation of different sampling strategies. +*/ + +class RSampler { +private: + // clang-format on + std::vector &fDatasets; + std::string fSampleType; + bool fShuffle; + std::size_t fSetSeed; + std::size_t fNumEntries; + + std::unique_ptr fTensorOperators; +public: + RSampler(std::vector &datasets, const std::string &sampleType, bool shuffle = true, const std::size_t setSeed = 0) + : fDatasets(datasets), + fSampleType(sampleType), + fShuffle(shuffle), + fSetSeed(setSeed) + { + fTensorOperators = std::make_unique(fShuffle, fSetSeed); + } + + ////////////////////////////////////////////////////////////////////////// + /// \brief Collection of sampling types + /// \param[in] SampledTensor Tensor with all the sampled entries + void Sampler(RFlat2DMatrix &SampledTensor) + { + if (fSampleType == "random") { + RandomSampler(SampledTensor); + } + } + + ////////////////////////////////////////////////////////////////////////// + /// \brief Sample all entries randomly from the datasets + /// \param[in] SampledTensor Tensor with all the sampled entries + void RandomSampler(RFlat2DMatrix &SampledTensor) { + RFlat2DMatrix ConcatTensor; + fTensorOperators->ConcatenateTensors(ConcatTensor, fDatasets); + fTensorOperators->ShuffleTensor(SampledTensor, ConcatTensor); + fNumEntries = SampledTensor.GetRows(); + } + + std::size_t GetNumEntries() { return fNumEntries;} +}; + +} // namespace TMVA::Experimental::Internal +#endif // TMVA_RSAMPLER