## \file
## \ingroup tutorial_tmva
## \notebook -nodraw
## This tutorial illustrates how to prepare ROOT datasets to be nicely readable
## by most machine learning methods. This requires filtering the inital complex
## datasets and writing the data in a flat format.
##
## \macro_code
## \macro_output
##
## \date August 2019
## \author Stefan Wunsch

import ROOT


def filter_events(df):
    """
    Reduce initial dataset to only events which shall be used for training
    """
    return df.Filter("nElectron>=2 && nMuon>=2", "At least two electrons and two muons")


def define_variables(df):
    """
    Define the variables which shall be used for training
    """
    return df.Define("Muon_pt_1", "Muon_pt[0]")\
             .Define("Muon_pt_2", "Muon_pt[1]")\
             .Define("Electron_pt_1", "Electron_pt[0]")\
             .Define("Electron_pt_2", "Electron_pt[1]")


variables = ["Muon_pt_1", "Muon_pt_2", "Electron_pt_1", "Electron_pt_2"]


if __name__ == "__main__":
    for filename, label in [["SMHiggsToZZTo4L.root", "signal"], ["ZZTo2e2mu.root", "background"]]:
        print(">>> Extract the training and testing events for {} from the {} dataset.".format(
            label, filename))

        # Load dataset, filter the required events and define the training variables
        filepath = "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/" + filename
        df = ROOT.RDataFrame("Events", filepath)
        df = filter_events(df)
        df = define_variables(df)

        # Book cutflow report
        report = df.Report()

        # Split dataset by event number for training and testing
        columns = ROOT.std.vector["string"](variables)
        df.Filter("event % 2 == 0", "Select events with even event number for training")\
          .Snapshot("Events", "train_" + label + ".root", columns)
        df.Filter("event % 2 == 1", "Select events with odd event number for training")\
          .Snapshot("Events", "test_" + label + ".root", columns)

        # Print cutflow report
        report.Print()