// @(#)root/tmva/pymva $Id$ // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015 /********************************************************************************** * Project: TMVA - a Root-integrated toolkit for multivariate data analysis * * Package: TMVA * * Class : MethodPyRandomForest * * Web : http://oproject.org * * * * Description: * * scikit-learn Package RandomForestClassifier method based on python * * * **********************************************************************************/ #ifndef ROOT_TMVA_MethodPyRandomForest #define ROOT_TMVA_MethodPyRandomForest ////////////////////////////////////////////////////////////////////////// // // // MethodPyRandomForest // // // ////////////////////////////////////////////////////////////////////////// #include "TMVA/PyMethodBase.h" namespace TMVA { class Factory; // DSMTEST class Reader; // DSMTEST class DataSetManager; // DSMTEST class Types; class MethodPyRandomForest : public PyMethodBase { public : // constructors MethodPyRandomForest(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption = ""); MethodPyRandomForest(DataSetInfo &dsi, const TString &theWeightFile); ~MethodPyRandomForest(void); void Train(); // options treatment void Init(); void DeclareOptions(); void ProcessOptions(); // create ranking const Ranking *CreateRanking(); Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets); // performs classifier testing virtual void TestClassification(); // Get class probabilities of given event Double_t GetMvaValue(Double_t *errLower = 0, Double_t *errUpper = 0); std::vector GetMvaValues(Long64_t firstEvt = 0, Long64_t lastEvt = -1, Bool_t logProgress = false); std::vector& GetMulticlassValues(); using MethodBase::ReadWeightsFromStream; // the actual "weights" virtual void AddWeightsXMLTo(void * /* parent */) const {} // = 0; virtual void ReadWeightsFromXML(void * /* wghtnode */) {} // = 0; virtual void ReadWeightsFromStream(std::istream &) {} //= 0; // backward compatibility void ReadModelFromFile(); private : DataSetManager *fDataSetManager; // DSMTEST friend class Factory; // DSMTEST friend class Reader; // DSMTEST protected: std::vector mvaValues; std::vector classValues; UInt_t fNvars; // number of variables UInt_t fNoutputs; // number of outputs TString fFilenameClassifier; // Path to serialized classifier (default in `weights` folder) // RandomForest options PyObject* pNestimators; Int_t fNestimators; //integer, optional (default=10) //The number of trees in the forest. PyObject* pCriterion; TString fCriterion; //string, optional (default="gini") //The function to measure the quality of a split. Supported criteria are //"gini" for the Gini impurity and "entropy" for the information gain. //Note: this parameter is tree-specific. PyObject* pMaxDepth; TString fMaxDepth; //integer or None, optional (default=None) //The maximum depth of the tree. If None, then nodes are expanded until //all leaves are pure or until all leaves contain less than `fMinSamplesSplit`. PyObject* pMinSamplesSplit; Int_t fMinSamplesSplit; //integer, optional (default=2) //The minimum number of samples required to split an internal node. PyObject* pMinSamplesLeaf; Int_t fMinSamplesLeaf; //integer, optional (default=1) //The minimum number of samples in newly created leaves. A split is //discarded if after the split, one of the leaves would contain less then //``min_samples_leaf`` samples. //Note: this parameter is tree-specific. PyObject* pMinWeightFractionLeaf; Double_t fMinWeightFractionLeaf; //float, optional (default=0.) //The minimum weighted fraction of the input samples required to be at a //leaf node. //Note: this parameter is tree-specific. PyObject* pMaxFeatures; TString fMaxFeatures; //int, float, string or None, optional (default="auto") //The number of features to consider when looking for the best split: //- If int, then consider `max_features` features at each split. //- If float, then `max_features` is a percentage and //`int(max_features * n_features)` features are considered at each split. //- If "auto", then `max_features=sqrt(n_features)`. //- If "sqrt", then `max_features=sqrt(n_features)`. //- If "log2", then `max_features=log2(n_features)`. //- If None, then `max_features=n_features`. // Note: the search for a split does not stop until at least one // valid partition of the node samples is found, even if it requires to // effectively inspect more than ``max_features`` features. // Note: this parameter is tree-specific. PyObject* pMaxLeafNodes; TString fMaxLeafNodes; //int or None, optional (default=None) //Grow trees with ``max_leaf_nodes`` in best-first fashion. //Best nodes are defined as relative reduction in impurity. //If None then unlimited number of leaf nodes. //If not None then ``max_depth`` will be ignored. PyObject* pBootstrap; Bool_t fBootstrap; //boolean, optional (default=True) //Whether bootstrap samples are used when building trees. PyObject* pOobScore; Bool_t fOobScore; //Whether to use out-of-bag samples to estimate //the generalization error. PyObject* pNjobs; Int_t fNjobs; // integer, optional (default=1) //The number of jobs to run in parallel for both `fit` and `predict`. //If -1, then the number of jobs is set to the number of cores. PyObject* pRandomState; TString fRandomState; //int, RandomState instance or None, optional (default=None) //If int, random_state is the seed used by the random number generator; //If RandomState instance, random_state is the random number generator; //If None, the random number generator is the RandomState instance used //by `np.random`. PyObject* pVerbose; Int_t fVerbose; //Controls the verbosity of the tree building process. PyObject* pWarmStart; Bool_t fWarmStart; //bool, optional (default=False) //When set to ``True``, reuse the solution of the previous call to fit //and add more estimators to the ensemble, otherwise, just fit a whole //new forest. PyObject* pClassWeight; TString fClassWeight; //dict, list of dicts, "auto", "subsample" or None, optional //Weights associated with classes in the form ``{class_label: weight}``. //If not given, all classes are supposed to have weight one. For //multi-output problems, a list of dicts can be provided in the same //order as the columns of y. //The "auto" mode uses the values of y to automatically adjust //weights inversely proportional to class frequencies in the input data. //The "subsample" mode is the same as "auto" except that weights are //computed based on the bootstrap sample for every tree grown. //For multi-output, the weights of each column of y will be multiplied. //Note that these weights will be multiplied with sample_weight (passed //through the fit method) if sample_weight is specified. // get help message text void GetHelpMessage() const; ClassDef(MethodPyRandomForest, 0) }; } // namespace TMVA #endif // ROOT_TMVA_MethodPyRandomForest