// @(#)root/tmva/tmva/dnn:$Id$ // Author: Ravi Kiran S /********************************************************************************** * Project: TMVA - a Root-integrated toolkit for multivariate data analysis * * Package: TMVA * * Class : TAdadelta * * Web : http://tmva.sourceforge.net * * * * Description: * * Adadelta Optimizer Class * * * * Authors (alphabetical): * * Ravi Kiran S - CERN, Switzerland * * * * Copyright (c) 2005-2018: * * CERN, Switzerland * * U. of Victoria, Canada * * MPI-K Heidelberg, Germany * * U. of Bonn, Germany * * * * Redistribution and use in source and binary forms, with or without * * modification, are permitted according to the terms listed in LICENSE * * (http://tmva.sourceforge.net/LICENSE) * **********************************************************************************/ #ifndef TMVA_DNN_ADADELTA #define TMVA_DNN_ADADELTA #include "TMatrix.h" #include "TMVA/DNN/Optimizer.h" #include "TMVA/DNN/Functions.h" #include namespace TMVA { namespace DNN { /** \class TAdadelta * Adadelta Optimizer class * * This class represents the Adadelta Optimizer. */ template , typename DeepNet_t = TDeepNet> class TAdadelta : public VOptimizer { public: using Matrix_t = typename Architecture_t::Matrix_t; using Scalar_t = typename Architecture_t::Scalar_t; protected: Scalar_t fRho; ///< The Rho constant used by the optimizer. Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero. std::vector> fPastSquaredWeightGradients; ///< The accumulation of the square of the past /// weight gradients associated with the deep net. std::vector> fPastSquaredBiasGradients; ///< The accumulation of the square of the past bias /// gradients associated with the deep net. std::vector> fPastSquaredWeightUpdates; ///< The accumulation of the square of the past weight /// updates associated with the deep net. std::vector> fPastSquaredBiasUpdates; ///< The accumulation of the square of the past bias /// updates associated with the deep net. std::vector> fWorkWeightTensor1; ///< working tensor used to keep a temporary copy of weights or weight gradients std::vector> fWorkBiasTensor1; ///< working tensor used to keep a temporary copy of bias or bias gradients std::vector> fWorkWeightTensor2; ///< working tensor used to keep a temporary copy of weights or weight gradients std::vector> fWorkBiasTensor2; ///< working tensor used to keep a temporary copy of bias or bias gradients /*! Update the weights, given the current weight gradients. */ void UpdateWeights(size_t layerIndex, std::vector &weights, const std::vector &weightGradients); /*! Update the biases, given the current bias gradients. */ void UpdateBiases(size_t layerIndex, std::vector &biases, const std::vector &biasGradients); public: /*! Constructor. */ TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate = 1.0, Scalar_t rho = 0.95, Scalar_t epsilon = 1e-8); /*! Destructor. */ ~TAdadelta() = default; /*! Getters */ Scalar_t GetRho() const { return fRho; } Scalar_t GetEpsilon() const { return fEpsilon; } std::vector> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; } std::vector &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; } std::vector> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; } std::vector &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; } std::vector> &GetPastSquaredWeightUpdates() { return fPastSquaredWeightUpdates; } std::vector &GetPastSquaredWeightUpdatesAt(size_t i) { return fPastSquaredWeightUpdates[i]; } std::vector> &GetPastSquaredBiasUpdates() { return fPastSquaredBiasUpdates; } std::vector &GetPastSquaredBiasUpdatesAt(size_t i) { return fPastSquaredBiasUpdates[i]; } }; // // // The Adadelta Optimizer Class - Implementation //_________________________________________________________________________________________________ template TAdadelta::TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t rho, Scalar_t epsilon) : VOptimizer(learningRate, deepNet), fRho(rho), fEpsilon(epsilon) { std::vector &layers = deepNet.GetLayers(); const size_t layersNSlices = layers.size(); fPastSquaredWeightGradients.resize(layersNSlices); fPastSquaredBiasGradients.resize(layersNSlices); fPastSquaredWeightUpdates.resize(layersNSlices); fPastSquaredBiasUpdates.resize(layersNSlices); fWorkWeightTensor1.resize(layersNSlices); fWorkBiasTensor1.resize(layersNSlices); fWorkWeightTensor2.resize(layersNSlices); fWorkBiasTensor2.resize(layersNSlices); for (size_t i = 0; i < layersNSlices; i++) { const size_t weightsNSlices = (layers[i]->GetWeights()).size(); Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights()); Architecture_t::CreateWeightTensors( fPastSquaredWeightUpdates[i], layers[i]->GetWeights()); for (size_t j = 0; j < weightsNSlices; j++) { initialize(fPastSquaredWeightGradients[i][j], EInitialization::kZero); initialize(fPastSquaredWeightUpdates[i][j], EInitialization::kZero); } const size_t biasesNSlices = (layers[i]->GetBiases()).size(); Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases()); Architecture_t::CreateWeightTensors( fPastSquaredBiasUpdates[i], layers[i]->GetBiases()); for (size_t j = 0; j < biasesNSlices; j++) { initialize(fPastSquaredBiasGradients[i][j], EInitialization::kZero); initialize(fPastSquaredBiasUpdates[i][j], EInitialization::kZero); } Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights()); Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases()); Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights()); Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases()); } } //_________________________________________________________________________________________________ template auto TAdadelta::UpdateWeights(size_t layerIndex, std::vector &weights, const std::vector &weightGradients) -> void { std::vector ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex); std::vector ¤tLayerPastSquaredWeightUpdates = this->GetPastSquaredWeightUpdatesAt(layerIndex); const size_t weightsNSlices = weights.size(); assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices); for (size_t i = 0; i < weightsNSlices; i++) { // accumulation matrix used for temporary storing of the current accumulation auto &accumulation = fWorkWeightTensor1[layerIndex][i]; auto ¤tSquaredWeightGradients = fWorkWeightTensor2[layerIndex][i]; // Vt = rho * Vt-1 + (1-rho) * currentSquaredWeightGradients initialize(accumulation, EInitialization::kZero); Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]); Architecture_t::SquareElementWise(currentSquaredWeightGradients); Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[i], this->GetRho()); Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho())); Architecture_t::Copy(currentLayerPastSquaredWeightGradients[i], accumulation); // updating the weights. // currentWeightUpdates = sqrt(Wt + epsilon) * currentGradients / sqrt(Vt + epsilon) // dummy1 = sqrt(Wt + epsilon) auto &dummy1 = fWorkWeightTensor1[layerIndex][i]; // reuse working tensor Architecture_t::Copy(dummy1, currentLayerPastSquaredWeightUpdates[i]); Architecture_t::ConstAdd(dummy1, this->GetEpsilon()); Architecture_t::SqrtElementWise(dummy1); auto ¤tWeightUpdates = fWorkWeightTensor2[layerIndex][i]; // reuse the work tensor for the weight updates now Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]); Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon()); Architecture_t::SqrtElementWise(currentWeightUpdates); Architecture_t::ReciprocalElementWise(currentWeightUpdates); Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]); Architecture_t::Hadamard(currentWeightUpdates, dummy1); // theta = theta - learningRate * currentWeightUpdates Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate()); // Wt = rho * Wt-1 + (1-rho) * currentSquaredWeightUpdates // re-use accumulation matrix used for temporary storing of the current accumulation initialize(accumulation, EInitialization::kZero); auto ¤tSquaredWeightUpdates = fWorkWeightTensor2[layerIndex][i]; // reuse work tensor Architecture_t::Copy(currentSquaredWeightUpdates, currentWeightUpdates); Architecture_t::SquareElementWise(currentSquaredWeightUpdates); Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightUpdates[i], this->GetRho()); Architecture_t::ScaleAdd(accumulation, currentSquaredWeightUpdates, 1 - (this->GetRho())); Architecture_t::Copy(currentLayerPastSquaredWeightUpdates[i], accumulation); } } //_________________________________________________________________________________________________ template auto TAdadelta::UpdateBiases(size_t layerIndex, std::vector &biases, const std::vector &biasGradients) -> void { std::vector ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex); std::vector ¤tLayerPastSquaredBiasUpdates = this->GetPastSquaredBiasUpdatesAt(layerIndex); const size_t biasesNSlices = biases.size(); assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices); for (size_t i = 0; i < biasesNSlices; i++) { // accumulation matrix used for temporary storing of the current accumulation auto &accumulation = fWorkBiasTensor1[layerIndex][i]; // Vt = rho * Vt-1 + (1-rho) * currentSquaredBiasGradients initialize(accumulation, EInitialization::kZero); auto ¤tSquaredBiasGradients = fWorkBiasTensor2[layerIndex][i]; Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]); Architecture_t::SquareElementWise(currentSquaredBiasGradients); Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[i], this->GetRho()); Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho())); Architecture_t::Copy(currentLayerPastSquaredBiasGradients[i], accumulation); // updating the biases. // currentBiasUpdates = sqrt(Wt + epsilon) * currentGradients / sqrt(Vt + epsilon) // dummy1 = sqrt(Wt + epsilon) auto &dummy1 = fWorkBiasTensor1[layerIndex][i]; // reuse working tensor Architecture_t::Copy(dummy1, currentLayerPastSquaredBiasUpdates[i]); Architecture_t::ConstAdd(dummy1, this->GetEpsilon()); Architecture_t::SqrtElementWise(dummy1); auto ¤tBiasUpdates = fWorkBiasTensor2[layerIndex][i]; Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]); Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon()); Architecture_t::SqrtElementWise(currentBiasUpdates); Architecture_t::ReciprocalElementWise(currentBiasUpdates); Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]); Architecture_t::Hadamard(currentBiasUpdates, dummy1); // theta = theta - learningRate * currentBiasUpdates Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate()); // Wt = rho * Wt-1 + (1-rho) * currentSquaredBiasUpdates // re-use accumulation matrix used for temporary storing of the current accumulation initialize(accumulation, EInitialization::kZero); auto ¤tSquaredBiasUpdates = fWorkBiasTensor2[layerIndex][i]; // reuse work tensor Architecture_t::Copy(currentSquaredBiasUpdates, currentBiasUpdates); Architecture_t::SquareElementWise(currentSquaredBiasUpdates); Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasUpdates[i], this->GetRho()); Architecture_t::ScaleAdd(accumulation, currentSquaredBiasUpdates, 1 - (this->GetRho())); Architecture_t::Copy(currentLayerPastSquaredBiasUpdates[i], accumulation); } } } // namespace DNN } // namespace TMVA #endif