""" **sPlot** is reweighting technique frequently used in HEP to reconstruct the distributions of features in mixture. Initial information used is the probabilities obtained after fitting. **hep_ml.splot** contains standalone python implementation of this technique. This implementation is brilliantly simple and clear - just as it should be. Example ------- >>> from hep_ml.splot import compute_sweights >>> p = pandas.DataFrame({'signal': p_signal, 'bkg', b_bkg}) >>> sWeights = compute_sweights(p) >>> # plotting reconstructed distribution of some other variable >>> plt.hist(other_var, weights=sWeights.signal) >>> plt.hist(other_var, weights=sWeights.bkg) For more examples and explanations, see notebooks/Splot in repository. """ from __future__ import division, print_function, absolute_import import pandas import numpy from .commonutils import check_sample_weight __author__ = 'Alex Rogozhnikov' def compute_sweights(probabilities, sample_weight=None): """Computes sWeights based on probabilities obtained from distribution fit. :param probabilities: pandas.DataFrame with probabilities of shape [n_samples, n_classes]. These probabilities are obtained after fit (typically, mass fit). Pay attention, that for each sample sum of probabilities should be equal to 1. :param sample_weight: optionally you can pass weights of events, numpy.array of shape [n_samples] :return: pandas.DataFrame with sWeights of shape [n_samples, n_classes] """ # converting to pandas.DataFrame probabilities = pandas.DataFrame(probabilities) # checking sample_weight sample_weight = check_sample_weight(probabilities, sample_weight=sample_weight) # checking that all weights are positive assert numpy.all(sample_weight >= 0), 'sample weight are expected to be non-negative' p = numpy.array(probabilities) # checking that probabilities sum up to 1. assert numpy.allclose(p.sum(axis=1), 1, atol=1e-3), 'sum of probabilities is not equal to 1.' # computations initial_stats = (p * sample_weight[:, numpy.newaxis]).sum(axis=0) V_inv = p.T.dot(p * sample_weight[:, numpy.newaxis]) V = numpy.linalg.inv(V_inv) * initial_stats[numpy.newaxis, :] # Final formula sweights = p.dot(V) * sample_weight[:, numpy.newaxis] return pandas.DataFrame(sweights, columns=probabilities.keys())