#Statistics.py """ Authors: Rasa Muller | Barbara Caiffi | Matteo Sanguineti """ import os, sys, ROOT, uproot from array import array from math import * import numpy as np # Abbreviations: # sd = sin declination # ty = number of datataking years # LLR = Likelihood ratio # TS = test statistic # # gr_<...> = TGraph # l_<...> = list # Setup: # StatisticalAnalysis() is a composite class # LLR_TS_distribution() is a component class # A StatisticalAnalysis will have multiple LLR_TS_distributions # (at least one H0 LLR_TS_distribution, and one H1 LLR_TS_distribution are needed) # class StatisticalAnalysis(): # composite class def __init__(self): # self.computingsystem = "lyon" self.run_identifier = "RunID" self.inputpath = "./input/path/" #TODO: Buildin safety: "/" at the end self.outputpath = self.inputpath self.show_graphical_output = True if self.show_graphical_output == True: ROOT.gROOT.SetBatch(False) elif self.show_graphical_output == False: ROOT.gROOT.SetBatch(True) self.save_plots = True self.datatakingyears = ["ty1.0"] #[yr^-1] self.sindecs = ["sd-0.9"] self.dict_of_LLR_TS_distributions = {} def set_LLR_TS_distributions() for YR in self.datatakingyears: for SD in self.sindecs: for Nsig in self. dict_of_LLR_TS_distributions[SD+"_"+YR+"_"+Nsig] = LLR_TS_distribution(SD, YR, Nsig): # component class def set_different_outputpath(self, outputpath_to_set): '''function to set outputpath do a different path than default (=inputpath)''' self.outputpath = outputpath_to_set if not os.path.exists(outputpath_to_set): # create outputdir if it doesn't exist yet print("creating: ", outputpath_to_set) os.mkdir(outputpath_to_set) return def get_nsigindset(self, SD, YR, multiply_by_years = False): '''get all analysed signal events from existing files in input directory TODO? get info from #entries => will take more time because all files then should be opened ''' x_NsigInDset = [] float_YR = float(YR.split('ty')[-1]) float_SD = float(SD.split('sd')[-1]) for FILE in os.listdir(self.inputpath): if self.run_identifier in FILE and SD in FILE and YR in FILE and ".root" in FILE and "RESULTS" not in FILE: #get info from filename n_sig = str(FILE).split('_c')[1].split('_')[0] if multiply_by_years == True: NsigInDset = round(float(n_sig) * float_YR, 3) #CHECK round by 3 digits always ok? else: NsigInDset = round(float(n_sig), 3) #CHECK round by 3 digits always ok? x_NsigInDset.append( "c"+str(NsigInDset) ) return x_NsigInDset # def determine_CL_value_from_array(a_h1, teststatistic_data): # '''Funtion to determine the CL-value of the data with given test statistic: # determine P(teststatistic < teststatistic_data | H1) by integration # ''' # a_h1 = np.sort(a_h1) #(just in case it's not done yet) # # integral = "number of elements above median" / normfactor # # so: count number of elements higher than test statistic, and norm by total number of entries # CL_value = np.sum(a_h1 >= teststatistic_data) / len(a_h1) # return CL_value def plot_flux_sindecl(l_years, ): ####################################################################### # NOT SURE IF THIS IS NEEDED HERE IF IT'S ALREADY DEFINED IN THE INIT if self.show_graphical_output == False: ROOT.gROOT.SetBatch(True) elif self.show_graphical_output == True: ROOT.gROOT.SetBatch(False) ####################################################################### c_plot_flux_sindecl = ROOT.TCanvas('c_plot_flux_sindecl', 'c_plot_flux_sindecl', 600, 600) for i, YR in enumerate(l_years): x_sd, y_fluxlimit = flux_sindecl(YR, p3s)#*[p3s/p5s/90CL/95CL]=>integral&H0/H1?*) n = len(x_sd[YR]) x = array('d', x_sd[YR]) y = array('d', y_fluxlimit[YR]) gr_p[YR] = ROOT.TGraph(n, x, y) gr_p[YR].SetMarkerStyle(4) gr_p[YR].SetMarkerColor(1) gr_p[YR].SetLineColor(1) gr_p[YR].SetLineStyle(i*2+1) gr_p[YR].Draw("C*") leg.AddEntry(gr_p[YR], str(float(YR.split("ty")[-1])/2.)+"yr datataking ARCA (2BB)", "l") leg.Draw("same"); c_plot_flux_sindecl.SetLogy() c_plot_flux_sindecl.Draw() c_plot_flux_sindecl.Update() if self.save_plots == True: c_plot_flux_sindecl.SaveAs(self.outputpath+"plot_flux_sindecl.png") return c_plot_flux_sindecl class LLR_TS_distribution(): # component class def __init__(self, SD, YR, Nsig): # TODO SD/YR/NSIG are string inputs => change to float self.SD = SD self.YR = YR self.Nsig = Nsig if Nsig == 'c0.0': self.H = 0 # H0 else: self.H = 1 # H1 self.name = 'H'+str(self.H)+"_"+YR+"_"+SD print(self.name) self.arrayform = self.get_LLR_TS_distrib_from_files(SD, YR, Nsig) #array_with_LLR_TS_values self.histform = self.array_to_hist(array_with_LLR_TS_values, True, self.name) if self.H == 0: self.fittotail = self.exp_fit_right_tail_of_hist( self.histform ) self.median = self.get_median_of_array(array_with_LLR_TS_values) def get_LLR_TS_distrib_from_files(self, SD, YR, N): ''' function to get H0 or H1 LLR distribution (teststatistic distribution) if N == 0 we're dealing with the H0 distribution, if N>0 it's a H1 distributions input: sin(decl): SD, years: YR, N signal events in dataset returns: H0 or H1 LLR distribution (depending on N) ''' count = 0 for FILE in os.listdir(self.inputpath): ROOT.gROOT.cd() if self.run_identifier in FILE and SD in FILE and YR in FILE and N+"_" in FILE and ".root" in FILE and "RESULTS" not in FILE: count+=1 print( "%i -> %s" %(count, FILE) ) if N == "c0.0": tree_Hmodel = uproot.open(self.inputpath+FILE+":bkg_model") else: tree_Hmodel = uproot.open(self.inputpath+FILE+":sigbkg_model") # Get the LLR values # --H0 model-- np_array_model_flik_H0 = tree_Hmodel["fit_H0.fitted_likelihood"].array(library="np") np_array_model_flik_H1 = tree_Hmodel["fit_H1.fitted_likelihood"].array(library="np") LLR_TS_array = np_array_model_flik_H1 - np_array_model_flik_H0 # if (entry.fit_H1.fitted_likelihood-entry.fit_H0.fitted_likelihood) < -1e-3: # print("!!! NEGATIVE LLR !!! \ncheck what's going on") print("-------->Total number of Entries in TS distribution: ", len(LLR_TS_array)) return np.sort(LLR_TS_array) def get_median_of_array(self, inputarray): '''function to calculate the median of an array ''' inputarray = np.sort(inputarray) #just in case return np.median(inputarray) def array_to_hist(self, inputarray, normalise = True, title= "array_to_hist"): '''function to turn an array into a root histogram ''' nbins = int(( max(inputarray) - min(inputarray) ) * 2 ) # PRIO TODO! flexible/fixed binning? should be fixed for full analysis and not dependent on values of array? hist = ROOT.TH1D(title, title, nbins, min(inputarray), max(inputarray)) for i in range(len(inputarray)): hist.Fill(inputarray[i]) if normalise == True: hist.Scale( 1 / hist.Integral(0, hist.GetNbinsX()+1, "width") ) return # def plot_Hypothesis_test(LLR_teststatistic_H0, LLR_teststatistic_H1, f_fitH0tail_bool=True): # ''' TODO plot_Hypothesis_test_2 / plot_Hypothesis_test_1 / plot_Hypothesis_test''' # OPTION TO DO IT STACKED FOR GIVEN ARRAY OF H1! # title="Hypothesis Testing" # savingname="./Hypothesis_test.png" def exp_fit_right_tail_of_hist(hist_to_fit): '''function that fits an exponential to the right tail of given histogram ''' ffit = ROOT.TF1("ffit","expo", hist_to_fit.GetMean()+hist_to_fit.GetStdDev(), hist_to_fit.GetXaxis().GetXmax() ) # range min, max hist_to_fit.Fit(ffit,"R") # 'exp([Constant]+[Slope]*x)' # Get fit-parameter info f_fit_h0tail = hist_to_fit.GetFunction("ffit") f_fit_h0tail.SetLineColor(2) f_fit_h0tail.SetLineStyle(4) p0 = f_fit_h0tail.GetParameter(0) p0_err = f_fit_h0tail.GetParError(0) p1 = f_fit_h0tail.GetParameter(1) p1_err = f_fit_h0tail.GetParError(1) print('\nexp_fit_right_tail_of_hist FITTED FUNCTION\nFITTED FUNCTION = exp([Constant]+[Slope]*x)\n where: [Constant] = {0} +- {1}, \n and: [Slope] = {2} +- {3}, \nso: exp({4}+{5}*x)\n'.format(p0, p0_err, p1, p1_err, round(p0, 2), round(p1, 2))) return f_fit_h0tail def GetThreshold(func, prob): '''Create a graph with the fit values to read 3sigma & 5sigma thresholds ''' found = False x = 0.0 #Initial value --> TBD print("In GetThreshold : integral=",func.Integral(x,1000) ) while (x < 80 and found == False): #y = func.Eval(x) y = func.Integral(x,1000) if(y <= prob ): found = True print("In GetThreshold : found y=",func.Integral(x,1000) ) x_c = x x = x+0.001 return (x_c + x)/2.