#!/usr/bin/env python # -*- coding: utf-8 -*- # Python imports from __future__ import division import ConfigParser import cPickle import cProfile import copy import datetime import math import operator import os import shutil import sys import time import traceback import numpy import alixe_library as al import SystemUtility # Instantiate the system utility module and monitor #NOTE: This line has been commented by Massimo because is not used #and also because is giving an error when running the binary in Nuno ###sym = SystemUtility.SystemUtility() # Get a copy of the space group dictionary dictio_space_groups = al.get_spacegroup_dictionary() # Functions to help in optimization def timing(f): """ :param f: :type f: :return: :rtype: """ def wrap(*args, **kwds): time1 = time.time() ret = f(*args, **kwds) time2 = time.time() print ('%s function took %0.3f s' % (f.__name__, (time2 - time1))) return ret return wrap def profileit(func): """ :param func: :type func: :return: :rtype: """ def wrapper(*args, **kwargs): datafn = func.__name__ + ".profile" # Name the data file sensibly prof = cProfile.Profile() retval = prof.runcall(func, *args, **kwargs) prof.dump_stats(datafn) return retval return wrapper def fishing_round_by_prio_list(dict_first_round_by_rotclu, reference_hkl, sg_symbol, f_fom, phstat_version, path_phstat, ncores, clust_fold, path_ins, cell, resolution, cycles, tolerance): """ :param dict_first_round_by_rotclu: :type dict_first_round_by_rotclu: :param reference_hkl: :type reference_hkl: :param sg_symbol: :type sg_symbol: :param f_fom: :type f_fom: :param phstat_version: :type phstat_version: :param path_phstat: :type path_phstat: :param ncores: :type ncores: :param clust_fold: :type clust_fold: :param path_ins: :type path_ins: :param cell: :type cell: :param resolution: :type resolution: :param cycles: :type cycles: :param tolerance: :type tolerance: :return: :rtype: """ list_prio = [] list_to_remove = [] sg_number = al.get_space_group_number_from_symbol(sg_symbol) seed = 0 # We give the input sorted so in the fortran phstat we want it to use the first one as ref for rotclu in dict_first_round_by_rotclu.keys(): topllg = 0 name_clust_topllg = '' topzscore = 0 name_clust_topzscore = '' for cluster in dict_first_round_by_rotclu[rotclu].keys(): if dict_first_round_by_rotclu[rotclu][cluster]['dict_FOMs']['llg'] > topllg: topllg = dict_first_round_by_rotclu[rotclu][cluster]['dict_FOMs']['llg'] name_clust_topllg = cluster if dict_first_round_by_rotclu[rotclu][cluster]['dict_FOMs']['zscore'] > topzscore: topzscore = dict_first_round_by_rotclu[rotclu][cluster]['dict_FOMs']['zscore'] name_clust_topzscore = cluster if name_clust_topllg not in list_prio: list_prio.append(name_clust_topllg) if name_clust_topzscore not in list_prio: list_prio.append(name_clust_topzscore) list_rest = [] for rotclu in dict_first_round_by_rotclu.keys(): for cluster in dict_first_round_by_rotclu[rotclu]: if cluster not in list_prio: list_rest.append((cluster, dict_first_round_by_rotclu[rotclu][cluster]['dict_FOMs']['llg'])) sorted_list_rest = sorted(list_rest, key=lambda x: x[1]) for i in range(len(sorted_list_rest)): sorted_list_rest[i] = sorted_list_rest[i][0] list_prio.extend(sorted_list_rest) number_of_trials = int(ncores) max_number_of_trials = len(list_prio) dict_result_by_trial = {} raw_clust_second_round = open(os.path.join(clust_fold, "info_clust_second_round_raw"), 'w') del raw_clust_second_round table_clust_second_round = open(os.path.join(clust_fold, "info_clust_second_round_table"), 'w') table_clust_second_round.write('%-40s %-5s %-10s %-10s\n' % ('Cluster', 'n_phs', 'wmpd_max', 'wmpd_min')) del table_clust_second_round for i in xrange(number_of_trials): # if i==1: # print "list_to_remove",list_to_remove if i >= max_number_of_trials: break # We can't use more references, we finished the list! reference = list_prio[i] if (os.path.split(reference))[1] in list_to_remove: print "This reference, ", reference, " was already fished with another reference. Skipping this cycle" continue # We go to the next iteration, because this reference has been fished already name_ref = os.path.split(reference)[1] name_phi = reference[:-4] + "_ref.phi" if phstat_version == 'python': dict_phs = {} dict_phs[reference] = True for j in xrange(len(list_prio)): name_file = (os.path.split(list_prio[j]))[1] if (list_prio[j] not in dict_phs.keys()) and (name_file not in list_to_remove): dict_phs[list_prio[j]] = False dict_result_by_trial[list_prio[i]] = startALIXEasPHSTAT(clust_fold, reference_hkl, dict_phs, cell, sg_number, tolerance=tolerance, resolution=resolution, cycles=cycles, f_fom=f_fom) elif phstat_version == 'fortran': path = os.path.normpath(clust_fold) # relative path because fortran does not accept such long paths list_path = path.split(os.sep) relative_path_clust_fold = './' + list_path[-1] # 1) Write an ls file with the list of phs, putting first the reference path_ls = os.path.join(clust_fold, reference[:-4] + "_ref.ls") relative_ref = os.path.join(relative_path_clust_fold, name_ref) lsrotfile = open(path_ls, 'w') lsrotfile.write(relative_ref + '\n') for j in xrange(len(list_prio)): phs_namefile = (os.path.split(list_prio[j]))[1] phs_relative_path = os.path.join(relative_path_clust_fold, phs_namefile) if phs_relative_path != relative_ref and (phs_namefile not in list_to_remove): lsrotfile.write(phs_relative_path + '\n') lsrotfile.close() # 2) Link the ins file os.link(path_ins, os.path.join(clust_fold, path_ls[:-3] + ".ins")) # 3) Launch the function in alixe_library complete_output = al.call_phstat_print_for_clustering(path_ls[:-3], relative_path_clust_fold, path_phstat, resolution, seed, tolerance, cycles) print complete_output ls_file = open(path_ls, 'r') ls_content = ls_file.read() del ls_file dict_result_by_trial[reference] = {name_phi: None} dict_result_by_trial[reference][name_phi] = al.read_phstat_print_clusterization_output(ls_content, complete_output, cycles) # NOTE: Because of the relative paths, in the fortran case I need to modify the dictionary now to contain the full paths raw_clust_second_round = open(os.path.join(clust_fold, "info_clust_second_round_raw"), 'a') table_clust_second_round = open(os.path.join(clust_fold, "info_clust_second_round_table"), 'a') raw_clust_second_round.write( "*********************************************************************************************************************************************\n") n_phs = len(dict_result_by_trial[list_prio[i]][name_phi].keys()) raw_clust_second_round.write("CLUSTER fishing with reference " + name_ref + ' , found in file ' + name_ref[ :-4] + '_ref.phi, containing ' + str( n_phs) + ' phase files' + '\n') for key1 in dict_result_by_trial[list_prio[i]].keys(): wmpe_max = 0.0 wmpe_min = 90.0 for key2 in dict_result_by_trial[list_prio[i]][key1].keys(): if phstat_version == 'fortran': new_key2 = os.path.join(clust_fold, os.path.split(key2)[1]) dict_result_by_trial[list_prio[i]][key1][new_key2] = copy.deepcopy( dict_result_by_trial[list_prio[i]][key1][key2]) del dict_result_by_trial[list_prio[i]][key1][key2] raw_clust_second_round.write( new_key2 + "\t" + str(dict_result_by_trial[list_prio[i]][key1][new_key2]) + '\n') wmpe = dict_result_by_trial[list_prio[i]][key1][new_key2]['wMPE'] else: raw_clust_second_round.write( key2 + "\t" + str(dict_result_by_trial[list_prio[i]][key1][key2]) + '\n') wmpe = dict_result_by_trial[list_prio[i]][key1][key2]['wMPE'] list_to_remove.append(os.path.split(key2)[1]) if wmpe > wmpe_max: wmpe_max = wmpe if wmpe < wmpe_min and wmpe != 0.0: # Because 0.0 is to itself, the reference wmpe_min = wmpe name_cluster = (os.path.split(key1))[1] if wmpe_max == 0.0 and wmpe_min == 90.0: # Then we have not clustered them wmpe_max = 0.0 wmpe_min = 0.0 table_clust_second_round.write('%-40s %-5i %-10f %-10f \n' % (name_cluster, n_phs, wmpe_max, wmpe_min)) del raw_clust_second_round del table_clust_second_round return dict_result_by_trial def startALIXEasPHSTAT(clust_fold, reference_hkl, dict_phs, cell, sg_number, tolerance=75.0, resolution=2.0, cycles=3, f_fom=True): """ Minimal version of the ALIXE program. :param clust_fold: directory to perform the clustering :type clust_fold: str :param reference_hkl: the path for the hkl given for shelxe, to filter by evalues the phs in case there is extrapolated data :type reference_hkl: str :param dict_phs: phs dictionary with values that are the phs and keys set to True if they have to be tested as references and False otherwhise :type dict_phs: dict :param cell: the unit cell parameters, in the form of a list of floats :type cell: list :param sg_number: space group number :type sg_number: int :param tolerance: tolerance for the MPE value between the phase sets :type tolerance: float :param resolution: resolution to filter the phase sets for combination :type resolution: float :param cycles: number of macrocycles of combination :type cycles: int :param f_fom: If true, mpe are weighted by Fvalues, otherwhise, by E values :type f_fom: bool :return: dictio_clusters :rtype: dict """ print '\n ALIXE started at: ', time.strftime("%c") start = datetime.datetime.now() dictio_clusters = {} # Dictionary to return the results # Get the symmetry information thanks to the sg_number symops = al.get_symops_from_sg_dictionary(sg_number) polar, origins = al.get_origins_from_sg_dictionary(sg_number) # Compute the coefficients needed for other calculations unit = cell[0] * cell[1] * cell[2] # Sides of the cell (abc) #print 'SHERLOCK cell ',cell coef1 = [] coef2 = [] coef3 = [] list_cos = [] list_sin = [] for n in range(0, 3): angle_rad = 1.74533 * math.pow(10, -2) * (cell[n + 3]) # Convert to radian units the angles #print 'SHERLOCK n, angle_rad',n,angle_rad cos_angle = numpy.cos(angle_rad) #print 'SHERLOCK cos_angle',cos_angle list_cos.append(cos_angle) sin_angle = numpy.sin(angle_rad) #print 'SHERLOCK sin_angle',sin_angle list_sin.append(sin_angle) exp1 = 2.0 * unit * (cos_angle / cell[n]) # 2.0 * abc * (cos_angle/side) #print 'SHERLOCK exp1 --> 2.0 * abc * (cos_angle/side)',exp1 coef1.append(exp1) volume = unit * math.sqrt(1 - (list_cos[0]) ** 2 - (list_cos[1]) ** 2 - (list_cos[2]) ** 2 + ( 2 * list_cos[0] * list_cos[1] * list_cos[2])) # V = abc * (1-cos²α-cos²β-cos²γ+2cosαcosβcosγ)^(-1/2) #print 'SHERLOCK volume ',volume unit = unit / volume # abc / volume #print 'SHERLOCK unit abc / volume ',unit for n in range(0, 3): exp2 = 0.25 * unit * ((list_sin[n] / cell[n]) ** 2) #print 'SHERLOCK, n, exp2',n,exp2 coef2.append(exp2) unit = unit / volume # (abc / V) / V , so abc/V² #print 'SHERLOCK unit abc/V²', unit coef3.append(unit * cell[0] * (list_cos[1] * list_cos[2] - list_cos[0])) # (abc/V²) * a *(cosβ*cosγ-cosα) coef3.append(unit * cell[1] * (list_cos[0] * list_cos[2] - list_cos[1])) # (abc/V²) * b *(cosα*cosγ-cosβ) coef3.append(unit * cell[2] * (list_cos[0] * list_cos[1] - list_cos[2])) # (abc/V²) * c *(cosα*cosβ-cosγ) #print 'SHERLOCK coef3 list',coef3 #print 'SHERLOCK coef2 list', coef2 # TODO: Add a filtering for the extrapolated phases in the phs if they exists # Generate a list with the phs from the dictionary that are references list_references = [phs for _, phs in enumerate(dict_phs.keys()) if dict_phs[phs]] # Use the first phs file to compute the evalues and epsilon factors ref_for_evalues = list_references[0] # Process the array array_ref = al.read_phs_file(ref_for_evalues) print 'Resolution check set to ',resolution array_ref, max_resolution = al.check_resolution_limit(array_ref, resolution, coef2, coef3) # We will use the coefficients saved in coef2 and coef3 # if max_resolution!=resolution: # print 'Warning: the resolution cut set for phase clustering is of ',resolution # print 'while your data has a maximum resolution of ',max_resolution # print 'Automatically adjusting resolution to ',max_resolution # resolution=max_resolution array_ref = al.change_to_standard_equivalent_reflections(array_ref, sg_number) array_ref = al.sort_reflections_phs(array_ref) # Get the number of reflections that we expect in all the phs files nreflections = len(array_ref) # Find epsilon factor and 1/dsquared epsilon, onedsquared, res_max = al.get_epsilon_1overdsquared_and_resmax(array_ref, symops, coef2, coef3) # Estimate E-values (whats changes in phs is phases, not structure factors) array_evalues, array_aux = al.get_evalues_and_array_aux(epsilon, onedsquared, res_max, array_ref) # e-values only in the array_evalues #print 'SHERLOCK check finished processing array_ref' #sys.exit(0) # Save the information that is common to all phs # # # Before # array_miller_indices = [] # array_f_sigf = [] # for i, _ in enumerate(array_ref): # array_miller_indices.append([array_ref[i][0], array_ref[i][1], array_ref[i][2]]) # array_f_sigf.append([array_ref[i][3], array_ref[i][6]]) # # # After array_miller_indices = [ [array_ref[i][0], array_ref[i][1], array_ref[i][2]] for i, _ in enumerate(array_ref)] array_f_sigf = [ [array_ref[i][3], array_ref[i][6]] for i, _ in enumerate(array_ref) ] array_ref = None n_phs = len(dict_phs.keys()) print "\n Processing ", str(n_phs), " phase files" # Prepare the arrays to save list_arrays_va_to_modify = [None for x in xrange(n_phs)] list_arrays_vb_to_modify = [None for x in xrange(n_phs)] list_arrays_to_modify = [None for x in xrange(n_phs)] list_names_phs = [None for x in xrange(n_phs)] for iphs, phs in enumerate(dict_phs.keys()): if phs[-4:] == ".phi": # NOTE: My phi files are the same as phs files NOW array_phs = al.read_phs_file(phs) elif phs[-4:] == ".phs": array_phs = al.read_phs_file(phs) else: print "ALIXE only supports clustering of SHELXE phi or phs files" sys.exit(0) # Save the name so that we have the correct order list_names_phs[iphs] = phs # phs is the full path of the phs # Check resolution limit filter_res_array, max_resolution = al.check_resolution_limit(array_phs, resolution, coef2,coef3) # Find equivalent reflections with standard indices and transform phases array_merged = al.change_to_standard_equivalent_reflections(filter_res_array, sg_number) # Sort reflections in order of higher h,k and l indexes sorted_array = al.sort_reflections_phs(array_merged) for r in range(len(sorted_array)): if not ((sorted_array[r][0]==array_miller_indices[r][0]) and (sorted_array[r][1]==array_miller_indices[r][1]) and (sorted_array[r][2] == array_miller_indices[r][2])): print 'SHERLOCK there are non equivalent indices between the arrays' print 'sorted_array[r]0,1,2 ',sorted_array[r][0],sorted_array[r][1],sorted_array[r][2] print 'array_miller_indices[r]0,1,2',array_miller_indices[r][0],array_miller_indices[r][1],array_miller_indices[r][2] sys.exit(0) # Reduce the array to the phi and FOM values reduced_array = al.reduce_array_phs_to_PHI_and_FOM(sorted_array) # [FOM,PHI] list_arrays_to_modify[iphs] = reduced_array # For all, including the reference, obtain VA and VB arrva, arrvb = al.get_VA_and_VB(reduced_array) list_arrays_va_to_modify[iphs] = arrva list_arrays_vb_to_modify[iphs] = arrvb print '\n Finished reading the phs files' # Save the arrays as pickle objects save_double_array = open(os.path.join(clust_fold, 'double_array.pkl'), 'wb') cPickle.dump(list_arrays_to_modify, save_double_array) save_double_array.close() save_arrva = open(os.path.join(clust_fold, 'arrva.pkl'), 'wb') cPickle.dump(list_arrays_va_to_modify, save_arrva) save_arrva.close() save_arrvb = open(os.path.join(clust_fold, 'arrvb.pkl'), 'wb') cPickle.dump(list_arrays_vb_to_modify, save_arrvb) save_arrvb.close() # Now test all references in the list for _, reference in enumerate(list_references): print "\n Testing reference", reference name_phi = (os.path.split(reference)[1])[:-4] + '_ref.phi' path_phi = os.path.join(clust_fold, name_phi) position = list_names_phs.index(reference) dictio_clusters[path_phi] = {} # We load the arrays again for each reference to make sure we have the unmodified original ones back_double_array = open(os.path.join(clust_fold, 'double_array.pkl'), 'rb') list_arrays_to_modify = cPickle.load(back_double_array) back_double_array.close() array_ref = copy.deepcopy(list_arrays_to_modify[position]) back_arrva = open(os.path.join(clust_fold, 'arrva.pkl'), 'rb') list_arrays_va_to_modify = cPickle.load(back_arrva) back_arrva.close() back_arrvb = open(os.path.join(clust_fold, 'arrvb.pkl'), 'rb') list_arrays_vb_to_modify = cPickle.load(back_arrvb) back_arrvb.close() # Phase combination macrocycles for c in range(cycles): print '\n Cluster analysis cycle ', str(c + 1) print '\n N' + '\t' + 'wMPE' + '\t' + 'Dif' + '\t' + 'MapCC' + '\t' + 'Phase file' dictio_clusters[path_phi][c + 1] = {} # We will save the results cycle by cycle # For every phs, find and apply origin shifts relative to reference phases dict_wmpes_sel = {} list_wmpes_sel = [] for pos, _ in enumerate(list_arrays_to_modify): name_current_phs = list_names_phs[pos] dict_wmpes_sel[name_current_phs] = {"wMPE": 0.0, "diff_wMPE": 0.0, "mapcc": 0.0} s = 0.0 t = 0.0 # Check that the number of reflections is the expected one (all should have the same) nreflections_current = len(list_arrays_to_modify[pos]) if nreflections_current != nreflections: print 'The number of reflections of this phase file is not compatible with the reference file' print 'Please check, some error in the files is expected' print 'SHERLOCK nreflections',nreflections print 'SHERLOCK nreflections_current',nreflections_current sys.exit(0) for r, _ in enumerate(list_arrays_to_modify[pos]): list_arrays_to_modify[pos][r][1] = list_arrays_vb_to_modify[pos][r] # Phases setting # Check the FOM to be set if f_fom == True: list_arrays_to_modify[pos][r][0] = array_f_sigf[r][0] * list_arrays_va_to_modify[pos][ r] # Structure Factor current reflection * FOM current reflection elif f_fom == False: list_arrays_to_modify[pos][r][0] = array_evalues[r] * list_arrays_va_to_modify[pos][ r] # E-value current reflection * FOM current reflection s = s + list_arrays_to_modify[pos][r][0] # Summatory of the FOMs t = t + list_arrays_to_modify[pos][r][0] * abs( ((900.0 + list_arrays_to_modify[pos][r][1] - list_arrays_vb_to_modify[pos][r]) % 360.0) - 180.0) # Here is where when we have to apply the origin shifts and calculate the wMPE and the mapCC current_weights = list_arrays_va_to_modify[pos] # array with the FOMs of the current phs sorted_list_wmpe, sorted_list_mapcc = al.apply_origin_shift_and_compute_wMPE_and_CC(n_reflections=nreflections, symops=symops, array_ref=array_ref, array_miller_indices= array_miller_indices, array_f_sigf=array_f_sigf, current_array= list_arrays_to_modify[pos], current_weights= current_weights, array_evalues= array_evalues, array_aux=array_aux, sg_number=sg_number, f_fom=f_fom) shift_to_apply = sorted_list_wmpe[0][1] wmpe_shift = sorted_list_wmpe[0][0] for i in range(len(sorted_list_mapcc)): # Get the mapCC that corresponds to that shift if sorted_list_mapcc[i][1] == shift_to_apply: map_CC = sorted_list_mapcc[i][0] # Get difference top best/second best for the wMPE if we are not in a polar sg if polar == False: diff_mpe = sorted_list_wmpe[1][0] - sorted_list_wmpe[0][0] elif polar == True: diff_mpe = 0 dict_wmpes_sel[name_current_phs] = {"wMPE": wmpe_shift, "diff_wMPE": diff_mpe, "mapcc": map_CC, 'shift': shift_to_apply} list_wmpes_sel.append((name_current_phs, wmpe_shift)) for r, _ in enumerate(list_arrays_vb_to_modify[pos]): # Modify the phases saved list_arrays_vb_to_modify[pos][r] = ((720.0 + (360.0 * ((( shift_to_apply[0] * array_miller_indices[r][ 0]) + ( shift_to_apply[1] * array_miller_indices[r][ 1]) + ( shift_to_apply[2] * array_miller_indices[r][ 2])) % 1.0)) + list_arrays_vb_to_modify[pos][r]) % 360.0) # Sort the phase sets according to their mean phase error sorted_by_mpe = sorted(list_wmpes_sel, key=lambda x: x[1], reverse=False) # Check if anything will cluster under the tolerance we set if len(sorted_by_mpe) > 1: if sorted_by_mpe[1][1] > tolerance: print 'There are not solutions to cluster under the tolerance set' #print '**********************************' #print 'SHERLOCK sorted_by_mpe' #for ele in sorted_by_mpe: # print ele #print '**********************************' if c + 1 != 1: print 'HEY, THIS HAPPENED IN A CYCLE DIFFERENT FROM CYCLE 1' print 'Error, please report to bugs-borges@ibmb.csic.es' sys.exit(0) dictio_clusters[path_phi] = {sorted_by_mpe[0][0]: {'wMPE': sorted_by_mpe[0][1], 'diff_wMPE': dict_wmpes_sel[sorted_by_mpe[0][0]][ 'diff_wMPE'], 'mapcc': dict_wmpes_sel[sorted_by_mpe[0][0]][ 'mapcc'], 'shift': [0.0, 0.0, 0.0]}} # Write the phases with the resolution cut to a file file_phi = open(path_phi, 'w') pos = list_names_phs.index(sorted_by_mpe[0][0]) # Position of the reference array in the reading nreflections = len(list_arrays_va_to_modify[pos]) for r in xrange(nreflections): # 3I4,F9.2,F8.4,F8.1,F8.2 file_phi.write('%4i%4i%4i%9.2f%8.4f%8.1f%8.2f\n' % ( array_miller_indices[r][0], array_miller_indices[r][1], array_miller_indices[r][2], array_f_sigf[r][0], list_arrays_va_to_modify[pos][r], list_arrays_vb_to_modify[pos][r], array_f_sigf[r][1])) del file_phi return dictio_clusters # Combine phases for cluster # 1) Generate clean arrays of contributions VA and VB and for the combination va = [0.0 for _ in range(len(array_ref))] # fom vb = [0.0 for _ in range(len(array_ref))] # phases wa_combi = [0.0 for _ in range(len(array_ref))] # 2) Iterate over the phs files (consider sorting) for ind in range(len(sorted_by_mpe)): current_phs = sorted_by_mpe[ind][0] current_wmpe = sorted_by_mpe[ind][1] current_dif_wmpe = dict_wmpes_sel[current_phs]['diff_wMPE'] current_mapcc = dict_wmpes_sel[current_phs]['mapcc'] pos = list_names_phs.index(current_phs) if current_wmpe < tolerance: print ' ', ind + 1, '\t', "{0:.1f}".format(current_wmpe), '\t', "{0:.1f}".format( current_dif_wmpe), '\t', "{0:.2f}".format(current_mapcc), '\t', current_phs dictio_clusters[path_phi][c + 1][current_phs] = copy.deepcopy(dict_wmpes_sel[current_phs]) for r in xrange(nreflections): list_arrays_to_modify[pos][r][1] = list_arrays_vb_to_modify[pos][r] t = 0.0174533 * list_arrays_to_modify[pos][r][1] # 0.0174533 is 2pi/360 s = list_arrays_va_to_modify[pos][r] va[r] = va[r] + (s * (numpy.cos(t))) vb[r] = vb[r] + (s * (numpy.sin(t))) t = numpy.sqrt(1.0 / (ind + 1)) for r in xrange(nreflections): wa_combi[r] = min(1.0, numpy.sqrt(math.pow(va[r], 2) + math.pow(vb[r], 2)) * t) # Use either structure factors or normalized structure factors for the combined phases list_arrays_to_modify[pos][r][0] = wa_combi[r] * array_evalues[ r] # e-weighted,as it is in the fortran prototype ## if f_fom==True: ## list_arrays_to_modify[pos][r][0]=array_f_sigf[r][0]*wa_combi[r] # StructureFactor * new_weight ## elif f_fom==False: ## alist_arrays_to_modify[pos][r][0]=array_evalues[r]*wa_combi[r] # E-value * new_weight if wa_combi[r] > 0.0001: list_arrays_to_modify[pos][r][1] = (720.0 + 57.29578 * numpy.arctan2(vb[r], va[r])) % 360 # NOTE: experimenting what is happening here #current_weights[r] = list_arrays_to_modify[pos][r][0] # Save the new weight # NOTE: experimenting what is happening here # Compute again the shift, because now we changed the phases sorted_list_wmpe2, sorted_list_mapcc2 = al.apply_origin_shift_and_compute_wMPE_and_CC(n_reflections =nreflections, symops=symops, array_ref= array_ref, array_miller_indices=array_miller_indices, array_f_sigf=array_f_sigf, current_array=list_arrays_to_modify[ pos], current_weights=current_weights, array_evalues=array_evalues, array_aux=array_aux, sg_number=sg_number, f_fom=f_fom) shift_to_apply2 = sorted_list_wmpe2[0][1] if shift_to_apply2 != [0.0, 0.0, 0.0]: # Check if there ever is another shift after if not (abs(shift_to_apply2[0]) < 0.01 and abs(shift_to_apply2[1]) < 0.01 and abs( shift_to_apply2[2]) < 0.01): print "There is another shift to apply after phase merging. CHECK!", shift_to_apply2 sys.exit(0) for r in xrange(nreflections): list_arrays_to_modify[pos][r][1] = ((720.0 + (360.0 * ((( shift_to_apply2[0] * array_miller_indices[r][ 0]) + ( shift_to_apply2[1] * array_miller_indices[r][ 1]) + ( shift_to_apply2[2] * array_miller_indices[r][ 2])) % 1.0)) + list_arrays_vb_to_modify[pos][r]) % 360.0) # Change the phases in the reference array so that they correspond to the combined map list_arrays_to_modify[pos][r][1]) % 360.0) for r in xrange(nreflections): array_ref[r][1] = list_arrays_to_modify[pos][r][1] # Write the cumulative phases to a file file_phi = open(path_phi, 'w') # print 'path_phi',path_phi for r in xrange(len(array_ref)): # 3I4,F9.2,F8.4,F8.1,F8.2 file_phi.write('%4i%4i%4i%9.2f%8.4f%8.1f%8.2f\n' % ( array_miller_indices[r][0], array_miller_indices[r][1], array_miller_indices[r][2], array_f_sigf[r][0], wa_combi[r], array_ref[r][1], array_f_sigf[r][1])) del file_phi print '\n Cleaning up....' for fich in os.listdir(clust_fold): if fich.endswith('.pkl'): os.remove(os.path.join(clust_fold, fich)) # Return just the last cycle results dictio_clusters_third = {} for key in dictio_clusters.keys(): dictio_clusters_third[key] = dictio_clusters[key][cycles] return dictio_clusters_third end = datetime.datetime.now() print " end - start time ", end - start def startALIXEforARCIMBOLDO(clust_fold, cluster_id, reference_hkl, cell, sg_symbol, tolerance_list=[60.0, 88.0], resolution=2.0, cycles=3, f_fom=True, run='SHREDDER', mode='one_step', fishing_folder=None, confibor=None, ncores=1): """ Performs ALIXE phase clustering internally within the ARCIMBOLDO programs. Keyword arguments: clust_fold -- path to the folder where to perform the clustering cluster_id -- integer with the rotation cluster to evaluate or list with the cluster ids if mode=two_steps reference_hkl -- path to hkl file cell -- list of 6 elements with the unit cell parameters (STR! CONVERT TO FLOAT) sg_symbol -- string with the space group symbol tolerance -- list of floats with tolerances for mean phase error cycles -- integer, number of cycles of phase combination to be performed f_fom -- boolean. If true, mpe are weighted by Fvalues, otherwhise, by E values run -- string. Can be 'ARCIMBOLDO', 'BORGES', 'SHREDDER' mode -- string. Can be 'fish','full','one_step','two_steps' fishing_folder -- string. Path to the folder in which ALIXE has to fish in case of that mode. confibor -- ConfigParser object from the main program that is calling ALIXE ncores -- number of real cores that can be used in the machine """ print '\n ALIXE started: ', time.strftime("%c") skip_FOM_reading = False ent_present = False hard_limit_phs = 100 skip_copyfiles = False skip_ins_generation = False if mode not in ['fish', 'full', 'one_step', 'two_steps']: print "Sorry, you need to provide a valid mode for alixe: either 'fish','full','one_step' or 'two_steps' " sys.exit(0) # Check if alixe has been already performed on that folder and act accordingly depending on mode basepath_clustfold = clust_fold print "\n Root folder for clustering ", basepath_clustfold print '\n Mode for clustering is ',mode if mode=='one_step': clust_fold = os.path.join(basepath_clustfold, str(cluster_id)) print "\n Currently processing folder ", clust_fold path_for_tuple = os.path.join(clust_fold, 'final_alixe_tuple.pkl') #print 'SHERLOCK line 641 path_for_tuple',path_for_tuple if os.path.exists(path_for_tuple): # Then first round was completed back_final_tuple = open(path_for_tuple, 'rb') (path_ins, final_dict) = cPickle.load(back_final_tuple) print "\n Alixe first round has run succesfully already on this folder" print "\n Returning the results" # Return only the clusters that contain more than one phase set so that they are expanded for cluster in final_dict.keys(): if final_dict[cluster]['n_phs'] == 1: del final_dict[cluster] print "\n",len(final_dict),' phase clusters contain more than one phase set and will be expanded' return (path_ins, final_dict) else: # Either we have an incomplete run or it was not created yet print 'Alixe was not finished in this folder ',clust_fold if not os.path.exists(clust_fold): os.makedirs(clust_fold) # Generate the directory if it is not already there else: # Otherwhise remove it and generate it from the beginning shutil.rmtree(clust_fold) os.makedirs(clust_fold) elif mode == 'two_steps': second_round_clufold = os.path.join(basepath_clustfold, 'ALIXE_SECOND_ROUND') # First check whether we were finished and can proceed directly to expansions path_for_tuple = os.path.join(second_round_clufold, 'tuple_second_round_alixe.pkl') #print 'SHERLOCK path_for_tuple',path_for_tuple if os.path.exists(path_for_tuple): # Then first round was completed back_final_tuple = open(path_for_tuple, 'rb') (path_ins, final_dict) = cPickle.load(back_final_tuple) print "\n Alixe second round has run succesfully already on this folder" print "\n Returning the results" # Return only the clusters that contain more than one phase set so that they are expanded for cluster in final_dict.keys(): if final_dict[cluster]['n_phs'] == 1: del final_dict[cluster] print "\n",len(final_dict),' phase clusters contain more than one phase set and will be expanded' return (path_ins, final_dict) # In any case we do not need to reperform steps skip_FOM_reading = True skip_copyfiles = True skip_ins_generation = True # Generate a fake ins that SHELXE can read for expansions cell = [ float(cell[i]) for i in range(len(cell)) ] sg_number = al.get_space_group_number_from_symbol(sg_symbol) if not skip_ins_generation: path_ins = os.path.join(clust_fold, 'name.ins') al.generate_fake_ins_for_shelxe(path_ins, cell, sg_number) if run == 'SHREDDER' or run == 'BORGES': if not isinstance(confibor, ConfigParser.ConfigParser): print "Sorry, you need to provide a valid bor object in order to run ALIXE on ", run # Check the bor sys.exit() topexp = confibor.get('ARCIMBOLDO-BORGES', 'topexp') basepath = confibor.get('GENERAL', 'working_directory') if not skip_copyfiles: list_chosen = al.get_files_from_9_EXP_BORGES(basepath, clust_fold, cluster_id=cluster_id, mode=9, hard_limit_phs=hard_limit_phs) #print 'SHERLOCK tenemos que salir para ver que pasa con los phs' #sys.exit(0) subrun = 'BORGES' # To know how to read the solutions. Spherical is generating a BORGESARCI job if confibor.has_option('GENERAL', 'ent_path'): # therefore the lst will have post mortem info ent_path = confibor.get('GENERAL', 'ent_path') if os.path.exists(ent_path): ent_present = True # elif run == 'ARCIMBOLDO': # if not confibor.isinstance(ConfigParser.ConfigParser): # print "Sorry, you need to provide a valid bor object in order to run ALIXE on ", run # Check the bor # sys.exit() # # TODO: Get the link to the files and save them in the CLUSTERING folder # # TODO: Check if there is an ent file provided in the bor and therefore the lst will have post mortem info else: print "Sorry, ALIXE can just run on ARCIMBOLDO_SHREDDER spheres, or ARCIMBOLDO_BORGES runs" sys.exit(0) # Read the FOMs of the PHS files in the phaser and shelxe steps if not skip_FOM_reading: phs_files = al.list_files_by_extension(clust_fold, 'phs') dictio_fragments = {} for phs in phs_files: dictio_fragments[phs[:-4]] = {'rot_cluster': None, 'llg': None, 'zscore': None, 'initcc': None, 'efom': None, 'pseudocc': None, 'list_MPE': None} dictio_fragments = al.get_FOMs_from_lst_files_in_folder(clust_fold=clust_fold, dictio_fragments=dictio_fragments, ent_present=ent_present) dictio_fragments = al.get_FOMs_from_sum_files_in_folder(wd=basepath, clust_fold=clust_fold, dictio_fragments=dictio_fragments, ent_present=ent_present, program=subrun, fragment=1) # First round if mode == 'one_step': tolerance = tolerance_list[0] dict_first_round = {} # To save the results per rotation cluster # Call the function clustering_all_in_ALIXE_under_a_tolerance per each of them and save the results dict_first_round = clustering_all_in_ALIXE_under_a_tolerance(clust_fold, reference_hkl, list_chosen, cell, sg_symbol, tolerance, resolution, cycles, f_fom, ncores=topexp) # Save the FOM information for priorization in second round if there is for cluster in dict_first_round.keys(): if dict_first_round[cluster]['n_phs'] > 1: list_llg = [] list_zscore = [] for phs in dict_first_round[cluster]['dictio_result'].keys(): # For each phs in the cluster list_llg.append(dictio_fragments[phs[:-4]]['llg']) list_zscore.append(dictio_fragments[phs[:-4]]['zscore']) # We add the top LLG or ZSCORE as representative of the cluster dict_first_round[cluster]['dict_FOMs']['llg'] = (sorted(list_llg, reverse=True))[0] dict_first_round[cluster]['dict_FOMs']['zscore'] = (sorted(list_zscore, reverse=True))[0] elif dict_first_round[cluster]['n_phs'] == 1: name_file = ((dict_first_round[cluster]['dictio_result'].keys())[0])[:-4] dict_first_round[cluster]['dict_FOMs']['llg'] = dictio_fragments[name_file]['llg'] dict_first_round[cluster]['dict_FOMs']['zscore'] = dictio_fragments[name_file]['zscore'] save_final_tuple = open(os.path.join(clust_fold, 'final_alixe_tuple.pkl'), 'wb') cPickle.dump((path_ins, dict_first_round), save_final_tuple) save_final_tuple.close() # Write a summary output table with useful information from the first round that can be sortable global_table=open(os.path.join(clust_fold, 'global_clusters_table.txt'), 'w') global_table.write('%-40s %-10s %-10s %-10s\n' % ('Cluster', 'n_phs', 'topLLG', 'topZSCORE')) for cluster in dict_first_round.keys(): basenameclu = os.path.basename(cluster) n_phs = dict_first_round[cluster]['n_phs'] topLLG = dict_first_round[cluster]['dict_FOMs']['llg'] topZscore = dict_first_round[cluster]['dict_FOMs']['zscore'] global_table.write('%-40s %-10s %-10s %-10s\n' % (basenameclu, n_phs, topLLG, topZscore)) del global_table # Prepare to return only the clusters that contain more than one phase set so that they are expanded print " \n There are ", len(dict_first_round)," phase sets after the first round of phase combination" for cluster in dict_first_round.keys(): if dict_first_round[cluster]['n_phs'] == 1: del dict_first_round[cluster] print " \n There are ",len(dict_first_round)," phase sets that contained more than one solution" # For every cluster to be expanded, write a summary table file with the information of its contents for cluster in dict_first_round.keys(): path_clu=os.path.join(clust_fold, os.path.basename(cluster)[:-4]+'.sum') fileforclu=open(path_clu, 'w') fileforclu.write('%-40s %-10s %-10s %-10s %-20s\n' % ('Name of the phs file', 'wMPD', 'diff wMPD', 'map CC', 'shift')) for phaseset in dict_first_round[cluster]['dictio_result'].keys(): name = os.path.basename(phaseset) wmpe = round(dict_first_round[cluster]['dictio_result'][phaseset]['wMPE'],2) diffwmpe = round(dict_first_round[cluster]['dictio_result'][phaseset]['diff_wMPE'],2) mapcc = round(dict_first_round[cluster]['dictio_result'][phaseset]['mapcc'],2) shiftval = dict_first_round[cluster]['dictio_result'][phaseset]['shift'] shift='x = '+str(shiftval[0])+' y = '+str(shiftval[1])+' z = '+str(shiftval[2]) fileforclu.write( '%-40s %-10s %-10s %-10s %-20s\n' % (name, wmpe, diffwmpe, mapcc, shift)) del(fileforclu) return (path_ins, dict_first_round) elif mode == 'full': print "full mode is still not implemented" sys.exit(0) # Second round if mode == 'two_steps': #print "SHERLOCK en construcción... Two steps mode ;)" tolerance = tolerance_list[1] # Generate the folder in which we will perform the combination second_round_clufold = os.path.join(basepath_clustfold,'ALIXE_SECOND_ROUND') try: os.makedirs(second_round_clufold) except: # Si da algún error, mejor empezar desde el principio #print "Error" #print sys.exc_info() #traceback.print_exc(file=sys.stdout) #sys.exit() shutil.rmtree(second_round_clufold) os.makedirs(second_round_clufold) # Read all dictionaries from all rotation clusters list_phis_to_evaluate = [] # List with tuples (rotclu,cc,phi) for sorting the files according to FOM for clu in cluster_id: path_pkl = os.path.join(basepath_clustfold +str(clu)+'/final_alixe_tuple.pkl') pkl_first_round = open(path_pkl, 'rb') path_ins, dictio_clust_first_round = cPickle.load(pkl_first_round) for phi_file in dictio_clust_first_round.keys(): list_phis_to_evaluate.append((phi_file, dictio_clust_first_round[phi_file]['dict_FOMs']['llg'])) # NOTE CM: at the moment I am using the topLLG of the solutions in every cluster from the first round, I should # include other foms and evaluate which ones work better sorted_list_phis_to_evaluate = sorted(list_phis_to_evaluate, key=operator.itemgetter(1),reverse=True) sorted_list_phis = [ele[0] for ele in sorted_list_phis_to_evaluate] dict_second_round = clustering_all_in_ALIXE_under_a_tolerance(clust_fold=second_round_clufold, reference_hkl=reference_hkl, list_phs=sorted_list_phis, cell=cell, sg_symbol=sg_symbol, tolerance=tolerance,resolution=resolution, cycles=cycles,fom_weigth=f_fom,ncores=topexp) #for key in dict_second_round.keys(): # print 'SHERLOCK ***',key, dict_second_round[key] #sys.exit(0) # Write a summary output table with useful information from the first round that can be sortable global_table=open(os.path.join(second_round_clufold, 'global_clusters_table_second_round.txt'), 'w') global_table.write('%-40s %-10s\n' % ('Cluster', 'n_phs')) for cluster in dict_second_round.keys(): basenameclu = os.path.basename(cluster) n_phs = dict_second_round[cluster]['n_phs'] global_table.write('%-40s %-10s \n' % (basenameclu, n_phs)) del global_table # Write a pkl file to avoid recomputation if it is not required save_second_round_tuple = open(os.path.join(second_round_clufold, 'tuple_second_round_alixe.pkl'), 'wb') cPickle.dump((path_ins, dict_second_round), save_second_round_tuple) save_second_round_tuple.close() # Prepare to return only the clusters that contain more than one phase set so that they are expanded print " \n There are ", len(dict_second_round)," phase sets after the first round of phase combination" for cluster in dict_second_round.keys(): if dict_second_round[cluster]['n_phs'] == 1: del dict_second_round[cluster] print " \n There are ",len(dict_second_round)," phase sets that contained more than one solution" # For every cluster to be expanded, write a summary table file with the information of its contents for cluster in dict_second_round.keys(): path_clu=os.path.join(second_round_clufold, os.path.basename(cluster)[:-4]+'.sum') fileforclu=open(path_clu, 'w') fileforclu.write('%-40s %-10s %-10s %-10s %-20s\n' % ('Name of the phs file', 'wMPD', 'diff wMPD', 'map CC', 'shift')) for phaseset in dict_second_round[cluster]['dictio_result'].keys(): name = os.path.basename(phaseset) wmpe = round(dict_second_round[cluster]['dictio_result'][phaseset]['wMPE'],2) diffwmpe = round(dict_second_round[cluster]['dictio_result'][phaseset]['diff_wMPE'],2) mapcc = round(dict_second_round[cluster]['dictio_result'][phaseset]['mapcc']) shiftval = dict_second_round[cluster]['dictio_result'][phaseset]['shift'] shift='x = '+str(shiftval[0])+' y = '+str(shiftval[1])+' z = '+str(shiftval[2]) fileforclu.write( '%-40s %-10s %-10s %-10s %-20s\n' % (name, wmpe, diffwmpe, mapcc, shift)) del(fileforclu) return (path_ins, dict_second_round) def clustering_all_in_ALIXE_under_a_tolerance(clust_fold, reference_hkl, list_phs, cell, sg_symbol, tolerance, resolution, cycles, fom_weigth, ncores): '''Sequentially calls startALIXE in order to generate all possible clusters under a certain tolerance. :param clust_fold: :type clust_fold: str :param reference_hkl: :type reference_hkl: str :param list_phs: :type list_phs: list :param cell: :type cell: list :param sg_symbol: :type sg_symbol: str :param tolerance: :type tolerance: float :param resolution: :type resolution: float :param cycles: :type cycles: int :param fom_weigth: if True, weight applied is by Structure Factors, if False, by E-values :type fom_weigth: bool :param ncores: :type ncores: int :return dictio_clusters: a dictionary with the results of the clustering. Keys are phi paths, and value is another dictionary with the following structure: :rtype dictio_clusters: dict ''' # Initialize required variables dictio_clusters = {} sg_number = al.get_space_group_number_from_symbol(sg_symbol) # Main loop, until all the phs in the list have been processed or we will not expand them anyway while (len(list_phs) > 0) and (len(dictio_clusters.keys()) <= ncores): dict_phs = {} for i in range(len(list_phs)): if i == 0: dict_phs[list_phs[i]] = True else: dict_phs[list_phs[i]] = False results = startALIXEasPHSTAT(clust_fold, reference_hkl, dict_phs, cell, sg_number, tolerance, resolution, cycles, fom_weigth) # I can safely assume that if I just give one reference, # the results dictionary keys list will have just one phi name phi_name = results.keys()[0] phs_to_remove = results[phi_name].keys() n_phs = len(phs_to_remove) #print 'SHERLOCK results', results dictio_clusters[phi_name] = {'dictio_result': results[phi_name], 'n_phs': n_phs, 'dict_FOMs': {}} #print 'SHERLOCK phi_name',phi_name #for key in dictio_clusters[phi_name].keys(): # print 'SHERLOCK key ', key # print 'SHERLOCK dictio_clusters[phi_name][key]',dictio_clusters[phi_name][key] #sys.exit(0) #table_clust_first_round = open(os.path.join(clust_fold, "info_clust_table"), 'a') # del table_clust_first_round #file_debug_first_round = open(os.path.join(clust_fold, "clusters_dictionary.txt"), 'a') #file_debug_first_round.write( # phi_name + '\t' + 'dictio_result ' + str(results[phi_name]) + 'n_phs ' + str(n_phs) + "\n") #phs_basenames = [os.path.basename(x) for x in phs_to_remove ] #file_debug_first_round.write(os.path.basename(phi_name)+'\t'+str(n_phs)+str(phs_basenames)) #file_debug_first_round.write("**********************************************") #file_debug_first_round.close() new_list_phs = [] for phs in list_phs: if phs in phs_to_remove: continue else: new_list_phs.append(phs) list_phs = new_list_phs return dictio_clusters # Main module if __name__ == "__main__": if len(sys.argv) == 1: print "\nUsage: ALIXE.py name.ls [options]" print "\nOptions:" print "\n-s=name_reference Use the name_reference phase file in the ls file as reference for clustering. It also accepts a list separated by commas." print "\n-t= Use a tolerance of N degrees for the mean phase difference between the phase sets compared" print "\n-r=N Use data to a resolution of N angstroms. Default is 2.0" print "\n-c=N Do N macrocycles of phase clustering between the sets" # print "-h=hkl_filename Reference hkl to calculate evalues" # TODO: it is better that symmetry is given either as a number of the sg or a line in the ls (if maps are given) print "\n-p=pdb_filename Read symmetry information from a pdb file" print "\n-w=f or -w=e Use f-weights or e-weights for the calculation of mean phase errors. Default is f" print "\n-d=working_directory Indicate the folder were to put the results. Default is the current working directory" sys.exit(0) print "\n Starting date & time " + time.strftime("%c") # Print initial time # Read ls file and save the files in a list name_ls = sys.argv[1] fich_ls = open(name_ls, 'r') list_phs = fich_ls.read().splitlines() dictio_phs = {} for phs in list_phs: dictio_phs[phs] = False n_phs = len(list_phs) print "\n ", str(n_phs), " phase files found in ", name_ls # Read the arguments or set the defaults list_options = sys.argv[2:] # Defaults reference_set = list_phs[0] # The first one in the ls position_ref = 1 tolerance = 60 resolution = 2.0 dictio_phs[reference_set] = True # Default is to use the first phs in the list cycles = 3 f_fom = True reference_hkl = None # TEMPORARY, NEED TO CHECK WITH ISABEL clust_fold = os.getcwd() cell = None for option in list_options: if option.startswith("-s"): # Check if they have given a single reference or a list separated by comma # if len(option[3:].split(','))>1: # list_references=option[3:].split(',') # for reference in list_references: # dictio_phs[reference]=True # elif len(option[3:].split(','))==1: # Then we have a single reference # print 'option[3:]',option[3:] ref = option[3:] dictio_phs[ref] = True # If it is not the same one, remove te True from the default if ref != reference_set: dictio_phs[reference_set] = False elif option.startswith("-t"): tolerance = float(option[3:]) elif option.startswith("-r"): resolution = float(option[3:]) elif option.startswith("-h"): reference_hkl = option[3:] elif option.startswith("-c"): cycles = int(option[3:]) elif option.startswith("-w"): weight_fom = option[3:] if weight_fom == 'e': f_fom = False elif option.startswith("-p"): # Read the symmetry from a pdb file pdb_path = str(option[3:]) cell, sg = al.read_cell_and_sg_from_pdb(pdb_path) # Cell is a list of floats sg_number = al.get_space_group_number_from_symbol(sg) elif option.startswith("-d"): clust_fold = str(option[3:]) # Check we have the symmetry info if cell == None: print "Sorry, you need to provide some pdb file to extract the symmetry information" sys.exit(0) # Generate a fake ins that SHELXE can read for expansions path_ins = os.path.join(clust_fold, 'name.ins') al.generate_fake_ins_for_shelxe(path_ins, cell, sg_number) # print 'dictio_phs before start alixe',dictio_phs # Call the startALIXE function with the appropiate input startALIXEasPHSTAT(clust_fold, reference_hkl, dictio_phs, cell, sg_number, tolerance, resolution, cycles, f_fom) print "\n Ending date & time " + time.strftime("%c") # Print initial time