#!/usr/bin/env python # -*- coding: utf-8 -*- # Python imports from __future__ import division import ConfigParser import cPickle import copy import datetime import math import operator import os import sys import time import numpy import alixe_library as al # Get a copy of the space group dictionary dictio_space_groups = al.get_spacegroup_dictionary() # NOTE: Maybe directly access from the module to avoid having it in memory? def fishing_round_by_prio_list(dict_first_round_by_rotclu, reference_hkl, sg_symbol, f_fom, phstat_version, path_phstat, ncores, clust_fold, path_ins, cell, resolution, cycles, tolerance): list_prio = [] list_to_remove = [] sg_number = al.get_space_group_number_from_symbol(sg_symbol) seed = 0 # We give the input sorted so in the fortran phstat we want it to use the first one as ref for rotclu in dict_first_round_by_rotclu.keys(): topllg = 0 name_clust_topllg = '' topzscore = 0 name_clust_topzscore = '' for cluster in dict_first_round_by_rotclu[rotclu].keys(): if dict_first_round_by_rotclu[rotclu][cluster]['dict_FOMs']['llg'] > topllg: topllg = dict_first_round_by_rotclu[rotclu][cluster]['dict_FOMs']['llg'] name_clust_topllg = cluster if dict_first_round_by_rotclu[rotclu][cluster]['dict_FOMs']['zscore'] > topzscore: topzscore = dict_first_round_by_rotclu[rotclu][cluster]['dict_FOMs']['zscore'] name_clust_topzscore = cluster if name_clust_topllg not in list_prio: list_prio.append(name_clust_topllg) if name_clust_topzscore not in list_prio: list_prio.append(name_clust_topzscore) list_rest = [] for rotclu in dict_first_round_by_rotclu.keys(): for cluster in dict_first_round_by_rotclu[rotclu]: if cluster not in list_prio: list_rest.append((cluster, dict_first_round_by_rotclu[rotclu][cluster]['dict_FOMs']['llg'])) sorted_list_rest = sorted(list_rest, key=lambda x: x[1]) for i in range(len(sorted_list_rest)): sorted_list_rest[i] = sorted_list_rest[i][0] list_prio.extend(sorted_list_rest) number_of_trials = int(ncores) max_number_of_trials = len(list_prio) dict_result_by_trial = {} raw_clust_second_round = open(os.path.join(clust_fold, "info_clust_second_round_raw"), 'w') del raw_clust_second_round table_clust_second_round = open(os.path.join(clust_fold, "info_clust_second_round_table"), 'w') table_clust_second_round.write('%-40s %-5s %-10s %-10s\n' % ('Cluster', 'n_phs', 'wmpd_max', 'wmpd_min')) del table_clust_second_round for i in xrange(number_of_trials): # if i==1: # print "list_to_remove",list_to_remove if i >= max_number_of_trials: break # We can't use more references, we finished the list! reference = list_prio[i] if (os.path.split(reference))[1] in list_to_remove: print "This reference, ", reference, " was already fished with another reference. Skipping this cycle" continue # We go to the next iteration, because this reference has been fished already # print "CHECK: len(list_prio) ",len(list_prio) # print "CHECK: len(list_to_remove)",len(list_to_remove) name_ref = os.path.split(reference)[1] name_phi = reference[:-4] + "_ref.phi" path_phi = os.path.join(clust_fold, reference[:-4] + "_ref.phi") if phstat_version == 'python': dict_phs = {} dict_phs[reference] = True for j in xrange(len(list_prio)): name_file = (os.path.split(list_prio[j]))[1] if (list_prio[j] not in dict_phs.keys()) and (name_file not in list_to_remove): dict_phs[list_prio[j]] = False dict_result_by_trial[list_prio[i]] = startALIXEasPHSTAT(clust_fold, reference_hkl, dict_phs, cell, sg_number, tolerance=tolerance, resolution=resolution, cycles=cycles, f_fom=f_fom) elif phstat_version == 'fortran': # NOTE: I need to use a relative path because fortran does not accept such long paths path = os.path.normpath(clust_fold) list_path = path.split(os.sep) relative_path_clust_fold = './' + list_path[-1] # 1) Write an ls file with the list of phs, putting first the reference path_ls = os.path.join(clust_fold, reference[:-4] + "_ref.ls") relative_ref = os.path.join(relative_path_clust_fold, name_ref) lsrotfile = open(path_ls, 'w') lsrotfile.write(relative_ref + '\n') for j in xrange(len(list_prio)): phs_namefile = (os.path.split(list_prio[j]))[1] phs_relative_path = os.path.join(relative_path_clust_fold, phs_namefile) # if phs_namefile in list_to_remove: # print 'phs_namefile ',phs_namefile,' is in list_to_remove' if phs_relative_path != relative_ref and (phs_namefile not in list_to_remove): lsrotfile.write(phs_relative_path + '\n') lsrotfile.close() # 2) Link the ins file os.link(path_ins, os.path.join(clust_fold, path_ls[:-3] + ".ins")) # 3) Launch the function in alixe_library complete_output = al.call_phstat_print_for_clustering(path_ls[:-3], relative_path_clust_fold, path_phstat, resolution, seed, tolerance, cycles) print complete_output ls_file = open(path_ls, 'r') ls_content = ls_file.read() del ls_file dict_result_by_trial[reference] = {name_phi: None} dict_result_by_trial[reference][name_phi] = al.read_phstat_print_clusterization_output(ls_content, complete_output, cycles) # NOTE: Because of the relative paths, in the fortran casse I need to modify the dictionary now to contain the full paths raw_clust_second_round = open(os.path.join(clust_fold, "info_clust_second_round_raw"), 'a') table_clust_second_round = open(os.path.join(clust_fold, "info_clust_second_round_table"), 'a') raw_clust_second_round.write( "*********************************************************************************************************************************************\n") n_phs = len(dict_result_by_trial[list_prio[i]][name_phi].keys()) raw_clust_second_round.write("CLUSTER fishing with reference " + name_ref + ' , found in file ' + name_ref[ :-4] + '_ref.phi, containing ' + str( n_phs) + ' phase files' + '\n') for key1 in dict_result_by_trial[list_prio[i]].keys(): wmpe_max = 0.0 wmpe_min = 90.0 for key2 in dict_result_by_trial[list_prio[i]][key1].keys(): if phstat_version == 'fortran': new_key2 = os.path.join(clust_fold, os.path.split(key2)[1]) dict_result_by_trial[list_prio[i]][key1][new_key2] = copy.deepcopy( dict_result_by_trial[list_prio[i]][key1][key2]) del dict_result_by_trial[list_prio[i]][key1][key2] raw_clust_second_round.write( new_key2 + "\t" + str(dict_result_by_trial[list_prio[i]][key1][new_key2]) + '\n') wmpe = dict_result_by_trial[list_prio[i]][key1][new_key2]['wMPE'] else: raw_clust_second_round.write( key2 + "\t" + str(dict_result_by_trial[list_prio[i]][key1][key2]) + '\n') wmpe = dict_result_by_trial[list_prio[i]][key1][key2]['wMPE'] list_to_remove.append(os.path.split(key2)[1]) if wmpe > wmpe_max: wmpe_max = wmpe if wmpe < wmpe_min and wmpe != 0.0: # Because 0.0 is to itself, the reference wmpe_min = wmpe name_cluster = (os.path.split(key1))[1] if wmpe_max == 0.0 and wmpe_min == 90.0: # Then we have not clustered them wmpe_max = 0.0 wmpe_min = 0.0 table_clust_second_round.write('%-40s %-5i %-10f %-10f \n' % (name_cluster, n_phs, wmpe_max, wmpe_min)) del raw_clust_second_round del table_clust_second_round return dict_result_by_trial def startALIXEasPHSTAT(clust_fold, reference_hkl, dict_phs, cell, sg_number, tolerance=75.0, resolution=2.0, cycles=3, f_fom=True): '''Minimal version of the ALIXE program that performs the same as phstat but with the possibility of testing more than one reference agains the same pool of solutions. Parameters: clust_fold = directory to perform the clustering reference_hkl = the path for the hkl given for shelxe, to filter by evalues the phs in case there is extrapolated data dict_phs = phs dictionary with values that are the phs and keys set to True if they have to be tested as references and False otherwhise cell = the unit cell parameters, in the form of a list of floats sg_number = integer with the space group number tolerance = float with the tolerance for the MPE value between the phase sets cycles = integer, number of cycles of phase combination to be performed f_fom = boolean. If true, mpe are weighted by Fvalues, otherwhise, by E values ''' print '\n ALIXE started: ', time.strftime("%c") start = datetime.datetime.now() dictio_clusters = {} # Dictionary to return the results # Get the symmetry information thanks to the sg_number symops = al.get_symops_from_sg_dictionary(sg_number) polar, origins = al.get_origins_from_sg_dictionary(sg_number) latt = al.get_latt_from_sg_dictionary(sg_number) # Calculate the coefficients needed for other calculations unit = cell[0] * cell[1] * cell[2] # Sides of the cell squared_sides = [] coef1 = [] coef2 = [] coef3 = [] list_cos = [] list_sin = [] for n in range(0, 3): angle_rad = 1.74533 * math.pow(10, -2) * (cell[n + 3]) # Convert to radian units the angles cos_angle = numpy.cos(angle_rad) list_cos.append(cos_angle) sin_angle = numpy.sin(angle_rad) list_sin.append(sin_angle) exp1 = 2.0 * unit * (cos_angle / cell[n]) # 2.0 * volume * (cos_angle/side) coef1.append(exp1) sside = cell[n] ** 2 squared_sides.append(sside) volume = unit * math.sqrt(1 - (list_cos[0]) ** 2 - (list_cos[1]) ** 2 - (list_cos[2]) ** 2 + ( 2 * list_cos[0] * list_cos[1] * list_cos[2])) # V = abc * (1-cos²α-cos²β-cos²γ+2cosαcosβcosγ)^(-1/2) unit = unit / volume for n in range(0, 3): exp2 = 0.25 * unit * ((list_sin[n] / cell[n]) ** 2) coef2.append(exp2) unit = unit / volume coef3.append(unit * cell[0] * (list_cos[1] * list_cos[2] - list_cos[0])) coef3.append(unit * cell[1] * (list_cos[0] * list_cos[2] - list_cos[1])) coef3.append(unit * cell[2] * (list_cos[0] * list_cos[1] - list_cos[2])) # TODO: Revise this part with Isabel. Should be filtering the phs according to what phases are not extrapolated ## # Set the reference for e-values calculation ## array_hkl=al.read_hkl_file(reference_hkl) ## print "BEGIN len(array_hkl)",len(array_hkl) ## array_filter=al.check_resolution_limit(array_hkl,resolution,coef2,coef3) ## file_hkl=open(reference_hkl[:-4]+'_filter_res.hkl','w') ## print "RESOLUTION len(array_hkl)",len(array_filter) ## array_filter_symm=al.find_equivalent_reflections(array_filter,symops,True) # Merge by symmetry ## print "SYMMETRY len(array_hkl)",len(array_filter_symm) ## array_filter_symm_sorted=al.sort_reflections_phs(array_filter_symm) ## for r in range(len(array_filter_symm_sorted)): ## # 3I4,2F8.2,I4 ## file_hkl.write('%4i%4i%4i%8.2f%8.2f\n' % (array_filter_symm_sorted[r][0],array_filter_symm_sorted[r][1],array_filter_symm_sorted[r][2],array_filter_symm_sorted[r][3],array_filter_symm_sorted[r][4])) ## del file_hkl ## sys.exit() # Generate a list with the phs from the dictionary that are references list_references = [] for _, phs in enumerate(dict_phs.keys()): if dict_phs[phs] == True: list_references.append(phs) # TEMPORARY: Use the first phs file to calculate the evalues ref_for_evalues = list_references[0] array_ref = al.read_phs_file(ref_for_evalues) # NOTE: My phi files are the same as phs files NOW array_ref = al.check_resolution_limit(array_ref, resolution, coef2, coef3) # We will use the coefficients saved in coef2 and coef3 array_ref = al.find_equivalent_reflections(array_ref, symops, False) array_ref = al.sort_reflections_phs(array_ref) # Find epsilon factor and 1/dsquared epsilon, onedsquared, res_max = al.get_epsilon_1overdsquared_and_resmax(array_ref, symops, coef2, coef3) # Estimate E-values (whats changes in phs is phases, not structure factors) array_evalues, array_aux = al.get_evalues_and_array_aux(epsilon, onedsquared, res_max, array_ref) # e-values only in the array_evalues # Save the information that is common to all phs to avoid overloading the memory array_miller_indices = [] array_f_sigf = [] for i, _ in enumerate(array_ref): array_miller_indices.append([array_ref[i][0], array_ref[i][1], array_ref[i][2]]) array_f_sigf.append([array_ref[i][3], array_ref[i][6]]) array_ref = None n_phs = len(dict_phs.keys()) print "\n Processing ", str(n_phs), " phase files" # I need two copies of the arrays because if there is a list with more than one reference # I do need to have the original values list_arrays_va_to_modify = [] list_arrays_vb_to_modify = [] list_arrays_to_modify = [] list_names_phs = [] for _, phs in enumerate(dict_phs.keys()): # TODO: Encapsulate this processing of the files in a function and then call threads. # TODO: be careful of either locking at the step of saving or changing the data structure where to save them print "\n Reading phs ", phs if phs[-4:] == ".phi": # NOTE: My phi files are the same as phs files NOW array_phs = al.read_phs_file(phs) elif phs[-4:] == ".phs": array_phs = al.read_phs_file(phs) else: print "ALIXE only supports clustering of SHELXE phi or phs files" sys.exit() # Save the name so that we have the correct order list_names_phs.append(phs) # phs is the full path name of the phs # Check resolution limit filter_res_array = al.check_resolution_limit(array_phs, resolution, coef2, coef3) # We will use the coefficients saved in coef2 and coef3 # Find equivalent reflections with standard indices and transform phases array_merged = al.find_equivalent_reflections(filter_res_array, symops, False) # Sort reflections in order of higher h,k and l indexes sorted_array = al.sort_reflections_phs(array_merged) ## # Filter by e-values of the hkl reference, so that we do not have extrapolated reflections ## evalue_sorted_array=al.filter_array_by_evalues(sorted_array,array_evalues) # Reduce the array to the phi and FOM values reduced_array = al.reduce_array_phs_to_PHI_and_FOM(sorted_array) # [FOM,PHI] list_arrays_to_modify.append(reduced_array) # For all, including the reference, obtain VA and VB arrva, arrvb = al.get_VA_and_VB(reduced_array) list_arrays_va_to_modify.append(arrva) list_arrays_vb_to_modify.append(arrvb) # Save the bigger arrays as pickle objects save_double_array = open(os.path.join(clust_fold, 'double_array.pkl'), 'wb') cPickle.dump(list_arrays_to_modify, save_double_array) save_double_array.close() save_arrva = open(os.path.join(clust_fold, 'arrva.pkl'), 'wb') cPickle.dump(list_arrays_va_to_modify, save_arrva) save_arrva.close() save_arrvb = open(os.path.join(clust_fold, 'arrvb.pkl'), 'wb') cPickle.dump(list_arrays_vb_to_modify, save_arrvb) save_arrvb.close() # Now test all references in the list for _, reference in enumerate(list_references): print "\n Testing reference", reference name_phi = (os.path.split(reference)[1])[:-4] + '_ref.phi' # print 'name_phi',name_phi path_phi = os.path.join(clust_fold, name_phi) # print 'path_phi',path_phi position = list_names_phs.index(reference) dictio_clusters[path_phi] = {} # We load the arrays again for each reference back_double_array = open(os.path.join(clust_fold, 'double_array.pkl'), 'rb') list_arrays_to_modify = cPickle.load(back_double_array) back_double_array.close() # print "position",position # print 'list_names_phs[position]',list_names_phs[position] array_ref = copy.deepcopy(list_arrays_to_modify[position]) back_arrva = open(os.path.join(clust_fold, 'arrva.pkl'), 'rb') list_arrays_va_to_modify = cPickle.load(back_arrva) back_arrva.close() back_arrvb = open(os.path.join(clust_fold, 'arrvb.pkl'), 'rb') list_arrays_vb_to_modify = cPickle.load(back_arrvb) back_arrvb.close() # Phase combination macrocycles for c in range(cycles): print '\n Cluster analysis cycle ', str(c + 1) print '\n N' + '\t' + 'wMPE' + '\t' + 'Dif' + '\t' + 'MapCC' + '\t' + 'Phase file' # print 'array_ref[0] beginning cycle',array_ref[0] dictio_clusters[path_phi][c + 1] = {} # We will save the results cycle by cycle # Find and apply origin shifts relative to reference phases dict_wmpes_sel = {} list_wmpes_sel = [] for pos, _ in enumerate(list_arrays_to_modify): name_current_phs = list_names_phs[pos] dict_wmpes_sel[name_current_phs] = {"wMPE": 0.0, "diff_wMPE": 0.0, "mapcc": 0.0} s = 0.0 t = 0.0 nreflections = len(list_arrays_to_modify[pos]) for r, _ in enumerate(list_arrays_to_modify[pos]): list_arrays_to_modify[pos][r][1] = list_arrays_vb_to_modify[pos][r] # Phases # Check the weight to apply to the structure factors # Now we have the hkl and f info in other arrays: array_miller_indices and array_f_sigf if f_fom == True: list_arrays_to_modify[pos][r][0] = array_f_sigf[r][0] * list_arrays_va_to_modify[pos][ r] # StructureFactor * weight current phs elif f_fom == False: list_arrays_to_modify[pos][r][0] = array_evalues[r] * list_arrays_va_to_modify[pos][ r] # E-value * weight current phs s = s + list_arrays_to_modify[pos][r][0] t = t + list_arrays_to_modify[pos][r][0] * abs( ((900.0 + list_arrays_to_modify[pos][r][1] - list_arrays_vb_to_modify[pos][r]) % 360.0) - 180.0) # Here is where when we have to apply the origin shifts and calculate the wMPE and the mapCC current_weights = list_arrays_va_to_modify[ pos] # array with the ORIGINAL weights of the current phs. Does NOT change with every cycle sorted_list_wmpe, sorted_list_mapcc = al.apply_origin_shift_and_compute_wMPE_and_CC(nreflections, symops, array_ref, array_miller_indices, array_f_sigf, list_arrays_to_modify[ pos], current_weights, array_evalues, array_aux, sg_number, f_fom) shift_to_apply = sorted_list_wmpe[0][1] wmpe_shift = sorted_list_wmpe[0][0] for i in range(len(sorted_list_mapcc)): # Get the mapCC that corresponds to that shift if sorted_list_mapcc[i][1] == shift_to_apply: map_CC = sorted_list_mapcc[i][0] # Get difference top best/second best for the wMPE if we are not in a polar sg if polar == False: diff_mpe = sorted_list_wmpe[1][0] - sorted_list_wmpe[0][0] elif polar == True: # NOTE: Maybe I can find another indicator,or search the way to produce sth similar diff_mpe = 0 dict_wmpes_sel[name_current_phs] = {"wMPE": wmpe_shift, "diff_wMPE": diff_mpe, "mapcc": map_CC, 'shift': shift_to_apply} list_wmpes_sel.append((name_current_phs, wmpe_shift)) # print 'name_current_phs ',name_current_phs,'sorted_list_wmpe ',sorted_list_wmpe for r, _ in enumerate(list_arrays_vb_to_modify[pos]): # Modify the phases saved list_arrays_vb_to_modify[pos][r] = ((720.0 + (360.0 * ((( shift_to_apply[0] * array_miller_indices[r][ 0]) + ( shift_to_apply[1] * array_miller_indices[r][ 1]) + ( shift_to_apply[2] * array_miller_indices[r][ 2])) % 1.0)) + list_arrays_vb_to_modify[pos][r]) % 360.0) # Sort the phase sets according to their mean phase error sorted_by_mpe = sorted(list_wmpes_sel, key=lambda x: x[1], reverse=False) # print 'sorted_by_mpe',sorted_by_mpe # Check if anything will cluster under the tolerance we set if len(sorted_by_mpe) > 1: # print "Entered len(sorted_by_mpe)>1" if sorted_by_mpe[1][1] > tolerance: # print "Entered sorted_by_mpe[1][1]>tolerance" print 'There are not solutions to cluster under the tolerance set' # print "path_phi",path_phi dictio_clusters[path_phi] = {sorted_by_mpe[0][0]: {'wMPE': sorted_by_mpe[0][1], 'diff_wMPE': dict_wmpes_sel[sorted_by_mpe[0][0]][ 'diff_wMPE'], 'mapcc': dict_wmpes_sel[sorted_by_mpe[0][0]][ 'mapcc'], 'shift': [0.0, 0.0, 0.0]}} # Write the phases with the resolution cut to a file file_phi = open(path_phi, 'w') pos = list_names_phs.index(sorted_by_mpe[0][0]) # Position of the reference array in the reading nreflections = len(list_arrays_va_to_modify[pos]) for r in xrange(nreflections): # 3I4,F9.2,F8.4,F8.1,F8.2 file_phi.write('%4i%4i%4i%9.2f%8.4f%8.1f%8.2f\n' % ( array_miller_indices[r][0], array_miller_indices[r][1], array_miller_indices[r][2], array_f_sigf[r][0], list_arrays_va_to_modify[pos][r], list_arrays_vb_to_modify[pos][r], array_f_sigf[r][1])) del file_phi return dictio_clusters # else: # print "Hurra! We have things to cluster" # Combine phases for cluster # 1) Generate clean arrays of contributions VA and VB and for the combination va = [0.0 for _ in range(len(array_ref))] # fom vb = [0.0 for _ in range(len(array_ref))] # phases wa_combi = [0.0 for _ in range(len(array_ref))] # fa_combi=[0.0 for _ in range(len(array_ref))] FORTRAN: FA(I)=WA(I)*ET(I) (no consistente) # 2) Iterate over the phs files (consider sorting) for ind in range(len(sorted_by_mpe)): current_phs = sorted_by_mpe[ind][0] current_wmpe = sorted_by_mpe[ind][1] current_dif_wmpe = dict_wmpes_sel[current_phs]['diff_wMPE'] current_mapcc = dict_wmpes_sel[current_phs]['mapcc'] # print "current_phs",current_phs, "current_wmpe",current_wmpe # I need to find which is the position of this phs in the arrays pos = list_names_phs.index(current_phs) if current_wmpe < tolerance: print ' ', ind + 1, '\t', "{0:.1f}".format(current_wmpe), '\t', "{0:.1f}".format( current_dif_wmpe), '\t', "{0:.2f}".format(current_mapcc), '\t', current_phs dictio_clusters[path_phi][c + 1][current_phs] = copy.deepcopy(dict_wmpes_sel[current_phs]) for r in xrange(nreflections): list_arrays_to_modify[pos][r][1] = list_arrays_vb_to_modify[pos][r] t = 0.0174533 * list_arrays_to_modify[pos][r][1] # 0.0174533 is 2pi/360 s = list_arrays_va_to_modify[pos][r] va[r] = va[r] + (s * (numpy.cos(t))) vb[r] = vb[r] + (s * (numpy.sin(t))) # print t,s,numpy.cos(t),numpy.sin(t) t = numpy.sqrt(1.0 / (ind + 1)) for r in xrange(nreflections): wa_combi[r] = min(1.0, numpy.sqrt(math.pow(va[r], 2) + math.pow(vb[r], 2)) * t) # Use either structure factors or normalized structure factors for the combined phases list_arrays_to_modify[pos][r][0] = wa_combi[r] * array_evalues[ r] # e-weighted,as it is in the fortran prototype ## if f_fom==True: ## list_arrays_to_modify[pos][r][0]=array_f_sigf[r][0]*wa_combi[r] # StructureFactor * new_weight ## elif f_fom==False: ## alist_arrays_to_modify[pos][r][0]=array_evalues[r]*wa_combi[r] # E-value * new_weight if wa_combi[r] > 0.0001: list_arrays_to_modify[pos][r][1] = (720.0 + 57.29578 * numpy.arctan2(vb[r], va[r])) % 360 # Compute again the shift, because now we changed the phases sorted_list_wmpe2, sorted_list_mapcc2 = al.apply_origin_shift_and_compute_wMPE_and_CC(nreflections, symops, array_ref, array_miller_indices, array_f_sigf, list_arrays_to_modify[ pos], current_weights, array_evalues, array_aux, sg_number, f_fom) shift_to_apply2 = sorted_list_wmpe2[0][1] wmpe_shift2 = sorted_list_wmpe2[0][0] if shift_to_apply2 != [0.0, 0.0, 0.0]: # Check if there ever is another shift after if not (abs(shift_to_apply2[0]) < 0.01 and abs(shift_to_apply2[1]) < 0.01 and abs( shift_to_apply2[2]) < 0.01): print "There is another shift to apply after phase merging. CHECK!", shift_to_apply2 for r in xrange(nreflections): list_arrays_to_modify[pos][r][1] = ((720.0 + (360.0 * (((shift_to_apply2[0] * array_miller_indices[r][0]) + ( shift_to_apply2[1] * array_miller_indices[r][1]) + ( shift_to_apply2[2] * array_miller_indices[r][ 2])) % 1.0)) + list_arrays_to_modify[pos][r][1]) % 360.0) for r in xrange( nreflections): # Change the phases in the reference array so that they correspond to the combined map array_ref[r][1] = list_arrays_to_modify[pos][r][1] # print 'array_ref[0] ending cycle',array_ref[0] # print "POST list_arrays_to_modify[0]",list_arrays_to_modify[0][0] # print "POST list_arrays_va_to_modify[0]",list_arrays_va_to_modify[0][0] # print "POST list_arrays_vb_to_modify[0]",list_arrays_vb_to_modify[0][0] # Write the cumulative phases to a file file_phi = open(path_phi, 'w') # print 'path_phi',path_phi for r in xrange(len(array_ref)): # 3I4,F9.2,F8.4,F8.1,F8.2 file_phi.write('%4i%4i%4i%9.2f%8.4f%8.1f%8.2f\n' % ( array_miller_indices[r][0], array_miller_indices[r][1], array_miller_indices[r][2], array_f_sigf[r][0], wa_combi[r], array_ref[r][1], array_f_sigf[r][1])) del file_phi print '\n Cleaning up....' for fich in os.listdir(clust_fold): if fich.endswith('.pkl'): os.remove(os.path.join(clust_fold, fich)) # Return just the last cycle results dictio_clusters_third = {} for key in dictio_clusters.keys(): dictio_clusters_third[key] = dictio_clusters[key][cycles] return dictio_clusters_third end = datetime.datetime.now() print " end - start time ", end - start def startALIXEforARCIMBOLDO(clust_fold, cluster_id, reference_hkl, cell, sg_symbol, tolerance_list=[60.0, 88.0], resolution=2.0, cycles=3, f_fom=True, run='SHREDDER', mode='one_step', fishing_folder=None, confibor=None, ncores=1): """ Function to call ALIXE internally from other program. The arguments needed are: clust_fold = directory to perform the clustering cluster_id = integer with the rotation cluster to evaluate reference_hkl = the path for the hkl given for shelxe, to filter by evalues the phs in case there is extrapolated data cell = the unit cell parameters, in the form of a list (STR! CONVERT TO FLOAT) sg_symbol = string with the space group symbol tolerance = list of floats with tolerances for mpe cycles = integer, number of cycles of phase combination to be performed f_fom = boolean. If true, mpe are weighted by Fvalues, otherwhise, by E values run = string. Can be 'ARCIMBOLDO', 'BORGES', 'SHREDDER' mode = string. Can be 'fish','full','one_step','two_steps' fishing_folder = string. Path to the folder in which ALIXE has to fish in case of that mode. confibor = ConfigParser object ncores = number of real cores in the machine, as read by ARCIMBOLDO or ARCIMBOLDO_BORGES""" print '\n ALIXE started: ', time.strftime("%c") dictio_clusters = {} # Dictionary to return the results skip_FOM_reading = False ent_present = False hard_limit_phs = 100 skip_copyfiles = False skip_ins_generation = False # Check if alixe has been already performed on that folder basepath_clustfold = clust_fold print "Root folder for clustering", basepath_clustfold clust_fold = os.path.join(basepath_clustfold, str(cluster_id)) print "Currrently processing folder ", clust_fold if not os.path.exists(clust_fold): os.makedirs(clust_fold) # Generate the directory if it is not already there path_for_tuple = os.path.join(clust_fold, 'final_alixe_tuple.pkl') if os.path.exists(path_for_tuple): # Then first round was completed if mode == 'one_step': back_final_tuple = open(path_for_tuple, 'rb') (path_ins, final_dict) = cPickle.load(back_final_tuple) print "Alixe has been succesfully run already. Returning the tuple with the results" return (path_ins, final_dict) elif mode == 'two_steps': # In any case we do not need to reperform steps print "We are in the level of setting the skips to True" skip_FOM_reading = True skip_copyfiles = True skip_ins_generation = True # Then, the first round was finished but we need to do the second path_list_clusters = os.path.join(basepath_clustfold, 'list_clustered.pkl') if os.path.exists(path_list_clusters): print "We did the second step already on some folder" back_list = open(path_list_clusters, 'rb') list_to_remove = cPickle.load( back_list) # This list contains the phi files that have been already clustered at some point in the second step elif not os.path.exists(path_list_clusters): # Then let's create it because we are going to append things to it after we cluster using the references of this rotation cluster list_to_remove = [] if mode not in ['fish', 'full', 'one_step', 'two_steps']: print "Sorry, you need to provide a valid mode for alixe: either 'fish','full','one_step' or 'two_steps' " sys.exit(0) # Generate a fake ins that SHELXE can read for expansions if not skip_ins_generation: for i in range(len(cell)): cell[i] = float(cell[i]) sg_number = al.get_space_group_number_from_symbol(sg_symbol) path_ins = os.path.join(clust_fold, 'name.ins') al.generate_fake_ins_for_shelxe(path_ins, cell, sg_number) if run == 'SHREDDER' or run == 'BORGES': if not isinstance(confibor, ConfigParser.ConfigParser): print "Sorry, you need to provide a valid bor object in order to run ALIXE on ", run # Check the bor sys.exit() topexp = confibor.get('ARCIMBOLDO-BORGES', 'topexp') basepath = confibor.get('GENERAL', 'working_directory') if not skip_copyfiles: list_chosen = al.get_files_from_9_EXP_BORGES(basepath, clust_fold, cluster_id=cluster_id, mode=9, hard_limit_phs=hard_limit_phs) subrun = 'BORGES' # To know how to read the solutions. Spherical is generating a BORGESARCI job if confibor.has_option('GENERAL', 'ent_path'): # therefore the lst will have post mortem info ent_path = confibor.get('GENERAL', 'ent_path') if os.path.exists(ent_path): ent_present = True elif run == 'ARCIMBOLDO': if not confibor.isinstance(ConfigParser.ConfigParser): print "Sorry, you need to provide a valid bor object in order to run ALIXE on ", run # Check the bor sys.exit() # TODO: Get the link to the files and save them in the CLUSTERING folder # TODO: Check if there is an ent file provided in the bor and therefore the lst will have post mortem info else: print "Sorry, ALIXE can just run on SHREDDER, ARCIMBOLDO or BORGES" sys.exit() # Read the FOMs of the PHS files in the phaser and shelxe steps if not skip_FOM_reading: phs_files = al.list_files_by_extension(clust_fold, 'phs') dictio_fragments = {} for phs in phs_files: dictio_fragments[phs[:-4]] = {'rot_cluster': None, 'llg': None, 'zscore': None, 'initcc': None, 'efom': None, 'pseudocc': None, 'list_MPE': None} dictio_fragments = al.get_FOMs_from_lst_files_in_folder(clust_fold, dictio_fragments, ent_present) dictio_fragments = al.get_FOMs_from_sum_files_in_folder(basepath, clust_fold, dictio_fragments, ent_present, subrun) # First round if mode == 'one_step': tolerance = tolerance_list[0] dict_first_round = {} # To save the results per rotation cluster # Call the function clustering_all_in_ALIXE_under_a_tolerance per each of them and save the results dict_first_round = clustering_all_in_ALIXE_under_a_tolerance(clust_fold, reference_hkl, list_chosen, cell, sg_symbol, tolerance, resolution, cycles, f_fom, ncores=topexp) # Save the FOM information for priorization in second round if there is for cluster in dict_first_round.keys(): if dict_first_round[cluster]['n_phs'] > 1: list_llg = [] list_zscore = [] for phs in dict_first_round[cluster]['dictio_result'].keys(): # For each phs in the cluster list_llg.append(dictio_fragments[phs[:-4]]['llg']) list_zscore.append(dictio_fragments[phs[:-4]]['zscore']) # We add the top LLG or ZSCORE as representative of the cluster dict_first_round[cluster]['dict_FOMs']['llg'] = (sorted(list_llg, reverse=True))[0] dict_first_round[cluster]['dict_FOMs']['zscore'] = (sorted(list_zscore, reverse=True))[0] elif dict_first_round[cluster]['n_phs'] == 1: name_file = ((dict_first_round[cluster]['dictio_result'].keys())[0])[:-4] dict_first_round[cluster]['dict_FOMs']['llg'] = dictio_fragments[name_file]['llg'] dict_first_round[cluster]['dict_FOMs']['zscore'] = dictio_fragments[name_file]['zscore'] save_final_tuple = open(os.path.join(clust_fold, 'final_alixe_tuple.pkl'), 'wb') cPickle.dump((path_ins, dict_first_round), save_final_tuple) save_final_tuple.close() # Return only the clusters that contain more than one phase set so that they are expanded print "len(dict_first_round) before cleaning the single clusters", len(dict_first_round) for cluster in dict_first_round.keys(): if dict_first_round[cluster]['n_phs'] == 1: del dict_first_round[cluster] print "len(dict_first_round) after cleaning the single clusters", len(dict_first_round) return (path_ins, dict_first_round) elif mode == 'full': print "full mode is still not implemented" sys.exit(0) # Second round if mode == 'two_steps': print "Two steps mode... Not yet implemented" sys.exit(0) tolerance = tolerance_list[1] # Read all dictionaries from all rotation clusters dictio_clust_by_rotclu = {} limit_ref = 0 list_phis_to_evaluate = [] # List with tuples (rotclu,cc,phi) for sorting the files according to FOM (topLLG?) and to remove the files we have already fished cluster_folders = [d for d in os.listdir(basepath_clustfold) if os.path.isdir(os.path.join(basepath_clustfold, d))] for cluster_folder in cluster_folders: path_cluster = os.path.join(basepath_clustfold, cluster_folder) path_pkl = os.path.join(path_cluster, 'final_alixe_tuple.pkl') pkl_first_round = open(path_pkl, 'rb') path_ins, dictio_clust_first_round = cPickle.load(pkl_first_round) dictio_clust_by_rotclu[cluster_folder] = dictio_clust_first_round for phi_file in dictio_clust_first_round.keys(): list_phis_to_evaluate.append( (phi_file, cluster_folder, dictio_clust_first_round[phi_file]['dict_FOMs']['llg'])) if cluster_folder == str(cluster_id): limit_ref = limit_ref + 1 sorted_list_phis_to_evaluate = sorted(list_phis_to_evaluate, key=operator.itemgetter(2, 1, 0)) final_list_to_evaluate = [] for i in range(len(sorted_list_phis_to_evaluate)): if sorted_list_phis_to_evaluate[i][0] not in list_to_remove: final_list_to_evaluate.append(sorted_list_phis_to_evaluate[i][0]) print "****** Lista a evaluar *******" for i in range(len(final_list_to_evaluate)): print i, final_list_to_evaluate[i] sys.exit(0) # dict_second_round=clustering_all_in_ALIXE_under_a_tolerance(clust_fold,reference_hkl,final_list_to_evaluate,cell,sg_symbol,tolerance,resolution,cycles,f_fom,ncores=limit_ref) print 'dict_second_round', dict_second_round sys.exit(0) # We need to generate a list of the things we do not need to cluster anymore because were already tried list_to_remove_file = open(path_list_clusters, 'wb') # We overwrite the pickle file with a new one that contains the current list_to_remove cPickle.dump(list_to_remove, list_to_remove_file) sys.exit(0) # TODO: Function to call recursively startALIXE in order to get all clusters under a tolerance def clustering_all_in_ALIXE_under_a_tolerance(clust_fold, reference_hkl, list_phs, cell, sg_symbol, tolerance, resolution, cycles, fom_weigth, ncores): '''This function calls recursively to startALIXE in order to generate all possible clusters under a certain tolerance It does it by discarding the phs that had already been clustered with a previous reference''' dictio_clusters = {} sg_number = al.get_space_group_number_from_symbol(sg_symbol) # Convert the cell to float for i in range(len(cell)): cell[i] = float(cell[i]) while (len(list_phs) > 0) and (len(dictio_clusters.keys()) <= ncores): dict_phs = {} for i in range(len(list_phs)): if i == 0: dict_phs[list_phs[i]] = True else: dict_phs[list_phs[i]] = False results = startALIXEasPHSTAT(clust_fold, reference_hkl, dict_phs, cell, sg_number, tolerance, resolution, cycles, fom_weigth) # I can safely assume that if I just give one reference, # the results dictionary keys list will have just one phi name phi_name = results.keys()[0] phs_to_remove = results[phi_name].keys() n_phs = len(phs_to_remove) dictio_clusters[phi_name] = {'dictio_result': results[phi_name], 'n_phs': n_phs, 'dict_FOMs': {}} file_debug_first_round = open(os.path.join(clust_fold, "clusters_dictionary.txt"), 'a') file_debug_first_round.write( phi_name + '\t' + 'dictio_result ' + str(results[phi_name]) + 'n_phs ' + str(n_phs) + "\n") file_debug_first_round.write("**********************************************") file_debug_first_round.close() new_list_phs = [] for phs in list_phs: if phs in phs_to_remove: continue else: new_list_phs.append(phs) list_phs = new_list_phs return dictio_clusters # Main module if __name__ == "__main__": if len(sys.argv) == 1: print "\nUsage: ALIXE.py name.ls [options]" print "\nOptions:" print "-s=name_reference Use the name_reference phase file in the ls file as reference for clustering. It also accepts a list separated by commas." print "-t= Use a tolerance of N degrees for the mean phase difference between the phase sets compared" print "-r=N Use data to a resolution of N angstroms. Default is 2.0" print "-c=N Do N macrocycles of phase clustering between the sets" # print "-h=hkl_filename Reference hkl to calculate evalues" # TODO: it is better that symmetry is given either as a number of the sg or a line in the ls (if maps are given) print "-p=pdb_filename Read symmetry information from a pdb file" print "-w=f or -w=e Use f-weights or e-weights for the calculation of mean phase errors. Default is f" print "-d=working_directory Indicate the folder were to put the results. Default is the current working directory" sys.exit() print "\n Starting date & time " + time.strftime("%c") # Print initial time # Read ls file and save the files in a list name_ls = sys.argv[1] fich_ls = open(name_ls, 'r') list_phs = fich_ls.read().splitlines() dictio_phs = {} for phs in list_phs: dictio_phs[phs] = False n_phs = len(list_phs) print "\n ", str(n_phs), " phase files found in ", name_ls # Read the arguments or set the defaults list_options = sys.argv[2:] # Defaults reference_set = list_phs[0] # The first one in the ls position_ref = 1 tolerance = 60 resolution = 2.0 dictio_phs[reference_set] = True # Default is to use the first phs in the list cycles = 3 f_fom = True reference_hkl = None # TEMPORARY, NEED TO CHECK WITH ISABEL clust_fold = os.getcwd() cell = None for option in list_options: if option.startswith("-s"): # Check if they have given a single reference or a list separated by comma # if len(option[3:].split(','))>1: # list_references=option[3:].split(',') # for reference in list_references: # dictio_phs[reference]=True # elif len(option[3:].split(','))==1: # Then we have a single reference # print 'option[3:]',option[3:] ref = option[3:] dictio_phs[ref] = True # If it is not the same one, remove te True from the default if ref != reference_set: dictio_phs[reference_set] = False elif option.startswith("-t"): tolerance = float(option[3:]) elif option.startswith("-r"): resolution = float(option[3:]) elif option.startswith("-h"): reference_hkl = option[3:] elif option.startswith("-c"): cycles = int(option[3:]) elif option.startswith("-w"): weight_fom = option[3:] if weight_fom == 'e': f_fom = False elif option.startswith("-p"): # Read the symmetry from a pdb file pdb_path = str(option[3:]) cell, sg = al.read_cell_and_sg_from_pdb(pdb_path) # Cell is a list of floats sg_number = al.get_space_group_number_from_symbol(sg) elif option.startswith("-d"): clust_fold = str(option[3:]) # Check we have the symmetry info if cell == None: print "Sorry, you need to provide some pdb file to extract the symmetry information" sys.exit(0) # Generate a fake ins that SHELXE can read for expansions path_ins = os.path.join(clust_fold, 'name.ins') al.generate_fake_ins_for_shelxe(path_ins, cell, sg_number) # print 'dictio_phs before start alixe',dictio_phs # Call the startALIXE function with the appropiate input startALIXEasPHSTAT(clust_fold, reference_hkl, dictio_phs, cell, sg_number, tolerance, resolution, cycles, f_fom) print "\n Ending date & time " + time.strftime("%c") # Print initial time