#!/usr/bin/env python

# python imports
import glob
import logging
import os
import re
import shutil
import sys

# our imports
try:
    from ample.util import ample_util
except:
    ample_util = None
    
logger = logging.getLogger(__name__)

class SpickerResult(object):
    """
    A class to hold the result of running Spicker
    """

    def __init__(self):

        self.pdb_file = None  # Path to a list of the pdbs for this cluster
        self.cluster_size = None
        self.cluster_centroid = "N/A"
        self.pdbs = []  # ordered list of the pdbs in their results directory
        self.r_cen = []  # ordered list of the distance from the cluster centroid for each pdb
        return

class Spickerer(object):

    def __init__(self, spicker_exe=None, run_dir=None):
        """Initialise from a dictionary of options"""
        
        if not spicker_exe:
            if 'CCP4' in os.environ: spicker_exe = os.path.join(os.environ['CCP4'], 'bin', 'spicker')
            else: raise RuntimeError, "Cannot find a spicker executable!"
        if not (os.path.exists(spicker_exe) and os.access(spicker_exe, os.X_OK)):
            raise RuntimeError, "Cannot find a valid spicker executable: {0}".format(spicker_exe)
        self.spicker_exe = spicker_exe
        self.run_dir = run_dir
        self.results = None
        return
        
    def get_length(self, pdb):
        pdb = open(pdb)
        counter = 0
        for line in pdb:
            pdb_pattern = re.compile('^ATOM')
            pdb_result = pdb_pattern.match(line)
            if pdb_result:
                atom = line[13:16]
                if re.search('CA', atom):
                    counter += 1
        return str(counter)

    def create_input_files(self, models, score_type='rmsd', score_matrix=None):
        """
        jmht
        Create the input files required to run spicker
        (See notes in spicker.f FORTRAN file for a description of the required files)
        """
        if not len(models):
            msg = "no models provided!"
            logger.critical(msg)
            raise RuntimeError, msg
        
        if score_type: score_type = score_type.lower()
        logger.debug("Using score_type: {0}".format(score_type))
        
        if score_type == 'read_matrix':
            if not (score_matrix and os.path.isfile(score_matrix)):
                msg = 'Cannot find score_matrix: {0}'.format(score_matrix)
                logger.critical(msg)
                raise RuntimeError, msg
            logger.debug("Using score_matrix: {0}".format(score_matrix))
            shutil.copy(score_matrix, os.path.join(self.run_dir,'score.matrix'))
        elif score_type == 'tm':
            # Create file so spicker knows to calculate TM scores
            with open('TM.score','w') as f: f.write('\n')
        elif score_type == 'rmsd':
            pass
        else:
            raise RuntimeError,"Unknown score_type: {0}".format(score_type)
        
        # read_out - Input file for spicker with coordinates of the CA atoms for each of the PDB structures
        #
        # file_list - a list of the full path of all PDBs - used so we can loop through it and copy the selected
        # ones to the relevant directory after we have run spicker - the order of these must match the order
        # of the structures in the rep1.tra1 file 
        list_string = ''
        counter = 0
        with open('rep1.tra1', "w") as read_out, open('file_list', "w") as file_list:
            for infile in models:
                pdbname = os.path.basename(infile)
                file_list.write(infile + '\n')
                list_string = list_string + pdbname + '\n'
                counter += 1
                length = self.get_length(infile)
                # 1st field is length, 2nd energy, 3rd & 4th don't seem to be used for anything
                read_out.write('\t' + length + '\t926.917       ' + str(counter) + '       ' + str(counter) + '\n')
                with open(infile) as read:
                    # Write out the coordinates of the CA atoms 
                    for line in read:
                        pattern = re.compile('^ATOM\s*(\d*)\s*(\w*)\s*(\w*)\s*(\w)\s*(\d*)\s*(.\d*.\d*)\s*(.\d*.\d*)\s*(.\d*.\d*)\s*(.\d*.\d*)')
                        result = re.match(pattern, line)
                        if result:
                            split = re.split(pattern, line)
                            if split[2] == 'CA':
                                read_out.write('     ' + split[6] + '     ' + split[7] + '     ' + split[8] + '\n')
    
        # from spicker.f
        # *       'rmsinp'---Mandatory, length of protein & piece for RMSD calculation;
        with open('rmsinp', "w") as rmsinp:
            rmsinp.write('1  ' + length + '\n\n')
            rmsinp.write(length + '\n')
        
        # make tra.in
        # from spicker.f
        # *       'tra.in'---Mandatory, list of trajectory names used for clustering.
        # *                  In the first line of 'tra.in', there are 3 parameters:
        # *                  par1: number of decoy files
        # *                  par2: 1, default cutoff, best for decoys from template-based
        # *                           modeling;
        # *                       -1, cutoff based on variation, best for decoys from
        # *                           ab initio modeling.
        # *                       -2, use TM scores for clustering
        # *                  par3: 1, closc from all decoys; -1, closc clustered decoys
        # *                  From second lines are the file names which contain coordinates
        # *                  of 3D structure decoys. All these files are mandatory
        par2 = '-2' if score_type == 'tm' else '-1'
        with open('tra.in', "w") as tra:
            tra.write('1 {0} 1 \nrep1.tra1\n'.format(par2))
    
        # Create the file with the sequence of the PDB structures
        # from spicker.f
        # *       'seq.dat'--Mandatory, sequence file, for output of PDB models.
        with open('seq.dat', "w") as seq, open(models[0], 'r') as a_pdb:
            for line in a_pdb:
                pattern = re.compile('^ATOM\s*(\d*)\s*(\w*)\s*(\w*)\s*(\w)\s*(\d*)\s*(\d*)\s')
                result = re.match(pattern, line)
                if result:
                    split = re.split(pattern, line)
                    if split[2] == 'CA':
                        seq.write('\t' + split[5] + '\t' + split[3] + '\n')
        return
    
    def cluster(self, models, run_dir=None, score_type='rmsd', score_matrix=None, nproc=1):
        """
        Run spicker to cluster the models
        """
        owd = os.getcwd()
        if run_dir: self.run_dir = os.path.abspath(run_dir)
        if not self.run_dir: self.run_dir = os.path.join(owd, 'spicker')
        if not os.path.isdir(self.run_dir): os.mkdir(self.run_dir)
        os.chdir(self.run_dir)
        
        logger.debug("Running spicker in directory: {0}".format(self.run_dir))
        logger.debug("Using executable: {0} on {1} processors".format(self.spicker_exe, nproc))
        
        self.create_input_files(models, score_type=score_type, score_matrix=score_matrix)
        
        # We need special care if we are running with tm scores as we will be using the OPENMP
        # version of spicker which requires increasing the stack size on linux and setting the 
        # OMP_NUM_THREADS environment variable on all platforms
        # The stack size on 64-bit linux seems to be 15Mb, so I guess asking for 50 seems reasonable
        # I'm assuming that the limit is in bytes and specified by an integer so 50Mb -> 50000000
        preexec_fn=None
        env = None
        if score_type == 'tm':
            env = { 'OMP_NUM_THREADS' : str(nproc)}
            if sys.platform.lower().startswith('linux'):
                def set_stack():
                    import resource
                    stack_bytes = 50000000 # 50Mb
                    #resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY,resource.RLIM_INFINITY))
                    resource.setrlimit(resource.RLIMIT_STACK, (stack_bytes,stack_bytes))
                preexec_fn=set_stack

        logfile = os.path.abspath("spicker.log")
        rtn = ample_util.run_command([self.spicker_exe], logfile=logfile, env=env, preexec_fn=preexec_fn)
        if not rtn == 0:
            raise RuntimeError,"Error running spicker, check logfile: {0}".format(logfile)
    
        # Read the log and generate the results
        self.results = self.process_log()
        
        # Always go back to where we started
        os.chdir(owd)
        return
                
    def process_log(self, logfile=None):
        """Read the spicker str.txt file and return a list of SpickerResults for each cluster.
        
        We use the R_nat value to order the files in the cluster
        """
        if not logfile: logfile = os.path.join(self.run_dir, 'str.txt')
        clusterCounts = []
        index2rcens = []
        
        # File with the spicker results for each cluster
        logger.debug("Processing spicker output file: {0}".format(logfile))
        f = open(logfile, 'r')
        line = f.readline()
        while line:
            line = line.strip()
            if line.startswith("#Cluster"):
                # skip 2 lines to Nstr
                f.readline()
                f.readline()
                line = f.readline().strip()
                if not line.startswith("Nstr="):
                    raise RuntimeError, "Problem reading file: {0}".format(logfile)
                
                ccount = int(line.split()[1])
                clusterCounts.append(ccount)
                
                # Loop through this cluster
                i2rcen = []
                line = f.readline().strip()
                while not line.startswith("------"):
                    fields = line.split()
                    #  i_cl   i_str  R_nat   R_cen  E    #str     traj
                    # tuple of: ( index in file , distance from centroid )
                    i2rcen.append((int(fields[5]), float(fields[3])))
                    line = f.readline().strip()
                index2rcens.append(i2rcen)
            line = f.readline()
                
        # Sort clusters by the R_cen - distance from cluster centroid
        for i, l in enumerate(index2rcens):
            # Sort by the distance form the centroid, so first becomes centroid
            sorted_by_rcen = sorted(l, key=lambda tup: tup[1])
            index2rcens[i] = sorted_by_rcen
    
        # Now map the indices to their files
        # Get ordered list of the pdb files
        flist = os.path.join(self.run_dir, 'file_list')
        pdb_list = [ line.strip() for  line in open(flist , 'r') ]
        
        results = []
        # create results
        for cluster in range(len(clusterCounts)):
            result = SpickerResult()
            result.cluster_size = clusterCounts[ cluster ]
            result.pdb_file = os.path.join(self.run_dir, "spicker_cluster_{0}.list".format(cluster + 1))
            with open(result.pdb_file, "w") as f:
                for i, (idx, rcen) in enumerate(index2rcens[ cluster ]):
                    pdb = pdb_list[idx - 1]
                    if i == 0: result.cluster_centroid = pdb
                    result.pdbs.append(pdb)
                    result.r_cen.append(rcen)
                    f.write(pdb + "\n")
            results.append(result)
        return results
        
    def results_summary(self):
        """Summarise the spicker results"""
        
        if not self.results: raise RuntimeError, "Could not find any results!"
        rstr = "---- Spicker Results ----\n\n"
        
        for i, r in enumerate(self.results):
            rstr += "Cluster: {0}\n".format(i + 1)
            rstr += "* number of models: {0}\n".format(r.cluster_size)
            rstr += "* files are listed in file: {0}\n".format(r.pdb_file)
            rstr += "* centroid model is: {0}\n".format(r.cluster_centroid)
            rstr += "\n"
            
        return rstr

if __name__ == "__main__":
    import argparse, subprocess, tempfile
    
    # For running as a stand-alone script
    def run_command(cmd, logfile=None, directory=None, dolog=True, stdin=None, check=False):
        """Execute a command and return the exit_error code.
    
        We take care of outputting stuff to the logs and opening/closing logfiles
    
        Args:
        cmd - command to run as a list
        stdin - a string to use as stdin for the command
        logfile (optional) - the path to the logfile
        directory (optional) - the directory to run the job in (cwd assumed)
        dolog: bool - whether to output info to the system log
        """
    
        assert type(cmd) is list
    
        if not directory:
            directory = os.getcwd()
    
        if dolog:
            logging.debug("In directory {0}\nRunning command: {1}".format(directory, " ".join(cmd)))
    
        if logfile:
            if dolog:
                logging.debug("Logfile is: {0}".format(logfile))
            logf = open(logfile, "w")
        else:
            logf = tempfile.TemporaryFile()
            
        if stdin != None:
            stdinstr = stdin
            stdin = subprocess.PIPE
    
        # Windows needs some special treatment
        kwargs = {}
        if os.name == "nt":
            kwargs = { 'bufsize': 0, 'shell' : "False" }
        p = subprocess.Popen(cmd, stdin=stdin, stdout=logf, stderr=subprocess.STDOUT, cwd=directory, **kwargs)
    
        if stdin != None:
            p.stdin.write(stdinstr)
            p.stdin.close()
            if dolog:
                logging.debug("stdin for cmd was: {0}".format(stdinstr))
    
        p.wait()
        logf.close()
        
        return p.returncode
    
    # Mock up ample_util for when we don't have CCP4 installed
    if ample_util is None:
        class Tmp(object):pass
        ample_util = Tmp()
        ample_util.run_command = run_command
    
    #
    # Run Spicker on a directory of PDB files
    #
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--executable',
                        help="spicker executable to use")
    parser.add_argument('-m', '--models', required=True,
                        help="Models to cluster")
    parser.add_argument('-s', '--score_type', default='rmsd',
                        help="Use TM score")
    args = parser.parse_args()
    
    models = args.models
    if not os.path.exists(models): raise RuntimeError("Cannot find models: {0}".format(models))
    
    if os.path.isdir(models):
        models = [ os.path.abspath(m) for m in glob.glob(os.path.join(models, "*.pdb")) ]
    elif os.path.isfile(models):
        with open(models) as f:
            models = [ l.strip() for l in f.readlines() if l.strip() ]
        
    if not len(models):
        print "Cannot find any pdbs in: {0}".format(models)
        sys.exit(1)

    spicker_exe=None
    if args.executable:
       spicker_exe=os.path.abspath(args.executable)
        
    logging.basicConfig()
    logging.getLogger().setLevel(logging.DEBUG)

    spicker = Spickerer(spicker_exe=spicker_exe)
    spicker.cluster(models, score_type=args.score_type)
    print spicker.results_summary()