# --- UCSF Chimera Copyright --- # Copyright (c) 2000 Regents of the University of California. # All rights reserved. This software provided pursuant to a # license agreement containing restrictions on its disclosure, # duplication and use. This notice must be embedded in or # attached to all copies, including partial copies, of the # software or any revisions or derivations thereof. # --- UCSF Chimera Copyright --- # # $Id: __init__.py 29790 2010-01-14 22:32:06Z pett $ def nw(s1, s2, scoreMatch=10, scoreMismatch=-3, scoreGap=0, scoreGapOpen=-40, gapChar=".", returnSeqs=False, scoreMatrix=None, similarityMatrix=None, frequencyMatrix=None, endsAreGaps=False, ssMatrix=None, ssFraction=0.9, gapOpenHelix=None, gapOpenStrand=None, gapOpenOther=None, debug=False): # if 'scoreMatrix', 'similarityMatrix', or 'frequencyMatrix' is # provided, then 'scoreMatch' and 'scoreMismatch' are ignored and # the matrix is used to evaluate matching between the sequences. # 'scoreMatrix' should be a two-dimensional array of size # len(s1) x len(s2). 'similarityMatrix' should be a dictionary # keyed with two-tuples of residue types. 'frequencyMatrix' should # be a list of length s2 of dictionaries, keyed by residue type. # # if 'ssFraction' is not None/False, then 'ssMatrix' should be a 3x3 # matrix keyed with 2-tuples of secondary structure types ('H': helix, # 'S': strand, 'O': other). The score will be a mixture of the # ss/similarity matrix scores weighted by the ssFraction # [ssFraction * ss score + (1 - ssFraction) * similarity score] # # if 'gapOpenHelix/Strand/Other' is not None and 'ssFraction' is not # None/False, then scoreGapOpen is ignored when an intra-helix/ # intra-strand/other gap is opened and the appropriate penalty # is applied instead # # if 'returnSeqs' is True, then instead of returning a match list # (a list of two-tuples) as the second value, a two-tuple of gapped # Sequences will be returned. In both cases, the first return value # is the match score. m = [] bt = [] for i1 in range(len(s1) + 1): m.append((len(s2) + 1) * [ 0 ]) bt.append((len(s2) + 1) * [None]) bt[i1][0] = 1 if endsAreGaps and i1 > 0: m[i1][0] = scoreGapOpen + i1 * scoreGap for i2 in range(len(s2) + 1): bt[0][i2] = 2 if endsAreGaps and i2 > 0: m[0][i2] = scoreGapOpen * i2 * scoreGap if similarityMatrix is not None: evaluate = lambda i1, i2: similarityMatrix[(s1[i1], s2[i2])] elif scoreMatrix is not None: evaluate = lambda i1, i2: scoreMatrix[i1][i2] elif frequencyMatrix is not None: evaluate = lambda i1, i2: frequencyMatrix[i2][s1[i1]] else: def evaluate(i1, i2): if s1[i1] == s2[i2]: return scoreMatch return scoreMismatch doingSS = ssFraction is not None and ssFraction is not False \ and ssMatrix is not None if doingSS: prevEval = evaluate simFraction = 1.0 - ssFraction def ssEval(i1, i2): if hasattr(s1, 'ssFreqs'): freqs1 = s1.ssFreqs[i1] else: freqs1 = {s1.ssType(i1): 1.0} if hasattr(s2, 'ssFreqs'): freqs2 = s2.ssFreqs[i2] else: freqs2 = {s2.ssType(i2): 1.0} val = 0.0 for ss1, freq1 in freqs1.items(): if ss1 == None: continue for ss2, freq2 in freqs2.items(): if ss2 == None: continue val += freq1 * freq2 * ssMatrix[(ss1, ss2)] return val evaluate = lambda i1, i2: ssFraction * ssEval(i1, i2) + \ simFraction * prevEval(i1, i2) # precompute appropriate gap-open penalties gapOpen1 = [scoreGapOpen] * (len(s1)+1) gapOpen2 = [scoreGapOpen] * (len(s2)+1) if endsAreGaps: if gapOpenOther is not None: gapOpen1[0] = gapOpen2[0] = gapOpenOther else: gapOpen1[0] = gapOpen2[0] = 0 if doingSS and gapOpenOther != None: for seq, gapOpens in [(s1, gapOpen1), (s2, gapOpen2)]: if hasattr(seq, 'gapFreqs'): for i, gapFreq in enumerate(seq.gapFreqs): gapOpens[i+1] = \ gapFreq['H'] * gapOpenHelix + \ gapFreq['S'] * gapOpenStrand + \ gapFreq['O'] * gapOpenOther else: ssTypes = [seq.ssType(i) for i in range(len(seq))] for i, ss in enumerate(ssTypes[:-1]): nextSS = ssTypes[i+1] if ss == nextSS and ss == 'H': gapOpens[i+1] = gapOpenHelix elif ss == nextSS and ss == 'S': gapOpens[i+1] = gapOpenStrand else: gapOpens[i+1] = gapOpenOther colGapStarts = [0] * len(s2) # don't care about column zero for i1 in range(len(s1)): rowGapPos = 0 for i2 in range(len(s2)): best = m[i1][i2] + evaluate(i1, i2) btType = 0 if i2 + 1 < len(s2) or endsAreGaps: colGapPos = colGapStarts[i2] skipSize = i1 + 1 - colGapPos if hasattr(s1, "occupancy"): totOcc = 0.0 for i in range(colGapPos, i1+1): totOcc += s1.occupancy[i] colSkipVal = totOcc * scoreGap else: colSkipVal = skipSize * scoreGap baseColGapVal = m[colGapPos][i2+1] + colSkipVal skip = baseColGapVal + gapOpen2[i2+1] else: skipSize = 1 colSkipVal = 0 skip = m[i1][i2+1] if skip > best: best = skip btType = skipSize if i1 + 1 < len(s1) or endsAreGaps: skipSize = i2 + 1 - rowGapPos if hasattr(s2, "occupancy"): totOcc = 0.0 for i in range(rowGapPos, i2+1): totOcc += s2.occupancy[i] rowSkipVal = totOcc * scoreGap else: rowSkipVal = skipSize * scoreGap baseRowGapVal = m[i1+1][rowGapPos] + rowSkipVal skip = baseRowGapVal + gapOpen1[i1+1] else: skipSize = 1 rowSkipVal = 0 skip = m[i1+1][i2] if skip > best: best = skip btType = 0 - skipSize m[i1+1][i2+1] = best bt[i1+1][i2+1] = btType if btType >= 0: # not gapping the row if best > baseRowGapVal: rowGapPos = i2 + 1 if btType <= 0: # not gapping the column if best > baseColGapVal: colGapStarts[i2] = i1 + 1 if debug: from chimera.selection import currentResidues cr = currentResidues(asDict=True) if cr: for fileName, matrix in [("scores", m), ("trace", bt)]: out = open("/home/socr/a/pett/rm/" + fileName, "w") print>>out, " ", for i2, r2 in enumerate(s2.residues): if r2 not in cr: continue print>>out, "%5d" % i2, print>>out print>>out, " ", for i2, r2 in enumerate(s2.residues): if r2 not in cr: continue print>>out, "%5s" % s2[i2], print>>out for i1, r1 in enumerate(s1.residues): if r1 not in cr: continue print>>out, "%3d" % i1, s1[i1], for i2, r2 in enumerate(s2.residues): if r2 not in cr: continue print>>out, "%5g" % ( matrix[i1+1][i2+1]), print>>out out.close() i1 = len(s1) i2 = len(s2) matchList = [] while i1 > 0 and i2 > 0: btType = bt[i1][i2] if btType == 0: matchList.append((i1-1, i2-1)) i1 = i1 - 1 i2 = i2 - 1 elif btType > 0: i1 = i1 - btType else: i2 = i2 + btType if returnSeqs: return m[len(s1)][len(s2)], matches2gappedSeqs(matchList, s1, s2, gapChar=gapChar) return m[len(s1)][len(s2)], matchList def matches2gappedSeqs(matches, s1, s2, gapChar=".", reverseSorts=True): gapped1 = cloneSeq(s1) gapped2 = cloneSeq(s2) prev1 = prev2 = -1 if reverseSorts: matches.reverse() else: matches.sort() for pos1, pos2 in matches: if pos1 > prev1 + 1 and pos2 > prev2 + 1: gapped1.extend(s1[prev1+1:pos1]) gapped2.extend(gapChar * (pos1 - prev1 - 1)) gapped1.extend(gapChar * (pos2 - prev2 - 1)) gapped1.append(s1[pos1]) gapped2.extend(s2[prev2+1:pos2+1]) else: if pos1 > prev1 + 1: gapped1.extend(s1[prev1+1:pos1+1]) gapped2.extend(gapChar * (pos1 - prev1 - 1)) gapped2.append(s2[pos2]) if pos2 > prev2 + 1: gapped1.extend(gapChar * (pos2 - prev2 - 1)) gapped1.append(s1[pos1]) gapped2.extend(s2[prev2+1:pos2+1]) if pos1 == prev1 + 1 and pos2 == prev2 + 1: gapped1.append(s1[pos1]) gapped2.append(s2[pos2]) prev1, prev2 = pos1, pos2 if prev1 < len(s1) - 1: gapped2.extend(gapChar * (len(s1) - prev1 - 1)) gapped1.extend(s1[prev1+1:]) if prev2 < len(s2) - 1: gapped1.extend(gapChar * (len(s2) - prev2 - 1)) gapped2.extend(s2[prev2+1:]) return gapped1, gapped2 def cloneSeq(seq): from copy import copy clone = copy(seq) if hasattr(clone, "molecule"): name = clone.molecule.name if not clone.name.startswith("principal"): name += ", " + clone.name clone.name = name clone[:] = "" return clone if __name__ == '__main__': print nw("abxxababxxab", "abab", endsAreGaps=True)