#!/usr/bin/env python
'''manipulate fasta for fastq format files.'''

#import built-in modules
import numpy
import re
import sys
from string import maketrans
from optparse import OptionParser
import collections
from qcmodule import FrameKmer
#import third-party modules

#changes to the paths

#changing history to this module


__author__ = "Liguo Wang"
__copyright__ = "Copyright 2010, Wei Li's Lab"
__credits__ = []
__license__ = "GPL"
__version__ = "1.0.1"
__maintainer__ = "Liguo Wang"
__email__ = "liguow@bcm.edu"
__status__ = "Development" #Prototype or Production


class Fasta:
	'''manipulate fasta or fastq format file
	'''
	
	def __init__(self,fastafile=None):
		'''initialize object, lowercase in sequence is automatically converted into uppercase'''
		self.seqs={}
		self.IDs=[]
		self.transtab = maketrans("ACGTNX","TGCANX")
		self.filename = fastafile
		tmpseq=''
		if fastafile is not None:
			for line in open(fastafile,'r'):
				line=line.strip(' \n')
				if line.startswith('>'):
					if(tmpseq):
						self.seqs[name]=tmpseq
					name=line[1:]
					tmpseq =''
					self.IDs.append(name)
					print >>sys.stderr,"\tloading "+name+' ...'
				else:
					tmpseq += line.upper()
			self.seqs[name]=tmpseq
				
	def addSeq(self,id,seq):
		'''add sequence to current data'''
		if self.seqs.has_key(id):
			print >>sys.stderr,id +" already exists!"
			return
		else:
			self.seqs[id]=seq.upper()
			self.IDs.append(id)
			
	def getNames (self,file=None):
		'''return all sequence IDs'''
		return self.IDs
		
	def getSeq(self,seqID=None):
		'''return sequence for sepcified seqID, otherwise all sequences are returned'''
		if seqID is None:
			return self.seqs.values()
		else:
			return self.seqs[seqID]

	def printSeqs(self,n=50):
		'''print all seqs '''
		for k,v in self.seqs.items():
			print '>' + k
			for i in xrange(0, len(v), n):
				print v[i:i+n]

			
	def getSeqLen(self,seqID=None):
		seqlen=collections.defaultdict(dict)
		if seqID is None:
			for (k,v) in self.seqs.items():
				seqlen[k]=len(v)
		else:
			try:
				seqlen[seqID]=len(self.seqs[seqID])
			except:
				print >>sys.stderr,"Not found"
		return seqlen
			
	def countBase(self,pattern=None):
		'''count occurence of substring (defined by pattern), otherwise count A,C,G,T,N,X
		NOTE: pattern is counted non-overlappingly'''
		if pattern is None:
			print "ID\tTotal\tA\tC\tG\tT\tN\tX"
			for (k,v) in self.seqs.items():
				print k+"\t",
				print len(v),"\t",
				print str(v.count('A'))+"\t",
				print str(v.count('C'))+"\t",
				print str(v.count('G'))+"\t",
				print str(v.count('T'))+"\t",
				print str(v.count('N'))+"\t",
				print v.count('X')
		else:
			for (k,v) in self.seqs.items():
				print k+"\t",
				print str(len(v))+"\t",
				print v.count(pattern)
	def cal_entropy(self,l=3):
		'''calculate entropy for each sequence'''
		for (id,seq) in self.seqs.items():
			entropy = 0
			dna_chars_uniq = FrameKmer.all_possible_kmer(l)
			dna_len = len(seq)
			
			for c in dna_chars_uniq:
				if 'N' in c:
					continue
				prop = seq.count(c)/(1.0*dna_len)
				if prop ==0:
					continue
				information = numpy.log2(1.0/prop)
				entropy += prop * information
			yield(id, entropy)
		
	def revComp(self,seqID=None):
		'''return reverse-complemented sequence for sepcified seqID, otherwise all sequences are 
		reverse-complemented'''
		if seqID is None:
			for (k,v) in self.seqs.items():
				print ">" + k + "_rev"
				tmp = v.translate(self.transtab)
				return tmp[::-1]			
		else:
			return self.seqs[seqID].translate(self.transtab)[::-1]

			
	def getUniqSeqs(self):
		'''remove redundancy from original fasta files.
		duplicated sequences will be only report once'''

		seq2Name={}
		seq2Count={}
		for (key,value) in self.seqs.items():
			seq2Name[value]=key
			if seq2Count.has_key(value):
				seq2Count[value]+=1
			else:
				seq2Count[value]=1
		for value in seq2Name.keys():
				print '>'+ str(seq2Name[value]) + '_' + str(seq2Count[value])
				print value


	def findPattern(self,pat,outfile,seqID=None,rev=True):
		''' find pattern in all sequence unless seqID is specified, coordinates will be returned as bed format file'''
		
		fout=open(outfile,'w')
		length=len(pat)	

		Pat=pat.upper()
		start=0
		
		
		if seqID is None:
			for (k,v) in self.seqs.items():
				loopSwitch=0
				start=0
				while loopSwitch !=-1:
					loopSwitch = v.find(Pat,start)
					if loopSwitch !=-1:
						print >>fout,k + "\t" + str(loopSwitch) + "\t" + str(loopSwitch + length) + "\t" + Pat + "\t0\t+" 
						start = loopSwitch +1
					
		else:
			loopSwitch=0
			start=0
			while loopSwitch !=-1:
				loopSwitch = self.seqs[seqID].find(Pat,start)
				print >>fout, seqID + "\t" + str(loopSwitch) + "\t" + str(loopSwitch + length) + "\t" + Pat + "\t0\t+" 
				start = loopSwitch +1

		if rev==True:
			Pat_rev=Pat.translate(self.transtab)[::-1]
			if seqID is None:
				for (k,v) in self.seqs.items():
					loopSwitch=0
					start=0
					while loopSwitch !=-1:
						loopSwitch = v.find(Pat_rev,start)
						if loopSwitch !=-1:
							print >>fout,k + "\t" + str(loopSwitch) + "\t" + str(loopSwitch + length) + "\t" + Pat + "\t0\t-" 
							start = loopSwitch +1
						
			else:
				loopSwitch=0
				start=0
				while loopSwitch !=-1:
					loopSwitch = self.seqs[seqID].find(Pat_rev,start)
					print >>fout, seqID + "\t" + str(loopSwitch) + "\t" + str(loopSwitch + length) + "\t" + Pat + "\t0\t-" 
					start = loopSwitch +1		

	def fetchSeq(self,chr=None,st=None,end=None,infile=None,outfile=None):
		''' Fetching sequence based on chrName (should be exactly the same as fasta file), St, End. 
		NOTE: the coordinate is 0-based,half-open. use infile to specify multiple coordinates. infile
		should be bed3, bed6 or bed12'''

		if (infile is not None) and (outfile is not None):
			fout=open(outfile,'w')
			for line in open(infile):
				fields=line.strip().split()
				if (len(fields)==3):
					print >>fout,fields[0]+":"+fields[1]+"-"+fields[2]+"\t"+"strand=+"
					print >>fout,self.seqs[fields[0]][int(fields[1]):int(fields[2])].upper()
				elif (len(fields)>3):
					if fields[5]=='-':
						print >>fout,fields[0]+":"+fields[1]+"-"+fields[2]+"\t"+"strand=-"
						print >>fout,self.seqs[fields[0]][int(fields[1]):int(fields[2])].translate(self.transtab)[::-1].upper()
					else:
						print >>fout,fields[0]+":"+fields[1]+"-"+fields[2]+"\t"+"strand=+"
						print >>fout,self.seqs[fields[0]][int(fields[1]):int(fields[2])].upper()
		else:
			try:
				return self.seqs[chr][st:end].upper()
			except:
				print >>sys.stderr, "cannot fetch sequence from " + self.filename + " for " + chr + ":" + str(st) + "-" + str(end)
				return ''
				#print >>sys.stderr, chr + "\t" + str(st) +'\t' + str(end) + "  Please input chr,st,end"
						
def main():
	parser = OptionParser()
	parser.add_option("-i","--input_file",dest="in_file",help="input file name")
	(options,args)=parser.parse_args()
	obj=Fasta(options.in_file)
	obj.printSeqs(n=80)

if __name__ == "__main__":
    main()