#!/opt/chipster/tools/Python-2.7.12/bin/python """ Read a maf file and print the regions covered to a set of bed files (one for each sequence source referenced in the maf). Only blocks with a positive percent identity are written out. TODO: Can this be generalized to be made more useful? usage: %prog bed_outfile_prefix < maf """ from __future__ import division, print_function import sys import bx.align.maf import psyco_full def block_pid( comp1, comp2 ): match = 0 total = 0 t1 = comp1.text.lower() t2 = comp2.text.lower() for i in range( 0, len(t1) ): a, b = t1[i], t2[i] if a == '-' or b == '-': continue elif a == b: match += 1 total += 1 if total == 0: return None return ( match / total ) def main(): out_prefix = sys.argv[1] print(out_prefix) out_files = dict() for block in bx.align.maf.Reader( sys.stdin ): ref_comp = block.components[0] ref_chrom = ref_comp.src.split('.')[1] for comp in block.components[1:]: comp_species, comp_chrom = comp.src.split('.')[:2] if comp_species not in out_files: f = open( "%s%s.bed" % ( out_prefix, comp_species ), "w" ) out_files[comp_species] = f pid = block_pid( ref_comp, comp ) if pid: out_files[comp_species].write( "%s\t%d\t%d\t%s:%d-%d,%s\t%f\n" % ( ref_chrom, ref_comp.forward_strand_start, ref_comp.forward_strand_end, \ comp_chrom, comp.start, comp.end, comp.strand, pid ) ) for f in out_files.values(): f.close() if __name__ == "__main__": main()