#!/usr/bin/env perl # # Copyright (C) 2012-2014 Genome Research Ltd. # # Author: Petr Danecek # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. # Dependencies: # - matplotlib # http://matplotlib.sourceforge.net # - LaTex/xcolor.sty # Download .sty.gz LaTeX class style from http://www.ukern.de/tex/xcolor.html, # unpack and install system-wide or place elsewhere and make available by # setting the TEXINPUTS environment variable (note the colon) # export TEXINPUTS=some/dir: # The list of the recognised path can be obtained from `kpsepath tex` # # use strict; use warnings; use Carp; use Storable qw(dclone); my $opts = parse_params(); parse_vcfstats($opts); merge_vcfstats($opts) if @{$$opts{vcfstats}} > 1; if ( $$opts{make_plots} ) { init_plots($opts); plot_venn_bars($opts); plot_counts_by_AF($opts); plot_overlap_by_AF($opts); plot_concordance_by_AF($opts); plot_concordance_by_sample($opts); for my $id (file_ids($opts)) { # irc plots will be deprecated # plot_irc_by_AF($opts,$id); # plot_irc_by_rlen($opts,$id); plot_tstv_by_AF($opts,$id); plot_tstv_by_QUAL($opts,$id); plot_indel_distribution($opts,$id); plot_substitutions($opts,$id); plot_per_sample_stats($opts,$id); plot_DP($opts,$id); plot_hwe($opts,$id); } plot($opts); } create_pdf($opts) unless !$$opts{make_pdf}; exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "Usage: plot-vcfstats [OPTIONS] file.chk ...\n", " plot-vcfstats -p outdir/ file.chk ...\n", "Options:\n", " -m, --merge Merge vcfstats files to STDOUT, skip plotting.\n", " -p, --prefix The output files prefix, add a slash to create new directory.\n", " -P, --no-PDF Skip the PDF creation step.\n", " -r, --rasterize Rasterize PDF images for fast rendering.\n", " -s, --sample-names Use sample names for xticks rather than numeric IDs.\n", " -t, --title Identify files by these titles in plots. Can be given multiple times.\n", " -T, --main-title Main title for the PDF.\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; my $opts = { pdf_plots => 1, use_sample_names => 0, verbose => 1, make_pdf => 1, make_plots => 1, merge => 0, args => join(' ',$0,@ARGV), img_width => 11/2.54, img_height => 10/2.54, id2col => [ 'orange', 'red', 'darkgreen' ], tex => { slide3v => { height1 => '7cm', height2 => '7cm', height3 => '4.5cm' }, slide3h => { width1 => '15cm', width2 => '10cm', width3 => '8cm' }, }, # for file version sanity check sections => [ { id=>'ID', header=>'Definition of sets', exp=>"# ID\t[2]id\t[3]tab-separated file names" }, { id=>'SN', header=>'SN, Summary numbers', exp=>"# SN [2]id [3]key [4]value" }, { id=>'TSTV', header=>'# TSTV, transition/transversions:', exp=>"# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)" }, { id=>'SiS', header=>'Sis, Singleton stats', exp=>"# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable" }, { id=>'AF', header=>'AF, Stats by non-reference allele frequency', exp=>"# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable" }, { id=>'IDD', header=>'IDD, InDel distribution', exp=>"# IDD\t[2]id\t[3]length (deletions negative)\t[4]count" }, { id=>'ST', header=>'ST, Substitution types', exp=>"# ST\t[2]id\t[3]type\t[4]count" }, { id=>'GCsAF', header=>'GCsAF, Genotype concordance by non-reference allele frequency (SNPs)', exp=>"# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites" }, { id=>'GCiAF', header=>'GCiAF, Genotype concordance by non-reference allele frequency (indels)', exp=>"# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites" }, { id=>'NRDs', header=>'Non-Reference Discordance (NRD), SNPs', exp=>"# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance" }, { id=>'NRDi', header=>'Non-Reference Discordance (NRD), indels', exp=>"# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance" }, { id=>'GCsS', header=>'GCsS, Genotype concordance by sample (SNPs)', exp=>"# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches" }, { id=>'GCiS', header=>'GCiS, Genotype concordance by sample (indels)', exp=>"# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches" }, { id=>'PSC', header=>'PSC, Per-sample counts', exp=>"# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons" }, { id=>'PSI', header=>'PSI, Per-sample Indels', exp=>"# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio" }, { id=>'DP', header=>'DP, Depth distribution', exp=>"# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%)\t[6]number of sites\t[7]fraction of sites (%)" }, { id=>'FS', header=>'FS, Indel frameshifts', exp=>"# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)" }, { id=>'ICS', header=>'ICS, Indel context summary', exp=>"# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio" }, { id=>'ICL', header=>'ICL, Indel context by length', exp=>"# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio" }, { id=>'QUAL', header=>'QUAL, Stats by quality', exp=>"# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels" }, { id=>'HWE', header=>'HWE', exp=>"# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile", }, ], }; for my $sec (@{$$opts{sections}}) { $$opts{exp}{$$sec{id}} = $$sec{exp}; $$opts{id2sec}{$$sec{id}} = $sec; } while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '--no-plots' ) { $$opts{make_plots}=0; next; } if ( $arg eq '-P' || $arg eq '--no-PDF' ) { $$opts{make_pdf}=0; next; } if ( $arg eq '-r' || $arg eq '--rasterize' ) { $$opts{rasterize}=1; $$opts{pdf_plots} = 0; next; } if ( $arg eq '-m' || $arg eq '--merge' ) { $$opts{make_plots}=0; $$opts{make_pdf}=0; $$opts{merge}=1; next; } if ( $arg eq '-s' || $arg eq '--sample-names' ) { $$opts{use_sample_names}=1; next; } if ( $arg eq '-t' || $arg eq '--title' ) { push @{$$opts{titles}},shift(@ARGV); next; } if ( $arg eq '-T' || $arg eq '--main-title' ) { $$opts{main_title} = shift(@ARGV); next; } if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { push @{$$opts{vcfstats}},$arg; next; } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{vcfstats}) ) { error(); } if ( !exists($$opts{prefix}) ) { if ( !$$opts{merge} ) { error("Expected -p parameter.\n") } $$opts{prefix} = './'; } elsif ( $$opts{merge} ) { error("Only one of -p or -m should be given.\n"); } if ( $$opts{merge} && @{$$opts{vcfstats}} < 2 ) { error("Nothing to merge\n") } $$opts{dir} = $$opts{prefix}; $$opts{dir} =~ s{/[^/]+$}{/}; if ( !($$opts{dir}=~m{/$}) ) { $$opts{dir} = './'; } if ( !($$opts{prefix}=~m{/$}) && !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; } $$opts{lprefix} = $$opts{prefix}; $$opts{lprefix} =~ s{^.+/}{}; $$opts{logfile} = "$$opts{lprefix}plot-vcfstats.log"; if ( !-d $$opts{dir} ) { `mkdir -p $$opts{dir}`; } `> $$opts{dir}$$opts{logfile}` unless $$opts{merge}; return $opts; } sub plot { my ($opts) = @_; if ( !exists($$opts{plt_fh}) ) { return; } close($$opts{plt_fh}) or error("close $$opts{plt_fh}"); my $cmd = "python $$opts{plt_file}"; print STDERR "Plotting graphs: $cmd\n" unless !$$opts{verbose}; system($cmd); if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); } } sub parse_vcfstats { my ($opts) = @_; for (my $i=0; $i<@{$$opts{vcfstats}}; $i++) { parse_vcfstats1($opts,$i); } # Check sanity if ( !exists($$opts{dat}{ID}{0}) ) { error("Sanity check failed: no stats found by vcfstats??\n"); } # Set titles my %file2title; my %title2file; if ( exists($$opts{titles}) ) { for (my $i=0; $i<@{$$opts{titles}}; $i++) { if ( !exists($$opts{dat}{ID}{$i}) ) { next; } $file2title{$$opts{dat}{ID}{$i}[0][0]} = $$opts{titles}[$i]; $title2file{$$opts{titles}[$i]} = $$opts{dat}{ID}{$i}[0][0]; } } for my $id (file_ids($opts)) { if ( @{$$opts{dat}{ID}{$id}[0]}>1 ) { next; } # shared stats of two files my $file = $$opts{dat}{ID}{$id}[0][0]; if ( !exists($file2title{$file}) ) # create short title { my $bname = $file; $bname =~ s{^.*/}{}; $bname =~ s{\.vcf\.gz$}{}i; if ( length($bname) > 5 ) { $bname = substr($bname,0,5); } my $i = 0; my $title = $bname; while ( exists($title2file{$title}) ) { $title = $bname.chr(66+$i); $i++; } $file2title{$file} = $title; $title2file{$title} = $file; } } for my $id (file_ids($opts)) { my @titles; for my $file (@{$$opts{dat}{ID}{$id}[0]}) { push @titles, $file2title{$file} if $file2title{$file}; } $$opts{title}{$id} = join(' + ',@titles); } # mapping from file names to list of IDs for my $id (file_ids($opts)) { my @files; for my $file (@{$$opts{dat}{ID}{$id}[0]}) { push @{$$opts{file2ids}{$file}}, $id; } } # check sanity of the file merge: were the correct files merged? if ( exists($$opts{coalesced_files}) && $$opts{verbose} ) { print STDERR "The vcfstats outputs have been merged as follows:\n"; my %printed; for my $id (keys %{$$opts{coalesced_files}}) { for (my $i=0; $i<@{$$opts{coalesced_files}{$id}}; $i++) { for (my $j=0; $j<@{$$opts{coalesced_files}{$id}[$i]}; $j++) { if ( exists($printed{$$opts{dat}{ID}{$id}[$i][$j]}) ) { next; } print STDERR "\t$$opts{dat}{ID}{$id}[$i][$j]\n"; for my $file (keys %{$$opts{coalesced_files}{$id}[$i][$j]}) { my $n = $$opts{coalesced_files}{$id}[$i][$j]{$file}; print STDERR "\t\t$file", ($n>1 ? "\t..\t${n}x" : ''),"\n"; } $printed{$$opts{dat}{ID}{$id}[$i][$j]} = 1; } } } } } sub add_to_values { my ($dst,$src,$cmp) = @_; my $id = 0; my $is = 0; while ($is<@$src) { while ( $id<@$dst && &$cmp($$src[$is][0],$$dst[$id][0])>0 ) { $id++; } if ( $id<@$dst && !&$cmp($$src[$is][0],$$dst[$id][0]) ) { for (my $j=1; $j<@{$$src[$is]}; $j++) { $$dst[$id][$j] += $$src[$is][$j]; } } else { splice(@$dst,$id,0,$$src[$is]); } $is++; } } sub add_to_sample_values { my ($dst,$src) = @_; my %id2i; for (my $i=0; $i<@$dst; $i++) { $id2i{$$dst[$i][0]} = $i; } for (my $i=0; $i<@$src; $i++) { if ( !exists($id2i{$$src[$i][0]}) ) { error("Whoops, no such dst sample: $$src[$i][0]\n"); } my $di = $id2i{$$src[$i][0]}; for (my $j=1; $j<@{$$src[$i]}; $j++) { $$dst[$di][$j] += $$src[$i][$j]; } } } sub merge_PSC { my ($a,$b,$n) = @_; for (my $i=0; $i<@$a; $i++) { $$a[$i][7] *= $n; } # average DP add_to_sample_values($a,$b); for (my $i=0; $i<@$a; $i++) { $$a[$i][7] /= $n+1; } } sub merge_PSI { my ($a,$b,$n) = @_; add_to_sample_values($a,$b); for (my $i=0; $i<@$b; $i++) { $$a[$i][4] = sprintf "%.2f", ($$a[$i][1]+$$a[$i][2] ? $$a[$i][2]/($$a[$i][1]+$$a[$i][2]) : 0); } } sub rglob { my ($a,$b) = @_; if ( $a eq $b ) { return $a; } $a =~ s/\*//; my $la = length($a); my $lb = length($b); my $i = 0; while ( $i<$la && $i<$lb && substr($a,$i,1) eq substr($b,$i,1) ) { $i++; } $la--; $lb--; while ( $la>$i && $lb>$i && substr($a,$la,1) eq substr($b,$lb,1) ) { $la--; $lb--; } $la = $la==$i && $lb==$i ? 1 : $la-$i; substr($a,$i,$la,'*'); return $a; } sub merge_id { # merge id filenames my ($opts,$dst,$src,$id) = @_; for (my $i=0; $i<@{$$src{$id}}; $i++) { for (my $j=0; $j<@{$$src{$id}[$i]}; $j++) { my $gname = rglob($$dst{$id}[$i][$j], $$src{$id}[$i][$j]); $$dst{$id}[$i][$j] = $gname; $$opts{coalesced_files}{$id}[$i][$j]{$$src{$id}[$i][$j]}++; } } } sub add_to_avg { my ($dst,$src,$n) = @_; for (my $i=0; $i<@$src; $i++) { if ( ref($$dst[$i]) eq 'ARRAY' ) { for (my $j=0; $j<@{$$dst[$i]}; $j++) { $$dst[$i][$j] = ($n*$$dst[$i][$j]+$$src[$i][$j])/($n+1); } } else { $$dst[$i] = ($n*$$dst[$i]+$$src[$i])/($n+1); } } } sub cmp_str($$) { my ($a,$b) = @_; return $a cmp $b; } sub cmp_num($$) { my ($a,$b) = @_; return $a <=> $b; } sub cmp_num_op($$) { # numeric compare with operators # Cases like <3, >500 make it complicated my ($a,$b) = @_; my $xa = '='; my $xb = '='; if ( $a=~/^(\D+)/ ) { $xa = $1; $a = $'; } if ( $b=~/^(\D+)/ ) { $xb = $1; $b = $'; } if ( $a==$b ) { return $xa cmp $xb; } $a <=> $b; } sub merge_dp { my ($a,$b) = @_; add_to_values($a,$b,\&cmp_num_op); # recalculate fraction of GTs, cannot be simply summed my $sum = 0; for (my $i=0; $i<@$a; $i++) { $sum += $$a[$i][1]; } for (my $i=0; $i<@$a; $i++) { $$a[$i][2] = $$a[$i][1]*100./$sum; } } sub merge_GCsS { my ($a,$b,$n) = @_; # average the non-ref discordance rate for (my $i=0; $i<@$a; $i++) { $$a[$i][1] *= $n; } add_to_sample_values($a,$b); for (my $i=0; $i<@$a; $i++) { $$a[$i][1] /= $n+1; } } sub merge_FS { my ($a,$b) = @_; for (my $i=0; $i<@$a; $i++) { for (my $j=0; $j<3; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][3] = sprintf "%.2f", ($$a[$i][0] + $$a[$i][1]) ? $$a[$i][1]/($$a[$i][0] + $$a[$i][1]) : 0; for (my $j=4; $j<7; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][7] = sprintf "%.2f", ($$a[$i][4] + $$a[$i][5]) ? $$a[$i][5]/($$a[$i][4] + $$a[$i][5]) : 0; } } sub merge_ICS { my ($a,$b) = @_; for (my $i=0; $i<@$a; $i++) { for (my $j=0; $j<3; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][3] = sprintf "%.4f", ($$a[$i][0] + $$a[$i][1]) ? $$a[$i][0]/($$a[$i][0] + $$a[$i][1]) : 0; } } sub merge_ICL { my ($a,$b) = @_; for (my $i=0; $i<@$a; $i++) { for (my $j=1; $j<5; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][5] = sprintf "%.4f", ($$a[$i][2] + $$a[$i][4]) ? ($$a[$i][1] + $$a[$i][3])/($$a[$i][1] + $$a[$i][2] + $$a[$i][3] + $$a[$i][4]) : 0; } } sub merge_TSTV { my ($a,$b) = @_; for (my $i=0; $i<@$a; $i++) { for (my $j=0; $j<2; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][2] = sprintf "%.2f", $$a[$i][1] ? $$a[$i][0]/$$a[$i][1] : 0; for (my $j=3; $j<5; $j++) { $$a[$i][$j] += $$b[$i][$j]; } $$a[$i][5] = sprintf "%.2f", $$a[$i][4] ? $$a[$i][3]/$$a[$i][4] : 0; } } sub merge_GCsAF { my ($a,$b) = @_; # recalculate r2 for (my $i=0; $i<@$a; $i++) { $$a[$i][7] *= $$a[$i][8]; } for (my $i=0; $i<@$b; $i++) { $$b[$i][7] *= $$b[$i][8]; } add_to_values($a,$b,\&cmp_num_op); for (my $i=0; $i<@$a; $i++) { $$a[$i][7] /= $$a[$i][8]; } } sub parse_vcfstats1 { my ($opts,$i) = @_; my $file = $$opts{vcfstats}[$i]; print STDERR "Parsing bcftools stats output: $file\n" unless !$$opts{verbose}; open(my $fh,'<',$file) or error("$file: $!"); my $line = <$fh>; if ( !$line or !($line=~/^# This file was produced by \S*/) ) { error("Sanity check failed: was this file generated by bcftools stats?"); } my %dat; while ($line=<$fh>) { $line =~ s/\s*$//; if ( $line=~/^#\s+(\S+)\t/ ) { $$opts{def_line}{$1} = $line; next; } if ( $line=~/^#/ ) { next; } my @items = split(/\t/,$line); if ( $items[0] eq 'SN' ) { $dat{$items[1]}{$items[2]} = splice(@items,3); next; } push @{$dat{$items[0]}{$items[1]}}, [splice(@items,2)]; } close($fh); for my $a (keys %dat) { if ( !exists($$opts{dat}{$a}) ) { $$opts{dat}{$a} = $dat{$a}; next; } # first vcfstats file for my $b (keys %{$dat{$a}}) { # Merging multiple vcfstats files. Honestly, this is quite hacky. if ( !exists($$opts{dat}{$a}{$b}) ) { $$opts{dat}{$a}{$b} = $dat{$a}{$b}; next; } # copy all, first occurance if ( $a eq 'ID' ) { merge_id($opts,$$opts{dat}{$a},$dat{$a},$b); } elsif ( ref($dat{$a}{$b}) ne 'ARRAY' ) { $$opts{dat}{$a}{$b} += $dat{$a}{$b} unless $b eq 'number of samples:'; } # SN, Summary numbers, do not sum sample counts elsif ( $a eq 'NRDs' ) { add_to_avg($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'NRDi' ) { add_to_avg($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'DP' ) { merge_dp($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'GCsS' ) { merge_GCsS($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'GCiS' ) { merge_GCsS($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'GCsAF' ) { merge_GCsAF($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'GCiAF' ) { merge_GCsAF($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'ST' ) { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_str); } elsif ( $a eq 'PSC') { merge_PSC($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'PSI') { merge_PSI($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'IDD') { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_num); } elsif ( $a eq 'FS') { merge_FS($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'ICS') { merge_ICS($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'ICL') { merge_ICL($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'TSTV') { merge_TSTV($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'DBG' ) { next; } else { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_num_op); } # SiS AF IDD } } } sub file_ids { my ($opts) = @_; my $id = 0; my @out; while ( exists($$opts{dat}{ID}) && exists($$opts{dat}{ID}{$id}) ) { push @out, $id++; } return @out; } sub tprint { my ($fh,@txt) = @_; for my $txt (@txt) { $txt =~ s/\n[ \t]+/\n/g; # eat leading tabs while ( ($txt =~ /\n\t*\\t/) ) { $txt =~ s/(\n\t*)\\t/$1\t/g; # replace escaped tabs (\\t) with tabs } print $fh $txt; } } sub init_plots { my ($opts) = @_; $$opts{plt_file} = "$$opts{prefix}plot.py"; my $titles = "# Title abbreviations:\n"; for my $id (file_ids($opts)) { $titles .= "# \t $id .. $$opts{title}{$id} .. $$opts{dat}{ID}{$id}[0][0]\n"; } $titles .= "#"; open(my $fh,'>',$$opts{plt_file}) or error("$$opts{plt_file}: $!"); tprint $fh, qq[ # This file was produced by plot-vcfstats, the command line was: # $$opts{args} # # Edit as necessary and recreate the plots by running # python $$opts{plt_file} # $titles # Set to 1 to plot in PDF instead of PNG pdf_plots = $$opts{pdf_plots} # Plots to generate, set to 0 to disable plot_venn_snps = 1 plot_venn_indels = 1 plot_tstv_by_sample = 1 plot_hethom_by_sample = 1 plot_snps_by_sample = 1 plot_indels_by_sample = 1 plot_singletons_by_sample = 1 plot_depth_by_sample = 1 plot_SNP_count_by_af = 1 plot_Indel_count_by_af = 1 plot_SNP_overlap_by_af = 1 plot_Indel_overlap_by_af = 1 plot_dp_dist = 1 plot_hwe = 1 plot_concordance_by_af = 1 plot_r2_by_af = 1 plot_discordance_by_sample = 1 plot_tstv_by_af = 1 plot_indel_dist = 1 plot_tstv_by_qual = 1 plot_substitutions = 1 # Set to 1 to use sample names for xticks instead of numeric sequential IDs # and adjust margins and font properties if necessary sample_names = $$opts{use_sample_names} sample_margins = {'right':0.98, 'left':0.07, 'bottom':0.2} sample_font = {'rotation':45, 'ha':'right', 'fontsize':8} if sample_names==0: sample_margins=(); sample_font=(); #------------------------------------------------- import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import csv csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE) import numpy def smooth(x,window_len=11,window='hanning'): \\tif x.ndim != 1: raise ValueError, "smooth only accepts 1 dimension arrays." \\tif x.size < window_len: return x \\tif window_len<3: return x \\tif not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']: raise ValueError, "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'" \\ts = numpy.r_[x[window_len-1:0:-1],x,x[-1:-window_len:-1]] \\tif window == 'flat': # moving average \\t\\tw = numpy.ones(window_len,'d') \\telse: \\t\\tw = eval('numpy.'+window+'(window_len)') \\ty = numpy.convolve(w/w.sum(),s,mode='valid') \\treturn y[(window_len/2-1):-(window_len/2)] ]; $$opts{plt_fh} = $fh; } sub percentile { my ($p,@vals) = @_; my $N = 0; for my $val (@vals) { $N += $val; } my $n = $p*($N+1)/100.; my $k = int($n); my $d = $n-$k; if ( $k<=0 ) { return 0; } if ( $k>=$N ) { return scalar @vals-1; } my $cnt; for (my $i=0; $i<@vals; $i++) { $cnt += $vals[$i]; if ( $cnt>=$k ) { return $i; } } error("FIXME: this should not happen [percentile]\n"); } sub get_values { my ($opts,$id,$key,$i,$j) = @_; if ( !exists($$opts{dat}{$key}) ) { return (); } if ( !exists($$opts{dat}{$key}{$id}) ) { return (); } my $fields_ok = 1; if ( !exists($$opts{exp}{$key}) ) { error("todo: sanity check for $key\n"); } if ( exists($$opts{def_line}{$key}) && $$opts{def_line}{$key} ne $$opts{exp}{$key} && !$$opts{def_line_warned}{$key} ) { warn("Warning: Possible version mismatch, the definition line differs\n\texpected: $$opts{exp}{$key}\n\tfound: $$opts{def_line}{$key}\n"); $$opts{def_line_warned}{$key} = 1; } if ( defined $i ) { if ( defined $j ) { return $$opts{dat}{$key}{$id}[$i][$j]; } return (@{$$opts{dat}{$key}{$id}[$i]}); } return (@{$$opts{dat}{$key}{$id}}); } sub get_value { my ($opts,$id,$key) = @_; if ( !exists($$opts{dat}{$id}) ) { return undef; } if ( !exists($$opts{dat}{$id}{$key}) ) { return undef} return $$opts{dat}{$id}{$key}; } sub plot_venn_bars { my ($opts) = @_; my @ids = file_ids($opts); if ( @ids != 3 ) { return; } my (@snps,@indels,@tstv,@snp_titles,@indel_titles); for my $id (0..2) { push @snps, get_value($opts,$id,'number of SNPs:'); push @indels, get_value($opts,$id,'number of indels:'); push @tstv, sprintf("%.2f",get_values($opts,$id,'TSTV',0,5)); push @snp_titles, "$$opts{title}{$id}\\nts/tv $tstv[$id]\\n" .bignum($snps[$id]); my @fs = get_values($opts,$id,'FS'); my $fs = @fs ? "frm $fs[0][3]\\n" : ''; push @indel_titles, "$$opts{title}{$id}\\n$fs" .bignum($indels[$id]); } my $fh = $$opts{plt_fh}; tprint $fh, qq[ if plot_venn_snps: \\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) \\tax1 = fig.add_subplot(111) \\tax1.bar([1,2,3],[$snps[0],$snps[2],$snps[1]],align='center',color='$$opts{id2col}[0]',width=0.3) \\tax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tax1.set_xlim(0.5,3.5) \\tplt.xticks([1,2,3],('$snp_titles[0]','$snp_titles[2]','$snp_titles[1]')) \\tplt.title('Number of SNPs') \\tplt.subplots_adjust(right=0.95,bottom=0.15) \\tplt.savefig('$$opts{prefix}venn_bars.snps.png') \\tif pdf_plots: plt.savefig('$$opts{prefix}venn_bars.snps.pdf') \\tplt.close() if plot_venn_indels: \\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) \\tax1 = fig.add_subplot(111) \\tax1.bar([1,2,3],[$indels[0],$indels[2],$indels[1]],align='center',color='$$opts{id2col}[1]',width=0.3) \\tax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tax1.set_xlim(0.5,3.5) \\tplt.xticks([1,2,3],('$indel_titles[0]','$indel_titles[2]','$indel_titles[1]')) \\tplt.title('Number of indels') \\tplt.subplots_adjust(right=0.95,bottom=0.15) \\tplt.savefig('$$opts{prefix}venn_bars.indels.png') \\tif pdf_plots: plt.savefig('$$opts{prefix}venn_bars.indels.pdf') \\tplt.close() ]; } sub plot_per_sample_stats { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'PSC'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}tstv_by_sample.$id"; my $img2 = "$$opts{prefix}hets_by_sample.$id"; my $img3 = "$$opts{prefix}snps_by_sample.$id"; my $img4 = "$$opts{prefix}indels_by_sample.$id"; my $img5 = "$$opts{prefix}singletons_by_sample.$id"; my $img6 = "$$opts{prefix}dp_by_sample.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Sample ID\t[2]ts/tv\t[3]het/hom\t[4]nSNPs\t[5]nIndels\t[6]Average depth\t[7]nSingletons\t[8]Sample name\n"; for (my $i=0; $i<@vals; $i++) { my $tstv = $vals[$i][5] ? $vals[$i][4]/$vals[$i][5] : 0; my $hethom = $vals[$i][2] ? $vals[$i][3]/$vals[$i][2] : 0; printf $tfh "%d\t%f\t%f\t%d\t%d\t%f\t%d\t%s\n", $i, $tstv, $hethom, $vals[$i][4]+$vals[$i][5], $vals[$i][6], $vals[$i][7], $vals[$i][8], $vals[$i][0]; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] != '#': dat.append(row) if plot_tstv_by_sample: \\tfig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[1] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') \\tax1.set_ylabel('Ts/Tv') \\tax1.set_ylim(min(float(row[1]) for row in dat)-0.1,max(float(row[1]) for row in dat)+0.1) \\tif sample_names: \\t\\t plt.xticks([row[0] for row in dat],[row[7] for row in dat],**sample_font) \\t\\t plt.subplots_adjust(**sample_margins) \\telse: \\t\\t plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) \\t\\t ax1.set_xlabel('Sample ID') \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() if plot_hethom_by_sample: \\tfig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[2] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') \\tax1.set_ylabel('nHet(RA) / nHom(AA)') \\tax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tif sample_names: \\t\\t plt.xticks([row[0] for row in dat],[row[7] for row in dat],**sample_font) \\t\\t plt.subplots_adjust(**sample_margins) \\telse: \\t\\t plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) \\t\\t ax1.set_xlabel('Sample ID') \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img2.png') \\tif pdf_plots: plt.savefig('$img2.pdf') \\tplt.close() if plot_snps_by_sample: \\tfig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[3] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') \\tax1.set_ylabel('Number of SNPs') \\tax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tif sample_names: \\t\\t plt.xticks([row[0] for row in dat],[row[7] for row in dat],**sample_font) \\t\\t plt.subplots_adjust(**sample_margins) \\telse: \\t\\t plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) \\t\\t ax1.set_xlabel('Sample ID') \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img3.png') \\tif pdf_plots: plt.savefig('$img3.pdf') \\tplt.close() if plot_indels_by_sample: \\tfig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[4] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') \\tax1.set_ylabel('Number of indels') \\tax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tif sample_names: \\t\\t plt.xticks([row[0] for row in dat],[row[7] for row in dat],**sample_font) \\t\\t plt.subplots_adjust(**sample_margins) \\telse: \\t\\t plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) \\t\\t ax1.set_xlabel('Sample ID') \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img4.png') \\tif pdf_plots: plt.savefig('$img4.pdf') \\tplt.close() if plot_singletons_by_sample: \\tfig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[6] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') \\tax1.set_ylabel('Number of singletons') \\tax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tif sample_names: \\t\\t plt.xticks([row[0] for row in dat],[row[7] for row in dat],**sample_font) \\t\\t plt.subplots_adjust(**sample_margins) \\telse: \\t\\t plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) \\t\\t ax1.set_xlabel('Sample ID') \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img5.png') \\tif pdf_plots: plt.savefig('$img5.pdf') \\tplt.close() if plot_depth_by_sample: \\tfig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[5] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') \\tax1.set_ylabel('Average depth') \\tax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tif sample_names: \\t\\t plt.xticks([row[0] for row in dat],[row[7] for row in dat],**sample_font) \\t\\t plt.subplots_adjust(**sample_margins) \\telse: \\t\\t plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) \\t\\t ax1.set_xlabel('Sample ID') \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img6.png') \\tif pdf_plots: plt.savefig('$img6.pdf') \\tplt.close() "; } sub plot_DP { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'DP'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}depth.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Depth\t[2]Cumulative number of genotypes\t[3]Number of genotypes\n"; my $sum = 0; for (my $i=0; $i<@vals; $i++) { if ( $sum>99. ) { last; } if ( !($vals[$i][0]=~/^\d+$/) ) { next; } # DP ">500" case $sum += $vals[$i][2]; printf $tfh "%d\t%f\t%f\n", $vals[$i][0], $sum, $vals[$i][2]; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] != '#': dat.append(row) if plot_dp_dist: \\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[2] for row in dat], '-^', color='k') \\tax1.set_ylabel('Number of genotypes [%]',color='k') \\tax1.set_xlabel('Depth') \\tax2 = ax1.twinx() \\tax2.plot([row[0] for row in dat], [row[1] for row in dat], '-o', color='$$opts{id2col}[$id]') \\tax2.set_ylabel('Cumulative number of genotypes [%]',color='$$opts{id2col}[$id]') \\tfor tl in ax2.get_yticklabels(): tl.set_color('$$opts{id2col}[$id]') \\tplt.subplots_adjust(left=0.15,bottom=0.15,right=0.87) \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() "; } sub plot_hwe { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'HWE'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}hwe.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Allele Frequency\t[2]Depth\t[3]Number of hets (median)\t[4]Number of hets (25-75th percentile)\n"; for (my $i=0; $i<@vals; $i++) { if ( !$vals[$i][1] ) { next; } print $tfh join("\t", @{$vals[$i]}), "\n"; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] != '#': dat.append(row) if plot_hwe and len(dat)>1: \\tx = [float(row[0]) for row in dat] \\ty1 = smooth(numpy.array([float(row[2]) for row in dat]),40,'hanning') \\ty2 = smooth(numpy.array([float(row[3]) for row in dat]),40,'hanning') \\ty3 = smooth(numpy.array([float(row[4]) for row in dat]),40,'hanning') \\tdp = smooth(numpy.array([float(row[1]) for row in dat]),40,'hanning') \\thwe = [] \\tfor af in x: hwe.append(2*af/100.*(1-af/100.)) \\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) \\tax1 = fig.add_subplot(111) \\tplots = ax1.plot(x,hwe,'--',color='#ff9900',label='HWE') \\tplots += ax1.plot(x,y2,color='#ff9900',label='Median') \\tplots += ax1.plot(x,y3,color='#ffe0b2',label='25-75th percentile') \\tax1.fill_between(x,y1,y3, facecolor='#ffeacc',edgecolor='#ffe0b2') \\tax1.set_ylabel('Fraction of hets',color='#ff9900') \\tax1.set_xlabel('Allele frequency [%]') \\tfor tl in ax1.get_yticklabels(): tl.set_color('#ff9900') \\tax2 = ax1.twinx() \\tplots += ax2.plot(x,dp, 'k', label='Number of sites') \\tax2.set_ylabel('Number of sites') \\tax2.set_yscale('log') \\tlabels = [l.get_label() for l in plots] \\tplt.legend(plots,labels,numpoints=1,markerscale=2,loc='center',prop={'size':9},frameon=False) \\tplt.subplots_adjust(left=0.15,bottom=0.15,right=0.86) \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() "; } sub plot_irc_by_rlen { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'ICL'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}irc_by_rlen.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Repeat type\t[2]Total count\t[3]repeat consistency: deletions\t[4]repeat consistency: insertions\n"; for (my $i=0; $i<@vals; $i++) { printf $tfh "%d\t%d\t%f\t%f\n", $vals[$i][0], # repeat type $vals[$i][1]+$vals[$i][2]+$vals[$i][3]+$vals[$i][4], # total count $vals[$i][2] ? $vals[$i][1]/($vals[$i][1]+$vals[$i][2]) : 0, # IRC of deletions $vals[$i][4] ? $vals[$i][3]/($vals[$i][3]+$vals[$i][4]) : 0 # IRC of insertions ; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] != '#': dat.append([float(x) for x in row]) if plot_irc_by_len and len(dat): \\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) \\tax1 = fig.add_subplot(111) \\tax2 = ax1.twinx() \\tplots = ax1.plot([row[0] for row in dat], [row[2] for row in dat], 'o-', ms=3, color='r', mec='r', label='Deletions'); \\tplots += ax1.plot([row[0] for row in dat], [row[3] for row in dat], 'o-', ms=3, color='g', mec='g', label='Insertions'); \\tplots += ax2.plot([row[0] for row in dat], [row[1] for row in dat], '^--', ms=3, color='k', mec='k', label='Number of sites'); \\tax2.ticklabel_format(style='sci',scilimits=(-3,4)) \\tax1.set_xlabel('Repeat type'); \\tax1.set_ylabel('Repeat Consistency'); \\tax2.set_ylabel('Number of sites'); \\tlabels = [l.get_label() for l in plots] \\tax1.legend(plots,labels,numpoints=1,markerscale=1,loc='best',prop={'size':9},frameon=False) \\tplt.subplots_adjust(right=0.88,left=0.15,bottom=0.11) \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() "; } sub plot_irc_by_AF { error("plot_irc_by_AF: deprecated\n"); my ($opts,$id) = @_; my @vals = get_values($opts,$id,'AF'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}irc_by_af.$id"; my $vals = rebin_values(\@vals,8,0); open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Repeat type\t[2]Total count\t[3]repeat consistency: deletions\t[4]repeat consistency: insertions\n"; my $has_vals = 0; tprint $fh, " dat = [ "; for (my $i=0; $i<@$vals; $i++) { if ( $$vals[$i][5] + $$vals[$i][6] == 0 ) { next; } $has_vals++; tprint $fh, sprintf("\t[ %f, %d, %f, %f ],\n", $$vals[$i][0]/100., $$vals[$i][5] + $$vals[$i][6], $$vals[$i][6] ? $$vals[$i][5]/($$vals[$i][5] + $$vals[$i][6]) : 0, ($$vals[$i][6] + $$vals[$i][7]) ? $$vals[$i][5]/($$vals[$i][5] + $$vals[$i][6] + $$vals[$i][7]) : 0); } tprint $fh, "]"; if ( $has_vals<2 ) { return; } tprint $fh, " fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ax1 = fig.add_subplot(111) ax1.plot([row[0] for row in dat], [row[1] for row in dat], '-o',color='k',mec='k',markersize=3) ax1.set_ylabel('Number of sites',color='k') ax1.set_yscale('log') #ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') for tl in ax1.get_yticklabels(): tl.set_color('k') ax1.set_xlabel('Non-ref allele frequency') ax2 = ax1.twinx() ax2.plot([row[0] for row in dat], [row[2] for row in dat], '-o',color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]',markersize=3,label='c/(c+i)') ax2.plot([row[0] for row in dat], [row[3] for row in dat], '--^',color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]',markersize=3,label='c/(c+i+n/a)') ax2.legend(numpoints=1,markerscale=2,loc='best',prop={'size':10},frameon=False) ax2.set_ylabel('Indel Repeat Consistency',color='$$opts{id2col}[$id]') ax2.set_ylim(0,1) ax1.set_xlim(0,1) for tl in ax2.get_yticklabels(): tl.set_color('$$opts{id2col}[$id]') plt.subplots_adjust(right=0.88,left=0.15,bottom=0.11) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') if pdf_plots: plt.savefig('$img.pdf') plt.close() "; } sub plot_tstv_by_AF { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'AF'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}tstv_by_af.$id"; my $vals = rebin_values(\@vals,8,0); open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Allele frequency\t[2]Number of sites\t[3]ts/tv\n"; for (my $i=0; $i<@$vals; $i++) { if ( $$vals[$i][2] + $$vals[$i][3] == 0 ) { next; } printf $tfh "%f\t%d\t%f\n", $$vals[$i][0]/100., $$vals[$i][2] + $$vals[$i][3], $$vals[$i][3] ? $$vals[$i][2]/$$vals[$i][3]: 0; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] != '#': dat.append([float(x) for x in row]) if plot_tstv_by_af and len(dat)>2: \\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[1] for row in dat], '-o',color='k',mec='k',markersize=3) \\tax1.set_ylabel('Number of sites',color='k') \\tax1.set_yscale('log') \\t#ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tfor tl in ax1.get_yticklabels(): tl.set_color('k') \\tax1.set_xlabel('Non-ref allele frequency') \\tax2 = ax1.twinx() \\tax2.plot([row[0] for row in dat], [row[2] for row in dat], '-o',color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]',markersize=3) \\tax2.set_ylabel('Ts/Tv',color='$$opts{id2col}[$id]') \\tax2.set_ylim(0,0.5+max(3,max(row[2] for row in dat))) \\tax1.set_xlim(0,1) \\tfor tl in ax2.get_yticklabels(): tl.set_color('$$opts{id2col}[$id]') \\tplt.subplots_adjust(right=0.88,left=0.15,bottom=0.11) \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() "; } sub plot_tstv_by_QUAL { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'QUAL'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}tstv_by_qual.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Quality\t[2]Number of sites\t[3]Marginal Ts/Tv\n"; my @dat = (); my $ntot = 0; for my $val (@vals) { push @dat, [ $$val[0], $$val[2], $$val[3] ]; # qual, nts, ntv $ntot += $$val[2] + $$val[3]; } my @sdat = sort { $$b[0] <=> $$a[0] } @dat; push @sdat, [-1]; my $dn = $ntot*0.05; my $qprev = $sdat[0][0]; my $nts = 0; my $ntv = 0; my $nout = 0; for my $rec (@sdat) { if ( $$rec[0]==-1 or $nts+$ntv > $dn ) { if ( $ntv ) { printf $tfh "$qprev\t%d\t%f\n", $nts+$ntv+$nout,$nts/$ntv; } if ( $$rec[0]==-1 ) { last; } $nout += $nts+$ntv; $nts = 0; $ntv = 0; $qprev = $$rec[0]; } $nts += $$rec[1]; $ntv += $$rec[2]; } close($tfh) or error("close $img.dat"); tprint $fh, " dat = [] with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] != '#': dat.append([float(x) for x in row]) if plot_tstv_by_qual: \\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[1] for row in dat], [row[2] for row in dat], '^-', ms=3, mec='$$opts{id2col}[$id]', color='$$opts{id2col}[$id]') \\tax1.set_ylabel('Ts/Tv',fontsize=10) \\tax1.set_xlabel('Number of sites\\n(sorted by QUAL, descending)',fontsize=10) \\tax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='x') \\tax1.set_ylim(min(2,min(row[2] for row in dat))-0.3,0.3+max(2.2,max(row[2] for row in dat))) \\tplt.subplots_adjust(right=0.88,left=0.15,bottom=0.15) \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() "; } sub rebin_values { my ($vals,$bin_size,$col,%args) = @_; my %avg = exists($args{avg}) ? map {$_=>1} @{$args{avg}} : (); my $prev = $$vals[0][$col]; my $iout = 0; my $nsum = 0; my (@dat,@out); for (my $i=0; $i<@$vals; $i++) { for (my $icol=0; $icol<@{$$vals[$i]}; $icol++) { if ( $icol==$col ) { next; } $dat[$icol] += $$vals[$i][$icol]; } $nsum++; if ( $i+1<@$vals && $$vals[$i][$col] - $prev < $bin_size ) { next; } $dat[$col] = $prev; for (my $icol=0; $icol<@{$$vals[$i]}; $icol++) { $out[$iout][$icol] = $dat[$icol] ? $dat[$icol] : 0; if ( $avg{$icol} && $nsum ) { $out[$iout][$icol] /= $nsum; } } $nsum = 0; @dat = (); $iout++; $prev = $$vals[$i][$col]; } return \@out; } sub plot_concordance_by_AF { my ($opts) = @_; my @vals = get_values($opts,2,'GCsAF'); if ( !@vals ) { return; } # create a local copy and prepare r2 for rebinning @vals = @{ dclone(\@vals) }; for (my $i=0; $i<@vals; $i++) { $vals[$i][7] *= $vals[$i][8]; } my $vals = rebin_values(\@vals,1,0); my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}gts_by_af"; my $img2 = "$$opts{prefix}r2_by_af"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Allele Frequency\t[2]RR concordance\t[3]RA concordance\t[4]AA concordance\t[5]nRR\t[6]nRA\t[7]nAA\t[8]R^2\t[9]Number of sites\n"; for (my $i=0; $i<@$vals; $i++) { printf $tfh "%f\t%f\t%f\t%f\t%d\t%d\t%d\t%f\t%d\n", $$vals[$i][0]/100., $$vals[$i][1]+$$vals[$i][4] ? $$vals[$i][1]/($$vals[$i][1]+$$vals[$i][4]) : 1, $$vals[$i][2]+$$vals[$i][5] ? $$vals[$i][2]/($$vals[$i][2]+$$vals[$i][5]) : 1, $$vals[$i][3]+$$vals[$i][6] ? $$vals[$i][3]/($$vals[$i][3]+$$vals[$i][6]) : 1, $$vals[$i][1]+$$vals[$i][4], $$vals[$i][2]+$$vals[$i][5], $$vals[$i][3]+$$vals[$i][6], $$vals[$i][8] ? $$vals[$i][7]/$$vals[$i][8] : 1, $$vals[$i][8]; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] != '#': dat.append(row) if plot_concordance_by_af and len(dat)>1: \\tfig = plt.figure(figsize=($$opts{img_width}*1.2,$$opts{img_height})) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[1] for row in dat],'.',color='$$opts{id2col}[1]',label='Hom RR') \\tax1.plot([row[0] for row in dat], [row[2] for row in dat],'.',color='$$opts{id2col}[0]',label='Het RA') \\tax1.plot([row[0] for row in dat], [row[3] for row in dat],'.',color='k',label='Hom AA') \\tax1.set_xlabel('Non-ref allele frequency') \\tax1.set_ylabel('Concordance') \\tleg = ax1.legend(title='Concordance:',numpoints=1,markerscale=2,loc='best',prop={'size':9}) \\tleg.draw_frame(False) \\tplt.setp(leg.get_title(),fontsize=9) \\tax2 = ax1.twinx() \\tax2.plot([row[0] for row in dat], [row[4] for row in dat],color='$$opts{id2col}[1]') \\tax2.plot([row[0] for row in dat], [row[5] for row in dat],color='$$opts{id2col}[0]') \\tax2.plot([row[0] for row in dat], [row[6] for row in dat],color='k') \\tax2.set_ylabel('Number of genotypes') \\tax2.set_yscale('log') \\tplt.subplots_adjust(left=0.15,right=0.83,bottom=0.11) \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() if plot_r2_by_af and len(dat)>1: \\tfig = plt.figure(figsize=($$opts{img_width}*1.3,$$opts{img_height})) \\tax1 = fig.add_subplot(111) \\tax2 = ax1.twinx() \\tax1.set_zorder(ax2.get_zorder()+1) \\tax1.patch.set_visible(False) \\tax2.plot([row[0] for row in dat], [row[8] for row in dat], '-o', color='r',mec='r',markersize=3) \\tax1.plot([row[0] for row in dat], [row[7] for row in dat], '-^', color='k',markersize=3) \\tfor tl in ax2.get_yticklabels(): tl.set_color('r') \\tax2.set_ylabel('Number of sites', color='r') \\tax2.set_yscale('log') \\tax1.set_ylabel('Mean allelic R\$^2\$', color='k') \\tax1.set_xlabel('Non-ref allele frequency') \\tplt.subplots_adjust(left=0.19,right=0.83,bottom=0.11) \\tplt.savefig('$img2.png') \\tif pdf_plots: plt.savefig('$img2.pdf') \\tplt.close() "; } sub plot_concordance_by_sample { my ($opts) = @_; my @vals = get_values($opts,2,'GCsS'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}gts_by_sample"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Sample ID\t[2]Discordance\t[3]Sample Name\n"; for (my $i=0; $i<@vals; $i++) { printf $tfh "%d\t%f\t%s\n", $i, $vals[$i][1], $vals[$i][0]; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] != '#': dat.append(row) if plot_discordance_by_sample: \\tfig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[1] for row in dat],'.',color='orange') \\tax1.set_ylabel('Non-ref discordance') \\tax1.set_ylim(0,) \\tif sample_names: \\t\\t plt.xticks([row[0] for row in dat],[row[2] for row in dat],**sample_font) \\t\\t plt.subplots_adjust(**sample_margins) \\telse: \\t\\t plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) \\t\\t ax1.set_xlabel('Sample ID') \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() "; } sub plot_counts_by_AF { my ($opts) = @_; plot_counts_by_AF_col($opts,1,'SNP'); plot_counts_by_AF_col($opts,4,'Indel'); } sub plot_counts_by_AF_col { my ($opts,$col,$title) = @_; my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}counts_by_af.".lc($title)."s"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]id\t[2]Nonref Allele Frequency\t[3]Number of sites\n"; for my $id (file_ids($opts)) { my @tmp = get_values($opts,$id,'AF'); my $vals = rebin_values(\@tmp,1,0); for my $val (@$vals) { if ( !$$val[$col] ) { next; } print $tfh "$id\t$$val[0]\t$$val[$col]\n"; } } close($tfh); tprint $fh, " dat = {} with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] == '#': continue \\t\\tid = int(row[0]) \\t\\tif id not in dat: dat[id] = [] \\t\\tdat[id].append([float(row[1]),float(row[2])]) if plot_${title}_count_by_af: \\tfig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) \\tax1 = fig.add_subplot(111) \\tax1.set_ylabel('Number of sites') \\tax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tax1.set_yscale('log') \\tax1.set_xlabel('Non-reference allele frequency') \\tax1.set_xlim(-0.05,1.05) \\thas_data = 0 "; for my $id (file_ids($opts)) { tprint $fh, " \\tif $id in dat and len(dat[$id])>2: \\t\\tax1.plot([row[0]/100. for row in dat[$id]], [row[1] for row in dat[$id]], '-o',markersize=3, color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]',label='$$opts{title}{$id}') \\t\\thas_data = 1 "; } tprint $fh, " \\tif has_data: \\t\\tax1.legend(numpoints=1,markerscale=1,loc='best',prop={'size':10},frameon=False) \\t\\tplt.title('$title count by AF') \\t\\tplt.subplots_adjust(bottom=0.2,left=0.1,right=0.95) \\t\\tplt.savefig('$img.png') \\t\\tif pdf_plots: plt.savefig('$img.pdf') \\t\\tplt.close() "; } sub plot_overlap_by_AF { my ($opts) = @_; plot_overlap_by_AF_col($opts,1,'SNP'); plot_overlap_by_AF_col($opts,4,'Indel'); } sub plot_overlap_by_AF_col { my ($opts,$col,$title) = @_; my @ids = file_ids($opts); if ( @ids != 3 ) { return; } my ($ia,$ib,$iab); for (my $i=0; $i<@ids; $i++) { if ( @{$$opts{dat}{ID}{$ids[$i]}[0]}>1 ) { $iab = $i; next; } if ( !defined $ia ) { $ia = $i; next; } $ib = $i; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}overlap_by_af.".lc($title)."s"; my @has_vals; my @vals_a = get_values($opts,$ia,'AF'); my @vals_b = get_values($opts,$ib,'AF'); my @vals_ab = get_values($opts,$iab,'AF'); my (%afs,%af_a,%af_ab); for my $val (@vals_a) { $afs{$$val[0]} = $$val[$col]; $af_a{$$val[0]} = $$val[$col]; } for my $val (@vals_ab) { $afs{$$val[0]} = $$val[$col]; $af_ab{$$val[0]} = $$val[$col]; } open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Allele frequency\t[2]Fraction of sites from $$opts{title}{$ids[$ia]} also in $$opts{title}{$ids[$ib]}\t[3]Number of sites\n"; for my $af (sort { $a<=>$b } keys %afs) { my $a = exists($af_a{$af}) ? $af_a{$af} : 0; my $ab = exists($af_ab{$af}) ? $af_ab{$af} : 0; my $yval = ($a+$ab) ? $ab * 100. / ($a + $ab) : 0; print $tfh "$af\t$yval\t" .($a+$ab). "\n"; } close($tfh) or error("close $img.dat"); tprint $fh, " dat = [] with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] != '#': dat.append(row) if plot_${title}_overlap_by_af and len(dat)>1: \\tfig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) \\tax1 = fig.add_subplot(111) \\tax1.plot([row[0] for row in dat], [row[1] for row in dat],'-o',markersize=3, color='$$opts{id2col}[1]',mec='$$opts{id2col}[1]') \\tax1.set_ylabel('Fraction found in $$opts{title}{$ib} [%]') \\tax1.set_xscale('log') \\tax1.set_xlabel('Non-reference allele frequency in $$opts{title}{$ia} [%]') \\tax1.set_xlim(0,11) \\tax1.set_xticks([0.1,0.2,0.5,1,2,5,10]) \\tax1.set_xticklabels([0.1,0.2,0.5,1,2,5,10]) \\tplt.title('$title overlap by AF') \\tplt.subplots_adjust(bottom=0.2,left=0.1,right=0.95) \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() "; } sub plot_indel_distribution { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'IDD'); if ( !@vals ) { return; } # Set xlim to show 99 of indels but ignore outliers my @tmp; for my $id (file_ids($opts)) { my @v = get_values($opts,$id,'IDD'); for my $v (@v) { $tmp[ abs($$v[0]) ] += $$v[1]; } } my $n; for my $t (@tmp) { $n += $t ? $t : 0; } my ($sum,$xlim); for ($xlim=0; $xlim<@tmp; $xlim++) { $sum += $tmp[$xlim] ? $tmp[$xlim] : 0; if ( $sum/$n >= 0.99 ) { last; } } if ( $xlim<20 ) { $xlim=20; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}indels.$id"; open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Indel length\t[2]Count\n"; for my $val (@vals) { print $tfh "$$val[0]\t$$val[1]\n"; } close($tfh); tprint $fh, " dat = [] with open('$img.dat', 'rb') as f: \\treader = csv.reader(f, 'tab') \\tfor row in reader: \\t\\tif row[0][0] != '#': dat.append([float(x) for x in row]) if plot_indel_dist and len(dat)>0: \\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) \\tax1 = fig.add_subplot(111) \\tax1.bar([row[0]-0.5 for row in dat], [row[1] for row in dat], color='$$opts{id2col}[0]')# , edgecolor='$$opts{id2col}[0]') \\tax1.set_xlabel('InDel Length') \\tax1.set_ylabel('Count') \\tax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tax1.set_xlim(-$xlim,$xlim) \\tplt.subplots_adjust(bottom=0.17) \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() "; } sub plot_substitutions { my ($opts,$id) = @_; my @vals = get_values($opts,$id,'ST'); if ( !@vals ) { return; } my $fh = $$opts{plt_fh}; my $img = "$$opts{prefix}substitutions.$id"; tprint $fh, " dat = [ "; for (my $i=0; $i<@vals; $i++) { my $val=$vals[$i]; tprint $fh, "\t[$i,'$$val[0]',$$val[1]],\n"; } tprint $fh, "] if plot_substitutions: \\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) \\tcm = mpl.cm.get_cmap('autumn') \\tn = 12 \\tcol = range(n) \\tfor i in range(n): col[i] = cm(1.*i/n) \\tax1 = fig.add_subplot(111) \\tax1.bar([row[0] for row in dat], [row[2] for row in dat], color=col) \\tax1.set_ylabel('Count') \\tax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') \\tax1.set_xlim(-0.5,n+0.5) \\tplt.xticks([row[0] for row in dat],[row[1] for row in dat],rotation=45) \\tplt.title('$$opts{title}{$id}') \\tplt.savefig('$img.png') \\tif pdf_plots: plt.savefig('$img.pdf') \\tplt.close() "; } sub singletons { my ($opts,$id) = @_; my @si_vals = get_values($opts,$id,'SiS'); my $si_snps = $si_vals[0][1]; my $si_indels = $si_vals[0][4]; my $si_irc = sprintf "%.3f", $si_vals[0][6] ? $si_vals[0][5]/($si_vals[0][5]+$si_vals[0][6]) : 0; my $si_tstv = sprintf "%.2f", $si_vals[0][3] ? $si_vals[0][2]/$si_vals[0][3] : 0; my @all_vals = get_values($opts,$id,'AF'); my $nsnps = 0; my $nindels = 0; for my $val (@all_vals) { $nsnps += $$val[1]; $nindels += $$val[4]; } $si_snps = sprintf "%.1f", $nsnps ? $si_snps*100./$nsnps : 0; $si_indels = sprintf "%.1f", $nindels ? $si_indels*100./$nindels : 0; return { snps=>$si_snps, indels=>$si_indels, tstv=>$si_tstv, irc=>$si_irc }; } sub calc_3n_n3n { my (@vals) = @_; my $n3 = 0; my $nn3 = 0; for my $val (@vals) { if ( !($$val[0]%3) ) { $n3++; } else { $nn3++; } } if ( !$nn3 ) { return '-'; } return sprintf("%.2f", $n3/$nn3); } sub fmt_slide3v { my ($opts, $image, $title) = @_; my $n = 0; for my $id (0..2) { if ( -e "$image.$id.$$opts{fmt}" ) { $n++; } } if ( !$n ) { return ''; } my $h = $$opts{tex}{slide3v}{"height$n"}; my $slide = q[\vbox{]; for my $id (0..2) { if ( !-e "$image.$id.$$opts{fmt}" ) { next; } $slide .= qq[\\centerline{\\includegraphics[$$opts{ext},height=$h]{$image.$id}}]; } $slide .= '}'; return qq[ % $title % \\hslide{$title}{$slide} ]; } sub fmt_slide3h { my ($opts, $image, $title) = @_; my $n = 0; for my $id (0..2) { if ( -e "$image.$id.$$opts{fmt}" ) { $n++; } } if ( !$n ) { return ''; } my $w = $$opts{tex}{slide3h}{"width$n"}; my $slide = ''; for my $id (0..2) { if ( !-e "$image.$id.$$opts{fmt}" ) { next; } $slide .= qq[\\includegraphics[$$opts{ext},width=$w]{$image.$id}]; } return qq[ % $title % \\hslide{$title}{$slide} ]; } sub bignum { my ($num) = @_; if ( !defined $num ) { return ''; } if ( !($num=~/^\d+$/) ) { return $num; } my $len = length($num); my $out; for (my $i=0; $i<$len; $i++) { $out .= substr($num,$i,1); if ( $i+1<$len && !(($len-$i-1)%3) ) { $out .= ','; } } return $out; } sub create_pdf { my ($opts) = @_; chdir($$opts{dir}); my @ids = file_ids($opts); my $width = "25.4cm"; # todo: move all this to $$opts{tex} my $height = "19cm"; my $height1 = "13cm"; my $width1 = "23cm"; my $width2 = @ids==3 ? "10.5cm" : "10.5cm"; my $width3 = @ids==3 ? "8cm" : "15cm"; my $fmt = $$opts{rasterize} ? 'png' : 'pdf'; my $ext = "type=$fmt,ext=.$fmt,read=.$fmt"; my $args = { ext=>$ext, width3=>$width3, n=>scalar @ids }; $$opts{fmt} = $fmt; $$opts{ext} = $ext; # Check that xcolor is available my @has_xcolor = `kpsewhich xcolor.sty`; if ( !@has_xcolor ) { warn("Note: The xcolor.sty package not available, black and white tables only...\n\n"); } my $tex_file = "$$opts{lprefix}summary.tex"; my $pdf_file = "$$opts{prefix}summary.pdf"; open(my $tex,'>',$tex_file) or error("$tex_file: $!"); tprint $tex, qq[ % This file was produced by plot-vcfstats, the command line was: % $$opts{args} % % Edit as necessary and recreate the PDF by running % pdflatex $tex_file % % Slides style and dimensions % \\nonstopmode \\documentclass[17pt]{memoir} \\setstocksize{$height}{$width} \\settrimmedsize{\\stockheight}{\\stockwidth}{*} \\settrims{0pt}{0pt} \\setlrmarginsandblock{1cm}{*}{*} \\setulmarginsandblock{1.5cm}{*}{*} \\setheadfoot{1mm}{1cm} \\setlength{\\parskip}{0pt} \\setheaderspaces{*}{1mm}{*} \\setmarginnotes{1mm}{1mm}{1mm} \\checkandfixthelayout[fixed] \\usepackage{charter} % font \\pagestyle{plain} \\makeevenfoot{plain}{}{}{\\thepage} \\makeoddfoot{plain}{}{}{\\thepage} \\usepackage{graphicx} % For colored tables. If xcolor.sty is not available on your system, % download xcolor.sty.gz LaTeX class style from % http://www.ukern.de/tex/xcolor.html % Unpack and install system-wide or place elsewhere and make available by % setting the TEXINPUTS environment variable (note the colon) % export TEXINPUTS=some/dir: % The list of the recognised path can be obtained by running `kpsepath tex` % \\usepackage{multirow} \\setlength{\\tabcolsep}{0.6em} \\renewcommand{\\arraystretch}{1.2} ]; if ( @has_xcolor ) { tprint $tex, '\usepackage[table]{xcolor}'; } else { tprint $tex, qq[ \\newcommand{\\definecolor}[3]{} \\newcommand{\\columncolor}[1]{} \\newcommand{\\rowcolors}[4]{} \\newcommand{\\arrayrulecolor}[1]{} ]; } tprint $tex, qq[ \\definecolor{hcol1}{rgb}{1,0.6,0} \\definecolor{hcol2}{rgb}{1,0.68,0.2} \\definecolor{row1}{rgb}{1,0.88,0.7} \\definecolor{row2}{rgb}{1,0.92,0.8} % #FFEBCC \\setlength{\\arrayrulewidth}{1.5pt} % Slide headings \\newcommand*{\\head}[1]{{\\Large\\centerline{#1}\\vskip0.5em}} % Slide definition \\newcommand*{\\hslide}[2]{% \\head{#1}% \\begin{vplace}[0.5]\\centerline{#2}\\end{vplace}\\newpage} \\newcommand{\\pdf}[2]{\\IfFileExists{#2.$fmt}{\\includegraphics[#1]{#2}}{}} % The actual slides \\begin{document} ]; # Table with summary numbers my $slide .= q[ \begin{minipage}{\textwidth}\centering \small \rowcolors*{3}{row2}{row1} \arrayrulecolor{black} \begin{tabular}{l | r r r | r r r | r | r} \multicolumn{1}{>{\columncolor{hcol1}}l|}{} & \multicolumn{3}{>{\columncolor{hcol1}}c|}{SNPs} & \multicolumn{3}{>{\columncolor{hcol1}}c|}{indels} & \multicolumn{1}{>{\columncolor{hcol1}}c|}{MNPs} & \multicolumn{1}{>{\columncolor{hcol1}}c}{others} \\\\ % \multicolumn{1}{>{\columncolor{hcol2}}l|}{Callset} & \multicolumn{1}{>{\columncolor{hcol2}}c}{n} & \multicolumn{1}{>{\columncolor{hcol2}}c }{ts/tv} & \multicolumn{1}{>{\columncolor{hcol2}}c|}{\\footnotesize(1st ALT)} & \multicolumn{1}{>{\columncolor{hcol2}}c}{n} & \multicolumn{1}{>{\columncolor{hcol2}}c}{frm$^*$} & \multicolumn{1}{>{\columncolor{hcol2}}c|}{rc$^{**}$} & \multicolumn{1}{>{\columncolor{hcol2}}c|}{} & \multicolumn{1}{>{\columncolor{hcol2}}c}{} \\\\ \hline ]; my %tex_titles; for my $id (@ids) { my $snps = get_value($opts,$id,'number of SNPs:'); my $indels = get_value($opts,$id,'number of indels:'); my $mnps = get_value($opts,$id,'number of MNPs:'); my $others = get_value($opts,$id,'number of others:'); my $tstv = sprintf "%.2f",get_values($opts,$id,'TSTV',0,2); my $tstv1 = sprintf "%.2f",get_values($opts,$id,'TSTV',0,5); my @frsh = get_values($opts,$id,'FS'); my $frsh = @frsh ? $frsh[0][3] : '--'; my @rc = get_values($opts,$id,'ICS'); my $rc = @rc ? sprintf("%.3f",$rc[0][3]) : '--'; my $title = $$opts{title}{$id}; $title =~ s/_/\\_/g; $title =~ s/^\s*\*/\$*\$/; # leading asterisks is eaten by TeX $tex_titles{$id} = $title; $slide .= qq[ $title & ] . bignum($snps) . qq[ & $tstv & $tstv1 & ] . bignum($indels) . qq[ & $frsh & $rc & ] . bignum($mnps) . ' & ' . bignum($others) . qq[ \\\\ \n]; } $slide .= q[% \multicolumn{8}{r}{$^*$ frameshift ratio: out/(out+in);\hspace{1em} $^{**}$ repeat-consistency} \\\\ \end{tabular} \\\\ \vspace{1em} \begin{tabular}{l | r r r r | r r} \multicolumn{1}{>{\columncolor{hcol1}}l|}{} & \multicolumn{4}{>{\columncolor{hcol1}}c|}{singletons {\footnotesize(AC=1)}} & \multicolumn{2}{>{\columncolor{hcol1}}c}{multiallelic} \\\\ % \multicolumn{1}{>{\columncolor{hcol2}}l|}{Callset} & \multicolumn{1}{>{\columncolor{hcol2}}c}{SNPs} & \multicolumn{1}{>{\columncolor{hcol2}}c}{ts/tv} & \multicolumn{1}{>{\columncolor{hcol2}}c}{indels} & \multicolumn{1}{>{\columncolor{hcol2}}c|}{rc} & \multicolumn{1}{>{\columncolor{hcol2}}c}{sites} & \multicolumn{1}{>{\columncolor{hcol2}}c}{SNPs} \\\\ \hline ]; for my $id (@ids) { my $snps = get_value($opts,$id,'number of SNPs:'); my $s = singletons($opts,$id); my $mals = get_value($opts,$id,'number of multiallelic sites:'); my $msnps = get_value($opts,$id,'number of multiallelic SNP sites:'); my $title = $tex_titles{$id}; $slide .= qq[ $title & $$s{snps}\\% & $$s{tstv} & $$s{indels}\\% & $$s{irc} & ] . bignum($mals) . ' &' . bignum($msnps) . qq[ \\\\ \n]; } $slide .= q[ \\end{tabular} \\vspace{2em} \\begin{itemize}[-] \\setlength{\\itemsep}{0pt} ]; for my $id (@ids) { my $fname = $$opts{dat}{ID}{$id}[0][0]; if ( $$opts{title}{$id} =~ / \+ / ) { next; } $fname =~ s/.{80}/$&\\\\\\hskip2em /g; $fname =~ s/_/\\_/g; $slide .= qq[\\item $tex_titles{$id} .. \\texttt{\\footnotesize $fname}\n]; } $slide .= q[\\end{itemize}\\end{minipage}]; my $title = exists($$opts{main_title}) ? $$opts{main_title} : 'Summary Numbers'; tprint $tex, qq[ % Table with summary numbers % \\hslide{$title}{$slide} ]; # Venn bars if ( @ids==3 ) { tprint $tex, qq[% % Venn numbers % \\hslide{Total counts}{% \\includegraphics[$ext,width=$width2]{$$opts{lprefix}venn_bars.snps}% \\includegraphics[$ext,width=$width2]{$$opts{lprefix}venn_bars.indels} } ]; } tprint $tex, fmt_slide3v($opts, "$$opts{lprefix}tstv_by_sample", 'Ts/Tv by sample'); tprint $tex, fmt_slide3v($opts, "$$opts{lprefix}hets_by_sample", 'Hets vs non-ref Homs by sample'); tprint $tex, fmt_slide3v($opts, "$$opts{lprefix}singletons_by_sample", 'Singletons by sample {\normalsize(hets and homs)}'); tprint $tex, fmt_slide3v($opts, "$$opts{lprefix}dp_by_sample", 'Average depth by sample'); tprint $tex, fmt_slide3v($opts, "$$opts{lprefix}snps_by_sample", 'Number of SNPs by sample'); tprint $tex, fmt_slide3v($opts, "$$opts{lprefix}indels_by_sample", 'Number of indels by sample'); if ( scalar get_values($opts,2,'GCsS') ) { tprint $tex, qq[ % Genotype discordance by sample % \\hslide{Genotype discordance by sample}{\\pdf{$ext,width=$width1}{$$opts{lprefix}gts_by_sample}} ]; } if ( scalar get_values($opts,2,'GCsAF') ) { my @vals = get_values($opts,2,'NRDs'); my $nrd = sprintf "%.2f", $vals[0][0]; my $rr = sprintf "%.2f", $vals[0][1]; my $ra = sprintf "%.2f", $vals[0][2]; my $aa = sprintf "%.2f", $vals[0][3]; my $nsamples = get_value($opts,2,'number of samples:'); my $table = qq[% {\\small \\rowcolors*{1}{row2}{row1}\\arrayrulecolor{black} \\begin{tabular}{c | c | c | c } \\multicolumn{1}{>{\\columncolor{hcol1}}c|}{REF/REF} & \\multicolumn{1}{>{\\columncolor{hcol1}}c|}{REF/ALT} & \\multicolumn{1}{>{\\columncolor{hcol1}}c|}{ALT/ALT} & \\multicolumn{1}{>{\\columncolor{hcol1}}c}{NRDs} \\\\ \\hline $rr\\% & $ra\\% & $aa\\% & $nrd\\% \\\\ \\end{tabular}}]; tprint $tex, qq[ % Genotype discordance by AF % \\head{Genotype discordance by AF}\\begin{vplace}[0.7]\\centerline{$table}% \\centerline{\\pdf{$ext,height=$height1}{$$opts{lprefix}gts_by_af}}\\end{vplace} \\newpage % dosage r2 by AF % \\hslide{Allelic R\$^2\$ by AF}{\\pdf{$ext,height=$height1}{$$opts{lprefix}r2_by_af}} ]; } if ( -e "$$opts{lprefix}counts_by_af.snps.$fmt" && -e "$$opts{lprefix}counts_by_af.indels.$fmt" ) { tprint $tex, qq[ % SNP and indel counts by AF % \\hslide{}{\\vbox{\\noindent\\includegraphics[$ext,width=$width1]{$$opts{lprefix}counts_by_af.snps}\\\\% \\noindent\\includegraphics[$ext,width=$width1]{$$opts{lprefix}counts_by_af.indels}}} ]; } if ( -e "$$opts{lprefix}overlap_by_af.snps.$fmt" && -e "$$opts{lprefix}overlap_by_af.indels.$fmt" ) { tprint $tex, qq[ % SNP and indel overlap by AF % \\hslide{}{\\vbox{\\noindent\\includegraphics[$ext,width=$width1]{$$opts{lprefix}overlap_by_af.snps}\\\\% \\noindent\\includegraphics[$ext,width=$width1]{$$opts{lprefix}overlap_by_af.indels}}} ]; } tprint $tex, fmt_slide3h($opts, "$$opts{lprefix}tstv_by_af", 'Ts/Tv by AF'); tprint $tex, fmt_slide3h($opts, "$$opts{lprefix}tstv_by_qual", 'Ts/Tv stratified by QUAL'); tprint $tex, fmt_slide3h($opts, "$$opts{lprefix}indels", 'Indel distribution'); tprint $tex, fmt_slide3h($opts, "$$opts{lprefix}depth", 'Depth distribution'); tprint $tex, fmt_slide3h($opts, "$$opts{lprefix}hwe", 'Number of HETs by AF'); tprint $tex, fmt_slide3h($opts, "$$opts{lprefix}substitutions", 'Substitution types'); tprint $tex, fmt_slide3h($opts, "$$opts{lprefix}irc_by_af", 'Indel Repeat Consistency by AF'); tprint $tex, fmt_slide3h($opts, "$$opts{lprefix}irc_by_rlen", 'Indel Consistency by Repeat Type'); tprint $tex, "\n\n\\end{document}\n"; close($tex); $tex_file =~ s{^.+/}{}; my $cmd = "pdflatex $tex_file >$$opts{logfile} 2>&1"; print STDERR "Creating PDF: $cmd\n" unless !$$opts{verbose}; system($cmd); if ( $? ) { error("The command exited with non-zero status, please consult the output of pdflatex: $$opts{dir}$$opts{logfile}\n\n"); } print STDERR "Finished: $pdf_file\n" unless !$$opts{verbose}; } sub merge_vcfstats { my ($opts) = @_; my $fh = *STDOUT; if ( !$$opts{merge} ) { open($fh,'>',"$$opts{prefix}merge.chk") or error("$$opts{prefix}merge.chk: $!\n"); } print $fh "# This file was produced by plot-vcfstats, the command line was:\n# $$opts{args}\n#\n"; for my $sec (@{$$opts{sections}}) { my $sid = $$sec{id}; if ( !exists($$opts{dat}{$sid}) ) { next; } print $fh "# $$sec{header}\n$$sec{exp}\n"; for my $id (sort keys %{$$opts{dat}{$sid}}) { for my $rec (@{$$opts{dat}{$sid}{$id}}) { print $fh "$sid\t$id\t", join("\t",@$rec), "\n"; } } if ( $sid eq 'ID' ) { print $fh "# $$opts{id2sec}{SN}{header}\n$$opts{id2sec}{SN}{exp}\n"; # output summary numbers here for my $id (keys %{$$opts{dat}}) { if ( exists($$opts{exp}{$id}) ) { next; } for my $key (keys %{$$opts{dat}{$id}}) { print $fh "SN\t$id\t$key\t$$opts{dat}{$id}{$key}\n"; } } } } }