/* * variant_file.h * * Created on: Dec 12, 2012 * Author: amarcketta */ #ifndef VARIANT_FILE_H_ #define VARIANT_FILE_H_ #if HAVE_CONFIG_H # include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "parameters.h" #include "entry.h" #include "gamma.h" #include "vcf_entry.h" #include "bcf_entry.h" #include "header.h" #if HAVE_LIBLAPACK # include "dgeev.h" #endif extern output_log LOG; using namespace std; class variant_file { public: string filename; bool compressed; istream *file_in; ifstream file_tmp; unsigned int gzMAX_LINE_LEN; gzFile gzfile_in; header meta_data; vector include_indv; unsigned int N_entries; unsigned int N_kept_entries; int N_kept_individuals() const; int N_kept_sites() const; int N_total_sites() const; virtual void open() = 0; virtual void open_gz() = 0; virtual void close() = 0; virtual bool eof() = 0; virtual void get_entry(vector &out) = 0; virtual entry* get_entry_object() = 0; void ByteSwap(unsigned char *b, int n) const; static inline bool is_big_endian() { long one= 1; return !(*((char *)(&one))); }; void apply_filters(const parameters ¶ms); void filter_individuals(const set &indv_to_keep, const set &indv_to_exclude, const vector &indv_to_keep_filename, const vector &indv_to_exclude_filename, bool keep_then_exclude=true); void filter_individuals_by_keep_list(const set &indv_to_keep, const vector &indv_to_keep_filenames); void filter_individuals_by_exclude_list(const set &indv_to_exclude, const vector &indv_to_exclude_filenames); void filter_individuals_randomly(int max_N_indv); void output_frequency(const parameters ¶ms, bool output_counts=false); void output_individuals_by_mean_depth(const parameters ¶ms); void output_site_depth(const parameters ¶ms, bool output_mean=true); void output_genotype_depth(const parameters ¶ms); void output_het(const parameters ¶ms); void output_hwe(const parameters ¶ms); void output_SNP_density(const parameters ¶ms); void output_indv_missingness(const parameters ¶ms); void output_indv_burden(const parameters ¶ms); void output_indv_freq_burden(const parameters ¶ms, int double_count_hom_alt=0); void output_site_missingness(const parameters ¶ms); void output_haplotype_r2(const parameters ¶ms); void output_genotype_r2(const parameters ¶ms); void output_genotype_chisq(const parameters ¶ms, double min_pval); void output_interchromosomal_genotype_r2(const parameters ¶ms); void output_interchromosomal_haplotype_r2(const parameters & params); void output_haplotype_r2_of_SNP_list_vs_all_others(const parameters ¶ms); void output_haplotype_count(const parameters ¶ms); void output_genotype_r2_of_SNP_list_vs_all_others(const parameters ¶ms); void output_singletons(const parameters ¶ms); void output_TsTv(const parameters ¶ms); void output_TsTv_by_count(const parameters ¶ms); void output_TsTv_by_quality(const parameters ¶ms); void output_TsTv_summary(const parameters ¶ms); void output_per_site_nucleotide_diversity(const parameters ¶ms); void output_windowed_nucleotide_diversity(const parameters ¶ms); void output_Tajima_D(const parameters ¶ms); void output_site_quality(const parameters ¶ms); void output_FILTER_summary(const parameters ¶ms); void output_kept_sites(const parameters ¶ms); void output_removed_sites(const parameters ¶ms); void output_LROH(const parameters ¶ms); void output_indv_relatedness_Yang(const parameters ¶ms); void output_indv_relatedness_Manichaikul(const parameters ¶ms); void output_PCA(const parameters ¶ms); void output_PCA_SNP_loadings(const parameters ¶ms); void output_indel_hist(const parameters ¶ms); void output_as_012_matrix(const parameters ¶ms); void output_as_plink(const parameters ¶ms); void output_as_plink_tped(const parameters ¶ms); void output_BEAGLE_genotype_likelihoods(const parameters ¶ms, int GL_or_PL=0); void output_as_IMPUTE(const parameters ¶ms); void output_as_LDhat_phased(const parameters ¶ms); void output_as_LDhat_unphased(const parameters ¶ms); void output_FORMAT_information(const parameters ¶ms); void output_weir_and_cockerham_fst(const parameters ¶ms); void output_windowed_weir_and_cockerham_fst(const parameters ¶ms); void output_sites_in_files(const parameters ¶ms, variant_file &diff_vcf_file); void output_indv_in_files(const parameters ¶ms, variant_file &diff_vcf_file); void output_discordance_by_site(const parameters ¶ms, variant_file &diff_vcf_file); void output_discordance_matrix(const parameters ¶ms, variant_file &diff_vcf_file); void output_discordance_by_indv(const parameters ¶ms, variant_file &diff_vcf_file); void output_switch_error(const parameters ¶ms, variant_file &diff_vcf_file); void output_INFO_for_each_site(const parameters ¶ms); void output_mendel_inconsistencies(const parameters ¶ms); void write_stats(const parameters ¶ms); virtual void print(const parameters ¶ms) = 0; virtual void print_bcf(const parameters ¶ms) = 0; void calc_hap_r2(vector > >1, vector > >2, double &r2, double &D, double &Dprime, int &chr_count); void calc_geno_r2(vector > >1, vector > >2, double &r2, int &indv_count); void calc_r2_em(entry *e, entry *e2, double &r2, int &indv_count); void calc_geno_chisq(vector > >1, vector > >2, int &N0, int &N1, double &chisq, double &dof, double &pval, int &indv_count); void read_temp_site(ifstream &tmp_file, string &CHROM, int &POS, vector< pair > >s); void read_big_temp_site(ifstream &tmp_file, string &CHROM, int &POS, int &alleles, vector< pair > >s); void return_indv_union(variant_file &file2, map > &combined_individuals, const string &indv_ID_map_file=""); void get_contigs(const std::string &contigs_file, vector &contig_vector); virtual ~variant_file(); }; #endif /* VARIANT_FILE_H_ */