/* * vcf_entry.cpp * * Created on: Aug 19, 2009 * Author: Adam Auton * ($Revision: 230 $) */ #include "vcf_entry.h" string vcf_entry::convert_line; vcf_entry::vcf_entry(header &meta_data, vector &include_individual) { N_indv = meta_data.N_indv; include_indv = include_individual; include_genotype = vector(N_indv, true); basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; CHROM = ""; POS = -1; REF = ""; QUAL = -1; passed_filters = true; parsed_FORMAT_binary = false; N_INFO_removed = 0; N_FORMAT_removed = 0; parsed_GT = vector(N_indv, false); parsed_GQ = vector(N_indv, false); parsed_DP = vector(N_indv, false); parsed_FT = vector(N_indv, false); GT_idx = -1; GQ_idx = -1; DP_idx = -1; FT_idx = -1; FORMAT_positions.resize(N_indv); FORMAT_types.resize(N_indv); FORMAT_sizes.resize(N_indv); FORMAT_skip.resize(N_indv); FORMAT_keys.resize(N_indv); convert_line.clear(); data_stream.str(""); entry_header = meta_data; } vcf_entry::~vcf_entry() {} // Reset the VCF entry object with a new data line void vcf_entry::reset(const vector &data_line) { basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; parsed_FORMAT_binary = false; passed_filters = true; data_stream.clear(); line = data_line; convert_line.assign(line.begin(), line.end()); data_stream.str(convert_line); fill(parsed_GT.begin(), parsed_GT.end(), 0); fill(parsed_GQ.begin(), parsed_GQ.end(), 0); fill(parsed_DP.begin(), parsed_DP.end(), 0); fill(parsed_FT.begin(), parsed_FT.end(), 0); fill(include_genotype.begin(), include_genotype.end(), 1); N_INFO_removed = 0; N_FORMAT_removed = 0; FORMAT_positions.clear(); FORMAT_types.clear(); FORMAT_sizes.clear(); FORMAT_skip.clear(); FORMAT_keys.clear(); } // Tokenize the basic information in a VCF data line (at the tab level) void vcf_entry::parse_basic_entry(bool parse_ALT, bool parse_FILTER, bool parse_INFO) { if(!basic_parsed) { getline(data_stream, CHROM, '\t'); getline(data_stream, ID, '\t'); POS = atoi(ID.c_str()); getline(data_stream, ID, '\t'); getline(data_stream, REF, '\t'); getline(data_stream, ALT_str, '\t'); getline(data_stream, QUAL_str, '\t'); getline(data_stream, FILTER_str, '\t'); getline(data_stream, INFO_str, '\t'); QUAL = header::str2double(QUAL_str); // Convert to uppercase for consistency // Note that VCF v4.1 allows mixtures of lower/upper case in REF and ALT. // However, the spec specifically states that tools using VCF are not required // to preserve the case. std::transform(REF.begin(), REF.end(), REF.begin(), ::toupper); std::transform(ALT_str.begin(), ALT_str.end(),ALT_str.begin(), ::toupper); } basic_parsed = true; if (parse_ALT && !parsed_ALT) set_ALT(ALT_str); if (parse_FILTER && !parsed_FILTER) set_FILTER(FILTER_str); if (parse_INFO && !parsed_INFO) set_INFO(INFO_str); } // Tokenize the genotype information (at the 'tab' level) in the VCF entry void vcf_entry::parse_full_entry(bool parse_FORMAT) { if (fully_parsed) return; if (basic_parsed == false) parse_basic_entry(); getline(data_stream, FORMAT_str, '\t'); if (parse_FORMAT) set_FORMAT(FORMAT_str); string tmpstr; tmpstr.reserve(64); GENOTYPE_str.resize(N_indv, tmpstr); for (unsigned int ui=0; ui tmp_vector; vector tmp_split; vector< vector > format_matrix(N_indv); unsigned int type, number, size, position=0; tmp_split.resize(FORMAT.size()); for (unsigned int ui=0; ui &INFO_to_keep, bool keep_all_INFO) { if (fully_parsed == false) parse_full_entry(); out << get_CHROM() << '\t' << POS << '\t' << get_ID() << '\t' << REF << '\t' << get_ALT(); out << '\t' << header::double2str(QUAL); out << '\t' << get_FILTER(); if (keep_all_INFO == false) out << '\t' << get_INFO(INFO_to_keep); else out << '\t' << INFO_str; pair genotype; string GFILTER_tmp; if (FORMAT.size() > 0) { char PHASE; out << '\t' << get_FORMAT(); for (unsigned int ui=0; ui &INFO_to_keep, bool keep_all_INFO) { if (fully_parsed == false) parse_full_entry(); if (parsed_FORMAT_binary == false) parse_FORMAT(); vector out_vector, tmp_vector; out_vector.resize(8*sizeof(int32_t)); int vector_pos = 2*sizeof(uint32_t); string tmp_string; int index; vector filter_vector; vector > tmp_info; tmp_string = get_CHROM(); if (tmp_string == "." or tmp_string == " " or tmp_string == "") LOG.error("CHROM value must be defined for all entries.",0); if (entry_header.CONTIG_reverse_map.find(tmp_string) == entry_header.CONTIG_reverse_map.end() ) LOG.error("CHROM value " + tmp_string + " is not defined on contig dictionary.",0); int32_t chrom = (int32_t)entry_header.CONTIG_reverse_map[tmp_string]; memcpy(&out_vector[vector_pos], &chrom, sizeof(chrom)); vector_pos += sizeof(chrom); get_POS_binary(tmp_vector); memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size()); vector_pos += tmp_vector.size(); tmp_vector.resize(0); get_rlen(tmp_vector); memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size()); vector_pos += tmp_vector.size(); tmp_vector.resize(0); get_QUAL_binary(tmp_vector); memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size()); vector_pos += tmp_vector.size(); tmp_vector.resize(0); get_ID_binary(tmp_vector); out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end()); tmp_vector.resize(0); get_ALLELES_binary(tmp_vector); out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end()); tmp_vector.resize(0); get_FILTER_vector(filter_vector); if (filter_vector.empty()) make_typed_int_vector(tmp_vector, filter_vector); else { vector index_vector; for(unsigned int ui=0; ui max_depth)) include_genotype[ui] = false; } } } // Filter specific genotypes by quality void vcf_entry::filter_genotypes_by_quality(double min_genotype_quality) { if (fully_parsed == false) parse_full_entry(); if (GQ_idx != -1) { // Have quality info double quality; for (unsigned int ui=0; ui &filter_flags_to_remove, bool remove_all) { if (fully_parsed == false) parse_full_entry(); vector GFILTERs; if (FT_idx != -1) { // Have GFilter info for (unsigned int ui=0; ui