/* * header.cpp * * Created on: Apr 29, 2013 * Author: amarcketta */ #include "header.h" header::header() { has_contigs = false; has_file_format = false; has_genotypes = false; has_header = false; has_idx = false; contig_index = 0; N_indv = 0; } void header::parse_meta(const string &line, unsigned int &line_index) { lines.push_back(line); if (line.compare(0,13,"##fileformat=")==0) { has_file_format = true; string version = line.substr(13); if ((version != "VCFv4.0") && (version != "VCFv4.1") && (version != "VCFv4.2")) LOG.error("VCF version must be v4.0, v4.1 or v4.2:\nYou are using version " + version); } else if (line.compare(0,7,"##INFO=")==0) { // Found an INFO descriptor line_index += add_INFO_descriptor(line.substr(8, line.size()-8), line_index); } else if (line.compare(0,9,"##FILTER=")==0) { // Found a FILTER descriptor line_index += add_FILTER_descriptor(line.substr(10, line.size()-8), line_index); } else if (line.compare(0,9,"##FORMAT=")==0) { // Found a genotype filter descriptor line_index += add_FORMAT_descriptor(line.substr(10, line.size()-8), line_index); } else if (line.compare(0,9,"##contig=")==0) { // Found a contig descriptor add_CONTIG_descriptor(line.substr(10, line.size()-8), contig_index); contig_index++; has_contigs = true; } else { Field_description I; size_t found = line.find_first_of("="); I.Field = line.substr(0,found); I.Other = line.substr(found+1); parsed_lines.push_back(I); } } void header::parse_header(const string &line) { // #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... ) if (has_header == true) LOG.warning("Multiple Header lines."); has_header = true; istringstream header(line); int count = 0; string tmp_str; unsigned int N_header_indv = 0; has_genotypes = false; while (!header.eof()) { getline(header, tmp_str, '\t'); switch (count) { case 0: if (tmp_str != "#CHROM") LOG.warning("First Header entry should be #CHROM: " + tmp_str); break; case 1: if (tmp_str != "POS") LOG.warning("Second Header entry should be POS: " + tmp_str); break; case 2: if (tmp_str != "ID") LOG.warning("Third Header entry should be ID: " + tmp_str); break; case 3: if (tmp_str != "REF") LOG.warning("Fourth Header entry should be REF: " + tmp_str); break; case 4: if (tmp_str != "ALT") LOG.warning("Fifth Header entry should be ALT: " + tmp_str); break; case 5: if (tmp_str != "QUAL") LOG.warning("Sixth Header entry should be QUAL: " + tmp_str); break; case 6: if (tmp_str != "FILTER") LOG.warning("Seventh Header entry should be FILTER: " + tmp_str); break; case 7: if (tmp_str != "INFO") LOG.warning("Eighth Header entry should be INFO: " + tmp_str); break; case 8: if (tmp_str != "FORMAT") LOG.warning("Ninth Header entry should be FORMAT: " + tmp_str); else has_genotypes = true; break; default: { if (count <= 8) LOG.error("Incorrectly formatted header."); indv.push_back(tmp_str); N_header_indv++; } break; } count++; } N_indv = N_header_indv; if ((has_genotypes == true ) && (N_indv == 0)) LOG.warning("FORMAT field without genotypes?"); } int header::add_INFO_descriptor(const string &in, int index) { size_t found_end=in.find_last_of(">"); string details = in.substr(0, found_end); Field_description I; I.Field = "INFO"; vector tokens; tokenize(details, ',', tokens); if (tokens.size() < 4) LOG.error("Expected at least 4 parts in INFO definition: " + in); vector entry; for (unsigned int ui=0; ui entry; for (unsigned int ui=0; ui entry; for (unsigned int ui=0; ui entry; for (unsigned int ui=0; ui"; } lines.push_back(new_line.str()); } } void header::reparse() { unsigned int index = 0; has_idx = false; contig_index = 0; vector old_lines(lines.size(),""); copy(lines.begin(), lines.end(), old_lines.begin()); lines.resize(0); INFO_map.clear(); INFO_reverse_map.clear(); FILTER_map.clear(); FILTER_reverse_map.clear(); FORMAT_map.clear(); FORMAT_reverse_map.clear(); CONTIG_map.clear(); CONTIG_reverse_map.clear(); index += add_FILTER_descriptor("ID=PASS,Description=PASS", index); for (unsigned int ui=0; ui &out) { out.resize(0); istringstream ss(in); string tmp; while( getline(ss, tmp, token) ) { out.push_back(tmp); } } void header::split(const string &text, char sep, vector &tokens) { int start = 0, end = 0, idx = 0, max = tokens.size(); while ((end = text.find(sep, start)) != string::npos) { if (idx < max) tokens[idx] = text.substr(start, end - start); else tokens.push_back(text.substr(start, end - start)); start = end + 1; idx++; } if (idx < max) tokens[idx] = text.substr(start); else tokens.push_back(text.substr(start)); } string header::int2str(const int in, const int missing_value) { if (in == missing_value) return "."; else { static ostringstream out; out.str(""); out.clear(); out << in; return out.str(); } } int header::str2int(const string &in, const int missing_value) { if ((in.size() == 0) || (in == ".")) return missing_value; else return atoi(in.c_str()); } double header::str2double(const string &in, const double missing_value) { if ((in.size() == 0) || (in == ".")) return missing_value; else return atof(in.c_str()); } string header::double2str(const double in, const double missing_value) { if (in == missing_value) return "."; else { static ostringstream out; out.str(""); out.clear(); out << in; return out.str(); } }