/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ /*========================================================================== * VDB Alignment types, functions and tables */ version 1; include 'vdb/vdb.vschema'; include 'ncbi/seq.vschema'; include 'ncbi/sra.vschema'; include 'ncbi/stats.vschema'; include 'align/seq.vschema'; include 'align/qstat.vschema'; include 'sra/abi.vschema'; include 'align/mate-cache.vschema'; /*-------------------------------------------------------------------------- * data types */ /* ploidy * the number of sets of chromosomes in a cell */ typedef U32 NCBI:align:ploidy; /* ro_type * the type of event causing ref-offset */ typedef U8 NCBI:align:ro_type; const NCBI:align:ro_type NCBI:align:ro_normal = 0; // normal ref-offset const NCBI:align:ro_type NCBI:align:ro_soft_clip = 1; // soft-clipping const NCBI:align:ro_type NCBI:align:ro_intron_plus = 2; // intron on positive strand const NCBI:align:ro_type NCBI:align:ro_intron_minus = 3; // intron on negative strand const NCBI:align:ro_type NCBI:align:ro_intron_unknown = 4; // intron strand not specified const NCBI:align:ro_type NCBI:align:ro_complete_genomics = 5; // /*-------------------------------------------------------------------------- * functions */ /* cigar * construct "cigar" alignment string or length arrays * * "ctype" [ CONST ] - select variant of format * 0 => both matches and mismatches represented as M * 1 => matches represented as '=' mismatches as 'X' * * "has_mismatch" [ DATA ] - a boolean for each base in aligned sequence * where a value of false means the base aligned to the reference * * "has_ref_offset" [ DATA ] - a boolean for each base in the aligned sequence * where a value of true means there is a corresponding offset to position on reference * * "ref_offset" [ DATA ] - a packed sequence of signed offsets to aligned position * one entry for every true in "has_ref_offset" * * "read_len" [ DATA ] - v2: elem_count defines PLOIDY and values are an actual length of reads in spot */ extern function ascii NCBI:align:cigar #1 < U8 ctype > ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, * INSDC:coord:len ref_len ) = ALIGN:cigar; /* history: * 2.1 - added "ref_offset_type" optional parameter * NB - reverting to 2.0 due to linker bug in older code */ extern function < type T > T NCBI:align:cigar #2.0 < U8 ctype > ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len, * INSDC:coord:len ref_len, NCBI:align:ro_type ref_offset_type ) = ALIGN:cigar_2; extern function U32 NCBI:align:edit_distance #1 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset ); extern function U32 NCBI:align:edit_distance #2 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len, *INSDC:coord:len read_len) = NCBI:align:edit_distance_2; extern function U32 NCBI:align:edit_distance #3 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len) = NCBI:align:edit_distance_3; /* rna_orientation * reads column REF_OFFSET_TYPE * returns '+' if has: * at least one NCBI:align:ro_intron_plus * none of NCBI:align:ro_intron_minus * returns '-' if has: * at least one NCBI:align:ro_intron_minus * none of NCBI:align:ro_intron_plus * returns empty string otherwise */ extern function ascii NCBI:align:rna_orientation #1 ( NCBI:align:ro_type ref_offset_type ); /* project_from_sequence * projects column from SEQUENCE * * "T" [ TYPE ] * * "col" [ CONST ] * "use_read_len" [ CONST ] whether subset by read_len or by read_id only * * "seq_spot_id" [ DATA ] * * "seq_read_id" [ DATA ] */ extern function < type T > T NCBI:align:project_from_sequence #1 < ascii col> ( I64 seq_spot_id, INSDC:coord:one seq_read_id ) = ALIGN:project_from_sequence; /* align_restore_read * restores read by applying alignment-based difference to ref_read * * "ref_read" [ DATA ] * * "has_mismatch" [ DATA ] and "mismatch" [ DATA ] * * "has_ref_offset" [ DATA ] and "ref_offset" [ DATA ] */ extern function INSDC:4na:bin NCBI:align:align_restore_read #1 ( INSDC:4na:bin ref_read, bool has_mismatch, INSDC:4na:bin mismatch, bool has_ref_offset, I32 ref_offset * INSDC:coord:len read_len) = ALIGN:align_restore_read; /* raw_restore_read * restores read by applying alignment-based difference to align_read * * "align_read" [ DATA ] * * "ref_orientation" [ DATA ] */ extern function INSDC:4na:bin NCBI:align:raw_restore_read #1 ( INSDC:4na:bin align_read, bool ref_orientation ) = ALIGN:raw_restore_read; /* raw_restore_qual * restores quality by applying alignment-based difference to align_qual * * "align_qual" [ DATA ] * * "ref_orientation" [ DATA ] */ extern function INSDC:quality:phred NCBI:align:raw_restore_qual #1 ( INSDC:quality:phred align_qual, bool ref_orientation ); /* ref_sub_select * projects reference from sequence * * "id" [ DATA ] * * "start" [ DATA ] and "len" [ DATA ] * * "ref_ploidy" [ DATA, OPTIONAL ] */ extern function INSDC:4na:bin NCBI:align:ref_sub_select #1 ( I64 id, INSDC:coord:zero start, INSDC:coord:len len * U32 ref_ploidy) = ALIGN:ref_sub_select; /* ref_restore_read * restores read from central storage * * "cmp_rd" [ DATA ] * * "seq_id" [ DATA ] * * "seq_start" [ DATA ] and "seq_len" [ DATA ] */ extern function INSDC:4na:bin NCBI:align:ref_restore_read #1 ( INSDC:4na:bin cmp_rd, ascii seq_id, INSDC:coord:one seq_start, INSDC:coord:len seq_len) = ALIGN:ref_restore_read; /* seq_restore_read * projects read from align_deflate table to SEQUENCE * * "cmp_rd" [ DATA ] * * "align_id" [ DATA ] * * "read_len" [ DATA ] * * "rd_type" [ DATA ] */ extern function INSDC:4na:bin NCBI:align:seq_restore_read #1 ( INSDC:4na:bin cmp_rd, I64 align_id, INSDC:coord:len read_len, INSDC:SRA:xread_type rd_type ) = ALIGN:seq_restore_read; /* seq_restore_linkage_group * projects LINKAGE_GROUP from PRIMARY_ALIGNMENT table to SEQUENCE * * "cmp_linkage_group" [ DATA ] * * "align_id" [ DATA ] */ extern function ascii NCBI:align:seq_restore_linkage_group #1 ( ascii cmp_linkage_group, I64 align_id ) = ALIGN:seq_restore_linkage_group; /* generate_has_mismatch * generates has mismatch by doing actual compare of reference and subject, * *ref_offsets move comparisons reference-wise * * "reference" [ DATA ] * * "subject" [ DATA ] * * "has_ref_offset" [ DATA ] * * "ref_offset" [ DATA ] */ extern function bool NCBI:align:generate_has_mismatch #1 ( INSDC:4na:bin reference, INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset) = ALIGN:generate_has_mismatch; /* generate_mismatch * * "reference" [ DATA ] * * "subject" [ DATA ] * * "has_ref_offset" [ DATA ] * * "ref_offset" [ DATA ] */ extern function INSDC:4na:bin NCBI:align:generate_mismatch #1 ( INSDC:4na:bin reference, INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset ) = ALIGN:generate_mismatch; /* ref_pos * retrieves the alignment's positions on the reference * one per PLOIDY * * "ref_id" [ DATA ] * * "ref_start" [ DATA ] - one per PLOIDY */ extern function INSDC:coord:zero NCBI:align:ref_pos #1 ( I64 ref_id, INSDC:coord:zero ref_start ); /* ref_name * retrieve the name from the reference * * "ref_id" [ DATA ] */ extern function ascii NCBI:align:ref_name #1 ( I64 ref_id ); /* ref_seq_id * retrieve the seq_id from the reference * * "ref_id" [ DATA ] */ extern function ascii NCBI:align:ref_seq_id #1 ( I64 ref_id ); /* local_ref_id * convert global ref_start into ref_id */ extern function I64 NCBI:align:local_ref_id #1 ( U64 global_ref_start ); /* global_ref_id * convert global ref_start into ref_id */ extern function INSDC:coord:zero NCBI:align:local_ref_start #1 ( U64 global_ref_start ); /* not_my_row * removes current row_id from the list */ extern function I64 NCBI:align:not_my_row #1 ( I64 list ); /* template_len * compute template length, i.e. the distance from the left-most to the * right-most matching reference position */ extern function I32 NCBI:align:template_len #1 ( INSDC:coord:zero pos, INSDC:coord:zero mate_pos, INSDC:coord:len reflen, INSDC:coord:len mate_reflen, ascii ref_name, ascii mate_ref_name, INSDC:coord:one read_id); /* get_sam_flags * compute the flags that would be in a SAM file * * version 1 works with full Alignment databases. * version 2 works with Alignment databases that have had SEQUENCE removed. */ extern function U32 NCBI:align:get_sam_flags #1 ( INSDC:coord:len read_len, INSDC:coord:one read_id, I32 template_len, bool strand, bool mate_strand, bool is_secondary, * INSDC:SRA:read_filter filter); extern function U32 NCBI:align:get_sam_flags #2 ( I64 mate_id, INSDC:coord:one read_id, I32 template_len, bool strand, bool mate_strand, bool is_secondary, * INSDC:SRA:read_filter filter) = NCBI:align:get_sam_flags_2; /* get_left_soft_clip * compute the length of the soft clip on the left edge of the alignment */ extern function INSDC:coord:len NCBI:align:get_left_soft_clip #1 ( bool has_ref_offset, I32 ref_offset ); extern function INSDC:coord:len NCBI:align:get_left_soft_clip #2 ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len ) = NCBI:align:get_left_soft_clip_2; /* get_right_soft_clip * compute the length of the soft clip on the right edge of the alignment */ extern function INSDC:coord:len NCBI:align:get_right_soft_clip #1 ( bool has_mismatch, INSDC:coord:len left_clip * bool has_ref_offset ); extern function INSDC:coord:len NCBI:align:get_right_soft_clip #2 ( bool has_mismatch, INSDC:coord:len left_clip, bool has_ref_offset, I32 ref_offset ) = NCBI:align:get_right_soft_clip_2; extern function INSDC:coord:len NCBI:align:get_right_soft_clip #3 ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len ) = NCBI:align:get_right_soft_clip_3; extern function INSDC:coord:len NCBI:align:get_right_soft_clip #4 ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len, INSDC:coord:len ref_len ) = NCBI:align:get_right_soft_clip_4; extern function INSDC:coord:len NCBI:align:get_right_soft_clip #5 ( bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len ) = NCBI:align:get_right_soft_clip_5; /* get_clipped_cigar * compute the CIGAR string with the soft clipping removed */ extern function ascii NCBI:align:get_clipped_cigar #1 ( ascii cigar ); extern function < type T > T NCBI:align:get_clipped_cigar #2 ( ascii cigar, INSDC:coord:len cigar_len ) = NCBI:align:get_clipped_cigar_2; /* get_clipped_ref_offset * compute the reference offsets with the soft clipping removed */ extern function I32 NCBI:align:get_clipped_ref_offset #1 ( bool has_ref_offset, I32 ref_offset ); /* clip * remove the soft clipped bases (or qualities, or has_mismatch, or cetera) * works with things whose lengths are the same as SEQUENCE.READ */ extern function < type T > T NCBI:align:clip #1 ( T object, INSDC:coord:len left_clip, INSDC:coord:len right_clip); extern function < type T > T NCBI:align:clip #2 ( T object, INSDC:coord:len read_len, INSDC:coord:len left_clip, INSDC:coord:len right_clip) = NCBI:align:clip_2; /* get_ref_len * compute reference length from alignment information */ extern function INSDC:coord:len NCBI:align:get_ref_len #1 ( bool has_ref_offset, I32 ref_offset, * INSDC:coord:len right_clip ); extern function INSDC:coord:len NCBI:align:get_ref_len_2 #2 ( bool has_ref_offset, I32 ref_offset) = NCBI:align:get_ref_len_2; /* get_mismatch_read * generate the READ with matching bases replaced with '=' */ extern function ascii NCBI:align:get_mismatch_read #1 ( bool has_mismatch, INSDC:dna:text mismatch ); /* get_ref_mismatch * shows mismatch positions in reference space */ function bool NCBI:align:get_ref_mismatch #1 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len ); /* get_ref_insert * shows positions of inserts in reference space * i.e. an insert occurs between each pair of true's */ function bool NCBI:align:get_ref_insert #1 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len ); /* get_ref_delete * shows positions of deleted bases in reference space */ function bool NCBI:align:get_ref_delete #1 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len ); extern function INSDC:quality:phred NCBI:align:compress_quality #1 ( INSDC:quality:phred quality, bool preserved ); extern function INSDC:quality:phred NCBI:align:decompress_quality #1 < INSDC:quality:phred restored_qual_value > ( INSDC:quality:phred cmp_quality, bool preserved ); /* make_cmp_read_start * */ extern function INSDC:coord:zero NCBI:align:make_read_start #1 (INSDC:coord:len read_len); /* make_cmp_read_desc * determines whether an element of "operand" is aligned * by looking at the corresponding element of "align_id" * * zeros out unaligned elements of operand, unless "invert" is true, * in which case it zeros out aligned elements. * * "T" [ TYPE ] - type of operand * * "invert" [ CONST ] - if true, invert the logic of which elements * to zero out. * * "operand" [ DATA ] - uncompressed data * * "align_id" [ DATA ] - indication of alignment */ extern function < type T > T NCBI:align:make_cmp_read_desc #1 (T operand, I64 align_id); /* seq_construct_read * assembles read from aligned and unaligned parts */ extern function < type T > T NCBI:align:seq_construct_read #1 ( T aligned, INSDC:coord:len aligned_read_len, T unaligned, INSDC:coord:len unaligned_read_len ); extern function I64 NCBI:align:get_mate_align_id #1 ( I64 spot_id ); /*-------------------------------------------------------------------------- * tables */ /* ref_block_cmn * common implementation ancestor for reference block */ table NCBI:align:tbl:ref_block_cmn #1.0.0 { readonly column ascii REF_TABLE = < ascii > meta:read < "CONFIG/REF_TABLE" > () | < ascii > echo < 'REFERENCE' > (); // REF_ID is rowid in Reference Table REF_TABLE extern column I64 REF_ID = out_ref_id; // this is a redefinition of REF_START // REF_START is the offset within REFERENCE.READ extern column INSDC:coord:zero REF_START = out_ref_start; // global REF_START extern column U64 GLOBAL_REF_START = out_global_ref_start; // REF_LEN the length of a read projection on reference INSDC:coord:len out_ref_len_internal = NCBI:align:get_ref_len_2 ( out_has_ref_offset, out_ref_offset ) | NCBI:align:get_ref_len ( out_has_ref_offset, out_ref_offset ); INSDC:coord:len out_ref_len = .REF_LEN /* | NCBI:align:get_ref_len ( out_has_ref_offset, out_ref_offset, out_right_clip ) */ | out_ref_len_internal; physical column < INSDC:coord:len > izip_encoding .REF_LEN = REF_LEN; extern column INSDC:coord:len REF_LEN = out_ref_len; // REF_ORIENTATION - relative orientation of original raw read to the reference // false -> same orientation, true -> opposite orientation // alignment and reference are always in the same orientation extern column bool_encoding REF_ORIENTATION; // REF_PLOIDY extern column < U32 > izip_encoding REF_PLOIDY; /* REF_POS * per PLOIDY */ readonly column INSDC:coord:zero REF_POS = NCBI:align:ref_pos ( out_ref_id, out_ref_start ); /* REF_NAME * the name of the reference */ readonly column ascii REF_NAME = NCBI:align:ref_name ( out_ref_id ); /* REF_SEQ_ID */ readonly column ascii REF_SEQ_ID = NCBI:align:ref_seq_id ( out_ref_id ) | < ascii > echo < '' > (); }; /* global_ref_block * reference block favoring global ref-start */ table NCBI:align:tbl:global_ref_block #1.0.0 = NCBI:align:tbl:ref_block_cmn #1.0.0 { U64 out_global_ref_start = .GLOBAL_REF_START; physical < U64 > izip_encoding .GLOBAL_REF_START = GLOBAL_REF_START; I64 out_ref_id = NCBI:align:local_ref_id ( .GLOBAL_REF_START ); INSDC:coord:zero out_ref_start = NCBI:align:local_ref_start ( .GLOBAL_REF_START ); }; /* local_ref_block * reference block favoring local ref-start */ table NCBI:align:tbl:local_ref_block #1.0.0 = NCBI:align:tbl:ref_block_cmn #1.0.0 { I64 out_ref_id = .REF_ID; physical < I64 > izip_encoding .REF_ID = REF_ID; INSDC:coord:zero out_ref_start = .REF_START; physical < INSDC:coord:zero > izip_encoding .REF_START = REF_START; }; /* align_cmn * common interface and implementation for alignment object * * History: * 2.1 - added REF_OFFSET_TYPE and RNA_ORIENTATION columns * updated all cigar calculations */ table NCBI:align:tbl:align_cmn #2.1 = NCBI:tbl:base_space_common #1.0.3 , NCBI:SRA:tbl:stats #1.2.0 , NCBI:align:tbl:ref_block_cmn #1.0.0 { bool is_secondary = out_is_secondary; // temporary key extern column < U32 > izip_encoding TMP_KEY_ID; extern column zip_encoding LINKAGE_GROUP; /* Raw Sequence Block */ // Points to sequence table, which may contain more information about the raw sequence. // row id in SEQUENCE table; 0 if not linked extern column < I64 > izip_encoding SEQ_SPOT_ID; // read number in SEQUENCE table; { SEQ_SPOT_ID, SEQ_READ_ID } is the unique link to the sequence extern column < INSDC:coord:one > izip_encoding SEQ_READ_ID; /* Soft-Clipped data block */ readonly column INSDC:coord:len LEFT_SOFT_CLIP = NCBI:align:get_left_soft_clip ( HAS_REF_OFFSET, REF_OFFSET, out_read_len ); INSDC:coord:len out_right_clip = NCBI:align:get_right_soft_clip #5 ( out_has_ref_offset, out_ref_offset, out_ro_type, out_read_len ) | NCBI:align:get_right_soft_clip #4 ( out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len ) | NCBI:align:get_right_soft_clip #3 ( out_has_ref_offset, out_ref_offset, out_ref_len ) | NCBI:align:get_right_soft_clip #2 ( out_has_mismatch, LEFT_SOFT_CLIP, out_has_ref_offset, out_ref_offset ); readonly column INSDC:coord:len RIGHT_SOFT_CLIP = out_right_clip; readonly column ascii CLIPPED_CIGAR_LONG = < ascii > NCBI:align:get_clipped_cigar ( CIGAR_LONG, CIGAR_LONG_LEN ); readonly column INSDC:coord:len CLIPPED_CIGAR_LONG_LEN = < INSDC:coord:len > NCBI:align:get_clipped_cigar ( CIGAR_LONG, CIGAR_LONG_LEN ); readonly column ascii CLIPPED_CIGAR_SHORT = < ascii > NCBI:align:get_clipped_cigar ( CIGAR_SHORT, CIGAR_SHORT_LEN ); readonly column INSDC:coord:len CLIPPED_CIGAR_SHORT_LEN = < INSDC:coord:len > NCBI:align:get_clipped_cigar ( CIGAR_SHORT, CIGAR_SHORT_LEN ); bool out_clipped_has_mismatch = < bool > NCBI:align:clip (out_has_mismatch, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); readonly column ascii CLIPPED_HAS_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_clipped_has_mismatch ); readonly column bool CLIPPED_HAS_MISMATCH = out_clipped_has_mismatch; bool out_clipped_has_ref_offset = < bool > NCBI:align:clip (HAS_REF_OFFSET, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); readonly column ascii CLIPPED_HAS_REF_OFFSET = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_clipped_has_ref_offset ); readonly column bool CLIPPED_HAS_REF_OFFSET = out_clipped_has_ref_offset; // TBD cannot be computed right unless HAS_MISMATCH and! READ_LEN is used readonly column INSDC:dna:text CLIPPED_MISMATCH = < INSDC:dna:text > NCBI:align:clip #1 ( out_mismatch_dna_text, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); readonly column I32 CLIPPED_REF_OFFSET = NCBI:align:get_clipped_ref_offset ( HAS_REF_OFFSET, REF_OFFSET ); readonly column INSDC:quality:phred CLIPPED_QUALITY = < INSDC:quality:phred > NCBI:align:clip (out_qual_phred, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); readonly column INSDC:dna:text CLIPPED_READ = < INSDC:dna:text > NCBI:align:clip (READ, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); /* Sequence Block */ extern column < NCBI:align:ploidy > izip_encoding PLOIDY; // Number of reads per spot; corresponds to the number of alternative alignments // all alternative alignments are computed against the same reference region U32 out_nreads = .PLOIDY | < U32 > echo < 1 > (); // READ_START and READ_LEN are position and length of the sequence physical < INSDC:coord:zero > izip_encoding .READ_START = READ_START; INSDC:coord:zero out_read_start = .READ_START | < INSDC:coord:zero > echo < 0 > (); physical < INSDC:coord:len > izip_encoding .READ_LEN = READ_LEN; INSDC:coord:len align_spot_len = ( INSDC:coord:len ) row_len ( out_has_ref_offset ); INSDC:coord:len out_read_len = .READ_LEN | align_spot_len; // associated qualities extern column INSDC:quality:phred CMP_QUALITY = .CMP_QUALITY | out_cmp_quality; physical column < INSDC:quality:phred > zip_encoding .CMP_QUALITY = CMP_QUALITY; INSDC:quality:phred out_raw_qual = < INSDC:quality:phred > NCBI:align:project_from_sequence < '( INSDC:quality:phred ) QUALITY'> ( .SEQ_SPOT_ID, .SEQ_READ_ID ); INSDC:quality:phred out_qual_phred = NCBI:align:raw_restore_qual ( out_raw_qual, .REF_ORIENTATION ) | < INSDC:quality:phred > echo < 30 > ( out_4na_bin ); readonly column INSDC:quality:text:phred_33 SAM_QUALITY = QUALITY ; // project read group and name ascii out_spot_group = < ascii > simple_sub_select < 'SEQUENCE','SPOT_GROUP'> (.SEQ_SPOT_ID); INSDC:SRA:spotid_t tmp_seq_spot_id = cast ( .SEQ_SPOT_ID ) ; physical zip_encoding .SEQ_NAME = SEQ_NAME; extern column ascii SEQ_NAME = .SEQ_NAME | < ascii > simple_sub_select < 'SEQUENCE','NAME'> (.SEQ_SPOT_ID) | sprintf < "%u" > ( tmp_seq_spot_id ); // compute sam flags /* blows up parser: starts at schema-tbl.c:2138 readonly column U32 SAM_FLAGS = NCBI:align:get_sam_flags(MATE_ALIGN_ID, .SEQ_READ_ID, out_template_len, REF_ORIENTATION, out_mate_ref_orientation, is_secondary); */ INSDC:coord:len projected_read_len = < INSDC:coord:len > simple_sub_select < 'SEQUENCE', 'READ_LEN' > ( .SEQ_SPOT_ID ); readonly column U32 SAM_FLAGS = NCBI:align:get_sam_flags #1 (projected_read_len, .SEQ_READ_ID, out_template_len, REF_ORIENTATION, out_mate_ref_orientation, is_secondary, out_rd_filter) | NCBI:align:get_sam_flags #2 (out_mate_align_id, .SEQ_READ_ID, out_template_len, REF_ORIENTATION, out_mate_ref_orientation, is_secondary, out_rd_filter); ascii out_name_fmt = < ascii > echo < '$R' > (); INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > (); INSDC:coord:len trim_len = align_spot_len; ascii out_label = .LABEL | < ascii > echo < "ploidy1" > (); INSDC:coord:zero out_label_start = .LABEL_START | < INSDC:coord:zero > echo < 0 > (); INSDC:coord:len out_label_len = .LABEL_LEN | < INSDC:coord:len > echo < 7 > (); physical < INSDC:SRA:read_filter > zip_encoding .RD_FILTER = READ_FILTER; INSDC:SRA:read_filter out_rd_filter = .RD_FILTER | < INSDC:SRA:read_filter > NCBI:align:project_from_sequence < 'READ_FILTER' > ( .SEQ_SPOT_ID, .SEQ_READ_ID ) | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ( out_read_len ); INSDC:SRA:platform_id out_platform = .PLATFORM | < INSDC:SRA:platform_id > simple_sub_select < 'SEQUENCE','PLATFORM'> (.SEQ_SPOT_ID) | < INSDC:SRA:platform_id > echo < SRA_PLATFORM_UNDEFINED > (); U8 out_alignment_count = NCBI:align:project_from_sequence < 'ALIGNMENT_COUNT' > ( .SEQ_SPOT_ID, .SEQ_READ_ID ); /* out_read_type * set to SRA_READ_TYPE_FORWARD + SRA_READ_TYPE_BIOLOGICAL * which has a constant value of 3 */ INSDC:SRA:xread_type out_read_type = < INSDC:SRA:xread_type > echo < 3 > ( out_read_len ); // stats inputs bool in_stats_bin = HAS_REF_OFFSET; INSDC:coord:len _alt_in_read_len = READ_LEN | ( INSDC:coord:len ) row_len #1 ( HAS_REF_OFFSET ); INSDC:SRA:xread_type _alt_in_read_type = READ_TYPE | < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > (_alt_in_read_len); readonly column ascii MISMATCH_READ = NCBI:align:get_mismatch_read ( out_has_mismatch, out_mismatch_dna_text ); /* Alignment block */ // MAPQ - single value quality of the mapping; the scale is submitter specific extern column < I32 > izip_encoding MAPQ; extern column INSDC:coord:zero MATE_REF_POS = out_mate_ref_pos; extern column INSDC:coord:len MATE_REF_LEN = out_mate_ref_len; extern column I64 MATE_REF_ID = out_mate_ref_id; extern column I32 TEMPLATE_LEN = out_template_len; extern column bool MATE_REF_ORIENTATION = out_mate_ref_orientation; readonly column ascii MATE_REF_NAME = NCBI:align:ref_name ( out_mate_ref_id ); readonly column ascii MATE_REF_SEQ_ID = NCBI:align:ref_seq_id( out_mate_ref_id ); readonly column U8 ALIGNMENT_COUNT = out_alignment_count; /******************************** * Columns representing CIGARs ********************************/ // one value per base i.e. length is same as sum of READ_LEN // partitioned by READ_START and READ_LEN into alternative alignments // flags the shifts in reference position preceeding the base // if sequence of a partitioned read starts with a ref_offset and one or more mismatches // then it represents a left soft clip // any run of mismatches at the end represents a right soft clip readonly column ascii HAS_REF_OFFSET = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_has_ref_offset ); extern column bool_encoding HAS_REF_OFFSET; bool out_has_ref_offset = .HAS_REF_OFFSET; // has number of elements equal to number of true elements in HAS_REF_OFFSET extern column < I32 > izip_encoding REF_OFFSET; I32 out_ref_offset = .REF_OFFSET; // the type of offset recorded in REF_OFFSET extern column < NCBI:align:ro_type > izip_encoding REF_OFFSET_TYPE; NCBI:align:ro_type out_ro_type = .REF_OFFSET_TYPE; // DISPLAY Columns readonly column I64 ALIGN_ID = row_id (); // get projection of the reference readonly column INSDC:dna:text REF_READ = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( REF_READ ); readonly column INSDC:4na:bin REF_READ = NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len, .REF_PLOIDY) | NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len ); INSDC:4na:bin ref_read_internal = NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len_internal, .REF_PLOIDY) | NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len_internal); // text forms of reads INSDC:dna:text out_dna_text = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ); readonly column INSDC:dna:text RAW_READ = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_raw_read ); readonly column INSDC:4na:bin RAW_READ = out_raw_read; // CIGARs readonly column ascii CIGAR_LONG = < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) ; readonly column INSDC:coord:len CIGAR_LONG_LEN = < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) ; readonly column ascii CIGAR_SHORT = < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) ; readonly column INSDC:coord:len CIGAR_SHORT_LEN = < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) ; readonly column ascii RNA_ORIENTATION = NCBI:align:rna_orientation ( out_ro_type ) ; readonly column U32 EDIT_DISTANCE = NCBI:align:edit_distance #3 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ro_type, out_read_len) | NCBI:align:edit_distance #2 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len, out_read_len) | NCBI:align:edit_distance #2 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len) | NCBI:align:edit_distance #1 (out_has_mismatch, out_has_ref_offset, out_ref_offset); readonly column ascii HAS_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_has_mismatch ); // needed for backward compatibility readonly column ascii SEQ_SPOT_GROUP = out_spot_group; /* These columns are purely informational. */ bool out_ref_mismatch = NCBI:align:get_ref_mismatch ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); readonly column ascii REF_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_mismatch ); readonly column bool REF_MISMATCH = out_ref_mismatch; bool out_ref_insert = NCBI:align:get_ref_insert ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); readonly column ascii REF_INSERT = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_insert ); readonly column bool REF_INSERT = out_ref_insert; bool out_ref_delete = NCBI:align:get_ref_delete ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); readonly column ascii REF_DELETE = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_delete ); readonly column bool REF_DELETE = out_ref_delete; }; /* align_full * aligns externally stored sequence against reference * alignment transcript is calculated * * History: * 1.1 - respond to changes in base table */ table NCBI:align:tbl:align_full #1.1 = NCBI:align:tbl:align_cmn #2.1 { bool out_is_secondary = echo < true > (); // restore reads to its raw form (orientation is restored) INSDC:4na:bin out_raw_read = < INSDC:4na:bin > simple_sub_select < 'PRIMARY_ALIGNMENT', '( INSDC:4na:bin ) RAW_READ' > (.PRIMARY_ALIGNMENT_ID) | < INSDC:4na:bin > NCBI:align:project_from_sequence < '( INSDC:4na:bin ) READ'> ( .SEQ_SPOT_ID, .SEQ_READ_ID ); INSDC:4na:bin out_4na_bin = NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, tmp_out_mismatch_4na_bin, out_has_ref_offset, out_ref_offset, .READ_LEN ) | NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, tmp_out_mismatch_4na_bin, out_has_ref_offset, out_ref_offset ) | NCBI:align:raw_restore_read ( out_raw_read, .REF_ORIENTATION ); // flags mismatches with the reference // produced by actual comparison of REF_READ and READ // TMP_HAS_MISMATCH is a hack to speed up retrieval during coverage recalculation column bool_encoding TMP_HAS_MISMATCH; bool out_has_mismatch = .TMP_HAS_MISMATCH | NCBI:align:generate_has_mismatch ( REF_READ, READ, out_has_ref_offset, out_ref_offset ); readonly column bool HAS_MISMATCH = out_has_mismatch; INSDC:4na:bin out_mismatch_4na_bin = NCBI:align:generate_mismatch ( REF_READ, READ, out_has_ref_offset, out_ref_offset ); INSDC:4na:bin tmp_out_mismatch_4na_bin = < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( .TMP_MISMATCH ); // temporary column for reference coverage calculation column < INSDC:dna:text> zip_encoding TMP_MISMATCH; INSDC:dna:text out_mismatch_dna_text = .TMP_MISMATCH | < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_mismatch_4na_bin ); readonly column INSDC:dna:text MISMATCH = out_mismatch_dna_text; readonly column INSDC:4na:bin MISMATCH = out_mismatch_4na_bin; physical column < INSDC:coord:zero > izip_encoding .MATE_REF_POS = MATE_REF_POS; INSDC:coord:zero out_mate_ref_pos = .MATE_REF_POS | < INSDC:coord:zero > simple_sub_select < '','REF_POS'> (MATE_ALIGN_ID); physical column < I64 > izip_encoding .MATE_REF_ID = MATE_REF_ID; I64 out_mate_ref_id = .MATE_REF_ID | < I64 > simple_sub_select < '','REF_ID'> (MATE_ALIGN_ID); INSDC:coord:len out_mate_ref_len = < INSDC:coord:len > simple_sub_select < '','REF_LEN'> (MATE_ALIGN_ID); physical column < I32 > izip_encoding .TEMPLATE_LEN = TEMPLATE_LEN; I32 out_template_len = .TEMPLATE_LEN | NCBI:align:template_len(REF_POS,out_mate_ref_pos,out_ref_len,out_mate_ref_len,REF_NAME,MATE_REF_NAME,SEQ_READ_ID); physical column < bool > izip_encoding .MATE_REF_ORIENTATION = MATE_REF_ORIENTATION; bool out_mate_ref_orientation = .MATE_REF_ORIENTATION | < bool > simple_sub_select < '','REF_ORIENTATION'> (MATE_ALIGN_ID); I64 out_mate_align_id = .MATE_ALIGN_ID; physical column izip_encoding .MATE_ALIGN_ID = MATE_ALIGN_ID; extern column I64 MATE_ALIGN_ID = out_mate_align_id; physical column < I64 > izip_encoding .PRIMARY_ALIGNMENT_ID = PRIMARY_ALIGNMENT_ID; I32 read_idx = cast (.SEQ_READ_ID); extern column I64 PRIMARY_ALIGNMENT_ID = .PRIMARY_ALIGNMENT_ID | simple_sub_select < 'SEQUENCE','PRIMARY_ALIGNMENT_ID' > (.SEQ_SPOT_ID,.SEQ_READ_ID); }; /* compressed_by_reference * aligns internally represented sequence against reference * alignment transcript is stored * original sequence is reconstructed * * History: * 1.2 - respond to changes in base table */ table NCBI:align:tbl:compressed_by_reference #1.2 = NCBI:align:tbl:align_cmn #2.1 { bool out_is_secondary = echo < false > (); // one value per base i.e. length is same as sum of READ_LEN // partitioned by READ_START and READ_LEN into alternative alignments // flags mismatches with the reference extern default column bool_encoding HAS_MISMATCH; bool out_has_mismatch = .HAS_MISMATCH; // has number of elements equal to number of true elements in HAS_MISMATCH extern column INSDC:dna:text MISMATCH { read = out_mismatch_dna_text; validate = < INSDC:dna:text > compare ( in_mismatch_dna_text, out_mismatch_dna_text ); } INSDC:dna:text in_mismatch_dna_text = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( MISMATCH ); INSDC:4na:bin in_mismatch_4na_bin = < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_mismatch_dna_text ); extern column < ascii > zip_encoding ALIGN_GROUP; physical column < INSDC:4na:bin > zip_encoding .MISMATCH = in_mismatch_4na_bin; INSDC:4na:bin out_mismatch_4na_bin = .MISMATCH; INSDC:dna:text out_mismatch_dna_text = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_mismatch_4na_bin ); I64 out_mate_align_id = .MATE_ALIGN_ID | NCBI:align:get_mate_align_id (.SEQ_SPOT_ID); physical column izip_encoding .MATE_ALIGN_ID = MATE_ALIGN_ID; extern column I64 MATE_ALIGN_ID = out_mate_align_id; // restore reads from alignment columns and the reference // optional .READ_LEN size defines PLOIDY INSDC:4na:bin out_4na_bin = NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, .MISMATCH, out_has_ref_offset, out_ref_offset, .READ_LEN ) | NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, .MISMATCH, out_has_ref_offset, out_ref_offset ); // restore reads to its raw form (orientation is restored) INSDC:4na:bin out_raw_read = NCBI:align:raw_restore_read (out_4na_bin,.REF_ORIENTATION); I64 primary_align_pair = < I64 > simple_sub_select < 'SEQUENCE','PRIMARY_ALIGNMENT_ID'> (.SEQ_SPOT_ID); I64 out_mate_ref_id = < I64 > simple_sub_select < '','REF_ID'> (MATE_ALIGN_ID); bool out_mate_ref_orientation = < bool > simple_sub_select < '','REF_ORIENTATION'> (MATE_ALIGN_ID); INSDC:coord:zero out_mate_ref_pos = < INSDC:coord:zero > simple_sub_select < '','REF_POS'> (MATE_ALIGN_ID); INSDC:coord:len out_mate_ref_len = < INSDC:coord:len > simple_sub_select < '','REF_LEN'> (MATE_ALIGN_ID); readonly column U32 MATE_EDIT_DISTANCE = < U32 > simple_sub_select < '','EDIT_DISTANCE'> (MATE_ALIGN_ID); readonly column ascii MATE_CIGAR_LONG = < ascii > simple_sub_select < '','CIGAR_LONG'> (MATE_ALIGN_ID); readonly column ascii MATE_CIGAR_SHORT = < ascii > simple_sub_select < '','CIGAR_SHORT'> (MATE_ALIGN_ID); readonly column INSDC:coord:len MATE_CIGAR_LONG_LEN = < INSDC:coord:len > simple_sub_select < '','CIGAR_LONG_LEN'> (MATE_ALIGN_ID); readonly column INSDC:coord:len MATE_CIGAR_SHORT_LEN = < INSDC:coord:len > simple_sub_select < '','CIGAR_SHORT_LEN'> (MATE_ALIGN_ID); I32 out_template_len = NCBI:align:template_len (REF_POS,out_mate_ref_pos,out_ref_len,out_mate_ref_len,REF_NAME,MATE_REF_NAME,SEQ_READ_ID); }; /* align_sorted * deflated alignment data sorted against reference * * History: * 1.2 - respond to changes in base table */ table NCBI:align:tbl:align_sorted #1.2 = NCBI:align:tbl:compressed_by_reference #1.2 , NCBI:align:tbl:global_ref_block #1.0.0 { // 128K column default limit = 131072; }; /* align_unsorted * deflated alignment unsorted data * * History: * 1.2 - respond to changes in base table */ table NCBI:align:tbl:align_unsorted #1.2 = NCBI:align:tbl:compressed_by_reference #1.2 , NCBI:align:tbl:local_ref_block #1.0.0 { // 128K column default limit = 131072; }; /* align_mate_sorted * * History: * 1.1 - respond to changes in base table */ table NCBI:align:tbl:align_mate_sorted #1.1 = NCBI:align:tbl:align_full #1.1 , NCBI:align:tbl:global_ref_block #1.0.0 { // 128K column default limit = 131072; }; /* align_mate_unsorted * * History: * 1.1 - respond to changes in base table */ table NCBI:align:tbl:align_mate_unsorted #1.1 = NCBI:align:tbl:align_full #1.1 , NCBI:align:tbl:local_ref_block #1.0.0 { // 128K column default limit = 131072; }; /* align_allele * alleles coverage extension * * History: * 1.2 - respond to changes in base table */ table NCBI:align:tbl:align_allele #1.2 = NCBI:align:tbl:align_unsorted #1.2 { extern column < I64 > izip_encoding EVIDENCE_ALIGNMENT_IDS; /* INSDC:quality:phred out_qual_phred = < INSDC:quality:phred > echo < 30 > ( out_4na_bin ); */ }; /*-------------------------------------------------------------------------- * seq * alignment sequence table */ physical I64 NCBI:align:sorted:alignment_id_encoding #1.0 { decode { I64 outliers_removed = iunzip ( @ ); return < I64 > outlier_decode < 0 > ( outliers_removed ); } encode { I64 outliers_removed = < I64 > outlier_encode < 0 > ( @ ); return izip ( outliers_removed ); } } table NCBI:align:tbl:seq #1.1 = NCBI:tbl:base_space #2.0.3, NCBI:tbl:phred_quality #2.0.4, NCBI:align:tbl:cmp_base_space #1, NCBI:SRA:tbl:spotdesc #1.0.2, NCBI:SRA:tbl:stats #1.2.0 { // 128K column default limit = 131072; // gets primary record in alignment table (size of column is NREADS) // if sorted - should used special encoding extern column izip_encoding PRIMARY_ALIGNMENT_ID; INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > (); INSDC:coord:len trim_len = _spot_len; // size is NREADS extern column < U8 > zip_encoding ALIGNMENT_COUNT; // auto-generate name from row-id ascii out_name_fmt = < ascii > echo < '$R' > (); // temparary column extern column < U64 > izip_encoding TMP_KEY_ID; // restored READ INSDC:4na:bin out_dcmp_4na_bin = NCBI:align:seq_restore_read (out_cmp_4na_bin, .PRIMARY_ALIGNMENT_ID, .READ_LEN, .READ_TYPE); extern column < U64 > izip_encoding TI; extern column zip_encoding CMP_LINKAGE_GROUP; // restored LINKAGE_GROUP readonly column ascii LINKAGE_GROUP = NCBI:align:seq_restore_linkage_group(.CMP_LINKAGE_GROUP, .PRIMARY_ALIGNMENT_ID) | .CMP_LINKAGE_GROUP; }; table NCBI:align:tbl:cs_seq #1.2 { /* writable columns */ extern column INSDC:color:text CMP_CSREAD = out_cmp_color_text ; extern column < INSDC:dna:text > zip_encoding CS_KEY; extern default column < INSDC:quality:phred > zip_encoding QUALITY; extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_ID; extern column < U8 > zip_encoding ALIGNMENT_COUNT; extern column < INSDC:SRA:platform_id > zip_encoding PLATFORM; extern column < ascii > zip_encoding LABEL; extern column < INSDC:coord:zero > izip_encoding LABEL_START; extern column < INSDC:coord:len > izip_encoding LABEL_LEN; extern column < INSDC:SRA:xread_type > zip_encoding READ_TYPE; extern column < INSDC:coord:zero > izip_encoding READ_START; extern column < INSDC:coord:len > izip_encoding READ_LEN; extern column < INSDC:SRA:read_filter > zip_encoding READ_FILTER; extern column < U64 > izip_encoding TMP_KEY_ID; extern column < ascii > zip_encoding SPOT_GROUP; extern column < U64 > izip_encoding TI; /* writing rules */ INSDC:x2cs:bin in_cmp_x2cs_bin = < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( CMP_CSREAD ) ; INSDC:2cs:bin in_cmp_2cs_bin = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_cmp_x2cs_bin ) ; INSDC:x2cs:bin in_cmp_alt_x2cs_bin = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_cmp_x2cs_bin ) ; physical column INSDC:2cs:packed .CMP_CSREAD = ( INSDC:2cs:packed ) pack ( in_cmp_2cs_bin ) ; physical column < INSDC:x2cs:bin > zip_encoding .CMP_ALTCSREAD = < INSDC:x2cs:bin > trim < ALIGN_LEFT, 0 > ( in_cmp_alt_x2cs_bin ) ; /* reading rules */ INSDC:2cs:packed phys_cmp_2cs_packed = .CMP_CSREAD ; INSDC:x2cs:bin phys_cmp_alt_x2cs_bin = .CMP_ALTCSREAD ; INSDC:2cs:packed phys_2cs_packed = .CSREAD ; INSDC:x2cs:bin phys_alt_x2cs_bin = .ALTCSREAD ; INSDC:2cs:bin out_cmp_2cs_bin = ( INSDC:2cs:bin ) unpack ( phys_cmp_2cs_packed ) ; INSDC:2cs:bin out_2cs_bin = ( INSDC:2cs:bin ) unpack ( phys_2cs_packed ) ; INSDC:x2cs:bin out_cmp_x2cs_bin = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_cmp_2cs_bin, phys_cmp_alt_x2cs_bin ) | ( INSDC:x2cs:bin ) out_cmp_2cs_bin ; INSDC:x2cs:bin out_x2cs_bin = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, phys_alt_x2cs_bin ) | ( INSDC:x2cs:bin ) out_2cs_bin ; INSDC:color:text out_cmp_color_text = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_cmp_x2cs_bin ) ; INSDC:color:text out_color_text = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ) ; /* triggers from stats */ INSDC:quality:phred in_qual_phred = QUALITY ; INSDC:coord:len in_read_len = READ_LEN ; INSDC:SRA:xread_type in_read_type = READ_TYPE ; ascii in_spot_group = SPOT_GROUP ; trigger meta_stats = NCBI:SRA:cmp_stats_trigger ( in_cmp_x2cs_bin, in_qual_phred, in_read_len, in_read_type, in_spot_group ) | NCBI:SRA:cmp_stats_trigger ( in_cmp_x2cs_bin, in_qual_phred, in_read_len, in_read_type ) ; trigger qual_stats = NCBI:SRA:phred_stats_trigger #1 ( in_qual_phred ) ; extern column zip_encoding CMP_LINKAGE_GROUP; // restored LINKAGE_GROUP readonly column ascii LINKAGE_GROUP = NCBI:align:seq_restore_linkage_group(.CMP_LINKAGE_GROUP, .PRIMARY_ALIGNMENT_ID) | .CMP_LINKAGE_GROUP; }; table NCBI:align:view:cs_seq #1.1 = NCBI:align:tbl:cs_seq #1.2 { // various READ columns default readonly column INSDC:dna:text READ = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_dcmp_4na_bin ) | < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ) ; readonly column INSDC:4na:bin READ = out_dcmp_4na_bin | out_4na_bin; readonly column INSDC:4na:packed READ = pack ( out_dcmp_4na_bin ) | pack ( out_4na_bin ); readonly column INSDC:x2na:bin READ = out_dcmp_x2na_bin | out_x2na_bin; readonly column INSDC:2na:bin READ = out_dcmp_2na_bin | out_2na_bin; INSDC:2na:bin out_dcmp_2na_bin = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_dcmp_x2na_bin ) ; INSDC:2na:bin out_2na_bin = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin ) ; readonly column INSDC:2na:packed READ = pack ( out_dcmp_2na_bin ) | pack ( out_2na_bin ); // decompression in base space INSDC:coord:len cmp_read_len = < INSDC:coord:len > NCBI:align:make_cmp_read_desc #1 < true > ( .READ_LEN, .PRIMARY_ALIGNMENT_ID ) ; INSDC:coord:zero cmp_read_start = NCBI:align:make_read_start #1 ( cmp_read_len ) ; INSDC:x2na:bin out_cmp_x2na_bin = NCBI:dna_from_color #1 ( out_cmp_x2cs_bin, cmp_read_start, cmp_read_len, .CS_KEY, color_matrix ) ; INSDC:x2na:bin out_x2na_bin = NCBI:dna_from_color #1 ( out_x2cs_bin, .READ_START, .READ_LEN, .CS_KEY, color_matrix ) ; INSDC:4na:bin out_cmp_4na_bin = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_cmp_x2na_bin ) ; INSDC:4na:bin out_4na_bin = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ) ; INSDC:4na:bin out_dcmp_4na_bin = NCBI:align:seq_restore_read ( out_cmp_4na_bin, .PRIMARY_ALIGNMENT_ID, .READ_LEN, .READ_TYPE ) ; // various CSREAD columns default readonly column INSDC:color:text CSREAD = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_dcmp_x2cs_bin ) | out_color_text; readonly column INSDC:x2cs:bin CSREAD = out_dcmp_x2cs_bin | out_x2cs_bin; readonly column INSDC:2cs:bin CSREAD = out_dcmp_2cs_bin | out_2cs_bin; INSDC:2cs:bin out_dcmp_2cs_bin = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_dcmp_x2cs_bin ) ; readonly column INSDC:2cs:packed CSREAD = pack ( out_dcmp_2cs_bin ) | out_2cs_bin; // decompression in color space INSDC:x2na:bin out_dcmp_x2na_bin = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_dcmp_4na_bin ) ; INSDC:x2cs:bin out_dcmp_x2na_x2cs_bin = NCBI:color_from_dna #1 ( out_dcmp_x2na_bin, .READ_START, .READ_LEN, .CS_KEY, color_matrix ) ; INSDC:coord:len aligned_read_len = < INSDC:coord:len > NCBI:align:make_cmp_read_desc #1 < false > ( .READ_LEN, .PRIMARY_ALIGNMENT_ID ) ; INSDC:x2cs:bin out_dcmp_x2cs_bin = < INSDC:x2cs:bin > NCBI:align:seq_construct_read #1 ( out_dcmp_x2na_x2cs_bin, .READ_LEN, out_cmp_x2cs_bin, cmp_read_len ) ; // CS_NATIVE - dynamic U32 cmp_csread_row_len = row_len #1 ( phys_cmp_2cs_packed ) ; U32 cmp_csread_not_zero = < U32 > clip < 0, 1 > ( cmp_csread_row_len ) ; readonly column bool CS_NATIVE = < U32, bool > map < [ 0, 1 ], [ false, true ] > ( cmp_cs_read_not_zero ) ; // COLOR_MATRIX readonly column U8 COLOR_MATRIX = color_matrix ; U8 color_matrix = < U8 > echo < INSDC:color:default_matrix > () ; // various QUALITY types readonly column INSDC:quality:text:phred_33 QUALITY = out_qual_text_phred_33 | ( INSDC:quality:text:phred_33 ) < B8 > sum < 33 > ( .QUALITY ); readonly column INSDC:quality:text:phred_64 QUALITY = out_qual_text_phred_64 | ( INSDC:quality:text:phred_64 ) < B8 > sum < 64 > ( .QUALITY ); // SPOT_LEN INSDC:coord:len spot_len = ( INSDC:coord:len ) row_len ( out_dcmp_4na_bin ) | ( INSDC:coord:len ) row_len ( out_4na_bin ) ; readonly column INSDC:coord:len SPOT_LEN = spot_len; // TRIM_START readonly column INSDC:coord:zero TRIM_START = < INSDC:coord:zero > echo < 0 > () ; readonly column INSDC:coord:one TRIM_START = < INSDC:coord:one > echo < 1 > () ; // TRIM_LEN readonly column INSDC:coord:len TRIM_LEN = spot_len; // MIN_SPOT_ID readonly column INSDC:SRA:spotid_t MIN_SPOT_ID = < INSDC:SRA:spotid_t > meta:value < "STATS/TABLE/SPOT_MIN" > () ; // MAX_SPOT_ID readonly column INSDC:SRA:spotid_t MAX_SPOT_ID = < INSDC:SRA:spotid_t > meta:value < "STATS/TABLE/SPOT_MAX" > () ; // SPOT_COUNT readonly column U64 SPOT_COUNT = < U64 > meta:value < "STATS/TABLE/SPOT_COUNT" > () ; // BASE_COUNT U64 base_count = < U64 > meta:value < "STATS/TABLE/BASE_COUNT" > () ; readonly column U64 BASE_COUNT = base_count; // BIO_BASE_COUNT readonly column U64 BIO_BASE_COUNT = < U64 > meta:value < "STATS/TABLE/BIO_BASE_COUNT" > () ; // CMP_BASE_COUNT readonly column U64 CMP_BASE_COUNT = < U64 > meta:value < "STATS/TABLE/CMP_BASE_COUNT" > () | base_count ; // various PLATFORM // TBD // SPOT_ID I64 rowid_64 = row_id (); readonly column INSDC:SRA:spotid_t SPOT_ID = cast ( rowid_64 ) ; readonly column ascii NAME = sprintf < "%u" > ( SPOT_ID ) ; }; /*********************************** * Reference table - to store reference sequences * Sequences are divided in chunks. Two sequences never share a chunk. * SEQ_LEN - real size of a chunk should never exceed MAX_SEQ_LEN when it is set * READ - inherited from NCBI:tbl:base_space * CMP_READ,CMP_ALTREAD - are inherited from NCBI:align:tbl:cmp_base_space * SEQ_ID,SEQ_START,SEQ_LEN are inherited from NCBI:align:tbl:seqloc * .skey contains NAME of the chunk - it corresponds to actual name used in BAM (chr1,chr2, etc....) * * SEQ_START,SEQ_LEN,MAX_SEQ_LEN,SEQID and rowlen(READ) operate the following way * - SEQ_LEN < MAX_SEQ_LEN - should only happen on the last chunk of the sequence * - .READ is absent - there should be a retrieval from external services by SEQ_ID,SEQ_START,SEQ_LEN * - rowlen(.READ) = 0 && SEQ_START==0 (used as flag) - the sequence is SEQ_LEN repetition of 'N' * - rowlen(.READ) = 0 && SEQ_START >= 1 - the sequence have to be fetched from external sources * - 0 < rowlen(.READ)< SEQ_LEN -- the sequence have to be filled with 'N's * v***********************************/ table NCBI:align:tbl:reference #2 = NCBI:align:tbl:cmp_base_space #1, NCBI:tbl:base_space #2.0.3, NCBI:tbl:seqloc #1, NCBI:SRA:tbl:stats #1.2.0 { INSDC:quality:phred out_qual_phred = < INSDC:quality:phred > echo < 30 > ( out_dcmp_4na_bin ); // MAX_SEQ_LEN - should be a constant == static column extern column < U32 > izip_encoding MAX_SEQ_LEN; // indicates if sequence has circular structure // copied from refSeq extern column bool_encoding CIRCULAR; // make CS_KEY writable INSDC:dna:text in_cs_key = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn', 'ACGTN' > ( CS_KEY ); physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key; U32 in_spot_len = SEQ_LEN; INSDC:coord:len _alt_in_read_len = READ_LEN | SEQ_LEN; INSDC:SRA:xread_type _alt_in_read_type = READ_TYPE | < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > (); // extra columns needed for CS conversion INSDC:coord:zero out_read_start = < INSDC:coord:zero > echo < 0 > (); INSDC:coord:len out_read_len = .SEQ_LEN; extern column utf8 NAME = out_spot_name_utf8; physical utf8 .NAME = idx:text:insert #1.0 < 'i_name' > ( NAME ); utf8 out_spot_name_utf8 = idx:text:project #1.0 < 'i_name' > (.NAME ); ascii out_spot_name = cast ( out_spot_name_utf8 ); INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > (); INSDC:coord:len trim_len = base_space_spot_len; ascii out_label = < ascii > echo < "reference" > (); INSDC:coord:zero out_label_start = < INSDC:coord:zero > echo < 0 > (); INSDC:coord:len out_label_len = < INSDC:coord:len > echo < 9 > (); U32 out_nreads = < U32 > echo < 1 > (); INSDC:SRA:xread_type out_read_type = < INSDC:SRA:xread_type > echo < 3 > (); INSDC:SRA:read_filter out_rd_filter = < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > (); // Columns of computed coverages by alignment // TBD: use percentiles instead of min/max? // maximum value clipped at 255 of the coverage density // for a chunk extern column < U8 > izip_encoding CGRAPH_HIGH; // minimum value clipped at 255 of the coverage density // for a chunk extern column < U8 > izip_encoding CGRAPH_LOW; // count of the number of mismatches in the chunk extern column < U32 > izip_encoding CGRAPH_MISMATCHES; // count of the number of inserts and deletes in the chunk extern column < U32 > izip_encoding CGRAPH_INDELS; // List of row ids from alignment tables extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_IDS; extern column < I64 > izip_encoding SECONDARY_ALIGNMENT_IDS; extern column < I64 > izip_encoding EVIDENCE_INTERVAL_IDS; // both OVERLAP_REF_* columns are array of three elements, matching number of *_IDS columns above. // points back to an offset where the alignments to this chunk start extern column < INSDC:coord:zero > izip_encoding OVERLAP_REF_POS; // indicates the length of the longest tail of the alignmnent to this chunk which start in previous chunks // if value of an element in this col is zero corresponding value of OVERLAP_REF_POS is meaningless extern column < INSDC:coord:len > izip_encoding OVERLAP_REF_LEN; // Mechanism to seach for NAME readonly column vdb:row_id_range NAME_RANGE = idx:text:lookup #1.0 < 'i_name', 'QUERY_SEQ_NAME' > (); // Fully instantiates READ INSDC:4na:bin out_dcmp_4na_bin = NCBI:align:ref_restore_read (out_cmp_4na_bin, .SEQ_ID, .SEQ_START, .SEQ_LEN); } // THE DATABASES database NCBI:align:db:alignment_sorted #1.3 { table NCBI:align:tbl:reference #2 REFERENCE; table NCBI:align:tbl:align_sorted #1.2 PRIMARY_ALIGNMENT; table NCBI:align:tbl:align_mate_sorted #1.1 SECONDARY_ALIGNMENT; table NCBI:align:tbl:seq #1.1 SEQUENCE; table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; table NCBI:align:tbl:qstat #1.0 QUAL_STAT; }; database NCBI:align:db:alignment_unsorted #1.3 { table NCBI:align:tbl:reference #2 REFERENCE; table NCBI:align:tbl:align_unsorted #1.2 PRIMARY_ALIGNMENT; table NCBI:align:tbl:align_mate_unsorted #1.1 SECONDARY_ALIGNMENT; table NCBI:align:tbl:seq #1.1 SEQUENCE; table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; table NCBI:align:tbl:qstat #1.0 QUAL_STAT; }; database NCBI:align:db:alignment_evidence #1.3 { table NCBI:align:tbl:reference #2 REFERENCE; table NCBI:align:tbl:align_unsorted #1.2 PRIMARY_ALIGNMENT; table NCBI:align:tbl:align_mate_unsorted #1.1 SECONDARY_ALIGNMENT; table NCBI:align:tbl:align_allele #1.2 EVIDENCE_INTERVAL; table NCBI:align:tbl:align_mate_unsorted #1.1 EVIDENCE_ALIGNMENT; table NCBI:align:tbl:seq #1.1 SEQUENCE; table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; table NCBI:align:tbl:qstat #1.0 QUAL_STAT; }; database NCBI:align:db:alignment_evidence_sorted #1.2 { table NCBI:align:tbl:reference #2 REFERENCE; table NCBI:align:tbl:align_sorted #1.2 PRIMARY_ALIGNMENT; table NCBI:align:tbl:align_mate_sorted #1.1 SECONDARY_ALIGNMENT; table NCBI:align:tbl:align_allele #1.2 EVIDENCE_INTERVAL; table NCBI:align:tbl:align_mate_unsorted #1.1 EVIDENCE_ALIGNMENT; table NCBI:align:tbl:seq #1.1 SEQUENCE; table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; table NCBI:align:tbl:qstat #1.0 QUAL_STAT; }; database NCBI:align:db:unaligned #1 { table NCBI:align:tbl:seq #1.1 SEQUENCE; table NCBI:SRA:ABI:tbl:v2 #1.0.4 CS_SEQUENCE; table NCBI:align:tbl:qstat #1.0 QUAL_STAT; };