/*===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
*/

/*==========================================================================
 * General read table which will be inherited by others
 */
version 1;

include 'vdb/vdb.vschema';
include 'insdc/insdc.vschema';
include 'csra2/stats.vschema';


/*--------------------------------------------------------------------------
 * tables
 */
table NCBI:csra2:tbl:read #1.0 = NCBI:csra2:tbl:read_stats #1
{
    /* CHUNK_SZ
     *  describes the maximum number of bases in any row
     *
     *  if present, allows a single sequence to be broken into multiple rows
     *  where this value gives the limit on the number of bases in any row.
     *
     *  the sequence will be split across some number of rows, depending upon
     *  the value of CHUNK_SZ. if length ( seq ) > CHUNK_SZ, then there will
     *  be multiple rows, where all but the last will have a length of CHUNK_SZ.
     *  the last ( or only ) row will have a length of length(seq)%CHUNK_SIZE.
     */
    extern column INSDC:coord:len CHUNK_SZ;


    /* READ
     *  base calls
     */

    // textual representation
    extern default column INSDC:dna:text READ
    {
        read = out_dna_text;
        validate = < INSDC:dna:text > compare ( in_dna_text, out_dna_text );
    }

    // 4na representation - unpacked
    extern column INSDC:4na:bin READ
        = out_4na_bin
        ;


    /* QUALITY
     *  phred-score quality values
     */
    extern default column INSDC:quality:phred QUALITY
        = out_qual_phred
        ;
    extern column INSDC:quality:text:phred_33 QUALITY
        = ( INSDC:quality:text:phred_33 ) < B8 > sum < 33 > ( out_qual_phred )
        ;
    extern column INSDC:quality:text:phred_64 QUALITY
        = ( INSDC:quality:text:phred_64 ) < B8 > sum < 64 > ( out_qual_phred )
        ;

    /* ---------------------------- optional columns ---------------------------- */

    /* RD_ID
     * RD_GROUP
     *  reports group and id of current row
     */
    extern column I64 RD_ID;
    extern column ascii RD_GROUP;

    /* RD_FILTER
     *  records filter value if used
     */
    extern column INSDC:SRA:read_filter RD_FILTER;


    /* ---------------------------- input rules ---------------------------- */

    // input text
    INSDC:dna:text in_dna_text
        = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( READ )
        ;

    // input 4na bin
    INSDC:4na:bin in_4na_bin
        = < INSDC:4na:bin > range_validate < 0, 15 > ( READ )
        | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_dna_text )
        ;

    // input 2na bin
    INSDC:2na:bin in_2na_bin
        = INSDC:SEQ:rand_4na_2na ( in_4na_bin )
        ;

    // input 4na alt-read ( ambiguities )
    INSDC:4na:bin in_alt_4na_bin
        = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_4na_bin )
        ;

    // feed the statistics
    INSDC:4na:bin in_stats_seq = in_4na_bin;
    
    // quality
    INSDC:quality:text:phred_33 in_qual_text_phred_33 = QUALITY;
    INSDC:quality:text:phred_64 in_qual_text_phred_64 = QUALITY;

    INSDC:quality:phred in_qual_phred
        = QUALITY
        | ( INSDC:quality:phred ) < B8 > diff < 33 > ( in_qual_text_phred_33 )
        | ( INSDC:quality:phred ) < B8 > diff < 64 > ( in_qual_text_phred_64 )
        ;

    // feed the statistics
    INSDC:quality:phred in_stats_qual_phred = in_qual_phred;

    ascii in_stats_read_group
        = in_stats_spot_group
        | RD_GROUP
        ;


    /* ---------------------------- physical columns ---------------------------- */

    physical column INSDC:2na:packed .READ
        = ( INSDC:2na:packed ) pack ( in_2na_bin )
        ;

    physical column < INSDC:4na:bin > zip_encoding .ALTREAD
        = < INSDC:4na:bin > trim < 0, 0 > ( in_alt_4na_bin )
        ;

    physical column < INSDC:quality:phred > delta_average_zip_encoding .QUALITY
        = in_qual_phred
        ;


    /* ---------------------------- output rules ---------------------------- */

    // output 2na packed
    INSDC:2na:packed out_2na_packed
        = .READ
        ;

    // output 2na bin
    INSDC:2na:bin out_2na_bin
        = ( INSDC:2na:bin ) unpack ( out_2na_packed )
        ;

    // output 2na->4na bin
    INSDC:4na:bin out_2na_4na_bin
        = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_2na_bin )
        ;

    // output 4na bin
    INSDC:4na:bin out_4na_bin
        = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_2na_4na_bin, .ALTREAD )
        | out_2na_4na_bin
        ;
    
    // output text
    INSDC:dna:text out_dna_text
        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin )
        ;
    
    // output quality
    INSDC:quality:phred out_qual_phred
        = .QUALITY
        | < INSDC:quality:phred > echo < 30 > ( out_4na_bin )
        ;
}


/*--------------------------------------------------------------------------
 * views
 */
table NCBI:csra2:view:read #1.0 =
    NCBI:csra2:tbl:read #1.0
{
    /* CHUNK_SIZE
     *  describes the maximum number of bases in any row
     *
     *  if present, allows a single sequence to be broken into multiple rows
     *  where this value gives the limit on the number of bases in any row.
     *
     *  the sequence will be split across some number of rows, depending upon
     *  the value of CHUNK_SIZE. if length ( seq ) > CHUNK_SIZE, then there will
     *  be multiple rows, where all but the last will have a length of CHUNK_SIZE.
     *  the last ( or only ) row will have a length of length(seq)%CHUNK_SIZE.
     */
    readonly column INSDC:coord:len CHUNK_SIZE
        = .CHUNK_SZ
        | < INSDC:coord:len > echo < 0xFFFFFFFF > ()
        ;
    
    /* READ
     *  generate remaining 4 types
     */
    readonly column INSDC:4na:packed READ
        = ( INSDC:4na:packed ) pack ( out_4na_bin )
        ;
    readonly column INSDC:x2na:bin READ
        = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin )
        ;
    readonly column INSDC:2na:bin READ
        = out_2na_bin
        ;
    readonly column INSDC:2na:packed READ
        = out_2na_packed
        ;

    /* READ_ID
     * READ_GROUP
     *  reports group and id of current row
     */
    readonly column I64 READ_ID
        = .RD_ID
        | row_id ()
        ;
    readonly column ascii READ_GROUP
        = .RD_GROUP
        | < ascii > echo < '' > ()
        ;

    /* READ_FILTER
     *  records filter value if used
     */
    readonly column INSDC:SRA:read_filter READ_FILTER
        = .RD_FILTER
        | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ()
        ;
}