// ===========================================================================
//
// PUBLIC DOMAIN NOTICE
// National Center for Biotechnology Information (NCBI)
//
// This software/database is a "United States Government Work" under the
// terms of the United States Copyright Act. It was written as part of
// the author's official duties as a United States Government employee and
// thus cannot be copyrighted. This software/database is freely available
// to the public for use. The National Library of Medicine and the U.S.
// Government do not place any restriction on its use or reproduction.
// We would, however, appreciate having the NCBI and the author cited in
// any work or product based on this material.
//
// Although all reasonable efforts have been taken to ensure the accuracy
// and reliability of the software and data, the NLM and the U.S.
// Government do not and cannot warrant the performance or results that
// may be obtained by using this software or data. The NLM and the U.S.
// Government disclaim all warranties, express or implied, including
// warranties of performance, merchantability or fitness for any particular
// purpose.
//
// ===========================================================================
//
// File Name: xtract.go
//
// Author: Jonathan Kans
//
// ==========================================================================
/*
test for presence of go compiler, cross-compile xtract executables, and pack into archive, by running:
if hash go 2>/dev/null
then
env GOOS=darwin GOARCH=amd64 go build -o xtract.Darwin -v xtract.go
env GOOS=linux GOARCH=amd64 go build -o xtract.Linux -v xtract.go
env GOOS=windows GOARCH=386 go build -o xtract.CYGWIN_NT -v xtract.go
tar -czf archive.tar.gz xtract.[A-Z]*
rm xtract.[A-Z]*
fi
*/
package main
import (
"bytes"
"container/heap"
"fmt"
"html"
"io"
"math"
"os"
"runtime"
"runtime/debug"
"runtime/pprof"
"strconv"
"strings"
"sync"
"time"
"unicode"
)
// VERSION AND HELP MESSAGE TEXT
const xtractVersion = "5.80"
const xtractHelp = `
Overview
Xtract uses command-line arguments to convert XML data into a tab-delimited table.
-pattern places the data from individual records into separate rows.
-element extracts values from specified fields into separate columns.
-group, -block, and -subset limit element exploration to selected XML subregions.
Processing
-cleanup Fix non-ASCII spaces
-compress Compress runs of spaces
-input Read from file instead of stdin
Exploration Argument Hierarchy
-pattern Name of record within set
-group Use of different argument
-block names allows command-line
-subset control of nested looping
Exploration Constructs
Object DateCreated
Parent/Child Book/AuthorList
Heterogeneous "PubmedArticleSet/*"
Nested "*/Taxon"
Recursive "**/Gene-commentary"
Conditional Execution
-if Element [@attribute] required
-unless Skip if element matches
-and All tests must pass
-or Any passing test suffices
-else Execute if conditional test failed
-position Must be at given location in list
String Constraints
-equals String must match exactly
-contains Substring must be present
-starts-with Substring must be at beginning
-ends-with Substring must be at end
-is-not String must not match
Numeric Constraints
-gt Greater than
-ge Greater than or equal to
-lt Less than
-le Less than or equal to
-eq Equal to
-ne Not equal to
Format Customization
-ret Override line break between patterns
-tab Replace tab character between fields
-sep Separator between group members
-pfx Prefix to print before group
-sfx Suffix to print after group
-clr Clear queued tab separator
-pfc Preface combines -clr and -pfx
-rst Reset -sep, -pfx, and -sfx
-lbl Insert arbitrary text
Element Selection
-element Print all items that match tag name
-first Only print value of first item
-last Only print value of last item
-encode URL-encode <, >, &, ", and ' characters
-NAME Record value in named variable
-element Constructs
Tag Caption
Group Initials,LastName
Parent/Child MedlineCitation/PMID
Attribute DescriptorName@MajorTopicYN
Recursive "**/Gene-commentary_accession"
Object Count "#Author"
Item Length "%Title"
Element Depth "^PMID"
Parent Index "+"
XML Subtree "*"
Variable "&NAME"
Numeric Selection
-num Count
-len Length
-sum Sum
-min Minimum
-max Maximum
-inc Increment
-dec Decrement
-sub Difference
-avg Average
-dev Deviation
Sequence Coordinates
-0-based Zero-Based
-1-based One-Based
-ucsc Half-Open
Command Generator
-insd Generate INSDSeq extraction commands
-insd Argument Order
Descriptors INSDSeq_sequence INSDSeq_definition INSDSeq_division
Flags complete or partial [optional]
Feature(s) CDS,mRNA
Qualifiers INSDFeature_key "#INSDInterval" gene product
Miscellaneous
-head Print before first record
-tail Print after last record
Reformatting
-format [compact|indent|expand]
Modification
-filter Object [retain|remove|encode|decode|shrink] [content|cdata|comment|object|attributes]
Validation
-verify Report XML data integrity problems
Summary
-outline Display outline of XML structure
-synopsis Display count of unique XML paths
Documentation
-help Print this document
-examples Examples of EDirect and xtract usage
-version Print version number
Notes
String constraints use case-insensitive comparisons.
Numeric constraints and selection arguments use integer values.
-num and -len selections are synonyms for Object Count (#) and Item Length (%).
Examples
-pattern DocumentSummary -element Id -first Name Title
-pattern "PubmedArticleSet/*" -block Author -sep " " -element Initials,LastName
-pattern PubmedArticle -block MeshHeading -if "@MajorTopicYN" -equals Y -sep " / " -element DescriptorName,QualifierName
-pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop
-pattern Taxon -block "*/Taxon" -unless Rank -equals "no rank" -tab "\n" -element Rank,ScientificName
-pattern Entrezgene -block "**/Gene-commentary"
-block INSDReference -position 2
-if Author -and Title
-if "#Author" -lt 6 -and "%Title" -le 70
-if DateCreated/Year -gt 2005
-if ChrStop -lt ChrStart
-if CommonName -contains mouse
-if "&ABST" -starts-with "Transposable elements"
-if MapLocation -element MapLocation -else -lbl "\-"
-min ChrStart,ChrStop
-max ExonCount
-inc @aaPosition -element @residue
-1-based ChrStart
-insd CDS gene product protein_id translation
-insd complete mat_peptide "%peptide" product peptide
-filter ExpXml decode content
-filter LocationHist remove object
`
const xtractInternal = `
ReadBlocks -> SplitPattern => StreamTokens => ParseXML => ProcessQuery -> MergeResults
Performance Default Overrides
-proc Number of CPU processors used
-cons Ratio of parsers to processors
-serv Concurrent parser instances
-chan Communication channel depth
-heap Order restoration heap size
-farm Node allocation buffer length
-gogc Garbage collection tuning knob
Debugging
-debug Display run-time parameter summary
-empty Flag records with no output
-index Print record index numbers
-stats Show processing time for each record
-timer Report processing duration and rate
-trial Optimize -proc value, requires -input
Internal Component Performance
-chunk ReadBlocks
-split ReadBlocks -> SplitPattern
-drain ReadBlocks -> SplitPattern -> ConcurrencyChannel
-token ReadBlocks -> StreamTokens
Documentation
-keys Keyboard navigation shortcuts
-unix Common Unix commands
Sample File Download
ftp-cp ftp.ncbi.nlm.nih.gov /entrez/entrezdirect/samples carotene.xml.zip
unzip carotene.xml.zip
rm carotene.xml.zip
Performance Tuning Script
XtractTrials() {
echo -e ""
for tries in {1..5}
do
xtract -debug -input "$1" -proc "$2" -pattern PubmedArticle -element LastName
done
echo -e ""
}
for proc in {1..8}
do
XtractTrials "carotene.xml" "$proc" |
xtract -pattern Trials -lbl "$proc" -avg Rate -dev Rate
done
Processor Titration Results
1 10455 124
2 18674 684
3 23654 1371
4 30527 521
5 35132 2349
6 40289 907
7 45369 1370
8 47397 1141
Execution Profiling
xtract -profile -input carotene.xml -pattern PubmedArticle -element LastName
go tool pprof --pdf ./xtract ./cpu.pprof > ./callgraph.pdf
`
const xtractExamples = `
Author Frequency
esearch -db pubmed -query "rattlesnake phospholipase" |
efetch -format docsum |
xtract -pattern DocumentSummary -sep "\n" -element Name |
sort-uniq-count-rank
39 Marangoni S
31 Toyama MH
26 Soares AM
25 Bon C
...
Publications
efetch -db pubmed -id 6271474,5685784,4882854,6243420 -format xml |
xtract -pattern PubmedArticle -element MedlineCitation/PMID "#Author" \
-block Author -position first -sep " " -element Initials,LastName \
-block Article -element ArticleTitle
6271474 5 MJ Casadaban Tn3: transposition and control.
5685784 2 RK Mortimer Suppressors and suppressible mutations in yeast.
4882854 2 ED Garber Proteins and enzymes as taxonomic tools.
6243420 1 NR Cozzarelli DNA gyrase and the supercoiling of DNA.
Formatted Authors
efetch -db pubmed -id 1413997,6301692,781293 -format xml |
xtract -pattern PubmedArticle -element MedlineCitation/PMID \
-block DateCreated -sep "-" -element Year,Month,Day \
-block Author -sep " " -tab "" \
-element "&COM" Initials,LastName -COM "(, )"
1413997 1992-11-25 RK Mortimer, CR Contopoulou, JS King
6301692 1983-06-17 MA Krasnow, NR Cozzarelli
781293 1976-10-02 MJ Casadaban
Medical Subject Headings
efetch -db pubmed -id 6092233,2539356,1937004 -format xml |
xtract -pattern PubmedArticle -element MedlineCitation/PMID \
-block MeshHeading \
-subset DescriptorName -pfc "\n" -sep "|" -element @MajorTopicYN,DescriptorName \
-subset QualifierName -pfc " / " -sep "|" -element @MajorTopicYN,QualifierName |
sed -e 's/N|//g' -e 's/Y|/*/g'
6092233
Base Sequence
DNA Restriction Enzymes
DNA, Fungal / genetics / *isolation & purification
*Genes, Fungal
...
Peptide Sequences
esearch -db protein -query "conotoxin AND mat_peptide [FKEY]" |
efetch -format gpc |
xtract -insd complete mat_peptide "%peptide" product peptide |
grep -i conotoxin | sort -t $'\t' -u -k 2,2n | head -n 8
ADB43131.1 15 conotoxin Cal 1b LCCKRHHGCHPCGRT
AIC77099.1 16 conotoxin Im1.2 GCCSHPACNVNNPHIC
AIC77105.1 17 conotoxin Lt1.4 GCCSHPACDVNNPDICG
AIC77103.1 18 conotoxin Lt1.2 PRCCSNPACNANHAEICG
AIC77083.1 20 conotoxin Bt14.6 KDCTYCMHSSCSMMYEKCRP
AIC77085.1 21 conotoxin Bt14.8 NECDNCMRSFCSMIYEKCRLK
AIC77093.1 22 conotoxin Bt14.16 GDCKPCMHPDCRFNPGRCRPRE
AIC77154.1 23 conotoxin Bt14.19 VREKDCPPHPVPGMHKCVCLKTC
Chromosome Locations
esearch -db gene -query "calmodulin [PFN] AND mammalia [ORGN]" |
efetch -format docsum |
xtract -pattern DocumentSummary -MAP "(-)" -MAP MapLocation \
-element Id Name "&MAP" ScientificName
801 CALM1 14q32.11 Homo sapiens
808 CALM3 19q13.2-q13.3 Homo sapiens
805 CALM2 2p21 Homo sapiens
24242 Calm1 6q31-q32 Rattus norvegicus
12313 Calm1 12 E Mus musculus
326597 CALM - Bos taurus
50663 Calm2 6q11-q12 Rattus norvegicus
24244 Calm3 1q22 Rattus norvegicus
12315 Calm3 7 9.15 cM Mus musculus
12314 Calm2 17 E4 Mus musculus
617095 CALM1 - Bos taurus
396838 CALM3 6 Sus scrofa
...
Gene Regions
esearch -db gene -query "DDT [GENE] AND mouse [ORGN]" |
efetch -format docsum |
xtract -pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop |
xargs -n 3 sh -c 'efetch -db nuccore -format gb \
-id "$0" -chr_start "$1" -chr_stop "$2"'
LOCUS NC_000076 2142 bp DNA linear CON 09-FEB-2015
DEFINITION Mus musculus strain C57BL/6J chromosome 10, GRCm38.p3 C57BL/6J.
ACCESSION NC_000076 REGION: complement(75771233..75773374) GPC_000000783
VERSION NC_000076.6 GI:372099100
...
FEATURES Location/Qualifiers
source 1..2142
/organism="Mus musculus"
/mol_type="genomic DNA"
/strain="C57BL/6J"
/db_xref="taxon:10090"
/chromosome="10"
gene 1..2142
/gene="Ddt"
mRNA join(1..159,462..637,1869..2142)
/gene="Ddt"
/product="D-dopachrome tautomerase"
/transcript_id="NM_010027.1"
CDS join(52..159,462..637,1869..1941)
/gene="Ddt"
/codon_start=1
/product="D-dopachrome decarboxylase"
/protein_id="NP_034157.1"
/translation="MPFVELETNLPASRIPAGLENRLCAATATILDKPEDRVSVTIRP
GMTLLMNKSTEPCAHLLVSSIGVVGTAEQNRTHSASFFKFLTEELSLDQDRIVIRFFP
...
Taxonomic Names
esearch -db taxonomy -query "txid10090 [SBTR] OR camel [COMN]" |
efetch -format docsum |
xtract -pattern DocumentSummary -if CommonName \
-element Id ScientificName CommonName
57486 Mus musculus molossinus Japanese wild mouse
39442 Mus musculus musculus eastern European house mouse
35531 Mus musculus bactrianus southwestern Asian house mouse
10092 Mus musculus domesticus western European house mouse
10091 Mus musculus castaneus southeastern Asian house mouse
10090 Mus musculus house mouse
9838 Camelus dromedarius Arabian camel
9837 Camelus bactrianus Bactrian camel
Structural Similarity
esearch -db structure -query "crotalus [ORGN] AND phospholipase A2" |
elink -related |
efilter -query "archaea [ORGN]" |
efetch -format docsum |
xtract -pattern DocumentSummary \
-if PdbClass -equals Hydrolase \
-element PdbAcc PdbDescr
3VV2 Crystal Structure Of Complex Form Between S324a-subtilisin And Mutant Tkpro
3VHQ Crystal Structure Of The Ca6 Site Mutant Of Pro-Sa-Subtilisin
2ZWP Crystal Structure Of Ca3 Site Mutant Of Pro-S324a
2ZWO Crystal Structure Of Ca2 Site Mutant Of Pro-S324a
...
Multiple Links
esearch -db pubmed -query "conotoxin AND dopamine [MAJR]" |
elink -target protein -cmd neighbor |
xtract -pattern LinkSet -if Link/Id -element IdList/Id Link/Id
23624852 17105332
14657161 27532980 27532978
12944511 31542395
11222635 144922602
Gene Comments
esearch -db gene -query "rbcL [GENE] AND maize [ORGN]" |
efetch -format xml |
xtract -pattern Entrezgene -block "**/Gene-commentary" \
-if Gene-commentary_type@value -equals genomic \
-tab "\n" -element Gene-commentary_accession |
sort | uniq
NC_001666
X86563
Z11973
Vitamin Biosynthesis
esearch -db pubmed -query "tomato lycopene cyclase" |
elink -related |
elink -target protein |
efilter -organism mammals |
efetch -format gpc |
xtract -pattern INSDSeq -if INSDSeq_definition -contains carotene \
-element INSDSeq_accession-version INSDSeq_definition
NP_573480.1 beta,beta-carotene 9',10'-oxygenase [Mus musculus]
NP_001156500.1 beta,beta-carotene 15,15'-dioxygenase isoform 2 [Mus musculus]
NP_067461.2 beta,beta-carotene 15,15'-dioxygenase isoform 1 [Mus musculus]
NP_001297121.1 beta-carotene oxygenase 2 [Mustela putorius furo]
AAS20392.1 carotene-9',10'-monooxygenase [Mustela putorius furo]
Indexed Fields
einfo -db pubmed |
xtract -pattern Field \
-if IsDate -equals Y -and IsHidden -equals N \
-pfx "[" -sep "]\t" -element Name,FullName |
sort -t $'\t' -k 2f
[CDAT] Date - Completion
[CRDT] Date - Create
[EDAT] Date - Entrez
[MHDA] Date - MeSH
[MDAT] Date - Modification
[PDAT] Date - Publication
Author Numbers
esearch -db pubmed -query "conotoxin" |
efetch -format xml |
xtract -pattern PubmedArticle -num Author |
sort-uniq-count -n |
reorder-columns 2 1 |
head -n 15 |
xy-plot auth.png
0 11
1 193
2 854
3 844
4 699
5 588
6 439
7 291
8 187
9 124
10 122
11 58
12 33
13 18
900 +
| ********
800 + * **
| * *
700 + * ***
| * **
600 + * *
| * ***
500 + * **
| * ***
400 + * **
| * *
300 + * ***
| * *
200 + * ******
| * *********
100 + ** *
| * **********
0 + * ******
+---------+---------+---------+---------+---------+---------+---------+
0 2 4 6 8 10 12 14
Record Counts
echo "diphtheria measles pertussis polio tuberculosis" |
xargs -n 1 sh -c 'esearch -db pubmed -query "$0 [MESH]" |
efilter -days 365 -datetype PDAT |
xtract -pattern ENTREZ_DIRECT -lbl "$0" -element Count'
diphtheria 18
measles 166
pertussis 98
polio 75
tuberculosis 1386
Gene Products
for sym in HBB DMD TTN ATP7B HFE BRCA2 CFTR PAH PRNP RAG1
do
esearch -db gene -query "$sym [GENE] AND human [ORGN]" |
efilter -query "alive [PROP]" | efetch -format docsum |
xtract -pattern GenomicInfoType \
-element ChrAccVer ChrStart ChrStop |
while read acc str stp
do
efetch -db nuccore -format gbc \
-id "$acc" -chr_start "$str" -chr_stop "$stp" |
xtract -insd CDS,mRNA INSDFeature_key "#INSDInterval" \
gene "%transcription" "%translation" \
product transcription translation |
grep -i $'\t'"$sym"$'\t'
done
done
NC_000011.10 mRNA 3 HBB 626 hemoglobin, beta ACATTTGCTT...
NC_000011.10 CDS 3 HBB 147 hemoglobin subunit beta MVHLTPEEKS...
NC_000023.11 mRNA 78 DMD 13805 dystrophin, transcript variant X2 AGGAAGATGA...
NC_000023.11 mRNA 77 DMD 13794 dystrophin, transcript variant X6 ACTTTCCCCC...
NC_000023.11 mRNA 77 DMD 13800 dystrophin, transcript variant X5 ACTTTCCCCC...
NC_000023.11 mRNA 77 DMD 13785 dystrophin, transcript variant X7 ACTTTCCCCC...
NC_000023.11 mRNA 74 DMD 13593 dystrophin, transcript variant X8 ACTTTCCCCC...
NC_000023.11 mRNA 75 DMD 13625 dystrophin, transcript variant X9 ACTTTCCCCC...
...
Genome Range
esearch -db gene -query "Homo sapiens [ORGN] AND Y [CHR]" |
efilter -status alive | efetch -format docsum |
xtract -pattern DocumentSummary -NAME Name -DESC Description \
-block GenomicInfoType -if ChrLoc -equals Y \
-min ChrStart,ChrStop -element "&NAME" "&DESC" |
sort -k 1,1n | cut -f 2- |
between-two-genes ASMT IL3RA
IL3RA interleukin 3 receptor subunit alpha
LOC101928032 uncharacterized LOC101928032
LOC101928055 uncharacterized LOC101928055
SLC25A6 solute carrier family 25 member 6
LOC105373102 uncharacterized LOC105373102
LINC00106 long intergenic non-protein coding RNA 106
ASMTL-AS1 ASMTL antisense RNA 1
ASMTL acetylserotonin O-methyltransferase-like
P2RY8 purinergic receptor P2Y8
AKAP17A A-kinase anchoring protein 17A
ASMT acetylserotonin O-methyltransferase
Amino Acid Substitutions
ApplySNPs() {
seq=""
last=""
while read rsid accn pos res
do
if [ "$accn" != "$last" ]
then
insd=$(efetch -db protein -id "$accn" -format gbc < /dev/null)
seq=$(echo $insd | xtract -pattern INSDSeq -element INSDSeq_sequence)
last=$accn
fi
pos=$((pos+1))
pfx=""
sfx=""
echo ">rs$rsid [$accn $res@$pos]"
if [ $pos -gt 1 ]
then
pfx=$(echo ${seq:0:$pos-1})
fi
if [ $pos -lt ${#seq} ]
then
sfx=$(echo ${seq:$pos})
fi
echo "$pfx$res$sfx" | fold -w 50
done
}
esearch -db gene -query "OPN1MW [GENE] AND human [ORGN]" |
elink -target snp | efetch -format xml |
xtract -pattern Rs -RSID Rs@rsId \
-block FxnSet -if @fxnClass -equals missense \
-sep "." -element "&RSID" @protAcc,@protVer @aaPosition \
-tab "\n" -element @residue |
sort -t $'\t' -k 2,2 -k 3,3n -k 4,4 | uniq |
ApplySNPs
>rs104894915 [NP_000504.1 K@94]
maqqwslqrlagrhpqdsyedstqssiftytnsnstrgpfegpnyhiapr
wvyhltsvwmifvviasvftnglvlaatmkfkklrhplnwilvKlavadl
aetviastisvvnqvygyfvlghpmcvlegytvslcgitglwslaiiswe
...
Amino Acid Composition
#!/bin/bash -norc
abbrev=( Ala Asx Cys Asp Glu Phe Gly His Ile \
Xle Lys Leu Met Asn Pyl Pro Gln Arg \
Ser Thr Sec Val Trp Xxx Tyr Glx )
AminoAcidComp() {
local count
while read num lttr
do
idx=$(printf %i "'$lttr'")
ofs=$((idx-97))
count[$ofs]="$num"
done <<< "$1"
for i in {0..25}
do
echo -e "${abbrev[$i]}\t${count[$i]-0}"
done |
sort
}
AminoAcidJoin() {
result=""
while read acc seq gene
do
comp="$(echo "$seq" | tr A-Z a-z | sed 's/[^a-z]//g' | fold -w 1 | sort-uniq-count)"
current=$(AminoAcidComp "$comp")
current=$(echo -e "GENE\t$gene\n$current")
if [ -n "$result" ]
then
result=$(join -t $'\t' <(echo "$result") <(echo "$current"))
else
result=$current
fi
done
echo "$result" |
grep -e "GENE" -e "[1-9]"
}
ids="NP_001172026,NP_000509,NP_004001,NP_001243779"
efetch -db protein -id "$ids" -format gpc |
xtract -insd INSDSeq_sequence CDS gene |
AminoAcidJoin
GENE INS HBB DMD TTN
Ala 10 15 210 2084
Arg 5 3 193 1640
Asn 3 6 153 1111
Asp 2 7 185 1720
Cys 6 2 35 513
Gln 7 3 301 942
Glu 8 8 379 3193
Gly 12 13 104 2066
His 2 9 84 478
Ile 2 0 165 2062
Leu 20 18 438 2117
Lys 2 11 282 2943
Met 2 2 79 398
Phe 3 8 77 908
Pro 6 7 130 2517
Ser 5 5 239 2463
Thr 3 7 194 2546
Trp 2 2 67 466
Tyr 4 3 61 999
Val 6 18 186 3184
Phrase Searching
entrez-phrase-search -db pubmed -field WORD \
selective serotonin reuptake inhibitor + monoamine oxidase inhibitor |
efetch -format xml |
xtract -pattern PubmedArticle -element MedlineCitation/PMID \
-block Keyword -pfc "\n " -element Keyword
24657329
Antidepressant
Organic cation transporter 2
Piperine
Uptake 2
24280122
5-HIAA
5-HT
5-HTP
5-hydroxyindoleacetic acid
5-hydroxytryptophan
...
`
const pubMedArtSample = `
6301692
1983
06
17
1983
06
17
2007
11
14
0092-8674
32
4
1983
Apr
Cell
Cell
Site-specific relaxation and recombination by the Tn3 resolvase: recognition of the DNA path between oriented res sites.
1313-24
A model in which one subunit of a dimeric resolvase is bound at one res site,
while the other searches along adjacent DNA until it encounters the second site,
would account for the ability of resolvase to distinguish intramolecular from intermolecular sites,
to sense the relative orientation of sites and to produce singly interlinked catenanes.
Because resolvase is a type 1 topoisomerase, we infer that it makes the required duplex bDNA breaks of recombination one strand at a time.
Krasnow
Mark A
MA
Cozzarelli
Nicholas R
NR
eng
GM-07281
GM
NIGMS NIH HHS
United States
Journal Article
Research Support, U.S. Gov't, P.H.S.
UNITED STATES
Cell
0413066
0092-8674
0
DNA, Bacterial
0
DNA, Superhelical
0
DNA, Viral
EC 2.7.7.-
Nucleotidyltransferases
EC 2.7.7.-
Transposases
EC 5.99.1.2
DNA Topoisomerases, Type I
IM
DNA Topoisomerases, Type I
metabolism
DNA, Bacterial
metabolism
DNA, Superhelical
metabolism
DNA, Viral
metabolism
Models, Genetic
Nucleic Acid Conformation
Nucleotidyltransferases
isolation & purification
metabolism
Plasmids
Recombination, Genetic
Repetitive Sequences, Nucleic Acid
Simian virus 40
Transposases
1983
4
1
1983
4
1
0
1
1983
4
1
0
0
ppublish
6301692
0092-8674(83)90312-4
`
const insdSeqSample = `
AF480315_1
67
AA
linear
INV
25-JUL-2016
31-DEC-2003
four-loop conotoxin preproprotein, partial [Conus purpurascens]
AAQ05867
AAQ05867.1
gb|AAQ05867.1|AF480315_1
gi|33320307
Conus purpurascens
Conus purpurascens
Eukaryota; Metazoa; Lophotrochozoa; Mollusca; Gastropoda; Caenogastropoda; Hypsogastropoda; Neogastropoda; Conoidea; Conidae; Conus
1
1..67
Duda,T.F. Jr.
Palumbi,S.R.
Convergent evolution of venoms and feeding ecologies among polyphyletic piscivorous Conus species
Unpublished
2
1..67
Duda,T.F. Jr.
Palumbi,S.R.
Direct Submission
Submitted (04-FEB-2002) Naos Marine Lab, Smithsonian Tropical Research Institute, Apartado 2072, Balboa, Ancon, Panama, Republic of Panama
Method: conceptual translation supplied by author.
accession AF480315.1
source
1
67
AAQ05867.1
organism
Conus purpurascens
isolate
purpurascens-2c
db_xref
taxon:41690
clone_lib
venom duct cDNA library
country
Panama
note
isolated from the Bay of Panama
Protein
<1..67
1
67
AAQ05867.1
product
four-loop conotoxin preproprotein
mat_peptide
41..67
41
67
AAQ05867.1
product
four-loop conotoxin
calculated_mol_wt
3008
peptide
PCKKTGRKCFPHQKDCCGRACIITICP
CDS
1..67
1
67
AAQ05867.1
coded_by
AF480315.1:<1..205
codon_start
2
vvivavlfltacqlitaddsrrtqkhralrsttkratsnrpckktgrkcfphqkdccgraciiticp
`
const geneDocSumSample = `
3581
IL9R
interleukin 9 receptor
0
0
X, Y
genomic
Xq28 and Yq12
CD129, IL-9R
interleukin-9 receptor|IL-9 receptor
IL9R
interleukin 9 receptor
Official
300007
X
NC_000023.11
155997580
156013016
14
Y
NC_000024.10
57184100
57199536
14
5425
The protein encoded by this gene is a cytokine receptor that specifically mediates the biological effects of interleukin 9 (IL9).
The functional IL9 receptor complex requires this protein as well as the interleukin 2 receptor, gamma (IL2RG), a common gamma subunit shared by the receptors of many different cytokines.
The ligand binding of this receptor leads to the activation of various JAK kinases and STAT proteins, which connect to different biologic responses.
This gene is located at the pseudoautosomal regions of X and Y chromosomes.
Genetic studies suggested an association of this gene with the development of asthma.
Multiple pseudogenes on chromosome 9, 10, 16, and 18 have been described.
Alternatively spliced transcript variants have been found for this gene.
X
155997580
Homo sapiens
human
9606
`
const keyboardShortcuts = `
Command History
Ctrl-n Next command
Ctrl-p Previous command
Move Cursor Forward
Ctrl-e To end of line
Ctrl-f By one character
Esc-f By one argument
Move Cursor Backward
Ctrl-a To beginning of line
Ctrl-b By one character
Esc-b By one argument
Delete
Del Previous character
Ctrl-d Next character
Ctrl-k To end of line
Ctrl-u Entire line
Ctrl-w Previous word
Esc-Del Previous argument
Esc-d Next argument
Autocomplete
Tab Completes directory or file names
Program Control
Ctrl-c Quit running program
^x^y Run last command replacing x with y
`
const unixCommands = `
Process by Contents
sort Sorts lines of text
-f Ignore case
-n Numeric comparison
-r Reverse result order
-k Field key (start,stop or first)
-u Unique lines with identical keys
-b Ignore leading blanks
-s Stable sort
-t Specify field separator
uniq Removes repeated lines
-c Count occurrences
-i Ignore case
-f Ignore first n fields
-s Ignore first n characters
-d Only output repeated lines
-u Only output non-repeated lines
grep Matches patterns using regular expressions
-i Ignore case
-v Invert search
-w Search expression as a word
-x Search expression as whole line
-e Specify individual pattern
-c Only count number of matches
-n Print line numbers
Regular Expressions
Characters
. Any single character (except newline)
\w Alphabetic [A-Za-z], numeric [0-9], or underscore (_)
\s Whitespace (space or tab)
\ Escapes special characters
[] Matches any enclosed characters
Positions
^ Beginning of line
$ End of line
\b Word boundary
Repeat Matches
? 0 or 1
* 0 or more
+ 1 or more
{n} Exactly n
Escape Sequences
\n Line break
\t Tab character
Modify Contents
sed Replaces text strings
-e Specify individual expression
tr Translates characters
-d Delete character
rev Reverses characters on line
Format Contents
column Aligns columns by content width
-s Specify field separator
-t Create table
expand Aligns columns to specified positions
-t Tab positions
fold Wraps lines at a specific width
-w Line width
Filter by Position
cut Removes parts of lines
-c Characters to keep
-f Fields to keep
-d Specify field separator
-s Suppress lines with no delimiters
head Prints first lines
-n Number of lines
tail Prints last lines
-n Number of lines
Miscellaneous
wc Counts words, lines, or characters
-c Characters
-l Lines
-w Words
xargs Constructs arguments
-n Number of words per batch
File Compression
tar Archive files
-c Create archive
-f Name of output file
-z Compress archive with gzip
gzip Compress file
-k Keep original file
unzip Decompress .zip archive
-p Pipe to stdout
gzcat Decompress .gz archive and pipe to stdout
Directory and File Navigation
cd Changes directory
/ Root
~ Home
. Current
.. Parent
- Previous
ls Lists file names
-1 One entry per line
-a Show files beginning with dot (.)
-l List in long format
-R Recursively explore subdirectories
-S Sort files by size
-t Sort by most recently modified
pwd Prints working directory path
`
// TYPED CONSTANTS
type LevelType int
const (
_ LevelType = iota
UNIT
SUBSET
SECTION
BLOCK
BRANCH
GROUP
DIVISION
PATTERN
)
type IndentType int
const (
SINGULARITY IndentType = iota
COMPACT
FLUSH
INDENT
SUBTREE
WRAPPED
)
type SideType int
const (
_ SideType = iota
LEFT
RIGHT
)
type TagType int
const (
NOTAG TagType = iota
STARTTAG
SELFTAG
STOPTAG
ATTRIBTAG
CONTENTTAG
CDATATAG
COMMENTTAG
OBJECTTAG
ISCLOSED
)
type OpType int
const (
UNSET OpType = iota
ELEMENT
FIRST
LAST
ENCODE
PFX
SFX
SEP
TAB
RET
LBL
CLR
PFC
RST
POSITION
IF
UNLESS
MATCH
AVOID
AND
OR
EQUALS
CONTAINS
STARTSWITH
ENDSWITH
ISNOT
GT
GE
LT
LE
EQ
NE
NUM
LEN
SUM
MIN
MAX
INC
DEC
SUB
AVG
DEV
ZEROBASED
ONEBASED
UCSC
ELSE
VARIABLE
VALUE
STAR
COUNT
LENGTH
DEPTH
INDEX
UNRECOGNIZED
)
type ArgumentType int
const (
_ ArgumentType = iota
EXPLORATION
CONDITIONAL
EXTRACTION
CUSTOMIZATION
)
type SpecialType int
const (
NOPROCESS SpecialType = iota
DOFORMAT
DOOUTLINE
DOSYNOPSIS
DOVERIFY
DOFILTER
DOCHUNK
DOSPLIT
DODRAIN
DOTOKEN
)
type SeqEndType int
const (
_ SeqEndType = iota
ISSTART
ISSTOP
ISPOS
)
type SequenceType struct {
Based int
Which SeqEndType
}
// ARGUMENT MAPS
var argTypeIs = map[string]ArgumentType{
"-unit": EXPLORATION,
"-Unit": EXPLORATION,
"-subset": EXPLORATION,
"-Subset": EXPLORATION,
"-section": EXPLORATION,
"-Section": EXPLORATION,
"-block": EXPLORATION,
"-Block": EXPLORATION,
"-branch": EXPLORATION,
"-Branch": EXPLORATION,
"-group": EXPLORATION,
"-Group": EXPLORATION,
"-division": EXPLORATION,
"-Division": EXPLORATION,
"-pattern": EXPLORATION,
"-Pattern": EXPLORATION,
"-position": CONDITIONAL,
"-if": CONDITIONAL,
"-unless": CONDITIONAL,
"-match": CONDITIONAL,
"-avoid": CONDITIONAL,
"-and": CONDITIONAL,
"-or": CONDITIONAL,
"-equals": CONDITIONAL,
"-contains": CONDITIONAL,
"-starts-with": CONDITIONAL,
"-ends-with": CONDITIONAL,
"-is-not": CONDITIONAL,
"-gt": CONDITIONAL,
"-ge": CONDITIONAL,
"-lt": CONDITIONAL,
"-le": CONDITIONAL,
"-eq": CONDITIONAL,
"-ne": CONDITIONAL,
"-element": EXTRACTION,
"-first": EXTRACTION,
"-last": EXTRACTION,
"-encode": EXTRACTION,
"-num": EXTRACTION,
"-len": EXTRACTION,
"-sum": EXTRACTION,
"-min": EXTRACTION,
"-max": EXTRACTION,
"-inc": EXTRACTION,
"-dec": EXTRACTION,
"-sub": EXTRACTION,
"-avg": EXTRACTION,
"-dev": EXTRACTION,
"-0-based": EXTRACTION,
"-zero-based": EXTRACTION,
"-1-based": EXTRACTION,
"-one-based": EXTRACTION,
"-ucsc": EXTRACTION,
"-else": EXTRACTION,
"-pfx": CUSTOMIZATION,
"-sfx": CUSTOMIZATION,
"-sep": CUSTOMIZATION,
"-tab": CUSTOMIZATION,
"-ret": CUSTOMIZATION,
"-lbl": CUSTOMIZATION,
"-clr": CUSTOMIZATION,
"-pfc": CUSTOMIZATION,
"-rst": CUSTOMIZATION,
}
var opTypeIs = map[string]OpType{
"-element": ELEMENT,
"-first": FIRST,
"-last": LAST,
"-encode": ENCODE,
"-pfx": PFX,
"-sfx": SFX,
"-sep": SEP,
"-tab": TAB,
"-ret": RET,
"-lbl": LBL,
"-clr": CLR,
"-pfc": PFC,
"-rst": RST,
"-position": POSITION,
"-if": IF,
"-unless": UNLESS,
"-match": MATCH,
"-avoid": AVOID,
"-and": AND,
"-or": OR,
"-equals": EQUALS,
"-contains": CONTAINS,
"-starts-with": STARTSWITH,
"-ends-with": ENDSWITH,
"-is-not": ISNOT,
"-gt": GT,
"-ge": GE,
"-lt": LT,
"-le": LE,
"-eq": EQ,
"-ne": NE,
"-num": NUM,
"-len": LEN,
"-sum": SUM,
"-min": MIN,
"-max": MAX,
"-inc": INC,
"-dec": DEC,
"-sub": SUB,
"-avg": AVG,
"-dev": DEV,
"-0-based": ZEROBASED,
"-zero-based": ZEROBASED,
"-1-based": ONEBASED,
"-one-based": ONEBASED,
"-ucsc": UCSC,
"-else": ELSE,
}
var levelTypeIs = map[string]LevelType{
"-unit": UNIT,
"-Unit": UNIT,
"-subset": SUBSET,
"-Subset": SUBSET,
"-section": SECTION,
"-Section": SECTION,
"-block": BLOCK,
"-Block": BLOCK,
"-branch": BRANCH,
"-Branch": BRANCH,
"-group": GROUP,
"-Group": GROUP,
"-division": DIVISION,
"-Division": DIVISION,
"-pattern": PATTERN,
"-Pattern": PATTERN,
}
var sequenceTypeIs = map[string]SequenceType{
"INSDSeq:INSDInterval_from": {1, ISSTART},
"INSDSeq:INSDInterval_to": {1, ISSTOP},
"DocumentSummary:ChrStart": {0, ISSTART},
"DocumentSummary:ChrStop": {0, ISSTOP},
"DocumentSummary:Chr_start": {1, ISSTART},
"DocumentSummary:Chr_end": {1, ISSTOP},
"DocumentSummary:Chr_inner_start": {1, ISSTART},
"DocumentSummary:Chr_inner_end": {1, ISSTOP},
"DocumentSummary:Chr_outer_start": {1, ISSTART},
"DocumentSummary:Chr_outer_end": {1, ISSTOP},
"DocumentSummary:start": {1, ISSTART},
"DocumentSummary:stop": {1, ISSTOP},
"DocumentSummary:display_start": {1, ISSTART},
"DocumentSummary:display_stop": {1, ISSTOP},
"Entrezgene:Seq-interval_from": {0, ISSTART},
"Entrezgene:Seq-interval_to": {0, ISSTOP},
"GenomicInfoType:ChrStart": {0, ISSTART},
"GenomicInfoType:ChrStop": {0, ISSTOP},
"Rs:@aaPosition": {0, ISPOS},
"Rs:@asnFrom": {0, ISSTART},
"Rs:@asnTo": {0, ISSTOP},
"Rs:@end": {0, ISSTOP},
"Rs:@leftContigNeighborPos": {0, ISSTART},
"Rs:@physMapInt": {0, ISPOS},
"Rs:@protLoc": {0, ISPOS},
"Rs:@rightContigNeighborPos": {0, ISSTOP},
"Rs:@start": {0, ISSTART},
"Rs:@structLoc": {0, ISPOS},
}
// DATA OBJECTS
type Tables struct {
InBlank [256]bool
AltBlank [256]bool
InFirst [256]bool
InElement [256]bool
ChanDepth int
FarmSize int
}
type Node struct {
Name string
Parent string
Contents string
Attributes string
Attribs []string
Children *Node
Next *Node
}
type Step struct {
Type OpType
Value string
Parent string
Match string
Attrib string
Wild bool
}
type Operation struct {
Type OpType
Value string
Stages []*Step
}
type Block struct {
Visit string
Parent string
Match string
Working []string
Parsed []string
Position string
Conditions []*Operation
Commands []*Operation
Failure []*Operation
Subtasks []*Block
}
// UTILITIES
func IsNotJustWhitespace(str string) bool {
for _, ch := range str {
if ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r' && ch != '\f' {
return true
}
}
return false
}
func HasAmpOrNotASCII(str string) bool {
for _, ch := range str {
if ch == '&' || ch > 127 {
return true
}
}
return false
}
func IsAllCapsOrDigits(str string) bool {
for _, rune := range str {
if !unicode.IsUpper(rune) && !unicode.IsDigit(rune) {
return false
}
}
return true
}
func CompressRunsOfSpaces(str string) string {
whiteSpace := false
var buffer bytes.Buffer
for _, rune := range str {
if unicode.IsSpace(rune) {
if !whiteSpace {
buffer.WriteRune(' ')
}
whiteSpace = true
} else {
buffer.WriteRune(rune)
whiteSpace = false
}
}
return buffer.String()
}
func HasFlankingSpace(str string) bool {
if str == "" {
return false
}
ch := str[0]
if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f' {
return true
}
strlen := len(str)
ch = str[strlen-1]
if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f' {
return true
}
return false
}
func HasBadSpace(str string) bool {
for _, rune := range str {
if unicode.IsSpace(rune) && rune != ' ' {
return true
}
}
return false
}
func CleanupBadSpaces(str string) string {
var buffer bytes.Buffer
for _, rune := range str {
if unicode.IsSpace(rune) {
buffer.WriteRune(' ')
} else {
buffer.WriteRune(rune)
}
}
return buffer.String()
}
func SplitInTwoAt(str, chr string, side SideType) (string, string) {
slash := strings.SplitN(str, chr, 2)
if len(slash) > 1 {
return slash[0], slash[1]
}
if side == LEFT {
return str, ""
}
return "", str
}
func ConvertSlash(str string) string {
if str == "" {
return str
}
length := len(str)
res := make([]byte, length+1, length+1)
isSlash := false
idx := 0
for _, rune := range str {
if isSlash {
switch rune {
case 'n':
// line feed
res[idx] = '\n'
case 'r':
// carriage return
res[idx] = '\r'
case 't':
// horizontal tab
res[idx] = '\t'
case 'f':
// form feed
res[idx] = '\f'
case 'a':
// audible bell from terminal (undocumented)
res[idx] = '\x07'
default:
res[idx] = byte(rune)
}
idx++
isSlash = false
} else if rune == '\\' {
isSlash = true
} else {
res[idx] = byte(rune)
idx++
}
}
res = res[0:idx]
return string(res)
}
func ParseFlag(str string) OpType {
op, ok := opTypeIs[str]
if ok {
return op
}
if len(str) > 1 && str[0] == '-' && IsAllCapsOrDigits(str[1:]) {
return VARIABLE
}
if len(str) > 0 && str[0] == '-' {
return UNRECOGNIZED
}
return UNSET
}
// CREATE COMMON DRIVER TABLES
// InitTables creates lookup tables to simplify the tokenizer
func InitTables() *Tables {
tbls := &Tables{}
for i := range tbls.InBlank {
tbls.InBlank[i] = false
}
tbls.InBlank[' '] = true
tbls.InBlank['\t'] = true
tbls.InBlank['\n'] = true
tbls.InBlank['\r'] = true
tbls.InBlank['\f'] = true
// alternative version of InBlank allows newlines to be counted
for i := range tbls.AltBlank {
tbls.AltBlank[i] = false
}
tbls.AltBlank[' '] = true
tbls.AltBlank['\t'] = true
tbls.AltBlank['\r'] = true
tbls.AltBlank['\f'] = true
// first character of element cannot be a digit, dash, or period
for i := range tbls.InFirst {
tbls.InFirst[i] = false
}
for ch := 'A'; ch <= 'Z'; ch++ {
tbls.InFirst[ch] = true
}
for ch := 'a'; ch <= 'z'; ch++ {
tbls.InFirst[ch] = true
}
tbls.InFirst['_'] = true
// remaining characters also includes colon for namespace
for i := range tbls.InElement {
tbls.InElement[i] = false
}
for ch := 'A'; ch <= 'Z'; ch++ {
tbls.InElement[ch] = true
}
for ch := 'a'; ch <= 'z'; ch++ {
tbls.InElement[ch] = true
}
for ch := '0'; ch <= '9'; ch++ {
tbls.InElement[ch] = true
}
tbls.InElement['_'] = true
tbls.InElement['-'] = true
tbls.InElement['.'] = true
tbls.InElement[':'] = true
return tbls
}
// examine structure of parsed arguments (undocumented)
func DebugBlock(blk *Block, depth int) {
doIndent := func(indt int) {
for i := 1; i < indt; i++ {
fmt.Fprintf(os.Stderr, " ")
}
}
doIndent(depth)
if blk.Visit != "" {
doIndent(depth + 1)
fmt.Fprintf(os.Stderr, " %s \n", blk.Visit)
}
if len(blk.Parsed) > 0 {
doIndent(depth + 1)
fmt.Fprintf(os.Stderr, "")
for _, str := range blk.Parsed {
fmt.Fprintf(os.Stderr, " %s", str)
}
fmt.Fprintf(os.Stderr, " \n")
}
if len(blk.Subtasks) > 0 {
for _, sub := range blk.Subtasks {
DebugBlock(sub, depth+1)
}
}
}
// PARSE COMMAND-LINE ARGUMENTS
// ParseArguments parses nested exploration instruction from command-line arguments
func ParseArguments(args []string, pttrn string) *Block {
// different names of exploration control arguments allow multiple levels of nested "for" loops in linear command line
// (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions)
var (
lcname = []string{
"",
"-unit",
"-subset",
"-section",
"-block",
"-branch",
"-group",
"-division",
"-pattern",
}
ucname = []string{
"",
"-Unit",
"-Subset",
"-Section",
"-Block",
"-Branch",
"-Group",
"-Division",
"-Pattern",
}
)
/*
xtract -pattern PubmedArticle -element MedlineCitation/PMID \
-block DateCreated -sep "-" -element Year,Month,Day \
-block Author -sep " " -tab "" -element "&COM" Initials,LastName -COM "(, )"
PubmedArticle
-element MedlineCitation/PMID
DateCreated
-sep "-" -element Year,Month,Day
Author
-sep " " -tab "" -element &COM Initials,LastName -COM "(, )"
*/
// parseCommands recursive definition
var parseCommands func(parent *Block, startLevel LevelType)
// parseCommands does initial parsing of exploration command structure
parseCommands = func(parent *Block, startLevel LevelType) {
// function to find next highest level exploration argument
findNextLevel := func(args []string, level LevelType) (LevelType, string, string) {
if len(args) > 1 {
for {
if level < UNIT {
break
}
lctag := lcname[level]
uctag := ucname[level]
for _, txt := range args {
if txt == lctag || txt == uctag {
return level, lctag, uctag
}
}
level--
}
}
return 0, "", ""
}
arguments := parent.Working
level, lctag, uctag := findNextLevel(arguments, startLevel)
if level < UNIT {
// break recursion
return
}
// function to group arguments at a given exploration level
subsetCommands := func(args []string) *Block {
max := len(args)
visit := ""
// extract name of object to visit
if max > 1 {
visit = args[1]
args = args[2:]
max -= 2
}
partition := 0
for cur, str := range args {
// record point of next exploration command
partition = cur + 1
// skip if not a command
if len(str) < 1 || str[0] != '-' {
continue
}
if argTypeIs[str] == EXPLORATION {
partition = cur
break
}
}
// parse parent/child construct
// colon indicates a namespace prefix in any or all of the components
prnt, match := SplitInTwoAt(visit, "/", RIGHT)
// promote arguments parsed at this level
return &Block{Visit: visit, Parent: prnt, Match: match, Parsed: args[0:partition], Working: args[partition:]}
}
cur := 0
// search for positions of current exploration command
for idx, txt := range arguments {
if txt == lctag || txt == uctag {
if idx == 0 {
continue
}
blk := subsetCommands(arguments[cur:idx])
parseCommands(blk, level-1)
parent.Subtasks = append(parent.Subtasks, blk)
cur = idx
}
}
if cur < len(arguments) {
blk := subsetCommands(arguments[cur:])
parseCommands(blk, level-1)
parent.Subtasks = append(parent.Subtasks, blk)
}
// clear execution arguments from parent after subsetting
parent.Working = nil
}
parseConditionals := func(cmds *Block, arguments []string) []*Operation {
max := len(arguments)
if max < 1 {
return nil
}
// check for missing condition command
txt := arguments[0]
if txt != "-if" && txt != "-unless" && txt != "-match" && txt != "-avoid" && txt != "-position" {
fmt.Fprintf(os.Stderr, "\nERROR: Missing -if command before '%s'\n", txt)
os.Exit(1)
}
if txt == "-position" && max > 2 {
fmt.Fprintf(os.Stderr, "\nERROR: Cannot combine -position with -if or -unless commands\n")
os.Exit(1)
}
// check for missing argument after last condition
txt = arguments[max-1]
if len(txt) > 0 && txt[0] == '-' {
fmt.Fprintf(os.Stderr, "\nERROR: Item missing after %s command\n", txt)
os.Exit(1)
}
cond := make([]*Operation, 0, max)
status := UNSET
// function to parse conditional clause into execution step
parseStep := func(op *Operation, elementColonValue bool) {
if op == nil {
return
}
str := op.Value
status := ELEMENT
// check for pound, percent, or caret character at beginning of name
if len(str) > 1 {
switch str[0] {
case '&':
if IsAllCapsOrDigits(str[1:]) {
status = VARIABLE
str = str[1:]
} else if strings.Contains(str, ":") {
fmt.Fprintf(os.Stderr, "\nERROR: Unsupported construct '%s', use -if &VARIABLE -equals VALUE instead\n", str)
os.Exit(1)
} else {
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variable '%s'\n", str)
os.Exit(1)
}
case '#':
status = COUNT
str = str[1:]
case '%':
status = LENGTH
str = str[1:]
case '^':
status = DEPTH
str = str[1:]
default:
}
} else if str == "+" {
status = INDEX
}
// parse parent/element@attribute construct
// colon indicates a namespace prefix in any or all of the components
prnt, match := SplitInTwoAt(str, "/", RIGHT)
match, attrib := SplitInTwoAt(match, "@", LEFT)
val := ""
// leading colon indicates namespace prefix wildcard
wildcard := false
if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") {
wildcard = true
}
if elementColonValue {
// allow parent/element@attribute:value construct for deprecated -match and -avoid, and for subsequent -and and -or commands
match, val = SplitInTwoAt(str, ":", LEFT)
prnt, match = SplitInTwoAt(match, "/", RIGHT)
match, attrib = SplitInTwoAt(match, "@", LEFT)
}
tsk := &Step{Type: status, Value: str, Parent: prnt, Match: match, Attrib: attrib, Wild: wildcard}
op.Stages = append(op.Stages, tsk)
// transform old -match "element:value" to -match element -equals value
if val != "" {
tsk := &Step{Type: EQUALS, Value: val}
op.Stages = append(op.Stages, tsk)
}
}
idx := 0
// conditionals should alternate between command and object/value
expectDash := true
last := ""
var op *Operation
// flag to allow element-colon-value for deprecated -match and -avoid commands, otherwise colon is for namespace prefixes
elementColonValue := false
// parse command strings into operation structure
for idx < max {
str := arguments[idx]
idx++
// conditionals should alternate between command and object/value
if expectDash {
if len(str) < 1 || str[0] != '-' {
fmt.Fprintf(os.Stderr, "\nERROR: Unexpected '%s' argument after '%s'\n", str, last)
os.Exit(1)
}
expectDash = false
} else {
if len(str) > 0 && str[0] == '-' {
fmt.Fprintf(os.Stderr, "\nERROR: Unexpected '%s' command after '%s'\n", str, last)
os.Exit(1)
}
expectDash = true
}
last = str
switch status {
case UNSET:
status = ParseFlag(str)
case POSITION:
cmds.Position = str
status = UNSET
case MATCH, AVOID:
elementColonValue = true
fallthrough
case IF, UNLESS, AND, OR:
op = &Operation{Type: status, Value: str}
cond = append(cond, op)
parseStep(op, elementColonValue)
status = UNSET
case EQUALS, CONTAINS, STARTSWITH, ENDSWITH, ISNOT:
if op != nil {
if len(str) > 1 && str[0] == '\\' {
// first character may be backslash protecting dash (undocumented)
str = str[1:]
}
tsk := &Step{Type: status, Value: str}
op.Stages = append(op.Stages, tsk)
op = nil
} else {
fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent string match constraints\n")
os.Exit(1)
}
status = UNSET
case GT, GE, LT, LE, EQ, NE:
if op != nil {
if len(str) > 1 && str[0] == '\\' {
// first character may be backslash protecting minus sign (undocumented)
str = str[1:]
}
if len(str) < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: Empty numeric match constraints\n")
os.Exit(1)
}
ch := str[0]
if (ch >= '0' && ch <= '9') || ch == '-' || ch == '+' {
// literal numeric constant
tsk := &Step{Type: status, Value: str}
op.Stages = append(op.Stages, tsk)
} else if (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') {
// numeric test allows element as second argument
prnt, match := SplitInTwoAt(str, "/", RIGHT)
match, attrib := SplitInTwoAt(match, "@", LEFT)
wildcard := false
if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") {
wildcard = true
}
tsk := &Step{Type: status, Value: str, Parent: prnt, Match: match, Attrib: attrib, Wild: wildcard}
op.Stages = append(op.Stages, tsk)
} else {
fmt.Fprintf(os.Stderr, "\nERROR: Unexpected numeric match constraints\n")
os.Exit(1)
}
op = nil
} else {
fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent numeric match constraints\n")
os.Exit(1)
}
status = UNSET
case UNRECOGNIZED:
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str)
os.Exit(1)
default:
fmt.Fprintf(os.Stderr, "\nERROR: Unexpected argument '%s'\n", str)
os.Exit(1)
}
}
return cond
}
parseExtractions := func(cmds *Block, arguments []string) []*Operation {
max := len(arguments)
if max < 1 {
return nil
}
// check for missing -element (or -first, etc.) command
txt := arguments[0]
if len(txt) < 1 || txt[0] != '-' {
fmt.Fprintf(os.Stderr, "\nERROR: Missing -element command before '%s'\n", txt)
os.Exit(1)
}
// check for missing argument after last -element (or -first, etc.) command
txt = arguments[max-1]
if len(txt) > 0 && txt[0] == '-' {
if txt == "-rst" {
fmt.Fprintf(os.Stderr, "\nERROR: Unexpected position for %s command\n", txt)
os.Exit(1)
} else if txt != "-clr" {
fmt.Fprintf(os.Stderr, "\nERROR: Item missing after %s command\n", txt)
os.Exit(1)
}
}
comm := make([]*Operation, 0, max)
status := UNSET
// function to parse next argument
nextStatus := func(str string) OpType {
status = ParseFlag(str)
switch status {
case VARIABLE:
op := &Operation{Type: status, Value: str[1:]}
comm = append(comm, op)
status = VALUE
case CLR, RST:
op := &Operation{Type: status, Value: ""}
comm = append(comm, op)
status = UNSET
case ELEMENT, FIRST, LAST, ENCODE, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSC:
case TAB, RET, PFX, SFX, SEP, LBL, PFC:
case UNSET:
fmt.Fprintf(os.Stderr, "\nERROR: No -element before '%s'\n", str)
os.Exit(1)
case UNRECOGNIZED:
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str)
os.Exit(1)
default:
fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", str)
os.Exit(1)
}
return status
}
// function to parse extraction clause into individual steps
parseSteps := func(op *Operation, pttrn string) {
if op == nil {
return
}
stat := op.Type
str := op.Value
// element names combined with commas are treated as a prefix-separator-suffix group
comma := strings.Split(str, ",")
for _, item := range comma {
status := stat
// check for special character at beginning of name
if len(item) > 1 {
switch item[0] {
case '&':
if IsAllCapsOrDigits(item[1:]) {
status = VARIABLE
item = item[1:]
} else {
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variable '%s'\n", item)
os.Exit(1)
}
case '#':
status = COUNT
item = item[1:]
case '%':
status = LENGTH
item = item[1:]
case '^':
status = DEPTH
item = item[1:]
case '*':
for _, ch := range item {
if ch != '*' {
break
}
}
status = STAR
default:
}
} else if item == "*" {
status = STAR
} else if item == "+" {
status = INDEX
}
// parse parent/element@attribute construct
// colon indicates a namespace prefix in any or all of the components
prnt, match := SplitInTwoAt(item, "/", RIGHT)
match, attrib := SplitInTwoAt(match, "@", LEFT)
// leading colon indicates namespace prefix wildcard
wildcard := false
if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") {
wildcard = true
}
// sequence coordinate adjustments
switch status {
case ZEROBASED, ONEBASED, UCSC:
seq := pttrn + ":"
if attrib != "" {
seq += "@"
seq += attrib
} else if match != "" {
seq += match
}
// confirm -0-based or -1-based arguments are known sequence position elements or attributes
seqtype, ok := sequenceTypeIs[seq]
if !ok {
fmt.Fprintf(os.Stderr, "\nERROR: Element '%s' is not suitable for sequence coordinate conversion\n", item)
os.Exit(1)
}
switch status {
case ZEROBASED:
status = ELEMENT
// if 1-based coordinates, decrement to get 0-based value
if seqtype.Based == 1 {
status = DEC
}
case ONEBASED:
status = ELEMENT
// if 0-based coordinates, increment to get 1-based value
if seqtype.Based == 0 {
status = INC
}
case UCSC:
status = ELEMENT
// half-open intervals, start is 0-based, stop is 1-based
if seqtype.Based == 0 && seqtype.Which == ISSTOP {
status = INC
} else if seqtype.Based == 1 && seqtype.Which == ISSTART {
status = DEC
}
default:
status = ELEMENT
}
default:
}
tsk := &Step{Type: status, Value: item, Parent: prnt, Match: match, Attrib: attrib, Wild: wildcard}
op.Stages = append(op.Stages, tsk)
}
}
idx := 0
// parse command strings into operation structure
for idx < max {
str := arguments[idx]
idx++
if argTypeIs[str] == CONDITIONAL {
fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", str)
os.Exit(1)
}
switch status {
case UNSET:
status = nextStatus(str)
case ELEMENT, FIRST, LAST, ENCODE, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSC:
for !strings.HasPrefix(str, "-") {
// create one operation per argument, even if under a single -element statement
op := &Operation{Type: status, Value: str}
comm = append(comm, op)
parseSteps(op, pttrn)
if idx >= max {
break
}
str = arguments[idx]
idx++
}
status = UNSET
if idx < max {
status = nextStatus(str)
}
case TAB, RET, PFX, SFX, SEP, LBL, PFC:
op := &Operation{Type: status, Value: ConvertSlash(str)}
comm = append(comm, op)
status = UNSET
case VARIABLE:
op := &Operation{Type: status, Value: str[1:]}
comm = append(comm, op)
status = VALUE
case VALUE:
op := &Operation{Type: status, Value: str}
comm = append(comm, op)
parseSteps(op, pttrn)
status = UNSET
case UNRECOGNIZED:
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str)
os.Exit(1)
default:
}
}
return comm
}
// parseOperations recursive definition
var parseOperations func(parent *Block)
// parseOperations converts parsed arguments to operations lists
parseOperations = func(parent *Block) {
args := parent.Parsed
partition := 0
for cur, str := range args {
// record junction between conditional and extraction commands
partition = cur + 1
// skip if not a command
if len(str) < 1 || str[0] != '-' {
continue
}
if argTypeIs[str] != CONDITIONAL {
partition = cur
break
}
}
// split arguments into conditional tests and extraction or customization commands
conditionals := args[0:partition]
args = args[partition:]
partition = 0
foundElse := false
for cur, str := range args {
// record junction at -else command
partition = cur + 1
// skip if not a command
if len(str) < 1 || str[0] != '-' {
continue
}
if str == "-else" {
partition = cur
foundElse = true
break
}
}
extractions := args[0:partition]
alternative := args[partition:]
if len(alternative) > 0 && alternative[0] == "-else" {
alternative = alternative[1:]
}
// validate argument structure and convert to operations lists
parent.Conditions = parseConditionals(parent, conditionals)
parent.Commands = parseExtractions(parent, extractions)
parent.Failure = parseExtractions(parent, alternative)
// reality checks on placement of -else command
if foundElse {
if len(conditionals) < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n")
os.Exit(1)
}
if len(alternative) < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n")
os.Exit(1)
}
if len(parent.Subtasks) > 0 {
fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n")
os.Exit(1)
}
}
for _, sub := range parent.Subtasks {
parseOperations(sub)
}
}
// ParseArguments
head := &Block{}
for _, txt := range args {
head.Working = append(head.Working, txt)
}
// initial parsing of exploration command structure
parseCommands(head, PATTERN)
if len(head.Subtasks) != 1 {
return nil
}
// skip past empty placeholder
head = head.Subtasks[0]
// convert command strings to array of operations for faster processing
parseOperations(head)
// check for no -element or multiple -pattern commands
noElement := true
numPatterns := 0
for _, txt := range args {
if argTypeIs[txt] == EXTRACTION {
noElement = false
}
if txt == "-pattern" || txt == "-Pattern" {
numPatterns++
}
}
if numPatterns < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n")
os.Exit(1)
}
if numPatterns > 1 {
fmt.Fprintf(os.Stderr, "\nERROR: Only one -pattern command is permitted\n")
os.Exit(1)
}
if noElement {
fmt.Fprintf(os.Stderr, "\nERROR: No -element statement in argument list\n")
os.Exit(1)
}
return head
}
// READ XML INPUT FILE INTO SET OF BLOCKS
type XMLReader struct {
Reader io.Reader
Buffer []byte
Remainder string
Closed bool
Docompress bool
Docleanup bool
}
func NewXMLReader(in io.Reader, doCompress, doCleanup bool) *XMLReader {
if in == nil {
return nil
}
rdr := &XMLReader{Reader: in, Docompress: doCompress, Docleanup: doCleanup}
// 65536 appears to be the maximum number of characters presented to io.Reader when input is piped from stdin
// increasing size of buffer when input is from a file does not improve program performance
// additional 16384 bytes are reserved for copying previous remainder to start of buffer before next read
const XMLBUFSIZE = 65536 + 16384
rdr.Buffer = make([]byte, XMLBUFSIZE)
return rdr
}
// NextBlock reads buffer, concatenates if necessary to place long element content into a single string
// all result strings end in > character that is used as a sentinel in subsequent code
func (rdr *XMLReader) NextBlock() string {
if rdr == nil {
return ""
}
// read one buffer, trim at last > and retain remainder for next call, signal if no > character
nextBuffer := func() (string, bool, bool) {
if rdr.Closed {
return "", false, true
}
// prepend previous remainder to beginning of buffer
m := copy(rdr.Buffer, rdr.Remainder)
rdr.Remainder = ""
if m > 16384 {
// previous remainder is larger than reserved section, write and signal need to continue reading
return string(rdr.Buffer[:]), true, false
}
// read next block, append behind copied remainder from previous read
n, err := rdr.Reader.Read(rdr.Buffer[m:])
// with data piped through stdin, read function may not always return the same number of bytes each time
if err != nil {
// end of file
rdr.Closed = true
// do not send final remainder (not terminated by right angle bracket that is used as a sentinel)
return "", false, true
}
// slice of actual characters read
bufr := rdr.Buffer[:n+m]
// look for last > character
pos := -1
for pos = len(bufr) - 1; pos >= 0; pos-- {
if bufr[pos] == '>' {
break
}
}
// trim back to last > character, save remainder for next buffer
if pos > -1 {
pos++
rdr.Remainder = string(bufr[pos:])
return string(bufr[:pos]), false, false
}
// no > found, signal need to continue reading long content
return string(bufr[:]), true, false
}
// read next buffer
line, cont, closed := nextBuffer()
if closed {
// no sentinel in remainder at end of file
return ""
}
// if buffer does not end with > character
if cont {
var buff bytes.Buffer
// keep reading long content blocks
for {
if line != "" {
buff.WriteString(line)
}
if !cont {
// last buffer ended with sentinel
break
}
line, cont, closed = nextBuffer()
if closed {
// no sentinel in multi-block buffer at end of file
return ""
}
}
// concatenate blocks
line = buff.String()
}
// trimming spaces here would throw off line tracking
// optionally compress/cleanup tags/attributes and contents
if rdr.Docompress {
line = CompressRunsOfSpaces(line)
}
if rdr.Docleanup {
if HasBadSpace(line) {
line = CleanupBadSpaces(line)
}
}
return line
}
// PARSE XML BLOCK STREAM INTO STRINGS FROM TO
// PartitionPattern splits XML input by pattern and sends individual records to a callback
func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string)) {
if pat == "" || rdr == nil || proc == nil {
return
}
type Scanner struct {
Pattern string
PatLength int
CharSkip [256]int
}
// function to initialize to scanner
newScanner := func(pattern string) *Scanner {
if pattern == "" {
return nil
}
scr := &Scanner{Pattern: pattern}
patlen := len(pattern)
scr.PatLength = patlen
// position of last character in pattern
last := patlen - 1
// initialize bad character displacement table
for i := range scr.CharSkip {
scr.CharSkip[i] = patlen
}
for i := 0; i < last; i++ {
ch := pattern[i]
scr.CharSkip[ch] = last - i
}
return scr
}
// function check surroundings of match candidate
isAnElement := func(text string, lf, rt, mx int) bool {
if (lf >= 0 && text[lf] == '<') || (lf > 0 && text[lf] == '/' && text[lf-1] == '<') {
if (rt < mx && (text[rt] == '>' || text[rt] == ' ')) || (rt+1 < mx && text[rt] == '/' && text[rt+1] == '>') {
return true
}
}
return false
}
// modified Boyer-Moore-Horspool search function
findNextMatch := func(scr *Scanner, text string, offset int) (int, int, int) {
if scr == nil || text == "" {
return -1, -1, -1
}
// copy values into local variables for speed
txtlen := len(text)
pattern := scr.Pattern[:]
patlen := scr.PatLength
max := txtlen - patlen
last := patlen - 1
skip := scr.CharSkip[:]
i := offset
for i <= max {
j := last
k := i + last
for j >= 0 && text[k] == pattern[j] {
j--
k--
}
// require match candidate to be element name, i.e., , , or
if j < 0 && isAnElement(text, i-1, i+patlen, txtlen) {
// find positions of flanking brackets
lf := i - 1
for lf > 0 && text[lf] != '<' {
lf--
}
rt := i + patlen
for rt < txtlen && text[rt] != '>' {
rt++
}
return i + 1, lf, rt + 1
}
// find character in text above last character in pattern
ch := text[i+last]
// displacement table can shift pattern by one or more positions
i += skip[ch]
}
return -1, -1, -1
}
type PatternType int
const (
NOPATTERN PatternType = iota
STARTPATTERN
SELFPATTERN
STOPPATTERN
)
// function to find next element with pattern name
nextPattern := func(scr *Scanner, text string, pos int) (PatternType, int, int) {
if scr == nil || text == "" {
return NOPATTERN, 0, 0
}
prev := pos
for {
next, start, stop := findNextMatch(scr, text, prev)
if next < 0 {
return NOPATTERN, 0, 0
}
prev = next + 1
if text[start+1] == '/' {
return STOPPATTERN, stop, prev
} else if text[stop-2] == '/' {
return SELFPATTERN, start, prev
} else {
return STARTPATTERN, start, prev
}
}
}
// -pattern Object construct
doNormal := func() {
// current depth of -pattern objects
level := 0
begin := 0
inPattern := false
line := ""
var accumulator bytes.Buffer
match := NOPATTERN
pos := 0
next := 0
rec := 0
scr := newScanner(pat)
if scr == nil {
return
}
for {
begin = 0
next = 0
line = rdr.NextBlock()
if line == "" {
return
}
for {
match, pos, next = nextPattern(scr, line, next)
if match == STARTPATTERN {
if level == 0 {
inPattern = true
begin = pos
}
level++
} else if match == STOPPATTERN {
level--
if level == 0 {
inPattern = false
accumulator.WriteString(line[begin:pos])
// read and process one -pattern object at a time
str := accumulator.String()
if str != "" {
rec++
proc(rec, str[:])
}
// reset accumulator
accumulator.Reset()
}
} else if match == SELFPATTERN {
} else {
if inPattern {
accumulator.WriteString(line[begin:])
}
break
}
}
}
}
// -pattern Parent/* construct now works with catenated files, but not if components
// are recursive or self-closing objects, process those through xtract -format first
doStar := func() {
// current depth of -pattern objects
level := 0
begin := 0
inPattern := false
line := ""
var accumulator bytes.Buffer
match := NOPATTERN
pos := 0
next := 0
rec := 0
scr := newScanner(pat)
if scr == nil {
return
}
last := pat
// read to first element
for {
next = 0
line = rdr.NextBlock()
if line == "" {
break
}
match, pos, next = nextPattern(scr, line, next)
if match == STARTPATTERN {
break
}
}
if match != STARTPATTERN {
return
}
// function to find next element in XML
nextElement := func(text string, pos int) string {
txtlen := len(text)
tag := ""
for i := pos; i < txtlen; i++ {
if text[i] == '<' {
tag = text[i+1:]
break
}
}
if tag == "" {
return ""
}
if tag[0] == '/' {
if strings.HasPrefix(tag[1:], pat) {
//should be at end, want to continue if catenated files
return "/"
}
return ""
}
for i, ch := range tag {
if ch == '>' || ch == ' ' || ch == '/' {
return tag[0:i]
}
}
return ""
}
// read and process heterogeneous objects immediately below parent
for {
tag := nextElement(line, next)
if tag == "" {
begin = 0
next = 0
line = rdr.NextBlock()
if line == "" {
break
}
tag = nextElement(line, next)
}
if tag == "" {
return
}
// check for catenated parent set files
if tag[0] == '/' {
scr = newScanner(pat)
if scr == nil {
return
}
last = pat
// confirm end just found
match, pos, next = nextPattern(scr, line, next)
if match != STOPPATTERN {
return
}
// now look for a new start tag
for {
match, pos, next = nextPattern(scr, line, next)
if match == STARTPATTERN {
break
}
next = 0
line = rdr.NextBlock()
if line == "" {
break
}
}
if match != STARTPATTERN {
return
}
// continue with processing loop
continue
}
if tag != last {
scr = newScanner(tag)
if scr == nil {
return
}
last = tag
}
for {
match, pos, next = nextPattern(scr, line, next)
if match == STARTPATTERN {
if level == 0 {
inPattern = true
begin = pos
}
level++
} else if match == STOPPATTERN {
level--
if level == 0 {
inPattern = false
accumulator.WriteString(line[begin:pos])
// read and process one -pattern/* object at a time
str := accumulator.String()
if str != "" {
rec++
proc(rec, str[:])
}
// reset accumulator
accumulator.Reset()
break
}
} else {
if inPattern {
accumulator.WriteString(line[begin:])
}
begin = 0
next = 0
line = rdr.NextBlock()
if line == "" {
break
}
}
}
}
}
// call appropriate handler
if star == "" {
doNormal()
} else if star == "*" {
doStar()
}
}
// XML VALIDATION AND FORMATTING FUNCTIONS
// ProcessXMLStream tokenizes and runs designated operations on an entire XML file
func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action SpecialType) (int, int) {
if in == nil || tbls == nil {
return 0, 0
}
blockCount := 0
// token parser variables
Text := ""
Txtlen := 0
Idx := 0
Line := 1
// variables to track comments or CDATA sections that span reader blocks
Which := NOTAG
SkipTo := ""
nextToken := func(idx int) (TagType, string, string, int, int) {
if Text == "" {
// if buffer is empty, read next block
Text = in.NextBlock()
Txtlen = len(Text)
Idx = 0
idx = 0
blockCount++
}
if Text == "" {
return ISCLOSED, "", "", Line, 0
}
// lookup table array pointers
inBlank := &tbls.AltBlank
inFirst := &tbls.InFirst
inElement := &tbls.InElement
text := Text[:]
txtlen := Txtlen
line := Line
if Which != NOTAG && SkipTo != "" {
which := Which
// previous block ended inside CDATA object or comment
start := idx
found := strings.Index(text[:], SkipTo)
if found < 0 {
// no stop signal found in next block
// count lines
for i := 0; i < txtlen; i++ {
if text[i] == '\n' {
line++
}
}
Line = line
str := text[:]
if HasFlankingSpace(str) {
str = strings.TrimSpace(str)
}
// signal end of current block
Text = ""
// leave Which and SkipTo values unchanged as another continuation signal
// send CDATA or comment contents
return which, str[:], "", Line, 0
}
// otherwise adjust position past end of skipTo string and return to normal processing
idx += found
// count lines
for i := 0; i < idx; i++ {
if text[i] == '\n' {
line++
}
}
Line = line
str := text[start:idx]
if HasFlankingSpace(str) {
str = strings.TrimSpace(str)
}
idx += len(SkipTo)
// clear tracking variables
Which = NOTAG
SkipTo = ""
// send CDATA or comment contents
return which, str[:], "", Line, idx
}
// all blocks end with > character, acts as sentinel to check if past end of text
if idx >= txtlen {
// signal end of current block, will read next block on next call
Text = ""
Line = line
return NOTAG, "", "", Line, 0
}
// skip past leading blanks
ch := text[idx]
for {
for inBlank[ch] {
idx++
ch = text[idx]
}
if ch != '\n' {
break
}
line++
idx++
ch = text[idx]
}
Line = line
start := idx
if ch == '<' {
// at start of element
idx++
ch = text[idx]
// check for legal first character of element
if inFirst[ch] {
// read element name
start = idx
idx++
ch = text[idx]
for inElement[ch] {
idx++
ch = text[idx]
}
str := text[start:idx]
switch ch {
case '>':
// end of element
idx++
return STARTTAG, str[:], "", Line, idx
case '/':
// self-closing element without attributes
idx++
ch = text[idx]
if ch != '>' {
fmt.Fprintf(os.Stderr, "\nSelf-closing element missing right angle bracket\n")
}
idx++
return SELFTAG, str[:], "", Line, idx
case '\n':
line++
fallthrough
case ' ', '\t', '\r', '\f':
// attributes
idx++
start = idx
ch = text[idx]
for {
for ch != '<' && ch != '>' && ch != '\n' {
idx++
ch = text[idx]
}
if ch != '\n' {
break
}
line++
idx++
ch = text[idx]
}
Line = line
if ch != '>' {
fmt.Fprintf(os.Stderr, "\nAttributes not followed by right angle bracket\n")
}
if text[idx-1] == '/' {
// self-closing
atr := text[start : idx-1]
idx++
return SELFTAG, str[:], atr[:], Line, idx
}
atr := text[start:idx]
idx++
return STARTTAG, str[:], atr[:], Line, idx
default:
fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
return STARTTAG, str[:], "", Line, idx
}
} else {
// punctuation character immediately after first angle bracket
switch ch {
case '/':
// at start of end tag
idx++
start = idx
ch = text[idx]
// expect legal first character of element
if inFirst[ch] {
idx++
ch = text[idx]
for inElement[ch] {
idx++
ch = text[idx]
}
str := text[start:idx]
if ch != '>' {
fmt.Fprintf(os.Stderr, "\nUnexpected characters after end element name\n")
}
idx++
return STOPTAG, str[:], "", Line, idx
}
fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
case '?':
// skip ?xml and ?processing instructions
idx++
ch = text[idx]
for ch != '>' {
idx++
ch = text[idx]
}
idx++
return NOTAG, "", "", Line, idx
case '!':
// skip !DOCTYPE, !comment, and ![CDATA[
idx++
start = idx
ch = text[idx]
Which := NOTAG
SkipTo := ""
if ch == '[' && strings.HasPrefix(text[idx:], "[CDATA[") {
Which = CDATATAG
SkipTo = "]]>"
start += 7
} else if ch == '-' && strings.HasPrefix(text[idx:], "--") {
Which = COMMENTTAG
SkipTo = "-->"
start += 2
}
if Which != NOTAG && SkipTo != "" {
which := Which
// CDATA or comment block may contain internal angle brackets
found := strings.Index(text[idx:], SkipTo)
if found < 0 {
// string stops in middle of CDATA or comment
// count lines
for i := start; i < txtlen; i++ {
if text[i] == '\n' {
line++
}
}
Line = line
str := text[start:]
if HasFlankingSpace(str) {
str = strings.TrimSpace(str)
}
// signal end of current block
Text = ""
// leave Which and SkipTo values unchanged as another continuation signal
// send CDATA or comment contents
return which, str[:], "", Line, 0
}
// adjust position past end of CDATA or comment
idx += found
// count lines
for i := start; i < idx; i++ {
if text[i] == '\n' {
line++
}
}
Line = line
str := text[start:idx]
if HasFlankingSpace(str) {
str = strings.TrimSpace(str)
}
idx += len(SkipTo)
// clear tracking variables
Which = NOTAG
SkipTo = ""
// send CDATA or comment contents
return which, str[:], "", Line, idx
}
// otherwise just skip to next right angle bracket
for ch != '>' {
if ch == '\n' {
line++
}
idx++
ch = text[idx]
}
Line = line
idx++
return NOTAG, "", "", Line, idx
default:
fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
}
}
} else if ch != '>' {
// at start of contents
start = idx
// find end of contents
for {
for ch != '<' && ch != '>' && ch != '\n' {
idx++
ch = text[idx]
}
if ch != '\n' {
break
}
line++
idx++
ch = text[idx]
}
Line = line
// trim back past trailing blanks
lst := idx - 1
ch = text[lst]
for inBlank[ch] && lst > start {
lst--
ch = text[lst]
}
str := text[start : lst+1]
return CONTENTTAG, str[:], "", Line, idx
}
// signal end of current block, will read next block on next call
Text = ""
Line = line
return NOTAG, "", "", Line, 0
}
// common output buffer
var buffer bytes.Buffer
count := 0
// processOutline displays outline of XML structure
processOutline := func() {
indent := 0
for {
tag, name, _, _, idx := nextToken(Idx)
Idx = idx
switch tag {
case STARTTAG:
if name == "eSummaryResult" ||
name == "eLinkResult" ||
name == "eInfoResult" ||
name == "PubmedArticleSet" ||
name == "DocumentSummarySet" ||
name == "INSDSet" ||
name == "Entrezgene-Set" ||
name == "TaxaSet" {
break
}
for i := 0; i < indent; i++ {
buffer.WriteString(" ")
}
buffer.WriteString(name)
buffer.WriteString("\n")
indent++
case SELFTAG:
for i := 0; i < indent; i++ {
buffer.WriteString(" ")
}
buffer.WriteString(name)
buffer.WriteString("\n")
case STOPTAG:
indent--
case NOTAG:
case ISCLOSED:
txt := buffer.String()
if txt != "" {
// print final buffer
fmt.Fprintf(os.Stdout, "%s", txt)
}
return
default:
}
count++
if count > 1000 {
count = 0
txt := buffer.String()
if txt != "" {
// print current buffered output
fmt.Fprintf(os.Stdout, "%s", txt)
}
buffer.Reset()
}
}
}
// processSynopsis displays paths to XML elements
processSynopsis := func() {
// synopsisLevel recursive definition
var synopsisLevel func(string) bool
synopsisLevel = func(parent string) bool {
for {
tag, name, _, _, idx := nextToken(Idx)
Idx = idx
switch tag {
case STARTTAG:
if name == "eSummaryResult" ||
name == "eLinkResult" ||
name == "eInfoResult" ||
name == "PubmedArticleSet" ||
name == "DocumentSummarySet" ||
name == "INSDSet" ||
name == "Entrezgene-Set" ||
name == "TaxaSet" {
break
}
if parent != "" {
buffer.WriteString(parent)
buffer.WriteString("/")
}
buffer.WriteString(name)
buffer.WriteString("\n")
path := parent
if path != "" {
path += "/"
}
path += name
if synopsisLevel(path) {
return true
}
case SELFTAG:
if parent != "" {
buffer.WriteString(parent)
buffer.WriteString("/")
}
buffer.WriteString(name)
buffer.WriteString("\n")
case STOPTAG:
// break recursion
return false
case NOTAG:
case ISCLOSED:
txt := buffer.String()
if txt != "" {
// print final buffer
fmt.Fprintf(os.Stdout, "%s", txt)
}
return true
default:
}
count++
if count > 1000 {
count = 0
txt := buffer.String()
if txt != "" {
// print current buffered output
fmt.Fprintf(os.Stdout, "%s", txt)
}
buffer.Reset()
}
}
}
for {
// may have concatenated XMLs, loop through all
if synopsisLevel("") {
return
}
}
}
// processVerify checks for well-formed XML
processVerify := func() {
type VerifyType int
const (
_ VerifyType = iota
START
STOP
CHAR
OTHER
)
// skip past command name
args = args[1:]
pttrn := ""
if len(args) > 0 {
pttrn = args[0]
args = args[1:]
}
// if pattern supplied, report maximum nesting depth and record spanning the most blocks
maxDepth := 0
depthLine := 0
maxBlocks := 0
blockLine := 0
startLine := 0
// verifyLevel recursive definition
var verifyLevel func(string, int)
// verify integrity of XML object nesting (well-formed)
verifyLevel = func(parent string, level int) {
status := START
for {
// use alternative low-level tokenizer
tag, name, _, line, idx := nextToken(Idx)
Idx = idx
if level > maxDepth {
maxDepth = level
depthLine = line
}
switch tag {
case STARTTAG:
if status == CHAR {
fmt.Fprintf(os.Stdout, "<%s> not expected after contents, line %d\n", name, line)
}
if name == pttrn {
blockCount = 1
startLine = line
}
verifyLevel(name, level+1)
// returns here after recursion
status = STOP
case SELFTAG:
status = OTHER
case STOPTAG:
if name == pttrn {
if blockCount > maxBlocks {
maxBlocks = blockCount
blockLine = startLine
}
}
if parent != name && parent != "" {
fmt.Fprintf(os.Stdout, "Expected %s>, found %s>, line %d\n", parent, name, line)
}
if level < 1 {
fmt.Fprintf(os.Stdout, "Unexpected %s> at end of XML, line %d\n", name, line)
}
// break recursion
return
case CONTENTTAG:
if status != START {
fmt.Fprintf(os.Stdout, "Contents not expected before %s>, line %d - status %d\n", parent, line, status)
}
status = CHAR
case CDATATAG, COMMENTTAG:
status = OTHER
case NOTAG:
case ISCLOSED:
if level > 0 {
fmt.Fprintf(os.Stdout, "Unexpected end of data\n")
}
return
default:
status = OTHER
}
}
}
verifyLevel("", 0)
if pttrn != "" {
fmt.Fprintf(os.Stdout, "Maximum nesting (%d levels) at line %d\n", maxDepth, depthLine)
fmt.Fprintf(os.Stdout, "Longest pattern (%d blocks) at line %d\n", maxBlocks, blockLine)
}
}
// processFilter modifies XML content, comments, or CDATA
processFilter := func() {
// skip past command name
args = args[1:]
max := len(args)
if max < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract -filter\n")
os.Exit(1)
}
pttrn := args[0]
args = args[1:]
max--
if max < 2 {
fmt.Fprintf(os.Stderr, "\nERROR: No object name supplied to xtract -filter\n")
os.Exit(1)
}
type ActionType int
const (
NOACTION ActionType = iota
DORETAIN
DOREMOVE
DOENCODE
DODECODE
DOSHRINK
)
action := args[0]
what := NOACTION
switch action {
case "retain":
what = DORETAIN
case "remove":
what = DOREMOVE
case "encode":
what = DOENCODE
case "decode":
what = DODECODE
case "shrink":
what = DOSHRINK
default:
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized action '%s' supplied to xtract -filter\n", action)
os.Exit(1)
}
trget := args[1]
which := NOTAG
switch trget {
case "attribute", "attributes":
which = ATTRIBTAG
case "content", "contents":
which = CONTENTTAG
case "cdata", "CDATA":
which = CDATATAG
case "comment", "comments":
which = COMMENTTAG
case "object":
// object normally retained
which = OBJECTTAG
default:
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized target '%s' supplied to xtract -filter\n", trget)
os.Exit(1)
}
inPattern := false
for {
tag, name, attr, _, idx := nextToken(Idx)
Idx = idx
switch tag {
case STARTTAG:
if name == pttrn {
inPattern = true
}
if inPattern && which == OBJECTTAG && what == DOREMOVE {
continue
}
buffer.WriteString("<")
buffer.WriteString(name)
if attr != "" {
if which != ATTRIBTAG || what != DOREMOVE {
attr = strings.TrimSpace(attr)
attr = CompressRunsOfSpaces(attr)
buffer.WriteString(" ")
buffer.WriteString(attr)
}
}
buffer.WriteString(">\n")
case SELFTAG:
if inPattern && which == OBJECTTAG && what == DOREMOVE {
continue
}
buffer.WriteString("<")
buffer.WriteString(name)
if attr != "" {
if which != ATTRIBTAG || what != DOREMOVE {
attr = strings.TrimSpace(attr)
attr = CompressRunsOfSpaces(attr)
buffer.WriteString(" ")
buffer.WriteString(attr)
}
}
buffer.WriteString("/>\n")
case STOPTAG:
if name == pttrn {
inPattern = false
if which == OBJECTTAG && what == DOREMOVE {
continue
}
}
if inPattern && which == OBJECTTAG && what == DOREMOVE {
continue
}
buffer.WriteString("")
buffer.WriteString(name)
buffer.WriteString(">\n")
case CONTENTTAG:
if inPattern && which == OBJECTTAG && what == DOREMOVE {
continue
}
if inPattern && which == tag {
switch what {
case DORETAIN:
// default behavior for content - can use -filter X retain content as a no-op
case DOREMOVE:
continue
case DOENCODE:
name = html.EscapeString(name)
case DODECODE:
name = html.UnescapeString(name)
case DOSHRINK:
name = CompressRunsOfSpaces(name)
default:
continue
}
}
// content normally printed
if HasFlankingSpace(name) {
name = strings.TrimSpace(name)
}
buffer.WriteString(name)
buffer.WriteString("\n")
case CDATATAG, COMMENTTAG:
if inPattern && which == OBJECTTAG && what == DOREMOVE {
continue
}
if inPattern && which == tag {
switch what {
case DORETAIN:
// cdata and comment require explicit retain command
case DOREMOVE:
continue
case DOENCODE:
name = html.EscapeString(name)
case DODECODE:
name = html.UnescapeString(name)
case DOSHRINK:
name = CompressRunsOfSpaces(name)
default:
continue
}
// cdata and comment normally removed
if HasFlankingSpace(name) {
name = strings.TrimSpace(name)
}
buffer.WriteString(name)
buffer.WriteString("\n")
}
case NOTAG:
case ISCLOSED:
txt := buffer.String()
if txt != "" {
// print final buffer
fmt.Fprintf(os.Stdout, "%s", txt)
}
return
default:
}
count++
if count > 1000 {
count = 0
txt := buffer.String()
if txt != "" {
// print current buffered output
fmt.Fprintf(os.Stdout, "%s", txt)
}
buffer.Reset()
}
}
}
// processFormat reformats XML for ease of reading
processFormat := func() {
// skip past command name
args = args[1:]
compRecrd := false
wrapAttrs := false
ret := "\n"
frst := true
if len(args) > 0 {
switch args[0] {
case "compact", "compacted", "compress", "compressed", "terse", "*":
// compress to one record per line
compRecrd = true
ret = ""
case "expand", "expanded", "verbose", "@":
// each attribute on its own line
wrapAttrs = true
case "indent", "indented", "normal":
default:
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -format command\n")
os.Exit(1)
}
}
type FormatType int
const (
NOTSET FormatType = iota
START
STOP
CHAR
OTHER
)
// array to speed up indentation
indentSpaces := []string{
"",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
}
indent := 0
// parent used to detect first start tag, will place in doctype line
parent := ""
status := NOTSET
// delay printing right bracket of start tag to support self-closing tag style
needsRightBracket := ""
// delay printing start tag if no attributes, suppress empty start-end pair if followed by end
justStartName := ""
justStartIndent := 0
// function to indent a specified number of spaces
doIndent := func(indt int) {
if compRecrd {
return
}
i := indt
for i > 9 {
buffer.WriteString(" ")
i -= 10
}
if i < 0 {
return
}
buffer.WriteString(indentSpaces[i])
}
// function to handle delayed start tag
doDelayedName := func() {
if needsRightBracket != "" {
buffer.WriteString(">")
needsRightBracket = ""
}
if justStartName != "" {
doIndent(justStartIndent)
buffer.WriteString("<")
buffer.WriteString(justStartName)
buffer.WriteString(">")
justStartName = ""
}
}
closingTag := ""
// function to print attributes
printAttributes := func(attr string) {
attr = strings.TrimSpace(attr)
attr = CompressRunsOfSpaces(attr)
if wrapAttrs {
start := 0
idx := 0
attlen := len(attr)
for idx < attlen {
ch := attr[idx]
if ch == '=' {
str := attr[start:idx]
buffer.WriteString("\n")
doIndent(indent)
buffer.WriteString(" ")
buffer.WriteString(str)
// skip past equal sign and leading double quote
idx += 2
start = idx
} else if ch == '"' {
str := attr[start:idx]
buffer.WriteString("=\"")
buffer.WriteString(str)
buffer.WriteString("\"")
// skip past trailing double quote and (possible) space
idx += 2
start = idx
} else {
idx++
}
}
buffer.WriteString("\n")
doIndent(indent)
} else {
buffer.WriteString(" ")
buffer.WriteString(attr)
}
}
for {
tag, name, attr, _, idx := nextToken(Idx)
Idx = idx
switch tag {
case STARTTAG:
doDelayedName()
if status == START {
buffer.WriteString(ret)
}
// remove internal copies of tags
if parent != "" && name == parent && indent == 1 {
continue
}
// detect first start tag, print xml and doctype parent
if indent == 0 && parent == "" {
parent = name
buffer.WriteString("\n")
buffer.WriteString("\n")
// now filtering internal tags, so queue printing of closing tag
closingTag = fmt.Sprintf("%s>\n", parent)
// already past test, so opening tag will print normally
}
// check for attributes
if attr != "" {
doIndent(indent)
buffer.WriteString("<")
buffer.WriteString(name)
printAttributes(attr)
needsRightBracket = name
} else {
justStartName = name
justStartIndent = indent
}
if compRecrd && frst && indent == 0 {
frst = false
doDelayedName()
buffer.WriteString("\n")
}
indent++
status = START
case SELFTAG:
doDelayedName()
if status == START {
buffer.WriteString(ret)
}
// suppress self-closing tag without attributes attributes
if attr != "" {
doIndent(indent)
buffer.WriteString("<")
buffer.WriteString(name)
printAttributes(attr)
buffer.WriteString("/>")
buffer.WriteString(ret)
}
status = STOP
case STOPTAG:
// if end immediately follows start, turn into self-closing tag if there were attributes, otherwise suppress empty tag
if needsRightBracket != "" {
if status == START && name == needsRightBracket {
// end immediately follows start, produce self-closing tag
buffer.WriteString("/>")
buffer.WriteString(ret)
needsRightBracket = ""
indent--
status = STOP
break
}
buffer.WriteString(">")
needsRightBracket = ""
}
if justStartName != "" {
if status == START && name == justStartName {
// end immediately follows delayed start with no attributes, suppress
justStartName = ""
indent--
status = STOP
break
}
doIndent(justStartIndent)
buffer.WriteString("<")
buffer.WriteString(justStartName)
buffer.WriteString(">")
justStartName = ""
}
// remove internal copies of tags
if parent != "" && name == parent && indent == 1 {
continue
}
indent--
if status == CHAR {
buffer.WriteString("")
buffer.WriteString(name)
buffer.WriteString(">")
buffer.WriteString(ret)
} else if status == START {
buffer.WriteString("")
buffer.WriteString(name)
buffer.WriteString(">")
buffer.WriteString(ret)
} else {
doIndent(indent)
buffer.WriteString("")
buffer.WriteString(name)
buffer.WriteString(">")
buffer.WriteString(ret)
}
status = STOP
if compRecrd && indent == 1 {
buffer.WriteString("\n")
}
case CONTENTTAG:
doDelayedName()
if len(name) > 0 && IsNotJustWhitespace(name) {
if HasFlankingSpace(name) {
name = strings.TrimSpace(name)
}
buffer.WriteString(name)
status = CHAR
}
case CDATATAG, COMMENTTAG:
// ignore
case NOTAG:
case ISCLOSED:
doDelayedName()
if closingTag != "" {
buffer.WriteString(closingTag)
}
txt := buffer.String()
if txt != "" {
// print final buffer
fmt.Fprintf(os.Stdout, "%s", txt)
}
return
default:
doDelayedName()
status = OTHER
}
count++
if count > 1000 {
count = 0
txt := buffer.String()
if txt != "" {
// print current buffered output
fmt.Fprintf(os.Stdout, "%s", txt)
}
buffer.Reset()
}
}
}
// return variables set by performance test functions
recordCount := 0
byteCount := 0
// processChunk reads a set of XML blocks
processChunk := func() {
for {
str := in.NextBlock()
if str == "" {
break
}
recordCount++
byteCount += len(str)
}
}
// processSplit partitions XML by pattern
processSplit := func() {
if len(args) > 1 {
if args[1] == "-pattern" || args[1] == "-Pattern" {
// skip past -split if followed by -pattern
args = args[1:]
}
}
if len(args) < 2 {
fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -split command\n")
os.Exit(1)
}
pat := args[1]
PartitionPattern(pat, "", in,
func(rec int, str string) {
recordCount++
byteCount += len(str)
})
}
// processDrain partitions XML by pattern and sends them down a channel
processDrain := func() {
if len(args) > 1 {
if args[1] == "-pattern" || args[1] == "-Pattern" {
// skip past -drain if followed by -pattern
args = args[1:]
}
}
if len(args) < 2 {
fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -drain command\n")
os.Exit(1)
}
pat := args[1]
chn := make(chan string, tbls.ChanDepth)
if chn == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create output channel\n")
os.Exit(1)
}
sendPatterns := func(pat string, out chan<- string) {
defer close(out)
PartitionPattern(pat, "", in,
func(rec int, str string) {
out <- str
})
}
go sendPatterns(pat, chn)
for str := range chn {
recordCount++
byteCount += len(str)
}
}
// processToken tokenizes the XML block stream
processToken := func() {
for {
tag, name, attr, _, idx := nextToken(Idx)
Idx = idx
if tag == ISCLOSED {
break
}
recordCount++
byteCount += len(name) + len(attr)
}
}
// ProcessXMLStream
// call specific function
switch action {
case DOFORMAT:
processFormat()
case DOOUTLINE:
processOutline()
case DOSYNOPSIS:
processSynopsis()
case DOVERIFY:
processVerify()
case DOFILTER:
processFilter()
case DOCHUNK:
processChunk()
case DOSPLIT:
processSplit()
case DODRAIN:
processDrain()
case DOTOKEN:
processToken()
default:
}
return recordCount, byteCount
}
// INSDSEQ EXTRACTION COMMAND GENERATOR
// e.g., xtract -insd complete mat_peptide "%peptide" product peptide
// ProcessINSD generates extraction commands for GenBank/RefSeq records in INSDSet format
func ProcessINSD(args []string, isPipe, addDash bool) []string {
// legal GenBank / GenPept / RefSeq features
features := []string{
"-10_signal",
"-35_signal",
"3'clip",
"3'UTR",
"5'clip",
"5'UTR",
"allele",
"assembly_gap",
"attenuator",
"Bond",
"C_region",
"CAAT_signal",
"CDS",
"centromere",
"conflict",
"D_segment",
"D-loop",
"enhancer",
"exon",
"gap",
"GC_signal",
"gene",
"iDNA",
"intron",
"J_segment",
"LTR",
"mat_peptide",
"misc_binding",
"misc_difference",
"misc_feature",
"misc_recomb",
"misc_RNA",
"misc_signal",
"misc_structure",
"mobile_element",
"modified_base",
"mRNA",
"mutation",
"N_region",
"ncRNA",
"old_sequence",
"operon",
"oriT",
"polyA_signal",
"polyA_site",
"precursor_RNA",
"prim_transcript",
"primer_bind",
"promoter",
"propeptide",
"protein_bind",
"Protein",
"RBS",
"Region",
"regulatory",
"rep_origin",
"repeat_region",
"repeat_unit",
"rRNA",
"S_region",
"satellite",
"scRNA",
"sig_peptide",
"Site",
"snoRNA",
"snRNA",
"source",
"stem_loop",
"STS",
"TATA_signal",
"telomere",
"terminator",
"tmRNA",
"transit_peptide",
"tRNA",
"unsure",
"V_region",
"V_segment",
"variation",
}
// legal GenBank / GenPept / RefSeq qualifiers
qualifiers := []string{
"allele",
"altitude",
"anticodon",
"artificial_location",
"bio_material",
"bond_type",
"bound_moiety",
"breed",
"calculated_mol_wt",
"cell_line",
"cell_type",
"chloroplast",
"chromoplast",
"chromosome",
"citation",
"clone_lib",
"clone",
"coded_by",
"codon_start",
"codon",
"collected_by",
"collection_date",
"compare",
"cons_splice",
"country",
"cultivar",
"culture_collection",
"cyanelle",
"db_xref",
"derived_from",
"dev_stage",
"direction",
"EC_number",
"ecotype",
"encodes",
"endogenous_virus",
"environmental_sample",
"estimated_length",
"evidence",
"exception",
"experiment",
"focus",
"frequency",
"function",
"gap_type",
"gdb_xref",
"gene_synonym",
"gene",
"germline",
"haplogroup",
"haplotype",
"host",
"identified_by",
"inference",
"insertion_seq",
"isolate",
"isolation_source",
"kinetoplast",
"lab_host",
"label",
"lat_lon",
"linkage_evidence",
"locus_tag",
"macronuclear",
"map",
"mating_type",
"metagenome_source",
"metagenomic",
"mitochondrion",
"mobile_element_type",
"mobile_element",
"mod_base",
"mol_type",
"name",
"nat_host",
"ncRNA_class",
"non_functional",
"note",
"number",
"old_locus_tag",
"operon",
"organelle",
"organism",
"partial",
"PCR_conditions",
"PCR_primers",
"peptide",
"phenotype",
"plasmid",
"pop_variant",
"product",
"protein_id",
"proviral",
"pseudo",
"pseudogene",
"rearranged",
"recombination_class",
"region_name",
"regulatory_class",
"replace",
"ribosomal_slippage",
"rpt_family",
"rpt_type",
"rpt_unit_range",
"rpt_unit_seq",
"rpt_unit",
"satellite",
"segment",
"sequenced_mol",
"serotype",
"serovar",
"sex",
"site_type",
"specific_host",
"specimen_voucher",
"standard_name",
"strain",
"structural_class",
"sub_clone",
"sub_species",
"sub_strain",
"tag_peptide",
"tissue_lib",
"tissue_type",
"trans_splicing",
"transcript_id",
"transcription",
"transgenic",
"transl_except",
"transl_table",
"translation",
"transposon",
"type_material",
"UniProtKB_evidence",
"usedin",
"variety",
"virion",
}
// legal INSDSeq XML fields
insdtags := []string{
"INSDAltSeqData_items",
"INSDAltSeqData",
"INSDAltSeqItem_first-accn",
"INSDAltSeqItem_gap-comment",
"INSDAltSeqItem_gap-length",
"INSDAltSeqItem_gap-linkage",
"INSDAltSeqItem_gap-type",
"INSDAltSeqItem_interval",
"INSDAltSeqItem_isgap",
"INSDAltSeqItem_last-accn",
"INSDAltSeqItem_value",
"INSDAltSeqItem",
"INSDAuthor",
"INSDComment_paragraphs",
"INSDComment_type",
"INSDComment",
"INSDCommentParagraph",
"INSDFeature_intervals",
"INSDFeature_key",
"INSDFeature_location",
"INSDFeature_operator",
"INSDFeature_partial3",
"INSDFeature_partial5",
"INSDFeature_quals",
"INSDFeature_xrefs",
"INSDFeature",
"INSDFeatureSet_annot-source",
"INSDFeatureSet_features",
"INSDFeatureSet",
"INSDInterval_accession",
"INSDInterval_from",
"INSDInterval_interbp",
"INSDInterval_iscomp",
"INSDInterval_point",
"INSDInterval_to",
"INSDInterval",
"INSDKeyword",
"INSDQualifier_name",
"INSDQualifier_value",
"INSDQualifier",
"INSDReference_authors",
"INSDReference_consortium",
"INSDReference_journal",
"INSDReference_position",
"INSDReference_pubmed",
"INSDReference_reference",
"INSDReference_remark",
"INSDReference_title",
"INSDReference_xref",
"INSDReference",
"INSDSecondary-accn",
"INSDSeq_accession-version",
"INSDSeq_alt-seq",
"INSDSeq_comment-set",
"INSDSeq_comment",
"INSDSeq_contig",
"INSDSeq_create-date",
"INSDSeq_create-release",
"INSDSeq_database-reference",
"INSDSeq_definition",
"INSDSeq_division",
"INSDSeq_entry-version",
"INSDSeq_feature-set",
"INSDSeq_feature-table",
"INSDSeq_keywords",
"INSDSeq_length",
"INSDSeq_locus",
"INSDSeq_moltype",
"INSDSeq_organism",
"INSDSeq_other-seqids",
"INSDSeq_primary-accession",
"INSDSeq_primary",
"INSDSeq_project",
"INSDSeq_references",
"INSDSeq_secondary-accessions",
"INSDSeq_segment",
"INSDSeq_sequence",
"INSDSeq_source-db",
"INSDSeq_source",
"INSDSeq_strandedness",
"INSDSeq_struc-comments",
"INSDSeq_taxonomy",
"INSDSeq_topology",
"INSDSeq_update-date",
"INSDSeq_update-release",
"INSDSeq_xrefs",
"INSDSeq",
"INSDSeqid",
"INSDSet",
"INSDStrucComment_items",
"INSDStrucComment_name",
"INSDStrucComment",
"INSDStrucCommentItem_tag",
"INSDStrucCommentItem_url",
"INSDStrucCommentItem_value",
"INSDStrucCommentItem",
"INSDXref_dbname",
"INSDXref_id",
"INSDXref",
}
checkAgainstVocabulary := func(str, objtype string, arry []string) {
if str == "" || arry == nil {
return
}
// skip past pound, percent, or caret character at beginning of string
if len(str) > 1 {
switch str[0] {
case '#', '%', '^':
str = str[1:]
default:
}
}
for _, txt := range arry {
if str == txt {
return
}
if strings.ToUpper(str) == strings.ToUpper(txt) {
fmt.Fprintf(os.Stderr, "\nERROR: Incorrect capitalization of '%s' %s, change to '%s'\n", str, objtype, txt)
os.Exit(1)
}
}
fmt.Fprintf(os.Stderr, "\nERROR: Item '%s' is not a legal -insd %s\n", str, objtype)
os.Exit(1)
}
var acc []string
max := len(args)
if max < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract -insd\n")
os.Exit(1)
}
acc = append(acc, "-pattern", "INSDSeq", "-ACCN", "INSDSeq_accession-version")
printAccn := true
// collect descriptors
if strings.HasPrefix(args[0], "INSD") {
if isPipe {
acc = append(acc, "-clr", "-pfx", "\\n", "-element", "&ACCN")
acc = append(acc, "-group", "INSDSeq", "-sep", "|", "-element")
} else {
acc = append(acc, "-clr", "-pfx", "\"\\n\"", "-element", "\"&ACCN\"")
acc = append(acc, "-group", "INSDSeq", "-sep", "\"|\"", "-element")
}
printAccn = false
for {
if len(args) < 1 {
return acc
}
str := args[0]
if !strings.HasPrefix(args[0], "INSD") {
break
}
checkAgainstVocabulary(str, "element", insdtags)
acc = append(acc, str)
args = args[1:]
}
} else if strings.HasPrefix(strings.ToUpper(args[0]), "INSD") {
// report capitalization or vocabulary failure
checkAgainstVocabulary(args[0], "element", insdtags)
// program should not get to this point, but warn and exit anyway
fmt.Fprintf(os.Stderr, "\nERROR: Item '%s' is not a legal -insd %s\n", args[0], "element")
os.Exit(1)
}
// collect qualifiers
partial := false
complete := false
if args[0] == "+" || args[0] == "complete" {
complete = true
args = args[1:]
max--
} else if args[0] == "-" || args[0] == "partial" {
partial = true
args = args[1:]
max--
}
if max < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: No feature key supplied to xtract -insd\n")
os.Exit(1)
}
acc = append(acc, "-group", "INSDFeature")
// limit to designated features
feature := args[0]
fcmd := "-if"
// can specify multiple features separated by plus sign (e.g., CDS+mRNA) or comma (e.g., CDS,mRNA)
plus := strings.Split(feature, "+")
for _, pls := range plus {
comma := strings.Split(pls, ",")
for _, cma := range comma {
checkAgainstVocabulary(cma, "feature", features)
acc = append(acc, fcmd, "INSDFeature_key", "-equals", cma)
fcmd = "-or"
}
}
if max < 2 {
// still need at least one qualifier even on legal feature
fmt.Fprintf(os.Stderr, "\nERROR: Feature '%s' must be followed by at least one qualifier\n", feature)
os.Exit(1)
}
args = args[1:]
if complete {
acc = append(acc, "-unless", "INSDFeature_partial5", "-or", "INSDFeature_partial3")
} else if partial {
acc = append(acc, "-if", "INSDFeature_partial5", "-or", "INSDFeature_partial3")
}
if printAccn {
if isPipe {
acc = append(acc, "-clr", "-pfx", "\\n", "-element", "&ACCN")
} else {
acc = append(acc, "-clr", "-pfx", "\"\\n\"", "-element", "\"&ACCN\"")
}
}
for _, str := range args {
if strings.HasPrefix(str, "INSD") {
checkAgainstVocabulary(str, "element", insdtags)
if isPipe {
acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element")
} else {
acc = append(acc, "-block", "INSDFeature", "-sep", "\"|\"", "-element")
}
acc = append(acc, str)
} else if strings.HasPrefix(str, "#INSD") {
checkAgainstVocabulary(str, "element", insdtags)
if isPipe {
acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element")
acc = append(acc, str)
} else {
acc = append(acc, "-block", "INSDFeature", "-sep", "\"|\"", "-element")
ql := fmt.Sprintf("\"%s\"", str)
acc = append(acc, ql)
}
} else if strings.HasPrefix(strings.ToUpper(str), "#INSD") || strings.HasPrefix(strings.ToUpper(str), "#INSD") {
// report capitalization or vocabulary failure
checkAgainstVocabulary(str, "element", insdtags)
} else {
acc = append(acc, "-block", "INSDQualifier")
checkAgainstVocabulary(str, "qualifier", qualifiers)
if len(str) > 2 && str[0] == '%' {
acc = append(acc, "-if", "INSDQualifier_name", "-equals", str[1:])
if isPipe {
acc = append(acc, "-element", "%INSDQualifier_value")
} else {
acc = append(acc, "-element", "\"%INSDQualifier_value\"")
}
if addDash {
acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str[1:])
if isPipe {
acc = append(acc, "-lbl", "\\-")
} else {
acc = append(acc, "-lbl", "\"\\-\"")
}
}
} else {
acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
acc = append(acc, "-element", "INSDQualifier_value")
if addDash {
acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str)
if isPipe {
acc = append(acc, "-lbl", "\\-")
} else {
acc = append(acc, "-lbl", "\"\\-\"")
}
}
}
}
}
return acc
}
// HYDRA CITATION MATCHER COMMAND GENERATOR
// ProcessHydra generates extraction commands for NCBI's in-house citation matcher (undocumented)
func ProcessHydra(isPipe bool) []string {
var acc []string
// acceptable scores are 0.8 or higher, exact match on "1" rejects low value in scientific notation with minus sign present
acc = append(acc, "-pattern", "Id")
acc = append(acc, "-if", "@score", "-equals", "1")
acc = append(acc, "-or", "@score", "-starts-with", "0.9")
acc = append(acc, "-or", "@score", "-starts-with", "0.8")
acc = append(acc, "-element", "Id")
return acc
}
// COLLECT AND FORMAT REQUESTED XML VALUES
// ExploreElements returns matching element values to callback
func ExploreElements(curr *Node, mask, prnt, match, attrib string, wildcard bool, level int, proc func(string, int)) {
if curr == nil || proc == nil {
return
}
// **/Object performs deep exploration of recursive data (*/Object also supported)
deep := false
if prnt == "**" || prnt == "*" {
prnt = ""
deep = true
}
// exploreElements recursive definition
var exploreElements func(curr *Node, skip string, lev int)
exploreElements = func(curr *Node, skip string, lev int) {
if !deep && curr.Name == skip {
// do not explore within recursive object
return
}
// parseAttributes is only run if attribute values are requested in element statements
parseAttributes := func(attrb string) []string {
if attrb == "" {
return nil
}
attlen := len(attrb)
// count equal signs
num := 0
for i := 0; i < attlen; i++ {
if attrb[i] == '=' {
num += 2
}
}
if num < 1 {
return nil
}
// allocate array of proper size
arry := make([]string, num)
if arry == nil {
return nil
}
start := 0
idx := 0
itm := 0
// place tag and value in successive array slots
for idx < attlen && itm < num {
ch := attrb[idx]
if ch == '=' {
// skip past possible leading blanks
for start < attlen {
ch = attrb[start]
if ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r' || ch == '\f' {
start++
} else {
break
}
}
// =
arry[itm] = attrb[start:idx]
itm++
// skip past equal sign and leading double quote
idx += 2
start = idx
} else if ch == '"' {
// "
arry[itm] = attrb[start:idx]
itm++
// skip past trailing double quote and (possible) space
idx += 2
start = idx
} else {
idx++
}
}
return arry
}
// wildcard matches any namespace prefix
if curr.Name == match ||
(wildcard && strings.HasPrefix(match, ":") && strings.HasSuffix(curr.Name, match)) ||
(match == "" && attrib != "") {
if prnt == "" ||
curr.Parent == prnt ||
(wildcard && strings.HasPrefix(prnt, ":") && strings.HasSuffix(curr.Parent, prnt)) {
if attrib != "" {
if curr.Attributes != "" && curr.Attribs == nil {
// parse attributes on-the-fly if queried
curr.Attribs = parseAttributes(curr.Attributes)
}
for i := 0; i < len(curr.Attribs)-1; i += 2 {
// attributes now parsed into array as [ tag, value, tag, value, tag, value, ... ]
if curr.Attribs[i] == attrib ||
(wildcard && strings.HasPrefix(attrib, ":") && strings.HasSuffix(curr.Attribs[i], attrib)) {
proc(curr.Attribs[i+1], level)
return
}
}
} else if curr.Contents != "" {
str := curr.Contents[:]
if HasAmpOrNotASCII(str) {
// processing of <, >, &, ", and ' characters is now delayed until element contents is requested
str = html.UnescapeString(str)
}
proc(str, level)
return
} else if curr.Children != nil {
// for XML container object, send empty string to callback to increment count
proc("", level)
// and continue exploring
}
}
}
for chld := curr.Children; chld != nil; chld = chld.Next {
// inner exploration is subject to recursive object exclusion
exploreElements(chld, mask, lev+1)
}
}
exploreElements(curr, "", level)
}
// PrintSubtree supports compression styles selected by -element "*" through "****"
func PrintSubtree(node *Node, style IndentType, printAttrs bool, proc func(string)) {
if node == nil || proc == nil {
return
}
// WRAPPED is SUBTREE plus each attribute on its own line
wrapped := false
if style == WRAPPED {
style = SUBTREE
wrapped = true
}
// INDENT is offset by two spaces to allow for parent tag, SUBTREE is not offset
initial := 1
if style == SUBTREE {
style = INDENT
initial = 0
}
// array to speed up indentation
indentSpaces := []string{
"",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
}
// function to indent a specified number of spaces
doIndent := func(indt int) {
i := indt
for i > 9 {
proc(" ")
i -= 10
}
if i < 0 {
return
}
proc(indentSpaces[i])
}
// doSubtree recursive definition
var doSubtree func(*Node, int)
doSubtree = func(curr *Node, depth int) {
// suppress if it would be an empty self-closing tag
if !IsNotJustWhitespace(curr.Attributes) && curr.Contents == "" && curr.Children == nil {
return
}
if style == INDENT {
doIndent(depth)
}
proc("<")
proc(curr.Name)
if printAttrs {
attr := strings.TrimSpace(curr.Attributes)
attr = CompressRunsOfSpaces(attr)
if attr != "" {
if wrapped {
start := 0
idx := 0
attlen := len(attr)
for idx < attlen {
ch := attr[idx]
if ch == '=' {
str := attr[start:idx]
proc("\n")
doIndent(depth)
proc(" ")
proc(str)
// skip past equal sign and leading double quote
idx += 2
start = idx
} else if ch == '"' {
str := attr[start:idx]
proc("=\"")
proc(str)
proc("\"")
// skip past trailing double quote and (possible) space
idx += 2
start = idx
} else {
idx++
}
}
proc("\n")
doIndent(depth)
} else {
proc(" ")
proc(attr)
}
}
}
// see if suitable for for self-closing tag
if curr.Contents == "" && curr.Children == nil {
proc("/>")
if style != COMPACT {
proc("\n")
}
return
}
proc(">")
if curr.Contents != "" {
proc(curr.Contents[:])
} else {
if style != COMPACT {
proc("\n")
}
for chld := curr.Children; chld != nil; chld = chld.Next {
doSubtree(chld, depth+1)
}
if style == INDENT {
i := depth
for i > 9 {
proc(" ")
i -= 10
}
proc(indentSpaces[i])
}
}
proc("<")
proc("/")
proc(curr.Name)
proc(">")
if style != COMPACT {
proc("\n")
}
}
doSubtree(node, initial)
}
// ProcessClause handles comma-separated -element arguments
func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep string, status OpType, index, level int, variables map[string]string) (string, bool) {
if curr == nil || stages == nil {
return "", false
}
// processElement handles individual -element constructs
processElement := func(acc func(string)) {
if acc == nil {
return
}
// element names combined with commas are treated as a prefix-separator-suffix group
for _, stage := range stages {
stat := stage.Type
item := stage.Value
prnt := stage.Parent
match := stage.Match
attrib := stage.Attrib
wildcard := stage.Wild
// exploreElements is a wrapper for ExploreElements, obtaining most arguments as closures
exploreElements := func(proc func(string, int)) {
ExploreElements(curr, mask, prnt, match, attrib, wildcard, level, proc)
}
switch stat {
case ELEMENT, VALUE, LEN, SUM, MIN, MAX, SUB, AVG, DEV:
exploreElements(func(str string, lvl int) {
if str != "" {
acc(str)
}
})
case ENCODE:
exploreElements(func(str string, lvl int) {
if str != "" {
str = html.EscapeString(str)
acc(str)
}
})
case FIRST:
single := ""
exploreElements(func(str string, lvl int) {
if single == "" {
single = str
}
})
if single != "" {
acc(single)
}
case LAST:
single := ""
exploreElements(func(str string, lvl int) {
single = str
})
if single != "" {
acc(single)
}
case VARIABLE:
// use value of stored variable
val, ok := variables[match]
if ok {
acc(val)
}
case NUM, COUNT:
count := 0
exploreElements(func(str string, lvl int) {
count++
})
// number of element objects
val := strconv.Itoa(count)
acc(val)
case LENGTH:
length := 0
exploreElements(func(str string, lvl int) {
length += len(str)
})
// length of element strings
val := strconv.Itoa(length)
acc(val)
case DEPTH:
exploreElements(func(str string, lvl int) {
// depth of each element in scope
val := strconv.Itoa(lvl)
acc(val)
})
case INDEX:
// -element "+" prints index of current XML object
val := strconv.Itoa(index)
acc(val)
case INC:
// -inc, or component of -0-based, -1-based, or -ucsc
exploreElements(func(str string, lvl int) {
if str != "" {
num, err := strconv.Atoi(str)
if err == nil {
// increment value
num++
val := strconv.Itoa(num)
acc(val)
}
}
})
case DEC:
// -dec, or component of -0-based, -1-based, or -ucsc
exploreElements(func(str string, lvl int) {
if str != "" {
num, err := strconv.Atoi(str)
if err == nil {
// decrement value
num--
val := strconv.Itoa(num)
acc(val)
}
}
})
case STAR:
// -element "*" prints current XML subtree on a single line
style := SINGULARITY
printAttrs := true
for _, ch := range item {
if ch == '*' {
style++
} else if ch == '@' {
printAttrs = false
}
}
if style > WRAPPED {
style = WRAPPED
}
if style < COMPACT {
style = COMPACT
}
var buffer bytes.Buffer
PrintSubtree(curr, style, printAttrs,
func(str string) {
if str != "" {
buffer.WriteString(str)
}
})
txt := buffer.String()
if txt != "" {
acc(txt)
}
default:
}
}
}
ok := false
// format results in buffer
var buffer bytes.Buffer
buffer.WriteString(prev)
buffer.WriteString(pfx)
between := ""
switch status {
case ELEMENT, ENCODE, VALUE, NUM, INC, DEC, ZEROBASED, ONEBASED, UCSC:
processElement(func(str string) {
if str != "" {
ok = true
buffer.WriteString(between)
buffer.WriteString(str)
between = sep
}
})
case FIRST:
single := ""
processElement(func(str string) {
ok = true
if single == "" {
single = str
}
})
if single != "" {
buffer.WriteString(between)
buffer.WriteString(single)
between = sep
}
case LAST:
single := ""
processElement(func(str string) {
ok = true
single = str
})
if single != "" {
buffer.WriteString(between)
buffer.WriteString(single)
between = sep
}
case LEN:
length := 0
processElement(func(str string) {
ok = true
length += len(str)
})
// length of element strings
val := strconv.Itoa(length)
buffer.WriteString(between)
buffer.WriteString(val)
between = sep
case SUM:
sum := 0
processElement(func(str string) {
value, err := strconv.Atoi(str)
if err == nil {
sum += value
ok = true
}
})
if ok {
// sum of element values
val := strconv.Itoa(sum)
buffer.WriteString(between)
buffer.WriteString(val)
between = sep
}
case MIN:
min := 0
processElement(func(str string) {
value, err := strconv.Atoi(str)
if err == nil {
if !ok || value < min {
min = value
}
ok = true
}
})
if ok {
// minimum of element values
val := strconv.Itoa(min)
buffer.WriteString(between)
buffer.WriteString(val)
between = sep
}
case MAX:
max := 0
processElement(func(str string) {
value, err := strconv.Atoi(str)
if err == nil {
if !ok || value > max {
max = value
}
ok = true
}
})
if ok {
// maximum of element values
val := strconv.Itoa(max)
buffer.WriteString(between)
buffer.WriteString(val)
between = sep
}
case SUB:
first := 0
second := 0
count := 0
processElement(func(str string) {
value, err := strconv.Atoi(str)
if err == nil {
count++
if count == 1 {
first = value
} else if count == 2 {
second = value
}
}
})
if count == 2 {
// must have exactly 2 elements
ok = true
// difference of element values
val := strconv.Itoa(first - second)
buffer.WriteString(between)
buffer.WriteString(val)
between = sep
}
case AVG:
sum := 0
count := 0
processElement(func(str string) {
value, err := strconv.Atoi(str)
if err == nil {
sum += value
count++
ok = true
}
})
if ok {
// average of element values
avg := int(float64(sum) / float64(count))
val := strconv.Itoa(avg)
buffer.WriteString(between)
buffer.WriteString(val)
between = sep
}
case DEV:
count := 0
mean := 0.0
m2 := 0.0
processElement(func(str string) {
value, err := strconv.Atoi(str)
if err == nil {
// Welford algorithm for one-pass standard deviation
count++
x := float64(value)
delta := x - mean
mean += delta / float64(count)
m2 += delta * (x - mean)
}
})
if count > 1 {
// must have at least 2 elements
ok = true
// standard deviation of element values
vrc := m2 / float64(count-1)
dev := int(math.Sqrt(vrc))
val := strconv.Itoa(dev)
buffer.WriteString(between)
buffer.WriteString(val)
between = sep
}
default:
}
buffer.WriteString(sfx)
if !ok {
return "", false
}
txt := buffer.String()
return txt, true
}
// ProcessInstructions performs extraction commands on a subset of XML
func ProcessInstructions(commands []*Operation, curr *Node, mask, tab, ret string, index, level int, variables map[string]string, accum func(string)) (string, string) {
if accum == nil {
return tab, ret
}
sep := "\t"
pfx := ""
sfx := ""
col := "\t"
lin := "\n"
varname := ""
// process commands
for _, op := range commands {
str := op.Value
switch op.Type {
case ELEMENT, FIRST, LAST, ENCODE, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSC:
txt, ok := ProcessClause(curr, op.Stages, mask, tab, pfx, sfx, sep, op.Type, index, level, variables)
if ok {
tab = col
ret = lin
accum(txt)
}
case TAB:
col = str
case RET:
lin = str
case PFX:
pfx = str
case SFX:
sfx = str
case SEP:
sep = str
case LBL:
lbl := str
accum(tab)
accum(lbl)
tab = col
ret = lin
case PFC:
// preface clears previous tab and sets prefix in one command
pfx = str
fallthrough
case CLR:
// clear previous tab after the fact
tab = ""
case RST:
pfx = ""
sfx = ""
sep = "\t"
case VARIABLE:
varname = str
case VALUE:
length := len(str)
if length > 1 && str[0] == '(' && str[length-1] == ')' {
// set variable from literal text inside parentheses, e.g., -COM "(, )"
variables[varname] = str[1 : length-1]
// -if "&VARIABLE" will succeed if set to blank with empty parentheses "()"
} else if str == "" {
// -if "&VARIABLE" will fail if initialized with empty string ""
delete(variables, varname)
} else {
txt, ok := ProcessClause(curr, op.Stages, mask, "", pfx, sfx, sep, op.Type, index, level, variables)
if ok {
variables[varname] = txt
}
}
varname = ""
default:
}
}
return tab, ret
}
// CONDITIONAL EXECUTION USES -if AND -unless STATEMENT, WITH SUPPORT FOR DEPRECATED -match AND -avoid STATEMENTS
// ConditionsAreSatisfied tests a set of conditions to determine if extraction should proceed
func ConditionsAreSatisfied(conditions []*Operation, curr *Node, mask string, index, level int, variables map[string]string) bool {
if curr == nil {
return false
}
required := 0
observed := 0
forbidden := 0
isMatch := false
isAvoid := false
// function to test string or numeric constraints
testConstraint := func(str string, constraint *Step) bool {
if str == "" || constraint == nil {
return false
}
val := constraint.Value
stat := constraint.Type
switch stat {
case EQUALS, CONTAINS, STARTSWITH, ENDSWITH, ISNOT:
// substring test on element values
str = strings.ToUpper(str)
val = strings.ToUpper(val)
switch stat {
case EQUALS:
if str == val {
return true
}
case CONTAINS:
if strings.Contains(str, val) {
return true
}
case STARTSWITH:
if strings.HasPrefix(str, val) {
return true
}
case ENDSWITH:
if strings.HasSuffix(str, val) {
return true
}
case ISNOT:
if str != val {
return true
}
default:
}
case GT, GE, LT, LE, EQ, NE:
// second argument of numeric test can be element specifier
if constraint.Parent != "" || constraint.Match != "" || constraint.Attrib != "" {
ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, level, func(str string, lvl int) {
if str != "" {
_, errz := strconv.Atoi(str)
if errz == nil {
val = str
}
}
})
}
// numeric tests on element values
x, errx := strconv.Atoi(str)
y, erry := strconv.Atoi(val)
// both arguments must resolve to integers
if errx != nil || erry != nil {
return false
}
switch stat {
case GT:
if x > y {
return true
}
case GE:
if x >= y {
return true
}
case LT:
if x < y {
return true
}
case LE:
if x <= y {
return true
}
case EQ:
if x == y {
return true
}
case NE:
if x != y {
return true
}
default:
}
default:
}
return false
}
// matchFound tests individual conditions
matchFound := func(stages []*Step) bool {
if stages == nil || len(stages) < 1 {
return false
}
stage := stages[0]
var constraint *Step
if len(stages) > 1 {
constraint = stages[1]
}
status := stage.Type
prnt := stage.Parent
match := stage.Match
attrib := stage.Attrib
wildcard := stage.Wild
found := false
number := ""
// exploreElements is a wrapper for ExploreElements, obtaining most arguments as closures
exploreElements := func(proc func(string, int)) {
ExploreElements(curr, mask, prnt, match, attrib, wildcard, level, proc)
}
switch status {
case ELEMENT:
exploreElements(func(str string, lvl int) {
// match to XML container object sends empty string, so do not check for str != "" here
// test every selected element individually if value is specified
if constraint == nil || testConstraint(str, constraint) {
found = true
}
})
case VARIABLE:
// use value of stored variable
str, ok := variables[match]
if ok {
// -if &VARIABLE -equals VALUE is the supported construct
if constraint == nil || testConstraint(str, constraint) {
found = true
}
}
case COUNT:
count := 0
exploreElements(func(str string, lvl int) {
count++
found = true
})
// number of element objects
number = strconv.Itoa(count)
case LENGTH:
length := 0
exploreElements(func(str string, lvl int) {
length += len(str)
found = true
})
// length of element strings
number = strconv.Itoa(length)
case DEPTH:
depth := 0
exploreElements(func(str string, lvl int) {
depth = lvl
found = true
})
// depth of last element in scope
number = strconv.Itoa(depth)
case INDEX:
// index of explored parent object
number = strconv.Itoa(index)
found = true
default:
}
if number == "" {
return found
}
if constraint == nil || testConstraint(number, constraint) {
return true
}
return false
}
// test conditional arguments
for _, op := range conditions {
switch op.Type {
// -if tests for presence of element (deprecated -match can test element:value)
case IF, MATCH:
// checking for failure here allows for multiple -if [ -and / -or ] clauses
if isMatch && observed < required {
return false
}
if isAvoid && forbidden > 0 {
return false
}
required = 0
observed = 0
forbidden = 0
isMatch = true
isAvoid = false
// continue on to next two cases
fallthrough
case AND:
required++
// continue on to next case
fallthrough
case OR:
if matchFound(op.Stages) {
observed++
// record presence of forbidden element if in -unless clause
forbidden++
}
// -unless tests for absence of element, or presence but with failure of subsequent value test (deprecated -avoid can test element:value)
case UNLESS, AVOID:
if isMatch && observed < required {
return false
}
if isAvoid && forbidden > 0 {
return false
}
required = 0
observed = 0
forbidden = 0
isMatch = false
isAvoid = true
if matchFound(op.Stages) {
forbidden++
}
default:
}
}
if isMatch && observed < required {
return false
}
if isAvoid && forbidden > 0 {
return false
}
return true
}
// RECURSIVELY PROCESS EXPLORATION COMMANDS AND XML DATA STRUCTURE
// ProcessCommands visits XML nodes, performs conditional tests, and executes data extraction instructions
func ProcessCommands(cmds *Block, curr *Node, tab, ret string, index, level int, variables map[string]string, accum func(string)) (string, string) {
if accum == nil {
return tab, ret
}
prnt := cmds.Parent
match := cmds.Match
// leading colon indicates namespace prefix wildcard
wildcard := false
if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") {
wildcard = true
}
// **/Object performs deep exploration of recursive data
deep := false
if prnt == "**" {
prnt = "*"
deep = true
}
// closure passes local variables to callback, which can modify caller tab and ret values
processNode := func(node *Node, idx, lvl int) {
// apply -if or -unless tests
if ConditionsAreSatisfied(cmds.Conditions, node, match, idx, lvl, variables) {
// execute data extraction commands
if len(cmds.Commands) > 0 {
tab, ret = ProcessInstructions(cmds.Commands, node, match, tab, ret, idx, lvl, variables, accum)
}
// process sub commands on child node
for _, sub := range cmds.Subtasks {
tab, ret = ProcessCommands(sub, node, tab, ret, 1, lvl, variables, accum)
}
} else {
// execute commands after -else statement
if len(cmds.Failure) > 0 {
tab, ret = ProcessInstructions(cmds.Failure, node, match, tab, ret, idx, lvl, variables, accum)
}
}
}
// exploreNodes recursive definition
var exploreNodes func(*Node, int, int, func(*Node, int, int)) int
// exploreNodes visits all nodes that match the selection criteria
exploreNodes = func(curr *Node, indx, levl int, proc func(*Node, int, int)) int {
if curr == nil || proc == nil {
return indx
}
// match is "*" for heterogeneous data constructs, e.g., -group PubmedArticleSet/*
// wildcard matches any namespace prefix
if curr.Name == match ||
match == "*" ||
(wildcard && strings.HasPrefix(match, ":") && strings.HasSuffix(curr.Name, match)) {
if prnt == "" ||
curr.Parent == prnt ||
(wildcard && strings.HasPrefix(prnt, ":") && strings.HasSuffix(curr.Parent, prnt)) {
proc(curr, indx, levl)
indx++
if !deep {
// do not explore within recursive object
return indx
}
}
}
// clearing prnt "*" now allows nested exploration within recursive data, e.g., -pattern Taxon -block */Taxon
if prnt == "*" {
prnt = ""
}
// explore child nodes
for chld := curr.Children; chld != nil; chld = chld.Next {
indx = exploreNodes(chld, indx, levl+1, proc)
}
return indx
}
// apply -position test
if cmds.Position == "" {
exploreNodes(curr, index, level, processNode)
} else {
var single *Node
lev := 0
ind := 0
if cmds.Position == "first" {
exploreNodes(curr, index, level,
func(node *Node, idx, lvl int) {
if single == nil {
single = node
ind = idx
lev = lvl
}
})
} else if cmds.Position == "last" {
exploreNodes(curr, index, level,
func(node *Node, idx, lvl int) {
single = node
ind = idx
lev = lvl
})
} else {
// use numeric position
number, err := strconv.Atoi(cmds.Position)
if err == nil {
pos := 0
exploreNodes(curr, index, level,
func(node *Node, idx, lvl int) {
pos++
if pos == number {
single = node
ind = idx
lev = lvl
}
})
} else {
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized position '%s'\n", cmds.Position)
os.Exit(1)
}
}
if single != nil {
processNode(single, ind, lev)
}
}
return tab, ret
}
// PROCESS ONE XML COMPONENT RECORD
// ParseXML combines the tokenizer and parser to work on a single partitioned XML string
func ParseXML(Text, parent string, tbls *Tables) (*Node, bool) {
if Text == "" || tbls == nil {
return nil, false
}
// node farm variables
FarmPos := 0
FarmMax := tbls.FarmSize
FarmItems := make([]Node, FarmMax)
// function to allocate multiple nodes in a large array for memory management efficiency
nextNode := func(strt, attr, prnt string) *Node {
// if farm array slots used up, allocate new array
if FarmPos >= FarmMax {
FarmItems = make([]Node, FarmMax)
FarmPos = 0
}
if FarmItems == nil {
return nil
}
// take node from next available slot in farm array
node := &FarmItems[FarmPos]
node.Name = strt[:]
node.Attributes = attr[:]
node.Parent = prnt[:]
FarmPos++
return node
}
// token parser variables
Txtlen := len(Text)
Idx := 0
// function to get next XML token
nextToken := func(idx int) (TagType, string, string, int) {
// lookup table array pointers
inBlank := &tbls.InBlank
inFirst := &tbls.InFirst
inElement := &tbls.InElement
text := Text[:]
txtlen := Txtlen
// XML string ends with > character, acts as sentinel to check if past end of text
if idx >= txtlen {
// signal end of XML string
return ISCLOSED, "", "", 0
}
// skip past leading blanks
ch := text[idx]
for inBlank[ch] {
idx++
ch = text[idx]
}
start := idx
if ch == '<' {
// at start of element
idx++
ch = text[idx]
// check for legal first character of element
if inFirst[ch] {
// read element name
start = idx
idx++
ch = text[idx]
for inElement[ch] {
idx++
ch = text[idx]
}
str := text[start:idx]
switch ch {
case '>':
// end of element
idx++
return STARTTAG, str[:], "", idx
case '/':
// self-closing element without attributes
idx++
ch = text[idx]
if ch != '>' {
fmt.Fprintf(os.Stderr, "\nSelf-closing element missing right angle bracket\n")
}
idx++
return SELFTAG, str[:], "", idx
case ' ', '\t', '\n', '\r', '\f':
// attributes
idx++
start = idx
ch = text[idx]
for ch != '<' && ch != '>' {
idx++
ch = text[idx]
}
if ch != '>' {
fmt.Fprintf(os.Stderr, "\nAttributes not followed by right angle bracket\n")
}
if text[idx-1] == '/' {
// self-closing
atr := text[start : idx-1]
idx++
return SELFTAG, str[:], atr[:], idx
}
atr := text[start:idx]
idx++
return STARTTAG, str[:], atr[:], idx
default:
fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
return STARTTAG, str[:], "", idx
}
} else {
// punctuation character immediately after first angle bracket
switch ch {
case '/':
// at start of end tag
idx++
start = idx
ch = text[idx]
// expect legal first character of element
if inFirst[ch] {
idx++
ch = text[idx]
for inElement[ch] {
idx++
ch = text[idx]
}
str := text[start:idx]
if ch != '>' {
fmt.Fprintf(os.Stderr, "\nUnexpected characters after end element name\n")
}
idx++
return STOPTAG, str[:], "", idx
}
fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
case '?':
// skip ?xml and ?processing instructions
idx++
ch = text[idx]
for ch != '>' {
idx++
ch = text[idx]
}
idx++
case '!':
// skip !DOCTYPE, !comment, and ![CDATA[
idx++
start = idx
ch = text[idx]
which := NOTAG
skipTo := ""
if ch == '[' && strings.HasPrefix(text[idx:], "[CDATA[") {
which = CDATATAG
skipTo = "]]>"
start += 7
} else if ch == '-' && strings.HasPrefix(text[idx:], "--") {
which = COMMENTTAG
skipTo = "-->"
start += 2
}
if which != NOTAG && skipTo != "" {
// CDATA or comment block may contain internal angle brackets
found := strings.Index(text[idx:], skipTo)
if found < 0 {
// string stops in middle of CDATA or comment
return ISCLOSED, "", "", idx
}
// adjust position past end of CDATA or comment
idx += found + len(skipTo)
} else {
// otherwise just skip to next right angle bracket
for ch != '>' {
idx++
ch = text[idx]
}
idx++
}
default:
fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
}
}
} else if ch != '>' {
// at start of contents
start = idx
// find end of contents
for ch != '<' && ch != '>' {
idx++
ch = text[idx]
}
// trim back past trailing blanks
lst := idx - 1
ch = text[lst]
for inBlank[ch] && lst > start {
lst--
ch = text[lst]
}
str := text[start : lst+1]
return CONTENTTAG, str[:], "", idx
}
return NOTAG, "", "", idx
}
// parseLevel recursive definition
var parseLevel func(string, string, string) (*Node, bool)
parseLevel = func(strt, attr, prnt string) (*Node, bool) {
ok := true
// obtain next node from farm
node := nextNode(strt, attr, prnt)
if node == nil {
return nil, false
}
var lastNode *Node
for {
tag, name, attr, idx := nextToken(Idx)
if tag == ISCLOSED {
break
}
Idx = idx
switch tag {
case STARTTAG:
// read sub tree
obj, ok := parseLevel(name, attr, node.Name)
if !ok {
break
}
// adding next child to end of linked list gives better performance than appending to slice of nodes
if node.Children == nil {
node.Children = obj
}
if lastNode != nil {
lastNode.Next = obj
}
lastNode = obj
case STOPTAG:
// pop out of recursive call
return node, ok
case CONTENTTAG:
node.Contents = name
case SELFTAG:
if attr == "" {
// ignore if self-closing tag has no attributes
continue
}
// self-closing tag has no contents, just create child node
obj := nextNode(name, attr, node.Name)
if node.Children == nil {
node.Children = obj
}
if lastNode != nil {
lastNode.Next = obj
}
lastNode = obj
// continue on same level
default:
}
}
return node, ok
}
for {
tag, name, attr, idx := nextToken(Idx)
if tag == ISCLOSED {
break
}
Idx = idx
if tag != STARTTAG {
continue
}
// call recursive function from beginning of XML
return parseLevel(name, attr, parent)
}
return nil, false
}
// ProcessQuery calls XML combined tokenizer parser on a partitioned string
func ProcessQuery(text, parent string, index int, cmds *Block, tbls *Tables) string {
if text == "" || cmds == nil || tbls == nil {
return ""
}
// exit from function will collect garbage of node structure for current XML object
pat, ok := ParseXML(text, parent, tbls)
if !ok {
return ""
}
// exit from function will also free map of recorded variables for current -pattern
variables := make(map[string]string)
var buffer bytes.Buffer
// start processing at top of command tree and top of XML subregion selected by -pattern
_, ret := ProcessCommands(cmds, pat, "", "", index, 1, variables,
func(str string) {
if str != "" {
buffer.WriteString(str)
}
})
if ret != "" {
buffer.WriteString(ret)
}
txt := buffer.String()
// remove leading newline (-insd -pfx artifact)
if txt != "" && txt[0] == '\n' {
txt = txt[1:]
}
// return consolidated result string
return txt
}
// UNSHUFFLER USES HEAP TO RESTORE OUTPUT OF MULTIPLE CONSUMERS TO ORIGINAL RECORD ORDER
type Extract struct {
Index int
Text string
}
type ExtractHeap []Extract
// methods that satisfy heap.Interface
func (h ExtractHeap) Len() int {
return len(h)
}
func (h ExtractHeap) Less(i, j int) bool {
return h[i].Index < h[j].Index
}
func (h ExtractHeap) Swap(i, j int) {
h[i], h[j] = h[j], h[i]
}
func (h *ExtractHeap) Push(x interface{}) {
*h = append(*h, x.(Extract))
}
func (h *ExtractHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
// CONCURRENT CONSUMER GOROUTINES PARSE AND PROCESS PARTITIONED XML OBJECTS
// ReadBlocks -> SplitPattern => StreamTokens => ParseXML => ProcessQuery -> MergeResults
// XMLProducer sends partitioned XML strings through channel
func XMLProducer(pat, star string, rdr *XMLReader, out chan<- Extract) {
// close channel when all records have been processed, so consumers can range over channel
defer close(out)
// partition all input by pattern and send XML substring to available consumer through channel
PartitionPattern(pat, star, rdr,
func(rec int, str string) {
out <- Extract{rec, str}
})
}
func CreateProducer(pat, star string, rdr *XMLReader, tbls *Tables) <-chan Extract {
if tbls == nil {
return nil
}
out := make(chan Extract, tbls.ChanDepth)
if out == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create producer channel\n")
os.Exit(1)
}
// launch single producer goroutine
go XMLProducer(pat, star, rdr, out)
return out
}
// XMLConsumer reads partitioned XML from channel and calls parser for processing
func XMLConsumer(cmds *Block, tbls *Tables, parent string, wg *sync.WaitGroup, inp <-chan Extract, out chan<- Extract) {
// report when this consumer has no more records to process
defer wg.Done()
// read partitioned XML from producer channel
for ext := range inp {
idx := ext.Index
text := ext.Text
if text == "" {
// should never see empty input data
out <- Extract{idx, text}
continue
}
str := ProcessQuery(text[:], parent, idx, cmds, tbls)
// send even if empty to get all record counts for reordering
out <- Extract{idx, str}
}
}
func CreateConsumers(cmds *Block, tbls *Tables, parent string, numServers int, inp <-chan Extract) <-chan Extract {
if tbls == nil {
return nil
}
out := make(chan Extract, tbls.ChanDepth)
if out == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create consumer channel\n")
os.Exit(1)
}
var wg sync.WaitGroup
// launch multiple consumer goroutines
for i := 0; i < numServers; i++ {
wg.Add(1)
go XMLConsumer(cmds, tbls, parent, &wg, inp, out)
}
// launch separate anonymous goroutine to wait until all consumers are done, then close single output channel, so unshuffler can range over channel
go func() {
wg.Wait()
close(out)
}()
return out
}
// MAIN FUNCTION
// e.g., xtract -pattern PubmedArticle -element MedlineCitation/PMID -block Author -sep " " -element Initials,LastName
func main() {
// skip past executable name
args := os.Args[1:]
if len(args) < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: No command-line arguments supplied to xtract\n")
os.Exit(1)
}
// CONCURRENCY, CLEANUP, AND DEBUGGING FLAGS
// do these first because -defcpu and -maxcpu can be sent from wrapper before other arguments
ncpu := runtime.NumCPU()
if ncpu < 1 {
ncpu = 1
}
// wrapper can limit maximum number of processors to use (undocumented)
maxProcs := ncpu
defProcs := 0
// concurrent performance tuning parameters, can be overridden by -proc and -cons
numProcs := 0
serverRatio := 4
// number of servers usually calculated by -cons server ratio, but can be overridden by -serv
numServers := 0
// number of channels usually equals number of servers, but can be overridden by -chan
chanDepth := 0
// miscellaneous tuning parameters
heapSize := 16
farmSize := 64
// garbage collector control can be set by environment variable or default value with -gogc 0
goGc := 600
// XML data cleanup
doCompress := false
doCleanup := false
// read data from file instead of stdin
fileName := ""
// debugging
dbug := false
mpty := false
indx := false
stts := false
timr := false
// profiling
prfl := false
// alternative source of sample record, processed a designated number of times, looping for each -proc from 1 to nCPU (undocumented)
testCount := 0
testType := ""
testString := ""
// repeat the specified extraction 5 times for each -proc from 1 to nCPU
trial := false
// function to get numeric value
getNumericArg := func(name string, zer, min, max int) int {
if len(args) < 2 {
fmt.Fprintf(os.Stderr, "\nERROR: %s is missing\n", name)
os.Exit(1)
}
value, err := strconv.Atoi(args[1])
if err != nil {
fmt.Fprintf(os.Stderr, "\nERROR: %s (%s) is not an integer\n", name, args[1])
os.Exit(1)
}
// skip past first of two arguments
args = args[1:]
// special case for argument value of 0
if value < 1 {
return zer
}
// limit value to between specified minimum and maximum
if value < min {
return min
}
if value > max {
return max
}
return value
}
inSwitch := true
// get concurrency, cleanup, and debugging flags in any order
for {
inSwitch = true
switch args[0] {
// concurrency override arguments can be passed in by local wrapper script (undocumented)
case "-maxcpu":
maxProcs = getNumericArg("Maximum number of processors", 1, 1, ncpu)
case "-defcpu":
defProcs = getNumericArg("Default number of processors", ncpu, 1, ncpu)
// performance tuning flags
case "-proc":
numProcs = getNumericArg("Number of processors", ncpu, 1, ncpu)
case "-cons":
serverRatio = getNumericArg("Parser to processor ratio", 4, 1, 32)
case "-serv":
numServers = getNumericArg("Concurrent parser count", 0, ncpu, 128)
case "-chan":
chanDepth = getNumericArg("Communication channel depth", 0, ncpu, 128)
case "-heap":
heapSize = getNumericArg("Unshuffler heap size", 8, 8, 64)
case "-farm":
farmSize = getNumericArg("Node buffer length", 4, 4, 2048)
case "-gogc":
goGc = getNumericArg("Garbage collection percentage", 0, 100, 1000)
// read data from file
case "-input":
if len(args) < 2 {
fmt.Fprintf(os.Stderr, "\nERROR: Input file name is missing\n")
os.Exit(1)
}
fileName = args[1]
// skip past first of two arguments
args = args[1:]
// data cleanup flags
case "-compress":
doCompress = true
case "-cleanup":
doCleanup = true
// debugging flags
case "-debug":
dbug = true
case "-empty":
mpty = true
case "-index":
indx = true
case "-stats", "-stat":
stts = true
case "-timer":
timr = true
case "-profile":
prfl = true
case "-trial":
trial = true
case "-test":
testCount = getNumericArg("Test data counter", 0, 0, 1000000)
if len(args) > 1 {
next := args[1]
// if next argument is not another flag
if next != "" && next[0] != '-' {
// get optional data source specifier
testType = next
// skip past second of three arguments
args = args[1:]
}
}
default:
// if not any of the controls, set flag to break out of for loop
inSwitch = false
}
if !inSwitch {
break
}
// skip past argument
args = args[1:]
if len(args) < 1 {
break
}
}
// reality checks on number of processors to use
// performance degrades if capacity is above maximum number of partitions per second (context switching?)
if numProcs == 0 {
if defProcs > 0 {
numProcs = defProcs
} else {
// best performance measurement with current code is obtained when 4 to 6 processors are assigned,
// varying slightly among queries on PubmedArticle, gene DocumentSummary, and INSDSeq sequence records
numProcs = 4
}
}
if numProcs > ncpu {
numProcs = ncpu
}
if numProcs > maxProcs {
numProcs = maxProcs
}
// allow simultaneous threads for multiplexed go routines
runtime.GOMAXPROCS(numProcs)
// adjust garbage collection target percentage
if goGc >= 100 {
debug.SetGCPercent(goGc)
}
// explicit -serv argument overrides -cons ratio
if numServers > 0 {
serverRatio = numServers / numProcs
// if numServers / numProcs is not a whole number, do not print serverRatio in -stats
if numServers != numProcs*serverRatio {
serverRatio = 0
}
} else {
numServers = numProcs * serverRatio
}
// server limits
if numServers > 128 {
numServers = 128
} else if numServers < 1 {
numServers = numProcs
}
// explicit -chan argument overrides default to number of servers
if chanDepth == 0 {
chanDepth = numServers
}
// -stats prints number of CPUs and performance tuning values if no other arguments (undocumented)
if stts && len(args) < 1 {
fmt.Fprintf(os.Stderr, "CPUs %d\n", ncpu)
fmt.Fprintf(os.Stderr, "Proc %d\n", numProcs)
if serverRatio > 0 {
fmt.Fprintf(os.Stderr, "Cons %d\n", serverRatio)
}
fmt.Fprintf(os.Stderr, "Serv %d\n", numServers)
fmt.Fprintf(os.Stderr, "Chan %d\n", chanDepth)
fmt.Fprintf(os.Stderr, "Heap %d\n", heapSize)
fmt.Fprintf(os.Stderr, "Farm %d\n", farmSize)
if goGc >= 100 {
fmt.Fprintf(os.Stderr, "Gogc %d\n", goGc)
}
fmt.Fprintf(os.Stderr, "\n")
return
}
// -test N [pubmed|protein|insd|gene] repeats simple query on local XML to measure performance independent of stdin (undocumented)
if testCount > 0 {
var acc []string
// select internal XML data source
switch testType {
case "pubmed":
testString = pubMedArtSample
case "protein", "sequence":
testString = insdSeqSample
case "insd":
testString = insdSeqSample
case "gene", "docsum":
testString = geneDocSumSample
default:
testString = pubMedArtSample
}
// default commands if no other arguments
if len(args) < 1 {
switch testType {
case "pubmed":
acc = append(acc, "-pattern", "PubmedArticle", "-element", "LastName")
case "protein", "sequence":
acc = append(acc, "-pattern", "INSDSeq", "-element", "INSDSeq_accession-version")
case "insd":
acc = append(acc, "-insd", "mat_peptide", "%peptide", "product", "peptide")
case "gene", "docsum":
acc = append(acc, "-pattern", "DocumentSummary", "-element", "Name")
default:
acc = append(acc, "-pattern", "PubmedArticle", "-element", "LastName")
}
}
// otherwise use remaining arguments for extraction commands
for len(args) > 0 {
acc = append(acc, args[0])
args = args[1:]
}
args = acc
}
if len(args) < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n")
os.Exit(1)
}
// DOCUMENTATION COMMANDS
inSwitch = true
switch args[0] {
case "-version":
fmt.Printf("%s\n", xtractVersion)
case "-help", "-extras", "-extra":
fmt.Printf("xtract %s\n%s\n", xtractVersion, xtractHelp)
case "-examples", "-example", "-scripts", "-script":
fmt.Printf("xtract %s\n%s\n", xtractVersion, xtractExamples)
case "-internal", "-internals":
fmt.Printf("xtract %s\n%s\n", xtractVersion, xtractInternal)
case "-sample", "-samples":
// -sample [pubmed|protein|gene] sends specified sample record to stdout (undocumented)
if len(args) > 1 {
testType = args[1]
}
switch testType {
case "pubmed":
fmt.Printf("%s\n", pubMedArtSample)
case "protein", "sequence", "insd":
fmt.Printf("%s\n", insdSeqSample)
case "gene", "docsum":
fmt.Printf("%s\n", geneDocSumSample)
default:
fmt.Printf("%s\n", pubMedArtSample)
}
case "-keys":
fmt.Printf("%s\n", keyboardShortcuts)
case "-unix":
fmt.Printf("%s\n", unixCommands)
default:
// if not any of the documentation commands, keep going
inSwitch = false
}
if inSwitch {
return
}
// INITIALIZE TABLES
tbls := InitTables()
if tbls == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Problem creating token streamer lookup tables\n")
os.Exit(1)
}
// additional fields passed in master table
tbls.ChanDepth = chanDepth
tbls.FarmSize = farmSize
// FILE NAME CAN BE SUPPLIED WITH -input COMMAND
in := os.Stdin
// check for data being piped into stdin
fi, _ := os.Stdin.Stat()
isPipe := bool((fi.Mode() & os.ModeCharDevice) == 0)
usingFile := false
if fileName != "" {
inFile, err := os.Open(fileName)
if err != nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to open input file '%s'\n", fileName)
os.Exit(1)
}
defer inFile.Close()
// use indicated file instead of stdin
in = inFile
usingFile = true
if isPipe {
fmt.Fprintf(os.Stderr, "\nERROR: Input data from both stdin and file '%s'\n", fileName)
os.Exit(1)
}
}
// check for -input command after extraction arguments
for _, str := range args {
if str == "-input" {
fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -input command\n")
os.Exit(1)
}
}
// CREATE XML BLOCK READER FROM STDIN OR FILE
rdr := NewXMLReader(in, doCompress, doCleanup)
if rdr == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create XML Block Reader\n")
os.Exit(1)
}
// SEQUENCE RECORD EXTRACTION COMMAND GENERATOR
// -insd simplifies extraction of INSDSeq qualifiers
if args[0] == "-insd" || args[0] == "-insd-" {
addDash := true
// -insd- variant suppresses use of dash as placeholder for missing qualifiers (undocumented)
if args[0] == "-insd-" {
addDash = false
}
args = args[1:]
insd := ProcessINSD(args, isPipe || usingFile || testCount > 0, addDash)
if !isPipe && !usingFile && testCount < 1 {
// no piped input, so write output instructions
fmt.Printf("xtract")
for _, str := range insd {
fmt.Printf(" %s", str)
}
fmt.Printf("\n")
return
}
// data in pipe, so replace arguments, execute dynamically
args = insd
}
// CITATION MATCHER EXTRACTION COMMAND GENERATOR
// -hydra filters HydraResponse output by relevance score (undocumented)
if args[0] == "-hydra" {
hydra := ProcessHydra(isPipe || usingFile)
if !isPipe && !usingFile {
// no piped input, so write output instructions
fmt.Printf("xtract")
for _, str := range hydra {
fmt.Printf(" %s", str)
}
fmt.Printf("\n")
return
}
// data in pipe, so replace arguments, execute dynamically
args = hydra
}
// XML DATA FORMATTING/COMPRESSION COMMAND GENERATOR
// -reformat takes a parent pattern and compresses each object for fastest processing (undocumented)
if args[0] == "-reformat" {
args = args[1:]
max := len(args)
if max < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -reformat command\n")
os.Exit(1)
}
// required first argument is parent pattern, will explore using Parent/* construct, write component with -element "*"
prnt := args[0]
if prnt == "" {
fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -reformat command\n")
os.Exit(1)
}
if prnt == "-xml" || prnt == "-doctype" || prnt == "-pfx" || prnt == "-sfx" {
fmt.Fprintf(os.Stderr, "\nERROR: Deprecated argument '%s' used in -reformat command\n", prnt)
os.Exit(1)
}
// optional second argument controls XML expansion or compression level
// asterisks MUST be quoted to avoid interpration as file wildcard by Unix shell
elm := "*"
addRet := false
hideDoctype := false
hideWrapper := false
if max > 1 {
// * = compact, ** = flush, *** = indented, **** = subtree, ***** = attributes on separate lines
// @ = remove attributes
// ^ = suppress xml and doctype, ^^ = also suppress parent set wrapper
numStars := 0
numCarets := 0
hideAttrs := false
for _, ch := range args[1] {
if ch == '*' {
numStars++
} else if ch == '@' {
hideAttrs = true
} else if ch == '^' {
numCarets++
}
}
if numStars > 1 {
addRet = true
}
if numCarets > 0 {
hideDoctype = true
if numCarets > 1 {
hideWrapper = true
}
}
// construct legal element argument for PrintSubtree
switch numStars {
case 1:
elm = "*"
case 2:
elm = "**"
case 3:
elm = "***"
case 4:
elm = "****"
case 5:
elm = "*****"
default:
elm = "*"
}
if hideAttrs {
elm += "@"
}
}
// optional third argument provides detailed DOCTYPE construct
doctype := ""
if max > 2 {
str := ConvertSlash(args[2])
if strings.HasPrefix(str, "") {
doctype = str
}
}
if !isPipe && !usingFile {
// no piped input, so write output instructions (without -head and -tail arguments)
if addRet {
fmt.Printf("xtract -pattern %s/* -ret \"\" -element \"%s\"\n", prnt, elm)
} else {
fmt.Printf("xtract -pattern %s/* -element \"%s\"\n", prnt, elm)
}
return
}
// add xml, DOCTYPE, and lines at the beginning
hd := fmt.Sprintf("\n\n<%s>", prnt, prnt)
if doctype != "" {
// use supplied DOCTYPE argument
hd = fmt.Sprintf("\n%s\n<%s>", doctype, prnt)
}
if hideDoctype {
// or just line
hd = fmt.Sprintf("<%s>", prnt)
}
// add line at the end
tl := fmt.Sprintf("%s>", prnt)
// use -pattern Parent/* construct
prnt += "/*"
var acc []string
if !hideWrapper {
acc = append(acc, "-head", hd, "-tail", tl)
}
acc = append(acc, "-pattern", prnt)
if addRet {
acc = append(acc, "-ret", "")
}
acc = append(acc, "-element", elm)
// data in pipe, so replace arguments, execute dynamically
args = acc
}
// CONFIRM INPUT DATA AVAILABILITY AFTER RUNNING COMMAND GENERATORS
if testCount < 1 && !usingFile && !isPipe {
fmt.Fprintf(os.Stderr, "\nERROR: No data supplied to xtract from stdin or file\n")
os.Exit(1)
}
// START PROFILING IF REQUESTED
if prfl {
dbug = true
f, err := os.Create("cpu.pprof")
if err != nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create profile output file\n")
os.Exit(1)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
}
// INITIALIZE PROCESS TIMER AND RECORD COUNT
startTime := time.Now()
recordCount := 0
byteCount := 0
// function to print processing rate and program duration
printDuration := func(name string) {
stopTime := time.Now()
duration := stopTime.Sub(startTime)
seconds := float64(duration.Nanoseconds()) / 1e9
if recordCount >= 1000000 {
fmt.Fprintf(os.Stderr, "\nXtract processed %d million %s in %.3f seconds", recordCount/1000000, name, seconds)
} else {
fmt.Fprintf(os.Stderr, "\nXtract processed %d %s in %.3f seconds", recordCount, name, seconds)
}
if seconds >= 0.001 && recordCount > 0 {
rate := int(float64(recordCount) / seconds)
if rate >= 1000000 {
fmt.Fprintf(os.Stderr, " (%d mega%s/second", rate/1000000, name)
} else {
fmt.Fprintf(os.Stderr, " (%d %s/second", rate, name)
}
if byteCount > 0 {
rate := int(float64(byteCount) / seconds)
if rate >= 1000000 {
fmt.Fprintf(os.Stderr, ", %d megabytes/second", rate/1000000)
} else if rate >= 1000 {
fmt.Fprintf(os.Stderr, ", %d kilobytes/second", rate/1000)
} else {
fmt.Fprintf(os.Stderr, ", %d bytes/second", rate)
}
}
fmt.Fprintf(os.Stderr, ")")
}
fmt.Fprintf(os.Stderr, "\n\n")
}
// PERFORMANCE TIMING COMMANDS
inSwitch = true
action := NOPROCESS
recordType := ""
switch args[0] {
case "-chunk":
action = DOCHUNK
recordType = "blocks"
case "-split":
action = DOSPLIT
recordType = "patterns"
case "-drain":
action = DODRAIN
recordType = "patterns"
case "-token":
action = DOTOKEN
recordType = "tokens"
default:
// if not any of the formatting commands, keep going
inSwitch = false
}
if inSwitch {
recordCount, byteCount = ProcessXMLStream(rdr, tbls, args, action)
printDuration(recordType)
return
}
// SPECIAL FORMATTING COMMANDS
inSwitch = true
action = NOPROCESS
switch args[0] {
case "-format":
action = DOFORMAT
case "-outline":
action = DOOUTLINE
case "-synopsis":
action = DOSYNOPSIS
case "-verify", "-validate":
action = DOVERIFY
case "-filter":
action = DOFILTER
default:
// if not any of the formatting commands, keep going
inSwitch = false
}
if inSwitch {
ProcessXMLStream(rdr, tbls, args, action)
return
}
// SPECIFY STRINGS TO GO BEFORE AND AFTER ENTIRE OUTPUT
head := ""
tail := ""
for {
inSwitch = true
switch args[0] {
case "-head":
if len(args) < 2 {
fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -head command\n")
os.Exit(1)
}
head = ConvertSlash(args[1])
case "-tail":
if len(args) < 2 {
fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -tail command\n")
os.Exit(1)
}
tail = ConvertSlash(args[1])
default:
// if not any of the controls, set flag to break out of for loop
inSwitch = false
}
if !inSwitch {
break
}
// skip past arguments
args = args[2:]
if len(args) < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n")
os.Exit(1)
}
}
// ENSURE PRESENCE OF PATTERN ARGUMENT
if len(args) < 1 {
fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n")
os.Exit(1)
}
// make sure top-level -pattern command is next
if args[0] != "-pattern" && args[0] != "-Pattern" {
fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n")
os.Exit(1)
}
if len(args) < 2 {
fmt.Fprintf(os.Stderr, "\nERROR: Item missing after -pattern command\n")
os.Exit(1)
}
topPat := args[1]
if topPat == "" {
fmt.Fprintf(os.Stderr, "\nERROR: Item missing after -pattern command\n")
os.Exit(1)
}
if strings.HasPrefix(topPat, "-") {
fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", topPat)
os.Exit(1)
}
// look for -pattern Parent/* construct for heterogeneous data, e.g., -pattern PubmedArticleSet/*
topPattern, star := SplitInTwoAt(topPat, "/", LEFT)
if topPattern == "" {
return
}
parent := ""
if star == "*" {
parent = topPattern
} else if star != "" {
fmt.Fprintf(os.Stderr, "\nERROR: -pattern Parent/Child construct is not supported\n")
os.Exit(1)
}
// PARSE AND VALIDATE EXTRACTION ARGUMENTS
// parse nested exploration instruction from command-line arguments
cmds := ParseArguments(args, topPattern)
if cmds == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Problem parsing command-line arguments\n")
os.Exit(1)
}
// PERFORMANCE TIMING COMMAND
// -stats with an extraction command prints XML size and processing time for each record
if stts {
legend := "REC\tSIZE\tTIME"
PartitionPattern(topPattern, star, rdr,
func(rec int, str string) {
beginTime := time.Now()
ProcessQuery(str[:], parent, rec, cmds, tbls)
endTime := time.Now()
duration := endTime.Sub(beginTime)
micro := int(float64(duration.Nanoseconds()) / 1e3)
if legend != "" {
fmt.Printf("%s\n", legend)
legend = ""
}
fmt.Printf("%d\t%d\t%d\n", rec, len(str), micro)
})
return
}
// PERFORMANCE OPTIMIZATION FUNCTIONS
// -test N runs a test extraction N times for each -proc from 1 to nCPU (undocumented)
if testCount > 0 && testString != "" {
// clean up copy of sample string included in source code
sample := strings.TrimSpace(testString)
sample = CleanupBadSpaces(sample)
legend := "CPU\tTIME\tRATE"
for numServ := 1; numServ <= ncpu; numServ++ {
runtime.GOMAXPROCS(numServ)
// alternative producer sends sample XML through channel N times
xmlq := make(chan Extract, chanDepth)
go func(out chan<- Extract) {
for rec := 1; rec <= testCount; rec++ {
out <- Extract{rec, sample}
}
close(out)
}(xmlq)
tblq := CreateConsumers(cmds, tbls, parent, numServ, xmlq)
if xmlq == nil || tblq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n")
os.Exit(1)
}
begTime := time.Now()
recordCount = 0
for _ = range tblq {
recordCount++
runtime.Gosched()
}
debug.FreeOSMemory()
endTime := time.Now()
expended := endTime.Sub(begTime)
secs := float64(expended.Nanoseconds()) / 1e9
if secs >= 0.000001 && recordCount > 0 {
speed := int(float64(recordCount) / secs)
if legend != "" {
fmt.Printf("%s\n", legend)
legend = ""
}
fmt.Printf("%d\t%.3f\t%d\n", numServ, secs, speed)
}
}
return
}
// -trial -input fileName runs the specified extraction for each -proc from 1 to nCPU
if trial && fileName != "" {
legend := "CPU\tRATE\tDEV"
for numServ := 1; numServ <= ncpu; numServ++ {
runtime.GOMAXPROCS(numServ)
sum := 0
count := 0
mean := 0.0
m2 := 0.0
// calculate mean and standard deviation of processing rate
for trials := 0; trials < 5; trials++ {
inFile, err := os.Open(fileName)
if err != nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to open input file '%s'\n", fileName)
os.Exit(1)
}
rdr := NewXMLReader(inFile, doCompress, doCleanup)
if rdr == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to read input file\n")
os.Exit(1)
}
xmlq := CreateProducer(topPattern, star, rdr, tbls)
tblq := CreateConsumers(cmds, tbls, parent, numServ, xmlq)
if xmlq == nil || tblq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n")
os.Exit(1)
}
begTime := time.Now()
recordCount = 0
for _ = range tblq {
recordCount++
runtime.Gosched()
}
inFile.Close()
debug.FreeOSMemory()
endTime := time.Now()
expended := endTime.Sub(begTime)
secs := float64(expended.Nanoseconds()) / 1e9
if secs >= 0.000001 && recordCount > 0 {
speed := int(float64(recordCount) / secs)
sum += speed
count++
x := float64(speed)
delta := x - mean
mean += delta / float64(count)
m2 += delta * (x - mean)
}
}
if legend != "" {
fmt.Printf("%s\n", legend)
legend = ""
}
if count > 1 {
vrc := m2 / float64(count-1)
dev := int(math.Sqrt(vrc))
fmt.Printf("%d\t%d\t%d\n", numServ, sum/count, dev)
}
}
return
}
// PROCESS SINGLE SELECTED RECORD IF -pattern ARGUMENT IS IMMEDIATELY FOLLOWED BY -position COMMAND
if cmds.Visit == topPat && cmds.Position != "" {
qry := ""
idx := 0
if cmds.Position == "first" {
PartitionPattern(topPattern, star, rdr,
func(rec int, str string) {
if rec == 1 {
qry = str
idx = rec
}
})
} else if cmds.Position == "last" {
PartitionPattern(topPattern, star, rdr,
func(rec int, str string) {
qry = str
idx = rec
})
} else {
// use numeric position
number, err := strconv.Atoi(cmds.Position)
if err != nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized position '%s'\n", cmds.Position)
os.Exit(1)
}
PartitionPattern(topPattern, star, rdr,
func(rec int, str string) {
if rec == number {
qry = str
idx = rec
}
})
}
if qry == "" {
return
}
// clear position on top node to prevent condition test failure
cmds.Position = ""
// process single selected record
res := ProcessQuery(qry[:], parent, idx, cmds, tbls)
if res != "" {
fmt.Printf("%s\n", res)
}
return
}
// LAUNCH PRODUCER AND CONSUMER SERVERS
// launch producer goroutine to partition XML by pattern
xmlq := CreateProducer(topPattern, star, rdr, tbls)
if xmlq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create producer\n")
os.Exit(1)
}
// launch consumer goroutines to parse and explore partitioned XML objects
tblq := CreateConsumers(cmds, tbls, parent, numServers, xmlq)
if tblq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create consumers\n")
os.Exit(1)
}
// PERFORMANCE SUMMARY
if dbug {
// drain results, but suppress extraction output
for ext := range tblq {
byteCount += len(ext.Text)
recordCount++
runtime.Gosched()
}
// force garbage collection, return memory to operating system
debug.FreeOSMemory()
// print processing parameters as XML object
stopTime := time.Now()
duration := stopTime.Sub(startTime)
seconds := float64(duration.Nanoseconds()) / 1e9
// Threads is a more easily explained concept than GOMAXPROCS
fmt.Printf("\n")
fmt.Printf(" %d\n", numProcs)
fmt.Printf(" %d\n", numServers)
fmt.Printf(" \n", seconds)
if seconds >= 0.001 && recordCount > 0 {
rate := int(float64(recordCount) / seconds)
fmt.Printf(" %d\n", rate)
}
fmt.Printf("\n")
return
}
// DRAIN OUTPUT CHANNEL TO EXECUTE EXTRACTION COMMANDS, RESTORE OUTPUT ORDER WITH HEAP
// initialize empty heap
hp := &ExtractHeap{}
heap.Init(hp)
// index of next desired result
next := 1
delay := 0
var buffer bytes.Buffer
count := 0
okay := false
if head != "" {
buffer.WriteString(head[:])
buffer.WriteString("\n")
}
// printResult prints output for current pattern, handles -empty and -index flags, and periodically flushes buffer
printResult := func(curr Extract) {
str := curr.Text
if mpty {
if str == "" {
okay = true
idx := curr.Index
val := strconv.Itoa(idx)
buffer.WriteString(val[:])
buffer.WriteString("\n")
count++
}
} else if str != "" {
okay = true
if indx {
idx := curr.Index
val := strconv.Itoa(idx)
buffer.WriteString(val[:])
buffer.WriteString("\t")
}
// save output to byte buffer
buffer.WriteString(str[:])
count++
}
if count > 1000 {
count = 0
txt := buffer.String()
if txt != "" {
// print current buffer
os.Stdout.WriteString(txt[:])
}
buffer.Reset()
}
}
for ext := range tblq {
// push result onto heap
heap.Push(hp, ext)
// read several values before checking to see if next record to print has been processed
if delay < heapSize {
delay++
continue
}
delay = 0
for hp.Len() > 0 {
// remove lowest item from heap, use interface type assertion
curr := heap.Pop(hp).(Extract)
if curr.Index == next {
// if this is the desired item, send to output
printResult(curr)
recordCount++
// increment index
next++
// and keep checking heap to see if next result is already available
} else {
// otherwise push back onto heap
heap.Push(hp, curr)
// and go back to waiting on input channel
break
}
}
}
// send remainder of heap to output
for hp.Len() > 0 {
curr := heap.Pop(hp).(Extract)
printResult(curr)
recordCount++
}
if tail != "" {
buffer.WriteString(tail[:])
buffer.WriteString("\n")
}
// do not print head or tail if no extraction output
if okay {
txt := buffer.String()
if txt != "" {
// print final buffer
os.Stdout.WriteString(txt[:])
}
}
buffer.Reset()
// force garbage collection and return memory before calculating processing rate
debug.FreeOSMemory()
if timr {
printDuration("records")
}
}