// =========================================================================== // // PUBLIC DOMAIN NOTICE // National Center for Biotechnology Information (NCBI) // // This software/database is a "United States Government Work" under the // terms of the United States Copyright Act. It was written as part of // the author's official duties as a United States Government employee and // thus cannot be copyrighted. This software/database is freely available // to the public for use. The National Library of Medicine and the U.S. // Government do not place any restriction on its use or reproduction. // We would, however, appreciate having the NCBI and the author cited in // any work or product based on this material. // // Although all reasonable efforts have been taken to ensure the accuracy // and reliability of the software and data, the NLM and the U.S. // Government do not and cannot warrant the performance or results that // may be obtained by using this software or data. The NLM and the U.S. // Government disclaim all warranties, express or implied, including // warranties of performance, merchantability or fitness for any particular // purpose. // // =========================================================================== // // File Name: xtract.go // // Author: Jonathan Kans // // ========================================================================== /* test for presence of go compiler, cross-compile xtract executables, and pack into archive, by running: if hash go 2>/dev/null then env GOOS=darwin GOARCH=amd64 go build -o xtract.Darwin -v xtract.go env GOOS=linux GOARCH=amd64 go build -o xtract.Linux -v xtract.go env GOOS=windows GOARCH=386 go build -o xtract.CYGWIN_NT -v xtract.go tar -czf archive.tar.gz xtract.[A-Z]* rm xtract.[A-Z]* fi */ package main import ( "bytes" "container/heap" "fmt" "html" "io" "math" "os" "runtime" "runtime/debug" "runtime/pprof" "strconv" "strings" "sync" "time" "unicode" ) // VERSION AND HELP MESSAGE TEXT const xtractVersion = "5.80" const xtractHelp = ` Overview Xtract uses command-line arguments to convert XML data into a tab-delimited table. -pattern places the data from individual records into separate rows. -element extracts values from specified fields into separate columns. -group, -block, and -subset limit element exploration to selected XML subregions. Processing -cleanup Fix non-ASCII spaces -compress Compress runs of spaces -input Read from file instead of stdin Exploration Argument Hierarchy -pattern Name of record within set -group Use of different argument -block names allows command-line -subset control of nested looping Exploration Constructs Object DateCreated Parent/Child Book/AuthorList Heterogeneous "PubmedArticleSet/*" Nested "*/Taxon" Recursive "**/Gene-commentary" Conditional Execution -if Element [@attribute] required -unless Skip if element matches -and All tests must pass -or Any passing test suffices -else Execute if conditional test failed -position Must be at given location in list String Constraints -equals String must match exactly -contains Substring must be present -starts-with Substring must be at beginning -ends-with Substring must be at end -is-not String must not match Numeric Constraints -gt Greater than -ge Greater than or equal to -lt Less than -le Less than or equal to -eq Equal to -ne Not equal to Format Customization -ret Override line break between patterns -tab Replace tab character between fields -sep Separator between group members -pfx Prefix to print before group -sfx Suffix to print after group -clr Clear queued tab separator -pfc Preface combines -clr and -pfx -rst Reset -sep, -pfx, and -sfx -lbl Insert arbitrary text Element Selection -element Print all items that match tag name -first Only print value of first item -last Only print value of last item -encode URL-encode <, >, &, ", and ' characters -NAME Record value in named variable -element Constructs Tag Caption Group Initials,LastName Parent/Child MedlineCitation/PMID Attribute DescriptorName@MajorTopicYN Recursive "**/Gene-commentary_accession" Object Count "#Author" Item Length "%Title" Element Depth "^PMID" Parent Index "+" XML Subtree "*" Variable "&NAME" Numeric Selection -num Count -len Length -sum Sum -min Minimum -max Maximum -inc Increment -dec Decrement -sub Difference -avg Average -dev Deviation Sequence Coordinates -0-based Zero-Based -1-based One-Based -ucsc Half-Open Command Generator -insd Generate INSDSeq extraction commands -insd Argument Order Descriptors INSDSeq_sequence INSDSeq_definition INSDSeq_division Flags complete or partial [optional] Feature(s) CDS,mRNA Qualifiers INSDFeature_key "#INSDInterval" gene product Miscellaneous -head Print before first record -tail Print after last record Reformatting -format [compact|indent|expand] Modification -filter Object [retain|remove|encode|decode|shrink] [content|cdata|comment|object|attributes] Validation -verify Report XML data integrity problems Summary -outline Display outline of XML structure -synopsis Display count of unique XML paths Documentation -help Print this document -examples Examples of EDirect and xtract usage -version Print version number Notes String constraints use case-insensitive comparisons. Numeric constraints and selection arguments use integer values. -num and -len selections are synonyms for Object Count (#) and Item Length (%). Examples -pattern DocumentSummary -element Id -first Name Title -pattern "PubmedArticleSet/*" -block Author -sep " " -element Initials,LastName -pattern PubmedArticle -block MeshHeading -if "@MajorTopicYN" -equals Y -sep " / " -element DescriptorName,QualifierName -pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop -pattern Taxon -block "*/Taxon" -unless Rank -equals "no rank" -tab "\n" -element Rank,ScientificName -pattern Entrezgene -block "**/Gene-commentary" -block INSDReference -position 2 -if Author -and Title -if "#Author" -lt 6 -and "%Title" -le 70 -if DateCreated/Year -gt 2005 -if ChrStop -lt ChrStart -if CommonName -contains mouse -if "&ABST" -starts-with "Transposable elements" -if MapLocation -element MapLocation -else -lbl "\-" -min ChrStart,ChrStop -max ExonCount -inc @aaPosition -element @residue -1-based ChrStart -insd CDS gene product protein_id translation -insd complete mat_peptide "%peptide" product peptide -filter ExpXml decode content -filter LocationHist remove object ` const xtractInternal = ` ReadBlocks -> SplitPattern => StreamTokens => ParseXML => ProcessQuery -> MergeResults Performance Default Overrides -proc Number of CPU processors used -cons Ratio of parsers to processors -serv Concurrent parser instances -chan Communication channel depth -heap Order restoration heap size -farm Node allocation buffer length -gogc Garbage collection tuning knob Debugging -debug Display run-time parameter summary -empty Flag records with no output -index Print record index numbers -stats Show processing time for each record -timer Report processing duration and rate -trial Optimize -proc value, requires -input Internal Component Performance -chunk ReadBlocks -split ReadBlocks -> SplitPattern -drain ReadBlocks -> SplitPattern -> ConcurrencyChannel -token ReadBlocks -> StreamTokens Documentation -keys Keyboard navigation shortcuts -unix Common Unix commands Sample File Download ftp-cp ftp.ncbi.nlm.nih.gov /entrez/entrezdirect/samples carotene.xml.zip unzip carotene.xml.zip rm carotene.xml.zip Performance Tuning Script XtractTrials() { echo -e "" for tries in {1..5} do xtract -debug -input "$1" -proc "$2" -pattern PubmedArticle -element LastName done echo -e "" } for proc in {1..8} do XtractTrials "carotene.xml" "$proc" | xtract -pattern Trials -lbl "$proc" -avg Rate -dev Rate done Processor Titration Results 1 10455 124 2 18674 684 3 23654 1371 4 30527 521 5 35132 2349 6 40289 907 7 45369 1370 8 47397 1141 Execution Profiling xtract -profile -input carotene.xml -pattern PubmedArticle -element LastName go tool pprof --pdf ./xtract ./cpu.pprof > ./callgraph.pdf ` const xtractExamples = ` Author Frequency esearch -db pubmed -query "rattlesnake phospholipase" | efetch -format docsum | xtract -pattern DocumentSummary -sep "\n" -element Name | sort-uniq-count-rank 39 Marangoni S 31 Toyama MH 26 Soares AM 25 Bon C ... Publications efetch -db pubmed -id 6271474,5685784,4882854,6243420 -format xml | xtract -pattern PubmedArticle -element MedlineCitation/PMID "#Author" \ -block Author -position first -sep " " -element Initials,LastName \ -block Article -element ArticleTitle 6271474 5 MJ Casadaban Tn3: transposition and control. 5685784 2 RK Mortimer Suppressors and suppressible mutations in yeast. 4882854 2 ED Garber Proteins and enzymes as taxonomic tools. 6243420 1 NR Cozzarelli DNA gyrase and the supercoiling of DNA. Formatted Authors efetch -db pubmed -id 1413997,6301692,781293 -format xml | xtract -pattern PubmedArticle -element MedlineCitation/PMID \ -block DateCreated -sep "-" -element Year,Month,Day \ -block Author -sep " " -tab "" \ -element "&COM" Initials,LastName -COM "(, )" 1413997 1992-11-25 RK Mortimer, CR Contopoulou, JS King 6301692 1983-06-17 MA Krasnow, NR Cozzarelli 781293 1976-10-02 MJ Casadaban Medical Subject Headings efetch -db pubmed -id 6092233,2539356,1937004 -format xml | xtract -pattern PubmedArticle -element MedlineCitation/PMID \ -block MeshHeading \ -subset DescriptorName -pfc "\n" -sep "|" -element @MajorTopicYN,DescriptorName \ -subset QualifierName -pfc " / " -sep "|" -element @MajorTopicYN,QualifierName | sed -e 's/N|//g' -e 's/Y|/*/g' 6092233 Base Sequence DNA Restriction Enzymes DNA, Fungal / genetics / *isolation & purification *Genes, Fungal ... Peptide Sequences esearch -db protein -query "conotoxin AND mat_peptide [FKEY]" | efetch -format gpc | xtract -insd complete mat_peptide "%peptide" product peptide | grep -i conotoxin | sort -t $'\t' -u -k 2,2n | head -n 8 ADB43131.1 15 conotoxin Cal 1b LCCKRHHGCHPCGRT AIC77099.1 16 conotoxin Im1.2 GCCSHPACNVNNPHIC AIC77105.1 17 conotoxin Lt1.4 GCCSHPACDVNNPDICG AIC77103.1 18 conotoxin Lt1.2 PRCCSNPACNANHAEICG AIC77083.1 20 conotoxin Bt14.6 KDCTYCMHSSCSMMYEKCRP AIC77085.1 21 conotoxin Bt14.8 NECDNCMRSFCSMIYEKCRLK AIC77093.1 22 conotoxin Bt14.16 GDCKPCMHPDCRFNPGRCRPRE AIC77154.1 23 conotoxin Bt14.19 VREKDCPPHPVPGMHKCVCLKTC Chromosome Locations esearch -db gene -query "calmodulin [PFN] AND mammalia [ORGN]" | efetch -format docsum | xtract -pattern DocumentSummary -MAP "(-)" -MAP MapLocation \ -element Id Name "&MAP" ScientificName 801 CALM1 14q32.11 Homo sapiens 808 CALM3 19q13.2-q13.3 Homo sapiens 805 CALM2 2p21 Homo sapiens 24242 Calm1 6q31-q32 Rattus norvegicus 12313 Calm1 12 E Mus musculus 326597 CALM - Bos taurus 50663 Calm2 6q11-q12 Rattus norvegicus 24244 Calm3 1q22 Rattus norvegicus 12315 Calm3 7 9.15 cM Mus musculus 12314 Calm2 17 E4 Mus musculus 617095 CALM1 - Bos taurus 396838 CALM3 6 Sus scrofa ... Gene Regions esearch -db gene -query "DDT [GENE] AND mouse [ORGN]" | efetch -format docsum | xtract -pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop | xargs -n 3 sh -c 'efetch -db nuccore -format gb \ -id "$0" -chr_start "$1" -chr_stop "$2"' LOCUS NC_000076 2142 bp DNA linear CON 09-FEB-2015 DEFINITION Mus musculus strain C57BL/6J chromosome 10, GRCm38.p3 C57BL/6J. ACCESSION NC_000076 REGION: complement(75771233..75773374) GPC_000000783 VERSION NC_000076.6 GI:372099100 ... FEATURES Location/Qualifiers source 1..2142 /organism="Mus musculus" /mol_type="genomic DNA" /strain="C57BL/6J" /db_xref="taxon:10090" /chromosome="10" gene 1..2142 /gene="Ddt" mRNA join(1..159,462..637,1869..2142) /gene="Ddt" /product="D-dopachrome tautomerase" /transcript_id="NM_010027.1" CDS join(52..159,462..637,1869..1941) /gene="Ddt" /codon_start=1 /product="D-dopachrome decarboxylase" /protein_id="NP_034157.1" /translation="MPFVELETNLPASRIPAGLENRLCAATATILDKPEDRVSVTIRP GMTLLMNKSTEPCAHLLVSSIGVVGTAEQNRTHSASFFKFLTEELSLDQDRIVIRFFP ... Taxonomic Names esearch -db taxonomy -query "txid10090 [SBTR] OR camel [COMN]" | efetch -format docsum | xtract -pattern DocumentSummary -if CommonName \ -element Id ScientificName CommonName 57486 Mus musculus molossinus Japanese wild mouse 39442 Mus musculus musculus eastern European house mouse 35531 Mus musculus bactrianus southwestern Asian house mouse 10092 Mus musculus domesticus western European house mouse 10091 Mus musculus castaneus southeastern Asian house mouse 10090 Mus musculus house mouse 9838 Camelus dromedarius Arabian camel 9837 Camelus bactrianus Bactrian camel Structural Similarity esearch -db structure -query "crotalus [ORGN] AND phospholipase A2" | elink -related | efilter -query "archaea [ORGN]" | efetch -format docsum | xtract -pattern DocumentSummary \ -if PdbClass -equals Hydrolase \ -element PdbAcc PdbDescr 3VV2 Crystal Structure Of Complex Form Between S324a-subtilisin And Mutant Tkpro 3VHQ Crystal Structure Of The Ca6 Site Mutant Of Pro-Sa-Subtilisin 2ZWP Crystal Structure Of Ca3 Site Mutant Of Pro-S324a 2ZWO Crystal Structure Of Ca2 Site Mutant Of Pro-S324a ... Multiple Links esearch -db pubmed -query "conotoxin AND dopamine [MAJR]" | elink -target protein -cmd neighbor | xtract -pattern LinkSet -if Link/Id -element IdList/Id Link/Id 23624852 17105332 14657161 27532980 27532978 12944511 31542395 11222635 144922602 Gene Comments esearch -db gene -query "rbcL [GENE] AND maize [ORGN]" | efetch -format xml | xtract -pattern Entrezgene -block "**/Gene-commentary" \ -if Gene-commentary_type@value -equals genomic \ -tab "\n" -element Gene-commentary_accession | sort | uniq NC_001666 X86563 Z11973 Vitamin Biosynthesis esearch -db pubmed -query "tomato lycopene cyclase" | elink -related | elink -target protein | efilter -organism mammals | efetch -format gpc | xtract -pattern INSDSeq -if INSDSeq_definition -contains carotene \ -element INSDSeq_accession-version INSDSeq_definition NP_573480.1 beta,beta-carotene 9',10'-oxygenase [Mus musculus] NP_001156500.1 beta,beta-carotene 15,15'-dioxygenase isoform 2 [Mus musculus] NP_067461.2 beta,beta-carotene 15,15'-dioxygenase isoform 1 [Mus musculus] NP_001297121.1 beta-carotene oxygenase 2 [Mustela putorius furo] AAS20392.1 carotene-9',10'-monooxygenase [Mustela putorius furo] Indexed Fields einfo -db pubmed | xtract -pattern Field \ -if IsDate -equals Y -and IsHidden -equals N \ -pfx "[" -sep "]\t" -element Name,FullName | sort -t $'\t' -k 2f [CDAT] Date - Completion [CRDT] Date - Create [EDAT] Date - Entrez [MHDA] Date - MeSH [MDAT] Date - Modification [PDAT] Date - Publication Author Numbers esearch -db pubmed -query "conotoxin" | efetch -format xml | xtract -pattern PubmedArticle -num Author | sort-uniq-count -n | reorder-columns 2 1 | head -n 15 | xy-plot auth.png 0 11 1 193 2 854 3 844 4 699 5 588 6 439 7 291 8 187 9 124 10 122 11 58 12 33 13 18 900 + | ******** 800 + * ** | * * 700 + * *** | * ** 600 + * * | * *** 500 + * ** | * *** 400 + * ** | * * 300 + * *** | * * 200 + * ****** | * ********* 100 + ** * | * ********** 0 + * ****** +---------+---------+---------+---------+---------+---------+---------+ 0 2 4 6 8 10 12 14 Record Counts echo "diphtheria measles pertussis polio tuberculosis" | xargs -n 1 sh -c 'esearch -db pubmed -query "$0 [MESH]" | efilter -days 365 -datetype PDAT | xtract -pattern ENTREZ_DIRECT -lbl "$0" -element Count' diphtheria 18 measles 166 pertussis 98 polio 75 tuberculosis 1386 Gene Products for sym in HBB DMD TTN ATP7B HFE BRCA2 CFTR PAH PRNP RAG1 do esearch -db gene -query "$sym [GENE] AND human [ORGN]" | efilter -query "alive [PROP]" | efetch -format docsum | xtract -pattern GenomicInfoType \ -element ChrAccVer ChrStart ChrStop | while read acc str stp do efetch -db nuccore -format gbc \ -id "$acc" -chr_start "$str" -chr_stop "$stp" | xtract -insd CDS,mRNA INSDFeature_key "#INSDInterval" \ gene "%transcription" "%translation" \ product transcription translation | grep -i $'\t'"$sym"$'\t' done done NC_000011.10 mRNA 3 HBB 626 hemoglobin, beta ACATTTGCTT... NC_000011.10 CDS 3 HBB 147 hemoglobin subunit beta MVHLTPEEKS... NC_000023.11 mRNA 78 DMD 13805 dystrophin, transcript variant X2 AGGAAGATGA... NC_000023.11 mRNA 77 DMD 13794 dystrophin, transcript variant X6 ACTTTCCCCC... NC_000023.11 mRNA 77 DMD 13800 dystrophin, transcript variant X5 ACTTTCCCCC... NC_000023.11 mRNA 77 DMD 13785 dystrophin, transcript variant X7 ACTTTCCCCC... NC_000023.11 mRNA 74 DMD 13593 dystrophin, transcript variant X8 ACTTTCCCCC... NC_000023.11 mRNA 75 DMD 13625 dystrophin, transcript variant X9 ACTTTCCCCC... ... Genome Range esearch -db gene -query "Homo sapiens [ORGN] AND Y [CHR]" | efilter -status alive | efetch -format docsum | xtract -pattern DocumentSummary -NAME Name -DESC Description \ -block GenomicInfoType -if ChrLoc -equals Y \ -min ChrStart,ChrStop -element "&NAME" "&DESC" | sort -k 1,1n | cut -f 2- | between-two-genes ASMT IL3RA IL3RA interleukin 3 receptor subunit alpha LOC101928032 uncharacterized LOC101928032 LOC101928055 uncharacterized LOC101928055 SLC25A6 solute carrier family 25 member 6 LOC105373102 uncharacterized LOC105373102 LINC00106 long intergenic non-protein coding RNA 106 ASMTL-AS1 ASMTL antisense RNA 1 ASMTL acetylserotonin O-methyltransferase-like P2RY8 purinergic receptor P2Y8 AKAP17A A-kinase anchoring protein 17A ASMT acetylserotonin O-methyltransferase Amino Acid Substitutions ApplySNPs() { seq="" last="" while read rsid accn pos res do if [ "$accn" != "$last" ] then insd=$(efetch -db protein -id "$accn" -format gbc < /dev/null) seq=$(echo $insd | xtract -pattern INSDSeq -element INSDSeq_sequence) last=$accn fi pos=$((pos+1)) pfx="" sfx="" echo ">rs$rsid [$accn $res@$pos]" if [ $pos -gt 1 ] then pfx=$(echo ${seq:0:$pos-1}) fi if [ $pos -lt ${#seq} ] then sfx=$(echo ${seq:$pos}) fi echo "$pfx$res$sfx" | fold -w 50 done } esearch -db gene -query "OPN1MW [GENE] AND human [ORGN]" | elink -target snp | efetch -format xml | xtract -pattern Rs -RSID Rs@rsId \ -block FxnSet -if @fxnClass -equals missense \ -sep "." -element "&RSID" @protAcc,@protVer @aaPosition \ -tab "\n" -element @residue | sort -t $'\t' -k 2,2 -k 3,3n -k 4,4 | uniq | ApplySNPs >rs104894915 [NP_000504.1 K@94] maqqwslqrlagrhpqdsyedstqssiftytnsnstrgpfegpnyhiapr wvyhltsvwmifvviasvftnglvlaatmkfkklrhplnwilvKlavadl aetviastisvvnqvygyfvlghpmcvlegytvslcgitglwslaiiswe ... Amino Acid Composition #!/bin/bash -norc abbrev=( Ala Asx Cys Asp Glu Phe Gly His Ile \ Xle Lys Leu Met Asn Pyl Pro Gln Arg \ Ser Thr Sec Val Trp Xxx Tyr Glx ) AminoAcidComp() { local count while read num lttr do idx=$(printf %i "'$lttr'") ofs=$((idx-97)) count[$ofs]="$num" done <<< "$1" for i in {0..25} do echo -e "${abbrev[$i]}\t${count[$i]-0}" done | sort } AminoAcidJoin() { result="" while read acc seq gene do comp="$(echo "$seq" | tr A-Z a-z | sed 's/[^a-z]//g' | fold -w 1 | sort-uniq-count)" current=$(AminoAcidComp "$comp") current=$(echo -e "GENE\t$gene\n$current") if [ -n "$result" ] then result=$(join -t $'\t' <(echo "$result") <(echo "$current")) else result=$current fi done echo "$result" | grep -e "GENE" -e "[1-9]" } ids="NP_001172026,NP_000509,NP_004001,NP_001243779" efetch -db protein -id "$ids" -format gpc | xtract -insd INSDSeq_sequence CDS gene | AminoAcidJoin GENE INS HBB DMD TTN Ala 10 15 210 2084 Arg 5 3 193 1640 Asn 3 6 153 1111 Asp 2 7 185 1720 Cys 6 2 35 513 Gln 7 3 301 942 Glu 8 8 379 3193 Gly 12 13 104 2066 His 2 9 84 478 Ile 2 0 165 2062 Leu 20 18 438 2117 Lys 2 11 282 2943 Met 2 2 79 398 Phe 3 8 77 908 Pro 6 7 130 2517 Ser 5 5 239 2463 Thr 3 7 194 2546 Trp 2 2 67 466 Tyr 4 3 61 999 Val 6 18 186 3184 Phrase Searching entrez-phrase-search -db pubmed -field WORD \ selective serotonin reuptake inhibitor + monoamine oxidase inhibitor | efetch -format xml | xtract -pattern PubmedArticle -element MedlineCitation/PMID \ -block Keyword -pfc "\n " -element Keyword 24657329 Antidepressant Organic cation transporter 2 Piperine Uptake 2 24280122 5-HIAA 5-HT 5-HTP 5-hydroxyindoleacetic acid 5-hydroxytryptophan ... ` const pubMedArtSample = ` 6301692 1983 06 17 1983 06 17 2007 11 14
0092-8674 32 4 1983 Apr Cell Cell Site-specific relaxation and recombination by the Tn3 resolvase: recognition of the DNA path between oriented res sites. 1313-24 A model in which one subunit of a dimeric resolvase is bound at one res site, while the other searches along adjacent DNA until it encounters the second site, would account for the ability of resolvase to distinguish intramolecular from intermolecular sites, to sense the relative orientation of sites and to produce singly interlinked catenanes. Because resolvase is a type 1 topoisomerase, we infer that it makes the required duplex bDNA breaks of recombination one strand at a time. Krasnow Mark A MA Cozzarelli Nicholas R NR eng GM-07281 GM NIGMS NIH HHS United States Journal Article Research Support, U.S. Gov't, P.H.S.
UNITED STATES Cell 0413066 0092-8674 0 DNA, Bacterial 0 DNA, Superhelical 0 DNA, Viral EC 2.7.7.- Nucleotidyltransferases EC 2.7.7.- Transposases EC 5.99.1.2 DNA Topoisomerases, Type I IM DNA Topoisomerases, Type I metabolism DNA, Bacterial metabolism DNA, Superhelical metabolism DNA, Viral metabolism Models, Genetic Nucleic Acid Conformation Nucleotidyltransferases isolation & purification metabolism Plasmids Recombination, Genetic Repetitive Sequences, Nucleic Acid Simian virus 40 Transposases
1983 4 1 1983 4 1 0 1 1983 4 1 0 0 ppublish 6301692 0092-8674(83)90312-4
` const insdSeqSample = ` AF480315_1 67 AA linear INV 25-JUL-2016 31-DEC-2003 four-loop conotoxin preproprotein, partial [Conus purpurascens] AAQ05867 AAQ05867.1 gb|AAQ05867.1|AF480315_1 gi|33320307 Conus purpurascens Conus purpurascens Eukaryota; Metazoa; Lophotrochozoa; Mollusca; Gastropoda; Caenogastropoda; Hypsogastropoda; Neogastropoda; Conoidea; Conidae; Conus 1 1..67 Duda,T.F. Jr. Palumbi,S.R. Convergent evolution of venoms and feeding ecologies among polyphyletic piscivorous Conus species Unpublished 2 1..67 Duda,T.F. Jr. Palumbi,S.R. Direct Submission Submitted (04-FEB-2002) Naos Marine Lab, Smithsonian Tropical Research Institute, Apartado 2072, Balboa, Ancon, Panama, Republic of Panama Method: conceptual translation supplied by author. accession AF480315.1 source 1 67 AAQ05867.1 organism Conus purpurascens isolate purpurascens-2c db_xref taxon:41690 clone_lib venom duct cDNA library country Panama note isolated from the Bay of Panama Protein <1..67 1 67 AAQ05867.1 product four-loop conotoxin preproprotein mat_peptide 41..67 41 67 AAQ05867.1 product four-loop conotoxin calculated_mol_wt 3008 peptide PCKKTGRKCFPHQKDCCGRACIITICP CDS 1..67 1 67 AAQ05867.1 coded_by AF480315.1:<1..205 codon_start 2 vvivavlfltacqlitaddsrrtqkhralrsttkratsnrpckktgrkcfphqkdccgraciiticp ` const geneDocSumSample = ` 3581 IL9R interleukin 9 receptor 0 0 X, Y genomic Xq28 and Yq12 CD129, IL-9R interleukin-9 receptor|IL-9 receptor IL9R interleukin 9 receptor Official 300007 X NC_000023.11 155997580 156013016 14 Y NC_000024.10 57184100 57199536 14 5425 The protein encoded by this gene is a cytokine receptor that specifically mediates the biological effects of interleukin 9 (IL9). The functional IL9 receptor complex requires this protein as well as the interleukin 2 receptor, gamma (IL2RG), a common gamma subunit shared by the receptors of many different cytokines. The ligand binding of this receptor leads to the activation of various JAK kinases and STAT proteins, which connect to different biologic responses. This gene is located at the pseudoautosomal regions of X and Y chromosomes. Genetic studies suggested an association of this gene with the development of asthma. Multiple pseudogenes on chromosome 9, 10, 16, and 18 have been described. Alternatively spliced transcript variants have been found for this gene. X 155997580 Homo sapiens human 9606 ` const keyboardShortcuts = ` Command History Ctrl-n Next command Ctrl-p Previous command Move Cursor Forward Ctrl-e To end of line Ctrl-f By one character Esc-f By one argument Move Cursor Backward Ctrl-a To beginning of line Ctrl-b By one character Esc-b By one argument Delete Del Previous character Ctrl-d Next character Ctrl-k To end of line Ctrl-u Entire line Ctrl-w Previous word Esc-Del Previous argument Esc-d Next argument Autocomplete Tab Completes directory or file names Program Control Ctrl-c Quit running program ^x^y Run last command replacing x with y ` const unixCommands = ` Process by Contents sort Sorts lines of text -f Ignore case -n Numeric comparison -r Reverse result order -k Field key (start,stop or first) -u Unique lines with identical keys -b Ignore leading blanks -s Stable sort -t Specify field separator uniq Removes repeated lines -c Count occurrences -i Ignore case -f Ignore first n fields -s Ignore first n characters -d Only output repeated lines -u Only output non-repeated lines grep Matches patterns using regular expressions -i Ignore case -v Invert search -w Search expression as a word -x Search expression as whole line -e Specify individual pattern -c Only count number of matches -n Print line numbers Regular Expressions Characters . Any single character (except newline) \w Alphabetic [A-Za-z], numeric [0-9], or underscore (_) \s Whitespace (space or tab) \ Escapes special characters [] Matches any enclosed characters Positions ^ Beginning of line $ End of line \b Word boundary Repeat Matches ? 0 or 1 * 0 or more + 1 or more {n} Exactly n Escape Sequences \n Line break \t Tab character Modify Contents sed Replaces text strings -e Specify individual expression tr Translates characters -d Delete character rev Reverses characters on line Format Contents column Aligns columns by content width -s Specify field separator -t Create table expand Aligns columns to specified positions -t Tab positions fold Wraps lines at a specific width -w Line width Filter by Position cut Removes parts of lines -c Characters to keep -f Fields to keep -d Specify field separator -s Suppress lines with no delimiters head Prints first lines -n Number of lines tail Prints last lines -n Number of lines Miscellaneous wc Counts words, lines, or characters -c Characters -l Lines -w Words xargs Constructs arguments -n Number of words per batch File Compression tar Archive files -c Create archive -f Name of output file -z Compress archive with gzip gzip Compress file -k Keep original file unzip Decompress .zip archive -p Pipe to stdout gzcat Decompress .gz archive and pipe to stdout Directory and File Navigation cd Changes directory / Root ~ Home . Current .. Parent - Previous ls Lists file names -1 One entry per line -a Show files beginning with dot (.) -l List in long format -R Recursively explore subdirectories -S Sort files by size -t Sort by most recently modified pwd Prints working directory path ` // TYPED CONSTANTS type LevelType int const ( _ LevelType = iota UNIT SUBSET SECTION BLOCK BRANCH GROUP DIVISION PATTERN ) type IndentType int const ( SINGULARITY IndentType = iota COMPACT FLUSH INDENT SUBTREE WRAPPED ) type SideType int const ( _ SideType = iota LEFT RIGHT ) type TagType int const ( NOTAG TagType = iota STARTTAG SELFTAG STOPTAG ATTRIBTAG CONTENTTAG CDATATAG COMMENTTAG OBJECTTAG ISCLOSED ) type OpType int const ( UNSET OpType = iota ELEMENT FIRST LAST ENCODE PFX SFX SEP TAB RET LBL CLR PFC RST POSITION IF UNLESS MATCH AVOID AND OR EQUALS CONTAINS STARTSWITH ENDSWITH ISNOT GT GE LT LE EQ NE NUM LEN SUM MIN MAX INC DEC SUB AVG DEV ZEROBASED ONEBASED UCSC ELSE VARIABLE VALUE STAR COUNT LENGTH DEPTH INDEX UNRECOGNIZED ) type ArgumentType int const ( _ ArgumentType = iota EXPLORATION CONDITIONAL EXTRACTION CUSTOMIZATION ) type SpecialType int const ( NOPROCESS SpecialType = iota DOFORMAT DOOUTLINE DOSYNOPSIS DOVERIFY DOFILTER DOCHUNK DOSPLIT DODRAIN DOTOKEN ) type SeqEndType int const ( _ SeqEndType = iota ISSTART ISSTOP ISPOS ) type SequenceType struct { Based int Which SeqEndType } // ARGUMENT MAPS var argTypeIs = map[string]ArgumentType{ "-unit": EXPLORATION, "-Unit": EXPLORATION, "-subset": EXPLORATION, "-Subset": EXPLORATION, "-section": EXPLORATION, "-Section": EXPLORATION, "-block": EXPLORATION, "-Block": EXPLORATION, "-branch": EXPLORATION, "-Branch": EXPLORATION, "-group": EXPLORATION, "-Group": EXPLORATION, "-division": EXPLORATION, "-Division": EXPLORATION, "-pattern": EXPLORATION, "-Pattern": EXPLORATION, "-position": CONDITIONAL, "-if": CONDITIONAL, "-unless": CONDITIONAL, "-match": CONDITIONAL, "-avoid": CONDITIONAL, "-and": CONDITIONAL, "-or": CONDITIONAL, "-equals": CONDITIONAL, "-contains": CONDITIONAL, "-starts-with": CONDITIONAL, "-ends-with": CONDITIONAL, "-is-not": CONDITIONAL, "-gt": CONDITIONAL, "-ge": CONDITIONAL, "-lt": CONDITIONAL, "-le": CONDITIONAL, "-eq": CONDITIONAL, "-ne": CONDITIONAL, "-element": EXTRACTION, "-first": EXTRACTION, "-last": EXTRACTION, "-encode": EXTRACTION, "-num": EXTRACTION, "-len": EXTRACTION, "-sum": EXTRACTION, "-min": EXTRACTION, "-max": EXTRACTION, "-inc": EXTRACTION, "-dec": EXTRACTION, "-sub": EXTRACTION, "-avg": EXTRACTION, "-dev": EXTRACTION, "-0-based": EXTRACTION, "-zero-based": EXTRACTION, "-1-based": EXTRACTION, "-one-based": EXTRACTION, "-ucsc": EXTRACTION, "-else": EXTRACTION, "-pfx": CUSTOMIZATION, "-sfx": CUSTOMIZATION, "-sep": CUSTOMIZATION, "-tab": CUSTOMIZATION, "-ret": CUSTOMIZATION, "-lbl": CUSTOMIZATION, "-clr": CUSTOMIZATION, "-pfc": CUSTOMIZATION, "-rst": CUSTOMIZATION, } var opTypeIs = map[string]OpType{ "-element": ELEMENT, "-first": FIRST, "-last": LAST, "-encode": ENCODE, "-pfx": PFX, "-sfx": SFX, "-sep": SEP, "-tab": TAB, "-ret": RET, "-lbl": LBL, "-clr": CLR, "-pfc": PFC, "-rst": RST, "-position": POSITION, "-if": IF, "-unless": UNLESS, "-match": MATCH, "-avoid": AVOID, "-and": AND, "-or": OR, "-equals": EQUALS, "-contains": CONTAINS, "-starts-with": STARTSWITH, "-ends-with": ENDSWITH, "-is-not": ISNOT, "-gt": GT, "-ge": GE, "-lt": LT, "-le": LE, "-eq": EQ, "-ne": NE, "-num": NUM, "-len": LEN, "-sum": SUM, "-min": MIN, "-max": MAX, "-inc": INC, "-dec": DEC, "-sub": SUB, "-avg": AVG, "-dev": DEV, "-0-based": ZEROBASED, "-zero-based": ZEROBASED, "-1-based": ONEBASED, "-one-based": ONEBASED, "-ucsc": UCSC, "-else": ELSE, } var levelTypeIs = map[string]LevelType{ "-unit": UNIT, "-Unit": UNIT, "-subset": SUBSET, "-Subset": SUBSET, "-section": SECTION, "-Section": SECTION, "-block": BLOCK, "-Block": BLOCK, "-branch": BRANCH, "-Branch": BRANCH, "-group": GROUP, "-Group": GROUP, "-division": DIVISION, "-Division": DIVISION, "-pattern": PATTERN, "-Pattern": PATTERN, } var sequenceTypeIs = map[string]SequenceType{ "INSDSeq:INSDInterval_from": {1, ISSTART}, "INSDSeq:INSDInterval_to": {1, ISSTOP}, "DocumentSummary:ChrStart": {0, ISSTART}, "DocumentSummary:ChrStop": {0, ISSTOP}, "DocumentSummary:Chr_start": {1, ISSTART}, "DocumentSummary:Chr_end": {1, ISSTOP}, "DocumentSummary:Chr_inner_start": {1, ISSTART}, "DocumentSummary:Chr_inner_end": {1, ISSTOP}, "DocumentSummary:Chr_outer_start": {1, ISSTART}, "DocumentSummary:Chr_outer_end": {1, ISSTOP}, "DocumentSummary:start": {1, ISSTART}, "DocumentSummary:stop": {1, ISSTOP}, "DocumentSummary:display_start": {1, ISSTART}, "DocumentSummary:display_stop": {1, ISSTOP}, "Entrezgene:Seq-interval_from": {0, ISSTART}, "Entrezgene:Seq-interval_to": {0, ISSTOP}, "GenomicInfoType:ChrStart": {0, ISSTART}, "GenomicInfoType:ChrStop": {0, ISSTOP}, "Rs:@aaPosition": {0, ISPOS}, "Rs:@asnFrom": {0, ISSTART}, "Rs:@asnTo": {0, ISSTOP}, "Rs:@end": {0, ISSTOP}, "Rs:@leftContigNeighborPos": {0, ISSTART}, "Rs:@physMapInt": {0, ISPOS}, "Rs:@protLoc": {0, ISPOS}, "Rs:@rightContigNeighborPos": {0, ISSTOP}, "Rs:@start": {0, ISSTART}, "Rs:@structLoc": {0, ISPOS}, } // DATA OBJECTS type Tables struct { InBlank [256]bool AltBlank [256]bool InFirst [256]bool InElement [256]bool ChanDepth int FarmSize int } type Node struct { Name string Parent string Contents string Attributes string Attribs []string Children *Node Next *Node } type Step struct { Type OpType Value string Parent string Match string Attrib string Wild bool } type Operation struct { Type OpType Value string Stages []*Step } type Block struct { Visit string Parent string Match string Working []string Parsed []string Position string Conditions []*Operation Commands []*Operation Failure []*Operation Subtasks []*Block } // UTILITIES func IsNotJustWhitespace(str string) bool { for _, ch := range str { if ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r' && ch != '\f' { return true } } return false } func HasAmpOrNotASCII(str string) bool { for _, ch := range str { if ch == '&' || ch > 127 { return true } } return false } func IsAllCapsOrDigits(str string) bool { for _, rune := range str { if !unicode.IsUpper(rune) && !unicode.IsDigit(rune) { return false } } return true } func CompressRunsOfSpaces(str string) string { whiteSpace := false var buffer bytes.Buffer for _, rune := range str { if unicode.IsSpace(rune) { if !whiteSpace { buffer.WriteRune(' ') } whiteSpace = true } else { buffer.WriteRune(rune) whiteSpace = false } } return buffer.String() } func HasFlankingSpace(str string) bool { if str == "" { return false } ch := str[0] if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f' { return true } strlen := len(str) ch = str[strlen-1] if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f' { return true } return false } func HasBadSpace(str string) bool { for _, rune := range str { if unicode.IsSpace(rune) && rune != ' ' { return true } } return false } func CleanupBadSpaces(str string) string { var buffer bytes.Buffer for _, rune := range str { if unicode.IsSpace(rune) { buffer.WriteRune(' ') } else { buffer.WriteRune(rune) } } return buffer.String() } func SplitInTwoAt(str, chr string, side SideType) (string, string) { slash := strings.SplitN(str, chr, 2) if len(slash) > 1 { return slash[0], slash[1] } if side == LEFT { return str, "" } return "", str } func ConvertSlash(str string) string { if str == "" { return str } length := len(str) res := make([]byte, length+1, length+1) isSlash := false idx := 0 for _, rune := range str { if isSlash { switch rune { case 'n': // line feed res[idx] = '\n' case 'r': // carriage return res[idx] = '\r' case 't': // horizontal tab res[idx] = '\t' case 'f': // form feed res[idx] = '\f' case 'a': // audible bell from terminal (undocumented) res[idx] = '\x07' default: res[idx] = byte(rune) } idx++ isSlash = false } else if rune == '\\' { isSlash = true } else { res[idx] = byte(rune) idx++ } } res = res[0:idx] return string(res) } func ParseFlag(str string) OpType { op, ok := opTypeIs[str] if ok { return op } if len(str) > 1 && str[0] == '-' && IsAllCapsOrDigits(str[1:]) { return VARIABLE } if len(str) > 0 && str[0] == '-' { return UNRECOGNIZED } return UNSET } // CREATE COMMON DRIVER TABLES // InitTables creates lookup tables to simplify the tokenizer func InitTables() *Tables { tbls := &Tables{} for i := range tbls.InBlank { tbls.InBlank[i] = false } tbls.InBlank[' '] = true tbls.InBlank['\t'] = true tbls.InBlank['\n'] = true tbls.InBlank['\r'] = true tbls.InBlank['\f'] = true // alternative version of InBlank allows newlines to be counted for i := range tbls.AltBlank { tbls.AltBlank[i] = false } tbls.AltBlank[' '] = true tbls.AltBlank['\t'] = true tbls.AltBlank['\r'] = true tbls.AltBlank['\f'] = true // first character of element cannot be a digit, dash, or period for i := range tbls.InFirst { tbls.InFirst[i] = false } for ch := 'A'; ch <= 'Z'; ch++ { tbls.InFirst[ch] = true } for ch := 'a'; ch <= 'z'; ch++ { tbls.InFirst[ch] = true } tbls.InFirst['_'] = true // remaining characters also includes colon for namespace for i := range tbls.InElement { tbls.InElement[i] = false } for ch := 'A'; ch <= 'Z'; ch++ { tbls.InElement[ch] = true } for ch := 'a'; ch <= 'z'; ch++ { tbls.InElement[ch] = true } for ch := '0'; ch <= '9'; ch++ { tbls.InElement[ch] = true } tbls.InElement['_'] = true tbls.InElement['-'] = true tbls.InElement['.'] = true tbls.InElement[':'] = true return tbls } // examine structure of parsed arguments (undocumented) func DebugBlock(blk *Block, depth int) { doIndent := func(indt int) { for i := 1; i < indt; i++ { fmt.Fprintf(os.Stderr, " ") } } doIndent(depth) if blk.Visit != "" { doIndent(depth + 1) fmt.Fprintf(os.Stderr, " %s \n", blk.Visit) } if len(blk.Parsed) > 0 { doIndent(depth + 1) fmt.Fprintf(os.Stderr, "") for _, str := range blk.Parsed { fmt.Fprintf(os.Stderr, " %s", str) } fmt.Fprintf(os.Stderr, " \n") } if len(blk.Subtasks) > 0 { for _, sub := range blk.Subtasks { DebugBlock(sub, depth+1) } } } // PARSE COMMAND-LINE ARGUMENTS // ParseArguments parses nested exploration instruction from command-line arguments func ParseArguments(args []string, pttrn string) *Block { // different names of exploration control arguments allow multiple levels of nested "for" loops in linear command line // (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions) var ( lcname = []string{ "", "-unit", "-subset", "-section", "-block", "-branch", "-group", "-division", "-pattern", } ucname = []string{ "", "-Unit", "-Subset", "-Section", "-Block", "-Branch", "-Group", "-Division", "-Pattern", } ) /* xtract -pattern PubmedArticle -element MedlineCitation/PMID \ -block DateCreated -sep "-" -element Year,Month,Day \ -block Author -sep " " -tab "" -element "&COM" Initials,LastName -COM "(, )" PubmedArticle -element MedlineCitation/PMID DateCreated -sep "-" -element Year,Month,Day Author -sep " " -tab "" -element &COM Initials,LastName -COM "(, )" */ // parseCommands recursive definition var parseCommands func(parent *Block, startLevel LevelType) // parseCommands does initial parsing of exploration command structure parseCommands = func(parent *Block, startLevel LevelType) { // function to find next highest level exploration argument findNextLevel := func(args []string, level LevelType) (LevelType, string, string) { if len(args) > 1 { for { if level < UNIT { break } lctag := lcname[level] uctag := ucname[level] for _, txt := range args { if txt == lctag || txt == uctag { return level, lctag, uctag } } level-- } } return 0, "", "" } arguments := parent.Working level, lctag, uctag := findNextLevel(arguments, startLevel) if level < UNIT { // break recursion return } // function to group arguments at a given exploration level subsetCommands := func(args []string) *Block { max := len(args) visit := "" // extract name of object to visit if max > 1 { visit = args[1] args = args[2:] max -= 2 } partition := 0 for cur, str := range args { // record point of next exploration command partition = cur + 1 // skip if not a command if len(str) < 1 || str[0] != '-' { continue } if argTypeIs[str] == EXPLORATION { partition = cur break } } // parse parent/child construct // colon indicates a namespace prefix in any or all of the components prnt, match := SplitInTwoAt(visit, "/", RIGHT) // promote arguments parsed at this level return &Block{Visit: visit, Parent: prnt, Match: match, Parsed: args[0:partition], Working: args[partition:]} } cur := 0 // search for positions of current exploration command for idx, txt := range arguments { if txt == lctag || txt == uctag { if idx == 0 { continue } blk := subsetCommands(arguments[cur:idx]) parseCommands(blk, level-1) parent.Subtasks = append(parent.Subtasks, blk) cur = idx } } if cur < len(arguments) { blk := subsetCommands(arguments[cur:]) parseCommands(blk, level-1) parent.Subtasks = append(parent.Subtasks, blk) } // clear execution arguments from parent after subsetting parent.Working = nil } parseConditionals := func(cmds *Block, arguments []string) []*Operation { max := len(arguments) if max < 1 { return nil } // check for missing condition command txt := arguments[0] if txt != "-if" && txt != "-unless" && txt != "-match" && txt != "-avoid" && txt != "-position" { fmt.Fprintf(os.Stderr, "\nERROR: Missing -if command before '%s'\n", txt) os.Exit(1) } if txt == "-position" && max > 2 { fmt.Fprintf(os.Stderr, "\nERROR: Cannot combine -position with -if or -unless commands\n") os.Exit(1) } // check for missing argument after last condition txt = arguments[max-1] if len(txt) > 0 && txt[0] == '-' { fmt.Fprintf(os.Stderr, "\nERROR: Item missing after %s command\n", txt) os.Exit(1) } cond := make([]*Operation, 0, max) status := UNSET // function to parse conditional clause into execution step parseStep := func(op *Operation, elementColonValue bool) { if op == nil { return } str := op.Value status := ELEMENT // check for pound, percent, or caret character at beginning of name if len(str) > 1 { switch str[0] { case '&': if IsAllCapsOrDigits(str[1:]) { status = VARIABLE str = str[1:] } else if strings.Contains(str, ":") { fmt.Fprintf(os.Stderr, "\nERROR: Unsupported construct '%s', use -if &VARIABLE -equals VALUE instead\n", str) os.Exit(1) } else { fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variable '%s'\n", str) os.Exit(1) } case '#': status = COUNT str = str[1:] case '%': status = LENGTH str = str[1:] case '^': status = DEPTH str = str[1:] default: } } else if str == "+" { status = INDEX } // parse parent/element@attribute construct // colon indicates a namespace prefix in any or all of the components prnt, match := SplitInTwoAt(str, "/", RIGHT) match, attrib := SplitInTwoAt(match, "@", LEFT) val := "" // leading colon indicates namespace prefix wildcard wildcard := false if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") { wildcard = true } if elementColonValue { // allow parent/element@attribute:value construct for deprecated -match and -avoid, and for subsequent -and and -or commands match, val = SplitInTwoAt(str, ":", LEFT) prnt, match = SplitInTwoAt(match, "/", RIGHT) match, attrib = SplitInTwoAt(match, "@", LEFT) } tsk := &Step{Type: status, Value: str, Parent: prnt, Match: match, Attrib: attrib, Wild: wildcard} op.Stages = append(op.Stages, tsk) // transform old -match "element:value" to -match element -equals value if val != "" { tsk := &Step{Type: EQUALS, Value: val} op.Stages = append(op.Stages, tsk) } } idx := 0 // conditionals should alternate between command and object/value expectDash := true last := "" var op *Operation // flag to allow element-colon-value for deprecated -match and -avoid commands, otherwise colon is for namespace prefixes elementColonValue := false // parse command strings into operation structure for idx < max { str := arguments[idx] idx++ // conditionals should alternate between command and object/value if expectDash { if len(str) < 1 || str[0] != '-' { fmt.Fprintf(os.Stderr, "\nERROR: Unexpected '%s' argument after '%s'\n", str, last) os.Exit(1) } expectDash = false } else { if len(str) > 0 && str[0] == '-' { fmt.Fprintf(os.Stderr, "\nERROR: Unexpected '%s' command after '%s'\n", str, last) os.Exit(1) } expectDash = true } last = str switch status { case UNSET: status = ParseFlag(str) case POSITION: cmds.Position = str status = UNSET case MATCH, AVOID: elementColonValue = true fallthrough case IF, UNLESS, AND, OR: op = &Operation{Type: status, Value: str} cond = append(cond, op) parseStep(op, elementColonValue) status = UNSET case EQUALS, CONTAINS, STARTSWITH, ENDSWITH, ISNOT: if op != nil { if len(str) > 1 && str[0] == '\\' { // first character may be backslash protecting dash (undocumented) str = str[1:] } tsk := &Step{Type: status, Value: str} op.Stages = append(op.Stages, tsk) op = nil } else { fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent string match constraints\n") os.Exit(1) } status = UNSET case GT, GE, LT, LE, EQ, NE: if op != nil { if len(str) > 1 && str[0] == '\\' { // first character may be backslash protecting minus sign (undocumented) str = str[1:] } if len(str) < 1 { fmt.Fprintf(os.Stderr, "\nERROR: Empty numeric match constraints\n") os.Exit(1) } ch := str[0] if (ch >= '0' && ch <= '9') || ch == '-' || ch == '+' { // literal numeric constant tsk := &Step{Type: status, Value: str} op.Stages = append(op.Stages, tsk) } else if (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') { // numeric test allows element as second argument prnt, match := SplitInTwoAt(str, "/", RIGHT) match, attrib := SplitInTwoAt(match, "@", LEFT) wildcard := false if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") { wildcard = true } tsk := &Step{Type: status, Value: str, Parent: prnt, Match: match, Attrib: attrib, Wild: wildcard} op.Stages = append(op.Stages, tsk) } else { fmt.Fprintf(os.Stderr, "\nERROR: Unexpected numeric match constraints\n") os.Exit(1) } op = nil } else { fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent numeric match constraints\n") os.Exit(1) } status = UNSET case UNRECOGNIZED: fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str) os.Exit(1) default: fmt.Fprintf(os.Stderr, "\nERROR: Unexpected argument '%s'\n", str) os.Exit(1) } } return cond } parseExtractions := func(cmds *Block, arguments []string) []*Operation { max := len(arguments) if max < 1 { return nil } // check for missing -element (or -first, etc.) command txt := arguments[0] if len(txt) < 1 || txt[0] != '-' { fmt.Fprintf(os.Stderr, "\nERROR: Missing -element command before '%s'\n", txt) os.Exit(1) } // check for missing argument after last -element (or -first, etc.) command txt = arguments[max-1] if len(txt) > 0 && txt[0] == '-' { if txt == "-rst" { fmt.Fprintf(os.Stderr, "\nERROR: Unexpected position for %s command\n", txt) os.Exit(1) } else if txt != "-clr" { fmt.Fprintf(os.Stderr, "\nERROR: Item missing after %s command\n", txt) os.Exit(1) } } comm := make([]*Operation, 0, max) status := UNSET // function to parse next argument nextStatus := func(str string) OpType { status = ParseFlag(str) switch status { case VARIABLE: op := &Operation{Type: status, Value: str[1:]} comm = append(comm, op) status = VALUE case CLR, RST: op := &Operation{Type: status, Value: ""} comm = append(comm, op) status = UNSET case ELEMENT, FIRST, LAST, ENCODE, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSC: case TAB, RET, PFX, SFX, SEP, LBL, PFC: case UNSET: fmt.Fprintf(os.Stderr, "\nERROR: No -element before '%s'\n", str) os.Exit(1) case UNRECOGNIZED: fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str) os.Exit(1) default: fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", str) os.Exit(1) } return status } // function to parse extraction clause into individual steps parseSteps := func(op *Operation, pttrn string) { if op == nil { return } stat := op.Type str := op.Value // element names combined with commas are treated as a prefix-separator-suffix group comma := strings.Split(str, ",") for _, item := range comma { status := stat // check for special character at beginning of name if len(item) > 1 { switch item[0] { case '&': if IsAllCapsOrDigits(item[1:]) { status = VARIABLE item = item[1:] } else { fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variable '%s'\n", item) os.Exit(1) } case '#': status = COUNT item = item[1:] case '%': status = LENGTH item = item[1:] case '^': status = DEPTH item = item[1:] case '*': for _, ch := range item { if ch != '*' { break } } status = STAR default: } } else if item == "*" { status = STAR } else if item == "+" { status = INDEX } // parse parent/element@attribute construct // colon indicates a namespace prefix in any or all of the components prnt, match := SplitInTwoAt(item, "/", RIGHT) match, attrib := SplitInTwoAt(match, "@", LEFT) // leading colon indicates namespace prefix wildcard wildcard := false if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") { wildcard = true } // sequence coordinate adjustments switch status { case ZEROBASED, ONEBASED, UCSC: seq := pttrn + ":" if attrib != "" { seq += "@" seq += attrib } else if match != "" { seq += match } // confirm -0-based or -1-based arguments are known sequence position elements or attributes seqtype, ok := sequenceTypeIs[seq] if !ok { fmt.Fprintf(os.Stderr, "\nERROR: Element '%s' is not suitable for sequence coordinate conversion\n", item) os.Exit(1) } switch status { case ZEROBASED: status = ELEMENT // if 1-based coordinates, decrement to get 0-based value if seqtype.Based == 1 { status = DEC } case ONEBASED: status = ELEMENT // if 0-based coordinates, increment to get 1-based value if seqtype.Based == 0 { status = INC } case UCSC: status = ELEMENT // half-open intervals, start is 0-based, stop is 1-based if seqtype.Based == 0 && seqtype.Which == ISSTOP { status = INC } else if seqtype.Based == 1 && seqtype.Which == ISSTART { status = DEC } default: status = ELEMENT } default: } tsk := &Step{Type: status, Value: item, Parent: prnt, Match: match, Attrib: attrib, Wild: wildcard} op.Stages = append(op.Stages, tsk) } } idx := 0 // parse command strings into operation structure for idx < max { str := arguments[idx] idx++ if argTypeIs[str] == CONDITIONAL { fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", str) os.Exit(1) } switch status { case UNSET: status = nextStatus(str) case ELEMENT, FIRST, LAST, ENCODE, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSC: for !strings.HasPrefix(str, "-") { // create one operation per argument, even if under a single -element statement op := &Operation{Type: status, Value: str} comm = append(comm, op) parseSteps(op, pttrn) if idx >= max { break } str = arguments[idx] idx++ } status = UNSET if idx < max { status = nextStatus(str) } case TAB, RET, PFX, SFX, SEP, LBL, PFC: op := &Operation{Type: status, Value: ConvertSlash(str)} comm = append(comm, op) status = UNSET case VARIABLE: op := &Operation{Type: status, Value: str[1:]} comm = append(comm, op) status = VALUE case VALUE: op := &Operation{Type: status, Value: str} comm = append(comm, op) parseSteps(op, pttrn) status = UNSET case UNRECOGNIZED: fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str) os.Exit(1) default: } } return comm } // parseOperations recursive definition var parseOperations func(parent *Block) // parseOperations converts parsed arguments to operations lists parseOperations = func(parent *Block) { args := parent.Parsed partition := 0 for cur, str := range args { // record junction between conditional and extraction commands partition = cur + 1 // skip if not a command if len(str) < 1 || str[0] != '-' { continue } if argTypeIs[str] != CONDITIONAL { partition = cur break } } // split arguments into conditional tests and extraction or customization commands conditionals := args[0:partition] args = args[partition:] partition = 0 foundElse := false for cur, str := range args { // record junction at -else command partition = cur + 1 // skip if not a command if len(str) < 1 || str[0] != '-' { continue } if str == "-else" { partition = cur foundElse = true break } } extractions := args[0:partition] alternative := args[partition:] if len(alternative) > 0 && alternative[0] == "-else" { alternative = alternative[1:] } // validate argument structure and convert to operations lists parent.Conditions = parseConditionals(parent, conditionals) parent.Commands = parseExtractions(parent, extractions) parent.Failure = parseExtractions(parent, alternative) // reality checks on placement of -else command if foundElse { if len(conditionals) < 1 { fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n") os.Exit(1) } if len(alternative) < 1 { fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n") os.Exit(1) } if len(parent.Subtasks) > 0 { fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n") os.Exit(1) } } for _, sub := range parent.Subtasks { parseOperations(sub) } } // ParseArguments head := &Block{} for _, txt := range args { head.Working = append(head.Working, txt) } // initial parsing of exploration command structure parseCommands(head, PATTERN) if len(head.Subtasks) != 1 { return nil } // skip past empty placeholder head = head.Subtasks[0] // convert command strings to array of operations for faster processing parseOperations(head) // check for no -element or multiple -pattern commands noElement := true numPatterns := 0 for _, txt := range args { if argTypeIs[txt] == EXTRACTION { noElement = false } if txt == "-pattern" || txt == "-Pattern" { numPatterns++ } } if numPatterns < 1 { fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n") os.Exit(1) } if numPatterns > 1 { fmt.Fprintf(os.Stderr, "\nERROR: Only one -pattern command is permitted\n") os.Exit(1) } if noElement { fmt.Fprintf(os.Stderr, "\nERROR: No -element statement in argument list\n") os.Exit(1) } return head } // READ XML INPUT FILE INTO SET OF BLOCKS type XMLReader struct { Reader io.Reader Buffer []byte Remainder string Closed bool Docompress bool Docleanup bool } func NewXMLReader(in io.Reader, doCompress, doCleanup bool) *XMLReader { if in == nil { return nil } rdr := &XMLReader{Reader: in, Docompress: doCompress, Docleanup: doCleanup} // 65536 appears to be the maximum number of characters presented to io.Reader when input is piped from stdin // increasing size of buffer when input is from a file does not improve program performance // additional 16384 bytes are reserved for copying previous remainder to start of buffer before next read const XMLBUFSIZE = 65536 + 16384 rdr.Buffer = make([]byte, XMLBUFSIZE) return rdr } // NextBlock reads buffer, concatenates if necessary to place long element content into a single string // all result strings end in > character that is used as a sentinel in subsequent code func (rdr *XMLReader) NextBlock() string { if rdr == nil { return "" } // read one buffer, trim at last > and retain remainder for next call, signal if no > character nextBuffer := func() (string, bool, bool) { if rdr.Closed { return "", false, true } // prepend previous remainder to beginning of buffer m := copy(rdr.Buffer, rdr.Remainder) rdr.Remainder = "" if m > 16384 { // previous remainder is larger than reserved section, write and signal need to continue reading return string(rdr.Buffer[:]), true, false } // read next block, append behind copied remainder from previous read n, err := rdr.Reader.Read(rdr.Buffer[m:]) // with data piped through stdin, read function may not always return the same number of bytes each time if err != nil { // end of file rdr.Closed = true // do not send final remainder (not terminated by right angle bracket that is used as a sentinel) return "", false, true } // slice of actual characters read bufr := rdr.Buffer[:n+m] // look for last > character pos := -1 for pos = len(bufr) - 1; pos >= 0; pos-- { if bufr[pos] == '>' { break } } // trim back to last > character, save remainder for next buffer if pos > -1 { pos++ rdr.Remainder = string(bufr[pos:]) return string(bufr[:pos]), false, false } // no > found, signal need to continue reading long content return string(bufr[:]), true, false } // read next buffer line, cont, closed := nextBuffer() if closed { // no sentinel in remainder at end of file return "" } // if buffer does not end with > character if cont { var buff bytes.Buffer // keep reading long content blocks for { if line != "" { buff.WriteString(line) } if !cont { // last buffer ended with sentinel break } line, cont, closed = nextBuffer() if closed { // no sentinel in multi-block buffer at end of file return "" } } // concatenate blocks line = buff.String() } // trimming spaces here would throw off line tracking // optionally compress/cleanup tags/attributes and contents if rdr.Docompress { line = CompressRunsOfSpaces(line) } if rdr.Docleanup { if HasBadSpace(line) { line = CleanupBadSpaces(line) } } return line } // PARSE XML BLOCK STREAM INTO STRINGS FROM TO // PartitionPattern splits XML input by pattern and sends individual records to a callback func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string)) { if pat == "" || rdr == nil || proc == nil { return } type Scanner struct { Pattern string PatLength int CharSkip [256]int } // function to initialize to scanner newScanner := func(pattern string) *Scanner { if pattern == "" { return nil } scr := &Scanner{Pattern: pattern} patlen := len(pattern) scr.PatLength = patlen // position of last character in pattern last := patlen - 1 // initialize bad character displacement table for i := range scr.CharSkip { scr.CharSkip[i] = patlen } for i := 0; i < last; i++ { ch := pattern[i] scr.CharSkip[ch] = last - i } return scr } // function check surroundings of match candidate isAnElement := func(text string, lf, rt, mx int) bool { if (lf >= 0 && text[lf] == '<') || (lf > 0 && text[lf] == '/' && text[lf-1] == '<') { if (rt < mx && (text[rt] == '>' || text[rt] == ' ')) || (rt+1 < mx && text[rt] == '/' && text[rt+1] == '>') { return true } } return false } // modified Boyer-Moore-Horspool search function findNextMatch := func(scr *Scanner, text string, offset int) (int, int, int) { if scr == nil || text == "" { return -1, -1, -1 } // copy values into local variables for speed txtlen := len(text) pattern := scr.Pattern[:] patlen := scr.PatLength max := txtlen - patlen last := patlen - 1 skip := scr.CharSkip[:] i := offset for i <= max { j := last k := i + last for j >= 0 && text[k] == pattern[j] { j-- k-- } // require match candidate to be element name, i.e., , , or if j < 0 && isAnElement(text, i-1, i+patlen, txtlen) { // find positions of flanking brackets lf := i - 1 for lf > 0 && text[lf] != '<' { lf-- } rt := i + patlen for rt < txtlen && text[rt] != '>' { rt++ } return i + 1, lf, rt + 1 } // find character in text above last character in pattern ch := text[i+last] // displacement table can shift pattern by one or more positions i += skip[ch] } return -1, -1, -1 } type PatternType int const ( NOPATTERN PatternType = iota STARTPATTERN SELFPATTERN STOPPATTERN ) // function to find next element with pattern name nextPattern := func(scr *Scanner, text string, pos int) (PatternType, int, int) { if scr == nil || text == "" { return NOPATTERN, 0, 0 } prev := pos for { next, start, stop := findNextMatch(scr, text, prev) if next < 0 { return NOPATTERN, 0, 0 } prev = next + 1 if text[start+1] == '/' { return STOPPATTERN, stop, prev } else if text[stop-2] == '/' { return SELFPATTERN, start, prev } else { return STARTPATTERN, start, prev } } } // -pattern Object construct doNormal := func() { // current depth of -pattern objects level := 0 begin := 0 inPattern := false line := "" var accumulator bytes.Buffer match := NOPATTERN pos := 0 next := 0 rec := 0 scr := newScanner(pat) if scr == nil { return } for { begin = 0 next = 0 line = rdr.NextBlock() if line == "" { return } for { match, pos, next = nextPattern(scr, line, next) if match == STARTPATTERN { if level == 0 { inPattern = true begin = pos } level++ } else if match == STOPPATTERN { level-- if level == 0 { inPattern = false accumulator.WriteString(line[begin:pos]) // read and process one -pattern object at a time str := accumulator.String() if str != "" { rec++ proc(rec, str[:]) } // reset accumulator accumulator.Reset() } } else if match == SELFPATTERN { } else { if inPattern { accumulator.WriteString(line[begin:]) } break } } } } // -pattern Parent/* construct now works with catenated files, but not if components // are recursive or self-closing objects, process those through xtract -format first doStar := func() { // current depth of -pattern objects level := 0 begin := 0 inPattern := false line := "" var accumulator bytes.Buffer match := NOPATTERN pos := 0 next := 0 rec := 0 scr := newScanner(pat) if scr == nil { return } last := pat // read to first element for { next = 0 line = rdr.NextBlock() if line == "" { break } match, pos, next = nextPattern(scr, line, next) if match == STARTPATTERN { break } } if match != STARTPATTERN { return } // function to find next element in XML nextElement := func(text string, pos int) string { txtlen := len(text) tag := "" for i := pos; i < txtlen; i++ { if text[i] == '<' { tag = text[i+1:] break } } if tag == "" { return "" } if tag[0] == '/' { if strings.HasPrefix(tag[1:], pat) { //should be at end, want to continue if catenated files return "/" } return "" } for i, ch := range tag { if ch == '>' || ch == ' ' || ch == '/' { return tag[0:i] } } return "" } // read and process heterogeneous objects immediately below parent for { tag := nextElement(line, next) if tag == "" { begin = 0 next = 0 line = rdr.NextBlock() if line == "" { break } tag = nextElement(line, next) } if tag == "" { return } // check for catenated parent set files if tag[0] == '/' { scr = newScanner(pat) if scr == nil { return } last = pat // confirm end just found match, pos, next = nextPattern(scr, line, next) if match != STOPPATTERN { return } // now look for a new start tag for { match, pos, next = nextPattern(scr, line, next) if match == STARTPATTERN { break } next = 0 line = rdr.NextBlock() if line == "" { break } } if match != STARTPATTERN { return } // continue with processing loop continue } if tag != last { scr = newScanner(tag) if scr == nil { return } last = tag } for { match, pos, next = nextPattern(scr, line, next) if match == STARTPATTERN { if level == 0 { inPattern = true begin = pos } level++ } else if match == STOPPATTERN { level-- if level == 0 { inPattern = false accumulator.WriteString(line[begin:pos]) // read and process one -pattern/* object at a time str := accumulator.String() if str != "" { rec++ proc(rec, str[:]) } // reset accumulator accumulator.Reset() break } } else { if inPattern { accumulator.WriteString(line[begin:]) } begin = 0 next = 0 line = rdr.NextBlock() if line == "" { break } } } } } // call appropriate handler if star == "" { doNormal() } else if star == "*" { doStar() } } // XML VALIDATION AND FORMATTING FUNCTIONS // ProcessXMLStream tokenizes and runs designated operations on an entire XML file func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action SpecialType) (int, int) { if in == nil || tbls == nil { return 0, 0 } blockCount := 0 // token parser variables Text := "" Txtlen := 0 Idx := 0 Line := 1 // variables to track comments or CDATA sections that span reader blocks Which := NOTAG SkipTo := "" nextToken := func(idx int) (TagType, string, string, int, int) { if Text == "" { // if buffer is empty, read next block Text = in.NextBlock() Txtlen = len(Text) Idx = 0 idx = 0 blockCount++ } if Text == "" { return ISCLOSED, "", "", Line, 0 } // lookup table array pointers inBlank := &tbls.AltBlank inFirst := &tbls.InFirst inElement := &tbls.InElement text := Text[:] txtlen := Txtlen line := Line if Which != NOTAG && SkipTo != "" { which := Which // previous block ended inside CDATA object or comment start := idx found := strings.Index(text[:], SkipTo) if found < 0 { // no stop signal found in next block // count lines for i := 0; i < txtlen; i++ { if text[i] == '\n' { line++ } } Line = line str := text[:] if HasFlankingSpace(str) { str = strings.TrimSpace(str) } // signal end of current block Text = "" // leave Which and SkipTo values unchanged as another continuation signal // send CDATA or comment contents return which, str[:], "", Line, 0 } // otherwise adjust position past end of skipTo string and return to normal processing idx += found // count lines for i := 0; i < idx; i++ { if text[i] == '\n' { line++ } } Line = line str := text[start:idx] if HasFlankingSpace(str) { str = strings.TrimSpace(str) } idx += len(SkipTo) // clear tracking variables Which = NOTAG SkipTo = "" // send CDATA or comment contents return which, str[:], "", Line, idx } // all blocks end with > character, acts as sentinel to check if past end of text if idx >= txtlen { // signal end of current block, will read next block on next call Text = "" Line = line return NOTAG, "", "", Line, 0 } // skip past leading blanks ch := text[idx] for { for inBlank[ch] { idx++ ch = text[idx] } if ch != '\n' { break } line++ idx++ ch = text[idx] } Line = line start := idx if ch == '<' { // at start of element idx++ ch = text[idx] // check for legal first character of element if inFirst[ch] { // read element name start = idx idx++ ch = text[idx] for inElement[ch] { idx++ ch = text[idx] } str := text[start:idx] switch ch { case '>': // end of element idx++ return STARTTAG, str[:], "", Line, idx case '/': // self-closing element without attributes idx++ ch = text[idx] if ch != '>' { fmt.Fprintf(os.Stderr, "\nSelf-closing element missing right angle bracket\n") } idx++ return SELFTAG, str[:], "", Line, idx case '\n': line++ fallthrough case ' ', '\t', '\r', '\f': // attributes idx++ start = idx ch = text[idx] for { for ch != '<' && ch != '>' && ch != '\n' { idx++ ch = text[idx] } if ch != '\n' { break } line++ idx++ ch = text[idx] } Line = line if ch != '>' { fmt.Fprintf(os.Stderr, "\nAttributes not followed by right angle bracket\n") } if text[idx-1] == '/' { // self-closing atr := text[start : idx-1] idx++ return SELFTAG, str[:], atr[:], Line, idx } atr := text[start:idx] idx++ return STARTTAG, str[:], atr[:], Line, idx default: fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n") return STARTTAG, str[:], "", Line, idx } } else { // punctuation character immediately after first angle bracket switch ch { case '/': // at start of end tag idx++ start = idx ch = text[idx] // expect legal first character of element if inFirst[ch] { idx++ ch = text[idx] for inElement[ch] { idx++ ch = text[idx] } str := text[start:idx] if ch != '>' { fmt.Fprintf(os.Stderr, "\nUnexpected characters after end element name\n") } idx++ return STOPTAG, str[:], "", Line, idx } fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n") case '?': // skip ?xml and ?processing instructions idx++ ch = text[idx] for ch != '>' { idx++ ch = text[idx] } idx++ return NOTAG, "", "", Line, idx case '!': // skip !DOCTYPE, !comment, and ![CDATA[ idx++ start = idx ch = text[idx] Which := NOTAG SkipTo := "" if ch == '[' && strings.HasPrefix(text[idx:], "[CDATA[") { Which = CDATATAG SkipTo = "]]>" start += 7 } else if ch == '-' && strings.HasPrefix(text[idx:], "--") { Which = COMMENTTAG SkipTo = "-->" start += 2 } if Which != NOTAG && SkipTo != "" { which := Which // CDATA or comment block may contain internal angle brackets found := strings.Index(text[idx:], SkipTo) if found < 0 { // string stops in middle of CDATA or comment // count lines for i := start; i < txtlen; i++ { if text[i] == '\n' { line++ } } Line = line str := text[start:] if HasFlankingSpace(str) { str = strings.TrimSpace(str) } // signal end of current block Text = "" // leave Which and SkipTo values unchanged as another continuation signal // send CDATA or comment contents return which, str[:], "", Line, 0 } // adjust position past end of CDATA or comment idx += found // count lines for i := start; i < idx; i++ { if text[i] == '\n' { line++ } } Line = line str := text[start:idx] if HasFlankingSpace(str) { str = strings.TrimSpace(str) } idx += len(SkipTo) // clear tracking variables Which = NOTAG SkipTo = "" // send CDATA or comment contents return which, str[:], "", Line, idx } // otherwise just skip to next right angle bracket for ch != '>' { if ch == '\n' { line++ } idx++ ch = text[idx] } Line = line idx++ return NOTAG, "", "", Line, idx default: fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n") } } } else if ch != '>' { // at start of contents start = idx // find end of contents for { for ch != '<' && ch != '>' && ch != '\n' { idx++ ch = text[idx] } if ch != '\n' { break } line++ idx++ ch = text[idx] } Line = line // trim back past trailing blanks lst := idx - 1 ch = text[lst] for inBlank[ch] && lst > start { lst-- ch = text[lst] } str := text[start : lst+1] return CONTENTTAG, str[:], "", Line, idx } // signal end of current block, will read next block on next call Text = "" Line = line return NOTAG, "", "", Line, 0 } // common output buffer var buffer bytes.Buffer count := 0 // processOutline displays outline of XML structure processOutline := func() { indent := 0 for { tag, name, _, _, idx := nextToken(Idx) Idx = idx switch tag { case STARTTAG: if name == "eSummaryResult" || name == "eLinkResult" || name == "eInfoResult" || name == "PubmedArticleSet" || name == "DocumentSummarySet" || name == "INSDSet" || name == "Entrezgene-Set" || name == "TaxaSet" { break } for i := 0; i < indent; i++ { buffer.WriteString(" ") } buffer.WriteString(name) buffer.WriteString("\n") indent++ case SELFTAG: for i := 0; i < indent; i++ { buffer.WriteString(" ") } buffer.WriteString(name) buffer.WriteString("\n") case STOPTAG: indent-- case NOTAG: case ISCLOSED: txt := buffer.String() if txt != "" { // print final buffer fmt.Fprintf(os.Stdout, "%s", txt) } return default: } count++ if count > 1000 { count = 0 txt := buffer.String() if txt != "" { // print current buffered output fmt.Fprintf(os.Stdout, "%s", txt) } buffer.Reset() } } } // processSynopsis displays paths to XML elements processSynopsis := func() { // synopsisLevel recursive definition var synopsisLevel func(string) bool synopsisLevel = func(parent string) bool { for { tag, name, _, _, idx := nextToken(Idx) Idx = idx switch tag { case STARTTAG: if name == "eSummaryResult" || name == "eLinkResult" || name == "eInfoResult" || name == "PubmedArticleSet" || name == "DocumentSummarySet" || name == "INSDSet" || name == "Entrezgene-Set" || name == "TaxaSet" { break } if parent != "" { buffer.WriteString(parent) buffer.WriteString("/") } buffer.WriteString(name) buffer.WriteString("\n") path := parent if path != "" { path += "/" } path += name if synopsisLevel(path) { return true } case SELFTAG: if parent != "" { buffer.WriteString(parent) buffer.WriteString("/") } buffer.WriteString(name) buffer.WriteString("\n") case STOPTAG: // break recursion return false case NOTAG: case ISCLOSED: txt := buffer.String() if txt != "" { // print final buffer fmt.Fprintf(os.Stdout, "%s", txt) } return true default: } count++ if count > 1000 { count = 0 txt := buffer.String() if txt != "" { // print current buffered output fmt.Fprintf(os.Stdout, "%s", txt) } buffer.Reset() } } } for { // may have concatenated XMLs, loop through all if synopsisLevel("") { return } } } // processVerify checks for well-formed XML processVerify := func() { type VerifyType int const ( _ VerifyType = iota START STOP CHAR OTHER ) // skip past command name args = args[1:] pttrn := "" if len(args) > 0 { pttrn = args[0] args = args[1:] } // if pattern supplied, report maximum nesting depth and record spanning the most blocks maxDepth := 0 depthLine := 0 maxBlocks := 0 blockLine := 0 startLine := 0 // verifyLevel recursive definition var verifyLevel func(string, int) // verify integrity of XML object nesting (well-formed) verifyLevel = func(parent string, level int) { status := START for { // use alternative low-level tokenizer tag, name, _, line, idx := nextToken(Idx) Idx = idx if level > maxDepth { maxDepth = level depthLine = line } switch tag { case STARTTAG: if status == CHAR { fmt.Fprintf(os.Stdout, "<%s> not expected after contents, line %d\n", name, line) } if name == pttrn { blockCount = 1 startLine = line } verifyLevel(name, level+1) // returns here after recursion status = STOP case SELFTAG: status = OTHER case STOPTAG: if name == pttrn { if blockCount > maxBlocks { maxBlocks = blockCount blockLine = startLine } } if parent != name && parent != "" { fmt.Fprintf(os.Stdout, "Expected , found , line %d\n", parent, name, line) } if level < 1 { fmt.Fprintf(os.Stdout, "Unexpected at end of XML, line %d\n", name, line) } // break recursion return case CONTENTTAG: if status != START { fmt.Fprintf(os.Stdout, "Contents not expected before , line %d - status %d\n", parent, line, status) } status = CHAR case CDATATAG, COMMENTTAG: status = OTHER case NOTAG: case ISCLOSED: if level > 0 { fmt.Fprintf(os.Stdout, "Unexpected end of data\n") } return default: status = OTHER } } } verifyLevel("", 0) if pttrn != "" { fmt.Fprintf(os.Stdout, "Maximum nesting (%d levels) at line %d\n", maxDepth, depthLine) fmt.Fprintf(os.Stdout, "Longest pattern (%d blocks) at line %d\n", maxBlocks, blockLine) } } // processFilter modifies XML content, comments, or CDATA processFilter := func() { // skip past command name args = args[1:] max := len(args) if max < 1 { fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract -filter\n") os.Exit(1) } pttrn := args[0] args = args[1:] max-- if max < 2 { fmt.Fprintf(os.Stderr, "\nERROR: No object name supplied to xtract -filter\n") os.Exit(1) } type ActionType int const ( NOACTION ActionType = iota DORETAIN DOREMOVE DOENCODE DODECODE DOSHRINK ) action := args[0] what := NOACTION switch action { case "retain": what = DORETAIN case "remove": what = DOREMOVE case "encode": what = DOENCODE case "decode": what = DODECODE case "shrink": what = DOSHRINK default: fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized action '%s' supplied to xtract -filter\n", action) os.Exit(1) } trget := args[1] which := NOTAG switch trget { case "attribute", "attributes": which = ATTRIBTAG case "content", "contents": which = CONTENTTAG case "cdata", "CDATA": which = CDATATAG case "comment", "comments": which = COMMENTTAG case "object": // object normally retained which = OBJECTTAG default: fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized target '%s' supplied to xtract -filter\n", trget) os.Exit(1) } inPattern := false for { tag, name, attr, _, idx := nextToken(Idx) Idx = idx switch tag { case STARTTAG: if name == pttrn { inPattern = true } if inPattern && which == OBJECTTAG && what == DOREMOVE { continue } buffer.WriteString("<") buffer.WriteString(name) if attr != "" { if which != ATTRIBTAG || what != DOREMOVE { attr = strings.TrimSpace(attr) attr = CompressRunsOfSpaces(attr) buffer.WriteString(" ") buffer.WriteString(attr) } } buffer.WriteString(">\n") case SELFTAG: if inPattern && which == OBJECTTAG && what == DOREMOVE { continue } buffer.WriteString("<") buffer.WriteString(name) if attr != "" { if which != ATTRIBTAG || what != DOREMOVE { attr = strings.TrimSpace(attr) attr = CompressRunsOfSpaces(attr) buffer.WriteString(" ") buffer.WriteString(attr) } } buffer.WriteString("/>\n") case STOPTAG: if name == pttrn { inPattern = false if which == OBJECTTAG && what == DOREMOVE { continue } } if inPattern && which == OBJECTTAG && what == DOREMOVE { continue } buffer.WriteString("\n") case CONTENTTAG: if inPattern && which == OBJECTTAG && what == DOREMOVE { continue } if inPattern && which == tag { switch what { case DORETAIN: // default behavior for content - can use -filter X retain content as a no-op case DOREMOVE: continue case DOENCODE: name = html.EscapeString(name) case DODECODE: name = html.UnescapeString(name) case DOSHRINK: name = CompressRunsOfSpaces(name) default: continue } } // content normally printed if HasFlankingSpace(name) { name = strings.TrimSpace(name) } buffer.WriteString(name) buffer.WriteString("\n") case CDATATAG, COMMENTTAG: if inPattern && which == OBJECTTAG && what == DOREMOVE { continue } if inPattern && which == tag { switch what { case DORETAIN: // cdata and comment require explicit retain command case DOREMOVE: continue case DOENCODE: name = html.EscapeString(name) case DODECODE: name = html.UnescapeString(name) case DOSHRINK: name = CompressRunsOfSpaces(name) default: continue } // cdata and comment normally removed if HasFlankingSpace(name) { name = strings.TrimSpace(name) } buffer.WriteString(name) buffer.WriteString("\n") } case NOTAG: case ISCLOSED: txt := buffer.String() if txt != "" { // print final buffer fmt.Fprintf(os.Stdout, "%s", txt) } return default: } count++ if count > 1000 { count = 0 txt := buffer.String() if txt != "" { // print current buffered output fmt.Fprintf(os.Stdout, "%s", txt) } buffer.Reset() } } } // processFormat reformats XML for ease of reading processFormat := func() { // skip past command name args = args[1:] compRecrd := false wrapAttrs := false ret := "\n" frst := true if len(args) > 0 { switch args[0] { case "compact", "compacted", "compress", "compressed", "terse", "*": // compress to one record per line compRecrd = true ret = "" case "expand", "expanded", "verbose", "@": // each attribute on its own line wrapAttrs = true case "indent", "indented", "normal": default: fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -format command\n") os.Exit(1) } } type FormatType int const ( NOTSET FormatType = iota START STOP CHAR OTHER ) // array to speed up indentation indentSpaces := []string{ "", " ", " ", " ", " ", " ", " ", " ", " ", " ", } indent := 0 // parent used to detect first start tag, will place in doctype line parent := "" status := NOTSET // delay printing right bracket of start tag to support self-closing tag style needsRightBracket := "" // delay printing start tag if no attributes, suppress empty start-end pair if followed by end justStartName := "" justStartIndent := 0 // function to indent a specified number of spaces doIndent := func(indt int) { if compRecrd { return } i := indt for i > 9 { buffer.WriteString(" ") i -= 10 } if i < 0 { return } buffer.WriteString(indentSpaces[i]) } // function to handle delayed start tag doDelayedName := func() { if needsRightBracket != "" { buffer.WriteString(">") needsRightBracket = "" } if justStartName != "" { doIndent(justStartIndent) buffer.WriteString("<") buffer.WriteString(justStartName) buffer.WriteString(">") justStartName = "" } } closingTag := "" // function to print attributes printAttributes := func(attr string) { attr = strings.TrimSpace(attr) attr = CompressRunsOfSpaces(attr) if wrapAttrs { start := 0 idx := 0 attlen := len(attr) for idx < attlen { ch := attr[idx] if ch == '=' { str := attr[start:idx] buffer.WriteString("\n") doIndent(indent) buffer.WriteString(" ") buffer.WriteString(str) // skip past equal sign and leading double quote idx += 2 start = idx } else if ch == '"' { str := attr[start:idx] buffer.WriteString("=\"") buffer.WriteString(str) buffer.WriteString("\"") // skip past trailing double quote and (possible) space idx += 2 start = idx } else { idx++ } } buffer.WriteString("\n") doIndent(indent) } else { buffer.WriteString(" ") buffer.WriteString(attr) } } for { tag, name, attr, _, idx := nextToken(Idx) Idx = idx switch tag { case STARTTAG: doDelayedName() if status == START { buffer.WriteString(ret) } // remove internal copies of tags if parent != "" && name == parent && indent == 1 { continue } // detect first start tag, print xml and doctype parent if indent == 0 && parent == "" { parent = name buffer.WriteString("\n") buffer.WriteString("\n") // now filtering internal tags, so queue printing of closing tag closingTag = fmt.Sprintf("\n", parent) // already past test, so opening tag will print normally } // check for attributes if attr != "" { doIndent(indent) buffer.WriteString("<") buffer.WriteString(name) printAttributes(attr) needsRightBracket = name } else { justStartName = name justStartIndent = indent } if compRecrd && frst && indent == 0 { frst = false doDelayedName() buffer.WriteString("\n") } indent++ status = START case SELFTAG: doDelayedName() if status == START { buffer.WriteString(ret) } // suppress self-closing tag without attributes attributes if attr != "" { doIndent(indent) buffer.WriteString("<") buffer.WriteString(name) printAttributes(attr) buffer.WriteString("/>") buffer.WriteString(ret) } status = STOP case STOPTAG: // if end immediately follows start, turn into self-closing tag if there were attributes, otherwise suppress empty tag if needsRightBracket != "" { if status == START && name == needsRightBracket { // end immediately follows start, produce self-closing tag buffer.WriteString("/>") buffer.WriteString(ret) needsRightBracket = "" indent-- status = STOP break } buffer.WriteString(">") needsRightBracket = "" } if justStartName != "" { if status == START && name == justStartName { // end immediately follows delayed start with no attributes, suppress justStartName = "" indent-- status = STOP break } doIndent(justStartIndent) buffer.WriteString("<") buffer.WriteString(justStartName) buffer.WriteString(">") justStartName = "" } // remove internal copies of tags if parent != "" && name == parent && indent == 1 { continue } indent-- if status == CHAR { buffer.WriteString("") buffer.WriteString(ret) } else if status == START { buffer.WriteString("") buffer.WriteString(ret) } else { doIndent(indent) buffer.WriteString("") buffer.WriteString(ret) } status = STOP if compRecrd && indent == 1 { buffer.WriteString("\n") } case CONTENTTAG: doDelayedName() if len(name) > 0 && IsNotJustWhitespace(name) { if HasFlankingSpace(name) { name = strings.TrimSpace(name) } buffer.WriteString(name) status = CHAR } case CDATATAG, COMMENTTAG: // ignore case NOTAG: case ISCLOSED: doDelayedName() if closingTag != "" { buffer.WriteString(closingTag) } txt := buffer.String() if txt != "" { // print final buffer fmt.Fprintf(os.Stdout, "%s", txt) } return default: doDelayedName() status = OTHER } count++ if count > 1000 { count = 0 txt := buffer.String() if txt != "" { // print current buffered output fmt.Fprintf(os.Stdout, "%s", txt) } buffer.Reset() } } } // return variables set by performance test functions recordCount := 0 byteCount := 0 // processChunk reads a set of XML blocks processChunk := func() { for { str := in.NextBlock() if str == "" { break } recordCount++ byteCount += len(str) } } // processSplit partitions XML by pattern processSplit := func() { if len(args) > 1 { if args[1] == "-pattern" || args[1] == "-Pattern" { // skip past -split if followed by -pattern args = args[1:] } } if len(args) < 2 { fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -split command\n") os.Exit(1) } pat := args[1] PartitionPattern(pat, "", in, func(rec int, str string) { recordCount++ byteCount += len(str) }) } // processDrain partitions XML by pattern and sends them down a channel processDrain := func() { if len(args) > 1 { if args[1] == "-pattern" || args[1] == "-Pattern" { // skip past -drain if followed by -pattern args = args[1:] } } if len(args) < 2 { fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -drain command\n") os.Exit(1) } pat := args[1] chn := make(chan string, tbls.ChanDepth) if chn == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create output channel\n") os.Exit(1) } sendPatterns := func(pat string, out chan<- string) { defer close(out) PartitionPattern(pat, "", in, func(rec int, str string) { out <- str }) } go sendPatterns(pat, chn) for str := range chn { recordCount++ byteCount += len(str) } } // processToken tokenizes the XML block stream processToken := func() { for { tag, name, attr, _, idx := nextToken(Idx) Idx = idx if tag == ISCLOSED { break } recordCount++ byteCount += len(name) + len(attr) } } // ProcessXMLStream // call specific function switch action { case DOFORMAT: processFormat() case DOOUTLINE: processOutline() case DOSYNOPSIS: processSynopsis() case DOVERIFY: processVerify() case DOFILTER: processFilter() case DOCHUNK: processChunk() case DOSPLIT: processSplit() case DODRAIN: processDrain() case DOTOKEN: processToken() default: } return recordCount, byteCount } // INSDSEQ EXTRACTION COMMAND GENERATOR // e.g., xtract -insd complete mat_peptide "%peptide" product peptide // ProcessINSD generates extraction commands for GenBank/RefSeq records in INSDSet format func ProcessINSD(args []string, isPipe, addDash bool) []string { // legal GenBank / GenPept / RefSeq features features := []string{ "-10_signal", "-35_signal", "3'clip", "3'UTR", "5'clip", "5'UTR", "allele", "assembly_gap", "attenuator", "Bond", "C_region", "CAAT_signal", "CDS", "centromere", "conflict", "D_segment", "D-loop", "enhancer", "exon", "gap", "GC_signal", "gene", "iDNA", "intron", "J_segment", "LTR", "mat_peptide", "misc_binding", "misc_difference", "misc_feature", "misc_recomb", "misc_RNA", "misc_signal", "misc_structure", "mobile_element", "modified_base", "mRNA", "mutation", "N_region", "ncRNA", "old_sequence", "operon", "oriT", "polyA_signal", "polyA_site", "precursor_RNA", "prim_transcript", "primer_bind", "promoter", "propeptide", "protein_bind", "Protein", "RBS", "Region", "regulatory", "rep_origin", "repeat_region", "repeat_unit", "rRNA", "S_region", "satellite", "scRNA", "sig_peptide", "Site", "snoRNA", "snRNA", "source", "stem_loop", "STS", "TATA_signal", "telomere", "terminator", "tmRNA", "transit_peptide", "tRNA", "unsure", "V_region", "V_segment", "variation", } // legal GenBank / GenPept / RefSeq qualifiers qualifiers := []string{ "allele", "altitude", "anticodon", "artificial_location", "bio_material", "bond_type", "bound_moiety", "breed", "calculated_mol_wt", "cell_line", "cell_type", "chloroplast", "chromoplast", "chromosome", "citation", "clone_lib", "clone", "coded_by", "codon_start", "codon", "collected_by", "collection_date", "compare", "cons_splice", "country", "cultivar", "culture_collection", "cyanelle", "db_xref", "derived_from", "dev_stage", "direction", "EC_number", "ecotype", "encodes", "endogenous_virus", "environmental_sample", "estimated_length", "evidence", "exception", "experiment", "focus", "frequency", "function", "gap_type", "gdb_xref", "gene_synonym", "gene", "germline", "haplogroup", "haplotype", "host", "identified_by", "inference", "insertion_seq", "isolate", "isolation_source", "kinetoplast", "lab_host", "label", "lat_lon", "linkage_evidence", "locus_tag", "macronuclear", "map", "mating_type", "metagenome_source", "metagenomic", "mitochondrion", "mobile_element_type", "mobile_element", "mod_base", "mol_type", "name", "nat_host", "ncRNA_class", "non_functional", "note", "number", "old_locus_tag", "operon", "organelle", "organism", "partial", "PCR_conditions", "PCR_primers", "peptide", "phenotype", "plasmid", "pop_variant", "product", "protein_id", "proviral", "pseudo", "pseudogene", "rearranged", "recombination_class", "region_name", "regulatory_class", "replace", "ribosomal_slippage", "rpt_family", "rpt_type", "rpt_unit_range", "rpt_unit_seq", "rpt_unit", "satellite", "segment", "sequenced_mol", "serotype", "serovar", "sex", "site_type", "specific_host", "specimen_voucher", "standard_name", "strain", "structural_class", "sub_clone", "sub_species", "sub_strain", "tag_peptide", "tissue_lib", "tissue_type", "trans_splicing", "transcript_id", "transcription", "transgenic", "transl_except", "transl_table", "translation", "transposon", "type_material", "UniProtKB_evidence", "usedin", "variety", "virion", } // legal INSDSeq XML fields insdtags := []string{ "INSDAltSeqData_items", "INSDAltSeqData", "INSDAltSeqItem_first-accn", "INSDAltSeqItem_gap-comment", "INSDAltSeqItem_gap-length", "INSDAltSeqItem_gap-linkage", "INSDAltSeqItem_gap-type", "INSDAltSeqItem_interval", "INSDAltSeqItem_isgap", "INSDAltSeqItem_last-accn", "INSDAltSeqItem_value", "INSDAltSeqItem", "INSDAuthor", "INSDComment_paragraphs", "INSDComment_type", "INSDComment", "INSDCommentParagraph", "INSDFeature_intervals", "INSDFeature_key", "INSDFeature_location", "INSDFeature_operator", "INSDFeature_partial3", "INSDFeature_partial5", "INSDFeature_quals", "INSDFeature_xrefs", "INSDFeature", "INSDFeatureSet_annot-source", "INSDFeatureSet_features", "INSDFeatureSet", "INSDInterval_accession", "INSDInterval_from", "INSDInterval_interbp", "INSDInterval_iscomp", "INSDInterval_point", "INSDInterval_to", "INSDInterval", "INSDKeyword", "INSDQualifier_name", "INSDQualifier_value", "INSDQualifier", "INSDReference_authors", "INSDReference_consortium", "INSDReference_journal", "INSDReference_position", "INSDReference_pubmed", "INSDReference_reference", "INSDReference_remark", "INSDReference_title", "INSDReference_xref", "INSDReference", "INSDSecondary-accn", "INSDSeq_accession-version", "INSDSeq_alt-seq", "INSDSeq_comment-set", "INSDSeq_comment", "INSDSeq_contig", "INSDSeq_create-date", "INSDSeq_create-release", "INSDSeq_database-reference", "INSDSeq_definition", "INSDSeq_division", "INSDSeq_entry-version", "INSDSeq_feature-set", "INSDSeq_feature-table", "INSDSeq_keywords", "INSDSeq_length", "INSDSeq_locus", "INSDSeq_moltype", "INSDSeq_organism", "INSDSeq_other-seqids", "INSDSeq_primary-accession", "INSDSeq_primary", "INSDSeq_project", "INSDSeq_references", "INSDSeq_secondary-accessions", "INSDSeq_segment", "INSDSeq_sequence", "INSDSeq_source-db", "INSDSeq_source", "INSDSeq_strandedness", "INSDSeq_struc-comments", "INSDSeq_taxonomy", "INSDSeq_topology", "INSDSeq_update-date", "INSDSeq_update-release", "INSDSeq_xrefs", "INSDSeq", "INSDSeqid", "INSDSet", "INSDStrucComment_items", "INSDStrucComment_name", "INSDStrucComment", "INSDStrucCommentItem_tag", "INSDStrucCommentItem_url", "INSDStrucCommentItem_value", "INSDStrucCommentItem", "INSDXref_dbname", "INSDXref_id", "INSDXref", } checkAgainstVocabulary := func(str, objtype string, arry []string) { if str == "" || arry == nil { return } // skip past pound, percent, or caret character at beginning of string if len(str) > 1 { switch str[0] { case '#', '%', '^': str = str[1:] default: } } for _, txt := range arry { if str == txt { return } if strings.ToUpper(str) == strings.ToUpper(txt) { fmt.Fprintf(os.Stderr, "\nERROR: Incorrect capitalization of '%s' %s, change to '%s'\n", str, objtype, txt) os.Exit(1) } } fmt.Fprintf(os.Stderr, "\nERROR: Item '%s' is not a legal -insd %s\n", str, objtype) os.Exit(1) } var acc []string max := len(args) if max < 1 { fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract -insd\n") os.Exit(1) } acc = append(acc, "-pattern", "INSDSeq", "-ACCN", "INSDSeq_accession-version") printAccn := true // collect descriptors if strings.HasPrefix(args[0], "INSD") { if isPipe { acc = append(acc, "-clr", "-pfx", "\\n", "-element", "&ACCN") acc = append(acc, "-group", "INSDSeq", "-sep", "|", "-element") } else { acc = append(acc, "-clr", "-pfx", "\"\\n\"", "-element", "\"&ACCN\"") acc = append(acc, "-group", "INSDSeq", "-sep", "\"|\"", "-element") } printAccn = false for { if len(args) < 1 { return acc } str := args[0] if !strings.HasPrefix(args[0], "INSD") { break } checkAgainstVocabulary(str, "element", insdtags) acc = append(acc, str) args = args[1:] } } else if strings.HasPrefix(strings.ToUpper(args[0]), "INSD") { // report capitalization or vocabulary failure checkAgainstVocabulary(args[0], "element", insdtags) // program should not get to this point, but warn and exit anyway fmt.Fprintf(os.Stderr, "\nERROR: Item '%s' is not a legal -insd %s\n", args[0], "element") os.Exit(1) } // collect qualifiers partial := false complete := false if args[0] == "+" || args[0] == "complete" { complete = true args = args[1:] max-- } else if args[0] == "-" || args[0] == "partial" { partial = true args = args[1:] max-- } if max < 1 { fmt.Fprintf(os.Stderr, "\nERROR: No feature key supplied to xtract -insd\n") os.Exit(1) } acc = append(acc, "-group", "INSDFeature") // limit to designated features feature := args[0] fcmd := "-if" // can specify multiple features separated by plus sign (e.g., CDS+mRNA) or comma (e.g., CDS,mRNA) plus := strings.Split(feature, "+") for _, pls := range plus { comma := strings.Split(pls, ",") for _, cma := range comma { checkAgainstVocabulary(cma, "feature", features) acc = append(acc, fcmd, "INSDFeature_key", "-equals", cma) fcmd = "-or" } } if max < 2 { // still need at least one qualifier even on legal feature fmt.Fprintf(os.Stderr, "\nERROR: Feature '%s' must be followed by at least one qualifier\n", feature) os.Exit(1) } args = args[1:] if complete { acc = append(acc, "-unless", "INSDFeature_partial5", "-or", "INSDFeature_partial3") } else if partial { acc = append(acc, "-if", "INSDFeature_partial5", "-or", "INSDFeature_partial3") } if printAccn { if isPipe { acc = append(acc, "-clr", "-pfx", "\\n", "-element", "&ACCN") } else { acc = append(acc, "-clr", "-pfx", "\"\\n\"", "-element", "\"&ACCN\"") } } for _, str := range args { if strings.HasPrefix(str, "INSD") { checkAgainstVocabulary(str, "element", insdtags) if isPipe { acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element") } else { acc = append(acc, "-block", "INSDFeature", "-sep", "\"|\"", "-element") } acc = append(acc, str) } else if strings.HasPrefix(str, "#INSD") { checkAgainstVocabulary(str, "element", insdtags) if isPipe { acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element") acc = append(acc, str) } else { acc = append(acc, "-block", "INSDFeature", "-sep", "\"|\"", "-element") ql := fmt.Sprintf("\"%s\"", str) acc = append(acc, ql) } } else if strings.HasPrefix(strings.ToUpper(str), "#INSD") || strings.HasPrefix(strings.ToUpper(str), "#INSD") { // report capitalization or vocabulary failure checkAgainstVocabulary(str, "element", insdtags) } else { acc = append(acc, "-block", "INSDQualifier") checkAgainstVocabulary(str, "qualifier", qualifiers) if len(str) > 2 && str[0] == '%' { acc = append(acc, "-if", "INSDQualifier_name", "-equals", str[1:]) if isPipe { acc = append(acc, "-element", "%INSDQualifier_value") } else { acc = append(acc, "-element", "\"%INSDQualifier_value\"") } if addDash { acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str[1:]) if isPipe { acc = append(acc, "-lbl", "\\-") } else { acc = append(acc, "-lbl", "\"\\-\"") } } } else { acc = append(acc, "-if", "INSDQualifier_name", "-equals", str) acc = append(acc, "-element", "INSDQualifier_value") if addDash { acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str) if isPipe { acc = append(acc, "-lbl", "\\-") } else { acc = append(acc, "-lbl", "\"\\-\"") } } } } } return acc } // HYDRA CITATION MATCHER COMMAND GENERATOR // ProcessHydra generates extraction commands for NCBI's in-house citation matcher (undocumented) func ProcessHydra(isPipe bool) []string { var acc []string // acceptable scores are 0.8 or higher, exact match on "1" rejects low value in scientific notation with minus sign present acc = append(acc, "-pattern", "Id") acc = append(acc, "-if", "@score", "-equals", "1") acc = append(acc, "-or", "@score", "-starts-with", "0.9") acc = append(acc, "-or", "@score", "-starts-with", "0.8") acc = append(acc, "-element", "Id") return acc } // COLLECT AND FORMAT REQUESTED XML VALUES // ExploreElements returns matching element values to callback func ExploreElements(curr *Node, mask, prnt, match, attrib string, wildcard bool, level int, proc func(string, int)) { if curr == nil || proc == nil { return } // **/Object performs deep exploration of recursive data (*/Object also supported) deep := false if prnt == "**" || prnt == "*" { prnt = "" deep = true } // exploreElements recursive definition var exploreElements func(curr *Node, skip string, lev int) exploreElements = func(curr *Node, skip string, lev int) { if !deep && curr.Name == skip { // do not explore within recursive object return } // parseAttributes is only run if attribute values are requested in element statements parseAttributes := func(attrb string) []string { if attrb == "" { return nil } attlen := len(attrb) // count equal signs num := 0 for i := 0; i < attlen; i++ { if attrb[i] == '=' { num += 2 } } if num < 1 { return nil } // allocate array of proper size arry := make([]string, num) if arry == nil { return nil } start := 0 idx := 0 itm := 0 // place tag and value in successive array slots for idx < attlen && itm < num { ch := attrb[idx] if ch == '=' { // skip past possible leading blanks for start < attlen { ch = attrb[start] if ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r' || ch == '\f' { start++ } else { break } } // = arry[itm] = attrb[start:idx] itm++ // skip past equal sign and leading double quote idx += 2 start = idx } else if ch == '"' { // " arry[itm] = attrb[start:idx] itm++ // skip past trailing double quote and (possible) space idx += 2 start = idx } else { idx++ } } return arry } // wildcard matches any namespace prefix if curr.Name == match || (wildcard && strings.HasPrefix(match, ":") && strings.HasSuffix(curr.Name, match)) || (match == "" && attrib != "") { if prnt == "" || curr.Parent == prnt || (wildcard && strings.HasPrefix(prnt, ":") && strings.HasSuffix(curr.Parent, prnt)) { if attrib != "" { if curr.Attributes != "" && curr.Attribs == nil { // parse attributes on-the-fly if queried curr.Attribs = parseAttributes(curr.Attributes) } for i := 0; i < len(curr.Attribs)-1; i += 2 { // attributes now parsed into array as [ tag, value, tag, value, tag, value, ... ] if curr.Attribs[i] == attrib || (wildcard && strings.HasPrefix(attrib, ":") && strings.HasSuffix(curr.Attribs[i], attrib)) { proc(curr.Attribs[i+1], level) return } } } else if curr.Contents != "" { str := curr.Contents[:] if HasAmpOrNotASCII(str) { // processing of <, >, &, ", and ' characters is now delayed until element contents is requested str = html.UnescapeString(str) } proc(str, level) return } else if curr.Children != nil { // for XML container object, send empty string to callback to increment count proc("", level) // and continue exploring } } } for chld := curr.Children; chld != nil; chld = chld.Next { // inner exploration is subject to recursive object exclusion exploreElements(chld, mask, lev+1) } } exploreElements(curr, "", level) } // PrintSubtree supports compression styles selected by -element "*" through "****" func PrintSubtree(node *Node, style IndentType, printAttrs bool, proc func(string)) { if node == nil || proc == nil { return } // WRAPPED is SUBTREE plus each attribute on its own line wrapped := false if style == WRAPPED { style = SUBTREE wrapped = true } // INDENT is offset by two spaces to allow for parent tag, SUBTREE is not offset initial := 1 if style == SUBTREE { style = INDENT initial = 0 } // array to speed up indentation indentSpaces := []string{ "", " ", " ", " ", " ", " ", " ", " ", " ", " ", } // function to indent a specified number of spaces doIndent := func(indt int) { i := indt for i > 9 { proc(" ") i -= 10 } if i < 0 { return } proc(indentSpaces[i]) } // doSubtree recursive definition var doSubtree func(*Node, int) doSubtree = func(curr *Node, depth int) { // suppress if it would be an empty self-closing tag if !IsNotJustWhitespace(curr.Attributes) && curr.Contents == "" && curr.Children == nil { return } if style == INDENT { doIndent(depth) } proc("<") proc(curr.Name) if printAttrs { attr := strings.TrimSpace(curr.Attributes) attr = CompressRunsOfSpaces(attr) if attr != "" { if wrapped { start := 0 idx := 0 attlen := len(attr) for idx < attlen { ch := attr[idx] if ch == '=' { str := attr[start:idx] proc("\n") doIndent(depth) proc(" ") proc(str) // skip past equal sign and leading double quote idx += 2 start = idx } else if ch == '"' { str := attr[start:idx] proc("=\"") proc(str) proc("\"") // skip past trailing double quote and (possible) space idx += 2 start = idx } else { idx++ } } proc("\n") doIndent(depth) } else { proc(" ") proc(attr) } } } // see if suitable for for self-closing tag if curr.Contents == "" && curr.Children == nil { proc("/>") if style != COMPACT { proc("\n") } return } proc(">") if curr.Contents != "" { proc(curr.Contents[:]) } else { if style != COMPACT { proc("\n") } for chld := curr.Children; chld != nil; chld = chld.Next { doSubtree(chld, depth+1) } if style == INDENT { i := depth for i > 9 { proc(" ") i -= 10 } proc(indentSpaces[i]) } } proc("<") proc("/") proc(curr.Name) proc(">") if style != COMPACT { proc("\n") } } doSubtree(node, initial) } // ProcessClause handles comma-separated -element arguments func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep string, status OpType, index, level int, variables map[string]string) (string, bool) { if curr == nil || stages == nil { return "", false } // processElement handles individual -element constructs processElement := func(acc func(string)) { if acc == nil { return } // element names combined with commas are treated as a prefix-separator-suffix group for _, stage := range stages { stat := stage.Type item := stage.Value prnt := stage.Parent match := stage.Match attrib := stage.Attrib wildcard := stage.Wild // exploreElements is a wrapper for ExploreElements, obtaining most arguments as closures exploreElements := func(proc func(string, int)) { ExploreElements(curr, mask, prnt, match, attrib, wildcard, level, proc) } switch stat { case ELEMENT, VALUE, LEN, SUM, MIN, MAX, SUB, AVG, DEV: exploreElements(func(str string, lvl int) { if str != "" { acc(str) } }) case ENCODE: exploreElements(func(str string, lvl int) { if str != "" { str = html.EscapeString(str) acc(str) } }) case FIRST: single := "" exploreElements(func(str string, lvl int) { if single == "" { single = str } }) if single != "" { acc(single) } case LAST: single := "" exploreElements(func(str string, lvl int) { single = str }) if single != "" { acc(single) } case VARIABLE: // use value of stored variable val, ok := variables[match] if ok { acc(val) } case NUM, COUNT: count := 0 exploreElements(func(str string, lvl int) { count++ }) // number of element objects val := strconv.Itoa(count) acc(val) case LENGTH: length := 0 exploreElements(func(str string, lvl int) { length += len(str) }) // length of element strings val := strconv.Itoa(length) acc(val) case DEPTH: exploreElements(func(str string, lvl int) { // depth of each element in scope val := strconv.Itoa(lvl) acc(val) }) case INDEX: // -element "+" prints index of current XML object val := strconv.Itoa(index) acc(val) case INC: // -inc, or component of -0-based, -1-based, or -ucsc exploreElements(func(str string, lvl int) { if str != "" { num, err := strconv.Atoi(str) if err == nil { // increment value num++ val := strconv.Itoa(num) acc(val) } } }) case DEC: // -dec, or component of -0-based, -1-based, or -ucsc exploreElements(func(str string, lvl int) { if str != "" { num, err := strconv.Atoi(str) if err == nil { // decrement value num-- val := strconv.Itoa(num) acc(val) } } }) case STAR: // -element "*" prints current XML subtree on a single line style := SINGULARITY printAttrs := true for _, ch := range item { if ch == '*' { style++ } else if ch == '@' { printAttrs = false } } if style > WRAPPED { style = WRAPPED } if style < COMPACT { style = COMPACT } var buffer bytes.Buffer PrintSubtree(curr, style, printAttrs, func(str string) { if str != "" { buffer.WriteString(str) } }) txt := buffer.String() if txt != "" { acc(txt) } default: } } } ok := false // format results in buffer var buffer bytes.Buffer buffer.WriteString(prev) buffer.WriteString(pfx) between := "" switch status { case ELEMENT, ENCODE, VALUE, NUM, INC, DEC, ZEROBASED, ONEBASED, UCSC: processElement(func(str string) { if str != "" { ok = true buffer.WriteString(between) buffer.WriteString(str) between = sep } }) case FIRST: single := "" processElement(func(str string) { ok = true if single == "" { single = str } }) if single != "" { buffer.WriteString(between) buffer.WriteString(single) between = sep } case LAST: single := "" processElement(func(str string) { ok = true single = str }) if single != "" { buffer.WriteString(between) buffer.WriteString(single) between = sep } case LEN: length := 0 processElement(func(str string) { ok = true length += len(str) }) // length of element strings val := strconv.Itoa(length) buffer.WriteString(between) buffer.WriteString(val) between = sep case SUM: sum := 0 processElement(func(str string) { value, err := strconv.Atoi(str) if err == nil { sum += value ok = true } }) if ok { // sum of element values val := strconv.Itoa(sum) buffer.WriteString(between) buffer.WriteString(val) between = sep } case MIN: min := 0 processElement(func(str string) { value, err := strconv.Atoi(str) if err == nil { if !ok || value < min { min = value } ok = true } }) if ok { // minimum of element values val := strconv.Itoa(min) buffer.WriteString(between) buffer.WriteString(val) between = sep } case MAX: max := 0 processElement(func(str string) { value, err := strconv.Atoi(str) if err == nil { if !ok || value > max { max = value } ok = true } }) if ok { // maximum of element values val := strconv.Itoa(max) buffer.WriteString(between) buffer.WriteString(val) between = sep } case SUB: first := 0 second := 0 count := 0 processElement(func(str string) { value, err := strconv.Atoi(str) if err == nil { count++ if count == 1 { first = value } else if count == 2 { second = value } } }) if count == 2 { // must have exactly 2 elements ok = true // difference of element values val := strconv.Itoa(first - second) buffer.WriteString(between) buffer.WriteString(val) between = sep } case AVG: sum := 0 count := 0 processElement(func(str string) { value, err := strconv.Atoi(str) if err == nil { sum += value count++ ok = true } }) if ok { // average of element values avg := int(float64(sum) / float64(count)) val := strconv.Itoa(avg) buffer.WriteString(between) buffer.WriteString(val) between = sep } case DEV: count := 0 mean := 0.0 m2 := 0.0 processElement(func(str string) { value, err := strconv.Atoi(str) if err == nil { // Welford algorithm for one-pass standard deviation count++ x := float64(value) delta := x - mean mean += delta / float64(count) m2 += delta * (x - mean) } }) if count > 1 { // must have at least 2 elements ok = true // standard deviation of element values vrc := m2 / float64(count-1) dev := int(math.Sqrt(vrc)) val := strconv.Itoa(dev) buffer.WriteString(between) buffer.WriteString(val) between = sep } default: } buffer.WriteString(sfx) if !ok { return "", false } txt := buffer.String() return txt, true } // ProcessInstructions performs extraction commands on a subset of XML func ProcessInstructions(commands []*Operation, curr *Node, mask, tab, ret string, index, level int, variables map[string]string, accum func(string)) (string, string) { if accum == nil { return tab, ret } sep := "\t" pfx := "" sfx := "" col := "\t" lin := "\n" varname := "" // process commands for _, op := range commands { str := op.Value switch op.Type { case ELEMENT, FIRST, LAST, ENCODE, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSC: txt, ok := ProcessClause(curr, op.Stages, mask, tab, pfx, sfx, sep, op.Type, index, level, variables) if ok { tab = col ret = lin accum(txt) } case TAB: col = str case RET: lin = str case PFX: pfx = str case SFX: sfx = str case SEP: sep = str case LBL: lbl := str accum(tab) accum(lbl) tab = col ret = lin case PFC: // preface clears previous tab and sets prefix in one command pfx = str fallthrough case CLR: // clear previous tab after the fact tab = "" case RST: pfx = "" sfx = "" sep = "\t" case VARIABLE: varname = str case VALUE: length := len(str) if length > 1 && str[0] == '(' && str[length-1] == ')' { // set variable from literal text inside parentheses, e.g., -COM "(, )" variables[varname] = str[1 : length-1] // -if "&VARIABLE" will succeed if set to blank with empty parentheses "()" } else if str == "" { // -if "&VARIABLE" will fail if initialized with empty string "" delete(variables, varname) } else { txt, ok := ProcessClause(curr, op.Stages, mask, "", pfx, sfx, sep, op.Type, index, level, variables) if ok { variables[varname] = txt } } varname = "" default: } } return tab, ret } // CONDITIONAL EXECUTION USES -if AND -unless STATEMENT, WITH SUPPORT FOR DEPRECATED -match AND -avoid STATEMENTS // ConditionsAreSatisfied tests a set of conditions to determine if extraction should proceed func ConditionsAreSatisfied(conditions []*Operation, curr *Node, mask string, index, level int, variables map[string]string) bool { if curr == nil { return false } required := 0 observed := 0 forbidden := 0 isMatch := false isAvoid := false // function to test string or numeric constraints testConstraint := func(str string, constraint *Step) bool { if str == "" || constraint == nil { return false } val := constraint.Value stat := constraint.Type switch stat { case EQUALS, CONTAINS, STARTSWITH, ENDSWITH, ISNOT: // substring test on element values str = strings.ToUpper(str) val = strings.ToUpper(val) switch stat { case EQUALS: if str == val { return true } case CONTAINS: if strings.Contains(str, val) { return true } case STARTSWITH: if strings.HasPrefix(str, val) { return true } case ENDSWITH: if strings.HasSuffix(str, val) { return true } case ISNOT: if str != val { return true } default: } case GT, GE, LT, LE, EQ, NE: // second argument of numeric test can be element specifier if constraint.Parent != "" || constraint.Match != "" || constraint.Attrib != "" { ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, level, func(str string, lvl int) { if str != "" { _, errz := strconv.Atoi(str) if errz == nil { val = str } } }) } // numeric tests on element values x, errx := strconv.Atoi(str) y, erry := strconv.Atoi(val) // both arguments must resolve to integers if errx != nil || erry != nil { return false } switch stat { case GT: if x > y { return true } case GE: if x >= y { return true } case LT: if x < y { return true } case LE: if x <= y { return true } case EQ: if x == y { return true } case NE: if x != y { return true } default: } default: } return false } // matchFound tests individual conditions matchFound := func(stages []*Step) bool { if stages == nil || len(stages) < 1 { return false } stage := stages[0] var constraint *Step if len(stages) > 1 { constraint = stages[1] } status := stage.Type prnt := stage.Parent match := stage.Match attrib := stage.Attrib wildcard := stage.Wild found := false number := "" // exploreElements is a wrapper for ExploreElements, obtaining most arguments as closures exploreElements := func(proc func(string, int)) { ExploreElements(curr, mask, prnt, match, attrib, wildcard, level, proc) } switch status { case ELEMENT: exploreElements(func(str string, lvl int) { // match to XML container object sends empty string, so do not check for str != "" here // test every selected element individually if value is specified if constraint == nil || testConstraint(str, constraint) { found = true } }) case VARIABLE: // use value of stored variable str, ok := variables[match] if ok { // -if &VARIABLE -equals VALUE is the supported construct if constraint == nil || testConstraint(str, constraint) { found = true } } case COUNT: count := 0 exploreElements(func(str string, lvl int) { count++ found = true }) // number of element objects number = strconv.Itoa(count) case LENGTH: length := 0 exploreElements(func(str string, lvl int) { length += len(str) found = true }) // length of element strings number = strconv.Itoa(length) case DEPTH: depth := 0 exploreElements(func(str string, lvl int) { depth = lvl found = true }) // depth of last element in scope number = strconv.Itoa(depth) case INDEX: // index of explored parent object number = strconv.Itoa(index) found = true default: } if number == "" { return found } if constraint == nil || testConstraint(number, constraint) { return true } return false } // test conditional arguments for _, op := range conditions { switch op.Type { // -if tests for presence of element (deprecated -match can test element:value) case IF, MATCH: // checking for failure here allows for multiple -if [ -and / -or ] clauses if isMatch && observed < required { return false } if isAvoid && forbidden > 0 { return false } required = 0 observed = 0 forbidden = 0 isMatch = true isAvoid = false // continue on to next two cases fallthrough case AND: required++ // continue on to next case fallthrough case OR: if matchFound(op.Stages) { observed++ // record presence of forbidden element if in -unless clause forbidden++ } // -unless tests for absence of element, or presence but with failure of subsequent value test (deprecated -avoid can test element:value) case UNLESS, AVOID: if isMatch && observed < required { return false } if isAvoid && forbidden > 0 { return false } required = 0 observed = 0 forbidden = 0 isMatch = false isAvoid = true if matchFound(op.Stages) { forbidden++ } default: } } if isMatch && observed < required { return false } if isAvoid && forbidden > 0 { return false } return true } // RECURSIVELY PROCESS EXPLORATION COMMANDS AND XML DATA STRUCTURE // ProcessCommands visits XML nodes, performs conditional tests, and executes data extraction instructions func ProcessCommands(cmds *Block, curr *Node, tab, ret string, index, level int, variables map[string]string, accum func(string)) (string, string) { if accum == nil { return tab, ret } prnt := cmds.Parent match := cmds.Match // leading colon indicates namespace prefix wildcard wildcard := false if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") { wildcard = true } // **/Object performs deep exploration of recursive data deep := false if prnt == "**" { prnt = "*" deep = true } // closure passes local variables to callback, which can modify caller tab and ret values processNode := func(node *Node, idx, lvl int) { // apply -if or -unless tests if ConditionsAreSatisfied(cmds.Conditions, node, match, idx, lvl, variables) { // execute data extraction commands if len(cmds.Commands) > 0 { tab, ret = ProcessInstructions(cmds.Commands, node, match, tab, ret, idx, lvl, variables, accum) } // process sub commands on child node for _, sub := range cmds.Subtasks { tab, ret = ProcessCommands(sub, node, tab, ret, 1, lvl, variables, accum) } } else { // execute commands after -else statement if len(cmds.Failure) > 0 { tab, ret = ProcessInstructions(cmds.Failure, node, match, tab, ret, idx, lvl, variables, accum) } } } // exploreNodes recursive definition var exploreNodes func(*Node, int, int, func(*Node, int, int)) int // exploreNodes visits all nodes that match the selection criteria exploreNodes = func(curr *Node, indx, levl int, proc func(*Node, int, int)) int { if curr == nil || proc == nil { return indx } // match is "*" for heterogeneous data constructs, e.g., -group PubmedArticleSet/* // wildcard matches any namespace prefix if curr.Name == match || match == "*" || (wildcard && strings.HasPrefix(match, ":") && strings.HasSuffix(curr.Name, match)) { if prnt == "" || curr.Parent == prnt || (wildcard && strings.HasPrefix(prnt, ":") && strings.HasSuffix(curr.Parent, prnt)) { proc(curr, indx, levl) indx++ if !deep { // do not explore within recursive object return indx } } } // clearing prnt "*" now allows nested exploration within recursive data, e.g., -pattern Taxon -block */Taxon if prnt == "*" { prnt = "" } // explore child nodes for chld := curr.Children; chld != nil; chld = chld.Next { indx = exploreNodes(chld, indx, levl+1, proc) } return indx } // apply -position test if cmds.Position == "" { exploreNodes(curr, index, level, processNode) } else { var single *Node lev := 0 ind := 0 if cmds.Position == "first" { exploreNodes(curr, index, level, func(node *Node, idx, lvl int) { if single == nil { single = node ind = idx lev = lvl } }) } else if cmds.Position == "last" { exploreNodes(curr, index, level, func(node *Node, idx, lvl int) { single = node ind = idx lev = lvl }) } else { // use numeric position number, err := strconv.Atoi(cmds.Position) if err == nil { pos := 0 exploreNodes(curr, index, level, func(node *Node, idx, lvl int) { pos++ if pos == number { single = node ind = idx lev = lvl } }) } else { fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized position '%s'\n", cmds.Position) os.Exit(1) } } if single != nil { processNode(single, ind, lev) } } return tab, ret } // PROCESS ONE XML COMPONENT RECORD // ParseXML combines the tokenizer and parser to work on a single partitioned XML string func ParseXML(Text, parent string, tbls *Tables) (*Node, bool) { if Text == "" || tbls == nil { return nil, false } // node farm variables FarmPos := 0 FarmMax := tbls.FarmSize FarmItems := make([]Node, FarmMax) // function to allocate multiple nodes in a large array for memory management efficiency nextNode := func(strt, attr, prnt string) *Node { // if farm array slots used up, allocate new array if FarmPos >= FarmMax { FarmItems = make([]Node, FarmMax) FarmPos = 0 } if FarmItems == nil { return nil } // take node from next available slot in farm array node := &FarmItems[FarmPos] node.Name = strt[:] node.Attributes = attr[:] node.Parent = prnt[:] FarmPos++ return node } // token parser variables Txtlen := len(Text) Idx := 0 // function to get next XML token nextToken := func(idx int) (TagType, string, string, int) { // lookup table array pointers inBlank := &tbls.InBlank inFirst := &tbls.InFirst inElement := &tbls.InElement text := Text[:] txtlen := Txtlen // XML string ends with > character, acts as sentinel to check if past end of text if idx >= txtlen { // signal end of XML string return ISCLOSED, "", "", 0 } // skip past leading blanks ch := text[idx] for inBlank[ch] { idx++ ch = text[idx] } start := idx if ch == '<' { // at start of element idx++ ch = text[idx] // check for legal first character of element if inFirst[ch] { // read element name start = idx idx++ ch = text[idx] for inElement[ch] { idx++ ch = text[idx] } str := text[start:idx] switch ch { case '>': // end of element idx++ return STARTTAG, str[:], "", idx case '/': // self-closing element without attributes idx++ ch = text[idx] if ch != '>' { fmt.Fprintf(os.Stderr, "\nSelf-closing element missing right angle bracket\n") } idx++ return SELFTAG, str[:], "", idx case ' ', '\t', '\n', '\r', '\f': // attributes idx++ start = idx ch = text[idx] for ch != '<' && ch != '>' { idx++ ch = text[idx] } if ch != '>' { fmt.Fprintf(os.Stderr, "\nAttributes not followed by right angle bracket\n") } if text[idx-1] == '/' { // self-closing atr := text[start : idx-1] idx++ return SELFTAG, str[:], atr[:], idx } atr := text[start:idx] idx++ return STARTTAG, str[:], atr[:], idx default: fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n") return STARTTAG, str[:], "", idx } } else { // punctuation character immediately after first angle bracket switch ch { case '/': // at start of end tag idx++ start = idx ch = text[idx] // expect legal first character of element if inFirst[ch] { idx++ ch = text[idx] for inElement[ch] { idx++ ch = text[idx] } str := text[start:idx] if ch != '>' { fmt.Fprintf(os.Stderr, "\nUnexpected characters after end element name\n") } idx++ return STOPTAG, str[:], "", idx } fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n") case '?': // skip ?xml and ?processing instructions idx++ ch = text[idx] for ch != '>' { idx++ ch = text[idx] } idx++ case '!': // skip !DOCTYPE, !comment, and ![CDATA[ idx++ start = idx ch = text[idx] which := NOTAG skipTo := "" if ch == '[' && strings.HasPrefix(text[idx:], "[CDATA[") { which = CDATATAG skipTo = "]]>" start += 7 } else if ch == '-' && strings.HasPrefix(text[idx:], "--") { which = COMMENTTAG skipTo = "-->" start += 2 } if which != NOTAG && skipTo != "" { // CDATA or comment block may contain internal angle brackets found := strings.Index(text[idx:], skipTo) if found < 0 { // string stops in middle of CDATA or comment return ISCLOSED, "", "", idx } // adjust position past end of CDATA or comment idx += found + len(skipTo) } else { // otherwise just skip to next right angle bracket for ch != '>' { idx++ ch = text[idx] } idx++ } default: fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n") } } } else if ch != '>' { // at start of contents start = idx // find end of contents for ch != '<' && ch != '>' { idx++ ch = text[idx] } // trim back past trailing blanks lst := idx - 1 ch = text[lst] for inBlank[ch] && lst > start { lst-- ch = text[lst] } str := text[start : lst+1] return CONTENTTAG, str[:], "", idx } return NOTAG, "", "", idx } // parseLevel recursive definition var parseLevel func(string, string, string) (*Node, bool) parseLevel = func(strt, attr, prnt string) (*Node, bool) { ok := true // obtain next node from farm node := nextNode(strt, attr, prnt) if node == nil { return nil, false } var lastNode *Node for { tag, name, attr, idx := nextToken(Idx) if tag == ISCLOSED { break } Idx = idx switch tag { case STARTTAG: // read sub tree obj, ok := parseLevel(name, attr, node.Name) if !ok { break } // adding next child to end of linked list gives better performance than appending to slice of nodes if node.Children == nil { node.Children = obj } if lastNode != nil { lastNode.Next = obj } lastNode = obj case STOPTAG: // pop out of recursive call return node, ok case CONTENTTAG: node.Contents = name case SELFTAG: if attr == "" { // ignore if self-closing tag has no attributes continue } // self-closing tag has no contents, just create child node obj := nextNode(name, attr, node.Name) if node.Children == nil { node.Children = obj } if lastNode != nil { lastNode.Next = obj } lastNode = obj // continue on same level default: } } return node, ok } for { tag, name, attr, idx := nextToken(Idx) if tag == ISCLOSED { break } Idx = idx if tag != STARTTAG { continue } // call recursive function from beginning of XML return parseLevel(name, attr, parent) } return nil, false } // ProcessQuery calls XML combined tokenizer parser on a partitioned string func ProcessQuery(text, parent string, index int, cmds *Block, tbls *Tables) string { if text == "" || cmds == nil || tbls == nil { return "" } // exit from function will collect garbage of node structure for current XML object pat, ok := ParseXML(text, parent, tbls) if !ok { return "" } // exit from function will also free map of recorded variables for current -pattern variables := make(map[string]string) var buffer bytes.Buffer // start processing at top of command tree and top of XML subregion selected by -pattern _, ret := ProcessCommands(cmds, pat, "", "", index, 1, variables, func(str string) { if str != "" { buffer.WriteString(str) } }) if ret != "" { buffer.WriteString(ret) } txt := buffer.String() // remove leading newline (-insd -pfx artifact) if txt != "" && txt[0] == '\n' { txt = txt[1:] } // return consolidated result string return txt } // UNSHUFFLER USES HEAP TO RESTORE OUTPUT OF MULTIPLE CONSUMERS TO ORIGINAL RECORD ORDER type Extract struct { Index int Text string } type ExtractHeap []Extract // methods that satisfy heap.Interface func (h ExtractHeap) Len() int { return len(h) } func (h ExtractHeap) Less(i, j int) bool { return h[i].Index < h[j].Index } func (h ExtractHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } func (h *ExtractHeap) Push(x interface{}) { *h = append(*h, x.(Extract)) } func (h *ExtractHeap) Pop() interface{} { old := *h n := len(old) x := old[n-1] *h = old[0 : n-1] return x } // CONCURRENT CONSUMER GOROUTINES PARSE AND PROCESS PARTITIONED XML OBJECTS // ReadBlocks -> SplitPattern => StreamTokens => ParseXML => ProcessQuery -> MergeResults // XMLProducer sends partitioned XML strings through channel func XMLProducer(pat, star string, rdr *XMLReader, out chan<- Extract) { // close channel when all records have been processed, so consumers can range over channel defer close(out) // partition all input by pattern and send XML substring to available consumer through channel PartitionPattern(pat, star, rdr, func(rec int, str string) { out <- Extract{rec, str} }) } func CreateProducer(pat, star string, rdr *XMLReader, tbls *Tables) <-chan Extract { if tbls == nil { return nil } out := make(chan Extract, tbls.ChanDepth) if out == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create producer channel\n") os.Exit(1) } // launch single producer goroutine go XMLProducer(pat, star, rdr, out) return out } // XMLConsumer reads partitioned XML from channel and calls parser for processing func XMLConsumer(cmds *Block, tbls *Tables, parent string, wg *sync.WaitGroup, inp <-chan Extract, out chan<- Extract) { // report when this consumer has no more records to process defer wg.Done() // read partitioned XML from producer channel for ext := range inp { idx := ext.Index text := ext.Text if text == "" { // should never see empty input data out <- Extract{idx, text} continue } str := ProcessQuery(text[:], parent, idx, cmds, tbls) // send even if empty to get all record counts for reordering out <- Extract{idx, str} } } func CreateConsumers(cmds *Block, tbls *Tables, parent string, numServers int, inp <-chan Extract) <-chan Extract { if tbls == nil { return nil } out := make(chan Extract, tbls.ChanDepth) if out == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create consumer channel\n") os.Exit(1) } var wg sync.WaitGroup // launch multiple consumer goroutines for i := 0; i < numServers; i++ { wg.Add(1) go XMLConsumer(cmds, tbls, parent, &wg, inp, out) } // launch separate anonymous goroutine to wait until all consumers are done, then close single output channel, so unshuffler can range over channel go func() { wg.Wait() close(out) }() return out } // MAIN FUNCTION // e.g., xtract -pattern PubmedArticle -element MedlineCitation/PMID -block Author -sep " " -element Initials,LastName func main() { // skip past executable name args := os.Args[1:] if len(args) < 1 { fmt.Fprintf(os.Stderr, "\nERROR: No command-line arguments supplied to xtract\n") os.Exit(1) } // CONCURRENCY, CLEANUP, AND DEBUGGING FLAGS // do these first because -defcpu and -maxcpu can be sent from wrapper before other arguments ncpu := runtime.NumCPU() if ncpu < 1 { ncpu = 1 } // wrapper can limit maximum number of processors to use (undocumented) maxProcs := ncpu defProcs := 0 // concurrent performance tuning parameters, can be overridden by -proc and -cons numProcs := 0 serverRatio := 4 // number of servers usually calculated by -cons server ratio, but can be overridden by -serv numServers := 0 // number of channels usually equals number of servers, but can be overridden by -chan chanDepth := 0 // miscellaneous tuning parameters heapSize := 16 farmSize := 64 // garbage collector control can be set by environment variable or default value with -gogc 0 goGc := 600 // XML data cleanup doCompress := false doCleanup := false // read data from file instead of stdin fileName := "" // debugging dbug := false mpty := false indx := false stts := false timr := false // profiling prfl := false // alternative source of sample record, processed a designated number of times, looping for each -proc from 1 to nCPU (undocumented) testCount := 0 testType := "" testString := "" // repeat the specified extraction 5 times for each -proc from 1 to nCPU trial := false // function to get numeric value getNumericArg := func(name string, zer, min, max int) int { if len(args) < 2 { fmt.Fprintf(os.Stderr, "\nERROR: %s is missing\n", name) os.Exit(1) } value, err := strconv.Atoi(args[1]) if err != nil { fmt.Fprintf(os.Stderr, "\nERROR: %s (%s) is not an integer\n", name, args[1]) os.Exit(1) } // skip past first of two arguments args = args[1:] // special case for argument value of 0 if value < 1 { return zer } // limit value to between specified minimum and maximum if value < min { return min } if value > max { return max } return value } inSwitch := true // get concurrency, cleanup, and debugging flags in any order for { inSwitch = true switch args[0] { // concurrency override arguments can be passed in by local wrapper script (undocumented) case "-maxcpu": maxProcs = getNumericArg("Maximum number of processors", 1, 1, ncpu) case "-defcpu": defProcs = getNumericArg("Default number of processors", ncpu, 1, ncpu) // performance tuning flags case "-proc": numProcs = getNumericArg("Number of processors", ncpu, 1, ncpu) case "-cons": serverRatio = getNumericArg("Parser to processor ratio", 4, 1, 32) case "-serv": numServers = getNumericArg("Concurrent parser count", 0, ncpu, 128) case "-chan": chanDepth = getNumericArg("Communication channel depth", 0, ncpu, 128) case "-heap": heapSize = getNumericArg("Unshuffler heap size", 8, 8, 64) case "-farm": farmSize = getNumericArg("Node buffer length", 4, 4, 2048) case "-gogc": goGc = getNumericArg("Garbage collection percentage", 0, 100, 1000) // read data from file case "-input": if len(args) < 2 { fmt.Fprintf(os.Stderr, "\nERROR: Input file name is missing\n") os.Exit(1) } fileName = args[1] // skip past first of two arguments args = args[1:] // data cleanup flags case "-compress": doCompress = true case "-cleanup": doCleanup = true // debugging flags case "-debug": dbug = true case "-empty": mpty = true case "-index": indx = true case "-stats", "-stat": stts = true case "-timer": timr = true case "-profile": prfl = true case "-trial": trial = true case "-test": testCount = getNumericArg("Test data counter", 0, 0, 1000000) if len(args) > 1 { next := args[1] // if next argument is not another flag if next != "" && next[0] != '-' { // get optional data source specifier testType = next // skip past second of three arguments args = args[1:] } } default: // if not any of the controls, set flag to break out of for loop inSwitch = false } if !inSwitch { break } // skip past argument args = args[1:] if len(args) < 1 { break } } // reality checks on number of processors to use // performance degrades if capacity is above maximum number of partitions per second (context switching?) if numProcs == 0 { if defProcs > 0 { numProcs = defProcs } else { // best performance measurement with current code is obtained when 4 to 6 processors are assigned, // varying slightly among queries on PubmedArticle, gene DocumentSummary, and INSDSeq sequence records numProcs = 4 } } if numProcs > ncpu { numProcs = ncpu } if numProcs > maxProcs { numProcs = maxProcs } // allow simultaneous threads for multiplexed go routines runtime.GOMAXPROCS(numProcs) // adjust garbage collection target percentage if goGc >= 100 { debug.SetGCPercent(goGc) } // explicit -serv argument overrides -cons ratio if numServers > 0 { serverRatio = numServers / numProcs // if numServers / numProcs is not a whole number, do not print serverRatio in -stats if numServers != numProcs*serverRatio { serverRatio = 0 } } else { numServers = numProcs * serverRatio } // server limits if numServers > 128 { numServers = 128 } else if numServers < 1 { numServers = numProcs } // explicit -chan argument overrides default to number of servers if chanDepth == 0 { chanDepth = numServers } // -stats prints number of CPUs and performance tuning values if no other arguments (undocumented) if stts && len(args) < 1 { fmt.Fprintf(os.Stderr, "CPUs %d\n", ncpu) fmt.Fprintf(os.Stderr, "Proc %d\n", numProcs) if serverRatio > 0 { fmt.Fprintf(os.Stderr, "Cons %d\n", serverRatio) } fmt.Fprintf(os.Stderr, "Serv %d\n", numServers) fmt.Fprintf(os.Stderr, "Chan %d\n", chanDepth) fmt.Fprintf(os.Stderr, "Heap %d\n", heapSize) fmt.Fprintf(os.Stderr, "Farm %d\n", farmSize) if goGc >= 100 { fmt.Fprintf(os.Stderr, "Gogc %d\n", goGc) } fmt.Fprintf(os.Stderr, "\n") return } // -test N [pubmed|protein|insd|gene] repeats simple query on local XML to measure performance independent of stdin (undocumented) if testCount > 0 { var acc []string // select internal XML data source switch testType { case "pubmed": testString = pubMedArtSample case "protein", "sequence": testString = insdSeqSample case "insd": testString = insdSeqSample case "gene", "docsum": testString = geneDocSumSample default: testString = pubMedArtSample } // default commands if no other arguments if len(args) < 1 { switch testType { case "pubmed": acc = append(acc, "-pattern", "PubmedArticle", "-element", "LastName") case "protein", "sequence": acc = append(acc, "-pattern", "INSDSeq", "-element", "INSDSeq_accession-version") case "insd": acc = append(acc, "-insd", "mat_peptide", "%peptide", "product", "peptide") case "gene", "docsum": acc = append(acc, "-pattern", "DocumentSummary", "-element", "Name") default: acc = append(acc, "-pattern", "PubmedArticle", "-element", "LastName") } } // otherwise use remaining arguments for extraction commands for len(args) > 0 { acc = append(acc, args[0]) args = args[1:] } args = acc } if len(args) < 1 { fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n") os.Exit(1) } // DOCUMENTATION COMMANDS inSwitch = true switch args[0] { case "-version": fmt.Printf("%s\n", xtractVersion) case "-help", "-extras", "-extra": fmt.Printf("xtract %s\n%s\n", xtractVersion, xtractHelp) case "-examples", "-example", "-scripts", "-script": fmt.Printf("xtract %s\n%s\n", xtractVersion, xtractExamples) case "-internal", "-internals": fmt.Printf("xtract %s\n%s\n", xtractVersion, xtractInternal) case "-sample", "-samples": // -sample [pubmed|protein|gene] sends specified sample record to stdout (undocumented) if len(args) > 1 { testType = args[1] } switch testType { case "pubmed": fmt.Printf("%s\n", pubMedArtSample) case "protein", "sequence", "insd": fmt.Printf("%s\n", insdSeqSample) case "gene", "docsum": fmt.Printf("%s\n", geneDocSumSample) default: fmt.Printf("%s\n", pubMedArtSample) } case "-keys": fmt.Printf("%s\n", keyboardShortcuts) case "-unix": fmt.Printf("%s\n", unixCommands) default: // if not any of the documentation commands, keep going inSwitch = false } if inSwitch { return } // INITIALIZE TABLES tbls := InitTables() if tbls == nil { fmt.Fprintf(os.Stderr, "\nERROR: Problem creating token streamer lookup tables\n") os.Exit(1) } // additional fields passed in master table tbls.ChanDepth = chanDepth tbls.FarmSize = farmSize // FILE NAME CAN BE SUPPLIED WITH -input COMMAND in := os.Stdin // check for data being piped into stdin fi, _ := os.Stdin.Stat() isPipe := bool((fi.Mode() & os.ModeCharDevice) == 0) usingFile := false if fileName != "" { inFile, err := os.Open(fileName) if err != nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to open input file '%s'\n", fileName) os.Exit(1) } defer inFile.Close() // use indicated file instead of stdin in = inFile usingFile = true if isPipe { fmt.Fprintf(os.Stderr, "\nERROR: Input data from both stdin and file '%s'\n", fileName) os.Exit(1) } } // check for -input command after extraction arguments for _, str := range args { if str == "-input" { fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -input command\n") os.Exit(1) } } // CREATE XML BLOCK READER FROM STDIN OR FILE rdr := NewXMLReader(in, doCompress, doCleanup) if rdr == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create XML Block Reader\n") os.Exit(1) } // SEQUENCE RECORD EXTRACTION COMMAND GENERATOR // -insd simplifies extraction of INSDSeq qualifiers if args[0] == "-insd" || args[0] == "-insd-" { addDash := true // -insd- variant suppresses use of dash as placeholder for missing qualifiers (undocumented) if args[0] == "-insd-" { addDash = false } args = args[1:] insd := ProcessINSD(args, isPipe || usingFile || testCount > 0, addDash) if !isPipe && !usingFile && testCount < 1 { // no piped input, so write output instructions fmt.Printf("xtract") for _, str := range insd { fmt.Printf(" %s", str) } fmt.Printf("\n") return } // data in pipe, so replace arguments, execute dynamically args = insd } // CITATION MATCHER EXTRACTION COMMAND GENERATOR // -hydra filters HydraResponse output by relevance score (undocumented) if args[0] == "-hydra" { hydra := ProcessHydra(isPipe || usingFile) if !isPipe && !usingFile { // no piped input, so write output instructions fmt.Printf("xtract") for _, str := range hydra { fmt.Printf(" %s", str) } fmt.Printf("\n") return } // data in pipe, so replace arguments, execute dynamically args = hydra } // XML DATA FORMATTING/COMPRESSION COMMAND GENERATOR // -reformat takes a parent pattern and compresses each object for fastest processing (undocumented) if args[0] == "-reformat" { args = args[1:] max := len(args) if max < 1 { fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -reformat command\n") os.Exit(1) } // required first argument is parent pattern, will explore using Parent/* construct, write component with -element "*" prnt := args[0] if prnt == "" { fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -reformat command\n") os.Exit(1) } if prnt == "-xml" || prnt == "-doctype" || prnt == "-pfx" || prnt == "-sfx" { fmt.Fprintf(os.Stderr, "\nERROR: Deprecated argument '%s' used in -reformat command\n", prnt) os.Exit(1) } // optional second argument controls XML expansion or compression level // asterisks MUST be quoted to avoid interpration as file wildcard by Unix shell elm := "*" addRet := false hideDoctype := false hideWrapper := false if max > 1 { // * = compact, ** = flush, *** = indented, **** = subtree, ***** = attributes on separate lines // @ = remove attributes // ^ = suppress xml and doctype, ^^ = also suppress parent set wrapper numStars := 0 numCarets := 0 hideAttrs := false for _, ch := range args[1] { if ch == '*' { numStars++ } else if ch == '@' { hideAttrs = true } else if ch == '^' { numCarets++ } } if numStars > 1 { addRet = true } if numCarets > 0 { hideDoctype = true if numCarets > 1 { hideWrapper = true } } // construct legal element argument for PrintSubtree switch numStars { case 1: elm = "*" case 2: elm = "**" case 3: elm = "***" case 4: elm = "****" case 5: elm = "*****" default: elm = "*" } if hideAttrs { elm += "@" } } // optional third argument provides detailed DOCTYPE construct doctype := "" if max > 2 { str := ConvertSlash(args[2]) if strings.HasPrefix(str, "") { doctype = str } } if !isPipe && !usingFile { // no piped input, so write output instructions (without -head and -tail arguments) if addRet { fmt.Printf("xtract -pattern %s/* -ret \"\" -element \"%s\"\n", prnt, elm) } else { fmt.Printf("xtract -pattern %s/* -element \"%s\"\n", prnt, elm) } return } // add xml, DOCTYPE, and lines at the beginning hd := fmt.Sprintf("\n\n<%s>", prnt, prnt) if doctype != "" { // use supplied DOCTYPE argument hd = fmt.Sprintf("\n%s\n<%s>", doctype, prnt) } if hideDoctype { // or just line hd = fmt.Sprintf("<%s>", prnt) } // add line at the end tl := fmt.Sprintf("", prnt) // use -pattern Parent/* construct prnt += "/*" var acc []string if !hideWrapper { acc = append(acc, "-head", hd, "-tail", tl) } acc = append(acc, "-pattern", prnt) if addRet { acc = append(acc, "-ret", "") } acc = append(acc, "-element", elm) // data in pipe, so replace arguments, execute dynamically args = acc } // CONFIRM INPUT DATA AVAILABILITY AFTER RUNNING COMMAND GENERATORS if testCount < 1 && !usingFile && !isPipe { fmt.Fprintf(os.Stderr, "\nERROR: No data supplied to xtract from stdin or file\n") os.Exit(1) } // START PROFILING IF REQUESTED if prfl { dbug = true f, err := os.Create("cpu.pprof") if err != nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create profile output file\n") os.Exit(1) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } // INITIALIZE PROCESS TIMER AND RECORD COUNT startTime := time.Now() recordCount := 0 byteCount := 0 // function to print processing rate and program duration printDuration := func(name string) { stopTime := time.Now() duration := stopTime.Sub(startTime) seconds := float64(duration.Nanoseconds()) / 1e9 if recordCount >= 1000000 { fmt.Fprintf(os.Stderr, "\nXtract processed %d million %s in %.3f seconds", recordCount/1000000, name, seconds) } else { fmt.Fprintf(os.Stderr, "\nXtract processed %d %s in %.3f seconds", recordCount, name, seconds) } if seconds >= 0.001 && recordCount > 0 { rate := int(float64(recordCount) / seconds) if rate >= 1000000 { fmt.Fprintf(os.Stderr, " (%d mega%s/second", rate/1000000, name) } else { fmt.Fprintf(os.Stderr, " (%d %s/second", rate, name) } if byteCount > 0 { rate := int(float64(byteCount) / seconds) if rate >= 1000000 { fmt.Fprintf(os.Stderr, ", %d megabytes/second", rate/1000000) } else if rate >= 1000 { fmt.Fprintf(os.Stderr, ", %d kilobytes/second", rate/1000) } else { fmt.Fprintf(os.Stderr, ", %d bytes/second", rate) } } fmt.Fprintf(os.Stderr, ")") } fmt.Fprintf(os.Stderr, "\n\n") } // PERFORMANCE TIMING COMMANDS inSwitch = true action := NOPROCESS recordType := "" switch args[0] { case "-chunk": action = DOCHUNK recordType = "blocks" case "-split": action = DOSPLIT recordType = "patterns" case "-drain": action = DODRAIN recordType = "patterns" case "-token": action = DOTOKEN recordType = "tokens" default: // if not any of the formatting commands, keep going inSwitch = false } if inSwitch { recordCount, byteCount = ProcessXMLStream(rdr, tbls, args, action) printDuration(recordType) return } // SPECIAL FORMATTING COMMANDS inSwitch = true action = NOPROCESS switch args[0] { case "-format": action = DOFORMAT case "-outline": action = DOOUTLINE case "-synopsis": action = DOSYNOPSIS case "-verify", "-validate": action = DOVERIFY case "-filter": action = DOFILTER default: // if not any of the formatting commands, keep going inSwitch = false } if inSwitch { ProcessXMLStream(rdr, tbls, args, action) return } // SPECIFY STRINGS TO GO BEFORE AND AFTER ENTIRE OUTPUT head := "" tail := "" for { inSwitch = true switch args[0] { case "-head": if len(args) < 2 { fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -head command\n") os.Exit(1) } head = ConvertSlash(args[1]) case "-tail": if len(args) < 2 { fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -tail command\n") os.Exit(1) } tail = ConvertSlash(args[1]) default: // if not any of the controls, set flag to break out of for loop inSwitch = false } if !inSwitch { break } // skip past arguments args = args[2:] if len(args) < 1 { fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n") os.Exit(1) } } // ENSURE PRESENCE OF PATTERN ARGUMENT if len(args) < 1 { fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n") os.Exit(1) } // make sure top-level -pattern command is next if args[0] != "-pattern" && args[0] != "-Pattern" { fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n") os.Exit(1) } if len(args) < 2 { fmt.Fprintf(os.Stderr, "\nERROR: Item missing after -pattern command\n") os.Exit(1) } topPat := args[1] if topPat == "" { fmt.Fprintf(os.Stderr, "\nERROR: Item missing after -pattern command\n") os.Exit(1) } if strings.HasPrefix(topPat, "-") { fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", topPat) os.Exit(1) } // look for -pattern Parent/* construct for heterogeneous data, e.g., -pattern PubmedArticleSet/* topPattern, star := SplitInTwoAt(topPat, "/", LEFT) if topPattern == "" { return } parent := "" if star == "*" { parent = topPattern } else if star != "" { fmt.Fprintf(os.Stderr, "\nERROR: -pattern Parent/Child construct is not supported\n") os.Exit(1) } // PARSE AND VALIDATE EXTRACTION ARGUMENTS // parse nested exploration instruction from command-line arguments cmds := ParseArguments(args, topPattern) if cmds == nil { fmt.Fprintf(os.Stderr, "\nERROR: Problem parsing command-line arguments\n") os.Exit(1) } // PERFORMANCE TIMING COMMAND // -stats with an extraction command prints XML size and processing time for each record if stts { legend := "REC\tSIZE\tTIME" PartitionPattern(topPattern, star, rdr, func(rec int, str string) { beginTime := time.Now() ProcessQuery(str[:], parent, rec, cmds, tbls) endTime := time.Now() duration := endTime.Sub(beginTime) micro := int(float64(duration.Nanoseconds()) / 1e3) if legend != "" { fmt.Printf("%s\n", legend) legend = "" } fmt.Printf("%d\t%d\t%d\n", rec, len(str), micro) }) return } // PERFORMANCE OPTIMIZATION FUNCTIONS // -test N runs a test extraction N times for each -proc from 1 to nCPU (undocumented) if testCount > 0 && testString != "" { // clean up copy of sample string included in source code sample := strings.TrimSpace(testString) sample = CleanupBadSpaces(sample) legend := "CPU\tTIME\tRATE" for numServ := 1; numServ <= ncpu; numServ++ { runtime.GOMAXPROCS(numServ) // alternative producer sends sample XML through channel N times xmlq := make(chan Extract, chanDepth) go func(out chan<- Extract) { for rec := 1; rec <= testCount; rec++ { out <- Extract{rec, sample} } close(out) }(xmlq) tblq := CreateConsumers(cmds, tbls, parent, numServ, xmlq) if xmlq == nil || tblq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n") os.Exit(1) } begTime := time.Now() recordCount = 0 for _ = range tblq { recordCount++ runtime.Gosched() } debug.FreeOSMemory() endTime := time.Now() expended := endTime.Sub(begTime) secs := float64(expended.Nanoseconds()) / 1e9 if secs >= 0.000001 && recordCount > 0 { speed := int(float64(recordCount) / secs) if legend != "" { fmt.Printf("%s\n", legend) legend = "" } fmt.Printf("%d\t%.3f\t%d\n", numServ, secs, speed) } } return } // -trial -input fileName runs the specified extraction for each -proc from 1 to nCPU if trial && fileName != "" { legend := "CPU\tRATE\tDEV" for numServ := 1; numServ <= ncpu; numServ++ { runtime.GOMAXPROCS(numServ) sum := 0 count := 0 mean := 0.0 m2 := 0.0 // calculate mean and standard deviation of processing rate for trials := 0; trials < 5; trials++ { inFile, err := os.Open(fileName) if err != nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to open input file '%s'\n", fileName) os.Exit(1) } rdr := NewXMLReader(inFile, doCompress, doCleanup) if rdr == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to read input file\n") os.Exit(1) } xmlq := CreateProducer(topPattern, star, rdr, tbls) tblq := CreateConsumers(cmds, tbls, parent, numServ, xmlq) if xmlq == nil || tblq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n") os.Exit(1) } begTime := time.Now() recordCount = 0 for _ = range tblq { recordCount++ runtime.Gosched() } inFile.Close() debug.FreeOSMemory() endTime := time.Now() expended := endTime.Sub(begTime) secs := float64(expended.Nanoseconds()) / 1e9 if secs >= 0.000001 && recordCount > 0 { speed := int(float64(recordCount) / secs) sum += speed count++ x := float64(speed) delta := x - mean mean += delta / float64(count) m2 += delta * (x - mean) } } if legend != "" { fmt.Printf("%s\n", legend) legend = "" } if count > 1 { vrc := m2 / float64(count-1) dev := int(math.Sqrt(vrc)) fmt.Printf("%d\t%d\t%d\n", numServ, sum/count, dev) } } return } // PROCESS SINGLE SELECTED RECORD IF -pattern ARGUMENT IS IMMEDIATELY FOLLOWED BY -position COMMAND if cmds.Visit == topPat && cmds.Position != "" { qry := "" idx := 0 if cmds.Position == "first" { PartitionPattern(topPattern, star, rdr, func(rec int, str string) { if rec == 1 { qry = str idx = rec } }) } else if cmds.Position == "last" { PartitionPattern(topPattern, star, rdr, func(rec int, str string) { qry = str idx = rec }) } else { // use numeric position number, err := strconv.Atoi(cmds.Position) if err != nil { fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized position '%s'\n", cmds.Position) os.Exit(1) } PartitionPattern(topPattern, star, rdr, func(rec int, str string) { if rec == number { qry = str idx = rec } }) } if qry == "" { return } // clear position on top node to prevent condition test failure cmds.Position = "" // process single selected record res := ProcessQuery(qry[:], parent, idx, cmds, tbls) if res != "" { fmt.Printf("%s\n", res) } return } // LAUNCH PRODUCER AND CONSUMER SERVERS // launch producer goroutine to partition XML by pattern xmlq := CreateProducer(topPattern, star, rdr, tbls) if xmlq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create producer\n") os.Exit(1) } // launch consumer goroutines to parse and explore partitioned XML objects tblq := CreateConsumers(cmds, tbls, parent, numServers, xmlq) if tblq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create consumers\n") os.Exit(1) } // PERFORMANCE SUMMARY if dbug { // drain results, but suppress extraction output for ext := range tblq { byteCount += len(ext.Text) recordCount++ runtime.Gosched() } // force garbage collection, return memory to operating system debug.FreeOSMemory() // print processing parameters as XML object stopTime := time.Now() duration := stopTime.Sub(startTime) seconds := float64(duration.Nanoseconds()) / 1e9 // Threads is a more easily explained concept than GOMAXPROCS fmt.Printf("\n") fmt.Printf(" %d\n", numProcs) fmt.Printf(" %d\n", numServers) fmt.Printf(" \n", seconds) if seconds >= 0.001 && recordCount > 0 { rate := int(float64(recordCount) / seconds) fmt.Printf(" %d\n", rate) } fmt.Printf("\n") return } // DRAIN OUTPUT CHANNEL TO EXECUTE EXTRACTION COMMANDS, RESTORE OUTPUT ORDER WITH HEAP // initialize empty heap hp := &ExtractHeap{} heap.Init(hp) // index of next desired result next := 1 delay := 0 var buffer bytes.Buffer count := 0 okay := false if head != "" { buffer.WriteString(head[:]) buffer.WriteString("\n") } // printResult prints output for current pattern, handles -empty and -index flags, and periodically flushes buffer printResult := func(curr Extract) { str := curr.Text if mpty { if str == "" { okay = true idx := curr.Index val := strconv.Itoa(idx) buffer.WriteString(val[:]) buffer.WriteString("\n") count++ } } else if str != "" { okay = true if indx { idx := curr.Index val := strconv.Itoa(idx) buffer.WriteString(val[:]) buffer.WriteString("\t") } // save output to byte buffer buffer.WriteString(str[:]) count++ } if count > 1000 { count = 0 txt := buffer.String() if txt != "" { // print current buffer os.Stdout.WriteString(txt[:]) } buffer.Reset() } } for ext := range tblq { // push result onto heap heap.Push(hp, ext) // read several values before checking to see if next record to print has been processed if delay < heapSize { delay++ continue } delay = 0 for hp.Len() > 0 { // remove lowest item from heap, use interface type assertion curr := heap.Pop(hp).(Extract) if curr.Index == next { // if this is the desired item, send to output printResult(curr) recordCount++ // increment index next++ // and keep checking heap to see if next result is already available } else { // otherwise push back onto heap heap.Push(hp, curr) // and go back to waiting on input channel break } } } // send remainder of heap to output for hp.Len() > 0 { curr := heap.Pop(hp).(Extract) printResult(curr) recordCount++ } if tail != "" { buffer.WriteString(tail[:]) buffer.WriteString("\n") } // do not print head or tail if no extraction output if okay { txt := buffer.String() if txt != "" { // print final buffer os.Stdout.WriteString(txt[:]) } } buffer.Reset() // force garbage collection and return memory before calculating processing rate debug.FreeOSMemory() if timr { printDuration("records") } }