# # enaGroupGet.py # # # Copyright 2017 EMBL-EBI, Hinxton outstation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import argparse import os import sys import sequenceGet import assemblyGet import readGet import utils import traceback def set_parser(): parser = argparse.ArgumentParser(prog='enaGroupGet', description = 'Download data for a given study or sample, or (for sequence and assembly) taxon') parser.add_argument('accession', help='Study or sample accession or NCBI tax ID to fetch data for') parser.add_argument('-g', '--group', default='read', choices=['sequence', 'wgs', 'assembly', 'read', 'analysis'], help='Data group to be downloaded for this study/sample/taxon (default is read)') parser.add_argument('-f', '--format', default=None, choices=['embl', 'fasta', 'submitted', 'fastq', 'sra'], help="""File format required. Format requested must be permitted for data group selected. sequence, assembly and wgs groups: embl and fasta formats. read group: submitted, fastq and sra formats. analysis group: submitted only.""") parser.add_argument('-d', '--dest', default='.', help='Destination directory (default is current running directory)') parser.add_argument('-w', '--wgs', action='store_true', help='Download WGS set for each assembly if available (default is false)') parser.add_argument('-e', '--extract-wgs', action='store_true', help='Extract WGS scaffolds for each assembly if available (default is false)') parser.add_argument('-exp', '--expanded', action='store_true', help='Expand CON scaffolds when downloading embl format (default is false)') parser.add_argument('-m', '--meta', action='store_true', help='Download read or analysis XML in addition to data files (default is false)') parser.add_argument('-i', '--index', action='store_true', help="""Download CRAM index files with submitted CRAM files, if any (default is false). This flag is ignored for fastq and sra format options. """) parser.add_argument('-a', '--aspera', action='store_true', help='Use the aspera command line client to download, instead of FTP.') parser.add_argument('-as', '--aspera-settings', default=None, help="""Use the provided settings file, will otherwise check for environment variable or default settings file location.""") parser.add_argument('-t', '--subtree', action='store_true', help='Include subordinate taxa (taxon subtree) when querying with NCBI tax ID (default is false)') parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.5.3') return parser def download_report(group, result, accession, temp_file, subtree): search_url = utils.get_group_search_query(group, result, accession, subtree) response = utils.get_report_from_portal(search_url) f = open(temp_file, 'wb') for line in response: f.write(line) f.flush() f.close() def download_data(group, data_accession, output_format, group_dir, fetch_wgs, extract_wgs, expanded, fetch_meta, fetch_index, aspera): if group == utils.WGS: print ('Fetching ' + data_accession[:6]) sequenceGet.download_wgs(group_dir, data_accession[:6], output_format) else: print ('Fetching ' + data_accession) if group == utils.ASSEMBLY: assemblyGet.download_assembly(group_dir, data_accession, output_format, fetch_wgs, extract_wgs, expanded, True) elif group in [utils.READ, utils.ANALYSIS]: readGet.download_files(data_accession, output_format, group_dir, fetch_index, fetch_meta, aspera) def download_data_group(group, accession, output_format, group_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree, expanded): temp_file_path = os.path.join(group_dir, accession + '_temp.txt') download_report(group, utils.get_group_result(group), accession, temp_file_path, subtree) header = True with open(temp_file_path) as f: for line in f: if header: header = False continue data_accession = line.strip() download_data(group, data_accession, output_format, group_dir, fetch_wgs, extract_wgs, expanded, fetch_meta, fetch_index, aspera) os.remove(temp_file_path) def download_sequence_result(dest_file, group_dir, result, accession, subtree, update_accs, expanded): temp_file_path = os.path.join(group_dir, 'temp.txt') download_report(utils.SEQUENCE, result, accession, temp_file_path, subtree) header = True with open(temp_file_path) as f: for line in f: if header: header = False continue data_accession = line.strip() write_record = False if result == utils.SEQUENCE_UPDATE_RESULT: update_accs.append(data_accession) write_record = True elif result == utils.SEQUENCE_RELEASE_RESULT: if data_accession not in update_accs: write_record = True if write_record: sequenceGet.write_record(dest_file, data_accession, output_format) dest_file.flush() os.remove(temp_file_path) return update_accs def download_sequence_group(accession, output_format, group_dir, subtree, expanded): print ('Downloading sequences') update_accs = [] dest_file_path = os.path.join(group_dir, utils.get_filename(accession + '_sequences', output_format)) dest_file = open(dest_file_path, 'wb') #sequence update update_accs = download_sequence_result(dest_file, group_dir, utils.SEQUENCE_UPDATE_RESULT, accession, subtree, update_accs, expanded) #sequence release update_accs = download_sequence_result(dest_file, group_dir, utils.SEQUENCE_RELEASE_RESULT, accession, subtree, update_accs, expanded) dest_file.close() def download_group(accession, group, output_format, dest_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree, expanded): group_dir = os.path.join(dest_dir, accession) utils.create_dir(group_dir) if group == utils.SEQUENCE: download_sequence_group(accession, output_format, group_dir, subtree, expanded) else: download_data_group(group, accession, output_format, group_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree, expanded) if __name__ == '__main__': parser = set_parser() args = parser.parse_args() accession = args.accession group = args.group output_format = args.format dest_dir = args.dest fetch_wgs = args.wgs extract_wgs = args.extract_wgs expanded = args.expanded fetch_meta = args.meta fetch_index = args.index aspera = args.aspera aspera_settings = args.aspera_settings subtree = args.subtree if aspera or aspera_settings is not None: aspera = utils.set_aspera(aspera_settings) if not utils.is_available(accession): sys.stderr.write('ERROR: Study/sample does not exist or is not available for accession provided.\n') sys.stderr.write('If you believe that it should be, please contact datasubs@ebi.ac.uk for assistance.\n') sys.exit(1) if not utils.is_study(accession) and not utils.is_sample(accession) and not utils.is_taxid(accession): sys.stderr.write( 'ERROR: Invalid accession. Only sample and study/project accessions or NCBI tax ID supported\n' ) sys.exit(1) if output_format is None: if group in (utils.READ, utils.ANALYSIS): output_format = utils.SUBMITTED_FORMAT else: output_format = utils.EMBL_FORMAT elif not utils.group_format_allowed(group, output_format, aspera): sys.stderr.write('ERROR: Illegal group and format combination provided. Allowed:\n') sys.stderr.write('sequence, assembly and wgs groups: embl and fasta formats\n') sys.stderr.write('read group: submitted, fastq and sra formats\n') sys.stderr.write('analysis group: submitted format only\n') sys.exit(1) try: # disable read and analysis retrieval for taxon until added in size calculation and user response if utils.is_taxid(accession) and group in ['read', 'analysis']: print('Sorry, tax ID retrieval not yet supported for read and analysis') sys.exit(1) download_group(accession, group, output_format, dest_dir, fetch_wgs, extract_wgs, fetch_meta, fetch_index, aspera, subtree, expanded) print ('Completed') except Exception: traceback.print_exc() utils.print_error() sys.exit(1)