#!/usr/bin/env python """ Executable for producing properly randomized bunches from reseeded files To check the output, use: for index in `seq 0 1 n_bunches`; do head -$(($index * 16)) test_file_2.txt | tail -16 | \ awk -F'[" "_]' '{printf $5" "$11"\n"}' | sort | uniq -d done """ from argparse import ArgumentParser from collections import OrderedDict import random import os, sys def main(): """ Function that does the file randomization """ parser = ArgumentParser(prog="./bunchrandomizer.py", description="Script to properly randomize a list"+\ " of files, accounting for version"+\ " number. It randomizes the input"+\ " list such that no two versions of"+\ " the same file are in the same"+\ " bunch. The output lists the files"+\ " randomized by bunch, where N lines"+\ " is one bunch, N set by in options"+\ " (-b).") parser.add_argument("in_files", metavar="", type=str, help="The input list of files to randomize."+\ " This should have the path to one input file"+\ " per line.") parser.add_argument("-o", "--output", default="randomized_bunches.txt", dest="outfile", type=str, help="The path to the output file list.") parser.add_argument("-u", "--unused", default=None, dest="unused_outfile", type=str, help="The path to the print any unused runs.") parser.add_argument("-b", "--bunch", dest="files_per_bunch", default=25, type=int, help="Define the number of files needed in each bunch.") parser.add_argument("-t", "--target", dest="target_n_bunches", default=None, type=int, help="Define the target number of bunches.") parser.add_argument("-r", "--resample", dest="resample", default=1, type=int, help="Amount of times one file can be used.") parser.add_argument("-s", "--seed", dest="seed", default=None, type=int, help="Set the random number seed.") # Get arguments args = parser.parse_args() # Set the random number seed if it exists if args.seed is not None: random.seed(args.seed) # Write a dictionary of all the runs in the file, organizing by version dict_of_runs = OrderedDict() n_in_files = 0 # Allow for resampling with open(args.in_files) as all_in_files: for this_file in all_in_files.readlines(): # Remove new lin character in_file = this_file.strip("\n") # Get the run number this_run = in_file.split("/")[-1].split("_")[3] # If we expect multiple lists from concatenated files, add them in # here this_list = in_file.split(" ")[-1] if this_list != in_file: this_run = this_run + "_" + str(this_list) # Otherwise just add the file name to the existing run number # dictionary entry for _ in range(args.resample): dict_of_runs[this_run] = \ dict_of_runs.get(this_run, list()) + [in_file] n_in_files += 1 # Check the number of files per run is the same for all entries total_files = 0 n_in_runs = len(dict_of_runs.keys()) for run, list_of_version in dict_of_runs.iteritems(): this_n_file_run = len(list_of_version) total_files += this_n_file_run # Define which runs we are going to use dict_of_unused_runs = OrderedDict() if args.target_n_bunches is not None: target_files = args.files_per_bunch * args.target_n_bunches files_to_remove = total_files - target_files # If this is more bunches then we have if files_to_remove < 0: print "Requested {} bunches,".format(args.target_n_bunches)+\ "need {} files,".format(target_files)+\ "only have {} files.\n".format(total_files)+\ "Will create maximum number of bunches possible." # Otherwise keep popping lists from the dict until we have the right # number of files else: while total_files > target_files + args.files_per_bunch: unused_run, unused_files = dict_of_runs.popitem() dict_of_unused_runs[unused_run] = unused_files total_files -= len(unused_files) # Initialize lists specific to this n_bunches = 0 # Build as many bunches as possible availible_runs = list(dict_of_runs.keys()) all_bunch_lists = list() while len(availible_runs) >= args.files_per_bunch: # Ensure we don't get trapped in a loop this_bunch_list = list() # Build one bunch while len(this_bunch_list) < args.files_per_bunch: # Store the number of runs left num_runs_left = len(availible_runs) # Get a ranom run and remove it from availible rand_run = availible_runs.pop(random.randrange(0, num_runs_left, 1)) rand_file_list = dict_of_runs[rand_run] assert len(rand_file_list) != 0, "{} {}".format(rand_run, rand_file_list) # Get a random file rand_file = rand_file_list.pop(random.randrange(0, len(rand_file_list), 1)) # Check if the length of the used run is zero, if so, remove if from the dict if len(rand_file_list) == 0: dict_of_runs.pop(rand_run) # Append the random run this_bunch_list.append(rand_file) # Append the list of runs in this bunch to the list of all bunches all_bunch_lists.append(this_bunch_list) n_bunches += 1 # If we've sampled all runs, reset the availible bunch list if len(availible_runs) < args.files_per_bunch: availible_runs = list(dict_of_runs.keys()) # Print some information print "All input files : {}".format(n_in_files) print "All runs : {}".format(n_in_runs) print "Maximum bunches : {}".format(n_in_files/args.files_per_bunch) print "Generated bunches : {}".format(n_bunches) # Write the mixture to the correct output with open(args.outfile, mode='w+') as random_bunch_out: for bunch in all_bunch_lists: for bunch_file in bunch: random_bunch_out.write(bunch_file + "\n") # Write all unused runs/files to the unusued file list if args.unused_outfile is None: in_files_name, in_files_extension = os.path.splitext(args.in_files) args.unused_outfile = in_files_name + "_unused" + in_files_extension # If we excluded some bunches, let us know if dict_of_unused_runs: left_over_runs = len(dict_of_unused_runs.keys()) left_over_file = 0 with open(args.unused_outfile, mode='w+') as unused_file_out: for run in dict_of_unused_runs.itervalues(): left_over_file += len(run) for run_file in run: unused_file_out.write(run_file + "\n") print "Left over runs/files/bunches : {}/{}/{}".format(left_over_runs, left_over_file, left_over_file/args.files_per_bunch) if __name__ == '__main__': main()