#!/usr/bin/python ''' POST-GARLI CIPRES SCRIPT ~~~~~~~ beta v1.0 ~~~~~~ To be used when Garli has elected multiple best trees (from being run on parallel cores), and multiple .best.tre files exists in the current directory. This script collects the Garli scores from each of these best tree files, and ranks the trees by score from highest to lowest score. This allows the user to quickly determine which of the "best trees" is truly the best tree. Personal Notes: (data file: nexus or relaxed-phylip) --> searchreps=N (N=1-1000+) --> for each searchrep, elects best tree because they’re running on parallel cores, cant be compared until dumped into directory --> You get thousands of these: prefix.best.tre - inside this file, there’s a likelihood value - user has to grep across all trees to find this likelihood value --> WANT: --> script to take all likelihood values and pool them into a single file that ranks them from most to least likelihood Execute this file inside the directory containing all of your prefix.best.tre files ''' import os import re ''' Collects the GarliScore from current line using regex Parameters: regex_score - compiled regex for pulling score from line line - line to collect score from Returns: The collected GarliScore, or score of 0 if none is found ''' def collect_score(regex_score, line): match = regex_score.search(line) # if a score is collected, store value if match: score = float(match.group(1)) # otherwise give error score of 0 else: score = 0 return score ''' Creates the output file containing scores and their corresponding filenames in descending order Parameters: score_dict - dictionary of scores, filenames Returns: nothing ''' def make_out_file(score_dict): output_file = open('garli_scores.txt', 'w') output_file.write('### RANK OF GARLI SCORES (highest to lowest) ### \n \ Score:\tfilename \n') # sort scores in descending order, write to output file for gs in sorted(score_dict, reverse = True): output_file.write('%s: %s \n' % (gs, score_dict[gs])) output_file.close ''' Main ''' print('\n############### Ranking Garli Scores ############### \n') # create empty dictionary to store filename, score pairs score_dict = {} # regex to pull GarliScore (compiled) regex_score = re.compile('GarliScore\s([-?]\d+[\.\d]+)') # start with no error and empty list of error-prone Garli output files error_code = 0 problem_files= [] first_file = True lines_until_score = 0 # loop through all *.best.tre files in current directory print('Processing files... \n') files = [f for f in os.listdir('.') if f.endswith('.best.tre')] for filename in files: with open(filename, 'rU') as fin: line_num = 0 for line in fin: # for the first file, count the number of lines before reaching the # line containing Garli score. Once reached, stop counting lines and # store score if first_file: line = line.rstrip() if line.find('GarliScore') >= 0: score = collect_score(regex_score, line) first_file = False break lines_until_score += 1 # for all other files, jump to line with score and collect score else: if line_num == lines_until_score: line = line.rstrip() score = collect_score(regex_score, line) break else: line_num += 1 # give files for which score retrieval was unsuccessful an error code of 1, # add their name to list of problematic files if score == 0: error_code = 1 problem_files.append(filename) # otherwise, add score, filename to list of scores else: score_dict[score] = filename # print error_code and problem_files (if relevant) if error_code == 1: print('Errors were encountered when processing the\n' + 'following files: \n') print(problem_files) else: # add scores with corresponding filename to file called garli_scores.txt make_out_file(score_dict) print('All scores were collected successfully, please see\n' + 'garli_scores.txt for ranked Garli scores.\n')