#!/usr/bin/python ##!/opt/python/bin/python import sys import ab_out # process a blast file for indels and generate an output file from it # default data file: "/projects/stsi2/acarson/indels/Abyss/indels/BLAT/indels.blast" #in_file = "indels-test.blast" in_file = "indels.blast" #out_file = "ab-ind-test.txt" out_file = "ab-ind.txt" q_group_count = 0 q_group = [] try: fi = open(in_file) except IOError: print "%s%s" % ("Error: can\'t open input file ", in_file) sys.exit("\nInput file open error\n\n") l = fi.readline() fo = ab_out.open_outfile(out_file, in_file) # collect all the lines up to the next "BLASTN" in a list named q_group # process that group, then collect the lines for the next group while len(l) > 0: # collect all the lines from a query group # the group ends with the string 'Database:' indented by 2 spaces while not l.startswith(' Database:') and len(l) > 0: q_group.append(l) l = fi.readline() # at this point we have a complete query group q_group_count = q_group_count + 1 print "%s%s" % ("Query group #:", q_group_count) next_contig = False qname = "" score_set = False ident_set = False strand_set = False query_in_progress = False sbjct_in_progress = False q_line = "" s_line = "" segm= "" q_start = 0 q_end = 0 s_start = 0 s_end = 0 qry = [] sbj = [] final_sbjct_string = "" final_query_string = "" out_line = "" # there are three conditions where we will end processing and write this out # (in addition to the query_in_progress being True): # 1. "Score = " # 2. ">" # 3. " Database " # This condition must be checked before the others because we want to output # as soon as we have the query data collected for line in q_group: # print "%s%s" % ("line:", line) if line.strip() == "": continue if query_in_progress and (line.lstrip().startswith("Score = ") or line.lstrip().startswith(">") or line.lstrip().startswith("Database: ")): ab_out.write_outline(qname, chr, q_start, q_end, s_start, s_end, final_query_string, final_sbjct_string, fo) next_contig = True query_in_progress = False sbjct_in_progress = False print break if line.lstrip().startswith("Query: ") and not query_in_progress: # start a new query line with current line # set initial start and end values # set query in progress qry = line.split(":") segm = qry[1] q_line = segm.split() q_start = q_line[0] q_end = q_line[2] final_query_string = q_line[1] query_in_progress = True continue if line.lstrip().startswith("Sbjct: ") and not sbjct_in_progress: # start a new sbjct line with current line # set initial start and end values # set query in progress sbj = line.split(":") segm = sbj[1] s_line = segm.split() s_start = s_line[0] s_end = s_line[2] final_sbjct_string = s_line[1] sbjct_in_progress = True continue if line.lstrip().startswith("Query: ") and query_in_progress: # append to current query line qry = line.split(":") segm = qry[1] q_line = segm.split() final_query_string = final_query_string + q_line[1] # update end value q_end = q_line[2] continue if line.lstrip().startswith("Sbjct: ") and sbjct_in_progress: # append to current sbjct line sbj = line.split(":") segm = sbj[1] s_line = segm.split() final_sbjct_string = final_sbjct_string + s_line[1] # update end value s_end = s_line[2] continue elif line.startswith("Query="): qvars = line.split("=") qname = qvars[1] print "%s%s" % ("Contig name:", qname.rstrip()) continue elif line.lstrip().startswith("Length = "): leng = line.split("=") seq_len = leng[1] # print "%s%s" % ("Sequence length:", seq_len.rstrip()) continue elif line.startswith(">"): chrs = line.split(">") chr = chrs[1].rstrip() # print "%s%s" % ("Best chromosome hit: ", chr) continue elif line.lstrip().startswith("Identities = ") and not ident_set: ids = line.split("=") idents = ids[1] # print "%s%s" % ("Identities:", idents.rstrip()) ident_set = True continue elif line.lstrip().startswith("Score = ") and not score_set: score_tokens = line.split(",") scr = score_tokens[0] score = scr.split('=') # print "%s%s" % ("Score: ", score[1].strip()) expct = score_tokens[1] expect = expct.split('=') # print "%s%s" % ("Expect: ", expect[1].strip()) score_set = True continue elif line.lstrip().startswith("Strand = ") and not strand_set: strnd = line.split("=") strand = strnd[1] # print "%s%s" % ("Strand:", strand.rstrip()) strand_set = True continue else: continue if next_contig: break # If next_contig is true, we are done processing the query; this value will be # set after the last line is processed for this group, which will be after the # last set of Sbjct and Query data lines have been collected for the best hit. # We have to cleanup here for the case when there is only a single query # since this case is not handled inside the main loop if query_in_progress: ab_out.write_outline(qname, chr, q_start, q_end, s_start, s_end, final_query_string, final_sbjct_string, fo) # after processing and outputting the group, begin collecting the next one l = fi.readline() q_group = [] sys.exit("Processing complete\n\n") """ try: fo = open('ab-ind.txt', 'w') except IOError: print "%s%s" % ("Error: can\'t open output file ", out_file) sys.exit("Output file open error") else: print "Opened the output file successfully" try: fo.write("##ABySS Insertion-Deletion Identification file\n") fo.write("##Input file: " + in_file + "\n\n") except IOError: print "%s%s" % ("Error: can\'t write to output file ", out_file) sys.exit("Output file write error") else: print "Wrote content in the output file successfully" """