#!/usr/bin/env python """ """ import sys import os import re import string import subprocess import tempfile import getopt import fastaExtract from datetime import datetime import time class DataException(Exception) : def __init__(self, msg): self.message = msg def __str__(self): return self.message def readAtacHeader(file): """ Seek to beginning of file, skip lines that don't start with "/". Create dictionary of name/val pairs from lines starting with "/". Return dictionary and leave file positioned to read first data row. """ retval = {} startOfLine = 0 file.seek(0) while True: startOfLine = file.tell() line = file.readline() if not line: break line = line.strip() if line[0] != "/": if len(retval) == 0: # Skip the comments at head of file continue else: # Done with header, put the line back so we're positioned at start of first data row. file.seek(startOfLine) break # Store the header rows pair = line.split("=") key = pair[0][1:] value = pair[1] retval[key] = value return retval class AtacRow: def __init__( self, row): global refFile, contigFile self.fields = row.split() self.row = row self.rowClass = self.fields[0] self.matchType = self.fields[1] self.matchIndex = self.fields[2] self.parentIndex = self.fields[3] self.contigID = self.fields[4] self.cMatchStart = int(self.fields[5]) self.cMatchLen = int(self.fields[6]) self.cDir = int(self.fields[7]) self.refID = self.fields[8] self.rMatchStart = int(self.fields[9]) self.rMatchLen = int(self.fields[10]) self.rDir = int(self.fields[11]) self.cMatchEnd = self.cMatchStart + self.cMatchLen - 1 self.rMatchEnd = self.rMatchStart + self.rMatchLen - 1 self.cMatchSeq = contigFile.getSequence(self.contigID, self.cMatchStart, self.cMatchLen) self.rMatchSeq = refFile.getSequence(self.refID, self.rMatchStart, self.rMatchLen) if self.rDir == -1: self.rMatchSeq = fastaExtract.reverse_complement(self.rMatchSeq) def __str__(self): return self.row class AtacPair: def __init__( self, row1, row2): global refFile, contigFile if (row1.rDir != row2.rDir): raise DataException("%s and %s have different reference directions" % (row1, row2)); if (row1.cDir != row2.cDir) or (row2.cDir != 1): raise DataException("%s and %s have unexpected contig directions" % (row1, row2)); if (row1.refID != row2.refID): raise DataException("%s and %s have different refIDs" % (row1, row2)); self.row1 = row1 self.row2 = row2 self.cGap = (row2.cMatchStart - row1.cMatchEnd) - 1 if (row1.rDir == 1): self.rGap = (row2.rMatchStart - row1.rMatchEnd) - 1 else: self.rGap = (row1.rMatchStart - row2.rMatchEnd) - 1 if (self.cGap < 0): raise DataException("cGap is negative") if (self.rGap < 0): raise DataException("rGap is negative") if (self.cGap > 1000): raise DataException("cGap is %d" % self.cGap) if (self.rGap > 1000): raise DataException("rGap is %d" % self.rGap) self.cGapStart = row1.cMatchEnd + 1 self.cGapEnd = row2.cMatchStart - 1 self.cEndLowerMatch = row1.cMatchEnd self.cStartUpperMatch = row2.cMatchStart self.cGapSeq = contigFile.getSequence(row1.contigID, self.cGapStart, self.cGap) if (row1.rDir == 1): self.rGapStart = row1.rMatchEnd + 1 self.rGapEnd = row2.rMatchStart - 1 self.rEndLowerMatch = row1.rMatchEnd self.rStartUpperMatch = row2.rMatchStart self.rGapSeq = refFile.getSequence(row1.refID, self.rGapStart, self.rGap) else: self.rGapStart = row2.rMatchEnd + 1 self.rGapEnd = row1.rMatchStart - 1 self.rEndLowerMatch = row2.rMatchEnd self.rStartUpperMatch = row1.rMatchStart self.rGapSeq = fastaExtract.reverse_complement(refFile.getSequence(row1.refID, self.rGapStart, self.rGap)) # If we correct a BRI or BRD case we store the new pair here. self.correctedPair = None # If True, indicates that this pair is not original; it's a correction. self.isCorrected = False """ Starting at current offset in the file, read groups of rows that have the same contig ID and that are of type "M u". Leave the file position ready to read the first row with a different contig ID. Returns a list containing one or more atac rows. Returns an empty list on EOF. """ def getFromAtac(atacFile): atacRows = [] contig = prevContig = None while True: pos = atacFile.tell() line = atacFile.readline() # eof if not line: break line = line.strip() row = line.split() # skip lines that aren't the right format if not ((len(row) > 4) and (row[0] == 'M' and row[1] == 'u')): continue contig = row[4] if (prevContig): if prevContig == contig: atacRows.append(AtacRow(line)) else: atacFile.seek(pos) break else: atacRows.append(AtacRow(line)) prevContig = contig if len(atacRows): contig = atacRows[0].contigID return atacRows, contig def printAligned(cStr, rStr, oStr, rFirstPos, rLastPos): print formatString(cStr) aList = [] for i in range(0, len(cStr)): if ((cStr[i] == "-") or (rStr[i] == "-") or (cStr[i] != rStr[i])): aList.append(" ") else: aList.append("|") print formatString("".join(aList)) print formatString(rStr) print "%d -> %d" % (rFirstPos, rLastPos) print print oStr def formatString(str): retval = "" for i in range(0, len(str)): if i and ((i % 10) == 0): retval += " " retval += str[i] return retval def correctBR0(pair): print("Original rgap=%d, original cgap=%d" % (pair.rGap, pair.cGap)) len = pair.rGap mismatches = 0 # Find out how many bps of cgap and rgap are different for i in range(0, len): if pair.rGapSeq[i] != pair.cGapSeq[i]: mismatches += 1 # If there are any mismatches, we can't correct the indel, but if cgap = rgap, we log some info. if not mismatches: percentage = 0 else: percentage = (float(mismatches) / float(pair.rGap)) * 100.0 print "BR0:%s\t%d\t%d\t%.2f" % (pair.row1.contigID, pair.rGap, mismatches, percentage) if not mismatches: return True return False """ atac atac row1 row2 (f1) (f2) contig: 0-----10 g g g 14-------n ref: 0-----10 g g - 14-------n Extend the first match (f1) on both contig and reference. """ def justifyPlusRight(pair, f1, f2, smallerGapLen): # Add to the length of the contig f1[6] = str(int(f1[6]) + smallerGapLen) # Add to the length of the ref f1[10] = str(int(f1[10]) + smallerGapLen) pair.correctedPair = AtacPair(AtacRow("\t".join(f1)), AtacRow("\t".join(f2))) pair.correctedPair.isCorrected = True return "right" """ atac atac row1 row2 (f1) (f2) contig: 0-----10 g g g 14-------n ref: 0-----10 - g g 14-------n Extend the second match (f2) back toward the left in the diagram above, on contig and reference """ def justifyPlusLeft(pair, f1, f2, smallerGapLen): # adust cmatchstart and cmatchlen f2[5] = str(int(f2[5]) - smallerGapLen) f2[6] = str(int(f2[6]) + smallerGapLen) # adjust rmatchstart and rmatchlen f2[9] = str(int(f2[9]) - smallerGapLen) f2[10] = str(int(f2[10]) + smallerGapLen) pair.correctedPair = AtacPair(AtacRow("\t".join(f1)), AtacRow("\t".join(f2))) pair.correctedPair.isCorrected = True return "left" """ atac atac row1 row2 (f1) (f2) contig: 0-----10 g g g 14-------n ref: n-----14 - g g 10-------0 Extend the second match (f2) back toward the left """ def justifyMinusRight(pair, f1, f2, smallerGapLen): # adjust cmatch start and len f2[5] = str(int(f2[5]) - smallerGapLen) f2[6] = str(int(f2[6]) + smallerGapLen) # adjust rmatchLen (e.g. instead of being 11 log, it's now 13 long, in the diagram above). f2[10] = str(int(f2[10]) + smallerGapLen) pair.correctedPair = AtacPair(AtacRow("\t".join(f1)), AtacRow("\t".join(f2))) pair.correctedPair.isCorrected = True return "right" """ atac atac row1 row2 (f1) (f2) contig: 0-----10 g g g 14-------n ref: n-----14 g g - 10-------0 Extend the first match (f1) on both contig and reference. """ def justifyMinusLeft(pair, f1, f2, smallerGapLen): # Add to the length of the contig, on the first match row f1[6] = str(int(f1[6]) + smallerGapLen) # extend the rmatch f1[10] = str(int(f1[10]) + smallerGapLen) f1[9] = str(int(f1[9]) - smallerGapLen) pair.correctedPair = AtacPair(AtacRow("\t".join(f1)), AtacRow("\t".join(f2))) pair.correctedPair.isCorrected = True return "left" """ Right justification means push the hyphen to the right, sequence to the left. In other words, hyphens are pushed toward the higher number positions. When the reference direction is "-", we display the reference sequence from n - 0, but apparently Sanger indels and other programs would display the reference from 0 - n, so they right justify by putting the hyphen toward the higher number. To be consistent with them we need to do the same, even though we show the higher numbered ref positions on the left. """ def findBestAlignment(pair): global preferredJustification smallerGapLen = min(pair.rGap, pair.cGap) prefix_mismatches = suffix_mismatches = 0 for i in range(0, smallerGapLen): if pair.rGapSeq[i] != pair.cGapSeq[i]: prefix_mismatches += 1 for i in range(1, smallerGapLen + 1): if pair.rGapSeq[-i] != pair.cGapSeq[-i]: suffix_mismatches += 1 print("Original rgap=%d, original cgap=%d, prefix_mismatches = %d, suffix_mismatches = %d" % (pair.rGap, pair.cGap, prefix_mismatches, suffix_mismatches)) f1 = list(pair.row1.fields) f2 = list(pair.row2.fields) justify = preferredJustification if (pair.row1.rDir == 1): if (preferredJustification == "right"): if not prefix_mismatches: print "case 1" justify = justifyPlusRight(pair, f1, f2, smallerGapLen) elif not suffix_mismatches: print "case 2" justify = justifyPlusLeft(pair, f1, f2, smallerGapLen) else: if not suffix_mismatches: print "case 3" justify = justifyPlusLeft(pair, f1, f2, smallerGapLen) elif not prefix_mismatches: print "case 4" justify = justifyPlusRight(pair, f1, f2, smallerGapLen) else: if (preferredJustification == "right"): if not suffix_mismatches: print "case 5" justify = justifyMinusRight(pair, f1, f2, smallerGapLen) elif not prefix_mismatches: print "case 6" justify = justifyMinusLeft(pair, f1, f2, smallerGapLen) else: if not prefix_mismatches: print "case 7" justify = justifyMinusLeft(pair, f1, f2, smallerGapLen) elif not suffix_mismatches: print "case 8" justify = justifyMinusRight(pair, f1, f2, smallerGapLen) if pair.cGap > pair.rGap: type = "BRI" cStr = "%s%s%s" % (pair.row1.cMatchSeq, pair.cGapSeq.lower(), pair.row2.cMatchSeq) if (justify == "right" and pair.row1.rDir == 1) or (justify == "left" and pair.row1.rDir == -1): rStr = "%s%s%s%s" % ( pair.row1.rMatchSeq, pair.rGapSeq.lower(), "-" * (pair.cGap - pair.rGap), pair.row2.rMatchSeq) else: rStr = "%s%s%s%s" % ( pair.row1.rMatchSeq, "-" * (pair.cGap - pair.rGap), pair.rGapSeq.lower(), pair.row2.rMatchSeq) else: type = "BRD" rStr = "%s%s%s" % (pair.row1.rMatchSeq, pair.rGapSeq.lower(), pair.row2.rMatchSeq) if (justify == "right" and pair.row1.rDir == 1) or (justify == "left" and pair.row1.rDir == -1): cStr = "%s%s%s%s" % ( pair.row1.cMatchSeq, pair.cGapSeq.lower(), "-" * (pair.rGap - pair.cGap), pair.row2.cMatchSeq) else: cStr = "%s%s%s%s" % ( pair.row1.cMatchSeq, "-" * (pair.rGap - pair.cGap), pair.cGapSeq.lower(), pair.row2.cMatchSeq) oStr = "B\t%s\t%d\t%s\t%d\t%d\t%s" % ( pair.row1.refID, pair.rGapStart, pair.row1.rDir, pair.rGapStart, pair.rGapEnd + 1, pair.row1.rDir) gffStr = "%s\t%s\t%s\t%d\t%d\t%d:%d\t%s\t%s:%d-%d\t%s/%s" % ( pair.row1.refID, ".", type, pair.rEndLowerMatch + 2, pair.rStartUpperMatch, pair.cGap, pair.rGap, ("+", "-")[pair.row1.rDir == -1], pair.row1.contigID, pair.cEndLowerMatch + 2, pair.cStartUpperMatch, (pair.cGapSeq, "-")[len(pair.cGapSeq) == 0], (pair.rGapSeq, "-")[len(pair.rGapSeq) == 0]) return cStr, rStr, oStr, gffStr def processPair(pair): if (not pair.cGap and not pair.rGap): raise DataException("No indel found, cGap=0 and rGap=0") if (pair.cGap == pair.rGap): cStr, rStr, oStr, gffStr = handleBR0(pair) if (correctBR0(pair)): raise DataException("No indel found, cGap=0 and rGap=0") elif (pair.rGap != 0 and pair.cGap != 0): cStr, rStr, oStr, gffStr = handleBRIndel(pair) elif (pair.cGap != 0): cStr, rStr, oStr, gffStr = handleINS(pair) else: cStr, rStr, oStr, gffStr = handleDEL(pair) if (pair.row1.rDir == 1): printAligned(cStr, rStr, oStr, pair.row1.rMatchStart, pair.row2.rMatchEnd) else: printAligned(cStr, rStr, oStr, pair.row1.rMatchEnd, pair.row2.rMatchStart) print return pair.correctedPair, oStr, gffStr def handleBR0(pair): global bFlag if not bFlag: raise DataException("cgap=%d and rgap=%d" % (pair.cGap, pair.rGap)) rStr = "%s%s%s" % (pair.row1.rMatchSeq, pair.rGapSeq.lower(), pair.row2.rMatchSeq) cStr = "%s%s%s" % (pair.row1.cMatchSeq, pair.cGapSeq.lower(), pair.row2.cMatchSeq) oStr = "B\t%s\t%d\t%s\t%d\t%d\t%s" % ( pair.row1.refID, pair.rGapStart, pair.row1.rDir, pair.rGapStart, pair.rGapEnd + 1, pair.row1.rDir) type = "BR0" gffStr = "%s\t%s\t%s\t%d\t%d\t%d:%d\t%s\t%s:%d-%d\t%s/%s" % ( pair.row1.refID, ".", type, pair.rEndLowerMatch + 2, pair.rStartUpperMatch, pair.cGap, pair.rGap, ("+", "-")[pair.row1.rDir == -1], pair.row1.contigID, pair.cEndLowerMatch + 2, pair.cStartUpperMatch, (pair.cGapSeq, "-")[len(pair.cGapSeq) == 0], (pair.rGapSeq, "-")[len(pair.rGapSeq) == 0]) return cStr, rStr, oStr, gffStr def handleBRIndel(pair): global bFlag if not bFlag: raise DataException("cgap=%d and rgap=%d" % (pair.cGap, pair.rGap)) return findBestAlignment(pair) def handleINS(pair): cStr = "%s%s%s" % (pair.row1.cMatchSeq, pair.cGapSeq, pair.row2.cMatchSeq) rStr = "%s%s%s" % (pair.row1.rMatchSeq, "-" * pair.cGap, pair.row2.rMatchSeq) oStr = "I\t%s\t%d\t%s" % (pair.row1.refID, pair.rGapStart, pair.row1.rDir) type = ("INS", "INSC")[pair.isCorrected] gffStr = "%s\t%s\t%s\t%d\t%d\t%d:%d\t%s\t%s:%d-%d\t%s/%s" % ( pair.row1.refID, ".", type, pair.rEndLowerMatch + 1, pair.rStartUpperMatch + 1, pair.cGap, pair.rGap, ("+", "-")[pair.row1.rDir == -1], pair.row1.contigID, pair.cEndLowerMatch + 2, pair.cStartUpperMatch, (pair.cGapSeq, "-")[len(pair.cGapSeq) == 0], (pair.rGapSeq, "-")[len(pair.rGapSeq) == 0]) return cStr, rStr, oStr, gffStr def handleDEL(pair): cStr = "%s%s%s" % (pair.row1.cMatchSeq, "-" * pair.rGap, pair.row2.cMatchSeq) rStr = "%s%s%s" % (pair.row1.rMatchSeq, pair.rGapSeq, pair.row2.rMatchSeq) oStr = "D\t%s\t%d\t%d\t%s" % (pair.row1.refID, pair.rGapStart, pair.rGapEnd + 1, pair.row1.rDir) type = ("DEL", "DELC")[pair.isCorrected] gffStr = "%s\t%s\t%s\t%d\t%d\t%d:%d\t%s\t%s:%d-%d\t%s/%s" % ( pair.row1.refID, ".", type, pair.rEndLowerMatch + 2, pair.rStartUpperMatch, pair.cGap, pair.rGap, ("+", "-")[pair.row1.rDir == -1], pair.row1.contigID, pair.cEndLowerMatch + 1, pair.cStartUpperMatch + 1, (pair.cGapSeq, "-")[len(pair.cGapSeq) == 0], (pair.rGapSeq, "-")[len(pair.rGapSeq) == 0]) return cStr, rStr, oStr, gffStr def leftJustifyIndel(pair): if (pair.rGap != 0 and pair.cGap != 0): print "Not left justifying BR rows" return None shifted = False corrected = pair.isCorrected f1 = list(pair.row1.fields) f2 = list(pair.row2.fields) if (pair.row1.rDir == 1): if (pair.cGap != 0): bp = pair.cGapSeq[-1] else: bp = pair.rGapSeq[-1] # Shift indel left, making first match shorter and second match longer while bp == pair.row1.rMatchSeq[-1]: if pair.row1.cMatchLen == 1 or pair.row1.rMatchLen == 1: print "leftJustifyIndel at end of atac match" break f1[6] = str(int(f1[6]) - 1) f1[10] = str(int(f1[10]) - 1) f2[5] = str(int(f2[5]) - 1) f2[9] = str(int(f2[9]) - 1) f2[6] = str(int(f2[6]) + 1) f2[10] = str(int(f2[10]) + 1) shifted = True pair = AtacPair(AtacRow("\t".join(f1)), AtacRow("\t".join(f2))) if (pair.cGap != 0): bp = pair.cGapSeq[-1] else: bp = pair.rGapSeq[-1] else: # Shift indel toward lower number bp on ref to be consistent with "+" match case. # When we show the alignment with reference running from n to 0, this means we're # actually shifting the indel toward the right. Making the first match longer and # and the 2nd shorter. if (pair.cGap != 0): bp = pair.cGapSeq[0] else: bp = pair.rGapSeq[0] while bp == pair.row2.rMatchSeq[0]: print "bp=%s, end of ref match=%s" % (bp, pair.row2.rMatchSeq) if pair.row2.cMatchLen == 1 or pair.row2.rMatchLen == 1: print "leftJustifyIndel at end of atac match" break f1[6] = str(int(f1[6]) + 1) f1[9] = str(int(f1[9]) - 1) f1[10] = str(int(f1[10]) + 1) f2[5] = str(int(f2[5]) + 1) f2[6] = str(int(f2[6]) - 1) f2[10] = str(int(f2[10]) - 1) shifted = True pair = AtacPair(AtacRow("\t".join(f1)), AtacRow("\t".join(f2))) if (pair.cGap != 0): bp = pair.cGapSeq[0] else: bp = pair.rGapSeq[0] if shifted: pair.isCorrected = corrected return pair return None # For negative matches, return modified gffline, where inserted/deleted sequence is reverse complemented, # For positive matches, just return the gffLine as is. def reverseRefGff(gffLine): fields = gffLine.split() if (fields[6] == "-"): lastFields = fields[8].split("/") if not lastFields[0].startswith("-"): lastFields[0] = fastaExtract.reverse_complement(lastFields[0]) if not lastFields[1].startswith("-"): lastFields[1] = fastaExtract.reverse_complement(lastFields[1]) fields[8] = "/".join(lastFields) return "\t".join(fields) return gffLine contigFile = refFile = None bFlag = True preferredJustification = "right" def main(argv=None): if argv is None: argv = sys.argv global contigFile, refFile, atacFile, bFlag, preferredJustification contigFilename = refFilename = indelFilename = refDirectory = atacFilename = outFilename = None options, remainder = getopt.getopt(argv[1:], "br:c:i:a:o:j:") for opt, arg in options: if opt in ("-r"): refFilename = arg elif opt in ("-c"): contigFilename = arg elif opt in ("-i"): indelFilename = arg elif opt in ("-a"): atacFilename = arg elif opt in ("-o"): outFilename = arg elif opt in ("-b"): bFlag = False elif opt in ("-j"): preferredJustification = arg print time.asctime(time.localtime(time.time())) print "preferred justification of BRI/BRD cases is %s." % preferredJustification sys.stdout.flush() atacFile = open(atacFilename, "r") atacHeader = readAtacHeader(atacFile) # fd.close() print time.asctime(time.localtime(time.time())) print "Indexing the reference, this will take a few minutes ..." sys.stdout.flush() fastaExtract.FastaExtract.debug = True refFile = fastaExtract.FastaExtract(refFilename) print time.asctime(time.localtime(time.time())) print "Reading contigs" sys.stdout.flush() fastaExtract.FastaExtract.debug = False contigFile = fastaExtract.FastaExtract(contigFilename, fixedLineLength = False) print time.asctime(time.localtime(time.time())) print "Processing indel candidates." sys.stdout.flush() if not outFilename: outFilename = "%s.vs.%s.atac.variants" % (atacHeader["assemblyId1"], atacHeader["assemblyId2"]) outFile = open(outFilename, "w") gffFilename = outFilename + ".gff" gffFile = open(gffFilename, "w") gffFile.write("#contig/chr_assembly2\t.\tvariant_type\tSTART\tEND\tsize(assembly1:assembly2)\t" + "strand\tassembly1_coordinates\tsequence(assembly1:assembly2)\n") gffRFilename = outFilename + ".r.gff" gffRFile = open(gffRFilename, "w") gffRFile.write("#contig/chr_assembly2\t.\tvariant_type\tSTART\tEND\tsize(assembly1:assembly2)\t" + "strand\tassembly1_coordinates\tsequence(assembly1:assembly2)\n") contigID = None # Get groups of rows with same contig IDs from atac output (they are consecutive in the atac.uids file). while True: atacRows, contigID = getFromAtac(atacFile) # Check for eof if not len(atacRows): break if len(atacRows) < 2: # print("\tSkipping %s, only found %d atac rows with this contigID" % (contigID, len(atacRows))) continue print print "PROCESSING CONTIG %s. There are %d atac rows with this contig" % (contigID, len(atacRows)) sys.stdout.flush() for i in range(0, len(atacRows) - 1): row1 = atacRows[i] row2 = atacRows[i + 1] try: print "Showing alignment of atac rows:\n\t%s\n\t%s" % (row1, row2) pair = AtacPair(row1, row2) correctedPair, oStr, gffStr = processPair(pair) if correctedPair: print "Corrected Alignment:" if (correctedPair.rGap != 0 and correctedPair.cGap != 0): raise DataException("Internal Error, still have BR after doing correction." "Corrected rgap=%d, Corrected cgap=%d.\nCorrected row1=%s\nCorrected row2=%s\n" % ( correctedPair.rGap, correctedPair.cGap, correctedPair.row1, correctedPair.row2)) pair = correctedPair correctedPair, oStr, gffStr = processPair(correctedPair) outFile.write(oStr + "\n") outFile.flush() # If we can left justify it, print the new alignment and gffstr. Only justifies pure INS[C]/DEL[C] rows. justifiedPair = leftJustifyIndel(pair) if justifiedPair: print "Left Justified Alignment:" justifiedPair, oStr, justifiedGffStr = processPair(justifiedPair) gffFile.write(justifiedGffStr + "\n") gffRFile.write(reverseRefGff(justifiedGffStr) + "\n") else: gffFile.write(gffStr + "\n") gffRFile.write(reverseRefGff(gffStr) + "\n") gffFile.flush() gffRFile.flush() except DataException, err : print "Skipping %s because: %s" % (contigID, err) print continue if __name__ == "__main__": sys.exit(main())