#!/usr/bin/env python """ This program counts the errors at each position of each read. It distinguishes between insertions, deletions and bp mismatches and keeps separate counts for 1st and 2nd reads. The output is a tab delimited table: columns are the positions 1 - readLen. There are 8 rows: R1_A, R1_M, R1_D, R1_I, R2_A, ...", where R1_A gives the percent of (All) errors at each position for all the first reads, R1_M, gives the percent of bp mismatch errors at each position for all the first reads, etc. Where percent is percent of total reads with a given type of error. If the -r option is specified and the R program is in your path (try /opt/R/bin), a jpg graph of the output is also produced. Arguments: -o Required. Name of output file. With -r option .jpg is also created. -f [1|2] Optional. Indicates which MD format is used in the input data. Default is 1 which is the format that uses ^ to mark insertions. Format 2 is the format described in the SAM reference and uses ^ to mark deletions. -r Optional. Create the usual tab delimited output and then run R to produce a .jpg graph of the results. The .sam file must be piped to this program. Temporary files (containing generated awk and R scripts) are created in the current directory and removed on exit but may be left behind if an error occurs. Each input file is split into a first read file having suffix .R1 and and 2nd read file having suffix .R2, in the current directory. These are also removed on exit. This version handles .sam files were the read length varies from line to line. """ import sys import os import re import string import subprocess import tempfile import getopt initialized = 0 maxReadLen = 0 maxPossibleReadLen = 1000 # debug = True debug = False # debug2 = True debug2 = False outFile = None parseMDFn = None def parseCigar(theCigar): """ Cigar gives the position (where position is between 1 and readLen) of each insertion and deletion, but it considers both matches of basepairs and mismatches the same (i.e. both are counted as "M"s). We parse the cigar and return an insertions array, deletions array and softclips array. Insertions array has tuples giving the position and length of each insertion. Deletions array gives position of each deletion. Softclips array: 1st element is number of bps clipped from start, 2nd element is number clipped from end. """ readLen = 0 deletions = [] insertions = [] softclips = [0,0] # split cigar into tokens and remove any empty strings for example, # splits "1M1I2M1D4MI1I65M" as # ['1', 'M', '1', 'I', '2', 'M', '1', 'D', '4', 'M', '1', 'I', '65', 'M' ] tokens = re.split(r'(\D)', theCigar) tokens = filter(lambda x: len(x) > 0, tokens) startpos = 0 t = 0 while (t < (len(tokens) - 1)): num = int(tokens[t]) token = tokens[t + 1] t += 2 if (token == "M"): startpos += num elif (token == "S"): if (t == 2): softclips[0] = num else: softclips[1] = num startpos += num elif (token == "D"): deletions.append(startpos + 1) elif (token == "I"): insertions.append({'pos': startpos + 1, 'count': num, 'used': False}) startpos += num else: raise Exception, "Cigar Parse Error: %s" % theCigar readLen = startpos return readLen, insertions, deletions, softclips def adjustForInserts(theInsertions, thePos): """ When parsing the MD to find position of mismatched basepairs, we need to see if we've moved over a place where an insert occurred and increment our position accordingly. We start at the end of the insertion array (the hightest insertion position) and move back until we find an insertion position that is earlier than our current position AND that hasn't been accounted for yet. """ retval = thePos for i in range(len(theInsertions)-1, -1, -1): insert = theInsertions[i] if (thePos >= insert['pos']): if (not insert['used']): retval += insert['count'] insert['used'] = True else: break if (retval > thePos): retval = adjustForInserts(theInsertions, retval) return retval def adjustForDeletions(thePos, theToken, theDeletions): """ thePos is our current position as we parse the MD, theToken is a sequence of digits Second form of the command is if you want to postprocess an earlier run representing the number of matching bps, but we don't know how to parse it, since, "531" could mean 5 matches, a delete, 3 matches, a delete, 1 match or 53 matches, a delete, 1 match. Etc. So we try it one digit at a time and see if we land on a deletion, then march thru the rest of the string. We return the new postion, just past the mismatches described by the token. """ if (debug2): print >> outFile, "adjustForDeletions pos=%d, token=%s" % (thePos, theToken) num = 0 # Keep adding longer substrings of theToken to our pos until we either hit a deletion # or use the whole number. for digits in range(1, len(theToken) + 1): num = int(theToken[0:digits]) if (debug2): print >> outFile, "num is %d, newpos would be %d" % (num, thePos + num) if (thePos + num) in theDeletions: if (debug2): print >> outFile, "recursive call set thePos = %d"% thePos return adjustForDeletions(thePos + num, theToken[digits:], theDeletions) elif (digits == len(theToken)): thePos += num if (debug2): print >> outFile, "adjustForDeletions returning %d" % (thePos) return thePos def isInteger(str): try: int(str) return True except ValueError: return False def parseMD1(theMD, readLen, theInsertions, theDeletions, theSoftClips): """ Some programs (including bfast, I think) use an MD format where "^" introduces insertions instead of deletions, although this differs from the MD definition in the SAM reference manual. In this case MD is a series of numbers, representing a number of matching bps, ACGT or N, representing mismatched bps and ^ characters, which represent that one or more bps were inserted into the reference. The ACGT or Ns that follow a ^ are the inserted bps. According to the SAM reference manual the MD always starts with a number. For example, 10A5^AC6 means there are 10 matching bps. Then there's an A in the reference and a different bp in the read. Then there are 5 matching bps. Then there's a 2bp (A and C) insertion, etc. Since deletions aren't marked, any time we see a multi-digit number such as "25", we need to figure out whether it represents 2 matches, followed by one or more deletions, and 5 matches, or just 25 matches. Returns an array of integers representing the position (between 1 and readLen) of each mismatched basepair in the read. theInsertions array is unused. Warning: I haven't tested this method with soft clipping. """ mismatches = [] # convert MD string such as '64T6^TT2' to # ['64', 'T', '6', '^', 'TT', '2' ] theMD = theMD.replace("MD:Z:", "") tokens = re.split(r'([0-9]+|[A-Z]+)', theMD) tokens = filter(lambda x: len(x) > 0, tokens) if (debug2): print >> outFile, tokens startpos = 1 + theSoftClips[0] num = 0 tokenIndex = 0 currentToken = None while (tokenIndex < len(tokens)): currentToken = tokens[tokenIndex]; tokenIndex += 1 if (debug2): print >> outFile, "StartPos=%d, token=%s" % (startpos, currentToken) # Digits represent matching bps if (isInteger(currentToken)): num = int(currentToken) startpos = adjustForDeletions(startpos, currentToken, theDeletions) # ^ introduces insertion elif (currentToken == "^"): if (tokenIndex == len(tokens)): raise Exception, "MD Parse Exception, no more tokens after ^: ", theMD currentToken = tokens[tokenIndex]; tokenIndex += 1 if (not re.match(r'[ACGTNM]+', currentToken)): raise Exception, "MD Parse Exception, unexpected character after ^: ", theMD startpos += len(currentToken) if (debug2): print >> outFile, "StartPos=%d after inserts" % startpos # Apparently there can be a sequence of letters each of which represents a mismatch. elif (re.match(r'[ACGTNM]+', currentToken)): # if (len(currentToken) != 1): # raise Exception, "MD Parse Exception, unexpected multiple bp mismatches in a row: at position %d of %s" % (startpos, theMD) for i in range(0, len(currentToken)): mismatches.append(startpos) startpos += 1 if (debug2): print >> outFile, "StartPos=%d after mismatch" % startpos else: raise Exception, "MD Parse Exception, unexpected character at position %d : ", (startpos, theMD) startpos += theSoftClips[1] if (startpos != (readLen + 1)): raise Exception, "MD Parse Error, after parsing, startpos=%d, should be %d : %s" % \ (startpos, readLen + 1, theMD) return mismatches def parseMD2(theMD, readLen, theInsertions, theDeletions, theSoftClips): """ This method parses MDs that conform to the format described in the SAM reference manual: [0-9]+(([ACGTN]|\^[ACGTN]+)[0-9]+)* In this case, MD is a series of numbers, representing a number of matching bps, ACGT or N, representing a mismatched bp and ^ characters, which represent that one or more bps were deleted from the reference. The ACGT or Ns that follow a ^ are the deleted bps. For example, 10A5^AC6 means there are 10 matching bps. Then there's an A in the reference and a different bp in the read. Then there are 5 matching bps. Then there's a 2bp (A and C) deletion from the reference, etc. Note that consequetive mismatches will be shown with a 0 preceeding the ACGT or N, eg "0A0C", not "AC". The problem with using the MD alone to determine the position of mismatched bps is that it doesn't take insertions into account. For that we need to also use info obtained from the CIGAR and seen here in theInsertions array. Returns an array of integers representing the position (between 1 and readLen) of each mismatched basepair in the read. theDeletions array is unused. """ mismatches = [] # convert string such as '64T6^TT2' to # ['64', 'T', '6', '^', 'TT', '2' ] theMD = theMD.replace("MD:Z:", "") tokens = re.split(r'([0-9]+|[A-Z]+)', theMD) tokens = filter(lambda x: len(x) > 0, tokens) num = 0 startpos = theSoftClips[0] t = 0 while (t < len(tokens)): num = int(tokens[t]) t += 1 if (t == len(tokens)): break str = tokens[t] t += 1 if (re.match(r'[ACGTNM]+', str)): pos = startpos + num + 1 pos = adjustForInserts(theInsertions, pos) mismatches.append(pos) startpos = pos elif (str == "^"): # skip the next token after the ^ tmp = tokens[t] t += 1 if (re.match(r'[ACGTNM]+', tmp) == None): raise Exception, "MD Parse Exception after ^: ", theMD startpos += num pos = startpos + num pos = adjustForInserts(theInsertions, pos) # print "\tadjustForInserts: %s, %d\n" % (theInsertions, pos) # We've processed the whole MD and found the last bp to be at pos. # It's possible however, that at the next position of the read, there's an insert. # Let's see where the last insert falls. if (len(theInsertions)): lastInsertPos = theInsertions[len(theInsertions) - 1]['pos'] lastInsertUsed = theInsertions[len(theInsertions) - 1]['used'] lastInsertCount = theInsertions[len(theInsertions) - 1]['count'] if ((pos + 1 == lastInsertPos) and not lastInsertUsed): pos = pos + lastInsertCount pos += theSoftClips[1] if (pos != readLen): raise Exception, "MD Parse Error, after parsing, pos=%d, should be %d : %s" % \ (pos, readLen, theMD) return mismatches def printRow(theTitle, theTotal, theRow): global outFile, maxReadLen outFile.write("%s\t" % theTitle) for i in range(0, maxReadLen): outFile.write("%02.4f\t" % ((float(theRow[i]) /theTotal[i]) * 100)) outFile.write("\n") def isReversed(flag): return int(flag) & 0x0010 def reversePosition(pos, readLen): return (readLen + 1) - pos def reversePositionIfNeeded(flag, pos, readLen): if isReversed(flag): return reversePosition(pos, readLen) return pos def perPositionReadCoverage(totalReads, readLen): for i in range(0, readLen): totalReads[i] += 1 def tally(theTitle, theFile): global maxReadLen, outFile readLen = 0 # This keeps a count of the total number or reads that covers the position totalReads = [ 0 for i in range(maxPossibleReadLen) ] # Counts of number of insertion, deletion, modification, and all errors at each position totalI = [ 0 for i in range(maxPossibleReadLen) ] totalD = [ 0 for i in range(maxPossibleReadLen) ] totalM = [ 0 for i in range(maxPossibleReadLen) ] totalErrors = [ 0 for i in range(maxPossibleReadLen) ] if debug: print >> outFile, "Data for %s Reads" % theTitle inputFile = open(theFile) for line in inputFile: row = line.split() # print row readID = row[0] flag = row[1] cigar = row[2] MD = row[3] if (debug2): if isReversed(flag): print >> outFile, "READ: %s\t%s\t%s\tREVERSED" % (readID, cigar, MD) else: print >> outFile, "READ: %s\t%s\t%s" % (readID, cigar, MD) readLen, insertions, deletions, softclips = parseCigar(cigar) perPositionReadCoverage(totalReads, readLen) if readLen > maxReadLen: maxReadLen = readLen for d in deletions: d = reversePositionIfNeeded(flag, d, readLen) if debug: print >> outFile, "D\t%d\t%s\t%s\t%s" % (d, readID, cigar, MD) totalD[d - 1] += 1 totalErrors[d - 1] += 1 for i in insertions: ipos = reversePositionIfNeeded(flag, i['pos'], readLen) if debug: print >> outFile, "I\t%d\t%s\t%s\t%s" % (ipos, readID, cigar, MD) totalI[ipos - 1] += 1 totalErrors[ipos - 1] += 1 mismatches = parseMDFn(MD, readLen, insertions, deletions, softclips) for m in mismatches: m = reversePositionIfNeeded(flag, m, readLen) if debug: print >> outFile, "M\t%d\t%s\t%s\t%s" % (m, readID, cigar, MD) totalM[m - 1] += 1 totalErrors[m - 1] += 1 inputFile.close() if debug: print >> outFile, "TotalReads=%d" % totalReads return totalReads, totalErrors, totalM, totalD, totalI def findMD(fields): """ Return index of field that starts with "MD:" """ for i in range(len(fields)): if re.match("^MD:", fields[i]): return i return -1 def splitReads(outputFile): """ Splits the input into a pair of files, one with only 'first' reads, the other with 'second' reads. The filenames are the basename of the outputFile with either .R1 or .R2 appended. Throws out the columns we don't care about so that the resulting files contain just the columns: read id, cigar, MD. Returns the r1 and r2 filenames. """ r1file = os.path.basename(os.path.normpath(outputFile)) + ".R1" r2file = os.path.basename(os.path.normpath(outputFile)) + ".R2" ifd = sys.stdin ofd1 = open(r1file, "w") ofd2 = open(r2file, "w") while True: line = ifd.readline() if not line: break line = line.strip() if not len(line): break if re.match("^@", line): continue fields = line.split() # Pick output file depending on whether 1st or 2nd read. if int(fields[1]) <= 124: ofd = ofd1 elif int(fields[1]) >= 128: ofd = ofd2 else: print >> sys.stderr, "skipping %s\t%s\t%s because this is neither 1st nor 2nd read.\n" % (fields[0], fields[1], fields[5]) continue if fields[5] == "*" or not re.match("^[0-9MIDS]+$", fields[5]): print >> sys.stderr, "skipping %s\t%s\n" % (fields[0], fields[5]) else: i = findMD(fields) if i == -1: print >> sys.stderr, "skipping %s\t%s because MD field is missing.\n" % (fields[0], fields[5]) else: ofd.write("%s %s %s %s\n" % (fields[0], fields[1], fields[5], fields[i])) ofd1.close() ofd2.close() ofd.close() # not sure if this is necessary, refers to same file as ofd1 or ofd2 return (r1file, r2file) # numberOfPlots must be either 2 or 4 def invokeR(theFile, numberOfPlots): global maxReadLen script = """ infile <- commandArgs(TRUE)[1] outfile <- paste(infile, ".jpg", sep="") data <- read.table(infile, header=T) jpeg(outfile, width = 1000, height = 1000) plot.new() split.screen(c(%d,2)) for (i in 1:%d) { screen(i) #go through each figure and plot something plot(1:%d,data[i,], type = "l", main = rownames(data[i,]), ylab="Error Rate", xlab="Base (1:%d)") } close.screen(all=TRUE) dev.off() """ % (numberOfPlots / 2, numberOfPlots, maxReadLen, maxReadLen) tmp = tempfile.mkstemp(dir=os.getcwd()) tmpfile = os.fdopen(tmp[0], "w") tmpfile.write(script) tmpfile.close() cmd = "R --vanilla --args %s < %s" % (theFile, tmp[1]) p = subprocess.Popen(cmd, shell=True) p.wait() os.remove(tmp[1]) def main(argv=None): """ Usage: """ global outFile, parseMDFn if argv is None: argv = sys.argv runR = False outFileName = None mdFormat = 0 parseMDFn = parseMD1 options, remainder = getopt.getopt(argv[1:], "ro:f:") for opt, arg in options: if opt in ("-r"): runR = True elif opt in ("-o"): outFileName = arg elif opt in ("-f"): mdFormat = int(arg) if (mdFormat == 1): parseMDFn = parseMD1 elif (mdFormat == 2): parseMDFn = parseMD2 if outFileName == None: print >> sys.stderr, "No output filename specified." print >> sys.stderr, __doc__ return outFile = open(outFileName, "w") # Split each input file "f" into "f.R1" and "f.R2", so that .R1 files have only first # reads and .R2 files have only 2nd reads. We'll tally all the first reads together # and all the 2nd reads together, but keep them separate from each other. r1file, r2file = splitReads(outFileName) r1Title = "R1_" r2Title = "R2_" totalReads1, totalErrors1, totalM1, totalD1, totalI1 = tally(r1Title, r1file) totalReads2, totalErrors2, totalM2, totalD2, totalI2 = tally(r2Title, r2file) header = [str(i + 1) for i in range(maxReadLen) ] # this is list containing "1", "2", ... "maxeadLen" print >> outFile, "\t%s" % ("\t".join(header)) numberOfPlots = 0 if (totalReads1[0] > 0): printRow("%sA" % r1Title, totalReads1, totalErrors1) printRow("%sM" % r1Title, totalReads1, totalM1) printRow("%sD" % r1Title, totalReads1, totalD1) printRow("%sI" % r1Title, totalReads1, totalI1) numberOfPlots += 4 if (totalReads2[0]): printRow("%sA" % r2Title, totalReads2, totalErrors2) printRow("%sM" % r2Title, totalReads2, totalM2) printRow("%sD" % r2Title, totalReads2, totalD2) printRow("%sI" % r2Title, totalReads2, totalI2) numberOfPlots += 4 outFile.close() os.remove(r1file) os.remove(r2file) if runR: invokeR(outFileName, numberOfPlots) if __name__ == "__main__": sys.exit(main())