#!/usr/bin/env python """ This program counts the errors at each position of each read. It distinguishes between insertions, deletions and bp mismatches and keeps separate counts for 1st and 2nd reads. The output is a tab delimited table: columns are the positions 1 - readLen (which is hardcoded). There are 8 rows: R1_A, R1_M, R1_D, R1_I, R2_A, ...", where R1_A gives the total errors at each position for all the first reads, R1_M, gives the total number of bp mismatch errors at each position for all the first reads, etc. If the -r option is specified and the R program is in your path (try /opt/R/bin), a jpg graph of the output is also produced. Arguments: -o Required. Name of output file. With -r option .jpg is also created. -l Required. -f [1|2] Optional. Indicates which MD format is used in the input data. Default is 1 which is the format that uses ^ to mark insertions. Format 2 is the format described in the SAM reference and uses ^ to mark deletions. -r Optional. Create the usual tab delimited output and then run R to produce a .jpg graph of the results. infiles Required. Names of one or more .sam files. Temporary files (containing generated awk and R scripts) are created in the current directory and removed on exit but may be left behind if an error occurs. Each input file is split into a first read file having suffix .R1 and and 2nd read file having suffix .R2, in the current directory. These are also removed on exit. """ import sys import os import re import string import subprocess import tempfile import getopt readLen = 0 # debug = True debug = False # debug2 = True debug2 = False outFile = None parseMDFn = None def parseCigar(theCigar): """ Cigar gives the position (where position is between 1 and readLen) of each insertion and deletion, but it considers both matches of basepairs and mismatches the same (i.e. both are counted as "M"s). We parse the cigar and return an insertions array, deletions array and softclips array. Insertions array has tuples giving the position and length of each insertion. Deletions array gives position of each deletion. Softclips array: 1st element is number of bps clipped from start, 2nd element is number clipped from end. """ global readLen deletions = [] insertions = [] softclips = [0,0] # split cigar into tokens and remove any empty strings for example, # splits "1M1I2M1D4MI1I65M" as # ['1', 'M', '1', 'I', '2', 'M', '1', 'D', '4', 'M', '1', 'I', '65', 'M' ] tokens = re.split(r'(\D)', theCigar) tokens = filter(lambda x: len(x) > 0, tokens) startpos = 0 t = 0 while (t < (len(tokens) - 1)): num = int(tokens[t]) token = tokens[t + 1] t += 2 if (token == "M"): startpos += num elif (token == "S"): if (t == 2): softclips[0] = num else: softclips[1] = num startpos += num elif (token == "D"): deletions.append(startpos + 1) elif (token == "I"): insertions.append({'pos': startpos + 1, 'count': num, 'used': False}) startpos += num else: raise Exception, "Cigar Parse Error: %s" % theCigar if (startpos != readLen): raise Exception, "Cigar Parse Error, after parsing startpos=%d, should be %d : %s" % \ (startpos, readLen, theCigar) return insertions, deletions, softclips def adjustForInserts(theInsertions, thePos): """ When parsing the MD to find position of mismatched basepairs, we need to see if we've moved over a place where an insert occurred and increment our position accordingly. We start at the end of the insertion array (the hightest insertion position) and move back until we find an insertion position that is earlier than our current position AND that hasn't been accounted for yet. """ retval = thePos for i in range(len(theInsertions)-1, -1, -1): insert = theInsertions[i] if (thePos >= insert['pos']): if (not insert['used']): retval += insert['count'] insert['used'] = True else: break if (retval > thePos): retval = adjustForInserts(theInsertions, retval) return retval def adjustForDeletions(thePos, theToken, theDeletions): """ thePos is our current position as we parse the MD, theToken is a sequence of digits representing the number of matching bps, but we don't know how to parse it, since, "531" could mean 5 matches, a delete, 3 matches, a delete, 1 match or 53 matches, a delete, 1 match. Etc. So we try it one digit at a time and see if we land on a deletion, then march thru the rest of the string. We return the new postion, just past the mismatches described by the token. """ if (debug2): print >> outFile, "adjustForDeletions pos=%d, token=%s" % (thePos, theToken) num = 0 # Keep adding longer substrings of theToken to our pos until we either hit a deletion # or use the whole number. for digits in range(1, len(theToken) + 1): num = int(theToken[0:digits]) if (debug2): print >> outFile, "num is %d, newpos would be %d" % (num, thePos + num) if (thePos + num) in theDeletions: if (debug2): print >> outFile, "recursive call set thePos = %d"% thePos return adjustForDeletions(thePos + num, theToken[digits:], theDeletions) elif (digits == len(theToken)): thePos += num if (debug2): print >> outFile, "adjustForDeletions returning %d" % (thePos) return thePos def isInteger(str): try: int(str) return True except ValueError: return False def parseMD1(theMD, theInsertions, theDeletions, theSoftClips): """ Some programs (including bfast, I think) use an MD format where "^" introduces insertions instead of deletions, although this differs from the MD definition in the SAM reference manual. In this case MD is a series of numbers, representing a number of matching bps, ACGT or N, representing mismatched bps and ^ characters, which represent that one or more bps were inserted into the reference. The ACGT or Ns that follow a ^ are the inserted bps. According to the SAM reference manual the MD always starts with a number. For example, 10A5^AC6 means there are 10 matching bps. Then there's an A in the reference and a different bp in the read. Then there are 5 matching bps. Then there's a 2bp (A and C) insertion, etc. Since deletions aren't marked, any time we see a multi-digit number such as "25", we need to figure out whether it represents 2 matches, followed by one or more deletions, and 5 matches, or just 25 matches. Returns an array of integers representing the position (between 1 and readLen) of each mismatched basepair in the read. theInsertions array is unused. Warning: I haven't tested this method with soft clipping. """ global readLen mismatches = [] # convert MD string such as '64T6^TT2' to # ['64', 'T', '6', '^', 'TT', '2' ] theMD = theMD.replace("MD:Z:", "") tokens = re.split(r'([0-9]+|[A-Z]+)', theMD) tokens = filter(lambda x: len(x) > 0, tokens) if (debug2): print >> outFile, tokens startpos = 1 + theSoftClips[0] num = 0 tokenIndex = 0 currentToken = None while (tokenIndex < len(tokens)): currentToken = tokens[tokenIndex]; tokenIndex += 1 if (debug2): print >> outFile, "StartPos=%d, token=%s" % (startpos, currentToken) # Digits represent matching bps if (isInteger(currentToken)): num = int(currentToken) startpos = adjustForDeletions(startpos, currentToken, theDeletions) # ^ introduces insertion elif (currentToken == "^"): if (tokenIndex == len(tokens)): raise Exception, "MD Parse Exception, no more tokens after ^: ", theMD currentToken = tokens[tokenIndex]; tokenIndex += 1 if (not re.match(r'[ACGTNM]+', currentToken)): raise Exception, "MD Parse Exception, unexpected character after ^: ", theMD startpos += len(currentToken) if (debug2): print >> outFile, "StartPos=%d after inserts" % startpos # Apparently there can be a sequence of letters each of which represents a mismatch. elif (re.match(r'[ACGTNM]+', currentToken)): # if (len(currentToken) != 1): # raise Exception, "MD Parse Exception, unexpected multiple bp mismatches in a row: at position %d of %s" % (startpos, theMD) for i in range(0, len(currentToken)): mismatches.append(startpos) startpos += 1 if (debug2): print >> outFile, "StartPos=%d after mismatch" % startpos else: raise Exception, "MD Parse Exception, unexpected character at position %d : ", (startpos, theMD) startpos += theSoftClips[1] if (startpos != (readLen + 1)): raise Exception, "MD Parse Error, after parsing, startpos=%d, should be %d : %s" % \ (startpos, readLen + 1, theMD) return mismatches def parseMD2(theMD, theInsertions, theDeletions, theSoftClips): """ This method parses MDs that conform to the format described in the SAM reference manual. In this case, MD is a series of numbers, representing a number of matching bps, ACGT or N, representing mismatched bps and ^ characters, which represent that one or more bps were deleted from the reference. The ACGT or Ns that follow a ^ are the deleted bps. For example, 10A5^AC6 means there are 10 matching bps. Then there's an A in the reference and a different bp in the read. Then there are 5 matching bps. Then there's a 2bp (A and C) deletion from the reference, etc. The problem with using the MD alone to determine the position of mismatched bps is that it doesn't take insertions into account. For that we need to also use info obtained from the CIGAR and seen here in theInsertions array. Returns an array of integers representing the position (between 1 and readLen) of each mismatched basepair in the read. theDeletions array is unused. """ global readLen mismatches = [] # convert string such as '64T6^TT2' to # ['64', 'T', '6', '^', 'TT', '2' ] theMD = theMD.replace("MD:Z:", "") tokens = re.split(r'([0-9]+|[A-Z]+)', theMD) tokens = filter(lambda x: len(x) > 0, tokens) num = 0 startpos = theSoftClips[0] t = 0 while (t < len(tokens)): num = int(tokens[t]) t += 1 if (t == len(tokens)): break str = tokens[t] t += 1 if (re.match(r'[ACGTNM]+', str)): pos = startpos + num + 1 pos = adjustForInserts(theInsertions, pos) mismatches.append(pos) startpos = pos elif (str == "^"): # skip the next token after the ^ tmp = tokens[t] t += 1 if (re.match(r'[ACGTNM]+', tmp) == None): raise Exception, "MD Parse Exception after ^: ", theMD startpos += num pos = startpos + num pos = adjustForInserts(theInsertions, pos) pos += theSoftClips[1] if (pos != readLen): raise Exception, "MD Parse Error, after parsing, pos=%d, should be %d : %s" % \ (pos, readLen, theMD) return mismatches def printRow(theTitle, theTotal, theRow): global outFile outFile.write("%s\t" % theTitle) for i in range(0, readLen): outFile.write("%02.4f\t" % ((float(theRow[i]) /theTotal) * 100)) outFile.write("\n") def isReversed(flag): return int(flag) & 0x0010 def reversePosition(pos): global readLen return (readLen + 1) - pos def reversePositionIfNeeded(flag, pos): if isReversed(flag): return reversePosition(pos) return pos def tally(theTitle, theFiles): global readLen, outFile totalReads = 0 totalErrors = [ 0 for i in range(readLen) ] totalI = [ 0 for i in range(readLen) ] totalD = [ 0 for i in range(readLen) ] totalM = [ 0 for i in range(readLen) ] if debug: print >> outFile, "Data for %s Reads" % theTitle for file in theFiles: inputFile = open(file) for line in inputFile: printedSomething = False totalReads += 1 row = line.split() readID = row[0] flag = row[1] cigar = row[2] MD = row[3] if (debug2): if isReversed(flag): print >> outFile, "READ: %s\t%s\t%s\tREVERSED" % (readID, cigar, MD) else: print >> outFile, "READ: %s\t%s\t%s" % (readID, cigar, MD) insertions, deletions, softclips = parseCigar(cigar) for d in deletions: d = reversePositionIfNeeded(flag, d) if debug: print >> outFile, "D\t%d\t%s\t%s\t%s" % (d, readID, cigar, MD) printedSomething = True totalD[d - 1] += 1 totalErrors[d - 1] += 1 for i in insertions: ipos = reversePositionIfNeeded(flag, i['pos']) if debug: print >> outFile, "I\t%d\t%s\t%s\t%s" % (ipos, readID, cigar, MD) printedSomething = True totalI[ipos - 1] += 1 totalErrors[ipos - 1] += 1 mismatches = parseMDFn(MD, insertions, deletions, softclips) for m in mismatches: m = reversePositionIfNeeded(flag, m) if debug: print >> outFile, "M\t%d\t%s\t%s\t%s" % (m, readID, cigar, MD) printedSomething = True totalM[m - 1] += 1 totalErrors[m - 1] += 1 if printedSomething: print >> outFile inputFile.close() if debug: print >> outFile, "TotalReads=%d" % totalReads if totalReads > 0: printRow("%sA" % theTitle, totalReads, totalErrors) printRow("%sM" % theTitle, totalReads, totalM) printRow("%sD" % theTitle, totalReads, totalD) printRow("%sI" % theTitle, totalReads, totalI) def findMD(fields): """ Return index of field that starts with "MD:" """ for i in range(len(fields)): if re.match("^MD:", fields[i]): return i return -1 def splitReads(theFiles): """ Splits each input file into a pair of files, one with only 'first' reads, the other with 'second' reads. The filenames are the basename of the input file with either .R1 or .R2 appended. Throws out the columns we don't care about so that the resulting files contain just the columns: read id, cigar, MD. Returns a list of the r1 fileanmes and a list of the r2 filenames, as a (2 element) list of lists of filenames. """ r1files = [] r2files = [] for ifile in theFiles: f1 = os.path.basename(os.path.normpath(ifile)) + ".R1" f2 = os.path.basename(os.path.normpath(ifile)) + ".R2" r1files.append(f1) r2files.append(f2) ifd = open(ifile, "r") ofd1 = open(f1, "w") ofd2 = open(f2, "w") while True: line = ifd.readline() if not line: break line = line.strip() if not len(line): break if re.match("^@", line): continue fields = line.split() # Pick output file depending on whether 1st or 2nd read. if int(fields[1]) <= 124: ofd = ofd1 elif int(fields[1]) >= 128: ofd = ofd2 else: print >> sys.stderr, "skipping %s\t%s\t%s because this is neither 1st nor 2nd read.\n" % (fields[0], fields[1], fields[5]) continue if fields[5] == "*" or not re.match("^[0-9MIDS]+$", fields[5]): print >> sys.stderr, "skipping %s\t%s\n" % (fields[0], fields[5]) else: i = findMD(fields) if i == -1: print >> sys.stderr, "skipping %s\t%s because MD field is missing.\n" % (fields[0], fields[5]) else: ofd.write("%s %s %s %s\n" % (fields[0], fields[1], fields[5], fields[i])) ifd.close() ofd1.close() ofd2.close() ofd.close() # not sure if this is necessary, refers to same file as ofd1 or ofd2 return (r1files, r2files) def invokeR(theFile): script = """ infile <- commandArgs(TRUE)[1] outfile <- paste(infile, ".jpg", sep="") data <- read.table(infile, header=T) jpeg(outfile, width = 1000, height = 1000) plot.new() split.screen(c(4,2)) for (i in 1:8) { screen(i) #go through each figure and plot something plot(1:%d,data[i,], type = "l", main = rownames(data[i,]), ylab="Error Rate", xlab="Base (1:%d)") } close.screen(all=TRUE) dev.off() """ % (readLen, readLen) tmp = tempfile.mkstemp(dir=os.getcwd()) tmpfile = os.fdopen(tmp[0], "w") tmpfile.write(script) tmpfile.close() cmd = "R --vanilla --args %s < %s" % (theFile, tmp[1]) p = subprocess.Popen(cmd, shell=True) p.wait() os.remove(tmp[1]) def main(argv=None): """ Usage: """ global outFile, readLen, parseMDFn if argv is None: argv = sys.argv runR = False outFileName = None mdFormat = 0 parseMDFn = parseMD1 options, remainder = getopt.getopt(argv[1:], "ro:l:f:") for opt, arg in options: if opt in ("-r"): runR = True elif opt in ("-o"): outFileName = arg elif opt in ("-l"): readLen = int(arg) elif opt in ("-f"): mdFormat = int(arg) if (mdFormat == 1): parseMDFn = parseMD1 elif (mdFormat == 2): parseMDFn = parseMD2 inputFiles = remainder if len(inputFiles) == 0: print >> sys.stderr, "No inputFiles specified." print >> sys.stderr, __doc__ return if outFileName == None: print >> sys.stderr, "No output filename specified." print >> sys.stderr, __doc__ return if readLen == 0: print >> sys.stderr, "Read len not specified." print >> sys.stderr, __doc__ return outFile = open(outFileName, "w") # Split each input file "f" into "f.R1" and "f.R2", so that .R1 files have only first # reads and .R2 files have only 2nd reads. We'll tally all the first reads together # and all the 2nd reads together, but keep them separate from each other. r1files, r2files = splitReads(inputFiles) header = [str(i + 1) for i in range(readLen) ] # this is list containing "1", "2", ... "readLen" print >> outFile, "\t%s" % ("\t".join(header)) tally("R1_", r1files) tally("R2_", r2files) outFile.close() for f in r1files: os.remove(f) for f in r2files: os.remove(f) if runR: invokeR(outFileName) if __name__ == "__main__": sys.exit(main())