#!/usr/bin/env python import sys import os import re # Fields of each row ('x' indicates a field we're not concerned with): # read_id, x, rname, pos, mapq/score, x, seq, filename # where rname and pos taken together constitute the "mapping". # Created the file by merging two runs (filename will differ). read_id should # be unique in each of the input files, so we'll be selecting "groups" of two # rows, each with the same read_id but from a different run and seeing if the # mapping differs. # Run as "findcases.py file scorePrefix" # # Each row ends with the filename of the file that the row came from before the merge # We want to use the score from the rows that come from the run w/ fewer indicies # so we specify that filename (scorePrefix) on the command line. It must match # the last column exactly. def printTheGroup(theGroup): for line in theGroup: print line print def gotMatch(cigar): return cigar == "100M" totalReads = 0 totalDifferences = 0 score1Count = 0 score2Count = 0 score3Count = 0 score4Count = 0 score5Count = 0 f1 = open(sys.argv[1] + "_1.score", "w"); f2 = open(sys.argv[1] + "_2.score", "w"); f3 = open(sys.argv[1] + "_3.score", "w"); f4 = open(sys.argv[1] + "_4.score", "w"); f5 = open(sys.argv[1] + "_5.score", "w"); def score(theRows): global score1Count, score2Count, score3Count, score4Count, score5Count global f1, f2, f3, f4, f5 global scorePrefix if (theRows[0][7] == scorePrefix): score = int(theRows[0][4]) elif (theRows[1][7] == scorePrefix): score = int(theRows[1][4]) else: print >> sys.stderr, "neither row matches ", scorePrefix, ":" for r in theRows: print r return if (score < 51): score1Count = score1Count + 1 handle = f1 elif (score < 101): score2Count = score2Count + 1 handle = f2 elif (score < 151): score3Count = score3Count + 1 handle = f3 elif (score < 201): score4Count = score4Count + 1 handle = f4 else: score5Count = score5Count+ 1 handle = f5 handle.write("\t".join(theRows[0]) + "\n") handle.write("\t".join(theRows[1]) + "\n") def processGroup(theGroup): global totalReads, totalDifferences if len(theGroup) == 1: print >> sys.stderr, "Unique read_id ", "".join(theGroup[0]) return elif len(theGroup) == 2: line1 = theGroup[0] line2 = theGroup[1] if line1[7] == line2[7]: print >> sys.stderr, "Same read id occurs more than once in file, skipping these:" print >> sys.stderr, line1 print >> sys.stderr, line2 return totalReads = totalReads + 1 # If the mapping differs, inc our counter and group the rows based on their score if (line1[2] != line2[2] or line1[3] != line2[3]): totalDifferences = totalDifferences + 1 score(theGroup) else : print >> sys.stderr, "Too many rows with same read_id, excluding these: " for g in theGroup: print >> sys.stderr, g f = open(sys.argv[1]) scorePrefix = sys.argv[2] group = [] prevRow = -1 for line in f: row = line.split() if (len(row) < 4): print >> sys.stderr, "row is too short", row continue # If current row doesn't belong in current group based on match of readid, # i.e. the first column if (len(group) and (row[0] != group[prevRow][0] )): processGroup(group) prevRow = -1 group = [] group.append(row) prevRow = prevRow + 1 if (len(group)): processGroup(group) f.close() f1.close() f2.close() f3.close() f4.close() f5.close() print "Total Differences = %d" % totalDifferences print "Total Read Ids = %d" % totalReads print "Percent Differences = %f" % ((float(totalDifferences) / float(totalReads)) * 100.0) print print "Number of pairs with different mappings, by score:" print "0-50 : %d" % score1Count print "51-100: %d" % score2Count print "101-150 : %d" % score3Count print "151-201 : %d" % score4Count print "201-255 : %d" % score5Count