#!/usr/bin/env python
import sys
import os
import re


# Fields of each row ('x' indicates a field we're not concerned with):
# read_id, x, rname, pos, mapq/score, x, seq, filename
# where rname and pos taken together constitute the "mapping".

# Created the file by merging two runs (filename will differ).  read_id should
# be unique in each of the input files, so we'll be selecting "groups" of two
# rows, each with the same read_id but from a different run and seeing if the
# mapping differs.


# Run as "findcases.py file scorePrefix" 
#
# Each row ends with the filename of the file that the row came from before the merge
# We want to use the score from the rows that come from the run w/ fewer indicies
# so we specify that filename (scorePrefix) on the command line.  It must match
# the last column exactly.


def printTheGroup(theGroup):
    for line in theGroup:
        print line
    print 

def gotMatch(cigar):
    return cigar == "100M" 
    
totalReads = 0
totalDifferences = 0
score1Count = 0
score2Count = 0
score3Count = 0
score4Count = 0
score5Count = 0

f1 = open(sys.argv[1] + "_1.score", "w");
f2 = open(sys.argv[1] + "_2.score", "w");
f3 = open(sys.argv[1] + "_3.score", "w");
f4 = open(sys.argv[1] + "_4.score", "w");
f5 = open(sys.argv[1] + "_5.score", "w");

def score(theRows):
    global score1Count, score2Count, score3Count, score4Count, score5Count
    global f1, f2, f3, f4, f5
    global scorePrefix
    if (theRows[0][7] == scorePrefix):
        score = int(theRows[0][4])
    elif (theRows[1][7] == scorePrefix):
        score = int(theRows[1][4])
    else:
        print >> sys.stderr,  "neither row matches ", scorePrefix, ":"
        for r in theRows:
            print r
            return
    if (score < 51):
        score1Count = score1Count + 1
        handle = f1
    elif (score < 101): 
        score2Count = score2Count + 1
        handle = f2
    elif (score < 151): 
        score3Count = score3Count + 1
        handle = f3
    elif (score < 201): 
        score4Count = score4Count + 1
        handle = f4
    else:
        score5Count = score5Count+ 1
        handle = f5
    handle.write("\t".join(theRows[0]) + "\n")
    handle.write("\t".join(theRows[1]) + "\n")

def processGroup(theGroup):
    global totalReads, totalDifferences
    if len(theGroup) == 1:
        print >> sys.stderr, "Unique read_id ", "".join(theGroup[0])
        return
    elif len(theGroup) == 2:
        line1 = theGroup[0]
        line2 = theGroup[1]
        if line1[7] == line2[7]:
            print >> sys.stderr, "Same read id occurs more than once in file, skipping these:"
            print >> sys.stderr, line1
            print >> sys.stderr, line2
            return
        totalReads = totalReads + 1
        # If the mapping differs, inc our counter and group the rows based on their score 
        if (line1[2] != line2[2] or line1[3] != line2[3]):
            totalDifferences = totalDifferences + 1
            score(theGroup)
    else :
        print >> sys.stderr, "Too many rows with same read_id, excluding these: "
        for g in theGroup:
            print >> sys.stderr, g

f = open(sys.argv[1])
scorePrefix = sys.argv[2]

group = []
prevRow = -1 

for line in f:
    row = line.split()
    if (len(row) < 4):
        print >> sys.stderr, "row is too short", row
        continue

    # If current row doesn't belong in current group based on match of readid,
    # i.e. the first column
    if (len(group) and (row[0] != group[prevRow][0] )):
        processGroup(group)
        prevRow = -1 
        group = []
    group.append(row)
    prevRow = prevRow + 1

if (len(group)):
    processGroup(group)

f.close()
f1.close()
f2.close()
f3.close()
f4.close()
f5.close()

print "Total Differences = %d" %  totalDifferences
print "Total Read Ids = %d" %  totalReads
print "Percent Differences = %f" %  ((float(totalDifferences) / float(totalReads)) * 100.0)
print
print "Number of pairs with different mappings, by score:"
print "0-50 : %d" % score1Count
print "51-100: %d" % score2Count
print "101-150 : %d" % score3Count
print "151-201 : %d" % score4Count
print "201-255 : %d" % score5Count