#!/usr/bin/env python
"""
    Converts atac file to gff.  Usage:
        atacToGff -t flag <atac_filename> 

        flag is "u" or "r" to process either the type u or type r rows of the atac.uids file

        Results are written to <assembly1>.vs.<assembly2>.atac.gff in the current directory.

"""

""" 
    Normally we run atac with assembly1 = contig and assembly2 = reference genome
    so variables in this program are named accordingly.

    From Andrew's email:

    As for the gff, I think all the information you need is in the atac
    output. However, if I remember correctly, there is some redundancy in
    the output? For example, if I grep "14435260" in the provided file I get
    2 redundant lines:

    M u H839 r647 517942 0 162 1 22 14435260 162 -1
    M r r647 . 517942 0 162 1 22 14435260 162 -1

    I can't remember why ATAC has this double output, but it seems there are
    more lines with "u" in the 2nd column, rather than "r" in the 2nd
    column. So perhaps we should focus on those lines? If you know more
    about what is going on there, please let me know.

    Anyways, here is the gff format as I see it for this output:

    Chr     source  type    start   end     score   strand  phase
    attributes

    Chr is the chromosome of the hit. Can get this from the 9th column

    Source would be the sample name. If this isn't know, I think we can get
    it from the atac output. Line 20 contains:
    /assemblyId1=GIST.s_78.K47con
    Which gives the sample name as "GIST.s_78.K47con"

    Type could give the name of the 2nd assembly that is being used to align
    the contigs. For most this will be ncbi reference (eg: ncbi36.fasta),
    but in this case it is chr22.fasta. You can also get this information
    from the header in the atac result.

    Start and end are the coordinates. Note that in ATAC, these are in ZERO
    based format, but in GFF we want ONE based. So we need to do a quick
    conversion at this step: Start is the value in column 10 + 1. End is
    column 10 + column 11.

    Score could be the length of the sequence hit. This is given in column
    11.

    Strand is the direction of the alignment. If column 12 is 1, give +; if
    column 12 is -1, give -

    Phase. Perhaps we can give the contig name/number here. This is given in
    column 5.

    Attributes. We can use the field to give the bps hit in the contig given
    in Phase. Again, we need to convert these from zero based to one based.
    For the start, we would give column 6 + 1. End we would give column 6 +
    column 7.

    Example:
    M u H20 r13 460852 40 102 1 22 30276483 102 1
    M u H22 r14 460978 0 142 1 22 21730610 142 1
    M u H23 r15 461022 0 139 1 22 29830489 139 -1
    M u H24 r16 461214 0 142 1 22 14663990 142 1
    M u H25 r17 461462 12 115 1 22 22370145 115 1
    M u H26 r18 461688 1 141 1 22 15849050 141 1

    GFF:
    22      GIST.s_78.K47con        chr22.fasta     30276484        30276585
    102     +       contig_460852   41-142
    22      GIST.s_78.K47con        chr22.fasta     21730611        21730752
    142     +       contig_460978   1-142
    22      GIST.s_78.K47con        chr22.fasta     29830490        29830628
    139     -       contig_461022   1-139
    22      GIST.s_78.K47con        chr22.fasta     14663991        14664132
    142     +       contig_461214   1-142
    22      GIST.s_78.K47con        chr22.fasta     22370146        22370260
    115     +       contig_461462   13-127
    22      GIST.s_78.K47con        chr22.fasta     15849051        15849191
    141     +       contig_461688   2-142

"""

import sys
import os
import re
import string
import subprocess
import tempfile
import getopt

class DataException(Exception) : 
    def __init__(self, msg):
        self.message = msg
    def __str__(self):
        return self.message

class AtacRow:
    def __init__(   self, row):
        fields = row.split()
        self.row = row
        self.rowClass = fields[0]
        self.matchType = fields[1]
        self.matchIndex = fields[2]
        self.parentIndex = fields[3]
        self.contigID = fields[4]
        self.cMatchStart = int(fields[5])
        self.cMatchLen = int(fields[6])
        self.cDir = int(fields[7])
        self.refID = fields[8]
        self.rMatchStart = int(fields[9])
        self.rMatchLen = int(fields[10])
        self.rDir = int(fields[11])

        self.cMatchEnd = self.cMatchStart + self.cMatchLen - 1
        self.rMatchEnd = self.rMatchStart + self.rMatchLen - 1

    def __str__(self):
        return self.row

def readAtacHeader(file):
    """
        Seek to beginning of file, skip lines that don't start with "/".
        Create dictionary of name/val pairs from lines starting with "/". 
        Return dictionary and leave file positioned to read first data row.
    """
    retval = {}
    startOfLine = 0
    file.seek(0)
    while True:
        startOfLine = file.tell()
        line = file.readline()
        if not line:
            break
        line = line.strip()
        if line[0] != "/":
            if len(retval) == 0:
                # Skip the comments at head of file
                continue
            else:
                # Done with header, put the line back so we're positioned at start of first data row.
                file.seek(startOfLine)
                break
        # Store the header rows
        pair = line.split("=")
        key = pair[0][1:]
        value = pair[1] 
        retval[key] = value
    return retval

def doSort(filename):
    cmd = "sort -o %s -k 3,3 -k 1,1 -k 4,4n -k 5,5n %s" % (filename, filename)
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    outerr = p.communicate()
    if (p.returncode != 0):
       raise SystemError("Error running %s. retval=%d, stderr='%s'" % (cmd, p.returncode, outerr[1])) 

def main(argv=None):
    if argv is None:
        argv = sys.argv
    tflag = "u"
    infile = None
    outfile = None
    outfileName =  None 

    options, remainder = getopt.getopt(argv[1:], "ht:12")
    for opt, arg in options:
        # specify the type of row (2nd column in atac output) that we're mapping
        if opt in ("-t"):
            tflag = arg  
        if opt in ("-h"):
            print __doc__ 
            return 0
    if remainder:
        infile = open(remainder[0], "r")
    if not infile:
        print __doc__
        return 1

    atacFile = infile 
    atacHeader = readAtacHeader(atacFile)   

    outfileName = "%s.vs.%s.atac.gff" % (atacHeader["assemblyId1"], atacHeader["assemblyId2"])
    outfile = open(outfileName, "w")
    print >> outfile, "contig/chr\t.\tassembly\tSTART\tEND\t.\tStrand\t.\tcoordinates_on_aligned_assembly"

    # print atacHeader
    source = atacHeader["assemblyId1"]
    type = atacHeader["assemblyId2"]
    for line in atacFile:
        arow = AtacRow(line.strip())
        if (arow.matchType != tflag):
            continue
        # Print using coordinates of the 2nd assembly (which is usually the reference)
        print >>outfile, "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s:%d-%d" % (arow.refID,
            ".",
            type,
            arow.rMatchStart + 1,
            arow.rMatchEnd + 1,
            ".",
            ("-", "+")[(arow.rDir == 1)],
            ".",
            arow.contigID,
            arow.cMatchStart + 1,
            arow.cMatchEnd + 1)
        # Print using coordinates of the 1st assembly (which is usually the conting/sample)
        print >>outfile, "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s:%d-%d" % (arow.contigID,
            ".",
            source,
            arow.cMatchStart + 1,
            arow.cMatchEnd + 1,
            ".",
            ("-", "+")[(arow.rDir == 1)],
            ".",
            arow.refID,
            arow.rMatchStart + 1,
            arow.rMatchEnd + 1)
    infile.close()
    outfile.close()
    doSort(outfileName)
    
            
if __name__ == "__main__":
    sys.exit(main())