#!/usr/bin/env python """ Converts atac file to gff. Usage: atacToGff -t flag flag is "u" or "r" to process either the type u or type r rows of the atac.uids file Results are written to .vs..atac.gff in the current directory. """ """ Normally we run atac with assembly1 = contig and assembly2 = reference genome so variables in this program are named accordingly. From Andrew's email: As for the gff, I think all the information you need is in the atac output. However, if I remember correctly, there is some redundancy in the output? For example, if I grep "14435260" in the provided file I get 2 redundant lines: M u H839 r647 517942 0 162 1 22 14435260 162 -1 M r r647 . 517942 0 162 1 22 14435260 162 -1 I can't remember why ATAC has this double output, but it seems there are more lines with "u" in the 2nd column, rather than "r" in the 2nd column. So perhaps we should focus on those lines? If you know more about what is going on there, please let me know. Anyways, here is the gff format as I see it for this output: Chr source type start end score strand phase attributes Chr is the chromosome of the hit. Can get this from the 9th column Source would be the sample name. If this isn't know, I think we can get it from the atac output. Line 20 contains: /assemblyId1=GIST.s_78.K47con Which gives the sample name as "GIST.s_78.K47con" Type could give the name of the 2nd assembly that is being used to align the contigs. For most this will be ncbi reference (eg: ncbi36.fasta), but in this case it is chr22.fasta. You can also get this information from the header in the atac result. Start and end are the coordinates. Note that in ATAC, these are in ZERO based format, but in GFF we want ONE based. So we need to do a quick conversion at this step: Start is the value in column 10 + 1. End is column 10 + column 11. Score could be the length of the sequence hit. This is given in column 11. Strand is the direction of the alignment. If column 12 is 1, give +; if column 12 is -1, give - Phase. Perhaps we can give the contig name/number here. This is given in column 5. Attributes. We can use the field to give the bps hit in the contig given in Phase. Again, we need to convert these from zero based to one based. For the start, we would give column 6 + 1. End we would give column 6 + column 7. Example: M u H20 r13 460852 40 102 1 22 30276483 102 1 M u H22 r14 460978 0 142 1 22 21730610 142 1 M u H23 r15 461022 0 139 1 22 29830489 139 -1 M u H24 r16 461214 0 142 1 22 14663990 142 1 M u H25 r17 461462 12 115 1 22 22370145 115 1 M u H26 r18 461688 1 141 1 22 15849050 141 1 GFF: 22 GIST.s_78.K47con chr22.fasta 30276484 30276585 102 + contig_460852 41-142 22 GIST.s_78.K47con chr22.fasta 21730611 21730752 142 + contig_460978 1-142 22 GIST.s_78.K47con chr22.fasta 29830490 29830628 139 - contig_461022 1-139 22 GIST.s_78.K47con chr22.fasta 14663991 14664132 142 + contig_461214 1-142 22 GIST.s_78.K47con chr22.fasta 22370146 22370260 115 + contig_461462 13-127 22 GIST.s_78.K47con chr22.fasta 15849051 15849191 141 + contig_461688 2-142 """ import sys import os import re import string import subprocess import tempfile import getopt class DataException(Exception) : def __init__(self, msg): self.message = msg def __str__(self): return self.message class AtacRow: def __init__( self, row): fields = row.split() self.row = row self.rowClass = fields[0] self.matchType = fields[1] self.matchIndex = fields[2] self.parentIndex = fields[3] self.contigID = fields[4] self.cMatchStart = int(fields[5]) self.cMatchLen = int(fields[6]) self.cDir = int(fields[7]) self.refID = fields[8] self.rMatchStart = int(fields[9]) self.rMatchLen = int(fields[10]) self.rDir = int(fields[11]) self.cMatchEnd = self.cMatchStart + self.cMatchLen - 1 self.rMatchEnd = self.rMatchStart + self.rMatchLen - 1 def __str__(self): return self.row def readAtacHeader(file): """ Seek to beginning of file, skip lines that don't start with "/". Create dictionary of name/val pairs from lines starting with "/". Return dictionary and leave file positioned to read first data row. """ retval = {} startOfLine = 0 file.seek(0) while True: startOfLine = file.tell() line = file.readline() if not line: break line = line.strip() if line[0] != "/": if len(retval) == 0: # Skip the comments at head of file continue else: # Done with header, put the line back so we're positioned at start of first data row. file.seek(startOfLine) break # Store the header rows pair = line.split("=") key = pair[0][1:] value = pair[1] retval[key] = value return retval def doSort(filename): cmd = "sort -o %s -k 3,3 -k 1,1 -k 4,4n -k 5,5n %s" % (filename, filename) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) outerr = p.communicate() if (p.returncode != 0): raise SystemError("Error running %s. retval=%d, stderr='%s'" % (cmd, p.returncode, outerr[1])) def main(argv=None): if argv is None: argv = sys.argv tflag = "u" infile = None outfile = None outfileName = None options, remainder = getopt.getopt(argv[1:], "ht:12") for opt, arg in options: # specify the type of row (2nd column in atac output) that we're mapping if opt in ("-t"): tflag = arg if opt in ("-h"): print __doc__ return 0 if remainder: infile = open(remainder[0], "r") if not infile: print __doc__ return 1 atacFile = infile atacHeader = readAtacHeader(atacFile) outfileName = "%s.vs.%s.atac.gff" % (atacHeader["assemblyId1"], atacHeader["assemblyId2"]) outfile = open(outfileName, "w") print >> outfile, "contig/chr\t.\tassembly\tSTART\tEND\t.\tStrand\t.\tcoordinates_on_aligned_assembly" # print atacHeader source = atacHeader["assemblyId1"] type = atacHeader["assemblyId2"] for line in atacFile: arow = AtacRow(line.strip()) if (arow.matchType != tflag): continue # Print using coordinates of the 2nd assembly (which is usually the reference) print >>outfile, "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s:%d-%d" % (arow.refID, ".", type, arow.rMatchStart + 1, arow.rMatchEnd + 1, ".", ("-", "+")[(arow.rDir == 1)], ".", arow.contigID, arow.cMatchStart + 1, arow.cMatchEnd + 1) # Print using coordinates of the 1st assembly (which is usually the conting/sample) print >>outfile, "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s:%d-%d" % (arow.contigID, ".", source, arow.cMatchStart + 1, arow.cMatchEnd + 1, ".", ("-", "+")[(arow.rDir == 1)], ".", arow.refID, arow.rMatchStart + 1, arow.rMatchEnd + 1) infile.close() outfile.close() doSort(outfileName) if __name__ == "__main__": sys.exit(main())