#!/usr/bin/env python """ """ import sys import os import re import string import subprocess import tempfile import getopt import fastaExtract from datetime import datetime import time class Alleles: debug = False def __init__(self, gffName=None, flanking=3, linelen=0, refName=None, refFile=None): self.gffName = gffName self.gffFile = None self.flanking = flanking self.linelen = linelen self.close = False if refFile: self.refFile = ref elif refName: if Alleles.debug: fastaExtract.FastaExtract.debug = True self.refFile= fastaExtract.FastaExtract(refName) self.close = True else: raise Exception("Either refName or refFile argument must be passed to Alleles ctor") # skip over the gff file header. if self.gffName: self.gffFile = open(self.gffName, "r") self.gffFile.readline() def close(self): if self.close and self.refFile: self.refFile.close() if self.gffFile: self.gffFile.close() def breakLines(self, str, length=100): """ Splits long lines at specified length. Returns a string that's the same as the input string, with newlines inserted. """ lines = [str[i:i+length] for i in range(0, len(str), length)] return "\n".join(lines) return str def getAlleles(self, gffLine): rowI = rowD = None fields = gffLine.split() chr = fields[0] start = int(fields[3]) end = int(fields[4]) """ We always Use position before inserted or deleted character in the name. When we have an insert, we create the sequence for allele with the insert by getting the inserted bps from the contig. We have that info in the first part of the last field of the .variants.gff line. When its a reverse match, the contig aligns to the reverse complement of the reference, or alternatively you could say that the (forward version) of the reference aligns with the reverse complement of the contig. Since in alleles.fasta we always show the reference in the forward direction, we need to get the reverse complement of the inserted bps. """ if (fields[2].startswith("INS")): name = "%s_%d_INS" % (chr, start) insbps = fields[8].split("/")[0] if (fields[6] == "-"): insbps = fastaExtract.reverse_complement(insbps) data = "%s%s%s" % (self.refFile.getSequence1Base(chr, start - (self.flanking - 1), self.flanking), insbps, self.refFile.getSequence1Base(chr, end, self.flanking)) rowI = ">%s\n%s\n" % (name, self.breakLines(data )) name = "%s_%d_REF" % (chr, start) data = "%s%s" % (self.refFile.getSequence1Base(chr, start - (self.flanking - 1), self.flanking), self.refFile.getSequence1Base(chr, end, self.flanking)) rowD = ">%s\n%s\n" % (name, self.breakLines(data)) elif (fields[2].startswith("DEL")): pos = int(fields[3]) - 1 name = "%s_%d_REF" % (chr, pos) delLen = (end - start) + 1 data = self.refFile.getSequence1Base(chr, start - self.flanking, self.flanking * 2 + delLen) rowI = ">%s\n%s\n" % (name, self.breakLines(data)) name = "%s_%d_DEL" % (chr, pos) data = "%s%s" % (self.refFile.getSequence1Base(chr, start - self.flanking, self.flanking), self.refFile.getSequence1Base(chr, end + 1, self.flanking)) rowD = ">%s\n%s\n" % (name, self.breakLines(data)) elif (fields[2] == "BRI"): pos = int(fields[3]) - 1 name = "%s_%d_BRI" % (chr, pos) insbps = fields[8].split("/")[0] if (fields[6] == "-"): insbps = fastaExtract.reverse_complement(insbps) data = "%s%s%s" % (self.refFile.getSequence1Base(chr, start - self.flanking, self.flanking), insbps, self.refFile.getSequence1Base(chr, end + 1, self.flanking)) rowI = ">%s\n%s\n" % (name, self.breakLines(data)) name = "%s_%d_REF" % (chr, pos) insbps = fields[8].split("/")[1] if (fields[6] == "-"): insbps = fastaExtract.reverse_complement(insbps) data = "%s%s%s" % (self.refFile.getSequence1Base(chr, start - self.flanking, self.flanking), insbps, self.refFile.getSequence1Base(chr, end + 1, self.flanking)) rowD = ">%s\n%s\n" % (name, self.breakLines(data)) elif (fields[2] == "BRD"): pos = int(fields[3]) - 1 name = "%s_%d_REF" % (chr, pos) insbps = fields[8].split("/")[1] if (fields[6] == "-"): insbps = fastaExtract.reverse_complement(insbps) data = "%s%s%s" % (self.refFile.getSequence1Base(chr, start - self.flanking, self.flanking), insbps, self.refFile.getSequence1Base(chr, end + 1, self.flanking)) rowI = ">%s\n%s\n" % (name, self.breakLines(data)) name = "%s_%d_BRD" % (chr, pos) insbps = fields[8].split("/")[0] if (fields[6] == "-"): insbps = fastaExtract.reverse_complement(insbps) data = "%s%s%s" % (self.refFile.getSequence1Base(chr, start - self.flanking, self.flanking), insbps, self.refFile.getSequence1Base(chr, end + 1, self.flanking)) rowD = ">%s\n%s\n" % (name, self.breakLines(data)) elif (fields[2] == "BR0"): pos = int(fields[3]) - 1 name = "%s_%d_BR" % (chr, pos) insbps = fields[8].split("/")[1] if (fields[6] == "-"): insbps = fastaExtract.reverse_complement(insbps) data = "%s%s%s" % (self.refFile.getSequence1Base(chr, start - self.flanking, self.flanking), insbps, self.refFile.getSequence1Base(chr, end + 1, self.flanking)) rowI = ">%s\n%s\n" % (name, self.breakLines(data)) name = "%s_%d_BA" % (chr, pos) insbps = fields[8].split("/")[0] if (fields[6] == "-"): insbps = fastaExtract.reverse_complement(insbps) data = "%s%s%s" % (self.refFile.getSequence1Base(chr, start - self.flanking, self.flanking), insbps, self.refFile.getSequence1Base(chr, end + 1, self.flanking)) rowD = ">%s\n%s\n" % (name, self.breakLines(data)) else: return ("", "") return (rowI,rowD) def nextAllele(self): """ You can't call this method unless you passed gfffile to ctor. There are 2 ways to use this class: 1) you create class with gffFile=None and you iterate over lines of gfffile calling Alleles.getAlleles() 2) you create class with gffFile=name and you call nextAllele() to iterate. """ while True: line = self.gffFile.readline() if not line: return None if not len(line.strip()): continue break return self.getAlleles(line) def main(argv=None): flankingLen = 3 if argv is None: argv = sys.argv options, remainder = getopt.getopt(argv[1:], "g:r:f:") for opt, arg in options: if opt in ("-r"): refFilename = arg elif opt in ("-g"): gffFilename = arg elif opt in ("-f"): flankingLen = int(arg) a = Alleles(gffFilename, flanking=flankingLen, refName=refFilename) while True: lines = a.nextAllele() if not lines: break if lines[0]: sys.stdout.write(lines[0]) sys.stdout.write(lines[1]) if __name__ == "__main__": sys.exit(main())