#!/usr/bin/env python """ Usage: extract.py """ import sys import os import re import string import subprocess import tempfile import getopt """ """ def getFromReference(refID, offset, length): reffile = ReferenceFastaFile(refID) seq = reffile.getSequence(offset, length) reffile.close() return seq class ReferenceFastaFile: def __init__(self, refID): self.filename = ReferenceFastaFile.getFilename(refID) self.fd = open(self.filename, "r") self.start = 0 self.lineLength = 0 self.getInfo() def getFilename(refID): global refFilename, refDirectory if refDirectory: filename = "%s/chr%s.fa" % (refDirectory, refID) else: filename = refFilename return filename getFilename = staticmethod(getFilename) def getInfo(self): """ determines offset in file of first byte of sequence data (by skipping the comment line) determines length of data lines returns both """ self.fd.seek(0, 0) line = self.fd.readline(); self.start = self.fd.tell(); line = self.fd.readline(); line = line.rstrip("\n"); self.lineLength = len(line); def getSequence(self, offset, length): start = offset + int(offset / self.lineLength) + self.start self.fd.seek(start, 0) got = 0 result = [] while got < length: ch = self.fd.read(1) if (ch == '\n'): continue result.append(ch) got += 1 return ''.join(result) def close(self): self.fd.close() refFilename = refDirectory = None def main(argv=None): if argv is None: argv = sys.argv global refFilename, refDirectory options, remainder = getopt.getopt(argv[1:], "h") for opt, arg in options: if opt in ("-h"): print __doc__ return 0 if len(remainder) != 2: print __doc__ return 1 refDirectory = remainder[0] coordFilename = remainder[1] coordFile = open(coordFilename) for line in coordFile: line.strip() field = line.split() seq = getFromReference(field[0], int(field[1]), int(field[2])) print seq if __name__ == "__main__": sys.exit(main())