#!/usr/bin/env python """ Usage is: bamerrors.py -M email -A account -N jobname [-q queue] [-h runHours] -r -f [1|2] -c cutoff list_of_bamfiles -M, -A, -N are the same as the corresponding qsub arguments. -A and -N are mandatory. -q is optional. Default is "small". -h runHours is optional. Default is no limit specified (will run as long as needed up to the queue limit). I don't know if this is time for the whole job array or for each job in the array. This value is also given to the job that runs the final step of finding errors above a cutoff value. -c cutoff After errors are counted in each bam file this looks for read positions in each output file that are above the specified cutoff and writes them to bamerrors_cutoff.txt. NOTE that the -c cutoff argument doesn't work yet due to problems with having the job depend on an array of jobs! You'll need to run bamerrors_cutoff.py after all the bamerrors.py jobs have finished. A submission script for running bamerrors_cutoff.py will be generated for you as .finalize.run. -r run R to generate graphical output -f MD format (see samerrors3.py) Use absolute or relative paths to specify the bamfiles. The bamfilenames (not including the path) must be distinct and should end with ".bam". Results are produced in the current directory. Job will be submitted as a pbs job array. With -M you'll be emailed as each job in the array completes. I'm not sure what happens if you submit more files than the number of jobs you're allowed to run. """ import sys import os import getopt import subprocess import re from decimal import * samtoolsCmd = "samtools" # samErrorsCmd = "/home/teschwartz/stsi/sam_errors/samerrors2.py" samErrorsCmd = "samerrors3.py" # cutoffCmd = "/home/teschwartz/stsi/sam_errors/bamerrors_cutoff.py" cutoffCmd = "bamerrors_cutoff.py" # Set this to empty string if you don't want to run head. # headCmd = "head -n 1000 | " headCmd = " " def main(argv=None): if argv is None: argv=sys.argv rOption = "" mdFormat = "1" userEmail = userAccount = userRunHours = userJobname = None userQueue = "small" cutoff = None options, bamfiles= getopt.getopt(argv[1:], "M:A:N:q:h:f:c:r") for opt, arg in options: if opt in ("-M"): userEmail = arg if opt in ("-A"): userAccount = arg if opt in ("-q"): userQueue = arg if opt in ("-h"): userRunHours = arg if opt in ("-f"): mdFormat= arg if opt in ("-N"): userJobname = arg if opt in ("-r"): rOption = "-r" if opt in ("-c"): cutoff = arg Decimal(cutoff) # should throw exception if cutoff can't be parsed. if (not userAccount or not userJobname or not cutoff): print >> sys.stderr, __doc__ return scriptName = createScriptFile(bamfiles, userJobname, userEmail, userAccount, userQueue, userRunHours, mdFormat, rOption) jobid = submitJob(scriptName) if not jobid: print "Error getting jobid from qstub." return scriptName = createFinalizeScriptFile(bamfiles, userJobname, userEmail, userAccount, userQueue, userRunHours, cutoff, jobid) # This cutoff job should depend on completion of the array but I haven't been able to get that to # work, so instead we'll just create the job submission script and let the user submit it when he # knows the job array is finished. # submitJob(scriptName) def createFinalizeScriptFile(bamfiles, userJobname, userEmail, userAccount, userQueue, userRunHours, cutoff, jobid): scriptName = userJobname + ".finalize.run" userJobname = userJobname + ".finalize" sfile = open(scriptName, "w") text = """#!/bin/bash #PBS -q %s #PBS -N %s #PBS -j oe #PBS -o %s.stdout #PBS -V #PBS -A %s ##PBS -W depend=afterokarray:%s[] """ % (userQueue, userJobname, userJobname, userAccount, jobid) if (userEmail): text += """ #PBS -M %s #PBS -m ae """ % (userEmail) if (userRunHours): text += """ #PBS -l walltime=%s:00:00 """ % (userRunHours) sfile.write(text) text = """ source /etc/profile cd %s """ % (os.getcwd()) sfile.write(text) cmdline = """ %s -c %s """ % (cutoffCmd, cutoff) for filename in bamfiles: cmdline += os.path.basename(filename) + ".out" + " " sfile.write(cmdline + "\n") sfile.close() return scriptName def createScriptFile(bamfiles, userJobname, userEmail, userAccount, userQueue, userRunHours, mdFormat, rOption): global samtoolsCmd, headCmd, samErrorsCmd scriptName = userJobname + ".run" sfile = open(scriptName, "w") text = """#!/bin/bash #PBS -q %s #PBS -N %s #PBS -j oe #PBS -o %s.stdout #PBS -V #PBS -A %s #PBS -t 0-%d """ % (userQueue, userJobname, userJobname, userAccount, len(bamfiles) - 1) if (userEmail): text += """ #PBS -M %s #PBS -m ae """ % (userEmail) if (userRunHours): text += """ #PBS -l walltime=%s:00:00 """ % (userRunHours) sfile.write(text) text = "bamfiles=(" + " ".join(bamfiles) + ")\n" sfile.write(text) text = """ source /etc/profile module load R cd %s """ % (os.getcwd()) sfile.write(text) cmdline = """ %s view -h %s | %s %s -o %s -f %s %s """ % (samtoolsCmd, "${bamfiles[$PBS_ARRAYID]}", headCmd, samErrorsCmd, "`basename ${bamfiles[$PBS_ARRAYID]}.out`", mdFormat, rOption) sfile.write(cmdline) sfile.close() return scriptName def submitJob(scriptName): cmd = "qsub %s" % (scriptName) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) outerr = p.communicate() output = outerr[0] err = outerr[1] retval = p.returncode if retval != 0: # read whatever qsub wrote to the statusfile and print it to stdout print "Error submitting job:\n" print err return None print output # Just put this here incase we need to parse out jobid for some reason in the future p = re.compile(r"^(\d+).triton.\S+", re.M) m = p.search(output) if m != None: jobid = m.group(0) short_jobid = m.group(1) # print "jobid=%d" % int(short_jobid) return jobid return None if __name__ == "__main__": sys.exit(main())