#!/usr/bin/env python
# Program: cipres_data_parse
#
# Description: Standalone Python program to parse data files uploaded
# to CIPRES gateway. Currently configued to handle BEAST, BEAST2 and
# Migrate input files, but will be extended to other file formats
#
# Note that all files are opened with universal newlines support
# ('rU') so that we can handle files created using the Linux, Windows
# and Mac formats
#
# File parsing routines have very minimal error checking capabilities,
# definitely no substitute for a comprehensive file format checker
#
# Author: Robert Sinkovits, SDSC
#
# Usage: cipres_data_parse [file_name] <-t file_type>
import re
import argparse
# Process the command line arguments
file_types = ['beast', 'beast2', 'migrate_parm', 'migrate_infile']
parser = argparse.ArgumentParser(description='Process file name and file type cmd line args')
parser.add_argument(dest='file_name')
parser.add_argument('-t', '--type', dest='file_type', choices=file_types, default='unknown')
args = parser.parse_args()
file_name = args.file_name
file_type = args.file_type
def process_beast(file_name):
# Process BEAST files and return following
#
# datatype (nucleotide or amino acid)
# codon_partioning (True or False)
# nu_partitions (number of partitions)
# nu_patterns (number of patterns)
# Initialize results
datatype = 'unknown'
codon_partitioning = False
nu_partitions = 0
pattern_count = 0
# Start by assuming successful parsing of file
err_code = 0
# Define the regex that will be used to identify
# and parse the dataType, npatterns and codon lines
regex_datatype = '.*alignment.*data[tT]ype\s*=\s*"'
regex_patterns = '.*npatterns\s*=\s*'
regex_codon = 'codon'
# Compile the regex. Probably doesn't make a performance difference
# since python will cache recently used regex, but doesn't hurt
cregex_datatype = re.compile('.*alignment.*data[tT]ype\s*=\s*"')
cregex_patterns = re.compile('.*npatterns\s*=\s*')
cregex_codon = re.compile('codon')
with open(file_name, 'rU') as fin:
for line in fin:
line = line.rstrip()
# Process the dataType lines
if cregex_datatype.search(line):
line = re.sub(regex_datatype, '', line)
line = re.sub('".*', '', line)
datatype = re.sub('\s*', '', line)
# Process the lines that list number of patterns
# Increment pattern and partition counts
if cregex_patterns.search(line):
line = re.sub(regex_patterns, '', line)
line = re.sub('\D.*', '', line)
pattern_count += int(line)
nu_partitions += 1
# Look for lines that contain the string "codon"
if cregex_codon.search(line):
codon_partitioning = True
# Test for errors:
# Data type is not set to aminoacid or nucleotide
if datatype != 'aminoacid' and datatype != 'nucleotide':
err_code = 1
# Number of partitions or pattern counts non-positive numbers
if nu_partitions <= 0 or pattern_count <= 0:
err_code = 1
return err_code, datatype, codon_partitioning, nu_partitions, pattern_count
def process_beast2(file_name):
# Process BEAST2 files and return following
#
# nu_partitions (number of partitions)
# Partition count is determined from the number of pairs appearing AFTER the tag starting with
# = 0:
start_counting = True
continue
# Start counting partitions
if start_counting and line.find('= 0 and line.find('/>') < 0:
nu_partitions += 1
# Test for errors:
# Number of partitions non-positive number
if nu_partitions <= 0:
err_code = 1
return err_code, nu_partitions
def process_migrate_parm(file_name):
# Process Migrate parmfile and return following
#
# num_reps (Number of replicates)
# Replicate count is determined from the line
# replicate=< NO | YES: >
# Initialize replicates to 1
num_reps = 1
# Start by assuming successful parsing of file
err_code = 0
with open(file_name, 'rU') as fin:
for line in fin:
line = line.rstrip()
# Look for the starting line
if line.find('replicate=YES') >= 0:
p1, num_reps = line.split(':')
break
# Test for errors:
# number of replicates is not a valid number
if not str(num_reps).isdigit():
err_code = 1
return err_code, num_reps
def process_migrate_infile(file_name):
# Process Migrate infile and return following
#
# Number of num_loci
# Num_Loci count is determined from 1st record, 2nd field
# Initialize num_loci to 1
num_loci = 1
# Start by assuming successful parsing of file
err_code = 0
with open(file_name, 'rU') as fin:
for line in fin:
line = line.strip()
pline = line.split()
num_loci = pline[1]
break
# Test for errors:
# number of loci is not a valid number
if not str(num_loci).isdigit():
err_code = 1
return err_code, num_loci
#--------------------------------------------------------------
# ------------------- Start main program ----------------------
#--------------------------------------------------------------
# Determine the input file type if not already set
if file_type == 'unknown':
with open(file_name, 'rU') as fin:
for line in fin:
if line.find('BEAUTi') >= 0:
file_type = 'beast'
break
if line.find('= 0 and line.find('version="2.0">') >= 0:
file_type = 'beast2'
break
if line.find('Parmfile for Migrate') >= 0:
file_type = 'migrate_parm'
break
# Process BEAST files
if file_type == 'beast':
err_code, datatype, codon_partitioning, nu_partitions, pattern_count = process_beast(file_name)
results = 'file_type=' + file_type + '\n'
results += 'err_code=' + str(err_code) + '\n'
results += 'datatype=' + datatype + '\n'
results += 'codon_partitioning=' + str(codon_partitioning) + '\n'
results += 'nu_partitions=' + str(nu_partitions) +'\n'
results += 'pattern_count=' + str(pattern_count)
# Process BEAST2 files
if file_type == 'beast2':
err_code, nu_partitions = process_beast2(file_name)
results = 'file_type=' + file_type + '\n'
results += 'err_code=' + str(err_code) + '\n'
results += 'nu_partitions=' + str(nu_partitions)
# Process Migrate parmfile
if file_type == 'migrate_parm':
err_code, num_reps = process_migrate_parm(file_name)
results = 'file_type=' + file_type + '\n'
results += 'err_code=' + str(err_code) + '\n'
results += 'num_reps=' + str(num_reps)
# Process Migrate infile
if file_type == 'migrate_infile':
err_code, num_loci = process_migrate_infile(file_name)
results = 'file_type=' + file_type + '\n'
results += 'err_code=' + str(err_code) + '\n'
results += 'num_loci=' + str(num_loci)
# Unknown or unidentifiable file type
if file_type == 'unknown':
results = 'file_type=' + file_type + '\n'
results += 'err_code=1'
print results