/* * GenbankParser.java */ package org.ngbw.utils; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.util.regex.Pattern; /** * * @author Paul Hoover * */ class GenbankParser extends Parser { private static final Pattern m_fieldStartPattern = Pattern.compile("^\\S+.*$"); private static final Pattern m_locusPattern = Pattern.compile("^LOCUS.*$"); private static final Pattern m_definitionPattern = Pattern.compile("^DEFINITION.*$"); private static final Pattern m_accessionPattern = Pattern.compile("^ACCESSION.*$"); private static final Pattern m_versionPattern = Pattern.compile("^VERSION.*$"); private static final Pattern m_sourcePattern = Pattern.compile("^SOURCE.*$"); private static final Pattern m_recordEndPattern = Pattern.compile("^\\s*//\\s*$"); private static final Pattern m_filterPattern = Pattern.compile("^\\s+/translation=.*$|^ORIGIN.*$"); /** * * @param input */ public GenbankParser(String input) { super(input, m_filterPattern); } /** * * @param input */ public GenbankParser(byte[] input) { super(input, m_filterPattern); } /** * * @param input */ public GenbankParser(InputStream input) { super(input, m_filterPattern); } /** * * @param input */ public GenbankParser(Reader input) { super(input, m_filterPattern); } /** * * @return * @throws IOException */ public SequenceRecord nextRecord() throws IOException { if (!findFirstLine(m_locusPattern)) return null; SequenceRecord record = new SequenceRecord(); readAndStoreLine(); while (true) { if (m_line == null) return null; if (m_definitionPattern.matcher(m_line).matches()) parseDefinition(record); else if (m_accessionPattern.matcher(m_line).matches()) parseAccession(record); else if (m_versionPattern.matcher(m_line).matches()) parseVersion(record); else if (m_sourcePattern.matcher(m_line).matches()) parseSource(record); else if (m_recordEndPattern.matcher(m_line).matches()) { record.completeSource = getCompleteSource(); record.filteredSource = getFilteredSource(); break; } else ignoreField(); } return record; } /** * * @param record * @throws IOException */ private void parseDefinition(SequenceRecord record) throws IOException { StringBuilder field = new StringBuilder(); String[] subStrings = m_line.split("\\s+", 2); if (subStrings.length > 1) field.append(subStrings[1]); while (true) { readAndStoreLine(); if (m_line == null || m_fieldStartPattern.matcher(m_line).matches()) { record.name = field.toString(); return; } field.append(' '); field.append(m_line.trim()); } } /** * * @param record * @throws IOException */ private void parseAccession(SequenceRecord record) throws IOException { String[] subStrings = m_line.split("\\s+"); if (subStrings.length > 1) record.primaryId = subStrings[1]; readAndStoreLine(); } /** * * @param record * @throws IOException */ private void parseVersion(SequenceRecord record) throws IOException { String[] subStrings = m_line.split("\\s+"); if (subStrings.length == 3 && subStrings[2].startsWith("GI:")) { record.version = subStrings[1]; record.alternativeId = subStrings[2].substring(3); } readAndStoreLine(); } /** * * @param record * @throws IOException */ private void parseSource(SequenceRecord record) throws IOException { String[] subStrings = m_line.split("\\s+", 2); if (subStrings.length > 1) record.organism = subStrings[1]; ignoreField(); } /** * * @throws IOException */ private void ignoreField() throws IOException { while (true) { readAndStoreLine(); if (m_line == null || m_fieldStartPattern.matcher(m_line).matches()) return; } } }