/* * UniprotParser.java */ package org.ngbw.utils; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * @author Paul Hoover * */ class UniprotParser extends Parser { private static final Pattern m_idPattern = Pattern.compile("^ID\\s+.*$"); private static final Pattern m_accessionPattern = Pattern.compile("^AC\\s+.*$"); private static final Pattern m_datePattern = Pattern.compile("^DT\\s+.*$"); private static final Pattern m_versionPattern = Pattern.compile(".*entry version\\s+(\\d+).*"); private static final Pattern m_descriptionPattern = Pattern.compile("^DE\\s+.*$"); private static final Pattern m_organismPattern = Pattern.compile("^OS\\s+.*$"); private static final Pattern m_recordEndPattern = Pattern.compile("^\\s*//\\s*$"); private static final Pattern m_filterPattern = Pattern.compile("^CC\\s+-+$"); /** * * @param input */ public UniprotParser(String input) { super(input, m_filterPattern); } /** * * @param input */ public UniprotParser(byte[] input) { super(input, m_filterPattern); } /** * * @param input */ public UniprotParser(InputStream input) { super(input, m_filterPattern); } /** * * @param input */ public UniprotParser(Reader input) { super(input, m_filterPattern); } /** * * @return * @throws IOException */ public SequenceRecord nextRecord() throws IOException { if (!findFirstLine(m_idPattern)) return null; SequenceRecord record = new SequenceRecord(); parseIdentity(record); while (true) { if (m_line == null) return null; if (m_accessionPattern.matcher(m_line).matches()) parseAccession(record); else if (m_datePattern.matcher(m_line).matches()) parseDate(record); else if (m_descriptionPattern.matcher(m_line).matches()) parseDescription(record); else if (m_organismPattern.matcher(m_line).matches()) parseOrganism(record); else if (m_recordEndPattern.matcher(m_line).matches()) { record.completeSource = getCompleteSource(); record.filteredSource = getFilteredSource(); break; } else readAndStoreLine(); } return record; } /** * * @param record * @throws IOException */ private void parseIdentity(SequenceRecord record) throws IOException { String[] subStrings = m_line.substring(3).trim().split("\\s+"); record.alternativeId = subStrings[0]; readAndStoreLine(); } /** * * @param record * @throws IOException */ private void parseAccession(SequenceRecord record) throws IOException { String[] subStrings = m_line.substring(3).split(";\\s*"); record.primaryId = subStrings[0].trim(); while (true) { readAndStoreLine(); if (m_line == null || !m_accessionPattern.matcher(m_line).matches()) break; } } /** * * @param record * @throws IOException */ private void parseDate(SequenceRecord record) throws IOException { while (true) { Matcher matcher = m_versionPattern.matcher(m_line); if (matcher.matches()) record.version = matcher.group(1); readAndStoreLine(); if (m_line == null || !m_datePattern.matcher(m_line).matches()) break; } } /** * * @param record * @throws IOException */ private void parseDescription(SequenceRecord record) throws IOException { StringBuilder field = new StringBuilder(m_line.substring(3).trim()); while (true) { readAndStoreLine(); if (m_line == null || !m_descriptionPattern.matcher(m_line).matches()) break; field.append(' '); field.append(m_line.substring(3).trim()); } record.name = field.toString(); } /** * * @param record * @throws IOException */ private void parseOrganism(SequenceRecord record) throws IOException { record.organism = m_line.substring(3).trim(); readAndStoreLine(); } }