/* * FastaParser.java */ package org.ngbw.utils; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.util.regex.Pattern; /** * * @author Paul Hoover * */ class FastaParser { private static final Pattern m_headerPattern = Pattern.compile("^\\s*>.*$"); private final BufferedReader m_reader; private StringBuilder m_completeSource; private String m_line; /** * * @param input */ public FastaParser(String input) { this(new StringReader(input)); } /** * * @param input */ public FastaParser(byte[] input) { this(new ByteArrayInputStream(input)); } /** * * @param input */ public FastaParser(InputStream input) { this(new InputStreamReader(input)); } /** * * @param input */ public FastaParser(Reader input) { m_reader = new BufferedReader(input); } /** * * @return * @throws IOException */ public SequenceRecord nextRecord() throws IOException { if (!findFirstLine()) return null; SequenceRecord record = new SequenceRecord(); parseHeader(record); while (true) { if (m_line == null || m_headerPattern.matcher(m_line).matches()) { record.completeSource = m_completeSource.toString(); break; } else { storeLine(); readLine(); } } return record; } /** * * @return * @throws IOException */ private boolean findFirstLine() throws IOException { m_completeSource = new StringBuilder(); if (m_line == null) readLine(); while (true) { if (m_line == null) return false; if (m_headerPattern.matcher(m_line).matches()) { storeLine(); break; } readLine(); } return true; } /** * * @throws IOException */ private void readLine() throws IOException { m_line = m_reader.readLine(); } /** * */ private void storeLine() { m_completeSource.append(m_line); m_completeSource.append('\n'); } /** * * @param record * @throws IOException */ private void parseHeader(SequenceRecord record) throws IOException { String[] subStrings = m_line.substring(1).split("\\|"); if (subStrings.length == 5) { record.primaryId = subStrings[1]; record.organism = subStrings[3]; record.name = subStrings[4]; } record.filteredSource = m_line + '\n'; readLine(); } }