// UniprotParser.cpp #include #include #include #include "UniprotParser.hpp" namespace NGBW { // UniprotParser bool UniprotParser::Parse(UniprotRecord &record) { record.clear(); while (1) { ReadNextLine(); if (m_input.Eof()) return false; if (std::memcmp(m_line, "ID ", 3) == 0) { ParseIdentity(record); break; } } while (1) { if (m_input.Eof()) return false; if (std::memcmp(m_line, "AC ", 3) == 0) { ParseAccession(record); break; } ReadNextLine(); } while (1) { if (m_input.Eof()) return false; if (std::memcmp(m_line, "DT ", 3) == 0) { ParseDate(record); break; } ReadNextLine(); } while (1) { if (m_input.Eof()) return false; if (std::memcmp(m_line, "DE ", 3) == 0) { ParseDescription(record); break; } ReadNextLine(); } while (1) { if (m_input.Eof()) return false; if (std::memcmp(m_line, "OS ", 3) == 0) { ParseOrganism(record); break; } ReadNextLine(); } while (1) { if (m_input.Eof()) return false; if (std::memcmp(m_line, "SQ ", 3) == 0) { ParseSequence(record); break; } ReadNextLine(); } while (1) { if (m_input.Eof()) return false; if (FindEndToken()) break; ReadNextLine(); } return true; } void UniprotParser::ParseIdentity(UniprotRecord &record) { size_t first = FindFirstNotSpace(3); if (first == std::string::npos) return; size_t last = FindFirstSpace(first + 1); if (last == std::string::npos) last = std::strlen(m_line); record.identity.assign(m_line + first, last - first); ReadNextLine(); } void UniprotParser::ParseAccession(UniprotRecord &record) { size_t first = FindFirstNotSpace(3); if (first == std::string::npos) return; size_t last = FindFirstOf(';', first + 1); if (last == std::string::npos) last = std::strlen(m_line); record.accession.assign(m_line + first, last - first); ReadNextLine(); } void UniprotParser::ParseDate(UniprotRecord &record) { while (1) { const char *start = std::strstr(m_line + 3, "entry version"); if (start != NULL) { size_t first = FindFirstDigit(start - m_line); size_t last = FindFirstNotDigit(first + 1); if (last == std::string::npos) last = std::strlen(m_line); record.version.assign(m_line + first, last - first); } ReadNextLine(); if (m_input.Eof() || std::memcmp(m_line, "DT ", 3) != 0) break; } } void UniprotParser::ParseDescription(UniprotRecord &record) { size_t first = FindFirstNotSpace(3); size_t last; if (first != std::string::npos) { last = FindLastNotSpace() + 1; record.description.assign(m_line + first, last - first); } while (1) { ReadNextLine(); if (m_input.Eof() || std::memcmp(m_line, "DE ", 3) != 0) break; first = FindFirstNotSpace(3); if (first != std::string::npos) { last = FindLastNotSpace() + 1; if (std::memcmp(m_line + first, "Includes:", 9) == 0 || std::memcmp(m_line + first, "Contains:", 9) == 0 || std::memcmp(m_line + first, "Flags:", 6) == 0) record.description.push_back('\n'); else record.description.push_back(' '); record.description.append(m_line + first, last - first); } } } void UniprotParser::ParseOrganism(UniprotRecord &record) { size_t first = FindFirstNotSpace(3); if (first == std::string::npos) return; size_t last = std::strlen(m_line); record.organism.assign(m_line + first, last - first); ReadNextLine(); } void UniprotParser::ParseSequence(UniprotRecord &record) { while (1) { ReadNextLine(); if (m_input.Eof() || !std::isspace(m_line[0])) return; size_t first = FindFirstNotSpace(1); if (m_line[first] == '/' && m_line[first + 1] == '/') break; size_t last = first - 1; size_t length = std::strlen(m_line); while (1) { first = FindFirstNotSpace(last + 1); last = FindFirstSpace(first + 1); if (last == std::string::npos) last = length; record.sequence.append(m_line + first, last - first); if (last == length) break; } record.sequence.push_back('\n'); } } bool UniprotParser::FindEndToken() { size_t first = FindFirstNotSpace(); if (first == std::string::npos) return false; if (m_line[first] == '/' && m_line[first + 1] == '/') return true; return false; } } // namespace NGBW