#include "SyntaxHighlighter.h" #include "LanguageTree.h" #include "TokenList.h" SyntaxHighlighter::SyntaxHighlighter(const std::string& path) { m_tree = std::make_shared(); m_tree->load(path); } TokenList SyntaxHighlighter::tokenize(const std::string& text, const std::string& language) { const Grammar* grammar = m_tree->find(language); if (grammar) { return tokenize(text, grammar); } return TokenList(text); } std::map SyntaxHighlighter::languages() const { return m_tree->keys(); } TokenList SyntaxHighlighter::tokenize(std::string_view text, const Grammar* grammar) { TokenList tokenList(text); matchGrammar(text, tokenList, grammar, tokenList.head, 0, nullptr); return tokenList; } void SyntaxHighlighter::matchGrammar(std::string_view text, TokenList& tokenList, const Grammar* grammar, TokenListPtr startNode, size_t startPos, RematchOptions* rematch) { for (const auto& token : grammar->tokens) { int x = 0; for (auto j = token.cbegin(); j != token.cend(); ++j) { if (rematch && rematch->j == x && rematch->token == token.name()) { return; } const auto& pattern = *j; const auto& inside = pattern->inside(); const bool greedy = pattern->greedy(); size_t pos = startPos; // iterate the token list and keep track of the current token/string position for (TokenListPtr currentNode = startNode->next; currentNode != tokenList.head; pos += currentNode->length(), currentNode = currentNode->next) { if (rematch && pos >= rematch->reach) { break; } if (tokenList.length > text.length()) { // Something went terribly wrong, ABORT, ABORT! return; } if (currentNode->isSyntax()) { continue; } const auto& currentText = dynamic_cast(*currentNode); std::string_view str = currentText.value(); auto removeCount = 1; // this is the to parameter of removeBetween std::string_view match; bool matchSuccess = false; size_t matchIndex = pos; if (greedy) { match = pattern->match(matchSuccess, matchIndex, text); if (!matchSuccess || matchIndex >= text.length()) { break; } auto from = matchIndex; auto to = matchIndex + match.length(); auto p = pos; // find the node that contains the match p += currentNode->length(); while (from >= p) { currentNode = currentNode->next; p += currentNode->length(); } // adjust pos (and p) p -= currentNode->length(); pos = p; // the current node is a Token, then the match starts inside another Token, which is invalid if (currentNode->isSyntax()) { continue; } // find the last node which is affected by this match for (TokenListPtr k = currentNode; k != tokenList.head && (p < to || !k->isSyntax()); k = k->next) { removeCount++; p += k->length(); } removeCount--; // replace with the new match str = text.substr(pos, p - pos); matchIndex -= pos; } else { matchIndex = 0; match = pattern->match(matchSuccess, matchIndex, str); if (!matchSuccess) { continue; } } auto from = matchIndex; auto before = str.substr(0, from); auto after = str.substr(from + match.length()); auto reach = pos + str.length(); if (rematch && reach > rematch->reach) { rematch->reach = reach; } TokenListPtr removeFrom = currentNode->prev; if (before.size()) { removeFrom = tokenList.addAfter(removeFrom, before); pos += before.length(); } tokenList.removeRange(removeFrom, removeCount); TokenList tokenEntries = [&]() { if (inside) { return tokenize(match, inside); } else { return TokenList(match); } }(); currentNode = tokenList.addAfter(removeFrom, token.name(), std::move(tokenEntries), pattern->alias(), match.size()); if (after.size()) { tokenList.addAfter(currentNode, after); } if (removeCount > 1) { // at least one Token object was removed, so we have to do some rematching // this can only happen if the current pattern is greedy RematchOptions nestedRematch = { .token = token.name(), .reach = reach, .j = x }; matchGrammar(text, tokenList, grammar, currentNode->prev, pos, &nestedRematch); // the reach might have been extended because of the rematching if (rematch && nestedRematch.reach > rematch->reach) { rematch->reach = nestedRematch.reach; } } } ++x; } } }