/**************************************************************************** ** Copyright (C) 2017 Ford Motor Company. ** All rights reserved. ** ** Copyright (C) 2017 The Qt Company Ltd. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtRemoteObjects module of the Qt Toolkit. ** ** $QT_BEGIN_LICENSE:LGPL$ ** Commercial License Usage ** Licensees holding valid commercial Qt licenses may use this file in ** accordance with the commercial license agreement provided with the ** Software or, alternatively, in accordance with the terms contained in ** a written agreement between you and The Qt Company. For licensing terms ** and conditions see https://www.qt.io/terms-conditions. For further ** information use the contact form at https://www.qt.io/contact-us. ** ** GNU Lesser General Public License Usage ** Alternatively, this file may be used under the terms of the GNU Lesser ** General Public License version 3 as published by the Free Software ** Foundation and appearing in the file LICENSE.LGPL3 included in the ** packaging of this file. Please review the following information to ** ensure the GNU Lesser General Public License version 3 requirements ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. ** ** GNU General Public License Usage ** Alternatively, this file may be used under the terms of the GNU ** General Public License version 2.0 or (at your option) the GNU General ** Public license version 3 or any later version approved by the KDE Free ** Qt Foundation. The licenses are as published by the Free Software ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 ** included in the packaging of this file. Please review the following ** information to ensure the GNU General Public License requirements will ** be met: https://www.gnu.org/licenses/gpl-2.0.html and ** https://www.gnu.org/licenses/gpl-3.0.html. ** ** $QT_END_LICENSE$ ** ****************************************************************************/ #ifndef QREGEXPARSER_H #define QREGEXPARSER_H #include #include #include #ifdef QT_BOOTSTRAPPED # include # define REGEX QRegExp #else # include # define REGEX QRegularExpression #endif #include #include #include #include struct MatchCandidate { MatchCandidate(const QString &n, const QString &t, int i) : name(n), matchText(t), index(i) {} QString name; QString matchText; int index; }; QT_BEGIN_NAMESPACE template class QRegexParser: protected _Table { public: QRegexParser(int maxMatchLen=4096); virtual ~QRegexParser(); virtual bool parse(); virtual void reset() {} inline QVariant &sym(int index); void setBuffer(const QString &buffer); void setBufferFromDevice(QIODevice *device); void setDebug(); QString errorString() const { return m_errorString; } void setErrorString(const QString &error) { m_errorString = error; qWarning() << m_errorString; } inline const QMap& captured() const { return m_captured; } inline bool isDebug() const { return m_debug; } inline int lineNumber() const { return m_lineno; } private: int nextToken(); inline bool consumeRule(int rule) { return static_cast<_Parser*> (this)->consumeRule(rule); } enum { DefaultStackSize = 128 }; struct Data: public QSharedData { Data(): stackSize (DefaultStackSize), tos (0) {} QVarLengthArray stateStack; QVarLengthArray parseStack; int stackSize; int tos; void reallocateStack() { stackSize <<= 1; stateStack.resize(stackSize); parseStack.resize(stackSize); } }; inline QString escapeString(QString s) { return s.replace(QLatin1Char('\n'), QLatin1String("\\n")).replace(QLatin1Char('\t'), QLatin1String("\\t")); } QSharedDataPointer d; QList m_regexes; #ifndef QT_BOOTSTRAPPED QMap > regexCandidates; #endif QList m_tokens; QString m_buffer, m_lastMatchText; int m_loc, m_lastNewlinePosition; int m_lineno; int m_debug; QStringList m_tokenNames; QMap m_captured; int m_maxMatchLen; QString m_errorString; QVector > m_names; //storage for match names }; template inline QVariant &QRegexParser<_Parser, _Table>::sym(int n) { return d->parseStack [d->tos + n - 1]; } template QRegexParser<_Parser, _Table>::~QRegexParser() { } template bool QRegexParser<_Parser, _Table>::parse() { m_errorString.clear(); reset(); const int INITIAL_STATE = 0; d->tos = 0; d->reallocateStack(); int act = d->stateStack[++d->tos] = INITIAL_STATE; int token = -1; Q_FOREVER { if (token == -1 && - _Table::TERMINAL_COUNT != _Table::action_index[act]) token = nextToken(); act = _Table::t_action(act, token); if (d->stateStack[d->tos] == _Table::ACCEPT_STATE) return true; else if (act > 0) { if (++d->tos == d->stackSize) d->reallocateStack(); d->parseStack[d->tos] = d->parseStack[d->tos - 1]; d->stateStack[d->tos] = act; token = -1; } else if (act < 0) { int r = - act - 1; d->tos -= _Table::rhs[r]; act = d->stateStack[d->tos++]; if (!consumeRule(r)) return false; act = d->stateStack[d->tos] = _Table::nt_action(act, _Table::lhs[r] - _Table::TERMINAL_COUNT); } else break; } setErrorString(QStringLiteral("Unknown token encountered")); return false; } template QRegexParser<_Parser, _Table>::QRegexParser(int maxMatchLen) : d(new Data()), m_loc(0), m_lastNewlinePosition(0), m_lineno(1), m_debug(0), m_maxMatchLen(maxMatchLen) { REGEX re(QStringLiteral("\\[([_a-zA-Z][_0-9a-zA-Z]*)(,\\s*M)?\\](.+)$")); #ifdef QT_BOOTSTRAPPED REGEX nameMatch(QStringLiteral("\\((\\?<(.*)>).+\\)")); nameMatch.setMinimal(true); #else re.optimize(); #endif QMap token_lookup; QMap names; for (int i = 1; i < _Table::lhs[0]; i++) { const QString text = QLatin1String(_Table::spell[i]); names.clear(); #ifdef QT_BOOTSTRAPPED if (re.indexIn(text) == 0) { const QString token = re.cap(1); const bool multiline = re.cap(2).length() > 0; QString pattern = re.cap(3); //We need to identify/remove any match names in the pattern, since //QRegExp doesn't support that feature int pos = 0, counter = 1, loc = nameMatch.indexIn(pattern, pos); while (loc >= 0) { const QString res = nameMatch.cap(2); if (!res.isEmpty()) { names.insert(counter, res); pattern.remove(nameMatch.cap(1)); } pos += loc + nameMatch.matchedLength() - nameMatch.cap(1).length(); loc = nameMatch.indexIn(pattern, pos); ++counter; } //We need to use indexIn, but that will search past the location we //pass in. So prepend '^' and use QRegExp::CaretAtOffset. if (pattern.at(0) != QChar(QLatin1Char('^'))) pattern.prepend(QChar(QLatin1Char('^'))); #else QRegularExpressionMatch match = re.match(text, 0, QRegularExpression::NormalMatch, QRegularExpression::DontCheckSubjectStringMatchOption); if (match.hasMatch()) { const QString token = match.captured(1); const bool multiline = match.captured(2).length() > 0; const QString pattern = match.captured(3); #endif m_tokenNames.append(token); int index = i; if (token_lookup.contains(token)) index = token_lookup[token]; else token_lookup[token] = i; #ifdef QT_BOOTSTRAPPED if (multiline) qWarning() << "The multiline grammar option is ignore in force_bootstrap mode."; #endif REGEX pat(pattern); #ifndef QT_BOOTSTRAPPED if (multiline) pat.setPatternOptions(QRegularExpression::DotMatchesEverythingOption); #endif if (!pat.isValid()) qCritical() << "Pattern error for token #" << i << "for" << text << "pattern =" << pat << ":" << pat.errorString(); else { #ifndef QT_BOOTSTRAPPED pat.optimize(); int counter = 0; const auto namedCaptureGroups = pat.namedCaptureGroups(); for (const QString &name : namedCaptureGroups) { if (!name.isEmpty()) names.insert(counter, name); ++counter; } #endif m_names.append(names); m_regexes.append(pat); if (token.startsWith(QLatin1String("ignore"))) m_tokens.append(-1); else m_tokens.append(index); } } else { qCritical() << "Error parsing regex at token #" << i << "for" << text << "Invalid syntax"; } } } template void QRegexParser<_Parser, _Table>::setBuffer(const QString &buffer) { m_buffer = buffer; } template void QRegexParser<_Parser, _Table>::setBufferFromDevice(QIODevice *device) { QTextStream in(device); m_buffer = in.readAll(); } template void QRegexParser<_Parser, _Table>::setDebug() { m_debug = true; for (int r = 0; r < _Table::RULE_COUNT; ++r) { int ridx = _Table::rule_index[r]; int _rhs = _Table::rhs[r]; qDebug("%3d) %s ::=", r + 1, _Table::spell[_Table::rule_info[ridx]]); ++ridx; for (int i = ridx; i < ridx + _rhs; ++i) { int symbol = _Table::rule_info[i]; if (symbol > 0 && symbol < _Table::lhs[0]) qDebug(" token_%s (pattern = %s)",qPrintable(m_tokenNames[symbol-1]),qPrintable(m_regexes[symbol-1].pattern())); else if (const char *name = _Table::spell[symbol]) qDebug(" %s", name); else qDebug(" #%d", symbol); } qDebug(); } } template int QRegexParser<_Parser, _Table>::nextToken() { static const REGEX newline(QLatin1String("(\\n)")); int token = -1; while (token < 0) { if (m_loc == m_buffer.size()) return _Table::EOF_SYMBOL; //Check m_lastMatchText for newlines and update m_lineno //This isn't necessary, but being able to provide the line # and character # //where the match is failing sure makes building/debugging grammars easier. #ifdef QT_BOOTSTRAPPED int loc = 0, pos = newline.indexIn(m_lastMatchText, loc); while (pos >= 0) { m_lineno++; loc += pos + 1; m_lastNewlinePosition += pos + 1; pos = newline.indexIn(m_lastMatchText, loc); } #else //QT_BOOTSTRAPPED QRegularExpressionMatchIterator matches = newline.globalMatch(m_lastMatchText); while (matches.hasNext()) { m_lineno++; QRegularExpressionMatch match = matches.next(); if (!matches.hasNext()) m_lastNewlinePosition += match.capturedEnd(); } #endif //!QT_BOOTSTRAPPED if (m_debug) { qDebug(); qDebug() << "nextToken loop, line =" << m_lineno << "line position =" << m_loc - m_lastNewlinePosition << "next 5 characters =" << escapeString(m_buffer.mid(m_loc, 5)); } int best = -1, maxLen = -1; #ifndef QT_BOOTSTRAPPED QRegularExpressionMatch bestRegex; #endif //Find the longest match. //If more than one are the same (longest) length, return the first one in //the order defined. QList candidates; #ifndef QT_BOOTSTRAPPED { //We used PCRE's PartialMatch to eliminate most of the regexes by the first //character, so we keep a regexCandidates map with the list of possible regexes //based on initial characters found so far. const QChar nextChar = m_buffer.at(m_loc); //Populate the list if we haven't seeen this character before if (!regexCandidates.contains(nextChar)) { # if (QT_VERSION >= QT_VERSION_CHECK(5, 5, 0)) const QStringRef tmp = m_buffer.midRef(m_loc,1); # else const QString tmp = m_buffer.mid(m_loc,1); # endif int i = 0; regexCandidates[nextChar] = QList(); for (const QRegularExpression &re : qAsConst(m_regexes)) { QRegularExpressionMatch match = re.match(tmp, 0, QRegularExpression::PartialPreferFirstMatch, QRegularExpression::DontCheckSubjectStringMatchOption); //qDebug() << nextChar << tmp << match.hasMatch() << match.hasPartialMatch() << re.pattern(); if (match.hasMatch() || match.hasPartialMatch()) regexCandidates[nextChar] << i; i++; } } const auto indices = regexCandidates.value(nextChar); for (int i : indices) { //Seems like I should be able to run the regex on the entire string, but performance is horrible //unless I use a substring. //QRegularExpressionMatch match = m_regexes[i].match(m_buffer, m_loc, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption); # if (QT_VERSION >= QT_VERSION_CHECK(5, 5, 0)) QRegularExpressionMatch match = m_regexes.at(i).match(m_buffer.midRef(m_loc, m_maxMatchLen), 0, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption | QRegularExpression::DontCheckSubjectStringMatchOption); # else QRegularExpressionMatch match = m_regexes.at(i).match(m_buffer.mid(m_loc, m_maxMatchLen), 0, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption | QRegularExpression::DontCheckSubjectStringMatchOption); # endif if (match.hasMatch()) { if (m_debug) candidates << MatchCandidate(m_tokenNames[i], match.captured(), i); if (match.capturedLength() > maxLen) { best = i; maxLen = match.capturedLength(); bestRegex = match; } } } } #else { int i = 0; for (const QRegExp &r : qAsConst(m_regexes)) { if (r.indexIn(m_buffer, m_loc, QRegExp::CaretAtOffset) == m_loc) { if (m_debug) candidates << MatchCandidate(m_tokenNames[i], r.cap(0), i); if (r.matchedLength() > maxLen) { best = i; maxLen = r.matchedLength(); } } ++i; } } #endif if (best < 0) { setErrorString(QLatin1String("Error generating tokens from file, next characters >%1<").arg(m_buffer.midRef(m_loc, 15))); return -1; } else { const QMap &map = m_names.at(best); if (!map.isEmpty()) m_captured.clear(); for (auto iter = map.cbegin(), end = map.cend(); iter != end; ++iter) { #ifdef QT_BOOTSTRAPPED m_captured.insert(iter.value(), m_regexes.at(best).cap(iter.key())); #else m_captured.insert(iter.value(), bestRegex.captured(iter.key())); #endif } if (m_debug) { qDebug() << "Match candidates:"; for (const MatchCandidate &m : qAsConst(candidates)) { QLatin1String result = m.index == best ? QLatin1String(" * ") : QLatin1String(" "); qDebug() << qPrintable(result) << qPrintable(m.name) << qPrintable(escapeString(m.matchText)); } } m_loc += maxLen; if (m_tokens.at(best) >= 0) token = m_tokens.at(best); #ifdef QT_BOOTSTRAPPED m_lastMatchText = m_regexes.at(best).cap(0); #else m_lastMatchText = bestRegex.captured(0); #endif } } return token; } QT_END_NAMESPACE #endif // QREGEXPARSER_H