///////////////////////////////////////////////////////////////////////////// // // Copyright (c) 2005-2017 Dawson Dean // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the "Software"), // to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // ///////////////////////////////////////////////////////////////////////////// // // String Parsing Module // // This module defines CParseGrammar - A parser object that parses strings // using a grammar definition. The grammar syntax is based on the IETF // Augmented BNF used in RFCs and W3C specs. // // [RFC2234] Crocker, D.H. and P. Overell, // "Augmented BNF for Syntax Specifications: ABNF", // RFC 2234, November 1997. // // This pattern matching mechanism resembles a graph search more than anything. // I replaced the stack of a previous grammar with a "path" that is all the // steps we have taken so far. The goal is to find a path through the space // of states from the initial state to a terminating state. Each step will // go to a new state. Each state represents one state in the grammar. For // example, to match "CAT", there are 3 states, one for "C", one for "A", // and one for "T". This probably has the exact same effect as a typical // parser, but it was easier for me to reason about. // ///////////////////////////////////////////////////////////////////////////// #include "osIndependantLayer.h" #include "config.h" #include "log.h" #include "debugging.h" #include "memAlloc.h" #include "refCount.h" #include "threads.h" #include "stringLib.h" #include "stringParse.h" FILE_DEBUGGING_GLOBALS(LOG_LEVEL_DEFAULT, 0); class CCharTypeName { public: NEWEX_IMPL() const char *m_pPattern; int32 m_PatternLength; int32 m_Flags; }; static CCharTypeName g_AlternativePatterns[] = { { "whitespace", -1, CStringLib::WHITE_SPACE_CHAR }, { "newline", -1, CStringLib::NEWLINE_CHAR }, { "hex", -1, CStringLib::HEX_CHAR }, { "ALPHANUM", -1, CStringLib::ALPHANUM_CHAR }, { "DIGIT", -1, CStringLib::NUMBER_CHAR }, { "WORD_CHAR", -1, CStringLib::WORD_CHAR }, { "URL_PATH_CHAR", -1, CStringLib::URL_PATH_CHAR }, { "URL_HOST_CHAR", -1, CStringLib::URL_HOST_CHAR }, { "URL_FRAGMENT_CHAR", -1, CStringLib::URL_FRAGMENT_CHAR }, { "URL_QUERY_CHAR", -1, CStringLib::URL_QUERY_CHAR }, { NULL, -1, 0 } }; static const char g_KleeneStarChar = '*'; static const char g_OpenAlternativeChar = '('; static const char g_CloseAlternativeChar = ')'; static const char g_AlternativeSeparatorChar = '/'; static const char g_OpenOptionalChar = '['; static const char g_CloseOptionalChar = ']'; static const char g_OpenVariableNameChar = '<'; static const char g_CloseVariableNameChar = '>'; /////////////////////////////////////////// class CReentrantParsingCallback : public CParsingCallback { public: virtual ErrVal OnParseToken( void *pCallbackContext, int32 tokenId, int32 intValue, const char *pStr, int32 length); int32 m_TokenId; int32 m_IntValue; }; // CReentrantParsingCallback //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// CParseGrammar::CParsingContext::CParsingContext() { m_pCallback = NULL; m_pPath = m_DefaultPath; m_MaxPathLength = CParseGrammar::DEFAULT_PATH_LENGTH; m_PathLength = 0; m_pPendingCallbacks = m_DefaultPendingCallbacks; m_MaxNumPendingCallbacks = CParseGrammar::DEFAULT_MAX_PENDING_CALLBACKS; m_NumPendingCallbacks = 0; m_pStateAtEndOfPreviousBuffer = NULL; } // CParsingContext //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// CParseGrammar::CParseGrammar(CParseGrammar::CParseRule *pRuleList) { m_InitializedRules = false; m_pRuleList = pRuleList; m_pVariableList = NULL; m_Options = CStringLib::IGNORE_CASE; m_StartVariable = NULL; } // CParseGrammar //////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////// CParseGrammar::~CParseGrammar() { } // ~CParseGrammar //////////////////////////////////////////////////////////////////////////////// // // [InitializeContext] // //////////////////////////////////////////////////////////////////////////////// void CParseGrammar::InitializeContext( CParsingContext *pContext, CParsingCallback *pCallback, void *pCallbackContext) { if (NULL == pContext) { return; } pContext->m_pCallback = pCallback; pContext->m_pCallbackContext = pCallbackContext; pContext->m_pPath = pContext->m_DefaultPath; pContext->m_MaxPathLength = DEFAULT_PATH_LENGTH; pContext->m_PathLength = 0; pContext->m_pPendingCallbacks = pContext->m_DefaultPendingCallbacks; pContext->m_MaxNumPendingCallbacks = DEFAULT_MAX_PENDING_CALLBACKS; pContext->m_NumPendingCallbacks = 0; pContext->m_pStateAtEndOfPreviousBuffer = NULL; } // InitializeContext //////////////////////////////////////////////////////////////////////////////// // // [ShutdownContext] // //////////////////////////////////////////////////////////////////////////////// void CParseGrammar::ShutdownContext(CParsingContext *pContext) { if (NULL == pContext) { return; } if ((pContext->m_pPendingCallbacks) && (pContext->m_pPendingCallbacks != pContext->m_DefaultPendingCallbacks)) { memFree(pContext->m_pPendingCallbacks); } } // ShutdownContext //////////////////////////////////////////////////////////////////////////////// // // [ParseString] // //////////////////////////////////////////////////////////////////////////////// bool CParseGrammar::ParseString( const char *pText, int32 textLength, CParsingCallback *pCallback, void *pContext) { return(ParseStringEx(pText, textLength, pCallback, pContext, NULL)); } // ParseString //////////////////////////////////////////////////////////////////////////////// // // [ParseStringEx] // //////////////////////////////////////////////////////////////////////////////// bool CParseGrammar::ParseStringEx( const char *pText, int32 textLength, CParsingCallback *pCallback, void *pContext, char **ppEndPtr) { ErrVal err = ENoErr; bool fMatch = false; CParsingContext context; const char *pEndText; InitializeContext(&context, pCallback, pContext); // If this is a different set of rules, or if we have not // been called with rules yet, then convert the list of rules // to a finite-state-machine. if (!m_InitializedRules) { err = InitializeRules(); if (err) { gotoErr(err); } } // if (!m_InitializedRules) // Start at the beginning state. pEndText = (const char *) (((char *) pText) + textLength); context.m_pStateAtEndOfPreviousBuffer = m_StartVariable->m_pStartState; context.m_PathLength = 0; context.m_NumPendingCallbacks = 0; err = GoToNextStep( &context, m_StartVariable->m_pStartState, pText, pEndText, (const char **) &pText); if (err) { gotoErr(err); } // Be careful. Sometimes the first state will have some optional rules that // match the whole string. In that case, we are done. if (pText < pEndText) { fMatch = ParseSubString( &context, NULL, pText, pEndText, (const char **) ppEndPtr); } else { fMatch = true; } if (fMatch) { FirePendingCallbacks(&context); } abort: ShutdownContext(&context); return(fMatch); } // ParseStringEx //////////////////////////////////////////////////////////////////////////////// // // [InitializeRules] // //////////////////////////////////////////////////////////////////////////////// ErrVal CParseGrammar::InitializeRules() { ErrVal err = ENoErr; CParseGrammar::CParseRule *pCurrentRule; // If this is a different set of rules, or if we have not // been called with rules yet, then convert the list of rules // to a finite-state-machine. if (!m_InitializedRules) { m_pStateList = NULL; pCurrentRule = m_pRuleList; while ((NULL != pCurrentRule) && (NULL != pCurrentRule->m_pPattern)) { err = AddRule(pCurrentRule); if (err) { gotoErr(err); } pCurrentRule++; } if (NULL == m_StartVariable) { gotoErr(EFail); } m_InitializedRules = true; } // if (!m_InitializedRules) abort: returnErr(err); } // InitializeRules //////////////////////////////////////////////////////////////////////////////// // // [GoToNextStep] // //////////////////////////////////////////////////////////////////////////////// ErrVal CParseGrammar::GoToNextStep( CParsingContext *pContext, CParseState *pNewState, const char *pText, const char *pEndText, const char **ppResumeText) { ErrVal err = ENoErr; CPathStep *pStep; int32 index; int32 numOptionals; int32 savedCallPathLength; CStateTransition * optionalList[MAX_OPTIONALS_PER_RULE]; bool fOneOptionalMatched = false; bool fSubMatch = false; CStateTransition *pOptionalTransition; if (NULL != ppResumeText) { *ppResumeText = pText; } if (pContext->m_PathLength >= pContext->m_MaxPathLength) { gotoErr(EFail); } pStep = &(pContext->m_pPath[pContext->m_PathLength]); pContext->m_PathLength += 1; pStep->m_Function = STATE_TRANSITION; if (NULL != pNewState) { pStep->m_pDebugString = pNewState->m_pDebugString; } else { pStep->m_pDebugString = NULL; } pStep->m_pStartText = pText; pStep->m_pStateAfterVariable = NULL; pStep->m_pCurrentState = pNewState; if (NULL != pNewState) { pStep->m_pCurrentTransition = pNewState->m_TransitionList; } else { pStep->m_pCurrentTransition = NULL; } pStep->m_NumPendingCallbacks = pContext->m_NumPendingCallbacks; if ((NULL == pNewState) || (NULL == pNewState->m_pOptionalList)) { gotoErr(ENoErr); } // Make a private list. We will remove non-Kleene optionals from this // list as they are matched. numOptionals = 0; pOptionalTransition = pNewState->m_pOptionalList; while ((NULL != pOptionalTransition) && (numOptionals < MAX_OPTIONALS_PER_RULE)) { optionalList[numOptionals] = pOptionalTransition; pOptionalTransition = pOptionalTransition->m_pNextOptionalTransition; numOptionals++; } // Keep looping while there are optionals and at least one matches. while ((numOptionals > 0) && (pText < pEndText)) { fSubMatch = false; // Try to match each optional. index = 0; while (index < numOptionals) { pOptionalTransition = optionalList[index]; index++; // First, do a test run to see if there is a match. savedCallPathLength = pContext->m_PathLength; err = GoToNextStep(pContext, NULL, pText, pEndText, NULL); if (err) { gotoErr(err); } pStep = &(pContext->m_pPath[pContext->m_PathLength - 1]); pStep->m_Function = EXPANDING_OPTIONAL_PATTERN; pStep->m_pDebugString = pOptionalTransition->m_pDebugString; fOneOptionalMatched = ParseSubString( pContext, pOptionalTransition, pText, pEndText, &pText); pContext->m_PathLength = savedCallPathLength; if (fOneOptionalMatched) { fSubMatch = true; if (NULL != ppResumeText) { *ppResumeText = pText; } // If this is not a kleene, then it can only match once. if (!(pOptionalTransition->m_fKleene)) { // Go back to the optional we just matched. index--; if (index < (numOptionals - 1)) { optionalList[index] = optionalList[numOptionals - 1]; } numOptionals--; index = 0; } // if (!(pOptionalTransition->m_fKleene)) } } // while (index < numOptionals) // If no optionals matched in this entire iteration, then stop trying. if (!fSubMatch) { break; } } // while (numOptionals > 0) abort: returnErr(err); } // GoToNextStep //////////////////////////////////////////////////////////////////////////////// // // [FinishPathAfterMatch] // // This does a series of removals. Basically, we want to remove completed steps // until we find a state that has a transition that we have not yet taken. //////////////////////////////////////////////////////////////////////////////// ErrVal CParseGrammar::FinishPathAfterMatch( CParsingContext *pContext, int32 firstStep, const char *pText, const char *pEndText, const char **ppResumeText) { ErrVal err = ENoErr; CPathStep *pStep; CPathStep *pStartStep; int32 startStepIndex; CPendingCallback *pPendingCallback; const char *pStartText; if (NULL != ppResumeText) { *ppResumeText = pText; } while (pContext->m_PathLength >= firstStep) //<> This used to be > { pStep = &(pContext->m_pPath[pContext->m_PathLength - 1]); // If this was a match, and we just completed this state, // then fire the rule for the state we just completed. if ((NULL != pStep->m_pCurrentState) && (NULL != pStep->m_pCurrentState->m_pFinishRule) && (SILENT_RULE != pStep->m_pCurrentState->m_pFinishRule->m_RuleType) && (NULL != pContext->m_pCallback)) { // Grow the list of callbacks if we need to. if (pContext->m_NumPendingCallbacks >= pContext->m_MaxNumPendingCallbacks) { int32 newNumPendingCallbacks; CPendingCallback *pNewCallbackList = NULL; newNumPendingCallbacks = pContext->m_MaxNumPendingCallbacks * 2; // We cannot do a realloc, since the initial list may not be // allocated but instead from the built-in array. pNewCallbackList = (CPendingCallback *) memAlloc(newNumPendingCallbacks * sizeof(CPendingCallback)); if (NULL == pNewCallbackList) { gotoErr(EFail); } memcpy( pNewCallbackList, pContext->m_pPendingCallbacks, pContext->m_NumPendingCallbacks * sizeof(CPendingCallback)); if (pContext->m_pPendingCallbacks != pContext->m_DefaultPendingCallbacks) { memFree(pContext->m_pPendingCallbacks); } pContext->m_pPendingCallbacks = pNewCallbackList; pContext->m_MaxNumPendingCallbacks = newNumPendingCallbacks; } pStartText = pStep->m_pStartText; for (startStepIndex = pContext->m_PathLength - 1; startStepIndex >= 0; startStepIndex--) { pStartStep = &(pContext->m_pPath[startStepIndex]); if (EXPANDING_VARIABLE == pStartStep->m_Function) { pStartText = pStartStep->m_pStartText; break; } } pPendingCallback = &(pContext->m_pPendingCallbacks[pContext->m_NumPendingCallbacks]); pContext->m_NumPendingCallbacks += 1; pPendingCallback->m_pRule = pStep->m_pCurrentState->m_pFinishRule; pPendingCallback->m_pStartText = pStartText; pPendingCallback->m_pStopText = pText; } // fire a rule // Since we had a match, we are done exploring the possibilities of // this state. If this is a variable, then we will take this transition. // If it is not a variable, then we are done with this state. if (EXPANDING_VARIABLE == pStep->m_Function) { pStep->m_Function = STATE_TRANSITION; err = GoToNextStep( pContext, pStep->m_pStateAfterVariable, pText, pEndText, ppResumeText); if (err) { gotoErr(err); } break; } pContext->m_PathLength = pContext->m_PathLength - 1; } // while (pContext->m_PathLength > 0) abort: returnErr(err); } // FinishPathAfterMatch //////////////////////////////////////////////////////////////////////////////// // // [BacktrackFromMismatch] // // This is called after we get a mismatch. It does a series of backtracks, and // tries to find the next state that has a possible different transition. //////////////////////////////////////////////////////////////////////////////// void CParseGrammar::BacktrackFromMismatch( CParsingContext *pContext, int32 firstStep) { CPathStep *pStep; // If we didn't get a match, then find the next state we can try. // Note, we mismatched, so we can close any open variable. We only // want to find the next state that has an alternative. while (pContext->m_PathLength >= firstStep) { pStep = &(pContext->m_pPath[pContext->m_PathLength - 1]); pContext->m_NumPendingCallbacks = pStep->m_NumPendingCallbacks; if (EXPANDING_OPTIONAL_PATTERN == pStep->m_Function) { break; } if ((NULL != pStep->m_pCurrentTransition) && (NULL != pStep->m_pCurrentTransition->m_pNextTransition)) { pStep->m_pCurrentTransition = pStep->m_pCurrentTransition->m_pNextTransition; break; } // No more possibilities at this level. Backtrack one step. pContext->m_PathLength = pContext->m_PathLength - 1; } // while (TRUE) } // BacktrackFromMismatch //////////////////////////////////////////////////////////////////////////////// // // [ParseSubString] // // This is the main component of the pattern matching code. // It basically does a depth-first-search through the space of states. //////////////////////////////////////////////////////////////////////////////// bool CParseGrammar::ParseSubString( void *pVoidContext, CStateTransition *pFirstTransition, const char *pText, const char *pEndText, const char **ppEndPtr) { ErrVal err = ENoErr; bool fMatch = false; bool fSubMatch = false; CPathStep *pStep; CStateTransition *pTransition; CParsingContext *pContext = (CParsingContext *) pVoidContext; int32 firstStep; bool fFollowingTransitionFromState = false; if ((NULL == pContext) || (NULL == pText) || (NULL == pEndText)) { gotoErr(EFail); } if (NULL != ppEndPtr) { *ppEndPtr = (char *) pText; } // There are cases, like when the initial rule is just optional patterns, // that there is nothing to do once we start parsing the real pattern. // If this is the case, then do nothing. if (pContext->m_PathLength <= 0) { gotoErr(ENoErr); } // Now, do the actual parsing. Keep looping until we hit a final state. firstStep = pContext->m_PathLength; while (1) { pStep = &(pContext->m_pPath[pContext->m_PathLength - 1]); if (NULL != pFirstTransition) { pTransition = pFirstTransition; pFirstTransition = NULL; fFollowingTransitionFromState = false; } else { // if (NULL == pFirstTransition) // Get the next possible state transition. // If there are none, then we have to stop. This is either a // partial match, complete match, or a mismatch. if (NULL == pStep->m_pCurrentTransition) { // If the current state has no transitions at all, then this // is either a partial match or a complete match. Backtrack the path // until we either hit the first path step, or else hit a variable. // If we hit a variable, then this is just a partial match. If // we hit the top of the path, then this is a complete match. if ((NULL == pStep->m_pCurrentState) || (NULL == pStep->m_pCurrentState->m_TransitionList)) { err = FinishPathAfterMatch( pContext, firstStep, pText, pEndText, &pText); if (err) { gotoErr(err); } } else { BacktrackFromMismatch(pContext, firstStep); if (pContext->m_PathLength >= firstStep) { pStep = &(pContext->m_pPath[pContext->m_PathLength - 1]); pText = pStep->m_pStartText; } } // Check if this was a complete match. pStep = &(pContext->m_pPath[pContext->m_PathLength - 1]); if ((pContext->m_PathLength < firstStep) //<> This used to be <= || (EXPANDING_OPTIONAL_PATTERN == pStep->m_Function)) { if ((fMatch) && (NULL != ppEndPtr)) { *ppEndPtr = (char *) pText; } goto abort; } continue; } // if (NULL == pStep->m_pCurrentState->m_TransitionList) // Otherwise, we have a possible transition from the current state. pTransition = pStep->m_pCurrentTransition; fFollowingTransitionFromState = true; } // if (NULL == pFirstTransition) // Check if this transition will match. Note, this may be a weak // attempt. The transition may appear to match, but it will later // lead to a mismatch in some future state. fSubMatch = MatchTransition( pContext, pTransition, pText, pEndText, &pText); // If it didn't match, then try the next possible transition from // the current state. if (fSubMatch) { fMatch = true; if ((VARIABLE_PATTERN == pTransition->m_TransitionType) && (NULL != pTransition->m_pPatternVariable) && (NULL != pTransition->m_pPatternVariable->m_pStartState)) { err = GoToNextStep( pContext, pTransition->m_pPatternVariable->m_pStartState, pText, pEndText, &pText); if (err) { gotoErr(err); } pStep = &(pContext->m_pPath[pContext->m_PathLength - 1]); pStep->m_Function = EXPANDING_VARIABLE; pStep->m_pStateAfterVariable = pTransition->m_pNextState; pStep->m_pDebugString = pTransition->m_pDebugString; } else { // Follow the transition. err = GoToNextStep( pContext, pTransition->m_pNextState, pText, pEndText, &pText); if (err) { gotoErr(err); } } } else { // if (!fSubMatch) fMatch = false; if (fFollowingTransitionFromState) { pStep->m_pCurrentTransition = pTransition->m_pNextTransition; } else { break; } } } // while (TRUE) abort: if ((fMatch) && (NULL != ppEndPtr)) { *ppEndPtr = (char *) pText; } return(fMatch); } // ParseSubString //////////////////////////////////////////////////////////////////////////////// // // [MatchTransition] // //////////////////////////////////////////////////////////////////////////////// bool CParseGrammar::MatchTransition( CParsingContext *pContext, CStateTransition *pTransition, const char *pText, const char *pEndText, const char **ppNewText) { bool fTotalMatch = false; bool fSingleMatch = false; int32 bytesInChar; int32 compareResult; int32 charNum; //////////////////////////////////////////////////////////////// if ((CHAR_PATTERN == pTransition->m_TransitionType) && (pText < pEndText)) { // If a transition is a Kleene, then it can match as many times // as possible. In this case, keep iterating until it no longer matches. // Otherwise, it may match at most once. // Each iteration matches any one alternative once. while (pText < pEndText) { bytesInChar = 1; fSingleMatch = false; // It may match one of a list of 1 or more specific characters. for (charNum = 0; charNum < pTransition->m_NumPatternChars; charNum++) { compareResult = CStringLib::UnicodeStrcmp( pText, 1, &(pTransition->m_pPatternCharList[charNum]), 1, CStringLib::IGNORE_CASE); if (0 == compareResult) { fSingleMatch = true; break; } // if (0 == compareResult) } // for (charNum = 0; charNum < pTransition->m_NumPatternChars; charNum++) // Alternatively, it may have a general character type. // For example, you can use this to specify "all whitespace characters", // or "all numbers". if ((!fSingleMatch) && (0 != pTransition->m_PatternCharType) && (CStringLib::IsByte(*pText, pTransition->m_PatternCharType))) { fSingleMatch = true; } // Stop looking when there is a mismatch. // Note, that if this is a Kleene, then fTotalMatch may // still be true. if (!fSingleMatch) { break; } // Otherwise, we got a match. We may or may not try to get more matches. fTotalMatch = true; pText = pText + bytesInChar; if (!(pTransition->m_fKleene)) { break; } } // while (pText < pEndText) } // (CHAR_PATTERN == pTransition->m_TransitionType) //////////////////////////////////////////////////////////////// else if ((VARIABLE_PATTERN == pTransition->m_TransitionType) && (NULL != pTransition->m_pPatternVariable) && (NULL != pTransition->m_pPatternVariable->m_pStartState) && (pContext->m_PathLength < pContext->m_MaxPathLength)) { fTotalMatch = true; } if ((fTotalMatch) && (NULL != ppNewText)) { *ppNewText = pText; } return(fTotalMatch); } // MatchTransition //////////////////////////////////////////////////////////////////////////////// // // [FirePendingCallbacks] // //////////////////////////////////////////////////////////////////////////////// ErrVal CParseGrammar::FirePendingCallbacks(CParsingContext *pContext) { ErrVal err = ENoErr; CParseRule *pRule; const char *pStartText; const char *pStopText; int32 numberValue; int textLength; CPendingCallback *pPendingCallback; int callbackNum; if (NULL == pContext->m_pCallback) { goto abort; } for (callbackNum = 0; callbackNum < pContext->m_NumPendingCallbacks; callbackNum++) { pPendingCallback = &(pContext->m_pPendingCallbacks[callbackNum]); pRule = pPendingCallback->m_pRule; pStartText = pPendingCallback->m_pStartText; pStopText = pPendingCallback->m_pStopText; textLength = (int32) (((char *) pStopText) - (char *) (pStartText)); switch (pRule->m_RuleType) { case SILENT_RULE: break; case TOKEN_RULE: pContext->m_pCallback->OnParseToken( pContext->m_pCallbackContext, pRule->m_TokenId, pRule->m_TokenValue, pStartText, textLength); break; case INTEGER_RULE: err = CStringLib::StringToNumber((const char *) pStartText, textLength, &numberValue); if (err) { gotoErr(err); } pContext->m_pCallback->OnParseToken( pContext->m_pCallbackContext, pRule->m_TokenId, numberValue, (const char *) pStartText, textLength); break; default: break; } } pContext->m_NumPendingCallbacks = 0; abort: returnErr(err); } // FirePendingCallbacks //////////////////////////////////////////////////////////////////////////////// // // [AddRule] // //////////////////////////////////////////////////////////////////////////////// ErrVal CParseGrammar::AddRule(CParseGrammar::CParseRule *pRule) { ErrVal err = ENoErr; CParseState *pState = NULL; CParserVariable *pVariable; if (NULL == pRule) { gotoErr(EFail); } // Look for the state that this rule starts from. pVariable = AddVariable(pRule->m_pVariableName, NULL); if (NULL == pVariable) { gotoErr(EFail); } // If there is no state already defined, then create one. pState = pVariable->m_pStartState; if (NULL == pState) { pState = newex CParseState; if (NULL == pState) { gotoErr(EFail); } g_MainMem.DontCountMemoryAsLeaked((char *) pState); pState->m_TransitionList = NULL; pState->m_pOptionalList = NULL; pState->m_pFinishRule = NULL; pState->m_pDebugString = NULL; pState->m_pNextStateInGrammar = m_pStateList; m_pStateList = pState; pVariable->m_pStartState = pState; } // if (NULL == pState) // Now, expand the string into states. err = ParseRuleExpansion(pRule->m_pPattern, pState, &pState); if (err) { gotoErr(err); } if (NULL != pState) { ASSERT(NULL == pState->m_pFinishRule); pState->m_pFinishRule = pRule; } abort: returnErr(err); } // AddRule //////////////////////////////////////////////////////////////////////////////// // // [ParseRuleExpansion] // //////////////////////////////////////////////////////////////////////////////// ErrVal CParseGrammar::ParseRuleExpansion( const char *pString, CParseState *pState, CParseState **ppFinishState) { ErrVal err = ENoErr; CStateTransition *pTransition = NULL; CStateTransition *pOptionalPattern = NULL; CParseState *pNewState = NULL; CParserVariable *pVariable = NULL; TransitionType transitionType; const char *pSaveString = NULL; int32 compareResult; int32 matchCharFlags; char matchCharList[256]; bool fKleene; int32 numChars; int32 charNum; // Now, expand the string into states. while ((pState) && (*pString)) { // Reset the description of a transition before defining the // next transition. Every new transition may not override everything. matchCharFlags = 0; numChars = 0; fKleene = false; pSaveString = pString; if (NULL == pState->m_pDebugString) { pState->m_pDebugString = pString; } ///////////////////////////////////////// if (g_OpenVariableNameChar == *pString) { pVariable = AddVariable(pString, &pString); if (NULL == pVariable) { gotoErr(EFail); } transitionType = VARIABLE_PATTERN; } // if (g_OpenVariableNameChar == *pString) ///////////////////////////////////////// else if (' ' == *pString) { pOptionalPattern = ParseWhitespacePattern(pString, &pString); if (NULL == pOptionalPattern) { gotoErr(EFail); } // These hang off the current state, they are not transitions to a // new and different state. pOptionalPattern->m_pNextOptionalTransition = pState->m_pOptionalList; pState->m_pOptionalList = pOptionalPattern; continue; } ///////////////////////////////////////// else if ((g_OpenAlternativeChar == *pString) || ((g_KleeneStarChar == pString[0]) && (g_OpenAlternativeChar == pString[1]))) { err = ParseAlternativeList( pString, &matchCharFlags, matchCharList, &numChars, &fKleene, &pString); if (err) { gotoErr(err); } transitionType = CHAR_PATTERN; } ///////////////////////////////////////// else if ((g_OpenOptionalChar == *pString) || ((g_KleeneStarChar == pString[0]) && (g_OpenOptionalChar == pString[1]))) { pOptionalPattern = ParseOptionalPattern(pString, &pString); if (NULL == pOptionalPattern) { gotoErr(EFail); } // These hang off the current state, they are not transitions to a // new and different state. pOptionalPattern->m_pNextOptionalTransition = pState->m_pOptionalList; pState->m_pOptionalList = pOptionalPattern; continue; } ///////////////////////////////////////// else { matchCharList[0] = GetPatternChar(&pString); numChars = 1; transitionType = CHAR_PATTERN; } // Check if a transition for this character already exists. If // so, then we just share the next state with this new string. pTransition = pState->m_TransitionList; while (pTransition) { if (transitionType == pTransition->m_TransitionType) { if ((CHAR_PATTERN == pTransition->m_TransitionType) && (pTransition->m_NumPatternChars > 0) && (1 == numChars) && (0 == matchCharFlags)) { for (charNum = 0; charNum < pTransition->m_NumPatternChars; charNum++) { compareResult = CStringLib::UnicodeStrcmp( &(matchCharList[0]), 1, &(pTransition->m_pPatternCharList[charNum]), 1, m_Options); if (compareResult == 0) { break; } } // for (charNum = 0; charNum < pTransition->m_NumPatternChars; charNum++) } // if (CHAR_PATTERN == pTransition->m_TransitionType) else if ((VARIABLE_PATTERN == pTransition->m_TransitionType) && (pVariable == pTransition->m_pPatternVariable)) { break; } // if (VARIABLE_PATTERN == pTransition->m_TransitionType) } // if (transitionType == pTransition->m_TransitionType) pTransition = pTransition->m_pNextTransition; } // while (pTransition) // If there is no transition, then make one. We will also have to make a new state. if (NULL == pTransition) { pTransition = newex CStateTransition; if (NULL == pTransition) { gotoErr(EFail); } g_MainMem.DontCountMemoryAsLeaked((char *) pTransition); pTransition->m_fKleene = fKleene; pTransition->m_pNextOptionalTransition = NULL; pTransition->m_pDebugString = pSaveString; pTransition->m_TransitionType = transitionType; pTransition->m_PatternCharType = matchCharFlags; pTransition->m_NumPatternChars = numChars; if (CHAR_PATTERN == transitionType) { pTransition->m_NumPatternChars = numChars; pTransition->m_pPatternCharList = new char[numChars]; if (NULL == pTransition->m_pPatternCharList) { gotoErr(EFail); } for (charNum = 0; charNum < numChars; charNum++) { pTransition->m_pPatternCharList[charNum] = matchCharList[charNum]; } pTransition->m_PatternCharType = matchCharFlags; pTransition->m_pPatternVariable = NULL; } else if (VARIABLE_PATTERN == pTransition->m_TransitionType) { pTransition->m_pPatternVariable = pVariable; } pNewState = newex CParseState; if (NULL == pNewState) { gotoErr(EFail); } g_MainMem.DontCountMemoryAsLeaked((char *) pNewState); pNewState->m_pFinishRule = NULL; pNewState->m_TransitionList = NULL; pNewState->m_pOptionalList = NULL; pNewState->m_pDebugString = NULL; pNewState->m_pNextStateInGrammar = m_pStateList; m_pStateList = pNewState; // This is a transition from the old state to the new state. pTransition->m_pNextState = pNewState; pTransition->m_pNextTransition = pState->m_TransitionList; pState->m_TransitionList = pTransition; } // if (NULL == pTransition) pState = pTransition->m_pNextState; } // while ((pState) && (*pString)) if (NULL != ppFinishState) { *ppFinishState = pState; } abort: returnErr(err); } // ParseRuleExpansion //////////////////////////////////////////////////////////////////////////////// // // [AddVariable] // //////////////////////////////////////////////////////////////////////////////// CParseGrammar::CParserVariable * CParseGrammar::AddVariable( const char *pStartName, const char **pResultStopName) { ErrVal err = ENoErr; CParserVariable *pVariable; bool fStartVariable = false; const char *pStopName; int nameLength = 0; // Look for the state that this rule starts from. if ((NULL == pStartName) || (0 == *pStartName)) { fStartVariable = true; pVariable = m_StartVariable; } else { fStartVariable = false; while (g_OpenVariableNameChar == *pStartName) { pStartName++; } pStopName = pStartName; while ((*pStopName) && (g_CloseVariableNameChar != *pStopName)) { pStopName++; } nameLength = pStopName - pStartName; if (NULL != pResultStopName) { *pResultStopName = pStopName; if (g_CloseVariableNameChar == *pStopName) { *pResultStopName += 1; } } pVariable = m_pVariableList; while (NULL != pVariable) { if ((NULL != pVariable->m_pName) && (nameLength == pVariable->m_NameLength) && (0 == strncasecmpex(pStartName, pVariable->m_pName, nameLength))) { break; } pVariable = pVariable->m_pNextVariable; } // while (NULL != pVariable) } // If we don't have a variable for this rule yet, then create one. if (NULL == pVariable) { pVariable = newex CParserVariable; if (NULL == pVariable) { gotoErr(EFail); } g_MainMem.DontCountMemoryAsLeaked((char *) pVariable); pVariable->m_pStartState = NULL; if (fStartVariable) { pVariable->m_pName = NULL; pVariable->m_NameLength = -1; pVariable->m_pNextVariable = NULL; m_StartVariable = pVariable; } else { pVariable->m_pName = strndupex(pStartName, nameLength); if (NULL == pVariable->m_pName) { gotoErr(EFail); } g_MainMem.DontCountMemoryAsLeaked((char *) pVariable->m_pName); pVariable->m_NameLength = nameLength; pVariable->m_pNextVariable = m_pVariableList; m_pVariableList = pVariable; } } abort: return(pVariable); } // AddVariable //////////////////////////////////////////////////////////////////////////////// // // [ParseAlternativeList] // //////////////////////////////////////////////////////////////////////////////// ErrVal CParseGrammar::ParseAlternativeList( const char *pStartAlternativeList, int32 *pMatchFlags, char *pCharList, int32 *pNumChars, bool *pfKleene, const char **ppResultStopAlternative) { ErrVal err = ENoErr; const char *pStartAlternate; const char *pStopAlternate; CCharTypeName *pPattern; int32 alternateLength; int32 numMatchChars = 0; *pMatchFlags = 0; *pfKleene = false; *pNumChars = 0; if (g_KleeneStarChar == *pStartAlternativeList) { *pfKleene = true; pStartAlternativeList++; } if (g_OpenAlternativeChar == *pStartAlternativeList) { pStartAlternativeList++; } // Now, read through every alternate character. // Each iteration reads one alternate. pStartAlternate = pStartAlternativeList; while ((*pStartAlternate) && (g_CloseAlternativeChar != *pStartAlternate)) { // Skip whitespace and separators. while ((g_AlternativeSeparatorChar == *pStartAlternate) || (' ' == *pStartAlternate) || ('\n' == *pStartAlternate) || ('\r' == *pStartAlternate) || ('\t' == *pStartAlternate)) { pStartAlternate += 1; } // Find the end of this alternate. pStopAlternate = pStartAlternate; while ((g_AlternativeSeparatorChar != *pStopAlternate) && (g_CloseAlternativeChar != *pStopAlternate) && ('\r' != *pStopAlternate) && ('\n' != *pStopAlternate) && ('\t' != *pStopAlternate) && (' ' != *pStopAlternate)) { if ('\\' == *pStopAlternate) { pStopAlternate++; } pStopAlternate += 1; } alternateLength = pStopAlternate - pStartAlternate; if (alternateLength <= 0) { break; } // Check if this matches any fixed pattern. pPattern = g_AlternativePatterns; while (NULL != pPattern->m_pPattern) { // Initialize it the first time we use it. if (-1 == pPattern->m_PatternLength) { pPattern->m_PatternLength = strlen(pPattern->m_pPattern); } if ((alternateLength == pPattern->m_PatternLength) && (0 == strncasecmpex(pStartAlternate, pPattern->m_pPattern, alternateLength))) { break; } pPattern++; } // while (NULL != pPattern->m_pPattern) // Initialize this alternate if ((NULL != pPattern) && (NULL != pPattern->m_pPattern)) { *pMatchFlags |= pPattern->m_Flags; } else { pCharList[numMatchChars] = GetPatternChar(&pStartAlternate); numMatchChars += 1; } // Resume parsing on the next alternate. pStartAlternate = pStopAlternate; } // while (pStartAlternate < pStopPtr) if (NULL != ppResultStopAlternative) { if (g_CloseAlternativeChar == *pStartAlternate) { pStartAlternate++; } *ppResultStopAlternative = pStartAlternate; } pCharList[numMatchChars] = 0; *pNumChars = numMatchChars; return(err); } // ParseAlternativeList //////////////////////////////////////////////////////////////////////////////// // // [ParseOptionalPattern] // //////////////////////////////////////////////////////////////////////////////// CParseGrammar::CStateTransition * CParseGrammar::ParseOptionalPattern( const char *pStartOptional, const char **ppResultStopOptional) { ErrVal err = ENoErr; CStateTransition *pTransition = NULL; const char *pStopOptional; bool fKleeneStar = false; CParseState *pFinishState = NULL; CParseState startState; char *pOptionPattern = NULL; int nesting; int optionalLength; if (g_KleeneStarChar == *pStartOptional) { fKleeneStar = true; pStartOptional++; while (' ' == *pStartOptional) { pStartOptional++; } } if (g_OpenOptionalChar == *pStartOptional) { pStartOptional++; } // Find the end of this optional. Be careful, since // several optionals may be nested. pStopOptional = pStartOptional; nesting = 1; while ((*pStopOptional) && (nesting > 0)) { if ('\\' == *pStopOptional) { pStopOptional++; } else if (g_CloseOptionalChar == *pStopOptional) { nesting -= 1; } else if (g_OpenOptionalChar == *pStopOptional) { nesting += 1; } pStopOptional += 1; } // while (g_CloseOptionalChar != *pStopOptional) // Make this substring a NULL-terminated string. Resume // parsing just after it finishes. optionalLength = pStopOptional - pStartOptional; if ((optionalLength > 1) && (g_CloseOptionalChar == pStartOptional[optionalLength - 1])) { optionalLength--; } pOptionPattern = strndupex(pStartOptional, optionalLength); if (NULL == pOptionPattern) { gotoErr(EFail); } g_MainMem.DontCountMemoryAsLeaked((char *) pOptionPattern); // Parse this optional pattern, and hang its transitions off a temporary // state. startState.m_pFinishRule = NULL; startState.m_TransitionList = NULL; startState.m_pOptionalList = NULL; err = ParseRuleExpansion(pOptionPattern, &startState, &pFinishState); if (err) { gotoErr(err); } // Now, put the transitions into the optional object. pTransition = startState.m_TransitionList; pTransition->m_fKleene = fKleeneStar; ASSERT(NULL == startState.m_pFinishRule); if (NULL != ppResultStopOptional) { *ppResultStopOptional = pStopOptional; } abort: // Leak pOptionPattern. The parsing code assumes we can point // to variable names in the grammar. if (err) { delete pTransition; pTransition = NULL; } return(pTransition); } // ParseOptionalPattern //////////////////////////////////////////////////////////////////////////////// // // [ParseWhitespacePattern] // //////////////////////////////////////////////////////////////////////////////// CParseGrammar::CStateTransition * CParseGrammar::ParseWhitespacePattern( const char *pStartOptional, const char **ppResultStopOptional) { ErrVal err = ENoErr; CStateTransition *pTransition = NULL; CParseState *pNewState = NULL; while (CStringLib::IsByte(*pStartOptional, CStringLib::WHITE_SPACE_CHAR)) { pStartOptional++; } if (NULL != ppResultStopOptional) { *ppResultStopOptional = pStartOptional; } pTransition = newex CStateTransition; if (NULL == pTransition) { gotoErr(EFail); } g_MainMem.DontCountMemoryAsLeaked((char *) pTransition); pTransition->m_fKleene = true; pTransition->m_pNextOptionalTransition = NULL; pTransition->m_pDebugString = "SPECIAL Kleene for a space"; pTransition->m_TransitionType = CHAR_PATTERN; pTransition->m_PatternCharType = CStringLib::WHITE_SPACE_CHAR; pTransition->m_NumPatternChars = 0; pTransition->m_pPatternVariable = NULL; pTransition->m_pNextTransition = NULL; pNewState = newex CParseState; if (NULL == pNewState) { gotoErr(EFail); } g_MainMem.DontCountMemoryAsLeaked((char *) pNewState); pNewState->m_pFinishRule = NULL; pNewState->m_TransitionList = NULL; pNewState->m_pOptionalList = NULL; pNewState->m_pDebugString = "Finish state for special Kleene for a space"; pNewState->m_pNextStateInGrammar = m_pStateList; m_pStateList = pNewState; // This is a transition from the old state to the new state. pTransition->m_pNextState = pNewState; abort: if (err) { delete pTransition; pTransition = NULL; } return(pTransition); } // ParseWhitespacePattern //////////////////////////////////////////////////////////////////////////////// // // [GetPatternChar] // //////////////////////////////////////////////////////////////////////////////// char CParseGrammar::GetPatternChar(const char **pPattern) { char c; const char *pStr; pStr = *pPattern; c = *pStr; pStr += 1; if ('\\' == c) { c = *pStr; pStr += 1; if ('n' == c) { c = '\n'; } else if ('r' == c) { c = '\r'; } else if ('t' == c) { c = '\t'; } } *pPattern = pStr; return(c); } // GetPatternChar //////////////////////////////////////////////////////////////////////////////// // // [OnParseToken] // //////////////////////////////////////////////////////////////////////////////// ErrVal CReentrantParsingCallback::OnParseToken( void *pCallbackContext, int32 tokenId, int32 intValue, const char *pStr, int32 length) { pCallbackContext = pCallbackContext; // Unused pStr = pStr; // Unused. length = length; // Unused. m_TokenId = tokenId; m_IntValue = intValue; return(ENoErr); }; // OnParseToken