///////////////////////////////////////////////////////////////////////////// // // Copyright (c) 2005-2017 Dawson Dean // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the "Software"), // to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // ///////////////////////////////////////////////////////////////////////////// // See the corresponding .cpp file for a description of this module. ///////////////////////////////////////////////////////////////////////////// #ifndef _STRING_PARSE_H_ #define _STRING_PARSE_H_ class CParsingContext; //////////////////////////////////////////////////////////////////////////////// // This is the callback that is invoked by the parser whenever it finds a // token. class CParsingCallback { public: virtual ErrVal OnParseToken( void *pCallbackContext, int32 tokenId, int32 intValue, const char *pStr, int32 length) { pCallbackContext = pCallbackContext; // Unused. tokenId = tokenId; // Unused. intValue = intValue; // Unused. pStr = pStr; // Unused. length = length; // Unused. return(ENoErr); } }; // CParsingCallback //////////////////////////////////////////////////////////////////////////////// class CParseGrammar { public: /////////////////////////// enum RuleType { STOP_RULE, SILENT_RULE, TOKEN_RULE, INTEGER_RULE }; // RuleType /////////////////////////// enum CParseGrammarConstants { DEFAULT_PATH_LENGTH = 1024, DEFAULT_MAX_PENDING_CALLBACKS = 64, }; // CParseGrammarConstants /////////////////////////// class CParseRule { public: NEWEX_IMPL() RuleType m_RuleType; const char *m_pVariableName; const char *m_pPattern; int32 m_TokenId; int32 m_TokenValue; }; // CParseRule CParseGrammar(CParseGrammar::CParseRule *pRuleList); virtual ~CParseGrammar(); NEWEX_IMPL() bool ParseString( const char *pText, int32 textLength, CParsingCallback *pCallback, void *pContext); // These are used for more advanced applications, like when // we need to parse a string several times. bool ParseStringEx( const char *pText, int32 textLength, CParsingCallback *pCallback, void *pContext, char **ppEndPtr); private: class CParseState; class CParserVariable; class CStateTransition; /////////////////////////// enum CStateTransitionFlags { MAX_RULES_PER_STATE = 20, MAX_OPTIONALS_PER_RULE = 32, }; /////////////////////////// enum TransitionType { CHAR_PATTERN, VARIABLE_PATTERN, }; /////////////////////////// enum StackFrameFunction { STATE_TRANSITION, EXPANDING_VARIABLE, EXPANDING_OPTIONAL_PATTERN, }; // StackFrameFunction /////////////////////////// class CPathStep { public: StackFrameFunction m_Function; const char *m_pDebugString; const char *m_pStartText; CParseState *m_pStateAfterVariable; int32 m_NumPendingCallbacks; CParseState *m_pCurrentState; CStateTransition *m_pCurrentTransition; }; // CPathStep /////////////////////////// class CPendingCallback { public: CParseRule *m_pRule; const char *m_pStartText; const char *m_pStopText; }; // CPendingCallback /////////////////////////// class CParsingContext { public: CParsingContext(); NEWEX_IMPL() CParseState *m_pStateAtEndOfPreviousBuffer; CParsingCallback *m_pCallback; void *m_pCallbackContext; CPathStep m_DefaultPath[DEFAULT_PATH_LENGTH]; CPathStep *m_pPath; int32 m_MaxPathLength; int32 m_PathLength; CPendingCallback m_DefaultPendingCallbacks[DEFAULT_MAX_PENDING_CALLBACKS]; CPendingCallback *m_pPendingCallbacks; int32 m_MaxNumPendingCallbacks; int32 m_NumPendingCallbacks; }; // CParsingContext /////////////////////////// // This is a single state transition. class CStateTransition { public: NEWEX_IMPL() TransitionType m_TransitionType; bool m_fKleene; char *m_pPatternCharList; int32 m_NumPatternChars; int32 m_PatternCharType; CParserVariable *m_pPatternVariable; CParseState *m_pNextState; const char *m_pDebugString; CStateTransition *m_pNextTransition; CStateTransition *m_pNextOptionalTransition; }; // CStateTransition /////////////////////////// // This is one possible state of the tokenizer. class CParseState { public: NEWEX_IMPL() CParseRule *m_pFinishRule; CStateTransition *m_TransitionList; CStateTransition *m_pOptionalList; const char *m_pDebugString; CParseState *m_pNextStateInGrammar; }; // CParseState /////////////////////////// // This is a variable that is bound to a substring in the text we parse. class CParserVariable { public: NEWEX_IMPL() const char *m_pName; int32 m_NameLength; CParseState *m_pStartState; CParserVariable *m_pNextVariable; }; // CParserVariable /////////////////////////////////////////////// // Grammar Methods /////////////////////////////////////////////// ErrVal InitializeRules(); ErrVal AddRule(CParseRule *pRule); ErrVal ParseRuleExpansion( const char *pString, CParseState *pState, CParseState **ppFinishState); CParserVariable *AddVariable( const char *pStartName, const char **pStopName); ErrVal ParseAlternativeList( const char *pStartAlternative, int32 *pMatchFlags, char *pCharList, int32 *pNumChars, bool *pfKleene, const char **ppResultStopAlternative); CStateTransition *ParseOptionalPattern( const char *pStartAlternative, const char **ppResultStopAlternative); CStateTransition *ParseWhitespacePattern( const char *pStartAlternative, const char **ppResultStopAlternative); char GetPatternChar(const char **pPattern); /////////////////////////////////////////////// // Parsing Methods /////////////////////////////////////////////// bool ParseSubString( void *pContext, CStateTransition *pFirstTransition, const char *pText, const char *pEndText, const char **pEndStr); bool MatchTransition( CParsingContext *pContext, CStateTransition *pTransition, const char *pText, const char *pEndText, const char **ppNewText); ErrVal FirePendingCallbacks( CParsingContext *pContext); ErrVal GoToNextStep( CParsingContext *pContext, CParseState *pNewState, const char *pText, const char *pEndText, const char **ppResumeText); ErrVal FinishPathAfterMatch( CParsingContext *pContext, int32 topStackFrame, const char *pText, const char *pEndText, const char **ppResumeText); void BacktrackFromMismatch( CParsingContext *pContext, int32 topStackFrame); void InitializeContext( CParsingContext *pContext, CParsingCallback *pCallback, void *pCallbackContext); void ShutdownContext(CParsingContext *pContext); int32 m_Options; bool m_InitializedRules; CParseRule *m_pRuleList; CParserVariable *m_pVariableList; CParseState *m_pStateList; CParserVariable *m_StartVariable; }; // CParseGrammar // These macros allow you to define grammars. // A sample grammar definition is: // DEFINE_GRAMMAR(g_UrlGrammar) // DEFINE_RULE("", "://[:]/"), // // DEFINE_TOKEN("schemeName", "http", SCHEME_TOKEN, URL_SCHEME_HTTP), // DEFINE_TOKEN("schemeName", "https", SCHEME_TOKEN, URL_SCHEME_HTTPS), // // DEFINE_TOKEN("hostname", HOSTNAME_TOKEN, 0), // DEFINE_TOKEN("path", PATH_TOKEN, 0), // DEFINE_TOKEN("port", PORTNUM_TOKEN, 0), // STOP_GRAMMAR; // // The following characters are reserved, and need to be escaped when they // appear in a grammar definition: <, >, [, ] // #define DEFINE_GRAMMAR(varName) static CParseGrammar::CParseRule varName##RuleList[] = { #define STOP_GRAMMAR(varName) { CParseGrammar::STOP_RULE, NULL, NULL, -1, -1 } }; CParseGrammar varName(varName##RuleList); #define DEFINE_RULE(pVarName, pValue) { CParseGrammar::SILENT_RULE, pVarName, pValue, -1, -1 }, #define DEFINE_TOKEN(pVarName, pValue, tokenId, tokenValue) { CParseGrammar::TOKEN_RULE, pVarName, pValue, tokenId, tokenValue }, #define DEFINE_INTEGER_TOKEN(pVarName, pValue, tokenId) { CParseGrammar::INTEGER_RULE, pVarName, pValue, tokenId, -1 }, #endif // _STRING_PARSE_H_