#ifndef __Parse_h_ #define __Parse_h_ #include #include #include #include #include #include #include #include "../shared/ThreadSafeRefCount.h" #include "DataFormat.h" namespace Parse { // Several operations related to parsing can throw an exception. class Exception : public std::exception { private: const std::string _message; const std::string _toParse; const int _location; public: // Standard / required. virtual const char* what() const throw(); Exception(std::string const &message, std::string const &toParse, int location) : _message(message), _toParse(toParse), _location(location) { } // Standard / required. The default would not include "throw()" and // would upset the compiler. ~Exception() throw() {}; // Same as what(), but as a std::string. Internally we always use // a std::string. std::string const &getMessage() const { return _message; } // The original string that we tried to parse. std::string const &getToParse() const { return _toParse; } // The location of the error. This is a byte number, or index into the // string. 0 means the beginning of the string. This is typically // the beginning of the error. int getLocation() const { return _location; } // The part of the original document before the error. std::string getBeforeError() const; // The part of the original document after and including the error. // getBeforeError() + getAfterError() == getToParse(). std::string getAfterError() const; std::string simpleLocationMessage() const { return getBeforeError() + "⊰HERE⊱" + getAfterError(); } }; // These are the small pieces that make up a program. I.e. "1.3" or "price". // This includes information like the location of the token in the original // program, which can be useful when reporting errors. class Token { public: enum Type { ttPunctuation, /* "(", ",", "+", etc. */ ttFilter, /* This refers to a filter that's available in the config * window, i.e. "[Price]". */ ttAltField, /* A special syntax for grabbing the extra fields that * are available in the description. I.e. "[=p]" means * the price of certain things. */ ttSymbol, /* The name of a function or a well known field in the * database. */ ttInteger, /* I.e. "4" */ ttDecimal, /* I.e. "4.5" */ ttString, /* A literal string. Same rules as C++. Starts and ends * with a double quote. Most escape sequences work. */ ttAlert, /* We have a special syntax for discussing alerts, i.e. * "{NHP,NLP}" to say new highs and lows. */ ttEnd /* End of input. We return this if you try to read at or past * the last real token. This makes life easier than if we * returned a null or threw an exception. This is a legitimate * place to report an error. */ }; private: Type _type; int _position; std::string _original; std::string _value; std::string _completeSource; public: Token(Type type, int position, std::string const &original, std::string const &value, std::string const &completeSource) : _type(type), _position(position), _original(original), _value(value), _completeSource("") { } explicit Token(std::string const &value) : _type(ttEnd), _position(0), _original("“" + value + "”"), _value(value) { } Type getType() const { return _type; } // This is the first byte of the token. This is a valid index into the // original string we are parsing. int getPosition() const { return _position; } // This is the original form of the token, exactly as written by the user. // I.e. "[Price]". std::string const &getOriginal() const { return _original; } // This is the important part of the token. For the price filter, // getOriginal() would say "[Price]" but getValue would say "Price". std::string const &getValue() const { return _value; } // getPosition() is relative to this. std::string const &getCompleteSource() const { return _completeSource; } // The token is of type puncutation, and the value is as given. bool matchesP(std::string const &value) const { return (_type == ttPunctuation) && (_value == value); } // Mostly aimed at debugging. static std::string asString(Type type); // A short description of the token. This is helpful when listing a lot // of tokens together. std::string shortDebug() const; // Report an error, and say how that relates to the user's input. // This is one of the main uses of this class! If we didn't care about // this, we wouldn't need to save so much information in the token. Exception makeException(std::string const &message) const; // When a tree node needs a token, but there isn't one. This is never // the default. If you don't have a token, you have to explicitly ask // for this. static const Token EMPTY; }; enum class OptimizationContext { // No special assumptions. If you are implementing an optimize() routine, // and you don't understand the input, it's safe to treat it as // NORMAL. NORMAL, // This assumes that the caller will treat the result as a boolean. Every // value will be interpreted as true or false. (EMPTY and 0 both become // false. Interpreting a string as a Boolean is not well defined.) Note // that some functions, like && and ||, treat all their inputs as true or // false. But others, like !, treat their inputs as true, false, or EMPTY. // This function is aimed at the former case. FOR_BOOLEAN }; class TreeNode; // Note that trees are always read only. That is in part to help with // sharing, and in part to help when a tree is used as a key in a set or // map. In large part that is to prevent circular references that might // cause our referencing counting to fail! typedef TSRefCount< TreeNode > Tree; // The parser should spit out a somewhat abstract tree. This holds all of // the information about the structure of the user's program. Other parts // of the code will convert this into something more usable. This // framework should allow a lot of separation between the parsing algorithm // and the code that runs the program. This should also make it easy to // set up an intermediate layer that can do various types of optimization. // // Trees can be useful outside of the context of parsing text. I'd imagine // that the output of reading a config string will also be tree, built from // these same classes. This would be handed over to the optimizer, etc., // and the routines receiving these trees wouldn't know where they came from. // // TreeNode objects are immutable. This is important because we reuse trees // and sub-trees a lot. This also means that Trees and TreeNode objects are // thread safe. class TreeNode { public: typedef std::vector< Tree > Args; private: const Token _token; const Args _children; int64_t _hashCode; void createHashCode() const; protected: // This is the heart of compareSameType(). static int compareChildren(Args const &a, Args const &b); // Each class can take over the comparison process. When this is called // we know the two objects are of identical type. The default operation // is to compare the children, recursively. That doesn't work if the class // has other data, in addition to the children. For example, an integer // node has to store the integer in a new field. An integer cannot be a // child because it doesn't inherit from TreeNode. virtual int compareSameType(TreeNode const &other) const; TreeNode(Token const &token) : _token(token), _hashCode(0) { } TreeNode(Token const &token, Args const &children) : _token(token), _children(children), _hashCode(0) { } // Recursively call optimize, and fix the nulls. Args optimizeChildren(OptimizationContext context) const; // TreeNode::hash_code() automatically uses information about the type of // this object and the hash codes of the object's children. However, if // a class contains additional information, it should override this // function. The return value from this function should be a hash of // that additional information. // // If you override compareSameType() you should probably override this, and // vice versa. // // Note that hash values are automatically cached. So each object will // probably get this call only once. So it can be somewhat thorough. virtual uint64_t getHashContributaion() const { return 0; } // This may help you build values in getHashContributaion(). template < typename T > static uint64_t sHash(T const &t) { return std::hash< T >()(t); } template < typename T, typename U > static uint64_t sHash(T const &t, U const &u) { return sHash(t) - sHash(u); } public: // This is useful to various algorithms that traverse the tree. As much // as possible, all data should be in the form of children. Args const &getChildren() const { return _children; } // This returns the standard -1, 0, 1 to say how two objects compare. // Two objects are always equal if they have the same types and contain // the same data. This is often a recursive operation. // If two pointers are identical, then the two items are equal. // Else if only one of the items isConstant(), that item is less than // the other item. // Else if the two items have different classes, we compare the classes // in some predictable and repeatable way. // Else we call compareSameType() to allow each class to compare its own // objects to each other. int compare(TreeNode const &other) const; // Similar to above but this allows either input to be NULL. static int compare(Tree a, Tree b); // Do not call this on an object before all of it's constructors have // finished. int64_t hash_code() const { if (!_hashCode) createHashCode(); return _hashCode; } // Create a new object of the same type but with different children. // This is used, for example, in the replace() routine. Since these // objects are read only, replace will have to create a new object if // something changes. This should return NULL if the inputs are not valid. virtual Tree construct(Args const &children) const; // Return true if this is a compile time constant. The number 4 should // return true. The field [Price] should return false. virtual bool isConstant() const { return false; } // If this is not a constant, throw an exception. virtual ValueBox getConstantValue() const { throw getToken().makeException("Not a constant."); } // This means that we don't want to cache the value. Originally I was // thinking about making this a virtual function, but this defintion is // simpler and probably does what I want. bool easyToCompute() const { return isConstant(); } // This is mostly useful for generating error messages. You can point // to the source of the problem. This is typically not used in the // comparison routines. The main purpose of comparing two trees is to see // if the user wrote the same thing twice! We want to compute the // expression just once and reuse the result. However, some speciallized // subclasses use the token to store useful information. In this case the // token value and type might be used. But not the location. Token const &getToken() const { return _token; } // This should be recursive. virtual std::string shortDebug() const; template< typename T > T const *as() const { return dynamic_cast< T const * >(this); } // Some nodes are temporary. When the parser sees an identifier, for // example, it only checks that the identifier is made of legal characters. // It doesn't check if the identifier actually means something. The idea // is that another phase of the process will replace each good identifier // with the thing that it represents. If any identifiers are left after // that phase, they must be errors. This function will recursively scan // the tree for temporary nodes. If none are found, it returns with no // side effects. If one is found, it immediately throws an exception. // The exception will point to the exact place where the error occurred, // and will have a message appropriate for an end user. // // Note that Parse::AppropriatePrice does not override this method. While // those objects are temporary, they are usually replaceded much later in // the compilation process. Also, if you see one of those when you don't // expect it, that's almost certainly an error in our C++ code. If you // miss an AppropriatePrice you will get an exception when you call // Exectuion::Executable::create(). virtual void assertDone() const; // Try to return a tree with the same meaning but less effort. Constant // folding would be a good example. Returning null means that the object // should not change. virtual Tree optimize(OptimizationContext context = OptimizationContext::NORMAL) const; // A simple wrapper that automatically gets rid of NULLs. static Tree optimize(Tree input, OptimizationContext context = OptimizationContext::NORMAL); // 1 if this node has no children. Otherwise, the maximum height of all // of it's children + 1. int height() const; virtual ~TreeNode() {} }; inline TclList &operator <<(TclList &destination, Tree const &tree) { destination<shortDebug(); return destination; } // This is suitable for use in a map or set. By default trees will be sorted // by pointer value. That's like using == on a String in Java. This // compares things based on the meaning. If you create two different objects // that both represent the exact same thing, this will say they are equal. struct TreeCompare { bool operator ()(Tree const &a, Tree const &b) const { // Interesting. There is no ==. This function is like <. Maybe C++ // compares each thing twice. If neither is less than the other, the // they must be equal. return a->compare(*b) < 0; } }; // TreeHashHelper allows you to use a Tree in a hashtable. Make sure you // list this type twice, once for the hash function and a second time for // the equals function. This performs the same service that TreeComapre // does for ordered sets and maps. These are consistent; you should be // able to switch between an ordered set and an unordered set without any // change in functionality. struct TreeHashHelper { uint64_t operator()(Tree const &tree) const { return tree->hash_code(); } bool operator()(Tree const &a, Tree const &b) const { return !a->compare(*b); } }; class NullValue : public TreeNode { public: virtual bool isConstant() const { return true; } virtual ValueBox getConstantValue() const { return ValueBox(); } NullValue(Token const &token) : TreeNode(token) { } static Tree const &noToken(); }; class IntTreeNode : public TreeNode { private: const int64_t _value; IntTreeNode(Token const &token, int64_t value) : TreeNode(token), _value(value) { } protected: virtual int compareSameType(TreeNode const &other) const; virtual uint64_t getHashContributaion() const { return _value; } public: virtual bool isConstant() const { return true; } virtual ValueBox getConstantValue() const { return _value; } virtual std::string shortDebug() const; static Tree create(Token const &token); static Tree create(int64_t value); static const Tree TRUE; static const Tree FALSE; int64_t getValue() const { return _value; } }; class DoubleTreeNode : public TreeNode { private: const double _value; DoubleTreeNode(Token const &token, double value) : TreeNode(token), _value(value) { } DoubleTreeNode(Token &token, double value) : TreeNode(token), _value(value) { } protected: virtual int compareSameType(TreeNode const &other) const; virtual uint64_t getHashContributaion() const { return sHash(_value); } public: virtual bool isConstant() const { return true; } virtual ValueBox getConstantValue() const { return _value; } virtual std::string shortDebug() const; static Tree create(Token const &token); static Tree create(double value); double getValue() const { return _value; } }; class StringTreeNode : public TreeNode { private: protected: virtual int compareSameType(TreeNode const &other) const; virtual uint64_t getHashContributaion() const { return sHash(getValue()); } public: virtual bool isConstant() const { return true; } virtual ValueBox getConstantValue() const { return getValue(); } virtual std::string shortDebug() const; StringTreeNode(Token const &token) : TreeNode(token) { } StringTreeNode(std::string const &value) : TreeNode(Token(value)) { } std::string const &getValue() const; // This exists more for documentation than anything else. I don't // expect anyone to call this. StringTreeNode(std::string const &debug, std::string const &value) : TreeNode(Token(Token::ttString, 0, debug, value, "")) {} }; // The parser will treat all functions the same way. We have nothing but // the token to tell one from the next. Operators have a slightly different // syntax in the source code, but they are also translated into // SimpleFunctionNode objects. class SimpleFunctionNode : public TreeNode { protected: virtual int compareSameType(TreeNode const &other) const; virtual uint64_t getHashContributaion() const { return sHash(getToken().getValue()); } public: virtual Tree construct(Args const &children) const; virtual std::string shortDebug() const; SimpleFunctionNode(Token const &token, Args const &arguments) : TreeNode(token, arguments) { } // These are intended to be temporary objects. The parser will create // these without doing much verification. I.e. does a function have the // wrong number of aruments, or does it even exist at all? The consumer // should translate these into something meaningful, verifying the // arguments as it goes. If one of these objects didn't get replaced by // something else, thats an error, an unknown function. virtual void assertDone() const; }; // This is how the parser encapsulates the remaining types of items. I.e. // filters, database fields, alert specifications, etc. All of the // information about the specific item is stored in the Token. class SimpleItemNode : public TreeNode { protected: virtual int compareSameType(TreeNode const &other) const; virtual uint64_t getHashContributaion() const { return sHash((char)getToken().getType(), getToken().getValue()); } public: virtual std::string shortDebug() const; SimpleItemNode(Token const &token) : TreeNode(token) { } // These are intended to be temporary objects. The parser will create // these without doing much verification. I.e. if the user mispells a // word, the parser will still create a SimpleItemNode. The consumer // should translate these objects into something meaningful. If one of // these objects didn't get replaced by something else, thats an error, an // unknown identifier. virtual void assertDone() const; }; // Use this to parse a program. class Parser { private: typedef TreeNode::Args Args; // One time setup. static pcre *_whiteSpacePattern; static pcre *_filterPattern; static pcre *_altFieldPattern; static pcre *_symbolPattern; static pcre *_numberPattern; static pcre *_alertPattern; static pcre *_quotedStringPattern; static std::set< std::string > _punctuation2; // Do the one time setup. Initialize the variables that I want to make // static. static pthread_once_t _hasBeenInitialized; static void initParser(); // Try to compile a regular expression. Ideally these would never fail. // These are the same every time and are part this C++ program, not part // of any user input. A failure here could be treated as an assertion // failure. A failure here will throw an exception. static pcre *tryCompile(std::string const &pattern); // Create an exception. The message comes from a parameter, but the // other parts of the exception come from the fields below. Exception tokenizerException(std::string const &message) const; // Examine the return code from pcre_exec. If a match was found, return // true. If no match was found, but there are no other errors, return // false. Otherwise throw an exception describing the problem. The error // messages are somewhat user friendly. bool matched(int rc) const; // How far we are in makeTokenList(). This is an index into _toParse. std::string::size_type _position; // The initial string we are trying to parse. const std::string _toParse; // Used by the regular expression library. int _ovector[30]; // Creates a new token based on the last call to match(). This uses the // entire token as the value of the token. I.e. "+" ==> "+". This updates // the state so the next call to match() will start after this token. Token makeTokenAll(Token::Type type); // Creates a new token based on the last call to match(). This uses the // first parenthesized sub expression as the value of the token. I.e. // "[Price]" ==> "Price". This updates the state so the next call to // match() will start after this token. Token makeTokenFirst(Token::Type type); // Creates a new token based on the quoted string regular expression. // This involves more work than most of our tokens. The regualar // expression only says if something is valid, and where's the start // and end. This routine converts the escape characters into the final // string. Token makeTokenQuotedString(); // A wrapper around pcre_exec(). Most of the arguments are the same // every time. Some inputs and outputs are call member variables; that // helps make support routines that don't need so many arguments. This // does no error checking. The result from pcre_exec() is returned as // the result from match(). int match(pcre *pattern); typedef std::vector< Token > TokenList; // Convert the string input into a list of tokens. It is safe to call // this more than once. TokenList makeTokenList(); TokenList _tokens; int _nextToken; const Token _endToken; Token const &peekToken(int skip = 0) const; Token const &popToken(); Exception currentTokenException(std::string const &message) const; // We used recursive descent. The calls go down. The items on the bottom // have the highest priority. // // The exact Operator Precedence comes from MySQL. The original formula // editor kept this order because it was easier to implement. We keep // that order to be in compatible with the formual editor. // http://dev.mysql.com/doc/refman/5.0/en/operator-precedence.html Tree parseExpression(); Tree parseBinaryOps1(); // ?? Tree parseBinaryOps2(); // || Tree parseBinaryOps3(); // && Tree parseBinaryOps4(); // <, >, !=, <=, >=, == Tree parseBinaryOps5(); // +, - Tree parseBinaryOps6(); // *, / Tree parseUnaryOps(); // !, - Tree parseBase(); public: // Create a new parser each time you want to parse a new program. Parser(std::string const &toParse); Tree parse(); // Only for debugging. Calls makeTokenList() then turns the result into // a readable string. Excpetions are passed on to the caller. std::string getTokenListDebug(); }; // This supports the tree-replace algorithm. Given a tree, you can make a // set of rules, and the algorithm will recursively go through the tree // applying the rules, making replacements. For example, a rule might say // that if you see the filter "[Price]" repalce it with a function that looks // up the "price" field in the database. We might generate a lot of these // rules at once, after we read the definations of the various filters. // Filters are just an example; tree replace is very valuable for a lot of // reasons. So we make it easy to make new rules. class ReplaceRule { public: // After examinging a node, a rule can make one of these requests. enum Status { sStop, /* Do not look at this item or it's children any more. Keep * the result of this rule and move on. Note: one of the * ancestors of this rule might request that we take another * look at this item or its childen, but presumably only if * something else changed. */ sContinue, /* Keep looking at additional rules. This is mostly * aimed at the RuleList class. If a rule doesn't apply * to an item, the rule should leave the item alone, and * the status should be sContinue. That tells the RuleList * to try the next rule in the list. That's the default * action if a rule doesn't do anything. It is also * possible to change the item, but to continue processing * starting from the next item in the list, although that's * more complicated and you probably don't want to do that. */ sRetryTop, /* After replacing the current item, you can try to reapply * all of the rules to the new item. This will only look * at the new item; it will not recursively process the * item. It is an error to request a retry if you don't * change the item. That would obviously cause an infinite * loop. */ sRetryRecursive /* After replacing the current item, you can try to * reapply all of the rules to the new item. This will * recursively process the new item. It is an error to * request a retry if you don't change the item. That * would obviously cause an infinite loop. */ }; protected: // See notes at replaceSafely(). virtual void replaceImpl(Tree &tree, Status &status) const =0; Exception makeException(std::string const &message) const; public: // This is the heart of the rule. // o tree is the item to inspect. The rule can leave it in tact, or // replace it with something new. (Trees are, by their nature, // immutable, so the rule certainly can't change the tree.) tree should // not be null before or after the call. // o status says what to do next. This is initially set to sContinue. // Note that this function is const. The idea is that rules can get // applied and reapplied a lot and the order can be hard to gaurentee. // Things are a lot simpler when the rules don't change. // // Note, replaceImpl() does the bulk of the work. replaceSafely() calls // replaceImpl() then checks for errors. By checking here, rather than // in Parse::replace(), we check more often, but we can report an error // in the exact rule where the failure occurred. void replaceSafely(Tree &tree, Status &status) const; // Possibly used to make better error messages. Possibly used in // debugging. The default is based on the class name. virtual std::string shortDebug() const; virtual ~ReplaceRule() {} typedef TSRefCount< ReplaceRule > Ref; }; // Parse::replace() always takes in a single rule. Presumably most of the // time you have a lot of possible rules. This is a simple way to create // a compound rule from a lot of indivual rules. The rules are applied in // order until one changes to the status. class RuleList : public ReplaceRule { protected: // Apply the sub rules. virtual void replaceImpl(Tree &tree, Status &status) const; public: // These are the rules. You cannot change this while a rule is being // applied. (And the rule is const inside of Parse::replace(), so that // should be enforced by the compiler.) Otherwise, pretty much anything // goes. A rule should not be NULL. std::vector< ReplaceRule::Ref > rules; // The result from shortDebug(). Leave it blank to use the default. std::string name; // Automatically generated based on name. virtual std::string shortDebug() const; }; // This is an easy and efficient way to build a rule. Each thing we // match must be a symbol. If we see the symbol, we apply the rule. The // rule might include other constraints, or it might apply to everything, // since this class has already inspected the inital symbol. I.e. the symbol // "price" could be replaced with a call to a function that looks for a field // called "price" in the current record. A symbol "advol" could be replaced // with a call to a function that looks for a field called "advol" in the // alerts daily record. We expect a lot of these types of rules. class SymbolRules : public ReplaceRule { protected: // Do it! virtual void replaceImpl(Tree &tree, Status &status) const; public: // The key is the value (i.e. Token::getValue()) of the symbol we want // to replace. std::map< std::string, ReplaceRule::Ref > rules; // The result from shortDebug(). Leave it blank to use the default. std::string name; // Automatically generated based on name. virtual std::string shortDebug() const; }; // This is an easy and efficient way to build a rule. Each thing we // match must be a filter, i.e. [Price]. class FilterRules : public ReplaceRule { protected: // Do it! virtual void replaceImpl(Tree &tree, Status &status) const; public: // The key is the "value" of a filter, i.e. "Price", not "[Price]". The // value is a tree, which is copied as is. That's different from most of // our rules. Most of them create new trees which include the token of // the thing that we're replacing. std::map< std::string, Tree > filters; // The result from shortDebug(). Leave it blank to use the default. std::string name; // Automatically generated based on name. virtual std::string shortDebug() const; }; // Recursively apply the rule to each node in the tree. We start applying // rules at the bottom of the tree. As the stack unwinds, we work our way // up the tree. If none of an item's children are modified by the rule, we // examine the original item as is. If one or more of the children are // modified, we rebuild the item using construct(), and then apply the rule // to that new item. Tree replace(Tree tree, ReplaceRule const &rule); } #endif