RegexNode.cs source code in C# .NET

Source code for the .NET framework in C#

                        

Code:

/ 4.0 / 4.0 / untmp / DEVDIV_TFS / Dev10 / Releases / RTMRel / ndp / fx / src / Regex / System / Text / RegularExpressions / RegexNode.cs / 1305376 / RegexNode.cs

                            //------------------------------------------------------------------------------ 
// 
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// 
//----------------------------------------------------------------------------- 

// This RegexNode class is internal to the Regex package. 
// It is built into a parsed tree for a regular expression. 

// Implementation notes: 
//
// Since the node tree is a temporary data structure only used
// during compilation of the regexp to integer codes, it's
// designed for clarity and convenience rather than 
// space efficiency.
// 
// RegexNodes are built into a tree, linked by the _children list. 
// Each node also has a _parent and _ichild member indicating
// its parent and which child # it is in its parent's list. 
//
// RegexNodes come in as many types as there are constructs in
// a regular expression, for example, "concatenate", "alternate",
// "one", "rept", "group". There are also node types for basic 
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
// 
// Because perl 5 allows "lookback" groups that scan backwards, 
// each node also gets a "direction". Normally the value of
// boolean _backward = false. 
//
// During parsing, top-level nodes are also stacked onto a parse
// stack (a stack of trees). For this purpose we have a _next
// pointer. [Note that to save a few bytes, we could overload the 
// _parent pointer instead.]
// 
// On the parse stack, each tree has a "role" - basically, the 
// nonterminal in the grammar that the parser has currently
// assigned to the tree. That code is stored in _role. 
//
// Finally, some of the different kinds of nodes have data.
// Two integers (for the looping constructs) are stored in
// _operands, an an object (either a string or a set) 
// is stored in _data
 
 
namespace System.Text.RegularExpressions {
 
    using System.Collections;
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.Globalization; 

    internal sealed class RegexNode { 
        /* 
         * RegexNode types
         */ 

        // the following are leaves, and correspond to primitive operations

        //    static final int Onerep     = RegexCode.Onerep;     // c,n      a {n} 
        //    static final int Notonerep  = RegexCode.Notonerep;  // c,n      .{n}
        //    static final int Setrep     = RegexCode.Setrep;     // set,n    \d {n} 
 
        internal const int Oneloop    = RegexCode.Oneloop;    // c,n      a*
        internal const int Notoneloop = RegexCode.Notoneloop; // c,n      .* 
        internal const int Setloop    = RegexCode.Setloop;    // set,n    \d*

        internal const int Onelazy    = RegexCode.Onelazy;    // c,n      a*?
        internal const int Notonelazy = RegexCode.Notonelazy; // c,n      .*? 
        internal const int Setlazy    = RegexCode.Setlazy;    // set,n    \d*?
 
        internal const int One        = RegexCode.One;        // char     a 
        internal const int Notone     = RegexCode.Notone;     // char     . [^a]
        internal const int Set        = RegexCode.Set;        // set      [a-z] \w \s \d 

        internal const int Multi      = RegexCode.Multi;      // string   abcdef
        internal const int Ref        = RegexCode.Ref;        // index    \1
 
        internal const int Bol        = RegexCode.Bol;        //          ^
        internal const int Eol        = RegexCode.Eol;        //          $ 
        internal const int Boundary   = RegexCode.Boundary;   //          \b 
        internal const int Nonboundary= RegexCode.Nonboundary;//          \B
        internal const int ECMABoundary   = RegexCode.ECMABoundary;    // \b 
        internal const int NonECMABoundary= RegexCode.NonECMABoundary; // \B
        internal const int Beginning  = RegexCode.Beginning;  //          \A
        internal const int Start      = RegexCode.Start;      //          \G
        internal const int EndZ       = RegexCode.EndZ;       //          \Z 
        internal const int End        = RegexCode.End;        //          \z
 
        // (note: End               = 21;) 

        // interior nodes do not correpond to primitive operations, but 
        // control structures compositing other operations

        // concat and alternate take n children, and can run forward or backwards
 
        internal const int Nothing    = 22;                   //          []
        internal const int Empty      = 23;                   //          () 
 
        internal const int Alternate  = 24;                   //          a|b
        internal const int Concatenate= 25;                   //          ab 

        internal const int Loop       = 26;                   // m,x      * + ? {,}
        internal const int Lazyloop   = 27;                   // m,x      *? +? ?? {,}?
 
        internal const int Capture    = 28;                   // n        ()
        internal const int Group      = 29;                   //          (?:) 
        internal const int Require    = 30;                   //          (?=) (?<=) 
        internal const int Prevent    = 31;                   //          (?!) (?) (?<) 
        internal const int Testref    = 33;                   //          (?(n) | )
        internal const int Testgroup  = 34;                   //          (?(...) | )

        /* 
         * RegexNode data members
         * 
         */ 

        internal int            _type; 

        internal List      _children;

        internal String         _str; 
        internal char           _ch;
        internal int            _m; 
        internal int            _n; 
        internal RegexOptions   _options;
 
        internal RegexNode   _next;

        internal RegexNode(int type, RegexOptions options) {
            _type = type; 
            _options = options;
        } 
 
        internal RegexNode(int type, RegexOptions options, char ch) {
            _type = type; 
            _options = options;
            _ch = ch;
        }
 
        internal RegexNode(int type, RegexOptions options, String str) {
            _type = type; 
            _options = options; 
            _str = str;
        } 

        internal RegexNode(int type, RegexOptions options, int m) {
            _type = type;
            _options = options; 
            _m = m;
        } 
 
        internal RegexNode(int type, RegexOptions options, int m, int n) {
            _type = type; 
            _options = options;
            _m = m;
            _n = n;
        } 

        internal bool UseOptionR() { 
            return(_options & RegexOptions.RightToLeft) != 0; 
        }
 
        internal RegexNode ReverseLeft() {
            if (UseOptionR() && _type == Concatenate && _children != null) {
                _children.Reverse(0, _children.Count);
            } 

            return this; 
        } 

 
        // Pass type as OneLazy or OneLoop
        internal void MakeRep(int type, int min, int max) {
            _type += (type - One);
            _m = min; 
            _n = max;
        } 
 
        /*
         * Reduce 
         *
         * Removes redundant nodes from the subtree, and returns a reduced subtree.
         */
        internal RegexNode Reduce() { 
            RegexNode n;
 
            switch (Type()) { 
                case Alternate:
                    n = ReduceAlternation(); 
                    break;

                case Concatenate:
                    n = ReduceConcatenation(); 
                    break;
 
                case Loop: 
                case Lazyloop:
                    n = ReduceRep(); 
                    break;

                case Group:
                    n = ReduceGroup(); 
                    break;
 
                case Set: 
                case Setloop:
                    n = ReduceSet(); 
                    break;

                default:
                    n = this; 
                    break;
            } 
 
            return n;
        } 


        /*
         * StripEnation: 
         *
         * Simple optimization. If a concatenation or alternation has only 
         * one child strip out the intermediate node. If it has zero children, 
         * turn it into an empty.
         * 
         */

        internal RegexNode StripEnation(int emptyType) {
            switch (ChildCount()) { 
                case 0:
                    return new RegexNode(emptyType, _options); 
                case 1: 
                    return Child(0);
                default: 
                    return this;
            }
        }
 
        /*
         * ReduceGroup: 
         * 
         * Simple optimization. Once parsed into a tree, noncapturing groups
         * serve no function, so strip them out. 
         */

        internal RegexNode ReduceGroup() {
            RegexNode u; 

            for (u = this; u.Type() == Group; ) 
                u = u.Child(0); 

            return u; 
        }

        /*
         * ReduceRep: 
         *
         * Nested repeaters just get multiplied with each other if they're not 
         * too lumpy 
         */
 
        internal RegexNode ReduceRep() {
            RegexNode u;
            RegexNode child;
            int type; 
            int min;
            int max; 
 
            u = this;
            type = Type(); 
            min = _m;
            max = _n;

            for (;;) { 
                if (u.ChildCount() == 0)
                    break; 
 
                child = u.Child(0);
 
                // multiply reps of the same type only
                if (child.Type() != type) {
                    int childType = child.Type();
 
                    if (!(childType >= Oneloop && childType <= Setloop && type == Loop ||
                          childType >= Onelazy && childType <= Setlazy && type == Lazyloop)) 
                        break; 
                }
 
                // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
                // [but things like (a {2,})+ are not too lumpy...]
                if (u._m == 0 && child._m > 1 || child._n < child._m * 2)
                    break; 

                u = child; 
                if (u._m > 0) 
                    u._m = min = ((Int32.MaxValue - 1) / u._m < min) ? Int32.MaxValue : u._m * min;
                if (u._n > 0) 
                    u._n = max = ((Int32.MaxValue - 1) / u._n < max) ? Int32.MaxValue : u._n * max;
            }

            return min == Int32.MaxValue ? new RegexNode(Nothing, _options) : u; 
        }
 
        /* 
         * ReduceSet:
         * 
         * Simple optimization. If a set is a singleton, an inverse singleton,
         * or empty, it's transformed accordingly.
         */
 
        internal RegexNode ReduceSet() {
            // Extract empty-set, one and not-one case as special 
 
            if (RegexCharClass.IsEmpty(_str)) {
                _type = Nothing; 
                _str = null;
            }
            else if (RegexCharClass.IsSingleton(_str)) {
                _ch = RegexCharClass.SingletonChar(_str); 
                _str = null;
                _type += (One - Set); 
            } 
            else if (RegexCharClass.IsSingletonInverse(_str)) {
                _ch = RegexCharClass.SingletonChar(_str); 
                _str = null;
                _type += (Notone - Set);
            }
 
            return this;
        } 
 
        /*
         * ReduceAlternation: 
         *
         * Basic optimization. Single-letter alternations can be replaced
         * by faster set specifications, and nested alternations with no
         * intervening operators can be flattened: 
         *
         * a|b|c|def|g|h -> [a-c]|def|[gh] 
         * apple|(?:orange|pear)|grape -> apple|orange|pear|grape 
         *
         * < 
*/

        internal RegexNode ReduceAlternation() {
            // Combine adjacent sets/chars 

            bool wasLastSet; 
            bool lastNodeCannotMerge; 
            RegexOptions optionsLast;
            RegexOptions optionsAt; 
            int i;
            int j;
            RegexNode at;
            RegexNode prev; 

            if (_children == null) 
                return new RegexNode(RegexNode.Nothing, _options); 

            wasLastSet = false; 
            lastNodeCannotMerge = false;
            optionsLast = 0;

            for (i = 0, j = 0; i < _children.Count; i++, j++) { 
                at = _children[i];
 
                if (j < i) 
                    _children[j] = at;
 
                for (;;) {
                    if (at._type == Alternate) {
                        for (int k = 0; k < at._children.Count; k++)
                            at._children[k]._next = this; 

                        _children.InsertRange(i + 1, at._children); 
                        j--; 
                    }
                    else if (at._type == Set || at._type == One) { 
                        // Cannot merge sets if L or I options differ, or if either are negated.
                        optionsAt = at._options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);

 
                        if (at._type == Set) {
                            if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at._str)) { 
                                wasLastSet = true; 
                                lastNodeCannotMerge = !RegexCharClass.IsMergeable(at._str);
                                optionsLast = optionsAt; 
                                break;
                            }
                        }
                        else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge) { 
                            wasLastSet = true;
                            lastNodeCannotMerge = false; 
                            optionsLast = optionsAt; 
                            break;
                        } 


                        // The last node was a Set or a One, we're a Set or One and our options are the same.
                        // Merge the two nodes. 
                        j--;
                        prev = _children[j]; 
 
                        RegexCharClass prevCharClass;
                        if (prev._type == RegexNode.One) { 
                            prevCharClass = new RegexCharClass();
                            prevCharClass.AddChar(prev._ch);
                        }
                        else { 
                            prevCharClass = RegexCharClass.Parse(prev._str);
                        } 
 
                        if (at._type == RegexNode.One) {
                            prevCharClass.AddChar(at._ch); 
                        }
                        else {
                            RegexCharClass atCharClass = RegexCharClass.Parse(at._str);
                            prevCharClass.AddCharClass(atCharClass); 
                        }
 
                        prev._type = RegexNode.Set; 
                        prev._str  = prevCharClass.ToStringClass();
 
                    }
                    else if (at._type == RegexNode.Nothing) {
                        j--;
                    } 
                    else {
                        wasLastSet = false; 
                        lastNodeCannotMerge = false; 
                    }
                    break; 
                }
            }

            if (j < i) 
                _children.RemoveRange(j, i - j);
 
            return StripEnation(RegexNode.Nothing); 
        }
 
        /*
         * ReduceConcatenation:
         *
         * Basic optimization. Adjacent strings can be concatenated. 
         *
         * (?:abc)(?:def) -> abcdef 
         */ 

        internal RegexNode ReduceConcatenation() { 
            // Eliminate empties and concat adjacent strings/chars

            bool wasLastString;
            RegexOptions optionsLast; 
            RegexOptions optionsAt;
            int i; 
            int j; 

            if (_children == null) 
                return new RegexNode(RegexNode.Empty, _options);

            wasLastString = false;
            optionsLast = 0; 

            for (i = 0, j = 0; i < _children.Count; i++, j++) { 
                RegexNode at; 
                RegexNode prev;
 
                at = _children[i];

                if (j < i)
                    _children[j] = at; 

                if (at._type == RegexNode.Concatenate && 
                    ((at._options & RegexOptions.RightToLeft) == (_options & RegexOptions.RightToLeft))) { 
                    for (int k = 0; k < at._children.Count; k++)
                        at._children[k]._next = this; 

                    _children.InsertRange(i + 1, at._children);
                    j--;
                } 
                else if (at._type == RegexNode.Multi ||
                         at._type == RegexNode.One) { 
                    // Cannot merge strings if L or I options differ 
                    optionsAt = at._options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
 
                    if (!wasLastString || optionsLast != optionsAt) {
                        wasLastString = true;
                        optionsLast = optionsAt;
                        continue; 
                    }
 
                    prev = _children[--j]; 

                    if (prev._type == RegexNode.One) { 
                        prev._type = RegexNode.Multi;
                        prev._str = Convert.ToString(prev._ch, CultureInfo.InvariantCulture);
                    }
 
                    if ((optionsAt & RegexOptions.RightToLeft) == 0) {
                        if (at._type == RegexNode.One) 
                            prev._str += at._ch.ToString(); 
                        else
                            prev._str += at._str; 
                    }
                    else {
                        if (at._type == RegexNode.One)
                            prev._str = at._ch.ToString() + prev._str; 
                        else
                            prev._str = at._str + prev._str; 
                    } 

                } 
                else if (at._type == RegexNode.Empty) {
                    j--;
                }
                else { 
                    wasLastString = false;
                } 
            } 

            if (j < i) 
                _children.RemoveRange(j, i - j);

            return StripEnation(RegexNode.Empty);
        } 

        internal RegexNode MakeQuantifier(bool lazy, int min, int max) { 
            RegexNode result; 

            if (min == 0 && max == 0) 
                return new RegexNode(RegexNode.Empty, _options);

            if (min == 1 && max == 1)
                return this; 

            switch (_type) { 
                case RegexNode.One: 
                case RegexNode.Notone:
                case RegexNode.Set: 

                    MakeRep(lazy ? RegexNode.Onelazy : RegexNode.Oneloop, min, max);
                    return this;
 
                default:
                    result = new RegexNode(lazy ? RegexNode.Lazyloop : RegexNode.Loop, _options, min, max); 
                    result.AddChild(this); 
                    return result;
            } 
        }

        internal void AddChild(RegexNode newChild) {
            RegexNode reducedChild; 

            if (_children == null) 
                _children = new List(4); 

            reducedChild = newChild.Reduce(); 

            _children.Add(reducedChild);
            reducedChild._next = this;
        } 
        internal RegexNode Child(int i) {
            return _children[i]; 
        } 

        internal int ChildCount() { 
            return _children == null ? 0 : _children.Count;
        }

        internal int Type() { 
            return _type;
        } 
 
#if DBG
        internal static String[] TypeStr = new String[] { 
            "Onerep", "Notonerep", "Setrep",
            "Oneloop", "Notoneloop", "Setloop",
            "Onelazy", "Notonelazy", "Setlazy",
            "One", "Notone", "Set", 
            "Multi", "Ref",
            "Bol", "Eol", "Boundary", "Nonboundary", 
            "ECMABoundary", "NonECMABoundary", 
            "Beginning", "Start", "EndZ", "End",
            "Nothing", "Empty", 
            "Alternate", "Concatenate",
            "Loop", "Lazyloop",
            "Capture", "Group", "Require", "Prevent", "Greedy",
            "Testref", "Testgroup"}; 

        internal String Description() { 
 
            StringBuilder ArgSb = new StringBuilder();
 
            ArgSb.Append(TypeStr[_type]);

            if ((_options & RegexOptions.ExplicitCapture) != 0)
                ArgSb.Append("-C"); 
            if ((_options & RegexOptions.IgnoreCase) != 0)
                ArgSb.Append("-I"); 
            if ((_options & RegexOptions.RightToLeft) != 0) 
                ArgSb.Append("-L");
            if ((_options & RegexOptions.Multiline) != 0) 
                ArgSb.Append("-M");
            if ((_options & RegexOptions.Singleline) != 0)
                ArgSb.Append("-S");
            if ((_options & RegexOptions.IgnorePatternWhitespace) != 0) 
                ArgSb.Append("-X");
            if ((_options & RegexOptions.ECMAScript) != 0) 
                ArgSb.Append("-E"); 

            switch (_type) { 
                case Oneloop:
                case Notoneloop:
                case Onelazy:
                case Notonelazy: 
                case One:
                case Notone: 
                    ArgSb.Append("(Ch = " + RegexCharClass.CharDescription(_ch) + ")"); 
                    break;
                case Capture: 
                    ArgSb.Append("(index = " + _m.ToString(CultureInfo.InvariantCulture) + ", unindex = " + _n.ToString(CultureInfo.InvariantCulture) + ")");
                    break;
                case Ref:
                case Testref: 
                    ArgSb.Append("(index = " + _m.ToString(CultureInfo.InvariantCulture) + ")");
                    break; 
                case Multi: 
                    ArgSb.Append("(String = " + _str + ")");
                    break; 
                case Set:
                case Setloop:
                case Setlazy:
                    ArgSb.Append("(Set = " + RegexCharClass.SetDescription(_str) + ")"); 
                    break;
            } 
 
            switch (_type) {
                case Oneloop: 
                case Notoneloop:
                case Onelazy:
                case Notonelazy:
                case Setloop: 
                case Setlazy:
                case Loop: 
                case Lazyloop: 
                    ArgSb.Append("(Min = " + _m.ToString(CultureInfo.InvariantCulture) + ", Max = " + (_n == Int32.MaxValue ? "inf" : Convert.ToString(_n, CultureInfo.InvariantCulture)) + ")");
 		    break; 
            }

            return ArgSb.ToString();
        } 

        internal const String Space = "                                "; 
 
        internal void Dump() {
            List Stack = new List(); 
            RegexNode CurNode;
            int CurChild;

            CurNode = this; 
            CurChild = 0;
 
            Debug.WriteLine(CurNode.Description()); 

            for (;;) { 
                if (CurNode._children != null && CurChild < CurNode._children.Count) {
                    Stack.Add(CurChild + 1);
                    CurNode = CurNode._children[CurChild];
                    CurChild = 0; 

                    int Depth = Stack.Count; 
                    if (Depth > 32) 
                        Depth = 32;
 
                    Debug.WriteLine(Space.Substring(0, Depth) + CurNode.Description());
                }
                else {
                    if (Stack.Count == 0) 
                        break;
 
                    CurChild = Stack[Stack.Count - 1]; 
                    Stack.RemoveAt(Stack.Count - 1);
                    CurNode = CurNode._next; 
                }
            }
        }
#endif 

    } 
 
}

// File provided for Reference Use Only by Microsoft Corporation (c) 2007.
                        

Link Menu

Network programming in C#, Network Programming in VB.NET, Network Programming in .NET
This book is available now!
Buy at Amazon US or
Buy at Amazon UK