RegexNode.cs source code in C# .NET

Source code for the .NET framework in C#

                        

Code:

/ DotNET / DotNET / 8.0 / untmp / whidbey / REDBITS / ndp / fx / src / Regex / System / Text / RegularExpressions / RegexNode.cs / 1 / RegexNode.cs

                            //------------------------------------------------------------------------------ 
// 
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// 
//----------------------------------------------------------------------------- 

// This RegexNode class is internal to the Regex package. 
// It is built into a parsed tree for a regular expression. 

// Implementation notes: 
//
// Since the node tree is a temporary data structure only used
// during compilation of the regexp to integer codes, it's
// designed for clarity and convenience rather than 
// space efficiency.
// 
// RegexNodes are built into a tree, linked by the _children list. 
// Each node also has a _parent and _ichild member indicating
// its parent and which child # it is in its parent's list. 
//
// RegexNodes come in as many types as there are constructs in
// a regular expression, for example, "concatenate", "alternate",
// "one", "rept", "group". There are also node types for basic 
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
// 
// Because perl 5 allows "lookback" groups that scan backwards, 
// each node also gets a "direction". Normally the value of
// boolean _backward = false. 
//
// During parsing, top-level nodes are also stacked onto a parse
// stack (a stack of trees). For this purpose we have a _next
// pointer. [Note that to save a few bytes, we could overload the 
// _parent pointer instead.]
// 
// On the parse stack, each tree has a "role" - basically, the 
// nonterminal in the grammar that the parser has currently
// assigned to the tree. That code is stored in _role. 
//
// Finally, some of the different kinds of nodes have data.
// Two integers (for the looping constructs) are stored in
// _operands, an an object (either a string or a set) 
// is stored in _data
 
 
namespace System.Text.RegularExpressions {
 
    using System.Collections;
    using System.Diagnostics;
    using System.Globalization;
 
    internal sealed class RegexNode {
        /* 
         * RegexNode types 
         */
 
        // the following are leaves, and correspond to primitive operations

        //    static final int Onerep     = RegexCode.Onerep;     // c,n      a {n}
        //    static final int Notonerep  = RegexCode.Notonerep;  // c,n      .{n} 
        //    static final int Setrep     = RegexCode.Setrep;     // set,n    \d {n}
 
        internal const int Oneloop    = RegexCode.Oneloop;    // c,n      a* 
        internal const int Notoneloop = RegexCode.Notoneloop; // c,n      .*
        internal const int Setloop    = RegexCode.Setloop;    // set,n    \d* 

        internal const int Onelazy    = RegexCode.Onelazy;    // c,n      a*?
        internal const int Notonelazy = RegexCode.Notonelazy; // c,n      .*?
        internal const int Setlazy    = RegexCode.Setlazy;    // set,n    \d*? 

        internal const int One        = RegexCode.One;        // char     a 
        internal const int Notone     = RegexCode.Notone;     // char     . [^a] 
        internal const int Set        = RegexCode.Set;        // set      [a-z] \w \s \d
 
        internal const int Multi      = RegexCode.Multi;      // string   abcdef
        internal const int Ref        = RegexCode.Ref;        // index    \1

        internal const int Bol        = RegexCode.Bol;        //          ^ 
        internal const int Eol        = RegexCode.Eol;        //          $
        internal const int Boundary   = RegexCode.Boundary;   //          \b 
        internal const int Nonboundary= RegexCode.Nonboundary;//          \B 
        internal const int ECMABoundary   = RegexCode.ECMABoundary;    // \b
        internal const int NonECMABoundary= RegexCode.NonECMABoundary; // \B 
        internal const int Beginning  = RegexCode.Beginning;  //          \A
        internal const int Start      = RegexCode.Start;      //          \G
        internal const int EndZ       = RegexCode.EndZ;       //          \Z
        internal const int End        = RegexCode.End;        //          \z 

        // (note: End               = 21;) 
 
        // interior nodes do not correpond to primitive operations, but
        // control structures compositing other operations 

        // concat and alternate take n children, and can run forward or backwards

        internal const int Nothing    = 22;                   //          [] 
        internal const int Empty      = 23;                   //          ()
 
        internal const int Alternate  = 24;                   //          a|b 
        internal const int Concatenate= 25;                   //          ab
 
        internal const int Loop       = 26;                   // m,x      * + ? {,}
        internal const int Lazyloop   = 27;                   // m,x      *? +? ?? {,}?

        internal const int Capture    = 28;                   // n        () 
        internal const int Group      = 29;                   //          (?:)
        internal const int Require    = 30;                   //          (?=) (?<=) 
        internal const int Prevent    = 31;                   //          (?!) (?) (?<)
        internal const int Testref    = 33;                   //          (?(n) | ) 
        internal const int Testgroup  = 34;                   //          (?(...) | )

        /*
         * RegexNode data members 
         *
         */ 
 
        internal int            _type;
 
        internal ArrayList      _children;

        internal String         _str;
        internal char           _ch; 
        internal int            _m;
        internal int            _n; 
        internal RegexOptions   _options; 

        internal RegexNode   _next; 

        internal RegexNode(int type, RegexOptions options) {
            _type = type;
            _options = options; 
        }
 
        internal RegexNode(int type, RegexOptions options, char ch) { 
            _type = type;
            _options = options; 
            _ch = ch;
        }

        internal RegexNode(int type, RegexOptions options, String str) { 
            _type = type;
            _options = options; 
            _str = str; 
        }
 
        internal RegexNode(int type, RegexOptions options, int m) {
            _type = type;
            _options = options;
            _m = m; 
        }
 
        internal RegexNode(int type, RegexOptions options, int m, int n) { 
            _type = type;
            _options = options; 
            _m = m;
            _n = n;
        }
 
        internal bool UseOptionR() {
            return(_options & RegexOptions.RightToLeft) != 0; 
        } 

        internal RegexNode ReverseLeft() { 
            if (UseOptionR() && _type == Concatenate && _children != null) {
                _children.Reverse(0, _children.Count);
            }
 
            return this;
        } 
 

        // Pass type as OneLazy or OneLoop 
        internal void MakeRep(int type, int min, int max) {
            _type += (type - One);
            _m = min;
            _n = max; 
        }
 
        /* 
         * Reduce
         * 
         * Removes redundant nodes from the subtree, and returns a reduced subtree.
         */
        internal RegexNode Reduce() {
            RegexNode n; 

            switch (Type()) { 
                case Alternate: 
                    n = ReduceAlternation();
                    break; 

                case Concatenate:
                    n = ReduceConcatenation();
                    break; 

                case Loop: 
                case Lazyloop: 
                    n = ReduceRep();
                    break; 

                case Group:
                    n = ReduceGroup();
                    break; 

                case Set: 
                case Setloop: 
                    n = ReduceSet();
                    break; 

                default:
                    n = this;
                    break; 
            }
 
            return n; 
        }
 

        /*
         * StripEnation:
         * 
         * Simple optimization. If a concatenation or alternation has only
         * one child strip out the intermediate node. If it has zero children, 
         * turn it into an empty. 
         *
         */ 

        internal RegexNode StripEnation(int emptyType) {
            switch (ChildCount()) {
                case 0: 
                    return new RegexNode(emptyType, _options);
                case 1: 
                    return Child(0); 
                default:
                    return this; 
            }
        }

        /* 
         * ReduceGroup:
         * 
         * Simple optimization. Once parsed into a tree, noncapturing groups 
         * serve no function, so strip them out.
         */ 

        internal RegexNode ReduceGroup() {
            RegexNode u;
 
            for (u = this; u.Type() == Group; )
                u = u.Child(0); 
 
            return u;
        } 

        /*
         * ReduceRep:
         * 
         * Nested repeaters just get multiplied with each other if they're not
         * too lumpy 
         */ 

        internal RegexNode ReduceRep() { 
            RegexNode u;
            RegexNode child;
            int type;
            int min; 
            int max;
 
            u = this; 
            type = Type();
            min = _m; 
            max = _n;

            for (;;) {
                if (u.ChildCount() == 0) 
                    break;
 
                child = u.Child(0); 

                // multiply reps of the same type only 
                if (child.Type() != type) {
                    int childType = child.Type();

                    if (!(childType >= Oneloop && childType <= Setloop && type == Loop || 
                          childType >= Onelazy && childType <= Setlazy && type == Lazyloop))
                        break; 
                } 

                // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})? 
                // [but things like (a {2,})+ are not too lumpy...]
                if (u._m == 0 && child._m > 1 || child._n < child._m * 2)
                    break;
 
                u = child;
                if (u._m > 0) 
                    u._m = min = ((Int32.MaxValue - 1) / u._m < min) ? Int32.MaxValue : u._m * min; 
                if (u._n > 0)
                    u._n = max = ((Int32.MaxValue - 1) / u._n < max) ? Int32.MaxValue : u._n * max; 
            }

            return min == Int32.MaxValue ? new RegexNode(Nothing, _options) : u;
        } 

        /* 
         * ReduceSet: 
         *
         * Simple optimization. If a set is a singleton, an inverse singleton, 
         * or empty, it's transformed accordingly.
         */

        internal RegexNode ReduceSet() { 
            // Extract empty-set, one and not-one case as special
 
            if (RegexCharClass.IsEmpty(_str)) { 
                _type = Nothing;
                _str = null; 
            }
            else if (RegexCharClass.IsSingleton(_str)) {
                _ch = RegexCharClass.SingletonChar(_str);
                _str = null; 
                _type += (One - Set);
            } 
            else if (RegexCharClass.IsSingletonInverse(_str)) { 
                _ch = RegexCharClass.SingletonChar(_str);
                _str = null; 
                _type += (Notone - Set);
            }

            return this; 
        }
 
        /* 
         * ReduceAlternation:
         * 
         * Basic optimization. Single-letter alternations can be replaced
         * by faster set specifications, and nested alternations with no
         * intervening operators can be flattened:
         * 
         * a|b|c|def|g|h -> [a-c]|def|[gh]
         * apple|(?:orange|pear)|grape -> apple|orange|pear|grape 
         * 
         * <
*/ 

        internal RegexNode ReduceAlternation() {
            // Combine adjacent sets/chars
 
            bool wasLastSet;
            bool lastNodeCannotMerge; 
            RegexOptions optionsLast; 
            RegexOptions optionsAt;
            int i; 
            int j;
            RegexNode at;
            RegexNode prev;
 
            if (_children == null)
                return new RegexNode(RegexNode.Nothing, _options); 
 
            wasLastSet = false;
            lastNodeCannotMerge = false; 
            optionsLast = 0;

            for (i = 0, j = 0; i < _children.Count; i++, j++) {
                at = (RegexNode)_children[i]; 

                if (j < i) 
                    _children[j] = at; 

                for (;;) { 
                    if (at._type == Alternate) {
                        for (int k = 0; k < at._children.Count; k++)
                            ((RegexNode)at._children[k])._next = this;
 
                        _children.InsertRange(i + 1, at._children);
                        j--; 
                    } 
                    else if (at._type == Set || at._type == One) {
                        // Cannot merge sets if L or I options differ, or if either are negated. 
                        optionsAt = at._options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);


                        if (at._type == Set) { 
                            if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at._str)) {
                                wasLastSet = true; 
                                lastNodeCannotMerge = !RegexCharClass.IsMergeable(at._str); 
                                optionsLast = optionsAt;
                                break; 
                            }
                        }
                        else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge) {
                            wasLastSet = true; 
                            lastNodeCannotMerge = false;
                            optionsLast = optionsAt; 
                            break; 
                        }
 

                        // The last node was a Set or a One, we're a Set or One and our options are the same.
                        // Merge the two nodes.
                        j--; 
                        prev = (RegexNode)_children[j];
 
                        RegexCharClass prevCharClass; 
                        if (prev._type == RegexNode.One) {
                            prevCharClass = new RegexCharClass(); 
                            prevCharClass.AddChar(prev._ch);
                        }
                        else {
                            prevCharClass = RegexCharClass.Parse(prev._str); 
                        }
 
                        if (at._type == RegexNode.One) { 
                            prevCharClass.AddChar(at._ch);
                        } 
                        else {
                            RegexCharClass atCharClass = RegexCharClass.Parse(at._str);
                            prevCharClass.AddCharClass(atCharClass);
                        } 

                        prev._type = RegexNode.Set; 
                        prev._str  = prevCharClass.ToStringClass(); 

                    } 
                    else if (at._type == RegexNode.Nothing) {
                        j--;
                    }
                    else { 
                        wasLastSet = false;
                        lastNodeCannotMerge = false; 
                    } 
                    break;
                } 
            }

            if (j < i)
                _children.RemoveRange(j, i - j); 

            return StripEnation(RegexNode.Nothing); 
        } 

        /* 
         * ReduceConcatenation:
         *
         * Basic optimization. Adjacent strings can be concatenated.
         * 
         * (?:abc)(?:def) -> abcdef
         */ 
 
        internal RegexNode ReduceConcatenation() {
            // Eliminate empties and concat adjacent strings/chars 

            bool wasLastString;
            RegexOptions optionsLast;
            RegexOptions optionsAt; 
            int i;
            int j; 
 
            if (_children == null)
                return new RegexNode(RegexNode.Empty, _options); 

            wasLastString = false;
            optionsLast = 0;
 
            for (i = 0, j = 0; i < _children.Count; i++, j++) {
                RegexNode at; 
                RegexNode prev; 

                at = (RegexNode)_children[i]; 

                if (j < i)
                    _children[j] = at;
 
                if (at._type == RegexNode.Concatenate &&
                    ((at._options & RegexOptions.RightToLeft) == (_options & RegexOptions.RightToLeft))) { 
                    for (int k = 0; k < at._children.Count; k++) 
                        ((RegexNode)at._children[k])._next = this;
 
                    _children.InsertRange(i + 1, at._children);
                    j--;
                }
                else if (at._type == RegexNode.Multi || 
                         at._type == RegexNode.One) {
                    // Cannot merge strings if L or I options differ 
                    optionsAt = at._options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase); 

                    if (!wasLastString || optionsLast != optionsAt) { 
                        wasLastString = true;
                        optionsLast = optionsAt;
                        continue;
                    } 

                    prev = (RegexNode)_children[--j]; 
 
                    if (prev._type == RegexNode.One) {
                        prev._type = RegexNode.Multi; 
                        prev._str = Convert.ToString(prev._ch, CultureInfo.InvariantCulture);
                    }

                    if ((optionsAt & RegexOptions.RightToLeft) == 0) { 
                        if (at._type == RegexNode.One)
                            prev._str += at._ch.ToString(); 
                        else 
                            prev._str += at._str;
                    } 
                    else {
                        if (at._type == RegexNode.One)
                            prev._str = at._ch.ToString() + prev._str;
                        else 
                            prev._str = at._str + prev._str;
                    } 
 
                }
                else if (at._type == RegexNode.Empty) { 
                    j--;
                }
                else {
                    wasLastString = false; 
                }
            } 
 
            if (j < i)
                _children.RemoveRange(j, i - j); 

            return StripEnation(RegexNode.Empty);
        }
 
        internal RegexNode MakeQuantifier(bool lazy, int min, int max) {
            RegexNode result; 
 
            if (min == 0 && max == 0)
                return new RegexNode(RegexNode.Empty, _options); 

            if (min == 1 && max == 1)
                return this;
 
            switch (_type) {
                case RegexNode.One: 
                case RegexNode.Notone: 
                case RegexNode.Set:
 
                    MakeRep(lazy ? RegexNode.Onelazy : RegexNode.Oneloop, min, max);
                    return this;

                default: 
                    result = new RegexNode(lazy ? RegexNode.Lazyloop : RegexNode.Loop, _options, min, max);
                    result.AddChild(this); 
                    return result; 
            }
        } 

        internal void AddChild(RegexNode newChild) {
            RegexNode reducedChild;
 
            if (_children == null)
                _children = new ArrayList(4); 
 
            reducedChild = newChild.Reduce();
 
            _children.Add(reducedChild);
            reducedChild._next = this;
        }
        internal RegexNode Child(int i) { 
            return(RegexNode)_children[i];
        } 
 
        internal int ChildCount() {
            return _children == null ? 0 : _children.Count; 
        }

        internal int Type() {
            return _type; 
        }
 
#if DBG 
        internal static String[] TypeStr = new String[] {
            "Onerep", "Notonerep", "Setrep", 
            "Oneloop", "Notoneloop", "Setloop",
            "Onelazy", "Notonelazy", "Setlazy",
            "One", "Notone", "Set",
            "Multi", "Ref", 
            "Bol", "Eol", "Boundary", "Nonboundary",
            "ECMABoundary", "NonECMABoundary", 
            "Beginning", "Start", "EndZ", "End", 
            "Nothing", "Empty",
            "Alternate", "Concatenate", 
            "Loop", "Lazyloop",
            "Capture", "Group", "Require", "Prevent", "Greedy",
            "Testref", "Testgroup"};
 
        internal String Description() {
 
            StringBuilder ArgSb = new StringBuilder(); 

            ArgSb.Append(TypeStr[_type]); 

            if ((_options & RegexOptions.ExplicitCapture) != 0)
                ArgSb.Append("-C");
            if ((_options & RegexOptions.IgnoreCase) != 0) 
                ArgSb.Append("-I");
            if ((_options & RegexOptions.RightToLeft) != 0) 
                ArgSb.Append("-L"); 
            if ((_options & RegexOptions.Multiline) != 0)
                ArgSb.Append("-M"); 
            if ((_options & RegexOptions.Singleline) != 0)
                ArgSb.Append("-S");
            if ((_options & RegexOptions.IgnorePatternWhitespace) != 0)
                ArgSb.Append("-X"); 
            if ((_options & RegexOptions.ECMAScript) != 0)
                ArgSb.Append("-E"); 
 
            switch (_type) {
                case Oneloop: 
                case Notoneloop:
                case Onelazy:
                case Notonelazy:
                case One: 
                case Notone:
                    ArgSb.Append("(Ch = " + RegexCharClass.CharDescription(_ch) + ")"); 
                    break; 
                case Capture:
                    ArgSb.Append("(index = " + _m.ToString(CultureInfo.InvariantCulture) + ", unindex = " + _n.ToString(CultureInfo.InvariantCulture) + ")"); 
                    break;
                case Ref:
                case Testref:
                    ArgSb.Append("(index = " + _m.ToString(CultureInfo.InvariantCulture) + ")"); 
                    break;
                case Multi: 
                    ArgSb.Append("(String = " + _str + ")"); 
                    break;
                case Set: 
                case Setloop:
                case Setlazy:
                    ArgSb.Append("(Set = " + RegexCharClass.SetDescription(_str) + ")");
                    break; 
            }
 
            switch (_type) { 
                case Oneloop:
                case Notoneloop: 
                case Onelazy:
                case Notonelazy:
                case Setloop:
                case Setlazy: 
                case Loop:
                case Lazyloop: 
                    ArgSb.Append("(Min = " + _m.ToString(CultureInfo.InvariantCulture) + ", Max = " + (_n == Int32.MaxValue ? "inf" : Convert.ToString(_n, CultureInfo.InvariantCulture)) + ")"); 
 		    break;
            } 

            return ArgSb.ToString();
        }
 
        internal const String Space = "                                ";
 
        internal void Dump() { 
            ArrayList Stack = new ArrayList();
            RegexNode CurNode; 
            int CurChild;

            CurNode = this;
            CurChild = 0; 

            Debug.WriteLine(CurNode.Description()); 
 
            for (;;) {
                if (CurNode._children != null && CurChild < CurNode._children.Count) { 
                    Stack.Add(CurChild + 1);
                    CurNode = (RegexNode)CurNode._children[CurChild];
                    CurChild = 0;
 
                    int Depth = Stack.Count;
                    if (Depth > 32) 
                        Depth = 32; 

                    Debug.WriteLine(Space.Substring(0, Depth) + CurNode.Description()); 
                }
                else {
                    if (Stack.Count == 0)
                        break; 

                    CurChild = (Int32)Stack[Stack.Count - 1]; 
                    Stack.RemoveAt(Stack.Count - 1); 
                    CurNode = CurNode._next;
                } 
            }
        }
#endif
 
    }
 
}
                        

Link Menu

Network programming in C#, Network Programming in VB.NET, Network Programming in .NET
This book is available now!
Buy at Amazon US or
Buy at Amazon UK