Classification.cs source code in C# .NET

Source code for the .NET framework in C#

                        

Code:

/ 4.0 / 4.0 / untmp / DEVDIV_TFS / Dev10 / Releases / RTMRel / wpf / src / Core / CSharp / MS / Internal / Classification.cs / 1599983 / Classification.cs

                            //------------------------------------------------------------------------ 
//
//  Microsoft Windows Client Platform
//  Copyright (c) Microsoft Corporation.  All rights reserved.
// 
//  File:      Classification.cs
// 
//  Contents:  Unicode classification entry point 
//
//  Created:   7-14-2002 Tarek Mahmoud Sayed ([....]) 
//
//-----------------------------------------------------------------------

using System; 
using System.Diagnostics;
using MS.Internal; 
using System.Windows; 
using System.Security;
using System.Collections; 
using System.Runtime.InteropServices;
using System.Windows.Media.TextFormatting;

namespace MS.Internal 
{
    ///  
    /// This class is used as a level on indirection for classes in managed c++ to be able to utilize methods 
    /// from the static class Classification.
    /// We cannot make MC++ reference PresentationCore.dll since this will result in cirular reference. 
    /// 
    internal class ClassificationUtility : MS.Internal.Text.TextInterface.IClassification
    {
        // We have restored this list from WPF 3.x. 
        // The original list can be found under
        // $/Dev10/pu/WPF/wpf/src/Core/CSharp/MS/Internal/Shaping/Script.cs 
        internal static readonly bool[] ScriptCaretInfo = new bool[] 
        {
            /* Default              */    false, 
            /* Arabic               */    false,
            /* Armenian             */    false,
            /* Bengali              */    true,
            /* Bopomofo             */    false, 
            /* Braille              */    false,
            /* Buginese             */    true, 
            /* Buhid                */    false, 
            /* CanadianSyllabics    */    false,
            /* Cherokee             */    false, 
            /* CJKIdeographic       */    false,
            /* Coptic               */    false,
            /* CypriotSyllabary     */    false,
            /* Cyrillic             */    false, 
            /* Deseret              */    false,
            /* Devanagari           */    true, 
            /* Ethiopic             */    false, 
            /* Georgian             */    false,
            /* Glagolitic           */    false, 
            /* Gothic               */    false,
            /* Greek                */    false,
            /* Gujarati             */    true,
            /* Gurmukhi             */    true, 
            /* Hangul               */    true,
            /* Hanunoo              */    false, 
            /* Hebrew               */    true, 
            /* Kannada              */    true,
            /* Kana                 */    false, 
            /* Kharoshthi           */    true,
            /* Khmer                */    true,
            /* Lao                  */    true,
            /* Latin                */    false, 
            /* Limbu                */    true,
            /* LinearB              */    false, 
            /* Malayalam            */    true, 
            /* MathematicalAlphanumericSymbols */ false,
            /* Mongolian            */    true, 
            /* MusicalSymbols       */    false,
            /* Myanmar              */    true,
            /* NewTaiLue            */    true,
            /* Ogham                */    false, 
            /* OldItalic            */    false,
            /* OldPersianCuneiform  */    false, 
            /* Oriya                */    true, 
            /* Osmanya              */    false,
            /* Runic                */    false, 
            /* Shavian              */    false,
            /* Sinhala              */    true,
            /* SylotiNagri          */    true,
            /* Syriac               */    false, 
            /* Tagalog              */    false,
            /* Tagbanwa             */    false, 
            /* TaiLe                */    false, 
            /* Tamil                */    true,
            /* Telugu               */    true, 
            /* Thaana               */    true,
            /* Thai                 */    true,
            /* Tibetan              */    true,
            /* Tifinagh             */    false, 
            /* UgariticCuneiform    */    false,
            /* Yi                   */    false, 
            /* Digit                */    false, 
            /* Control              */    false,
            /* Mirror               */    false, 
        };

        static private ClassificationUtility _classificationUtilityInstance = new ClassificationUtility();
 
        static internal ClassificationUtility Instance
        { 
            get 
            {
                return _classificationUtilityInstance; 
            }
        }

        public void GetCharAttribute( 
                                    int unicodeScalar,
                                    out bool isCombining, 
                                    out bool needsCaretInfo, 
                                    out bool isIndic,
                                    out bool isDigit, 
                                    out bool isLatin,
                                    out bool isStrong
                                    )
        { 
            CharacterAttribute charAttribute = Classification.CharAttributeOf((int)Classification.GetUnicodeClass(unicodeScalar));
 
            byte itemClass = charAttribute.ItemClass; 
            isCombining = (itemClass == (byte)ItemClass.SimpleMarkClass
                        || itemClass == (byte)ItemClass.ComplexMarkClass); 

            isStrong = (itemClass == (byte)ItemClass.StrongClass);

            int script = charAttribute.Script; 
            needsCaretInfo = ScriptCaretInfo[script];
 
            ScriptID scriptId = (ScriptID)script; 
            isDigit = scriptId == ScriptID.Digit;
            isLatin = scriptId == ScriptID.Latin; 
            if (isLatin)
            {
                isIndic = false;
            } 
            else
            { 
                isIndic = IsScriptIndic(scriptId); 
            }
        } 

        /// 
        /// Returns true if specified script is Indic.
        ///  
        private static bool IsScriptIndic(ScriptID scriptId)
        { 
            if (scriptId == ScriptID.Bengali 
                 || scriptId == ScriptID.Devanagari
                 || scriptId == ScriptID.Gurmukhi 
                 || scriptId == ScriptID.Gujarati
                 || scriptId == ScriptID.Kannada
                 || scriptId == ScriptID.Malayalam
                 || scriptId == ScriptID.Oriya 
                 || scriptId == ScriptID.Tamil
                 || scriptId == ScriptID.Telugu) 
            { 
                return true;
            } 
            else
            {
                return false;
            } 
        }
    } 
    ///  
    /// Hold the classification table pointers.
    ///  
    internal static class Classification
    {
        /// 
        /// This structure has a cloned one in the unmanaged side. Doing any change in this 
        /// structure should have the same change on unmanaged side too.
        ///  
        [StructLayout(LayoutKind.Sequential)] 
        internal unsafe struct CombiningMarksClassificationData
        { 
            internal IntPtr CombiningCharsIndexes; // Two dimentional array of base char classes,
            internal int    CombiningCharsIndexesTableLength;
            internal int    CombiningCharsIndexesTableSegmentLength;
 
            internal IntPtr CombiningMarkIndexes; // Combining mark classes array, with length = length
            internal int    CombiningMarkIndexesTableLength; 
 
            internal IntPtr CombinationChars; // Two dimentional array of combined characters
            internal int    CombinationCharsBaseCount; 
            internal int    CombinationCharsMarkCount;
        }

        ///  
        /// This structure has a cloned one in the unmanaged side. doing any change in  that
        /// structure should have same change in the unmanaged side too. 
        ///  
        [StructLayout(LayoutKind.Sequential)]
        internal unsafe struct RawClassificationTables 
        {
            internal IntPtr UnicodeClasses;
            internal IntPtr CharacterAttributes;
            internal IntPtr Mirroring; 
            internal CombiningMarksClassificationData CombiningMarksClassification;
        }; 
 
        ///
        /// Critical - as this code performs an elevation. 
        /// Safe - This is an entry point that grabs several RO shared pages for text layout
        /// purpose. Reading it directly won't yield any readable content to the reader, nor
        /// will it yield any useful information to outside world as these are just loads of
        /// Unicode classification hex data that only Text engine knows how to use � hence, 
        /// not an Information Disclosure threat.
        /// 
        [SecurityCritical, SecurityTreatAsSafe] 
        [SuppressUnmanagedCodeSecurity]
        [DllImport(Microsoft.Internal.DllImport.PresentationNative, EntryPoint="MILGetClassificationTables")] 
        internal static extern void MILGetClassificationTables(out RawClassificationTables ct);
        /// 
        ///    Critical: This accesses unsafe code and retrieves pointers that it stores locally
        ///    The pointers retrieved are not validated for correctness and they are later dereferenced. 
        ///    TreatAsSafe: The constructor is safe since it simply stores these pointers. The risk here
        ///    in the future is not of these pointers being spoofed since they are not settable from outside. 
        ///  
        [SecurityCritical,SecurityTreatAsSafe]
        static Classification() 
        {
            unsafe
            {
                RawClassificationTables ct = new RawClassificationTables(); 
                MILGetClassificationTables(out ct);
 
                _unicodeClassTable   = new SecurityCriticalData(ct.UnicodeClasses); 
                _charAttributeTable  = new SecurityCriticalData(ct.CharacterAttributes);
                _mirroredCharTable   = new SecurityCriticalData(ct.Mirroring); 

                _combiningMarksClassification = new SecurityCriticalData(ct.CombiningMarksClassification);
            }
        } 

        ///  
        /// Lookup Unicode character class for a Unicode UTF16 value 
        /// 
        ///  
        ///    Critical: This accesses unsafe code and dereferences a location in
        ///    a prepopulated Array. The risk is you might derefence a bogus memory
        ///    location.
        ///    TreatAsSafe: This code is ok since it reduces codepoint to one of 256 possible 
        ///    values and will always succeed. Also this information is ok to expose.
        ///  
        [SecurityCritical, SecurityTreatAsSafe] 
        static public short GetUnicodeClassUTF16(char codepoint)
        { 
            unsafe
            {
                short **plane0 = UnicodeClassTable[0];
                Invariant.Assert((long)plane0 >= (long)UnicodeClass.Max); 

                short* pcc = plane0[codepoint >> 8]; 
                return ((long) pcc < (long) UnicodeClass.Max ? 
                    (short)pcc : pcc[codepoint & 0xFF]);
            } 
        }


        ///  
        /// Lookup Unicode character class for a Unicode scalar value
        ///  
        ///  
        ///    Critical: This accesses unsafe code and derefences a pointer retrieved from unmanaged code
        ///    TreatAsSafe: There is bounds checking in place and this dereferences a valid structure which 
        ///    is guaranteed to be populated
        /// 
        [SecurityCritical,SecurityTreatAsSafe]
        static public short GetUnicodeClass(int unicodeScalar) 
        {
            unsafe 
            { 
                Invariant.Assert(unicodeScalar >= 0 && unicodeScalar <= 0x10FFFF);
                short **ppcc = UnicodeClassTable[((unicodeScalar >> 16) & 0xFF) % 17]; 

                if ((long)ppcc < (long)UnicodeClass.Max)
                    return (short)ppcc;
 
                short *pcc = ppcc[(unicodeScalar & 0xFFFF) >> 8];
 
                if ((long)pcc < (long)UnicodeClass.Max) 
                    return (short)pcc;
 
                return pcc[unicodeScalar & 0xFF];
            }
        }
 

        ///  
        /// Compute Unicode scalar value from unicode codepoint stream 
        /// 
        static internal int UnicodeScalar( 
            CharacterBufferRange unicodeString,
            out int              sizeofChar
            )
        { 
            Invariant.Assert(unicodeString.CharacterBuffer != null && unicodeString.Length > 0);
 
            int ch = unicodeString[0]; 
            sizeofChar = 1;
 
            if (    unicodeString.Length >= 2
                &&  (ch & 0xFC00) == 0xD800
                &&  (unicodeString[1] & 0xFC00) == 0xDC00
                ) 
            {
                ch = (((ch & 0x03FF) << 10) | (unicodeString[1] & 0x3FF)) + 0x10000; 
                sizeofChar++; 
            }
 
            return ch;
        }

 
        /// 
        /// Check whether the character is combining mark 
        ///  
        /// 
        ///    Critical: This code acceses a function call that returns a pointer (get_CharAttributeTable). 
        ///    It trusts the value passed in to derfence the table with no implicit bounds or validity checks.
        ///    TreatAsSafe: This information is safe to expose at the same time the unicodeScalar passed in
        ///    is validated for bounds
        ///  
        [SecurityCritical,SecurityTreatAsSafe]
        static public bool IsCombining(int unicodeScalar) 
        { 
            unsafe
            { 
                byte itemClass = Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].ItemClass;

                return itemClass == (byte)ItemClass.SimpleMarkClass
                    || itemClass == (byte)ItemClass.ComplexMarkClass; 
            }
        } 
 
        /// 
        /// Check whether the character is a joiner character 
        /// 
        /// 
        ///    Critical: This code acceses a function call that returns a pointer (get_CharAttributeTable).
        ///    It trusts the value passed in to derfence the table with no implicit bounds or validity checks. 
        ///    TreatAsSafe: This information is safe to expose at the same time the unicodeScalar passed in
        ///    is validated for bounds 
        ///  
        [SecurityCritical,SecurityTreatAsSafe]
        static public bool IsJoiner(int unicodeScalar) 
        {
            unsafe
            {
                byte itemClass = Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].ItemClass; 

                return itemClass == (byte) ItemClass.JoinerClass; 
            } 
        }
 
        /// 
        /// Scan UTF16 character string until a character with specified attributes is found
        /// 
        /// character index of first character matching the attribute. 
        /// 
        ///    Critical: This code acceses a function call that returns a pointer (get_CharAttributeTable). 
        ///    It keeps accesing a buffer with no validation in terms of the variables passed in. 
        ///    TreatAsSafe: This information is safe to expose, as in the worst case it tells you information
        ///    of where the next UTF16 character is. Also the constructor for characterbuffer can be one of three 
        ///    a string, a char array or an unmanaged char*. The third case is critical and tightly controlled
        ///    so the risk of bogus length is significantly mitigated.
        /// 
        [SecurityCritical,SecurityTreatAsSafe] 
        static public int AdvanceUntilUTF16(
            CharacterBuffer     charBuffer, 
            int                 offsetToFirstChar, 
            int                 stringLength,
            ushort              mask, 
            out ushort          charFlags
            )
        {
            int i = offsetToFirstChar; 
            int limit = offsetToFirstChar + stringLength;
            charFlags = 0; 
 
            while (i < limit)
            { 
                unsafe
                {
                    ushort flags = (ushort)Classification.CharAttributeTable[(int)GetUnicodeClassUTF16(charBuffer[i])].Flags;
 
                    if((flags & mask) != 0)
                        break; 
 
                    charFlags |= flags;
                } 
                i++;
            }
            return i - offsetToFirstChar;
        } 

        ///  
        /// Scan character string until a character that is not the specified ItemClass is found 
        /// 
        /// character index of first character that is not the specified ItemClass 
        /// 
        ///    Critical: This code acceses a function call that returns a pointer (get_CharAttributeTable). It acceses
        ///    elements in an array with no type checking.
        ///    TreatAsSafe: This code exposes the index of the next non UTF16 character in a run. This is ok to expose 
        ///    Also the calls to CharBuffer and CahrAttribute do the requisite bounds checking.
        ///  
        [SecurityCritical,SecurityTreatAsSafe] 
        static public int AdvanceWhile(
            CharacterBufferRange unicodeString, 
            ItemClass            itemClass
            )
        {
            int i     = 0; 
            int limit = unicodeString.Length;
            int sizeofChar = 0; 
 
            while (i < limit)
            { 
                int ch = Classification.UnicodeScalar(
                    new CharacterBufferRange(unicodeString, i, limit - i),
                    out sizeofChar
                    ); 

                unsafe 
                { 
                    byte currentClass = (byte) Classification.CharAttributeTable[(int)GetUnicodeClass(ch)].ItemClass;
                    if (currentClass != (byte) itemClass) 
                        break;
                }

                i += sizeofChar; 
            }
 
            return i; 
        }
 
        /// 
        ///    Critical: This accesses unsafe code and returns a pointer
        /// 
        private static unsafe short*** UnicodeClassTable 
        {
            [SecurityCritical] 
            get { return (short***)_unicodeClassTable.Value; } 
        }
        ///  
        ///    Critical: This accesses unsafe code and returns a pointer
        /// 
        private static unsafe CharacterAttribute* CharAttributeTable
        { 
            [SecurityCritical]
            get { return (CharacterAttribute*)_charAttributeTable.Value; } 
        } 

        ///  
        ///    Critical: This accesses unsafe code and indexes into an array
        ///    Safe    : This method does bound check on the input char class.
        /// 
        [SecurityCritical, SecurityTreatAsSafe] 
        internal static CharacterAttribute CharAttributeOf(int charClass)
        { 
            unsafe 
            {
                Invariant.Assert(charClass >= 0 && charClass < (int) UnicodeClass.Max); 
                return CharAttributeTable[charClass];
            }
        }
 
        static private readonly SecurityCriticalData  _unicodeClassTable;
        static private readonly SecurityCriticalData _charAttributeTable; 
        static private readonly SecurityCriticalData _mirroredCharTable; 
        static private readonly SecurityCriticalData _combiningMarksClassification;
    } 
}

// File provided for Reference Use Only by Microsoft Corporation (c) 2007.


                        

Link Menu

Network programming in C#, Network Programming in VB.NET, Network Programming in .NET
This book is available now!
Buy at Amazon US or
Buy at Amazon UK