00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef Tanl_Text_Unicode_h
00025 #define Tanl_Text_Unicode_h
00026
00027 #include "text.h"
00028
00029 namespace Tanl {
00030 namespace Text {
00031 namespace Unicode {
00032
00071 enum Masks
00072 {
00073 Letter = 0x0100,
00074 Number = 0x0200,
00075 Mark = 0x0400,
00076 Punctuation = 0x0800,
00077 Symbol = 0x1000,
00078 Separator = 0x2000,
00079 Other = 0x4000,
00080
00081 Uppercase = 0x0010,
00082 Lowercase = 0x0020,
00083 Titlecase = 0x0040,
00084 Whitespace = 0x0080,
00085
00086 Lu = 0x0110,
00087 Ll = 0x0120,
00088 Lt = 0x0140,
00089 Lm = 0x0180,
00090 Lo = 0x0100,
00091
00092 Nd = 0x0201,
00093 No = 0x0202,
00094 Nl = 0x0300,
00095
00096 Mn = 0x0400,
00097 Mc = 0x0401,
00098 Me = 0x0402,
00099
00100 Pc = 0x0800,
00101 Pd = 0x0801,
00102 Ps = 0x0802,
00103 Pe = 0x0803,
00104 Pi = 0x0804,
00105 Pf = 0x0805,
00106 Po = 0x0806,
00107
00108 Sm = 0x1000,
00109 Sc = 0x1001,
00110 Sk = 0x1002,
00111 So = 0x1003,
00112
00113 Zs = 0x2001,
00114 Zl = 0x2002,
00115 Zp = 0x2004,
00116
00117 Cc = 0x4000,
00118 Cf = 0x4001,
00119 Cs = 0x4002,
00120 Co = 0x4003,
00121 Cn = 0x4004
00122 };
00123
00124 enum Category
00125 {
00126 UppercaseLetter = 0,
00127 LowercaseLetter = 1,
00128 TitlecaseLetter = 2,
00129 ModifierLetter = 3,
00130 OtherLetter = 4,
00131 NonSpacingMark = 5,
00132 SpaceCombiningMark = 6,
00133 EnclosingMark = 7,
00134 DecimalDigitNumber = 8,
00135 LetterNumber = 9,
00136 OtherNumber = 10,
00137 SpaceSeparator = 11,
00138 LineSeparator = 12,
00139 ParagraphSeparator = 13,
00140 Control = 14,
00141 Format = 15,
00142 Surrogate = 16,
00143 PrivateUse = 17,
00144 ConnectorPunctuation = 18,
00145 DashPunctuation = 19,
00146 OpenPunctuation = 20,
00147 ClosePunctuation = 21,
00148 InitialQuotePunctuation = 22,
00149 FinalQuotePunctuation = 23,
00150 OtherPunctuation = 24,
00151 MathSymbol = 25,
00152 CurrencySymbol = 26,
00153 ModifierSymbol = 27,
00154 OtherSymbol = 28,
00155 OtherNotAssigned = 29
00156 };
00157
00161 enum BlockCode {
00162
00163 NoBlock = 0,
00164 BasicLatin = 1,
00165 Latin1Supplement = 2,
00166 LatinExtendedA = 3,
00167 LatinExtendedB = 4,
00168 IPAExtensions = 5,
00169 SpacingModifierLetters = 6,
00170 CombiningDiacriticalMarks = 7,
00171 Greek = 8,
00172 Cyrillic = 9,
00173 CyrillicSupplement = 10,
00174 Armenian = 11,
00175 Hebrew = 12,
00176 Arabic = 13,
00177 Syriac = 14,
00178 Thaana = 15,
00179 Devanagari = 16,
00180 Bengali = 17,
00181 Gurmukhi = 18,
00182 Gujarati = 19,
00183 Oriya = 20,
00184 Tamil = 21,
00185 Telugu = 22,
00186 Kannada = 23,
00187 Malayalam = 24,
00188 Sinhala = 25,
00189 Thai = 26,
00190 Lao = 27,
00191 Tibetan = 28,
00192 Myanmar = 29,
00193 Georgian = 30,
00194 HangulJamo = 31,
00195 Ethiopic = 32,
00196 Cherokee = 33,
00197 UnifiedCanadianAboriginalSyllabics = 34,
00198 Ogham = 35,
00199 Runic = 36,
00200 Tagalog = 37,
00201 Hanunoo = 38,
00202 Buhid = 39,
00203 Tagbanwa = 40,
00204 Khmer = 41,
00205 Mongolian = 42,
00206 Limbu = 43,
00207 TaiLe = 44,
00208 KhmerSymbols = 45,
00209 PhoneticExtensions = 46,
00210 LatinExtendedAdditional = 47,
00211 GreekExtended = 48,
00212 GeneralPunctuation = 49,
00213 SuperscriptsAndSubscripts = 50,
00214 CurrencySymbols = 51,
00215 CombiningDiacriticalMarksForSymbols = 52,
00216 LetterlikeSymbols = 53,
00217 NumberForms = 54,
00218 Arrows = 55,
00219 MathematicalOperators = 56,
00220 MiscellaneousTechnical = 56,
00221 ControlPictures = 57,
00222 OpticalCharacterRecognition = 58,
00223 EnclosedAlphanumerics = 59,
00224 BoxDrawing = 60,
00225 BlockElements = 61,
00226 GeometricShapes = 62,
00227 MiscellaneousSymbols = 63,
00228 Dingbats = 64,
00229 MiscellaneousMathematicalSymbolsA = 65,
00230 SupplementalArrowsA = 66,
00231 BraillePatterns = 67,
00232 SupplementalArrowsB = 68,
00233 MiscellaneousMathematicalSymbolsB = 69,
00234 SupplementalMathematicalOperators = 70,
00235 MiscellaneousSymbolsAndArrows = 71,
00236 CJKRadicalsSupplement = 72,
00237 KangxiRadicals = 73,
00238 IdeographicDescriptionCharacters = 74,
00239 CJKSymbolsAndPunctuation = 75,
00240 Hiragana = 76,
00241 Katakana = 77,
00242 Bopomofo = 78,
00243 HangulCompatibilityJamo = 79,
00244 Kanbun = 80,
00245 BopomofoExtended = 81,
00246 KatakanaPhoneticExtensions = 82,
00247 EnclosedCJKLettersAndMonths = 83,
00248 CJKCompatibility = 84,
00249 CJKUnifiedIdeographsExtensionA= 85,
00250 YijingHexagramSymbols = 86,
00251 CJKUnifiedIdeographs = 87,
00252 YiSyllables = 88,
00253 YiRadicals = 89,
00254 HangulSyllables = 90,
00255 HighSurrogates = 91,
00256 HighPrivateUseSurrogates = 92,
00257 LowSurrogates = 93,
00258 PrivateUseArea = 94,
00259 CJKCompatibilityIdeographs = 95,
00260 AlphabeticPresentationForms = 96,
00261 ArabicPresentationFormsA = 97,
00262 VariationSelectors = 98,
00263 CombiningHalfMarks = 99,
00264 CJKCompatibilityForms = 100,
00265 SmallFormVariants = 101,
00266 ArabicPresentationFormsB = 102,
00267 HalfwidthAndFullwidthForms = 103,
00268 Specials = 104,
00269 LinearBSyllabary = 105,
00270 LinearBIdeograms = 106,
00271 AegeanNumbers = 107,
00272 OldItalic = 108,
00273 Gothic = 109,
00274 Ugaritic = 110,
00275 Deseret = 111,
00276 Shavian = 112,
00277 Osmanya = 113,
00278 CypriotSyllabary = 114,
00279 ByzantineMusicalSymbols = 115,
00280 MusicalSymbols = 116,
00281 TaiXuanJingSymbols = 117,
00282 MathematicalAlphanumericSymbols = 118,
00283 CJKUnifiedIdeographsExtensionB = 119,
00284 CJKCompatibilityIdeographsSupplement = 120,
00285 Tags = 121,
00286 VariationSelectorsSupplement = 122,
00287 SupplementaryPrivateUseAreaA = 123,
00288 SupplementaryPrivateUseAreaB = 124,
00289 BlockCount
00290 };
00291
00292 struct BlockInfo
00293 {
00294 UCS4 first;
00295 UCS4 last;
00296 char const* name;
00297 };
00298
00299 extern BlockInfo BlockTable[];
00300
00304 BlockInfo& Block(UCS2 uc);
00305
00309 BlockCode GetBlockCode(UCS2 uc);
00310
00311 struct UnicodeInfo
00312 {
00313 short flags;
00314 short category;
00315
00316 UCS2* decomp;
00317
00318 int decval;
00319 UCS2 upcase;
00320 UCS2 downcase;
00321 UCS2 titlecase;
00322 };
00323
00324 extern UnicodeInfo UnicodeTable[65536];
00325
00329 extern UCS2 decompositionMap[];
00330
00331 }
00332 }
00333 }
00334
00335 #endif //Tanl_Text_Unicode_h