DeSR Dependency Parser |
Classes | |
struct | BlockLess |
struct | BlockInfo |
struct | UnicodeInfo |
Typedefs | |
typedef unsigned char | byte |
Enumerations | |
enum | Masks { Letter = 0x0100, Number = 0x0200, Mark = 0x0400, Punctuation = 0x0800, Symbol = 0x1000, Separator = 0x2000, Other = 0x4000, Uppercase = 0x0010, Lowercase = 0x0020, Titlecase = 0x0040, Whitespace = 0x0080, Lu = 0x0110, Ll = 0x0120, Lt = 0x0140, Lm = 0x0180, Lo = 0x0100, Nd = 0x0201, No = 0x0202, Nl = 0x0300, Mn = 0x0400, Mc = 0x0401, Me = 0x0402, Pc = 0x0800, Pd = 0x0801, Ps = 0x0802, Pe = 0x0803, Pi = 0x0804, Pf = 0x0805, Po = 0x0806, Sm = 0x1000, Sc = 0x1001, Sk = 0x1002, So = 0x1003, Zs = 0x2001, Zl = 0x2002, Zp = 0x2004, Cc = 0x4000, Cf = 0x4001, Cs = 0x4002, Co = 0x4003, Cn = 0x4004 } |
enum | Category { UppercaseLetter = 0, LowercaseLetter = 1, TitlecaseLetter = 2, ModifierLetter = 3, OtherLetter = 4, NonSpacingMark = 5, SpaceCombiningMark = 6, EnclosingMark = 7, DecimalDigitNumber = 8, LetterNumber = 9, OtherNumber = 10, SpaceSeparator = 11, LineSeparator = 12, ParagraphSeparator = 13, Control = 14, Format = 15, Surrogate = 16, PrivateUse = 17, ConnectorPunctuation = 18, DashPunctuation = 19, OpenPunctuation = 20, ClosePunctuation = 21, InitialQuotePunctuation = 22, FinalQuotePunctuation = 23, OtherPunctuation = 24, MathSymbol = 25, CurrencySymbol = 26, ModifierSymbol = 27, OtherSymbol = 28, OtherNotAssigned = 29 } |
enum | BlockCode { NoBlock = 0, BasicLatin = 1, Latin1Supplement = 2, LatinExtendedA = 3, LatinExtendedB = 4, IPAExtensions = 5, SpacingModifierLetters = 6, CombiningDiacriticalMarks = 7, Greek = 8, Cyrillic = 9, CyrillicSupplement = 10, Armenian = 11, Hebrew = 12, Arabic = 13, Syriac = 14, Thaana = 15, Devanagari = 16, Bengali = 17, Gurmukhi = 18, Gujarati = 19, Oriya = 20, Tamil = 21, Telugu = 22, Kannada = 23, Malayalam = 24, Sinhala = 25, Thai = 26, Lao = 27, Tibetan = 28, Myanmar = 29, Georgian = 30, HangulJamo = 31, Ethiopic = 32, Cherokee = 33, UnifiedCanadianAboriginalSyllabics = 34, Ogham = 35, Runic = 36, Tagalog = 37, Hanunoo = 38, Buhid = 39, Tagbanwa = 40, Khmer = 41, Mongolian = 42, Limbu = 43, TaiLe = 44, KhmerSymbols = 45, PhoneticExtensions = 46, LatinExtendedAdditional = 47, GreekExtended = 48, GeneralPunctuation = 49, SuperscriptsAndSubscripts = 50, CurrencySymbols = 51, CombiningDiacriticalMarksForSymbols = 52, LetterlikeSymbols = 53, NumberForms = 54, Arrows = 55, MathematicalOperators = 56, MiscellaneousTechnical = 56, ControlPictures = 57, OpticalCharacterRecognition = 58, EnclosedAlphanumerics = 59, BoxDrawing = 60, BlockElements = 61, GeometricShapes = 62, MiscellaneousSymbols = 63, Dingbats = 64, MiscellaneousMathematicalSymbolsA = 65, SupplementalArrowsA = 66, BraillePatterns = 67, SupplementalArrowsB = 68, MiscellaneousMathematicalSymbolsB = 69, SupplementalMathematicalOperators = 70, MiscellaneousSymbolsAndArrows = 71, CJKRadicalsSupplement = 72, KangxiRadicals = 73, IdeographicDescriptionCharacters = 74, CJKSymbolsAndPunctuation = 75, Hiragana = 76, Katakana = 77, Bopomofo = 78, HangulCompatibilityJamo = 79, Kanbun = 80, BopomofoExtended = 81, KatakanaPhoneticExtensions = 82, EnclosedCJKLettersAndMonths = 83, CJKCompatibility = 84, CJKUnifiedIdeographsExtensionA = 85, YijingHexagramSymbols = 86, CJKUnifiedIdeographs = 87, YiSyllables = 88, YiRadicals = 89, HangulSyllables = 90, HighSurrogates = 91, HighPrivateUseSurrogates = 92, LowSurrogates = 93, PrivateUseArea = 94, CJKCompatibilityIdeographs = 95, AlphabeticPresentationForms = 96, ArabicPresentationFormsA = 97, VariationSelectors = 98, CombiningHalfMarks = 99, CJKCompatibilityForms = 100, SmallFormVariants = 101, ArabicPresentationFormsB = 102, HalfwidthAndFullwidthForms = 103, Specials = 104, LinearBSyllabary = 105, LinearBIdeograms = 106, AegeanNumbers = 107, OldItalic = 108, Gothic = 109, Ugaritic = 110, Deseret = 111, Shavian = 112, Osmanya = 113, CypriotSyllabary = 114, ByzantineMusicalSymbols = 115, MusicalSymbols = 116, TaiXuanJingSymbols = 117, MathematicalAlphanumericSymbols = 118, CJKUnifiedIdeographsExtensionB = 119, CJKCompatibilityIdeographsSupplement = 120, Tags = 121, VariationSelectorsSupplement = 122, SupplementaryPrivateUseAreaA = 123, SupplementaryPrivateUseAreaB = 124, BlockCount } |
Constants for Unicode blocks. | |
Functions | |
BlockInfo & | Block (UCS2 uc) |
Return the code block to which the character uc belongs. | |
BlockCode | GetBlockCode (UCS2 uc) |
Return the code for the block to which the character uc belongs. | |
void | incUtfPtr (const char *&ptr, const char *endptr) |
void | decUtfPtr (const char *&ptr, const char *beginptr) |
size_t | utfDiff (const char *end, const char *begin) |
bool | isAscii (const char *begin, const char *end) |
UCS2 | fetchChar (const char *&it, const char *end) |
int | byteLength (const char *it) |
int | utfLength (UCS2 ch) |
return the number of byte a unicode character needs | |
int | utfLength (const UCS2 *begin, const UCS2 *end) |
returns the number of bytes a unicode string needs | |
int | ucLength (const UCS2 *ch) |
returns the length of 0 terminated unicode character string | |
int | uc4Length (const UCS4 *ch) |
returns the length of an 0-terminated UCS4 stream | |
int | ToUtf8 (byte *&it, byte *end, UCS2 ucc) |
writes a Unicode char into the given char buffer. | |
int | ToUtf8 (byte *&it, const size_t max, const UCS2 *ucbegin, const size_t len) |
UCS2 * | ToUc (UCS2 *buffer, const char *source, int length) |
converts an char stream to an UCS2 stream. | |
UCS2 * | ToUc (UCS2 *buffer, const UCS4 *source, int length=-1) |
converts an uc4 character stream to an UCS2 stream. | |
UCS2 * | ToUc (UCS2 *buffer, const wchar_t *source, int length=-1) |
converts a wchar character stream to an UCS2 stream. | |
UCS2 * | utf8ToUc (UCS2 *buffer, const char *source, int length=-1) |
Convert an UTF-8 stream into Unicode UCS2. | |
UCS2 | toChar (const char *begin, const char *end) |
int | uc2Length (const UCS2 *ch) |
int | ToUtf8 (byte *it, const size_t max, const UCS2 *ucbegin, const size_t len) |
Writes the given Unicode character range into a char buffer range. | |
size_t | stringLength (const char *str) |
size_t | stringLength (const UCS2 *str) |
size_t | stringLength (const UCS4 *str) |
template<typename ToCharType , typename FromCharType > | |
ToCharType * | convertTo (ToCharType *buffer, const FromCharType *source, int length=-1) |
Variables | |
UCS2 | decompositionMap [] |
Decomposition sequences. | |
UnicodeInfo | UnicodeTable [] |
This table is generated from "UnicodeData.txt" by means of mkUnicodeTable.pl. | |
BlockInfo | BlockTable [] |
Category Flags
Lu Letter, Uppercase Ll Letter, Lowercase Lt Letter, Titlecase Lm Letter, Modifier Lo Letter, Other Mn Mark, Non-Spacing Mc Mark, Spacing Combining Me Mark, Enclosing Nd Number, Decimal Digit Nl Number, Letter No Number, Other Pc Punctuation, Connector Pd Punctuation, Dash Ps Punctuation, Open Pe Punctuation, Close Pi Punctuation, Initial quote Pf Punctuation, Final quote Po Punctuation, Other Sm Symbol, Math Sc Symbol, Currency Sk Symbol, Modifier So Symbol, Other Zs Separator, Space Zl Separator, Line Zp Separator, Paragraph Cc Other, Control Cf Other, Format Cs Other, Surrogate Co Other, Private Use Cn Other, Not Assigned
UCS2 * Tanl::Text::Unicode::ToUc | ( | UCS2 * | buffer, | |
const wchar_t * | source, | |||
int | length = -1 | |||
) |
converts a wchar character stream to an UCS2 stream.
wchar chars that cannot be mapped are mapped to 0xFFFF
buffer must have enough space to get all length + 1 chars
buffer | target to write | |
source | 0 terminated UCS4 stream | |
length | length of source, -1 if 0 terminated uc4 string |
Definition at line 214 of file Utf8Utils.cpp.
UCS2 * Tanl::Text::Unicode::ToUc | ( | UCS2 * | buffer, | |
const UCS4 * | source, | |||
int | length = -1 | |||
) |
converts an uc4 character stream to an UCS2 stream.
uc4 chars that cannot be mapped are mapped to 0xFFFF
buffer must have enough space to get all length + 1 chars
buffer | target to write | |
source | 0 terminated UCS4 stream | |
length | length of source, -1 if 0 terminated uc4 string |
Definition at line 202 of file Utf8Utils.cpp.
References uc4Length().
UCS2 * Tanl::Text::Unicode::ToUc | ( | UCS2 * | buffer, | |
const char * | source, | |||
int | length | |||
) |
converts an char stream to an UCS2 stream.
buffer must have enough space to get all length + 1 chars
buffer | target to write | |
source | 0 terminated char string | |
length | length of source, -1 if 0 terminated string |
Definition at line 190 of file Utf8Utils.cpp.
int Tanl::Text::Unicode::ToUtf8 | ( | byte * | it, | |
const size_t | max, | |||
const UCS2 * | ucbegin, | |||
const size_t | len | |||
) |
Writes the given Unicode character range into a char buffer range.
it | start position to write. | |
max | size of buffer, if 0 no checking for end is done. |
int Tanl::Text::Unicode::ToUtf8 | ( | byte *& | it, | |
byte * | end, | |||
UCS2 | ucc | |||
) |
writes a Unicode char into the given char buffer.
it | start position to write. After the call, it will point to next writing position. | |
end | end position in buffer, if 0 no checking for end is done. |
Definition at line 150 of file Utf8Utils.cpp.
References utfLength().
Decomposition sequences.
Provides the decomposition mapping for those characters which have it.
Definition at line 34 of file Unicode.cpp.
UnicodeInfo Tanl::Text::Unicode::UnicodeTable |
This table is generated from "UnicodeData.txt" by means of mkUnicodeTable.pl.
The latest version of the file can be found at the following URL:
http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
Definition at line 1418 of file Unicode.cpp.
Referenced by Tanl::Text::Char::IsPunctuation(), Tanl::Text::Char::IsSpace(), and Tanl::Text::Char::IsWhitespace().