Tanl::Text::Unicode Namespace Reference

Utilities to handle UTF-8 strings. More...

Classes

struct BlockLess

struct BlockInfo

struct UnicodeInfo

Typedefs

typedef unsigned char byte

Enumerations

enum Masks {
  Letter = 0x0100, Number = 0x0200, Mark = 0x0400, Punctuation = 0x0800,
  Symbol = 0x1000, Separator = 0x2000, Other = 0x4000, Uppercase = 0x0010,
  Lowercase = 0x0020, Titlecase = 0x0040, Whitespace = 0x0080, Lu = 0x0110,
  Ll = 0x0120, Lt = 0x0140, Lm = 0x0180, Lo = 0x0100,
  Nd = 0x0201, No = 0x0202, Nl = 0x0300, Mn = 0x0400,
  Mc = 0x0401, Me = 0x0402, Pc = 0x0800, Pd = 0x0801,
  Ps = 0x0802, Pe = 0x0803, Pi = 0x0804, Pf = 0x0805,
  Po = 0x0806, Sm = 0x1000, Sc = 0x1001, Sk = 0x1002,
  So = 0x1003, Zs = 0x2001, Zl = 0x2002, Zp = 0x2004,
  Cc = 0x4000, Cf = 0x4001, Cs = 0x4002, Co = 0x4003,
  Cn = 0x4004
}

enum Category {
  UppercaseLetter = 0, LowercaseLetter = 1, TitlecaseLetter = 2, ModifierLetter = 3,
  OtherLetter = 4, NonSpacingMark = 5, SpaceCombiningMark = 6, EnclosingMark = 7,
  DecimalDigitNumber = 8, LetterNumber = 9, OtherNumber = 10, SpaceSeparator = 11,
  LineSeparator = 12, ParagraphSeparator = 13, Control = 14, Format = 15,
  Surrogate = 16, PrivateUse = 17, ConnectorPunctuation = 18, DashPunctuation = 19,
  OpenPunctuation = 20, ClosePunctuation = 21, InitialQuotePunctuation = 22, FinalQuotePunctuation = 23,
  OtherPunctuation = 24, MathSymbol = 25, CurrencySymbol = 26, ModifierSymbol = 27,
  OtherSymbol = 28, OtherNotAssigned = 29
}

enum BlockCode {
  NoBlock = 0, BasicLatin = 1, Latin1Supplement = 2, LatinExtendedA = 3,
  LatinExtendedB = 4, IPAExtensions = 5, SpacingModifierLetters = 6, CombiningDiacriticalMarks = 7,
  Greek = 8, Cyrillic = 9, CyrillicSupplement = 10, Armenian = 11,
  Hebrew = 12, Arabic = 13, Syriac = 14, Thaana = 15,
  Devanagari = 16, Bengali = 17, Gurmukhi = 18, Gujarati = 19,
  Oriya = 20, Tamil = 21, Telugu = 22, Kannada = 23,
  Malayalam = 24, Sinhala = 25, Thai = 26, Lao = 27,
  Tibetan = 28, Myanmar = 29, Georgian = 30, HangulJamo = 31,
  Ethiopic = 32, Cherokee = 33, UnifiedCanadianAboriginalSyllabics = 34, Ogham = 35,
  Runic = 36, Tagalog = 37, Hanunoo = 38, Buhid = 39,
  Tagbanwa = 40, Khmer = 41, Mongolian = 42, Limbu = 43,
  TaiLe = 44, KhmerSymbols = 45, PhoneticExtensions = 46, LatinExtendedAdditional = 47,
  GreekExtended = 48, GeneralPunctuation = 49, SuperscriptsAndSubscripts = 50, CurrencySymbols = 51,
  CombiningDiacriticalMarksForSymbols = 52, LetterlikeSymbols = 53, NumberForms = 54, Arrows = 55,
  MathematicalOperators = 56, MiscellaneousTechnical = 56, ControlPictures = 57, OpticalCharacterRecognition = 58,
  EnclosedAlphanumerics = 59, BoxDrawing = 60, BlockElements = 61, GeometricShapes = 62,
  MiscellaneousSymbols = 63, Dingbats = 64, MiscellaneousMathematicalSymbolsA = 65, SupplementalArrowsA = 66,
  BraillePatterns = 67, SupplementalArrowsB = 68, MiscellaneousMathematicalSymbolsB = 69, SupplementalMathematicalOperators = 70,
  MiscellaneousSymbolsAndArrows = 71, CJKRadicalsSupplement = 72, KangxiRadicals = 73, IdeographicDescriptionCharacters = 74,
  CJKSymbolsAndPunctuation = 75, Hiragana = 76, Katakana = 77, Bopomofo = 78,
  HangulCompatibilityJamo = 79, Kanbun = 80, BopomofoExtended = 81, KatakanaPhoneticExtensions = 82,
  EnclosedCJKLettersAndMonths = 83, CJKCompatibility = 84, CJKUnifiedIdeographsExtensionA = 85, YijingHexagramSymbols = 86,
  CJKUnifiedIdeographs = 87, YiSyllables = 88, YiRadicals = 89, HangulSyllables = 90,
  HighSurrogates = 91, HighPrivateUseSurrogates = 92, LowSurrogates = 93, PrivateUseArea = 94,
  CJKCompatibilityIdeographs = 95, AlphabeticPresentationForms = 96, ArabicPresentationFormsA = 97, VariationSelectors = 98,
  CombiningHalfMarks = 99, CJKCompatibilityForms = 100, SmallFormVariants = 101, ArabicPresentationFormsB = 102,
  HalfwidthAndFullwidthForms = 103, Specials = 104, LinearBSyllabary = 105, LinearBIdeograms = 106,
  AegeanNumbers = 107, OldItalic = 108, Gothic = 109, Ugaritic = 110,
  Deseret = 111, Shavian = 112, Osmanya = 113, CypriotSyllabary = 114,
  ByzantineMusicalSymbols = 115, MusicalSymbols = 116, TaiXuanJingSymbols = 117, MathematicalAlphanumericSymbols = 118,
  CJKUnifiedIdeographsExtensionB = 119, CJKCompatibilityIdeographsSupplement = 120, Tags = 121, VariationSelectorsSupplement = 122,
  SupplementaryPrivateUseAreaA = 123, SupplementaryPrivateUseAreaB = 124, BlockCount
}

Constants for Unicode blocks.

Functions

BlockInfo & Block (UCS2 uc)

Return the code block to which the character uc belongs.

BlockCode GetBlockCode (UCS2 uc)

Return the code for the block to which the character uc belongs.

void incUtfPtr (const char *&ptr, const char *endptr)

void decUtfPtr (const char *&ptr, const char *beginptr)

size_t utfDiff (const char *end, const char *begin)

bool isAscii (const char *begin, const char *end)

UCS2 fetchChar (const char *&it, const char *end)

int byteLength (const char *it)

int utfLength (UCS2 ch)

return the number of byte a unicode character needs

int utfLength (const UCS2 *begin, const UCS2 *end)

returns the number of bytes a unicode string needs

int ucLength (const UCS2 *ch)

returns the length of 0 terminated unicode character string

int uc4Length (const UCS4 *ch)

returns the length of an 0-terminated UCS4 stream

int ToUtf8 (byte *&it, byte *end, UCS2 ucc)

writes a Unicode char into the given char buffer.

int ToUtf8 (byte *&it, const size_t max, const UCS2 *ucbegin, const size_t len)

UCS2 * ToUc (UCS2 *buffer, const char *source, int length)

converts an char stream to an UCS2 stream.

UCS2 * ToUc (UCS2 *buffer, const UCS4 *source, int length=-1)

converts an uc4 character stream to an UCS2 stream.

UCS2 * ToUc (UCS2 *buffer, const wchar_t *source, int length=-1)

converts a wchar character stream to an UCS2 stream.

UCS2 * utf8ToUc (UCS2 *buffer, const char *source, int length=-1)

Convert an UTF-8 stream into Unicode UCS2.

UCS2 toChar (const char *begin, const char *end)

int uc2Length (const UCS2 *ch)

int ToUtf8 (byte *it, const size_t max, const UCS2 *ucbegin, const size_t len)

Writes the given Unicode character range into a char buffer range.

size_t stringLength (const char *str)

size_t stringLength (const UCS2 *str)

size_t stringLength (const UCS4 *str)

template<typename ToCharType , typename FromCharType >

ToCharType * convertTo (ToCharType *buffer, const FromCharType *source, int length=-1)

Variables

UCS2 decompositionMap []

Decomposition sequences.

UnicodeInfo UnicodeTable []

This table is generated from "UnicodeData.txt" by means of mkUnicodeTable.pl.

BlockInfo BlockTable []

Detailed Description

Utilities to handle UTF-8 strings.

Enumeration Type Documentation

enum Tanl::Text::Unicode::Masks

See also:: http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.html

Category Flags

Lu Letter, Uppercase Ll Letter, Lowercase Lt Letter, Titlecase Lm Letter, Modifier Lo Letter, Other Mn Mark, Non-Spacing Mc Mark, Spacing Combining Me Mark, Enclosing Nd Number, Decimal Digit Nl Number, Letter No Number, Other Pc Punctuation, Connector Pd Punctuation, Dash Ps Punctuation, Open Pe Punctuation, Close Pi Punctuation, Initial quote Pf Punctuation, Final quote Po Punctuation, Other Sm Symbol, Math Sc Symbol, Currency Sk Symbol, Modifier So Symbol, Other Zs Separator, Space Zl Separator, Line Zp Separator, Paragraph Cc Other, Control Cf Other, Format Cs Other, Surrogate Co Other, Private Use Cn Other, Not Assigned

Definition at line 71 of file Unicode.h.

Function Documentation

UCS2 * Tanl::Text::Unicode::ToUc	(	UCS2 *	buffer,
		const wchar_t *	source,
		int	length = `-1`
	)

converts a wchar character stream to an UCS2 stream.

wchar chars that cannot be mapped are mapped to 0xFFFF

buffer must have enough space to get all length + 1 chars

Parameters:

	buffer	target to write
	source	0 terminated UCS4 stream
	length	length of source, -1 if 0 terminated uc4 string

Returns:: just the buffer

Definition at line 214 of file Utf8Utils.cpp.

UCS2 * Tanl::Text::Unicode::ToUc	(	UCS2 *	buffer,
		const UCS4 *	source,
		int	length = `-1`
	)

converts an uc4 character stream to an UCS2 stream.

uc4 chars that cannot be mapped are mapped to 0xFFFF

buffer must have enough space to get all length + 1 chars

Parameters:

	buffer	target to write
	source	0 terminated UCS4 stream
	length	length of source, -1 if 0 terminated uc4 string

Returns:: just the buffer

Definition at line 202 of file Utf8Utils.cpp.

References uc4Length().

UCS2 * Tanl::Text::Unicode::ToUc	(	UCS2 *	buffer,
		const char *	source,
		int	length
	)

converts an char stream to an UCS2 stream.

buffer must have enough space to get all length + 1 chars

Parameters:

	buffer	target to write
	source	0 terminated char string
	length	length of source, -1 if 0 terminated string

Returns:: just the buffer

Definition at line 190 of file Utf8Utils.cpp.

int Tanl::Text::Unicode::ToUtf8	(	byte *	it,
		const size_t	max,
		const UCS2 *	ucbegin,
		const size_t	len
	)

Writes the given Unicode character range into a char buffer range.

Parameters:

	it	start position to write.
	max	size of buffer, if 0 no checking for end is done.

Returns:: number of bytes written -1 is error

int Tanl::Text::Unicode::ToUtf8	(	byte *&	it,
		byte *	end,
		UCS2	ucc
	)

writes a Unicode char into the given char buffer.

Parameters:

	it	start position to write. After the call, it will point to next writing position.
	end	end position in buffer, if 0 no checking for end is done.

Returns:: 0 is Ok > 0 chars needed to encode the given Unicode character When return value != 0, it will not be changed.

Definition at line 150 of file Utf8Utils.cpp.

References utfLength().

Variable Documentation

UCS2 Tanl::Text::Unicode::decompositionMap

Decomposition sequences.

Provides the decomposition mapping for those characters which have it.

Definition at line 34 of file Unicode.cpp.

UnicodeInfo Tanl::Text::Unicode::UnicodeTable

This table is generated from "UnicodeData.txt" by means of mkUnicodeTable.pl.

The latest version of the file can be found at the following URL:

http://www.unicode.org/Public/UNIDATA/UnicodeData.txt

Definition at line 1418 of file Unicode.cpp.

Referenced by Tanl::Text::Char::IsPunctuation(), Tanl::Text::Char::IsSpace(), and Tanl::Text::Char::IsWhitespace().


Classes
struct	BlockLess
struct	BlockInfo
struct	UnicodeInfo
Typedefs
typedef unsigned char	byte
Enumerations
enum	Masks { Letter = 0x0100, Number = 0x0200, Mark = 0x0400, Punctuation = 0x0800, Symbol = 0x1000, Separator = 0x2000, Other = 0x4000, Uppercase = 0x0010, Lowercase = 0x0020, Titlecase = 0x0040, Whitespace = 0x0080, Lu = 0x0110, Ll = 0x0120, Lt = 0x0140, Lm = 0x0180, Lo = 0x0100, Nd = 0x0201, No = 0x0202, Nl = 0x0300, Mn = 0x0400, Mc = 0x0401, Me = 0x0402, Pc = 0x0800, Pd = 0x0801, Ps = 0x0802, Pe = 0x0803, Pi = 0x0804, Pf = 0x0805, Po = 0x0806, Sm = 0x1000, Sc = 0x1001, Sk = 0x1002, So = 0x1003, Zs = 0x2001, Zl = 0x2002, Zp = 0x2004, Cc = 0x4000, Cf = 0x4001, Cs = 0x4002, Co = 0x4003, Cn = 0x4004 }
enum	Category { UppercaseLetter = 0, LowercaseLetter = 1, TitlecaseLetter = 2, ModifierLetter = 3, OtherLetter = 4, NonSpacingMark = 5, SpaceCombiningMark = 6, EnclosingMark = 7, DecimalDigitNumber = 8, LetterNumber = 9, OtherNumber = 10, SpaceSeparator = 11, LineSeparator = 12, ParagraphSeparator = 13, Control = 14, Format = 15, Surrogate = 16, PrivateUse = 17, ConnectorPunctuation = 18, DashPunctuation = 19, OpenPunctuation = 20, ClosePunctuation = 21, InitialQuotePunctuation = 22, FinalQuotePunctuation = 23, OtherPunctuation = 24, MathSymbol = 25, CurrencySymbol = 26, ModifierSymbol = 27, OtherSymbol = 28, OtherNotAssigned = 29 }
enum	BlockCode { NoBlock = 0, BasicLatin = 1, Latin1Supplement = 2, LatinExtendedA = 3, LatinExtendedB = 4, IPAExtensions = 5, SpacingModifierLetters = 6, CombiningDiacriticalMarks = 7, Greek = 8, Cyrillic = 9, CyrillicSupplement = 10, Armenian = 11, Hebrew = 12, Arabic = 13, Syriac = 14, Thaana = 15, Devanagari = 16, Bengali = 17, Gurmukhi = 18, Gujarati = 19, Oriya = 20, Tamil = 21, Telugu = 22, Kannada = 23, Malayalam = 24, Sinhala = 25, Thai = 26, Lao = 27, Tibetan = 28, Myanmar = 29, Georgian = 30, HangulJamo = 31, Ethiopic = 32, Cherokee = 33, UnifiedCanadianAboriginalSyllabics = 34, Ogham = 35, Runic = 36, Tagalog = 37, Hanunoo = 38, Buhid = 39, Tagbanwa = 40, Khmer = 41, Mongolian = 42, Limbu = 43, TaiLe = 44, KhmerSymbols = 45, PhoneticExtensions = 46, LatinExtendedAdditional = 47, GreekExtended = 48, GeneralPunctuation = 49, SuperscriptsAndSubscripts = 50, CurrencySymbols = 51, CombiningDiacriticalMarksForSymbols = 52, LetterlikeSymbols = 53, NumberForms = 54, Arrows = 55, MathematicalOperators = 56, MiscellaneousTechnical = 56, ControlPictures = 57, OpticalCharacterRecognition = 58, EnclosedAlphanumerics = 59, BoxDrawing = 60, BlockElements = 61, GeometricShapes = 62, MiscellaneousSymbols = 63, Dingbats = 64, MiscellaneousMathematicalSymbolsA = 65, SupplementalArrowsA = 66, BraillePatterns = 67, SupplementalArrowsB = 68, MiscellaneousMathematicalSymbolsB = 69, SupplementalMathematicalOperators = 70, MiscellaneousSymbolsAndArrows = 71, CJKRadicalsSupplement = 72, KangxiRadicals = 73, IdeographicDescriptionCharacters = 74, CJKSymbolsAndPunctuation = 75, Hiragana = 76, Katakana = 77, Bopomofo = 78, HangulCompatibilityJamo = 79, Kanbun = 80, BopomofoExtended = 81, KatakanaPhoneticExtensions = 82, EnclosedCJKLettersAndMonths = 83, CJKCompatibility = 84, CJKUnifiedIdeographsExtensionA = 85, YijingHexagramSymbols = 86, CJKUnifiedIdeographs = 87, YiSyllables = 88, YiRadicals = 89, HangulSyllables = 90, HighSurrogates = 91, HighPrivateUseSurrogates = 92, LowSurrogates = 93, PrivateUseArea = 94, CJKCompatibilityIdeographs = 95, AlphabeticPresentationForms = 96, ArabicPresentationFormsA = 97, VariationSelectors = 98, CombiningHalfMarks = 99, CJKCompatibilityForms = 100, SmallFormVariants = 101, ArabicPresentationFormsB = 102, HalfwidthAndFullwidthForms = 103, Specials = 104, LinearBSyllabary = 105, LinearBIdeograms = 106, AegeanNumbers = 107, OldItalic = 108, Gothic = 109, Ugaritic = 110, Deseret = 111, Shavian = 112, Osmanya = 113, CypriotSyllabary = 114, ByzantineMusicalSymbols = 115, MusicalSymbols = 116, TaiXuanJingSymbols = 117, MathematicalAlphanumericSymbols = 118, CJKUnifiedIdeographsExtensionB = 119, CJKCompatibilityIdeographsSupplement = 120, Tags = 121, VariationSelectorsSupplement = 122, SupplementaryPrivateUseAreaA = 123, SupplementaryPrivateUseAreaB = 124, BlockCount }
	Constants for Unicode blocks.
Functions
BlockInfo &	Block (UCS2 uc)
	Return the code block to which the character uc belongs.
BlockCode	GetBlockCode (UCS2 uc)
	Return the code for the block to which the character uc belongs.
void	incUtfPtr (const char &ptr, const char endptr)
void	decUtfPtr (const char &ptr, const char beginptr)
size_t	utfDiff (const char end, const char begin)
bool	isAscii (const char begin, const char end)
UCS2	fetchChar (const char &it, const char end)
int	byteLength (const char *it)
int	utfLength (UCS2 ch)
	return the number of byte a unicode character needs
int	utfLength (const UCS2 begin, const UCS2 end)
	returns the number of bytes a unicode string needs
int	ucLength (const UCS2 *ch)
	returns the length of 0 terminated unicode character string
int	uc4Length (const UCS4 *ch)
	returns the length of an 0-terminated UCS4 stream
int	ToUtf8 (byte &it, byte end, UCS2 ucc)
	writes a Unicode char into the given char buffer.
int	ToUtf8 (byte &it, const size_t max, const UCS2 ucbegin, const size_t len)
UCS2 *	ToUc (UCS2 buffer, const char source, int length)
	converts an char stream to an UCS2 stream.
UCS2 *	ToUc (UCS2 buffer, const UCS4 source, int length=-1)
	converts an uc4 character stream to an UCS2 stream.
UCS2 *	ToUc (UCS2 buffer, const wchar_t source, int length=-1)
	converts a wchar character stream to an UCS2 stream.
UCS2 *	utf8ToUc (UCS2 buffer, const char source, int length=-1)
	Convert an UTF-8 stream into Unicode UCS2.
UCS2	toChar (const char begin, const char end)
int	uc2Length (const UCS2 *ch)
int	ToUtf8 (byte it, const size_t max, const UCS2 ucbegin, const size_t len)
	Writes the given Unicode character range into a char buffer range.
size_t	stringLength (const char *str)
size_t	stringLength (const UCS2 *str)
size_t	stringLength (const UCS4 *str)
template<typename ToCharType , typename FromCharType >
ToCharType *	convertTo (ToCharType buffer, const FromCharType source, int length=-1)
Variables
UCS2	decompositionMap []
	Decomposition sequences.
UnicodeInfo	UnicodeTable []
	This table is generated from "UnicodeData.txt" by means of mkUnicodeTable.pl.
BlockInfo	BlockTable []