00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "Utf8Utils.h"
00025 #include <wchar.h>
00026
00027 namespace Tanl {
00028 namespace Text {
00029
00030 namespace Unicode {
00031
00032 void incUtfPtr(const char*& ptr, const char* endptr)
00033 {
00034 if (endptr != 0)
00035 for (++ptr; ptr < endptr && (*ptr & 0xc0) == 0x80; ++ptr)
00036 ;
00037 else
00038 for (++ptr; (*ptr & 0xc0) == 0x80; ++ptr)
00039 ;
00040 }
00041
00042 void decUtfPtr(const char*& ptr, const char* beginptr)
00043 {
00044 while (ptr > beginptr) {
00045 ptr--;
00046 if ((*ptr & 0xc0) != 0x80)
00047 return;
00048 }
00049 }
00050
00051 size_t utfDiff(const char* end, const char* begin)
00052 {
00053 size_t count;
00054 for (count = 0; begin < end; incUtfPtr(begin, end))
00055 ++count;
00056 return count;
00057 }
00058
00059 bool isAscii(const char* begin, const char* end)
00060 {
00061 for (; begin < end; ++begin)
00062 if ((unsigned char)*begin > 127)
00063 return false;
00064 return true;
00065 }
00066
00067 UCS2 fetchChar(const char*& it, const char* end)
00068 {
00069 if (it >= end)
00070 return 0;
00071 int char_byte_length;
00072 int byte_mask = 0;
00073 if ((unsigned char)*it < 128) { char_byte_length = 1; byte_mask = 0x7f; }
00074 else if ((*it & 0xe0) == 0xc0) { char_byte_length = 2; byte_mask = 0x1f; }
00075 else if ((*it & 0xf0) == 0xe0) { char_byte_length = 3; byte_mask = 0x0f; }
00076 else if ((*it & 0xf8) == 0xf0) { char_byte_length = 4; byte_mask = 0x07; }
00077 else if ((*it & 0xfc) == 0xf8) { char_byte_length = 5; byte_mask = 0x03; }
00078 else if ((*it & 0xfe) == 0xfc) { char_byte_length = 6; byte_mask = 0x01; }
00079 else { char_byte_length = -1; }
00080
00081 UCS2 ret = it[0] & byte_mask;
00082 int i;
00083 for (i = 1; i < char_byte_length; ++i) {
00084 if ((it[i] & 0xc0) != 0x80) {
00085 ret = 0xFFFF;
00086 break;
00087 }
00088 ret <<= 6;
00089 ret |= (it[i] & 0x3f);
00090 }
00091 it += char_byte_length;
00092
00093 return ret;
00094 }
00095
00096 int byteLength(const char* it)
00097 {
00098 unsigned char c = *((unsigned char*)it);
00099 if (c < 128) return 1;
00100 if ((c & 0xe0) == 0xc0) return 2;
00101 if ((c & 0xf0) == 0xf0) return 3;
00102 if ((c & 0xf8) == 0xf0) return 4;
00103 if ((c & 0xfc) == 0xf8) return 5;
00104 if ((c & 0xfe) == 0xfc) return 6;
00105 return -1;
00106 }
00107
00108 int utfLength(UCS2 ucc)
00109 {
00110 int neededBytes;
00111 if (ucc < 0x80)
00112 neededBytes = 1;
00113 else if (ucc < 0x800)
00114 neededBytes = 2;
00115 else
00116 neededBytes = 3;
00117
00118
00119
00120
00121
00122
00123 return neededBytes;
00124 }
00125
00126 int utfLength(const UCS2* begin, const UCS2* end)
00127 {
00128 int len = 0;
00129 for (const UCS2* it = begin; it < end; ++it)
00130 len += utfLength(*it);
00131 return len;
00132 }
00133
00134 int ucLength(const UCS2* ch)
00135 {
00136 const UCS2* begin = ch;
00137 while (*ch++ != 0)
00138 ;
00139 return ch - begin - 1;
00140 }
00141
00142 int uc4Length(const UCS4* ptr)
00143 {
00144 const UCS4* begin = ptr;
00145 while (*ptr++ != 0)
00146 ;
00147 return ptr - begin - 1;
00148 }
00149
00150 int ToUtf8(byte*& it, byte* end, UCS2 ucc)
00151 {
00152 static const int firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
00153 const int byteMask = 0x3F;
00154 const int byteMark = 0x80;
00155 if (ucc < 0x80) {
00156 if (it >= end)
00157 return 1;
00158 *it++ = (unsigned char)ucc;
00159 return 0;
00160 }
00161 int neededBytes = utfLength(ucc);
00162 if (it + neededBytes >= end)
00163 return neededBytes;
00164 it += neededBytes;
00165 byte* tptr = it;
00166 switch (neededBytes) {
00167 case 6: *--tptr = (ucc & byteMask) | byteMark; ucc >>= 6;
00168 case 5: *--tptr = (ucc & byteMask) | byteMark; ucc >>= 6;
00169 case 4: *--tptr = (ucc & byteMask) | byteMark; ucc >>= 6;
00170 case 3: *--tptr = (ucc & byteMask) | byteMark; ucc >>= 6;
00171 case 2: *--tptr = (ucc & byteMask) | byteMark; ucc >>= 6;
00172 case 1: *--tptr = ucc | firstByteMark[neededBytes];
00173 }
00174 return 0;
00175 }
00176
00177 int ToUtf8(byte*& it, const size_t max, const UCS2* ucbegin, const size_t len)
00178 {
00179 byte* end = it + max;
00180 const UCS2* ucend = ucbegin + len;
00181 int needed = 0;
00182 for (const UCS2* uit = ucbegin; uit < ucend; ++uit) {
00183 int reqired = ToUtf8(it, end, *uit);
00184 if (reqired > 0)
00185 needed += reqired;
00186 }
00187 return needed;
00188 }
00189
00190 UCS2* ToUc(UCS2* buffer, const char* source, int length)
00191 {
00192 if (length == -1)
00193 length = ::strlen(source);
00194 UCS2* it = buffer;
00195 for (const char* end = source + length; source < end; ++source, ++it) {
00196 *it = (UCS2)*source;
00197 }
00198 *it = 0;
00199 return buffer;
00200 }
00201
00202 UCS2* ToUc(UCS2* buffer, const UCS4* source, int length)
00203 {
00204 if (length == -1)
00205 length = uc4Length(source);
00206 UCS2* it = buffer;
00207 for (const UCS4* end = source + length; source < end; ++source, ++it) {
00208 *it = (UCS2)*source;
00209 }
00210 *it = 0;
00211 return buffer;
00212 }
00213
00214 UCS2* ToUc(UCS2* buffer, const wchar_t* source, int length)
00215 {
00216 if (length == -1)
00217 length = wcslen(source);
00218 UCS2* it = buffer;
00219 for (const wchar_t* end = source + length; source < end; ++source, ++it) {
00220 *it = (UCS2)*source;
00221 }
00222 *it = 0;
00223 return buffer;
00224 }
00225
00226 UCS2* utf8ToUc(UCS2* buffer, const char* source, int length)
00227 {
00228 UCS2* tptr = buffer;
00229 const char* sptr = source;
00230 if (length == -1)
00231 length = strlen(source);
00232 for (int i = 0; i < length; ++i, ++sptr, ++tptr) {
00233 *tptr = *sptr;
00234 }
00235 *tptr = 0;
00236 return buffer;
00237 }
00238
00239 }
00240 }
00241 }