00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "Encoding.h"
00025
00026
00027 #include <stdlib.h>
00028 #include <string.h>
00029 #include <map>
00030 #include <iconv.h>
00031
00032 using namespace std;
00033
00034 namespace Tanl {
00035 namespace Text {
00036
00037 namespace {
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084 #define E(n,i,a,m) (new Tanl::Text::Encoding(n,i,(float)a,m))
00085
00086 static Tanl::Text::Encoding* const encodingTable[] = {
00087
00088
00089 E("ISO-8859-1", 0, 1.0, 1.0),
00090 E("ISO-8859-2", 1, 1.0, 1.0),
00091 E("ISO-8859-3", 2, 1.0, 1.0),
00092 E("ISO-8859-4", 3, 1.0, 1.0),
00093 E("ISO-8859-5", 4, 1.0, 1.0),
00094 E("ISO-8859-6", 5, 1.0, 1.0),
00095 E("ISO-8859-7", 6, 1.0, 1.0),
00096 E("ISO-8859-8", 7, 1.0, 1.0),
00097 E("ISO-8859-9", 8, 1.0, 1.0),
00098 E("ISO-8859-10", 9, 1.0, 1.0),
00099 E("ISO-8859-13", 10, 1.0, 1.0),
00100 E("ISO-8859-14", 11, 1.0, 1.0),
00101 E("ISO-8859-15", 12, 1.0, 1.0),
00102 E("ISO-8859-16", 13, 1.0, 1.0),
00103 E("US-ASCII", 16, 1.0, 1.0),
00104 E("UTF-8", 17, 1.2, 6.0),
00105 E("UCS-2", 18, 2.1, 6.0),
00106 E("UCS-2BE", 19, 2.1, 6.0),
00107 E("UCS-2LE", 20, 2.1, 6.0),
00108 E("UCS-4", 21, 4.0, 6.0),
00109 E("UCS-4BE", 22, 4.0, 6.0),
00110 E("UCS-4LE", 23, 4.0, 6.0),
00111 E("UTF-16", 24, 2.1, 6.0),
00112 E("UTF-16BE", 25, 2.1, 6.0),
00113 E("UTF-16LE", 26, 2.1, 6.0),
00114 E("UTF-32", 27, 4.0, 6.0),
00115 E("UTF-32BE", 28, 4.0, 6.0),
00116 E("UTF-32LE", 29, 4.0, 6.0),
00117 E("UTF-7", 30, 1.0, 6.0),
00118 E("UCS-2-INTERNAL", 31, 2.0, 6.0),
00119 E("UCS-2-SWAPPED", 32, 2.0, 6.0),
00120 E("UCS-4-INTERNAL", 33, 4.0, 6.0),
00121 E("UCS-4-SWAPPED", 34, 4.0, 6.0),
00122 E("C99", 35, 1.2, 10.0),
00123 E("JAVA", 36, 1.2, 6.0),
00124 E("KOI8-R", 37, 1.5, 6.0),
00125 E("KOI8-U", 38, 1.5, 6.0),
00126 E("KOI8-RU", 39, 1.5, 6.0),
00127 E("KOI8-T", 40, 1.5, 6.0),
00128 E("CP850", 41, 1.0, 1.0),
00129 E("CP862", 42, 1.0, 1.0),
00130 E("CP866", 43, 1.0, 1.0),
00131 E("CP874", 44, 1.5, 6.0),
00132 E("CP932", 45, 1.5, 6.0),
00133 E("CP949", 46, 1.5, 6.0),
00134 E("CP950", 47, 1.5, 6.0),
00135 E("CP1133", 48, 1.0, 1.0),
00136 E("CP1250", 49, 1.0, 1.0),
00137 E("CP1251", 50, 1.0, 1.0),
00138 E("CP1252", 51, 1.0, 1.0),
00139 E("CP1253", 52, 1.0, 1.0),
00140 E("CP1254", 53, 1.0, 1.0),
00141 E("CP1255", 54, 1.0, 1.0),
00142 E("CP1256", 55, 1.0, 1.0),
00143 E("CP1257", 56, 1.0, 1.0),
00144 E("CP1258", 57, 1.0, 1.0),
00145 E("MacRoman", 58, 1.5, 6.0),
00146 E("MacCentralEuope", 59, 1.5, 6.0),
00147 E("MacIceland", 60, 1.5, 6.0),
00148 E("MacCroatian", 61, 1.5, 6.0),
00149 E("MacRomania", 62, 1.5, 6.0),
00150 E("MacCyrillic", 63, 1.5, 6.0),
00151 E("MacUkraine", 64, 1.5, 6.0),
00152 E("MacGreek", 65, 1.5, 6.0),
00153 E("MacTurkish", 66, 1.5, 6.0),
00154 E("MacHebrew", 67, 1.5, 6.0),
00155 E("MacArabic", 68, 1.5, 6.0),
00156 E("MacThai", 69, 1.5, 6.0),
00157 E("HP-ROMAN8", 70, 1.5, 6.0),
00158 E("NEXTSTEP", 71, 1.5, 6.0),
00159 E("ARMSCII-8", 72, 1.5, 6.0),
00160 E("GEORGIAN-ACADEMY", 73, 1.5, 6.0),
00161 E("GEORGIAN-PS", 74, 1.5, 6.0),
00162 E("MULELAO-1", 75, 1.5, 6.0),
00163 E("TIS-620", 76, 1.5, 6.0),
00164 E("ISO-IR-166", 77, 1.5, 6.0),
00165 E("VISCII", 78, 1.5, 6.0),
00166 E("TCVN", 79, 1.5, 6.0),
00167 E("JIS_C6220-1969-RO",80, 1.5, 6.0),
00168 E("JIS_X0201", 81, 1.5, 6.0),
00169 E("JIS_X0208", 82, 1.5, 6.0),
00170 E("JIS_X0212", 83, 1.5, 6.0),
00171 E("GB_1988-80", 84, 1.5, 6.0),
00172 E("GB_2312-80", 85, 1.5, 6.0),
00173 E("ISO-IR-165", 86, 1.5, 6.0),
00174 E("KSC_5601", 87, 1.5, 6.0),
00175 E("EUC-JP", 88, 1.5, 6.0),
00176 E("SHIFT_JIS", 89, 1.5, 6.0),
00177 E("ISO-2022-JP", 90, 1.5, 6.0),
00178 E("ISO-2022-JP-1", 91, 1.5, 6.0),
00179 E("ISO-2022-JP-2", 92, 1.5, 6.0),
00180 E("EUC-CN", 93, 1.5, 6.0),
00181 E("GBK", 94, 1.5, 6.0),
00182 E("GB18030", 95, 1.5, 6.0),
00183 E("ISO-2022-CN", 96, 1.5, 6.0),
00184 E("ISO-2022-CN-EXT", 97, 1.5, 6.0),
00185 E("HZ", 98, 1.5, 6.0),
00186 E("EUC-TW", 99, 1.5, 6.0),
00187 E("BIG5", 100, 1.5, 6.0),
00188 E("BIG5-HKSCS", 101, 1.5, 6.0),
00189 E("EUC-KR", 102, 1.5, 6.0),
00190 E("JOHAB", 103, 1.5, 6.0),
00191 0
00192 };
00193
00194 static char const* const aliasTable[] = {
00195
00196
00197 "US-ASCII", "ASCII", "ISO646-US", "ISO_646.IRV:1991", "ISO-IR-6", "ANSI_X3.4-1968", "ANSI_X3.4-1986", "CP367", "IBM367", "US", "csASCII", 0,
00198 "UCS-2", "ISO-10646-UCS-2", "csUnicode", 0,
00199 "UCS-2BE", "UNICODEBIG", "UNICODE-1-1", "csUnicode11", 0,
00200 "UCS-2LE", "UNICODELITTLE", 0,
00201 "UCS-4", "ISO-10646-UCS-4", "csUCS4", 0,
00202 "UTF-7", "UNICODE-1-1-UTF-7", "csUnicode11UTF7", 0,
00203 "ISO-8859-1", "ISO_8859-1", "ISO_8859-1:1987", "ISO-IR-100", "CP819", "IBM819", "LATIN1", "L1", "csISOLatin1", "ISO8859-1", 0,
00204 "ISO-8859-2", "ISO_8859-2", "ISO_8859-2:1987", "ISO-IR-101", "LATIN2", "L2", "csISOLatin2", "ISO8859-2", 0,
00205 "ISO-8859-3", "ISO_8859-3", "ISO_8859-3:1988", "ISO-IR-109", "LATIN3", "L3", "csISOLatin3", "ISO8859-3", 0,
00206 "ISO-8859-4", "ISO_8859-4", "ISO_8859-4:1988", "ISO-IR-110", "LATIN4", "L4", "csISOLatin4", "ISO8859-4", 0,
00207 "ISO-8859-5", "ISO_8859-5", "ISO_8859-5:1988", "ISO-IR-144", "CYRILLIC", "csISOLatinCyrillic", "ISO8859-5", 0,
00208 "ISO-8859-6", "ISO_8859-6", "ISO_8859-6:1987", "ISO-IR-127", "ECMA-114", "ASMO-708", "ARABIC", "csISOLatinArabic", "ISO8859-6", 0,
00209 "ISO-8859-7", "ISO_8859-7", "ISO_8859-7:1987", "ISO-IR-126", "ECMA-118", "ELOT_928", "GREEK8", "GREEK", "csISOLatinGreek", "ISO8859-7", 0,
00210 "ISO-8859-8", "ISO_8859-8", "ISO_8859-8:1988", "ISO-IR-138", "HEBREW", "csISOLatinHebrew", "ISO8859-8", 0,
00211 "ISO-8859-9", "ISO_8859-9", "ISO_8859-9:1989", "ISO-IR-148", "LATIN5", "L5", "csISOLatin5", "ISO8859-9", 0,
00212 "ISO-8859-10", "ISO_8859-10", "ISO_8859-10:1992", "ISO-IR-157", "LATIN6", "L6", "csISOLatin6", "ISO8859-10", 0,
00213 "ISO-8859-13", "ISO_8859-13", "ISO-IR-179", "LATIN7", "L7", "ISO8859-13", 0,
00214 "ISO-8859-14", "ISO_8859-14", "ISO_8859-14:1998", "ISO-IR-199", "LATIN8", "L8", "ISO-CELTIC", "ISO8859-14", 0,
00215 "ISO-8859-15", "ISO_8859-15", "ISO_8859-15:1998", "ISO-IR-203", "ISO8859-15", 0,
00216 "ISO-8859-16", "ISO_8859-16", "ISO_8859-16:2000", "ISO-IR-226", "ISO8859-16", 0,
00217 "KOI8-R", "csKOI8R", 0,
00218 "CP1250", "WINDOWS-1250", "MS-EE", 0,
00219 "CP1251", "WINDOWS-1251", "MS-CYRL", 0,
00220 "CP1252", "WINDOWS-1252", "MS-ANSI", 0,
00221 "CP1253", "WINDOWS-1253", "MS-GREEK", 0,
00222 "CP1254", "WINDOWS-1254", "MS-TURK", 0,
00223 "CP1255", "WINDOWS-1255", "MS-HEBR", 0,
00224 "CP1256", "WINDOWS-1256", "MS-ARAB", 0,
00225 "CP1257", "WINDOWS-1257", "WINBALTRIM", 0,
00226 "CP1258", "WINDOWS-1258", 0,
00227 "CP850", "IBM850", "850", "csPC850Multilingual", 0,
00228 "CP862", "IBM862", "862", "csPC862LatinHebrew", 0,
00229 "CP866", "IBM866", "866", "csIBM866", 0,
00230 "MacRoman", "MACINTOSH", "MAC", "csMacintosh", 0,
00231 "HP-ROMAN8", "ROMAN8", "R8", "csHPRoman8", 0,
00232 "CP1133", "IBM-CP1133", 0,
00233 "TIS-620", "TIS620", "TIS620-0", "TIS620.2529-1", "TIS620.2533-0", "TIS620.2533-1", "ISO-IR-166", 0,
00234 "CP874", "WINDOWS-874", 0,
00235 "VISCII", "VISCII1.1-1", "csVISCII", 0,
00236 "TCVN", "TCVN-5712", "TCVN5712-1", "TCVN5712-1:1993", 0,
00237 "JIS_C6220-1969-RO", "ISO646-JP", "ISO-IR-14", "JP", "csISO14JISC6220ro", 0,
00238 "JIS_X0201", "JISX0201-1976", "X0201", "csHalfWidthKatakana", 0,
00239 "JIS_X0208", "JIS_X0208-1983", "JIS_X0208-1990", "JIS0208", "X0208", "ISO-IR-87", "JIS_C6226-1983", "csISO87JISX0208", 0,
00240 "JIS_X0212", "JIS_X0212.1990-0", "JIS_X0212-1990", "X0212", "ISO-IR-159", "csISO159JISX02121990", 0,
00241 "GB_1988-80", "ISO646-CN", "ISO-IR-57", "CN", "csISO57GB1988", 0,
00242 "GB_2312-80", "ISO-IR-58", "csISO58GB231280", "CHINESE", 0,
00243 "ISO-IR-165", "CN-GB-ISOIR165", 0,
00244 "KSC_5601", "KS_C_5601-1987", "KS_C_5601-1989", "ISO-IR-149", "csKSC56011987", "KOREAN", 0,
00245 "EUC-JP", "EUCJP", "Extended_UNIX_Code_Packed_Format_for_Japanese", "csEUCPkdFmtJapanese", 0,
00246 "SHIFT_JIS", "SHIFT-JIS", "SJIS", "MS_KANJI", "csShiftJIS", 0,
00247 "ISO-2022-JP", "csISO2022JP", 0,
00248 "ISO-2022-JP-2", "csISO2022JP2", 0,
00249 "EUC-CN", "EUCCN", "GB2312", "CN-GB", "csGB2312", 0,
00250 "GBK", "CP936", 0,
00251 "ISO-2022-CN", "csISO2022CN", 0,
00252 "HZ", "HZ-GB-2312", 0,
00253 "EUC-TW", "EUCTW", "csEUCTW", 0,
00254 "BIG5", "BIG-5", "BIG-FIVE", "BIGFIVE", "CN-BIG5", "csBig5", 0,
00255 "BIG5-HKSCS", "BIG5HKSCS", 0,
00256 "EUC-KR", "EUCKR", "csEUCKR", 0,
00257 "CP949", "UHC", 0,
00258 "JOHAB", "CP1361", 0,
00259
00260 0
00261 };
00262
00263 struct lessCase : std::binary_function<char const*, char const*, bool>
00264 {
00265 bool operator ()(char const* a, char const* b) const {
00266 return ::strcasecmp(a, b) < 0;
00267 }
00268 };
00269
00270 class Encoders : public map<char const*, Tanl::Text::Encoding*, lessCase>
00271 {
00272 public:
00273 Encoders() {
00274
00275 for (Encoding* const* e = encodingTable; *e; ++e) {
00276 (*this)[::strdup((*e)->name.c_str())] = *e;
00277 }
00278
00279 for (char const* const* names = aliasTable; *names; ) {
00280 char const* name = names[0];
00281 Encoding* e = 0;
00282 const_iterator found = find(name);
00283 if (found != end())
00284 e = found->second;
00285 char const* const* alias = names + 1;
00286 for (; *alias; ++alias) {
00287 if (e)
00288 (*this)[::strdup(*alias)] = e;
00289 }
00290 names = alias + 1;
00291 }
00292 }
00293 };
00294
00295 static Encoders& getEncoders()
00296 {
00297 static Encoders encoders;
00298 return encoders;
00299 }
00300
00301 }
00302
00303 Encoding* Encoding::get(char const* name)
00304 {
00305 Encoders& encoders = getEncoders();
00306 Encoders::const_iterator found = encoders.find(name);
00307 return (found == encoders.end()) ? 0 : found->second;
00308 }
00309
00310 Encoding* Encoding::get(ID id)
00311 {
00312 Encoders& encoders = getEncoders();
00313 for (Encoders::const_iterator eit = encoders.begin();
00314 eit != encoders.end(); ++eit) {
00315 if (eit->second->id == id)
00316 return eit->second;
00317 }
00318 return 0;
00319 }
00320
00321
00322 void
00323 Encoding::Register(Encoding* encoding)
00324 {
00325 Encoders& e = getEncoders();
00326 Encoders::iterator eit = e.find(encoding->name.c_str());
00327 if (eit != e.end())
00328 return;
00329 encoding->id = e.size();
00330 e[::strdup(encoding->name.c_str())] = encoding;
00331 }
00332
00333 void
00334 Encoding::Register(char const* alias, char const* canonical)
00335 {
00336 Encoding* e = Encoding::get(canonical);
00337 if (e)
00338 getEncoders()[::strdup(alias)] = e;
00339 }
00340
00341 size_t Encoding::Encode(Encoding* from, char const* in, size_t inlen,
00342 char*& out, size_t outlen)
00343 {
00344 char* allocated = 0;
00345 if (outlen == 0) {
00346 outlen = size_t(inlen / from->averageBytesPerChar * averageBytesPerChar);
00347 if (outlen < inlen)
00348 outlen = inlen;
00349 out = allocated = (char*)malloc(outlen);
00350 }
00351 char* outend = out;
00352 iconv_t converter = iconv_open(name.c_str(), from->name.c_str());
00353
00354 #ifdef _WIN32
00355 size_t rlen = iconv(converter, &in, &inlen, &outend, &outlen);
00356 #else
00357 size_t rlen = iconv(converter, (char **)&in, &inlen, &outend, &outlen);
00358 #endif
00359
00360 iconv_close(converter);
00361 if (rlen < 0) {
00362 free(allocated);
00363 return 0;
00364 }
00365 return outend - out;
00366 }
00367
00368 }
00369 }