desr: desr: text/Utf8Utils.cpp Source File

00001 /*
00002 **  Tanl C++ Library
00003 **  text/Utf8Utils.cpp: utilities for handling UTF-8
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2002  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of DeSR.
00009 **
00010 **  DeSR is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  DeSR is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #include "Utf8Utils.h"
00025 #include <wchar.h>
00026 
00027 namespace Tanl {
00028 namespace Text {
00029 
00030 namespace Unicode {
00031 
00032 void incUtfPtr(const char*& ptr, const char* endptr)
00033 {
00034   if (endptr != 0)
00035     for (++ptr; ptr < endptr && (*ptr & 0xc0) == 0x80; ++ptr)
00036       ;
00037   else
00038     for (++ptr; (*ptr & 0xc0) == 0x80; ++ptr)
00039       ;
00040 }
00041 
00042 void decUtfPtr(const char*& ptr, const char* beginptr)
00043 {
00044   while (ptr > beginptr) {
00045     ptr--;
00046     if ((*ptr & 0xc0) != 0x80)
00047       return;
00048   }
00049 }
00050 
00051 size_t utfDiff(const char* end, const char* begin)
00052 {
00053   size_t count;
00054   for (count = 0; begin < end; incUtfPtr(begin, end))
00055     ++count;
00056   return count;
00057 }
00058 
00059 bool isAscii(const char* begin, const char* end)
00060 {
00061   for (; begin < end; ++begin)
00062     if ((unsigned char)*begin > 127)
00063       return false;
00064   return true;
00065 }
00066 
00067 UCS2 fetchChar(const char*& it, const char* end)
00068 {
00069   if (it >= end)
00070     return 0;
00071   int char_byte_length;
00072   int byte_mask = 0;
00073   if ((unsigned char)*it < 128) { char_byte_length = 1; byte_mask = 0x7f; }
00074   else if ((*it & 0xe0) == 0xc0) { char_byte_length = 2; byte_mask = 0x1f; }
00075   else if ((*it & 0xf0) == 0xe0) { char_byte_length = 3; byte_mask = 0x0f; }
00076   else if ((*it & 0xf8) == 0xf0) { char_byte_length = 4; byte_mask = 0x07; }
00077   else if ((*it & 0xfc) == 0xf8) { char_byte_length = 5; byte_mask = 0x03; }
00078   else if ((*it & 0xfe) == 0xfc) { char_byte_length = 6; byte_mask = 0x01; }
00079   else { char_byte_length = -1; }
00080 
00081   UCS2 ret = it[0] & byte_mask;
00082   int i;
00083   for (i = 1; i < char_byte_length; ++i) {
00084     if ((it[i] & 0xc0) != 0x80) {
00085       ret = 0xFFFF;
00086       break;
00087     }
00088     ret <<= 6;
00089     ret |= (it[i] & 0x3f);
00090   }
00091   it += char_byte_length;
00092   //  ret = Number::fromLittleEndian(ret);
00093   return ret;
00094 }
00095 
00096 int byteLength(const char* it)
00097 {
00098   unsigned char c = *((unsigned char*)it);
00099   if (c < 128) return 1;
00100   if ((c & 0xe0) == 0xc0) return 2;
00101   if ((c & 0xf0) == 0xf0) return 3;
00102   if ((c & 0xf8) == 0xf0) return 4;
00103   if ((c & 0xfc) == 0xf8) return 5;
00104   if ((c & 0xfe) == 0xfc) return 6;
00105   return -1;
00106 }
00107 
00108 int utfLength(UCS2 ucc) 
00109 {
00110   int neededBytes;
00111   if (ucc < 0x80)
00112     neededBytes = 1;
00113   else if (ucc < 0x800)
00114     neededBytes = 2;
00115   else
00116     neededBytes = 3;
00117 /*
00118       if (ucc < 0x10000)  neededBytes = 3; // FIXME: always true 
00119         else if (ucc < 0x200000) neededBytes = 4;
00120         else if (ucc < 0x400000) neededBytes = 5;
00121         else neededBytes = 6;
00122 */
00123   return neededBytes;
00124 }
00125 
00126 int utfLength(const UCS2* begin, const UCS2* end)
00127 {
00128   int len = 0;
00129   for (const UCS2* it = begin; it < end; ++it)
00130     len += utfLength(*it);
00131   return len;
00132 }
00133 
00134 int ucLength(const UCS2* ch)
00135 {
00136   const UCS2* begin = ch;
00137   while (*ch++ != 0) 
00138     ;
00139   return ch - begin - 1;
00140 }
00141 
00142 int uc4Length(const UCS4* ptr)
00143 {
00144   const UCS4* begin = ptr;
00145   while (*ptr++ != 0) 
00146     ;
00147   return ptr - begin - 1;
00148 }
00149 
00150 int ToUtf8(byte*& it, byte* end, UCS2 ucc)
00151 {
00152   static const int firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
00153   const int byteMask = 0x3F;
00154   const int byteMark = 0x80; 
00155   if (ucc < 0x80) {
00156     if (it >= end)
00157       return 1;
00158     *it++ = (unsigned char)ucc;
00159     return 0;
00160   }
00161   int neededBytes = utfLength(ucc);
00162   if (it + neededBytes >= end)
00163     return neededBytes;
00164   it += neededBytes;
00165   byte* tptr = it;
00166   switch (neededBytes) {
00167   case 6:       *--tptr = (ucc & byteMask) | byteMark; ucc >>= 6;
00168   case 5:       *--tptr = (ucc & byteMask) | byteMark; ucc >>= 6;
00169   case 4:       *--tptr = (ucc & byteMask) | byteMark; ucc >>= 6;
00170   case 3:       *--tptr = (ucc & byteMask) | byteMark; ucc >>= 6;
00171   case 2:       *--tptr = (ucc & byteMask) | byteMark; ucc >>= 6;
00172   case 1:       *--tptr =  ucc | firstByteMark[neededBytes];
00173   }
00174   return 0;
00175 }
00176 
00177 int ToUtf8(byte*& it, const size_t max, const UCS2* ucbegin, const size_t len)
00178 {
00179   byte* end = it + max;
00180   const UCS2* ucend = ucbegin + len;
00181   int needed = 0;
00182   for (const UCS2* uit = ucbegin; uit < ucend; ++uit) {
00183     int reqired = ToUtf8(it, end, *uit);
00184     if (reqired > 0)
00185       needed += reqired;
00186   }
00187   return needed;
00188 }
00189 
00190 UCS2* ToUc(UCS2* buffer, const char* source, int length)
00191 {
00192   if (length == -1)
00193     length = ::strlen(source);
00194   UCS2* it = buffer;
00195   for (const char* end = source + length; source < end; ++source, ++it) {
00196     *it = (UCS2)*source; // FIXME: check for overflow
00197   }
00198   *it = 0;
00199   return buffer;
00200 }
00201 
00202 UCS2* ToUc(UCS2* buffer, const UCS4* source, int length)
00203 {
00204   if (length == -1)
00205     length = uc4Length(source);
00206   UCS2* it = buffer;
00207   for (const UCS4* end = source + length; source < end; ++source, ++it) {
00208     *it = (UCS2)*source; // FIXME: check for overflow
00209   }
00210   *it = 0;
00211   return buffer;
00212 }
00213 
00214 UCS2* ToUc(UCS2* buffer, const wchar_t* source, int length)
00215 {
00216   if (length == -1)
00217     length = wcslen(source);
00218   UCS2* it = buffer;
00219   for (const wchar_t* end = source + length; source < end; ++source, ++it) {
00220     *it = (UCS2)*source; // FIXME: check for overflow
00221   }
00222   *it = 0;
00223   return buffer;
00224 }
00225 
00226 UCS2* utf8ToUc(UCS2* buffer, const char* source, int length)
00227 {
00228   UCS2* tptr = buffer;
00229   const char* sptr = source;
00230   if (length == -1)
00231       length = strlen(source);
00232   for (int i = 0; i < length; ++i, ++sptr, ++tptr) {
00233     *tptr = *sptr; // FIXME: to implement
00234   }
00235   *tptr  = 0;
00236   return buffer; 
00237 }
00238 
00239 } // namespace Unicode
00240 } // namespace Text
00241 } // namespace Tanl