00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 #include "text/RegExp.h"
00018 #ifdef UNICODE
00019 # include "text/Utf8CharIterator.h"
00020 #endif
00021
00022 #include "text/strings.h"
00023
00024 using namespace std;
00025
00026 namespace Tanl {
00027 namespace Text {
00028 namespace RegExp {
00029
00030 const unsigned char* Pattern::setLocale(char const* locale)
00031 {
00032 free((void*)CharTables);
00033 char* oldLocale = setlocale(LC_CTYPE, NULL);
00034 setlocale(LC_CTYPE, locale);
00035 CharTables = pcre_maketables();
00036 setlocale(LC_CTYPE, oldLocale);
00037 return CharTables;
00038 }
00039
00040 const unsigned char* Pattern::CharTables = Pattern::setLocale(setlocale(LC_CTYPE, 0));
00041
00042 Pattern::Pattern(char const* regExpr, int cflags) :
00043 _errorCode(0), _pcre_extra(0)
00044 {
00045 const char* errtext = 0;
00046 int offset = -1;
00047 _pcre = pcre_compile(regExpr, cflags, &errtext, &offset, CharTables);
00048 if (_pcre == 0)
00049 throw ParseException(string("Error compiling RegExp: [") + regExpr +
00050 "] at " + offset + ": " + errtext);
00051 if (!(cflags & Anchored))
00052
00053 _pcre_extra = pcre_study(_pcre, 0, &errtext);
00054 pcre_fullinfo(_pcre, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns);
00055 ++subpatterns;
00056 }
00057
00058 Pattern::Pattern(string& regExpr, int cflags) :
00059 _errorCode(0), _pcre_extra(0)
00060 {
00061 const char* errtext = 0;
00062 int offset = -1;
00063 _pcre = pcre_compile(regExpr.c_str(), cflags, &errtext, &offset, CharTables);
00064 if (_pcre == 0)
00065 throw ParseException("Error compiling RegExp: [" + regExpr + "] at " + offset + ": " + errtext);
00066 if (!(cflags & Anchored))
00067
00068 _pcre_extra = pcre_study(_pcre, 0, &errtext);
00069 pcre_fullinfo(_pcre, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns);
00070 }
00071
00072 Pattern::~Pattern()
00073 {
00074 if (_pcre) {
00075 pcre_free(_pcre);
00076 _pcre = 0;
00077 }
00078 if (_pcre_extra) {
00079 pcre_free(_pcre_extra);
00080 _pcre_extra = 0;
00081 }
00082 }
00083
00084 bool Pattern::test(string const& str, int eflags)
00085 {
00086 MatchGroups matches(subpatterns);
00087 return pcre_exec(_pcre, _pcre_extra, (const char*)&str[0], str.length(),
00088 0, eflags, (int*)&matches[0], 3 * subpatterns) >= 0;
00089 }
00090
00091 bool Pattern::test(char const* str, size_t len, int eflags)
00092 {
00093 MatchGroups matches(subpatterns);
00094 if (len == 0)
00095 len = strlen(str);
00096 return pcre_exec(_pcre, _pcre_extra, str, len,
00097 0, eflags, (int*)&matches[0], 3 * subpatterns) >= 0;
00098 }
00099
00100 #ifdef UNICODE
00101 inline
00102 static int utf8ByteToCharOffset(const char* begin, int offset)
00103 {
00104 CharBuffer<Utf8Char>::iterator it1(begin);
00105 CharBuffer<Utf8Char>::iterator it2(begin + offset);
00106 return it2 - it1;
00107 }
00108
00109 static void utf8ByteToCharOffsets(const char* begin, int offscount,
00110 MatchGroups& matches)
00111 {
00112 for (int i = 0; i < offscount; i += 2) {
00113 matches[i].first = utf8ByteToCharOffset(begin, matches[i].first);
00114 matches[i].second = utf8ByteToCharOffset(begin, matches[i].second);
00115 }
00116 }
00117 #endif
00118
00119 int Pattern::matchSize(string const& text, int eflags)
00120 {
00121 const char* it = (const char*)&*text.begin();
00122 const char* end = (const char*)&*text.end();
00123
00124 MatchGroups matches(subpatterns);
00125
00126 int mcount = pcre_exec(_pcre, _pcre_extra, it, end - it, 0, eflags,
00127 (int*)&matches[0], 3 * subpatterns);
00128 if (mcount < 0)
00129 return 0;
00130 # ifdef UNICODE
00131 if (eflags & PCRE_UTF8)
00132 utf8ByteToCharOffsets(it, mcount, matches);
00133 # endif
00134 return matches[0].second - matches[0].first;
00135 }
00136
00137 int Pattern::match(string const& text, MatchGroups& pos, int eflags)
00138 {
00139 const char* it = text.c_str();
00140 const char* end = it + text.length();
00141 return match(it, end, pos, eflags);
00142 }
00143
00144 int Pattern::match(const char* it, const char* end, MatchGroups& pos,
00145 int eflags)
00146 {
00147 int slotnum = pos.size() * 3;
00148 int mcount = pcre_exec(_pcre, _pcre_extra, it, end - it, 0, eflags,
00149 (int*)&pos[0], slotnum);
00150 if (mcount < 0)
00151 return 0;
00152 # ifdef UNICODE
00153 if (eflags & PCRE_UTF8)
00154 utf8ByteToCharOffsets(it, mcount, pos);
00155 # endif
00156 return mcount;
00157 }
00158
00159
00160 vector<string> Pattern::match(string const& str, int eflags)
00161 {
00162 MatchGroups matches(subpatterns);
00163 int mcount = match(str, matches, eflags);
00164 if (mcount < 0)
00165 return vector<string>(0);
00166 vector<string> sa(mcount);
00167 for (int i = 0; i < mcount; ++i)
00168 sa[i] = str.substr(matches[i].first, matches[i].second);
00169 return sa;
00170 }
00171
00172 string Pattern::replace(string& text, string& replwith, bool replaceAll)
00173 {
00174 string rest = text;
00175 MatchGroups sa(1);
00176
00177 string ret;
00178 while (match(rest, sa, 0)) {
00179 ret += rest.substr(0, sa[0].first) + replwith;
00180 rest = rest.substr(sa[0].second);
00181 if (!replaceAll)
00182 break;
00183 }
00184 ret += rest;
00185 return ret;
00186 }
00187
00188
00189 string Pattern::escape(string& str)
00190 {
00191 string sb;
00192 string::iterator end = str.end();
00193 for (string::iterator it = str.begin(); it < end; it++) {
00194 char c = *it;
00195 if (!((c >= 'a' && c <= 'z') ||
00196 (c >= 'A' && c <= 'Z') ||
00197 (c >= '0' && c <= '9')))
00198 sb += '\\';
00199 sb += c;
00200 }
00201 return sb;
00202 }
00203
00204 }
00205 }
00206 }