00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "MorphExtractor.h"
00025 #include "conf/conf_bool.h"
00026
00027 namespace Tanl {
00028
00032 IXE::conf<bool> MorphExtract("MorphExtract", false);
00033
00034 char* strnzcpy(char *dest, const char *src, size_t n) {
00035 strncpy(dest, src, n);
00036 dest[n] = '\0';
00037 return dest;
00038 }
00039
00040 void MorphExtractor::operator() (char const* start, char const* end,
00041 Features& mf) const
00042 {
00043 mf.gender[0] = '\0';
00044 mf.number[0] = '\0';
00045 mf.person[0] = '\0';
00046 if (end - start == 1 && !strncmp(start, "_", 1))
00047 mf.full[0] = '\0';
00048 else
00049 strnzcpy(mf.full, start, end - start);
00050 }
00051
00052
00053
00054
00055
00056 Tanl::Text::RegExp::Pattern ArabicMorphExtractor::reCase("case=(\\d)");
00057 Tanl::Text::RegExp::Pattern ArabicMorphExtractor::reGender("gen=(\\w)");
00058 Tanl::Text::RegExp::Pattern ArabicMorphExtractor::reNumber("num=(\\w)");
00059 Tanl::Text::RegExp::Pattern ArabicMorphExtractor::rePerson("pers=(\\d)");
00060
00061 void ArabicMorphExtractor::operator() (char const* start, char const* end,
00062 Features& mf) const
00063 {
00064 MorphExtractor::operator() (start, end, mf);
00065
00066 Tanl::Text::RegExp::MatchGroups match(2);
00067
00068 if (reGender.match(start, end, match) > 0)
00069 strnzcpy(mf.gender, start + match[1].first, match[1].second - match[1].first);
00070 if (reNumber.match(start, end, match) > 0)
00071 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00072 if (rePerson.match(start, end, match) > 0)
00073 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00074 if (reCase.match(start, end, match) > 0)
00075 strnzcpy(mf.Case, start + match[1].first, match[1].second - match[1].first);
00076 }
00077
00078
00079
00080
00081
00082 Tanl::Text::RegExp::Pattern BasqueMorphExtractor::reNumber("NUM([SP])");
00083
00084 void BasqueMorphExtractor::operator() (char const* start, char const* end,
00085 Features& mf) const
00086 {
00087 MorphExtractor::operator() (start, end, mf);
00088 Tanl::Text::RegExp::MatchGroups match(2);
00089
00090 if (reNumber.match(start, end, match) > 0)
00091 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00092 }
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102 Tanl::Text::RegExp::Pattern CzechMorphExtractor::reGender("Gen=(\\w)");
00103 Tanl::Text::RegExp::Pattern CzechMorphExtractor::reNumber("Num=(\\w)");
00104 Tanl::Text::RegExp::Pattern CzechMorphExtractor::rePerson("Per=(\\d)");
00105 Tanl::Text::RegExp::Pattern CzechMorphExtractor::reCase("Cas=(\\d)");
00106 Tanl::Text::RegExp::Pattern CzechMorphExtractor::reNegative("Neg=(\\w)");
00107 Tanl::Text::RegExp::Pattern CzechMorphExtractor::reGra("Gra=(\\w)");
00108
00109 void CzechMorphExtractor::operator() (char const* start, char const* end,
00110 Features& mf) const
00111 {
00112 MorphExtractor::operator() (start, end, mf);
00113 Tanl::Text::RegExp::MatchGroups match(2);
00114
00115 # ifdef CZ_FEATS
00116 if (reGender.match(start, end, match) > 0)
00117 strnzcpy(mf.gender, start + match[1].first, match[1].second - match[1].first);
00118 if (reNumber.match(start, end, match) > 0)
00119 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00120 if (rePerson.match(start, end, match) > 0)
00121 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00122 # endif
00123 if (reCase.match(start, end, match) > 0)
00124 strnzcpy(mf.Case, start + match[1].first, match[1].second - match[1].first);
00125 }
00126
00127
00128
00129
00130
00131 Tanl::Text::RegExp::Pattern BulgarianMorphExtractor::reGender("gen=(\\w)");
00132 Tanl::Text::RegExp::Pattern BulgarianMorphExtractor::reNumber("num=(\\w)");
00133 Tanl::Text::RegExp::Pattern BulgarianMorphExtractor::rePerson("pers=(\\d)");
00134
00135 void BulgarianMorphExtractor::operator() (char const* start, char const* end,
00136 Features& mf) const
00137 {
00138 MorphExtractor::operator() (start, end, mf);
00139 Tanl::Text::RegExp::MatchGroups match(2);
00140
00141 if (reGender.match(start, end, match) > 0)
00142 strnzcpy(mf.gender, start + match[1].first, match[1].second - match[1].first);
00143 if (reNumber.match(start, end, match) > 0)
00144 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00145 if (rePerson.match(start, end, match) > 0)
00146 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00147 }
00148
00149
00150
00151
00152
00153 Tanl::Text::RegExp::Pattern DanishMorphExtractor::reGender("gender=(\\w)");
00154 Tanl::Text::RegExp::Pattern DanishMorphExtractor::reNumber("number=(\\w)");
00155 Tanl::Text::RegExp::Pattern DanishMorphExtractor::rePerson("person=(\\d)");
00156 Tanl::Text::RegExp::Pattern DanishMorphExtractor::reCase("case=(\\w)");
00157
00158 void DanishMorphExtractor::operator() (char const* start, char const* end,
00159 Features& mf) const
00160 {
00161 MorphExtractor::operator() (start, end, mf);
00162 Tanl::Text::RegExp::MatchGroups match(2);
00163
00164 if (reGender.match(start, end, match) > 0)
00165 strnzcpy(mf.gender, start + match[1].first, match[1].second - match[1].first);
00166 if (reNumber.match(start, end, match) > 0)
00167 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00168 if (rePerson.match(start, end, match) > 0)
00169 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00170 if (reCase.match(start, end, match) > 0)
00171 strnzcpy(mf.Case, start + match[1].first, match[1].second - match[1].first);
00172 }
00173
00174
00175
00176
00177
00178 Tanl::Text::RegExp::Pattern DutchMorphExtractor::reGender("\\|(onzijd|onzijd)\\|");
00179 Tanl::Text::RegExp::Pattern DutchMorphExtractor::reNumber("\\|(e|m)v\\|");
00180 Tanl::Text::RegExp::Pattern DutchMorphExtractor::rePerson("\\|(\\d)\\|");
00181 Tanl::Text::RegExp::Pattern DutchMorphExtractor::reCase("(neut|gen|dat)");
00182
00183 void DutchMorphExtractor::operator() (char const* start, char const* end,
00184 Features& mf) const
00185 {
00186 MorphExtractor::operator() (start, end, mf);
00187 Tanl::Text::RegExp::MatchGroups match(2);
00188
00189 if (reGender.match(start, end, match) > 0)
00190 strnzcpy(mf.gender, start + match[1].first, match[1].second - match[1].first);
00191 if (reNumber.match(start, end, match) > 0)
00192 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00193 if (rePerson.match(start, end, match) > 0)
00194 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00195 if (reCase.match(start, end, match) > 0)
00196 strnzcpy(mf.Case, start + match[1].first, match[1].second - match[1].first);
00197 if (MorphExtract)
00198
00199 sprintf(mf.full, "%s%s%s%s", mf.gender, mf.number, mf.person, mf.Case);
00200 }
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212 Tanl::Text::RegExp::Pattern GreekMorphExtractor::reGender("(Fe|Ma|Ne)\\|");
00213 Tanl::Text::RegExp::Pattern GreekMorphExtractor::reNumber("\\|(Sg|Pl)");
00214 Tanl::Text::RegExp::Pattern GreekMorphExtractor::rePerson("\\|(0\\d)");
00215 Tanl::Text::RegExp::Pattern GreekMorphExtractor::reCase("\\|(Ge|Ac|Da|Vo)");
00216
00217 void GreekMorphExtractor::operator() (char const* start, char const* end,
00218 Features& mf) const
00219 {
00220 MorphExtractor::operator() (start, end, mf);
00221 Tanl::Text::RegExp::MatchGroups match(2);
00222
00223 if (reGender.match(start, end, match) > 0)
00224 strnzcpy(mf.gender, start + match[1].first, match[1].second - match[1].first);
00225 if (reNumber.match(start, end, match) > 0)
00226 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00227 if (rePerson.match(start, end, match) > 0)
00228 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00229 if (reCase.match(start, end, match) > 0)
00230 strnzcpy(mf.Case, start + match[1].first, match[1].second - match[1].first);
00231 if (MorphExtract)
00232
00233 sprintf(mf.full, "%s%s%s%s", mf.gender, mf.number, mf.person, mf.Case);
00234 }
00235
00236
00237
00238
00239
00240 Tanl::Text::RegExp::Pattern HungarianMorphExtractor::reNumber("n=(\\w)+");
00241 Tanl::Text::RegExp::Pattern HungarianMorphExtractor::rePerson("p=(\\d)");
00242 Tanl::Text::RegExp::Pattern HungarianMorphExtractor::reCase("c=(\\w)+");
00243
00244 void HungarianMorphExtractor::operator() (char const* start, char const* end,
00245 Features& mf) const
00246 {
00247 MorphExtractor::operator() (start, end, mf);
00248 Tanl::Text::RegExp::MatchGroups match(2);
00249
00250 if (reNumber.match(start, end, match) > 0)
00251 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00252 if (rePerson.match(start, end, match) > 0)
00253 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00254 if (reCase.match(start, end, match) > 0)
00255 strnzcpy(mf.Case, start + match[1].first, match[1].second - match[1].first);
00256 }
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266 Tanl::Text::RegExp::Pattern ItalianTutMorphExtractor::reCase("cas=([A-Z+]+)");
00267 Tanl::Text::RegExp::Pattern ItalianTutMorphExtractor::reGender("gen=(\\w)");
00268 Tanl::Text::RegExp::Pattern ItalianTutMorphExtractor::reMode("mod=(\\w)");
00269 Tanl::Text::RegExp::Pattern ItalianTutMorphExtractor::reNumber("num=(\\w)");
00270 Tanl::Text::RegExp::Pattern ItalianTutMorphExtractor::rePerson("per=(\\d)");
00271 Tanl::Text::RegExp::Pattern ItalianTutMorphExtractor::reSem("sem=(\\w+)");
00272 Tanl::Text::RegExp::Pattern ItalianTutMorphExtractor::reTense("tmp=(\\w)");
00273 Tanl::Text::RegExp::Pattern ItalianTutMorphExtractor::reTrans("trans=(\\w)");
00274 Tanl::Text::RegExp::Pattern ItalianTutMorphExtractor::reVTrans("v-trans=(\\w)");
00275
00276 void ItalianTutMorphExtractor::operator() (char const* start, char const* end,
00277 Features& mf) const
00278 {
00279 MorphExtractor::operator() (start, end, mf);
00280 Tanl::Text::RegExp::MatchGroups match(2);
00281
00282 if (reCase.match(start, end, match) > 0)
00283 strnzcpy(mf.extra, start + match[1].first, match[1].second - match[1].first);
00284 if (reGender.match(start, end, match) > 0)
00285 strnzcpy(mf.gender, start + match[1].first, match[1].second - match[1].first);
00286 if (reMode.match(start, end, match) > 0)
00287 strnzcpy(mf.mode, start + match[1].first, match[1].second - match[1].first);
00288 if (reNumber.match(start, end, match) > 0)
00289 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00290 if (rePerson.match(start, end, match) > 0)
00291 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00292
00293
00294
00295
00296 if (reTense.match(start, end, match) > 0)
00297 strnzcpy(mf.tense, start + match[1].first, match[1].second - match[1].first);
00298 if (reTrans.match(start, end, match) > 0)
00299 strnzcpy(mf.trans, start + match[1].first, match[1].second - match[1].first);
00300
00301 if (reVTrans.match(start, end, match) > 0) {
00302 mf.trans[0] = tolower(*(start + match[1].first));
00303 mf.trans[1] = '\0';
00304 }
00305 if (MorphExtract)
00306
00307 sprintf(mf.full, "%s%s%s", mf.gender, mf.number, mf.person);
00308 }
00309
00310
00311
00312
00313
00314 Tanl::Text::RegExp::Pattern PortugueseMorphExtractor::reGender("([MF])\\|");
00315 Tanl::Text::RegExp::Pattern PortugueseMorphExtractor::reNumber("\\|([SP])");
00316 Tanl::Text::RegExp::Pattern PortugueseMorphExtractor::rePerson("\\|(\\d[SP])\\|");
00317
00318 void PortugueseMorphExtractor::operator() (char const* start, char const* end,
00319 Features& mf) const
00320 {
00321 MorphExtractor::operator() (start, end, mf);
00322 Tanl::Text::RegExp::MatchGroups match(2);
00323
00324 if (reGender.match(start, end, match) > 0)
00325 strnzcpy(mf.gender, start + match[1].first, match[1].second - match[1].first);
00326 if (reNumber.match(start, end, match) > 0)
00327 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00328 if (rePerson.match(start, end, match) > 0)
00329 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00330 if (MorphExtract)
00331
00332 sprintf(mf.full, "%s%s%s", mf.gender, mf.number, mf.person);
00333 }
00334
00335
00336
00337
00338
00339 Tanl::Text::RegExp::Pattern SloveneMorphExtractor::reCase("Case=(\\w)");
00340 Tanl::Text::RegExp::Pattern SloveneMorphExtractor::reGender("Gender=(\\w)");
00341 Tanl::Text::RegExp::Pattern SloveneMorphExtractor::reNegative("Negative=(\\w)");
00342 Tanl::Text::RegExp::Pattern SloveneMorphExtractor::reNumber("Number=(\\w)");
00343 Tanl::Text::RegExp::Pattern SloveneMorphExtractor::rePerson("Person=(\\w)");
00344
00345 void SloveneMorphExtractor::operator() (char const* start, char const* end,
00346 Features& mf) const
00347 {
00348 MorphExtractor::operator() (start, end, mf);
00349 Tanl::Text::RegExp::MatchGroups match(2);
00350
00351 if (reGender.match(start, end, match) > 0)
00352 strnzcpy(mf.gender, start + match[1].first, match[1].second - match[1].first);
00353 if (reNumber.match(start, end, match) > 0)
00354 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00355 if (rePerson.match(start, end, match) > 0)
00356 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00357 if (reCase.match(start, end, match) > 0)
00358 strnzcpy(mf.Case, start + match[1].first, match[1].second - match[1].first);
00359 if (MorphExtract)
00360
00361 sprintf(mf.full, "%s%s%s%s", mf.gender, mf.number, mf.person, mf.Case);
00362 }
00363
00364
00365
00366
00367
00368 Tanl::Text::RegExp::Pattern SpanishMorphExtractor::reGender("gen=(\\w)");
00369 Tanl::Text::RegExp::Pattern SpanishMorphExtractor::reNumber("num=(\\w)");
00370 Tanl::Text::RegExp::Pattern SpanishMorphExtractor::rePerson("per=(\\d)");
00371
00372 void SpanishMorphExtractor::operator() (char const* start, char const* end,
00373 Features& mf) const
00374 {
00375 MorphExtractor::operator() (start, end, mf);
00376 Tanl::Text::RegExp::MatchGroups match(2);
00377
00378 if (reGender.match(start, end, match) > 0)
00379 strnzcpy(mf.gender, start + match[1].first, match[1].second - match[1].first);
00380 if (reNumber.match(start, end, match) > 0)
00381 strnzcpy(mf.number, start + match[1].first, match[1].second - match[1].first);
00382 if (rePerson.match(start, end, match) > 0)
00383 strnzcpy(mf.person, start + match[1].first, match[1].second - match[1].first);
00384 if (MorphExtract)
00385
00386 sprintf(mf.full, "%s%s%s", mf.gender, mf.number, mf.person);
00387 }
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398 Tanl::Text::RegExp::Pattern TurkishMorphExtractor::reAFeats("A\\d\\w");
00399 Tanl::Text::RegExp::Pattern TurkishMorphExtractor::rePFeats("P\\d\\w");
00400
00401 void TurkishMorphExtractor::operator() (char const* start, char const* end,
00402 Features& mf) const
00403 {
00404 MorphExtractor::operator() (start, end, mf);
00405 Tanl::Text::RegExp::MatchGroups match(2);
00406
00407 if (reAFeats.match(start, end, match) > 0) {
00408 strnzcpy(mf.person, start + match[1].first + 1, 1);
00409 strnzcpy(mf.number, start + match[1].first + 2, 1);
00410 }
00411
00412 if (MorphExtract)
00413
00414 sprintf(mf.full, "%s%s%s", mf.gender, mf.number, mf.person);
00415 }
00416
00417 }