00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "SentenceReader.h"
00025
00026
00027 #include <iostream>
00028 #include <iomanip>
00029
00030
00031 #include "text/Utf8Utils.h"
00032
00033
00034 #include "Corpus.h"
00035
00036 using namespace std;
00037 using namespace Tanl::Text;
00038
00039 #define CONLL08
00040 #ifdef CONLL08
00041 #include "conf/conf_bool.h"
00042 IXE::conf<bool> useGold("UseGoldPos", false);
00043 #endif
00044
00045 namespace Tanl {
00046
00047
00048
00049
00050 SentenceReader::SentenceReader(istream* is, Corpus* corpus) :
00051 is(is),
00052 corpus(corpus)
00053 { }
00054
00055
00056 static RegExp::Pattern reTab("([^\t\n]+)");
00057
00058 bool SentenceReader::MoveNext()
00059 {
00060 string line;
00061 if (!getline(*is, line) || line.empty())
00062 return false;
00063
00064 sentence = new Sentence(&corpus->language);
00065 vector<char const*> const& names = corpus->index.names;
00066 vector<int> preds;
00067
00068 int id = 1;
00069 RegExp::MatchGroups match(2);
00070
00071 do {
00072 Attributes attributes(&corpus->index);
00073 string form;
00074 int head = 0;
00075 string deprel;
00076 int fields = corpus->tokenFields.size();
00077 TokenLinks links;
00078 unordered_map<string, int> linkMap;
00079 int argNo = 0;
00080 int i = 0;
00081 char const* cur = line.c_str();
00082 char const* end = cur + line.size();
00083 while (reTab.match(cur, end, match) > 0) {
00084 TokenField const& tf = corpus->tokenFields[i];
00085 char const* fieldStart = cur + match[1].first;
00086 int fieldLen = match[1].second - match[1].first;
00087 string field(fieldStart, fieldLen);
00088
00089 if (field == tf.default_)
00090 field = "";
00091
00092 if (tf.use != TokenField::ignore) {
00093 if (!tf.link.empty()) {
00094
00095 int head = field.empty() ? -1 : atoi(field.c_str());
00096 if (linkMap.find(tf.link) == linkMap.end()) {
00097
00098 linkMap[tf.link] = links.size();
00099 links.push_back(TokenLink(head));
00100 } else
00101
00102 links[linkMap[tf.link]].head = head;
00103 } else if (!tf.label.empty()) {
00104 if (field.empty())
00105 --argNo;
00106 else {
00107
00108 if (linkMap.find(tf.label) == linkMap.end()) {
00109
00110 linkMap[tf.label] = links.size();
00111
00112 links.push_back(TokenLink(--argNo, field.c_str()));
00113 } else
00114
00115 links[linkMap[tf.label]].label = field;
00116 }
00117 } else {
00118 attributes[i] = field;
00119 # ifdef CONLL08
00120 if (useGold && i == 7 && !attributes[3].empty())
00121 attributes[7] = attributes[3];
00122 # endif
00123 switch (tf.role) {
00124 case TokenField::form:
00125 form = field; break;
00126 case TokenField::predicate:
00127 if (!field.empty())
00128 preds.push_back(id);
00129 break;
00130 }
00131 }
00132 }
00133 i++;
00134 cur += match[0].second;
00135 if (i == fields || cur == end) {
00136
00137 break;
00138 }
00139 }
00140 TreeToken* token = new TreeToken(id++, form, attributes, links);
00141 sentence->push_back(token);
00142 } while (getline(*is, line) && !line.empty());
00143
00144 FOR_EACH (Sentence, *sentence, sit) {
00145 TO_EACH (TokenLinks, (*sit)->token->links, tit) {
00146 if (tit->head < 0)
00147 tit->head = preds[-tit->head - 1];
00148 }
00149 }
00150 return true;
00151 }
00152
00153 Sentence* SentenceReader::Current()
00154 {
00155 return sentence;
00156 }
00157
00158
00159
00160
00161
00162
00163 static RegExp::Pattern reCoNLL("(\\d+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t\n]+)(?:\t([^\t]+)\t([^\t]+)(?:\t([^\t]+)\t([^\t]+)\t?)?)?");
00164
00165 ConllXSentenceReader::ConllXSentenceReader(istream* is, Corpus* corpus) :
00166 SentenceReader(is, corpus),
00167 morphExtractor(*corpus->language.morphExtractor)
00168 { }
00169
00170 bool ConllXSentenceReader::MoveNext()
00171 {
00172 string line;
00173 if (!getline(*is, line) || line.empty())
00174 return false;
00175
00176 sentence = new Sentence(&corpus->language);
00177 MorphExtractor::Features mf;
00178 vector<char const*>& names = corpus->index.names;
00179
00180 RegExp::MatchGroups match(11);
00181 do {
00182 int matches = reCoNLL.match(line, match);
00183 if (matches > 0) {
00184 char const* start = line.c_str();
00185 morphExtractor(start + match[6].first, start + match[6].second, mf);
00186 Attributes attributes(&corpus->index);
00187 int id = atoi(start + match[1].first);
00188 attributes.insert(names[0], string(start + match[1].first, match[1].second - match[1].first));
00189 string form(start + match[2].first, match[2].second - match[2].first);
00190 attributes.insert(names[1], form);
00191 int head = 0;
00192 string deprel;
00193 for (int i = 3; i < matches; i++) {
00194
00195 char const* tagStart = start + match[i].first;
00196 int tagLen = match[i].second - match[i].first;
00197 string attr;
00198
00199 if (tagLen != 1 || tagStart[0] != '_')
00200 attr = string(tagStart, tagLen);
00201 attributes.insert(names[i-1], attr);
00202 if (i == 7)
00203 head = atoi(tagStart);
00204 else if (i == 8)
00205 deprel = attr;
00206 }
00207
00208 if (id == head)
00209 head = 0;
00210 TokenLinks links(1, TokenLink(head, deprel.c_str()));
00211 TreeToken* token = new TreeToken(id, form, attributes, links);
00212
00213 token->token->morpho.set(mf);
00214 sentence->push_back(token);
00215 }
00216 } while (getline(*is, line) && !line.empty());
00217
00218 return true;
00219 }
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00243 DgaSentenceReader::DgaSentenceReader(istream* is, Corpus* corpus) :
00244 SentenceReader(is, corpus),
00245 reader(*is)
00246 { }
00247
00248 bool DgaSentenceReader::MoveNext()
00249 {
00250
00251 if (!reader.Read() || reader.NodeType != Tanl::XML::Element ||
00252 reader.Name != "s")
00253 return false;
00254
00255 sentence = new Sentence(&corpus->language);
00256 vector<char const*>& names = corpus->index.names;
00257 while (reader.Read()) {
00258 if (reader.NodeType == Tanl::XML::Whitespace)
00259 continue;
00260 if (reader.NodeType == Tanl::XML::EndElement)
00261 break;
00262 if (reader.NodeType == Tanl::XML::Element &&
00263 reader.Name == "tok") {
00264 Attributes attributes(&corpus->index);
00265 int id;
00266 string form;
00267 int head = 0;
00268 string deprel;
00269 while (reader.MoveToNextAttribute()) {
00270 if (reader.Name == "id")
00271 id = atoi(reader.Value.c_str());
00272 }
00273
00274 while (reader.Read()) {
00275 if (reader.NodeType == Tanl::XML::Whitespace)
00276 continue;
00277 if (reader.NodeType == Tanl::XML::EndElement)
00278 break;
00279 if (reader.NodeType != Tanl::XML::Element)
00280 goto fail;
00281 string& name = reader.Name;
00282 if (name == "dep") {
00283
00284 while (reader.MoveToNextAttribute()) {
00285 string& name = reader.Name;
00286 if (name == "head")
00287 head = atoi(reader.Value.c_str());
00288 else if (name == "type")
00289 deprel = reader.Value;
00290 }
00291 } else if (name == "orth") {
00292 if (!reader.Read() || reader.NodeType != Tanl::XML::Text)
00293 goto fail;
00294 form = reader.Value;
00295 if (!reader.Read() || reader.NodeType != Tanl::XML::EndElement ||
00296 reader.Name != "orth")
00297 goto fail;
00298 } else {
00299 string tag = reader.Name;
00300 if (!reader.Read() || reader.NodeType != Tanl::XML::Text)
00301 goto fail;
00302 attributes.insert(tag.c_str(), reader.Value);
00303 if (!reader.Read() || reader.NodeType != Tanl::XML::EndElement ||
00304 reader.Name != tag)
00305 goto fail;
00306 }
00307 }
00308
00309 if (reader.Name != "tok")
00310 goto fail;
00311 TokenLinks links(1, TokenLink(head, deprel.c_str()));
00312 TreeToken* tok = new TreeToken(id, form, attributes, links);
00313 sentence->push_back(tok);
00314 } else
00315 goto fail;
00316 }
00317
00318 if (reader.Name == "s")
00319 return true;
00320 fail:
00321 delete sentence;
00322 sentence = 0;
00323 return false;
00324 }
00325
00326
00327
00328
00332 TokenSentenceReader::TokenSentenceReader(istream* is, Corpus* corpus) :
00333 SentenceReader(is, corpus)
00334 {
00335 # ifdef STEMMER
00336 if (corpus && corpus->language)
00337 stemmer = sb_stemmer_new(corpus->language, 0);
00338 # endif
00339 }
00340
00341 RegExp::Pattern TokenSentenceReader::reTok("\\s*([^\\s]*?)");
00342
00343 bool TokenSentenceReader::MoveNext()
00344 {
00345 string line;
00346 if (!getline(*is, line) || line.empty())
00347 return false;
00348
00349 sentence = new Sentence(&corpus->language);
00350 int id = 1;
00351 RegExp::MatchGroups matches(2);
00352 do {
00353 char const* cur = line.c_str();
00354 char const* endSent = cur + line.size();;
00355 while (reTok.match(cur, endSent, matches) > 0) {
00356 const char* tokStart = cur + matches[1].first;
00357 int tokSize = matches[1].second - matches[1].first;
00358 cur += matches[0].second;
00359 string form(tokStart, tokSize);
00360 TreeToken* tok = new TreeToken(id++, form.c_str(), &corpus->index);
00361 sentence->push_back(tok);
00362 }
00363 } while (getline(*is, line) && line.size());
00364 return true;
00365 }
00366
00367
00368
00369 bool TaggedSentenceReader::MoveNext()
00370 {
00371 if (!reader->MoveNext())
00372 return false;
00373 sentence = reader->Current();
00374 if (tagger) {
00375
00376 Parser::Tagged tagSentence;
00377 FOR_EACH (Sentence, *sentence, sit) {
00378 TreeToken* token = *sit;
00379 tagSentence.words.push_back(token->token->form.c_str());
00380 string const* pos = token->get("POS");
00381 tagSentence.tags.push_back(pos ? pos->c_str() : 0);
00382 }
00383 if (tagger->tag(tagSentence)) {
00384 for (unsigned i = 0; i < sentence->size(); i++) {
00385 (*sentence)[i]->set("POS", tagSentence.tags[i]);
00386 TreeToken* node = (*sentence)[i];
00387 string const* lemma = node->get("LEMMA");
00388 if (lemma && *lemma == "") {
00389 char const* lemma = tagSentence.lemmas[i];
00390 if (strcmp(lemma, "<unknown>"))
00391 node->set("LEMMA", lemma);
00392 else
00393 node->set("LEMMA", node->token->form);
00394 }
00395 }
00396 }
00397 }
00398 return true;
00399 }
00400
00401 }