00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "Corpus.h"
00025 #include "text/XmlReader.h"
00026 #include "SplitSentenceReader.h"
00027
00028
00029 #include "io/File.h"
00030
00031
00032 #include <sstream>
00033
00034 using namespace std;
00035
00036 namespace Tanl {
00037
00038
00039
00040
00041 Corpus::Corpus(Language const& lang, char const* formatFile) :
00042 language(lang)
00043 {
00044 CorpusFormat* cf = parseFormat(formatFile);
00045 if (cf) {
00046 tokenFields = cf->tokenFields;
00047 index = cf->index;
00048 delete cf;
00049 } else
00050 throw CorpusFormatError(string("Reading file: ") + formatFile);
00051 }
00052
00053 Corpus* Corpus::create(Language const& language, char const* inputFormat)
00054 {
00055 Corpus* corpus;
00056 CorpusFactory* cfactory = CorpusMap::get(inputFormat);
00057 if (cfactory)
00058 corpus = cfactory(language, 0);
00059 else if (IXE::io::File(inputFormat).exists()) {
00060 CorpusFormat* inFormat = Corpus::parseFormat(inputFormat);
00061 if (inFormat) {
00062 cerr << "Bad format file: " << *inputFormat << endl;
00063 return 0;
00064 }
00065 cfactory = CorpusMap::get(inFormat->name.c_str());
00066 if (cfactory)
00067 corpus = cfactory(language, inFormat);
00068 else
00069 corpus = new Corpus(language, *inFormat);
00070 }
00071 return corpus;
00072 }
00073
00074 CorpusFormat* Corpus::parseFormat(char const* formatFile)
00075 {
00076 ifstream fmt(formatFile);
00077 return parseFormat(fmt);
00078 }
00079
00080 CorpusFormat* Corpus::parseFormat(istream& fmt)
00081 {
00082 Tanl::XML::XmlReader reader(fmt);
00083 CorpusFormat* format = new CorpusFormat();
00084 AttributeIndex& index = format->index;
00085 while (reader.Read()) {
00086 switch (reader.NodeType) {
00087 case Tanl::XML::Element:
00088 string& name = reader.Name;
00089 if (name == "CorpusFormat") {
00090 if (reader.MoveToFirstAttribute()) {
00091 do {
00092 string& name = reader.Name;
00093 string& value = reader.Value;
00094 if (name == "name") {
00095 format->name = value;
00096 }
00097 } while (reader.MoveToNextAttribute());
00098 }
00099 } else if (name == "field") {
00100
00101 TokenField field;
00102 if (reader.MoveToFirstAttribute()) {
00103 do {
00104 string& name = reader.Name;
00105 string& value = reader.Value;
00106 if (name == "name") {
00107 field.name = value;
00108 index.insert(value.c_str());
00109 } else if (name == "use")
00110 field.use = (value == "INPUT" ? TokenField::input :
00111 (value == "OUTPUT" ? TokenField::output :
00112 (value == "ECHO" ? TokenField::echo :
00113 (value == "IGNORE" ? TokenField::ignore :
00114 TokenField::input))));
00115 else if (name == "value")
00116 field.value = (value == "STRING" ? TokenField::string :
00117 (value == "INTEGER" ? TokenField::integer : TokenField::string));
00118 else if (name == "role")
00119 field.role = (value == "FORM" ? TokenField::form :
00120 (value == "HEAD" ? TokenField::head :
00121 (value == "DEPREL" ? TokenField::deprel :
00122 (value == "PREDICATE" ? TokenField::predicate :
00123 TokenField::none))));
00124 else if (name == "link")
00125 field.link = value;
00126 else if (name == "label")
00127 field.label = value;
00128 else if (name == "default")
00129 field.default_ = value;
00130 } while (reader.MoveToNextAttribute());
00131 }
00132 format->tokenFields.push_back(field);
00133 } else {
00134 delete format;
00135 return 0;
00136 }
00137 }
00138 }
00139 return format;
00140 }
00141
00142 SentenceReader* Corpus::sentenceReader(istream* is) {
00143 return new SentenceReader(is, this);
00144 }
00145
00146 void Corpus::print(ostream& os, Sentence const& sent) const
00147 {
00148 FOR_EACH (Sentence, sent, tit) {
00149 (*tit)->printTab(os, *this);
00150 os << endl;
00151 }
00152 }
00153
00154
00155
00156
00157 void Conll08Corpus::print(ostream& os, Sentence const& sent) const
00158 {
00159
00160 vector<int> preds;
00161 FOR_EACH (Sentence, sent, tit) {
00162 if (!(*tit)->get("PRED")->empty())
00163 preds.push_back((*tit)->id);
00164 }
00165 FOR_EACH (Sentence, sent, tit) {
00166 (*tit)->printConll08(os, *this, &preds);
00167 os << endl;
00168 }
00169 }
00170
00174 Corpus* Conll08CorpusFactory(Language const& lang, CorpusFormat* format)
00175 {
00176 return new Corpus(lang, *format);
00177 }
00178
00179 REGISTER_CORPUS(conll08, Conll08CorpusFactory);
00180
00181
00182
00183
00190 void CombCorpus::print(ostream& os, Sentence const& sent) const
00191 {
00192 FOR_EACH (Sentence, sent, tit) {
00193 bool first = true;
00194 TreeToken const& node = **tit;
00195 Token const& tok = *node.token;
00196
00197 TokenLink const* arg = 0;
00198 if (!tok.links.empty()) {
00199 FOR_EACH (TokenLinks, tok.links, lit)
00200 if (lit->head != node.id) {
00201 arg = &*lit;
00202 break;
00203 }
00204 }
00205 TokenFields::const_iterator tfit = tokenFields.begin();
00206 FOR_EACH (vector<char const*>, index.names, it) {
00207 string const* tag = tok.attributes.get(*it);
00208 if (first)
00209 first = false;
00210 else
00211 os << "\t";
00212 switch (tfit->role) {
00213 case TokenField::head:
00214 os << node.linkHead();
00215 break;
00216 case TokenField::deprel:
00217 if (arg && arg->label.substr(0, 2) != "SU")
00218 # ifdef PATH_DIFF
00219 {
00220 if (arg->head == node.linkHead())
00221 os << arg->label;
00222 else {
00223
00224 int upDep = node.linkHead();
00225 int upArg = arg->head;
00226 if (upDep && (upDep = sent[upDep-1]->linkHead()) == arg->head) {
00227 os << arg->label << "!";
00228 break;
00229 } else if (upArg && upDep == sent[upArg-1]->linkHead()) {
00230 os << arg->label << "!^";
00231 break;
00232 } else if (upDep == node.id) {
00233 os << arg->label << "!=";
00234 break;
00235 } else if (upDep && (upDep = sent[upDep-1]->linkHead()) == arg->head) {
00236 os << arg->label << "!!";
00237 break;
00238 } else if (upArg && upDep == sent[upArg-1]->linkHead()) {
00239 os << arg->label << "!!^";
00240 break;
00241 } else if (upDep == node.id) {
00242 os << arg->label << "!!=";
00243 break;
00244 } else if (upDep && (upDep = sent[upDep-1]->linkHead()) == arg->head) {
00245 os << arg->label << "!!!";
00246 break;
00247 } else if (upDep == node.id) {
00248 os << arg->label << "!!!=";
00249 break;
00250 } else if (upDep && (upDep = sent[upDep-1]->linkHead()) == arg->head) {
00251 os << arg->label << "!!!!";
00252 break;
00253 } else if (upArg && (upArg = sent[upArg-1]->linkHead()) == node.linkHead()) {
00254 os << arg->label << "^";
00255 break;
00256 } else if (upArg == node.id) {
00257 os << arg->label << "^=";
00258 break;
00259 } else if (upArg && (upArg = sent[upArg-1]->linkHead()) == node.linkHead()) {
00260 os << arg->label << "^^";
00261 break;
00262 } else if (upArg == node.id) {
00263 os << arg->label << "^^=";
00264 break;
00265 } else if (upArg && (upArg = sent[upArg-1]->linkHead()) == node.linkHead()) {
00266 os << arg->label << "^^^";
00267 break;
00268 } else if (upArg == node.id) {
00269 os << arg->label << "^^^=";
00270 break;
00271 } else if (upArg && (upArg = sent[upArg-1]->linkHead()) == node.linkHead()) {
00272 os << arg->label << "^^^^";
00273 break;
00274 }
00275
00276
00277 int dist = arg->head - (node.linkHead() ? node.linkHead() : node.id);
00278 char buf[20];
00279 snprintf(buf, sizeof(buf), "%s%+i",
00280 arg->label.c_str(), dist);
00281 os << buf;
00282 }
00283 }
00284 # else
00285 os << arg->label;
00286 # endif
00287 else
00288 os << node.linkLabel();
00289 break;
00290 default:
00291 os << ((!tag || tag->empty()) ? tfit->default_ : *tag);
00292 }
00293 ++tfit;
00294 }
00295 os << endl;
00296 }
00297 }
00298
00299
00300
00301
00302 char const* conllXFormat =
00303 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
00304 "<CorpusFormat name=\"conllX\">"
00305 " <field name=\"ID\" use=\"ECHO\" value=\"INTEGER\"/>"
00306 " <field name=\"FORM\" use=\"INPUT\" value=\"STRING\" role=\"FORM\"/>"
00307 " <field name=\"LEMMA\" use=\"INPUT\" value=\"STRING\"/>"
00308 " <field name=\"CPOSTAG\" use=\"INPUT\" value=\"STRING\"/>"
00309 " <field name=\"POSTAG\" use=\"INPUT\" value=\"STRING\"/>"
00310 " <field name=\"FEATS\" use=\"INPUT\" value=\"STRING\"/>"
00311 " <field name=\"HEAD\" use=\"OUTPUT\" link=\"DEP\" role=\"HEAD\"/>"
00312 " <field name=\"DEPREL\" use=\"OUTPUT\" label=\"DEP\" role=\"DEPREL\"/>"
00313 " <field name=\"PHEAD\" use=\"IGNORE\" default=\"_\"/>"
00314 " <field name=\"PDEPREL\" use=\"IGNORE\" default=\"_\"/>"
00315 "</CorpusFormat>";
00316
00317 ConllXCorpus::ConllXCorpus(Language const& lang) :
00318 Corpus(lang)
00319 {
00320 istringstream iss(conllXFormat);
00321 CorpusFormat* cf = parseFormat(iss);
00322 if (cf) {
00323 tokenFields = cf->tokenFields;
00324 index = cf->index;
00325 delete cf;
00326 } else
00327 throw CorpusFormatError("conllXFormat");
00328 }
00329
00330 SentenceReader* ConllXCorpus::sentenceReader(istream* is) {
00331 return new ConllXSentenceReader(is, this);
00332 }
00333
00334 void ConllXCorpus::print(ostream& os, Sentence const& sent) const
00335 {
00336 FOR_EACH (Sentence, sent, tit) {
00337 (*tit)->printTab(os, *this);
00338 os << endl;
00339 }
00340 }
00341
00345 Corpus* ConllXCorpusFactory(Language const& lang, CorpusFormat* format)
00346 {
00347 return new ConllXCorpus(lang);
00348 }
00349
00350 REGISTER_CORPUS(CoNLL, ConllXCorpusFactory);
00351
00352
00353
00354
00355 char const* DgaFormat =
00356 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
00357 "<CorpusFormat name=\"DGA\">"
00358 " <field name=\"id\" use=\"ECHO\" value=\"INTEGER\"/>"
00359 " <field name=\"orth\" use=\"INPUT\" value=\"STRING\" role=\"FORM\"/>"
00360 " <field name=\"lemma\" use=\"INPUT\" value=\"STRING\"/>"
00361 " <field name=\"pos\" use=\"INPUT\" value=\"STRING\"/>"
00362 " <field name=\"gen\" use=\"INPUT\" value=\"STRING\"/>"
00363 " <field name=\"num\" use=\"INPUT\" value=\"STRING\"/>"
00364 " <field name=\"per\" use=\"INPUT\" value=\"STRING\"/>"
00365 " <field name=\"head\" use=\"OUTPUT\" link=\"DEP\" role=\"HEAD\"/>"
00366 " <field name=\"type\" use=\"OUTPUT\" label=\"DEP\" role=\"DEPREL\"/>"
00367 "</CorpusFormat>";
00368
00369 DgaCorpus::DgaCorpus(Language const& lang) :
00370 Corpus(lang)
00371 {
00372 istringstream iss(DgaFormat);
00373 CorpusFormat* cf = parseFormat(iss);
00374 if (cf) {
00375 tokenFields = cf->tokenFields;
00376 index = cf->index;
00377 delete cf;
00378 } else
00379 throw CorpusFormatError("DgaFormat");
00380 }
00381
00382 SentenceReader* DgaCorpus::sentenceReader(istream* is)
00383 {
00384 return new DgaSentenceReader(is, this);
00385 }
00386
00387 void DgaCorpus::print(ostream& os, Sentence const& sent) const
00388 {
00389 FOR_EACH (Sentence, sent, tit) {
00390 (*tit)->print(os);
00391 os << endl;
00392 }
00393 }
00394
00398 Corpus* DgaXMLCorpusFactory(Language const& lang, CorpusFormat* format)
00399 {
00400 return new DgaCorpus(lang);
00401 }
00402
00403 REGISTER_CORPUS(DgaXML, DgaXMLCorpusFactory);
00404
00405
00406
00407
00408 SentenceReader* TextCorpus::sentenceReader(istream* is)
00409 {
00410 return new SplitSentenceReader(is, this);
00411 }
00412
00416 Corpus* TextCorpusFactory(Language const& lang, CorpusFormat* format)
00417 {
00418 return new TextCorpus(lang);
00419 }
00420
00421 REGISTER_CORPUS(Text, TextCorpusFactory);
00422
00423
00424
00425
00426 SentenceReader* TokenizedTextCorpus::sentenceReader(istream* is)
00427 {
00428 return new TokenSentenceReader(is, this);
00429 }
00430
00434 Corpus* TokenizedTextCorpusFactory(Language const& lang, CorpusFormat* format)
00435 {
00436 return new TokenizedTextCorpus(lang);
00437 }
00438
00439 REGISTER_CORPUS(TokenizedText, TokenizedTextCorpusFactory);
00440
00441
00442
00443
00451 std::map<char const*, CorpusFactory*>& CorpusMap::get()
00452 {
00453 static std::map<char const*, CorpusFactory*> corpusFor;
00454 return corpusFor;
00455 }
00456
00457 CorpusFactory* CorpusMap::get(char const* type)
00458 {
00459 std::map<char const*, CorpusFactory*>& corpusMap = get();
00460 map<char const*, CorpusFactory*>::const_iterator rit = corpusMap.find(type);
00461 return (rit == corpusMap.end()) ? 0 : rit->second;
00462 }
00463
00464 }