desr: desr: src/Corpus.cpp Source File

00001 /*
00002 **  DeSR
00003 **  src/Corpus.cpp
00004 ** ----------------------------------------------------------------------
00005 **  Copyright (c) 2008  Giuseppe Attardi (attardi@di.unipi.it).
00006 ** ----------------------------------------------------------------------
00007 **
00008 **  This file is part of DeSR.
00009 **
00010 **  DeSR is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  DeSR is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #include "Corpus.h"
00025 #include "text/XmlReader.h"
00026 #include "SplitSentenceReader.h"
00027 
00028 // library
00029 #include "io/File.h"
00030 
00031 // standard
00032 #include <sstream>
00033 
00034 using namespace std;
00035 
00036 namespace Tanl {
00037 
00038 // ======================================================================
00039 // Corpus
00040 
00041 Corpus::Corpus(Language const& lang, char const* formatFile) :
00042   language(lang)
00043 {
00044   CorpusFormat* cf = parseFormat(formatFile);
00045   if (cf) {
00046     tokenFields = cf->tokenFields;
00047     index = cf->index;
00048     delete cf;                  // no longer needed
00049   } else
00050     throw CorpusFormatError(string("Reading file: ") + formatFile);
00051 }
00052 
00053 Corpus* Corpus::create(Language const& language, char const* inputFormat)
00054 {
00055   Corpus* corpus;
00056   CorpusFactory* cfactory = CorpusMap::get(inputFormat);
00057   if (cfactory)
00058     corpus = cfactory(language, 0); // builtin corpus
00059   else if (IXE::io::File(inputFormat).exists()) {
00060     CorpusFormat* inFormat = Corpus::parseFormat(inputFormat);
00061     if (inFormat) {
00062       cerr << "Bad format file: " << *inputFormat << endl;
00063       return 0;
00064     }
00065     cfactory = CorpusMap::get(inFormat->name.c_str());
00066     if (cfactory)
00067       corpus = cfactory(language, inFormat);
00068     else
00069       corpus = new Corpus(language, *inFormat); // default tab format
00070   }
00071   return corpus;
00072 }
00073 
00074 CorpusFormat* Corpus::parseFormat(char const* formatFile)
00075 {
00076   ifstream fmt(formatFile);
00077   return parseFormat(fmt);
00078 }
00079 
00080 CorpusFormat* Corpus::parseFormat(istream& fmt)
00081 {
00082   Tanl::XML::XmlReader reader(fmt);
00083   CorpusFormat* format = new CorpusFormat();
00084   AttributeIndex& index = format->index;
00085   while (reader.Read()) {
00086     switch (reader.NodeType) {
00087     case Tanl::XML::Element:
00088       string& name = reader.Name;
00089       if (name == "CorpusFormat") {
00090         if (reader.MoveToFirstAttribute()) {
00091           do {
00092             string& name = reader.Name;
00093             string& value = reader.Value;
00094             if (name == "name") {
00095               format->name = value;
00096             }
00097           } while (reader.MoveToNextAttribute());
00098         }
00099       } else if (name == "field") {
00100         // read field
00101         TokenField field;
00102         if (reader.MoveToFirstAttribute()) {
00103           do {
00104             string& name = reader.Name;
00105             string& value = reader.Value;
00106             if (name == "name") {
00107               field.name = value;
00108               index.insert(value.c_str()); // names of attributes go into index
00109             } else if (name == "use")
00110               field.use = (value == "INPUT" ? TokenField::input :
00111                            (value == "OUTPUT" ? TokenField::output :
00112                             (value == "ECHO" ? TokenField::echo :
00113                              (value == "IGNORE" ? TokenField::ignore :
00114                               TokenField::input))));
00115             else if (name == "value")
00116               field.value = (value == "STRING" ? TokenField::string :
00117                              (value == "INTEGER" ? TokenField::integer : TokenField::string));
00118             else if (name == "role")
00119               field.role = (value == "FORM" ? TokenField::form :
00120                             (value == "HEAD" ? TokenField::head :
00121                              (value == "DEPREL" ? TokenField::deprel :
00122                               (value == "PREDICATE" ? TokenField::predicate :
00123                                TokenField::none))));
00124             else if (name == "link")
00125               field.link = value;
00126             else if (name == "label")
00127               field.label = value;
00128             else if (name == "default")
00129               field.default_ = value;
00130           } while (reader.MoveToNextAttribute());
00131         }
00132         format->tokenFields.push_back(field);
00133       } else {
00134         delete format;
00135         return 0;
00136       }
00137     }
00138   }
00139   return format;
00140 }
00141 
00142 SentenceReader* Corpus::sentenceReader(istream* is) {
00143   return new SentenceReader(is, this);
00144 }
00145 
00146 void Corpus::print(ostream& os, Sentence const& sent) const
00147 {
00148   FOR_EACH (Sentence, sent, tit) {
00149     (*tit)->printTab(os, *this);
00150     os << endl;
00151   }
00152 }
00153 
00154 // ======================================================================
00155 // Conll08Corpus
00156 
00157 void Conll08Corpus::print(ostream& os, Sentence const& sent) const
00158 {
00159   // collect attributes except PRED:
00160   vector<int> preds;
00161   FOR_EACH (Sentence, sent, tit) {
00162     if (!(*tit)->get("PRED")->empty())
00163       preds.push_back((*tit)->id);
00164   }
00165   FOR_EACH (Sentence, sent, tit) {
00166     (*tit)->printConll08(os, *this, &preds);
00167     os << endl;
00168   }
00169 }
00170 
00174 Corpus* Conll08CorpusFactory(Language const& lang, CorpusFormat* format)
00175 {
00176   return new Corpus(lang, *format);
00177 }
00178 
00179 REGISTER_CORPUS(conll08, Conll08CorpusFactory);
00180 
00181 // ======================================================================
00182 // CombCorpus
00183 
00190 void CombCorpus::print(ostream& os, Sentence const& sent) const
00191 {
00192   FOR_EACH (Sentence, sent, tit) {
00193     bool first = true;
00194     TreeToken const& node = **tit;
00195     Token const& tok = *node.token;
00196     // print ARG if present and not reflexive
00197     TokenLink const* arg = 0;
00198     if (!tok.links.empty()) {
00199       FOR_EACH (TokenLinks, tok.links, lit)
00200         if (lit->head != node.id) { // non reflexive
00201           arg = &*lit;
00202           break;
00203         }
00204     }
00205     TokenFields::const_iterator tfit = tokenFields.begin();
00206     FOR_EACH (vector<char const*>, index.names, it) {
00207       string const* tag = tok.attributes.get(*it);
00208       if (first)
00209         first = false;
00210       else
00211         os << "\t";
00212       switch (tfit->role) {
00213       case TokenField::head:
00214           os << node.linkHead();        // dont change or else cycles may arise
00215         break;
00216       case TokenField::deprel:
00217         if (arg && arg->label.substr(0, 2) != "SU") // SU are not scored
00218 #         ifdef PATH_DIFF
00219           {
00220           if (arg->head == node.linkHead())
00221             os << arg->label;
00222           else {
00223             // tree distance
00224             int upDep = node.linkHead();
00225             int upArg = arg->head;
00226             if (upDep && (upDep = sent[upDep-1]->linkHead()) == arg->head) {
00227               os << arg->label << "!"; // one down
00228               break;
00229             } else if (upArg && upDep == sent[upArg-1]->linkHead()) {
00230               os << arg->label << "!^"; // one down, one up
00231               break;
00232             } else if (upDep == node.id) {
00233               os << arg->label << "!="; // one reverse down
00234               break;
00235             } else if (upDep && (upDep = sent[upDep-1]->linkHead()) == arg->head) {
00236               os << arg->label << "!!"; // two down
00237               break;
00238             } else if (upArg && upDep == sent[upArg-1]->linkHead()) {
00239               os << arg->label << "!!^"; // two down, one up
00240               break;
00241             } else if (upDep == node.id) {
00242               os << arg->label << "!!="; // two reverse down
00243               break;
00244             } else if (upDep && (upDep = sent[upDep-1]->linkHead()) == arg->head) {
00245               os << arg->label << "!!!"; // three down
00246               break;
00247             } else if (upDep == node.id) {
00248               os << arg->label << "!!!="; // three reverse down
00249               break;
00250             } else if (upDep && (upDep = sent[upDep-1]->linkHead()) == arg->head) {
00251               os << arg->label << "!!!!"; // four down
00252               break;
00253             } else if (upArg && (upArg = sent[upArg-1]->linkHead()) == node.linkHead()) {
00254               os << arg->label << "^"; // one up
00255               break;
00256             } else if (upArg == node.id) {
00257               os << arg->label << "^="; // one reverse up
00258               break;
00259             } else if (upArg && (upArg = sent[upArg-1]->linkHead()) == node.linkHead()) {
00260               os << arg->label << "^^"; // two up
00261               break;
00262             } else if (upArg == node.id) {
00263               os << arg->label << "^^="; // two reverse up
00264               break;
00265             } else if (upArg && (upArg = sent[upArg-1]->linkHead()) == node.linkHead()) {
00266               os << arg->label << "^^^"; // three up
00267               break;
00268             } else if (upArg == node.id) {
00269               os << arg->label << "^^^="; // three reverse up
00270               break;
00271             } else if (upArg && (upArg = sent[upArg-1]->linkHead()) == node.linkHead()) {
00272               os << arg->label << "^^^^"; // four up
00273               break;
00274             }
00275             // linear distance
00276             // if root token, consider distance from itself
00277             int dist = arg->head - (node.linkHead() ? node.linkHead() : node.id);
00278             char buf[20];
00279             snprintf(buf, sizeof(buf), "%s%+i",
00280                      arg->label.c_str(), dist);
00281             os << buf;
00282           }
00283         }
00284 #       else
00285           os << arg->label;
00286 #       endif
00287         else
00288           os << node.linkLabel();
00289         break;
00290       default:
00291         os << ((!tag || tag->empty()) ? tfit->default_ : *tag);
00292       }
00293       ++tfit;
00294     }
00295     os << endl;
00296   }
00297 }
00298 
00299 // ======================================================================
00300 // ConllXCorpus
00301 
00302 char const* conllXFormat =
00303   "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
00304   "<CorpusFormat name=\"conllX\">"
00305   "     <field name=\"ID\" use=\"ECHO\" value=\"INTEGER\"/>"
00306   "     <field name=\"FORM\" use=\"INPUT\" value=\"STRING\" role=\"FORM\"/>"
00307   "     <field name=\"LEMMA\" use=\"INPUT\" value=\"STRING\"/>"
00308   "     <field name=\"CPOSTAG\" use=\"INPUT\" value=\"STRING\"/>"
00309   "     <field name=\"POSTAG\" use=\"INPUT\" value=\"STRING\"/>"
00310   "     <field name=\"FEATS\" use=\"INPUT\" value=\"STRING\"/>"
00311   "     <field name=\"HEAD\" use=\"OUTPUT\" link=\"DEP\" role=\"HEAD\"/>"
00312   "     <field name=\"DEPREL\" use=\"OUTPUT\" label=\"DEP\" role=\"DEPREL\"/>"
00313   "     <field name=\"PHEAD\" use=\"IGNORE\" default=\"_\"/>"
00314   "     <field name=\"PDEPREL\" use=\"IGNORE\" default=\"_\"/>"
00315   "</CorpusFormat>";
00316 
00317 ConllXCorpus::ConllXCorpus(Language const& lang) :
00318   Corpus(lang)
00319 {
00320   istringstream iss(conllXFormat);
00321   CorpusFormat* cf = parseFormat(iss);
00322   if (cf) {
00323     tokenFields = cf->tokenFields;
00324     index = cf->index;
00325     delete cf;
00326   } else
00327     throw CorpusFormatError("conllXFormat");
00328 }
00329 
00330   SentenceReader* ConllXCorpus::sentenceReader(istream* is) {
00331   return new ConllXSentenceReader(is, this);
00332 }
00333 
00334 void ConllXCorpus::print(ostream& os, Sentence const& sent) const
00335 {
00336   FOR_EACH (Sentence, sent, tit) {
00337     (*tit)->printTab(os, *this);
00338     os << endl;
00339   }
00340 }
00341 
00345 Corpus* ConllXCorpusFactory(Language const& lang, CorpusFormat* format)
00346 {
00347   return new ConllXCorpus(lang);
00348 }
00349 
00350 REGISTER_CORPUS(CoNLL, ConllXCorpusFactory);
00351 
00352 // ======================================================================
00353 // DGACorpus
00354 
00355 char const* DgaFormat =
00356 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
00357 "<CorpusFormat name=\"DGA\">"
00358 "       <field name=\"id\" use=\"ECHO\" value=\"INTEGER\"/>"
00359 "       <field name=\"orth\" use=\"INPUT\" value=\"STRING\" role=\"FORM\"/>"
00360 "       <field name=\"lemma\" use=\"INPUT\" value=\"STRING\"/>"
00361 "       <field name=\"pos\" use=\"INPUT\" value=\"STRING\"/>"
00362 "       <field name=\"gen\" use=\"INPUT\" value=\"STRING\"/>"
00363 "       <field name=\"num\" use=\"INPUT\" value=\"STRING\"/>"
00364 "       <field name=\"per\" use=\"INPUT\" value=\"STRING\"/>"
00365 "       <field name=\"head\" use=\"OUTPUT\" link=\"DEP\" role=\"HEAD\"/>"
00366 "       <field name=\"type\" use=\"OUTPUT\" label=\"DEP\" role=\"DEPREL\"/>"
00367 "</CorpusFormat>";
00368 
00369 DgaCorpus::DgaCorpus(Language const& lang) :
00370   Corpus(lang)
00371 {
00372   istringstream iss(DgaFormat);
00373   CorpusFormat* cf = parseFormat(iss);
00374   if (cf) {
00375     tokenFields = cf->tokenFields;
00376     index = cf->index;
00377     delete cf;
00378   } else
00379     throw CorpusFormatError("DgaFormat");
00380 }
00381 
00382 SentenceReader* DgaCorpus::sentenceReader(istream* is)
00383 {
00384   return new DgaSentenceReader(is, this);
00385 }
00386 
00387 void DgaCorpus::print(ostream& os, Sentence const& sent) const
00388 {
00389   FOR_EACH (Sentence, sent, tit) {
00390     (*tit)->print(os);
00391     os << endl;
00392   }
00393 }
00394 
00398 Corpus* DgaXMLCorpusFactory(Language const& lang, CorpusFormat* format)
00399 {
00400   return new DgaCorpus(lang);
00401 }
00402 
00403 REGISTER_CORPUS(DgaXML, DgaXMLCorpusFactory);
00404 
00405 // ======================================================================
00406 // TextCorpus
00407 
00408 SentenceReader* TextCorpus::sentenceReader(istream* is)
00409 {
00410   return new SplitSentenceReader(is, this);
00411 }
00412 
00416 Corpus* TextCorpusFactory(Language const& lang, CorpusFormat* format)
00417 {
00418   return new TextCorpus(lang);
00419 }
00420 
00421 REGISTER_CORPUS(Text, TextCorpusFactory);
00422 
00423 // ======================================================================
00424 // TokenizedTextCorpus
00425 
00426 SentenceReader* TokenizedTextCorpus::sentenceReader(istream* is)
00427 {
00428   return new TokenSentenceReader(is, this);
00429 }
00430 
00434 Corpus* TokenizedTextCorpusFactory(Language const& lang, CorpusFormat* format)
00435 {
00436   return new TokenizedTextCorpus(lang);
00437 }
00438 
00439 REGISTER_CORPUS(TokenizedText, TokenizedTextCorpusFactory);
00440 
00441 // ======================================================================
00442 // Corpus Factory
00443 
00451 std::map<char const*, CorpusFactory*>& CorpusMap::get()
00452 {
00453   static std::map<char const*, CorpusFactory*> corpusFor;
00454   return corpusFor;
00455 }
00456 
00457 CorpusFactory* CorpusMap::get(char const* type)
00458 {
00459   std::map<char const*, CorpusFactory*>& corpusMap = get();
00460   map<char const*, CorpusFactory*>::const_iterator rit = corpusMap.find(type);
00461   return (rit == corpusMap.end()) ? 0 : rit->second;
00462 }
00463 
00464 } // namespace Tanl