desr: desr: src/SentenceReader.cpp Source File

00001 /*
00002 **  DeSR
00003 **  src/SentenceReader.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2005  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of DeSR.
00009 **
00010 **  DeSR is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  DeSR is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #include "SentenceReader.h"
00025 
00026 // standard
00027 #include <iostream>
00028 #include <iomanip>
00029 
00030 // IXE library
00031 #include "text/Utf8Utils.h"
00032 
00033 // local
00034 #include "Corpus.h"
00035 
00036 using namespace std;
00037 using namespace Tanl::Text;
00038 
00039 #define CONLL08
00040 #ifdef CONLL08
00041 #include "conf/conf_bool.h"
00042 IXE::conf<bool>         useGold("UseGoldPos", false);
00043 #endif
00044 
00045 namespace Tanl {
00046 
00047 //======================================================================
00048 // SentenceReader
00049 
00050 SentenceReader::SentenceReader(istream* is, Corpus* corpus) :
00051   is(is),
00052   corpus(corpus)
00053 { }
00054 
00055 // Generic tab format reader
00056 static RegExp::Pattern reTab("([^\t\n]+)");
00057 
00058 bool SentenceReader::MoveNext()
00059 {
00060   string line;
00061   if (!getline(*is, line) || line.empty())
00062     return false;
00063 
00064   sentence = new Sentence(&corpus->language);
00065   vector<char const*> const& names = corpus->index.names;
00066   vector<int> preds;            // ids of predicates in sentence
00067 
00068   int id = 1;
00069   RegExp::MatchGroups match(2);
00070 
00071   do {
00072     Attributes attributes(&corpus->index);
00073     string form;
00074     int head = 0;
00075     string deprel;
00076     int fields = corpus->tokenFields.size();
00077     TokenLinks links;
00078     unordered_map<string, int> linkMap;
00079     int argNo = 0;
00080     int i = 0;
00081     char const* cur = line.c_str();
00082     char const* end = cur + line.size();
00083     while (reTab.match(cur, end, match) > 0) {
00084       TokenField const& tf = corpus->tokenFields[i];
00085       char const* fieldStart = cur + match[1].first;
00086       int fieldLen = match[1].second - match[1].first;
00087       string field(fieldStart, fieldLen);
00088       // clear empty fields
00089       if (field == tf.default_)
00090         field = "";
00091       // discard IGNORE fields
00092       if (tf.use != TokenField::ignore) {
00093         if (!tf.link.empty()) {
00094           // Got target of link: create even when value is missing
00095           int head = field.empty() ? -1 : atoi(field.c_str());
00096           if (linkMap.find(tf.link) == linkMap.end()) {
00097             // Create link and assign position in LinkMap 
00098             linkMap[tf.link] = links.size();
00099             links.push_back(TokenLink(head));
00100           } else
00101             // fill head in previously created link
00102             links[linkMap[tf.link]].head = head;
00103         } else if (!tf.label.empty()) {
00104           if (field.empty())
00105             --argNo;
00106           else {
00107             // Got label of link
00108             if (linkMap.find(tf.label) == linkMap.end()) {
00109               // Create link with dummy target and assign position in LinkMap 
00110               linkMap[tf.label] = links.size();
00111               // use negative value to distinguish from real target
00112               links.push_back(TokenLink(--argNo, field.c_str()));
00113             } else
00114               // fill label in previously created link
00115               links[linkMap[tf.label]].label = field;
00116           }
00117         } else {
00118           attributes[i] = field;
00119 #         ifdef CONLL08
00120           if (useGold && i == 7 && !attributes[3].empty())
00121             attributes[7] = attributes[3];
00122 #         endif
00123           switch (tf.role) {
00124           case TokenField::form:
00125             form = field; break;
00126           case TokenField::predicate:
00127             if (!field.empty())
00128               preds.push_back(id);
00129             break;
00130           }
00131         }
00132       }
00133       i++;
00134       cur += match[0].second;
00135       if (i == fields || cur == end) {
00136         // skip extra fields
00137         break;
00138       }
00139     }
00140     TreeToken* token = new TreeToken(id++, form, attributes, links);
00141     sentence->push_back(token);
00142   } while (getline(*is, line) && !line.empty());
00143   // fix ARG links
00144   FOR_EACH (Sentence, *sentence, sit) {
00145     TO_EACH (TokenLinks, (*sit)->token->links, tit) {
00146       if (tit->head < 0)
00147         tit->head = preds[-tit->head - 1];
00148     }
00149   }
00150   return true;
00151 }
00152 
00153 Sentence* SentenceReader::Current()
00154 {
00155   return sentence;
00156 }
00157 
00158 //======================================================================
00159 // ConllXSentenceReader
00160 
00161 // pattern for analyzing token line in CoNLL format:
00162 // czech 2007 has extra tab at end of lines.
00163 static RegExp::Pattern reCoNLL("(\\d+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t\n]+)(?:\t([^\t]+)\t([^\t]+)(?:\t([^\t]+)\t([^\t]+)\t?)?)?");
00164 
00165 ConllXSentenceReader::ConllXSentenceReader(istream* is, Corpus* corpus) :
00166   SentenceReader(is, corpus),
00167   morphExtractor(*corpus->language.morphExtractor)
00168 {  }
00169 
00170 bool ConllXSentenceReader::MoveNext()
00171 {
00172   string line;
00173   if (!getline(*is, line) || line.empty())
00174     return false;
00175 
00176   sentence = new Sentence(&corpus->language);
00177   MorphExtractor::Features mf;
00178   vector<char const*>& names = corpus->index.names;
00179 
00180   RegExp::MatchGroups match(11);
00181   do {
00182     int matches = reCoNLL.match(line, match);
00183     if (matches > 0) {
00184       char const* start = line.c_str();
00185       morphExtractor(start + match[6].first, start + match[6].second, mf);
00186       Attributes attributes(&corpus->index);
00187       int id = atoi(start + match[1].first);
00188       attributes.insert(names[0], string(start + match[1].first, match[1].second - match[1].first)); // ID
00189       string form(start + match[2].first, match[2].second - match[2].first);
00190       attributes.insert(names[1], form); // FORM
00191       int head = 0;
00192       string deprel;
00193       for (int i = 3; i < matches; i++) {
00194         // LEMMA, CPOS, POS, FEATS, HEAD, DEPREL, PHEAD, PDEPREL
00195         char const* tagStart = start + match[i].first;
00196         int tagLen = match[i].second - match[i].first;
00197         string attr;
00198         // discard CoNLL empty fields
00199         if (tagLen != 1 || tagStart[0] != '_')
00200           attr = string(tagStart, tagLen);
00201         attributes.insert(names[i-1], attr);
00202         if (i == 7)
00203           head = atoi(tagStart);
00204         else if (i == 8)
00205           deprel = attr;
00206       }
00207       // sanity check: avoid circularities
00208       if (id == head)
00209         head = 0;
00210       TokenLinks links(1, TokenLink(head, deprel.c_str()));
00211       TreeToken* token = new TreeToken(id, form, attributes, links);
00212       // add morpho features
00213       token->token->morpho.set(mf);
00214       sentence->push_back(token);
00215     }
00216   } while (getline(*is, line) && !line.empty());
00217 
00218   return true;
00219 }
00220 
00221 //======================================================================
00222 // DgaSentenceReader
00223 
00224 // The stream consists of sentences:
00225 //
00226 // <s>token+</s>
00227 //
00228 // Tokens have the following format:
00229 //
00230 // <tok id="...">
00231 //  <orth>...</orth>
00232 //  <lemma>...</lemma>
00233 //  <pos>...</pos>
00234 //  <gen>...</gen>
00235 //  <num>...</num>
00236 //  <per>...</per>
00237 //  <dep head="..." type="..." />
00238 // </tok>
00239 
00243 DgaSentenceReader::DgaSentenceReader(istream* is, Corpus* corpus) :
00244   SentenceReader(is, corpus),
00245   reader(*is)
00246 { }
00247 
00248 bool DgaSentenceReader::MoveNext()
00249 {
00250   // check for open tag <s>
00251   if (!reader.Read() || reader.NodeType != Tanl::XML::Element ||
00252       reader.Name != "s")
00253     return false;
00254 
00255   sentence = new Sentence(&corpus->language);
00256   vector<char const*>& names = corpus->index.names;
00257   while (reader.Read()) {
00258     if (reader.NodeType == Tanl::XML::Whitespace)
00259       continue;
00260     if (reader.NodeType == Tanl::XML::EndElement)
00261       break;
00262     if (reader.NodeType == Tanl::XML::Element &&
00263          reader.Name == "tok") {
00264       Attributes attributes(&corpus->index);
00265       int id;
00266       string form;
00267       int head = 0;
00268       string deprel;
00269       while (reader.MoveToNextAttribute()) {
00270         if (reader.Name == "id")
00271           id = atoi(reader.Value.c_str());
00272       }
00273       // read token contents
00274       while (reader.Read()) {
00275         if (reader.NodeType == Tanl::XML::Whitespace)
00276           continue;
00277         if (reader.NodeType == Tanl::XML::EndElement)
00278           break;
00279         if (reader.NodeType != Tanl::XML::Element)
00280           goto fail;
00281         string& name = reader.Name;
00282         if (name == "dep") {
00283           // read attributes
00284           while (reader.MoveToNextAttribute()) {
00285             string& name = reader.Name;
00286             if (name == "head")
00287               head = atoi(reader.Value.c_str());
00288             else if (name == "type")
00289               deprel = reader.Value;
00290           }
00291         } else if (name == "orth") {
00292           if (!reader.Read() || reader.NodeType != Tanl::XML::Text)
00293             goto fail;
00294           form = reader.Value;
00295           if (!reader.Read() || reader.NodeType != Tanl::XML::EndElement ||
00296               reader.Name != "orth")
00297             goto fail;
00298         } else {
00299           string tag = reader.Name;
00300           if (!reader.Read() || reader.NodeType != Tanl::XML::Text)
00301             goto fail;
00302           attributes.insert(tag.c_str(), reader.Value);
00303           if (!reader.Read() || reader.NodeType != Tanl::XML::EndElement ||
00304               reader.Name != tag)
00305             goto fail;
00306         }
00307       }
00308       // ckeck closing tag
00309       if (reader.Name != "tok")
00310         goto fail;
00311       TokenLinks links(1, TokenLink(head, deprel.c_str()));
00312       TreeToken* tok = new TreeToken(id, form, attributes, links);
00313       sentence->push_back(tok);
00314     } else
00315       goto fail;
00316   }
00317   // check closing tag
00318   if (reader.Name == "s")
00319     return true;
00320  fail:
00321   delete sentence;
00322   sentence = 0;
00323   return false;
00324 }
00325 
00326 //======================================================================
00327 // TokenSentenceReader
00328 
00332 TokenSentenceReader::TokenSentenceReader(istream* is, Corpus* corpus) :
00333   SentenceReader(is, corpus)
00334 {
00335 # ifdef STEMMER
00336   if (corpus && corpus->language)
00337     stemmer = sb_stemmer_new(corpus->language, 0); // UTF-8 encoding
00338 # endif
00339 }
00340 
00341 RegExp::Pattern TokenSentenceReader::reTok("\\s*([^\\s]*?)");
00342 
00343 bool TokenSentenceReader::MoveNext()
00344 {
00345   string line;
00346   if (!getline(*is, line) || line.empty())
00347     return false;
00348 
00349   sentence = new Sentence(&corpus->language);
00350   int id = 1;
00351   RegExp::MatchGroups matches(2);
00352   do {
00353     char const* cur = line.c_str();
00354     char const* endSent = cur + line.size();;
00355     while (reTok.match(cur, endSent, matches) > 0) {
00356       const char* tokStart = cur + matches[1].first;
00357       int tokSize = matches[1].second - matches[1].first;
00358       cur += matches[0].second;
00359       string form(tokStart, tokSize);
00360       TreeToken* tok = new TreeToken(id++, form.c_str(), &corpus->index);
00361       sentence->push_back(tok);
00362     }
00363   } while (getline(*is, line) && line.size());
00364   return true;
00365 }
00366 
00367 //======================================================================
00368 
00369 bool TaggedSentenceReader::MoveNext()
00370 {
00371   if (!reader->MoveNext())
00372     return false;
00373   sentence = reader->Current();
00374   if (tagger) {
00375     // perform POS tagging
00376     Parser::Tagged tagSentence;
00377     FOR_EACH (Sentence, *sentence, sit) {
00378       TreeToken* token = *sit;
00379       tagSentence.words.push_back(token->token->form.c_str());
00380       string const* pos = token->get("POS");
00381       tagSentence.tags.push_back(pos ? pos->c_str() : 0);
00382     }
00383     if (tagger->tag(tagSentence)) {
00384       for (unsigned i = 0; i < sentence->size(); i++) {
00385         (*sentence)[i]->set("POS", tagSentence.tags[i]);
00386         TreeToken* node = (*sentence)[i];
00387         string const* lemma = node->get("LEMMA");
00388         if (lemma && *lemma == "") {
00389           char const* lemma = tagSentence.lemmas[i];
00390           if (strcmp(lemma, "<unknown>"))
00391             node->set("LEMMA", lemma);
00392           else
00393             node->set("LEMMA", node->token->form);
00394         }
00395       }
00396     }
00397   }
00398   return true;
00399 }
00400 
00401 } // namespace Tanl