desr: desr: src/Token.cpp Source File

00001 /*
00002 **  DeSR
00003 **  src/Token.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2005  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of DeSR.
00009 **
00010 **  DeSR is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  DeSR is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #include "platform.h"
00025 
00026 #include "Sentence.h"
00027 #include "Corpus.h"
00028 
00029 // standard
00030 #include <iostream>
00031 #include <iomanip>
00032 
00033 // library
00034 #include "conf/conf_bool.h"
00035 
00036 using namespace std;
00037 using namespace Parser;
00038 
00039 namespace Tanl {
00040 
00041 AttributeIndex* Attributes::emptyAttrIndex = new AttributeIndex();
00042 
00043 std::ostream& operator <<(std::ostream& os, AttributeIndex const& ai)
00044 {
00045   FOR_EACH (Tanl::Text::WordIndex, ai, it)
00046     os << '<' << it->first << ", " << it->second << "> ";;
00047   return os << std::endl;
00048 }
00049 
00050 ostream& operator <<(ostream& os, Token const& tok)
00051 {
00052   os << "<tok";
00053   FOR_EACH(Attributes, tok.attributes, fit) {
00054     os << "\" " << (*fit).first << "=\"" << (*fit).second;
00055   }
00056   return os << '>';
00057 }
00058 
00059 void Token::print(ostream& os, int indent) const
00060 {
00061   os << setw(indent) << ' ' << "<tok "
00062      << "form=\"" << attributes.get("FORM")
00063      << "\" lemma=\"" << attributes.get("LEMMA")
00064      << "\" cpos=\"" << attributes.get("CPOS")
00065      << "\" pos=\"" << attributes.get("POS")
00066      << "\" morph=\"" << attributes.get("FEATS")
00067      << "\">" << endl;
00068 }
00069 
00070 //======================================================================
00071 // Token
00072 
00073 #define CONLL08                 // FIXME: for the time being
00074 #ifdef CONLL08
00075 static char const* DeprelTag = "DEPREL";
00076 static char const* LemmaTag = "SPLIT_LEMMA";
00077 static char const* PosTag = "PPOSS";
00078 #else
00079 static char const* DeprelTag = "DEPREL";
00080 static char const* LemmaTag = "LEMMA";
00081 static char const* PosTag = "CPOS";
00082 #endif
00083 
00084 bool Token::isNoun(Language const* lang)
00085 {
00086   string const* pos = get(PosTag);
00087   return pos && strStartsWith(pos->c_str(), lang->nounCPos);
00088 }
00089 
00090 bool Token::isPreposition(Language const* lang)
00091 {
00092   string const* pos = get(PosTag);
00093   return pos && strStartsWith(pos->c_str(), lang->prepCPos);
00094 }
00095 
00096 bool Token::isTime(Language const* lang)
00097 {
00098   // from gold deprel
00099   string const* label = ((Token*)this)->get(DeprelTag);
00100   return label && strStartsWith(label->c_str(),lang->timeDep);
00101 }
00102 
00103 bool Token::isLocation(Language const* lang)
00104 {
00105   // from gold deprel
00106   string const* label = ((Token*)this)->get(DeprelTag);
00107   return label && strStartsWith(label->c_str(),lang->locDep);
00108 }
00109 
00110 bool Token::isVerb(Language const* lang)
00111 {
00112   string const* pos = get(PosTag);
00113   return pos && strStartsWith(pos->c_str(), lang->verbCPos);
00114 }
00115 
00116 std::string const* Token::getLemma() const
00117 {
00118   return get(LemmaTag);
00119 }
00120 
00121 std::string const* Token::getPos() const
00122 {
00123   return get(PosTag);
00124 }
00125 
00126 //======================================================================
00127 // TreeToken
00128 
00129 TreeToken* TreeToken::ancestorCPos(char const* pos, Sentence& tokens)
00130 {
00131   for (int nid = linkHead(); nid != 0; ) {
00132     TreeToken* tokn = (TreeToken*)tokens[nid-1];
00133     std::string const* attr = tokn->get("CPOS");
00134     if (attr && *attr == pos)
00135       return tokn;
00136     nid = tokn->linkHead();
00137   }
00138   return 0;
00139 }
00140 
00141 TreeToken* TreeToken::follow(Parser::TokenPath const& tp, Sentence& sentence)
00142 {
00143   TreeToken* tok = this;
00144   FOR_EACH(vector<TokenPath::Direction>, tp.path, pit) {
00145     switch (*pit) {
00146     case TokenPath::leftChild:
00147       // move to the furthest left child (as in Nivre), if present
00148       tok = tok->left.empty() ? 0 : tok->left.front();
00149       break;
00150 
00151     case TokenPath::rightChild:
00152       // move to the furthest right child (as in Nivre), if present
00153       tok = tok->right.empty() ? 0 : tok->right.front();
00154       break;
00155 
00156     case TokenPath::parent:
00157       if (tok->linkHead() == 0) {
00158         tok = 0;
00159         break;
00160       }
00161       tok = sentence[tok->linkHead() - 1];
00162       break;
00163 
00164     case TokenPath::leftSibling: {
00165       if (tok->linkHead() == 0) {
00166         tok = 0;
00167         break;
00168       }
00169       TreeToken* parent = sentence[tok->linkHead() - 1];
00170       TreeToken* sibl = 0;
00171       // find token following tok in left children
00172       FOR_EACH(vector<TreeToken*>, parent->left, cit) {
00173         if (*cit == tok) {
00174           if (++cit != parent->left.end()) {
00175             sibl = *cit;
00176             break;
00177           }
00178         }
00179       }
00180       if (sibl) {
00181         tok = sibl;
00182         break;
00183       }
00184       // find token preceding tok in right children
00185       FOR_EACH(vector<TreeToken*>, parent->right, cit) {
00186         if (*cit == tok) {
00187           tok = 0;
00188           break;
00189         }
00190         sibl = *cit;
00191       }
00192       tok = (tok == 0) ? sibl : 0;
00193       break;
00194     }
00195 
00196     case TokenPath::rightSibling: {
00197       if (tok->linkHead() == 0) {
00198         tok = 0;
00199         break;
00200       }
00201       TreeToken* parent = sentence[tok->linkHead() - 1];
00202       TreeToken* sibl = 0;
00203       // find token following tok in right children
00204       FOR_EACH(vector<TreeToken*>, parent->right, cit) {
00205         if (*cit == tok) {
00206           if (++cit != parent->left.end()) {
00207             sibl = *cit;
00208             break;
00209           }
00210         }
00211       }
00212       if (sibl) {
00213         tok = sibl;
00214         break;
00215       }
00216       // find token preceding tok in left children
00217       FOR_EACH(vector<TreeToken*>, parent->left, cit) {
00218         if (*cit == tok) {
00219           tok = 0;
00220           break;
00221         }
00222         sibl = *cit;
00223       }
00224       tok = (tok == 0) ? sibl : 0;
00225       break;
00226     }
00227 
00228     case TokenPath::previous:
00229       tok =  (tok->id > 1) ? sentence[tok->id - 2] : 0;
00230       break;
00231 
00232     case TokenPath::next:
00233       tok = (tok->id < sentence.size()) ? sentence[tok->id] : 0;
00234       break;
00235     }
00236 
00237     if (!tok) break;
00238   }
00239   return tok;
00240 }
00241 
00242 void TreeToken::print(ostream& os, int indent) const
00243 {
00244   Attributes& attributes = token->attributes;
00245   os << setw(indent) << ' ' << "<tok id=\"" << id
00246      << "\" form=\"" << attributes.get("FORM")
00247      << "\" lemma=\"" << attributes.get("LEMMA")
00248      << "\" cpos=\"" << attributes.get("CPOS")
00249      << "\" pos=\"" << attributes.get("POS")
00250      << "\" morph=\"" << attributes.get("FEATS")
00251      << "\" head=\"" << linkHead() << "\" deprel=\"" << linkLabel() << "\">" << endl;
00252   indent += 2;
00253   FOR_EACH (vector<TreeToken*>, left, it)
00254     (*it)->print(os, indent);
00255   FOR_EACH (vector<TreeToken*>, right, it)
00256     (*it)->print(os, indent);
00257 }
00258 
00259 void TreeToken::printTab(ostream& os)
00260 {
00261   Attributes& attributes = token->attributes;
00262   bool first = true;
00263   FOR_EACH (vector<char const*>, attributes.attributeIndex->names, it) {
00264     string const* tag = attributes.get(*it);
00265     if (first)
00266       first = false;
00267     else
00268       os << "\t";
00269     os << ((!tag || tag->empty()) ? "_" : *tag);
00270   }
00271 }
00272 
00273 void TreeToken::printTab(ostream& os, Corpus const& corpus)
00274 {
00275   Attributes& attributes = token->attributes;
00276   TokenFields::const_iterator tfit = corpus.tokenFields.begin();
00277   bool first = true;
00278   FOR_EACH (vector<char const*>, attributes.attributeIndex->names, it) {
00279     if (first)
00280       first = false;
00281     else
00282       os << "\t";
00283     switch (tfit->role) {
00284     case TokenField::head:
00285       os << linkHead(); break;
00286     case TokenField::deprel:
00287       os << (linkLabel().empty() ? tfit->default_ : linkLabel()); break;
00288     default:
00289       if (tfit->use == TokenField::ignore)
00290         os << tfit->default_;
00291       else {
00292         string const* tag = attributes.get(*it);
00293         os << ((!tag || tag->empty()) ? tfit->default_ : *tag);
00294     }
00295     }
00296     ++tfit;
00297   }
00298 }
00299 
00300 void TreeToken::printConll08(ostream& os, Corpus const& corpus, vector<int>* preds)
00301 {
00302   Attributes& attributes = token->attributes;
00303   TokenFields::const_iterator tfit = corpus.tokenFields.begin();
00304   TokenLinks::const_iterator tlit = token->links.begin();
00305   bool first = true;
00306   FOR_EACH (vector<char const*>, attributes.attributeIndex->names, it) {
00307     string const* tag = attributes.get(*it);
00308     if (first)
00309       first = false;
00310     else
00311       os << "\t";
00312     if (!tfit->link.empty())
00313       os << tlit->head;
00314     else if (!tfit->label.empty()) {
00315       if (tlit != token->links.end()) {
00316         os << (tlit->label.empty() ? tfit->default_ : tlit->label);
00317         ++tlit;
00318       }
00319     } else {
00320       os << ((!tag || tag->empty()) ? tfit->default_ : *tag);
00321     }
00322     ++tfit;
00323   }
00324   // print links
00325 # ifdef COMPACT
00326   preds = 0;
00327 # endif
00328   if (preds) {
00329     unsigned predNo = 0;
00330     for (; tlit != token->links.end(); ++tlit) {
00331       int head = tlit->head;
00332       while (head != (*preds)[predNo++])
00333         os << "\t_";
00334       os << '\t' << tlit->label;
00335     }
00336     while (predNo++ < preds->size())
00337       os << "\t_";
00338   } else {
00339     for (; tlit != token->links.end(); ++tlit)
00340       os << '\t' << tlit->label << '[' << tlit->head << ']';
00341   }
00342 }
00343 
00344 } // namespace Tanl