00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "platform.h"
00025
00026 #include "Sentence.h"
00027 #include "Corpus.h"
00028
00029
00030 #include <iostream>
00031 #include <iomanip>
00032
00033
00034 #include "conf/conf_bool.h"
00035
00036 using namespace std;
00037 using namespace Parser;
00038
00039 namespace Tanl {
00040
00041 AttributeIndex* Attributes::emptyAttrIndex = new AttributeIndex();
00042
00043 std::ostream& operator <<(std::ostream& os, AttributeIndex const& ai)
00044 {
00045 FOR_EACH (Tanl::Text::WordIndex, ai, it)
00046 os << '<' << it->first << ", " << it->second << "> ";;
00047 return os << std::endl;
00048 }
00049
00050 ostream& operator <<(ostream& os, Token const& tok)
00051 {
00052 os << "<tok";
00053 FOR_EACH(Attributes, tok.attributes, fit) {
00054 os << "\" " << (*fit).first << "=\"" << (*fit).second;
00055 }
00056 return os << '>';
00057 }
00058
00059 void Token::print(ostream& os, int indent) const
00060 {
00061 os << setw(indent) << ' ' << "<tok "
00062 << "form=\"" << attributes.get("FORM")
00063 << "\" lemma=\"" << attributes.get("LEMMA")
00064 << "\" cpos=\"" << attributes.get("CPOS")
00065 << "\" pos=\"" << attributes.get("POS")
00066 << "\" morph=\"" << attributes.get("FEATS")
00067 << "\">" << endl;
00068 }
00069
00070 //======================================================================
00071 // Token
00072
00073 #define CONLL08 // FIXME: for the time being
00074 #ifdef CONLL08
00075 static char const* DeprelTag = "DEPREL";
00076 static char const* LemmaTag = "SPLIT_LEMMA";
00077 static char const* PosTag = "PPOSS";
00078 #else
00079 static char const* DeprelTag = "DEPREL";
00080 static char const* LemmaTag = "LEMMA";
00081 static char const* PosTag = "CPOS";
00082 #endif
00083
00084 bool Token::isNoun(Language const* lang)
00085 {
00086 string const* pos = get(PosTag);
00087 return pos && strStartsWith(pos->c_str(), lang->nounCPos);
00088 }
00089
00090 bool Token::isPreposition(Language const* lang)
00091 {
00092 string const* pos = get(PosTag);
00093 return pos && strStartsWith(pos->c_str(), lang->prepCPos);
00094 }
00095
00096 bool Token::isTime(Language const* lang)
00097 {
00098 // from gold deprel
00099 string const* label = ((Token*)this)->get(DeprelTag);
00100 return label && strStartsWith(label->c_str(),lang->timeDep);
00101 }
00102
00103 bool Token::isLocation(Language const* lang)
00104 {
00105 // from gold deprel
00106 string const* label = ((Token*)this)->get(DeprelTag);
00107 return label && strStartsWith(label->c_str(),lang->locDep);
00108 }
00109
00110 bool Token::isVerb(Language const* lang)
00111 {
00112 string const* pos = get(PosTag);
00113 return pos && strStartsWith(pos->c_str(), lang->verbCPos);
00114 }
00115
00116 std::string const* Token::getLemma() const
00117 {
00118 return get(LemmaTag);
00119 }
00120
00121 std::string const* Token::getPos() const
00122 {
00123 return get(PosTag);
00124 }
00125
00126 //======================================================================
00127 // TreeToken
00128
00129 TreeToken* TreeToken::ancestorCPos(char const* pos, Sentence& tokens)
00130 {
00131 for (int nid = linkHead(); nid != 0; ) {
00132 TreeToken* tokn = (TreeToken*)tokens[nid-1];
00133 std::string const* attr = tokn->get("CPOS");
00134 if (attr && *attr == pos)
00135 return tokn;
00136 nid = tokn->linkHead();
00137 }
00138 return 0;
00139 }
00140
00141 TreeToken* TreeToken::follow(Parser::TokenPath const& tp, Sentence& sentence)
00142 {
00143 TreeToken* tok = this;
00144 FOR_EACH(vector<TokenPath::Direction>, tp.path, pit) {
00145 switch (*pit) {
00146 case TokenPath::leftChild:
00147 // move to the furthest left child (as in Nivre), if present
00148 tok = tok->left.empty() ? 0 : tok->left.front();
00149 break;
00150
00151 case TokenPath::rightChild:
00152 // move to the furthest right child (as in Nivre), if present
00153 tok = tok->right.empty() ? 0 : tok->right.front();
00154 break;
00155
00156 case TokenPath::parent:
00157 if (tok->linkHead() == 0) {
00158 tok = 0;
00159 break;
00160 }
00161 tok = sentence[tok->linkHead() - 1];
00162 break;
00163
00164 case TokenPath::leftSibling: {
00165 if (tok->linkHead() == 0) {
00166 tok = 0;
00167 break;
00168 }
00169 TreeToken* parent = sentence[tok->linkHead() - 1];
00170 TreeToken* sibl = 0;
00171 // find token following tok in left children
00172 FOR_EACH(vector<TreeToken*>, parent->left, cit) {
00173 if (*cit == tok) {
00174 if (++cit != parent->left.end()) {
00175 sibl = *cit;
00176 break;
00177 }
00178 }
00179 }
00180 if (sibl) {
00181 tok = sibl;
00182 break;
00183 }
00184 // find token preceding tok in right children
00185 FOR_EACH(vector<TreeToken*>, parent->right, cit) {
00186 if (*cit == tok) {
00187 tok = 0;
00188 break;
00189 }
00190 sibl = *cit;
00191 }
00192 tok = (tok == 0) ? sibl : 0;
00193 break;
00194 }
00195
00196 case TokenPath::rightSibling: {
00197 if (tok->linkHead() == 0) {
00198 tok = 0;
00199 break;
00200 }
00201 TreeToken* parent = sentence[tok->linkHead() - 1];
00202 TreeToken* sibl = 0;
00203 // find token following tok in right children
00204 FOR_EACH(vector<TreeToken*>, parent->right, cit) {
00205 if (*cit == tok) {
00206 if (++cit != parent->left.end()) {
00207 sibl = *cit;
00208 break;
00209 }
00210 }
00211 }
00212 if (sibl) {
00213 tok = sibl;
00214 break;
00215 }
00216 // find token preceding tok in left children
00217 FOR_EACH(vector<TreeToken*>, parent->left, cit) {
00218 if (*cit == tok) {
00219 tok = 0;
00220 break;
00221 }
00222 sibl = *cit;
00223 }
00224 tok = (tok == 0) ? sibl : 0;
00225 break;
00226 }
00227
00228 case TokenPath::previous:
00229 tok = (tok->id > 1) ? sentence[tok->id - 2] : 0;
00230 break;
00231
00232 case TokenPath::next:
00233 tok = (tok->id < sentence.size()) ? sentence[tok->id] : 0;
00234 break;
00235 }
00236
00237 if (!tok) break;
00238 }
00239 return tok;
00240 }
00241
00242 void TreeToken::print(ostream& os, int indent) const
00243 {
00244 Attributes& attributes = token->attributes;
00245 os << setw(indent) << ' ' << "<tok id=\"" << id
00246 << "\" form=\"" << attributes.get("FORM")
00247 << "\" lemma=\"" << attributes.get("LEMMA")
00248 << "\" cpos=\"" << attributes.get("CPOS")
00249 << "\" pos=\"" << attributes.get("POS")
00250 << "\" morph=\"" << attributes.get("FEATS")
00251 << "\" head=\"" << linkHead() << "\" deprel=\"" << linkLabel() << "\">" << endl;
00252 indent += 2;
00253 FOR_EACH (vector<TreeToken*>, left, it)
00254 (*it)->print(os, indent);
00255 FOR_EACH (vector<TreeToken*>, right, it)
00256 (*it)->print(os, indent);
00257 }
00258
00259 void TreeToken::printTab(ostream& os)
00260 {
00261 Attributes& attributes = token->attributes;
00262 bool first = true;
00263 FOR_EACH (vector<char const*>, attributes.attributeIndex->names, it) {
00264 string const* tag = attributes.get(*it);
00265 if (first)
00266 first = false;
00267 else
00268 os << "\t";
00269 os << ((!tag || tag->empty()) ? "_" : *tag);
00270 }
00271 }
00272
00273 void TreeToken::printTab(ostream& os, Corpus const& corpus)
00274 {
00275 Attributes& attributes = token->attributes;
00276 TokenFields::const_iterator tfit = corpus.tokenFields.begin();
00277 bool first = true;
00278 FOR_EACH (vector<char const*>, attributes.attributeIndex->names, it) {
00279 if (first)
00280 first = false;
00281 else
00282 os << "\t";
00283 switch (tfit->role) {
00284 case TokenField::head:
00285 os << linkHead(); break;
00286 case TokenField::deprel:
00287 os << (linkLabel().empty() ? tfit->default_ : linkLabel()); break;
00288 default:
00289 if (tfit->use == TokenField::ignore)
00290 os << tfit->default_;
00291 else {
00292 string const* tag = attributes.get(*it);
00293 os << ((!tag || tag->empty()) ? tfit->default_ : *tag);
00294 }
00295 }
00296 ++tfit;
00297 }
00298 }
00299
00300 void TreeToken::printConll08(ostream& os, Corpus const& corpus, vector<int>* preds)
00301 {
00302 Attributes& attributes = token->attributes;
00303 TokenFields::const_iterator tfit = corpus.tokenFields.begin();
00304 TokenLinks::const_iterator tlit = token->links.begin();
00305 bool first = true;
00306 FOR_EACH (vector<char const*>, attributes.attributeIndex->names, it) {
00307 string const* tag = attributes.get(*it);
00308 if (first)
00309 first = false;
00310 else
00311 os << "\t";
00312 if (!tfit->link.empty())
00313 os << tlit->head;
00314 else if (!tfit->label.empty()) {
00315 if (tlit != token->links.end()) {
00316 os << (tlit->label.empty() ? tfit->default_ : tlit->label);
00317 ++tlit;
00318 }
00319 } else {
00320 os << ((!tag || tag->empty()) ? tfit->default_ : *tag);
00321 }
00322 ++tfit;
00323 }
00324 // print links
00325 # ifdef COMPACT
00326 preds = 0;
00327 # endif
00328 if (preds) {
00329 unsigned predNo = 0;
00330 for (; tlit != token->links.end(); ++tlit) {
00331 int head = tlit->head;
00332 while (head != (*preds)[predNo++])
00333 os << "\t_";
00334 os << '\t' << tlit->label;
00335 }
00336 while (predNo++ < preds->size())
00337 os << "\t_";
00338 } else {
00339 for (; tlit != token->links.end(); ++tlit)
00340 os << '\t' << tlit->label << '[' << tlit->head << ']';
00341 }
00342 }
00343
00344 } // namespace Tanl