00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef DeSR_Token_H
00025 #define DeSR_Token_H
00026
00027
00028 #include <string>
00029 #include <vector>
00030
00031
00032 #include "text/WordIndex.h"
00033 #include "Common/util.h"
00034
00035
00036 #include "Language.h"
00037 #include "TokenPath.h"
00038
00039 namespace Tanl {
00040
00041 class Sentence;
00042 class Corpus;
00043
00044 typedef unsigned AttributeId;
00045
00049 struct AttributeIndex : public Text::WordIndex
00050 {
00051 std::vector<char const*> names;
00052
00057 AttributeId insert(const char* name) {
00058 AttributeId id = Tanl::Text::WordIndex::insert(name);
00059 if (id >= names.size()) {
00060
00061 names.resize(id + 1);
00062 names[id] = ::strdup(name);
00063 }
00064 return id;
00065 }
00066
00067 AttributeIndex& operator =(AttributeIndex const& ai) {
00068 Tanl::Text::WordIndex::operator=(ai);
00069 for (unsigned i = 0; i < names.size(); i++)
00070 free((void*)names[i]);
00071 names.resize(ai.size());
00072 for (unsigned i = 0; i < ai.names.size(); i++)
00073 names[i] = strdup(ai.names[i]);
00074 return *this;
00075 }
00076
00077 ~AttributeIndex() {
00078 for (unsigned i = 0; i < names.size(); i++)
00079 free((void*)names[i]);
00080 }
00081
00082 };
00083
00084 std::ostream& operator <<(std::ostream& os, AttributeIndex const& ai);
00085
00092 struct Attributes
00093 {
00094 typedef std::string Attribute;
00095
00096 AttributeIndex* attributeIndex;
00097 std::vector<Attribute> values;
00098
00099 static AttributeIndex* emptyAttrIndex;
00100
00101 Attributes(AttributeIndex* attributeIndex) :
00102 attributeIndex(attributeIndex)
00103 {
00104 if (attributeIndex)
00105 values.resize(attributeIndex->size());
00106 else
00107 this->attributeIndex = emptyAttrIndex;
00108 }
00109
00110 Attributes(AttributeIndex* attributeIndex, std::vector<Attribute>& values) :
00111 attributeIndex(attributeIndex),
00112 values(values)
00113 { }
00114
00115 Attribute& operator [](int i) { return values[i]; }
00116
00117 struct const_iterator {
00118 const_iterator(Attributes const& attributes, int fit = 0) :
00119 attributes(attributes),
00120 fit(fit)
00121 { }
00122
00123 bool operator ==(const const_iterator& other) {
00124 return &attributes == &other.attributes && fit == other.fit;
00125 }
00126 bool operator !=(const const_iterator& other) { return !(*this == other); }
00127 const_iterator& operator ++() { fit++; return *this; }
00128 const_iterator operator ++(int) {
00129 const_iterator tmp = *this;
00130 ++*this;
00131 return tmp;
00132 }
00133
00134
00135 std::pair<char const*, std::string const*> operator*() {
00136 std::string const& val = attributes.values[fit];
00137 return std::make_pair(attributes.attributeIndex->names[fit], &val);
00138 }
00139
00140 Attributes const& attributes;
00141 int fit;
00142 };
00143
00144 const_iterator begin() const { return const_iterator(*this); }
00145 const_iterator end() const { return const_iterator(*this, values.size()); }
00146
00150 std::string const* get(std::string& name) const {
00151 AttributeId id = attributeIndex->index(name.c_str());
00152 return (id == Tanl::Text::WordIndex::None) ? 0 : &values[id];
00153 }
00154
00158 std::string const* get(char const* name) const {
00159 AttributeId id = attributeIndex->index(name);
00160 return (id == Tanl::Text::WordIndex::None) ? 0 : &values[id];
00161 }
00162
00166 AttributeId index(char const* key) const {
00167 return attributeIndex->index(key);
00168 }
00169
00173 void insert(char const* key, char const* value) {
00174 AttributeId id = attributeIndex->insert(key);
00175 values.resize(id + 1);
00176 values[id] = value;
00177 }
00178
00182 void insert(char const* key, std::string const& value) {
00183 AttributeId id = attributeIndex->insert(key);
00184 values.resize(id + 1);
00185 values[id] = value;
00186 }
00187 };
00188
00192 struct TokenLink
00193 {
00194 TokenLink(int head = -1, char const* label = "") :
00195 head(head),
00196 label(label)
00197 { }
00198
00199 int head;
00200 std::string label;
00201 };
00202
00203 typedef std::vector<TokenLink> TokenLinks;
00204
00208 struct Token
00209 {
00210
00212 struct Morpho {
00213 std::string morph;
00214 char Case;
00215 char gender;
00216 char mode;
00217 char negative;
00218 char number;
00219 char person;
00220 char tense;
00221 char transitive;
00222 char extra[20];
00223
00224 Morpho(const char* morph = "", size_t morphSize = 0) :
00225 morph(morph, morphSize),
00226 Case(0),
00227 gender(0),
00228 mode(0),
00229 negative(0),
00230 number(0),
00231 person(0),
00232 tense(0),
00233 transitive(0)
00234 {
00235 extra[0] = '\0';
00236 }
00237
00239 void set(MorphExtractor::Features& mf) {
00240 Case = mf.Case[0];
00241 gender = mf.gender[0];
00242 number = mf.number[0];
00243 person = mf.person[0];
00244 mode = mf.mode[0];
00245 negative = mf.negative[0];
00246 number = mf.number[0];
00247 person = mf.person[0];
00248 tense = mf.tense[0];
00249 transitive = mf.trans[0];
00250 int extraLen = sizeof(extra) - 1;
00251 strncpy(extra, mf.extra, extraLen);
00252 extra[extraLen] = '\0';
00253 }
00254 };
00255
00256 Token(std::string form, Attributes& attributes) :
00257 form(form),
00258 attributes(attributes),
00259 links(1, TokenLink(-1))
00260 { }
00261
00262 Token(std::string form, Attributes& attributes, TokenLinks& links) :
00263 form(form),
00264 attributes(attributes),
00265 links(links)
00266 { }
00267
00268 Token(Attributes& attributes) :
00269 attributes(attributes),
00270 links(1, TokenLink(-1))
00271 { }
00272
00273 Token(std::string& form, AttributeIndex* attributeIndex) :
00274 form(form),
00275 attributes(attributeIndex),
00276 links(1, TokenLink(-1))
00277 { }
00278
00279 Token(char const* form, AttributeIndex* attributeIndex) :
00280 form(form),
00281 attributes(attributeIndex),
00282 links(1, TokenLink(-1))
00283 { }
00284
00285 Token(AttributeIndex* attributeIndex) :
00286 attributes(attributeIndex),
00287 links(1, TokenLink(-1))
00288 { }
00289
00291 std::string const* get(char const* key) const {
00292 return attributes.get(key);
00293 }
00294
00296 void set(char const* key, char const* value) {
00297 attributes.insert(key, value);
00298 }
00299
00301 void set(char const* key, std::string const& value) {
00302 attributes.insert(key, value);
00303 }
00304
00306 int attrIndex(char const* name) const {
00307 std::vector<char const*> const& names = attributes.attributeIndex->names;
00308
00309
00310
00311
00312
00313 unsigned idx = 0;
00314 FOR_EACH (std::vector<char const*>, names, it) {
00315 if (!::strcmp(*it, name))
00316 break;
00317 idx++;
00318 }
00319 return (idx >= names.size()) ? -1 : idx;
00320 }
00321
00322 bool isNoun(Language const* lang);
00323 bool isPreposition(Language const* lang);
00324 bool isTime(Language const* lang);
00325 bool isLocation(Language const* lang);
00326 bool isVerb(Language const* lang);
00327 std::string const* getLemma() const;
00328 std::string const* getPos() const;
00329
00331 void print(std::ostream& os, int indent = 0) const;
00332
00333 std::string form;
00334 Morpho morpho;
00335 Attributes attributes;
00336 TokenLinks links;
00337
00338 };
00339
00343 std::ostream& operator <<(std::ostream& os, Token const& tok);
00344
00348 struct TreeToken
00349 {
00350 TreeToken(std::string& form, AttributeIndex* ai) :
00351 token(new Token(form, ai))
00352 { }
00353
00354 TreeToken(int id, char const* form, AttributeIndex* attributeIndex = 0) :
00355 token(new Token(form, attributeIndex)),
00356 id(id)
00357 { }
00358
00359 TreeToken(int id, std::string& form, Attributes& attributes,
00360 TokenLinks& links) :
00361 token(new Token(form, attributes, links)),
00362 id(id)
00363 { }
00364
00365 TreeToken(int id, std::string& form, Attributes& attributes) :
00366 token(new Token(form, attributes)),
00367 id(id)
00368 { }
00369
00370 TreeToken(int id, Token* token) :
00371 token(token),
00372 id(id)
00373 { }
00374
00375 TreeToken(TreeToken const& tok) :
00376 token(new Token(*tok.token)),
00377 id(tok.id),
00378 left(tok.left),
00379 right(tok.right)
00380 { }
00381
00382 virtual ~TreeToken() { delete token; }
00383
00384 Token* token;
00385 unsigned id;
00386 std::vector<TreeToken*> left;
00387 std::vector<TreeToken*> right;
00388
00390 std::string const* get(char const* key) const {
00391
00392 return (strcmp(key, "DEPREL")) ? token->attributes.get(key) : &linkLabel();
00393 }
00394
00396 void set(char const* key, char const* value) {
00397 token->set(key, value);
00398 }
00399
00401 void set(char const* key, std::string const& value) {
00402 token->set(key, value);
00403 }
00404
00405 int linkHead(unsigned i = 0) const {
00406 return token->links[i].head;
00407 }
00408 int linkHead(int h, unsigned i = 0) {
00409 return token->links[i].head = h;
00410 }
00411 std::string const& linkLabel(unsigned i = 0) const {
00412 return token->links[i].label;
00413 }
00414 std::string const& linkLabel(std::string& l, unsigned i = 0) {
00415 return token->links[i].label = l;
00416 }
00417 std::string const& linkLabel(char const* l, unsigned i = 0) {
00418 return token->links[i].label = l;
00419 }
00420
00422 int size() {
00423 int count = 0;
00424 FOR_EACH(std::vector<TreeToken*>, left, it)
00425 count += (*it)->size();
00426 FOR_EACH(std::vector<TreeToken*>, right, it)
00427 count += (*it)->size();
00428 return count + 1;
00429 }
00430
00436 TreeToken* follow(Parser::TokenPath const& tp, Sentence& sentence);
00437
00438 int leftmostDescendant() {
00439 int lmd = id;
00440 FOR_EACH(std::vector<TreeToken*>, left, it)
00441 lmd = MIN(lmd, (*it)->leftmostDescendant());
00442 return lmd;
00443 }
00444
00445 int rightmostDescendant() {
00446 int lmd = id;
00447 FOR_EACH(std::vector<TreeToken*>, right, it)
00448 lmd = MAX(lmd, (*it)->rightmostDescendant());
00449 return lmd;
00450 }
00451
00455 TreeToken* ancestorCPos(char const* pos, Sentence& sent);
00456
00460 TreeToken* descendantCPos(char const* pos) {
00461
00462 FOR_EACH(std::vector<TreeToken*>, left, it) {
00463 TreeToken* tokn = *it;
00464 std::string const* attr = tokn->get("CPOS");
00465 if (attr && *attr == pos)
00466 return tokn;
00467 tokn = tokn->descendantCPos(pos);
00468 if (tokn)
00469 return tokn;
00470 }
00471 FOR_EACH(std::vector<TreeToken*>, right, it) {
00472 TreeToken* tokn = *it;
00473 std::string const* attr = tokn->get("CPOS");
00474 if (attr && *attr == pos)
00475 return tokn;
00476 tokn = tokn->descendantCPos(pos);
00477 if (tokn)
00478 return tokn;
00479 }
00480 return 0;
00481 }
00482
00484
00485 void print(std::ostream& os, int indent = 0) const;
00486
00488 void printTab(std::ostream& os);
00489
00490 void printTab(std::ostream& os, Corpus const& corpus);
00491
00496 void printConll08(std::ostream& os, Corpus const& corpus,
00497 std::vector<int>* preds = 0);
00498 };
00499
00500 }
00501
00502 #endif // DeSR_Token_H