00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "ReviseEventStream.h"
00026
00027
00028 #include "conf/conf_string.h"
00029 #include "conf/conf_bool.h"
00030
00031 using namespace std;
00032 using namespace Tanl::Classifier;
00033
00034 namespace Parser {
00035
00036 extern IXE::conf<string> outputFormat;
00037 extern IXE::conf<bool> SecondOrder;
00038
00039 ReviseStream::ReviseStream(SentenceReader* reader) :
00040 reader(reader),
00041 sentence(0),
00042 cur(0)
00043 { }
00044
00045 void ReviseStream::clear()
00046 {
00047 cur = 0;
00048 actions.clear();
00049 }
00050
00051 void ReviseStream::childFeatures(TreeToken* tok, char const* tag,
00052 Features& preds)
00053 {
00054 TreeToken* parent = sentence->parent(tok);
00055 if (!parent)
00056 return;
00057 if (tok->linkHead() == parent->id) {
00058 char feature[256];
00059 if (tok->id < parent->id) {
00060 sprintf(feature, "L%s<%s", tag, tok->get("LEMMA")->c_str());
00061 preds.push_back(feature);
00062 sprintf(feature, "P%s<%s", tag, tok->get("POS")->c_str());
00063 preds.push_back(feature);
00064 sprintf(feature, "D%s<%s", tag, tok->get("DEPREL")->c_str());
00065 preds.push_back(feature);
00066 } else {
00067 sprintf(feature, "L%s>%s", tag, tok->get("LEMMA")->c_str());
00068 preds.push_back(feature);
00069 sprintf(feature, "P%s>%s", tag, tok->get("POS")->c_str());
00070 preds.push_back(feature);
00071 sprintf(feature, "D%s>%s", tag, tok->get("DEPREL")->c_str());
00072 preds.push_back(feature);
00073 }
00074 }
00075 }
00076
00077 void ReviseStream::predicates(Features& preds)
00078 {
00079 preds.clear();
00080 TreeToken* tok = (*sentence)[cur];
00081 TreeToken* parent = sentence->parent(tok);
00082 TreeToken* grandParent = sentence->parent(parent);
00083 TreeToken* ggParent = sentence->parent(grandParent);
00084 char feature[256];
00085
00086 sprintf(feature, "L%s", tok->get("LEMMA")->c_str());
00087 preds.push_back(feature);
00088 sprintf(feature, "P%s", tok->get("POS")->c_str());
00089 preds.push_back(feature);
00090 sprintf(feature, "D%s", tok->get("DEPREL")->c_str());
00091 preds.push_back(feature);
00092 childFeatures(tok, "", preds);
00093
00094 if (cur > 0) {
00095 TreeToken* tokn = (*sentence)[cur-1];
00096 sprintf(feature, "-1L%s", tokn->get("LEMMA")->c_str());
00097 preds.push_back(feature);
00098 sprintf(feature, "-1P%s", tokn->get("POS")->c_str());
00099 preds.push_back(feature);
00100 sprintf(feature, "-1D%s", tokn->get("DEPREL")->c_str());
00101 preds.push_back(feature);
00102 }
00103
00104 if (cur < sentence->size() - 1) {
00105 TreeToken* tokn = (*sentence)[cur+1];
00106 sprintf(feature, "+1L%s", tokn->get("LEMMA")->c_str());
00107 preds.push_back(feature);
00108 sprintf(feature, "+1P%s", tokn->get("POS")->c_str());
00109 preds.push_back(feature);
00110 sprintf(feature, "+1D%s", tokn->get("DEPREL")->c_str());
00111 preds.push_back(feature);
00112 }
00113
00114 int lmd = tok->leftmostDescendant();
00115 if (lmd > 1) {
00116 TreeToken* tokn = (*sentence)[lmd - 2];
00117 sprintf(feature, "<L%s", tokn->get("LEMMA")->c_str());
00118 preds.push_back(feature);
00119 sprintf(feature, "<P%s", tokn->get("POS")->c_str());
00120 preds.push_back(feature);
00121 sprintf(feature, "<D%s", tokn->get("DEPREL")->c_str());
00122 preds.push_back(feature);
00123 tokn = sentence->parent(tokn);
00124 if (tokn) {
00125 sprintf(feature, "<-1L%s", tokn->get("LEMMA")->c_str());
00126 preds.push_back(feature);
00127 sprintf(feature, "<-1P%s", tokn->get("POS")->c_str());
00128 preds.push_back(feature);
00129 sprintf(feature, "<-1D%s", tokn->get("DEPREL")->c_str());
00130 preds.push_back(feature);
00131 }
00132 }
00133
00134 unsigned rmd = tok->rightmostDescendant();
00135 if (rmd < sentence->size()) {
00136 TreeToken* tokn = (*sentence)[rmd];
00137 sprintf(feature, ">L%s", tokn->get("LEMMA")->c_str());
00138 preds.push_back(feature);
00139 sprintf(feature, ">P%s", tokn->get("POS")->c_str());
00140 preds.push_back(feature);
00141 sprintf(feature, ">D%s", tokn->get("DEPREL")->c_str());
00142 preds.push_back(feature);
00143 tokn = sentence->parent(tokn);
00144 if (tokn) {
00145 sprintf(feature, ">+1L%s", tokn->get("LEMMA")->c_str());
00146 preds.push_back(feature);
00147 sprintf(feature, ">+1P%s", tokn->get("POS")->c_str());
00148 preds.push_back(feature);
00149 sprintf(feature, ">+1D%s", tokn->get("DEPREL")->c_str());
00150 preds.push_back(feature);
00151 }
00152 }
00153 childFeatures(tok, "", preds);
00154
00155 if (parent) {
00156 childFeatures(parent, "u", preds);
00157 FOR_EACH (Sentence, *sentence, sibl) {
00158 if ((*sibl)->linkHead() == parent->id) {
00159 childFeatures(*sibl, "uudd", preds);
00160 if (grandParent)
00161 childFeatures(grandParent, "uu", preds);
00162 }
00163 }
00164 }
00165
00166 if (grandParent) {
00167 sprintf(feature, "GL%s", tok->get("LEMMA")->c_str());
00168 preds.push_back(feature);
00169 sprintf(feature, "GP%s", tok->get("POS")->c_str());
00170 preds.push_back(feature);
00171 sprintf(feature, "GD%s", tok->get("DEPREL")->c_str());
00172 preds.push_back(feature);
00173 }
00174
00175 if (ggParent) {
00176 sprintf(feature, "GGL%s", tok->get("LEMMA")->c_str());
00177 preds.push_back(feature);
00178 sprintf(feature, "GGP%s", tok->get("POS")->c_str());
00179 preds.push_back(feature);
00180 sprintf(feature, "GGD%s", tok->get("DEPREL")->c_str());
00181 preds.push_back(feature);
00182 }
00183 if (SecondOrder) {
00184
00185 size_t predNo = preds.size();
00186 for (unsigned i = 0; i < predNo; i++) {
00187 for (unsigned j = i+1; j < predNo; j++) {
00188
00189 string combo = (preds[i].compare(preds[j]) < 0) ?
00190 preds[i] + '#' + preds[j] : preds[j] + '#' + preds[i];
00191 preds.push_back(combo.c_str());
00192 }
00193 }
00194 }
00195 }
00196
00197
00198 bool ReviseContextStream::hasNext()
00199 {
00200 if (sentence) {
00201 if (cur < sentence->size())
00202 return true;
00203
00204 revise();
00205
00206 sentence->print(cout, *reader->corpus);
00207 cout << endl;
00208 }
00209 while (reader->MoveNext()) {
00210 delete sentence;
00211 sentence = reader->Current();
00212
00213
00214 FOR_EACH (vector<TreeToken*>, *sentence, it) {
00215 TreeToken* tok = *it;
00216 int head = tok->linkHead();
00217 if (head && tok->id > head)
00218 (*sentence)[head - 1]->right.push_back(tok);
00219 }
00220 for (vector<TreeToken*>::reverse_iterator it = sentence->rbegin();
00221 it != sentence->rend(); ++it) {
00222 TreeToken* tok = *it;
00223 int head = tok->linkHead();
00224 if (head && tok->id < head)
00225 (*sentence)[head - 1]->left.push_back(tok);
00226 }
00227 clear();
00228 return true;
00229 }
00230 return false;
00231 }
00232
00233 Context* ReviseContextStream::next()
00234 {
00235 Features preds;
00236 predicates(preds);
00237
00238 context.clear();
00239 FOR_EACH (Features, preds, it) {
00240 if (predIndex.find(it->c_str()) != predIndex.end())
00241 context.push_back(predIndex[it->c_str()]);
00242 }
00243 ++cur;
00244 return &context;
00245 }
00246
00247 char const* ReviseContextStream::Outcome() { return 0; }
00248
00249 int ReviseContextStream::performAction(TreeToken* tok, int from, char const* action)
00250 {
00251 int revision = (*sentence)[from]->linkHead();
00252 if (!strcmp(action, "OK") || !strcmp(action, "-"))
00253 return revision;
00254 TreeToken* to = tok;
00255 char const* origAction = action;
00256 while (*action) {
00257 switch (action[0]) {
00258 case 'r':
00259 return 0;
00260 case 'u':
00261 to = sentence->parent(to);
00262 ++action;
00263 break;
00264 case 'd': {
00265
00266 if (*++action == 0) {
00267 to = 0;
00268 break;
00269 }
00270 char* end = (char*)action;
00271 int nth = strtol(action, &end, 10);
00272 if (action == end) {
00273
00274 char sign = *action++;
00275 char POS = *action++;
00276 switch (POS) {
00277 case '-':
00278 to = (TreeToken*)sentence->child(to, INT_MIN);
00279 break;
00280 case '+':
00281 to = (TreeToken*)sentence->child(to, INT_MAX);
00282 break;
00283 case '\0':
00284 to = 0;
00285 break;
00286 default:
00287 to = (TreeToken*)sentence->child(to, sign, POS);
00288 }
00289 } else {
00290 action = end;
00291 to = (TreeToken*)sentence->child(to, nth);
00292 }
00293 break;
00294 }
00295 case '-': {
00296
00297 int nth = strtol(action, (char**)&action, 10);
00298 nth = to->id - 1 + nth;
00299 if (nth >= 0)
00300 to = (*sentence)[nth];
00301 else
00302 to = 0;
00303 break;
00304 }
00305 case '+': {
00306
00307 int nth = strtol(action, (char**)&action, 10);
00308 nth += to->id - 1;
00309 if (nth < sentence->size())
00310 to = (*sentence)[nth];
00311 else
00312 to = 0;
00313 break;
00314 }
00315 case '<': {
00316 ++action;
00317
00318 int lmd = to->leftmostDescendant();
00319 if (lmd > 1)
00320 to = (*sentence)[lmd - 2];
00321 else
00322 to = 0;
00323 break;
00324 }
00325 case '>': {
00326 ++action;
00327
00328 int rmd = to->rightmostDescendant();
00329 if (rmd < sentence->size())
00330 to = (*sentence)[rmd];
00331 else
00332 to = 0;
00333 break;
00334 }
00335 case 'v': {
00336 ++action;
00337
00338 to = (TreeToken*)to->ancestorCPos("V", *sentence);
00339 break;
00340 }
00341 case 'V': {
00342 ++action;
00343
00344
00345 to = (TreeToken*)to->descendantCPos("V");
00346 break;
00347 }
00348 case '[': {
00349 ++action;
00350
00351 int tlc = to->leftmostDescendant() - 1;
00352 if (tlc == 0) {
00353 to = 0;
00354 break;
00355 }
00356 to = (*sentence)[tlc-1];
00357
00358 while (to->rightmostDescendant() == tlc) {
00359 TreeToken* p = sentence->parent(to);
00360 if (p)
00361 to = p;
00362 else
00363 break;
00364 }
00365 break;
00366 }
00367 default:
00368 to = 0;
00369 }
00370 if (to == 0) {
00371 cerr << "Bad change: " << origAction << " " << tok->token->form << endl;
00372 return revision;
00373 }
00374 }
00375
00376 return to->id;
00377 }
00378
00379 void ReviseContextStream::revise()
00380 {
00381
00382 vector<int> revisions;
00383 int i = 0;
00384 FOR_EACH (vector<string>, actions, it) {
00385 char const* action = it->c_str();
00386 TreeToken* tok = (*sentence)[i];
00387 int head = performAction(tok, i++, action);
00388 revisions.push_back(head);
00389 }
00390
00391 i = 0;
00392 FOR_EACH (vector<int>, revisions, rit) {
00393 TreeToken* tok = (*sentence)[i++];
00394 tok->linkHead(*rit);
00395 }
00396 }
00397
00398 }