00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "State.h"
00025 #include "Parser.h"
00026 #include "conf_feature.h"
00027
00028
00029
00030
00031 #include "text/WordSet.h"
00032
00033 using namespace std;
00034 using namespace IXE;
00035 using namespace Tanl::Text;
00036 using namespace Tanl::Classifier;
00037
00038 namespace Parser {
00039
00040
00041 conf_feature features("Features");
00042
00043
00044 conf_feature SplitFeature("SplitFeature");
00045
00047 conf<bool> ClosestChildren("ClosestChildren", false);
00048
00050 conf<bool> PrepChildEntityType("PrepChildEntityType", false);
00051
00052 conf<bool> StackSize("StackSize", true);
00053 conf<bool> InputSize("InputSize", false);
00054 conf<bool> InPunct("InPunct", false);
00055 conf<bool> VerbCount("VerbCount", true);
00057 conf<bool> UseChildPunct("UseChildPunct", true);
00058 conf<int> PastActions("PastActions", 1);
00059 conf<bool> WordDistance("WordDistance", true);
00060 conf<bool> PunctCount("PunctCount", true);
00062 conf<bool> MorphoAgreement("MorphoAgreement", false);
00064 conf<bool> MorphoSplit("MorphoSplit", false);
00066 conf<bool> LexChildNonWord("LexChildNonWord", true);
00067
00069 conf<bool> CompositeActions("CompositeActions", false);
00070
00071
00072 conf<bool> SecondOrder("SecondOrder", false);
00073 conf<bool> FeatPairs("FeatPairs", false);
00074
00076 conf<bool> RightToLeft("RightToLeft", false);
00077
00078 conf<bool> ShowActions("ShowActions", false);
00079
00080 WordSet actionTable;
00081
00082
00083 RegExp::Pattern State::ispunct("^\\p{P}+$",
00084 PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
00085 RegExp::Pattern State::nonWordAscii("^[^$0-9_-zA-Z]+$");
00086
00087 char const* mkAction(char const* a, string const& dep)
00088 {
00089 if (CompositeActions) {
00090 char action[128];
00091 sprintf(action, "%s%s", a, dep.c_str());
00092 return *actionTable.insert(action).first;
00093 } else {
00094 return *actionTable.insert(a).first;
00095 }
00096 }
00097
00098 char const* actionString(char const* a)
00099 {
00100 return *actionTable.insert(a).first;
00101 }
00102
00103
00104
00105
00106 SentenceInfo::SentenceInfo(Sentence& sentence, GlobalInfo* info) :
00107 globalInfo(info)
00108 {
00109 if (sentence.empty())
00110 return;
00111 if (RightToLeft)
00112 sentence.reverse();
00113
00114 punctCount.push_back(State::ispunct.test(sentence[0]->token->form));
00115 for (unsigned i = 1; i < sentence.size(); ++i) {
00116 Token* token = sentence[i]->token;
00117 punctCount.push_back(punctCount[i-1] + State::ispunct.test(token->form));
00118 }
00119 }
00120
00121
00122
00123
00124 State::State(Sentence const& sent, GlobalInfo* info) :
00125 sentence(sent),
00126 rootNode(new TreeToken(0, "#NULL")),
00127 sentenceInfo(new SentenceInfo(sentence, info)),
00128 action(0),
00129 previous(0)
00130 {
00131
00132 input.resize(sentence.size());
00133 std::copy(sentence.rbegin(), sentence.rend(), input.begin());
00134
00135 stack.push_back(rootNode);
00136 }
00137
00138 bool State::hasNext()
00139 {
00140 return !input.empty();
00141 }
00142
00146 inline State* State::Shift()
00147 {
00148 TreeToken* next = input.back();
00149 stack.push_back(next);
00150 input.pop_back();
00151 action = "S";
00152 return this;
00153 }
00154
00155 inline State* State::Right(Action action)
00156 {
00157
00158 if (stack.size() == 1)
00159 return 0;
00160 TreeToken* top = copy(stack.back());
00161 stack.pop_back();
00162 TreeToken* next = copy(input.back());
00163 input.back() = next;
00164 next->left.push_back(top);
00165 top->linkHead(next->id, 0);
00166 if (CompositeActions)
00167 top->linkLabel(action+1);
00168 this->action = actionString(action);
00169 return this;
00170 }
00171
00172 inline State* State::Left(Action action)
00173 {
00174
00175 TreeToken* top = copy(stack.back());
00176 TreeToken* next = copy(input.back());
00177 top->right.push_back(next);
00178 if (CompositeActions) {
00179 if (stack.size() > 1) {
00180 stack.pop_back();
00181 input.back() = top;
00182 } else {
00183
00184 input.pop_back();
00185 }
00186 } else {
00187 if (stack.size()) {
00188 stack.pop_back();
00189 input.back() = top;
00190 }
00191 }
00192 next->linkHead(top->id, 0);
00193 if (CompositeActions)
00194 next->linkLabel(action+1);
00195 this->action = actionString(action);
00196 return this;
00197 }
00198
00199 inline State* State::right(Action action)
00200 {
00201 int n = action[1] - '0';
00202
00203 if (stack.size() <= n) {
00204
00205
00206 return 0;
00207 }
00208 TreeToken* nthTop = copy(stack[stack.size() - n]);
00209 stack.erase(stack.end() - n);
00210 TreeToken* next = copy(input.back());
00211 next->left.push_back(nthTop);
00212 nthTop->linkHead(next->id, 0);
00213 if (CompositeActions) {
00214 nthTop->linkLabel(action+2);
00215
00216 input.push_back(stack.back());
00217 stack.pop_back();
00218 }
00219 this->action = actionString(action);
00220 return this;
00221 }
00222
00223 inline State* State::left(char const* action)
00224 {
00225 int n = action[1] - '0';
00226
00227
00228 if (stack.size() <= n) {
00229
00230 return 0;
00231 }
00232 TreeToken* nthTop = copy(stack[stack.size() - n]);
00233 TreeToken* next = copy(input.back());
00234 nthTop->right.push_back(next);
00235 next->linkHead(nthTop->id, 0);
00236 if (CompositeActions)
00237 next->linkLabel(action+2);
00238
00239 input.back() = stack.back();
00240 stack.pop_back();
00241 if (CompositeActions)
00242 if (n == stack.size())
00243 n--;
00244
00245 for (int i = 0; i < n-2; i++) {
00246 input.push_back(stack.back());
00247 stack.pop_back();
00248 }
00249 input.push_back(nthTop);
00250 stack.pop_back();
00251 this->action = actionString(action);
00252 return this;
00253 }
00254
00255 inline State* State::DepLink(Action action)
00256 {
00257 TreeToken* next = input.back();
00258 switch (action[0]) {
00259 case 'R':
00260 case 'r':
00261
00262 next->left.back()->linkLabel(action+1);
00263
00264
00265 if (this->action[0] == 'r') {
00266 input.push_back(stack.back());
00267 stack.pop_back();
00268 }
00269 this->action = actionString(action);
00270 return this;
00271
00272 case 'L':
00273 case 'l':
00274
00275 next->right.back()->linkLabel(action+1);
00276
00277 if (stack.empty()) {
00278 input.pop_back();
00279 stack.push_back(next);
00280 }
00281 this->action = actionString(action);
00282 return this;
00283 }
00284 return this;
00285 }
00286
00287 State* State::Extract()
00288 {
00289
00290 if (stack.size() < 3 ||
00291 input.size() < 1) {
00292 cerr << "Improper action " << action << endl;
00293 return 0;
00294 }
00295 TreeToken* nthStack = stack[stack.size() - 2];
00296 extracted.push_back(nthStack);
00297 stack.erase(stack.end() - 2);
00298
00299 TreeToken* next = input.back();
00300 stack.push_back(next);
00301 input.pop_back();
00302 action = "E";
00303 return this;
00304 }
00305
00306 State* State::Insert()
00307 {
00308
00309 if (extracted.empty()) {
00310 cerr << "Improper action " << action << endl;
00311 return 0;
00312 }
00313 input.push_back(extracted.back());
00314 extracted.pop_back();
00315 action = "I";
00316 return this;
00317 }
00318
00336 State* State::transition(Action action)
00337 {
00338 # ifdef DEBUG_1
00339 showStatus();
00340 cerr << "Action: " << action << endl;
00341 # endif
00342 switch (action[0]) {
00343 case 'S':
00344 if (input.empty())
00345 return this;
00346 return Shift();
00347 case 'R':
00348 if (stack.size() == 1) {
00349
00350
00351 return Shift();
00352 }
00353 return Right(action);
00354 case 'L':
00355 return Left(action);
00356 case 'r':
00357 return right(action);
00358 case 'l':
00359 return left(action);
00360 case 'D':
00361 return DepLink(action);
00362 case 'E':
00363 return Extract();
00364 case 'I':
00365 return Insert();
00366 }
00367 return 0;
00368 }
00369
00370
00371 void State::predicates(Features& preds, Action action)
00372 {
00373 preds.clear();
00374
00375 if (stack.empty()) {
00376 preds.push_back("(");
00377 if (CompositeActions)
00378 return;
00379 }
00380
00381 if (input.empty()) {
00382 preds.push_back(")");
00383 return;
00384 }
00385
00386
00387 tokenFeatures(preds);
00388
00389
00390 char feature[256];
00391 if (extracted.size()) {
00392 Token* tok = extracted.back()->token;
00393 string const* lemma = tok->getLemma();
00394 if (lemma && !lemma->empty()) {
00395 sprintf(feature, "EL%s", lemma->c_str());
00396 preds.push_back(feature);
00397 } else {
00398 sprintf(feature, "EW%s", tok->form.c_str());
00399 preds.push_back(feature);
00400 }
00401 string const* pos = tok->getPos();
00402 if (pos && !pos->empty()) {
00403 sprintf(feature, "EP%s", pos->c_str());
00404 preds.push_back(feature);
00405 }
00406 }
00407
00408
00409 if (MorphoAgreement && stack.size() > 1) {
00410 Token* top = stack.back()->token;
00411 Token* next = input.back()->token;
00412 if (top->morpho.number && top->morpho.number == next->morpho.number)
00413 preds.push_back("=N");
00414 if (top->morpho.gender && top->morpho.gender == next->morpho.gender)
00415 preds.push_back("=G");
00416 }
00417
00418
00419 if (StackSize && stack.size() > 2)
00420 preds.push_back("((");
00421 if (InputSize && input.size() > 1)
00422 preds.push_back("))");
00423 if (VerbCount) {
00424 Language const* lang = sentence.language;
00425 int vc = 0;
00426 FOR_EACH (vector<TreeToken*>, stack, it) {
00427 if ((*it)->token->isVerb(lang))
00428 vc++;
00429 }
00430 if (vc) {
00431 sprintf(feature, "VC%d", vc);
00432 preds.push_back(feature);
00433 }
00434 }
00435
00436
00437 int id = input.back()->id;
00438 if (id > 1) {
00439
00440 if (InPunct && sentenceInfo->punctCount[id-2]%2)
00441 preds.push_back(".");
00442
00443 if (PunctCount && sentenceInfo->punctCount[id-2]) {
00444 sprintf(feature, ".%d", sentenceInfo->punctCount[id-2]);
00445 preds.push_back(feature);
00446 }
00447 }
00448 if (UseChildPunct) {
00449
00450
00451
00452 if (stack.size() > 1) {
00453 TreeToken* top = stack.back();
00454 FOR_EACH (vector<TreeToken*>, top->left, it) {
00455 if (ispunct.test((*it)->token->form)) {
00456 sprintf(feature, "1.<%s", (*it)->token->form.c_str());
00457 preds.push_back(feature);
00458 break;
00459 }
00460 }
00461 for (vector<TreeToken*>::reverse_iterator it = top->right.rbegin();
00462 it != top->right.rend(); it++) {
00463 if (ispunct.test((*it)->token->form)) {
00464 sprintf(feature, "1.>%s", (*it)->token->form.c_str());
00465 preds.push_back(feature);
00466 break;
00467 }
00468 }
00469 }
00470 if (input.size()) {
00471 TreeToken* next = input.back();
00472
00473 FOR_EACH (vector<TreeToken*>, next->left, it) {
00474 if (ispunct.test((*it)->token->form)) {
00475 sprintf(feature, ".<0%s", (*it)->token->form.c_str());
00476 preds.push_back(feature);
00477 break;
00478 }
00479 }
00480 for (vector<TreeToken*>::reverse_iterator it = next->right.rbegin();
00481 it != next->right.rend(); it++) {
00482 if (ispunct.test((*it)->token->form)) {
00483 sprintf(feature, ".>0%s", (*it)->token->form.c_str());
00484 preds.push_back(feature);
00485 break;
00486 }
00487 }
00488 }
00489 }
00490
00491 State const* s = this;
00492 for (int i = 0; i < PastActions && s; i++, s = s->previous) {
00493 if (s->action) {
00494 sprintf(feature, "A%d%s", i, s->action);
00495 preds.push_back(feature);
00496 }
00497 }
00498
00499 if (WordDistance && stack.size()) {
00500 int d = abs((int)input.back()->id - (int)stack.back()->id) - 1;
00501 sprintf(feature, "%d", min(d, 4));
00502 preds.push_back(feature);
00503 }
00504
00505
00506
00507 if (PrepChildEntityType)
00508 prepChildEntities(preds);
00509
00510 if (SecondOrder) {
00511
00512 size_t predNo = preds.size();
00513 for (unsigned i = 0; i < predNo; i++) {
00514 for (unsigned j = i+1; j < predNo; j++) {
00515
00516 string combo = (preds[i].compare(preds[j]) < 0) ?
00517 preds[i] + '#' + preds[j] : preds[j] + '#' + preds[i];
00518 preds.push_back(combo.c_str());
00519 }
00520 }
00521 }
00522
00523 if (!CompositeActions) {
00524
00525 switch (action[0]) {
00526 case 'R':
00527 case 'r': {
00528 TreeToken* next = input.back();
00529 string const* npos = next->token->getPos();
00530 string const* nlpos = next->left.back()->token->getPos();
00531 if (npos && nlpos) {
00532 sprintf(feature, "d%s%s", nlpos->c_str(), npos->c_str());
00533 preds.push_back(feature);
00534 }
00535 break;
00536 }
00537 case 'L':
00538 case 'l': {
00539 TreeToken* next = input.back();
00540 string const* npos = next->token->getPos();
00541 string const* nrpos = next->right.back()->token->getPos();
00542 if (npos && nrpos) {
00543 sprintf(feature, "D%s%s", nrpos->c_str(), npos->c_str());
00544 preds.push_back(feature);
00545 }
00546 break;
00547 }
00548 }
00549 }
00550 }
00551
00552
00553
00554 void State::tokenFeatures(Features& preds)
00555 {
00556 char feature[256];
00557 Token* next = input.back()->token;
00558 set<TreeToken*> lexChildNonWordTokens;
00559
00560 FOR_EACH (FeatureSpecs, features.value, fit) {
00561 char const* attrName = fit->first;
00562 int attrIndex = next->attrIndex(attrName);
00563 char featId = 'A' + attrIndex;
00564 FOR_EACH (set<TokenPath*>, fit->second, tit) {
00565
00566 TokenPath const& tp = **tit;
00567 TreeToken* tok;
00568 if (tp.root < 0) {
00569 if (-tp.root > (int)stack.size() - 1)
00570 continue;
00571 tok = stack[stack.size() + tp.root];
00572 } else {
00573 if (tp.root >= (int)input.size())
00574 continue;
00575 tok = input[input.size() - 1 - tp.root];
00576 }
00577 tok = tok->follow(tp, sentence);
00578 if (tok) {
00579 string const* item = tok->get(attrName);
00580 if (item && !item->empty()) {
00581
00582 if (tp.root < 0)
00583 sprintf(feature, "%d%c%s%s", -tp.root, featId, tp.Code(), item->c_str());
00584 else
00585 sprintf(feature, "%c%d%s%s", featId, tp.root, tp.Code(), item->c_str());
00586 preds.push_back(feature);
00587
00588 if (LexChildNonWord &&
00589 lexChildNonWordTokens.find(tok) == lexChildNonWordTokens.end()) {
00590
00591 lexChildNonWordTokens.insert(tok);
00592 FOR_EACH (vector<TreeToken*>, tok->left, it) {
00593 if (nonWordAscii.test((*it)->token->form)) {
00594
00595 sprintf(feature, ".%d/", tp.root);
00596 preds.push_back(feature);
00597 break;
00598 }
00599 }
00600 for (vector<TreeToken*>::reverse_iterator it = tok->right.rbegin();
00601 it != tok->right.rend(); it++) {
00602 if (nonWordAscii.test((*it)->token->form)) {
00603
00604 sprintf(feature, ".%d\\", tp.root);
00605 preds.push_back(feature);
00606 break;
00607 }
00608 }
00609 }
00610 }
00611 }
00612 }
00613 }
00614
00615
00616 FeatureSpecs splits = SplitFeature.value;
00617 FeatureSpecs::const_iterator split = splits.begin();
00618 if (split != splits.end()) {
00619 char const* attrName = split->first;
00620 FOR_EACH (set<TokenPath*>, split->second, tit) {
00621
00622 TokenPath const& tp = **tit;
00623 TreeToken* tok;
00624 if (tp.root < 0) {
00625 if (-tp.root > (int)stack.size() - 1)
00626 continue;
00627 tok = stack[stack.size() + tp.root];
00628 } else {
00629 if (tp.root >= (int)input.size())
00630 continue;
00631 tok = input[input.size() - 1 - tp.root];
00632 }
00633 tok = tok->follow(tp, sentence);
00634 if (tok) {
00635 string const* feat = tok->get(attrName);
00636 if (feat)
00637 splitFeature = *feat;
00638 else
00639 cerr << "Missing split feature" << endl;
00640 }
00641 }
00642 }
00643 }
00644
00646 void State::prepChildEntities(Features& preds)
00647 {
00648
00649 Language const* lang = sentence.language;
00650 GlobalInfo* info = sentenceInfo->globalInfo;
00651 if (stack.size() > 1) {
00652 TreeToken* top = stack.back();
00653 if (top->token->isPreposition(lang)) {
00654
00655
00656 FOR_EACH (vector<TreeToken*>, top->right, it) {
00657 if ((*it)->token->isNoun(lang)) {
00658 string const* noun = (*it)->token->getLemma();
00659 if (noun && !noun->empty()) {
00660 int tc = info->timeLemmas.count(*noun);
00661 int lc = info->locLemmas.count(*noun);
00662
00663 if (tc >= info->freqRatio * lc)
00664 preds.push_back("1TIME");
00665 if (lc >= info->freqRatio * tc)
00666 preds.push_back("1LOC");
00667 if (tc || lc)
00668 break;
00669 }
00670 }
00671 }
00672 }
00673 }
00674
00675 TreeToken* next = input.back();
00676 if (next->token->isPreposition(lang)) {
00677
00678
00679 FOR_EACH (vector<TreeToken*>, next->right, it) {
00680 if ((*it)->token->isNoun(lang)) {
00681 string const* noun = (*it)->token->getLemma();
00682 if (noun && !noun->empty()) {
00683 int tc = info->timeLemmas.count(*noun);
00684 int lc = info->locLemmas.count(*noun);
00685
00686 if (tc >= info->freqRatio * lc)
00687 preds.push_back("TIME0");
00688 if (lc >= info->freqRatio * tc)
00689 preds.push_back("LOC0");
00690 if (tc || lc)
00691 break;
00692 }
00693 }
00694 }
00695 }
00696 }
00697
00698 void State::showStatus() {
00699 cerr << "Stack:" << endl;
00700 FOR_EACH (vector<TreeToken*>, stack, it)
00701 (*it)->print(cerr);
00702 cerr << "Next:" << endl;
00703 if (input.size())
00704 input.back()->print(cerr);
00705 }
00706
00707
00708
00709
00710 TrainState::TrainState(Sentence const& sent, GlobalInfo* info) :
00711 State(sent, info),
00712 annotated(sentence)
00713 {
00714
00715 dependents.resize(sentence.size());
00716 FOR_EACH (Sentence, sentence, sit) {
00717 int head = (*sit)->linkHead();
00718 if (head)
00719 dependents[head-1]++;
00720 }
00721
00722
00723
00724 FOR_EACH (vector<TreeToken*>, input, sit) {
00725 (*sit)->linkHead(0);
00726 (*sit)->linkLabel("");
00727 }
00728
00729 if (PrepChildEntityType)
00730 info->extract(sentence);
00731 }
00732
00733 #define ORIG(tok) annotated[(tok)->id-1]
00734
00739 Action TrainState::nextAction()
00740 {
00741 if (input.size() == 0)
00742 return 0;
00743 TreeToken* next = input.back();
00744 int nextHead = annotated[next->id-1]->linkHead();
00745 string const& nextLabel = annotated[next->id-1]->linkLabel();
00746 if (stack.empty())
00747 return "S";
00748 TreeToken* top = stack.back();
00749 if (top->id && ORIG(top)->linkHead() == next->id) {
00750
00751 if (input.size() > 1 &&
00752 ORIG(input[input.size()-2])->linkHead() == top->id) {
00753
00754
00755 return "S";
00756 } else if (top->id &&
00757 dependents[top->id-1]) {
00758
00759 return "S";
00760 } else {
00761
00762 dependents[next->id - 1]--;
00763 return mkAction("R", ORIG(top)->linkLabel());
00764 }
00765 } else if (nextHead == top->id) {
00766
00767 if (stack.size() > 2 &&
00768 ORIG(stack[stack.size()-2])->linkHead() == next->id) {
00769
00770 TreeToken* dep = stack[stack.size()-2];
00771 if (dependents[dep->id-1]) {
00772 return "E";
00773 } else {
00774
00775 dependents[next->id - 1]--;
00776 return mkAction("r2", ORIG(dep)->linkLabel());
00777 }
00778 } else if (stack.size() > 3 &&
00779 ORIG(stack[stack.size()-3])->linkHead() == next->id &&
00780 dependents[stack[stack.size()-3]->id - 1] == 0) {
00781
00782
00783 dependents[next->id - 1]--;
00784 TreeToken* dep = stack[stack.size()-3];
00785 return mkAction("r3", ORIG(dep)->linkLabel());
00786 } else if (dependents[next->id - 1]) {
00787
00788 return "S";
00789 } else {
00790
00791
00792 if (stack.size() > 1) {
00793 dependents[top->id - 1]--;
00794 }
00795 return mkAction("L", nextLabel);
00796 }
00797 } else if (stack.size() > 1 &&
00798 stack[stack.size()-2]->id == nextHead &&
00799 dependents[next->id - 1] == 0) {
00800
00801
00802
00803 TreeToken* nthTop = stack[stack.size()-2];
00804
00805 if (stack.size() == 2) {
00806
00807
00808
00809
00810
00811 } else {
00812 dependents[nthTop->id - 1]--;
00813
00814 }
00815 return mkAction("l2", nextLabel);
00816 } else if (stack.size() > 2 &&
00817 stack[stack.size()-3]->id == nextHead &&
00818 dependents[next->id - 1] == 0) {
00819
00820
00821 if (stack.size() > 3)
00822 dependents[stack[stack.size()-3]->id - 1]--;
00823 return mkAction("l3", nextLabel);
00824 } else if (stack.size() > 3 &&
00825 stack[stack.size()-4]->id == nextHead &&
00826 dependents[next->id - 1] == 0) {
00827
00828
00829 if (stack.size() > 4)
00830 dependents[stack[stack.size()-4]->id - 1]--;
00831 return mkAction("l4", nextLabel);
00832 } else if (stack.size() > 2 &&
00833 ORIG(stack[stack.size()-2])->linkHead() == next->id &&
00834 dependents[stack[stack.size()-2]->id - 1] == 0) {
00835
00836
00837 dependents[next->id - 1]--;
00838 TreeToken* dep = stack[stack.size()-2];
00839 return mkAction("r2", ORIG(dep)->linkLabel());
00840 } else if (stack.size() > 3 &&
00841 ORIG(stack[stack.size()-3])->linkHead() == next->id &&
00842 dependents[stack[stack.size()-3]->id - 1] == 0) {
00843
00844
00845 dependents[next->id - 1]--;
00846 TreeToken* dep = stack[stack.size()-3];
00847 return mkAction("r3", ORIG(dep)->linkLabel());
00848 } else if (extracted.size() &&
00849 nextHead == extracted.back()->id) {
00850
00851
00852 return "I";
00853 } else
00854 return "S";
00855 }
00856
00857 Event* TrainState::next()
00858 {
00859 Action action = nextAction();
00860 Event* ev = new Event(action);
00861 predicates(ev->features, action);
00862 return ev;
00863 }
00864
00865
00866
00867
00868 bool ParseState::hasNext()
00869 {
00870 bool res = State::hasNext();
00871 if (!res) {
00872
00873 if (stack.size() > 2) {
00874
00875 # ifdef DEBUG
00876 cerr << "Multiple roots: " << stack.size() << endl;
00877 # endif
00878 Language const* lang = sentence.language;
00879
00880 int root = 0;
00881 int rootSize = 0;
00882 FOR_EACH (vector<TreeToken*>, stack, sit) {
00883 TreeToken* node = *sit;
00884 if (node->linkHead() == 0) {
00885 int size = node->size();
00886
00887 string const* tokPos = node->token->getPos();
00888 if (size > rootSize && tokPos && lang->rootPos(*tokPos)) {
00889 root = node->id;
00890 rootSize = size;
00891 }
00892 }
00893 }
00894 TO_EACH (vector<TreeToken*>, stack, sit) {
00895 TreeToken* node = *sit;
00896 if (node->linkHead() == 0 && node->id != root) {
00897 node->linkHead(root);
00898 if (node->linkLabel().empty())
00899 node->linkLabel(Language::DEFAULT_ROOT_LABEL);
00900 }
00901 }
00902 }
00903 }
00904 return res;
00905 }
00906
00907 Context* ParseState::next()
00908 {
00909 Features preds;
00910 predicates(preds);
00911
00912 context.clear();
00913 FOR_EACH (Features, preds, it) {
00914 string const& pred = *it;
00915 if (predIndex.find(pred.c_str()) != predIndex.end())
00916 context.add(predIndex[pred.c_str()]);
00917 }
00918 return &context;
00919 }
00920
00921 ParseState* ParseState::transition(Action action)
00922 {
00923
00924 if (extracted.size() && input.size() &&
00925 (action[0] == 'S' || action[0] == 'L') &&
00926 ispunct.test(input.back()->token->form))
00927 action = "I";
00928
00929 return (ParseState*)(new ParseState(*this))->State::transition(action);
00930 }
00931
00932 }