00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef DeSR_State_H
00025 #define DeSR_State_H
00026
00027
00028
00029
00030 #include "text/RegExp.h"
00031
00032
00033 #include "Classifier.h"
00034
00035
00036 #include <vector>
00037 #include <limits>
00038
00039
00040 #include "Parser.h"
00041 #include "Iterator.h"
00042
00043 namespace Parser {
00044
00045 extern IXE::conf<bool> RightToLeft;
00046
00047 typedef char const* Action;
00048
00049 typedef std::vector<std::string> Features;
00050
00051 class Parser;
00052
00056 class SentenceInfo
00057 {
00058 public:
00059
00060 SentenceInfo(Sentence& sentence, GlobalInfo* info);
00061
00062 GlobalInfo* globalInfo;
00063
00065 std::vector<int> punctCount;
00066
00067 };
00068
00072 class State
00073 {
00074 public:
00080 State(Sentence const& sentence, GlobalInfo* globalInfo);
00081
00082 State(State const& state) :
00083 sentenceInfo(state.sentenceInfo),
00084 sentence(state.sentence.language),
00085 rootNode(state.rootNode),
00086 previous(&state),
00087 verbCount(state.verbCount),
00088 action(0),
00089 input(state.input),
00090 stack(state.stack),
00091 extracted(state.extracted)
00092 {
00093
00094 sentence.resize(state.sentence.size());
00095 std::copy(state.sentence.begin(), state.sentence.end(), sentence.begin());
00096 }
00097
00098 virtual ~State() {
00099 delete rootNode;
00100 if (!previous)
00101 delete sentenceInfo;
00102 }
00103
00105 bool hasNext();
00106
00110 State* transition(Action action);
00111
00112 State* Shift();
00113
00114 State* Right(Action action);
00115
00116 State* Left(Action action);
00117
00118 State* right(Action action);
00119
00120 State* left(char const* action);
00121
00122 State* DepLink(Action action);
00123
00124 State* Extract();
00125
00126 State* Insert();
00127
00128 Sentence* getSentence() {
00129 Sentence* sen = new Sentence(sentence);
00130 if (RightToLeft)
00131 sen->reverse();
00132 return sen;
00133 }
00134
00135 void showStatus();
00136
00137 Sentence sentence;
00138
00139 TreeToken* rootNode;
00140
00142 static Tanl::Text::RegExp::Pattern ispunct;
00143
00144 std::string splitFeature;
00145
00146 protected:
00147
00148
00149 virtual TreeToken* copy(TreeToken* x) { return x; }
00150
00151 SentenceInfo* sentenceInfo;
00152
00156 void predicates(Features& preds, Action action = 0);
00157
00161 std::vector<TreeToken*> input;
00165 std::vector<TreeToken*> stack;
00169 std::vector<TreeToken*> extracted;
00170
00174 State const* previous;
00175
00179 Action action;
00180
00182 int verbCount;
00183
00185 static Tanl::Text::RegExp::Pattern nonWordAscii;
00186
00187 private:
00188 void tokenFeatures(Features& preds);
00189 void prepChildEntities(Features& preds);
00190 };
00191
00195 class TrainState : public State, public Iterator<Tanl::Classifier::Event*>
00196 {
00197 public:
00198
00199 TrainState(Sentence const& sent, GlobalInfo* info);
00200
00201 bool hasNext() { return State::hasNext(); }
00202
00207 Tanl::Classifier::Event* next();
00208
00209 protected:
00213 Action nextAction();
00214
00215 Sentence annotated;
00216
00217 std::vector<int> dependents;
00218 };
00219
00224 class ParseState : public State, public Iterator<Tanl::Classifier::Context*>
00225 {
00226 public:
00227
00228 ParseState(Sentence& sent, GlobalInfo* globalInfo, WordIndex& predIndex) :
00229 State(sent, globalInfo),
00230 predIndex(predIndex),
00231 lprob(0),
00232 refCount(0)
00233 { }
00234
00235 ParseState(ParseState& prev) :
00236 State((State)prev),
00237 predIndex(prev.predIndex),
00238 lprob(0),
00239 refCount(0)
00240 {
00241 prev.refCount++;
00242 }
00243
00244 ~ParseState() {
00245 if (refCount == 0) {
00246 for (int i = 0; i < sentence.size(); i++) {
00247 if (previous && sentence[i] == previous->sentence[i])
00248 sentence[i] = 0;
00249 }
00250 if (previous && rootNode == previous->rootNode)
00251 rootNode = 0;
00252 if (previous && --((ParseState*)previous)->refCount == 0)
00253 delete previous;
00254 }
00255 }
00256
00257 void dispose() { if (refCount == 0) delete this; }
00258
00259 bool hasNext();
00260 Tanl::Classifier::Context* next();
00261
00265 ParseState* transition(Action action);
00266
00267 TreeToken* copy(TreeToken* x) {
00268 TreeToken* tok = new TreeToken(*x);
00269 int id = x->id;
00270 if (id)
00271 return sentence[id - 1] = tok;
00272 else
00273 return rootNode = tok;
00274 }
00275
00276 double lprob;
00277
00278 protected:
00279
00280 WordIndex& predIndex;
00281 Tanl::Classifier::Context context;
00282
00283 int refCount;
00284 };
00285
00286 }
00287
00288 #endif // DeSR_State_H