00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00026
00027 #ifdef _WIN32
00028 #include "lib/strtok_r.h"
00029 #endif
00030
00031 #include "conf/conf_int.h"
00032 #include "Parser.h"
00033 #include "EventStream.h"
00034 #include "State.h"
00035
00036 #include <boost/numeric/ublas/matrix.hpp>
00037 #include <boost/numeric/ublas/matrix_proxy.hpp>
00038
00039 using namespace std;
00040 using namespace boost::numeric::ublas;
00041
00042 namespace Parser {
00043
00044 #define BEAM
00045
00046 #define MAX_LINE_LEN 8196
00047
00048
00049 typedef boost::numeric::ublas::vector<double> Vector;
00050 typedef matrix<double> Matrix;
00051
00052 void softsign(Vector& x)
00053 {
00054 for (int i = 0; i < x.size(); i++)
00055 x[i] /= 1.0 + fabs(x[i]);
00056 }
00057
00058 #define READLINE(line, file) if (!ifs.getline(line, MAX_LINE_LEN)) \
00059 throw IXE::FileError(string("Wrong file format: ") + file)
00060
00061 #define READ_WEIGHT(w, file) if (!ifs.read((char*)&w, sizeof(w))) \
00062 throw IXE::FileError(string("Wrong file format: ") + file)
00063
00064 struct DlModel : public Classifier::Classifier
00065 {
00066 DlModel() { }
00067
00068 DlModel(int numFeatures, int numOutcomes, int numHidden) :
00069 w1(numFeatures, numHidden),
00070 b1(numHidden),
00071 w2(numHidden, numOutcomes),
00072 b2(numOutcomes)
00073 { }
00074
00075 ~DlModel() {
00076 FOR_EACH (std::vector<char const*>, outcomeLabels, it)
00077 free((void*)*it);
00078 }
00079
00090 int crossentropy_softmax_argmax(Vector& x, double sm[]) {
00091
00092 double m = 0.0;
00093 int am = 0;
00094 for (int i = 0; i < numOutcomes; i++)
00095 if (x[i] > m) {
00096 am = i;
00097 m = x[i];
00098 }
00099
00100 double sum_j = 0.0;
00101 for (int i = 0; i < numOutcomes; i++)
00102 sum_j += sm[i] = exp(x[i] - m);
00103
00104
00105 for (int i = 0; i < numOutcomes; i++)
00106 sm[i] /= sum_j;
00107
00108 return am;
00109 }
00110
00111 void estimate(Tanl::Classifier::Context& features, double prob[]) {
00112 int numFeatures = predIndex.size();
00113 Vector xw1(numHidden);
00114 for (int i = 0; i < numHidden; i++)
00115 xw1[i] = 0.0;
00116 for (int f = 0; f < features.size(); f++)
00117 xw1 += row(w1, features[f]);
00118 xw1 += b1;
00119 softsign(xw1);
00120 Vector h(numOutcomes);
00121 h = prod(xw1, w2);
00122 h += b2;
00123 int argmax = crossentropy_softmax_argmax(h, prob);
00124 }
00125
00126 void load(ifstream& ifs, char const* file = "") {
00127
00128 char line[MAX_LINE_LEN];
00129 READLINE(line, file);
00130 int len = atoi(line);
00131 numOutcomes = len;
00132 outcomeLabels.resize(numOutcomes);
00133 int n = 0;
00134 while (len--) {
00135 READLINE(line, file);
00136 outcomeLabels[n] = strdup(line);
00137 outcomeIndex[(char const*)line] = n++;
00138 }
00139
00140 READLINE(line, file);
00141 int numFeatures = len = atoi(line);
00142 predLabels.resize(numFeatures);
00143 n = 0;
00144 while (len--) {
00145 READLINE(line, file);
00146 predLabels[n] = line;
00147 predIndex[(char const*)line] = n++;
00148 }
00149
00150
00151 READLINE(line, file);
00152 numHidden = atoi(line);
00153
00154 w1.resize(numFeatures, numHidden);
00155
00156
00157
00158 READLINE(line, file);
00159 len = atoi(line);
00160 n = 0;
00161 double w;
00162 while (len--) {
00163 READ_WEIGHT(w, file);
00164 w1(n / numHidden, n % numHidden) = w;
00165 n++;
00166 }
00167
00168
00169 b1.resize(numHidden);
00170 READLINE(line, file);
00171 len = atoi(line);
00172 n = 0;
00173 while (len--) {
00174 READ_WEIGHT(w, file);
00175 b1[n++] = w;
00176 }
00177
00178
00179 w2.resize(numHidden, numOutcomes);
00180 READLINE(line, file);
00181 len = atoi(line);
00182 n = 0;
00183 while (len--) {
00184 READ_WEIGHT(w, file);
00185 w2(n / numOutcomes, n % numOutcomes) = w;
00186 n++;
00187 }
00188
00189
00190 b2.resize(numOutcomes);
00191 READLINE(line, file);
00192 len = atoi(line);
00193 n = 0;
00194 while (len--) {
00195 READ_WEIGHT(w, file);
00196 b2[n++] = w;
00197 }
00198 }
00199
00200 Matrix w1, w2;
00201 Vector b1, b2;
00202 int numHidden;
00203 WordIndex outcomeIndex;
00204 };
00205
00209 struct MlParser : public Parser
00210 {
00211 MlParser(char const* modelFile);
00212
00213 void train(SentenceReader* sentenceReader, char const* modelFile);
00214
00215 Sentence* parse(Sentence* sentence);
00216
00217 void revise(SentenceReader* sentenceReader, char const* actionFile = 0);
00218
00219 DlModel model;
00220 };
00221
00225 Parser* MlParserFactory(char const* modelFile = 0)
00226 {
00227 return new MlParser(modelFile);
00228 }
00229
00230 REGISTER_PARSER(MLP, MlParserFactory);
00231
00232 MlParser::MlParser(char const* modelFile) :
00233 Parser(model.PredIndex())
00234 {
00235 if (!modelFile)
00236 return;
00237 ifstream ifs(modelFile);
00238 if (!ifs)
00239 throw IXE::FileError(string("Missing model file: ") + modelFile);
00240
00241 readHeader(ifs);
00242 model.load(ifs);
00243
00244 info.load(ifs);
00245 ifs.close();
00246 }
00247
00248 void MlParser::train(SentenceReader* sentenceReader, char const* modelFile)
00249 {
00250 EventStream eventStream(sentenceReader, &info);
00251 cerr << "Training the Deep Learning is currently possible only with a Python script.\n"
00252 << "Generate the training examples with 'desr -d'\n"
00253 << "and then hadle them to the script train.py.\n"
00254 << "You then must convert the generated model into a DeSR model\n"
00255 << "using the script script/dumpPyModel.py" << endl;
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270 }
00271
00272 #ifdef BEAM
00273
00274 static double addState(ParseState* s, std::vector<ParseState*>& states)
00275 {
00276 int size = states.size();
00277 if (size == 0) {
00278 states.push_back(s);
00279 return s->lprob;
00280 }
00281 double worst = states[size-1]->lprob;
00282 if (size == beam && s->lprob < worst)
00283 return worst;
00284 TO_EACH (std::vector<ParseState*>, states, it)
00285 if (s->lprob > (*it)->lprob) {
00286 if (size == beam) {
00287 delete states.back();
00288 states.pop_back();
00289 }
00290 states.insert(it, s);
00291 return states.back()->lprob;
00292 }
00293 if (size < beam)
00294 states.push_back(s);
00295 return states.back()->lprob;
00296 }
00297 #endif
00298
00299 Sentence* MlParser::parse(Sentence* sentence)
00300 {
00301 int numOutcomes = model.NumOutcomes();
00302 int numFeatures = model.PredIndex().size();
00303 double params[numOutcomes];
00304
00305 # ifdef BEAM
00306 std::vector<ParseState*> currStates; currStates.reserve(beam);
00307 std::vector<ParseState*> nextStates; nextStates.reserve(beam);
00308 std::vector<ParseState*>* bestStates = &currStates;
00309 std::vector<ParseState*>* bestNextStates = &nextStates;
00310 ParseState* state = new ParseState(*sentence, &info, model.PredIndex());
00311 addState(state, *bestStates);
00312
00313 while (true) {
00314 int finished = 0;
00315 int numBest = bestStates->size();
00316
00317 double worstProb = -numeric_limits<double>::infinity();
00318 for (int i = 0; i < numBest; i++) {
00319 state = (*bestStates)[i];
00320 if (state->hasNext()) {
00321 Tanl::Classifier::Context& features = *state->next();
00322
00323 model.estimate(features, params);
00324 for (int o = 0; o < numOutcomes; o++) {
00325 double prob = params[o];
00326 if (prob < 1e-4)
00327 continue;
00328 double lprob = log(prob) + state->lprob;
00329 if (bestNextStates->size() == beam && lprob < worstProb)
00330 continue;
00331 char const* outcome = model.OutcomeName(o);
00332 ParseState* next = state->transition(outcome);
00333 if (!next) {
00334
00335 state->dispose();
00336 continue;
00337 }
00338 next->lprob = lprob;
00339 worstProb = addState(next, *bestNextStates);
00340 }
00341 } else {
00342
00343 worstProb = addState(state, *bestNextStates);
00344 finished++;
00345 }
00346 }
00347 if (finished == numBest)
00348 break;
00349
00350 std::vector<ParseState*>* tmp = bestStates;
00351 bestStates = bestNextStates;
00352 bestNextStates = tmp;
00353 bestNextStates->clear();
00354 }
00355 Sentence* s = (*bestStates)[0]->getSentence();
00356 FOR_EACH (std::vector<ParseState*>, *bestStates, it)
00357 delete *it;
00358 return s;
00359 # else
00360 PyObject* pPredict = PyImport("predict");
00361 if (pPredict == 0)
00362 return 0;
00363 PyObject* predict = PyObject_GetAttrString(pPredict, "predict");
00364 ParseState* state = new ParseState(*sentence, &info, model.predIndex);
00365
00366 while (state->hasNext()) {
00367 Tanl::Classifier::Context& features = *state->next();
00368
00369 PyObject* pFeatures = PyTuple_New(features.size());
00370 for (int i = 0; i < features.size(); i++)
00371 PyTuple_SetItem(pFeatures, i, PyInt_FromLong(features[i]));
00372
00373 PyObject* pBest = PyObject_CallFunctionObjArgs(predict, pFeatures, pNumFeatures, model, 0);
00374 int best = PyInt_AsLong(pBest);
00375 Py_DECREF(pFeatures);
00376 Py_DECREF(pBest);
00377 char const* outcome = model.outcomeLabels[best];
00378 ParseState* nextState = state->transition(outcome);
00379 if (!nextState)
00380 nextState =state->transition("S");
00381 state = nextState;
00382 }
00383 Py_DECREF(pNumFeatures);
00384 Sentence* s = state->getSentence();
00385 delete state;
00386 return s;
00387 # endif
00388 }
00389
00390 void MlParser::revise(SentenceReader* sentenceReader, char const* actionFile)
00391 {}
00392
00393 }