desr: desr: src/MlpParser.cpp Source File

00001 /*
00002 **  DeSR
00003 **  src/MlParser.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2009  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of DeSR.
00009 **
00010 **  DeSR is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  DeSR is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00026 
00027 #ifdef _WIN32
00028 #include "lib/strtok_r.h"
00029 #endif
00030 
00031 #include "conf/conf_int.h"
00032 #include "Parser.h"
00033 #include "EventStream.h"
00034 #include "State.h"
00035 
00036 #include <boost/numeric/ublas/matrix.hpp>
00037 #include <boost/numeric/ublas/matrix_proxy.hpp>
00038 
00039 using namespace std;
00040 using namespace boost::numeric::ublas;
00041 
00042 namespace Parser {
00043 
00044 #define BEAM
00045 
00046 #define MAX_LINE_LEN 8196
00047 
00048 
00049 typedef boost::numeric::ublas::vector<double>   Vector;
00050 typedef matrix<double>  Matrix;
00051 
00052 void softsign(Vector& x)
00053 {
00054   for (int i = 0; i < x.size(); i++)
00055     x[i] /=  1.0 + fabs(x[i]);
00056 }
00057 
00058 #define READLINE(line, file)   if (!ifs.getline(line, MAX_LINE_LEN)) \
00059     throw IXE::FileError(string("Wrong file format: ") + file)
00060 
00061 #define READ_WEIGHT(w, file) if (!ifs.read((char*)&w, sizeof(w)))       \
00062     throw IXE::FileError(string("Wrong file format: ") + file)
00063 
00064 struct DlModel : public Classifier::Classifier
00065 {
00066   DlModel() { }
00067 
00068   DlModel(int numFeatures, int numOutcomes, int numHidden) :
00069     w1(numFeatures, numHidden),
00070     b1(numHidden),
00071     w2(numHidden, numOutcomes),
00072     b2(numOutcomes)
00073   { }
00074 
00075   ~DlModel() {
00076     FOR_EACH (std::vector<char const*>, outcomeLabels, it)
00077       free((void*)*it);
00078   }
00079 
00090   int   crossentropy_softmax_argmax(Vector& x, double sm[]) {
00091     // get the maximum value of x for numerically safe softmax
00092     double m = 0.0;
00093     int am = 0;
00094     for (int i = 0; i < numOutcomes; i++)
00095       if (x[i] > m) {
00096         am = i;
00097         m = x[i];
00098       }
00099     // compute the unnormalized softmax, and normalization constant
00100     double sum_j = 0.0;
00101     for (int i = 0; i < numOutcomes; i++)
00102       sum_j += sm[i] = exp(x[i] - m);
00103 
00104     // normalized our softmax
00105     for (int i = 0; i < numOutcomes; i++)
00106       sm[i] /= sum_j;
00107 
00108     return am;
00109   }
00110 
00111   void          estimate(Tanl::Classifier::Context& features, double prob[]) {
00112     int numFeatures = predIndex.size();
00113     Vector xw1(numHidden);
00114     for (int i = 0; i < numHidden; i++)
00115       xw1[i] = 0.0;
00116     for (int f = 0; f < features.size(); f++)
00117       xw1 += row(w1, features[f]);
00118     xw1 += b1;
00119     softsign(xw1);
00120     Vector h(numOutcomes);
00121     h = prod(xw1, w2);
00122     h += b2;
00123     int argmax = crossentropy_softmax_argmax(h, prob);
00124   }
00125 
00126   void          load(ifstream& ifs, char const* file = "") {
00127     // load class labels
00128     char line[MAX_LINE_LEN];
00129     READLINE(line, file);
00130     int len = atoi(line);
00131     numOutcomes = len;
00132     outcomeLabels.resize(numOutcomes);
00133     int n = 0;
00134     while (len--) {
00135       READLINE(line, file);
00136       outcomeLabels[n] = strdup(line);
00137       outcomeIndex[(char const*)line] = n++;
00138     }
00139     // load feature labels
00140     READLINE(line, file);
00141     int numFeatures = len = atoi(line);
00142     predLabels.resize(numFeatures);
00143     n = 0;
00144     while (len--) {
00145       READLINE(line, file);
00146       predLabels[n] = line;
00147       predIndex[(char const*)line] = n++;
00148     }
00149 
00150     // read hidden size
00151     READLINE(line, file);
00152     numHidden = atoi(line);
00153 
00154     w1.resize(numFeatures, numHidden);
00155     // load weights
00156 
00157     // w1
00158     READLINE(line, file);
00159     len = atoi(line);           // numFeats * numHidden
00160     n = 0;
00161     double w;
00162     while (len--) {
00163       READ_WEIGHT(w, file);
00164       w1(n / numHidden, n % numHidden) = w;
00165       n++;
00166     }
00167 
00168     // b1
00169     b1.resize(numHidden);
00170     READLINE(line, file);
00171     len = atoi(line);           // numHidden
00172     n = 0;
00173     while (len--) {
00174       READ_WEIGHT(w, file);
00175       b1[n++] = w;
00176     }
00177 
00178     // w2
00179     w2.resize(numHidden, numOutcomes);
00180     READLINE(line, file);
00181     len = atoi(line);           // numHidden * numOutcomes
00182     n = 0;
00183     while (len--) {
00184       READ_WEIGHT(w, file);
00185       w2(n / numOutcomes, n % numOutcomes) = w;
00186       n++;
00187     }
00188 
00189     // b2
00190     b2.resize(numOutcomes);
00191     READLINE(line, file);
00192     len = atoi(line);           // numOutcomes
00193     n = 0;
00194     while (len--) {
00195       READ_WEIGHT(w, file);
00196       b2[n++] = w;
00197     }
00198   }
00199 
00200   Matrix        w1, w2;
00201   Vector        b1, b2;
00202   int           numHidden;
00203   WordIndex     outcomeIndex;
00204 };
00205 
00209 struct MlParser : public Parser
00210 {
00211   MlParser(char const* modelFile);
00212 
00213   void          train(SentenceReader* sentenceReader, char const* modelFile);
00214 
00215   Sentence*     parse(Sentence* sentence);
00216 
00217   void          revise(SentenceReader* sentenceReader, char const* actionFile = 0);
00218 
00219   DlModel               model;
00220 };
00221 
00225 Parser* MlParserFactory(char const* modelFile = 0)
00226 {
00227   return new MlParser(modelFile);
00228 }
00229 
00230 REGISTER_PARSER(MLP, MlParserFactory);
00231 
00232 MlParser::MlParser(char const* modelFile) :
00233   Parser(model.PredIndex())
00234 {
00235   if (!modelFile)
00236     return;                     // training
00237   ifstream ifs(modelFile);
00238   if (!ifs)
00239     throw IXE::FileError(string("Missing model file: ") + modelFile);
00240   // load header
00241   readHeader(ifs);
00242   model.load(ifs);
00243   // read entities
00244   info.load(ifs);
00245   ifs.close();
00246 }
00247 
00248 void MlParser::train(SentenceReader* sentenceReader, char const* modelFile)
00249 {
00250   EventStream eventStream(sentenceReader, &info);
00251   cerr << "Training the Deep Learning is currently possible only with a Python script.\n"
00252        << "Generate the training examples with 'desr -d'\n"
00253        << "and then hadle them to the script train.py.\n"
00254        << "You then must convert the generated model into a DeSR model\n"
00255        << "using the script script/dumpPyModel.py" << endl;
00256   /*
00257   DeepLearn model(iterations, featureCutoff);
00258   model.verbose = verbose;
00259   model.read(eventStream);
00260   ofstream ofs(modelFile, ios::binary | ios::trunc);
00261   // dump configuration settings
00262   writeHeader(ofs);
00263   model.train();
00264   // cant do before train since outcomeLabels is created in train()
00265   model.writeHeader(ofs);
00266   model.writeData(ofs);
00267   // dump entities
00268   info.save(ofs);
00269   */
00270 }
00271 
00272 #ifdef BEAM
00273 // FIXME: this is duplicated in MlParser.cpp
00274 static double addState(ParseState* s, std::vector<ParseState*>& states)
00275 {
00276   int size = states.size();
00277   if (size == 0) {
00278     states.push_back(s);
00279     return s->lprob;
00280   }
00281   double worst = states[size-1]->lprob;
00282   if (size == beam && s->lprob < worst)
00283     return worst;
00284   TO_EACH (std::vector<ParseState*>, states, it)
00285     if (s->lprob > (*it)->lprob) {
00286       if (size == beam) {
00287         delete states.back();
00288         states.pop_back();
00289       }
00290       states.insert(it, s);
00291       return states.back()->lprob;
00292     }
00293   if (size < beam)
00294     states.push_back(s);
00295   return states.back()->lprob;
00296 }
00297 #endif
00298 
00299 Sentence* MlParser::parse(Sentence* sentence)
00300 {
00301   int numOutcomes = model.NumOutcomes();
00302   int numFeatures = model.PredIndex().size();
00303   double params[numOutcomes];
00304 
00305 # ifdef BEAM
00306   std::vector<ParseState*> currStates; currStates.reserve(beam);
00307   std::vector<ParseState*> nextStates; nextStates.reserve(beam);
00308   std::vector<ParseState*>* bestStates = &currStates;
00309   std::vector<ParseState*>* bestNextStates = &nextStates;
00310   ParseState* state = new ParseState(*sentence, &info, model.PredIndex());
00311   addState(state, *bestStates);
00312 
00313   while (true) {
00314     int finished = 0;
00315     int numBest = bestStates->size();
00316     //int numNext = 0;
00317     double worstProb = -numeric_limits<double>::infinity();
00318     for (int i = 0; i < numBest; i++) {
00319       state = (*bestStates)[i];
00320       if (state->hasNext()) {
00321         Tanl::Classifier::Context& features = *state->next();
00322         // build example
00323         model.estimate(features, params);
00324         for (int o = 0; o < numOutcomes; o++) {
00325           double prob = params[o];
00326           if (prob < 1e-4)
00327             continue;           // not worth considering
00328           double lprob = log(prob) + state->lprob;
00329           if (bestNextStates->size() == beam && lprob < worstProb)
00330             continue;           // not worth considering
00331           char const* outcome = model.OutcomeName(o);
00332           ParseState* next = state->transition(outcome);
00333           if (!next) {
00334             // dead end
00335             state->dispose();
00336             continue;
00337           }
00338           next->lprob = lprob;
00339           worstProb = addState(next, *bestNextStates);
00340         }
00341       } else {
00342         // already finished: promote to bestNextStates
00343         worstProb = addState(state, *bestNextStates);
00344         finished++;
00345       }
00346     }
00347     if (finished == numBest)
00348       break;
00349     // swap vectors
00350     std::vector<ParseState*>* tmp = bestStates;
00351     bestStates = bestNextStates;
00352     bestNextStates = tmp;
00353     bestNextStates->clear();
00354   }
00355   Sentence* s = (*bestStates)[0]->getSentence();
00356   FOR_EACH (std::vector<ParseState*>, *bestStates, it)
00357     delete *it;
00358   return s;
00359 # else
00360   PyObject* pPredict = PyImport("predict");
00361   if (pPredict == 0)
00362     return 0;
00363   PyObject* predict = PyObject_GetAttrString(pPredict, "predict");
00364   ParseState* state = new ParseState(*sentence, &info, model.predIndex);
00365 
00366   while (state->hasNext()) {
00367     Tanl::Classifier::Context& features = *state->next();
00368     // build example
00369     PyObject* pFeatures = PyTuple_New(features.size());
00370     for (int i = 0; i < features.size(); i++)
00371       PyTuple_SetItem(pFeatures, i, PyInt_FromLong(features[i]));
00372 
00373     PyObject* pBest = PyObject_CallFunctionObjArgs(predict, pFeatures, pNumFeatures, model, 0);
00374     int best = PyInt_AsLong(pBest);
00375     Py_DECREF(pFeatures);
00376     Py_DECREF(pBest);
00377     char const* outcome = model.outcomeLabels[best];
00378     ParseState* nextState = state->transition(outcome);
00379     if (!nextState)
00380       nextState =state->transition("S");
00381     state = nextState;
00382   }
00383   Py_DECREF(pNumFeatures);
00384   Sentence* s = state->getSentence();
00385   delete state;
00386   return s;
00387 # endif
00388 }
00389 
00390 void MlParser::revise(SentenceReader* sentenceReader, char const* actionFile)
00391 {}
00392 
00393 } // namespace Parser