00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifdef _WIN32
00025 #include "lib/strtok_r.h"
00026 #endif
00027
00028
00029 #include "conf/conf_int.h"
00030 #include "conf/conf_float.h"
00031 #include "text/WordIndex.h"
00032
00033 #include "Parser.h"
00034 #include "ap.h"
00035 #include "EventStream.h"
00036
00037
00038 #include <list>
00039
00040
00041
00042
00043 using namespace std;
00044
00045 namespace Parser {
00046
00048 IXE::conf<int> apIterations("ApIterations", 20);
00050 IXE::conf<float> apPercent("ApPercent", 0.1F);
00052 IXE::conf<int> partitionSize("ApPartition", 1);
00053
00056
00060 struct ApParser : public Parser
00061 {
00062 ApParser(char const* modelFile, int iter = 0);
00063
00064 void train(SentenceReader* sentenceReader, char const* modelFile);
00065 Sentence* parse(Sentence* sentence);
00066 void revise(SentenceReader* sentenceReader, char const* actionFile = 0);
00067
00068 APSV ap;
00069 int iter;
00070 };
00071
00075 Parser* ApParserFactory(char const* modelFile = 0)
00076 {
00077 return new ApParser(modelFile, apIterations);
00078 }
00079
00080 REGISTER_PARSER(AP, ApParserFactory);
00081
00082 ApParser::ApParser(char const* modelFile, int iter) :
00083 Parser(ap.predIndex),
00084 iter(iter)
00085 {
00086 AP::verbose = Parser::verbose;
00087 AP::updatePercent = apPercent;
00088
00089
00090 if (!modelFile)
00091 return;
00092 ifstream ifs(modelFile);
00093 if (!ifs)
00094 throw IXE::FileError(string("Missing model file: ") + modelFile);
00095
00096 readHeader(ifs);
00097 ap.load(ifs);
00098
00099 info.load(ifs);
00100 ifs.close();
00101 }
00102
00103 void ApParser::train(SentenceReader* sentenceReader, char const* modelFile)
00104 {
00105 WordIndex labelIndex;
00106 vector<string> labels;
00107
00108 vector<string> predLabels;
00109
00110
00111 list<Tanl::Classifier::Event*> events;
00112
00113 WordCounts predCount;
00114 int evCount = 0;
00115 Tanl::Classifier::PID pID = 1;
00116
00117
00118 EventStream eventStream(sentenceReader, &info);
00119 while (eventStream.hasNext()) {
00120 Tanl::Classifier::Event* ev = eventStream.next();
00121 events.push_back(ev);
00122 evCount++;
00123 if (verbose) {
00124 if (evCount % 10000 == 0)
00125 cerr << '+' << flush;
00126 else if (evCount % 1000 == 0)
00127 cerr << '.' << flush;
00128 }
00129 vector<string>& ec = ev->features;
00130 for (unsigned j = 0; j < ec.size(); j++) {
00131 string& pred = ec[j];
00132
00133 if (predIndex.find(pred.c_str()) == predIndex.end()) {
00134
00135 WordCounts::iterator wcit = predCount.find(pred);
00136
00137 int count;
00138 if (wcit == predCount.end())
00139 count = predCount[pred] = 1;
00140 else
00141 count = ++wcit->second;
00142 if (count >= featureCutoff) {
00143 predLabels.push_back(pred);
00144 predIndex[pred.c_str()] = pID++;
00145 predCount.erase(pred);
00146 }
00147 }
00148 }
00149 }
00150 if (verbose)
00151 cerr << endl;
00152
00153
00154 Cases cases;
00155 cases.reserve(evCount);
00156 int n = 0;
00157 Tanl::Classifier::ClassID oID = 0;
00158 while (!events.empty()) {
00159 Tanl::Classifier::Event* ev = events.front();
00160 events.pop_front();
00161 cases.push_back(Case());
00162 X& x = cases[n].first;
00163
00164 vector<string>& ec = ev->features;
00165 char const* c = ev->className.c_str();
00166 for (unsigned j = 0; j < ec.size(); j++) {
00167 string& pred = ec[j];
00168 WordIndex::const_iterator pit = predIndex.find(pred.c_str());
00169 if (pit != predIndex.end()) {
00170 x.push_back(pit->second);
00171 }
00172 }
00173 if (x.size()) {
00174 if (labelIndex.find(c) == labelIndex.end()) {
00175 labelIndex[c] = oID++;
00176 labels.push_back(c);
00177 }
00178 cases[n].second = labelIndex[c];
00179 n++;
00180 if (verbose) {
00181 if (n % 10000 == 0)
00182 cerr << '+' << flush;
00183 else if (n % 1000 == 0)
00184 cerr << '.' << flush;
00185 }
00186 x.push_back(0);
00187 }
00188 delete ev;
00189 }
00190 cases.resize(n);
00191 if (verbose)
00192 cerr << endl;
00193
00194 int predSize = predLabels.size();
00195 predSize++;
00196 APSV ap(labels.size(), predSize);
00197
00198 ofstream ofs(modelFile, ios::binary | ios::trunc);
00199
00200 writeHeader(ofs);
00201
00202 ofs << labels.size() << endl;
00203 FOR_EACH (vector<string>, labels, pit)
00204 ofs << *pit << endl;
00205
00206 ofs << predLabels.size() << endl;
00207 FOR_EACH (vector<string>, predLabels, pit)
00208 ofs << *pit << endl;
00209
00210 predIndex.clear();
00211 predIndex = WordIndex();
00212 labelIndex.clear();
00213 labelIndex = WordIndex();
00214
00215 info.clearRareEntities();
00216
00217 ap.train(cases, iter);
00218
00219 ap.save(ofs);
00220
00221 info.save(ofs);
00222 }
00223
00224 Sentence* ApParser::parse(Sentence* sentence)
00225 {
00226 ParseState state(*sentence, &info, predIndex);
00227
00228 while (state.hasNext()) {
00229 Tanl::Classifier::Context& pIDs = *state.next();
00230 X x;
00231 x.push_back(0);
00232
00233 for (unsigned i = 0; i < pIDs.size(); i++) {
00234 x.push_back(pIDs[i]);
00235 }
00236 Y prediction = ap.predict(x);
00237 string& outcome = ap.labels[prediction];
00238 if (!state.transition(outcome.c_str()))
00239 state.transition("S");
00240 }
00241 return state.getSentence();
00242 }
00243
00244 void ApParser::revise(SentenceReader* sentenceReader, char const* actionFile)
00245 {
00246 if (actionFile) {
00247
00248 ifstream ifs(actionFile);
00249
00250 ReviseContextStream contextStream(sentenceReader, predIndex);
00251
00252 char line[4000];
00253 while (contextStream.hasNext()) {
00254 ++contextStream.cur;
00255 ifs.getline(line, sizeof(line));
00256 char* next = line;
00257 char const* outcome = strtok_r(0, " \t", &next);
00258 contextStream.actions.push_back(outcome);
00259 }
00260 } else {
00261 ReviseContextStream contextStream(sentenceReader, predIndex);
00262
00263 while (contextStream.hasNext()) {
00264 Tanl::Classifier::Context& pIDs = *contextStream.next();
00265 X x;
00266 x.push_back(0);
00267
00268 for (unsigned i = 0; i < pIDs.size(); i++) {
00269 x.push_back(pIDs[i]);
00270 }
00271 FeatureID prediction = ap.predict(x);
00272 string& outcome = ap.labels[prediction];
00273 contextStream.actions.push_back(outcome);
00274 }
00275 }
00276 }
00277
00278 }