00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "Common/OptionStream.h"
00026 #include "io/File.h"
00027 #include "include/Timer.h"
00028
00029
00030 #include "Parser.h"
00031 #include "EventStream.h"
00032 #include "Corpus.h"
00033 #include "WordCounts.h"
00034
00035
00036 #include <list>
00037 #ifdef _WIN32
00038 # include <io.h>
00039 # include <fcntl.h>
00040 #endif
00041
00042 using namespace Parser;
00043 using namespace Tanl;
00044 using namespace IXE;
00045 using namespace std;
00046
00047 namespace Parser {
00048
00049
00050 char const* configFileDefault = "desr.conf";
00051 conf<string> mode("Mode", "parse");
00052 conf<string> inputFormat("InputFormat", "CoNLL");
00053 conf<string> outputFormat("OutputFormat", "CoNLL");
00054 conf<bool> dump("DumpFeatures", false);
00055
00056 conf<int> SentenceCutoff("SentenceCutoff", INT_MAX);
00057
00059 conf<string> PosParameters("PosParameters", "");
00060 }
00061
00062
00063 Options::spec const commandOptions[] = {
00064 "help", Options::no_arg, 'h', "-h, --help : Print this help message",
00065 "algorithm", Options::req_arg, 'a', "-a, --algorithm alg : Learning algorithm [ME]",
00066 "conf", Options::req_arg, 'c', "-c, --conf : Configuration file [desr.conf]",
00067 "convert", Options::no_arg, 'C', "-C, --conv : Convert from input to output format",
00068 "dump", Options::no_arg, 'd', "-d, --dump : dump training features",
00069 "direction", Options::no_arg, 'R', "-R, --right-to-left : reading direction",
00070 "format", Options::req_arg, 'f', "-f, --input-format format : Input file format [CoNLL]",
00071 "output", Options::req_arg, 'F', "-F, --output-format format : Output file format [CoNLL]",
00072 "lang", Options::req_arg, 'l', "-l, --lang lang : Language to be used [en]",
00073 "model", Options::req_arg, 'm', "-m, --model modelFile : Model file",
00074 "revise", Options::req_arg, 'r', "-r, --revise model/actions : Revise parse tree according to revision model or actions file",
00075 "train", Options::no_arg, 't', "-t, --train : Train parser from input file",
00076 0
00077 };
00078
00079 ostream& usage(ostream& os = cerr)
00080 {
00081 os << "Usage: desr [options] [file]" << endl;
00082 os << " If file is missing, reads from standard input." << endl;
00083 return Options::usage(commandOptions, os);
00084 }
00085
00087
00088 void train(SentenceReader* sentenceReader, char const* modelFile)
00089 {
00090 ::Parser::Parser* parser = ::Parser::Parser::create();
00091 if (parser) {
00092 Timer timer;
00093 timer.start();
00094 parser->train(sentenceReader, modelFile);
00095 if (::Parser::Parser::verbose) {
00096 timer.stop();
00097 char buf[20];
00098 timer.duration(buf, sizeof buf);
00099 cerr << "Elapsed time: " << buf << endl;
00100 }
00101 delete parser;
00102 }
00103 }
00104
00105 void parse(char const* modelFile, SentenceReader* sentenceReader)
00106 {
00107 ::Parser::Parser* parser = ::Parser::Parser::create(modelFile);
00108 if (parser) {
00109 parser->parse(sentenceReader);
00110 delete parser;
00111 }
00112 }
00113
00114 void revise(char const* modelFile, SentenceReader* sentenceReader)
00115 {
00116 ::Parser::Parser* parser = ::Parser::Parser::create();
00117 if (parser) {
00118 parser->revise(sentenceReader, modelFile);
00119 delete parser;
00120 }
00121 }
00122
00123 char const* sentenceSeparator = "\n";
00124
00125 void dumpFeatures(SentenceReader* sentenceReader, int sentenceCutoff)
00126 {
00127 GlobalInfo info;
00128 EventStream eventStream(sentenceReader, &info, sentenceCutoff);
00129 # ifdef _WIN32
00130 _setmode(_fileno(stdout), _O_BINARY);
00131 # endif
00132 if (::Parser::Parser::featureCutoff) {
00133
00134 WordIndex predIndex;
00135
00136
00137 list<Classifier::Event*> events;
00138 WordCounts predCount;
00139 int evCount = 0;
00140 Classifier::PID pID = 1;
00141
00142
00143 while (eventStream.hasNext()) {
00144 Classifier::Event* ev = eventStream.next();
00145 events.push_back(ev);
00146 evCount++;
00147 if (::Parser::Parser::verbose) {
00148 if (evCount % 10000 == 0)
00149 cerr << '+' << flush;
00150 else if (evCount % 1000 == 0)
00151 cerr << '.' << flush;
00152 }
00153 vector<string>& ec = ev->features;
00154 for (unsigned j = 0; j < ec.size(); j++) {
00155 string& pred = ec[j];
00156
00157 if (predIndex.find(pred.c_str()) == predIndex.end()) {
00158
00159 WordCounts::iterator wcit = predCount.find(pred);
00160
00161 int count;
00162 if (wcit == predCount.end())
00163 count = predCount[pred] = 1;
00164 else
00165 count = ++wcit->second;
00166 if (count >= ::Parser::Parser::featureCutoff) {
00167 predIndex[pred.c_str()] = pID++;
00168 predCount.erase(pred);
00169 }
00170 }
00171 }
00172 }
00173 if (::Parser::Parser::verbose)
00174 cerr << endl;
00175 while (!events.empty()) {
00176 Classifier::Event* ev = events.front();
00177 events.pop_front();
00178 vector<string>& ec = ev->features;
00179 string line = ev->className;
00180 bool empty = true;
00181 for (unsigned j = 0; j < ec.size(); j++) {
00182 string& pred = ec[j];
00183 WordIndex::const_iterator pit = predIndex.find(pred.c_str());
00184 if (pit != predIndex.end()) {
00185 line += ' ';
00186 line += pred;
00187 empty = false;
00188 }
00189 }
00190 if (!empty)
00191 cout << line << endl;
00192 if (ev->className == "LROOT")
00193 cout << sentenceSeparator;
00194 delete ev;
00195 }
00196 } else {
00197 while (eventStream.hasNext()) {
00198 Classifier::Event* ev = eventStream.next();
00199 cout << *ev << endl;
00200 if (eventStream.EoS())
00201 cout << sentenceSeparator;
00202 delete ev;
00203 }
00204 }
00205 }
00206
00207 void convert(SentenceReader* sentenceReader, Corpus* corpus)
00208 {
00209 while (sentenceReader->MoveNext()) {
00210 Sentence* sent = sentenceReader->Current();
00211 corpus->print(cout, *sent);
00212 cout << endl;
00213 }
00214 }
00215
00216 enum Mode {
00217 Train,
00218 Parse,
00219 Revise,
00220 Dump
00221 };
00222
00224
00225 int main(int argc, char* argv[])
00226 {
00227 char const* alg_arg = 0;
00228 char const* lang_arg = 0;
00229 char const* configFile = configFileDefault;
00230 char const* inputFormat_arg = 0;
00231 char const* outputFormat_arg = 0;
00232 string modelFile;
00233 string modelRevise;
00234 bool rightToLeft_opt = false;
00235 bool conv_opt = false;
00236 bool mode_opt = false;
00237 Mode mode_type = (*mode == "parse") ? Parse : Train;
00238
00239
00240 OptionStream options(argc, argv, commandOptions);
00241 for (OptionStream::Option opt; options >> opt;) {
00242 switch (opt) {
00243 case 'h':
00244 usage();
00245 return -1;
00246 case 'a':
00247 alg_arg = opt.arg();
00248 break;
00249 case 'c':
00250 configFile = opt.arg();
00251 break;
00252 case 'C':
00253 conv_opt = true;
00254 break;
00255 case 'd':
00256 mode_opt = true; mode_type = Dump;
00257 break;
00258 case 'f':
00259 inputFormat_arg = opt.arg();
00260 break;
00261 case 'F':
00262 outputFormat_arg = opt.arg();
00263 break;
00264 case 'l':
00265 lang_arg = opt.arg();
00266 break;
00267 case 'm':
00268 modelFile = opt.arg();
00269 break;
00270 case 'r':
00271 mode_opt = true; mode_type = Revise;
00272 modelRevise = opt.arg();
00273 break;
00274 case 'R':
00275 rightToLeft_opt = true;
00276 break;
00277 case 't':
00278 mode_opt = true; mode_type = Train;
00279 break;
00280 }
00281 }
00282 argc -= options.shift(), argv += options.shift();
00283
00284
00285 istream* is = 0;
00286
00287 switch (argc) {
00288 case 0:
00289 is = &cin;
00290 break;
00291 case 1: {
00292 char const* inputFile = argv[0];
00293 IXE::io::File file(inputFile);
00294 if (!file.exists()) {
00295 cerr << "No input file: " << inputFile << endl;
00296 return -1;
00297 }
00298 if (!file.length()) {
00299 cerr << "Empty input file: " << inputFile << endl;
00300 return -1;
00301 }
00302 is = new ifstream(inputFile);
00303 break;
00304 }
00305 default:
00306 cerr << usage;
00307 return -1;
00308 }
00309
00310 if (modelFile.empty() && !mode_opt) {
00311 cerr << "Model file is required" << endl;
00312 cerr << usage;
00313 return -2;
00314 }
00315
00316
00317
00318
00319 if (IXE::io::File(configFile).exists())
00320 Configuration::load(configFile);
00321 else if (configFile != configFileDefault) {
00322 cerr << "Missing config file: " << configFile << endl;
00323 return -2;
00324 }
00325
00326 if (!mode_opt) {
00327 if (*mode == "train")
00328 mode_type = Train;
00329 else if (*mode == "parse")
00330 mode_type = Parse;
00331 else if (*mode == "revise")
00332 mode_type = Revise;
00333 }
00334
00335
00336 if (mode_type == Parse) {
00337
00338 ifstream ifs(modelFile.c_str());
00339 if (!ifs) {
00340 cerr << "Missing model file: " << modelFile << endl;
00341 return -3;
00342 }
00343 ::Parser::Parser::readHeader(ifs);
00344 ifs.close();
00345 } else {
00346
00347 if (alg_arg) algorithm = alg_arg;
00348 if (lang_arg) lang = lang_arg;
00349 if (rightToLeft_opt) RightToLeft = true;
00350 }
00351
00352 if (inputFormat_arg) inputFormat = inputFormat_arg;
00353 if (outputFormat_arg) outputFormat = outputFormat_arg;
00354
00355 Language const* language = Language::get(lang->c_str());
00356 if (language == 0) {
00357 cerr << "Unknown language: " << *lang << endl;
00358 return -4;
00359 }
00360
00361
00362 Corpus* corpus = Corpus::create(*language, inputFormat);
00363 if (corpus == 0) {
00364 cerr << "Unknown format: " << *inputFormat << endl;
00365 return -1;
00366 }
00367 SentenceReader* sentenceReader = corpus->sentenceReader(is);
00368
00369 if (conv_opt) {
00370 Corpus* outCorpus = new CombCorpus(*language, inputFormat->c_str());
00371 convert(sentenceReader, outCorpus);
00372 delete outCorpus;
00373 delete corpus;
00374 delete sentenceReader;
00375 return 0;
00376 }
00377
00378 try {
00379 switch (mode_type) {
00380 case Train:
00381
00382 train(sentenceReader, modelFile.c_str());
00383 break;
00384 case Dump:
00385 dumpFeatures(sentenceReader, SentenceCutoff);
00386 break;
00387 case Parse: {
00388
00389
00390 SentenceReader* basicReader = sentenceReader;
00391 if (PosParameters->size()) {
00392 PosTagger* posTagger = 0;
00393 if (!strcmp(basicReader->corpus->language.code(), "en"))
00394 posTagger = new PennPosTagger(PosParameters->c_str());
00395 else
00396 posTagger = new PosTagger(PosParameters->c_str());
00397 sentenceReader = new TaggedSentenceReader(basicReader, posTagger);
00398 }
00399 parse(modelFile.c_str(), sentenceReader);
00400 if (basicReader != sentenceReader)
00401 delete basicReader;
00402 break;
00403 }
00404 case Revise: {
00405 if (!IXE::io::File(modelRevise.c_str()).exists()) {
00406 cerr << "No revision model: " << modelRevise << endl;
00407 return -1;
00408 }
00409 revise(modelRevise.c_str(), sentenceReader);
00410 break;
00411 }
00412 }
00413 } catch (IXE::Error& e) {
00414 cerr << "Error: " << e.message() << endl;
00415 } catch (exception& e) {
00416 cerr << "Error: " << e.what() << endl;
00417 }
00418 if (is != &cin)
00419 delete is;
00420 delete sentenceReader;
00421 delete corpus;
00422 }