00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "Parser.h"
00025 #include "ReviseEventStream.h"
00026 #include "Corpus.h"
00027 #include "version.h"
00028
00029
00030 #include "ixe/io/File.h"
00031
00032
00033 #include <iostream>
00034
00035 using namespace std;
00036 using namespace IXE;
00037
00038 #define MAX_LINE_LEN 8196
00039
00040 namespace Parser {
00041
00042 conf<string> algorithm("Algorithm", "SVM");
00043 conf<string> lang("Language", "en");
00044 conf<int> beam("Beam", 1, 1);
00045
00046 conf<int> Parser::featureCutoff("FeatureCutoff", 0);
00047 conf<bool> Parser::verbose("Verbose", false);
00048
00049 float const GlobalInfo::freqRatio = 1.5;
00050
00058 std::map<char const*, ParserFactory*>& ParserMap::get()
00059 {
00060 static std::map<char const*, ParserFactory*> parserFor;
00061 return parserFor;
00062 }
00063
00064 ParserFactory* ParserMap::get(char const* type)
00065 {
00066 std::map<char const*, ParserFactory*>& parserMap = get();
00067 map<char const*, ParserFactory*>::const_iterator rit = parserMap.find(type);
00068 return (rit == parserMap.end()) ? 0 : rit->second;
00069 }
00070
00071 Parser* Parser::create(char const* modelFile)
00072 {
00073 if (modelFile) {
00074
00075 ifstream ifs(modelFile);
00076 if (!ifs) {
00077 cerr << "Missing model file: " << modelFile << endl;
00078 return 0;
00079 }
00080 ::Parser::Parser::readHeader(ifs);
00081 ifs.close();
00082 }
00083
00084 ParserFactory* factory = ParserMap::get(algorithm->c_str());
00085 if (!factory) {
00086 cerr << "No such algorithm: " << *algorithm << endl;
00087 return 0;
00088 }
00089 ::Parser::Parser* parser = factory(modelFile);
00090 if (parser == 0) {
00091 cerr << "Could not load: " << modelFile << endl;
00092 return 0;
00093 }
00094 return parser;
00095 }
00096
00097 Enumerator<Sentence*>* Parser::pipe(Enumerator<std::vector<Token*>*>& tce)
00098 {
00099 return new ParserPipe(*this, tce);
00100 }
00101
00102 Enumerator<Sentence*>* Parser::pipe(Enumerator<Sentence*>& tce)
00103 {
00104 return new ParserSentPipe(*this, tce);
00105 }
00106
00107 void Parser::parse(SentenceReader* reader, ostream& os)
00108 {
00109 int las = 0;
00110 int uas = 0;
00111 int tot = 0;
00112 int sent = 0;
00113 # ifdef _WIN32
00114 if (os == cout)
00115 _setmode(_fileno(stdout), _O_BINARY);
00116 # endif
00117
00118 while (reader->MoveNext()) {
00119 Sentence* sin = reader->Current();
00120 Sentence* sout = parse(sin);
00121 reader->corpus->print(os, *sout);
00122 os << endl;
00123 ++sent;
00124
00125 Sentence::const_iterator soutIt = sout->begin();
00126 FOR_EACH (Sentence, *sin, sinIt) {
00127 TreeToken* tin = *sinIt;
00128 TreeToken* tout = *soutIt++;
00129 tot++;
00130 if (tin->linkHead() == tout->linkHead()) {
00131 uas++;
00132 if (tin->linkLabel() == tout->linkLabel())
00133 las++;
00134 }
00135 }
00136 delete sin;
00137 delete sout;
00138 }
00139 showEval(tot, las, uas, sent);
00140 }
00141
00142 void Parser::writeHeader(ostream& os)
00143 {
00144 os << "<desr alg='" << *algorithm << "' version='" << version << "'>" << endl;
00145 Configuration::Map& vars = Configuration::variables();
00146 FOR_EACH (Configuration::Map, vars, vit)
00147 vit->second->serialize(os);
00148 os << "</desr>" << endl;
00149 }
00150
00151 bool Parser::readHeader(istream& is)
00152 {
00153 Configuration::reset();
00154 Configuration::load(is);
00155 return true;
00156 }
00157
00158 void GlobalInfo::extract(Sentence const& sentence)
00159 {
00160 Language const* lang = sentence.language;
00161
00162 FOR_EACH (Sentence, sentence, sit) {
00163 TreeToken* node = *sit;
00164 Token& tok = *node->token;
00165 int head = node->linkHead();
00166 if (head == 0)
00167 continue;
00168 if (tok.isNoun(lang)) {
00169 string const* noun = tok.getLemma();
00170 if (noun && !noun->empty()) {
00171 if (tok.isTime(lang)) {
00172
00173 timeLemmas.add(*noun);
00174 } else if (tok.isLocation(lang)) {
00175
00176 locLemmas.add(*noun);
00177 }
00178
00179 Token* par = sentence[head - 1]->token;
00180 if (par->isPreposition(lang)) {
00181 if (par->isTime(lang)) {
00182
00183 timeLemmas.add(*noun);
00184 } else if (par->isLocation(lang)) {
00185
00186 locLemmas.add(*noun);
00187 }
00188 }
00189 }
00190 }
00191 }
00192 }
00193
00194 void GlobalInfo::clearRareEntities()
00195 {
00196 for (WordCounts::iterator pit = timeLemmas.begin();
00197 pit != timeLemmas.end(); ) {
00198 WordCounts::iterator cur = pit++;
00199 int tc = cur->second;
00200 int lc = locLemmas.count(cur->first);
00201 if (tc >= freqRatio * lc)
00202 locLemmas.erase(cur->first);
00203 else if (lc >= freqRatio * tc)
00204 timeLemmas.erase(cur);
00205 }
00206 }
00207
00208 void GlobalInfo::save(ofstream& ofs)
00209 {
00210
00211 ofs << timeLemmas.size() << endl;
00212 FOR_EACH (WordCounts, timeLemmas, pit)
00213 ofs << pit->first << endl;
00214
00215 ofs << locLemmas.size() << endl;
00216 FOR_EACH (WordCounts, locLemmas, pit)
00217 ofs << pit->first << endl;
00218 }
00219
00220 void GlobalInfo::load(ifstream& ifs)
00221 {
00222 char line[MAX_LINE_LEN];
00223
00224 if (ifs.getline(line, MAX_LINE_LEN)) {
00225 int n = atoi(line);
00226 while (n-- && ifs.getline(line, MAX_LINE_LEN))
00227 timeLemmas.add(line);
00228
00229 if (ifs.getline(line, MAX_LINE_LEN)) {
00230 n = atoi(line);
00231 while (n-- && ifs.getline(line, MAX_LINE_LEN))
00232 locLemmas.add(line);
00233 }
00234 }
00235 }
00236
00237 void Parser::showEval(int tokenCount, int las, int uas, int sentCount)
00238 {
00239 cerr << "UAS: " << float(uas)/tokenCount << " (" << uas << '/' << tokenCount << ')' << endl;
00240 cerr << "LAS: " << float(las)/tokenCount << " (" << las << '/' << tokenCount << ')' << endl;
00241 cerr << "Sentences: " << sentCount << endl;
00242 }
00243
00244
00245
00246 ParserPipe::ParserPipe(Parser& parser, Enumerator<std::vector<Token*>*>& tve) :
00247 parser(parser),
00248 tve(tve),
00249 language(Language::get(lang->c_str()))
00250 {
00251 }
00252
00253 bool ParserPipe::MoveNext()
00254 {
00255 return tve.MoveNext();
00256 }
00257
00258 Sentence* ParserPipe::Current()
00259 {
00260 vector<Token*>* sent = tve.Current();
00261 Sentence* sentence = new Sentence(language);
00262 int id = 1;
00263 FOR_EACH (vector<Token*>, *sent, vit) {
00264 Token* tok = *vit;
00265 TreeToken* token = new TreeToken(id++, tok->form, tok->attributes, tok->links);
00266 sentence->push_back(token);
00267 delete tok;
00268 }
00269 delete sent;
00270 return parser.parse(sentence);
00271 }
00272
00273
00274
00275 ParserSentPipe::ParserSentPipe(Parser& parser, Enumerator<Sentence*>& tve) :
00276 parser(parser),
00277 tve(tve)
00278 { }
00279
00280 bool ParserSentPipe::MoveNext()
00281 {
00282 return tve.MoveNext();
00283 }
00284
00285 Sentence* ParserSentPipe::Current()
00286 {
00287 Sentence* sentence = tve.Current();
00288 return parser.parse(sentence);
00289 }
00290
00291 }