00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "SplitSentenceReader.h"
00026
00027 namespace Tanl {
00028
00029 char const* SplitSentenceReader::BoundarySet::defaultBoundaryTokens[] =
00030 {".", "!", "?", 0 };
00031 char const* SplitSentenceReader::BoundarySet::defaultBoundaryFollowers[] =
00032 {")", "]", "\"", "\'", "''", "-RRB-", "-RSB-", 0 };
00033
00034 SplitSentenceReader::BoundarySet const SplitSentenceReader::defaultBS;
00035
00036
00037 SplitSentenceReader::SplitSentenceReader(std::istream* is,
00038 Corpus* corpus,
00039 BoundarySet const* bs) :
00040 SentenceReader(is, corpus),
00041 tokenizer(is),
00042 boundarySet(bs),
00043 insideRegion(false)
00044 {
00045 }
00046
00047 bool SplitSentenceReader::MoveNext()
00048 {
00049 if (!tokenizer.MoveNext())
00050 return false;
00051
00052 sentence = new Sentence;
00053 int id = 1;
00054 do {
00055 char const* scanTok = tokenizer.Current()->text;
00056 TreeToken* tok = new TreeToken(id++, scanTok);
00057 sentence->push_back(tok);
00058 if (boundarySet->ends(scanTok))
00059 break;
00060 } while (tokenizer.MoveNext());
00061 return true;
00062 }
00063
00064 }