00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef DeSR_SplitSentenceReader_H
00025 #define DeSR_SplitSentenceReader_H
00026
00027
00028 #include "Corpus.h"
00029 #include "Tokenizer.h"
00030 #include "SentenceReader.h"
00031
00032
00033 #include "text/less.h"
00034
00035
00036 #include <set>
00037
00038 namespace Tanl {
00039
00044 class SplitSentenceReader : public SentenceReader
00045 {
00046 public:
00047
00051 struct BoundarySet {
00052 static char const* defaultBoundaryTokens[];
00053 static char const* defaultBoundaryFollowers[];
00054
00055 BoundarySet(char const* boundaryTokens[] = defaultBoundaryTokens,
00056 char const* boundaryFollowers[] = defaultBoundaryFollowers,
00057 char const* regionBegin = 0,
00058 char const* regionEnd = 0) {
00059 for (char const** scan = boundaryTokens; *scan; scan++)
00060 boundaries.insert(*scan);
00061 for (char const** scan = boundaryFollowers; *scan; scan++)
00062 followers.insert(*scan);
00063 }
00064
00065 bool ends(char const* tok) const {
00066 return boundaries.find(tok) != boundaries.end();
00067 }
00068
00069 std::set<char const*> boundaries;
00070 std::set<char const*> followers;
00071 };
00072 static BoundarySet const defaultBS;
00073
00079 SplitSentenceReader(std::istream* is,
00080 Corpus* corpus = 0,
00081 BoundarySet const* bs = &defaultBS);
00082
00084 bool MoveNext();
00085
00087 void reset() { tokenizer.Reset(); insideRegion = false; }
00088
00089 private:
00090 Parser::Tokenizer tokenizer;
00091 BoundarySet const* boundarySet;
00092 bool insideRegion;
00093
00094 # ifdef STEMMER
00095 sb_stemmer* stemmer;
00096 # endif
00097 };
00098
00099 }
00100
00101 #endif // DeSR_SplitSentenceReader_H