desr: desr: src/SplitSentenceReader.cpp Source File

00001 /*
00002 **  DeSR
00003 **  src/SplitSentenceReader.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2005  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of DeSR.
00009 **
00010 **  DeSR is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  DeSR is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 // local
00025 #include "SplitSentenceReader.h"
00026 
00027 namespace Tanl {
00028 
00029 char const* SplitSentenceReader::BoundarySet::defaultBoundaryTokens[] =
00030   {".", "!", "?", 0 };
00031 char const* SplitSentenceReader::BoundarySet::defaultBoundaryFollowers[] =
00032   {")", "]", "\"", "\'", "''", "-RRB-", "-RSB-", 0 };
00033 
00034 SplitSentenceReader::BoundarySet const SplitSentenceReader::defaultBS;
00035 
00036 
00037   SplitSentenceReader::SplitSentenceReader(std::istream* is,
00038                                          Corpus* corpus,
00039                                          BoundarySet const* bs) :
00040   SentenceReader(is, corpus),
00041   tokenizer(is),
00042   boundarySet(bs),
00043   insideRegion(false)
00044 {
00045 }
00046 
00047 bool SplitSentenceReader::MoveNext()
00048 {
00049   if (!tokenizer.MoveNext())
00050     return false;
00051 
00052   sentence = new Sentence;
00053   int id = 1;
00054   do {
00055     char const* scanTok = tokenizer.Current()->text;
00056     TreeToken* tok = new TreeToken(id++, scanTok);
00057     sentence->push_back(tok);
00058     if (boundarySet->ends(scanTok))
00059       break;
00060   } while (tokenizer.MoveNext());
00061   return true;
00062 }
00063 
00064 } // namespace Tanl