desr: desr: text/HtmlTokenizer.cpp Source File

00001 /*
00002 **  IXE C++ Library
00003 **  ixe/text/HtmlTokenizer.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2002  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of DeSR.
00009 **
00010 **  DeSR is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  DeSR is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #include "text/HtmlTokenizer.h"
00025 
00026 #include <istream>
00027 #include <string.h>
00028 
00029 using namespace std;
00030 
00031 namespace Tanl {
00032 namespace Text {
00033 
00034 // include \n
00035 char const HtmlTokenizer::delimitersNL[] = " \t\n\r";
00036 
00047 static
00048 char  const*
00049 next_token(char const*& start, char const* end, const char* sep)
00050 {
00051   // skip all letters that are included in the separator string
00052   while (start < end) {
00053     if (::strchr(sep, *start))
00054       ++start;
00055     else if (*start == '<') {
00056       while (start < end && *start++ != '>')
00057         ;
00058     } else
00059       break;
00060   }
00061 
00062   if (start == end)
00063     return 0;
00064 
00065   char const* token = start;
00066 
00067   // scan through the string to find where it ends, it ends on a
00068   // null byte or a character that exists in the separator string
00069   while (++start < end && !::strchr(sep, *start))
00070     ;
00071 
00072   return token;                 // return the position where the token starts
00073 }
00074 
00075 char const* HtmlTokenizer::next()
00076 {
00077   if (hasNext()) {
00078     char const* res = tok;
00079     tok = 0;                    // consume token, forcing hasNext() to advance
00080     return res;
00081   }
00082   return 0;
00083 }
00084 
00085 char const* HtmlTokenizer::hasNext()
00086 {
00087   if (tok)
00088     return tok;
00089   count += toklen;      // count characters up to end of previous token
00090   char const* start = rest;
00091   if (rest && rest < end) { // continue on same line
00092     if ((tok = next_token(rest, end, delim))) {
00093       toklen = rest - tok;
00094       count += tok - start;     // count characters before next token
00095       return tok;
00096     }
00097   } else if (is) {
00098     if (count) count++; // (count previous \n)
00099     while (is->getline(line, MAX_LINE_LEN)) {
00100       lineNumber++;
00101       start = rest = line;
00102       end = rest + is->gcount() - 1; // discard newline
00103       if ((tok = next_token(rest, end, delim))) {
00104         toklen = rest - tok;
00105         count += tok - start; // count characters before next token
00106         return tok;
00107       }
00108     }
00109   }
00110   count += end - start; // got to the end
00111   return 0;
00112 }
00113 
00114 } // namespace Text
00115 } // namespace Tanl