00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "text/HtmlTokenizer.h"
00025
00026 #include <istream>
00027 #include <string.h>
00028
00029 using namespace std;
00030
00031 namespace Tanl {
00032 namespace Text {
00033
00034
00035 char const HtmlTokenizer::delimitersNL[] = " \t\n\r";
00036
00047 static
00048 char const*
00049 next_token(char const*& start, char const* end, const char* sep)
00050 {
00051
00052 while (start < end) {
00053 if (::strchr(sep, *start))
00054 ++start;
00055 else if (*start == '<') {
00056 while (start < end && *start++ != '>')
00057 ;
00058 } else
00059 break;
00060 }
00061
00062 if (start == end)
00063 return 0;
00064
00065 char const* token = start;
00066
00067 // scan through the string to find where it ends, it ends on a
00068 // null byte or a character that exists in the separator string
00069 while (++start < end && !::strchr(sep, *start))
00070 ;
00071
00072 return token; // return the position where the token starts
00073 }
00074
00075 char const* HtmlTokenizer::next()
00076 {
00077 if (hasNext()) {
00078 char const* res = tok;
00079 tok = 0; // consume token, forcing hasNext() to advance
00080 return res;
00081 }
00082 return 0;
00083 }
00084
00085 char const* HtmlTokenizer::hasNext()
00086 {
00087 if (tok)
00088 return tok;
00089 count += toklen; // count characters up to end of previous token
00090 char const* start = rest;
00091 if (rest && rest < end) { // continue on same line
00092 if ((tok = next_token(rest, end, delim))) {
00093 toklen = rest - tok;
00094 count += tok - start; // count characters before next token
00095 return tok;
00096 }
00097 } else if (is) {
00098 if (count) count++; // (count previous \n)
00099 while (is->getline(line, MAX_LINE_LEN)) {
00100 lineNumber++;
00101 start = rest = line;
00102 end = rest + is->gcount() - 1; // discard newline
00103 if ((tok = next_token(rest, end, delim))) {
00104 toklen = rest - tok;
00105 count += tok - start; // count characters before next token
00106 return tok;
00107 }
00108 }
00109 }
00110 count += end - start; // got to the end
00111 return 0;
00112 }
00113
00114 } // namespace Text
00115 } // namespace Tanl