00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "Suffixes.h"
00025 #include <cassert>
00026 #include <algorithm>
00027
00028 using namespace std;
00029
00030 namespace Tanl {
00031 namespace Text {
00032
00033 #define MAX_LINE_LEN 4096
00034
00035 Suffixes::Suffixes(char const* file) {
00036 ifstream ifs(file);
00037 load(ifs);
00038 }
00039
00040 Suffixes::Suffixes(string& file) {
00041 ifstream ifs(file.c_str());
00042 load(ifs);
00043 }
00044
00045 void Suffixes::load(ifstream& ifs)
00046 {
00047 char line[MAX_LINE_LEN];
00048 while (ifs.getline(line, MAX_LINE_LEN)) {
00049 char* sl = line;
00050 while (isspace(*sl))
00051 sl++;
00052 int len = strlen(sl);
00053 if (len == 0)
00054 continue;
00055 char* rev = new char[len+1];
00056 char* back = sl + len - 1;
00057 char* s = rev;
00058 while (back >= sl)
00059 *s++ = *back--;
00060 *s = '\0';
00061 push_back(rev);
00062 }
00063 std::sort(begin(), end());
00064 }
00071 char const*
00072 Suffixes::match(char const* word)
00073 {
00074
00075 const_iterator first = begin();
00076 const_iterator last = end();
00077 const_iterator middle;
00078 int len = last - first;
00079 unsigned char mchar, vchar;
00080 register const char *smiddle, *suffix;
00081
00082 char const* revword = word + ::strlen(word) - 1;
00083
00084 while (len > 0) {
00085 int half = len / 2;
00086 middle = first + half;
00087
00088 smiddle = *middle;
00089 suffix = revword;
00090 assert (*smiddle);
00091 while (suffix >= word &&
00092 (mchar = *smiddle) == (vchar = *suffix))
00093 if (*++smiddle == '\0')
00094 return suffix;
00095 else
00096 --suffix;
00097 if (mchar < vchar) {
00098 first = middle + 1;
00099 len = len - half - 1;
00100 }
00101 else
00102 len = half;
00103 }
00104 return 0;
00105 }
00106
00107 void Suffixes::store(char const* file)
00108 {
00109 ofstream ofs(file, ios::binary);
00110
00111 for (const_iterator wit = begin(); wit != end(); wit++)
00112 ofs << *wit << endl;
00113 }
00114
00115 }
00116 }