00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef Tanl_Text_WordSet_H
00025 #define Tanl_Text_WordSet_H
00026
00027 #include "include/unordered_set.h"
00028 #include "text/strings.h"
00029
00030
00031 #include <stdlib.h>
00032 #include <fstream>
00033
00034 namespace Tanl {
00035 namespace Text {
00036
00037 template <class _Hash = hash<const char *>,
00038 class _Pred = std::equal_to<const char*> >
00039 class WordSetBase :
00040 public unordered_set<const char*, _Hash, _Pred>
00041 {
00042 protected:
00043 typedef unordered_set<const char*, _Hash, _Pred> _Base;
00044
00045 public:
00046
00047 WordSetBase() { }
00048
00052 WordSetBase(char const* file);
00053
00054 WordSetBase(std::string& file);
00055
00056 ~WordSetBase() {
00057 for (typename _Base::iterator it = this->begin(); it != this->end();) {
00058 char const* word = *it;
00059 erase(*it++);
00060 free((void*)word);
00061 }
00062 }
00063
00067 bool contains(char const* word) {
00068 return this->find(word) != this->end();
00069 }
00070
00074 bool contains(std::string const& word) {
00075 return this->find(word.c_str()) != this->end();
00076 }
00077
00081 std::pair<typename _Base::iterator, bool> insert(const char* word) {
00082 typename _Base::iterator fit = this->find(word);
00083 return (fit != this->end()) ?
00084 std::pair<typename _Base::iterator, bool>(fit, false) :
00085 _Base::insert(::strdup(word));
00086 }
00087
00091 void store(char const* file);
00092
00096 void load(char const* file);
00097
00098 protected:
00099 void load(std::ifstream& ifs);
00100 };
00101
00106 class WordSet : public WordSetBase<>
00107 {
00108 public:
00109 WordSet() { }
00110 WordSet(char const* file) : WordSetBase<>(file) { }
00111 WordSet(std::string& file) : WordSetBase<>(file) { }
00112 };
00113
00119 struct NormEqual
00120 {
00121 bool operator()(const char* s1, const char* s2) const
00122 {
00123 char n1, n2;
00124 while (n1 = *s1, n2 = *s2, n1 || n2) {
00125 if (n1 && !isalnum(n1)) {
00126 ++s1;
00127 continue;
00128 }
00129 if (n2 && !isalnum(n2)) {
00130 ++s2;
00131 continue;
00132 }
00133 if (tolower(n1) != tolower(n2))
00134 break;
00135 ++s1;
00136 ++s2;
00137 }
00138 return (n1 == '\0') && (n2 == '\0');
00139 }
00140 };
00141
00142 struct NormHash
00143 {
00144 size_t operator()(char const* str) const {
00145 std::string s = str;
00146 to_lower(s);
00147 return hash<char const *>()(s.c_str());
00148 }
00149 };
00150
00151 class NormWordSet : public WordSetBase<NormHash, NormEqual>
00152 {
00153 public:
00154 NormWordSet() { }
00155 NormWordSet(char const* file) : WordSetBase<NormHash, NormEqual>(file) { }
00156 NormWordSet(std::string& file) : WordSetBase<NormHash, NormEqual>(file) { }
00157
00161 std::pair<_Base::iterator, bool> insert(const char* word) {
00162 char* lower = (char*)malloc(strlen(word));
00163 to_lower(lower, word);
00164 _Base::iterator fit = this->find(lower);
00165 return (fit != this->end()) ?
00166 std::pair<_Base::iterator, bool>(fit, false) :
00167 _Base::insert(lower);
00168 }
00169
00173 bool contains(char const* word) {
00174 return this->find(word) != this->end();
00175 }
00176
00180 bool contains(std::string const& word) {
00181 return this->find(word.c_str()) != this->end();
00182 }
00183
00184 };
00185
00186 }
00187 }
00188
00189
00190 #ifndef NON_TEMPLATE
00191 # include "WordSet.cpp"
00192 #endif
00193
00194 #endif // Tanl_Text_WordSet_H