00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #ifndef _terimber_fuzzyphonetic_hpp_
00028 #define _terimber_fuzzyphonetic_hpp_
00029
00030 #include "fuzzyphonetic.h"
00031 #include "base/common.hpp"
00032
00033 BEGIN_TERIMBER_NAMESPACE
00034 #pragma pack(4)
00035
00036 namespace fuzzyphonetic
00037 {
00038
00039
00040
00041
00042
00043
00044 const ub1_t H_NOTE = 0;
00045 const ub1_t TH_NOTE = 1;
00046 const ub1_t F_NOTE = 2;
00047 const ub1_t P_NOTE = 3;
00048 const ub1_t T_NOTE = 4;
00049 const ub1_t K_NOTE = 5;
00050 const ub1_t S_NOTE = 6;
00051 const ub1_t X_NOTE = 7;
00052 const ub1_t B_NOTE = 8;
00053 const ub1_t N_NOTE = 9;
00054 const ub1_t L_NOTE = 10;
00055 const ub1_t M_NOTE = 11;
00056 const ub1_t W_NOTE = 12;
00057 const ub1_t Y_NOTE = 13;
00058 const ub1_t R_NOTE = 14;
00059 const ub1_t J_NOTE = 15;
00060 const ub1_t A_NOTE = 16;
00061
00062
00063 const ub1_t legal_mask = 0x10;
00064 const ub1_t metaphoneic_mask = 0x0F;
00065 const ub1_t lower_mask = 0x20;
00066 const ub1_t vowel_mask = 0x40;
00067
00068 static const ub1_t metaphoneic_rules[256] =
00069 {
00070
00071 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00072
00073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00074
00075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00076
00077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00078
00079 0,
00080 legal_mask + vowel_mask,
00081 legal_mask,
00082 legal_mask,
00083 legal_mask,
00084 legal_mask + vowel_mask,
00085 legal_mask,
00086 legal_mask,
00087 legal_mask,
00088 legal_mask + vowel_mask,
00089 legal_mask,
00090 legal_mask,
00091 legal_mask,
00092 legal_mask,
00093 legal_mask,
00094 legal_mask + vowel_mask,
00095
00096 legal_mask,
00097 legal_mask,
00098 legal_mask,
00099 legal_mask,
00100 legal_mask,
00101 legal_mask + vowel_mask,
00102 legal_mask,
00103 legal_mask,
00104 legal_mask,
00105 legal_mask + vowel_mask,
00106 legal_mask,
00107 0, 0, 0, 0, 0,
00108
00109 0,
00110 legal_mask + vowel_mask + lower_mask,
00111 legal_mask + lower_mask,
00112 legal_mask + lower_mask,
00113 legal_mask + lower_mask,
00114 legal_mask + vowel_mask + lower_mask,
00115 legal_mask + lower_mask,
00116 legal_mask + lower_mask,
00117 legal_mask + lower_mask,
00118 legal_mask + vowel_mask + lower_mask,
00119 legal_mask + lower_mask,
00120 legal_mask + lower_mask,
00121 legal_mask + lower_mask,
00122 legal_mask + lower_mask,
00123 legal_mask + lower_mask,
00124 legal_mask + vowel_mask + lower_mask,
00125
00126 legal_mask + lower_mask,
00127 legal_mask + lower_mask,
00128 legal_mask + lower_mask,
00129 legal_mask + lower_mask,
00130 legal_mask + lower_mask,
00131 legal_mask + vowel_mask + lower_mask,
00132 legal_mask + lower_mask,
00133 legal_mask + lower_mask,
00134 legal_mask + lower_mask,
00135 legal_mask + vowel_mask + lower_mask,
00136 legal_mask + lower_mask,
00137 0, 0, 0, 0, 0,
00138
00139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00140
00141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00142
00143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00144
00145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00146
00147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00148
00149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00150
00151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00152
00153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00154 };
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167 inline bool is_legal_char(char ch) { return (metaphoneic_rules[(unsigned char)ch] & legal_mask) != 0; }
00168 inline char to_upper(char ch) { return (metaphoneic_rules[(unsigned char)ch] & lower_mask) ? ch - 0x20 : ((metaphoneic_rules[(unsigned char)ch] & legal_mask)? ch : 0); }
00169 inline bool is_vowel(char ch) { return (metaphoneic_rules[(unsigned char)ch] & vowel_mask) != 0; }
00170 inline char to_lower(char ch) { return (metaphoneic_rules[(unsigned char)ch] & lower_mask) ? ch : ((metaphoneic_rules[(unsigned char)ch] & legal_mask) ? ch + 0x20 : ch); }
00171
00172
00173 inline bool check_sub_string(const char* word, const char* pend, const char* ethalon, size_t len)
00174 {
00175 if (!word)
00176 return false;
00177
00178 while (word != pend && len-- && *ethalon)
00179 {
00180 if (to_upper(*word) != *ethalon)
00181 return false;
00182
00183 ++word, ++ethalon;
00184 }
00185
00186 return word != pend;
00187 }
00188
00189 inline bool is_slavo_germanic(const char* word, size_t len)
00190 {
00191 while (len-- && *word)
00192 {
00193 switch (to_upper(*word))
00194 {
00195 case 'W':
00196 return true;
00197 case 'K':
00198 return true;
00199 case 'C':
00200 if (to_upper(*(word + 1)) == 'Z')
00201 return true;
00202 break;
00203 }
00204
00205 ++word;
00206 }
00207
00208 return false;
00209 }
00210
00211 inline
00212 size_t
00213 find_metaphone_distance(const metaphone_key& x, const metaphone_key& y, byte_allocator& tmp, size_t max_penalty)
00214 {
00215 return metaphone_distance((const char*)x._array, x._length, (const char*)y._array, y._length, tmp, max_penalty);
00216 }
00217
00218
00219 inline
00220 size_t find_reflection_distance(const reflection_key& x, const reflection_key& y, byte_allocator& tmp, size_t max_penalty)
00221 {
00222 return metaphone_distance((const char*)x._array, reflection_key::REFSIZE, (const char*)y._array, reflection_key::REFSIZE, tmp, max_penalty);
00223 }
00224
00225 inline
00226 size_t find_word_distance(const char* x, size_t xlen, const char* y, size_t ylen, byte_allocator& tmp, size_t max_penalty)
00227 {
00228 return metaphone_distance(x, xlen, y, ylen, tmp, max_penalty);
00229 }
00230
00231 template< class T >
00232 inline
00233 size_t
00234 metaphone_distance(const T* ax, size_t x, const T* ay, size_t y, byte_allocator& tmp, size_t max_penalty)
00235 {
00236 const size_t dim = sizeof(size_t) * (x + 1) * (y + 1);
00237 size_t* matrix = (size_t*)tmp.allocate(dim);
00238 if (!matrix)
00239 return 0;
00240
00241 memset(matrix, 0, dim);
00242
00243 size_t row, col;
00244 for (row = 0; row <= x; ++row)
00245 matrix[row * (y + 1)] = row;
00246 for (col = 1; col <= y; ++col)
00247 matrix[col] = col;
00248
00249 for (row = 1; row <= x; ++row)
00250 {
00251 register T x_row = ax[row - 1];
00252 for (col = 1; col <= y; ++col)
00253 {
00254 register T y_col = ay[col - 1];
00255
00256 size_t cost;
00257 if (x_row == y_col)
00258 {
00259 cost = 0;
00260 }
00261 else
00262 {
00263 cost = 1;
00264 }
00265
00266 register size_t above = matrix[(row - 1) * (y + 1) + col];
00267 register size_t left = matrix[row * (y + 1) + col - 1];
00268 register size_t diag = matrix[(row - 1) * (y + 1) + col - 1];
00269 register size_t cell = __min(above + 1, __min(left + 1, diag + cost));
00270
00271 if (row > 2 && col > 2)
00272 {
00273 register size_t trans = matrix[(row - 2) * (y + 1) + col - 2] + 1;
00274 if (ax[row - 2] != y_col)
00275 ++trans;
00276 if (x_row != ay[col - 2])
00277 ++trans;
00278 if (cell > trans)
00279 cell = trans;
00280 }
00281
00282 matrix[row * (y + 1) + col] = cell;
00283
00284
00285
00286 }
00287 }
00288
00289 return matrix[x * (y + 1) + y];
00290 }
00291
00292 inline
00293 size_t
00294 find_key_distance(const metaphone_key& x, const metaphone_key& y, byte_allocator& tmp)
00295 {
00296 return metaphone_distance((const size_t*)x._array, x._length, (const size_t*)y._array, y._length, tmp, 0xffffffff);
00297 }
00298
00299 }
00300
00301 #pragma pack()
00302 END_TERIMBER_NAMESPACE
00303
00304 #endif //