00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #ifndef _terimber_fuzzyimpl_h_
00029 #define _terimber_fuzzyimpl_h_
00030
00031 #include "fuzzy/fuzzyaccess.h"
00032
00033 #include "base/map.h"
00034 #include "base/vector.h"
00035 #include "base/stack.h"
00036 #include "base/string.h"
00037 #include "smart/byterep.h"
00038 #include "tokenizer/tokenizer.h"
00039
00040 BEGIN_TERIMBER_NAMESPACE
00041 #pragma pack(4)
00042
00043
00044 #define MAX_TOKEN_LENGTH (size_t)64
00045 #define MAX_PHRASE_TOKENS (size_t)64
00046
00049 class metaphone_key
00050 {
00051 public:
00053 metaphone_key() :
00054 _array(0),
00055 _length(0)
00056 {
00057 }
00058
00060 inline
00061 bool
00062 operator<(const metaphone_key& x) const
00063 {
00064 if (!_length)
00065 return x._length != 0;
00066 else if (!x._length)
00067 return false;
00068 else
00069 {
00070 ub4_t min_len = __min(_length, x._length);
00071 int res = memcmp(_array, x._array, min_len);
00072 return res != 0 ? res < 0 : _length < x._length;
00073 }
00074 }
00075
00076 ub1_t* _array;
00077 ub4_t _length;
00078 };
00079
00082 class metaphone_entry
00083 {
00084 public:
00086 metaphone_entry(size_t mpk
00087 ) :
00088 _mpk(mpk),
00089 _ref(1)
00090 {
00091 }
00092
00093
00094 size_t _mpk;
00095 size_t _ref;
00096 };
00097
00100 class reflection_key
00101 {
00102 public:
00105 enum reflection_size
00106 {
00107 REFSIZE = 17,
00108 REFMAX = 0xFF
00109 };
00110
00113 typedef ub1_t reflection_bytes_t[REFSIZE];
00114
00115 public:
00117 reflection_key()
00118 {
00119 memset(_array, 0, REFSIZE * sizeof(ub1_t));
00120 }
00121
00123 inline
00124 bool
00125 operator<(const reflection_key& x) const
00126 {
00127 return memcmp(_array, x._array, REFSIZE * sizeof(ub1_t)) < 0;
00128 }
00129
00130 reflection_bytes_t _array;
00131 };
00132
00135 class reflection_entry
00136 {
00137 public:
00139 reflection_entry(size_t rpk) :
00140 _rpk(rpk),
00141 _ref(1)
00142 {
00143 }
00144
00145
00146 size_t _rpk;
00147 size_t _ref;
00148 };
00149
00150
00153 typedef map< metaphone_key, metaphone_entry > metaphone_entry_t;
00156 typedef metaphone_entry_t::iterator metaphone_entry_iter_t;
00159 typedef metaphone_entry_t::const_iterator metaphone_entry_citer_t;
00160
00163 typedef map< reflection_key, reflection_entry > reflection_entry_t;
00166 typedef reflection_entry_t::iterator reflection_entry_iter_t;
00169 typedef reflection_entry_t::const_iterator reflection_entry_citer_t;
00170
00173 class word_key
00174 {
00175 public:
00177 word_key( const char* str,
00178 size_t len
00179 ) :
00180 _str(str),
00181 _len(len)
00182 {
00183 }
00184
00186 inline
00187 bool
00188 operator<(const word_key& x) const
00189 {
00190 if (!_str)
00191 return x._str != 0;
00192 else if (!x._str)
00193 return false;
00194 else
00195 {
00196 size_t min_len = __min(_len, x._len);
00197 int res = str_template::strnocasecmp(_str, x._str, min_len);
00198 return res != 0 ? res < 0 : _len < x._len;
00199 }
00200 }
00201
00202 const char* _str;
00203 size_t _len;
00204 };
00205
00208 class word_entry
00209 {
00210 public:
00212 word_entry( size_t vpk,
00213 metaphone_entry_iter_t miter
00214 ) :
00215 _vpk(vpk),
00216 _miter(miter),
00217 _ref(1)
00218 {
00219 }
00220
00221 size_t _vpk;
00222 metaphone_entry_iter_t _miter;
00223 size_t _ref;
00224 };
00225
00228 typedef map< word_key, word_entry > word_entry_t;
00231 typedef word_entry_t::iterator word_entry_iter_t;
00232
00235 typedef map< size_t, word_entry_iter_t > vpk_word_entry_iter_t;
00236
00239 typedef map< size_t, word_entry_iter_t, less< size_t >, true > mpk_word_entry_iter_t;
00240
00243 class ngram_key
00244 {
00245 public:
00247 inline
00248 bool
00249 operator<(const ngram_key& x) const
00250 {
00251 if (!_length)
00252 return x._length != 0;
00253 else if (!x._length)
00254 return false;
00255 else
00256 {
00257 ub4_t min_len = (ub4_t)__min(_length, x._length);
00258 int res = memcmp(_array, x._array, min_len * sizeof(size_t));
00259 return res != 0 ? res < 0 : _length < x._length;
00260 }
00261 }
00262
00264 inline
00265 bool
00266 operator==(const ngram_key& x) const
00267 {
00268 if (_length != x._length)
00269 return false;
00270 else
00271 return memcmp(_array, x._array, _length * sizeof(size_t)) == 0;
00272 }
00273
00274 size_t* _array;
00275 size_t _length;
00276 };
00277
00280 class ngram_entry
00281 {
00282 public:
00284 ngram_entry( size_t npk,
00285 reflection_entry_iter_t riter
00286 ) :
00287 _npk(npk),
00288 _riter(riter),
00289 _ref(1)
00290 {
00291 }
00292
00293 size_t _npk;
00294 reflection_entry_iter_t _riter;
00295 ngram_key _origin;
00296 size_t _ref;
00297 };
00298
00301 typedef map< ngram_key, ngram_entry > ngram_entry_t;
00304 typedef ngram_entry_t::iterator ngram_entry_iter_t;
00307 typedef map< size_t, ngram_entry_iter_t > npk_ngram_entry_iter_t;
00310 typedef map< size_t, ngram_entry_iter_t, less< size_t >, true > rpk_ngram_entry_iter_t;
00311
00314 class ngram_suggestions_info
00315 {
00316 public:
00318 ngram_suggestions_info(size_t vpk,
00319 size_t popularity,
00320 double score,
00321 size_t penalty
00322 ) :
00323 _vpk(vpk),
00324 _popularity(popularity),
00325 _score(score),
00326 _penalty(penalty)
00327 {
00328 }
00329
00330 size_t _vpk;
00331 size_t _popularity;
00332 double _score;
00333 size_t _penalty;
00334 };
00335
00338 typedef _map< double, ngram_suggestions_info > sorted_ngram_suggestions_t;
00341 typedef sorted_ngram_suggestions_t::const_iterator sorted_ngram_suggestions_citer_t;
00342
00345 class ngram_suggestions_iter_info
00346 {
00347 public:
00349 ngram_suggestions_iter_info(sorted_ngram_suggestions_citer_t begin,
00350 sorted_ngram_suggestions_citer_t end
00351 ) :
00352 _begin(begin),
00353 _current(begin),
00354 _end(end)
00355 {
00356 }
00357
00358 sorted_ngram_suggestions_citer_t _begin;
00359 sorted_ngram_suggestions_citer_t _current;
00360 sorted_ngram_suggestions_citer_t _end;
00361 };
00362
00365 typedef _list< ngram_suggestions_iter_info > ngram_suggestions_iter_list_t;
00366
00369 class string_desc
00370 {
00371 public:
00372 size_t _vpk;
00373 double _score;
00374 size_t _popularity;
00375 const char* _str;
00376 size_t _len;
00377 };
00378
00381 class fuzzy_matcher_impl : public fuzzy_matcher
00382 {
00385 class ngram_key_offset
00386 {
00387 public:
00389 ngram_key_offset(size_t offset,
00390 ngram_entry_iter_t iter
00391 ) :
00392 _offset(offset),
00393 _iter(iter),
00394 _pointer(0)
00395 {
00396 }
00397
00399 ngram_key_offset(size_t offset,
00400 const ngram_key* pointer
00401 ) :
00402 _offset(offset),
00403 _pointer(pointer)
00404 {
00405 }
00406
00408 inline
00409 bool
00410 operator<(const ngram_key_offset& x) const
00411 {
00412 if (_offset != x._offset)
00413 return _offset < x._offset;
00414 else
00415 {
00416 const ngram_key* this_ptr = 0;
00417 const ngram_key* x_ptr = 0;
00418 if (x._pointer)
00419 {
00420 assert(_pointer == 0);
00421 this_ptr = &_iter.key();
00422 x_ptr = x._pointer;
00423 }
00424 else if (_pointer)
00425 {
00426 assert(x._pointer == 0);
00427 this_ptr = _pointer;
00428 x_ptr = &x._iter.key();
00429 }
00430 else
00431 {
00432 assert(_pointer == 0);
00433 assert(x._pointer == 0);
00434 this_ptr = &_iter.key();
00435 x_ptr = &x._iter.key();
00436 }
00437
00438 ub4_t min_len = (ub4_t)__min(this_ptr->_length, x_ptr->_length);
00439 assert(min_len > _offset);
00440 int res = memcmp(this_ptr->_array + _offset, x_ptr->_array + _offset, (min_len - _offset) * sizeof(size_t));
00441 return res != 0 ? res < 0 : this_ptr->_length < x_ptr->_length;
00442 }
00443 }
00444
00445 size_t _offset;
00446 ngram_entry_iter_t _iter;
00447 const ngram_key* _pointer;
00448 };
00449
00452 typedef map< ngram_key_offset, bool, less< ngram_key_offset >, true > ngram_key_offset_multimap_t;
00453
00456 class candidate_info
00457 {
00458 public:
00461 candidate_info(double total_score,
00462 double intersection_score,
00463 size_t population,
00464 size_t word_penalty
00465 ) :
00466 _total_score(total_score),
00467 _intersection_score(intersection_score),
00468 _population(population),
00469 _word_penalty(word_penalty)
00470 {
00471 }
00472
00473 double _total_score;
00474 double _intersection_score;
00475 size_t _population;
00476 size_t _word_penalty;
00477 };
00478
00481 typedef _map< ngram_key, candidate_info > candidates_container_t;
00484 typedef candidates_container_t::const_iterator candidates_container_citer_t;
00487 typedef _vector< candidates_container_citer_t > vector_container_citer_t;
00488
00491 class candidate_sorter
00492 {
00495 class candidate_iter_less
00496 {
00497 public:
00499 bool
00500 operator()(const candidates_container_citer_t& x, const candidates_container_citer_t& y) const
00501 {
00502 return x->_total_score < y->_total_score;
00503 }
00504 };
00505
00506
00507 public:
00509 candidate_sorter(const candidates_container_t& container,
00510 vector_container_citer_t& vec,
00511 byte_allocator& tmp
00512 )
00513 {
00514 size_t len = container.size();
00515 if (len > 1)
00516 {
00517 vec.resize(tmp, len);
00518 size_t index = 0;
00519 for (candidates_container_citer_t iter = container.begin(); iter != container.end(); ++iter, ++index)
00520 {
00521 vec[index] = iter;
00522 }
00523
00524 candidate_iter_less pred;
00525 std::sort(vec.begin(), vec.end(), pred);
00526
00527 }
00528 else if (len > 0)
00529 {
00530 vec.resize(tmp, 1);
00531 vec[0] = container.begin();
00532 }
00533 }
00534 };
00535
00536 public:
00538 fuzzy_matcher_impl(size_t memory_usage
00539 );
00540
00542 virtual
00543 ~fuzzy_matcher_impl();
00544
00545
00546
00550 virtual
00551 size_t
00552 add( const char* phrase,
00553 byte_allocator& all
00554 );
00555
00558 virtual
00559 bool
00560 remove( const char* phrase,
00561 byte_allocator& all
00562 );
00565 virtual
00566 bool
00567 remove( size_t ident,
00568 byte_allocator& all
00569 );
00571 virtual
00572 bool
00573 match( ngram_quality nq,
00574 phonetic_quality pq,
00575 const char* phrase,
00576 byte_allocator& all,
00577 byte_allocator& tmp,
00578 _list< const char* >& suggestions
00579 ) const;
00580
00582 virtual
00583 bool match( ngram_quality nq,
00584 phonetic_quality fq,
00585 const char* phrase,
00586 byte_allocator& all,
00587 byte_allocator& tmp,
00588 _list< size_t >& suggestions
00589 ) const;
00590
00592 virtual
00593 void
00594 reset();
00595
00596
00597 private:
00599 bool
00600 _match ( ngram_quality nq,
00601 phonetic_quality fq,
00602 const char* phrase,
00603 byte_allocator& all,
00604 byte_allocator& tmp,
00605 candidates_container_t& candidates
00606 ) const;
00607
00609 void
00610 partial_intersect(ngram_quality nq,
00611 phonetic_quality fq,
00612 const ngram_key& key,
00613 const ngram_key& key_origin,
00614 double score,
00615 size_t popularity,
00616 size_t word_penalty,
00617 byte_allocator& all,
00618 byte_allocator& tmp,
00619 candidates_container_t& candidate
00620 ) const;
00621
00623 word_key
00624 reconstruct_string(const ngram_key& key,
00625 byte_allocator& tmp,
00626 bool preserve_case
00627 ) const;
00628
00630 static
00631 word_key
00632 make_string_lower(const char* str,
00633 size_t len,
00634 byte_allocator& tmp
00635 );
00637 static
00638 word_key
00639 make_string_lower(const char* str,
00640 size_t len,
00641 char* buf
00642 );
00644 static
00645 double
00646 calculate_nkey_score(const ngram_key& key1,
00647 const ngram_key& key2
00648 );
00650 static
00651 double
00652 calculate_score(double phonetic_score,
00653 double intersection_score,
00654 size_t popularity,
00655 size_t penalty
00656 );
00658 static
00659 ub4_t
00660 find_and_remove(const metaphone_key& key,
00661 ub1_t* array,
00662 ub4_t& len
00663 );
00664 private:
00665
00666 byte_repository_factory _memory_factory;
00667 tokenizer _tokenizer;
00668 unique_key_generator _vocabulary_pk_generator;
00669 unique_key_generator _ngram_pk_generator;
00670 unique_key_generator _metaphone_pk_generator;
00671 unique_key_generator _reflection_pk_generator;
00672 word_entry_t _word_vocabulary;
00673 vpk_word_entry_iter_t _vpk_word_vocabulary;
00674 metaphone_entry_t _metaphone_vocabulary;
00675 mpk_word_entry_iter_t _mpk_word_vocabulary;
00676 ngram_entry_t _ngram_vocabulary;
00677 npk_ngram_entry_iter_t _npk_ngram_vocabulary;
00678 reflection_entry_t _reflection_vocabulary;
00679 rpk_ngram_entry_iter_t _rpk_ngram_vocabulary;
00680 ngram_key_offset_multimap_t _ngram_partial_map;
00681 };
00682
00683 #pragma pack()
00684 END_TERIMBER_NAMESPACE
00685
00686 #endif //_terimber_fuzzyimpl_h_