00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 #include "fuzzy/fuzzyimpl.h"
00029 #include "base/memory.hpp"
00030 #include "base/list.hpp"
00031 #include "base/common.hpp"
00032 #include "base/map.hpp"
00033 #include "base/vector.hpp"
00034 #include "base/stack.hpp"
00035 #include "base/string.hpp"
00036 #include "base/date.h"
00037 
00038 #include "smart/byterep.hpp"
00039 
00040 #include "fuzzy/fuzzyphonetic.hpp"
00041 
00042 fuzzy_matcher*
00043 fuzzy_matcher_factory::get_fuzzy_matcher(size_t memory_usage)
00044 {
00045         return new TERIMBER::fuzzy_matcher_impl(memory_usage);
00046 }
00047 
00048 BEGIN_TERIMBER_NAMESPACE
00049 #pragma pack(4)
00050 
00051 inline 
00052 double wrapper_find_metaphone_distance(const metaphone_key& key, metaphone_entry_citer_t y, byte_allocator& tmp, size_t max_penalty)
00053 {
00054         return (double)fuzzyphonetic::find_metaphone_distance(key, y.key(), tmp, max_penalty);
00055 }
00056 
00057 
00058 inline 
00059 double wrapper_find_reflection_distance(const reflection_key& key, reflection_entry_citer_t y, byte_allocator& tmp, size_t max_penalty)
00060 {
00061         return (double)fuzzyphonetic::find_reflection_distance(key, y.key(), tmp, max_penalty);
00062 }
00063 
00064 
00065 template < class C, class P, class Func  >
00066 class lookup_distance
00067 {
00068 public:
00069         inline
00070         lookup_distance(bool extended, size_t max_distance, const C& container, const P& key, Func f, byte_allocator& tmp) : 
00071                 _extended(extended),
00072                 _max_distance(max_distance),
00073                 _container(container), 
00074                 _key(key), 
00075                 _tmp(tmp),
00076                 _f(f), 
00077                 _stage(0),
00078                 _range(container.equal_range(key)),
00079                 _save_range(_range)
00080         {
00081                 if (_save_range.first != _container.end())
00082                         --_save_range.first;
00083         }
00084 
00085         inline
00086         pair< TYPENAME C::const_iterator, double > next()
00087         {
00088                 pair< TYPENAME C::const_iterator, double > ret(_container.end(), 0.0);
00089 
00090                 if (_stage == 0)
00091                 {
00092                         if (_range.first != _range.second)
00093                         {
00094                                 ret.first = _range.first;
00095                                 ++_range.first;
00096                                 return ret;
00097                         }
00098 
00099                         _stage = 1;
00100                 }
00101 
00102                 if (_stage == 1 && _extended)
00103                 {
00104                         if (_save_range.first != _container.end())
00105                         {
00106                                 
00107                                 _tmp.reset();
00108                                 ret.second = _f(_key, _save_range.first, _tmp, _max_distance + 1);
00109                                 if (ret.second <= _max_distance)
00110                                 {
00111                                         ret.first = _save_range.first;
00112                                         --_save_range.first;
00113                                         return ret;
00114                                 } 
00115                         } 
00116 
00117                         _stage = 2;
00118                 }
00119 
00120                 if (_stage == 2 && _extended)
00121                 {
00122                         if (_save_range.second != _container.end())
00123                         {
00124                                 
00125                                 _tmp.reset();
00126                                 ret.second = _f(_key, _save_range.second, _tmp, _max_distance + 1);
00127                                 if (ret.second <= _max_distance)
00128                                 {
00129                                         ret.first = _save_range.second;
00130                                         ++_save_range.second;
00131                                         return ret;
00132                                 }
00133                         }
00134 
00135                         _stage = 3;
00136                 }
00137 
00138                 return ret;
00139         }
00140 
00141 
00142 private:
00143         const C&        _container;
00144         const P&        _key;
00145         Func            _f;
00146         byte_allocator& _tmp;
00147         bool            _extended;
00148         size_t          _max_distance;
00149 
00150         size_t                          _stage;
00151         typename C::paircc_t _range;
00152         typename C::paircc_t _save_range;
00153 };
00155 
00156 fuzzy_matcher_impl::fuzzy_matcher_impl(size_t memory_usage)
00157 {
00158         assert(_memory_factory.is_ready());
00159 }
00160 
00161 
00162 fuzzy_matcher_impl::~fuzzy_matcher_impl()
00163 {
00164 }
00165 
00167 
00168 void
00169 fuzzy_matcher_impl::reset()
00170 {
00171         _vocabulary_pk_generator.clear();
00172         _ngram_pk_generator.clear();
00173         _metaphone_pk_generator.clear();
00174         _reflection_pk_generator.clear();
00175 
00176 
00177         _tokenizer.clear_regex();
00178         _tokenizer.clear_abbr();
00179 
00180         _word_vocabulary.clear();
00181         _vpk_word_vocabulary.clear();
00182         _metaphone_vocabulary.clear();
00183         _mpk_word_vocabulary.clear();
00184         _ngram_vocabulary.clear();
00185         _npk_ngram_vocabulary.clear();
00186         _reflection_vocabulary.clear();
00187         _rpk_ngram_vocabulary.clear();
00188         _ngram_partial_map.clear();
00189 
00190         _memory_factory->reset();
00191 }
00192 
00193 
00194 
00195 
00196 
00197 size_t 
00198 fuzzy_matcher_impl::add(const char* phrase, byte_allocator& all)
00199 {
00200         size_t npk = 0;
00201         if (!phrase)
00202                 return npk;
00203 
00204         tokenizer_output_sequence_t tokens;
00205         _tokenizer.tokenize(phrase, tokens, all, 0);
00206 
00207         
00208         _list< size_t > idents;
00209         size_t offset = 0;
00210         size_t count = 0;
00211 
00212         for (tokenizer_output_sequence_t::const_iterator it = tokens.begin(); count < MAX_PHRASE_TOKENS && it != tokens.end(); offset += it->_len, ++it)
00213         {
00214                 switch (it->_type)
00215                 {
00216                         case TT_ALPHABETIC:
00217                         case TT_DIGIT:
00218                                 break; 
00219                         default:
00220                                 continue;
00221                 }
00222 
00223                 ++count;
00224 
00225                 word_key wkey(phrase + offset, __min(it->_len, MAX_TOKEN_LENGTH));
00226 
00227                 
00228                 word_entry_t::iterator it_word = _word_vocabulary.find(wkey);
00229                 if (it_word == _word_vocabulary.end())
00230                 {
00231                         
00232 
00233                         
00234                         metaphone_key mkey = fuzzyphonetic::convert_to_metaphone(wkey._str, wkey._len, all);
00235                         metaphone_entry_t::iterator miter = _metaphone_vocabulary.find(mkey);
00236                         
00237                         if (miter != _metaphone_vocabulary.end()) 
00238                         {
00239                                 ++miter->_ref;
00240                         }
00241                         else
00242                         {
00243                                 
00244                                 size_t mpk = _metaphone_pk_generator.generate();
00245                                 
00246                                 metaphone_key mkey_new;
00247                                 mkey_new._array = (ub1_t*)_memory_factory->allocate(mkey_new._length = mkey._length);
00248                                 memcpy(mkey_new._array, mkey._array, mkey_new._length);
00249 
00250                                 
00251                                 metaphone_entry mentry(mpk);
00252                                 miter = _metaphone_vocabulary.insert(mkey_new, mentry).first;
00253                         }
00254 
00255                         
00256                         char* wptr = _memory_factory->allocate(wkey._len + 1);
00257                         memcpy(wptr, wkey._str, wkey._len);
00258                         wptr[wkey._len] = 0;
00259                         
00260                         
00261                         size_t vpk = _vocabulary_pk_generator.generate();
00262 
00263                         word_key wkey(wptr, wkey._len);
00264                         word_entry wentry(vpk, miter);
00265                         
00266                         it_word = _word_vocabulary.insert(wkey, wentry).first;
00267 
00268                         
00269                         _vpk_word_vocabulary.insert(it_word->_vpk, it_word);
00270                         
00271                         _mpk_word_vocabulary.insert(miter->_mpk, it_word);
00272                 }
00273                 else 
00274                 {
00275                         
00276                         ++it_word->_ref;
00277                 }
00278 
00279                 idents.push_back(all, it_word->_vpk);
00280         }
00281 
00282         
00283         size_t ngram_length = idents.size();
00284 
00285         if (ngram_length)
00286         {
00287                 
00288                 ngram_key nkey, okey;
00289                 okey._length = nkey._length = ngram_length;
00290                 size_t index = 0;
00291                 nkey._array = (size_t*)all.allocate(ngram_length * sizeof(size_t));
00292                 okey._array = (size_t*)all.allocate(ngram_length * sizeof(size_t));
00293                 for (_list< size_t >::const_iterator it_ident = idents.begin(); it_ident != idents.end(); ++it_ident, ++index)
00294                         okey._array[index] = nkey._array[index] = *it_ident;
00295 
00296                 
00297                 std::sort(nkey._array, nkey._array + nkey._length);
00298 
00299                 ngram_entry_t::iterator it_ngram = _ngram_vocabulary.find(nkey);
00300                  
00301                 if (it_ngram == _ngram_vocabulary.end())
00302                 {
00303                         
00304                         size_t phrase_len = strlen(phrase);
00305                         reflection_key rkey;
00306                         fuzzyphonetic::convert_to_reflection(phrase, phrase_len, all, rkey);
00307 
00308                         reflection_entry_t::iterator riter = _reflection_vocabulary.find(rkey);
00309                         if (riter != _reflection_vocabulary.end()) 
00310                         {
00311                                 ++riter->_ref;
00312                         }
00313                         else
00314                         {
00315                                 
00316                                 size_t rpk = _reflection_pk_generator.generate();
00317                                 reflection_entry rentry(rpk);
00318                                 riter = _reflection_vocabulary.insert(rkey, rentry).first;
00319                         }
00320 
00321                         npk = _ngram_pk_generator.generate();
00322                         ngram_entry nentry(npk, riter);
00323 
00324                         size_t* nptr = (size_t*)_memory_factory->allocate(ngram_length * sizeof(size_t));
00325                         memcpy(nptr, nkey._array, nkey._length * sizeof(size_t));
00326                         size_t* optr = (size_t*)_memory_factory->allocate(ngram_length * sizeof(size_t));
00327                         memcpy(optr, okey._array, okey._length * sizeof(size_t));
00328 
00329                         nkey._array = nptr;
00330                         nentry._origin._array = optr;
00331                         nentry._origin._length = okey._length;
00332 
00333                         it_ngram = _ngram_vocabulary.insert(nkey, nentry).first;
00334 
00335                         
00336                         _npk_ngram_vocabulary.insert(it_ngram->_npk, it_ngram);
00337                         
00338                         _rpk_ngram_vocabulary.insert(riter->_rpk, it_ngram);
00339 
00340                         
00341                         if (ngram_length > 1)
00342                         {
00343                                 for (size_t offset = 1; offset < ngram_length; ++offset)
00344                                 {
00345                                         ngram_key_offset pkey(offset, it_ngram);
00346                                         _ngram_partial_map.insert(pkey, true);
00347                                 }
00348                         }
00349                 }
00350                 else
00351                 {
00352                         npk = it_ngram->_npk;
00353                         ++it_ngram->_ref;
00354                 }
00355         }
00356 
00357         return npk;
00358 }
00359         
00360 
00361 
00362 
00363 bool 
00364 fuzzy_matcher_impl::remove(const char* phrase, byte_allocator& all)
00365 {
00366         if (!phrase)
00367                 return false;
00368 
00369         tokenizer_output_sequence_t tokens;
00370         _tokenizer.tokenize(phrase, tokens, all, 0);
00371 
00372         
00373         _list< size_t > idents;
00374         size_t offset = 0;
00375         size_t count = 0;
00376 
00377         for (tokenizer_output_sequence_t::const_iterator it = tokens.begin(); count < MAX_PHRASE_TOKENS && it != tokens.end(); offset += it->_len, ++it)
00378         {
00379                 switch (it->_type)
00380                 {
00381                         case TT_ALPHABETIC:
00382                         case TT_DIGIT:
00383                                 break; 
00384                         default:
00385                                 continue;
00386                 }
00387 
00388                 ++count;
00389 
00390                 word_key wkey(phrase + offset, __min(it->_len, MAX_TOKEN_LENGTH));
00391                 word_entry_t::iterator it_word = _word_vocabulary.find(wkey);
00392 
00393                 if (it_word != _word_vocabulary.end())
00394                 {
00395                         if (it_word->_ref == 1)
00396                         {
00397                                 
00398                                 size_t mpk = it_word->_miter->_mpk;
00399                                 mpk_word_entry_iter_t::iterator iter_metaphone = _mpk_word_vocabulary.lower_bound(mpk);
00400 
00401                                 while (mpk == iter_metaphone.key())
00402                                 {
00403                                         if (*iter_metaphone == it_word)
00404                                         {
00405                                                 _mpk_word_vocabulary.erase(iter_metaphone);
00406                                                 break;
00407                                         }
00408 
00409                                         ++iter_metaphone;
00410                                 }
00411 
00412                                 
00413                                 _memory_factory->deallocate((char*)it_word.key()._str);
00414                                 
00415                                 _vpk_word_vocabulary.erase(it_word->_vpk);
00416                                 
00417                                 _vocabulary_pk_generator.save(it_word->_vpk);
00418 
00419                                 
00420                                 if (it_word->_miter->_ref == 1)
00421                                         _metaphone_vocabulary.erase(it_word->_miter);
00422                                 else
00423                                         --it_word->_miter->_ref;
00424                                 
00425                                 _word_vocabulary.erase(it_word);
00426                         }
00427                         else
00428                                 --it_word->_ref;
00429                 }
00430 
00431                 idents.push_back(all, it_word->_vpk);
00432         }
00433 
00434         
00435         size_t ngram_length = idents.size();
00436 
00437         if (ngram_length)
00438         {
00439                 
00440                 ngram_key nkey;
00441                 nkey._length = ngram_length;
00442                 size_t index = 0;
00443                 nkey._array = (size_t*)all.allocate(ngram_length * sizeof(size_t));
00444                 for (_list< size_t >::const_iterator it_ident = idents.begin(); it_ident != idents.end(); ++it_ident, ++index)
00445                         nkey._array[index] = *it_ident;
00446 
00447                 std::sort(nkey._array, nkey._array + nkey._length);
00448 
00449                 ngram_entry_t::iterator it_ngram = _ngram_vocabulary.find(nkey);
00450                  
00451                 if (it_ngram != _ngram_vocabulary.end())
00452                 {
00453                         if (it_ngram->_ref == 1)
00454                         {
00455                                 size_t rpk = it_ngram->_riter->_rpk;
00456                                 rpk_ngram_entry_iter_t::iterator iter_reflaction = _rpk_ngram_vocabulary.lower_bound(rpk);
00457 
00458                                 while (rpk == iter_reflaction.key())
00459                                 {
00460                                         if (*iter_reflaction == it_ngram)
00461                                         {
00462                                                 _rpk_ngram_vocabulary.erase(iter_reflaction);
00463                                                 break;
00464                                         }
00465 
00466                                         ++iter_reflaction;
00467                                 }
00468 
00469 
00470                                 
00471                                 if (ngram_length > 1)
00472                                 {
00473                                         for (size_t offset = 1; offset < ngram_length; ++offset)
00474                                         {
00475                                                 ngram_key_offset pkey(offset, it_ngram);
00476                                                 ngram_key_offset_multimap_t::iterator iter_lower = _ngram_partial_map.lower_bound(pkey);
00477                                                 ngram_key_offset_multimap_t::iterator iter_upper = _ngram_partial_map.upper_bound(pkey);
00478 
00479                                                 
00480                                                 while (iter_lower != iter_upper)
00481                                                 {
00482                                                         if (iter_lower.key()._iter == it_ngram)
00483                                                                 iter_lower = _ngram_partial_map.erase(iter_lower);
00484                                                         else
00485                                                                 ++iter_lower;
00486                                                 }
00487                                         }
00488                                 }
00489 
00490                                 
00491                                 _memory_factory->deallocate((char*)it_ngram->_origin._array);
00492                                 _memory_factory->deallocate((char*)it_ngram.key()._array);
00493 
00494                                 
00495                                 _npk_ngram_vocabulary.erase(it_ngram->_npk);
00496                                 _ngram_pk_generator.save(it_ngram->_npk);
00497 
00498                                 
00499                                 if (it_ngram->_riter->_ref == 1)
00500                                         _reflection_vocabulary.erase(it_ngram->_riter);
00501                                 else
00502                                         --it_ngram->_riter->_ref;
00503 
00504                                 _ngram_vocabulary.erase(it_ngram);
00505                         }
00506                         else
00507                                 --it_ngram->_ref;
00508                 }
00509         }
00510 
00511         return true;
00512 }
00513 
00514 
00515 
00516 
00517 bool fuzzy_matcher_impl::remove(size_t ident, 
00518                                         byte_allocator& all)
00519 {
00520         npk_ngram_entry_iter_t::iterator it_ident = _npk_ngram_vocabulary.find(ident);
00521 
00522         if (it_ident == _npk_ngram_vocabulary.end())
00523                 return false;
00524 
00525         ngram_entry_iter_t it_ngram = *it_ident;
00526         size_t ngram_length = it_ngram.key()._length;
00527 
00528         if (it_ngram->_ref == 1)
00529         {
00530                 size_t rpk = it_ngram->_riter->_rpk;
00531                 rpk_ngram_entry_iter_t::iterator iter_reflaction = _rpk_ngram_vocabulary.lower_bound(rpk);
00532 
00533                 while (rpk == iter_reflaction.key())
00534                 {
00535                         if (*iter_reflaction == it_ngram)
00536                         {
00537                                 _rpk_ngram_vocabulary.erase(iter_reflaction);
00538                                 break;
00539                         }
00540 
00541                         ++iter_reflaction;
00542                 }
00543 
00544 
00545                 
00546                 if (ngram_length > 1)
00547                 {
00548                         for (size_t offset = 1; offset < ngram_length; ++offset)
00549                         {
00550                                 ngram_key_offset pkey(offset, it_ngram);
00551                                 ngram_key_offset_multimap_t::iterator iter_lower = _ngram_partial_map.lower_bound(pkey);
00552                                 ngram_key_offset_multimap_t::iterator iter_upper = _ngram_partial_map.upper_bound(pkey);
00553 
00554                                 
00555                                 while (iter_lower != iter_upper)
00556                                 {
00557                                         if (iter_lower.key()._iter == it_ngram)
00558                                                 iter_lower = _ngram_partial_map.erase(iter_lower);
00559                                         else
00560                                                 ++iter_lower;
00561                                 }
00562                         }
00563                 }
00564 
00565                 
00566                 _memory_factory->deallocate((char*)it_ngram->_origin._array);
00567                 _memory_factory->deallocate((char*)it_ngram.key()._array);
00568 
00569                 
00570                 _npk_ngram_vocabulary.erase(it_ngram->_npk);
00571                 _ngram_pk_generator.save(it_ngram->_npk);
00572 
00573                 
00574                 if (it_ngram->_riter->_ref == 1)
00575                         _reflection_vocabulary.erase(it_ngram->_riter);
00576                 else
00577                         --it_ngram->_riter->_ref;
00578 
00579                 _ngram_vocabulary.erase(it_ngram);
00580         }
00581         else
00582                 --it_ngram->_ref;
00583 
00584         return true;
00585 }
00586 
00587 
00588 
00589 bool 
00590 fuzzy_matcher_impl::match(      ngram_quality nq,
00591                                         phonetic_quality fq,
00592                                         const char* phrase, 
00593                                         byte_allocator& all, 
00594                                         byte_allocator& tmp,
00595                                         _list< const char* >& suggestions) const
00596 {
00597         candidates_container_t candidates;
00598 
00599         if (!_match(nq, fq, phrase, all, tmp, candidates))
00600                 return false;
00601 
00602         
00603         tmp.reset();
00604         vector_container_citer_t vec; 
00605         candidate_sorter sorter(candidates, vec, tmp);
00606 
00607         
00608         for (vector_container_citer_t::const_iterator iter_candidate = vec.begin(); iter_candidate != vec.end(); ++iter_candidate)
00609         {
00610                 
00611                 const ngram_key& ckey = (*iter_candidate).key();
00612 
00613                 
00614                 ngram_entry_t::const_iterator iter_ngram = _ngram_vocabulary.find(ckey);
00615                 assert(iter_ngram != _ngram_vocabulary.end());
00616 
00617                 const ngram_key& key_origin = iter_ngram->_origin;
00618 
00619                 word_key wkey = reconstruct_string(key_origin, all, true);
00620                 suggestions.push_back(all, wkey._str);
00621         }
00622 
00623 
00624         return true;
00625 }
00626 
00627 
00628 bool 
00629 fuzzy_matcher_impl::match(      ngram_quality nq,
00630                                                 phonetic_quality fq,
00631                                                 const char* phrase, 
00632                                                 byte_allocator& all, 
00633                                                 byte_allocator& tmp,
00634                                                 _list< size_t >& suggestions) const
00635 {
00636         candidates_container_t candidates;
00637 
00638         if (!_match(nq, fq, phrase, all, tmp, candidates))
00639                 return false;
00640 
00641         tmp.reset();
00642         vector_container_citer_t vec;
00643         candidate_sorter sorter(candidates, vec, tmp);
00644 
00645         
00646         for (vector_container_citer_t::const_iterator iter_candidate = vec.begin(); iter_candidate != vec.end(); ++iter_candidate)
00647         {
00648                 
00649                 const ngram_key& ckey = (*iter_candidate).key();
00650                 
00651                 ngram_entry_t::const_iterator iter_pk = _ngram_vocabulary.find(ckey);
00652                 assert(iter_pk != _ngram_vocabulary.end());
00653 
00654                 suggestions.push_back(all, iter_pk->_npk);
00655         }
00656 
00657         return true;
00658 }
00659 
00660 bool 
00661 fuzzy_matcher_impl::_match (ngram_quality nq,
00662                                         phonetic_quality fq,
00663                                         const char* phrase, 
00664                                         byte_allocator& all, 
00665                                         byte_allocator& tmp,
00666                                         candidates_container_t& candidates) const
00667 {
00668         if (!phrase)
00669                 return false;
00670 
00671         size_t max_phonet_distance = (fq == pq_high ? 0 : (fq == pq_normal ? 1 : 2));
00672         size_t max_word_distance = (fq == pq_high ? 1 : (fq == pq_normal ? 2 : 4));
00673 
00674         
00675         tokenizer_output_sequence_t tokens;
00676         _tokenizer.tokenize(phrase, tokens, tmp, 0);
00677 
00678         size_t phrase_length = strlen(phrase);
00679 
00680         _list< string_desc > words;
00681         _list< metaphone_key > metaphones;
00682 
00683         
00684         metaphone_key phrase_key = fuzzyphonetic::convert_to_metaphone(phrase, phrase_length, all);
00685 
00686         
00687         reflection_key phrase_reflection;
00688         fuzzyphonetic::convert_to_reflection(phrase, phrase_length, all, phrase_reflection);
00689 
00690         size_t offset = 0;
00691         size_t count = 0;
00692         
00693         
00694         for (tokenizer_output_sequence_t::const_iterator it = tokens.begin(); count < MAX_PHRASE_TOKENS && it != tokens.end(); offset += it->_len, ++it)
00695         {
00696                 switch (it->_type)
00697                 {
00698                         case TT_ALPHABETIC:
00699                         case TT_DIGIT:
00700                                 break; 
00701                         default:
00702                                 continue;
00703                 }
00704 
00705                 ++count;
00706                 
00707                 
00708                 size_t len = __min(it->_len, MAX_TOKEN_LENGTH);
00709                 char* ptr = (char*)all.allocate(len + 1);
00710                 for (size_t i = 0; i < len; ++i)
00711                         ptr[i] = fuzzyphonetic::to_lower(*(phrase + offset + i));
00712 
00713                 ptr[len] = 0;
00714 
00715                 string_desc desc;
00716                 desc._vpk = 0;
00717                 desc._str = ptr;
00718                 desc._len = len;
00719 
00720                 
00721                 words.push_back(all, desc);
00722 
00723                 
00724                 metaphone_key key = fuzzyphonetic::convert_to_metaphone(ptr, len, all);
00725                 metaphones.push_back(all, key);
00726         }
00727 
00728         
00729         if (count == 0) 
00730                 return false;
00731 
00732 
00733         
00734         
00735         
00736         
00737         
00738         
00739 
00740         word_key phraselow = make_string_lower(phrase, phrase_length, all);
00741 
00742         
00743         lookup_distance< metaphone_entry_t, metaphone_key, double (*)(const metaphone_key&, metaphone_entry_citer_t, byte_allocator&, size_t) >
00744                 lookup_metaphone(true, max_phonet_distance, _metaphone_vocabulary, phrase_key, wrapper_find_metaphone_distance, tmp);
00745 
00746         pair< metaphone_entry_t::const_iterator, double > iter_metaphone;
00747         while ((iter_metaphone = lookup_metaphone.next()).first != _metaphone_vocabulary.end())
00748         {
00749                 
00750                 size_t mpk = iter_metaphone.first->_mpk;
00751                 
00752                 double score = iter_metaphone.second;
00753                 
00754                 mpk_word_entry_iter_t::const_iterator iter_word = _mpk_word_vocabulary.lower_bound(mpk);
00755 
00756                 while (iter_word != _mpk_word_vocabulary.end()
00757                         && iter_word.key() == mpk
00758                         )
00759                 {
00760                         ngram_key nkey;
00761                         size_t vpk = (*iter_word)->_vpk;
00762                         nkey._array = &vpk;
00763                         nkey._length = 1;
00764                         
00765                         size_t popularity = (*iter_word)->_ref;
00766 
00767                         word_key wkey = make_string_lower((*iter_word).key()._str, (*iter_word).key()._len, tmp);
00768                         
00769                         size_t word_penalty = fuzzyphonetic::find_word_distance(phraselow._str, phraselow._len, wkey._str, wkey._len, tmp, max_phonet_distance + 1);
00770 
00771                         if (word_penalty <= max_word_distance)
00772                                 partial_intersect(nq, fq, nkey, nkey, score, popularity, word_penalty + 3*(count - 1), all, tmp, candidates);
00773                         ++iter_word;
00774                 }
00775         }
00776 
00777 
00778         if (count > 1) 
00779         {
00780                 _list< string_desc >::const_iterator wit = words.begin();
00781                 _list< metaphone_key >::const_iterator mit = metaphones.begin();
00782                 _list< sorted_ngram_suggestions_t > score_sugs;
00783                 size_t items = 0;
00784 
00785                 for (; wit != words.end(); ++wit, ++mit)
00786                 {
00787                         sorted_ngram_suggestions_t score_dummy;
00788                         sorted_ngram_suggestions_t& score_ref = *score_sugs.push_back(all, score_dummy);
00789 
00790                         
00791                         
00792                         lookup_distance< metaphone_entry_t, metaphone_key, double (*)(const metaphone_key&, metaphone_entry_citer_t, byte_allocator&, size_t) >
00793                                 lookup_metaphone(nq == nq_high ? false : true, max_phonet_distance, _metaphone_vocabulary, *mit, wrapper_find_metaphone_distance, tmp);
00794 
00795                         pair< metaphone_entry_t::const_iterator, double > iter_metaphone;
00796                         while ((iter_metaphone = lookup_metaphone.next()).first != _metaphone_vocabulary.end())
00797                         {
00798                                 
00799                                 size_t mpk = iter_metaphone.first->_mpk;
00800                                 double score = iter_metaphone.second;
00801                                 
00802                                 mpk_word_entry_iter_t::const_iterator iter_word = _mpk_word_vocabulary.lower_bound(mpk);
00803 
00804                                 while (iter_word != _mpk_word_vocabulary.end()
00805                                         && iter_word.key() == mpk
00806                                         )
00807                                 {
00808                                         size_t vpk = (*iter_word)->_vpk;
00809                                         size_t popularity = (*iter_word)->_ref;
00810 
00811                                         word_key wkey = make_string_lower((*iter_word).key()._str, (*iter_word).key()._len, tmp);
00812                                         
00813                                         size_t word_penalty = fuzzyphonetic::find_word_distance(wit->_str, wit->_len, wkey._str, wkey._len, tmp, max_phonet_distance + 1);
00814 
00815                                         if (word_penalty <= max_word_distance)
00816                                         {
00817                                                 ngram_suggestions_info score_p(vpk, popularity, score, word_penalty);
00818                                                 score_ref.insert(all, calculate_score(score, 1.0, popularity, word_penalty), score_p);
00819                                         }
00820 
00821                                         ++iter_word;
00822                                 } 
00823                         } 
00824 
00825                         if (!score_ref.empty())
00826                                 ++items;
00827                         else
00828                         {
00829                                 score_sugs.pop_back();
00830                         }
00831                 } 
00832 
00833                 
00834                 if (items)
00835                 {
00836                         
00837                         ngram_suggestions_iter_list_t iter_list;
00838                         
00839                         _list< sorted_ngram_suggestions_t >::iterator iter_score = score_sugs.begin();
00840 
00841                         size_t av_per_token = (size_t)pow(100.0, 1./items);
00842 
00843                         if (av_per_token == 0)
00844                                 av_per_token = 1;
00845 
00846                         for (; iter_score != score_sugs.end(); ++iter_score)
00847                         {
00848                                 while (iter_score->size() > av_per_token) 
00849                                 {
00850                                         iter_score->erase(--(iter_score->end()));
00851                                 }
00852 
00853                                 ngram_suggestions_iter_info info(iter_score->begin(), iter_score->end());
00854                                 iter_list.push_back(all, info);
00855                         }
00856 
00857                         ngram_key scorekey, scorekey_origin;
00858                         scorekey_origin._length = scorekey._length = items;
00859                         scorekey._array = (size_t*)all.allocate(scorekey._length * sizeof(size_t));
00860                         scorekey_origin._array = (size_t*)all.allocate(scorekey_origin._length * sizeof(size_t));
00861 
00862                         ngram_suggestions_iter_list_t::iterator fiter = iter_list.begin();
00863 
00864                         while (true)
00865                         {
00866                                 
00867                                 double score_score = 0.0;
00868                                 size_t score_pop = 0;
00869                                 size_t score_penalty = 0;
00870                                 size_t index = 0;
00871 
00872                                 for (ngram_suggestions_iter_list_t::iterator liter = iter_list.begin(); liter != iter_list.end(); ++liter, ++index)
00873                                 {
00874                                         score_score += liter->_current->_score;
00875                                         score_pop += liter->_current->_popularity;
00876                                         score_penalty += liter->_current->_penalty;
00877 
00878                                         scorekey_origin._array[index] = scorekey._array[index] = liter->_current->_vpk;
00879                                 }
00880 
00881                                 std::sort(scorekey._array, scorekey._array + index);
00882 
00883                                 
00884                                 partial_intersect(nq, fq, scorekey, scorekey_origin, score_score / items, score_pop / items, score_penalty / items, all, tmp, candidates);
00885 
00886                                 while (fiter != iter_list.end())
00887                                 {
00888                                         
00889                                         sorted_ngram_suggestions_citer_t citer = fiter->_current;
00890                                         
00891                                         if (citer != fiter->_end
00892                                                 && ++citer != fiter->_end)
00893                                         {
00894                                                 ++(fiter->_current); 
00895 
00896                                                 
00897                                                 if (fiter != iter_list.begin())
00898                                                 {
00899                                                         ngram_suggestions_iter_list_t::iterator startiter = iter_list.begin();
00900 
00901                                                         do
00902                                                         {
00903                                                                 startiter->_current = startiter->_begin;
00904                                                                 ++startiter;
00905                                                         }
00906                                                         while (startiter != fiter);
00907 
00908                                                         
00909                                                         fiter = iter_list.begin();
00910                                                 } 
00911 
00912                                                 break;
00913                                         } 
00914 
00915                                         ++fiter;
00916                                 } 
00917 
00918                                 if (fiter == iter_list.end())
00919                                         break;
00920                         } 
00921                 } 
00922         }
00923 
00924         
00925         lookup_distance< reflection_entry_t, reflection_key, double (*)(const reflection_key&, reflection_entry_citer_t, byte_allocator&, size_t) >
00926                 lookup_reflection(true, max_phonet_distance, _reflection_vocabulary, phrase_reflection, wrapper_find_reflection_distance, tmp);
00927 
00928         pair< reflection_entry_t::const_iterator, double > iter_reflection;
00929         while ((iter_reflection = lookup_reflection.next()).first != _reflection_vocabulary.end())
00930         {
00931                 
00932                 size_t rpk = iter_reflection.first->_rpk;
00933                 double score = iter_reflection.second;
00934 
00935                 rpk_ngram_entry_iter_t::const_iterator iter_ngram = _rpk_ngram_vocabulary.lower_bound(rpk);
00936 
00937                 while (iter_ngram != _rpk_ngram_vocabulary.end()
00938                         && iter_ngram.key() == rpk
00939                         )
00940                 {
00941                         const ngram_key& nkey_sorted = (*iter_ngram).key();
00942 
00943                         
00944                         if (candidates.end() == candidates.find(nkey_sorted))
00945                         {
00946                                 const ngram_key& nkey_origin = (*iter_ngram)->_origin;
00947                                 size_t popularity = (*iter_ngram)->_ref;
00948                                 
00949                                 
00950                                 if (nkey_origin._length == 1)
00951                                 {
00952                                         word_key reckey = reconstruct_string(nkey_origin, tmp, false);
00953                                         size_t word_penalty = fuzzyphonetic::find_word_distance(phraselow._str, phraselow._len, reckey._str, reckey._len, tmp, max_phonet_distance + 1);
00954                                         
00955                                         if (word_penalty <= max_word_distance)
00956                                         {
00957                                                 candidate_info info(calculate_score(score, count - 1.0, popularity, word_penalty+ 3*(count - 1)), score, popularity, word_penalty); 
00958                                                 candidates.insert(all, nkey_sorted, info);
00959                                         }
00960                                 }
00961                                 else 
00962                                 {
00963                                         
00964                                         char word[MAX_TOKEN_LENGTH]; 
00965                                         ub1_t array[MAX_TOKEN_LENGTH]; 
00966                                         ub4_t length = __min((ub4_t)MAX_TOKEN_LENGTH, phrase_key._length); 
00967                                         memcpy(array, phrase_key._array, length);
00968                                         memset(array + length, 0xff, MAX_TOKEN_LENGTH - length);
00969 
00970                                         size_t word_penalty = 0;
00971                                         size_t phone_penalty = 0;
00972                                         
00973                                         for (size_t p = 0; p < nkey_origin._length; ++p)
00974                                         {
00975                                                 
00976                                                 vpk_word_entry_iter_t::const_iterator it_p = _vpk_word_vocabulary.find(nkey_origin._array[p]);
00977                                                 assert(it_p != _vpk_word_vocabulary.end());
00978 
00979                                                 const metaphone_key& r_key = (*it_p)->_miter.key();
00980 
00981                                                 phone_penalty += r_key._length - find_and_remove(r_key, array, length);
00982 
00983                                                 word_key wkey = make_string_lower((*it_p).key()._str, (*it_p).key()._len, word);
00984                                                 word_penalty += ((strstr(phraselow._str, wkey._str) != 0) ? 0 : 3);
00985                                         } 
00986 
00987 
00988                                         
00989 
00990                                         
00991                                         if (phone_penalty <= max_phonet_distance
00992                                                 && word_penalty <= max_word_distance)
00993                                         {
00994                                                 size_t count_diff = __max(nkey_origin._length, count) - __min(nkey_origin._length, count);
00995                                                 candidate_info info(calculate_score(score, (double)count_diff, popularity, phone_penalty+ 3*word_penalty), score, popularity, word_penalty); 
00996                                                 candidates.insert(all, nkey_sorted, info);
00997                                         }
00998                                 }
00999                         }
01000                         ++iter_ngram;
01001                 }
01002         }
01003 
01004         return true;
01005 }
01006 
01007 void 
01008 fuzzy_matcher_impl::partial_intersect(ngram_quality nq,
01009                                                 phonetic_quality fq,
01010                                                 const ngram_key& key,
01011                                                 const ngram_key& key_origin,
01012                                                 double score,
01013                                                 size_t popularity,
01014                                                 size_t word_penalty,
01015                                                 byte_allocator& all, 
01016                                                 byte_allocator& tmp,
01017                                                 candidates_container_t& candidates) const
01018 {
01019         ngram_key mkey;
01020         mkey._length = 1;
01021         mkey._array = 0;
01022 
01023         double nq_score = (nq == nq_high ? 0.1 : (nq == nq_normal ? 0.26 : 0.34));
01024 
01025         
01026         for (size_t m = 0; m < key._length; ++m)
01027         {
01028                 mkey._array = &key._array[m];
01029 
01030                 
01031                 ngram_entry_t::const_iterator iter_main = _ngram_vocabulary.lower_bound(mkey);
01032                 while (iter_main != _ngram_vocabulary.end()
01033                         && mkey._array[0] == iter_main.key()._array[0]
01034                         )
01035                 {
01036                         
01037                         size_t pop = iter_main->_ref;
01038                         const ngram_key& nkey = iter_main.key();
01039                         const ngram_key& nkey_origin = iter_main->_origin;
01040 
01041                         double interscore = calculate_nkey_score(nkey, key);
01042                         if (interscore < nq_score 
01043                                 && candidates.end() == candidates.find(nkey)
01044                                 )
01045                         {
01046                                 size_t len_diff = __max(nkey._length, key._length) - __min(nkey._length, key._length);
01047                                 
01048                                 double interscore_origin = calculate_nkey_score(nkey_origin, key_origin);
01049                                 double final_interscore = (interscore + interscore_origin) / 2.0;
01050 
01051                                 candidate_info info(calculate_score(score, final_interscore, (popularity + pop) / 2, word_penalty + 3*len_diff), (score + final_interscore) / 2.0, (popularity + pop) / 2, word_penalty); 
01052                                 candidates.insert(all, nkey, info);
01053                         }
01054 
01055                         ++iter_main;
01056                 }
01057         }
01058 
01059         if (!_ngram_partial_map.empty())
01060         {
01061                 
01062                 ngram_key_offset_multimap_t::const_iterator iter_upper = _ngram_partial_map.end();
01063                 --iter_upper;
01064 
01065                 size_t max_tokens = iter_upper.key()._offset;
01066                 
01067                 
01068                 mkey._length = max_tokens + 1;
01069                 mkey._array = (size_t*)tmp.allocate(mkey._length * sizeof(size_t));
01070                 memset(mkey._array, 0, mkey._length * sizeof(size_t));
01071                 
01072 
01073                 for (size_t l = 0; l < key._length; ++l)
01074                 {
01075                         
01076                         for (size_t i = 1; i <= max_tokens; ++i)
01077                         {
01078                                 mkey._array[i] = key._array[l];
01079                                 mkey._length = i + 1;
01080                                 ngram_key_offset pkey(i, &mkey);
01081                                 ngram_key_offset_multimap_t::const_iterator it_partial = _ngram_partial_map.lower_bound(pkey);
01082                                 while (it_partial != _ngram_partial_map.end()
01083                                         && it_partial.key()._offset == i
01084                                         && it_partial.key()._iter.key()._array[i] == key._array[l]
01085                                         )
01086                                 {
01087                                         
01088                                         size_t pop = it_partial.key()._iter->_ref;
01089                                         const ngram_key& nkey = it_partial.key()._iter.key();
01090                                         const ngram_key& nkey_origin = it_partial.key()._iter->_origin;
01091 
01092                                         double interscore = calculate_nkey_score(nkey, key);
01093 
01094                                         if (interscore < nq_score 
01095                                                 && candidates.end() == candidates.find(nkey)
01096                                                 )
01097                                         {
01098                                                 size_t len_diff = __max(nkey._length, key._length) - __min(nkey._length, key._length);
01099                                                 
01100                                                 double interscore_origin = calculate_nkey_score(nkey_origin, key_origin);
01101                                                 double final_interscore = (interscore + interscore_origin) / 2.0;
01102                                                 
01103                                                 candidate_info info(calculate_score(score, final_interscore, (popularity + pop) / 2, word_penalty + 3*len_diff), (score + final_interscore) / 2.0, (popularity + pop) / 2, word_penalty); 
01104                                                 candidates.insert(all, nkey, info);
01105                                         }
01106 
01107                                         ++it_partial;
01108                                 } 
01109                         } 
01110                 } 
01111         } 
01112 }
01113 
01114 
01115 
01116 double 
01117 fuzzy_matcher_impl::calculate_nkey_score(const ngram_key& key1, const ngram_key& key2)
01118 {
01119         double ret = 0.0;
01120         size_t len1 = key1._length;
01121         size_t len2 = key2._length;
01122 
01123         for (size_t i1 = 0, i2 = 0; i1 < len1 && i2 < len2;)
01124         {
01125                 if (key1._array[i1] < key2._array[i2])
01126                         ++i1;
01127                 else if (key2._array[i2] < key1._array[i1])
01128                         ++i2;
01129                 else
01130                         ++i1, ++i2, ret += 2;
01131         }
01132 
01133         return (len1 + len2 - ret) / (len1 + len2);
01134 }
01135 
01136 word_key
01137 fuzzy_matcher_impl::reconstruct_string(const ngram_key& key, byte_allocator& tmp, bool preserve_case) const
01138 {
01139         _list< word_key > words;
01140         size_t total_bytes = 0;
01141 
01142         for (size_t i = 0; i < key._length; ++i)
01143         {
01144                 vpk_word_entry_iter_t::const_iterator iter_word_pk = _vpk_word_vocabulary.find(key._array[i]);
01145                 assert(iter_word_pk != _vpk_word_vocabulary.end());
01146 
01147                 word_key desc((*iter_word_pk).key());
01148                 total_bytes += desc._len + 1;
01149                 words.push_back(tmp, desc);
01150         }
01151 
01152         char* ret = (char*)tmp.allocate(total_bytes);
01153         size_t offset = 0;
01154 
01155         for (_list< word_key >::const_iterator it = words.begin(); it != words.end(); ++it)
01156         {
01157                 if (preserve_case)
01158                 {
01159                         memcpy(ret + offset, it->_str, it->_len);
01160                 }
01161                 else
01162                 {
01163                         for (size_t c = 0; c < it->_len; ++c)
01164                                 ret[offset + c] = fuzzyphonetic::to_lower(it->_str[c]);
01165                 }
01166                 offset += it->_len;
01167                 ret[offset++] = ' ';
01168         }
01169 
01170         ret[total_bytes - 1] = 0;
01171         word_key r(ret, total_bytes - 1); 
01172         return r;
01173 }
01174 
01175 
01176 word_key 
01177 fuzzy_matcher_impl::make_string_lower(const char* str, size_t len, byte_allocator& tmp)
01178 {
01179         char* ret = (char*)tmp.allocate(len + 1);
01180         size_t offset = 0;
01181 
01182         for (size_t c = 0; c < len; ++c)
01183                 ret[offset + c] = fuzzyphonetic::to_lower(str[c]);
01184 
01185         ret[len] = 0;
01186         word_key r(ret, len); 
01187         return r;
01188 }
01189 
01190 
01191 word_key 
01192 fuzzy_matcher_impl::make_string_lower(const char* str, size_t len, char* buf)
01193 {
01194         size_t offset = 0;
01195 
01196         for (size_t c = 0; c < len; ++c)
01197                 buf[offset + c] = fuzzyphonetic::to_lower(str[c]);
01198 
01199         buf[len] = 0;
01200         word_key r(buf, len); 
01201         return r;
01202 }
01203 
01204 
01205 double 
01206 fuzzy_matcher_impl::calculate_score(double phonetic_score, double intersection_score, size_t popularity, size_t penalty)
01207 {
01208         return (0.75 * phonetic_score + 0.9 * intersection_score + 0.25 * penalty + 1. / popularity);
01209 }
01210 
01211 
01212 ub4_t 
01213 fuzzy_matcher_impl::find_and_remove(const metaphone_key& key, ub1_t* array, ub4_t& len)
01214 {
01215         register ub4_t len_substr = key._length;
01216         register ub4_t len_str = len;
01217 
01218         if (!len_substr || !len || len_substr > len_str)
01219                 return 0;
01220 
01221         register ub1_t* p = key._array;
01222         register ub1_t* _this = array;
01223         register ub1_t* _next = 0;
01224 
01225         
01226         while ((_next = (ub1_t*)memchr(_this, *p, len_str))) 
01227         {
01228                 register ub4_t consumed = (ub4_t)((_next - array) + len_substr);
01229                 if (len < consumed)
01230                         return 0;
01231 
01232                  
01233                 if (!memcmp(_next, p, len_substr))
01234                 {
01235                         
01236                         memmove(_next, _next + len_substr, len - ((_next - array) + len_substr));
01237                         len -= len_substr;
01238                         return len_substr;
01239                 }
01240                 else
01241                 {
01242                         _this = _next + 1;
01243                         len_str = (ub4_t)(len - (_this - array));
01244                 }
01245         }
01246 
01247         return 0;
01248 }
01249 
01250 #pragma pack()
01251 END_TERIMBER_NAMESPACE