00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "fuzzy/fuzzyimpl.h"
00029 #include "base/memory.hpp"
00030 #include "base/list.hpp"
00031 #include "base/common.hpp"
00032 #include "base/map.hpp"
00033 #include "base/vector.hpp"
00034 #include "base/stack.hpp"
00035 #include "base/string.hpp"
00036 #include "base/date.h"
00037
00038 #include "smart/byterep.hpp"
00039
00040 #include "fuzzy/fuzzyphonetic.hpp"
00041
00042 fuzzy_matcher*
00043 fuzzy_matcher_factory::get_fuzzy_matcher(size_t memory_usage)
00044 {
00045 return new TERIMBER::fuzzy_matcher_impl(memory_usage);
00046 }
00047
00048 BEGIN_TERIMBER_NAMESPACE
00049 #pragma pack(4)
00050
00051 inline
00052 double wrapper_find_metaphone_distance(const metaphone_key& key, metaphone_entry_citer_t y, byte_allocator& tmp, size_t max_penalty)
00053 {
00054 return (double)fuzzyphonetic::find_metaphone_distance(key, y.key(), tmp, max_penalty);
00055 }
00056
00057
00058 inline
00059 double wrapper_find_reflection_distance(const reflection_key& key, reflection_entry_citer_t y, byte_allocator& tmp, size_t max_penalty)
00060 {
00061 return (double)fuzzyphonetic::find_reflection_distance(key, y.key(), tmp, max_penalty);
00062 }
00063
00064
00065 template < class C, class P, class Func >
00066 class lookup_distance
00067 {
00068 public:
00069 inline
00070 lookup_distance(bool extended, size_t max_distance, const C& container, const P& key, Func f, byte_allocator& tmp) :
00071 _extended(extended),
00072 _max_distance(max_distance),
00073 _container(container),
00074 _key(key),
00075 _tmp(tmp),
00076 _f(f),
00077 _stage(0),
00078 _range(container.equal_range(key)),
00079 _save_range(_range)
00080 {
00081 if (_save_range.first != _container.end())
00082 --_save_range.first;
00083 }
00084
00085 inline
00086 pair< TYPENAME C::const_iterator, double > next()
00087 {
00088 pair< TYPENAME C::const_iterator, double > ret(_container.end(), 0.0);
00089
00090 if (_stage == 0)
00091 {
00092 if (_range.first != _range.second)
00093 {
00094 ret.first = _range.first;
00095 ++_range.first;
00096 return ret;
00097 }
00098
00099 _stage = 1;
00100 }
00101
00102 if (_stage == 1 && _extended)
00103 {
00104 if (_save_range.first != _container.end())
00105 {
00106
00107 _tmp.reset();
00108 ret.second = _f(_key, _save_range.first, _tmp, _max_distance + 1);
00109 if (ret.second <= _max_distance)
00110 {
00111 ret.first = _save_range.first;
00112 --_save_range.first;
00113 return ret;
00114 }
00115 }
00116
00117 _stage = 2;
00118 }
00119
00120 if (_stage == 2 && _extended)
00121 {
00122 if (_save_range.second != _container.end())
00123 {
00124
00125 _tmp.reset();
00126 ret.second = _f(_key, _save_range.second, _tmp, _max_distance + 1);
00127 if (ret.second <= _max_distance)
00128 {
00129 ret.first = _save_range.second;
00130 ++_save_range.second;
00131 return ret;
00132 }
00133 }
00134
00135 _stage = 3;
00136 }
00137
00138 return ret;
00139 }
00140
00141
00142 private:
00143 const C& _container;
00144 const P& _key;
00145 Func _f;
00146 byte_allocator& _tmp;
00147 bool _extended;
00148 size_t _max_distance;
00149
00150 size_t _stage;
00151 typename C::paircc_t _range;
00152 typename C::paircc_t _save_range;
00153 };
00155
00156 fuzzy_matcher_impl::fuzzy_matcher_impl(size_t memory_usage)
00157 {
00158 assert(_memory_factory.is_ready());
00159 }
00160
00161
00162 fuzzy_matcher_impl::~fuzzy_matcher_impl()
00163 {
00164 }
00165
00167
00168 void
00169 fuzzy_matcher_impl::reset()
00170 {
00171 _vocabulary_pk_generator.clear();
00172 _ngram_pk_generator.clear();
00173 _metaphone_pk_generator.clear();
00174 _reflection_pk_generator.clear();
00175
00176
00177 _tokenizer.clear_regex();
00178 _tokenizer.clear_abbr();
00179
00180 _word_vocabulary.clear();
00181 _vpk_word_vocabulary.clear();
00182 _metaphone_vocabulary.clear();
00183 _mpk_word_vocabulary.clear();
00184 _ngram_vocabulary.clear();
00185 _npk_ngram_vocabulary.clear();
00186 _reflection_vocabulary.clear();
00187 _rpk_ngram_vocabulary.clear();
00188 _ngram_partial_map.clear();
00189
00190 _memory_factory->reset();
00191 }
00192
00193
00194
00195
00196
00197 size_t
00198 fuzzy_matcher_impl::add(const char* phrase, byte_allocator& all)
00199 {
00200 size_t npk = 0;
00201 if (!phrase)
00202 return npk;
00203
00204 tokenizer_output_sequence_t tokens;
00205 _tokenizer.tokenize(phrase, tokens, all, 0);
00206
00207
00208 _list< size_t > idents;
00209 size_t offset = 0;
00210 size_t count = 0;
00211
00212 for (tokenizer_output_sequence_t::const_iterator it = tokens.begin(); count < MAX_PHRASE_TOKENS && it != tokens.end(); offset += it->_len, ++it)
00213 {
00214 switch (it->_type)
00215 {
00216 case TT_ALPHABETIC:
00217 case TT_DIGIT:
00218 break;
00219 default:
00220 continue;
00221 }
00222
00223 ++count;
00224
00225 word_key wkey(phrase + offset, __min(it->_len, MAX_TOKEN_LENGTH));
00226
00227
00228 word_entry_t::iterator it_word = _word_vocabulary.find(wkey);
00229 if (it_word == _word_vocabulary.end())
00230 {
00231
00232
00233
00234 metaphone_key mkey = fuzzyphonetic::convert_to_metaphone(wkey._str, wkey._len, all);
00235 metaphone_entry_t::iterator miter = _metaphone_vocabulary.find(mkey);
00236
00237 if (miter != _metaphone_vocabulary.end())
00238 {
00239 ++miter->_ref;
00240 }
00241 else
00242 {
00243
00244 size_t mpk = _metaphone_pk_generator.generate();
00245
00246 metaphone_key mkey_new;
00247 mkey_new._array = (ub1_t*)_memory_factory->allocate(mkey_new._length = mkey._length);
00248 memcpy(mkey_new._array, mkey._array, mkey_new._length);
00249
00250
00251 metaphone_entry mentry(mpk);
00252 miter = _metaphone_vocabulary.insert(mkey_new, mentry).first;
00253 }
00254
00255
00256 char* wptr = _memory_factory->allocate(wkey._len + 1);
00257 memcpy(wptr, wkey._str, wkey._len);
00258 wptr[wkey._len] = 0;
00259
00260
00261 size_t vpk = _vocabulary_pk_generator.generate();
00262
00263 word_key wkey(wptr, wkey._len);
00264 word_entry wentry(vpk, miter);
00265
00266 it_word = _word_vocabulary.insert(wkey, wentry).first;
00267
00268
00269 _vpk_word_vocabulary.insert(it_word->_vpk, it_word);
00270
00271 _mpk_word_vocabulary.insert(miter->_mpk, it_word);
00272 }
00273 else
00274 {
00275
00276 ++it_word->_ref;
00277 }
00278
00279 idents.push_back(all, it_word->_vpk);
00280 }
00281
00282
00283 size_t ngram_length = idents.size();
00284
00285 if (ngram_length)
00286 {
00287
00288 ngram_key nkey, okey;
00289 okey._length = nkey._length = ngram_length;
00290 size_t index = 0;
00291 nkey._array = (size_t*)all.allocate(ngram_length * sizeof(size_t));
00292 okey._array = (size_t*)all.allocate(ngram_length * sizeof(size_t));
00293 for (_list< size_t >::const_iterator it_ident = idents.begin(); it_ident != idents.end(); ++it_ident, ++index)
00294 okey._array[index] = nkey._array[index] = *it_ident;
00295
00296
00297 std::sort(nkey._array, nkey._array + nkey._length);
00298
00299 ngram_entry_t::iterator it_ngram = _ngram_vocabulary.find(nkey);
00300
00301 if (it_ngram == _ngram_vocabulary.end())
00302 {
00303
00304 size_t phrase_len = strlen(phrase);
00305 reflection_key rkey;
00306 fuzzyphonetic::convert_to_reflection(phrase, phrase_len, all, rkey);
00307
00308 reflection_entry_t::iterator riter = _reflection_vocabulary.find(rkey);
00309 if (riter != _reflection_vocabulary.end())
00310 {
00311 ++riter->_ref;
00312 }
00313 else
00314 {
00315
00316 size_t rpk = _reflection_pk_generator.generate();
00317 reflection_entry rentry(rpk);
00318 riter = _reflection_vocabulary.insert(rkey, rentry).first;
00319 }
00320
00321 npk = _ngram_pk_generator.generate();
00322 ngram_entry nentry(npk, riter);
00323
00324 size_t* nptr = (size_t*)_memory_factory->allocate(ngram_length * sizeof(size_t));
00325 memcpy(nptr, nkey._array, nkey._length * sizeof(size_t));
00326 size_t* optr = (size_t*)_memory_factory->allocate(ngram_length * sizeof(size_t));
00327 memcpy(optr, okey._array, okey._length * sizeof(size_t));
00328
00329 nkey._array = nptr;
00330 nentry._origin._array = optr;
00331 nentry._origin._length = okey._length;
00332
00333 it_ngram = _ngram_vocabulary.insert(nkey, nentry).first;
00334
00335
00336 _npk_ngram_vocabulary.insert(it_ngram->_npk, it_ngram);
00337
00338 _rpk_ngram_vocabulary.insert(riter->_rpk, it_ngram);
00339
00340
00341 if (ngram_length > 1)
00342 {
00343 for (size_t offset = 1; offset < ngram_length; ++offset)
00344 {
00345 ngram_key_offset pkey(offset, it_ngram);
00346 _ngram_partial_map.insert(pkey, true);
00347 }
00348 }
00349 }
00350 else
00351 {
00352 npk = it_ngram->_npk;
00353 ++it_ngram->_ref;
00354 }
00355 }
00356
00357 return npk;
00358 }
00359
00360
00361
00362
00363 bool
00364 fuzzy_matcher_impl::remove(const char* phrase, byte_allocator& all)
00365 {
00366 if (!phrase)
00367 return false;
00368
00369 tokenizer_output_sequence_t tokens;
00370 _tokenizer.tokenize(phrase, tokens, all, 0);
00371
00372
00373 _list< size_t > idents;
00374 size_t offset = 0;
00375 size_t count = 0;
00376
00377 for (tokenizer_output_sequence_t::const_iterator it = tokens.begin(); count < MAX_PHRASE_TOKENS && it != tokens.end(); offset += it->_len, ++it)
00378 {
00379 switch (it->_type)
00380 {
00381 case TT_ALPHABETIC:
00382 case TT_DIGIT:
00383 break;
00384 default:
00385 continue;
00386 }
00387
00388 ++count;
00389
00390 word_key wkey(phrase + offset, __min(it->_len, MAX_TOKEN_LENGTH));
00391 word_entry_t::iterator it_word = _word_vocabulary.find(wkey);
00392
00393 if (it_word != _word_vocabulary.end())
00394 {
00395 if (it_word->_ref == 1)
00396 {
00397
00398 size_t mpk = it_word->_miter->_mpk;
00399 mpk_word_entry_iter_t::iterator iter_metaphone = _mpk_word_vocabulary.lower_bound(mpk);
00400
00401 while (mpk == iter_metaphone.key())
00402 {
00403 if (*iter_metaphone == it_word)
00404 {
00405 _mpk_word_vocabulary.erase(iter_metaphone);
00406 break;
00407 }
00408
00409 ++iter_metaphone;
00410 }
00411
00412
00413 _memory_factory->deallocate((char*)it_word.key()._str);
00414
00415 _vpk_word_vocabulary.erase(it_word->_vpk);
00416
00417 _vocabulary_pk_generator.save(it_word->_vpk);
00418
00419
00420 if (it_word->_miter->_ref == 1)
00421 _metaphone_vocabulary.erase(it_word->_miter);
00422 else
00423 --it_word->_miter->_ref;
00424
00425 _word_vocabulary.erase(it_word);
00426 }
00427 else
00428 --it_word->_ref;
00429 }
00430
00431 idents.push_back(all, it_word->_vpk);
00432 }
00433
00434
00435 size_t ngram_length = idents.size();
00436
00437 if (ngram_length)
00438 {
00439
00440 ngram_key nkey;
00441 nkey._length = ngram_length;
00442 size_t index = 0;
00443 nkey._array = (size_t*)all.allocate(ngram_length * sizeof(size_t));
00444 for (_list< size_t >::const_iterator it_ident = idents.begin(); it_ident != idents.end(); ++it_ident, ++index)
00445 nkey._array[index] = *it_ident;
00446
00447 std::sort(nkey._array, nkey._array + nkey._length);
00448
00449 ngram_entry_t::iterator it_ngram = _ngram_vocabulary.find(nkey);
00450
00451 if (it_ngram != _ngram_vocabulary.end())
00452 {
00453 if (it_ngram->_ref == 1)
00454 {
00455 size_t rpk = it_ngram->_riter->_rpk;
00456 rpk_ngram_entry_iter_t::iterator iter_reflaction = _rpk_ngram_vocabulary.lower_bound(rpk);
00457
00458 while (rpk == iter_reflaction.key())
00459 {
00460 if (*iter_reflaction == it_ngram)
00461 {
00462 _rpk_ngram_vocabulary.erase(iter_reflaction);
00463 break;
00464 }
00465
00466 ++iter_reflaction;
00467 }
00468
00469
00470
00471 if (ngram_length > 1)
00472 {
00473 for (size_t offset = 1; offset < ngram_length; ++offset)
00474 {
00475 ngram_key_offset pkey(offset, it_ngram);
00476 ngram_key_offset_multimap_t::iterator iter_lower = _ngram_partial_map.lower_bound(pkey);
00477 ngram_key_offset_multimap_t::iterator iter_upper = _ngram_partial_map.upper_bound(pkey);
00478
00479
00480 while (iter_lower != iter_upper)
00481 {
00482 if (iter_lower.key()._iter == it_ngram)
00483 iter_lower = _ngram_partial_map.erase(iter_lower);
00484 else
00485 ++iter_lower;
00486 }
00487 }
00488 }
00489
00490
00491 _memory_factory->deallocate((char*)it_ngram->_origin._array);
00492 _memory_factory->deallocate((char*)it_ngram.key()._array);
00493
00494
00495 _npk_ngram_vocabulary.erase(it_ngram->_npk);
00496 _ngram_pk_generator.save(it_ngram->_npk);
00497
00498
00499 if (it_ngram->_riter->_ref == 1)
00500 _reflection_vocabulary.erase(it_ngram->_riter);
00501 else
00502 --it_ngram->_riter->_ref;
00503
00504 _ngram_vocabulary.erase(it_ngram);
00505 }
00506 else
00507 --it_ngram->_ref;
00508 }
00509 }
00510
00511 return true;
00512 }
00513
00514
00515
00516
00517 bool fuzzy_matcher_impl::remove(size_t ident,
00518 byte_allocator& all)
00519 {
00520 npk_ngram_entry_iter_t::iterator it_ident = _npk_ngram_vocabulary.find(ident);
00521
00522 if (it_ident == _npk_ngram_vocabulary.end())
00523 return false;
00524
00525 ngram_entry_iter_t it_ngram = *it_ident;
00526 size_t ngram_length = it_ngram.key()._length;
00527
00528 if (it_ngram->_ref == 1)
00529 {
00530 size_t rpk = it_ngram->_riter->_rpk;
00531 rpk_ngram_entry_iter_t::iterator iter_reflaction = _rpk_ngram_vocabulary.lower_bound(rpk);
00532
00533 while (rpk == iter_reflaction.key())
00534 {
00535 if (*iter_reflaction == it_ngram)
00536 {
00537 _rpk_ngram_vocabulary.erase(iter_reflaction);
00538 break;
00539 }
00540
00541 ++iter_reflaction;
00542 }
00543
00544
00545
00546 if (ngram_length > 1)
00547 {
00548 for (size_t offset = 1; offset < ngram_length; ++offset)
00549 {
00550 ngram_key_offset pkey(offset, it_ngram);
00551 ngram_key_offset_multimap_t::iterator iter_lower = _ngram_partial_map.lower_bound(pkey);
00552 ngram_key_offset_multimap_t::iterator iter_upper = _ngram_partial_map.upper_bound(pkey);
00553
00554
00555 while (iter_lower != iter_upper)
00556 {
00557 if (iter_lower.key()._iter == it_ngram)
00558 iter_lower = _ngram_partial_map.erase(iter_lower);
00559 else
00560 ++iter_lower;
00561 }
00562 }
00563 }
00564
00565
00566 _memory_factory->deallocate((char*)it_ngram->_origin._array);
00567 _memory_factory->deallocate((char*)it_ngram.key()._array);
00568
00569
00570 _npk_ngram_vocabulary.erase(it_ngram->_npk);
00571 _ngram_pk_generator.save(it_ngram->_npk);
00572
00573
00574 if (it_ngram->_riter->_ref == 1)
00575 _reflection_vocabulary.erase(it_ngram->_riter);
00576 else
00577 --it_ngram->_riter->_ref;
00578
00579 _ngram_vocabulary.erase(it_ngram);
00580 }
00581 else
00582 --it_ngram->_ref;
00583
00584 return true;
00585 }
00586
00587
00588
00589 bool
00590 fuzzy_matcher_impl::match( ngram_quality nq,
00591 phonetic_quality fq,
00592 const char* phrase,
00593 byte_allocator& all,
00594 byte_allocator& tmp,
00595 _list< const char* >& suggestions) const
00596 {
00597 candidates_container_t candidates;
00598
00599 if (!_match(nq, fq, phrase, all, tmp, candidates))
00600 return false;
00601
00602
00603 tmp.reset();
00604 vector_container_citer_t vec;
00605 candidate_sorter sorter(candidates, vec, tmp);
00606
00607
00608 for (vector_container_citer_t::const_iterator iter_candidate = vec.begin(); iter_candidate != vec.end(); ++iter_candidate)
00609 {
00610
00611 const ngram_key& ckey = (*iter_candidate).key();
00612
00613
00614 ngram_entry_t::const_iterator iter_ngram = _ngram_vocabulary.find(ckey);
00615 assert(iter_ngram != _ngram_vocabulary.end());
00616
00617 const ngram_key& key_origin = iter_ngram->_origin;
00618
00619 word_key wkey = reconstruct_string(key_origin, all, true);
00620 suggestions.push_back(all, wkey._str);
00621 }
00622
00623
00624 return true;
00625 }
00626
00627
00628 bool
00629 fuzzy_matcher_impl::match( ngram_quality nq,
00630 phonetic_quality fq,
00631 const char* phrase,
00632 byte_allocator& all,
00633 byte_allocator& tmp,
00634 _list< size_t >& suggestions) const
00635 {
00636 candidates_container_t candidates;
00637
00638 if (!_match(nq, fq, phrase, all, tmp, candidates))
00639 return false;
00640
00641 tmp.reset();
00642 vector_container_citer_t vec;
00643 candidate_sorter sorter(candidates, vec, tmp);
00644
00645
00646 for (vector_container_citer_t::const_iterator iter_candidate = vec.begin(); iter_candidate != vec.end(); ++iter_candidate)
00647 {
00648
00649 const ngram_key& ckey = (*iter_candidate).key();
00650
00651 ngram_entry_t::const_iterator iter_pk = _ngram_vocabulary.find(ckey);
00652 assert(iter_pk != _ngram_vocabulary.end());
00653
00654 suggestions.push_back(all, iter_pk->_npk);
00655 }
00656
00657 return true;
00658 }
00659
00660 bool
00661 fuzzy_matcher_impl::_match (ngram_quality nq,
00662 phonetic_quality fq,
00663 const char* phrase,
00664 byte_allocator& all,
00665 byte_allocator& tmp,
00666 candidates_container_t& candidates) const
00667 {
00668 if (!phrase)
00669 return false;
00670
00671 size_t max_phonet_distance = (fq == pq_high ? 0 : (fq == pq_normal ? 1 : 2));
00672 size_t max_word_distance = (fq == pq_high ? 1 : (fq == pq_normal ? 2 : 4));
00673
00674
00675 tokenizer_output_sequence_t tokens;
00676 _tokenizer.tokenize(phrase, tokens, tmp, 0);
00677
00678 size_t phrase_length = strlen(phrase);
00679
00680 _list< string_desc > words;
00681 _list< metaphone_key > metaphones;
00682
00683
00684 metaphone_key phrase_key = fuzzyphonetic::convert_to_metaphone(phrase, phrase_length, all);
00685
00686
00687 reflection_key phrase_reflection;
00688 fuzzyphonetic::convert_to_reflection(phrase, phrase_length, all, phrase_reflection);
00689
00690 size_t offset = 0;
00691 size_t count = 0;
00692
00693
00694 for (tokenizer_output_sequence_t::const_iterator it = tokens.begin(); count < MAX_PHRASE_TOKENS && it != tokens.end(); offset += it->_len, ++it)
00695 {
00696 switch (it->_type)
00697 {
00698 case TT_ALPHABETIC:
00699 case TT_DIGIT:
00700 break;
00701 default:
00702 continue;
00703 }
00704
00705 ++count;
00706
00707
00708 size_t len = __min(it->_len, MAX_TOKEN_LENGTH);
00709 char* ptr = (char*)all.allocate(len + 1);
00710 for (size_t i = 0; i < len; ++i)
00711 ptr[i] = fuzzyphonetic::to_lower(*(phrase + offset + i));
00712
00713 ptr[len] = 0;
00714
00715 string_desc desc;
00716 desc._vpk = 0;
00717 desc._str = ptr;
00718 desc._len = len;
00719
00720
00721 words.push_back(all, desc);
00722
00723
00724 metaphone_key key = fuzzyphonetic::convert_to_metaphone(ptr, len, all);
00725 metaphones.push_back(all, key);
00726 }
00727
00728
00729 if (count == 0)
00730 return false;
00731
00732
00733
00734
00735
00736
00737
00738
00739
00740 word_key phraselow = make_string_lower(phrase, phrase_length, all);
00741
00742
00743 lookup_distance< metaphone_entry_t, metaphone_key, double (*)(const metaphone_key&, metaphone_entry_citer_t, byte_allocator&, size_t) >
00744 lookup_metaphone(true, max_phonet_distance, _metaphone_vocabulary, phrase_key, wrapper_find_metaphone_distance, tmp);
00745
00746 pair< metaphone_entry_t::const_iterator, double > iter_metaphone;
00747 while ((iter_metaphone = lookup_metaphone.next()).first != _metaphone_vocabulary.end())
00748 {
00749
00750 size_t mpk = iter_metaphone.first->_mpk;
00751
00752 double score = iter_metaphone.second;
00753
00754 mpk_word_entry_iter_t::const_iterator iter_word = _mpk_word_vocabulary.lower_bound(mpk);
00755
00756 while (iter_word != _mpk_word_vocabulary.end()
00757 && iter_word.key() == mpk
00758 )
00759 {
00760 ngram_key nkey;
00761 size_t vpk = (*iter_word)->_vpk;
00762 nkey._array = &vpk;
00763 nkey._length = 1;
00764
00765 size_t popularity = (*iter_word)->_ref;
00766
00767 word_key wkey = make_string_lower((*iter_word).key()._str, (*iter_word).key()._len, tmp);
00768
00769 size_t word_penalty = fuzzyphonetic::find_word_distance(phraselow._str, phraselow._len, wkey._str, wkey._len, tmp, max_phonet_distance + 1);
00770
00771 if (word_penalty <= max_word_distance)
00772 partial_intersect(nq, fq, nkey, nkey, score, popularity, word_penalty + 3*(count - 1), all, tmp, candidates);
00773 ++iter_word;
00774 }
00775 }
00776
00777
00778 if (count > 1)
00779 {
00780 _list< string_desc >::const_iterator wit = words.begin();
00781 _list< metaphone_key >::const_iterator mit = metaphones.begin();
00782 _list< sorted_ngram_suggestions_t > score_sugs;
00783 size_t items = 0;
00784
00785 for (; wit != words.end(); ++wit, ++mit)
00786 {
00787 sorted_ngram_suggestions_t score_dummy;
00788 sorted_ngram_suggestions_t& score_ref = *score_sugs.push_back(all, score_dummy);
00789
00790
00791
00792 lookup_distance< metaphone_entry_t, metaphone_key, double (*)(const metaphone_key&, metaphone_entry_citer_t, byte_allocator&, size_t) >
00793 lookup_metaphone(nq == nq_high ? false : true, max_phonet_distance, _metaphone_vocabulary, *mit, wrapper_find_metaphone_distance, tmp);
00794
00795 pair< metaphone_entry_t::const_iterator, double > iter_metaphone;
00796 while ((iter_metaphone = lookup_metaphone.next()).first != _metaphone_vocabulary.end())
00797 {
00798
00799 size_t mpk = iter_metaphone.first->_mpk;
00800 double score = iter_metaphone.second;
00801
00802 mpk_word_entry_iter_t::const_iterator iter_word = _mpk_word_vocabulary.lower_bound(mpk);
00803
00804 while (iter_word != _mpk_word_vocabulary.end()
00805 && iter_word.key() == mpk
00806 )
00807 {
00808 size_t vpk = (*iter_word)->_vpk;
00809 size_t popularity = (*iter_word)->_ref;
00810
00811 word_key wkey = make_string_lower((*iter_word).key()._str, (*iter_word).key()._len, tmp);
00812
00813 size_t word_penalty = fuzzyphonetic::find_word_distance(wit->_str, wit->_len, wkey._str, wkey._len, tmp, max_phonet_distance + 1);
00814
00815 if (word_penalty <= max_word_distance)
00816 {
00817 ngram_suggestions_info score_p(vpk, popularity, score, word_penalty);
00818 score_ref.insert(all, calculate_score(score, 1.0, popularity, word_penalty), score_p);
00819 }
00820
00821 ++iter_word;
00822 }
00823 }
00824
00825 if (!score_ref.empty())
00826 ++items;
00827 else
00828 {
00829 score_sugs.pop_back();
00830 }
00831 }
00832
00833
00834 if (items)
00835 {
00836
00837 ngram_suggestions_iter_list_t iter_list;
00838
00839 _list< sorted_ngram_suggestions_t >::iterator iter_score = score_sugs.begin();
00840
00841 size_t av_per_token = (size_t)pow(100.0, 1./items);
00842
00843 if (av_per_token == 0)
00844 av_per_token = 1;
00845
00846 for (; iter_score != score_sugs.end(); ++iter_score)
00847 {
00848 while (iter_score->size() > av_per_token)
00849 {
00850 iter_score->erase(--(iter_score->end()));
00851 }
00852
00853 ngram_suggestions_iter_info info(iter_score->begin(), iter_score->end());
00854 iter_list.push_back(all, info);
00855 }
00856
00857 ngram_key scorekey, scorekey_origin;
00858 scorekey_origin._length = scorekey._length = items;
00859 scorekey._array = (size_t*)all.allocate(scorekey._length * sizeof(size_t));
00860 scorekey_origin._array = (size_t*)all.allocate(scorekey_origin._length * sizeof(size_t));
00861
00862 ngram_suggestions_iter_list_t::iterator fiter = iter_list.begin();
00863
00864 while (true)
00865 {
00866
00867 double score_score = 0.0;
00868 size_t score_pop = 0;
00869 size_t score_penalty = 0;
00870 size_t index = 0;
00871
00872 for (ngram_suggestions_iter_list_t::iterator liter = iter_list.begin(); liter != iter_list.end(); ++liter, ++index)
00873 {
00874 score_score += liter->_current->_score;
00875 score_pop += liter->_current->_popularity;
00876 score_penalty += liter->_current->_penalty;
00877
00878 scorekey_origin._array[index] = scorekey._array[index] = liter->_current->_vpk;
00879 }
00880
00881 std::sort(scorekey._array, scorekey._array + index);
00882
00883
00884 partial_intersect(nq, fq, scorekey, scorekey_origin, score_score / items, score_pop / items, score_penalty / items, all, tmp, candidates);
00885
00886 while (fiter != iter_list.end())
00887 {
00888
00889 sorted_ngram_suggestions_citer_t citer = fiter->_current;
00890
00891 if (citer != fiter->_end
00892 && ++citer != fiter->_end)
00893 {
00894 ++(fiter->_current);
00895
00896
00897 if (fiter != iter_list.begin())
00898 {
00899 ngram_suggestions_iter_list_t::iterator startiter = iter_list.begin();
00900
00901 do
00902 {
00903 startiter->_current = startiter->_begin;
00904 ++startiter;
00905 }
00906 while (startiter != fiter);
00907
00908
00909 fiter = iter_list.begin();
00910 }
00911
00912 break;
00913 }
00914
00915 ++fiter;
00916 }
00917
00918 if (fiter == iter_list.end())
00919 break;
00920 }
00921 }
00922 }
00923
00924
00925 lookup_distance< reflection_entry_t, reflection_key, double (*)(const reflection_key&, reflection_entry_citer_t, byte_allocator&, size_t) >
00926 lookup_reflection(true, max_phonet_distance, _reflection_vocabulary, phrase_reflection, wrapper_find_reflection_distance, tmp);
00927
00928 pair< reflection_entry_t::const_iterator, double > iter_reflection;
00929 while ((iter_reflection = lookup_reflection.next()).first != _reflection_vocabulary.end())
00930 {
00931
00932 size_t rpk = iter_reflection.first->_rpk;
00933 double score = iter_reflection.second;
00934
00935 rpk_ngram_entry_iter_t::const_iterator iter_ngram = _rpk_ngram_vocabulary.lower_bound(rpk);
00936
00937 while (iter_ngram != _rpk_ngram_vocabulary.end()
00938 && iter_ngram.key() == rpk
00939 )
00940 {
00941 const ngram_key& nkey_sorted = (*iter_ngram).key();
00942
00943
00944 if (candidates.end() == candidates.find(nkey_sorted))
00945 {
00946 const ngram_key& nkey_origin = (*iter_ngram)->_origin;
00947 size_t popularity = (*iter_ngram)->_ref;
00948
00949
00950 if (nkey_origin._length == 1)
00951 {
00952 word_key reckey = reconstruct_string(nkey_origin, tmp, false);
00953 size_t word_penalty = fuzzyphonetic::find_word_distance(phraselow._str, phraselow._len, reckey._str, reckey._len, tmp, max_phonet_distance + 1);
00954
00955 if (word_penalty <= max_word_distance)
00956 {
00957 candidate_info info(calculate_score(score, count - 1.0, popularity, word_penalty+ 3*(count - 1)), score, popularity, word_penalty);
00958 candidates.insert(all, nkey_sorted, info);
00959 }
00960 }
00961 else
00962 {
00963
00964 char word[MAX_TOKEN_LENGTH];
00965 ub1_t array[MAX_TOKEN_LENGTH];
00966 ub4_t length = __min((ub4_t)MAX_TOKEN_LENGTH, phrase_key._length);
00967 memcpy(array, phrase_key._array, length);
00968 memset(array + length, 0xff, MAX_TOKEN_LENGTH - length);
00969
00970 size_t word_penalty = 0;
00971 size_t phone_penalty = 0;
00972
00973 for (size_t p = 0; p < nkey_origin._length; ++p)
00974 {
00975
00976 vpk_word_entry_iter_t::const_iterator it_p = _vpk_word_vocabulary.find(nkey_origin._array[p]);
00977 assert(it_p != _vpk_word_vocabulary.end());
00978
00979 const metaphone_key& r_key = (*it_p)->_miter.key();
00980
00981 phone_penalty += r_key._length - find_and_remove(r_key, array, length);
00982
00983 word_key wkey = make_string_lower((*it_p).key()._str, (*it_p).key()._len, word);
00984 word_penalty += ((strstr(phraselow._str, wkey._str) != 0) ? 0 : 3);
00985 }
00986
00987
00988
00989
00990
00991 if (phone_penalty <= max_phonet_distance
00992 && word_penalty <= max_word_distance)
00993 {
00994 size_t count_diff = __max(nkey_origin._length, count) - __min(nkey_origin._length, count);
00995 candidate_info info(calculate_score(score, (double)count_diff, popularity, phone_penalty+ 3*word_penalty), score, popularity, word_penalty);
00996 candidates.insert(all, nkey_sorted, info);
00997 }
00998 }
00999 }
01000 ++iter_ngram;
01001 }
01002 }
01003
01004 return true;
01005 }
01006
01007 void
01008 fuzzy_matcher_impl::partial_intersect(ngram_quality nq,
01009 phonetic_quality fq,
01010 const ngram_key& key,
01011 const ngram_key& key_origin,
01012 double score,
01013 size_t popularity,
01014 size_t word_penalty,
01015 byte_allocator& all,
01016 byte_allocator& tmp,
01017 candidates_container_t& candidates) const
01018 {
01019 ngram_key mkey;
01020 mkey._length = 1;
01021 mkey._array = 0;
01022
01023 double nq_score = (nq == nq_high ? 0.1 : (nq == nq_normal ? 0.26 : 0.34));
01024
01025
01026 for (size_t m = 0; m < key._length; ++m)
01027 {
01028 mkey._array = &key._array[m];
01029
01030
01031 ngram_entry_t::const_iterator iter_main = _ngram_vocabulary.lower_bound(mkey);
01032 while (iter_main != _ngram_vocabulary.end()
01033 && mkey._array[0] == iter_main.key()._array[0]
01034 )
01035 {
01036
01037 size_t pop = iter_main->_ref;
01038 const ngram_key& nkey = iter_main.key();
01039 const ngram_key& nkey_origin = iter_main->_origin;
01040
01041 double interscore = calculate_nkey_score(nkey, key);
01042 if (interscore < nq_score
01043 && candidates.end() == candidates.find(nkey)
01044 )
01045 {
01046 size_t len_diff = __max(nkey._length, key._length) - __min(nkey._length, key._length);
01047
01048 double interscore_origin = calculate_nkey_score(nkey_origin, key_origin);
01049 double final_interscore = (interscore + interscore_origin) / 2.0;
01050
01051 candidate_info info(calculate_score(score, final_interscore, (popularity + pop) / 2, word_penalty + 3*len_diff), (score + final_interscore) / 2.0, (popularity + pop) / 2, word_penalty);
01052 candidates.insert(all, nkey, info);
01053 }
01054
01055 ++iter_main;
01056 }
01057 }
01058
01059 if (!_ngram_partial_map.empty())
01060 {
01061
01062 ngram_key_offset_multimap_t::const_iterator iter_upper = _ngram_partial_map.end();
01063 --iter_upper;
01064
01065 size_t max_tokens = iter_upper.key()._offset;
01066
01067
01068 mkey._length = max_tokens + 1;
01069 mkey._array = (size_t*)tmp.allocate(mkey._length * sizeof(size_t));
01070 memset(mkey._array, 0, mkey._length * sizeof(size_t));
01071
01072
01073 for (size_t l = 0; l < key._length; ++l)
01074 {
01075
01076 for (size_t i = 1; i <= max_tokens; ++i)
01077 {
01078 mkey._array[i] = key._array[l];
01079 mkey._length = i + 1;
01080 ngram_key_offset pkey(i, &mkey);
01081 ngram_key_offset_multimap_t::const_iterator it_partial = _ngram_partial_map.lower_bound(pkey);
01082 while (it_partial != _ngram_partial_map.end()
01083 && it_partial.key()._offset == i
01084 && it_partial.key()._iter.key()._array[i] == key._array[l]
01085 )
01086 {
01087
01088 size_t pop = it_partial.key()._iter->_ref;
01089 const ngram_key& nkey = it_partial.key()._iter.key();
01090 const ngram_key& nkey_origin = it_partial.key()._iter->_origin;
01091
01092 double interscore = calculate_nkey_score(nkey, key);
01093
01094 if (interscore < nq_score
01095 && candidates.end() == candidates.find(nkey)
01096 )
01097 {
01098 size_t len_diff = __max(nkey._length, key._length) - __min(nkey._length, key._length);
01099
01100 double interscore_origin = calculate_nkey_score(nkey_origin, key_origin);
01101 double final_interscore = (interscore + interscore_origin) / 2.0;
01102
01103 candidate_info info(calculate_score(score, final_interscore, (popularity + pop) / 2, word_penalty + 3*len_diff), (score + final_interscore) / 2.0, (popularity + pop) / 2, word_penalty);
01104 candidates.insert(all, nkey, info);
01105 }
01106
01107 ++it_partial;
01108 }
01109 }
01110 }
01111 }
01112 }
01113
01114
01115
01116 double
01117 fuzzy_matcher_impl::calculate_nkey_score(const ngram_key& key1, const ngram_key& key2)
01118 {
01119 double ret = 0.0;
01120 size_t len1 = key1._length;
01121 size_t len2 = key2._length;
01122
01123 for (size_t i1 = 0, i2 = 0; i1 < len1 && i2 < len2;)
01124 {
01125 if (key1._array[i1] < key2._array[i2])
01126 ++i1;
01127 else if (key2._array[i2] < key1._array[i1])
01128 ++i2;
01129 else
01130 ++i1, ++i2, ret += 2;
01131 }
01132
01133 return (len1 + len2 - ret) / (len1 + len2);
01134 }
01135
01136 word_key
01137 fuzzy_matcher_impl::reconstruct_string(const ngram_key& key, byte_allocator& tmp, bool preserve_case) const
01138 {
01139 _list< word_key > words;
01140 size_t total_bytes = 0;
01141
01142 for (size_t i = 0; i < key._length; ++i)
01143 {
01144 vpk_word_entry_iter_t::const_iterator iter_word_pk = _vpk_word_vocabulary.find(key._array[i]);
01145 assert(iter_word_pk != _vpk_word_vocabulary.end());
01146
01147 word_key desc((*iter_word_pk).key());
01148 total_bytes += desc._len + 1;
01149 words.push_back(tmp, desc);
01150 }
01151
01152 char* ret = (char*)tmp.allocate(total_bytes);
01153 size_t offset = 0;
01154
01155 for (_list< word_key >::const_iterator it = words.begin(); it != words.end(); ++it)
01156 {
01157 if (preserve_case)
01158 {
01159 memcpy(ret + offset, it->_str, it->_len);
01160 }
01161 else
01162 {
01163 for (size_t c = 0; c < it->_len; ++c)
01164 ret[offset + c] = fuzzyphonetic::to_lower(it->_str[c]);
01165 }
01166 offset += it->_len;
01167 ret[offset++] = ' ';
01168 }
01169
01170 ret[total_bytes - 1] = 0;
01171 word_key r(ret, total_bytes - 1);
01172 return r;
01173 }
01174
01175
01176 word_key
01177 fuzzy_matcher_impl::make_string_lower(const char* str, size_t len, byte_allocator& tmp)
01178 {
01179 char* ret = (char*)tmp.allocate(len + 1);
01180 size_t offset = 0;
01181
01182 for (size_t c = 0; c < len; ++c)
01183 ret[offset + c] = fuzzyphonetic::to_lower(str[c]);
01184
01185 ret[len] = 0;
01186 word_key r(ret, len);
01187 return r;
01188 }
01189
01190
01191 word_key
01192 fuzzy_matcher_impl::make_string_lower(const char* str, size_t len, char* buf)
01193 {
01194 size_t offset = 0;
01195
01196 for (size_t c = 0; c < len; ++c)
01197 buf[offset + c] = fuzzyphonetic::to_lower(str[c]);
01198
01199 buf[len] = 0;
01200 word_key r(buf, len);
01201 return r;
01202 }
01203
01204
01205 double
01206 fuzzy_matcher_impl::calculate_score(double phonetic_score, double intersection_score, size_t popularity, size_t penalty)
01207 {
01208 return (0.75 * phonetic_score + 0.9 * intersection_score + 0.25 * penalty + 1. / popularity);
01209 }
01210
01211
01212 ub4_t
01213 fuzzy_matcher_impl::find_and_remove(const metaphone_key& key, ub1_t* array, ub4_t& len)
01214 {
01215 register ub4_t len_substr = key._length;
01216 register ub4_t len_str = len;
01217
01218 if (!len_substr || !len || len_substr > len_str)
01219 return 0;
01220
01221 register ub1_t* p = key._array;
01222 register ub1_t* _this = array;
01223 register ub1_t* _next = 0;
01224
01225
01226 while ((_next = (ub1_t*)memchr(_this, *p, len_str)))
01227 {
01228 register ub4_t consumed = (ub4_t)((_next - array) + len_substr);
01229 if (len < consumed)
01230 return 0;
01231
01232
01233 if (!memcmp(_next, p, len_substr))
01234 {
01235
01236 memmove(_next, _next + len_substr, len - ((_next - array) + len_substr));
01237 len -= len_substr;
01238 return len_substr;
01239 }
01240 else
01241 {
01242 _this = _next + 1;
01243 len_str = (ub4_t)(len - (_this - array));
01244 }
01245 }
01246
01247 return 0;
01248 }
01249
01250 #pragma pack()
01251 END_TERIMBER_NAMESPACE