|
Home / Open source / Terimber 2.0
fuzzy_matcher_impl Class Referencefuzzy match library implementation
More...
#include <fuzzyimpl.h>
List of all members.
|
Public Member Functions |
| | fuzzy_matcher_impl (size_t memory_usage) |
| | constructor
|
| virtual | ~fuzzy_matcher_impl () |
| | destructor
|
| virtual size_t | add (const char *phrase, byte_allocator &all) |
| | adds a new n-gram to the internal repository (utf-8) caller can add the same n-gram many times class will support internal references count
|
| virtual bool | remove (const char *phrase, byte_allocator &all) |
| | removes the previously added ngram caller can remove the same n-gram many times unless references count goes to zero
|
| virtual bool | remove (size_t ident, byte_allocator &all) |
| | removes the previously added ngram by ident caller can remove the same n-gram many times unless references count goes to zero
|
| virtual bool | match (ngram_quality nq, phonetic_quality pq, const char *phrase, byte_allocator &all, byte_allocator &tmp, _list< const char * > &suggestions) const |
| | does the fuzzy match
|
| virtual bool | match (ngram_quality nq, phonetic_quality fq, const char *phrase, byte_allocator &all, byte_allocator &tmp, _list< size_t > &suggestions) const |
| | does the fuzzy match
|
| virtual void | reset () |
| | clean up engine
|
Private Types |
typedef map< ngram_key_offset,
bool, less< ngram_key_offset >
, true > | ngram_key_offset_multimap_t |
| | partial offset map for n-grams
|
typedef _map< ngram_key,
candidate_info > | candidates_container_t |
| | maps n-gram key to the candidate information
|
typedef
candidates_container_t::const_iterator | candidates_container_citer_t |
| | const iterator of candidates_container_t
|
typedef _vector
< candidates_container_citer_t > | vector_container_citer_t |
| | vector of iterators
|
Private Member Functions |
| bool | _match (ngram_quality nq, phonetic_quality fq, const char *phrase, byte_allocator &all, byte_allocator &tmp, candidates_container_t &candidates) const |
| | matches the fuzzy match
|
| void | partial_intersect (ngram_quality nq, phonetic_quality fq, const ngram_key &key, const ngram_key &key_origin, double score, size_t popularity, size_t word_penalty, byte_allocator &all, byte_allocator &tmp, candidates_container_t &candidate) const |
| | finds the ngrams that partially intersect
|
| word_key | reconstruct_string (const ngram_key &key, byte_allocator &tmp, bool preserve_case) const |
| | reconstructs the string from ngram keys
|
Static Private Member Functions |
| static word_key | make_string_lower (const char *str, size_t len, byte_allocator &tmp) |
| | creates a case insensitive word key
|
| static word_key | make_string_lower (const char *str, size_t len, char *buf) |
| | creates a case insensitive word key
|
| static double | calculate_nkey_score (const ngram_key &key1, const ngram_key &key2) |
| | calculates the cross score
|
| static double | calculate_score (double phonetic_score, double intersection_score, size_t popularity, size_t penalty) |
| | calculates the total score
|
| static ub4_t | find_and_remove (const metaphone_key &key, ub1_t *array, ub4_t &len) |
| | finds and removes the metaphone key from the reflection array
|
Private Attributes |
| byte_repository_factory | _memory_factory |
| | memory factory
|
| tokenizer | _tokenizer |
| | tokenizer
|
| unique_key_generator | _vocabulary_pk_generator |
| | work key generator
|
| unique_key_generator | _ngram_pk_generator |
| | ngram key generator
|
| unique_key_generator | _metaphone_pk_generator |
| | metaphone key generator
|
| unique_key_generator | _reflection_pk_generator |
| | reflection key generator
|
| word_entry_t | _word_vocabulary |
| | word map
|
| vpk_word_entry_iter_t | _vpk_word_vocabulary |
| | reverse word map
|
| metaphone_entry_t | _metaphone_vocabulary |
| | metaphone map
|
| mpk_word_entry_iter_t | _mpk_word_vocabulary |
| | reverse metaphone map
|
| ngram_entry_t | _ngram_vocabulary |
| | ngram map
|
| npk_ngram_entry_iter_t | _npk_ngram_vocabulary |
| | reverse ngram map
|
| reflection_entry_t | _reflection_vocabulary |
| | reflection map
|
| rpk_ngram_entry_iter_t | _rpk_ngram_vocabulary |
| | reverse reflection map
|
| ngram_key_offset_multimap_t | _ngram_partial_map |
| | partial ngram offsets map
|
Classes |
| class | candidate_info |
| | match candidate information More...
|
| class | candidate_sorter |
| | candidate sort predicate More...
|
| class | ngram_key_offset |
| | n-gram ley offset More...
|
Detailed Description
fuzzy match library implementation
Definition at line 381 of file fuzzyimpl.h.
Member Typedef Documentation
maps n-gram key to the candidate information
Definition at line 481 of file fuzzyimpl.h.
const iterator of candidates_container_t
Definition at line 484 of file fuzzyimpl.h.
Constructor & Destructor Documentation
| fuzzy_matcher_impl::fuzzy_matcher_impl |
( |
size_t |
memory_usage |
) |
|
| fuzzy_matcher_impl::~fuzzy_matcher_impl |
( |
|
) |
[virtual] |
Member Function Documentation
| size_t fuzzy_matcher_impl::add |
( |
const char * |
phrase, |
|
|
byte_allocator & |
all | |
|
) |
| | [virtual] |
adds a new n-gram to the internal repository (utf-8) caller can add the same n-gram many times class will support internal references count
- Parameters:
-
| phrase |
input phrase |
| all |
external allocator |
Definition at line 198 of file fuzzyimpl.cpp.
References ngram_key::_array, metaphone_key::_array, word_key::_len, ngram_key::_length, metaphone_key::_length, _memory_factory, _metaphone_pk_generator, _metaphone_vocabulary, _mpk_word_vocabulary, _ngram_partial_map, _ngram_pk_generator, _ngram_vocabulary, _npk_ngram_vocabulary, ngram_entry::_origin, _reflection_pk_generator, _reflection_vocabulary, _rpk_ngram_vocabulary, word_key::_str, _tokenizer, _vocabulary_pk_generator, _vpk_word_vocabulary, _word_vocabulary, byte_allocator::allocate(), base_list< T >::begin(), fuzzyphonetic::convert_to_metaphone(), fuzzyphonetic::convert_to_reflection(), base_map< K, T, Pr, M >::end(), base_list< T >::end(), base_map< K, T, Pr, M >::find(), unique_key_generator::generate(), map< K, T, Pr, M >::insert(), base_map< reflection_key, reflection_entry, less< reflection_key >, false >::iterator, base_map< ngram_key, ngram_entry, less< ngram_key >, false >::iterator, base_map< metaphone_key, metaphone_entry, less< metaphone_key >, false >::iterator, base_map< word_key, word_entry, less< word_key >, false >::iterator, MAX_PHRASE_TOKENS, MAX_TOKEN_LENGTH, _list< T, A >::push_back(), _list< T, A >::size(), str_template::strlen(), tokenizer::tokenize(), TT_ALPHABETIC, and TT_DIGIT.
Referenced by fuzzy_wrapper_impl::add().
| bool fuzzy_matcher_impl::remove |
( |
const char * |
phrase, |
|
|
byte_allocator & |
all | |
|
) |
| | [virtual] |
removes the previously added ngram caller can remove the same n-gram many times unless references count goes to zero
- Parameters:
-
| phrase |
input phrase |
| all |
external allocator |
Definition at line 364 of file fuzzyimpl.cpp.
References ngram_key::_array, ngram_key::_length, _memory_factory, _metaphone_vocabulary, _mpk_word_vocabulary, _ngram_partial_map, _ngram_pk_generator, _ngram_vocabulary, _npk_ngram_vocabulary, _reflection_vocabulary, _rpk_ngram_vocabulary, _tokenizer, _vocabulary_pk_generator, _vpk_word_vocabulary, _word_vocabulary, byte_allocator::allocate(), base_list< T >::begin(), base_map< K, T, Pr, M >::end(), base_list< T >::end(), map< K, T, Pr, M >::erase(), base_map< K, T, Pr, M >::find(), base_map< ngram_key, ngram_entry, less< ngram_key >, false >::iterator, base_map< word_key, word_entry, less< word_key >, false >::iterator, base_map< K, T, Pr, M >::const_iterator::key(), base_map< K, T, Pr, M >::lower_bound(), MAX_PHRASE_TOKENS, MAX_TOKEN_LENGTH, _list< T, A >::push_back(), unique_key_generator::save(), _list< T, A >::size(), tokenizer::tokenize(), TT_ALPHABETIC, TT_DIGIT, and base_map< K, T, Pr, M >::upper_bound().
Referenced by fuzzy_wrapper_impl::remove().
| bool fuzzy_matcher_impl::remove |
( |
size_t |
ident, |
|
|
byte_allocator & |
all | |
|
) |
| | [virtual] |
removes the previously added ngram by ident caller can remove the same n-gram many times unless references count goes to zero
- Parameters:
-
| ident |
input ident |
| all |
external allocator |
Definition at line 517 of file fuzzyimpl.cpp.
References _memory_factory, _ngram_partial_map, _ngram_pk_generator, _ngram_vocabulary, _npk_ngram_vocabulary, _reflection_vocabulary, _rpk_ngram_vocabulary, base_map< K, T, Pr, M >::end(), map< K, T, Pr, M >::erase(), base_map< K, T, Pr, M >::find(), base_map< K, T, Pr, M >::const_iterator::key(), base_map< K, T, Pr, M >::lower_bound(), unique_key_generator::save(), and base_map< K, T, Pr, M >::upper_bound().
does the fuzzy match
- Parameters:
-
| nq |
ngram quality for matching |
| pq |
phonetic quality for matching |
| phrase |
input phrase |
| all |
external allocator for output container |
| tmp |
external temporary allocator |
| suggestions |
[out] output list of suggestions |
Definition at line 590 of file fuzzyimpl.cpp.
References _match(), _ngram_vocabulary, word_key::_str, base_vector< T >::begin(), base_map< ngram_key, ngram_entry, less< ngram_key >, false >::const_iterator, base_map< K, T, Pr, M >::end(), base_vector< T >::end(), base_map< K, T, Pr, M >::find(), _list< T, A >::push_back(), reconstruct_string(), and byte_allocator::reset().
Referenced by fuzzy_wrapper_impl::match().
does the fuzzy match
- Parameters:
-
| nq |
ngram quality for matching |
| fq |
phonetic quality for matching |
| phrase |
input phrase |
| all |
external allocator for output container |
| tmp |
external temporary allocator |
| suggestions |
[out] output list of sugestions idents |
Definition at line 629 of file fuzzyimpl.cpp.
References _match(), _ngram_vocabulary, base_vector< T >::begin(), base_map< ngram_key, ngram_entry, less< ngram_key >, false >::const_iterator, base_map< K, T, Pr, M >::end(), base_vector< T >::end(), base_map< K, T, Pr, M >::find(), _list< T, A >::push_back(), and byte_allocator::reset().
| void fuzzy_matcher_impl::reset |
( |
|
) |
[virtual] |
clean up engine
Implements fuzzy_matcher.
Definition at line 169 of file fuzzyimpl.cpp.
References _memory_factory, _metaphone_pk_generator, _metaphone_vocabulary, _mpk_word_vocabulary, _ngram_partial_map, _ngram_pk_generator, _ngram_vocabulary, _npk_ngram_vocabulary, _reflection_pk_generator, _reflection_vocabulary, _rpk_ngram_vocabulary, _tokenizer, _vocabulary_pk_generator, _vpk_word_vocabulary, _word_vocabulary, map< K, T, Pr, M >::clear(), unique_key_generator::clear(), tokenizer::clear_abbr(), and tokenizer::clear_regex().
matches the fuzzy match
- Parameters:
-
| nq |
ngram quality for matching |
| fq |
phonetic quality for matching |
| phrase |
input phrase |
| all |
external allocator for output container |
| tmp |
external temporary allocator |
| candidates |
[out] output candidate container |
Definition at line 661 of file fuzzyimpl.cpp.
References metaphone_key::_array, ngram_key::_array, word_key::_len, string_desc::_len, metaphone_key::_length, ngram_key::_length, _metaphone_vocabulary, _mpk_word_vocabulary, _reflection_vocabulary, _rpk_ngram_vocabulary, word_key::_str, string_desc::_str, _tokenizer, string_desc::_vpk, _vpk_word_vocabulary, byte_allocator::allocate(), base_list< T >::begin(), calculate_score(), fuzzyphonetic::convert_to_metaphone(), fuzzyphonetic::convert_to_reflection(), base_map< K, T, Pr, M >::empty(), base_map< K, T, Pr, M >::end(), base_list< T >::end(), _list< T, A >::erase(), base_map< K, T, Pr, M >::find(), find_and_remove(), fuzzyphonetic::find_word_distance(), pair< T1, T2 >::first, _map< K, T, A, Pr, M >::insert(), base_map< K, T, Pr, M >::const_iterator::key(), base_map< K, T, Pr, M >::lower_bound(), make_string_lower(), MAX_PHRASE_TOKENS, MAX_TOKEN_LENGTH, lookup_distance< C, P, Func >::next(), nq_high, partial_intersect(), _list< T, A >::pop_back(), pq_high, pq_normal, _list< T, A >::push_back(), reconstruct_string(), pair< T1, T2 >::second, _list< T, A >::size(), str_template::strlen(), fuzzyphonetic::to_lower(), tokenizer::tokenize(), TT_ALPHABETIC, TT_DIGIT, wrapper_find_metaphone_distance(), and wrapper_find_reflection_distance().
Referenced by match().
finds the ngrams that partially intersect
- Parameters:
-
| nq |
ngram quality for matching |
| fq |
phonetic quality for matching |
| key |
input sorted n-gram key |
| key_origin |
input original n-gram key |
| score |
match score |
| popularity |
n-gram popularity |
| word_penalty |
word mismatch penalty |
| all |
external allocator for output container |
| tmp |
external temporary allocator |
| candidate |
[out] output candidate container |
Definition at line 1008 of file fuzzyimpl.cpp.
References ngram_key::_array, ngram_key::_length, _ngram_partial_map, _ngram_vocabulary, byte_allocator::allocate(), calculate_nkey_score(), calculate_score(), base_map< ngram_key, ngram_entry, less< ngram_key >, false >::const_iterator, base_map< K, T, Pr, M >::empty(), base_map< K, T, Pr, M >::end(), base_map< K, T, Pr, M >::find(), _map< K, T, A, Pr, M >::insert(), base_map< K, T, Pr, M >::const_iterator::key(), base_map< K, T, Pr, M >::lower_bound(), nq_high, and nq_normal.
Referenced by _match().
reconstructs the string from ngram keys
- Parameters:
-
| key |
input ngram key |
| tmp |
external temporary allocator |
| preserve_case |
preserve word case |
Definition at line 1137 of file fuzzyimpl.cpp.
References ngram_key::_array, ngram_key::_length, _vpk_word_vocabulary, byte_allocator::allocate(), base_list< T >::begin(), base_list< T >::end(), base_map< K, T, Pr, M >::end(), base_map< K, T, Pr, M >::find(), _list< T, A >::push_back(), and fuzzyphonetic::to_lower().
Referenced by _match(), and match().
| word_key fuzzy_matcher_impl::make_string_lower |
( |
const char * |
str, |
|
|
size_t |
len, |
|
|
byte_allocator & |
tmp | |
|
) |
| | [static, private] |
| word_key fuzzy_matcher_impl::make_string_lower |
( |
const char * |
str, |
|
|
size_t |
len, |
|
|
char * |
buf | |
|
) |
| | [static, private] |
| double fuzzy_matcher_impl::calculate_nkey_score |
( |
const ngram_key & |
key1, |
|
|
const ngram_key & |
key2 | |
|
) |
| | [static, private] |
| double fuzzy_matcher_impl::calculate_score |
( |
double |
phonetic_score, |
|
|
double |
intersection_score, |
|
|
size_t |
popularity, |
|
|
size_t |
penalty | |
|
) |
| | [static, private] |
calculates the total score
- Parameters:
-
| phonetic_score |
phonetic score |
| intersection_score |
n-gram intersection score |
| popularity |
words popularity |
| penalty |
words mismatch penalty |
Definition at line 1206 of file fuzzyimpl.cpp.
Referenced by _match(), and partial_intersect().
Member Data Documentation
The documentation for this class was generated from the following files:
|
|