Home / Open source / Terimber 2.0
fuzzy_matcher_impl Class Referencefuzzy match library implementation
More...
#include <fuzzyimpl.h>
List of all members.
|
Public Member Functions |
| fuzzy_matcher_impl (size_t memory_usage) |
| constructor
|
virtual | ~fuzzy_matcher_impl () |
| destructor
|
virtual size_t | add (const char *phrase, byte_allocator &all) |
| adds a new n-gram to the internal repository (utf-8) caller can add the same n-gram many times class will support internal references count
|
virtual bool | remove (const char *phrase, byte_allocator &all) |
| removes the previously added ngram caller can remove the same n-gram many times unless references count goes to zero
|
virtual bool | remove (size_t ident, byte_allocator &all) |
| removes the previously added ngram by ident caller can remove the same n-gram many times unless references count goes to zero
|
virtual bool | match (ngram_quality nq, phonetic_quality pq, const char *phrase, byte_allocator &all, byte_allocator &tmp, _list< const char * > &suggestions) const |
| does the fuzzy match
|
virtual bool | match (ngram_quality nq, phonetic_quality fq, const char *phrase, byte_allocator &all, byte_allocator &tmp, _list< size_t > &suggestions) const |
| does the fuzzy match
|
virtual void | reset () |
| clean up engine
|
Private Types |
typedef map< ngram_key_offset,
bool, less< ngram_key_offset >
, true > | ngram_key_offset_multimap_t |
| partial offset map for n-grams
|
typedef _map< ngram_key,
candidate_info > | candidates_container_t |
| maps n-gram key to the candidate information
|
typedef
candidates_container_t::const_iterator | candidates_container_citer_t |
| const iterator of candidates_container_t
|
typedef _vector
< candidates_container_citer_t > | vector_container_citer_t |
| vector of iterators
|
Private Member Functions |
bool | _match (ngram_quality nq, phonetic_quality fq, const char *phrase, byte_allocator &all, byte_allocator &tmp, candidates_container_t &candidates) const |
| matches the fuzzy match
|
void | partial_intersect (ngram_quality nq, phonetic_quality fq, const ngram_key &key, const ngram_key &key_origin, double score, size_t popularity, size_t word_penalty, byte_allocator &all, byte_allocator &tmp, candidates_container_t &candidate) const |
| finds the ngrams that partially intersect
|
word_key | reconstruct_string (const ngram_key &key, byte_allocator &tmp, bool preserve_case) const |
| reconstructs the string from ngram keys
|
Static Private Member Functions |
static word_key | make_string_lower (const char *str, size_t len, byte_allocator &tmp) |
| creates a case insensitive word key
|
static word_key | make_string_lower (const char *str, size_t len, char *buf) |
| creates a case insensitive word key
|
static double | calculate_nkey_score (const ngram_key &key1, const ngram_key &key2) |
| calculates the cross score
|
static double | calculate_score (double phonetic_score, double intersection_score, size_t popularity, size_t penalty) |
| calculates the total score
|
static ub4_t | find_and_remove (const metaphone_key &key, ub1_t *array, ub4_t &len) |
| finds and removes the metaphone key from the reflection array
|
Private Attributes |
byte_repository_factory | _memory_factory |
| memory factory
|
tokenizer | _tokenizer |
| tokenizer
|
unique_key_generator | _vocabulary_pk_generator |
| work key generator
|
unique_key_generator | _ngram_pk_generator |
| ngram key generator
|
unique_key_generator | _metaphone_pk_generator |
| metaphone key generator
|
unique_key_generator | _reflection_pk_generator |
| reflection key generator
|
word_entry_t | _word_vocabulary |
| word map
|
vpk_word_entry_iter_t | _vpk_word_vocabulary |
| reverse word map
|
metaphone_entry_t | _metaphone_vocabulary |
| metaphone map
|
mpk_word_entry_iter_t | _mpk_word_vocabulary |
| reverse metaphone map
|
ngram_entry_t | _ngram_vocabulary |
| ngram map
|
npk_ngram_entry_iter_t | _npk_ngram_vocabulary |
| reverse ngram map
|
reflection_entry_t | _reflection_vocabulary |
| reflection map
|
rpk_ngram_entry_iter_t | _rpk_ngram_vocabulary |
| reverse reflection map
|
ngram_key_offset_multimap_t | _ngram_partial_map |
| partial ngram offsets map
|
Classes |
class | candidate_info |
| match candidate information More...
|
class | candidate_sorter |
| candidate sort predicate More...
|
class | ngram_key_offset |
| n-gram ley offset More...
|
Detailed Description
fuzzy match library implementation
Definition at line 381 of file fuzzyimpl.h.
Member Typedef Documentation
maps n-gram key to the candidate information
Definition at line 481 of file fuzzyimpl.h.
const iterator of candidates_container_t
Definition at line 484 of file fuzzyimpl.h.
Constructor & Destructor Documentation
fuzzy_matcher_impl::fuzzy_matcher_impl |
( |
size_t |
memory_usage |
) |
|
fuzzy_matcher_impl::~fuzzy_matcher_impl |
( |
|
) |
[virtual] |
Member Function Documentation
size_t fuzzy_matcher_impl::add |
( |
const char * |
phrase, |
|
|
byte_allocator & |
all | |
|
) |
| | [virtual] |
adds a new n-gram to the internal repository (utf-8) caller can add the same n-gram many times class will support internal references count
- Parameters:
-
phrase |
input phrase |
all |
external allocator |
Definition at line 198 of file fuzzyimpl.cpp.
References ngram_key::_array, metaphone_key::_array, word_key::_len, ngram_key::_length, metaphone_key::_length, _memory_factory, _metaphone_pk_generator, _metaphone_vocabulary, _mpk_word_vocabulary, _ngram_partial_map, _ngram_pk_generator, _ngram_vocabulary, _npk_ngram_vocabulary, ngram_entry::_origin, _reflection_pk_generator, _reflection_vocabulary, _rpk_ngram_vocabulary, word_key::_str, _tokenizer, _vocabulary_pk_generator, _vpk_word_vocabulary, _word_vocabulary, byte_allocator::allocate(), base_list< T >::begin(), fuzzyphonetic::convert_to_metaphone(), fuzzyphonetic::convert_to_reflection(), base_map< K, T, Pr, M >::end(), base_list< T >::end(), base_map< K, T, Pr, M >::find(), unique_key_generator::generate(), map< K, T, Pr, M >::insert(), base_map< reflection_key, reflection_entry, less< reflection_key >, false >::iterator, base_map< ngram_key, ngram_entry, less< ngram_key >, false >::iterator, base_map< metaphone_key, metaphone_entry, less< metaphone_key >, false >::iterator, base_map< word_key, word_entry, less< word_key >, false >::iterator, MAX_PHRASE_TOKENS, MAX_TOKEN_LENGTH, _list< T, A >::push_back(), _list< T, A >::size(), str_template::strlen(), tokenizer::tokenize(), TT_ALPHABETIC, and TT_DIGIT.
Referenced by fuzzy_wrapper_impl::add().
bool fuzzy_matcher_impl::remove |
( |
const char * |
phrase, |
|
|
byte_allocator & |
all | |
|
) |
| | [virtual] |
removes the previously added ngram caller can remove the same n-gram many times unless references count goes to zero
- Parameters:
-
phrase |
input phrase |
all |
external allocator |
Definition at line 364 of file fuzzyimpl.cpp.
References ngram_key::_array, ngram_key::_length, _memory_factory, _metaphone_vocabulary, _mpk_word_vocabulary, _ngram_partial_map, _ngram_pk_generator, _ngram_vocabulary, _npk_ngram_vocabulary, _reflection_vocabulary, _rpk_ngram_vocabulary, _tokenizer, _vocabulary_pk_generator, _vpk_word_vocabulary, _word_vocabulary, byte_allocator::allocate(), base_list< T >::begin(), base_map< K, T, Pr, M >::end(), base_list< T >::end(), map< K, T, Pr, M >::erase(), base_map< K, T, Pr, M >::find(), base_map< ngram_key, ngram_entry, less< ngram_key >, false >::iterator, base_map< word_key, word_entry, less< word_key >, false >::iterator, base_map< K, T, Pr, M >::const_iterator::key(), base_map< K, T, Pr, M >::lower_bound(), MAX_PHRASE_TOKENS, MAX_TOKEN_LENGTH, _list< T, A >::push_back(), unique_key_generator::save(), _list< T, A >::size(), tokenizer::tokenize(), TT_ALPHABETIC, TT_DIGIT, and base_map< K, T, Pr, M >::upper_bound().
Referenced by fuzzy_wrapper_impl::remove().
bool fuzzy_matcher_impl::remove |
( |
size_t |
ident, |
|
|
byte_allocator & |
all | |
|
) |
| | [virtual] |
removes the previously added ngram by ident caller can remove the same n-gram many times unless references count goes to zero
- Parameters:
-
ident |
input ident |
all |
external allocator |
Definition at line 517 of file fuzzyimpl.cpp.
References _memory_factory, _ngram_partial_map, _ngram_pk_generator, _ngram_vocabulary, _npk_ngram_vocabulary, _reflection_vocabulary, _rpk_ngram_vocabulary, base_map< K, T, Pr, M >::end(), map< K, T, Pr, M >::erase(), base_map< K, T, Pr, M >::find(), base_map< K, T, Pr, M >::const_iterator::key(), base_map< K, T, Pr, M >::lower_bound(), unique_key_generator::save(), and base_map< K, T, Pr, M >::upper_bound().
does the fuzzy match
- Parameters:
-
nq |
ngram quality for matching |
pq |
phonetic quality for matching |
phrase |
input phrase |
all |
external allocator for output container |
tmp |
external temporary allocator |
suggestions |
[out] output list of suggestions |
Definition at line 590 of file fuzzyimpl.cpp.
References _match(), _ngram_vocabulary, word_key::_str, base_vector< T >::begin(), base_map< ngram_key, ngram_entry, less< ngram_key >, false >::const_iterator, base_map< K, T, Pr, M >::end(), base_vector< T >::end(), base_map< K, T, Pr, M >::find(), _list< T, A >::push_back(), reconstruct_string(), and byte_allocator::reset().
Referenced by fuzzy_wrapper_impl::match().
does the fuzzy match
- Parameters:
-
nq |
ngram quality for matching |
fq |
phonetic quality for matching |
phrase |
input phrase |
all |
external allocator for output container |
tmp |
external temporary allocator |
suggestions |
[out] output list of sugestions idents |
Definition at line 629 of file fuzzyimpl.cpp.
References _match(), _ngram_vocabulary, base_vector< T >::begin(), base_map< ngram_key, ngram_entry, less< ngram_key >, false >::const_iterator, base_map< K, T, Pr, M >::end(), base_vector< T >::end(), base_map< K, T, Pr, M >::find(), _list< T, A >::push_back(), and byte_allocator::reset().
void fuzzy_matcher_impl::reset |
( |
|
) |
[virtual] |
clean up engine
Implements fuzzy_matcher.
Definition at line 169 of file fuzzyimpl.cpp.
References _memory_factory, _metaphone_pk_generator, _metaphone_vocabulary, _mpk_word_vocabulary, _ngram_partial_map, _ngram_pk_generator, _ngram_vocabulary, _npk_ngram_vocabulary, _reflection_pk_generator, _reflection_vocabulary, _rpk_ngram_vocabulary, _tokenizer, _vocabulary_pk_generator, _vpk_word_vocabulary, _word_vocabulary, map< K, T, Pr, M >::clear(), unique_key_generator::clear(), tokenizer::clear_abbr(), and tokenizer::clear_regex().
matches the fuzzy match
- Parameters:
-
nq |
ngram quality for matching |
fq |
phonetic quality for matching |
phrase |
input phrase |
all |
external allocator for output container |
tmp |
external temporary allocator |
candidates |
[out] output candidate container |
Definition at line 661 of file fuzzyimpl.cpp.
References metaphone_key::_array, ngram_key::_array, word_key::_len, string_desc::_len, metaphone_key::_length, ngram_key::_length, _metaphone_vocabulary, _mpk_word_vocabulary, _reflection_vocabulary, _rpk_ngram_vocabulary, word_key::_str, string_desc::_str, _tokenizer, string_desc::_vpk, _vpk_word_vocabulary, byte_allocator::allocate(), base_list< T >::begin(), calculate_score(), fuzzyphonetic::convert_to_metaphone(), fuzzyphonetic::convert_to_reflection(), base_map< K, T, Pr, M >::empty(), base_map< K, T, Pr, M >::end(), base_list< T >::end(), _list< T, A >::erase(), base_map< K, T, Pr, M >::find(), find_and_remove(), fuzzyphonetic::find_word_distance(), pair< T1, T2 >::first, _map< K, T, A, Pr, M >::insert(), base_map< K, T, Pr, M >::const_iterator::key(), base_map< K, T, Pr, M >::lower_bound(), make_string_lower(), MAX_PHRASE_TOKENS, MAX_TOKEN_LENGTH, lookup_distance< C, P, Func >::next(), nq_high, partial_intersect(), _list< T, A >::pop_back(), pq_high, pq_normal, _list< T, A >::push_back(), reconstruct_string(), pair< T1, T2 >::second, _list< T, A >::size(), str_template::strlen(), fuzzyphonetic::to_lower(), tokenizer::tokenize(), TT_ALPHABETIC, TT_DIGIT, wrapper_find_metaphone_distance(), and wrapper_find_reflection_distance().
Referenced by match().
finds the ngrams that partially intersect
- Parameters:
-
nq |
ngram quality for matching |
fq |
phonetic quality for matching |
key |
input sorted n-gram key |
key_origin |
input original n-gram key |
score |
match score |
popularity |
n-gram popularity |
word_penalty |
word mismatch penalty |
all |
external allocator for output container |
tmp |
external temporary allocator |
candidate |
[out] output candidate container |
Definition at line 1008 of file fuzzyimpl.cpp.
References ngram_key::_array, ngram_key::_length, _ngram_partial_map, _ngram_vocabulary, byte_allocator::allocate(), calculate_nkey_score(), calculate_score(), base_map< ngram_key, ngram_entry, less< ngram_key >, false >::const_iterator, base_map< K, T, Pr, M >::empty(), base_map< K, T, Pr, M >::end(), base_map< K, T, Pr, M >::find(), _map< K, T, A, Pr, M >::insert(), base_map< K, T, Pr, M >::const_iterator::key(), base_map< K, T, Pr, M >::lower_bound(), nq_high, and nq_normal.
Referenced by _match().
reconstructs the string from ngram keys
- Parameters:
-
key |
input ngram key |
tmp |
external temporary allocator |
preserve_case |
preserve word case |
Definition at line 1137 of file fuzzyimpl.cpp.
References ngram_key::_array, ngram_key::_length, _vpk_word_vocabulary, byte_allocator::allocate(), base_list< T >::begin(), base_list< T >::end(), base_map< K, T, Pr, M >::end(), base_map< K, T, Pr, M >::find(), _list< T, A >::push_back(), and fuzzyphonetic::to_lower().
Referenced by _match(), and match().
word_key fuzzy_matcher_impl::make_string_lower |
( |
const char * |
str, |
|
|
size_t |
len, |
|
|
byte_allocator & |
tmp | |
|
) |
| | [static, private] |
word_key fuzzy_matcher_impl::make_string_lower |
( |
const char * |
str, |
|
|
size_t |
len, |
|
|
char * |
buf | |
|
) |
| | [static, private] |
double fuzzy_matcher_impl::calculate_nkey_score |
( |
const ngram_key & |
key1, |
|
|
const ngram_key & |
key2 | |
|
) |
| | [static, private] |
double fuzzy_matcher_impl::calculate_score |
( |
double |
phonetic_score, |
|
|
double |
intersection_score, |
|
|
size_t |
popularity, |
|
|
size_t |
penalty | |
|
) |
| | [static, private] |
calculates the total score
- Parameters:
-
phonetic_score |
phonetic score |
intersection_score |
n-gram intersection score |
popularity |
words popularity |
penalty |
words mismatch penalty |
Definition at line 1206 of file fuzzyimpl.cpp.
Referenced by _match(), and partial_intersect().
Member Data Documentation
The documentation for this class was generated from the following files:
|
|