00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "tokenizer/tokenizer.h"
00029 #include "base/list.hpp"
00030 #include "base/map.hpp"
00031 #include "base/string.hpp"
00032 #include "base/common.hpp"
00033 #include "base/memory.hpp"
00034 #include "xml/defxml.hpp"
00035
00036 BEGIN_TERIMBER_NAMESPACE
00037 #pragma pack(4)
00038
00039 static
00040 inline
00041 size_t
00042 do_hash_lowercase(const char* x, size_t len)
00043 {
00044 size_t res = 0;
00045 if (x) while (len-- && *x) res = (res << 1) ^ tolower(*x++);
00046 return res;
00047 }
00048
00049 static
00050 inline
00051 token_type
00052 detect_token_type(char symbol)
00053 {
00054 switch (symbol)
00055 {
00056 case ch_space:
00057 case ch_hor_tab:
00058 case ch_lf:
00059 case ch_ver_tab:
00060 case ch_ff:
00061 case ch_caret:
00062 return TT_WHITESPACE;
00063 case ch_dash:
00064 case ch_ampersand:
00065 case ch_asterisk:
00066 case ch_at:
00067 case ch_back_slash:
00068 case ch_close_angle:
00069 case ch_close_curly:
00070 case ch_close_square:
00071 case ch_close_paren:
00072 case ch_dollar_sign:
00073 case ch_double_quote:
00074 case ch_equal:
00075 case ch_forward_slash:
00076 case ch_grave:
00077 case ch_open_angle:
00078 case ch_open_curly:
00079 case ch_open_square:
00080 case ch_open_paren:
00081 case ch_percent:
00082 case ch_pipe:
00083 case ch_plus:
00084 case ch_pound:
00085 case ch_tilde:
00086 case ch_underscore:
00087 case ch_single_quote:
00088 return TT_SYMBOL;
00089 case ch_colon:
00090 case ch_semicolon:
00091 case ch_bang:
00092 case ch_question:
00093 case ch_comma:
00094 return TT_PUNCTUATION;
00095 case ch_period:
00096 return TT_DOT;
00097 case ch_0:
00098 case ch_1:
00099 case ch_2:
00100 case ch_3:
00101 case ch_4:
00102 case ch_5:
00103 case ch_6:
00104 case ch_7:
00105 case ch_8:
00106 case ch_9:
00107 return TT_DIGIT;
00108 default:
00109 return TT_ALPHABETIC;
00110 }
00111 }
00112
00113 tokenizer::tokenizer()
00114 {
00115 }
00116
00117
00118 tokenizer::~tokenizer()
00119 {
00120 clear();
00121 }
00122
00123 void
00124 tokenizer::clear()
00125 {
00126 clear_abbr();
00127 clear_regex();
00128 }
00129
00130
00131 bool
00132 tokenizer::add_regex(const char* regex, size_t key, size_t min, size_t max)
00133 {
00134
00135 const char* error = 0;
00136 int erroffset;
00137 pcre* obj = pcre_compile(regex, PCRE_ANCHORED, &error, &erroffset, NULL);
00138 if (!obj)
00139 {
00140 _error = error;
00141 return false;
00142 }
00143
00144 pcre_extra* extra = pcre_study(obj, 0, &error);
00145
00146 pcre_key k(min, max);
00147
00148 pcre_entry e(obj, extra, key);
00149
00150 return _regex_map.end() != _regex_map.insert(k, e).first;
00151 }
00152
00153 bool
00154 tokenizer::load_regex(const char* file_name)
00155 {
00156
00157 FILE* f = fopen(file_name, "r");
00158
00159 if (!f)
00160 {
00161 _error = "Can't open file: ";
00162 _error += file_name;
00163 }
00164
00165
00166 char buf[4096];
00167
00168
00169 while (fgets(buf, 4095, f))
00170 {
00171 char* key_tab = strchr(buf, '\t');
00172 if (!key_tab)
00173 {
00174 _error = "Wrong file format, can't find key TAB after regular expression";
00175 fclose(f);
00176 return false;
00177 }
00178
00179 char* min_tab = strchr(key_tab + 1, '\t');
00180 if (!min_tab)
00181 {
00182 _error = "Wrong file format, can't find min TAB after regular expression";
00183 fclose(f);
00184 return false;
00185 }
00186
00187 char* max_tab = strchr(min_tab + 1, '\t');
00188 if (!max_tab)
00189 {
00190 _error = "Wrong file format, can't find max TAB after regular expression";
00191 fclose(f);
00192 return false;
00193 }
00194
00195
00196 *key_tab = 0;
00197 *min_tab = 0;
00198 *max_tab = 0;
00199
00200
00201 ub4_t key = 0;
00202 ub4_t min = 0;
00203 ub4_t max = 0;
00204
00205 if (1 != str_template::strscan(key_tab + 1, 32, "%u", &key))
00206 {
00207 _error = "Wrong file data, can't convert key to digit";
00208 fclose(f);
00209 return false;
00210 }
00211
00212 if (1 != str_template::strscan(min_tab + 1, 32, "%u", &min))
00213 {
00214 _error = "Wrong file data, can't convert min to digit";
00215 fclose(f);
00216 return false;
00217 }
00218
00219 if (1 != str_template::strscan(max_tab + 1, 32, "%u", &max))
00220 {
00221 _error = "Wrong file data, can't convert max to digit";
00222 fclose(f);
00223 return false;
00224 }
00225
00226 if (!min || min > max)
00227 {
00228 _error = "Wrong file data, min must be > 0 and <= max chars to digit";
00229 fclose(f);
00230 return false;
00231 }
00232
00233
00234 if (!add_regex(buf, key, min, max))
00235 {
00236 fclose(f);
00237 return false;
00238 }
00239 }
00240
00241 fclose(f);
00242 return true;
00243 }
00244
00245
00246 void
00247 tokenizer::clear_regex()
00248 {
00249 for (regex_map_t::iterator i = _regex_map.begin(); i != _regex_map.end(); ++i)
00250 {
00251 if (i->_this) free (i->_this);
00252 if (i->_extra) free (i->_extra);
00253 }
00254
00255 _regex_map.clear();
00256 }
00257
00258
00259 bool
00260 tokenizer::add_abbreviation(const char* abbr)
00261 {
00262
00263 size_t key = do_hash_lowercase(abbr, os_minus_one);
00264 string_t val(abbr);
00265
00266 return _abbr_map.end() != _abbr_map.insert(key, val).first;
00267 }
00268
00269
00270
00271
00272 bool
00273 tokenizer::load_abbr(const char* file_name)
00274 {
00275
00276 FILE* f = fopen(file_name, "r");
00277
00278 if (!f)
00279 {
00280 _error = "Can't open file: ";
00281 _error += file_name;
00282 }
00283
00284
00285 char buf[512];
00286
00287
00288 while (fgets(buf, 511, f))
00289 {
00290 char* dot = strchr(buf, '.');
00291 if (!dot)
00292 {
00293 _error = "Wrong file data, can't find final dot";
00294 fclose(f);
00295 return false;
00296 }
00297
00298 *dot = 0;
00299
00300
00301 if (!add_abbreviation(buf))
00302 {
00303 fclose(f);
00304 return false;
00305 }
00306 }
00307
00308 fclose(f);
00309 return true;
00310 }
00311
00312
00313 void
00314 tokenizer::clear_abbr()
00315 {
00316 _abbr_map.clear();
00317 }
00318
00319
00320 bool
00321 tokenizer::tokenize(const char* str, tokenizer_output_sequence_t& out, byte_allocator& all, size_t flags) const
00322 {
00323
00324 if (!str)
00325 {
00326 return false;
00327 }
00328
00329
00330 const char* x = str;
00331 const char* begin = 0;
00332
00333 tokenizer_output_sequence_t tmp_list;
00334
00335 token_type prev_tt = TT_UNKNOWN, curr_tt = TT_UNKNOWN;
00336
00337
00338 while (*x)
00339 {
00340 switch (curr_tt = detect_token_type(*x))
00341 {
00342 case TT_WHITESPACE:
00343 case TT_ALPHABETIC:
00344 case TT_DIGIT:
00345 if (prev_tt == curr_tt)
00346 break;
00347 default:
00348 if (begin)
00349 {
00350
00351 token_info item(prev_tt, x - begin, 0);
00352 tmp_list.push_back(all, item);
00353 }
00354
00355 begin = x;
00356 }
00357
00358 prev_tt = curr_tt;
00359 ++x;
00360 }
00361
00362
00363 if (begin)
00364 {
00365
00366 token_info item(prev_tt, x - begin, 0);
00367 tmp_list.push_back(all, item);
00368 }
00369
00370
00371 if (flags & T_REGEX)
00372 do_regex(str, tmp_list);
00373
00374 if (flags & T_ABBR)
00375 do_abbr(str, tmp_list);
00376
00377 if (flags & T_HYPHEN)
00378 do_hyphen(str, tmp_list);
00379
00380
00381 out.clear();
00382
00383 for (tokenizer_output_sequence_t::const_iterator i = tmp_list.begin(); i != tmp_list.end(); ++i)
00384 out.push_back(all, *i);
00385
00386 return true;
00387 }
00388
00389 size_t
00390 tokenizer::match(const char* x, size_t len, size_t tokens, size_t& key) const
00391 {
00392 size_t ret = 0;
00393 pcre_key k(1, tokens);
00394 regex_map_t::const_iterator i = _regex_map.lower_bound(k);
00395
00396 while (i != _regex_map.end()
00397 && i.key()._min <= tokens)
00398 {
00399 size_t length = 0;
00400 int ovector[30];
00401 int rc = pcre_exec(i->_this, i->_extra, x, (int)len, 0, PCRE_ANCHORED, ovector, 30);
00402
00403 if (rc >= 0)
00404 {
00405 length = ovector[1] - ovector[0];
00406 if (length > ret)
00407 {
00408 ret = length;
00409 key = i->_key;
00410 }
00411 }
00412
00413 ++i;
00414 }
00415
00416 return ret;
00417 }
00418
00419 void
00420 tokenizer::do_regex(const char* phrase, tokenizer_output_sequence_t& tokens) const
00421 {
00422
00423 if (_regex_map.empty())
00424 return;
00425
00426 size_t offset = 0;
00427 tokenizer_output_sequence_t::iterator i_first = tokens.begin(), i_second = tokens.begin();
00428
00429 pcre_key k(1, 0xffffffff);
00430 regex_map_t::const_iterator u = _regex_map.upper_bound(k);
00431 assert(u != _regex_map.end());
00432
00433 size_t max_count = u.key()._max;
00434
00435 while (i_first != tokens.end())
00436 {
00437 tokenizer_output_sequence_t::iterator i_matched = tokens.end();
00438 size_t count = 1, len = 0, maxlen = 0;
00439 size_t maxmatch = 0;
00440 size_t maxkey = 0;
00441
00442
00443 for (i_second = i_first; i_second != tokens.end() && count <= max_count; ++i_second, ++count)
00444 {
00445
00446 len += i_second->_len;
00447 size_t key = 0;
00448 size_t matched = match(phrase + offset, len, count, key);
00449
00450 if (matched > maxmatch)
00451 {
00452 i_matched = i_second;
00453 maxlen = len;
00454 maxkey = key;
00455 maxmatch = matched;
00456 }
00457 }
00458
00459 if (i_matched != tokens.end())
00460 {
00461
00462 i_first->_len = maxlen;
00463 i_first->_type = TT_REGEX;
00464 i_first->_key = maxkey;
00465
00466 if (i_first != i_matched)
00467 {
00468 i_second = i_first;
00469 tokens.erase(++i_second, ++i_matched);
00470 }
00471
00472 offset += maxlen;
00473 ++i_first;
00474 }
00475 else
00476 {
00477 offset += i_first->_len;
00478 ++i_first;
00479 }
00480 }
00481 }
00482
00483 void
00484 tokenizer::do_abbr(const char* phrase, tokenizer_output_sequence_t& tokens) const
00485 {
00486 size_t offset = 0;
00487 for (tokenizer_output_sequence_t::iterator i_first = tokens.begin(); i_first != tokens.end(); ++i_first)
00488 {
00489 switch (i_first->_type)
00490 {
00491 case TT_ALPHABETIC:
00492 {
00493 tokenizer_output_sequence_t::iterator i_second = i_first;
00494 ++i_second;
00495
00496 if (i_first != tokens.end()
00497 && i_second->_type == TT_DOT)
00498 {
00499
00500 size_t hash_value = do_hash_lowercase(phrase + offset, i_first->_len);
00501 abbreviation_map_t::const_iterator i_find = _abbr_map.lower_bound(hash_value);
00502
00503 while (i_find != _abbr_map.end()
00504 && i_find.key() == hash_value)
00505 {
00506 if (i_first->_len == i_find->length()
00507 && !str_template::strnocasecmp((const char*)*i_find, phrase + offset, i_first->_len)
00508 )
00509 {
00510 i_first->_len += 1;
00511 i_first->_type = TT_ABBR;
00512 tokens.erase(i_second);
00513 break;
00514 }
00515
00516 ++i_find;
00517 }
00518
00519 }
00520 }
00521 break;
00522 }
00523
00524 offset += i_first->_len;
00525 }
00526 }
00527
00528 void
00529 tokenizer::do_hyphen(const char* phrase, tokenizer_output_sequence_t& tokens) const
00530 {
00531 size_t offset = 0;
00532 bool find_dash = false;
00533 bool find_token = false;
00534
00535 for (tokenizer_output_sequence_t::iterator i_first = tokens.begin(); i_first != tokens.end(); ++i_first)
00536 {
00537 switch (i_first->_type)
00538 {
00539 case TT_ALPHABETIC:
00540 case TT_DIGIT:
00541 {
00542 tokenizer_output_sequence_t::iterator i_second = i_first;
00543 bool dash_flip_flop = false;
00544 size_t count = 0;
00545 size_t len = 0;
00546
00547 while (i_second != tokens.end()
00548 && (!dash_flip_flop && (i_second->_type == TT_ALPHABETIC || i_second->_type == TT_DIGIT)
00549 || dash_flip_flop && i_second->_type == TT_SYMBOL && *(phrase + offset + len) == ch_dash)
00550 )
00551 {
00552 if (!dash_flip_flop)
00553 {
00554 len += i_second->_len + (count ? 1 : 0);
00555 count += (count ? 2 : 1);
00556 }
00557
00558 dash_flip_flop = !dash_flip_flop;
00559 ++i_second;
00560 }
00561
00562 if (count > 3)
00563 {
00564 i_first->_len = len;
00565 i_first->_type = TT_COMPOSE;
00566 i_second = i_first;
00567 ++i_second;
00568 while (--count && i_second != tokens.end())
00569 i_second = tokens.erase(i_second);
00570 }
00571 }
00572 break;
00573 }
00574
00575 offset += i_first->_len;
00576 }
00577 }
00578
00579 #pragma pack()
00580 END_TERIMBER_NAMESPACE