Terimber Corporation

00001 /*
00002  * The Software License
00003  * =================================================================================
00004  * Copyright (c) 2003-.The Terimber Corporation. All rights reserved.
00005  * =================================================================================
00006  * Redistributions of source code must retain the above copyright notice, 
00007  * this list of conditions and the following disclaimer.
00008  * Redistributions in binary form must reproduce the above copyright notice, 
00009  * this list of conditions and the following disclaimer in the documentation 
00010  * and/or other materials provided with the distribution.
00011  * The end-user documentation included with the redistribution, if any, 
00012  * must include the following acknowledgment:
00013  * "This product includes software developed by the Terimber Corporation."
00014  * =================================================================================
00015  * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, 
00016  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
00017  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  
00018  * IN NO EVENT SHALL THE TERIMBER CORPORATION OR ITS CONTRIBUTORS BE LIABLE FOR 
00019  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
00020  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
00021  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
00022  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00023  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00024  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00025  * ================================================================================
00026 */
00027 
00028 #ifndef _terimber_tokenizer_h_
00029 #define _terimber_tokenizer_h_
00030 
00031 #include "allinc.h"
00032 #include "base/list.h"
00033 #include "base/map.h"
00034 #include "base/string.h"
00035 #include "pcre/pcre.h"
00036 
00037 BEGIN_TERIMBER_NAMESPACE
00038 #pragma pack(4)
00039 
00040 const size_t T_REGEX = 0x00000001;
00041 const size_t T_ABBR = 0x00000002;
00042 const size_t T_HYPHEN = 0x00000004;
00043 const size_t T_ALL = T_REGEX | T_ABBR | T_HYPHEN;
00044 
00047 enum token_type
00048 {
00049         TT_UNKNOWN = 0,                                                                                 
00050         TT_REGEX,                                                                                               
00051         TT_ABBR,                                                                                                
00052         TT_COMPOSE,                                                                                             
00053         TT_ALPHABETIC,                                                                                  
00054         TT_DIGIT,                                                                                               
00055         TT_WHITESPACE,                                                                                  
00056         TT_SYMBOL,                                                                                              
00057         TT_PUNCTUATION,                                                                                 
00058         TT_DOT                                                                                                  
00059 };
00060 
00063 class token_info
00064 {
00065 public:
00067         token_info(             token_type type,                                                
00068                                         size_t len,                                                             
00069                                         size_t key                                                              
00070                                         ) :                                     
00071                 _type(type), 
00072                 _len(len), 
00073                 _key(key) 
00074         {
00075         }
00076 
00077         token_type      _type;                                                                          
00078         size_t          _len;                                                                           
00079         size_t          _key;                                                                           
00080 };
00081 
00084 class pcre_key
00085 {
00086 public:
00088         pcre_key(               size_t min,                                                             
00089                                         size_t max                                                              
00090                                         ) : 
00091                 _min(min), 
00092                 _max(max) 
00093         {
00094         }
00095 
00096         size_t                  _min;                                                                   
00097         size_t                  _max;                                                                   
00098 
00100         inline 
00101         bool 
00102         operator<(const pcre_key& x) const
00103         { 
00104                 return _max < x._max; 
00105         }
00106 };
00107 
00110 class pcre_entry
00111 {
00112 public:
00114         pcre_entry(             pcre* th,                                                               
00115                                         pcre_extra* ex,                                                 
00116                                         size_t key                                                              
00117                                         ) : _this(th), _extra(ex), _key(key) {}
00118 
00119         pcre*                   _this;                                                                  
00120         pcre_extra*             _extra;                                                                 
00121         size_t                  _key;                                                                   
00122 };
00123 
00126 typedef _list< token_info >                                                                     tokenizer_output_sequence_t;
00129 typedef map< size_t, string_t, less< size_t >, true >           abbreviation_map_t;
00132 typedef map< pcre_key, pcre_entry, less< pcre_key >, true >     regex_map_t;
00133 
00136 class tokenizer
00137 {
00138 public:
00140         tokenizer();
00142         ~tokenizer();
00143 
00146         bool 
00147         add_regex(              const char* regex,                                              
00148                                         size_t key,                                                             
00149                                         size_t min,                                                             
00150                                         size_t max                                                              
00151                                         );
00152         
00156         bool 
00157         load_regex(             const char* file_name                                   
00158                                         );
00159 
00161         void 
00162         clear_regex();
00163 
00165         bool 
00166         add_abbreviation(const char* abbr                                               
00167                                         );
00168 
00172         bool 
00173         load_abbr(              const char* file_name                                   
00174                                         );
00175 
00177         void 
00178         clear_abbr();
00179 
00181         bool 
00182         tokenize(               const char* str,                                                
00183                                         tokenizer_output_sequence_t& out,               
00184                                         byte_allocator& all,                                    
00185                                         size_t flags = T_ALL                                    
00186                                         ) const;
00187 
00189         const string_t& 
00190         get_last_error() const 
00191         { 
00192                 return _error; 
00193         }
00194 private:
00196         void 
00197         clear();
00199         void 
00200         do_regex(               const char* phrase,                                             
00201                                         tokenizer_output_sequence_t& tokens             
00202                                         ) const;
00204         void 
00205         do_abbr(                const char* phrase,                                             
00206                                         tokenizer_output_sequence_t& tokens             
00207                                         ) const;
00209         void 
00210         do_hyphen(              const char* phrase,                                             
00211                                         tokenizer_output_sequence_t& tokens             
00212                                         ) const;
00213 
00215         size_t 
00216         match(                  const char* x,                                                  
00217                                         size_t len,                                                             
00218                                         size_t tokens,                                                  
00219                                         size_t& key                                                             
00220                                         ) const;
00221 private:
00222         string_t                                        _error;                                         
00223         abbreviation_map_t                      _abbr_map;                                      
00224         regex_map_t                                     _regex_map;                                     
00225 
00226 };
00227 
00228 #pragma pack()
00229 END_TERIMBER_NAMESPACE
00230 
00231 #endif
tokenizer.h