tokenizer.h File Reference
#include "allinc.h"
#include "base/list.h"
#include "base/map.h"
#include "base/string.h"
#include "pcre/pcre.h"
Go to the source code of this file.
|
Classes |
class | token_info |
| token information More...
|
class | pcre_key |
| pcre key More...
|
class | pcre_entry |
| pcre entry More...
|
class | tokenizer |
| class tokenize input string finding atomic tokens More...
|
Typedefs |
typedef _list< token_info > | tokenizer_output_sequence_t |
| list of detected tokens
|
typedef map< size_t, string_t,
less< size_t >, true > | abbreviation_map_t |
| abbreviation map
|
typedef map< pcre_key,
pcre_entry, less< pcre_key >
, true > | regex_map_t |
| pcre expression multimap
|
Enumerations |
enum | token_type {
TT_UNKNOWN = 0,
TT_REGEX,
TT_ABBR,
TT_COMPOSE,
TT_ALPHABETIC,
TT_DIGIT,
TT_WHITESPACE,
TT_SYMBOL,
TT_PUNCTUATION,
TT_DOT
} |
| atomic token type More...
|
Variables |
BEGIN_TERIMBER_NAMESPACE const
size_t | T_REGEX = 0x00000001 |
const size_t | T_ABBR = 0x00000002 |
const size_t | T_HYPHEN = 0x00000004 |
const size_t | T_ALL = T_REGEX | T_ABBR | T_HYPHEN |
Typedef Documentation
Enumeration Type Documentation
atomic token type
- Enumerator:
-
TT_UNKNOWN |
unknown type |
TT_REGEX |
regular expressionm 2005-11-11 12:00:00.333 |
TT_ABBR |
abbreviation Dr., Mr., Ms., Gen. |
TT_COMPOSE |
composite token semi-final, Cup-2005, F-117 |
TT_ALPHABETIC |
alpha-betic Terimber |
TT_DIGIT |
digit 1, 345 |
TT_WHITESPACE |
white space blank, tabs, new lines, caret, ... |
TT_SYMBOL |
symbol #$^... |
TT_PUNCTUATION |
punctuation ,:;!?- |
TT_DOT |
dot |
Definition at line 47 of file tokenizer.h.
Variable Documentation
const size_t T_ABBR = 0x00000002 |
BEGIN_TERIMBER_NAMESPACE const size_t T_REGEX = 0x00000001 |