defxml.h File Reference

#include "xml/declxml.h"
#include "base/common.h"

Go to the source code of this file.

Functions

xml_forceinline bool usascii_to_utf8 (ub4_t in, ub1_t *out, size_t &count)

converts ascii char to utf-8 char sequence

xml_forceinline bool fixedN_to_utf8 (encodingSchema schema, const ub1_t *in, size_t count, ub1_t *out, size_t &converted, size_t &processed, size_t &more)

converts fixed width char buffer into utf-8 biffer

xml_forceinline bool utf8_to_utf8 (const ub1_t *in, size_t count, size_t &processed, size_t &more)

checks if the input buffer is valid utf-8 string

xml_forceinline void windowsN_to_utf8 (encodingSchema schema, const ub1_t *in, size_t count, ub1_t *out, size_t &converted, size_t &processed)

converts the Windows encoding to utf-8

xml_forceinline void isoN_to_utf8 (encodingSchema schema, const ub1_t *in, size_t count, ub1_t *out, size_t &converted, size_t &processed)

converts the ISO encoding to utf-8

void tokenValues (const char *x, _list< const char * > &values, byte_allocator &allocator_)

tokenizes the string into list of tokens

Variables

BEGIN_TERIMBER_NAMESPACE const char str_xml [] = { ch_x, ch_m, ch_l, ch_null }

const char str_w3c_xml [] = { ch_h, ch_t, ch_t, ch_p, ch_colon, ch_forward_slash, ch_forward_slash, ch_w, ch_w, ch_w, ch_period, ch_w, ch_3, ch_period, ch_o, ch_r, ch_g, ch_forward_slash, ch_X, ch_M, ch_L, ch_forward_slash, ch_1, ch_9, ch_9, ch_8, ch_forward_slash, ch_n, ch_a, ch_m, ch_e, ch_s, ch_p, ch_a, ch_c, ch_e, ch_null }

const char str_xmlns [] = { ch_x, ch_m, ch_l, ch_n, ch_s, ch_null }

const char str_version [] = { ch_v, ch_e, ch_r, ch_s, ch_i, ch_o, ch_n, ch_null }

const char str_encoding [] = { ch_e, ch_n, ch_c, ch_o, ch_d, ch_i, ch_n, ch_g, ch_null }

const char str_standalone [] = { ch_s, ch_t, ch_a, ch_n, ch_d, ch_a, ch_l, ch_o, ch_n, ch_e, ch_null }

const char str_SYSTEM [] = { ch_S, ch_Y, ch_S, ch_T, ch_E, ch_M, ch_null }

const char str_PUBLIC [] = { ch_P, ch_U, ch_B, ch_L, ch_I, ch_C, ch_null }

const char str_DOCTYPE [] = { ch_D, ch_O, ch_C, ch_T, ch_Y, ch_P, ch_E, ch_null }

const char str_ELEMENT [] = { ch_E, ch_L, ch_E, ch_M, ch_E, ch_N, ch_T, ch_null }

const char str_ENTITY [] = { ch_E, ch_N, ch_T, ch_I, ch_T, ch_Y, ch_null }

const char str_ENTITIES [] = { ch_E, ch_N, ch_T, ch_I, ch_T, ch_I, ch_E, ch_S, ch_null }

const char str_ENTIT [] = { ch_E, ch_N, ch_T, ch_I, ch_T, ch_null }

const char str_ATTRLIST [] = { ch_A, ch_T, ch_T, ch_L, ch_I, ch_S, ch_T, ch_null }

const char str_NOTATION [] = { ch_N, ch_O, ch_T, ch_A, ch_T, ch_I, ch_O, ch_N, ch_null }

const char str_EMPTY [] = { ch_E, ch_M, ch_P, ch_T, ch_Y, ch_null }

const char str_ANY [] = { ch_A, ch_N, ch_Y, ch_null }

const char str__PCDATA [] = { ch_pound, ch_P, ch_C, ch_D, ch_A, ch_T, ch_A, ch_null }

const char str_yes [] = { ch_y, ch_e, ch_s, ch_null }

const char str_no [] = { ch_n, ch_o, ch_null }

const char str_CDATA [] = { ch_C, ch_D, ch_A, ch_T, ch_A, ch_null }

const char str_CTYPE [] = { ch_C, ch_T, ch_Y, ch_P, ch_E, ch_null }

const char str_PCDATA [] = { ch_P, ch_C, ch_D, ch_A, ch_T, ch_A, ch_null }

const char str_IGNORE [] = { ch_I, ch_G, ch_N, ch_O, ch_R, ch_E, ch_null }

const char str_INCLUDE [] = { ch_I, ch_N, ch_C, ch_L, ch_U, ch_D, ch_E, ch_null }

const char str_ID [] = { ch_I, ch_D, ch_null }

const char str_IDREF [] = { ch_I, ch_D, ch_R, ch_E, ch_F, ch_null }

const char str_IDREFS [] = { ch_I, ch_D, ch_R, ch_E, ch_F, ch_S, ch_null }

const char str_REF [] = { ch_R, ch_E, ch_F, ch_null }

const char str_NMTOKEN [] = { ch_N, ch_M, ch_T, ch_O, ch_K, ch_E, ch_N, ch_null }

const char str_NMTOKENS [] = { ch_N, ch_M, ch_T, ch_O, ch_K, ch_E, ch_N, ch_S, ch_null }

const char str_REQUIRED [] = { ch_R, ch_E, ch_Q, ch_U, ch_I, ch_R, ch_E, ch_D, ch_null }

const char str_IMPLIED [] = { ch_I, ch_M, ch_P, ch_L, ch_I, ch_E, ch_D, ch_null }

const char str_FIXED [] = { ch_F, ch_I, ch_X, ch_E, ch_D, ch_null }

const char str_NDATA [] = { ch_N, ch_D, ch_A, ch_T, ch_A, ch_null }

const char str_xml_space [] = { ch_x, ch_m, ch_l, ch_colon, ch_s, ch_p, ch_a, ch_c, ch_e, ch_null }

const char str_default [] = { ch_d, ch_e, ch_f, ch_a, ch_u, ch_l, ch_t, ch_null }

const char str_preserve [] = { ch_p, ch_r, ch_e, ch_s, ch_e, ch_r, ch_v, ch_e, ch_null }

const char str_apos [] = { ch_a, ch_p, ch_o, ch_s, ch_null }

const char str_quote [] = { ch_q, ch_u, ch_o, ch_t, ch_null }

const char str_amp [] = { ch_a, ch_m, ch_p, ch_null }

const char str_lt [] = { ch_l, ch_t, ch_null }

const char str_gt [] = { ch_g, ch_t, ch_null }

const char str_ch_apos [] = { ch_single_quote, ch_null }

const char str_ch_quote [] = { ch_double_quote, ch_null }

const char str_ch_amp [] = { ch_ampersand, ch_null }

const char str_ch_lt [] = { ch_open_angle, ch_null }

const char str_ch_gt [] = { ch_close_angle, ch_null }

const char str_ch_colon [] = { ch_colon, ch_null }

const ub1_t UTF8Pre [] = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C }

const ub1_t EBCDICPre [] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93 }

const ub1_t UTF16BPre [] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C }

const ub1_t UTF16LPre [] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00 }

const ub1_t UCS4BPre [] = { 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00, 0x6C }

const ub1_t UCS4LPre [] = { 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00 }

const ub1_t UTF8BOM [] = { 0xEF, 0xBB, 0xBF }

const ub1_t UTF16BBOM [] = { 0xFE, 0xFF }

const ub1_t UTF16LBOM [] = { 0xFF, 0xFE }

const ub1_t UCS4BBOM [] = { 0x00, 0x00, 0xFE, 0xFF }

const ub1_t UCS4LBOM [] = { 0xFF, 0xFE, 0x00, 0x00 }

const ub1_t s_leadingByte [6] = {0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}

const ub2_t encoding_table_winodws_1251 [256]

windows 1251 encoding table

Function Documentation

xml_forceinline bool fixedN_to_utf8	(	encodingSchema	schema,
		const ub1_t *	in,
		size_t	count,
		ub1_t *	out,
		size_t &	converted,
		size_t &	processed,
		size_t &	more
	)

converts fixed width char buffer into utf-8 biffer

Parameters:

schema	encoding schema
in	input buffer
count	input buffer length
out	output buffer
converted	input chars converted
processed	output chars processed
more	more input chars required to process output utf-8 char, crop input chars

Definition at line 98 of file defxml.hpp.

References ch_cr, ch_hor_tab, ch_lf, s_leadingByte, UCS_4B, UCS_4BS, UCS_4L, UCS_4LS, UTF_16B, and UTF_16L.

Referenced by byte_source::convert_chars(), and byte_source::taste_buffer().

xml_forceinline void isoN_to_utf8	(	encodingSchema	schema,
		const ub1_t *	in,
		size_t	count,
		ub1_t *	out,
		size_t &	converted,
		size_t &	processed
	)

converts the ISO encoding to utf-8

Parameters:

schema	encoding schema
in	input ISO encoding buffer
count	input buffer length
out	output buffer
converted	input chars converted
processed	output chars processed

Definition at line 375 of file defxml.hpp.

References s_leadingByte, and UTF_ISO88591.

Referenced by byte_source::convert_chars().

void tokenValues	(	const char *	x,
		_list< const char * > &	values,
		byte_allocator &	allocator_
	)

tokenizes the string into list of tokens

Parameters:

x	input string
values	[out] list of tokens
allocator_	external allocator

Definition at line 68 of file defxml.cpp.

References copy_string(), is_white_space(), os_def_size, paged_buffer::persist(), _list< T, A >::push_back(), paged_buffer::reset(), paged_buffer::size(), and xml_exception_throw().

Referenced by xml_document::assign_attribute_value(), and dtd_processor::parseAttDef().

xml_forceinline bool usascii_to_utf8	(	ub4_t	in,
		ub1_t *	out,
		size_t &	count
	)

converts ascii char to utf-8 char sequence

Parameters:

in	input ascii char
out	output utf-8 buffer
count	count of processed utf-8 chars

Definition at line 38 of file defxml.hpp.

References ch_cr, ch_hor_tab, ch_lf, and s_leadingByte.

Referenced by byte_manager::parseCharRef().

xml_forceinline bool utf8_to_utf8	(	const ub1_t *	in,
		size_t	count,
		size_t &	processed,
		size_t &	more
	)

checks if the input buffer is valid utf-8 string

Parameters:

in	input utf-8 buffer
count	input buffer length
processed	input char processed
more	more input chars required to process output utf-8 char, crop input chars

Definition at line 242 of file defxml.hpp.

References ch_cr, ch_hor_tab, and ch_lf.

Referenced by byte_source::convert_chars(), and byte_source::taste_buffer().

xml_forceinline void windowsN_to_utf8	(	encodingSchema	schema,
		const ub1_t *	in,
		size_t	count,
		ub1_t *	out,
		size_t &	converted,
		size_t &	processed
	)

converts the Windows encoding to utf-8

Parameters:

schema	encoding schema
in	input Windows encoding buffer
count	input buffer length
out	output buffer
converted	input chars converted
processed	output chars processed