00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "xml/storexml.hpp"
00029 #include "xml/declxml.hpp"
00030 #include "xml/defxml.hpp"
00031 #include "xml/miscxml.hpp"
00032 #include "xml/mngxml.hpp"
00033
00034 #include "base/memory.hpp"
00035 #include "base/list.hpp"
00036 #include "base/string.hpp"
00037 #include "base/template.hpp"
00038 #include "base/vector.hpp"
00039 #include "base/common.hpp"
00040
00041 BEGIN_TERIMBER_NAMESPACE
00042 #pragma pack(4)
00043
00044 const size_t xml_decl_max_len = 4096;
00046 byte_source::byte_source(mem_pool_t& small_pool, mem_pool_t& big_pool, size_t xml_size, const char* url, bool subset) :
00047 _small_pool(small_pool),
00048 _big_pool(big_pool),
00049 _xml_size(__max(xml_size, os_def_size)),
00050 _url(url),
00051 _subset(subset),
00052 _buffer_pos(_xml_size),
00053 _symbol(0),
00054 _line_counter(0),
00055 _char_counter(0),
00056 _pos_counter(0),
00057 _encodingSchema(AUTO),
00058 _version(1),
00059 _standalone(os_minus_one),
00060 _end(false)
00061 {
00062 _convert_allocator = _small_pool.loan_object();
00063 _list_allocator = _small_pool.loan_object();
00064
00065 if (_xml_size <= os_def_size)
00066 {
00067 _depot_allocator = _small_pool.loan_object();
00068 _store_allocator = _small_pool.loan_object();
00069 }
00070 else
00071 {
00072 _depot_allocator = _big_pool.loan_object(_xml_size);
00073 _store_allocator = _big_pool.loan_object(_xml_size);
00074 }
00075
00076 _buffer = (ub1_t*)_depot_allocator->allocate(_xml_size);
00077 _convert_buffer = (ub1_t*)_convert_allocator->allocate(_xml_size);
00078 }
00079
00080 byte_source::~byte_source()
00081 {
00082 _small_pool.return_object(_convert_allocator);
00083 _small_pool.return_object(_list_allocator);
00084 if (_xml_size <= os_def_size)
00085 {
00086 _small_pool.return_object(_depot_allocator);
00087 _small_pool.return_object(_store_allocator);
00088 }
00089 else
00090 {
00091 _big_pool.return_object(_depot_allocator);
00092 _big_pool.return_object(_store_allocator);
00093 }
00094 }
00095
00096 void
00097 byte_source::reset_buffer()
00098 {
00099 _active_store.clear();
00100 _used_store.clear();
00101 _store_allocator->reset();
00102 _list_allocator->reset();
00103
00104 _symbol = 0;
00105 _buffer_pos = _xml_size;
00106 _line_counter = 0;
00107 _pos_counter = 0;
00108 _char_counter = 0;
00109 _encodingSchema = AUTO;
00110 _version = 1;
00111 _standalone = os_minus_one;
00112 _end = false;
00113 }
00114
00115 void
00116 byte_source::throw_exception(const char* msg_text)
00117 {
00118 char x[64] = {0};
00119 str_template::strprint(x, 64, "Error on line: %d, position: %d", _line_counter + 1, _char_counter);
00120 string_t ex = x;
00121
00122 if (msg_text)
00123 {
00124 ex += ", error message: \"";
00125 ex.append(msg_text, __min(strlen(msg_text), (size_t)128));
00126 ex += "\"";
00127 }
00128
00129 exception::_throw(ex);
00130 }
00131
00132
00133 void
00134 byte_source::push(const ub1_t* x, size_t len)
00135 {
00136 size_t remain = len;
00137 while (remain)
00138 {
00139 size_t copy_len = __min(remain, _buffer_pos);
00140
00141 memcpy(_buffer + _buffer_pos - copy_len, x + remain - copy_len, copy_len);
00142
00143
00144 remain -= copy_len;
00145 _buffer_pos -= copy_len;
00146
00147 if (!_buffer_pos && remain)
00148 {
00149
00150 ub1_t* vec = 0;
00151
00152 if (!_used_store.empty())
00153 {
00154 vec = _used_store.front();
00155 _used_store.pop_front();
00156 }
00157 else
00158 vec = (ub1_t*)_store_allocator->allocate(_xml_size);
00159
00160 if (!vec)
00161 throw_exception("Not enough memory");
00162
00163
00164 _active_store.push_front(*_list_allocator, vec);
00165
00166 memcpy(vec, _buffer, _xml_size);
00167 _buffer_pos = _xml_size;
00168 }
00169 }
00170
00171 _pos_counter -= len;
00172 _symbol = _buffer_pos == _xml_size ? 0x00 : _buffer[_buffer_pos];
00173 }
00174
00175 size_t
00176 byte_source::pull(ub1_t* x, size_t len)
00177 {
00178 size_t requested = len;
00179 len = 0;
00180 size_t copy_len;
00181
00182 while (!_end && len < requested)
00183 {
00184 if (_buffer_pos == _xml_size)
00185 go_shopping();
00186 else
00187 {
00188 copy_len = __min(requested - len, _xml_size - _buffer_pos);
00189 memcpy(x + len, _buffer + _buffer_pos, copy_len);
00190 _buffer_pos += copy_len;
00191 len += copy_len;
00192 }
00193 }
00194
00195 _pos_counter += len;
00196 _char_counter += len;
00197
00198 return len;
00199 }
00200
00201 ub1_t
00202 byte_source::go_shopping()
00203 {
00204 if (_end)
00205 return 0;
00206
00207 if (!_active_store.empty())
00208 {
00209 memcpy(_buffer, _active_store.front(), _xml_size);
00210
00211 _used_store.push_front(*_list_allocator, _active_store.front());
00212 _active_store.pop_front();
00213
00214 _buffer_pos = 0;
00215 _symbol = _buffer[_buffer_pos];
00216 return _symbol;
00217 }
00218
00219
00220 _buffer_pos = 0;
00221
00222 switch (_encodingSchema)
00223 {
00224 case AUTO:
00225
00226
00227 _end = !taste_buffer();
00228 break;
00229 default:
00230
00231 _end = !auto_convert();
00232 break;
00233 }
00234
00235
00236 if (!_end)
00237 {
00238 if (_buffer_pos != _xml_size)
00239 memmove(_buffer + _xml_size - _buffer_pos, _buffer, _buffer_pos);
00240
00241 _buffer_pos = _xml_size - _buffer_pos;
00242 _symbol = _buffer[_buffer_pos];
00243 }
00244 else
00245 _symbol = 0;
00246
00247 return _symbol;
00248 }
00249
00250 bool
00251 byte_source::taste_buffer()
00252 {
00253 size_t len = 4;
00254 size_t more = 0;
00255 size_t processed = 0;
00256 size_t converted = 0;
00257
00258
00259
00260 if (!data_request(_buffer, len) || len != 4)
00261 {
00262 _encodingSchema = UTF_8;
00263 _buffer_pos = len;
00264 return _buffer_pos != 0;
00265 }
00266 else if (!memcmp(_buffer, UCS4BBOM, 4))
00267 {
00268 _encodingSchema = UCS_4B;
00269 if (!convert_chars(20) || _buffer_pos != 5 || memcmp(_buffer, UTF8Pre, 5))
00270 return true;
00271 else
00272 return parseXMLDeclInfo();
00273 }
00274 else if (!memcmp(_buffer, UCS4LBOM, 4))
00275 {
00276 _encodingSchema = UCS_4L;
00277 if (!convert_chars(20) || _buffer_pos != 5 || memcmp(_buffer, UTF8Pre, 5))
00278 return true;
00279 else
00280 return parseXMLDeclInfo();
00281 }
00282 else if (!memcmp(_buffer, UTF8BOM, 3))
00283 {
00284 _encodingSchema = UTF_8;
00285
00286
00287
00288 _buffer_pos = 1;
00289 _buffer[0] = _buffer[3];
00290 if (!convert_chars(4) || _buffer_pos != 5 || memcmp(_buffer, UTF8Pre, 5))
00291 return true;
00292 else
00293 return parseXMLDeclInfo();
00294 }
00295 else if (!memcmp(_buffer, UTF16BBOM, 2))
00296 {
00297 _encodingSchema = UTF_16B;
00298
00299
00300
00301
00302
00303 _convert_buffer[0] = _buffer[2];
00304 _convert_buffer[1] = _buffer[3];
00305 len = 8;
00306 if (!data_request(_convert_buffer + 2, len) || len != 8)
00307 return true;
00308
00309 _buffer_pos = 0;
00310 len = 10;
00311
00312 if (!fixedN_to_utf8(_encodingSchema, _convert_buffer, len, _buffer + _buffer_pos, converted, processed, more))
00313 {
00314 _symbol = 0;
00315 throw_exception("Invalid char token while char conversion");
00316 }
00317
00318 if (!more)
00319 _buffer_pos += converted;
00320 else
00321 {
00322 size_t more_ = more;
00323 size_t processed_;
00324 size_t converted_;
00325
00326 if (!data_request(_convert_buffer + len, more_) || more != more_)
00327 return false;
00328
00329 if (!fixedN_to_utf8(_encodingSchema, _convert_buffer + processed, len - processed + more, _buffer + _buffer_pos + converted, converted_, processed_, more_))
00330 {
00331 _symbol = 0;
00332 throw_exception("Invalid char token while char conversion");
00333 }
00334
00335 _buffer_pos += converted + converted_;
00336 }
00337
00338
00339 if (_buffer_pos != 5 || memcmp(_buffer, UTF8Pre, 5))
00340 return true;
00341 else
00342 return parseXMLDeclInfo();
00343 }
00344 else if (!memcmp(_buffer, UTF16LBOM, 2))
00345 {
00346 _encodingSchema = UTF_16L;
00347
00348
00349
00350
00351 _convert_buffer[0] = _buffer[2];
00352 _convert_buffer[1] = _buffer[3];
00353 len = 8;
00354 if (!data_request(_convert_buffer + 2, len) || len != 8)
00355 return true;
00356
00357 _buffer_pos = 0;
00358 len = 10;
00359
00360 if (!fixedN_to_utf8(_encodingSchema, _convert_buffer, len, _buffer + _buffer_pos, converted, processed, more))
00361 {
00362 _symbol = 0;
00363 throw_exception("Invalid char token while char conversion");
00364 }
00365
00366 if (!more)
00367 _buffer_pos += converted;
00368 else
00369 {
00370 size_t more_ = more;
00371 size_t processed_;
00372 size_t converted_;
00373
00374 if (!data_request(_convert_buffer + len, more_) || more != more_)
00375 return false;
00376
00377 if (!fixedN_to_utf8(_encodingSchema, _convert_buffer + processed, len - processed + more, _buffer + _buffer_pos + converted, converted_, processed_, more_))
00378 {
00379 _symbol = 0;
00380 throw_exception("Invalid char token while char conversion");
00381 }
00382
00383 _buffer_pos += converted + converted_;
00384 }
00385
00386
00387 if (_buffer_pos != 5 || memcmp(_buffer, UTF8Pre, 5))
00388 return true;
00389 else
00390 return parseXMLDeclInfo();
00391 }
00392 else
00393 {
00394
00395 len = 1;
00396 if (!data_request(_buffer + 4, len) || len != 1)
00397 {
00398 _encodingSchema = UTF_8;
00399 _buffer_pos = 4 + len;
00400 return true;
00401 }
00402
00403 if (!memcmp(_buffer, UTF8Pre, 5))
00404 {
00405 _encodingSchema = UTF_8;
00406 return parseXMLDeclInfo();
00407 }
00408
00409
00410
00411
00412
00413 else
00414 {
00415 len = 5;
00416 if (!data_request(_buffer + 5, len) || len != 5)
00417 {
00418 _encodingSchema = UTF_8;
00419 _buffer_pos = 5 + len;
00420
00421 if (!utf8_to_utf8(_buffer, _buffer_pos, processed, more))
00422 {
00423 _symbol = 0;
00424 throw_exception("Invalid char token while char conversion");
00425 }
00426
00427 return true;
00428 }
00429 else if (!memcmp(_buffer, UTF16BPre, 10))
00430 {
00431 _encodingSchema = UTF_16B;
00432 return parseXMLDeclInfo();
00433 }
00434 else if (_buffer_pos >= 10 && !memcmp(_buffer, UTF16LPre, 10))
00435 {
00436 _encodingSchema = UTF_16L;
00437 return parseXMLDeclInfo();
00438 }
00439 else
00440 {
00441 len = 10;
00442 if (!data_request(_buffer + 10, len) || len != 10)
00443 {
00444 _encodingSchema = UTF_8;
00445 _buffer_pos = 10 + len;
00446 return true;
00447 }
00448 else if (!memcmp(_buffer, UCS4BPre, 20))
00449 {
00450 _encodingSchema = UCS_4B;
00451 return parseXMLDeclInfo();
00452 }
00453 else if (!memcmp(_buffer, UCS4LPre, 20))
00454 {
00455 _encodingSchema= UCS_4L;
00456 return parseXMLDeclInfo();
00457 }
00458 }
00459 }
00460
00461 _buffer_pos = 20;
00462 _encodingSchema = UTF_8;
00463 return true;
00464 }
00465 }
00466
00467 bool
00468 byte_source::parseXMLDeclInfo()
00469 {
00470 _buffer_pos = 0;
00471
00472 if (!convert_chars(1))
00473 return false;
00474
00475
00476 size_t prev_pos = _buffer_pos - 1;
00477
00478 while (_buffer_pos < _xml_size - 6 && convert_chars(1))
00479 {
00480 if (_buffer_pos != prev_pos + 2)
00481 throw_exception("Invalid XML declaration syntax");
00482
00483 if ((char)_buffer[prev_pos] == ch_question
00484 && (char)_buffer[_buffer_pos - 1] == ch_close_angle)
00485 break;
00486 else
00487 ++prev_pos;
00488 }
00489
00490 if (_buffer_pos == _xml_size)
00491 throw_exception("Invalid XML declaration syntax");
00492
00493
00494
00495
00496 stream_input_memory stream(_buffer, _buffer_pos, _small_pool, _big_pool, 0, _subset);
00497 stream.set_encoding(UTF_8);
00498 encodingSchema detectedSchema = _subset ? stream.parseTextDecl() : stream.parseXMLDecl();
00499
00500
00501 if (detectedSchema != AUTO)
00502 {
00503 if (detectedSchema == UTF_16)
00504 {
00505 if (_encodingSchema != UTF_16B && _encodingSchema != UTF_16L)
00506 throw_exception("Unknown encoding schema");
00507 }
00508 else if (detectedSchema == UCS_4)
00509 {
00510 if (_encodingSchema != UCS_4B && _encodingSchema != UCS_4L && _encodingSchema != UCS_4BS && _encodingSchema != UCS_4LS)
00511 throw_exception("Unknown encoding schema");
00512 }
00513 else
00514 _encodingSchema = stream.get_encoding();
00515 }
00516
00517 _standalone = stream.get_standalone();
00518 _version = stream.get_version();
00519
00520 _buffer_pos = 0;
00521 return auto_convert();
00522 }
00523
00524 bool
00525 byte_source::auto_convert()
00526 {
00527 convert_chars(_xml_size - 6);
00528 return _buffer_pos != 0;
00529 }
00530
00531 bool
00532 byte_source::convert_chars(size_t count)
00533 {
00534 ub4_t in = 0, surrogate = 0;
00535 size_t len = 0;
00536 size_t more = 0;
00537 size_t processed = 0;
00538 size_t converted = 0;
00539 size_t available = 0;
00540 size_t actual = 0;
00541
00542 size_t counter = count;
00543
00544 if (!counter)
00545 return false;
00546
00547 while (counter > 0
00548 && (available = _xml_size - _buffer_pos - 6) > 0
00549 )
00550 {
00551 switch (_encodingSchema)
00552 {
00553
00554
00555
00556 case UTF_ISO88591:
00557 actual = available / 2;
00558 if (actual == 0)
00559 {
00560 counter = 0;
00561 break;
00562 }
00563
00564 if (counter == 1)
00565 len = 1;
00566 else if (counter == 2)
00567 len = 2;
00568 else
00569 len = counter / 2;
00570
00571
00572 if (len > actual)
00573 len = actual;
00574
00575
00576 if (!data_request(_convert_buffer, len))
00577 return false;
00578
00579 isoN_to_utf8(_encodingSchema, _convert_buffer, len, _buffer + _buffer_pos, converted, processed);
00580 _buffer_pos += converted;
00581 counter -= processed;
00582 break;
00583 case WINDOWS_1251:
00584
00585 actual = available / 3;
00586
00587 if (actual == 0)
00588 {
00589 counter = 0;
00590 break;
00591 }
00592
00593 if (counter == 1)
00594 len = 1;
00595 else if (counter == 2)
00596 len = 2;
00597 else
00598 len = counter / 3;
00599
00600
00601 if (len > actual)
00602 len = actual;
00603
00604 if (!data_request(_convert_buffer, len))
00605 return false;
00606
00607 windowsN_to_utf8(_encodingSchema, _convert_buffer, len, _buffer + _buffer_pos, converted, processed);
00608 _buffer_pos += converted;
00609 counter -= processed;
00610 break;
00611 case UTF_8:
00612 case US_ASCII:
00613 len = counter;
00614 if (!data_request(_buffer + _buffer_pos, len))
00615 return false;
00616
00617 if (!utf8_to_utf8(_buffer + _buffer_pos, len, processed, more))
00618 {
00619 _symbol = 0;
00620 string_t err = "Invalid utf8 char token: ";
00621 err.append((const char*)_buffer + _buffer_pos, __min(processed, (size_t)32));
00622 throw_exception(err);
00623 }
00624
00625 if (!more)
00626 {
00627 _buffer_pos += len;
00628 counter -= processed;
00629 }
00630 else
00631 {
00632 size_t more_ = more;
00633 size_t processed_;
00634
00635 if (!data_request(_buffer + _buffer_pos + len, more_))
00636 return false;
00637
00638 if (!utf8_to_utf8(_buffer + _buffer_pos + processed, len - processed + more, processed_, more_))
00639 {
00640 _symbol = 0;
00641 string_t err = "Invalid utf8 char token: ";
00642 err.append((const char*)_buffer + _buffer_pos + processed, __min(processed_, (size_t)32));
00643 throw_exception(err);
00644 }
00645
00646 _buffer_pos += len + more;
00647 counter -= __min(counter, len + more);
00648 }
00649 break;
00650 case UTF_16L:
00651 case UTF_16B:
00652
00653 actual = available / 4;
00654
00655 if (actual == 0)
00656 {
00657 counter = 0;
00658 break;
00659 }
00660
00661
00662 if (counter == 1)
00663 len = 2;
00664 else
00665 {
00666 len = counter * 2 / 4;
00667 len += len % 2 ? 1 : 0;
00668 }
00669
00670
00671 if (len > actual)
00672 len = actual;
00673
00674 if (!data_request(_convert_buffer, len))
00675 return false;
00676
00677 if (!fixedN_to_utf8(_encodingSchema, _convert_buffer, len, _buffer + _buffer_pos, converted, processed, more))
00678 {
00679 _symbol = 0;
00680 string_t err = "Invalid utf16 char token: ";
00681 err.append((const char*)_convert_buffer, __min(processed, (size_t)32));
00682 throw_exception(err);
00683 }
00684
00685 if (!more)
00686 {
00687 _buffer_pos += converted;
00688 counter -= processed;
00689 }
00690 else
00691 {
00692 size_t more_ = more;
00693 size_t processed_;
00694 size_t converted_;
00695
00696 if (!data_request(_convert_buffer + len, more_))
00697 return false;
00698
00699 if (!fixedN_to_utf8(_encodingSchema, _convert_buffer + processed, len - processed + more, _buffer + _buffer_pos + converted, converted_, processed_, more_))
00700 {
00701 _symbol = 0;
00702 string_t err = "Invalid utf16 char token: ";
00703 err.append((const char*)_convert_buffer + processed, __min(processed_, (size_t)32));
00704 throw_exception(err);
00705 }
00706
00707 _buffer_pos += converted + converted_;
00708 counter -= __min(counter, processed + processed_);
00709 }
00710 break;
00711 case UCS_4L:
00712 case UCS_4LS:
00713 case UCS_4B:
00714 case UCS_4BS:
00715
00716 actual = available / 6;
00717
00718 if (actual == 0)
00719 {
00720 counter = 0;
00721 break;
00722 }
00723
00724
00725 if (counter == 1)
00726 len = 4;
00727 else
00728 {
00729 len = counter * 4 / 6;
00730 len += len % 4 ? len % 4 : 0;
00731 }
00732
00733
00734 if (len > actual)
00735 len = actual;
00736
00737 if (!data_request(_convert_buffer, len))
00738 return false;
00739
00740 if (!fixedN_to_utf8(_encodingSchema, _convert_buffer, len, _buffer + _buffer_pos, converted, processed, more))
00741 {
00742 _symbol = 0;
00743 string_t err = "Invalid utf32 char token: ";
00744 err.append((const char*)_convert_buffer, __min(processed, (size_t)32));
00745 throw_exception(err);
00746 }
00747
00748 if (!more)
00749 {
00750 _buffer_pos += converted;
00751 counter -= processed;
00752 }
00753 else
00754 {
00755 size_t more_ = more;
00756 size_t processed_;
00757 size_t converted_;
00758
00759 if (!data_request(_convert_buffer + len, more_))
00760 return false;
00761
00762 if (!fixedN_to_utf8(_encodingSchema, _convert_buffer + processed, len - processed + more, _buffer + _buffer_pos + converted, converted_, processed_, more_))
00763 {
00764 _symbol = 0;
00765 string_t err = "Invalid utf32 char token: ";
00766 err.append((const char*)_convert_buffer + processed, __min(processed_, (size_t)32));
00767 throw_exception(err);
00768 }
00769
00770 _buffer_pos += converted + converted_;
00771 counter -= __min(counter, processed + processed_);
00772 }
00773 break;
00774 default:
00775 throw_exception("Unsupported encoding schema");
00776 }
00777 }
00778 return true;
00779 }
00780
00781 void
00782 byte_source::parseVersion()
00783 {
00784 skip_string(str_version, "Invalid version syntax");
00785
00786 skip_sign(ch_equal, true, true, "Equal sign is expected");
00787
00788 scanQuotedValue(_convert_buffer, xml_decl_max_len, is_versionnum_char, "Invalid version value");
00789
00790
00791
00792
00793 if (str_template::strnocasecmp((const char*)_convert_buffer, "1.0", os_minus_one))
00794 throw_exception("Invalid xml version, must be 1.0");
00795
00796 _version = 1;
00797 }
00798
00799 encodingSchema
00800 byte_source::parseEncoding()
00801 {
00802 skip_string(str_encoding, "Invalid encoding syntax");
00803
00804 skip_sign(ch_equal, true, true, "Equal sign is expected");
00805
00806 scanQuotedValue(_convert_buffer, xml_decl_max_len, is_encname_char, "Invalid char in encoding value");
00807
00808
00809
00810 if (!is_letter(_convert_buffer[0]))
00811 throw_exception("Invalid first char in encoding value");
00812
00813
00814
00815
00816 return checkEncodingSchema((const char*)_convert_buffer);
00817 }
00818
00819 size_t
00820 byte_source::parseStandalone()
00821 {
00822 skip_string(str_standalone, "Invalid standalone syntax");
00823
00824 skip_sign(ch_equal, true, true, "Equal sign is expected");
00825
00826 scanQuotedValue(_convert_buffer, xml_decl_max_len, 0, 0);
00827
00828 if (!strcmp((const char*)_convert_buffer, str_yes))
00829 return 1;
00830 else if (!strcmp((const char*)_convert_buffer, str_no))
00831 return 0;
00832 else
00833 {
00834 throw_exception("Invalid standalone value");
00835 return os_minus_one;
00836 }
00837 }
00838
00839
00840 encodingSchema
00841 byte_source::parseXMLDecl()
00842 {
00843
00844 _version = 1;
00845
00846 encodingSchema schema = AUTO;
00847
00848
00849 skip_white_space(true, "Must be a separator after xml declaration open tag");
00850
00851
00852
00853 if (pick() != ch_v)
00854 throw_exception("VersionInfo is required part of xml declaration");
00855
00856 parseVersion();
00857
00858
00859 skip_white_space(ch_question != pick(), "Must be a separator in xml declaration");
00860
00861
00862
00863 if (pick() == ch_e)
00864 {
00865 schema = parseEncoding();
00866
00867 skip_white_space(ch_question != pick(), "Must be a separator in xml declaration");
00868 }
00869
00870 if (pick() == ch_s)
00871 {
00872 _standalone = parseStandalone();
00873 skip_white_space();
00874 }
00875
00876
00877
00878 if (pick() != ch_question || pop() != ch_close_angle)
00879 throw_exception("Invalid close tag in Text declaration");
00880
00881 pop();
00882
00883 if (schema != AUTO)
00884 _encodingSchema = schema;
00885
00886 return schema;
00887 }
00888
00889
00890 encodingSchema
00891 byte_source::parseTextDecl()
00892 {
00893
00894 _version = 1;
00895
00896
00897 skip_white_space(true, "Must be a separator after xml declaration open tag");
00898
00899
00900 if (pick() == ch_v)
00901 {
00902 parseVersion();
00903
00904 skip_white_space(ch_question != pick(), "Must be a separator in xml declaration");
00905 }
00906
00907
00908
00909 if (pick() != ch_e)
00910 throw_exception("Expected encoding attribute in xml declaration");
00911
00912 encodingSchema schema = parseEncoding();
00913 skip_white_space();
00914
00915
00916 if (pick() != ch_question || pop() != ch_close_angle)
00917 throw_exception("Invalid close tag in xml declaration");
00918
00919 pop();
00920 _encodingSchema = schema;
00921 return schema;
00922 }
00923
00924
00925 void
00926 byte_source::skip_string(const char* x, const char* message)
00927 {
00928 assert(x);
00929 for (ub1_t symbol = pick(); symbol && *x && symbol == *x; symbol = pop(), ++x);
00930
00931 if (*x)
00932 throw_exception(message);
00933 }
00934
00935 void
00936 byte_source::skip_sign(ub1_t symbol, bool skip_before, bool skip_after, const char* message)
00937 {
00938 if (skip_before)
00939 skip_white_space();
00940
00941 if (symbol != pip())
00942 throw_exception(message);
00943
00944 if (skip_after)
00945 skip_white_space();
00946 }
00947
00948 encodingSchema
00949 byte_source::checkEncodingSchema(const char* schema)
00950 {
00951 if (!str_template::strnocasecmp(schema, "UTF-8", os_minus_one) || !str_template::strnocasecmp(schema, "UTF8", os_minus_one))
00952 return UTF_8;
00953 else if (!str_template::strnocasecmp(schema, "US-ASCII", os_minus_one) || !str_template::strnocasecmp(schema, "USASCII", os_minus_one) || !str_template::strnocasecmp(schema, "ASCII", os_minus_one) || !str_template::strnocasecmp(schema, "US_ASCII", os_minus_one))
00954 return US_ASCII;
00955 else if (!str_template::strnocasecmp(schema, "UTF-16", os_minus_one))
00956 return UTF_16;
00957 else if (!str_template::strnocasecmp(schema, "UTF-16 (LE)", os_minus_one) || !str_template::strnocasecmp(schema, "UTF-16LE", os_minus_one))
00958 return UTF_16L;
00959 else if (!str_template::strnocasecmp(schema, "UTF-16 (BE)", os_minus_one) || !str_template::strnocasecmp(schema, "UTF-16BE", os_minus_one))
00960 return UTF_16B;
00961 else if (!str_template::strnocasecmp(schema, "USC-4 (LE)", os_minus_one) || !str_template::strnocasecmp(schema, "USC-4LE", os_minus_one))
00962 return UCS_4L;
00963 else if (!str_template::strnocasecmp(schema, "USC-4 (BE)", os_minus_one) || !str_template::strnocasecmp(schema, "USC-4BE", os_minus_one))
00964 return UCS_4B;
00965 else if (!str_template::strnocasecmp(schema, "USC-4", os_minus_one))
00966 return UCS_4;
00967 else if (!str_template::strnocasecmp(schema, "ISO-8859-1", os_minus_one) || !str_template::strnocasecmp(schema, "ISO88591", os_minus_one) || !str_template::strnocasecmp(schema, "ISO_8859-1", os_minus_one) || !str_template::strnocasecmp(schema, "ISO_8859_1", os_minus_one))
00968 return UTF_ISO88591;
00969 else if (!str_template::strnocasecmp(schema, "WINDOWS-1251", os_minus_one))
00970 return WINDOWS_1251;
00971 else
00972 return AUTO;
00973
00974
00975 return AUTO;
00976 }
00977
00978 ub1_t
00979 byte_source::skip_quote(ub1_t symbol)
00980 {
00981
00982 if (!symbol)
00983 {
00984
00985 if ((symbol = pick()) != ch_double_quote && symbol != ch_single_quote)
00986 throw_exception("Invalid open quote");
00987 }
00988 else if (symbol != pick())
00989 throw_exception("Invalid close quote");
00990
00991 pop();
00992 return symbol;
00993 }
00994
00995 void
00996 byte_source::scanQuotedValue(ub1_t* value, size_t len, bool (*fn)(ub1_t), const char* message)
00997 {
00998 ub1_t quote_symbol = skip_quote();
00999 size_t count = 0;
01000 ub1_t symbol = pick();
01001
01002 while (count < len - 1 && symbol && symbol != quote_symbol)
01003 {
01004 if (fn && !fn(symbol))
01005 throw_exception(message);
01006 value[count++] = symbol;
01007 symbol = pop();
01008 }
01009
01010 value[count] = 0;
01011
01012 skip_quote(quote_symbol);
01013 }
01014
01015
01018 byte_consumer::byte_consumer(mem_pool_t& small_pool, mem_pool_t& big_pool, size_t xml_size) :
01019 _small_pool(small_pool),
01020 _big_pool(big_pool),
01021 _xml_size(__max(xml_size, os_def_size)),
01022 _buffer_pos(0)
01023 {
01024
01025 if (_xml_size <= os_def_size)
01026 _depot_allocator = _small_pool.loan_object();
01027 else
01028 _depot_allocator = _big_pool.loan_object(_xml_size);
01029
01030 _buffer = (ub1_t*)_depot_allocator->allocate(_xml_size);
01031 }
01032
01033 byte_consumer::~byte_consumer()
01034 {
01035 if (_xml_size <= os_def_size)
01036 _small_pool.return_object(_depot_allocator);
01037 else
01038 _big_pool.return_object(_depot_allocator);
01039 }
01040
01041
01042 void
01043 byte_consumer::push(const ub1_t* x, size_t len)
01044 {
01045 size_t remain = len;
01046 while (remain)
01047 {
01048 size_t copy_len = __min(remain, _xml_size - _buffer_pos);
01049
01050 memcpy(_buffer + _buffer_pos, x + len - remain, copy_len);
01051
01052
01053 remain -= copy_len;
01054 _buffer_pos += copy_len;
01055
01056 if (_buffer_pos == _xml_size)
01057 {
01058 if (!data_persist(_buffer, _buffer_pos))
01059 exception::_throw("Can't persist data");
01060 _buffer_pos = 0;
01061 }
01062 }
01063 }
01064
01065 void
01066 byte_consumer::flush()
01067 {
01068 if (_buffer_pos)
01069 {
01070 if (!data_persist(_buffer, _buffer_pos))
01071 exception::_throw("Can't persist data");
01072
01073 _buffer_pos = 0;
01074 }
01075 }
01076
01077 #pragma pack()
01078 END_TERIMBER_NAMESPACE