00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "xml/dtdxml.h"
00029
00030 #include "base/memory.hpp"
00031 #include "base/list.hpp"
00032 #include "base/map.hpp"
00033 #include "base/stack.hpp"
00034 #include "base/string.hpp"
00035 #include "base/common.hpp"
00036 #include "base/template.hpp"
00037
00038 #include "xml/parsexml.hpp"
00039 #include "xml/declxml.hpp"
00040 #include "xml/defxml.hpp"
00041 #include "xml/miscxml.hpp"
00042 #include "xml/mngxml.hpp"
00043 #include "xml/sxml.hpp"
00044 #include "xml/sxs.hpp"
00045 #include "xml/storexml.hpp"
00046
00047 BEGIN_TERIMBER_NAMESPACE
00048 #pragma pack(4)
00049
00051 xml_processor::xml_processor(byte_source& stream,
00052 xml_document& doc,
00053 mem_pool_t& small_pool,
00054 mem_pool_t& big_pool,
00055 size_t xml_size,
00056 bool validate) :
00057 byte_manager(stream, doc, small_pool, big_pool, xml_size),
00058 _doc(doc),
00059 _validate(validate)
00060 {
00061 _white_space_allocator = _small_pool.loan_object();
00062 }
00063
00064 xml_processor::~xml_processor()
00065 {
00066 _small_pool.return_object(_white_space_allocator);
00067 }
00068
00069 const char*
00070 xml_processor::get_error() const
00071 {
00072 return _error;
00073 }
00074
00075
00076 bool
00077 xml_processor::parse()
00078 {
00079 try
00080 {
00081
00082 _doc.container_reset();
00083 _white_space_stack.clear();
00084 _white_space_allocator->reset();
00085
00086 _entity_map.clear();
00087 _entity_allocator->reset();
00088 _preserve_white_space = false;
00089
00090 _doc.add_escaped_symbols();
00091
00092
00093
00094
00095
00096
00097 parseDocument();
00098
00099
00100 skip_white_space();
00101
00102 if (pick() != 0)
00103 throw_exception("Unrecognized chars after document");
00104
00105 resolve_references();
00106 }
00107 catch (exception& x)
00108 {
00109 _error = x.what();
00110 return false;
00111 }
00112 catch (...)
00113 {
00114 _error = "Unexpected exception has been thrown";
00115 return false;
00116 }
00117
00118 return true;
00119 }
00120
00121 void
00122 xml_processor::parseDocument()
00123 {
00124
00125 parseProlog();
00126
00127
00128 if (_doc._standalone != os_minus_one)
00129 _doc._standalone = get_standalone();
00130
00131 parseElement();
00132
00133 if (_doc.container_peak())
00134 throw_exception("Invalid Root element syntax");
00135
00136
00137 if (!_doc.get_root_element()._decl)
00138 throw_exception("Can't find Root element");
00139
00140 parseMisc();
00141 }
00142
00143 void
00144 xml_processor::parseProlog()
00145 {
00146
00147
00148
00149
00150
00151
00152
00153 skip_white_space();
00154 while (pick() == ch_open_angle)
00155 {
00156 switch (pop())
00157 {
00158
00159
00160
00161
00162 case ch_question:
00163 parsePI();
00164 break;
00165 case ch_bang:
00166
00167
00168 switch (pop())
00169 {
00170 case ch_dash:
00171 parseComment();
00172 break;
00173 case ch_D:
00174 parseDocTypeDecl();
00175 break;
00176 default:
00177 return;
00178 }
00179 break;
00180 default:
00181
00182 push(ch_open_angle);
00183 return;
00184 }
00185
00186 skip_white_space();
00187
00188 reset_all_tmp(true);
00189 _doc.get_tmp_allocator().reset();
00190 }
00191 }
00192
00193 void
00194 xml_processor::parseDocTypeDecl()
00195 {
00196
00197 skip_string(str_DOCTYPE, "Invalid DOCTYPE syntax");
00198 skip_white_space(true, "Expected white char");
00199
00200
00201
00202 if (!_doc.set_doc_name(parseName()) && _doc.is_on_fly())
00203 throw_exception("Invalid DOCTYPE name");
00204
00205 _doc.container_start_doctype();
00206
00207 skip_white_space();
00208
00209
00210 if (pick() == ch_S || pick() == ch_P)
00211 {
00212 if (_doc._standalone == os_minus_one)
00213 {
00214 if (get_standalone() == 1)
00215 throw_exception("Illegal external DOCTYPE in standalone document");
00216 else if (get_standalone() == os_minus_one)
00217 _doc._standalone = 0;
00218 }
00219
00220
00221 parseExternalID(_doc._system_id, _doc._public_id, true);
00222 if (_doc.is_on_fly() && _doc._system_id.length())
00223 parseDTD(_doc._system_id);
00224 }
00225
00226
00227 skip_white_space();
00228 if (pick() == ch_open_square)
00229 {
00230
00231 pop();
00232 if (_doc.is_on_fly())
00233 parseDTD();
00234 else
00235 skipDTD();
00236
00237 skip_white_space();
00238 }
00239
00240 skip_sign(ch_close_angle, false, false, "Expected close tag for DOCTYPE");
00241
00242 if (!_doc.check_root())
00243 throw_exception("Invalid root element name");
00244
00245 _doc.container_stop_doctype();
00246 }
00247
00248 void
00249 xml_processor::parseMisc()
00250 {
00251
00252 skip_white_space();
00253
00254 while (pick() == ch_open_angle)
00255 {
00256 switch (pop())
00257 {
00258 case ch_question:
00259
00260 parsePI();
00261 break;
00262 case ch_bang:
00263 if (ch_dash == pop())
00264 {
00265
00266 parseComment();
00267 break;
00268 }
00269 default:
00270 throw_exception("Invalid markup instruction in Misc, PI or comment are allowed only");
00271 break;
00272 }
00273
00274 skip_white_space();
00275 reset_all_tmp(true);
00276 _doc.get_tmp_allocator().reset();
00277 }
00278 }
00279
00280 void
00281 xml_processor::parseElement()
00282 {
00283 skip_white_space();
00284 if (pick() == ch_ampersand)
00285 parseGeneralReference(true);
00286
00287
00288 while (pick() == ch_open_angle)
00289 {
00290 switch (pop())
00291 {
00292 case ch_forward_slash:
00293 pop();
00294 parseEndTag();
00295
00296 if (!_doc.container_peak())
00297 return;
00298
00299 parseContent();
00300 break;
00301 default:
00302
00303 parseStartTag();
00304
00305 if (!_doc.container_peak())
00306 return;
00307
00308
00309 parseContent();
00310 break;
00311 }
00312
00313 reset_all_tmp(true);
00314 _doc.get_tmp_allocator().reset();
00315 }
00316 }
00317
00318 void
00319 xml_processor::parseAttributes(xml_element& el)
00320 {
00321
00322 attr_states_map_t attrStates;
00323 attr_states_map_t::iterator iterState;
00324
00325
00326 for (attribute_decl_map_t::const_iterator iterDecl = el.cast_decl()->_attributes.begin(); iterDecl != el.cast_decl()->_attributes.end(); ++iterDecl)
00327 attrStates.insert(*_tmp_allocator, &*iterDecl, 0);
00328
00329
00330 skip_white_space();
00331 ub1_t symbol = pick();
00332
00333 while (symbol && symbol != ch_forward_slash && symbol != ch_close_angle)
00334 {
00335 if (symbol == ch_ampersand)
00336 parseGeneralReference(true);
00337
00338 string_t attr_name(_tmp_allocator);
00339 string_t attr_value(_tmp_allocator);
00340
00341 parseAttributeValue(attr_name, attr_value);
00342
00343 xml_value_node* attrRef = _doc.add_attribute(el, attr_name, attr_value);
00344
00345 iterState = attrStates.find(attrRef->cast_to_attribute());
00346
00347 if (iterState == attrStates.end())
00348 attrStates.insert(*_tmp_allocator, attrRef->cast_to_attribute(), attrRef);
00349 else if (*iterState)
00350 throw_exception("Dublicate attribute");
00351 else
00352 *iterState = attrRef;
00353
00354
00355 symbol = pick();
00356 switch (symbol)
00357 {
00358 case ch_forward_slash:
00359 case ch_close_angle:
00360 break;
00361 default:
00362
00363 skip_white_space(true, "Must be separator between attributes");
00364 symbol = pick();
00365 }
00366 }
00367
00368
00369 if (!_doc.is_on_fly() && attrStates.size())
00370 _doc.add_def_attributes(el, attrStates);
00371
00372
00373 const attributeDecl* decl_xml_space = _doc.find_attribute_decl(*el.cast_decl(), str_xml_space);
00374 if (decl_xml_space && (iterState = attrStates.find(decl_xml_space)) != attrStates.end() && *iterState)
00375 {
00376
00377 for (_list< enumNodeDecl >::const_iterator iterEnum = decl_xml_space->_enum.begin(); iterEnum != decl_xml_space->_enum.end(); ++iterEnum)
00378 {
00379 if (iterEnum->_id == (*iterState)->_value.lVal)
00380 {
00381 _preserve_white_space = iterEnum->_value == str_preserve;
00382 xml_white_space_handler handler(&el, _preserve_white_space);
00383 _white_space_stack.push(*_white_space_allocator, handler);
00384 break;
00385 }
00386 }
00387 }
00388 }
00389
00390
00391 void
00392 xml_processor::parseCDATA()
00393 {
00394
00395 skip_string(str_CDATA, "Invalid CDATA syntax");
00396 skip_sign(ch_open_square, false, false, "Expected '[' symbol after <![CDATA instruction");
00397 size_t square_counter = 0;
00398 _tmp_store1.reset();
00399 ub1_t symbol = pick();
00400 while (symbol)
00401 {
00402 switch (symbol)
00403 {
00404 case ch_close_square:
00405 ++square_counter;
00406 break;
00407 case ch_close_angle:
00408 if (square_counter >= 2)
00409 {
00410 pop();
00411 while (square_counter-- > 2)
00412 _tmp_store1 << ch_close_square;
00413
00414 _doc.add_cdata(_tmp_store1.persist());
00415 return;
00416 }
00417 default:
00418 while (square_counter--)
00419 _tmp_store1 << ch_close_square;
00420
00421 _tmp_store1 << symbol;
00422 square_counter = 0;
00423 }
00424
00425 symbol = pop();
00426 }
00427
00428
00429
00430 throw_exception("Incompleted CDATA section");
00431 }
00432
00433 void
00434 xml_processor::_parseCharData()
00435 {
00436
00437 ub1_t symbol = pick();
00438 size_t illegal = 0;
00439
00440 while (symbol && symbol != ch_open_angle)
00441 {
00442
00443 switch (symbol)
00444 {
00445 case ch_ampersand:
00446 if (_tmp_store1.size())
00447 {
00448 _tmp_store3 << _tmp_store1.persist();
00449 _tmp_store1.reset();
00450 }
00451
00452
00453 if (resolveEntity(_tmp_store2))
00454 {
00455 size_t len = 0;
00456 const ub1_t* bptr = _tmp_store2.persist(len);
00457 push(bptr, len);
00458 if (_tmp_store3.size())
00459 _doc.add_text(_tmp_store3.persist());
00460
00461 reset_all_tmp();
00462 return;
00463 }
00464 else
00465 {
00466 _tmp_store3 << _tmp_store2.persist();
00467 illegal = 0;
00468 }
00469
00470 _tmp_store2.reset();
00471 _tmp_store1.reset();
00472 symbol = pick();
00473 break;
00474 case ch_close_angle:
00475 if (illegal >= 2)
00476 throw_exception("Illegal char sequence ]]> in CharData");
00477
00478 if (_tmp_store1.size())
00479 {
00480 _tmp_store3 << _tmp_store1.persist();
00481 _tmp_store1.reset();
00482 }
00483
00484 illegal = 0;
00485 _tmp_store3 << symbol;
00486 symbol = pop();
00487 break;
00488 case ch_close_square:
00489 if (_tmp_store1.size())
00490 {
00491 _tmp_store3 << _tmp_store1.persist();
00492 _tmp_store1.reset();
00493 }
00494
00495 ++illegal;
00496 _tmp_store3 << symbol;
00497 symbol = pop();
00498 break;
00499 default:
00500 if (is_white_space(symbol))
00501 _tmp_store1 << symbol;
00502 else
00503 {
00504 if (_tmp_store1.size())
00505 {
00506 _tmp_store3 << _tmp_store1.persist();
00507 _tmp_store1.reset();
00508 }
00509
00510 _tmp_store3 << symbol;
00511 }
00512
00513 illegal = 0;
00514 symbol = pop();
00515 break;
00516 }
00517 }
00518
00519 if (_tmp_store1.size() && _preserve_white_space)
00520 _tmp_store3 << _tmp_store1.persist();
00521 if (_tmp_store3.size())
00522 _doc.add_text(_tmp_store3.persist());
00523
00524 reset_all_tmp();
00525 }
00526
00527 void
00528 xml_processor::parseDTD(const char* location)
00529 {
00530
00531 if (location)
00532 {
00533 string_t full_path(0, _tmp_allocator);
00534 xml_stream_attribute attr_new;
00535 if (!attr_new.complete_to_full(location, _stream.get_location(), full_path))
00536 {
00537 string_t ex("Can't create full path from location: ");
00538 ex += location;
00539 ex += " and url: ";
00540 ex += _stream.get_location();
00541 exception::_throw(ex);
00542 }
00543
00544 stream_input_common stream(_small_pool, _big_pool, _xml_size, true);
00545 if (!stream.open(attr_new))
00546 {
00547 string_t str = "Can't open location: ";
00548 str += location;
00549 exception::_throw(str);
00550 }
00551
00552
00553 dtd_processor dtd(stream, _doc, _small_pool, _big_pool, _xml_size);
00554 dtd.parse();
00555 stream.close();
00556 }
00557 else
00558 {
00559 dtd_processor dtd(_stream, _doc, _small_pool, _big_pool, _xml_size);
00560
00561 dtd.parse();
00562
00563 pop();
00564 }
00565 }
00566
00567 void
00568 xml_processor::skipDTD()
00569 {
00570 xml_document skipSchema(_small_pool, _big_pool, _xml_size, 0);
00571 dtd_processor dtd(_stream, skipSchema, _small_pool, _big_pool, _xml_size);
00572 dtd.parse();
00573
00574 pop();
00575 }
00576
00577 void
00578 xml_processor::resolve_references()
00579 {
00580 _doc.resolve_references();
00581 }
00582
00583 void
00584 xml_processor::parseGeneralReference(bool skip_after)
00585 {
00586
00587 _tmp_store1.reset();
00588 resolveEntity(_tmp_store1);
00589 size_t count = 0;
00590 const ub1_t* ptr = _tmp_store1.persist(count);
00591 push(ptr, count);
00592 _tmp_store1.reset();
00593 if (skip_after)
00594 skip_white_space();
00595
00596 }
00597
00598 #pragma pack()
00599 END_TERIMBER_NAMESPACE
00600