Terimber Corporation

00001 /*
00002  * The Software License
00003  * =================================================================================
00004  * Copyright (c) 2003-.The Terimber Corporation. All rights reserved.
00005  * =================================================================================
00006  * Redistributions of source code must retain the above copyright notice, 
00007  * this list of conditions and the following disclaimer.
00008  * Redistributions in binary form must reproduce the above copyright notice, 
00009  * this list of conditions and the following disclaimer in the documentation 
00010  * and/or other materials provided with the distribution.
00011  * The end-user documentation included with the redistribution, if any, 
00012  * must include the following acknowledgment:
00013  * "This product includes software developed by the Terimber Corporation."
00014  * =================================================================================
00015  * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, 
00016  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
00017  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  
00018  * IN NO EVENT SHALL THE TERIMBER CORPORATION OR ITS CONTRIBUTORS BE LIABLE FOR 
00019  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
00020  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
00021  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
00022  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00023  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00024  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00025  * ================================================================================
00026 */
00027 
00028 #include "xml/dtdxml.h"
00029 #include "xml/declxml.hpp"
00030 #include "xml/defxml.hpp"
00031 #include "xml/mngxml.hpp"
00032 #include "xml/sxml.hpp"
00033 #include "xml/sxs.hpp"
00034 #include "xml/storexml.hpp"
00035 #include "xml/miscxml.hpp"
00036 
00037 #include "base/list.hpp"
00038 #include "base/map.hpp"
00039 #include "base/stack.hpp"
00040 #include "base/string.hpp"
00041 #include "base/common.hpp"
00042 #include "base/memory.hpp"
00043 
00044 BEGIN_TERIMBER_NAMESPACE
00045 #pragma pack(4)
00046 
00047 dtd_processor::dtd_processor(   byte_source& stream, 
00048                                                                 xml_document& doc,
00049                                                                 mem_pool_t& small_pool, 
00050                                                                 mem_pool_t& big_pool,
00051                                                                 size_t xml_size) :
00052         byte_manager(stream, doc, small_pool, big_pool, xml_size)
00053 {
00054 }
00056 // we can parse dtd as external and then stopSymbol = 0
00057 // but also it can be a build-in dtd so we are parsing dtd 
00058 // until stopSymbol = ']' appears
00059 
00060 void 
00061 dtd_processor::parse()
00062 {
00063         // [' (markupdecl | DeclSep)* ']'
00064         parseSubSet(!get_subset() ? ch_close_square : ch_null, get_subset());
00065         validate();
00066         // set standalone
00067         _doc._standalone = 1;
00068 }
00069 
00070 void 
00071 dtd_processor::parseSubSet(char stopSymbol, bool include_allowed)
00072 {
00073         ub1_t symbol = 0;
00074         while ((symbol = pick()) && symbol != stopSymbol)
00075         {
00076                 switch (symbol)
00077                 {
00078                         // [28a]    DeclSep    ::=    PEReference | S 
00079                         // [69]    PEReference    ::=    '%' Name ';' 
00080                         // [29]    markupdecl    ::=    elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment  
00081                         case ch_percent:
00082                                 // [69]    PEReference    ::=    '%' Name ';' 
00083                                 parsePEReference(false, true);
00084                                 break;
00085                         case ch_open_angle:
00086                                 // [45]    elementdecl    ::=    '<!ELEMENT' S Name S contentspec S? '>' 
00087                                 // [52]    AttlistDecl    ::=    '<!ATTLIST' S Name AttDef* S? '>' 
00088                                 // [70]    EntityDecl    ::=    GEDecl | PEDecl 
00089                                 // [71]    GEDecl    ::=    '<!ENTITY' S Name S EntityDef S? '>' 
00090                                 // [72]    PEDecl    ::=    '<!ENTITY' S '%' S Name S PEDef S? '>' 
00091                                 // [82]    NotationDecl    ::=    '<!NOTATION' S Name S (ExternalID | PublicID) S? '>' 
00092                                 // [16]    PI    ::=    '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 
00093                                 // [15]    Comment    ::=    '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 
00094                                 switch (pop()) // skip '<' and check next char
00095                                 {
00096                                         case ch_question:
00097                                                 // '<?'
00098                                                 parsePI();
00099                                                 break;
00100                                         case ch_bang:
00101                                                 // '<!ELEMENT'
00102                                                 // '<!ATTLIST'
00103                                                 // '<!ENTITY'
00104                                                 // '<!NOTATION'
00105                                                 // '<!--'
00106                                                 switch (pop()) // skip '!' and checks the next char
00107                                                 {
00108                                                         case ch_E:
00109                                                                 switch (pop())
00110                                                                 {
00111                                                                         case ch_L: // '<!ELEMENT'
00112                                                                                 parseElement();
00113                                                                                 break;
00114                                                                         case ch_N: // '<!ENTITY'
00115                                                                                 parseEntity();
00116                                                                                 break;
00117                                                                         default:
00118                                                                                 throw_exception("Invalid ELEMENT or ENTITY sections syntax");
00119                                                                 }
00120                                                                 break;
00121                                                         case ch_A: // '<!ATTRLIST'
00122                                                                 parseAttrList();
00123                                                                 break;
00124                                                         case ch_N: // '<!NOTATION'
00125                                                                 parseNotation();
00126                                                                 break;
00127                                                         case ch_dash: // '<!-'
00128                                                                 parseComment();
00129                                                         break;
00130                                                         case ch_open_square: 
00131                                                                 //if (stopSymbol != ch_null) // internal subset
00132                                                                 if (!include_allowed)
00133                                                                         throw_exception("IGNORE or INCLUDE sections are not allowed in internal dtd subset");
00134 
00135                                                                 pop(); // skip '['
00136                                                                 skip_sign(ch_I, true, false, "Expected IGNORE or INCLUDE sections");
00137                                                                 switch (pick()) // tests the next char
00138                                                                 {
00139                                                                         case ch_G:
00140                                                                                 // '<![' S? 'IGNORE'
00141                                                                                 parseIgnore();
00142                                                                                 break;
00143                                                                         case ch_N:
00144                                                                                 // '<![' S? 'INCLUDE'
00145                                                                                 parseInclude();
00146                                                                                 break;
00147                                                                         default:
00148                                                                                 throw_exception("Invalid IGNORE or INCLUDE sections syntax");
00149                                                                                 break;
00150                                                                 } // switch
00151                                                                 break;
00152                                                         default:
00153                                                                 throw_exception("Invalid markup instruction syntax in the internal dtd section");
00154                                                                 break;
00155                                                 } // switch 
00156                                                 break;
00157                                         default:
00158                                                 throw_exception("Unexpected char in dtd markup language");
00159                                                 break;
00160                                 } // switch
00161                                 break;
00162                         default: // only 'S'
00163                                 skip_white_space(true, "White space expected");
00164                                 break;
00165                 } // switch
00166 
00167                 reset_all_tmp(true);
00168                 _doc.get_tmp_allocator().reset();
00169         } // while
00170 
00171 }
00172 
00173 void  
00174 dtd_processor::parseIgnore()
00175 {
00176         // [63]    ignoreSect    ::=    '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>'
00177         // [64]    ignoreSectContents    ::=    Ignore ('<![' ignoreSectContents ']]>' Ignore)* 
00178         // [65]    Ignore    ::=    Char* - (Char* ('<![' | ']]>') Char*)  
00179 
00180         // first char of the IGNORE word has been eaten by caller
00181         skip_string(str_IGNORE + 1, "Invalid IGNORE section syntax");
00182         skip_sign(ch_open_square, true, true, "Expected IGNORE Bracket");
00183 
00184         // It assumes that
00185         //  we are already in the body, i.e. we've seen <![IGNORE[ at this point. So
00186         //  we just have to scan until we see a matching ]]> closing markup.
00187         //
00188 
00189         //  Depth starts at one because we are already in one section and want
00190     //  to parse until we hit its end.
00191     //
00192     size_t depth = 1;
00193         ub1_t symbol = 0;
00194 
00195     while (0 != (symbol = pick()))
00196     {
00197                 switch (symbol)
00198                 {
00199                         case ch_open_angle:
00200                                 if (pop() == ch_bang && pop() == ch_open_square)
00201                                 {
00202                                         pop();
00203                                         ++depth;        
00204                                 }
00205                                 continue;
00206                         case ch_close_square:
00207                                 if (pop() == ch_close_square && pop() == ch_close_angle)
00208                                 {
00209                                         pop();
00210                                         if (!--depth)
00211                                                 return;
00212                                 }
00213                                 continue;
00214                         default:
00215                                 break;
00216         } // switch
00217 
00218                 pop();
00219                 skip_white_space();
00220     } // while
00221 
00222         // 
00223         throw_exception("Invalid IGNORE section syntax");
00224 }
00225 
00226 void  
00227 dtd_processor::parseInclude()
00228 {
00229         // [62]    includeSect    ::=    '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>'
00230         // first char of the INCLUDE word has been eaten by caller
00231         skip_string(str_INCLUDE + 1, "Invalid INCLUDE section syntax");
00232         skip_sign(ch_open_square, true, true, "Expected open square tag symbol after INCLUDE section");
00233         parseSubSet(ch_close_square, true);
00234         skip_sign(ch_close_square, true, false, "Expected INCLUDE close bracket");
00235         skip_sign(ch_close_square, false, false, "Expected INCLUDE second close bracket");
00236         skip_sign(ch_close_angle, false, false, "Expected INCLUDE close angle");
00237 }
00238 
00239 
00240 void  
00241 dtd_processor::parseElement()
00242 {
00243         // [45]    elementdecl    ::=    '<!ELEMENT' S Name S contentspec S? '>' 
00244         // first char of the ELEMENT word has been eaten by caller
00245         skip_string(str_ELEMENT + 1, "Invalid ELEMENT section syntax");
00246         skip_white_space(true, "Expected white space");
00247         
00248         parsePEReference(false, true);
00249 
00250         elementDecl& decl = _doc.add_element_decl(parseName(), false, true, false);
00251         skip_white_space(true, "Expected white space");
00252         parsePEReference(false, false);
00253 
00254     // And now scan the content model for this guy.
00255     parseContentSpec(decl);
00256     // Another check for a PE ref, but we don't require whitespace here
00257         skip_sign(ch_close_angle, true, false, "Expected close tag");
00258         _doc.add_element_desc(decl);
00259 }
00260 
00261 void  
00262 dtd_processor::parseContentSpec(elementDecl& decl)
00263 {
00264         skip_white_space();
00265 
00266         switch (pick())
00267         {
00268                 case ch_E:
00269                         skip_string(str_EMPTY, "Invalid EMPTY declaration");
00270                         decl._content = CONTENT_EMPTY;
00271                         break;
00272                 case ch_A:
00273                         skip_string(str_ANY, "Invalid ANY declaration");
00274                         decl._content = CONTENT_ANY;
00275                         break;
00276                 case ch_open_paren:
00277                         pop();
00278                         // We could have a PE ref here, but don't require space
00279                         parsePEReference(true, true);
00280                         //  Now we look for a PCDATA string_t. If its PCDATA, then it must be a
00281                         //  MIXED model. Otherwise, it must be a regular list of children in
00282                         //  a regular expression perhaps.
00283                         switch (pick())
00284                         {
00285                                 case ch_pound:
00286                                         skip_string(str__PCDATA, "Invalid PCDATA declaration");
00287                                         decl._content = CONTENT_MIXED;
00288                                         decl._token = parseMixed();
00289                                         return;
00290                                 default: // must be children
00291                                         decl._content = CONTENT_CHILDREN;
00292                                         decl._token = parseChildren();
00293                                         return;
00294                         } // switch
00295                 default:
00296                         throw_exception("Invalid element content");
00297     } // switch
00298 
00299         // validates Deterministic Content Models (Non-Normative)
00300         deterministic_model(decl._token);
00301 }
00302 
00303 dfa_token*  
00304 dtd_processor::parseMixed()
00305 {
00306     //  Creates an initial content spec node. Its just a leaf node with a
00307     //  PCDATA element id. This current node pointer will be pushed down the
00308     //  tree as we go.
00309     
00310         dfa_token* curToken = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token)))) dfa_token(DFA_LEAF, 0, 0, 0);
00311         dfa_token* orgToken = curToken;
00312         dfa_token* headToken = orgToken;
00313    
00314         _list< const elementDecl* > uniqueList;
00315         //
00316     //  We just loop around, getting the | character at the top and then
00317     //  looking for the next element name. We keep up with the last node
00318     //  and add each new one to its last node.
00319     //
00320         bool hasElement = false;
00321         ub1_t symbol = 0;
00322 
00323     while (0 != (symbol = pick()))
00324     {
00325         skip_white_space();   
00326                 switch (symbol = pick())
00327                 {
00328                         case ch_percent:
00329                                 parsePEReference(false, false);
00330                                 break;
00331                         case ch_pipe:   
00332                                 pop(); // skips '|'
00333                                 parsePEReference(true, true);
00334 
00335                                 // parses name of element
00336                                 {
00337                                         const elementDecl& decl = _doc.add_element_decl(parseName(), false, false, false);
00338 
00339                                         // checks the first element after #PCDATA
00340                                         if (curToken == orgToken)
00341                                         {
00342                                                 hasElement = true;
00343                                                 curToken = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token)))) 
00344                                                                                                         dfa_token(DFA_CHOICE, 0, curToken, new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token)))) 
00345                                                                                                                                                                                         dfa_token(DFA_LEAF, &decl, 0, 0));
00346                                                 // Remembers the top token
00347                                                 headToken = curToken;
00348                                         }
00349                                         else
00350                                         {
00351                                                 // searches for the present
00352                                                 for (_list< const elementDecl* >::const_iterator iter = uniqueList.begin(); iter != uniqueList.end(); ++iter)
00353                                                         if (*iter == &decl)
00354                                                                 throw_exception("Dublicate elements in Mixed model");
00355 
00356                                                 dfa_token* oldLast = curToken->_last;
00357                                                 curToken->_last = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token))))
00358                                                                                                         dfa_token(DFA_CHOICE, 0, oldLast, new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token))))
00359                                                                                                                                                                                         dfa_token(DFA_LEAF, &decl, 0, 0));
00360 
00361                                                 // Make the new last node the current node
00362                                                 curToken = curToken->_last;
00363                                         }
00364 
00365                                         // pushes to list
00366                                         uniqueList.push_back(*_tmp_allocator, &decl);
00367                                 }
00368                                 break;
00369                         case ch_close_paren:
00370                                 if (ch_asterisk == pop()) // skips ')'
00371                                         skip_sign(ch_asterisk, false, false, "Expected asterisk symbol");
00372                                 
00373                 //
00374                 //  Creates a zero or more node and makes the original head
00375                 //  node its first child.
00376                 //
00377                 if (hasElement) 
00378                                 {
00379                     headToken = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token))))
00380                                                                                                         dfa_token(DFA_ASTERISK, 0, headToken, 0);
00381                 }
00382 
00383                 return headToken;
00384                         default: // unkonwn
00385                                 throw_exception("Invalid element PCDATA syntax");
00386         } // switch
00387     } // while
00388 
00389         throw_exception("Invalid element PCDATA syntax");
00390         return 0;
00391 }
00392 
00393 dfa_token*
00394 dtd_processor::checkRepeation(ub1_t symbol, dfa_token* token)
00395 {
00396         dfa_token* retVal = token;
00397 
00398         switch (symbol)
00399         {
00400                 case ch_question: // '?'
00401                         retVal = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token)))) dfa_token(DFA_QUESTION, 0, token, 0);
00402                         pop();
00403                         break;
00404                 case ch_asterisk: // '*'
00405                         retVal = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token)))) dfa_token(DFA_ASTERISK, 0, token, 0);
00406                         pop();
00407                         break;
00408                 case ch_plus: // '+'
00409                         retVal = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token)))) dfa_token(DFA_PLUS, 0, token, 0);
00410                         pop();
00411                         break;
00412                 default:
00413                         break;
00414         }
00415 
00416         return retVal;
00417 }
00418 
00419 dfa_token*
00420 dtd_processor::parseChildren()
00421 {
00422         parsePEReference(true, true);
00423 
00424         ub1_t symbol = 0;
00425         ub1_t type_char = 0;
00426         dfa_token* curToken = 0;
00427         dfa_token* headToken = 0;
00428 
00429         switch (pick())
00430         {
00431                 case ch_open_paren: // group
00432                         {
00433                                 pop(); // skips '('
00434                                 // here we don't know the sub group type
00435                                 // Lets call ourself and get back the resulting node
00436                                 curToken = parseChildren();
00437                         }
00438                         break;
00439                 default: // element name
00440                         {
00441                                 const elementDecl& decl = _doc.add_element_decl(parseName(), false, false, false);
00442                                 //  Creates a leaf token for it.
00443                                 curToken = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token)))) 
00444                                                                                                         dfa_token(DFA_LEAF, &decl, 0, 0);
00445 
00446                                 parsePEReference(true, true);
00447                                 curToken = checkRepeation(pick(), curToken);
00448                         }
00449         } // switch
00450 
00451     // checks the next symbol
00452         parsePEReference(true, true);
00453 
00454         switch (type_char = pick())
00455         {
00456                 case ch_comma:
00457                 case ch_pipe:
00458                         headToken = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token))))
00459                                                 dfa_token(type_char == ch_comma ? DFA_SEQUENCE : DFA_CHOICE, 0, curToken, 0);
00460                         curToken = headToken;
00461                         break;
00462                 case ch_close_paren:
00463                         headToken = curToken;
00464                         pop();
00465                         break;
00466                 default:
00467                         throw_exception("Expected CHOICE or SEQUENCE or CLOSE PAREN syntax");
00468         } // switch
00469 
00470         if ((type_char == ch_comma) || (type_char == ch_pipe))
00471         {
00472                 dfa_token* lastToken = 0;
00473                 dfaRule typeRule = type_char == ch_comma ? DFA_SEQUENCE : DFA_CHOICE;
00474 
00475                 while (pick())
00476                 {
00477                         parsePEReference(true, true);
00478 
00479                         symbol = pick();
00480 
00481                         if (symbol == ch_close_paren)
00482                         {
00483                                 pop(); // skips ')'
00484                                 //  We've hit the end of this section, so break out. But, we
00485                                 //  need to see if we left a partial sequence of choice nodes
00486                                 //  without a second node. If so, we have to undo that and
00487                                 //  put its left child into the right node of the previous
00488                                 //  node.
00489                                 if (!curToken->_last)
00490                                 {
00491                                         dfa_token* oldFirst = curToken->_first;
00492                                         curToken->_first = 0;
00493                                         lastToken->_last = oldFirst;
00494                                         curToken = lastToken;
00495                                 }
00496                                 break;
00497                         }
00498                         else if (symbol == ch_comma || symbol == ch_pipe)
00499                         {
00500                                 if (symbol != type_char)
00501                                         throw_exception("Expected legal CHOICE/SEQUENCE syntax");
00502 
00503                                 pop(); // skips '|' or ','
00504                                 parsePEReference(true, true);
00505 
00506                                 if ((symbol = pick()) == ch_open_paren)
00507                                 {
00508                                         pop(); // skips '('
00509                                         // Recurses to handle this new guy
00510                                         dfa_token* newToken = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token))))
00511                                                                                                 dfa_token(typeRule, 0, parseChildren(), 0);
00512 
00513                                         
00514                                         curToken->_last = newToken;
00515                                         lastToken = curToken;
00516                                         curToken = newToken;
00517                                 }
00518                                 else
00519                                 {
00520                                         //
00521                                         //  Has to be a leaf node, so gets a name. If it cannot get
00522                                         //  one, then it cleans up and gets out of here.
00523                                         //
00524                                         const elementDecl& decl = _doc.add_element_decl(parseName(), false, false, false);
00525                                         dfa_token* tmpToken = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token))))
00526                                                                                                 dfa_token(DFA_LEAF, &decl, 0, 0);
00527 
00528                                         parsePEReference(true, true);
00529 
00530                                         tmpToken = checkRepeation(pick(), tmpToken);
00531 
00532                                         //
00533                                         //  Creates a new sequence or choice node, with the leaf
00534                                         //  (or rep surrounding it) we just recieved as its first node.
00535                                         //  Makes the new node the second node of the current node,
00536                                         //  and then makes it the current node.
00537                                         //
00538                                         dfa_token* newToken = new(check_pointer(_doc.get_model_allocator().allocate(sizeof(dfa_token))))
00539                                                                                                 dfa_token(typeRule, 0, tmpToken, 0);
00540 
00541                                         curToken->_last = newToken;
00542                                         lastToken = curToken;
00543                                         curToken = newToken;
00544                                 }
00545                         } // else if
00546                         else
00547                                 throw_exception("Expected legal CHOICE/SEQUENCE syntax");
00548                 } // while
00549         } // if
00550 
00551     //
00552     //  We saw the terminating parenthesis so lets check for any repetition
00553     //  character, and create a node for that, making the head node its child
00554     //  
00555     //
00556         return checkRepeation(pick(), headToken);
00557 }
00558 
00559 void  
00560 dtd_processor::parseEntity()
00561 {
00562         // [70]    EntityDecl    ::=    GEDecl | PEDecl 
00563         // first char of the ENTITY word has been eaten by caller
00564         skip_string(str_ENTITY + 1, "Invalid ENTITY section syntax");
00565         skip_white_space(true, "Expected white space");
00566 
00567         // next must be either the persent or name or entity
00568         bool bPEDecl = false;
00569         if (pick() == ch_percent)
00570         {
00571                 pop(); // skips '%'
00572                 skip_white_space(true, "Expected white space");
00573                 bPEDecl = true;
00574         }
00575 
00576 
00577         // checks in the entity map
00578         bool wasAdded = false;
00579         entityDecl dummy(0, _tmp_allocator);
00580         entityDecl& entry = _doc.add_entity_decl(parseName(), wasAdded);
00581         entityDecl& entity = wasAdded ? entry : dummy;
00582 
00583         entity._is_parameter = bPEDecl;
00584 
00585         skip_white_space(true, "Expected white space");
00586         parsePEReference(false, true);
00587 
00588         // [73]    EntityDef    ::=    EntityValue | (ExternalID NDataDecl?) 
00589         parseEntityDef(entity);
00590         skip_sign(ch_close_angle, true, false, "Expected close tag");
00591         if (wasAdded)
00592                 _doc.add_entity_desc(entry);
00593 }
00594 
00595 void  
00596 dtd_processor::parseEntityDef(entityDecl& decl)
00597 {
00598         // [73]    EntityDef    ::=    EntityValue | (ExternalID NDataDecl?)
00599         reset_all_tmp();
00600 
00601         ub1_t symbol = 0;
00602         size_t counter = 0;
00603         bool met_xD = false;
00604 
00605         // checks quote
00606         if ((symbol = pick()) == ch_double_quote || symbol == ch_single_quote)
00607         {
00608                 ub1_t quote = skip_quote(0);
00609                 while (0 != (symbol = pick()))
00610                 {
00611                         // checks the next char
00612                         if(symbol == ch_percent) // PERef
00613                         {
00614                                 // expands the Parameter Entity
00615                                 // and continue the parsing process
00616                                 // preventing recursive reference
00617                                 if (&decl == expandPEReference(_tmp_store2))
00618                                         throw_exception("Recursive PE is not allowed");
00619 
00620                                 size_t len = 0;
00621                                 const ub1_t* ptr = _tmp_store2.persist(len);
00622                                 push(ptr, len);
00623                                 _tmp_store2.reset();
00624                                 continue;
00625                         }
00626                         else if (symbol == ch_ampersand)
00627                         {
00628                                 symbol = pop(); // skips '&'
00629                                 if (symbol == ch_pound) // CharRef
00630                                         parseCharRef(_tmp_store3);
00631                                 else // Entity Reference as it is
00632                                 {
00633                                         // NB!!! inside _tmp_store1 will be used
00634                                         const char* value = parseName();
00635                                         skip_sign(ch_semicolon, false, false, "Expected semicolon after Entity Reference");
00636                                         _tmp_store3 << ch_ampersand << value << ch_semicolon;
00637                                 }
00638                         }
00639                         else if (symbol == quote)
00640                         {
00641                                 pop();
00642                                 // checks the last xD
00643                                 if (met_xD) _tmp_store3 << ch_cr;
00644                                 decl._value = _tmp_store3.persist();
00645                                 return;
00646                         }
00647                         else
00648                         {
00649                                 // translates the two-character sequence #xD #xA 
00650                                 // and any #xD that is not followed by #xA to #xA 
00651                                 // on input before parsing
00652                                 switch (symbol)
00653                                 {
00654                                         case ch_lf: // #xA
00655                                                 _tmp_store3 << ch_lf;
00656                                                 // resets boolean
00657                                                 met_xD = false; // all xD before xA were skipped
00658                                                 break;
00659                                         case ch_cr:     // #xD
00660                                                 // sets boolean
00661                                                 met_xD = true;
00662                                                 break;
00663                                         default:
00664                                                 if (met_xD) // we previously got xD but there isn't a next xA
00665                                                 {
00666                                                         // replaces to xA
00667                                                         _tmp_store3 << ch_lf;
00668                                                         // resets boolean
00669                                                         met_xD = false;
00670                                                 }
00671                         _tmp_store3 << symbol;
00672                                 } // switch
00673                                 pop();
00674                         }
00675                 } // while
00676 
00677                 if (!symbol)
00678                         throw_exception("Invalid ENTITY syntax");
00679         } // if
00680 
00681         // [75]    ExternalID    ::=    'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral  
00682         skip_white_space();
00683         string_t value_system(_tmp_allocator);
00684         string_t value_public(_tmp_allocator);
00685         parseExternalID(value_system, value_public, true);
00686 
00687         decl._systemId = value_system;
00688         decl._publicId = value_public;
00689 
00690         if (!decl._is_parameter && is_white_space(pick())) // NDATA can be
00691         {
00692                 // [76]    NDataDecl    ::=    S 'NDATA' S Name
00693                 parsePEReference(true, true);
00694 
00695                 if (pick() == ch_N)
00696                 {
00697                         skip_string(str_NDATA, "Invalid NDATA syntax");
00698                         skip_white_space(true, "Expected white space");
00699                         parsePEReference(false, true);
00700 
00701                         decl._notation = parseName();
00702                         decl._is_unparsed = true;
00703                 } // if
00704         } // else
00705 
00706         decl._is_in_subset = true;
00707 }
00708         
00709 void  
00710 dtd_processor::parseAttrList()
00711 {
00712         skip_string(str_ATTRLIST, "Invalid ATTRLIST section syntax");
00713         // Space is required here
00714         skip_white_space(true, "Expected white space");
00715         parsePEReference(false, true);
00716 
00717 
00718         elementDecl& decl = _doc.add_element_decl(parseName(), false, false, false);
00719 
00720         while (pick())
00721         {
00722                 skip_white_space();
00723 
00724                 switch (pick())
00725                 {
00726                         case ch_close_angle:
00727                                 pop(); // skips '>'
00728                                 return;
00729                         case ch_percent:
00730                                 parsePEReference(false, false);
00731                                 break;
00732                         default: // attribute Def
00733                                 parseAttDef(decl);
00734                                 break;
00735                 }
00736         } // while 
00737 
00738         throw_exception("Invalid ATTLIST syntax");
00739 }
00740 
00741 void  
00742 dtd_processor::parseAttDef(elementDecl& decl)
00743 {
00744         bool wasAdded = false;
00745         attributeDecl dummy(0, _tmp_allocator);
00746 
00747         attributeDecl& new_decl = _doc.add_attribute_decl(decl, parseName(), false, wasAdded);
00748         attributeDecl& attr_decl = wasAdded ? new_decl : dummy;
00749 
00750         skip_white_space(true, "Expected white space");
00751         // next must be type
00752         parsePEReference(false, false);
00753 
00754         switch (pick())
00755         {
00756                 case ch_C: // CDATA
00757                         if (pop() == ch_D)
00758                         {
00759                                 skip_string(str_CDATA + 1, "Invalid CDATA syntax");
00760                                 attr_decl._atype = ATTR_TYPE_CDATA;
00761                                 attr_decl._ctype = vt_string;
00762                         }
00763                         else
00764                         {
00765                                 skip_string(str_CTYPE + 1, "Invalid CTYPE syntax");
00766                                 attr_decl._atype = ATTR_TYPE_CDATA;
00767                                 skip_white_space(true, "Expected white space");
00768                                 attr_decl._ctype = convert_ctype(parseName());
00769                         }
00770                         break;
00771                 case ch_I: // ID, IDREF, IDREFS
00772                         skip_string(str_ID, "Invalid CDATA syntax");
00773                         if (pick() != ch_R)
00774                         {
00775                                 attr_decl._atype = ATTR_TYPE_ID;
00776                                 if (wasAdded)
00777                                 {
00778                                         // checks id uniqueness
00779                                         for (attribute_decl_map_t::const_iterator iter = decl._attributes.begin(); iter != decl._attributes.end(); ++iter)
00780                                                 if (&*iter != &new_decl && iter->_atype == ATTR_TYPE_ID)
00781                                                         throw_exception("Dublicate ID type for the same element");
00782                                 }
00783                         }
00784                         else
00785                         {
00786                                 skip_string(str_REF, "Invalid IDREF syntax");
00787                                 if (pick() == ch_S)
00788                                 {
00789                                         attr_decl._atype = ATTR_TYPE_IDREFS;
00790                                         pop(); // skips 'S'
00791                                 }
00792                                 else
00793                                         attr_decl._atype = ATTR_TYPE_IDREF;
00794                         }
00795 
00796                         attr_decl._ctype = vt_string;
00797                         break;
00798                 case ch_E:
00799                         skip_string(str_ENTIT, "Invalid ENTITY syntax");
00800                         if (pick() == ch_Y)
00801                         {
00802                                 attr_decl._atype = ATTR_TYPE_ENTITY;
00803                                 pop(); // skips 'Y'
00804                         }
00805                         else if (pick() == ch_I && pop() == ch_E && pop() == ch_S)
00806                         {
00807                                 attr_decl._atype = ATTR_TYPE_ENTITIES;
00808                                 pop(); // skips 'S'
00809                         }
00810                         else
00811                                 throw_exception("Invalid ENTITY syntax");
00812 
00813                         attr_decl._ctype = vt_string;
00814                         break;
00815                 case ch_N:
00816                         if (pop() == ch_M)
00817                         {
00818                                 skip_string(str_NMTOKEN + 1, "Invalid NMTOKEN syntax");
00819                                 if (pick() == ch_S)
00820                                 {
00821                                         attr_decl._atype = ATTR_TYPE_NMTOKENS;
00822                                         pop(); // skips 'S'
00823                                 }
00824                                 else
00825                                         attr_decl._atype = ATTR_TYPE_NMTOKEN;
00826 
00827                                 attr_decl._ctype = vt_string;
00828                         }
00829                         else// notation
00830                         {
00831                                 skip_string(str_NOTATION + 1, "Invalid NOTATION syntax");
00832                                 skip_white_space(true, "Expected white space");
00833                                 attr_decl._atype = ATTR_TYPE_NOTATION;
00834                                 attr_decl._ctype = vt_enum;
00835 
00836                                 if (wasAdded)
00837                                 {
00838                                         // checks empty element
00839                                         if (decl._content == CONTENT_EMPTY)
00840                                                 throw_exception("An attribute of type NOTATION must not be declared on an element declared EMPTY");
00841 
00842                                         // checks id uniqueness
00843                                         for (attribute_decl_map_t::const_iterator iter = decl._attributes.begin(); iter != decl._attributes.end(); ++iter)
00844                                                 if (&*iter != &new_decl && iter->_atype == ATTR_TYPE_NOTATION)
00845                                                         throw_exception("No element type may have more than one NOTATION attribute specified");
00846                                 }
00847 
00848                                 parseAttrEnumeration(attr_decl);
00849                         }
00850                         break;
00851                 case ch_open_paren:
00852                         {
00853                                 attr_decl._atype = ATTR_TYPE_ENUMERATION;
00854                                 attr_decl._ctype = vt_enum;
00855                                 parseAttrEnumeration(attr_decl);
00856                         }
00857                         break;
00858                 default:
00859                         throw_exception("Unexpected char in attribute definition");
00860     } // switch
00861 
00862         skip_white_space(true, "Expected white space");
00863         parsePEReference(false, true);
00864 
00865     // And then scans for the optional default value declaration
00866     parseDefaultDecl(attr_decl);
00867 
00868         // validate there something if first
00869         if (!wasAdded)
00870                 return;
00871 
00872         if (attr_decl._atype == ATTR_TYPE_ID && attr_decl._rule != ATTR_RULE_IMPLIED && attr_decl._rule != ATTR_RULE_REQUIRED)
00873                 throw_exception("Invalid AttrDef syntax");
00874 
00875         // A special attribute named xml:space may be attached to an element to signal an intention that in that element, 
00876         // white space should be preserved by applications. In valid documents, this attribute, like any other, 
00877         // must be declared if it is used. 
00878         // When declared, it must be given as an enumerated type whose values are one or both of "default" and "preserve". 
00879         if (!strcmp(str_xml_space, attr_decl._name))
00880         {
00881                 if (attr_decl._atype == ATTR_TYPE_ENUMERATION)
00882                 {
00883                         size_t count = attr_decl._enum.size();
00884 
00885                         if (count < 1 || count > 2)
00886                                 throw_exception("Invalid xml space syntax");
00887 
00888                         if (count == 1 && 
00889                                 !(      attr_decl._enum.front()._value == str_default 
00890                                         //|| attr_decl._enum.front()._value != str_preserve)
00891                                         || attr_decl._enum.front()._value == str_preserve)
00892                                 )
00893                                 throw_exception("Invalid xml space syntax");
00894 
00895                         if (count == 2 && 
00896                                 !(attr_decl._enum.front()._value == str_default
00897                                         && attr_decl._enum.back()._value == str_preserve
00898                                         || attr_decl._enum.back()._value == str_default
00899                                         && attr_decl._enum.front()._value == str_preserve)
00900                                 )
00901                                 throw_exception("Invalid xml space syntax");
00902                 }
00903                 else if (attr_decl._atype == ATTR_TYPE_CDATA && attr_decl._rule == ATTR_RULE_FIXED)
00904                 {
00905                         if (attr_decl._defval != str_default
00906                                 && attr_decl._defval != str_preserve)
00907                                 throw_exception("Invalid xml space syntax");
00908                 }
00909                 else
00910                         throw_exception("Invalid xml space syntax");
00911         }
00912         
00913         // check default
00914         if (attr_decl._defval.length())
00915         {
00916                 const char* value_ = attr_decl._defval;
00917                 switch (attr_decl._atype)
00918                 {
00919                         case ATTR_TYPE_ENUMERATION:
00920                                 {
00921                                         bool findDefault = false;
00922                                         for (_list< enumNodeDecl >::const_iterator iter = attr_decl._enum.begin(); iter != attr_decl._enum.end(); ++iter)
00923                                         {
00924                                                 if (!findDefault && attr_decl._defval == iter->_value)
00925                                                 {
00926                             findDefault = true;
00927                                                         break;
00928                                                 }
00929                                         }
00930 
00931                                         if (!findDefault)
00932                                                 throw_exception("Default value doesn't match the enumeration items");
00933                                 }
00934                                 break;
00935                         case ATTR_TYPE_ID:
00936                         case ATTR_TYPE_NMTOKEN:
00937                                 //check_string_content(attr_decl._defval, attr_decl._defval.length(), is_name_char, ATTR_TYPE_ID ? "Invalid char in ID default value" : "Invalid char in NMTOKEN default value");
00938                                 break;
00939                         case ATTR_TYPE_NMTOKENS:
00940                                 {
00941                                         _list< const char* > values;
00942                                         tokenValues(value_, values, *_tmp_allocator);
00943                                         _list< const char* > defvalues;
00944                                         tokenValues(attr_decl._defval, defvalues, *_tmp_allocator); 
00945 
00946                                         if (values.empty())
00947                                                 throw_exception("Invalid ENTITIES default value syntax");
00948 
00949                                         for (_list< const char* >::const_iterator iter = values.begin(); iter != values.end(); ++iter)
00950                                         {
00951                                                 //check_string_content(*iter, -1, is_name_char, "Illigal token char in ENTITIES default value");
00952                                                 bool findDefault = false;
00953                                                 for (_list< const char* >::const_iterator defiter = defvalues.begin(); defiter != defvalues.end(); ++defiter)
00954                                                 {
00955                                                         if (!findDefault && !strcmp(*defiter, *iter))
00956                                                         {
00957                                                                 findDefault = true;
00958                                                                 break;
00959                                                         }
00960                                                 }
00961                                         
00962                                                 if (!findDefault)
00963                                                         throw_exception("Default value doesn't match the enumeration items");
00964                                         }
00965                                 }
00966                                 break;
00967                         default:
00968                                 break;
00969                 } // switch
00970         } // if
00971 }
00972 
00973 void  
00974 dtd_processor::parseAttrEnumeration(attributeDecl& decl)
00975 {
00976         //if (decl._atype == ATTR_TYPE_NOTATION)
00977         //      skip_white_space(true, "Expected white space");
00978 
00979         skip_sign(ch_open_paren, false, false, "Expected open paren symbol");
00980 
00981         size_t counter = 0;
00982 
00983     while (pick())
00984     {
00985                 parsePEReference(true, true);
00986 
00987                 enumNodeDecl attrEnum(&_doc.get_model_allocator());
00988                 attrEnum._id = counter++;
00989                 attrEnum._value = parseValue();
00990                 decl._enum.push_back(_doc.get_model_allocator(), attrEnum);
00991 
00992                 skip_white_space();
00993         // Checks for the terminating paren
00994                 if (pick() == ch_close_paren)
00995                 {
00996                         pop();
00997                         return;
00998                 }
00999 
01000                 skip_sign(ch_pipe, false, false, "Expected Enum Separator");
01001     }
01002 }
01003 
01004 void  
01005 dtd_processor::parseDefaultDecl(attributeDecl& decl)
01006 {
01007         switch (pick())
01008         {
01009                 case ch_pound: // required or implied
01010                         if (pop() == ch_R)
01011                         {
01012                                 skip_string(str_REQUIRED, "Invalid REQUIRED syntax");
01013                                 decl._rule = ATTR_RULE_REQUIRED;
01014                         }
01015                         else if (pick() == ch_I)
01016                         {
01017                                 skip_string(str_IMPLIED, "Invalid IMPLIED syntax");
01018                                 decl._rule = ATTR_RULE_IMPLIED;
01019                         }
01020                         else if (pick() == ch_F) // fixed
01021                         {
01022                                 skip_string(str_FIXED, "Invalid FIXED syntax");
01023                                 decl._rule = ATTR_RULE_FIXED;
01024 
01025                                 skip_white_space(true, "Expected white space");
01026                                 // must be default value
01027                                 // resolves entities
01028                                 //decl._defval = parseQuotedValue(true, false, is_name_char, "Illigal token char in attribute default value");
01029                                 decl._defval = parseQuotedValue(true, false, 0, 0);
01030                         }
01031                         break;
01032                 default:
01033                         {
01034                                 // checks for ID constrain
01035                                 if (decl._atype == ATTR_TYPE_ID)
01036                                         throw_exception("An ID attribute must have a declared default of #IMPLIED or #REQUIRED");
01037                                 decl._rule = ATTR_RULE_REQUIRED;
01038                                 // must be default value
01039                                 //decl._defval = parseQuotedValue(true, false, is_name_char, "Illigal token char in attribute default value");
01040                                 decl._defval = parseQuotedValue(true, false, 0, 0);
01041                         }
01042                         break;
01043         } // switch
01044 }
01045 
01046 void  
01047 dtd_processor::parseNotation()
01048 {
01049         skip_string(str_NOTATION, "Invalid NOTATION section syntax");
01050         skip_white_space(true, "Expected white space");
01051         parsePEReference(false, true);
01052 
01053         notationDecl& decl = _doc.add_notation_decl(parseName());
01054 
01055         skip_white_space(true, "Expected white space");
01056         parsePEReference(false, true);
01057 
01058         string_t value_system(_tmp_allocator);
01059         string_t value_public(_tmp_allocator);
01060 
01061         parseExternalID(value_system, value_public, false);
01062 
01063         decl._publicId = value_public;
01064         decl._systemId = value_system;
01065         skip_sign(ch_close_angle, true, false, "Expected close tag");
01066 }
01067 
01068 
01069 void  
01070 dtd_processor::validate()
01071 {
01072 }
01073 
01074 void  
01075 dtd_processor::parsePEReference(bool skip_junk_before, bool skip_junk_after)
01076 {
01077         // [69]    PEReference    ::=    '%' Name ';'
01078         if (skip_junk_before && is_white_space(pick()))
01079                 skip_white_space();
01080 
01081         if (pick() == ch_percent)
01082         {
01083                 // resolves value
01084                 _tmp_store2.reset();
01085                 expandPEReference(_tmp_store2);
01086                 size_t len = 0;
01087                 const ub1_t* ptr = _tmp_store2.persist(len);
01088                 push(ptr, len);
01089                 _tmp_store2.reset();
01090 
01091                 if (skip_junk_after && is_white_space(pick()))
01092                         skip_white_space();
01093         }
01094 }
01095 
01096 const entityDecl*  
01097 dtd_processor::expandPEReference(paged_buffer& buffer)
01098 {
01099         assert(pick() == ch_percent);
01100         pop(); // skips '%'
01101         
01102         // parses name
01103         const entityDecl* entry = _doc.find_entity_decl(parseName());
01104         if (!entry)
01105                 throw_exception("Unresolved parameter entity");
01106 
01107         skip_sign(ch_semicolon, false, false, "Expected semicolon symbol");
01108 
01109         // we have entity value
01110         if (entry->_value.length())
01111                 buffer << entry->_value;
01112         else if (entry->_systemId.length())
01113                 // loads
01114                 buffer_loader::load(_stream.get_location(), entry->_systemId, _small_pool, _big_pool, buffer, false);
01115 
01116         return entry;
01117 }
01118 
01119 void 
01120 dtd_processor::deterministic_model(const dfa_token* token)
01121 {
01122 }
01123 
01124 vt_types 
01125 dtd_processor::convert_ctype(const char* x)
01126 {
01127         if (!x)
01128                 throw_exception("Expected valid type");
01129 
01130         // predefined types
01133         // x can be one of the next
01134         if (*x != ch_v || *++x != ch_t || *++x != ch_underscore)
01135                 throw_exception("Unknown ctype");
01136 
01137         switch (*++x)
01138         {
01139                 case ch_u: // vt_unknown, vt_ub1, vt_ub2, vt_ub4, vt_ub8
01140                         if (*++x != ch_b) throw_exception("Unknown ctype");
01141                         switch (*++x)
01142                         {
01143                                 case ch_1:
01144                                         if (*++x) throw_exception("Unknown ctype");
01145                                         return vt_ub1;
01146                                 case ch_2:
01147                                         if (*++x) throw_exception("Unknown ctype");
01148                                         return vt_ub2;
01149                                 case ch_4:
01150                                         if (*++x) throw_exception("Unknown ctype");
01151                                         return vt_ub4;
01152                                 case ch_8:
01153                                         if (*++x) throw_exception("Unknown ctype");
01154                                         return vt_ub8;
01155                                 default:
01156                                         throw_exception("Unknown ctype");
01157                         }
01158                 case ch_s: // vt_sb1, vt_sb2, vt_sb4, vt_sb8, vt_string
01159                         switch (*++x)
01160                         {
01161                                 case ch_b: // vt_sb1, vt_sb2, vt_sb4, vt_sb8
01162                                         switch (*++x)
01163                                         {
01164                                                 case ch_1:
01165                                                         if (*++x) throw_exception("Unknown ctype");
01166                                                         return vt_sb1;
01167                                                 case ch_2:
01168                                                         if (*++x) throw_exception("Unknown ctype");
01169                                                         return vt_sb2;
01170                                                 case ch_4:
01171                                                         if (*++x) throw_exception("Unknown ctype");
01172                                                         return vt_sb4;
01173                                                 case ch_8:
01174                                                         if (*++x) throw_exception("Unknown ctype");
01175                                                         return vt_sb8;
01176                                                 default:
01177                                                         throw_exception("Unknown ctype");
01178                                         }
01179                                 case ch_t:
01180                                         if (*++x != ch_r || *++x != ch_i || *++x != ch_n || *++x != ch_g || *++x) throw_exception("Unknown ctype");
01181                                         return vt_string;
01182                                 default:
01183                                         throw_exception("Unknown ctype");
01184                         }
01185                 case ch_f: // 
01186                         if (*++x != ch_l || *++x != ch_t) throw_exception("Unknown ctype");
01187                         switch (*++x)
01188                         {
01189                                 case ch_3:
01190                                         if (*++x != ch_2 || *++x) throw_exception("Unknown ctype");
01191                                         return vt_float;
01192                                 case ch_6:
01193                                         if (*++x != ch_4 || *++x) throw_exception("Unknown ctype");
01194                                         return vt_double;
01195                                 default:
01196                                         throw_exception("Unknown ctype");
01197                         }
01198                 case ch_b:
01199                         switch (*++x)
01200                         {
01201                                 case ch_o:
01202                                         if (*++x != ch_o || *++x != ch_l || *++x) throw_exception("Unknown ctype");
01203                                         return vt_bool;
01204                                 case ch_i:
01205                                         if (*++x != ch_n || *++x != ch_a || *++x != ch_r || *++x != ch_y || *++x) throw_exception("Unknown ctype");
01206                                         return vt_binary;
01207                                 default:
01208                                         throw_exception("Unknown ctype");
01209                         }
01210                 case ch_g:
01211                         if (*++x != ch_u || *++x != ch_i || *++x != ch_d || *++x) throw_exception("Unknown ctype");
01212                         return vt_guid;
01213                 case ch_d:
01214                         switch (*++x)
01215                         {
01216                                 case ch_a:
01217                                         if (*++x != ch_t || *++x != ch_e || *++x) throw_exception("Unknown ctype");
01218                                         return vt_date;
01219                                 case ch_e:
01220                                         if (*++x != ch_c || *++x != ch_i || *++x != ch_m || *++x != ch_a || *++x != ch_l || *++x) throw_exception("Unknown ctype");
01221                                         return vt_decimal;
01222                                 default:
01223                                         throw_exception("Unknown ctype");
01224                         }
01225                 case ch_w:
01226                         if (*++x != ch_s || *++x != ch_t || *++x != ch_r || *++x != ch_i || *++x != ch_n || *++x != ch_g || *++x) throw_exception("Unknown ctype");
01227                         return vt_wstring;
01228                 case ch_n:
01229                         if (*++x != ch_u || *++x != ch_m || *++x != ch_e || *++x != ch_r || *++x != ch_i || *++x != ch_c || *++x) throw_exception("Unknown ctype");
01230                         return vt_numeric;
01231                 default:
01232                         throw_exception("Unknown ctype");
01233         } // switch
01234 
01235         return vt_unknown;
01236 }
01237 
01238 #pragma pack()
01239 END_TERIMBER_NAMESPACE
dtdxml.cpp