00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #ifndef _terimber_defxml_hpp_
00029 #define _terimber_defxml_hpp_
00030
00031 #include "xml/defxml.h"
00032
00033 BEGIN_TERIMBER_NAMESPACE
00034 #pragma pack(4)
00035
00036 xml_forceinline
00037 bool
00038 usascii_to_utf8(ub4_t in, ub1_t* out, size_t& count)
00039 {
00040
00041
00042
00043 switch (in)
00044 {
00045 case ch_hor_tab:
00046 case ch_lf:
00047 case ch_cr:
00048 break;
00049 default:
00050 if (in >= 0x20 && in <= 0xD7FF
00051 || in >= 0xE000 && in <= 0xFFFD
00052 || in >= 0x10000 && in <= 0x10FFFF)
00053 break;
00054 else
00055 return false;
00056 }
00057
00058
00059
00060
00061
00062
00063
00064
00065 if (in < 0x80)
00066 count = 1;
00067 else if (in < 0x800)
00068 count = 2;
00069 else if (in < 0x10000)
00070 count = 3;
00071 else if (in < 0x200000)
00072 count = 4;
00073 else if (in < 0x4000000)
00074 count = 5;
00075 else if (in <= 0x7FFFFFFF)
00076 count = 6;
00077 else
00078 return false;
00079
00080 out[count] = 0;
00081 out += count;
00082
00083 switch(count)
00084 {
00085 case 6 : *--out = (ub1_t)((in | 0x80UL) & 0xBFUL); in >>= 6;
00086 case 5 : *--out = (ub1_t)((in | 0x80UL) & 0xBFUL); in >>= 6;
00087 case 4 : *--out = (ub1_t)((in | 0x80UL) & 0xBFUL); in >>= 6;
00088 case 3 : *--out = (ub1_t)((in | 0x80UL) & 0xBFUL); in >>= 6;
00089 case 2 : *--out = (ub1_t)((in | 0x80UL) & 0xBFUL); in >>= 6;
00090 case 1 : *--out = (ub1_t)(in | s_leadingByte[count - 1]);
00091 }
00092
00093 return count != 1 || out[0] != 0;
00094 }
00095
00096 xml_forceinline
00097 bool
00098 fixedN_to_utf8(encodingSchema schema, const ub1_t* in, size_t count, ub1_t* out, size_t& converted, size_t& processed, size_t& more)
00099 {
00100 processed = 0;
00101 converted = 0;
00102 more = 0;
00103 size_t byte_count = 0;
00104
00105 ub4_t value = 0, surrogate = 0;
00106
00107 while (count)
00108 {
00109 switch (schema)
00110 {
00111 case UTF_16B:
00112
00113 value = (ub4_t(*in) << 8) | ub4_t(*(in + 1));
00114 surrogate = 0;
00115 if (value >= 0xD800 && value <= 0xDBFF)
00116 {
00117 if (count < 4)
00118 {
00119 more = 4 - count;
00120 return false;
00121 }
00122
00123 count -= 2;
00124 in += 2;
00125 processed += 2;
00126 surrogate = value;
00127 value = (ub4_t(*in) << 8) | ub4_t(*(in + 1));
00128 value = ub4_t((surrogate - 0xD800) << 10) + ub4_t((value - 0xDC00) + 0x10000);
00129 }
00130
00131 count -= 2;
00132 in += 2;
00133 processed += 2;
00134 break;
00135 case UTF_16L:
00136
00137 value = (ub4_t(*(in + 1)) << 8) | ub4_t(*in);
00138 surrogate = 0;
00139 if (value >= 0xD800 && value <= 0xDBFF)
00140 {
00141 if (count < 4)
00142 {
00143 more = 4 - count;
00144 return false;
00145 }
00146
00147 count -= 2;
00148 in += 2;
00149 processed += 2;
00150 surrogate = value;
00151 value = (ub4_t(*(in + 1)) << 8) | ub4_t(*in);
00152 value = ub4_t((surrogate - 0xD800) << 10) + ub4_t((value - 0xDC00) + 0x10000);
00153 }
00154
00155 count -= 2;
00156 in += 2;
00157 processed += 2;
00158 break;
00159 case UCS_4B:
00160 value = (ub4_t(*in) << 24) | (ub4_t(*(in + 1)) << 16) | (ub4_t(*(in + 2)) << 8) | ub4_t(*(in + 3));
00161 count -= 4;
00162 in += 4;
00163 processed += 4;
00164 break;
00165 case UCS_4BS:
00166 value = (ub4_t(*(in + 2)) << 24) | (ub4_t(*(in + 3)) << 16) | (ub4_t(*in) << 8) | ub4_t(*(in + 1));
00167 count -= 4;
00168 in += 4;
00169 processed += 4;
00170 break;
00171 case UCS_4L:
00172 value = (ub4_t(*(in + 3)) << 24) | (ub4_t(*(in + 2)) << 16) | (ub4_t(*(in + 1)) << 8) | ub4_t(*in);
00173 count -= 4;
00174 in += 4;
00175 processed += 4;
00176 break;
00177 case UCS_4LS:
00178 value = (ub4_t(*(in + 1)) << 24) | (ub4_t(*in) << 16) | (ub4_t(*(in + 3)) << 8) | ub4_t(*(in + 2));
00179 count -= 4;
00180 in += 4;
00181 processed += 4;
00182 break;
00183 default:
00184 assert(false);
00185 }
00186
00187
00188
00189 switch (value)
00190 {
00191 case ch_hor_tab:
00192 case ch_lf:
00193 case ch_cr:
00194 break;
00195 default:
00196 if (value >= 0x20 && value <= 0xD7FF
00197 || value >= 0xE000 && value <= 0xFFFD
00198 || value >= 0x10000 && value <= 0x10FFFF)
00199 break;
00200 else
00201 return false;
00202 }
00203
00204
00205
00206 if (value < 0x80)
00207 byte_count = 1;
00208 else if (value < 0x800)
00209 byte_count = 2;
00210 else if (value < 0x10000)
00211 byte_count = 3;
00212 else if (value < 0x200000)
00213 byte_count = 4;
00214 else if (value < 0x4000000)
00215 byte_count = 5;
00216 else if (value <= 0x7FFFFFFF)
00217 byte_count = 6;
00218 else
00219 return false;
00220
00221 out += byte_count;
00222 converted += byte_count;
00223
00224 switch(byte_count)
00225 {
00226 case 6 : *--out = (ub1_t)((value | 0x80UL) & 0xBFUL); value >>= 6;
00227 case 5 : *--out = (ub1_t)((value | 0x80UL) & 0xBFUL); value >>= 6;
00228 case 4 : *--out = (ub1_t)((value | 0x80UL) & 0xBFUL); value >>= 6;
00229 case 3 : *--out = (ub1_t)((value | 0x80UL) & 0xBFUL); value >>= 6;
00230 case 2 : *--out = (ub1_t)((value | 0x80UL) & 0xBFUL); value >>= 6;
00231 case 1 : *--out = (ub1_t)(value | s_leadingByte[byte_count - 1]);
00232 }
00233
00234 out += byte_count;
00235 }
00236
00237 return true;
00238 }
00239
00240 xml_forceinline
00241 bool
00242 utf8_to_utf8(const ub1_t* in, size_t count, size_t& processed, size_t& more)
00243 {
00244 processed = 0;
00245 more = 0;
00246 size_t byte_count = 0;
00247 size_t max_byte = 0;
00248 ub4_t value = 0;
00249
00250 while (count)
00251 {
00252 switch (byte_count)
00253 {
00254 case 0:
00255
00256 max_byte = 0;
00257 if (*in <= 0x7F)
00258 {
00259
00260 value = *in;
00261
00262 break;
00263 }
00264
00265 if (*in < 0xC0 || *in > 0xFD)
00266 return false;
00267
00268 if (*in < 0xE0)
00269 max_byte = byte_count = 1;
00270 else if (*in < 0xF0)
00271 max_byte = byte_count = 2;
00272 else if (*in < 0xF8)
00273 max_byte = byte_count = 3;
00274 else if (*in < 0xFC)
00275 max_byte = byte_count = 4;
00276 else
00277 max_byte = byte_count = 5;
00278
00279
00280 value = *in & ((2 << (5 - byte_count)) - 1);
00281 break;
00282 default:
00283 if (*in > 0xBF || *in < 0x80)
00284 return false;
00285
00286
00287 value <<= 6;
00288 value |= *in & 0x3F;
00289
00290
00291 --byte_count;
00292 }
00293
00294
00295 if (!byte_count)
00296 {
00297 switch (value)
00298 {
00299 case ch_hor_tab:
00300 case ch_lf:
00301 case ch_cr:
00302 break;
00303 default:
00304 if (value >= 0x20 && value <= 0xD7FF
00305 || value >= 0xE000 && value <= 0xFFFD
00306 || value >= 0x10000 && value <= 0x10FFFF)
00307 break;
00308 else
00309 return false;
00310 }
00311 }
00312
00313
00314 ++in;
00315 --count;
00316 ++processed;
00317 }
00318
00319 if (byte_count)
00320 {
00321 processed += byte_count;
00322 processed -= max_byte + 1;
00323 more = byte_count;
00324 }
00325
00326 return true;
00327 }
00328
00329 xml_forceinline
00330 void windowsN_to_utf8(encodingSchema schema, const ub1_t* in, size_t count, ub1_t* out, size_t& converted, size_t& processed)
00331 {
00332 processed = 0;
00333 converted = 0;
00334 size_t byte_count = 0;
00335
00336 ub2_t value = 0;
00337
00338 while (count)
00339 {
00340 switch (schema)
00341 {
00342 case WINDOWS_1251:
00343 value = encoding_table_winodws_1251[*in];
00344 count -= 1;
00345 in += 1;
00346 processed += 1;
00347 break;
00348 default:
00349 assert(false);
00350 }
00351
00352
00353 if (value < 0x80)
00354 byte_count = 1;
00355 else if (value < 0x800)
00356 byte_count = 2;
00357 else
00358 byte_count = 3;
00359
00360 out += byte_count;
00361 converted += byte_count;
00362
00363 switch(byte_count)
00364 {
00365 case 3 : *--out = (ub1_t)((value | 0x80UL) & 0xBFUL); value >>= 6;
00366 case 2 : *--out = (ub1_t)((value | 0x80UL) & 0xBFUL); value >>= 6;
00367 case 1 : *--out = (ub1_t)(value | s_leadingByte[byte_count - 1]);
00368 }
00369
00370 out += byte_count;
00371 }
00372 }
00373
00374 xml_forceinline
00375 void isoN_to_utf8(encodingSchema schema, const ub1_t* in, size_t count, ub1_t* out, size_t& converted, size_t& processed)
00376 {
00377 processed = 0;
00378 converted = 0;
00379 size_t byte_count = 0;
00380
00381 ub1_t value = 0;
00382
00383 while (count)
00384 {
00385 switch (schema)
00386 {
00387 case UTF_ISO88591:
00388 value = *in;
00389 count -= 1;
00390 in += 1;
00391 processed += 1;
00392 break;
00393 default:
00394 assert(false);
00395 }
00396
00397
00398 if (value < 0x80)
00399 byte_count = 1;
00400 else
00401 byte_count = 2;
00402
00403 out += byte_count;
00404 converted += byte_count;
00405
00406 switch(byte_count)
00407 {
00408 case 2 : *--out = (ub1_t)((value | 0x80UL) & 0xBFUL); value >>= 6;
00409 case 1 : *--out = (ub1_t)(value | s_leadingByte[byte_count - 1]);
00410 }
00411
00412 out += byte_count;
00413 }
00414 }
00415
00416 #pragma pack()
00417 END_TERIMBER_NAMESPACE
00418
00419 #endif // _terimber_defxml_hpp_