Geant4 Cross Reference |
1 /* 1 /* 2 __ __ 2 __ __ _ 3 ___\ \/ /_ __ __ _| 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_| 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parse 7 |_| XML parser 8 8 9 Copyright (c) 1997-2000 Thai Open Source So 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000 Clark Cooper <coope 10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net> 11 Copyright (c) 2001-2003 Fred L. Drake, Jr. 11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net> 12 Copyright (c) 2002 Greg Stein <gstein@ 12 Copyright (c) 2002 Greg Stein <gstein@users.sourceforge.net> 13 Copyright (c) 2002-2016 Karl Waclawek <karl 13 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net> 14 Copyright (c) 2005-2009 Steven Solie <steve 14 Copyright (c) 2005-2009 Steven Solie <steven@solie.ca> 15 Copyright (c) 2016-2022 Sebastian Pipping < 15 Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org> 16 Copyright (c) 2016 Pascal Cuoq <cuoq@t 16 Copyright (c) 2016 Pascal Cuoq <cuoq@trust-in-soft.com> 17 Copyright (c) 2016 Don Lewis <truckman 17 Copyright (c) 2016 Don Lewis <truckman@apache.org> 18 Copyright (c) 2017 Rhodri James <rhodr 18 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk> 19 Copyright (c) 2017 Alexander Bluhm <al 19 Copyright (c) 2017 Alexander Bluhm <alexander.bluhm@gmx.net> 20 Copyright (c) 2017 Benbuck Nason <bnas 20 Copyright (c) 2017 Benbuck Nason <bnason@netflix.com> 21 Copyright (c) 2017 José Gutiérrez de 21 Copyright (c) 2017 José Gutiérrez de la Concha <jose@zeroc.com> 22 Copyright (c) 2019 David Loffredo <lof 22 Copyright (c) 2019 David Loffredo <loffredo@steptools.com> 23 Copyright (c) 2021 Dong-hee Na <donghe 23 Copyright (c) 2021 Dong-hee Na <donghee.na@python.org> 24 Copyright (c) 2022 Martin Ettl <ettl.m 24 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com> 25 Licensed under the MIT license: 25 Licensed under the MIT license: 26 26 27 Permission is hereby granted, free of cha 27 Permission is hereby granted, free of charge, to any person obtaining 28 a copy of this software and associat 28 a copy of this software and associated documentation files (the 29 "Software"), to deal in the Software w 29 "Software"), to deal in the Software without restriction, including 30 without limitation the rights to use, c 30 without limitation the rights to use, copy, modify, merge, publish, 31 distribute, sublicense, and/or sell copies 31 distribute, sublicense, and/or sell copies of the Software, and to permit 32 persons to whom the Software is furnish 32 persons to whom the Software is furnished to do so, subject to the 33 following conditions: 33 following conditions: 34 34 35 The above copyright notice and this permis 35 The above copyright notice and this permission notice shall be included 36 in all copies or substantial portions of th 36 in all copies or substantial portions of the Software. 37 37 38 THE SOFTWARE IS PROVIDED "AS IS", WIT 38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 39 EXPRESS OR IMPLIED, INCLUDING BUT NOT L 39 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 40 MERCHANTABILITY, FITNESS FOR A PARTICULAR P 40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 41 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HO 41 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 42 DAMAGES OR OTHER LIABILITY, WHETHER IN AN 42 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 43 OTHERWISE, ARISING FROM, OUT OF OR IN CONNE 43 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 44 USE OR OTHER DEALINGS IN THE SOFTWARE. 44 USE OR OTHER DEALINGS IN THE SOFTWARE. 45 */ 45 */ 46 46 47 #include <expat_config.h> 47 #include <expat_config.h> 48 48 49 #include <stddef.h> 49 #include <stddef.h> 50 #include <string.h> /* memcpy */ 50 #include <string.h> /* memcpy */ 51 #include <stdbool.h> 51 #include <stdbool.h> 52 52 53 #ifdef _WIN32 53 #ifdef _WIN32 54 # include "winconfig.h" 54 # include "winconfig.h" 55 #endif 55 #endif 56 56 57 #include "expat_external.h" 57 #include "expat_external.h" 58 #include "internal.h" 58 #include "internal.h" 59 #include "xmltok.h" 59 #include "xmltok.h" 60 #include "nametab.h" 60 #include "nametab.h" 61 61 62 #ifdef XML_DTD 62 #ifdef XML_DTD 63 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(i 63 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 64 #else 64 #else 65 # define IGNORE_SECTION_TOK_VTABLE /* as noth 65 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 66 #endif 66 #endif 67 67 68 #define VTABLE1 68 #define VTABLE1 \ 69 {PREFIX(prologTok), PREFIX(contentTok), 69 {PREFIX(prologTok), PREFIX(contentTok), \ 70 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_ 70 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \ 71 {PREFIX(attributeValueTok), PREFIX(entit 71 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \ 72 PREFIX(nameMatchesAscii), PREFIX(nameLen 72 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \ 73 PREFIX(getAtts), PREFIX(charRefNumber), 73 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \ 74 PREFIX(updatePosition), PREFIX(isPublicI 74 PREFIX(updatePosition), PREFIX(isPublicId) 75 75 76 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX 76 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 77 77 78 #define UCS2_GET_NAMING(pages, hi, lo) 78 #define UCS2_GET_NAMING(pages, hi, lo) \ 79 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5) 79 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F))) 80 80 81 /* A 2 byte UTF-8 representation splits the ch 81 /* A 2 byte UTF-8 representation splits the characters 11 bits between 82 the bottom 5 and 6 bits of the bytes. We n 82 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 83 pages, 3 bits to add to that index and 5 bi 83 pages, 3 bits to add to that index and 5 bits to generate the mask. 84 */ 84 */ 85 #define UTF8_GET_NAMING2(pages, byte) 85 #define UTF8_GET_NAMING2(pages, byte) \ 86 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 86 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 87 + ((((byte)[0]) & 3) << 1) + ( 87 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \ 88 & (1u << (((byte)[1]) & 0x1F))) 88 & (1u << (((byte)[1]) & 0x1F))) 89 89 90 /* A 3 byte UTF-8 representation splits the ch 90 /* A 3 byte UTF-8 representation splits the characters 16 bits between 91 the bottom 4, 6 and 6 bits of the bytes. W 91 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 92 into pages, 3 bits to add to that index and 92 into pages, 3 bits to add to that index and 5 bits to generate the 93 mask. 93 mask. 94 */ 94 */ 95 #define UTF8_GET_NAMING3(pages, byte) 95 #define UTF8_GET_NAMING3(pages, byte) \ 96 (namingBitmap 96 (namingBitmap \ 97 [((pages)[((((byte)[0]) & 0xF) << 4) + 97 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \ 98 << 3) 98 << 3) \ 99 + ((((byte)[1]) & 3) << 1) + ((((byte) 99 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \ 100 & (1u << (((byte)[2]) & 0x1F))) 100 & (1u << (((byte)[2]) & 0x1F))) 101 101 102 /* Detection of invalid UTF-8 sequences is bas 102 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 103 of Unicode 3.2: http://www.unicode.org/unic 103 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 104 with the additional restriction of not allo 104 with the additional restriction of not allowing the Unicode 105 code points 0xFFFF and 0xFFFE (sequences EF 105 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 106 Implementation details: 106 Implementation details: 107 (A & 0x80) == 0 means A < 0x80 107 (A & 0x80) == 0 means A < 0x80 108 and 108 and 109 (A & 0xC0) == 0xC0 means A > 0xBF 109 (A & 0xC0) == 0xC0 means A > 0xBF 110 */ 110 */ 111 111 112 #define UTF8_INVALID2(p) 112 #define UTF8_INVALID2(p) \ 113 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p) 113 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 114 114 115 #define UTF8_INVALID3(p) 115 #define UTF8_INVALID3(p) \ 116 (((p)[2] & 0x80) == 0 116 (((p)[2] & 0x80) == 0 \ 117 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] 117 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \ 118 : ((p)[2 118 : ((p)[2] & 0xC0) == 0xC0) \ 119 || ((*p) == 0xE0 119 || ((*p) == 0xE0 \ 120 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) 120 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 121 : ((p)[1] & 0x80) == 0 121 : ((p)[1] & 0x80) == 0 \ 122 || ((*p) == 0xED ? (p)[1] > 0 122 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 123 123 124 #define UTF8_INVALID4(p) 124 #define UTF8_INVALID4(p) \ 125 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 125 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \ 126 || ((p)[2] & 0xC0) == 0xC0 126 || ((p)[2] & 0xC0) == 0xC0 \ 127 || ((*p) == 0xF0 127 || ((*p) == 0xF0 \ 128 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) 128 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 129 : ((p)[1] & 0x80) == 0 129 : ((p)[1] & 0x80) == 0 \ 130 || ((*p) == 0xF4 ? (p)[1] > 0 130 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 131 131 132 static int PTRFASTCALL 132 static int PTRFASTCALL 133 isNever(const ENCODING *enc, const char *p) { 133 isNever(const ENCODING *enc, const char *p) { 134 UNUSED_P(enc); 134 UNUSED_P(enc); 135 UNUSED_P(p); 135 UNUSED_P(p); 136 return 0; 136 return 0; 137 } 137 } 138 138 139 static int PTRFASTCALL 139 static int PTRFASTCALL 140 utf8_isName2(const ENCODING *enc, const char * 140 utf8_isName2(const ENCODING *enc, const char *p) { 141 UNUSED_P(enc); 141 UNUSED_P(enc); 142 return UTF8_GET_NAMING2(namePages, (const un 142 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 143 } 143 } 144 144 145 static int PTRFASTCALL 145 static int PTRFASTCALL 146 utf8_isName3(const ENCODING *enc, const char * 146 utf8_isName3(const ENCODING *enc, const char *p) { 147 UNUSED_P(enc); 147 UNUSED_P(enc); 148 return UTF8_GET_NAMING3(namePages, (const un 148 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 149 } 149 } 150 150 151 #define utf8_isName4 isNever 151 #define utf8_isName4 isNever 152 152 153 static int PTRFASTCALL 153 static int PTRFASTCALL 154 utf8_isNmstrt2(const ENCODING *enc, const char 154 utf8_isNmstrt2(const ENCODING *enc, const char *p) { 155 UNUSED_P(enc); 155 UNUSED_P(enc); 156 return UTF8_GET_NAMING2(nmstrtPages, (const 156 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 157 } 157 } 158 158 159 static int PTRFASTCALL 159 static int PTRFASTCALL 160 utf8_isNmstrt3(const ENCODING *enc, const char 160 utf8_isNmstrt3(const ENCODING *enc, const char *p) { 161 UNUSED_P(enc); 161 UNUSED_P(enc); 162 return UTF8_GET_NAMING3(nmstrtPages, (const 162 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 163 } 163 } 164 164 165 #define utf8_isNmstrt4 isNever 165 #define utf8_isNmstrt4 isNever 166 166 167 static int PTRFASTCALL 167 static int PTRFASTCALL 168 utf8_isInvalid2(const ENCODING *enc, const cha 168 utf8_isInvalid2(const ENCODING *enc, const char *p) { 169 UNUSED_P(enc); 169 UNUSED_P(enc); 170 return UTF8_INVALID2((const unsigned char *) 170 return UTF8_INVALID2((const unsigned char *)p); 171 } 171 } 172 172 173 static int PTRFASTCALL 173 static int PTRFASTCALL 174 utf8_isInvalid3(const ENCODING *enc, const cha 174 utf8_isInvalid3(const ENCODING *enc, const char *p) { 175 UNUSED_P(enc); 175 UNUSED_P(enc); 176 return UTF8_INVALID3((const unsigned char *) 176 return UTF8_INVALID3((const unsigned char *)p); 177 } 177 } 178 178 179 static int PTRFASTCALL 179 static int PTRFASTCALL 180 utf8_isInvalid4(const ENCODING *enc, const cha 180 utf8_isInvalid4(const ENCODING *enc, const char *p) { 181 UNUSED_P(enc); 181 UNUSED_P(enc); 182 return UTF8_INVALID4((const unsigned char *) 182 return UTF8_INVALID4((const unsigned char *)p); 183 } 183 } 184 184 185 struct normal_encoding { 185 struct normal_encoding { 186 ENCODING enc; 186 ENCODING enc; 187 unsigned char type[256]; 187 unsigned char type[256]; 188 #ifdef XML_MIN_SIZE 188 #ifdef XML_MIN_SIZE 189 int(PTRFASTCALL *byteType)(const ENCODING *, 189 int(PTRFASTCALL *byteType)(const ENCODING *, const char *); 190 int(PTRFASTCALL *isNameMin)(const ENCODING * 190 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 191 int(PTRFASTCALL *isNmstrtMin)(const ENCODING 191 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 192 int(PTRFASTCALL *byteToAscii)(const ENCODING 192 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 193 int(PTRCALL *charMatches)(const ENCODING *, 193 int(PTRCALL *charMatches)(const ENCODING *, const char *, int); 194 #endif /* XML_MIN_SIZE */ 194 #endif /* XML_MIN_SIZE */ 195 int(PTRFASTCALL *isName2)(const ENCODING *, 195 int(PTRFASTCALL *isName2)(const ENCODING *, const char *); 196 int(PTRFASTCALL *isName3)(const ENCODING *, 196 int(PTRFASTCALL *isName3)(const ENCODING *, const char *); 197 int(PTRFASTCALL *isName4)(const ENCODING *, 197 int(PTRFASTCALL *isName4)(const ENCODING *, const char *); 198 int(PTRFASTCALL *isNmstrt2)(const ENCODING * 198 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 199 int(PTRFASTCALL *isNmstrt3)(const ENCODING * 199 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 200 int(PTRFASTCALL *isNmstrt4)(const ENCODING * 200 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 201 int(PTRFASTCALL *isInvalid2)(const ENCODING 201 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 202 int(PTRFASTCALL *isInvalid3)(const ENCODING 202 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 203 int(PTRFASTCALL *isInvalid4)(const ENCODING 203 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 204 }; 204 }; 205 205 206 #define AS_NORMAL_ENCODING(enc) ((const struct 206 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc)) 207 207 208 #ifdef XML_MIN_SIZE 208 #ifdef XML_MIN_SIZE 209 209 210 # define STANDARD_VTABLE(E) 210 # define STANDARD_VTABLE(E) \ 211 E##byteType, E##isNameMin, E##isNmstrtMin, 211 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches, 212 212 213 #else 213 #else 214 214 215 # define STANDARD_VTABLE(E) /* as nothing */ 215 # define STANDARD_VTABLE(E) /* as nothing */ 216 216 217 #endif 217 #endif 218 218 219 #define NORMAL_VTABLE(E) 219 #define NORMAL_VTABLE(E) \ 220 E##isName2, E##isName3, E##isName4, E##isNms 220 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \ 221 E##isNmstrt4, E##isInvalid2, E##isInvali 221 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4 222 222 223 #define NULL_VTABLE 223 #define NULL_VTABLE \ 224 /* isName2 */ NULL, /* isName3 */ NULL, /* i 224 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \ 225 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NU 225 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \ 226 /* isInvalid2 */ NULL, /* isInvalid3 */ 226 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL 227 227 228 static int FASTCALL checkCharRefNumber(int); 228 static int FASTCALL checkCharRefNumber(int); 229 229 230 #include "xmltok_impl.h" 230 #include "xmltok_impl.h" 231 #include "ascii.h" 231 #include "ascii.h" 232 232 233 #ifdef XML_MIN_SIZE 233 #ifdef XML_MIN_SIZE 234 # define sb_isNameMin isNever 234 # define sb_isNameMin isNever 235 # define sb_isNmstrtMin isNever 235 # define sb_isNmstrtMin isNever 236 #endif 236 #endif 237 237 238 #ifdef XML_MIN_SIZE 238 #ifdef XML_MIN_SIZE 239 # define MINBPC(enc) ((enc)->minBytesPerChar) 239 # define MINBPC(enc) ((enc)->minBytesPerChar) 240 #else 240 #else 241 /* minimum bytes per character */ 241 /* minimum bytes per character */ 242 # define MINBPC(enc) 1 242 # define MINBPC(enc) 1 243 #endif 243 #endif 244 244 245 #define SB_BYTE_TYPE(enc, p) 245 #define SB_BYTE_TYPE(enc, p) \ 246 (((struct normal_encoding *)(enc))->type[(un 246 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 247 247 248 #ifdef XML_MIN_SIZE 248 #ifdef XML_MIN_SIZE 249 static int PTRFASTCALL 249 static int PTRFASTCALL 250 sb_byteType(const ENCODING *enc, const char *p 250 sb_byteType(const ENCODING *enc, const char *p) { 251 return SB_BYTE_TYPE(enc, p); 251 return SB_BYTE_TYPE(enc, p); 252 } 252 } 253 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODIN 253 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 254 #else 254 #else 255 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, 255 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 256 #endif 256 #endif 257 257 258 #ifdef XML_MIN_SIZE 258 #ifdef XML_MIN_SIZE 259 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENC 259 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 260 static int PTRFASTCALL 260 static int PTRFASTCALL 261 sb_byteToAscii(const ENCODING *enc, const char 261 sb_byteToAscii(const ENCODING *enc, const char *p) { 262 UNUSED_P(enc); 262 UNUSED_P(enc); 263 return *p; 263 return *p; 264 } 264 } 265 #else 265 #else 266 # define BYTE_TO_ASCII(enc, p) (*(p)) 266 # define BYTE_TO_ASCII(enc, p) (*(p)) 267 #endif 267 #endif 268 268 269 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENC 269 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p)) 270 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_E 270 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p)) 271 #ifdef XML_MIN_SIZE 271 #ifdef XML_MIN_SIZE 272 # define IS_INVALID_CHAR(enc, p, n) 272 # define IS_INVALID_CHAR(enc, p, n) \ 273 (AS_NORMAL_ENCODING(enc)->isInvalid##n 273 (AS_NORMAL_ENCODING(enc)->isInvalid##n \ 274 && AS_NORMAL_ENCODING(enc)->isInvalid##n( 274 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p)) 275 #else 275 #else 276 # define IS_INVALID_CHAR(enc, p, n) 276 # define IS_INVALID_CHAR(enc, p, n) \ 277 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc 277 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p)) 278 #endif 278 #endif 279 279 280 #ifdef XML_MIN_SIZE 280 #ifdef XML_MIN_SIZE 281 # define IS_NAME_CHAR_MINBPC(enc, p) 281 # define IS_NAME_CHAR_MINBPC(enc, p) \ 282 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p 282 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 283 # define IS_NMSTRT_CHAR_MINBPC(enc, p) 283 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 284 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, 284 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 285 #else 285 #else 286 # define IS_NAME_CHAR_MINBPC(enc, p) (0) 286 # define IS_NAME_CHAR_MINBPC(enc, p) (0) 287 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 287 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 288 #endif 288 #endif 289 289 290 #ifdef XML_MIN_SIZE 290 #ifdef XML_MIN_SIZE 291 # define CHAR_MATCHES(enc, p, c) 291 # define CHAR_MATCHES(enc, p, c) \ 292 (AS_NORMAL_ENCODING(enc)->charMatches(enc, 292 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 293 static int PTRCALL 293 static int PTRCALL 294 sb_charMatches(const ENCODING *enc, const char 294 sb_charMatches(const ENCODING *enc, const char *p, int c) { 295 UNUSED_P(enc); 295 UNUSED_P(enc); 296 return *p == c; 296 return *p == c; 297 } 297 } 298 #else 298 #else 299 /* c is an ASCII character */ 299 /* c is an ASCII character */ 300 # define CHAR_MATCHES(enc, p, c) (*(p) == (c) 300 # define CHAR_MATCHES(enc, p, c) (*(p) == (c)) 301 #endif 301 #endif 302 302 303 #define PREFIX(ident) normal_##ident 303 #define PREFIX(ident) normal_##ident 304 #define XML_TOK_IMPL_C 304 #define XML_TOK_IMPL_C 305 #include "xmltok_impl.c" 305 #include "xmltok_impl.c" 306 #undef XML_TOK_IMPL_C 306 #undef XML_TOK_IMPL_C 307 307 308 #undef MINBPC 308 #undef MINBPC 309 #undef BYTE_TYPE 309 #undef BYTE_TYPE 310 #undef BYTE_TO_ASCII 310 #undef BYTE_TO_ASCII 311 #undef CHAR_MATCHES 311 #undef CHAR_MATCHES 312 #undef IS_NAME_CHAR 312 #undef IS_NAME_CHAR 313 #undef IS_NAME_CHAR_MINBPC 313 #undef IS_NAME_CHAR_MINBPC 314 #undef IS_NMSTRT_CHAR 314 #undef IS_NMSTRT_CHAR 315 #undef IS_NMSTRT_CHAR_MINBPC 315 #undef IS_NMSTRT_CHAR_MINBPC 316 #undef IS_INVALID_CHAR 316 #undef IS_INVALID_CHAR 317 317 318 enum { /* UTF8_cvalN is value of masked first 318 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 319 UTF8_cval1 = 0x00, 319 UTF8_cval1 = 0x00, 320 UTF8_cval2 = 0xc0, 320 UTF8_cval2 = 0xc0, 321 UTF8_cval3 = 0xe0, 321 UTF8_cval3 = 0xe0, 322 UTF8_cval4 = 0xf0 322 UTF8_cval4 = 0xf0 323 }; 323 }; 324 324 325 void 325 void 326 _INTERNAL_trim_to_complete_utf8_characters(con 326 _INTERNAL_trim_to_complete_utf8_characters(const char *from, 327 con 327 const char **fromLimRef) { 328 const char *fromLim = *fromLimRef; 328 const char *fromLim = *fromLimRef; 329 size_t walked = 0; 329 size_t walked = 0; 330 for (; fromLim > from; fromLim--, walked++) 330 for (; fromLim > from; fromLim--, walked++) { 331 const unsigned char prev = (unsigned char) 331 const unsigned char prev = (unsigned char)fromLim[-1]; 332 if ((prev & 0xf8u) 332 if ((prev & 0xf8u) 333 == 0xf0u) { /* 4-byte character, lead 333 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */ 334 if (walked + 1 >= 4) { 334 if (walked + 1 >= 4) { 335 fromLim += 4 - 1; 335 fromLim += 4 - 1; 336 break; 336 break; 337 } else { 337 } else { 338 walked = 0; 338 walked = 0; 339 } 339 } 340 } else if ((prev & 0xf0u) 340 } else if ((prev & 0xf0u) 341 == 0xe0u) { /* 3-byte character 341 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */ 342 if (walked + 1 >= 3) { 342 if (walked + 1 >= 3) { 343 fromLim += 3 - 1; 343 fromLim += 3 - 1; 344 break; 344 break; 345 } else { 345 } else { 346 walked = 0; 346 walked = 0; 347 } 347 } 348 } else if ((prev & 0xe0u) 348 } else if ((prev & 0xe0u) 349 == 0xc0u) { /* 2-byte character 349 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */ 350 if (walked + 1 >= 2) { 350 if (walked + 1 >= 2) { 351 fromLim += 2 - 1; 351 fromLim += 2 - 1; 352 break; 352 break; 353 } else { 353 } else { 354 walked = 0; 354 walked = 0; 355 } 355 } 356 } else if ((prev & 0x80u) 356 } else if ((prev & 0x80u) 357 == 0x00u) { /* 1-byte character 357 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */ 358 break; 358 break; 359 } 359 } 360 } 360 } 361 *fromLimRef = fromLim; 361 *fromLimRef = fromLim; 362 } 362 } 363 363 364 static enum XML_Convert_Result PTRCALL 364 static enum XML_Convert_Result PTRCALL 365 utf8_toUtf8(const ENCODING *enc, const char ** 365 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 366 char **toP, const char *toLim) { 366 char **toP, const char *toLim) { 367 bool input_incomplete = false; 367 bool input_incomplete = false; 368 bool output_exhausted = false; 368 bool output_exhausted = false; 369 369 370 /* Avoid copying partial characters (due to 370 /* Avoid copying partial characters (due to limited space). */ 371 const ptrdiff_t bytesAvailable = fromLim - * 371 const ptrdiff_t bytesAvailable = fromLim - *fromP; 372 const ptrdiff_t bytesStorable = toLim - *toP 372 const ptrdiff_t bytesStorable = toLim - *toP; 373 UNUSED_P(enc); 373 UNUSED_P(enc); 374 if (bytesAvailable > bytesStorable) { 374 if (bytesAvailable > bytesStorable) { 375 fromLim = *fromP + bytesStorable; 375 fromLim = *fromP + bytesStorable; 376 output_exhausted = true; 376 output_exhausted = true; 377 } 377 } 378 378 379 /* Avoid copying partial characters (from in 379 /* Avoid copying partial characters (from incomplete input). */ 380 { 380 { 381 const char *const fromLimBefore = fromLim; 381 const char *const fromLimBefore = fromLim; 382 _INTERNAL_trim_to_complete_utf8_characters 382 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim); 383 if (fromLim < fromLimBefore) { 383 if (fromLim < fromLimBefore) { 384 input_incomplete = true; 384 input_incomplete = true; 385 } 385 } 386 } 386 } 387 387 388 { 388 { 389 const ptrdiff_t bytesToCopy = fromLim - *f 389 const ptrdiff_t bytesToCopy = fromLim - *fromP; 390 memcpy(*toP, *fromP, bytesToCopy); 390 memcpy(*toP, *fromP, bytesToCopy); 391 *fromP += bytesToCopy; 391 *fromP += bytesToCopy; 392 *toP += bytesToCopy; 392 *toP += bytesToCopy; 393 } 393 } 394 394 395 if (output_exhausted) /* needs to go first * 395 if (output_exhausted) /* needs to go first */ 396 return XML_CONVERT_OUTPUT_EXHAUSTED; 396 return XML_CONVERT_OUTPUT_EXHAUSTED; 397 else if (input_incomplete) 397 else if (input_incomplete) 398 return XML_CONVERT_INPUT_INCOMPLETE; 398 return XML_CONVERT_INPUT_INCOMPLETE; 399 else 399 else 400 return XML_CONVERT_COMPLETED; 400 return XML_CONVERT_COMPLETED; 401 } 401 } 402 402 403 static enum XML_Convert_Result PTRCALL 403 static enum XML_Convert_Result PTRCALL 404 utf8_toUtf16(const ENCODING *enc, const char * 404 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 405 unsigned short **toP, const unsig 405 unsigned short **toP, const unsigned short *toLim) { 406 enum XML_Convert_Result res = XML_CONVERT_CO 406 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; 407 unsigned short *to = *toP; 407 unsigned short *to = *toP; 408 const char *from = *fromP; 408 const char *from = *fromP; 409 while (from < fromLim && to < toLim) { 409 while (from < fromLim && to < toLim) { 410 switch (((struct normal_encoding *)enc)->t 410 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 411 case BT_LEAD2: 411 case BT_LEAD2: 412 if (fromLim - from < 2) { 412 if (fromLim - from < 2) { 413 res = XML_CONVERT_INPUT_INCOMPLETE; 413 res = XML_CONVERT_INPUT_INCOMPLETE; 414 goto after; 414 goto after; 415 } 415 } 416 *to++ = (unsigned short)(((from[0] & 0x1 416 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 417 from += 2; 417 from += 2; 418 break; 418 break; 419 case BT_LEAD3: 419 case BT_LEAD3: 420 if (fromLim - from < 3) { 420 if (fromLim - from < 3) { 421 res = XML_CONVERT_INPUT_INCOMPLETE; 421 res = XML_CONVERT_INPUT_INCOMPLETE; 422 goto after; 422 goto after; 423 } 423 } 424 *to++ = (unsigned short)(((from[0] & 0xf 424 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) 425 | (from[2] & 0x 425 | (from[2] & 0x3f)); 426 from += 3; 426 from += 3; 427 break; 427 break; 428 case BT_LEAD4: { 428 case BT_LEAD4: { 429 unsigned long n; 429 unsigned long n; 430 if (toLim - to < 2) { 430 if (toLim - to < 2) { 431 res = XML_CONVERT_OUTPUT_EXHAUSTED; 431 res = XML_CONVERT_OUTPUT_EXHAUSTED; 432 goto after; 432 goto after; 433 } 433 } 434 if (fromLim - from < 4) { 434 if (fromLim - from < 4) { 435 res = XML_CONVERT_INPUT_INCOMPLETE; 435 res = XML_CONVERT_INPUT_INCOMPLETE; 436 goto after; 436 goto after; 437 } 437 } 438 n = ((from[0] & 0x7) << 18) | ((from[1] 438 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 439 | ((from[2] & 0x3f) << 6) | (from[3] 439 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 440 n -= 0x10000; 440 n -= 0x10000; 441 to[0] = (unsigned short)((n >> 10) | 0xD 441 to[0] = (unsigned short)((n >> 10) | 0xD800); 442 to[1] = (unsigned short)((n & 0x3FF) | 0 442 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 443 to += 2; 443 to += 2; 444 from += 4; 444 from += 4; 445 } break; 445 } break; 446 default: 446 default: 447 *to++ = *from++; 447 *to++ = *from++; 448 break; 448 break; 449 } 449 } 450 } 450 } 451 if (from < fromLim) 451 if (from < fromLim) 452 res = XML_CONVERT_OUTPUT_EXHAUSTED; 452 res = XML_CONVERT_OUTPUT_EXHAUSTED; 453 after: 453 after: 454 *fromP = from; 454 *fromP = from; 455 *toP = to; 455 *toP = to; 456 return res; 456 return res; 457 } 457 } 458 458 459 #ifdef XML_NS 459 #ifdef XML_NS 460 static const struct normal_encoding utf8_encod 460 static const struct normal_encoding utf8_encoding_ns 461 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 461 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 462 { 462 { 463 # include "asciitab.h" 463 # include "asciitab.h" 464 # include "utf8tab.h" 464 # include "utf8tab.h" 465 }, 465 }, 466 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8 466 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 467 #endif 467 #endif 468 468 469 static const struct normal_encoding utf8_encod 469 static const struct normal_encoding utf8_encoding 470 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 470 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 471 { 471 { 472 #define BT_COLON BT_NMSTRT 472 #define BT_COLON BT_NMSTRT 473 #include "asciitab.h" 473 #include "asciitab.h" 474 #undef BT_COLON 474 #undef BT_COLON 475 #include "utf8tab.h" 475 #include "utf8tab.h" 476 }, 476 }, 477 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8 477 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 478 478 479 #ifdef XML_NS 479 #ifdef XML_NS 480 480 481 static const struct normal_encoding internal_u 481 static const struct normal_encoding internal_utf8_encoding_ns 482 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 482 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 483 { 483 { 484 # include "iasciitab.h" 484 # include "iasciitab.h" 485 # include "utf8tab.h" 485 # include "utf8tab.h" 486 }, 486 }, 487 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8 487 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 488 488 489 #endif 489 #endif 490 490 491 static const struct normal_encoding internal_u 491 static const struct normal_encoding internal_utf8_encoding 492 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 492 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 493 { 493 { 494 #define BT_COLON BT_NMSTRT 494 #define BT_COLON BT_NMSTRT 495 #include "iasciitab.h" 495 #include "iasciitab.h" 496 #undef BT_COLON 496 #undef BT_COLON 497 #include "utf8tab.h" 497 #include "utf8tab.h" 498 }, 498 }, 499 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8 499 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 500 500 501 static enum XML_Convert_Result PTRCALL 501 static enum XML_Convert_Result PTRCALL 502 latin1_toUtf8(const ENCODING *enc, const char 502 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 503 char **toP, const char *toLim) { 503 char **toP, const char *toLim) { 504 UNUSED_P(enc); 504 UNUSED_P(enc); 505 for (;;) { 505 for (;;) { 506 unsigned char c; 506 unsigned char c; 507 if (*fromP == fromLim) 507 if (*fromP == fromLim) 508 return XML_CONVERT_COMPLETED; 508 return XML_CONVERT_COMPLETED; 509 c = (unsigned char)**fromP; 509 c = (unsigned char)**fromP; 510 if (c & 0x80) { 510 if (c & 0x80) { 511 if (toLim - *toP < 2) 511 if (toLim - *toP < 2) 512 return XML_CONVERT_OUTPUT_EXHAUSTED; 512 return XML_CONVERT_OUTPUT_EXHAUSTED; 513 *(*toP)++ = (char)((c >> 6) | UTF8_cval2 513 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 514 *(*toP)++ = (char)((c & 0x3f) | 0x80); 514 *(*toP)++ = (char)((c & 0x3f) | 0x80); 515 (*fromP)++; 515 (*fromP)++; 516 } else { 516 } else { 517 if (*toP == toLim) 517 if (*toP == toLim) 518 return XML_CONVERT_OUTPUT_EXHAUSTED; 518 return XML_CONVERT_OUTPUT_EXHAUSTED; 519 *(*toP)++ = *(*fromP)++; 519 *(*toP)++ = *(*fromP)++; 520 } 520 } 521 } 521 } 522 } 522 } 523 523 524 static enum XML_Convert_Result PTRCALL 524 static enum XML_Convert_Result PTRCALL 525 latin1_toUtf16(const ENCODING *enc, const char 525 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 526 unsigned short **toP, const uns 526 unsigned short **toP, const unsigned short *toLim) { 527 UNUSED_P(enc); 527 UNUSED_P(enc); 528 while (*fromP < fromLim && *toP < toLim) 528 while (*fromP < fromLim && *toP < toLim) 529 *(*toP)++ = (unsigned char)*(*fromP)++; 529 *(*toP)++ = (unsigned char)*(*fromP)++; 530 530 531 if ((*toP == toLim) && (*fromP < fromLim)) 531 if ((*toP == toLim) && (*fromP < fromLim)) 532 return XML_CONVERT_OUTPUT_EXHAUSTED; 532 return XML_CONVERT_OUTPUT_EXHAUSTED; 533 else 533 else 534 return XML_CONVERT_COMPLETED; 534 return XML_CONVERT_COMPLETED; 535 } 535 } 536 536 537 #ifdef XML_NS 537 #ifdef XML_NS 538 538 539 static const struct normal_encoding latin1_enc 539 static const struct normal_encoding latin1_encoding_ns 540 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16 540 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0}, 541 { 541 { 542 # include "asciitab.h" 542 # include "asciitab.h" 543 # include "latin1tab.h" 543 # include "latin1tab.h" 544 }, 544 }, 545 STANDARD_VTABLE(sb_) NULL_VTABLE}; 545 STANDARD_VTABLE(sb_) NULL_VTABLE}; 546 546 547 #endif 547 #endif 548 548 549 static const struct normal_encoding latin1_enc 549 static const struct normal_encoding latin1_encoding 550 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16 550 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0}, 551 { 551 { 552 #define BT_COLON BT_NMSTRT 552 #define BT_COLON BT_NMSTRT 553 #include "asciitab.h" 553 #include "asciitab.h" 554 #undef BT_COLON 554 #undef BT_COLON 555 #include "latin1tab.h" 555 #include "latin1tab.h" 556 }, 556 }, 557 STANDARD_VTABLE(sb_) NULL_VTABLE}; 557 STANDARD_VTABLE(sb_) NULL_VTABLE}; 558 558 559 static enum XML_Convert_Result PTRCALL 559 static enum XML_Convert_Result PTRCALL 560 ascii_toUtf8(const ENCODING *enc, const char * 560 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 561 char **toP, const char *toLim) { 561 char **toP, const char *toLim) { 562 UNUSED_P(enc); 562 UNUSED_P(enc); 563 while (*fromP < fromLim && *toP < toLim) 563 while (*fromP < fromLim && *toP < toLim) 564 *(*toP)++ = *(*fromP)++; 564 *(*toP)++ = *(*fromP)++; 565 565 566 if ((*toP == toLim) && (*fromP < fromLim)) 566 if ((*toP == toLim) && (*fromP < fromLim)) 567 return XML_CONVERT_OUTPUT_EXHAUSTED; 567 return XML_CONVERT_OUTPUT_EXHAUSTED; 568 else 568 else 569 return XML_CONVERT_COMPLETED; 569 return XML_CONVERT_COMPLETED; 570 } 570 } 571 571 572 #ifdef XML_NS 572 #ifdef XML_NS 573 573 574 static const struct normal_encoding ascii_enco 574 static const struct normal_encoding ascii_encoding_ns 575 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 575 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0}, 576 { 576 { 577 # include "asciitab.h" 577 # include "asciitab.h" 578 /* BT_NONXML == 0 */ 578 /* BT_NONXML == 0 */ 579 }, 579 }, 580 STANDARD_VTABLE(sb_) NULL_VTABLE}; 580 STANDARD_VTABLE(sb_) NULL_VTABLE}; 581 581 582 #endif 582 #endif 583 583 584 static const struct normal_encoding ascii_enco 584 static const struct normal_encoding ascii_encoding 585 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 585 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0}, 586 { 586 { 587 #define BT_COLON BT_NMSTRT 587 #define BT_COLON BT_NMSTRT 588 #include "asciitab.h" 588 #include "asciitab.h" 589 #undef BT_COLON 589 #undef BT_COLON 590 /* BT_NONXML == 0 */ 590 /* BT_NONXML == 0 */ 591 }, 591 }, 592 STANDARD_VTABLE(sb_) NULL_VTABLE}; 592 STANDARD_VTABLE(sb_) NULL_VTABLE}; 593 593 594 static int PTRFASTCALL 594 static int PTRFASTCALL 595 unicode_byte_type(char hi, char lo) { 595 unicode_byte_type(char hi, char lo) { 596 switch ((unsigned char)hi) { 596 switch ((unsigned char)hi) { 597 /* 0xD800-0xDBFF first 16-bit code unit or h 597 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */ 598 case 0xD8: 598 case 0xD8: 599 case 0xD9: 599 case 0xD9: 600 case 0xDA: 600 case 0xDA: 601 case 0xDB: 601 case 0xDB: 602 return BT_LEAD4; 602 return BT_LEAD4; 603 /* 0xDC00-0xDFFF second 16-bit code unit or 603 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */ 604 case 0xDC: 604 case 0xDC: 605 case 0xDD: 605 case 0xDD: 606 case 0xDE: 606 case 0xDE: 607 case 0xDF: 607 case 0xDF: 608 return BT_TRAIL; 608 return BT_TRAIL; 609 case 0xFF: 609 case 0xFF: 610 switch ((unsigned char)lo) { 610 switch ((unsigned char)lo) { 611 case 0xFF: /* noncharacter-FFFF */ 611 case 0xFF: /* noncharacter-FFFF */ 612 case 0xFE: /* noncharacter-FFFE */ 612 case 0xFE: /* noncharacter-FFFE */ 613 return BT_NONXML; 613 return BT_NONXML; 614 } 614 } 615 break; 615 break; 616 } 616 } 617 return BT_NONASCII; 617 return BT_NONASCII; 618 } 618 } 619 619 620 #define DEFINE_UTF16_TO_UTF8(E) 620 #define DEFINE_UTF16_TO_UTF8(E) \ 621 static enum XML_Convert_Result PTRCALL E##to 621 static enum XML_Convert_Result PTRCALL E##toUtf8( \ 622 const ENCODING *enc, const char **fromP, 622 const ENCODING *enc, const char **fromP, const char *fromLim, \ 623 char **toP, const char *toLim) { 623 char **toP, const char *toLim) { \ 624 const char *from = *fromP; 624 const char *from = *fromP; \ 625 UNUSED_P(enc); 625 UNUSED_P(enc); \ 626 fromLim = from + (((fromLim - from) >> 1) 626 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \ 627 for (; from < fromLim; from += 2) { 627 for (; from < fromLim; from += 2) { \ 628 int plane; 628 int plane; \ 629 unsigned char lo2; 629 unsigned char lo2; \ 630 unsigned char lo = GET_LO(from); 630 unsigned char lo = GET_LO(from); \ 631 unsigned char hi = GET_HI(from); 631 unsigned char hi = GET_HI(from); \ 632 switch (hi) { 632 switch (hi) { \ 633 case 0: 633 case 0: \ 634 if (lo < 0x80) { 634 if (lo < 0x80) { \ 635 if (*toP == toLim) { 635 if (*toP == toLim) { \ 636 *fromP = from; 636 *fromP = from; \ 637 return XML_CONVERT_OUTPUT_EXHAUSTE 637 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 638 } 638 } \ 639 *(*toP)++ = lo; 639 *(*toP)++ = lo; \ 640 break; 640 break; \ 641 } 641 } \ 642 /* fall through */ 642 /* fall through */ \ 643 case 0x1: 643 case 0x1: \ 644 case 0x2: 644 case 0x2: \ 645 case 0x3: 645 case 0x3: \ 646 case 0x4: 646 case 0x4: \ 647 case 0x5: 647 case 0x5: \ 648 case 0x6: 648 case 0x6: \ 649 case 0x7: 649 case 0x7: \ 650 if (toLim - *toP < 2) { 650 if (toLim - *toP < 2) { \ 651 *fromP = from; 651 *fromP = from; \ 652 return XML_CONVERT_OUTPUT_EXHAUSTED; 652 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 653 } 653 } \ 654 *(*toP)++ = ((lo >> 6) | (hi << 2) | U 654 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 655 *(*toP)++ = ((lo & 0x3f) | 0x80); 655 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 656 break; 656 break; \ 657 default: 657 default: \ 658 if (toLim - *toP < 3) { 658 if (toLim - *toP < 3) { \ 659 *fromP = from; 659 *fromP = from; \ 660 return XML_CONVERT_OUTPUT_EXHAUSTED; 660 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 661 } 661 } \ 662 /* 16 bits divided 4, 6, 6 amongst 3 b 662 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 663 *(*toP)++ = ((hi >> 4) | UTF8_cval3); 663 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 664 *(*toP)++ = (((hi & 0xf) << 2) | (lo > 664 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 665 *(*toP)++ = ((lo & 0x3f) | 0x80); 665 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 666 break; 666 break; \ 667 case 0xD8: 667 case 0xD8: \ 668 case 0xD9: 668 case 0xD9: \ 669 case 0xDA: 669 case 0xDA: \ 670 case 0xDB: 670 case 0xDB: \ 671 if (toLim - *toP < 4) { 671 if (toLim - *toP < 4) { \ 672 *fromP = from; 672 *fromP = from; \ 673 return XML_CONVERT_OUTPUT_EXHAUSTED; 673 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 674 } 674 } \ 675 if (fromLim - from < 4) { 675 if (fromLim - from < 4) { \ 676 *fromP = from; 676 *fromP = from; \ 677 return XML_CONVERT_INPUT_INCOMPLETE; 677 return XML_CONVERT_INPUT_INCOMPLETE; \ 678 } 678 } \ 679 plane = (((hi & 0x3) << 2) | ((lo >> 6 679 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 680 *(*toP)++ = (char)((plane >> 2) | UTF8 680 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \ 681 *(*toP)++ = (((lo >> 2) & 0xF) | ((pla 681 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 682 from += 2; 682 from += 2; \ 683 lo2 = GET_LO(from); 683 lo2 = GET_LO(from); \ 684 *(*toP)++ = (((lo & 0x3) << 4) | ((GET 684 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \ 685 | (lo2 >> 6) | 0x80); 685 | (lo2 >> 6) | 0x80); \ 686 *(*toP)++ = ((lo2 & 0x3f) | 0x80); 686 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 687 break; 687 break; \ 688 } 688 } \ 689 } 689 } \ 690 *fromP = from; 690 *fromP = from; \ 691 if (from < fromLim) 691 if (from < fromLim) \ 692 return XML_CONVERT_INPUT_INCOMPLETE; 692 return XML_CONVERT_INPUT_INCOMPLETE; \ 693 else 693 else \ 694 return XML_CONVERT_COMPLETED; 694 return XML_CONVERT_COMPLETED; \ 695 } 695 } 696 696 697 #define DEFINE_UTF16_TO_UTF16(E) 697 #define DEFINE_UTF16_TO_UTF16(E) \ 698 static enum XML_Convert_Result PTRCALL E##to 698 static enum XML_Convert_Result PTRCALL E##toUtf16( \ 699 const ENCODING *enc, const char **fromP, 699 const ENCODING *enc, const char **fromP, const char *fromLim, \ 700 unsigned short **toP, const unsigned sho 700 unsigned short **toP, const unsigned short *toLim) { \ 701 enum XML_Convert_Result res = XML_CONVERT_ 701 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \ 702 UNUSED_P(enc); 702 UNUSED_P(enc); \ 703 fromLim = *fromP + (((fromLim - *fromP) >> 703 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \ 704 /* Avoid copying first half only of surrog 704 /* Avoid copying first half only of surrogate */ \ 705 if (fromLim - *fromP > ((toLim - *toP) << 705 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 706 && (GET_HI(fromLim - 2) & 0xF8) == 0xD 706 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \ 707 fromLim -= 2; 707 fromLim -= 2; \ 708 res = XML_CONVERT_INPUT_INCOMPLETE; 708 res = XML_CONVERT_INPUT_INCOMPLETE; \ 709 } 709 } \ 710 for (; *fromP < fromLim && *toP < toLim; * 710 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \ 711 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_ 711 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 712 if ((*toP == toLim) && (*fromP < fromLim)) 712 if ((*toP == toLim) && (*fromP < fromLim)) \ 713 return XML_CONVERT_OUTPUT_EXHAUSTED; 713 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 714 else 714 else \ 715 return res; 715 return res; \ 716 } 716 } 717 717 718 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff) 718 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8))) 719 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 719 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 720 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 720 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 721 721 722 DEFINE_UTF16_TO_UTF8(little2_) 722 DEFINE_UTF16_TO_UTF8(little2_) 723 DEFINE_UTF16_TO_UTF16(little2_) 723 DEFINE_UTF16_TO_UTF16(little2_) 724 724 725 #undef SET2 725 #undef SET2 726 #undef GET_LO 726 #undef GET_LO 727 #undef GET_HI 727 #undef GET_HI 728 728 729 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8) 729 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF))) 730 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 730 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 731 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 731 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 732 732 733 DEFINE_UTF16_TO_UTF8(big2_) 733 DEFINE_UTF16_TO_UTF8(big2_) 734 DEFINE_UTF16_TO_UTF16(big2_) 734 DEFINE_UTF16_TO_UTF16(big2_) 735 735 736 #undef SET2 736 #undef SET2 737 #undef GET_LO 737 #undef GET_LO 738 #undef GET_HI 738 #undef GET_HI 739 739 740 #define LITTLE2_BYTE_TYPE(enc, p) 740 #define LITTLE2_BYTE_TYPE(enc, p) \ 741 ((p)[1] == 0 ? ((struct normal_encoding *)(e 741 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 742 : unicode_byte_type((p)[1], (p) 742 : unicode_byte_type((p)[1], (p)[0])) 743 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 743 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1) 744 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 744 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c)) 745 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) 745 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \ 746 UCS2_GET_NAMING(namePages, (unsigned char)p[ 746 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 747 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) 747 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \ 748 UCS2_GET_NAMING(nmstrtPages, (unsigned char) 748 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 749 749 750 #ifdef XML_MIN_SIZE 750 #ifdef XML_MIN_SIZE 751 751 752 static int PTRFASTCALL 752 static int PTRFASTCALL 753 little2_byteType(const ENCODING *enc, const ch 753 little2_byteType(const ENCODING *enc, const char *p) { 754 return LITTLE2_BYTE_TYPE(enc, p); 754 return LITTLE2_BYTE_TYPE(enc, p); 755 } 755 } 756 756 757 static int PTRFASTCALL 757 static int PTRFASTCALL 758 little2_byteToAscii(const ENCODING *enc, const 758 little2_byteToAscii(const ENCODING *enc, const char *p) { 759 UNUSED_P(enc); 759 UNUSED_P(enc); 760 return LITTLE2_BYTE_TO_ASCII(p); 760 return LITTLE2_BYTE_TO_ASCII(p); 761 } 761 } 762 762 763 static int PTRCALL 763 static int PTRCALL 764 little2_charMatches(const ENCODING *enc, const 764 little2_charMatches(const ENCODING *enc, const char *p, int c) { 765 UNUSED_P(enc); 765 UNUSED_P(enc); 766 return LITTLE2_CHAR_MATCHES(p, c); 766 return LITTLE2_CHAR_MATCHES(p, c); 767 } 767 } 768 768 769 static int PTRFASTCALL 769 static int PTRFASTCALL 770 little2_isNameMin(const ENCODING *enc, const c 770 little2_isNameMin(const ENCODING *enc, const char *p) { 771 UNUSED_P(enc); 771 UNUSED_P(enc); 772 return LITTLE2_IS_NAME_CHAR_MINBPC(p); 772 return LITTLE2_IS_NAME_CHAR_MINBPC(p); 773 } 773 } 774 774 775 static int PTRFASTCALL 775 static int PTRFASTCALL 776 little2_isNmstrtMin(const ENCODING *enc, const 776 little2_isNmstrtMin(const ENCODING *enc, const char *p) { 777 UNUSED_P(enc); 777 UNUSED_P(enc); 778 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p); 778 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p); 779 } 779 } 780 780 781 # undef VTABLE 781 # undef VTABLE 782 # define VTABLE VTABLE1, little2_toUtf8, litt 782 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 783 783 784 #else /* not XML_MIN_SIZE */ 784 #else /* not XML_MIN_SIZE */ 785 785 786 # undef PREFIX 786 # undef PREFIX 787 # define PREFIX(ident) little2_##ident 787 # define PREFIX(ident) little2_##ident 788 # define MINBPC(enc) 2 788 # define MINBPC(enc) 2 789 /* CHAR_MATCHES is guaranteed to have MINBPC b 789 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 790 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE( 790 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 791 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_T 791 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p) 792 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR 792 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c) 793 # define IS_NAME_CHAR(enc, p, n) 0 793 # define IS_NAME_CHAR(enc, p, n) 0 794 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_ 794 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p) 795 # define IS_NMSTRT_CHAR(enc, p, n) (0) 795 # define IS_NMSTRT_CHAR(enc, p, n) (0) 796 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE 796 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) 797 797 798 # define XML_TOK_IMPL_C 798 # define XML_TOK_IMPL_C 799 # include "xmltok_impl.c" 799 # include "xmltok_impl.c" 800 # undef XML_TOK_IMPL_C 800 # undef XML_TOK_IMPL_C 801 801 802 # undef MINBPC 802 # undef MINBPC 803 # undef BYTE_TYPE 803 # undef BYTE_TYPE 804 # undef BYTE_TO_ASCII 804 # undef BYTE_TO_ASCII 805 # undef CHAR_MATCHES 805 # undef CHAR_MATCHES 806 # undef IS_NAME_CHAR 806 # undef IS_NAME_CHAR 807 # undef IS_NAME_CHAR_MINBPC 807 # undef IS_NAME_CHAR_MINBPC 808 # undef IS_NMSTRT_CHAR 808 # undef IS_NMSTRT_CHAR 809 # undef IS_NMSTRT_CHAR_MINBPC 809 # undef IS_NMSTRT_CHAR_MINBPC 810 # undef IS_INVALID_CHAR 810 # undef IS_INVALID_CHAR 811 811 812 #endif /* not XML_MIN_SIZE */ 812 #endif /* not XML_MIN_SIZE */ 813 813 814 #ifdef XML_NS 814 #ifdef XML_NS 815 815 816 static const struct normal_encoding little2_en 816 static const struct normal_encoding little2_encoding_ns 817 = {{VTABLE, 2, 0, 817 = {{VTABLE, 2, 0, 818 # if BYTEORDER == 1234 818 # if BYTEORDER == 1234 819 1 819 1 820 # else 820 # else 821 0 821 0 822 # endif 822 # endif 823 }, 823 }, 824 { 824 { 825 # include "asciitab.h" 825 # include "asciitab.h" 826 # include "latin1tab.h" 826 # include "latin1tab.h" 827 }, 827 }, 828 STANDARD_VTABLE(little2_) NULL_VTABLE}; 828 STANDARD_VTABLE(little2_) NULL_VTABLE}; 829 829 830 #endif 830 #endif 831 831 832 static const struct normal_encoding little2_en 832 static const struct normal_encoding little2_encoding 833 = {{VTABLE, 2, 0, 833 = {{VTABLE, 2, 0, 834 #if BYTEORDER == 1234 834 #if BYTEORDER == 1234 835 1 835 1 836 #else 836 #else 837 0 837 0 838 #endif 838 #endif 839 }, 839 }, 840 { 840 { 841 #define BT_COLON BT_NMSTRT 841 #define BT_COLON BT_NMSTRT 842 #include "asciitab.h" 842 #include "asciitab.h" 843 #undef BT_COLON 843 #undef BT_COLON 844 #include "latin1tab.h" 844 #include "latin1tab.h" 845 }, 845 }, 846 STANDARD_VTABLE(little2_) NULL_VTABLE}; 846 STANDARD_VTABLE(little2_) NULL_VTABLE}; 847 847 848 #if BYTEORDER != 4321 848 #if BYTEORDER != 4321 849 849 850 # ifdef XML_NS 850 # ifdef XML_NS 851 851 852 static const struct normal_encoding internal_l 852 static const struct normal_encoding internal_little2_encoding_ns 853 = {{VTABLE, 2, 0, 1}, 853 = {{VTABLE, 2, 0, 1}, 854 { 854 { 855 # include "iasciitab.h" 855 # include "iasciitab.h" 856 # include "latin1tab.h" 856 # include "latin1tab.h" 857 }, 857 }, 858 STANDARD_VTABLE(little2_) NULL_VTABLE}; 858 STANDARD_VTABLE(little2_) NULL_VTABLE}; 859 859 860 # endif 860 # endif 861 861 862 static const struct normal_encoding internal_l 862 static const struct normal_encoding internal_little2_encoding 863 = {{VTABLE, 2, 0, 1}, 863 = {{VTABLE, 2, 0, 1}, 864 { 864 { 865 # define BT_COLON BT_NMSTRT 865 # define BT_COLON BT_NMSTRT 866 # include "iasciitab.h" 866 # include "iasciitab.h" 867 # undef BT_COLON 867 # undef BT_COLON 868 # include "latin1tab.h" 868 # include "latin1tab.h" 869 }, 869 }, 870 STANDARD_VTABLE(little2_) NULL_VTABLE}; 870 STANDARD_VTABLE(little2_) NULL_VTABLE}; 871 871 872 #endif 872 #endif 873 873 874 #define BIG2_BYTE_TYPE(enc, p) 874 #define BIG2_BYTE_TYPE(enc, p) \ 875 ((p)[0] == 0 875 ((p)[0] == 0 \ 876 ? ((struct normal_encoding *)(enc))->ty 876 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 877 : unicode_byte_type((p)[0], (p)[1])) 877 : unicode_byte_type((p)[0], (p)[1])) 878 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? ( 878 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1) 879 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 & 879 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c)) 880 #define BIG2_IS_NAME_CHAR_MINBPC(p) 880 #define BIG2_IS_NAME_CHAR_MINBPC(p) \ 881 UCS2_GET_NAMING(namePages, (unsigned char)p[ 881 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 882 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) 882 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \ 883 UCS2_GET_NAMING(nmstrtPages, (unsigned char) 883 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 884 884 885 #ifdef XML_MIN_SIZE 885 #ifdef XML_MIN_SIZE 886 886 887 static int PTRFASTCALL 887 static int PTRFASTCALL 888 big2_byteType(const ENCODING *enc, const char 888 big2_byteType(const ENCODING *enc, const char *p) { 889 return BIG2_BYTE_TYPE(enc, p); 889 return BIG2_BYTE_TYPE(enc, p); 890 } 890 } 891 891 892 static int PTRFASTCALL 892 static int PTRFASTCALL 893 big2_byteToAscii(const ENCODING *enc, const ch 893 big2_byteToAscii(const ENCODING *enc, const char *p) { 894 UNUSED_P(enc); 894 UNUSED_P(enc); 895 return BIG2_BYTE_TO_ASCII(p); 895 return BIG2_BYTE_TO_ASCII(p); 896 } 896 } 897 897 898 static int PTRCALL 898 static int PTRCALL 899 big2_charMatches(const ENCODING *enc, const ch 899 big2_charMatches(const ENCODING *enc, const char *p, int c) { 900 UNUSED_P(enc); 900 UNUSED_P(enc); 901 return BIG2_CHAR_MATCHES(p, c); 901 return BIG2_CHAR_MATCHES(p, c); 902 } 902 } 903 903 904 static int PTRFASTCALL 904 static int PTRFASTCALL 905 big2_isNameMin(const ENCODING *enc, const char 905 big2_isNameMin(const ENCODING *enc, const char *p) { 906 UNUSED_P(enc); 906 UNUSED_P(enc); 907 return BIG2_IS_NAME_CHAR_MINBPC(p); 907 return BIG2_IS_NAME_CHAR_MINBPC(p); 908 } 908 } 909 909 910 static int PTRFASTCALL 910 static int PTRFASTCALL 911 big2_isNmstrtMin(const ENCODING *enc, const ch 911 big2_isNmstrtMin(const ENCODING *enc, const char *p) { 912 UNUSED_P(enc); 912 UNUSED_P(enc); 913 return BIG2_IS_NMSTRT_CHAR_MINBPC(p); 913 return BIG2_IS_NMSTRT_CHAR_MINBPC(p); 914 } 914 } 915 915 916 # undef VTABLE 916 # undef VTABLE 917 # define VTABLE VTABLE1, big2_toUtf8, big2_to 917 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 918 918 919 #else /* not XML_MIN_SIZE */ 919 #else /* not XML_MIN_SIZE */ 920 920 921 # undef PREFIX 921 # undef PREFIX 922 # define PREFIX(ident) big2_##ident 922 # define PREFIX(ident) big2_##ident 923 # define MINBPC(enc) 2 923 # define MINBPC(enc) 2 924 /* CHAR_MATCHES is guaranteed to have MINBPC b 924 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 925 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc 925 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 926 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_A 926 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p) 927 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MA 927 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c) 928 # define IS_NAME_CHAR(enc, p, n) 0 928 # define IS_NAME_CHAR(enc, p, n) 0 929 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_ 929 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p) 930 # define IS_NMSTRT_CHAR(enc, p, n) (0) 930 # define IS_NMSTRT_CHAR(enc, p, n) (0) 931 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_I 931 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p) 932 932 933 # define XML_TOK_IMPL_C 933 # define XML_TOK_IMPL_C 934 # include "xmltok_impl.c" 934 # include "xmltok_impl.c" 935 # undef XML_TOK_IMPL_C 935 # undef XML_TOK_IMPL_C 936 936 937 # undef MINBPC 937 # undef MINBPC 938 # undef BYTE_TYPE 938 # undef BYTE_TYPE 939 # undef BYTE_TO_ASCII 939 # undef BYTE_TO_ASCII 940 # undef CHAR_MATCHES 940 # undef CHAR_MATCHES 941 # undef IS_NAME_CHAR 941 # undef IS_NAME_CHAR 942 # undef IS_NAME_CHAR_MINBPC 942 # undef IS_NAME_CHAR_MINBPC 943 # undef IS_NMSTRT_CHAR 943 # undef IS_NMSTRT_CHAR 944 # undef IS_NMSTRT_CHAR_MINBPC 944 # undef IS_NMSTRT_CHAR_MINBPC 945 # undef IS_INVALID_CHAR 945 # undef IS_INVALID_CHAR 946 946 947 #endif /* not XML_MIN_SIZE */ 947 #endif /* not XML_MIN_SIZE */ 948 948 949 #ifdef XML_NS 949 #ifdef XML_NS 950 950 951 static const struct normal_encoding big2_encod 951 static const struct normal_encoding big2_encoding_ns 952 = {{VTABLE, 2, 0, 952 = {{VTABLE, 2, 0, 953 # if BYTEORDER == 4321 953 # if BYTEORDER == 4321 954 1 954 1 955 # else 955 # else 956 0 956 0 957 # endif 957 # endif 958 }, 958 }, 959 { 959 { 960 # include "asciitab.h" 960 # include "asciitab.h" 961 # include "latin1tab.h" 961 # include "latin1tab.h" 962 }, 962 }, 963 STANDARD_VTABLE(big2_) NULL_VTABLE}; 963 STANDARD_VTABLE(big2_) NULL_VTABLE}; 964 964 965 #endif 965 #endif 966 966 967 static const struct normal_encoding big2_encod 967 static const struct normal_encoding big2_encoding 968 = {{VTABLE, 2, 0, 968 = {{VTABLE, 2, 0, 969 #if BYTEORDER == 4321 969 #if BYTEORDER == 4321 970 1 970 1 971 #else 971 #else 972 0 972 0 973 #endif 973 #endif 974 }, 974 }, 975 { 975 { 976 #define BT_COLON BT_NMSTRT 976 #define BT_COLON BT_NMSTRT 977 #include "asciitab.h" 977 #include "asciitab.h" 978 #undef BT_COLON 978 #undef BT_COLON 979 #include "latin1tab.h" 979 #include "latin1tab.h" 980 }, 980 }, 981 STANDARD_VTABLE(big2_) NULL_VTABLE}; 981 STANDARD_VTABLE(big2_) NULL_VTABLE}; 982 982 983 #if BYTEORDER != 1234 983 #if BYTEORDER != 1234 984 984 985 # ifdef XML_NS 985 # ifdef XML_NS 986 986 987 static const struct normal_encoding internal_b 987 static const struct normal_encoding internal_big2_encoding_ns 988 = {{VTABLE, 2, 0, 1}, 988 = {{VTABLE, 2, 0, 1}, 989 { 989 { 990 # include "iasciitab.h" 990 # include "iasciitab.h" 991 # include "latin1tab.h" 991 # include "latin1tab.h" 992 }, 992 }, 993 STANDARD_VTABLE(big2_) NULL_VTABLE}; 993 STANDARD_VTABLE(big2_) NULL_VTABLE}; 994 994 995 # endif 995 # endif 996 996 997 static const struct normal_encoding internal_b 997 static const struct normal_encoding internal_big2_encoding 998 = {{VTABLE, 2, 0, 1}, 998 = {{VTABLE, 2, 0, 1}, 999 { 999 { 1000 # define BT_COLON BT_NMSTRT 1000 # define BT_COLON BT_NMSTRT 1001 # include "iasciitab.h" 1001 # include "iasciitab.h" 1002 # undef BT_COLON 1002 # undef BT_COLON 1003 # include "latin1tab.h" 1003 # include "latin1tab.h" 1004 }, 1004 }, 1005 STANDARD_VTABLE(big2_) NULL_VTABLE}; 1005 STANDARD_VTABLE(big2_) NULL_VTABLE}; 1006 1006 1007 #endif 1007 #endif 1008 1008 1009 #undef PREFIX 1009 #undef PREFIX 1010 1010 1011 static int FASTCALL 1011 static int FASTCALL 1012 streqci(const char *s1, const char *s2) { 1012 streqci(const char *s1, const char *s2) { 1013 for (;;) { 1013 for (;;) { 1014 char c1 = *s1++; 1014 char c1 = *s1++; 1015 char c2 = *s2++; 1015 char c2 = *s2++; 1016 if (ASCII_a <= c1 && c1 <= ASCII_z) 1016 if (ASCII_a <= c1 && c1 <= ASCII_z) 1017 c1 += ASCII_A - ASCII_a; 1017 c1 += ASCII_A - ASCII_a; 1018 if (ASCII_a <= c2 && c2 <= ASCII_z) 1018 if (ASCII_a <= c2 && c2 <= ASCII_z) 1019 /* The following line will never get ex 1019 /* The following line will never get executed. streqci() is 1020 * only called from two places, both of 1020 * only called from two places, both of which guarantee to put 1021 * upper-case strings into s2. 1021 * upper-case strings into s2. 1022 */ 1022 */ 1023 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_L 1023 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */ 1024 if (c1 != c2) 1024 if (c1 != c2) 1025 return 0; 1025 return 0; 1026 if (! c1) 1026 if (! c1) 1027 break; 1027 break; 1028 } 1028 } 1029 return 1; 1029 return 1; 1030 } 1030 } 1031 1031 1032 static void PTRCALL 1032 static void PTRCALL 1033 initUpdatePosition(const ENCODING *enc, const 1033 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end, 1034 POSITION *pos) { 1034 POSITION *pos) { 1035 UNUSED_P(enc); 1035 UNUSED_P(enc); 1036 normal_updatePosition(&utf8_encoding.enc, p 1036 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 1037 } 1037 } 1038 1038 1039 static int 1039 static int 1040 toAscii(const ENCODING *enc, const char *ptr, 1040 toAscii(const ENCODING *enc, const char *ptr, const char *end) { 1041 char buf[1]; 1041 char buf[1]; 1042 char *p = buf; 1042 char *p = buf; 1043 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 1043 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 1044 if (p == buf) 1044 if (p == buf) 1045 return -1; 1045 return -1; 1046 else 1046 else 1047 return buf[0]; 1047 return buf[0]; 1048 } 1048 } 1049 1049 1050 static int FASTCALL 1050 static int FASTCALL 1051 isSpace(int c) { 1051 isSpace(int c) { 1052 switch (c) { 1052 switch (c) { 1053 case 0x20: 1053 case 0x20: 1054 case 0xD: 1054 case 0xD: 1055 case 0xA: 1055 case 0xA: 1056 case 0x9: 1056 case 0x9: 1057 return 1; 1057 return 1; 1058 } 1058 } 1059 return 0; 1059 return 0; 1060 } 1060 } 1061 1061 1062 /* Return 1 if there's just optional white sp 1062 /* Return 1 if there's just optional white space or there's an S 1063 followed by name=val. 1063 followed by name=val. 1064 */ 1064 */ 1065 static int 1065 static int 1066 parsePseudoAttribute(const ENCODING *enc, con 1066 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end, 1067 const char **namePtr, co 1067 const char **namePtr, const char **nameEndPtr, 1068 const char **valPtr, con 1068 const char **valPtr, const char **nextTokPtr) { 1069 int c; 1069 int c; 1070 char open; 1070 char open; 1071 if (ptr == end) { 1071 if (ptr == end) { 1072 *namePtr = NULL; 1072 *namePtr = NULL; 1073 return 1; 1073 return 1; 1074 } 1074 } 1075 if (! isSpace(toAscii(enc, ptr, end))) { 1075 if (! isSpace(toAscii(enc, ptr, end))) { 1076 *nextTokPtr = ptr; 1076 *nextTokPtr = ptr; 1077 return 0; 1077 return 0; 1078 } 1078 } 1079 do { 1079 do { 1080 ptr += enc->minBytesPerChar; 1080 ptr += enc->minBytesPerChar; 1081 } while (isSpace(toAscii(enc, ptr, end))); 1081 } while (isSpace(toAscii(enc, ptr, end))); 1082 if (ptr == end) { 1082 if (ptr == end) { 1083 *namePtr = NULL; 1083 *namePtr = NULL; 1084 return 1; 1084 return 1; 1085 } 1085 } 1086 *namePtr = ptr; 1086 *namePtr = ptr; 1087 for (;;) { 1087 for (;;) { 1088 c = toAscii(enc, ptr, end); 1088 c = toAscii(enc, ptr, end); 1089 if (c == -1) { 1089 if (c == -1) { 1090 *nextTokPtr = ptr; 1090 *nextTokPtr = ptr; 1091 return 0; 1091 return 0; 1092 } 1092 } 1093 if (c == ASCII_EQUALS) { 1093 if (c == ASCII_EQUALS) { 1094 *nameEndPtr = ptr; 1094 *nameEndPtr = ptr; 1095 break; 1095 break; 1096 } 1096 } 1097 if (isSpace(c)) { 1097 if (isSpace(c)) { 1098 *nameEndPtr = ptr; 1098 *nameEndPtr = ptr; 1099 do { 1099 do { 1100 ptr += enc->minBytesPerChar; 1100 ptr += enc->minBytesPerChar; 1101 } while (isSpace(c = toAscii(enc, ptr, 1101 } while (isSpace(c = toAscii(enc, ptr, end))); 1102 if (c != ASCII_EQUALS) { 1102 if (c != ASCII_EQUALS) { 1103 *nextTokPtr = ptr; 1103 *nextTokPtr = ptr; 1104 return 0; 1104 return 0; 1105 } 1105 } 1106 break; 1106 break; 1107 } 1107 } 1108 ptr += enc->minBytesPerChar; 1108 ptr += enc->minBytesPerChar; 1109 } 1109 } 1110 if (ptr == *namePtr) { 1110 if (ptr == *namePtr) { 1111 *nextTokPtr = ptr; 1111 *nextTokPtr = ptr; 1112 return 0; 1112 return 0; 1113 } 1113 } 1114 ptr += enc->minBytesPerChar; 1114 ptr += enc->minBytesPerChar; 1115 c = toAscii(enc, ptr, end); 1115 c = toAscii(enc, ptr, end); 1116 while (isSpace(c)) { 1116 while (isSpace(c)) { 1117 ptr += enc->minBytesPerChar; 1117 ptr += enc->minBytesPerChar; 1118 c = toAscii(enc, ptr, end); 1118 c = toAscii(enc, ptr, end); 1119 } 1119 } 1120 if (c != ASCII_QUOT && c != ASCII_APOS) { 1120 if (c != ASCII_QUOT && c != ASCII_APOS) { 1121 *nextTokPtr = ptr; 1121 *nextTokPtr = ptr; 1122 return 0; 1122 return 0; 1123 } 1123 } 1124 open = (char)c; 1124 open = (char)c; 1125 ptr += enc->minBytesPerChar; 1125 ptr += enc->minBytesPerChar; 1126 *valPtr = ptr; 1126 *valPtr = ptr; 1127 for (;; ptr += enc->minBytesPerChar) { 1127 for (;; ptr += enc->minBytesPerChar) { 1128 c = toAscii(enc, ptr, end); 1128 c = toAscii(enc, ptr, end); 1129 if (c == open) 1129 if (c == open) 1130 break; 1130 break; 1131 if (! (ASCII_a <= c && c <= ASCII_z) && ! 1131 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z) 1132 && ! (ASCII_0 <= c && c <= ASCII_9) & 1132 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD 1133 && c != ASCII_MINUS && c != ASCII_UND 1133 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) { 1134 *nextTokPtr = ptr; 1134 *nextTokPtr = ptr; 1135 return 0; 1135 return 0; 1136 } 1136 } 1137 } 1137 } 1138 *nextTokPtr = ptr + enc->minBytesPerChar; 1138 *nextTokPtr = ptr + enc->minBytesPerChar; 1139 return 1; 1139 return 1; 1140 } 1140 } 1141 1141 1142 static const char KW_version[] 1142 static const char KW_version[] 1143 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, AS 1143 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'}; 1144 1144 1145 static const char KW_encoding[] = {ASCII_e, A 1145 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, 1146 ASCII_i, A 1146 ASCII_i, ASCII_n, ASCII_g, '\0'}; 1147 1147 1148 static const char KW_standalone[] 1148 static const char KW_standalone[] 1149 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, AS 1149 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, 1150 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\ 1150 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'}; 1151 1151 1152 static const char KW_yes[] = {ASCII_y, ASCII_ 1152 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'}; 1153 1153 1154 static const char KW_no[] = {ASCII_n, ASCII_o 1154 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'}; 1155 1155 1156 static int 1156 static int 1157 doParseXmlDecl(const ENCODING *(*encodingFind 1157 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *, 1158 1158 const char *), 1159 int isGeneralTextEntity, const 1159 int isGeneralTextEntity, const ENCODING *enc, const char *ptr, 1160 const char *end, const char ** 1160 const char *end, const char **badPtr, const char **versionPtr, 1161 const char **versionEndPtr, co 1161 const char **versionEndPtr, const char **encodingName, 1162 const ENCODING **encoding, int 1162 const ENCODING **encoding, int *standalone) { 1163 const char *val = NULL; 1163 const char *val = NULL; 1164 const char *name = NULL; 1164 const char *name = NULL; 1165 const char *nameEnd = NULL; 1165 const char *nameEnd = NULL; 1166 ptr += 5 * enc->minBytesPerChar; 1166 ptr += 5 * enc->minBytesPerChar; 1167 end -= 2 * enc->minBytesPerChar; 1167 end -= 2 * enc->minBytesPerChar; 1168 if (! parsePseudoAttribute(enc, ptr, end, & 1168 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1169 || ! name) { 1169 || ! name) { 1170 *badPtr = ptr; 1170 *badPtr = ptr; 1171 return 0; 1171 return 0; 1172 } 1172 } 1173 if (! XmlNameMatchesAscii(enc, name, nameEn 1173 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1174 if (! isGeneralTextEntity) { 1174 if (! isGeneralTextEntity) { 1175 *badPtr = name; 1175 *badPtr = name; 1176 return 0; 1176 return 0; 1177 } 1177 } 1178 } else { 1178 } else { 1179 if (versionPtr) 1179 if (versionPtr) 1180 *versionPtr = val; 1180 *versionPtr = val; 1181 if (versionEndPtr) 1181 if (versionEndPtr) 1182 *versionEndPtr = ptr; 1182 *versionEndPtr = ptr; 1183 if (! parsePseudoAttribute(enc, ptr, end, 1183 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1184 *badPtr = ptr; 1184 *badPtr = ptr; 1185 return 0; 1185 return 0; 1186 } 1186 } 1187 if (! name) { 1187 if (! name) { 1188 if (isGeneralTextEntity) { 1188 if (isGeneralTextEntity) { 1189 /* a TextDecl must have an EncodingDe 1189 /* a TextDecl must have an EncodingDecl */ 1190 *badPtr = ptr; 1190 *badPtr = ptr; 1191 return 0; 1191 return 0; 1192 } 1192 } 1193 return 1; 1193 return 1; 1194 } 1194 } 1195 } 1195 } 1196 if (XmlNameMatchesAscii(enc, name, nameEnd, 1196 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1197 int c = toAscii(enc, val, end); 1197 int c = toAscii(enc, val, end); 1198 if (! (ASCII_a <= c && c <= ASCII_z) && ! 1198 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) { 1199 *badPtr = val; 1199 *badPtr = val; 1200 return 0; 1200 return 0; 1201 } 1201 } 1202 if (encodingName) 1202 if (encodingName) 1203 *encodingName = val; 1203 *encodingName = val; 1204 if (encoding) 1204 if (encoding) 1205 *encoding = encodingFinder(enc, val, pt 1205 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1206 if (! parsePseudoAttribute(enc, ptr, end, 1206 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1207 *badPtr = ptr; 1207 *badPtr = ptr; 1208 return 0; 1208 return 0; 1209 } 1209 } 1210 if (! name) 1210 if (! name) 1211 return 1; 1211 return 1; 1212 } 1212 } 1213 if (! XmlNameMatchesAscii(enc, name, nameEn 1213 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1214 || isGeneralTextEntity) { 1214 || isGeneralTextEntity) { 1215 *badPtr = name; 1215 *badPtr = name; 1216 return 0; 1216 return 0; 1217 } 1217 } 1218 if (XmlNameMatchesAscii(enc, val, ptr - enc 1218 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1219 if (standalone) 1219 if (standalone) 1220 *standalone = 1; 1220 *standalone = 1; 1221 } else if (XmlNameMatchesAscii(enc, val, pt 1221 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1222 if (standalone) 1222 if (standalone) 1223 *standalone = 0; 1223 *standalone = 0; 1224 } else { 1224 } else { 1225 *badPtr = val; 1225 *badPtr = val; 1226 return 0; 1226 return 0; 1227 } 1227 } 1228 while (isSpace(toAscii(enc, ptr, end))) 1228 while (isSpace(toAscii(enc, ptr, end))) 1229 ptr += enc->minBytesPerChar; 1229 ptr += enc->minBytesPerChar; 1230 if (ptr != end) { 1230 if (ptr != end) { 1231 *badPtr = ptr; 1231 *badPtr = ptr; 1232 return 0; 1232 return 0; 1233 } 1233 } 1234 return 1; 1234 return 1; 1235 } 1235 } 1236 1236 1237 static int FASTCALL 1237 static int FASTCALL 1238 checkCharRefNumber(int result) { 1238 checkCharRefNumber(int result) { 1239 switch (result >> 8) { 1239 switch (result >> 8) { 1240 case 0xD8: 1240 case 0xD8: 1241 case 0xD9: 1241 case 0xD9: 1242 case 0xDA: 1242 case 0xDA: 1243 case 0xDB: 1243 case 0xDB: 1244 case 0xDC: 1244 case 0xDC: 1245 case 0xDD: 1245 case 0xDD: 1246 case 0xDE: 1246 case 0xDE: 1247 case 0xDF: 1247 case 0xDF: 1248 return -1; 1248 return -1; 1249 case 0: 1249 case 0: 1250 if (latin1_encoding.type[result] == BT_NO 1250 if (latin1_encoding.type[result] == BT_NONXML) 1251 return -1; 1251 return -1; 1252 break; 1252 break; 1253 case 0xFF: 1253 case 0xFF: 1254 if (result == 0xFFFE || result == 0xFFFF) 1254 if (result == 0xFFFE || result == 0xFFFF) 1255 return -1; 1255 return -1; 1256 break; 1256 break; 1257 } 1257 } 1258 return result; 1258 return result; 1259 } 1259 } 1260 1260 1261 int FASTCALL 1261 int FASTCALL 1262 XmlUtf8Encode(int c, char *buf) { 1262 XmlUtf8Encode(int c, char *buf) { 1263 enum { 1263 enum { 1264 /* minN is minimum legal resulting value 1264 /* minN is minimum legal resulting value for N byte sequence */ 1265 min2 = 0x80, 1265 min2 = 0x80, 1266 min3 = 0x800, 1266 min3 = 0x800, 1267 min4 = 0x10000 1267 min4 = 0x10000 1268 }; 1268 }; 1269 1269 1270 if (c < 0) 1270 if (c < 0) 1271 return 0; /* LCOV_EXCL_LINE: this case is 1271 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */ 1272 if (c < min2) { 1272 if (c < min2) { 1273 buf[0] = (char)(c | UTF8_cval1); 1273 buf[0] = (char)(c | UTF8_cval1); 1274 return 1; 1274 return 1; 1275 } 1275 } 1276 if (c < min3) { 1276 if (c < min3) { 1277 buf[0] = (char)((c >> 6) | UTF8_cval2); 1277 buf[0] = (char)((c >> 6) | UTF8_cval2); 1278 buf[1] = (char)((c & 0x3f) | 0x80); 1278 buf[1] = (char)((c & 0x3f) | 0x80); 1279 return 2; 1279 return 2; 1280 } 1280 } 1281 if (c < min4) { 1281 if (c < min4) { 1282 buf[0] = (char)((c >> 12) | UTF8_cval3); 1282 buf[0] = (char)((c >> 12) | UTF8_cval3); 1283 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80) 1283 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1284 buf[2] = (char)((c & 0x3f) | 0x80); 1284 buf[2] = (char)((c & 0x3f) | 0x80); 1285 return 3; 1285 return 3; 1286 } 1286 } 1287 if (c < 0x110000) { 1287 if (c < 0x110000) { 1288 buf[0] = (char)((c >> 18) | UTF8_cval4); 1288 buf[0] = (char)((c >> 18) | UTF8_cval4); 1289 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80 1289 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1290 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80) 1290 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1291 buf[3] = (char)((c & 0x3f) | 0x80); 1291 buf[3] = (char)((c & 0x3f) | 0x80); 1292 return 4; 1292 return 4; 1293 } 1293 } 1294 return 0; /* LCOV_EXCL_LINE: this case too 1294 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */ 1295 } 1295 } 1296 1296 1297 int FASTCALL 1297 int FASTCALL 1298 XmlUtf16Encode(int charNum, unsigned short *b 1298 XmlUtf16Encode(int charNum, unsigned short *buf) { 1299 if (charNum < 0) 1299 if (charNum < 0) 1300 return 0; 1300 return 0; 1301 if (charNum < 0x10000) { 1301 if (charNum < 0x10000) { 1302 buf[0] = (unsigned short)charNum; 1302 buf[0] = (unsigned short)charNum; 1303 return 1; 1303 return 1; 1304 } 1304 } 1305 if (charNum < 0x110000) { 1305 if (charNum < 0x110000) { 1306 charNum -= 0x10000; 1306 charNum -= 0x10000; 1307 buf[0] = (unsigned short)((charNum >> 10) 1307 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1308 buf[1] = (unsigned short)((charNum & 0x3F 1308 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1309 return 2; 1309 return 2; 1310 } 1310 } 1311 return 0; 1311 return 0; 1312 } 1312 } 1313 1313 1314 struct unknown_encoding { 1314 struct unknown_encoding { 1315 struct normal_encoding normal; 1315 struct normal_encoding normal; 1316 CONVERTER convert; 1316 CONVERTER convert; 1317 void *userData; 1317 void *userData; 1318 unsigned short utf16[256]; 1318 unsigned short utf16[256]; 1319 char utf8[256][4]; 1319 char utf8[256][4]; 1320 }; 1320 }; 1321 1321 1322 #define AS_UNKNOWN_ENCODING(enc) ((const stru 1322 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc)) 1323 1323 1324 int 1324 int 1325 XmlSizeOfUnknownEncoding(void) { 1325 XmlSizeOfUnknownEncoding(void) { 1326 return sizeof(struct unknown_encoding); 1326 return sizeof(struct unknown_encoding); 1327 } 1327 } 1328 1328 1329 static int PTRFASTCALL 1329 static int PTRFASTCALL 1330 unknown_isName(const ENCODING *enc, const cha 1330 unknown_isName(const ENCODING *enc, const char *p) { 1331 const struct unknown_encoding *uenc = AS_UN 1331 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1332 int c = uenc->convert(uenc->userData, p); 1332 int c = uenc->convert(uenc->userData, p); 1333 if (c & ~0xFFFF) 1333 if (c & ~0xFFFF) 1334 return 0; 1334 return 0; 1335 return UCS2_GET_NAMING(namePages, c >> 8, c 1335 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1336 } 1336 } 1337 1337 1338 static int PTRFASTCALL 1338 static int PTRFASTCALL 1339 unknown_isNmstrt(const ENCODING *enc, const c 1339 unknown_isNmstrt(const ENCODING *enc, const char *p) { 1340 const struct unknown_encoding *uenc = AS_UN 1340 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1341 int c = uenc->convert(uenc->userData, p); 1341 int c = uenc->convert(uenc->userData, p); 1342 if (c & ~0xFFFF) 1342 if (c & ~0xFFFF) 1343 return 0; 1343 return 0; 1344 return UCS2_GET_NAMING(nmstrtPages, c >> 8, 1344 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1345 } 1345 } 1346 1346 1347 static int PTRFASTCALL 1347 static int PTRFASTCALL 1348 unknown_isInvalid(const ENCODING *enc, const 1348 unknown_isInvalid(const ENCODING *enc, const char *p) { 1349 const struct unknown_encoding *uenc = AS_UN 1349 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1350 int c = uenc->convert(uenc->userData, p); 1350 int c = uenc->convert(uenc->userData, p); 1351 return (c & ~0xFFFF) || checkCharRefNumber( 1351 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1352 } 1352 } 1353 1353 1354 static enum XML_Convert_Result PTRCALL 1354 static enum XML_Convert_Result PTRCALL 1355 unknown_toUtf8(const ENCODING *enc, const cha 1355 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 1356 char **toP, const char *toLim) 1356 char **toP, const char *toLim) { 1357 const struct unknown_encoding *uenc = AS_UN 1357 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1358 char buf[XML_UTF8_ENCODE_MAX]; 1358 char buf[XML_UTF8_ENCODE_MAX]; 1359 for (;;) { 1359 for (;;) { 1360 const char *utf8; 1360 const char *utf8; 1361 int n; 1361 int n; 1362 if (*fromP == fromLim) 1362 if (*fromP == fromLim) 1363 return XML_CONVERT_COMPLETED; 1363 return XML_CONVERT_COMPLETED; 1364 utf8 = uenc->utf8[(unsigned char)**fromP] 1364 utf8 = uenc->utf8[(unsigned char)**fromP]; 1365 n = *utf8++; 1365 n = *utf8++; 1366 if (n == 0) { 1366 if (n == 0) { 1367 int c = uenc->convert(uenc->userData, * 1367 int c = uenc->convert(uenc->userData, *fromP); 1368 n = XmlUtf8Encode(c, buf); 1368 n = XmlUtf8Encode(c, buf); 1369 if (n > toLim - *toP) 1369 if (n > toLim - *toP) 1370 return XML_CONVERT_OUTPUT_EXHAUSTED; 1370 return XML_CONVERT_OUTPUT_EXHAUSTED; 1371 utf8 = buf; 1371 utf8 = buf; 1372 *fromP += (AS_NORMAL_ENCODING(enc)->typ 1372 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1373 - (BT_LEAD2 - 2)); 1373 - (BT_LEAD2 - 2)); 1374 } else { 1374 } else { 1375 if (n > toLim - *toP) 1375 if (n > toLim - *toP) 1376 return XML_CONVERT_OUTPUT_EXHAUSTED; 1376 return XML_CONVERT_OUTPUT_EXHAUSTED; 1377 (*fromP)++; 1377 (*fromP)++; 1378 } 1378 } 1379 memcpy(*toP, utf8, n); 1379 memcpy(*toP, utf8, n); 1380 *toP += n; 1380 *toP += n; 1381 } 1381 } 1382 } 1382 } 1383 1383 1384 static enum XML_Convert_Result PTRCALL 1384 static enum XML_Convert_Result PTRCALL 1385 unknown_toUtf16(const ENCODING *enc, const ch 1385 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 1386 unsigned short **toP, const u 1386 unsigned short **toP, const unsigned short *toLim) { 1387 const struct unknown_encoding *uenc = AS_UN 1387 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1388 while (*fromP < fromLim && *toP < toLim) { 1388 while (*fromP < fromLim && *toP < toLim) { 1389 unsigned short c = uenc->utf16[(unsigned 1389 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1390 if (c == 0) { 1390 if (c == 0) { 1391 c = (unsigned short)uenc->convert(uenc- 1391 c = (unsigned short)uenc->convert(uenc->userData, *fromP); 1392 *fromP += (AS_NORMAL_ENCODING(enc)->typ 1392 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1393 - (BT_LEAD2 - 2)); 1393 - (BT_LEAD2 - 2)); 1394 } else 1394 } else 1395 (*fromP)++; 1395 (*fromP)++; 1396 *(*toP)++ = c; 1396 *(*toP)++ = c; 1397 } 1397 } 1398 1398 1399 if ((*toP == toLim) && (*fromP < fromLim)) 1399 if ((*toP == toLim) && (*fromP < fromLim)) 1400 return XML_CONVERT_OUTPUT_EXHAUSTED; 1400 return XML_CONVERT_OUTPUT_EXHAUSTED; 1401 else 1401 else 1402 return XML_CONVERT_COMPLETED; 1402 return XML_CONVERT_COMPLETED; 1403 } 1403 } 1404 1404 1405 ENCODING * 1405 ENCODING * 1406 XmlInitUnknownEncoding(void *mem, int *table, 1406 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, 1407 void *userData) { 1407 void *userData) { 1408 int i; 1408 int i; 1409 struct unknown_encoding *e = (struct unknow 1409 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1410 memcpy(mem, &latin1_encoding, sizeof(struct 1410 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding)); 1411 for (i = 0; i < 128; i++) 1411 for (i = 0; i < 128; i++) 1412 if (latin1_encoding.type[i] != BT_OTHER 1412 if (latin1_encoding.type[i] != BT_OTHER 1413 && latin1_encoding.type[i] != BT_NONX 1413 && latin1_encoding.type[i] != BT_NONXML && table[i] != i) 1414 return 0; 1414 return 0; 1415 for (i = 0; i < 256; i++) { 1415 for (i = 0; i < 256; i++) { 1416 int c = table[i]; 1416 int c = table[i]; 1417 if (c == -1) { 1417 if (c == -1) { 1418 e->normal.type[i] = BT_MALFORM; 1418 e->normal.type[i] = BT_MALFORM; 1419 /* This shouldn't really get used. */ 1419 /* This shouldn't really get used. */ 1420 e->utf16[i] = 0xFFFF; 1420 e->utf16[i] = 0xFFFF; 1421 e->utf8[i][0] = 1; 1421 e->utf8[i][0] = 1; 1422 e->utf8[i][1] = 0; 1422 e->utf8[i][1] = 0; 1423 } else if (c < 0) { 1423 } else if (c < 0) { 1424 if (c < -4) 1424 if (c < -4) 1425 return 0; 1425 return 0; 1426 /* Multi-byte sequences need a converte 1426 /* Multi-byte sequences need a converter function */ 1427 if (! convert) 1427 if (! convert) 1428 return 0; 1428 return 0; 1429 e->normal.type[i] = (unsigned char)(BT_ 1429 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1430 e->utf8[i][0] = 0; 1430 e->utf8[i][0] = 0; 1431 e->utf16[i] = 0; 1431 e->utf16[i] = 0; 1432 } else if (c < 0x80) { 1432 } else if (c < 0x80) { 1433 if (latin1_encoding.type[c] != BT_OTHER 1433 if (latin1_encoding.type[c] != BT_OTHER 1434 && latin1_encoding.type[c] != BT_NO 1434 && latin1_encoding.type[c] != BT_NONXML && c != i) 1435 return 0; 1435 return 0; 1436 e->normal.type[i] = latin1_encoding.typ 1436 e->normal.type[i] = latin1_encoding.type[c]; 1437 e->utf8[i][0] = 1; 1437 e->utf8[i][0] = 1; 1438 e->utf8[i][1] = (char)c; 1438 e->utf8[i][1] = (char)c; 1439 e->utf16[i] = (unsigned short)(c == 0 ? 1439 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1440 } else if (checkCharRefNumber(c) < 0) { 1440 } else if (checkCharRefNumber(c) < 0) { 1441 e->normal.type[i] = BT_NONXML; 1441 e->normal.type[i] = BT_NONXML; 1442 /* This shouldn't really get used. */ 1442 /* This shouldn't really get used. */ 1443 e->utf16[i] = 0xFFFF; 1443 e->utf16[i] = 0xFFFF; 1444 e->utf8[i][0] = 1; 1444 e->utf8[i][0] = 1; 1445 e->utf8[i][1] = 0; 1445 e->utf8[i][1] = 0; 1446 } else { 1446 } else { 1447 if (c > 0xFFFF) 1447 if (c > 0xFFFF) 1448 return 0; 1448 return 0; 1449 if (UCS2_GET_NAMING(nmstrtPages, c >> 8 1449 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1450 e->normal.type[i] = BT_NMSTRT; 1450 e->normal.type[i] = BT_NMSTRT; 1451 else if (UCS2_GET_NAMING(namePages, c > 1451 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1452 e->normal.type[i] = BT_NAME; 1452 e->normal.type[i] = BT_NAME; 1453 else 1453 else 1454 e->normal.type[i] = BT_OTHER; 1454 e->normal.type[i] = BT_OTHER; 1455 e->utf8[i][0] = (char)XmlUtf8Encode(c, 1455 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1456 e->utf16[i] = (unsigned short)c; 1456 e->utf16[i] = (unsigned short)c; 1457 } 1457 } 1458 } 1458 } 1459 e->userData = userData; 1459 e->userData = userData; 1460 e->convert = convert; 1460 e->convert = convert; 1461 if (convert) { 1461 if (convert) { 1462 e->normal.isName2 = unknown_isName; 1462 e->normal.isName2 = unknown_isName; 1463 e->normal.isName3 = unknown_isName; 1463 e->normal.isName3 = unknown_isName; 1464 e->normal.isName4 = unknown_isName; 1464 e->normal.isName4 = unknown_isName; 1465 e->normal.isNmstrt2 = unknown_isNmstrt; 1465 e->normal.isNmstrt2 = unknown_isNmstrt; 1466 e->normal.isNmstrt3 = unknown_isNmstrt; 1466 e->normal.isNmstrt3 = unknown_isNmstrt; 1467 e->normal.isNmstrt4 = unknown_isNmstrt; 1467 e->normal.isNmstrt4 = unknown_isNmstrt; 1468 e->normal.isInvalid2 = unknown_isInvalid; 1468 e->normal.isInvalid2 = unknown_isInvalid; 1469 e->normal.isInvalid3 = unknown_isInvalid; 1469 e->normal.isInvalid3 = unknown_isInvalid; 1470 e->normal.isInvalid4 = unknown_isInvalid; 1470 e->normal.isInvalid4 = unknown_isInvalid; 1471 } 1471 } 1472 e->normal.enc.utf8Convert = unknown_toUtf8; 1472 e->normal.enc.utf8Convert = unknown_toUtf8; 1473 e->normal.enc.utf16Convert = unknown_toUtf1 1473 e->normal.enc.utf16Convert = unknown_toUtf16; 1474 return &(e->normal.enc); 1474 return &(e->normal.enc); 1475 } 1475 } 1476 1476 1477 /* If this enumeration is changed, getEncodin 1477 /* If this enumeration is changed, getEncodingIndex and encodings 1478 must also be changed. */ 1478 must also be changed. */ 1479 enum { 1479 enum { 1480 UNKNOWN_ENC = -1, 1480 UNKNOWN_ENC = -1, 1481 ISO_8859_1_ENC = 0, 1481 ISO_8859_1_ENC = 0, 1482 US_ASCII_ENC, 1482 US_ASCII_ENC, 1483 UTF_8_ENC, 1483 UTF_8_ENC, 1484 UTF_16_ENC, 1484 UTF_16_ENC, 1485 UTF_16BE_ENC, 1485 UTF_16BE_ENC, 1486 UTF_16LE_ENC, 1486 UTF_16LE_ENC, 1487 /* must match encodingNames up to here */ 1487 /* must match encodingNames up to here */ 1488 NO_ENC 1488 NO_ENC 1489 }; 1489 }; 1490 1490 1491 static const char KW_ISO_8859_1[] 1491 static const char KW_ISO_8859_1[] 1492 = {ASCII_I, ASCII_S, ASCII_O, ASCII_M 1492 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, 1493 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1 1493 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'}; 1494 static const char KW_US_ASCII[] 1494 static const char KW_US_ASCII[] 1495 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A 1495 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, 1496 ASCII_C, ASCII_I, ASCII_I, '\0'}; 1496 ASCII_C, ASCII_I, ASCII_I, '\0'}; 1497 static const char KW_UTF_8[] 1497 static const char KW_UTF_8[] 1498 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS 1498 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'}; 1499 static const char KW_UTF_16[] 1499 static const char KW_UTF_16[] 1500 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS 1500 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'}; 1501 static const char KW_UTF_16BE[] 1501 static const char KW_UTF_16BE[] 1502 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS 1502 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, 1503 ASCII_6, ASCII_B, ASCII_E, '\0'}; 1503 ASCII_6, ASCII_B, ASCII_E, '\0'}; 1504 static const char KW_UTF_16LE[] 1504 static const char KW_UTF_16LE[] 1505 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS 1505 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, 1506 ASCII_6, ASCII_L, ASCII_E, '\0'}; 1506 ASCII_6, ASCII_L, ASCII_E, '\0'}; 1507 1507 1508 static int FASTCALL 1508 static int FASTCALL 1509 getEncodingIndex(const char *name) { 1509 getEncodingIndex(const char *name) { 1510 static const char *const encodingNames[] = 1510 static const char *const encodingNames[] = { 1511 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, K 1511 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE, 1512 }; 1512 }; 1513 int i; 1513 int i; 1514 if (name == NULL) 1514 if (name == NULL) 1515 return NO_ENC; 1515 return NO_ENC; 1516 for (i = 0; i < (int)(sizeof(encodingNames) 1516 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++) 1517 if (streqci(name, encodingNames[i])) 1517 if (streqci(name, encodingNames[i])) 1518 return i; 1518 return i; 1519 return UNKNOWN_ENC; 1519 return UNKNOWN_ENC; 1520 } 1520 } 1521 1521 1522 /* For binary compatibility, we store the ind 1522 /* For binary compatibility, we store the index of the encoding 1523 specified at initialization in the isUtf16 1523 specified at initialization in the isUtf16 member. 1524 */ 1524 */ 1525 1525 1526 #define INIT_ENC_INDEX(enc) ((int)(enc)->init 1526 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1527 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->in 1527 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1528 1528 1529 /* This is what detects the encoding. encodi 1529 /* This is what detects the encoding. encodingTable maps from 1530 encoding indices to encodings; INIT_ENC_IN 1530 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1531 the external (protocol) specified encoding 1531 the external (protocol) specified encoding; state is 1532 XML_CONTENT_STATE if we're parsing an exte 1532 XML_CONTENT_STATE if we're parsing an external text entity, and 1533 XML_PROLOG_STATE otherwise. 1533 XML_PROLOG_STATE otherwise. 1534 */ 1534 */ 1535 1535 1536 static int 1536 static int 1537 initScan(const ENCODING *const *encodingTable 1537 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc, 1538 int state, const char *ptr, const ch 1538 int state, const char *ptr, const char *end, const char **nextTokPtr) { 1539 const ENCODING **encPtr; 1539 const ENCODING **encPtr; 1540 1540 1541 if (ptr >= end) 1541 if (ptr >= end) 1542 return XML_TOK_NONE; 1542 return XML_TOK_NONE; 1543 encPtr = enc->encPtr; 1543 encPtr = enc->encPtr; 1544 if (ptr + 1 == end) { 1544 if (ptr + 1 == end) { 1545 /* only a single byte available for auto- 1545 /* only a single byte available for auto-detection */ 1546 #ifndef XML_DTD /* FIXME */ 1546 #ifndef XML_DTD /* FIXME */ 1547 /* a well-formed document entity must hav 1547 /* a well-formed document entity must have more than one byte */ 1548 if (state != XML_CONTENT_STATE) 1548 if (state != XML_CONTENT_STATE) 1549 return XML_TOK_PARTIAL; 1549 return XML_TOK_PARTIAL; 1550 #endif 1550 #endif 1551 /* so we're parsing an external text enti 1551 /* so we're parsing an external text entity... */ 1552 /* if UTF-16 was externally specified, th 1552 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1553 switch (INIT_ENC_INDEX(enc)) { 1553 switch (INIT_ENC_INDEX(enc)) { 1554 case UTF_16_ENC: 1554 case UTF_16_ENC: 1555 case UTF_16LE_ENC: 1555 case UTF_16LE_ENC: 1556 case UTF_16BE_ENC: 1556 case UTF_16BE_ENC: 1557 return XML_TOK_PARTIAL; 1557 return XML_TOK_PARTIAL; 1558 } 1558 } 1559 switch ((unsigned char)*ptr) { 1559 switch ((unsigned char)*ptr) { 1560 case 0xFE: 1560 case 0xFE: 1561 case 0xFF: 1561 case 0xFF: 1562 case 0xEF: /* possibly first byte of UTF- 1562 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1563 if (INIT_ENC_INDEX(enc) == ISO_8859_1_E 1563 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1564 break; 1564 break; 1565 /* fall through */ 1565 /* fall through */ 1566 case 0x00: 1566 case 0x00: 1567 case 0x3C: 1567 case 0x3C: 1568 return XML_TOK_PARTIAL; 1568 return XML_TOK_PARTIAL; 1569 } 1569 } 1570 } else { 1570 } else { 1571 switch (((unsigned char)ptr[0] << 8) | (u 1571 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1572 case 0xFEFF: 1572 case 0xFEFF: 1573 if (INIT_ENC_INDEX(enc) == ISO_8859_1_E 1573 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1574 break; 1574 break; 1575 *nextTokPtr = ptr + 2; 1575 *nextTokPtr = ptr + 2; 1576 *encPtr = encodingTable[UTF_16BE_ENC]; 1576 *encPtr = encodingTable[UTF_16BE_ENC]; 1577 return XML_TOK_BOM; 1577 return XML_TOK_BOM; 1578 /* 00 3C is handled in the default case * 1578 /* 00 3C is handled in the default case */ 1579 case 0x3C00: 1579 case 0x3C00: 1580 if ((INIT_ENC_INDEX(enc) == UTF_16BE_EN 1580 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1581 || INIT_ENC_INDEX(enc) == UTF_16_E 1581 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1582 && state == XML_CONTENT_STATE) 1582 && state == XML_CONTENT_STATE) 1583 break; 1583 break; 1584 *encPtr = encodingTable[UTF_16LE_ENC]; 1584 *encPtr = encodingTable[UTF_16LE_ENC]; 1585 return XmlTok(*encPtr, state, ptr, end, 1585 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1586 case 0xFFFE: 1586 case 0xFFFE: 1587 if (INIT_ENC_INDEX(enc) == ISO_8859_1_E 1587 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1588 break; 1588 break; 1589 *nextTokPtr = ptr + 2; 1589 *nextTokPtr = ptr + 2; 1590 *encPtr = encodingTable[UTF_16LE_ENC]; 1590 *encPtr = encodingTable[UTF_16LE_ENC]; 1591 return XML_TOK_BOM; 1591 return XML_TOK_BOM; 1592 case 0xEFBB: 1592 case 0xEFBB: 1593 /* Maybe a UTF-8 BOM (EF BB BF) */ 1593 /* Maybe a UTF-8 BOM (EF BB BF) */ 1594 /* If there's an explicitly specified ( 1594 /* If there's an explicitly specified (external) encoding 1595 of ISO-8859-1 or some flavour of UTF 1595 of ISO-8859-1 or some flavour of UTF-16 1596 and this is an external text entity, 1596 and this is an external text entity, 1597 don't look for the BOM, 1597 don't look for the BOM, 1598 because it might be a legal data. 1598 because it might be a legal data. 1599 */ 1599 */ 1600 if (state == XML_CONTENT_STATE) { 1600 if (state == XML_CONTENT_STATE) { 1601 int e = INIT_ENC_INDEX(enc); 1601 int e = INIT_ENC_INDEX(enc); 1602 if (e == ISO_8859_1_ENC || e == UTF_1 1602 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC 1603 || e == UTF_16_ENC) 1603 || e == UTF_16_ENC) 1604 break; 1604 break; 1605 } 1605 } 1606 if (ptr + 2 == end) 1606 if (ptr + 2 == end) 1607 return XML_TOK_PARTIAL; 1607 return XML_TOK_PARTIAL; 1608 if ((unsigned char)ptr[2] == 0xBF) { 1608 if ((unsigned char)ptr[2] == 0xBF) { 1609 *nextTokPtr = ptr + 3; 1609 *nextTokPtr = ptr + 3; 1610 *encPtr = encodingTable[UTF_8_ENC]; 1610 *encPtr = encodingTable[UTF_8_ENC]; 1611 return XML_TOK_BOM; 1611 return XML_TOK_BOM; 1612 } 1612 } 1613 break; 1613 break; 1614 default: 1614 default: 1615 if (ptr[0] == '\0') { 1615 if (ptr[0] == '\0') { 1616 /* 0 isn't a legal data character. Fu 1616 /* 0 isn't a legal data character. Furthermore a document 1617 entity can only start with ASCII c 1617 entity can only start with ASCII characters. So the only 1618 way this can fail to be big-endian 1618 way this can fail to be big-endian UTF-16 if it it's an 1619 external parsed general entity tha 1619 external parsed general entity that's labelled as 1620 UTF-16LE. 1620 UTF-16LE. 1621 */ 1621 */ 1622 if (state == XML_CONTENT_STATE && INI 1622 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1623 break; 1623 break; 1624 *encPtr = encodingTable[UTF_16BE_ENC] 1624 *encPtr = encodingTable[UTF_16BE_ENC]; 1625 return XmlTok(*encPtr, state, ptr, en 1625 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1626 } else if (ptr[1] == '\0') { 1626 } else if (ptr[1] == '\0') { 1627 /* We could recover here in the case: 1627 /* We could recover here in the case: 1628 - parsing an external entity 1628 - parsing an external entity 1629 - second byte is 0 1629 - second byte is 0 1630 - no externally specified encodin 1630 - no externally specified encoding 1631 - no encoding declaration 1631 - no encoding declaration 1632 by assuming UTF-16LE. But we don' 1632 by assuming UTF-16LE. But we don't, because this would mean when 1633 presented just with a single byte, 1633 presented just with a single byte, we couldn't reliably determine 1634 whether we needed further bytes. 1634 whether we needed further bytes. 1635 */ 1635 */ 1636 if (state == XML_CONTENT_STATE) 1636 if (state == XML_CONTENT_STATE) 1637 break; 1637 break; 1638 *encPtr = encodingTable[UTF_16LE_ENC] 1638 *encPtr = encodingTable[UTF_16LE_ENC]; 1639 return XmlTok(*encPtr, state, ptr, en 1639 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1640 } 1640 } 1641 break; 1641 break; 1642 } 1642 } 1643 } 1643 } 1644 *encPtr = encodingTable[INIT_ENC_INDEX(enc) 1644 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1645 return XmlTok(*encPtr, state, ptr, end, nex 1645 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1646 } 1646 } 1647 1647 1648 #define NS(x) x 1648 #define NS(x) x 1649 #define ns(x) x 1649 #define ns(x) x 1650 #define XML_TOK_NS_C 1650 #define XML_TOK_NS_C 1651 #include "xmltok_ns.c" 1651 #include "xmltok_ns.c" 1652 #undef XML_TOK_NS_C 1652 #undef XML_TOK_NS_C 1653 #undef NS 1653 #undef NS 1654 #undef ns 1654 #undef ns 1655 1655 1656 #ifdef XML_NS 1656 #ifdef XML_NS 1657 1657 1658 # define NS(x) x##NS 1658 # define NS(x) x##NS 1659 # define ns(x) x##_ns 1659 # define ns(x) x##_ns 1660 1660 1661 # define XML_TOK_NS_C 1661 # define XML_TOK_NS_C 1662 # include "xmltok_ns.c" 1662 # include "xmltok_ns.c" 1663 # undef XML_TOK_NS_C 1663 # undef XML_TOK_NS_C 1664 1664 1665 # undef NS 1665 # undef NS 1666 # undef ns 1666 # undef ns 1667 1667 1668 ENCODING * 1668 ENCODING * 1669 XmlInitUnknownEncodingNS(void *mem, int *tabl 1669 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert, 1670 void *userData) { 1670 void *userData) { 1671 ENCODING *enc = XmlInitUnknownEncoding(mem, 1671 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1672 if (enc) 1672 if (enc) 1673 ((struct normal_encoding *)enc)->type[ASC 1673 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1674 return enc; 1674 return enc; 1675 } 1675 } 1676 1676 1677 #endif /* XML_NS */ 1677 #endif /* XML_NS */ 1678 1678