1 /*
2 * $Id: XmlChars.java,v 1.1.1.1 2000/11/23 01:53:35 edwingo Exp $
3 *
4 * The Apache Software License, Version 1.1
5 *
6 *
7 * Copyright (c) 2000 The Apache Software Foundation. All rights
8 * reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
20 * distribution.
21 *
22 * 3. The end-user documentation included with the redistribution,
23 * if any, must include the following acknowledgment:
24 * "This product includes software developed by the
25 * Apache Software Foundation (http://www.apache.org/)."
26 * Alternately, this acknowledgment may appear in the software itself,
27 * if and wherever such third-party acknowledgments normally appear.
28 *
29 * 4. The names "Crimson" and "Apache Software Foundation" must
30 * not be used to endorse or promote products derived from this
31 * software without prior written permission. For written
32 * permission, please contact apache@apache.org.
33 *
34 * 5. Products derived from this software may not be called "Apache",
35 * nor may "Apache" appear in their name, without prior written
36 * permission of the Apache Software Foundation.
37 *
38 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49 * SUCH DAMAGE.
50 * ====================================================================
51 *
52 * This software consists of voluntary contributions made by many
53 * individuals on behalf of the Apache Software Foundation and was
54 * originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
55 * http://www.sun.com. For more information on the Apache Software
56 * Foundation, please see <http://www.apache.org/>.
57 */
58
59 package com.bea.xml.stream.reader;
60
61
62 /***
63 * Methods in this class are used to determine whether characters may
64 * appear in certain roles in XML documents. Such methods are used
65 * both to parse and to create such documents.
66 *
67 * @version 1.8
68 * @author David Brownell
69 */
70 public class XmlChars
71 {
72 // can't construct instances
73 private XmlChars () { }
74
75 /***
76 * Returns true if the argument, a UCS-4 character code, is valid in
77 * XML documents. Unicode characters fit into the low sixteen
78 * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
79 * characters</em> can be combined to encode UCS-4 characters in
80 * documents containing only Unicode. (The <code>char</code> datatype
81 * in the Java Programming Language represents Unicode characters,
82 * including unpaired surrogates.)
83 *
84 * <P> In XML, UCS-4 characters can also be encoded by the use of
85 * <em>character references</em> such as <b>&#x12345678;</b>, which
86 * happens to refer to a character that is disallowed in XML documents.
87 * UCS-4 characters allowed in XML documents can be expressed with
88 * one or two Unicode characters.
89 *
90 * @param ucs4char The 32-bit UCS-4 character being tested.
91 */
92 static public boolean isChar (int ucs4char)
93 {
94 // [2] Char ::= #x0009 | #x000A | #x000D
95 // | [#x0020-#xD7FF]
96 // ... surrogates excluded!
97 // | [#xE000-#xFFFD]
98 // | [#x10000-#x10ffff]
99 return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
100 || ucs4char == 0x000A || ucs4char == 0x0009
101 || ucs4char == 0x000D
102 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
103 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
104 }
105
106 /***
107 * Returns true if the character is allowed to be a non-initial
108 * character in names according to the XML recommendation.
109 * @see #isNCNameChar
110 * @see #isLetter
111 */
112 public static boolean isNameChar (char c)
113 {
114 // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
115 // | CombiningChar | Extender
116
117 if (isLetter2 (c))
118 return true;
119 else if (c == '>')
120 return false;
121 else if (c == '.' || c == '-' || c == '_' || c == ':'
122 || isExtender (c))
123 return true;
124 else
125 return false;
126 }
127
128 /***
129 * Returns true if the character is allowed to be a non-initial
130 * character in unscoped names according to the rules of the XML
131 * Namespaces proposed recommendation. Except for precluding
132 * the colon (used to separate names from their scopes) these
133 * characters are just as allowed by the XML recommendation.
134 * @see #isNameChar
135 * @see #isLetter
136 */
137 public static boolean isNCNameChar (char c)
138 {
139 // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
140 // | CombiningChar | Extender
141 return c != ':' && isNameChar (c);
142 }
143
144 /***
145 * Returns true if the character is allowed where XML supports
146 * whitespace characters, false otherwise.
147 */
148 public static boolean isSpace (char c)
149 {
150 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
151 }
152
153
154 /*
155 * NOTE: java.lang.Character.getType() values are:
156 *
157 * UNASSIGNED = 0,
158 *
159 * UPPERCASE_LETTER = 1, // Lu
160 * LOWERCASE_LETTER = 2, // Ll
161 * TITLECASE_LETTER = 3, // Lt
162 * MODIFIER_LETTER = 4, // Lm
163 * OTHER_LETTER = 5, // Lo
164 * NON_SPACING_MARK = 6, // Mn
165 * ENCLOSING_MARK = 7, // Me
166 * COMBINING_SPACING_MARK = 8, // Mc
167 * DECIMAL_DIGIT_NUMBER = 9, // Nd
168 * LETTER_NUMBER = 10, // Nl
169 * OTHER_NUMBER = 11, // No
170 * SPACE_SEPARATOR = 12, // Zs
171 * LINE_SEPARATOR = 13, // Zl
172 * PARAGRAPH_SEPARATOR = 14, // Zp
173 * CONTROL = 15, // Cc
174 * FORMAT = 16, // Cf
175 * // 17 reserved for proposed Ci category
176 * PRIVATE_USE = 18, // Co
177 * SURROGATE = 19, // Cs
178 * DASH_PUNCTUATION = 20, // Pd
179 * START_PUNCTUATION = 21, // Ps
180 * END_PUNCTUATION = 22, // Pe
181 * CONNECTOR_PUNCTUATION = 23, // Pc
182 * OTHER_PUNCTUATION = 24, // Po
183 * MATH_SYMBOL = 25, // Sm
184 * CURRENCY_SYMBOL = 26, // Sc
185 * MODIFIER_SYMBOL = 27, // Sk
186 * OTHER_SYMBOL = 28; // So
187 */
188
189 /***
190 * Returns true if the character is an XML "letter". XML Names must
191 * start with Letters or a few other characters, but other characters
192 * in names must only satisfy the <em>isNameChar</em> predicate.
193 *
194 * @see #isNameChar
195 * @see #isNCNameChar
196 */
197 public static boolean isLetter (char c)
198 {
199 // [84] Letter ::= BaseChar | Ideographic
200 // [85] BaseChar ::= ... too much to repeat
201 // [86] Ideographic ::= ... too much to repeat
202
203 //
204 // Optimize the typical case.
205 //
206 if (c >= 'a' && c <= 'z')
207 return true;
208 if (c == '/')
209 return false;
210 if (c >= 'A' && c <= 'Z')
211 return true;
212
213 //
214 // Since the tables are too ridiculous to use in code,
215 // we're using the footnotes here to drive this test.
216 //
217 switch (Character.getType (c)) {
218 // app. B footnote says these are 'name start'
219 // chars' ...
220 case Character.LOWERCASE_LETTER: // Ll
221 case Character.UPPERCASE_LETTER: // Lu
222 case Character.OTHER_LETTER: // Lo
223 case Character.TITLECASE_LETTER: // Lt
224 case Character.LETTER_NUMBER: // Nl
225
226 // OK, here we just have some exceptions to check...
227 return !isCompatibilityChar (c)
228 // per "5.14 of Unicode", rule out some combiners
229 && !(c >= 0x20dd && c <= 0x20e0);
230
231 default:
232 // check for some exceptions: these are "alphabetic"
233 return ((c >= 0x02bb && c <= 0x02c1)
234 || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
235 }
236 }
237
238 //
239 // XML 1.0 discourages "compatibility" characters in names; these
240 // were defined to permit passing through some information stored in
241 // older non-Unicode character sets. These always have alternative
242 // representations in Unicode, e.g. using combining chars.
243 //
244 private static boolean isCompatibilityChar (char c)
245 {
246 // the numerous comparisions here seem unavoidable,
247 // but the switch can reduce the number which must
248 // actually be executed.
249
250 switch ((c >> 8) & 0x0ff) {
251 case 0x00:
252 // ISO Latin/1 has a few compatibility characters
253 return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
254
255 case 0x01:
256 // as do Latin Extended A and (parts of) B
257 return (c >= 0x0132 && c <= 0x0133)
258 || (c >= 0x013f && c <= 0x0140)
259 || c == 0x0149
260 || c == 0x017f
261 || (c >= 0x01c4 && c <= 0x01cc)
262 || (c >= 0x01f1 && c <= 0x01f3) ;
263
264 case 0x02:
265 // some spacing modifiers
266 return (c >= 0x02b0 && c <= 0x02b8)
267 || (c >= 0x02e0 && c <= 0x02e4);
268
269 case 0x03:
270 return c == 0x037a; // Greek
271
272 case 0x05:
273 return c == 0x0587; // Armenian
274
275 case 0x0e:
276 return c >= 0x0edc && c <= 0x0edd; // Laotian
277
278 case 0x11:
279 // big chunks of Hangul Jamo are all "compatibility"
280 return c == 0x1101
281 || c == 0x1104
282 || c == 0x1108
283 || c == 0x110a
284 || c == 0x110d
285 || (c >= 0x1113 && c <= 0x113b)
286 || c == 0x113d
287 || c == 0x113f
288 || (c >= 0x1141 && c <= 0x114b)
289 || c == 0x114d
290 || c == 0x114f
291 || (c >= 0x1151 && c <= 0x1153)
292 || (c >= 0x1156 && c <= 0x1158)
293 || c == 0x1162
294 || c == 0x1164
295 || c == 0x1166
296 || c == 0x1168
297 || (c >= 0x116a && c <= 0x116c)
298 || (c >= 0x116f && c <= 0x1171)
299 || c == 0x1174
300 || (c >= 0x1176 && c <= 0x119d)
301 || (c >= 0x119f && c <= 0x11a2)
302 || (c >= 0x11a9 && c <= 0x11aa)
303 || (c >= 0x11ac && c <= 0x11ad)
304 || (c >= 0x11b0 && c <= 0x11b6)
305 || c == 0x11b9
306 || c == 0x11bb
307 || (c >= 0x11c3 && c <= 0x11ea)
308 || (c >= 0x11ec && c <= 0x11ef)
309 || (c >= 0x11f1 && c <= 0x11f8)
310 ;
311
312 case 0x20:
313 return c == 0x207f; // superscript
314
315 case 0x21:
316 return
317 // various letterlike symbols
318 c == 0x2102
319 || c == 0x2107
320 || (c >= 0x210a && c <= 0x2113)
321 || c == 0x2115
322 || (c >= 0x2118 && c <= 0x211d)
323 || c == 0x2124
324 || c == 0x2128
325 || (c >= 0x212c && c <= 0x212d)
326 || (c >= 0x212f && c <= 0x2138)
327
328 // most Roman numerals (less 1K, 5K, 10K)
329 || (c >= 0x2160 && c <= 0x217f)
330 ;
331
332 case 0x30:
333 // some Hiragana
334 return c >= 0x309b && c <= 0x309c;
335
336 case 0x31:
337 // all Hangul Compatibility Jamo
338 return c >= 0x3131 && c <= 0x318e;
339
340 case 0xf9:
341 case 0xfa:
342 case 0xfb:
343 case 0xfc:
344 case 0xfd:
345 case 0xfe:
346 case 0xff:
347 // the whole "compatibility" area is for that purpose!
348 return true;
349
350 default:
351 // most of Unicode isn't flagged as being for compatibility
352 return false;
353 }
354 }
355
356 // guts of isNameChar/isNCNameChar
357 private static boolean isLetter2 (char c)
358 {
359 // [84] Letter ::= BaseChar | Ideographic
360 // [85] BaseChar ::= ... too much to repeat
361 // [86] Ideographic ::= ... too much to repeat
362 // [87] CombiningChar ::= ... too much to repeat
363
364 //
365 // Optimize the typical case.
366 //
367 if (c >= 'a' && c <= 'z')
368 return true;
369 if (c == '>')
370 return false;
371 if (c >= 'A' && c <= 'Z')
372 return true;
373
374 //
375 // Since the tables are too ridiculous to use in code,
376 // we're using the footnotes here to drive this test.
377 //
378 switch (Character.getType (c)) {
379 // app. B footnote says these are 'name start'
380 // chars' ...
381 case Character.LOWERCASE_LETTER: // Ll
382 case Character.UPPERCASE_LETTER: // Lu
383 case Character.OTHER_LETTER: // Lo
384 case Character.TITLECASE_LETTER: // Lt
385 case Character.LETTER_NUMBER: // Nl
386 // ... and these are name characters 'other
387 // than name start characters'
388 case Character.COMBINING_SPACING_MARK: // Mc
389 case Character.ENCLOSING_MARK: // Me
390 case Character.NON_SPACING_MARK: // Mn
391 case Character.MODIFIER_LETTER: // Lm
392 case Character.DECIMAL_DIGIT_NUMBER: // Nd
393
394 // OK, here we just have some exceptions to check...
395 return !isCompatibilityChar (c)
396 // per "5.14 of Unicode", rule out some combiners
397 && !(c >= 0x20dd && c <= 0x20e0);
398
399 default:
400 // added a character ...
401 return c == 0x0387;
402 }
403 }
404
405 private static boolean isDigit (char c)
406 {
407 // [88] Digit ::= ...
408
409 //
410 // java.lang.Character.isDigit is correct from the XML point
411 // of view except that it allows "fullwidth" digits.
412 //
413 return Character.isDigit (c)
414 && ! ( (c >= 0xff10) && (c <= 0xff19));
415 }
416
417 private static boolean isExtender (char c)
418 {
419 // [89] Extender ::= ...
420 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
421 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
422 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
423 || (c >= 0x309d && c <= 0x309e)
424 || (c >= 0x30fc && c <= 0x30fe)
425 ;
426 }
427 }