View Javadoc

1   /*
2    * $Id: XmlChars.java,v 1.1.1.1 2000/11/23 01:53:35 edwingo Exp $
3    *
4    * The Apache Software License, Version 1.1
5    *
6    *
7    * Copyright (c) 2000 The Apache Software Foundation.  All rights 
8    * reserved.
9    *
10   * Redistribution and use in source and binary forms, with or without
11   * modification, are permitted provided that the following conditions
12   * are met:
13   *
14   * 1. Redistributions of source code must retain the above copyright
15   *    notice, this list of conditions and the following disclaimer. 
16   *
17   * 2. Redistributions in binary form must reproduce the above copyright
18   *    notice, this list of conditions and the following disclaimer in
19   *    the documentation and/or other materials provided with the
20   *    distribution.
21   *
22   * 3. The end-user documentation included with the redistribution,
23   *    if any, must include the following acknowledgment:  
24   *       "This product includes software developed by the
25   *        Apache Software Foundation (http://www.apache.org/)."
26   *    Alternately, this acknowledgment may appear in the software itself,
27   *    if and wherever such third-party acknowledgments normally appear.
28   *
29   * 4. The names "Crimson" and "Apache Software Foundation" must
30   *    not be used to endorse or promote products derived from this
31   *    software without prior written permission. For written 
32   *    permission, please contact apache@apache.org.
33   *
34   * 5. Products derived from this software may not be called "Apache",
35   *    nor may "Apache" appear in their name, without prior written
36   *    permission of the Apache Software Foundation.
37   *
38   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49   * SUCH DAMAGE.
50   * ====================================================================
51   *
52   * This software consists of voluntary contributions made by many
53   * individuals on behalf of the Apache Software Foundation and was
54   * originally based on software copyright (c) 1999, Sun Microsystems, Inc., 
55   * http://www.sun.com.  For more information on the Apache Software 
56   * Foundation, please see <http://www.apache.org/>.
57   */
58  
59  package com.bea.xml.stream.reader;
60  
61  
62  /***
63   * Methods in this class are used to determine whether characters may
64   * appear in certain roles in XML documents.  Such methods are used
65   * both to parse and to create such documents.
66   *
67   * @version 1.8
68   * @author David Brownell
69   */
70  public class XmlChars
71  {
72      // can't construct instances
73      private XmlChars () { }
74  
75      /***
76       * Returns true if the argument, a UCS-4 character code, is valid in
77       * XML documents.  Unicode characters fit into the low sixteen
78       * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
79       * characters</em> can be combined to encode UCS-4 characters in
80       * documents containing only Unicode.  (The <code>char</code> datatype
81       * in the Java Programming Language represents Unicode characters,
82       * including unpaired surrogates.)
83       *
84       * <P> In XML, UCS-4 characters can also be encoded by the use of
85       * <em>character references</em> such as <b>&amp;#x12345678;</b>, which
86       * happens to refer to a character that is disallowed in XML documents.
87       * UCS-4 characters allowed in XML documents can be expressed with
88       * one or two Unicode characters.
89       *
90       * @param ucs4char The 32-bit UCS-4 character being tested.
91       */
92      static public boolean isChar (int ucs4char)
93      {
94  	// [2] Char ::= #x0009 | #x000A | #x000D
95  	//			| [#x0020-#xD7FF]
96  	//	... surrogates excluded!
97  	//			| [#xE000-#xFFFD]
98  	// 			| [#x10000-#x10ffff]
99  	return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
100 		|| ucs4char == 0x000A || ucs4char == 0x0009
101 		|| ucs4char == 0x000D
102 		|| (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
103 		|| (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
104     }
105 
106     /***
107      * Returns true if the character is allowed to be a non-initial
108      * character in names according to the XML recommendation.
109      * @see #isNCNameChar
110      * @see #isLetter
111      */
112     public static boolean isNameChar (char c)
113     {
114 	// [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
115 	//			| CombiningChar | Extender
116 
117 	if (isLetter2 (c))
118 	    return true;
119 	else if (c == '>')
120 	    return false;
121 	else if (c == '.' || c == '-' || c == '_' || c == ':'
122 		|| isExtender (c))
123 	    return true;
124 	else
125 	    return false;
126     }
127 
128     /***
129      * Returns true if the character is allowed to be a non-initial
130      * character in unscoped names according to the rules of the XML
131      * Namespaces proposed recommendation.  Except for precluding
132      * the colon (used to separate names from their scopes) these
133      * characters are just as allowed by the XML recommendation.
134      * @see #isNameChar
135      * @see #isLetter
136      */
137     public static boolean isNCNameChar (char c)
138     {
139 	// [NC 5] NCNameChar ::= Letter | Digit | '.' | '_' 
140 	//			| CombiningChar | Extender
141 	return c != ':' && isNameChar (c);
142     }
143 
144     /***
145      * Returns true if the character is allowed where XML supports
146      * whitespace characters, false otherwise.
147      */
148     public static boolean isSpace (char c)
149     {
150 	return c == ' ' || c == '\t' || c == '\n' || c == '\r';
151     }
152 
153 
154     /*
155      * NOTE:  java.lang.Character.getType() values are:
156      *
157      * UNASSIGNED                    = 0,
158      *
159      * UPPERCASE_LETTER            = 1,    // Lu
160      * LOWERCASE_LETTER            = 2,    // Ll
161      * TITLECASE_LETTER            = 3,    // Lt
162      * MODIFIER_LETTER             = 4,    // Lm
163      * OTHER_LETTER                = 5,    // Lo
164      * NON_SPACING_MARK            = 6,    // Mn
165      * ENCLOSING_MARK              = 7,    // Me
166      * COMBINING_SPACING_MARK      = 8,    // Mc
167      * DECIMAL_DIGIT_NUMBER        = 9,    // Nd
168      * LETTER_NUMBER               = 10,   // Nl
169      * OTHER_NUMBER                = 11,   // No
170      * SPACE_SEPARATOR             = 12,   // Zs
171      * LINE_SEPARATOR              = 13,   // Zl
172      * PARAGRAPH_SEPARATOR         = 14,   // Zp
173      * CONTROL                     = 15,   // Cc
174      * FORMAT                      = 16,   // Cf
175      *                         // 17 reserved for proposed Ci category
176      * PRIVATE_USE                 = 18,   // Co
177      * SURROGATE                   = 19,   // Cs
178      * DASH_PUNCTUATION            = 20,   // Pd
179      * START_PUNCTUATION           = 21,   // Ps
180      * END_PUNCTUATION             = 22,   // Pe
181      * CONNECTOR_PUNCTUATION       = 23,   // Pc
182      * OTHER_PUNCTUATION           = 24,   // Po
183      * MATH_SYMBOL                 = 25,   // Sm
184      * CURRENCY_SYMBOL             = 26,   // Sc
185      * MODIFIER_SYMBOL             = 27,   // Sk
186      * OTHER_SYMBOL                = 28;   // So
187      */
188 
189     /***
190      * Returns true if the character is an XML "letter".  XML Names must
191      * start with Letters or a few other characters, but other characters
192      * in names must only satisfy the <em>isNameChar</em> predicate.
193      *
194      * @see #isNameChar
195      * @see #isNCNameChar
196      */
197     public static boolean isLetter (char c)
198     {
199 	// [84] Letter ::= BaseChar | Ideographic
200 	// [85] BaseChar ::= ... too much to repeat
201 	// [86] Ideographic ::= ... too much to repeat
202 
203 	//
204 	// Optimize the typical case.
205 	//
206 	if (c >= 'a' && c <= 'z')
207 	    return true;
208 	if (c == '/')
209 	    return false;
210 	if (c >= 'A' && c <= 'Z')
211 	    return true;
212 
213 	//
214 	// Since the tables are too ridiculous to use in code,
215 	// we're using the footnotes here to drive this test.
216 	//
217 	switch (Character.getType (c)) {
218 	    // app. B footnote says these are 'name start'
219 	    // chars' ...
220 	  case Character.LOWERCASE_LETTER:		// Ll
221 	  case Character.UPPERCASE_LETTER:		// Lu
222 	  case Character.OTHER_LETTER:			// Lo
223 	  case Character.TITLECASE_LETTER:		// Lt
224 	  case Character.LETTER_NUMBER:			// Nl
225 
226 	    // OK, here we just have some exceptions to check...
227 	    return !isCompatibilityChar (c)
228 		    // per "5.14 of Unicode", rule out some combiners
229 		&& !(c >= 0x20dd && c <= 0x20e0);
230 
231 	  default:
232 	    // check for some exceptions:  these are "alphabetic"
233 	    return ((c >= 0x02bb && c <=  0x02c1)
234 		    || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
235 	}
236     }
237 
238     //
239     // XML 1.0 discourages "compatibility" characters in names; these
240     // were defined to permit passing through some information stored in
241     // older non-Unicode character sets.  These always have alternative
242     // representations in Unicode, e.g. using combining chars.
243     //
244     private static boolean isCompatibilityChar (char c)
245     {
246 	// the numerous comparisions here seem unavoidable,
247 	// but the switch can reduce the number which must
248 	// actually be executed.
249 
250 	switch ((c >> 8) & 0x0ff) {
251 	  case 0x00:
252 	    // ISO Latin/1 has a few compatibility characters
253 	    return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
254 
255 	  case 0x01:
256 	    // as do Latin Extended A and (parts of) B
257 	    return (c >= 0x0132 && c <= 0x0133)
258 		|| (c >= 0x013f && c <= 0x0140)
259 		|| c == 0x0149
260 		|| c == 0x017f
261 		|| (c >= 0x01c4 && c <= 0x01cc)
262 		|| (c >= 0x01f1 && c <= 0x01f3) ;
263 
264 	  case 0x02:
265 		   // some spacing modifiers
266 	    return (c >= 0x02b0 && c <= 0x02b8)
267 		|| (c >= 0x02e0 && c <= 0x02e4);
268 	  
269 	  case 0x03:
270 	    return c == 0x037a;			// Greek
271 
272 	  case 0x05:
273 	    return c == 0x0587;			// Armenian
274 
275 	  case 0x0e:
276 	    return c >= 0x0edc && c <= 0x0edd;	// Laotian
277 
278 	  case 0x11:
279 	    // big chunks of Hangul Jamo are all "compatibility"
280 	    return c == 0x1101
281 		|| c == 0x1104
282 		|| c == 0x1108
283 		|| c == 0x110a
284 		|| c == 0x110d
285 		|| (c >= 0x1113 && c <= 0x113b)
286 		|| c == 0x113d
287 		|| c == 0x113f
288 		|| (c >= 0x1141 && c <= 0x114b)
289 		|| c == 0x114d
290 		|| c == 0x114f
291 		|| (c >= 0x1151 && c <= 0x1153)
292 		|| (c >= 0x1156 && c <= 0x1158)
293 		|| c == 0x1162
294 		|| c == 0x1164
295 		|| c == 0x1166
296 		|| c == 0x1168
297 		|| (c >= 0x116a && c <= 0x116c)
298 		|| (c >= 0x116f && c <= 0x1171)
299 		|| c == 0x1174
300 		|| (c >= 0x1176 && c <= 0x119d)
301 		|| (c >= 0x119f && c <= 0x11a2)
302 		|| (c >= 0x11a9 && c <= 0x11aa)
303 		|| (c >= 0x11ac && c <= 0x11ad)
304 		|| (c >= 0x11b0 && c <= 0x11b6)
305 		|| c == 0x11b9
306 		|| c == 0x11bb
307 		|| (c >= 0x11c3 && c <= 0x11ea)
308 		|| (c >= 0x11ec && c <= 0x11ef)
309 		|| (c >= 0x11f1 && c <= 0x11f8)
310 		;
311 
312 	  case 0x20:
313 	    return c == 0x207f;			// superscript
314 
315 	  case 0x21:
316 	    return
317 		// various letterlike symbols
318 		   c == 0x2102
319 		|| c == 0x2107
320 		|| (c >= 0x210a && c <= 0x2113)
321 		|| c == 0x2115
322 		|| (c >= 0x2118 && c <= 0x211d)
323 		|| c == 0x2124
324 		|| c == 0x2128
325 		|| (c >= 0x212c && c <= 0x212d)
326 		|| (c >= 0x212f && c <= 0x2138)
327 
328 		    // most Roman numerals (less 1K, 5K, 10K)
329 		|| (c >= 0x2160 && c <= 0x217f)
330 		;
331 
332 	  case 0x30:
333 	    // some Hiragana
334 	    return c >= 0x309b && c <= 0x309c;
335 
336 	  case 0x31:
337 	    // all Hangul Compatibility Jamo
338 	    return c >= 0x3131 && c <= 0x318e;
339 
340 	  case 0xf9:
341 	  case 0xfa:
342 	  case 0xfb:
343 	  case 0xfc:
344 	  case 0xfd:
345 	  case 0xfe:
346 	  case 0xff:
347 	    // the whole "compatibility" area is for that purpose!
348 	    return true;
349 	
350 	  default:
351 	    // most of Unicode isn't flagged as being for compatibility
352 	    return false;
353 	}
354     }
355 
356     // guts of isNameChar/isNCNameChar
357     private static boolean isLetter2 (char c)
358     {
359 	// [84] Letter ::= BaseChar | Ideographic
360 	// [85] BaseChar ::= ... too much to repeat
361 	// [86] Ideographic ::= ... too much to repeat
362 	// [87] CombiningChar ::= ... too much to repeat
363 
364 	//
365 	// Optimize the typical case.
366 	//
367 	if (c >= 'a' && c <= 'z')
368 	    return true;
369 	if (c == '>')
370 	    return false;
371 	if (c >= 'A' && c <= 'Z')
372 	    return true;
373 
374 	//
375 	// Since the tables are too ridiculous to use in code,
376 	// we're using the footnotes here to drive this test.
377 	//
378 	switch (Character.getType (c)) {
379 	    // app. B footnote says these are 'name start'
380 	    // chars' ...
381 	  case Character.LOWERCASE_LETTER:		// Ll
382 	  case Character.UPPERCASE_LETTER:		// Lu
383 	  case Character.OTHER_LETTER:			// Lo
384 	  case Character.TITLECASE_LETTER:		// Lt
385 	  case Character.LETTER_NUMBER:			// Nl
386 	    // ... and these are name characters 'other
387 	    // than name start characters'
388 	  case Character.COMBINING_SPACING_MARK:	// Mc
389 	  case Character.ENCLOSING_MARK:		// Me
390 	  case Character.NON_SPACING_MARK:		// Mn
391 	  case Character.MODIFIER_LETTER:		// Lm
392 	  case Character.DECIMAL_DIGIT_NUMBER:		// Nd
393 
394 	    // OK, here we just have some exceptions to check...
395 	    return !isCompatibilityChar (c)
396 		    // per "5.14 of Unicode", rule out some combiners
397 		&& !(c >= 0x20dd && c <= 0x20e0);
398 
399 	  default:
400 		// added a character ...
401 	    return c == 0x0387;
402 	}
403     }
404 
405     private static boolean isDigit (char c)
406     {
407 	// [88] Digit ::= ... 
408 
409 	//
410 	// java.lang.Character.isDigit is correct from the XML point
411 	// of view except that it allows "fullwidth" digits.
412 	//
413 	return Character.isDigit (c)
414 		&& ! ( (c >= 0xff10) && (c <= 0xff19));
415     }
416 
417     private static boolean isExtender (char c)
418     {
419 	// [89] Extender ::= ... 
420 	return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
421 		|| c == 0x0640 || c == 0x0e46 || c == 0x0ec6
422 		|| c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
423 		|| (c >= 0x309d && c <= 0x309e)
424 		|| (c >= 0x30fc && c <= 0x30fe)
425 		;
426     }
427 }